[clang] 61faf26 - [Clang][OpenMP] Update tests using update_cc_test_checks.py

Shilei Tian via cfe-commits cfe-commits at lists.llvm.org
Tue Feb 21 17:09:27 PST 2023


Author: Shilei Tian
Date: 2023-02-21T20:09:17-05:00
New Revision: 61faf261506f93819e48f050f318f96452831f92

URL: https://github.com/llvm/llvm-project/commit/61faf261506f93819e48f050f318f96452831f92
DIFF: https://github.com/llvm/llvm-project/commit/61faf261506f93819e48f050f318f96452831f92.diff

LOG: [Clang][OpenMP] Update tests using update_cc_test_checks.py

Make preparation for other patches

Reviewed By: jdoerfert

Differential Revision: https://reviews.llvm.org/D144320

Added: 
    

Modified: 
    clang/test/OpenMP/amdgcn_target_codegen.cpp
    clang/test/OpenMP/nvptx_SPMD_codegen.cpp
    clang/test/OpenMP/nvptx_target_parallel_proc_bind_codegen.cpp
    clang/test/OpenMP/nvptx_target_parallel_reduction_codegen.cpp
    clang/test/OpenMP/nvptx_target_simd_codegen.cpp
    clang/test/OpenMP/nvptx_target_teams_distribute_simd_codegen.cpp

Removed: 
    


################################################################################
diff  --git a/clang/test/OpenMP/amdgcn_target_codegen.cpp b/clang/test/OpenMP/amdgcn_target_codegen.cpp
index 5027e142c4284..8fbe292c97d8e 100644
--- a/clang/test/OpenMP/amdgcn_target_codegen.cpp
+++ b/clang/test/OpenMP/amdgcn_target_codegen.cpp
@@ -1,3 +1,4 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --function-signature --include-generated-funcs --replace-value-regex "__omp_offloading_[0-9a-z]+_[0-9a-z]+" "reduction_size[.].+[.]" "pl_cond[.].+[.|,]" --prefix-filecheck-ir-name _
 // REQUIRES: amdgpu-registered-target
 
 // RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple x86_64-unknown-unknown -fopenmp-targets=amdgcn-amd-amdhsa -emit-llvm-bc %s -o %t-ppc-host.bc
@@ -9,25 +10,16 @@
 #define N 1000
 
 int test_amdgcn_target_tid_threads() {
-// CHECK-LABEL: define weak_odr protected amdgpu_kernel void @{{.*}}test_amdgcn_target_tid_threads
-
   int arr[N];
-
-// CHECK: call i32 @__kmpc_target_init(ptr addrspacecast (ptr addrspace(1) @1 to ptr), i8 1, i1 true)
 #pragma omp target
   for (int i = 0; i < N; i++) {
     arr[i] = 1;
   }
-
   return arr[0];
 }
 
 int test_amdgcn_target_tid_threads_simd() {
-// CHECK-LABEL: define weak_odr protected amdgpu_kernel void @{{.*}}test_amdgcn_target_tid_threads_simd
-
   int arr[N];
-
-// CHECK: call i32 @__kmpc_target_init(ptr addrspacecast (ptr addrspace(1) @1 to ptr), i8 2, i1 false)
 #pragma omp target simd
   for (int i = 0; i < N; i++) {
     arr[i] = 1;
@@ -36,3 +28,87 @@ int test_amdgcn_target_tid_threads_simd() {
 }
 
 #endif
+// CHECK-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z30test_amdgcn_target_tid_threadsv_l13
+// CHECK-SAME: (ptr noundef nonnull align 4 dereferenceable(4000) [[ARR:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[ARR_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// CHECK-NEXT:    [[I:%.*]] = alloca i32, align 4, addrspace(5)
+// CHECK-NEXT:    [[ARR_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ARR_ADDR]] to ptr
+// CHECK-NEXT:    [[I_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[I]] to ptr
+// CHECK-NEXT:    store ptr [[ARR]], ptr [[ARR_ADDR_ASCAST]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[ARR_ADDR_ASCAST]], align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr addrspacecast (ptr addrspace(1) @[[GLOB1:[0-9]+]] to ptr), i8 1, i1 true)
+// CHECK-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1
+// CHECK-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
+// CHECK:       user_code.entry:
+// CHECK-NEXT:    store i32 0, ptr [[I_ASCAST]], align 4
+// CHECK-NEXT:    br label [[FOR_COND:%.*]]
+// CHECK:       for.cond:
+// CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[I_ASCAST]], align 4
+// CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[TMP2]], 1000
+// CHECK-NEXT:    br i1 [[CMP]], label [[FOR_BODY:%.*]], label [[FOR_END:%.*]]
+// CHECK:       for.body:
+// CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr [[I_ASCAST]], align 4
+// CHECK-NEXT:    [[IDXPROM:%.*]] = sext i32 [[TMP3]] to i64
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [1000 x i32], ptr [[TMP0]], i64 0, i64 [[IDXPROM]]
+// CHECK-NEXT:    store i32 1, ptr [[ARRAYIDX]], align 4
+// CHECK-NEXT:    br label [[FOR_INC:%.*]]
+// CHECK:       for.inc:
+// CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr [[I_ASCAST]], align 4
+// CHECK-NEXT:    [[INC:%.*]] = add nsw i32 [[TMP4]], 1
+// CHECK-NEXT:    store i32 [[INC]], ptr [[I_ASCAST]], align 4
+// CHECK-NEXT:    br label [[FOR_COND]], !llvm.loop [[LOOP9:![0-9]+]]
+// CHECK:       worker.exit:
+// CHECK-NEXT:    ret void
+// CHECK:       for.end:
+// CHECK-NEXT:    call void @__kmpc_target_deinit(ptr addrspacecast (ptr addrspace(1) @[[GLOB1]] to ptr), i8 1)
+// CHECK-NEXT:    ret void
+//
+//
+// CHECK-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z35test_amdgcn_target_tid_threads_simdv_l22
+// CHECK-SAME: (ptr noundef nonnull align 4 dereferenceable(4000) [[ARR:%.*]]) #[[ATTR1:[0-9]+]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[ARR_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// CHECK-NEXT:    [[TMP:%.*]] = alloca i32, align 4, addrspace(5)
+// CHECK-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4, addrspace(5)
+// CHECK-NEXT:    [[I:%.*]] = alloca i32, align 4, addrspace(5)
+// CHECK-NEXT:    [[ARR_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ARR_ADDR]] to ptr
+// CHECK-NEXT:    [[TMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[TMP]] to ptr
+// CHECK-NEXT:    [[DOTOMP_IV_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTOMP_IV]] to ptr
+// CHECK-NEXT:    [[I_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[I]] to ptr
+// CHECK-NEXT:    store ptr [[ARR]], ptr [[ARR_ADDR_ASCAST]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[ARR_ADDR_ASCAST]], align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr addrspacecast (ptr addrspace(1) @[[GLOB1]] to ptr), i8 2, i1 false)
+// CHECK-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1
+// CHECK-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
+// CHECK:       user_code.entry:
+// CHECK-NEXT:    store i32 0, ptr [[DOTOMP_IV_ASCAST]], align 4
+// CHECK-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK:       omp.inner.for.cond:
+// CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[DOTOMP_IV_ASCAST]], align 4, !llvm.access.group [[ACC_GRP11:![0-9]+]]
+// CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[TMP2]], 1000
+// CHECK-NEXT:    br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK:       omp.inner.for.body:
+// CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr [[DOTOMP_IV_ASCAST]], align 4, !llvm.access.group [[ACC_GRP11]]
+// CHECK-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP3]], 1
+// CHECK-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// CHECK-NEXT:    store i32 [[ADD]], ptr [[I_ASCAST]], align 4, !llvm.access.group [[ACC_GRP11]]
+// CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr [[I_ASCAST]], align 4, !llvm.access.group [[ACC_GRP11]]
+// CHECK-NEXT:    [[IDXPROM:%.*]] = sext i32 [[TMP4]] to i64
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [1000 x i32], ptr [[TMP0]], i64 0, i64 [[IDXPROM]]
+// CHECK-NEXT:    store i32 1, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP11]]
+// CHECK-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK:       omp.body.continue:
+// CHECK-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK:       omp.inner.for.inc:
+// CHECK-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_IV_ASCAST]], align 4, !llvm.access.group [[ACC_GRP11]]
+// CHECK-NEXT:    [[ADD1:%.*]] = add nsw i32 [[TMP5]], 1
+// CHECK-NEXT:    store i32 [[ADD1]], ptr [[DOTOMP_IV_ASCAST]], align 4, !llvm.access.group [[ACC_GRP11]]
+// CHECK-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP12:![0-9]+]]
+// CHECK:       worker.exit:
+// CHECK-NEXT:    ret void
+// CHECK:       omp.inner.for.end:
+// CHECK-NEXT:    store i32 1000, ptr [[I_ASCAST]], align 4
+// CHECK-NEXT:    call void @__kmpc_target_deinit(ptr addrspacecast (ptr addrspace(1) @[[GLOB1]] to ptr), i8 2)
+// CHECK-NEXT:    ret void
+//

diff  --git a/clang/test/OpenMP/nvptx_SPMD_codegen.cpp b/clang/test/OpenMP/nvptx_SPMD_codegen.cpp
index bec7f3ec22599..b3edf529d7630 100644
--- a/clang/test/OpenMP/nvptx_SPMD_codegen.cpp
+++ b/clang/test/OpenMP/nvptx_SPMD_codegen.cpp
@@ -1,43 +1,17 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --function-signature --include-generated-funcs --replace-value-regex "__omp_offloading_[0-9a-z]+_[0-9a-z]+" "reduction_size[.].+[.]" "pl_cond[.].+[.|,]" --prefix-filecheck-ir-name _
 // Test target codegen - host bc file has to be created first.
 // RUN: %clang_cc1 -no-opaque-pointers -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc
-// RUN: %clang_cc1 -no-opaque-pointers -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s
+// RUN: %clang_cc1 -no-opaque-pointers -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck --check-prefixes=CHECK,CHECK-64 %s
 // RUN: %clang_cc1 -no-opaque-pointers -verify -fopenmp -x c++ -triple i386-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm-bc %s -o %t-x86-host.bc
-// RUN: %clang_cc1 -no-opaque-pointers -verify -fopenmp -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s
-// RUN: %clang_cc1 -no-opaque-pointers -verify -fopenmp -fexceptions -fcxx-exceptions -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s
+// RUN: %clang_cc1 -no-opaque-pointers -verify -fopenmp -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck --check-prefixes=CHECK,CHECK-32 %s
+// RUN: %clang_cc1 -no-opaque-pointers -verify -fopenmp -fexceptions -fcxx-exceptions -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck --check-prefixes=CHECK,CHECK-32,CHECK-32-EX %s
 // expected-no-diagnostics
 #ifndef HEADER
 #define HEADER
 
 int a;
 
-// CHECK-NOT: @__omp_offloading_{{.+}}_exec_mode = weak constant i8 1
-// CHECK-DAG: [[DISTR_FULL:@.+]] = private unnamed_addr constant %struct.ident_t { i32 0, i32 2050, i32 0, i32 {{[0-9]+}}, i8* getelementptr inbounds
-// CHECK-DAG: [[FULL:@.+]] = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 0, i32 {{[0-9]+}}, i8* getelementptr inbounds
-// CHECK-DAG: [[BAR_FULL:@.+]] = private unnamed_addr constant %struct.ident_t { i32 0, i32 66, i32 0, i32 {{[0-9]+}}, i8* getelementptr inbounds
-// CHECK-NOT: @__omp_offloading_{{.+}}_exec_mode = weak constant i8 1
-
 void foo() {
-// CHECK: call i32 @__kmpc_target_init({{.*}}, i8 2, i1 false)
-// CHECK-DAG: [[DISTR_FULL]]
-// CHECK-DAG: [[FULL]]
-// CHECK: call i32 @__kmpc_target_init({{.*}}, i8 2, i1 false)
-// CHECK-DAG: [[DISTR_FULL]]
-// CHECK-DAG: [[FULL]]
-// CHECK: call i32 @__kmpc_target_init({{.*}}, i8 2, i1 false)
-// CHECK-DAG: [[DISTR_FULL]]
-// CHECK-DAG: [[FULL]]
-// CHECK: call i32 @__kmpc_target_init({{.*}}, i8 2, i1 false)
-// CHECK-DAG: [[DISTR_FULL]]
-// CHECK-DAG: [[FULL]]
-// CHECK: call i32 @__kmpc_target_init({{.*}}, i8 2, i1 false)
-// CHECK-DAG: [[DISTR_FULL]]
-// CHECK-DAG: [[FULL]]
-// CHECK: call i32 @__kmpc_target_init({{.*}}, i8 2, i1 false)
-// CHECK-DAG: [[DISTR_FULL]]
-// CHECK-DAG: [[FULL]]
-// CHECK: call i32 @__kmpc_target_init({{.*}}, i8 2, i1 false)
-// CHECK-DAG: [[DISTR_FULL]]
-// CHECK-DAG: [[FULL]]
 #pragma omp target teams distribute parallel for simd if(a)
   for (int i = 0; i < 10; ++i)
     ;
@@ -60,27 +34,6 @@ void foo() {
   for (int i = 0; i < 10; ++i)
     ;
 int a;
-// CHECK: call i32 @__kmpc_target_init({{.*}}, i8 2, i1 false)
-// CHECK-DAG: [[DISTR_FULL]]
-// CHECK-DAG: [[FULL]]
-// CHECK: call i32 @__kmpc_target_init({{.*}}, i8 2, i1 false)
-// CHECK-DAG: [[DISTR_FULL]]
-// CHECK-DAG: [[FULL]]
-// CHECK: call i32 @__kmpc_target_init({{.*}}, i8 2, i1 false)
-// CHECK-DAG: [[DISTR_FULL]]
-// CHECK-DAG: [[FULL]]
-// CHECK: call i32 @__kmpc_target_init({{.*}}, i8 2, i1 false)
-// CHECK-DAG: [[DISTR_FULL]]
-// CHECK-DAG: [[FULL]]
-// CHECK: call i32 @__kmpc_target_init({{.*}}, i8 2, i1 false)
-// CHECK-DAG: [[DISTR_FULL]]
-// CHECK-DAG: [[FULL]]
-// CHECK: call i32 @__kmpc_target_init({{.*}}, i8 2, i1 false)
-// CHECK-DAG: [[DISTR_FULL]]
-// CHECK-DAG: [[FULL]]
-// CHECK: call i32 @__kmpc_target_init({{.*}}, i8 2, i1 false)
-// CHECK-DAG: [[DISTR_FULL]]
-// CHECK-DAG: [[FULL]]
 #pragma omp target teams distribute parallel for lastprivate(a)
   for (int i = 0; i < 10; ++i)
     a = i;
@@ -102,27 +55,6 @@ int a;
 #pragma omp target teams distribute parallel for schedule(guided)
   for (int i = 0; i < 10; ++i)
     ;
-// CHECK: call i32 @__kmpc_target_init({{.*}}, i8 2, i1 false)
-// CHECK-DAG: [[DISTR_FULL]]
-// CHECK-DAG: [[FULL]]
-// CHECK: call i32 @__kmpc_target_init({{.*}}, i8 2, i1 false)
-// CHECK-DAG: [[DISTR_FULL]]
-// CHECK-DAG: [[FULL]]
-// CHECK: call i32 @__kmpc_target_init(
-// CHECK-DAG: [[DISTR_FULL]]
-// CHECK-DAG: [[FULL]]
-// CHECK: call i32 @__kmpc_target_init({{.*}}, i8 2, i1 false)
-// CHECK-DAG: [[DISTR_FULL]]
-// CHECK-DAG: [[FULL]]
-// CHECK: call i32 @__kmpc_target_init({{.*}}, i8 2, i1 false)
-// CHECK-DAG: [[DISTR_FULL]]
-// CHECK-DAG: [[FULL]]
-// CHECK: call i32 @__kmpc_target_init({{.*}}, i8 2, i1 false)
-// CHECK-DAG: [[DISTR_FULL]]
-// CHECK-DAG: [[FULL]]
-// CHECK: call i32 @__kmpc_target_init({{.*}}, i8 2, i1 false)
-// CHECK-DAG: [[DISTR_FULL]]
-// CHECK-DAG: [[FULL]]
 #pragma omp target teams
    {
      int b;
@@ -162,27 +94,6 @@ int a;
 #pragma omp distribute parallel for simd schedule(guided)
   for (int i = 0; i < 10; ++i)
     ;
-// CHECK: call i32 @__kmpc_target_init({{.*}}, i8 2, i1 false)
-// CHECK-DAG: [[DISTR_FULL]]
-// CHECK-DAG: [[FULL]]
-// CHECK: call i32 @__kmpc_target_init({{.*}}, i8 2, i1 false)
-// CHECK-DAG: [[DISTR_FULL]]
-// CHECK-DAG: [[FULL]]
-// CHECK: call i32 @__kmpc_target_init({{.*}}, i8 2, i1 false)
-// CHECK-DAG: [[DISTR_FULL]]
-// CHECK-DAG: [[FULL]]
-// CHECK: call i32 @__kmpc_target_init({{.*}}, i8 2, i1 false)
-// CHECK-DAG: [[DISTR_FULL]]
-// CHECK-DAG: [[FULL]]
-// CHECK: call i32 @__kmpc_target_init({{.*}}, i8 2, i1 false)
-// CHECK-DAG: [[DISTR_FULL]]
-// CHECK-DAG: [[FULL]]
-// CHECK: call i32 @__kmpc_target_init({{.*}}, i8 2, i1 false)
-// CHECK-DAG: [[DISTR_FULL]]
-// CHECK-DAG: [[FULL]]
-// CHECK: call i32 @__kmpc_target_init({{.*}}, i8 2, i1 false)
-// CHECK-DAG: [[DISTR_FULL]]
-// CHECK-DAG: [[FULL]]
 #pragma omp target teams
 #pragma omp distribute parallel for
   for (int i = 0; i < 10; ++i)
@@ -211,26 +122,6 @@ int a;
 #pragma omp distribute parallel for schedule(guided)
   for (int i = 0; i < 10; ++i)
     ;
-// CHECK: call i32 @__kmpc_target_init({{.*}}, i8 2, i1 false)
-// CHECK-DAG: [[DISTR_FULL]]
-// CHECK-DAG: [[FULL]]
-// CHECK: call i32 @__kmpc_target_init({{.*}}, i8 2, i1 false)
-// CHECK-DAG: [[DISTR_FULL]]
-// CHECK-DAG: [[FULL]]
-// CHECK: call i32 @__kmpc_target_init({{.*}}, i8 2, i1 false)
-// CHECK-DAG: [[DISTR_FULL]]
-// CHECK-DAG: [[FULL]]
-// CHECK: call i32 @__kmpc_target_init({{.*}}, i8 2, i1 false)
-// CHECK-DAG: [[DISTR_FULL]]
-// CHECK-DAG: [[FULL]]
-// CHECK: call i32 @__kmpc_target_init({{.*}}, i8 2, i1 false)
-// CHECK-DAG: [[DISTR_FULL]]
-// CHECK-DAG: [[FULL]]
-// CHECK: call i32 @__kmpc_target_init({{.*}}, i8 2, i1 false)
-// CHECK-DAG: [[DISTR_FULL]]
-// CHECK-DAG: [[FULL]]
-// CHECK: call i32 @__kmpc_target_init({{.*}}, i8 2, i1 false)
-// CHECK-DAG: [[FULL]]
 #pragma omp target
 #pragma omp teams
 #pragma omp distribute parallel for
@@ -266,20 +157,6 @@ int a;
 #pragma omp distribute parallel for schedule(guided)
   for (int i = 0; i < 10; ++i)
     ;
-// CHECK: call i32 @__kmpc_target_init({{.*}}, i8 2, i1 false)
-// CHECK-DAG: [[FULL]]
-// CHECK: call i32 @__kmpc_target_init({{.*}}, i8 2, i1 false)
-// CHECK-DAG: [[FULL]]
-// CHECK: call i32 @__kmpc_target_init({{.*}}, i8 2, i1 false)
-// CHECK-DAG: [[FULL]]
-// CHECK: call i32 @__kmpc_target_init({{.*}}, i8 2, i1 false)
-// CHECK-DAG: [[FULL]]
-// CHECK: call i32 @__kmpc_target_init({{.*}}, i8 2, i1 false)
-// CHECK-DAG: [[FULL]]
-// CHECK: call i32 @__kmpc_target_init({{.*}}, i8 2, i1 false)
-// CHECK-DAG: [[FULL]]
-// CHECK: call i32 @__kmpc_target_init({{.*}}, i8 2, i1 false)
-// CHECK-DAG: [[FULL]]
 #pragma omp target parallel for if(a)
   for (int i = 0; i < 10; ++i)
     ;
@@ -301,27 +178,6 @@ int a;
 #pragma omp target parallel for schedule(guided)
   for (int i = 0; i < 10; ++i)
     ;
-// CHECK: call i32 @__kmpc_target_init({{.*}}, i8 2, i1 false)
-// CHECK-DAG: [[FULL]]
-// CHECK-DAG: [[BAR_FULL]]
-// CHECK: call i32 @__kmpc_target_init({{.*}}, i8 2, i1 false)
-// CHECK-DAG: [[FULL]]
-// CHECK-DAG: [[BAR_FULL]]
-// CHECK: call i32 @__kmpc_target_init({{.*}}, i8 2, i1 false)
-// CHECK-DAG: [[FULL]]
-// CHECK-DAG: [[BAR_FULL]]
-// CHECK: call i32 @__kmpc_target_init({{.*}}, i8 2, i1 false)
-// CHECK-DAG: [[FULL]]
-// CHECK-DAG: [[BAR_FULL]]
-// CHECK: call i32 @__kmpc_target_init({{.*}}, i8 2, i1 false)
-// CHECK-DAG: [[FULL]]
-// CHECK-DAG: [[BAR_FULL]]
-// CHECK: call i32 @__kmpc_target_init({{.*}}, i8 2, i1 false)
-// CHECK-DAG: [[FULL]]
-// CHECK-DAG: [[BAR_FULL]]
-// CHECK: call i32 @__kmpc_target_init({{.*}}, i8 2, i1 false)
-// CHECK-DAG: [[FULL]]
-// CHECK-DAG: [[BAR_FULL]]
 #pragma omp target parallel if(a)
 #pragma omp for simd
   for (int i = 0; i < 10; ++i)
@@ -350,26 +206,6 @@ int a;
 #pragma omp for simd schedule(guided)
   for (int i = 0; i < 10; ++i)
     ;
-// CHECK: call i32 @__kmpc_target_init({{.*}}, i8 2, i1 false)
-// CHECK-DAG: [[FULL]]
-// CHECK-DAG: [[BAR_FULL]]
-// CHECK: call i32 @__kmpc_target_init({{.*}}, i8 2, i1 false)
-// CHECK-DAG: [[FULL]]
-// CHECK-DAG: [[BAR_FULL]]
-// CHECK: call i32 @__kmpc_target_init({{.*}}, i8 2, i1 false)
-// CHECK-DAG: [[FULL]]
-// CHECK-DAG: [[BAR_FULL]]
-// CHECK: call i32 @__kmpc_target_init({{.*}}, i8 2, i1 false)
-// CHECK-DAG: [[FULL]]
-// CHECK-DAG: [[BAR_FULL]]
-// CHECK: call i32 @__kmpc_target_init({{.*}}, i8 2, i1 false)
-// CHECK-DAG: [[FULL]]
-// CHECK-DAG: [[BAR_FULL]]
-// CHECK: call i32 @__kmpc_target_init({{.*}}, i8 2, i1 false)
-// CHECK-DAG: [[FULL]]
-// CHECK-DAG: [[BAR_FULL]]
-// CHECK: call i32 @__kmpc_target_init({{.*}}, i8 2, i1 false)
-// CHECK-DAG: [[FULL]]
 #pragma omp target
 #pragma omp parallel
 #pragma omp for simd ordered
@@ -405,18 +241,6 @@ int a;
 #pragma omp for simd schedule(guided)
   for (int i = 0; i < 10; ++i)
     ;
-// CHECK: call i32 @__kmpc_target_init({{.*}}, i8 2, i1 false)
-// CHECK-DAG: [[FULL]]
-// CHECK: call i32 @__kmpc_target_init({{.*}}, i8 2, i1 false)
-// CHECK-DAG: [[FULL]]
-// CHECK: call i32 @__kmpc_target_init({{.*}}, i8 2, i1 false)
-// CHECK-DAG: [[FULL]]
-// CHECK: call i32 @__kmpc_target_init({{.*}}, i8 2, i1 false)
-// CHECK-DAG: [[FULL]]
-// CHECK: call i32 @__kmpc_target_init({{.*}}, i8 2, i1 false)
-// CHECK-DAG: [[FULL]]
-// CHECK: call i32 @__kmpc_target_init({{.*}}, i8 2, i1 false)
-// CHECK-DAG: [[FULL]]
 #pragma omp target
 #pragma omp parallel for
   for (int i = 0; i < 10; ++i)
@@ -448,4 +272,27472 @@ int a;
 }
 
 #endif
-
+// CHECK-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l17
+// CHECK-SAME: () #[[ATTR0:[0-9]+]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1:[0-9]+]], i8 2, i1 false)
+// CHECK-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
+// CHECK-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
+// CHECK:       user_code.entry:
+// CHECK-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
+// CHECK-NEXT:    store i32 0, i32* [[DOTZERO_ADDR]], align 4
+// CHECK-NEXT:    store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4
+// CHECK-NEXT:    call void @__omp_outlined__3(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]]) #[[ATTR2:[0-9]+]]
+// CHECK-NEXT:    call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 2)
+// CHECK-NEXT:    ret void
+// CHECK:       worker.exit:
+// CHECK-NEXT:    ret void
+//
+//
+//
+//
+// CHECK-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l20
+// CHECK-SAME: () #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 2, i1 false)
+// CHECK-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
+// CHECK-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
+// CHECK:       user_code.entry:
+// CHECK-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
+// CHECK-NEXT:    store i32 0, i32* [[DOTZERO_ADDR]], align 4
+// CHECK-NEXT:    store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4
+// CHECK-NEXT:    call void @__omp_outlined__5(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]]) #[[ATTR2]]
+// CHECK-NEXT:    call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 2)
+// CHECK-NEXT:    ret void
+// CHECK:       worker.exit:
+// CHECK-NEXT:    ret void
+//
+//
+//
+//
+// CHECK-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l23
+// CHECK-SAME: () #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 2, i1 false)
+// CHECK-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
+// CHECK-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
+// CHECK:       user_code.entry:
+// CHECK-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
+// CHECK-NEXT:    store i32 0, i32* [[DOTZERO_ADDR]], align 4
+// CHECK-NEXT:    store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4
+// CHECK-NEXT:    call void @__omp_outlined__7(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]]) #[[ATTR2]]
+// CHECK-NEXT:    call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 2)
+// CHECK-NEXT:    ret void
+// CHECK:       worker.exit:
+// CHECK-NEXT:    ret void
+//
+//
+//
+//
+// CHECK-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l26
+// CHECK-SAME: () #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 2, i1 false)
+// CHECK-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
+// CHECK-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
+// CHECK:       user_code.entry:
+// CHECK-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
+// CHECK-NEXT:    store i32 0, i32* [[DOTZERO_ADDR]], align 4
+// CHECK-NEXT:    store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4
+// CHECK-NEXT:    call void @__omp_outlined__9(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]]) #[[ATTR2]]
+// CHECK-NEXT:    call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 2)
+// CHECK-NEXT:    ret void
+// CHECK:       worker.exit:
+// CHECK-NEXT:    ret void
+//
+//
+//
+//
+// CHECK-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l29
+// CHECK-SAME: () #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 2, i1 false)
+// CHECK-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
+// CHECK-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
+// CHECK:       user_code.entry:
+// CHECK-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
+// CHECK-NEXT:    store i32 0, i32* [[DOTZERO_ADDR]], align 4
+// CHECK-NEXT:    store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4
+// CHECK-NEXT:    call void @__omp_outlined__11(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]]) #[[ATTR2]]
+// CHECK-NEXT:    call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 2)
+// CHECK-NEXT:    ret void
+// CHECK:       worker.exit:
+// CHECK-NEXT:    ret void
+//
+//
+//
+//
+// CHECK-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l32
+// CHECK-SAME: () #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 2, i1 false)
+// CHECK-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
+// CHECK-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
+// CHECK:       user_code.entry:
+// CHECK-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
+// CHECK-NEXT:    store i32 0, i32* [[DOTZERO_ADDR]], align 4
+// CHECK-NEXT:    store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4
+// CHECK-NEXT:    call void @__omp_outlined__13(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]]) #[[ATTR2]]
+// CHECK-NEXT:    call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 2)
+// CHECK-NEXT:    ret void
+// CHECK:       worker.exit:
+// CHECK-NEXT:    ret void
+//
+//
+//
+//
+//
+//
+//
+// CHECK-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l39
+// CHECK-SAME: () #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 2, i1 false)
+// CHECK-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
+// CHECK-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
+// CHECK:       user_code.entry:
+// CHECK-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
+// CHECK-NEXT:    store i32 0, i32* [[DOTZERO_ADDR]], align 4
+// CHECK-NEXT:    store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4
+// CHECK-NEXT:    call void @__omp_outlined__17(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]]) #[[ATTR2]]
+// CHECK-NEXT:    call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 2)
+// CHECK-NEXT:    ret void
+// CHECK:       worker.exit:
+// CHECK-NEXT:    ret void
+//
+//
+//
+//
+// CHECK-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l42
+// CHECK-SAME: () #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 2, i1 false)
+// CHECK-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
+// CHECK-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
+// CHECK:       user_code.entry:
+// CHECK-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
+// CHECK-NEXT:    store i32 0, i32* [[DOTZERO_ADDR]], align 4
+// CHECK-NEXT:    store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4
+// CHECK-NEXT:    call void @__omp_outlined__19(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]]) #[[ATTR2]]
+// CHECK-NEXT:    call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 2)
+// CHECK-NEXT:    ret void
+// CHECK:       worker.exit:
+// CHECK-NEXT:    ret void
+//
+//
+//
+//
+// CHECK-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l45
+// CHECK-SAME: () #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 2, i1 false)
+// CHECK-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
+// CHECK-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
+// CHECK:       user_code.entry:
+// CHECK-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
+// CHECK-NEXT:    store i32 0, i32* [[DOTZERO_ADDR]], align 4
+// CHECK-NEXT:    store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4
+// CHECK-NEXT:    call void @__omp_outlined__21(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]]) #[[ATTR2]]
+// CHECK-NEXT:    call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 2)
+// CHECK-NEXT:    ret void
+// CHECK:       worker.exit:
+// CHECK-NEXT:    ret void
+//
+//
+//
+//
+// CHECK-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l48
+// CHECK-SAME: () #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 2, i1 false)
+// CHECK-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
+// CHECK-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
+// CHECK:       user_code.entry:
+// CHECK-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
+// CHECK-NEXT:    store i32 0, i32* [[DOTZERO_ADDR]], align 4
+// CHECK-NEXT:    store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4
+// CHECK-NEXT:    call void @__omp_outlined__23(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]]) #[[ATTR2]]
+// CHECK-NEXT:    call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 2)
+// CHECK-NEXT:    ret void
+// CHECK:       worker.exit:
+// CHECK-NEXT:    ret void
+//
+//
+//
+//
+// CHECK-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l51
+// CHECK-SAME: () #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 2, i1 false)
+// CHECK-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
+// CHECK-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
+// CHECK:       user_code.entry:
+// CHECK-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
+// CHECK-NEXT:    store i32 0, i32* [[DOTZERO_ADDR]], align 4
+// CHECK-NEXT:    store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4
+// CHECK-NEXT:    call void @__omp_outlined__25(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]]) #[[ATTR2]]
+// CHECK-NEXT:    call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 2)
+// CHECK-NEXT:    ret void
+// CHECK:       worker.exit:
+// CHECK-NEXT:    ret void
+//
+//
+//
+//
+// CHECK-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l54
+// CHECK-SAME: () #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 2, i1 false)
+// CHECK-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
+// CHECK-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
+// CHECK:       user_code.entry:
+// CHECK-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
+// CHECK-NEXT:    store i32 0, i32* [[DOTZERO_ADDR]], align 4
+// CHECK-NEXT:    store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4
+// CHECK-NEXT:    call void @__omp_outlined__27(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]]) #[[ATTR2]]
+// CHECK-NEXT:    call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 2)
+// CHECK-NEXT:    ret void
+// CHECK:       worker.exit:
+// CHECK-NEXT:    ret void
+//
+//
+//
+//
+// CHECK-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l57
+// CHECK-SAME: () #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 2, i1 false)
+// CHECK-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
+// CHECK-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
+// CHECK:       user_code.entry:
+// CHECK-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
+// CHECK-NEXT:    store i32 0, i32* [[DOTZERO_ADDR]], align 4
+// CHECK-NEXT:    store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4
+// CHECK-NEXT:    call void @__omp_outlined__29(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]]) #[[ATTR2]]
+// CHECK-NEXT:    call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 2)
+// CHECK-NEXT:    ret void
+// CHECK:       worker.exit:
+// CHECK-NEXT:    ret void
+//
+//
+//
+//
+// CHECK-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l65
+// CHECK-SAME: () #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 2, i1 false)
+// CHECK-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
+// CHECK-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
+// CHECK:       user_code.entry:
+// CHECK-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
+// CHECK-NEXT:    store i32 0, i32* [[DOTZERO_ADDR]], align 4
+// CHECK-NEXT:    store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4
+// CHECK-NEXT:    call void @__omp_outlined__31(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]]) #[[ATTR2]]
+// CHECK-NEXT:    call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 2)
+// CHECK-NEXT:    ret void
+// CHECK:       worker.exit:
+// CHECK-NEXT:    ret void
+//
+//
+//
+//
+// CHECK-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l72
+// CHECK-SAME: () #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 1, i1 true)
+// CHECK-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
+// CHECK-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
+// CHECK:       user_code.entry:
+// CHECK-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
+// CHECK-NEXT:    store i32 0, i32* [[DOTZERO_ADDR]], align 4
+// CHECK-NEXT:    store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4
+// CHECK-NEXT:    call void @__omp_outlined__33(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]]) #[[ATTR2]]
+// CHECK-NEXT:    call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 1)
+// CHECK-NEXT:    ret void
+// CHECK:       worker.exit:
+// CHECK-NEXT:    ret void
+//
+//
+//
+//
+//
+// CHECK-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l80
+// CHECK-SAME: () #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 2, i1 false)
+// CHECK-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
+// CHECK-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
+// CHECK:       user_code.entry:
+// CHECK-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
+// CHECK-NEXT:    store i32 0, i32* [[DOTZERO_ADDR]], align 4
+// CHECK-NEXT:    store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4
+// CHECK-NEXT:    call void @__omp_outlined__35(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]]) #[[ATTR2]]
+// CHECK-NEXT:    call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 2)
+// CHECK-NEXT:    ret void
+// CHECK:       worker.exit:
+// CHECK-NEXT:    ret void
+//
+//
+//
+//
+// CHECK-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l84
+// CHECK-SAME: () #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 2, i1 false)
+// CHECK-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
+// CHECK-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
+// CHECK:       user_code.entry:
+// CHECK-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
+// CHECK-NEXT:    store i32 0, i32* [[DOTZERO_ADDR]], align 4
+// CHECK-NEXT:    store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4
+// CHECK-NEXT:    call void @__omp_outlined__37(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]]) #[[ATTR2]]
+// CHECK-NEXT:    call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 2)
+// CHECK-NEXT:    ret void
+// CHECK:       worker.exit:
+// CHECK-NEXT:    ret void
+//
+//
+//
+//
+// CHECK-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l88
+// CHECK-SAME: () #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 2, i1 false)
+// CHECK-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
+// CHECK-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
+// CHECK:       user_code.entry:
+// CHECK-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
+// CHECK-NEXT:    store i32 0, i32* [[DOTZERO_ADDR]], align 4
+// CHECK-NEXT:    store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4
+// CHECK-NEXT:    call void @__omp_outlined__39(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]]) #[[ATTR2]]
+// CHECK-NEXT:    call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 2)
+// CHECK-NEXT:    ret void
+// CHECK:       worker.exit:
+// CHECK-NEXT:    ret void
+//
+//
+//
+//
+// CHECK-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l92
+// CHECK-SAME: () #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 2, i1 false)
+// CHECK-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
+// CHECK-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
+// CHECK:       user_code.entry:
+// CHECK-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
+// CHECK-NEXT:    store i32 0, i32* [[DOTZERO_ADDR]], align 4
+// CHECK-NEXT:    store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4
+// CHECK-NEXT:    call void @__omp_outlined__41(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]]) #[[ATTR2]]
+// CHECK-NEXT:    call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 2)
+// CHECK-NEXT:    ret void
+// CHECK:       worker.exit:
+// CHECK-NEXT:    ret void
+//
+//
+//
+//
+// CHECK-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l96
+// CHECK-SAME: () #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 2, i1 false)
+// CHECK-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
+// CHECK-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
+// CHECK:       user_code.entry:
+// CHECK-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
+// CHECK-NEXT:    store i32 0, i32* [[DOTZERO_ADDR]], align 4
+// CHECK-NEXT:    store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4
+// CHECK-NEXT:    call void @__omp_outlined__43(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]]) #[[ATTR2]]
+// CHECK-NEXT:    call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 2)
+// CHECK-NEXT:    ret void
+// CHECK:       worker.exit:
+// CHECK-NEXT:    ret void
+//
+//
+//
+//
+// CHECK-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l100
+// CHECK-SAME: () #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 2, i1 false)
+// CHECK-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
+// CHECK-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
+// CHECK:       user_code.entry:
+// CHECK-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
+// CHECK-NEXT:    store i32 0, i32* [[DOTZERO_ADDR]], align 4
+// CHECK-NEXT:    store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4
+// CHECK-NEXT:    call void @__omp_outlined__45(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]]) #[[ATTR2]]
+// CHECK-NEXT:    call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 2)
+// CHECK-NEXT:    ret void
+// CHECK:       worker.exit:
+// CHECK-NEXT:    ret void
+//
+//
+//
+//
+// CHECK-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l104
+// CHECK-SAME: () #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 2, i1 false)
+// CHECK-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
+// CHECK-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
+// CHECK:       user_code.entry:
+// CHECK-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
+// CHECK-NEXT:    store i32 0, i32* [[DOTZERO_ADDR]], align 4
+// CHECK-NEXT:    store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4
+// CHECK-NEXT:    call void @__omp_outlined__47(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]]) #[[ATTR2]]
+// CHECK-NEXT:    call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 2)
+// CHECK-NEXT:    ret void
+// CHECK:       worker.exit:
+// CHECK-NEXT:    ret void
+//
+//
+//
+//
+// CHECK-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l108
+// CHECK-SAME: () #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 2, i1 false)
+// CHECK-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
+// CHECK-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
+// CHECK:       user_code.entry:
+// CHECK-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
+// CHECK-NEXT:    store i32 0, i32* [[DOTZERO_ADDR]], align 4
+// CHECK-NEXT:    store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4
+// CHECK-NEXT:    call void @__omp_outlined__49(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]]) #[[ATTR2]]
+// CHECK-NEXT:    call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 2)
+// CHECK-NEXT:    ret void
+// CHECK:       worker.exit:
+// CHECK-NEXT:    ret void
+//
+//
+//
+//
+// CHECK-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l112
+// CHECK-SAME: () #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 2, i1 false)
+// CHECK-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
+// CHECK-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
+// CHECK:       user_code.entry:
+// CHECK-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
+// CHECK-NEXT:    store i32 0, i32* [[DOTZERO_ADDR]], align 4
+// CHECK-NEXT:    store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4
+// CHECK-NEXT:    call void @__omp_outlined__51(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]]) #[[ATTR2]]
+// CHECK-NEXT:    call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 2)
+// CHECK-NEXT:    ret void
+// CHECK:       worker.exit:
+// CHECK-NEXT:    ret void
+//
+//
+//
+//
+// CHECK-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l116
+// CHECK-SAME: () #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 2, i1 false)
+// CHECK-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
+// CHECK-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
+// CHECK:       user_code.entry:
+// CHECK-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
+// CHECK-NEXT:    store i32 0, i32* [[DOTZERO_ADDR]], align 4
+// CHECK-NEXT:    store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4
+// CHECK-NEXT:    call void @__omp_outlined__53(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]]) #[[ATTR2]]
+// CHECK-NEXT:    call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 2)
+// CHECK-NEXT:    ret void
+// CHECK:       worker.exit:
+// CHECK-NEXT:    ret void
+//
+//
+//
+//
+// CHECK-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l120
+// CHECK-SAME: () #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 2, i1 false)
+// CHECK-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
+// CHECK-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
+// CHECK:       user_code.entry:
+// CHECK-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
+// CHECK-NEXT:    store i32 0, i32* [[DOTZERO_ADDR]], align 4
+// CHECK-NEXT:    store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4
+// CHECK-NEXT:    call void @__omp_outlined__55(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]]) #[[ATTR2]]
+// CHECK-NEXT:    call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 2)
+// CHECK-NEXT:    ret void
+// CHECK:       worker.exit:
+// CHECK-NEXT:    ret void
+//
+//
+//
+//
+// CHECK-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l124
+// CHECK-SAME: () #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 2, i1 false)
+// CHECK-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
+// CHECK-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
+// CHECK:       user_code.entry:
+// CHECK-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
+// CHECK-NEXT:    store i32 0, i32* [[DOTZERO_ADDR]], align 4
+// CHECK-NEXT:    store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4
+// CHECK-NEXT:    call void @__omp_outlined__57(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]]) #[[ATTR2]]
+// CHECK-NEXT:    call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 2)
+// CHECK-NEXT:    ret void
+// CHECK:       worker.exit:
+// CHECK-NEXT:    ret void
+//
+//
+//
+//
+// CHECK-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l129
+// CHECK-SAME: () #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 2, i1 false)
+// CHECK-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
+// CHECK-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
+// CHECK:       user_code.entry:
+// CHECK-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
+// CHECK-NEXT:    store i32 0, i32* [[DOTZERO_ADDR]], align 4
+// CHECK-NEXT:    store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4
+// CHECK-NEXT:    call void @__omp_outlined__59(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]]) #[[ATTR2]]
+// CHECK-NEXT:    call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 2)
+// CHECK-NEXT:    ret void
+// CHECK:       worker.exit:
+// CHECK-NEXT:    ret void
+//
+//
+//
+//
+// CHECK-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l134
+// CHECK-SAME: () #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 2, i1 false)
+// CHECK-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
+// CHECK-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
+// CHECK:       user_code.entry:
+// CHECK-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
+// CHECK-NEXT:    store i32 0, i32* [[DOTZERO_ADDR]], align 4
+// CHECK-NEXT:    store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4
+// CHECK-NEXT:    call void @__omp_outlined__61(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]]) #[[ATTR2]]
+// CHECK-NEXT:    call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 2)
+// CHECK-NEXT:    ret void
+// CHECK:       worker.exit:
+// CHECK-NEXT:    ret void
+//
+//
+//
+//
+// CHECK-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l139
+// CHECK-SAME: () #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 2, i1 false)
+// CHECK-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
+// CHECK-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
+// CHECK:       user_code.entry:
+// CHECK-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
+// CHECK-NEXT:    store i32 0, i32* [[DOTZERO_ADDR]], align 4
+// CHECK-NEXT:    store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4
+// CHECK-NEXT:    call void @__omp_outlined__63(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]]) #[[ATTR2]]
+// CHECK-NEXT:    call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 2)
+// CHECK-NEXT:    ret void
+// CHECK:       worker.exit:
+// CHECK-NEXT:    ret void
+//
+//
+//
+//
+// CHECK-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l144
+// CHECK-SAME: () #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 2, i1 false)
+// CHECK-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
+// CHECK-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
+// CHECK:       user_code.entry:
+// CHECK-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
+// CHECK-NEXT:    store i32 0, i32* [[DOTZERO_ADDR]], align 4
+// CHECK-NEXT:    store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4
+// CHECK-NEXT:    call void @__omp_outlined__65(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]]) #[[ATTR2]]
+// CHECK-NEXT:    call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 2)
+// CHECK-NEXT:    ret void
+// CHECK:       worker.exit:
+// CHECK-NEXT:    ret void
+//
+//
+//
+//
+// CHECK-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l149
+// CHECK-SAME: () #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 2, i1 false)
+// CHECK-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
+// CHECK-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
+// CHECK:       user_code.entry:
+// CHECK-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
+// CHECK-NEXT:    store i32 0, i32* [[DOTZERO_ADDR]], align 4
+// CHECK-NEXT:    store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4
+// CHECK-NEXT:    call void @__omp_outlined__67(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]]) #[[ATTR2]]
+// CHECK-NEXT:    call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 2)
+// CHECK-NEXT:    ret void
+// CHECK:       worker.exit:
+// CHECK-NEXT:    ret void
+//
+//
+//
+//
+// CHECK-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l154
+// CHECK-SAME: () #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 2, i1 false)
+// CHECK-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
+// CHECK-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
+// CHECK:       user_code.entry:
+// CHECK-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
+// CHECK-NEXT:    store i32 0, i32* [[DOTZERO_ADDR]], align 4
+// CHECK-NEXT:    store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4
+// CHECK-NEXT:    call void @__omp_outlined__69(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]]) #[[ATTR2]]
+// CHECK-NEXT:    call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 2)
+// CHECK-NEXT:    ret void
+// CHECK:       worker.exit:
+// CHECK-NEXT:    ret void
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+// CHECK-64-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l14
+// CHECK-64-SAME: (i64 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-64-NEXT:  entry:
+// CHECK-64-NEXT:    [[DOTCAPTURE_EXPR__ADDR:%.*]] = alloca i64, align 8
+// CHECK-64-NEXT:    [[DOTCAPTURE_EXPR__CASTED:%.*]] = alloca i64, align 8
+// CHECK-64-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    store i64 [[DOTCAPTURE_EXPR_]], i64* [[DOTCAPTURE_EXPR__ADDR]], align 8
+// CHECK-64-NEXT:    [[CONV:%.*]] = bitcast i64* [[DOTCAPTURE_EXPR__ADDR]] to i8*
+// CHECK-64-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1:[0-9]+]], i8 2, i1 false)
+// CHECK-64-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
+// CHECK-64-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
+// CHECK-64:       user_code.entry:
+// CHECK-64-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
+// CHECK-64-NEXT:    [[TMP2:%.*]] = load i8, i8* [[CONV]], align 1
+// CHECK-64-NEXT:    [[TOBOOL:%.*]] = trunc i8 [[TMP2]] to i1
+// CHECK-64-NEXT:    [[CONV1:%.*]] = bitcast i64* [[DOTCAPTURE_EXPR__CASTED]] to i8*
+// CHECK-64-NEXT:    [[FROMBOOL:%.*]] = zext i1 [[TOBOOL]] to i8
+// CHECK-64-NEXT:    store i8 [[FROMBOOL]], i8* [[CONV1]], align 1
+// CHECK-64-NEXT:    [[TMP3:%.*]] = load i64, i64* [[DOTCAPTURE_EXPR__CASTED]], align 8
+// CHECK-64-NEXT:    store i32 0, i32* [[DOTZERO_ADDR]], align 4
+// CHECK-64-NEXT:    store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4
+// CHECK-64-NEXT:    call void @__omp_outlined__(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i64 [[TMP3]]) #[[ATTR2:[0-9]+]]
+// CHECK-64-NEXT:    call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 2)
+// CHECK-64-NEXT:    ret void
+// CHECK-64:       worker.exit:
+// CHECK-64-NEXT:    ret void
+//
+//
+// CHECK-64-LABEL: define {{[^@]+}}@__omp_outlined__
+// CHECK-64-SAME: (i32* noalias noundef [[DOTGLOBAL_TID_:%.*]], i32* noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR1:[0-9]+]] {
+// CHECK-64-NEXT:  entry:
+// CHECK-64-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8
+// CHECK-64-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8
+// CHECK-64-NEXT:    [[DOTCAPTURE_EXPR__ADDR:%.*]] = alloca i64, align 8
+// CHECK-64-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTCAPTURE_EXPR__CASTED:%.*]] = alloca i64, align 8
+// CHECK-64-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [3 x i8*], align 8
+// CHECK-64-NEXT:    [[DOTCAPTURE_EXPR__CASTED16:%.*]] = alloca i64, align 8
+// CHECK-64-NEXT:    [[CAPTURED_VARS_ADDRS19:%.*]] = alloca [3 x i8*], align 8
+// CHECK-64-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK-64-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8
+// CHECK-64-NEXT:    store i64 [[DOTCAPTURE_EXPR_]], i64* [[DOTCAPTURE_EXPR__ADDR]], align 8
+// CHECK-64-NEXT:    [[CONV:%.*]] = bitcast i64* [[DOTCAPTURE_EXPR__ADDR]] to i8*
+// CHECK-64-NEXT:    store i32 0, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-64-NEXT:    store i32 9, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-64-NEXT:    store i32 1, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-64-NEXT:    store i32 0, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-64-NEXT:    [[NVPTX_NUM_THREADS:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block()
+// CHECK-64-NEXT:    [[TMP0:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK-64-NEXT:    [[TMP1:%.*]] = load i32, i32* [[TMP0]], align 4
+// CHECK-64-NEXT:    call void @__kmpc_distribute_static_init_4(%struct.ident_t* @[[GLOB2:[0-9]+]], i32 [[TMP1]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]])
+// CHECK-64-NEXT:    [[TMP2:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-64-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[TMP2]], 9
+// CHECK-64-NEXT:    br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK-64:       cond.true:
+// CHECK-64-NEXT:    br label [[COND_END:%.*]]
+// CHECK-64:       cond.false:
+// CHECK-64-NEXT:    [[TMP3:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-64-NEXT:    br label [[COND_END]]
+// CHECK-64:       cond.end:
+// CHECK-64-NEXT:    [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP3]], [[COND_FALSE]] ]
+// CHECK-64-NEXT:    store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-64-NEXT:    [[TMP4:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-64-NEXT:    store i32 [[TMP4]], i32* [[DOTOMP_IV]], align 4
+// CHECK-64-NEXT:    [[TMP5:%.*]] = load i8, i8* [[CONV]], align 1
+// CHECK-64-NEXT:    [[TOBOOL:%.*]] = trunc i8 [[TMP5]] to i1
+// CHECK-64-NEXT:    br i1 [[TOBOOL]], label [[OMP_IF_THEN:%.*]], label [[OMP_IF_ELSE:%.*]]
+// CHECK-64:       omp_if.then:
+// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK-64:       omp.inner.for.cond:
+// CHECK-64-NEXT:    [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP130:![0-9]+]]
+// CHECK-64-NEXT:    [[CMP1:%.*]] = icmp slt i32 [[TMP6]], 10
+// CHECK-64-NEXT:    br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK-64:       omp.inner.for.body:
+// CHECK-64-NEXT:    [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP130]]
+// CHECK-64-NEXT:    [[TMP8:%.*]] = zext i32 [[TMP7]] to i64
+// CHECK-64-NEXT:    [[TMP9:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP130]]
+// CHECK-64-NEXT:    [[TMP10:%.*]] = zext i32 [[TMP9]] to i64
+// CHECK-64-NEXT:    [[TMP11:%.*]] = load i8, i8* [[CONV]], align 1, !llvm.access.group [[ACC_GRP130]]
+// CHECK-64-NEXT:    [[TOBOOL2:%.*]] = trunc i8 [[TMP11]] to i1
+// CHECK-64-NEXT:    [[CONV3:%.*]] = bitcast i64* [[DOTCAPTURE_EXPR__CASTED]] to i8*
+// CHECK-64-NEXT:    [[FROMBOOL:%.*]] = zext i1 [[TOBOOL2]] to i8
+// CHECK-64-NEXT:    store i8 [[FROMBOOL]], i8* [[CONV3]], align 1, !llvm.access.group [[ACC_GRP130]]
+// CHECK-64-NEXT:    [[TMP12:%.*]] = load i64, i64* [[DOTCAPTURE_EXPR__CASTED]], align 8, !llvm.access.group [[ACC_GRP130]]
+// CHECK-64-NEXT:    [[TMP13:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 0
+// CHECK-64-NEXT:    [[TMP14:%.*]] = inttoptr i64 [[TMP8]] to i8*
+// CHECK-64-NEXT:    store i8* [[TMP14]], i8** [[TMP13]], align 8, !llvm.access.group [[ACC_GRP130]]
+// CHECK-64-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 1
+// CHECK-64-NEXT:    [[TMP16:%.*]] = inttoptr i64 [[TMP10]] to i8*
+// CHECK-64-NEXT:    store i8* [[TMP16]], i8** [[TMP15]], align 8, !llvm.access.group [[ACC_GRP130]]
+// CHECK-64-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 2
+// CHECK-64-NEXT:    [[TMP18:%.*]] = inttoptr i64 [[TMP12]] to i8*
+// CHECK-64-NEXT:    store i8* [[TMP18]], i8** [[TMP17]], align 8, !llvm.access.group [[ACC_GRP130]]
+// CHECK-64-NEXT:    [[TMP19:%.*]] = load i8, i8* [[CONV]], align 1, !llvm.access.group [[ACC_GRP130]]
+// CHECK-64-NEXT:    [[TOBOOL4:%.*]] = trunc i8 [[TMP19]] to i1
+// CHECK-64-NEXT:    [[TMP20:%.*]] = zext i1 [[TOBOOL4]] to i32
+// CHECK-64-NEXT:    [[TMP21:%.*]] = bitcast [3 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8**
+// CHECK-64-NEXT:    call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]], i32 [[TMP20]], i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i64, i64, i64)* @__omp_outlined__1 to i8*), i8* null, i8** [[TMP21]], i64 3), !llvm.access.group [[ACC_GRP130]]
+// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK-64:       omp.inner.for.inc:
+// CHECK-64-NEXT:    [[TMP22:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP130]]
+// CHECK-64-NEXT:    [[TMP23:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP130]]
+// CHECK-64-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP22]], [[TMP23]]
+// CHECK-64-NEXT:    store i32 [[ADD]], i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP130]]
+// CHECK-64-NEXT:    [[TMP24:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP130]]
+// CHECK-64-NEXT:    [[TMP25:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP130]]
+// CHECK-64-NEXT:    [[ADD5:%.*]] = add nsw i32 [[TMP24]], [[TMP25]]
+// CHECK-64-NEXT:    store i32 [[ADD5]], i32* [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP130]]
+// CHECK-64-NEXT:    [[TMP26:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP130]]
+// CHECK-64-NEXT:    [[TMP27:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP130]]
+// CHECK-64-NEXT:    [[ADD6:%.*]] = add nsw i32 [[TMP26]], [[TMP27]]
+// CHECK-64-NEXT:    store i32 [[ADD6]], i32* [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP130]]
+// CHECK-64-NEXT:    [[TMP28:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP130]]
+// CHECK-64-NEXT:    [[CMP7:%.*]] = icmp sgt i32 [[TMP28]], 9
+// CHECK-64-NEXT:    br i1 [[CMP7]], label [[COND_TRUE8:%.*]], label [[COND_FALSE9:%.*]]
+// CHECK-64:       cond.true8:
+// CHECK-64-NEXT:    br label [[COND_END10:%.*]]
+// CHECK-64:       cond.false9:
+// CHECK-64-NEXT:    [[TMP29:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP130]]
+// CHECK-64-NEXT:    br label [[COND_END10]]
+// CHECK-64:       cond.end10:
+// CHECK-64-NEXT:    [[COND11:%.*]] = phi i32 [ 9, [[COND_TRUE8]] ], [ [[TMP29]], [[COND_FALSE9]] ]
+// CHECK-64-NEXT:    store i32 [[COND11]], i32* [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP130]]
+// CHECK-64-NEXT:    [[TMP30:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP130]]
+// CHECK-64-NEXT:    store i32 [[TMP30]], i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP130]]
+// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP131:![0-9]+]]
+// CHECK-64:       omp.inner.for.end:
+// CHECK-64-NEXT:    br label [[OMP_IF_END:%.*]]
+// CHECK-64:       omp_if.else:
+// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND12:%.*]]
+// CHECK-64:       omp.inner.for.cond12:
+// CHECK-64-NEXT:    [[TMP31:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
+// CHECK-64-NEXT:    [[CMP13:%.*]] = icmp slt i32 [[TMP31]], 10
+// CHECK-64-NEXT:    br i1 [[CMP13]], label [[OMP_INNER_FOR_BODY14:%.*]], label [[OMP_INNER_FOR_END30:%.*]]
+// CHECK-64:       omp.inner.for.body14:
+// CHECK-64-NEXT:    [[TMP32:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-64-NEXT:    [[TMP33:%.*]] = zext i32 [[TMP32]] to i64
+// CHECK-64-NEXT:    [[TMP34:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-64-NEXT:    [[TMP35:%.*]] = zext i32 [[TMP34]] to i64
+// CHECK-64-NEXT:    [[TMP36:%.*]] = load i8, i8* [[CONV]], align 1
+// CHECK-64-NEXT:    [[TOBOOL15:%.*]] = trunc i8 [[TMP36]] to i1
+// CHECK-64-NEXT:    [[CONV17:%.*]] = bitcast i64* [[DOTCAPTURE_EXPR__CASTED16]] to i8*
+// CHECK-64-NEXT:    [[FROMBOOL18:%.*]] = zext i1 [[TOBOOL15]] to i8
+// CHECK-64-NEXT:    store i8 [[FROMBOOL18]], i8* [[CONV17]], align 1
+// CHECK-64-NEXT:    [[TMP37:%.*]] = load i64, i64* [[DOTCAPTURE_EXPR__CASTED16]], align 8
+// CHECK-64-NEXT:    [[TMP38:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS19]], i64 0, i64 0
+// CHECK-64-NEXT:    [[TMP39:%.*]] = inttoptr i64 [[TMP33]] to i8*
+// CHECK-64-NEXT:    store i8* [[TMP39]], i8** [[TMP38]], align 8
+// CHECK-64-NEXT:    [[TMP40:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS19]], i64 0, i64 1
+// CHECK-64-NEXT:    [[TMP41:%.*]] = inttoptr i64 [[TMP35]] to i8*
+// CHECK-64-NEXT:    store i8* [[TMP41]], i8** [[TMP40]], align 8
+// CHECK-64-NEXT:    [[TMP42:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS19]], i64 0, i64 2
+// CHECK-64-NEXT:    [[TMP43:%.*]] = inttoptr i64 [[TMP37]] to i8*
+// CHECK-64-NEXT:    store i8* [[TMP43]], i8** [[TMP42]], align 8
+// CHECK-64-NEXT:    [[TMP44:%.*]] = load i8, i8* [[CONV]], align 1
+// CHECK-64-NEXT:    [[TOBOOL20:%.*]] = trunc i8 [[TMP44]] to i1
+// CHECK-64-NEXT:    [[TMP45:%.*]] = zext i1 [[TOBOOL20]] to i32
+// CHECK-64-NEXT:    [[TMP46:%.*]] = bitcast [3 x i8*]* [[CAPTURED_VARS_ADDRS19]] to i8**
+// CHECK-64-NEXT:    call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]], i32 [[TMP45]], i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i64, i64, i64)* @__omp_outlined__2 to i8*), i8* null, i8** [[TMP46]], i64 3)
+// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_INC21:%.*]]
+// CHECK-64:       omp.inner.for.inc21:
+// CHECK-64-NEXT:    [[TMP47:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
+// CHECK-64-NEXT:    [[TMP48:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-64-NEXT:    [[ADD22:%.*]] = add nsw i32 [[TMP47]], [[TMP48]]
+// CHECK-64-NEXT:    store i32 [[ADD22]], i32* [[DOTOMP_IV]], align 4
+// CHECK-64-NEXT:    [[TMP49:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-64-NEXT:    [[TMP50:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-64-NEXT:    [[ADD23:%.*]] = add nsw i32 [[TMP49]], [[TMP50]]
+// CHECK-64-NEXT:    store i32 [[ADD23]], i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-64-NEXT:    [[TMP51:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-64-NEXT:    [[TMP52:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-64-NEXT:    [[ADD24:%.*]] = add nsw i32 [[TMP51]], [[TMP52]]
+// CHECK-64-NEXT:    store i32 [[ADD24]], i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-64-NEXT:    [[TMP53:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-64-NEXT:    [[CMP25:%.*]] = icmp sgt i32 [[TMP53]], 9
+// CHECK-64-NEXT:    br i1 [[CMP25]], label [[COND_TRUE26:%.*]], label [[COND_FALSE27:%.*]]
+// CHECK-64:       cond.true26:
+// CHECK-64-NEXT:    br label [[COND_END28:%.*]]
+// CHECK-64:       cond.false27:
+// CHECK-64-NEXT:    [[TMP54:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-64-NEXT:    br label [[COND_END28]]
+// CHECK-64:       cond.end28:
+// CHECK-64-NEXT:    [[COND29:%.*]] = phi i32 [ 9, [[COND_TRUE26]] ], [ [[TMP54]], [[COND_FALSE27]] ]
+// CHECK-64-NEXT:    store i32 [[COND29]], i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-64-NEXT:    [[TMP55:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-64-NEXT:    store i32 [[TMP55]], i32* [[DOTOMP_IV]], align 4
+// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND12]], !llvm.loop [[LOOP134:![0-9]+]]
+// CHECK-64:       omp.inner.for.end30:
+// CHECK-64-NEXT:    br label [[OMP_IF_END]]
+// CHECK-64:       omp_if.end:
+// CHECK-64-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK-64:       omp.loop.exit:
+// CHECK-64-NEXT:    call void @__kmpc_distribute_static_fini(%struct.ident_t* @[[GLOB2]], i32 [[TMP1]])
+// CHECK-64-NEXT:    [[TMP56:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-64-NEXT:    [[TMP57:%.*]] = icmp ne i32 [[TMP56]], 0
+// CHECK-64-NEXT:    br i1 [[TMP57]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]]
+// CHECK-64:       .omp.final.then:
+// CHECK-64-NEXT:    store i32 10, i32* [[I]], align 4
+// CHECK-64-NEXT:    br label [[DOTOMP_FINAL_DONE]]
+// CHECK-64:       .omp.final.done:
+// CHECK-64-NEXT:    ret void
+//
+//
+// CHECK-64-LABEL: define {{[^@]+}}@__omp_outlined__1
+// CHECK-64-SAME: (i32* noalias noundef [[DOTGLOBAL_TID_:%.*]], i32* noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR1]] {
+// CHECK-64-NEXT:  entry:
+// CHECK-64-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8
+// CHECK-64-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8
+// CHECK-64-NEXT:    [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i64, align 8
+// CHECK-64-NEXT:    [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i64, align 8
+// CHECK-64-NEXT:    [[DOTCAPTURE_EXPR__ADDR:%.*]] = alloca i64, align 8
+// CHECK-64-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK-64-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8
+// CHECK-64-NEXT:    store i64 [[DOTPREVIOUS_LB_]], i64* [[DOTPREVIOUS_LB__ADDR]], align 8
+// CHECK-64-NEXT:    store i64 [[DOTPREVIOUS_UB_]], i64* [[DOTPREVIOUS_UB__ADDR]], align 8
+// CHECK-64-NEXT:    store i64 [[DOTCAPTURE_EXPR_]], i64* [[DOTCAPTURE_EXPR__ADDR]], align 8
+// CHECK-64-NEXT:    [[CONV:%.*]] = bitcast i64* [[DOTCAPTURE_EXPR__ADDR]] to i8*
+// CHECK-64-NEXT:    store i32 0, i32* [[DOTOMP_LB]], align 4
+// CHECK-64-NEXT:    store i32 9, i32* [[DOTOMP_UB]], align 4
+// CHECK-64-NEXT:    [[TMP0:%.*]] = load i64, i64* [[DOTPREVIOUS_LB__ADDR]], align 8
+// CHECK-64-NEXT:    [[CONV1:%.*]] = trunc i64 [[TMP0]] to i32
+// CHECK-64-NEXT:    [[TMP1:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8
+// CHECK-64-NEXT:    [[CONV2:%.*]] = trunc i64 [[TMP1]] to i32
+// CHECK-64-NEXT:    store i32 [[CONV1]], i32* [[DOTOMP_LB]], align 4
+// CHECK-64-NEXT:    store i32 [[CONV2]], i32* [[DOTOMP_UB]], align 4
+// CHECK-64-NEXT:    store i32 1, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-64-NEXT:    store i32 0, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-64-NEXT:    [[TMP2:%.*]] = load i8, i8* [[CONV]], align 1
+// CHECK-64-NEXT:    [[TOBOOL:%.*]] = trunc i8 [[TMP2]] to i1
+// CHECK-64-NEXT:    br i1 [[TOBOOL]], label [[OMP_IF_THEN:%.*]], label [[OMP_IF_ELSE:%.*]]
+// CHECK-64:       omp_if.then:
+// CHECK-64-NEXT:    [[TMP3:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK-64-NEXT:    [[TMP4:%.*]] = load i32, i32* [[TMP3]], align 4
+// CHECK-64-NEXT:    call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB3:[0-9]+]], i32 [[TMP4]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1)
+// CHECK-64-NEXT:    [[TMP5:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4
+// CHECK-64-NEXT:    store i32 [[TMP5]], i32* [[DOTOMP_IV]], align 4
+// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK-64:       omp.inner.for.cond:
+// CHECK-64-NEXT:    [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP136:![0-9]+]]
+// CHECK-64-NEXT:    [[CONV3:%.*]] = sext i32 [[TMP6]] to i64
+// CHECK-64-NEXT:    [[TMP7:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8, !llvm.access.group [[ACC_GRP136]]
+// CHECK-64-NEXT:    [[CMP:%.*]] = icmp ule i64 [[CONV3]], [[TMP7]]
+// CHECK-64-NEXT:    br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK-64:       omp.inner.for.body:
+// CHECK-64-NEXT:    [[TMP8:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP136]]
+// CHECK-64-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP8]], 1
+// CHECK-64-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// CHECK-64-NEXT:    store i32 [[ADD]], i32* [[I]], align 4, !llvm.access.group [[ACC_GRP136]]
+// CHECK-64-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK-64:       omp.body.continue:
+// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK-64:       omp.inner.for.inc:
+// CHECK-64-NEXT:    [[TMP9:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP136]]
+// CHECK-64-NEXT:    [[TMP10:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP136]]
+// CHECK-64-NEXT:    [[ADD4:%.*]] = add nsw i32 [[TMP9]], [[TMP10]]
+// CHECK-64-NEXT:    store i32 [[ADD4]], i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP136]]
+// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP137:![0-9]+]]
+// CHECK-64:       omp.inner.for.end:
+// CHECK-64-NEXT:    br label [[OMP_IF_END:%.*]]
+// CHECK-64:       omp_if.else:
+// CHECK-64-NEXT:    [[TMP11:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK-64-NEXT:    [[TMP12:%.*]] = load i32, i32* [[TMP11]], align 4
+// CHECK-64-NEXT:    call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB3]], i32 [[TMP12]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1)
+// CHECK-64-NEXT:    [[TMP13:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4
+// CHECK-64-NEXT:    store i32 [[TMP13]], i32* [[DOTOMP_IV]], align 4
+// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND5:%.*]]
+// CHECK-64:       omp.inner.for.cond5:
+// CHECK-64-NEXT:    [[TMP14:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
+// CHECK-64-NEXT:    [[CONV6:%.*]] = sext i32 [[TMP14]] to i64
+// CHECK-64-NEXT:    [[TMP15:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8
+// CHECK-64-NEXT:    [[CMP7:%.*]] = icmp ule i64 [[CONV6]], [[TMP15]]
+// CHECK-64-NEXT:    br i1 [[CMP7]], label [[OMP_INNER_FOR_BODY8:%.*]], label [[OMP_INNER_FOR_END14:%.*]]
+// CHECK-64:       omp.inner.for.body8:
+// CHECK-64-NEXT:    [[TMP16:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
+// CHECK-64-NEXT:    [[MUL9:%.*]] = mul nsw i32 [[TMP16]], 1
+// CHECK-64-NEXT:    [[ADD10:%.*]] = add nsw i32 0, [[MUL9]]
+// CHECK-64-NEXT:    store i32 [[ADD10]], i32* [[I]], align 4
+// CHECK-64-NEXT:    br label [[OMP_BODY_CONTINUE11:%.*]]
+// CHECK-64:       omp.body.continue11:
+// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_INC12:%.*]]
+// CHECK-64:       omp.inner.for.inc12:
+// CHECK-64-NEXT:    [[TMP17:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
+// CHECK-64-NEXT:    [[TMP18:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-64-NEXT:    [[ADD13:%.*]] = add nsw i32 [[TMP17]], [[TMP18]]
+// CHECK-64-NEXT:    store i32 [[ADD13]], i32* [[DOTOMP_IV]], align 4
+// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND5]], !llvm.loop [[LOOP139:![0-9]+]]
+// CHECK-64:       omp.inner.for.end14:
+// CHECK-64-NEXT:    br label [[OMP_IF_END]]
+// CHECK-64:       omp_if.end:
+// CHECK-64-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK-64:       omp.loop.exit:
+// CHECK-64-NEXT:    [[TMP19:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK-64-NEXT:    [[TMP20:%.*]] = load i32, i32* [[TMP19]], align 4
+// CHECK-64-NEXT:    call void @__kmpc_distribute_static_fini(%struct.ident_t* @[[GLOB2]], i32 [[TMP20]])
+// CHECK-64-NEXT:    [[TMP21:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-64-NEXT:    [[TMP22:%.*]] = icmp ne i32 [[TMP21]], 0
+// CHECK-64-NEXT:    br i1 [[TMP22]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]]
+// CHECK-64:       .omp.final.then:
+// CHECK-64-NEXT:    store i32 10, i32* [[I]], align 4
+// CHECK-64-NEXT:    br label [[DOTOMP_FINAL_DONE]]
+// CHECK-64:       .omp.final.done:
+// CHECK-64-NEXT:    ret void
+//
+//
+// CHECK-64-LABEL: define {{[^@]+}}@__omp_outlined__2
+// CHECK-64-SAME: (i32* noalias noundef [[DOTGLOBAL_TID_:%.*]], i32* noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR1]] {
+// CHECK-64-NEXT:  entry:
+// CHECK-64-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8
+// CHECK-64-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8
+// CHECK-64-NEXT:    [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i64, align 8
+// CHECK-64-NEXT:    [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i64, align 8
+// CHECK-64-NEXT:    [[DOTCAPTURE_EXPR__ADDR:%.*]] = alloca i64, align 8
+// CHECK-64-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK-64-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8
+// CHECK-64-NEXT:    store i64 [[DOTPREVIOUS_LB_]], i64* [[DOTPREVIOUS_LB__ADDR]], align 8
+// CHECK-64-NEXT:    store i64 [[DOTPREVIOUS_UB_]], i64* [[DOTPREVIOUS_UB__ADDR]], align 8
+// CHECK-64-NEXT:    store i64 [[DOTCAPTURE_EXPR_]], i64* [[DOTCAPTURE_EXPR__ADDR]], align 8
+// CHECK-64-NEXT:    [[CONV:%.*]] = bitcast i64* [[DOTCAPTURE_EXPR__ADDR]] to i8*
+// CHECK-64-NEXT:    store i32 0, i32* [[DOTOMP_LB]], align 4
+// CHECK-64-NEXT:    store i32 9, i32* [[DOTOMP_UB]], align 4
+// CHECK-64-NEXT:    [[TMP0:%.*]] = load i64, i64* [[DOTPREVIOUS_LB__ADDR]], align 8
+// CHECK-64-NEXT:    [[CONV1:%.*]] = trunc i64 [[TMP0]] to i32
+// CHECK-64-NEXT:    [[TMP1:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8
+// CHECK-64-NEXT:    [[CONV2:%.*]] = trunc i64 [[TMP1]] to i32
+// CHECK-64-NEXT:    store i32 [[CONV1]], i32* [[DOTOMP_LB]], align 4
+// CHECK-64-NEXT:    store i32 [[CONV2]], i32* [[DOTOMP_UB]], align 4
+// CHECK-64-NEXT:    store i32 1, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-64-NEXT:    store i32 0, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-64-NEXT:    [[TMP2:%.*]] = load i8, i8* [[CONV]], align 1
+// CHECK-64-NEXT:    [[TOBOOL:%.*]] = trunc i8 [[TMP2]] to i1
+// CHECK-64-NEXT:    br i1 [[TOBOOL]], label [[OMP_IF_THEN:%.*]], label [[OMP_IF_ELSE:%.*]]
+// CHECK-64:       omp_if.then:
+// CHECK-64-NEXT:    [[TMP3:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK-64-NEXT:    [[TMP4:%.*]] = load i32, i32* [[TMP3]], align 4
+// CHECK-64-NEXT:    call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB3]], i32 [[TMP4]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1)
+// CHECK-64-NEXT:    [[TMP5:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4
+// CHECK-64-NEXT:    store i32 [[TMP5]], i32* [[DOTOMP_IV]], align 4
+// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK-64:       omp.inner.for.cond:
+// CHECK-64-NEXT:    [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP140:![0-9]+]]
+// CHECK-64-NEXT:    [[CONV3:%.*]] = sext i32 [[TMP6]] to i64
+// CHECK-64-NEXT:    [[TMP7:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8, !llvm.access.group [[ACC_GRP140]]
+// CHECK-64-NEXT:    [[CMP:%.*]] = icmp ule i64 [[CONV3]], [[TMP7]]
+// CHECK-64-NEXT:    br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK-64:       omp.inner.for.body:
+// CHECK-64-NEXT:    [[TMP8:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP140]]
+// CHECK-64-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP8]], 1
+// CHECK-64-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// CHECK-64-NEXT:    store i32 [[ADD]], i32* [[I]], align 4, !llvm.access.group [[ACC_GRP140]]
+// CHECK-64-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK-64:       omp.body.continue:
+// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK-64:       omp.inner.for.inc:
+// CHECK-64-NEXT:    [[TMP9:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP140]]
+// CHECK-64-NEXT:    [[TMP10:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP140]]
+// CHECK-64-NEXT:    [[ADD4:%.*]] = add nsw i32 [[TMP9]], [[TMP10]]
+// CHECK-64-NEXT:    store i32 [[ADD4]], i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP140]]
+// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP141:![0-9]+]]
+// CHECK-64:       omp.inner.for.end:
+// CHECK-64-NEXT:    br label [[OMP_IF_END:%.*]]
+// CHECK-64:       omp_if.else:
+// CHECK-64-NEXT:    [[TMP11:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK-64-NEXT:    [[TMP12:%.*]] = load i32, i32* [[TMP11]], align 4
+// CHECK-64-NEXT:    call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB3]], i32 [[TMP12]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1)
+// CHECK-64-NEXT:    [[TMP13:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4
+// CHECK-64-NEXT:    store i32 [[TMP13]], i32* [[DOTOMP_IV]], align 4
+// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND5:%.*]]
+// CHECK-64:       omp.inner.for.cond5:
+// CHECK-64-NEXT:    [[TMP14:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
+// CHECK-64-NEXT:    [[CONV6:%.*]] = sext i32 [[TMP14]] to i64
+// CHECK-64-NEXT:    [[TMP15:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8
+// CHECK-64-NEXT:    [[CMP7:%.*]] = icmp ule i64 [[CONV6]], [[TMP15]]
+// CHECK-64-NEXT:    br i1 [[CMP7]], label [[OMP_INNER_FOR_BODY8:%.*]], label [[OMP_INNER_FOR_END14:%.*]]
+// CHECK-64:       omp.inner.for.body8:
+// CHECK-64-NEXT:    [[TMP16:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
+// CHECK-64-NEXT:    [[MUL9:%.*]] = mul nsw i32 [[TMP16]], 1
+// CHECK-64-NEXT:    [[ADD10:%.*]] = add nsw i32 0, [[MUL9]]
+// CHECK-64-NEXT:    store i32 [[ADD10]], i32* [[I]], align 4
+// CHECK-64-NEXT:    br label [[OMP_BODY_CONTINUE11:%.*]]
+// CHECK-64:       omp.body.continue11:
+// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_INC12:%.*]]
+// CHECK-64:       omp.inner.for.inc12:
+// CHECK-64-NEXT:    [[TMP17:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
+// CHECK-64-NEXT:    [[TMP18:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-64-NEXT:    [[ADD13:%.*]] = add nsw i32 [[TMP17]], [[TMP18]]
+// CHECK-64-NEXT:    store i32 [[ADD13]], i32* [[DOTOMP_IV]], align 4
+// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND5]], !llvm.loop [[LOOP143:![0-9]+]]
+// CHECK-64:       omp.inner.for.end14:
+// CHECK-64-NEXT:    br label [[OMP_IF_END]]
+// CHECK-64:       omp_if.end:
+// CHECK-64-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK-64:       omp.loop.exit:
+// CHECK-64-NEXT:    [[TMP19:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK-64-NEXT:    [[TMP20:%.*]] = load i32, i32* [[TMP19]], align 4
+// CHECK-64-NEXT:    call void @__kmpc_distribute_static_fini(%struct.ident_t* @[[GLOB2]], i32 [[TMP20]])
+// CHECK-64-NEXT:    [[TMP21:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-64-NEXT:    [[TMP22:%.*]] = icmp ne i32 [[TMP21]], 0
+// CHECK-64-NEXT:    br i1 [[TMP22]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]]
+// CHECK-64:       .omp.final.then:
+// CHECK-64-NEXT:    store i32 10, i32* [[I]], align 4
+// CHECK-64-NEXT:    br label [[DOTOMP_FINAL_DONE]]
+// CHECK-64:       .omp.final.done:
+// CHECK-64-NEXT:    ret void
+//
+//
+// CHECK-64-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l17
+// CHECK-64-SAME: () #[[ATTR0]] {
+// CHECK-64-NEXT:  entry:
+// CHECK-64-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 2, i1 false)
+// CHECK-64-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
+// CHECK-64-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
+// CHECK-64:       user_code.entry:
+// CHECK-64-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
+// CHECK-64-NEXT:    store i32 0, i32* [[DOTZERO_ADDR]], align 4
+// CHECK-64-NEXT:    store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4
+// CHECK-64-NEXT:    call void @__omp_outlined__3(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]]) #[[ATTR2]]
+// CHECK-64-NEXT:    call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 2)
+// CHECK-64-NEXT:    ret void
+// CHECK-64:       worker.exit:
+// CHECK-64-NEXT:    ret void
+//
+//
+// CHECK-64-LABEL: define {{[^@]+}}@__omp_outlined__3
+// CHECK-64-SAME: (i32* noalias noundef [[DOTGLOBAL_TID_:%.*]], i32* noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
+// CHECK-64-NEXT:  entry:
+// CHECK-64-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8
+// CHECK-64-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8
+// CHECK-64-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [2 x i8*], align 8
+// CHECK-64-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK-64-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8
+// CHECK-64-NEXT:    store i32 0, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-64-NEXT:    store i32 9, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-64-NEXT:    store i32 1, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-64-NEXT:    store i32 0, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-64-NEXT:    [[NVPTX_NUM_THREADS:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block()
+// CHECK-64-NEXT:    [[TMP0:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK-64-NEXT:    [[TMP1:%.*]] = load i32, i32* [[TMP0]], align 4
+// CHECK-64-NEXT:    call void @__kmpc_distribute_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP1]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]])
+// CHECK-64-NEXT:    [[TMP2:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-64-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[TMP2]], 9
+// CHECK-64-NEXT:    br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK-64:       cond.true:
+// CHECK-64-NEXT:    br label [[COND_END:%.*]]
+// CHECK-64:       cond.false:
+// CHECK-64-NEXT:    [[TMP3:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-64-NEXT:    br label [[COND_END]]
+// CHECK-64:       cond.end:
+// CHECK-64-NEXT:    [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP3]], [[COND_FALSE]] ]
+// CHECK-64-NEXT:    store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-64-NEXT:    [[TMP4:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-64-NEXT:    store i32 [[TMP4]], i32* [[DOTOMP_IV]], align 4
+// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK-64:       omp.inner.for.cond:
+// CHECK-64-NEXT:    [[TMP5:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP144:![0-9]+]]
+// CHECK-64-NEXT:    [[CMP1:%.*]] = icmp slt i32 [[TMP5]], 10
+// CHECK-64-NEXT:    br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK-64:       omp.inner.for.body:
+// CHECK-64-NEXT:    [[TMP6:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP144]]
+// CHECK-64-NEXT:    [[TMP7:%.*]] = zext i32 [[TMP6]] to i64
+// CHECK-64-NEXT:    [[TMP8:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP144]]
+// CHECK-64-NEXT:    [[TMP9:%.*]] = zext i32 [[TMP8]] to i64
+// CHECK-64-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 0
+// CHECK-64-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP7]] to i8*
+// CHECK-64-NEXT:    store i8* [[TMP11]], i8** [[TMP10]], align 8, !llvm.access.group [[ACC_GRP144]]
+// CHECK-64-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 1
+// CHECK-64-NEXT:    [[TMP13:%.*]] = inttoptr i64 [[TMP9]] to i8*
+// CHECK-64-NEXT:    store i8* [[TMP13]], i8** [[TMP12]], align 8, !llvm.access.group [[ACC_GRP144]]
+// CHECK-64-NEXT:    [[TMP14:%.*]] = bitcast [2 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8**
+// CHECK-64-NEXT:    call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i64, i64)* @__omp_outlined__4 to i8*), i8* null, i8** [[TMP14]], i64 2), !llvm.access.group [[ACC_GRP144]]
+// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK-64:       omp.inner.for.inc:
+// CHECK-64-NEXT:    [[TMP15:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP144]]
+// CHECK-64-NEXT:    [[TMP16:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP144]]
+// CHECK-64-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP15]], [[TMP16]]
+// CHECK-64-NEXT:    store i32 [[ADD]], i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP144]]
+// CHECK-64-NEXT:    [[TMP17:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP144]]
+// CHECK-64-NEXT:    [[TMP18:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP144]]
+// CHECK-64-NEXT:    [[ADD2:%.*]] = add nsw i32 [[TMP17]], [[TMP18]]
+// CHECK-64-NEXT:    store i32 [[ADD2]], i32* [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP144]]
+// CHECK-64-NEXT:    [[TMP19:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP144]]
+// CHECK-64-NEXT:    [[TMP20:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP144]]
+// CHECK-64-NEXT:    [[ADD3:%.*]] = add nsw i32 [[TMP19]], [[TMP20]]
+// CHECK-64-NEXT:    store i32 [[ADD3]], i32* [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP144]]
+// CHECK-64-NEXT:    [[TMP21:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP144]]
+// CHECK-64-NEXT:    [[CMP4:%.*]] = icmp sgt i32 [[TMP21]], 9
+// CHECK-64-NEXT:    br i1 [[CMP4]], label [[COND_TRUE5:%.*]], label [[COND_FALSE6:%.*]]
+// CHECK-64:       cond.true5:
+// CHECK-64-NEXT:    br label [[COND_END7:%.*]]
+// CHECK-64:       cond.false6:
+// CHECK-64-NEXT:    [[TMP22:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP144]]
+// CHECK-64-NEXT:    br label [[COND_END7]]
+// CHECK-64:       cond.end7:
+// CHECK-64-NEXT:    [[COND8:%.*]] = phi i32 [ 9, [[COND_TRUE5]] ], [ [[TMP22]], [[COND_FALSE6]] ]
+// CHECK-64-NEXT:    store i32 [[COND8]], i32* [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP144]]
+// CHECK-64-NEXT:    [[TMP23:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP144]]
+// CHECK-64-NEXT:    store i32 [[TMP23]], i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP144]]
+// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP145:![0-9]+]]
+// CHECK-64:       omp.inner.for.end:
+// CHECK-64-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK-64:       omp.loop.exit:
+// CHECK-64-NEXT:    call void @__kmpc_distribute_static_fini(%struct.ident_t* @[[GLOB2]], i32 [[TMP1]])
+// CHECK-64-NEXT:    [[TMP24:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-64-NEXT:    [[TMP25:%.*]] = icmp ne i32 [[TMP24]], 0
+// CHECK-64-NEXT:    br i1 [[TMP25]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]]
+// CHECK-64:       .omp.final.then:
+// CHECK-64-NEXT:    store i32 10, i32* [[I]], align 4
+// CHECK-64-NEXT:    br label [[DOTOMP_FINAL_DONE]]
+// CHECK-64:       .omp.final.done:
+// CHECK-64-NEXT:    ret void
+//
+//
+// CHECK-64-LABEL: define {{[^@]+}}@__omp_outlined__4
+// CHECK-64-SAME: (i32* noalias noundef [[DOTGLOBAL_TID_:%.*]], i32* noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR1]] {
+// CHECK-64-NEXT:  entry:
+// CHECK-64-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8
+// CHECK-64-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8
+// CHECK-64-NEXT:    [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i64, align 8
+// CHECK-64-NEXT:    [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i64, align 8
+// CHECK-64-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK-64-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8
+// CHECK-64-NEXT:    store i64 [[DOTPREVIOUS_LB_]], i64* [[DOTPREVIOUS_LB__ADDR]], align 8
+// CHECK-64-NEXT:    store i64 [[DOTPREVIOUS_UB_]], i64* [[DOTPREVIOUS_UB__ADDR]], align 8
+// CHECK-64-NEXT:    store i32 0, i32* [[DOTOMP_LB]], align 4
+// CHECK-64-NEXT:    store i32 9, i32* [[DOTOMP_UB]], align 4
+// CHECK-64-NEXT:    [[TMP0:%.*]] = load i64, i64* [[DOTPREVIOUS_LB__ADDR]], align 8
+// CHECK-64-NEXT:    [[CONV:%.*]] = trunc i64 [[TMP0]] to i32
+// CHECK-64-NEXT:    [[TMP1:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8
+// CHECK-64-NEXT:    [[CONV1:%.*]] = trunc i64 [[TMP1]] to i32
+// CHECK-64-NEXT:    store i32 [[CONV]], i32* [[DOTOMP_LB]], align 4
+// CHECK-64-NEXT:    store i32 [[CONV1]], i32* [[DOTOMP_UB]], align 4
+// CHECK-64-NEXT:    store i32 1, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-64-NEXT:    store i32 0, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-64-NEXT:    [[TMP2:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK-64-NEXT:    [[TMP3:%.*]] = load i32, i32* [[TMP2]], align 4
+// CHECK-64-NEXT:    call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB3]], i32 [[TMP3]], i32 34, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1)
+// CHECK-64-NEXT:    [[TMP4:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4
+// CHECK-64-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[TMP4]], 9
+// CHECK-64-NEXT:    br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK-64:       cond.true:
+// CHECK-64-NEXT:    br label [[COND_END:%.*]]
+// CHECK-64:       cond.false:
+// CHECK-64-NEXT:    [[TMP5:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4
+// CHECK-64-NEXT:    br label [[COND_END]]
+// CHECK-64:       cond.end:
+// CHECK-64-NEXT:    [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP5]], [[COND_FALSE]] ]
+// CHECK-64-NEXT:    store i32 [[COND]], i32* [[DOTOMP_UB]], align 4
+// CHECK-64-NEXT:    [[TMP6:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4
+// CHECK-64-NEXT:    store i32 [[TMP6]], i32* [[DOTOMP_IV]], align 4
+// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK-64:       omp.inner.for.cond:
+// CHECK-64-NEXT:    [[TMP7:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP147:![0-9]+]]
+// CHECK-64-NEXT:    [[TMP8:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP147]]
+// CHECK-64-NEXT:    [[CMP2:%.*]] = icmp sle i32 [[TMP7]], [[TMP8]]
+// CHECK-64-NEXT:    br i1 [[CMP2]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK-64:       omp.inner.for.body:
+// CHECK-64-NEXT:    [[TMP9:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP147]]
+// CHECK-64-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP9]], 1
+// CHECK-64-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// CHECK-64-NEXT:    store i32 [[ADD]], i32* [[I]], align 4, !llvm.access.group [[ACC_GRP147]]
+// CHECK-64-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK-64:       omp.body.continue:
+// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK-64:       omp.inner.for.inc:
+// CHECK-64-NEXT:    [[TMP10:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP147]]
+// CHECK-64-NEXT:    [[ADD3:%.*]] = add nsw i32 [[TMP10]], 1
+// CHECK-64-NEXT:    store i32 [[ADD3]], i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP147]]
+// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP148:![0-9]+]]
+// CHECK-64:       omp.inner.for.end:
+// CHECK-64-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK-64:       omp.loop.exit:
+// CHECK-64-NEXT:    call void @__kmpc_distribute_static_fini(%struct.ident_t* @[[GLOB2]], i32 [[TMP3]])
+// CHECK-64-NEXT:    [[TMP11:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-64-NEXT:    [[TMP12:%.*]] = icmp ne i32 [[TMP11]], 0
+// CHECK-64-NEXT:    br i1 [[TMP12]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]]
+// CHECK-64:       .omp.final.then:
+// CHECK-64-NEXT:    store i32 10, i32* [[I]], align 4
+// CHECK-64-NEXT:    br label [[DOTOMP_FINAL_DONE]]
+// CHECK-64:       .omp.final.done:
+// CHECK-64-NEXT:    ret void
+//
+//
+// CHECK-64-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l20
+// CHECK-64-SAME: () #[[ATTR0]] {
+// CHECK-64-NEXT:  entry:
+// CHECK-64-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 2, i1 false)
+// CHECK-64-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
+// CHECK-64-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
+// CHECK-64:       user_code.entry:
+// CHECK-64-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
+// CHECK-64-NEXT:    store i32 0, i32* [[DOTZERO_ADDR]], align 4
+// CHECK-64-NEXT:    store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4
+// CHECK-64-NEXT:    call void @__omp_outlined__5(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]]) #[[ATTR2]]
+// CHECK-64-NEXT:    call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 2)
+// CHECK-64-NEXT:    ret void
+// CHECK-64:       worker.exit:
+// CHECK-64-NEXT:    ret void
+//
+//
+// CHECK-64-LABEL: define {{[^@]+}}@__omp_outlined__5
+// CHECK-64-SAME: (i32* noalias noundef [[DOTGLOBAL_TID_:%.*]], i32* noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
+// CHECK-64-NEXT:  entry:
+// CHECK-64-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8
+// CHECK-64-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8
+// CHECK-64-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [2 x i8*], align 8
+// CHECK-64-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK-64-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8
+// CHECK-64-NEXT:    store i32 0, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-64-NEXT:    store i32 9, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-64-NEXT:    store i32 1, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-64-NEXT:    store i32 0, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-64-NEXT:    [[NVPTX_NUM_THREADS:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block()
+// CHECK-64-NEXT:    [[TMP0:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK-64-NEXT:    [[TMP1:%.*]] = load i32, i32* [[TMP0]], align 4
+// CHECK-64-NEXT:    call void @__kmpc_distribute_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP1]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]])
+// CHECK-64-NEXT:    [[TMP2:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-64-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[TMP2]], 9
+// CHECK-64-NEXT:    br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK-64:       cond.true:
+// CHECK-64-NEXT:    br label [[COND_END:%.*]]
+// CHECK-64:       cond.false:
+// CHECK-64-NEXT:    [[TMP3:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-64-NEXT:    br label [[COND_END]]
+// CHECK-64:       cond.end:
+// CHECK-64-NEXT:    [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP3]], [[COND_FALSE]] ]
+// CHECK-64-NEXT:    store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-64-NEXT:    [[TMP4:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-64-NEXT:    store i32 [[TMP4]], i32* [[DOTOMP_IV]], align 4
+// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK-64:       omp.inner.for.cond:
+// CHECK-64-NEXT:    [[TMP5:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP150:![0-9]+]]
+// CHECK-64-NEXT:    [[CMP1:%.*]] = icmp slt i32 [[TMP5]], 10
+// CHECK-64-NEXT:    br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK-64:       omp.inner.for.body:
+// CHECK-64-NEXT:    [[TMP6:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP150]]
+// CHECK-64-NEXT:    [[TMP7:%.*]] = zext i32 [[TMP6]] to i64
+// CHECK-64-NEXT:    [[TMP8:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP150]]
+// CHECK-64-NEXT:    [[TMP9:%.*]] = zext i32 [[TMP8]] to i64
+// CHECK-64-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 0
+// CHECK-64-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP7]] to i8*
+// CHECK-64-NEXT:    store i8* [[TMP11]], i8** [[TMP10]], align 8, !llvm.access.group [[ACC_GRP150]]
+// CHECK-64-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 1
+// CHECK-64-NEXT:    [[TMP13:%.*]] = inttoptr i64 [[TMP9]] to i8*
+// CHECK-64-NEXT:    store i8* [[TMP13]], i8** [[TMP12]], align 8, !llvm.access.group [[ACC_GRP150]]
+// CHECK-64-NEXT:    [[TMP14:%.*]] = bitcast [2 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8**
+// CHECK-64-NEXT:    call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i64, i64)* @__omp_outlined__6 to i8*), i8* null, i8** [[TMP14]], i64 2), !llvm.access.group [[ACC_GRP150]]
+// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK-64:       omp.inner.for.inc:
+// CHECK-64-NEXT:    [[TMP15:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP150]]
+// CHECK-64-NEXT:    [[TMP16:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP150]]
+// CHECK-64-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP15]], [[TMP16]]
+// CHECK-64-NEXT:    store i32 [[ADD]], i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP150]]
+// CHECK-64-NEXT:    [[TMP17:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP150]]
+// CHECK-64-NEXT:    [[TMP18:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP150]]
+// CHECK-64-NEXT:    [[ADD2:%.*]] = add nsw i32 [[TMP17]], [[TMP18]]
+// CHECK-64-NEXT:    store i32 [[ADD2]], i32* [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP150]]
+// CHECK-64-NEXT:    [[TMP19:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP150]]
+// CHECK-64-NEXT:    [[TMP20:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP150]]
+// CHECK-64-NEXT:    [[ADD3:%.*]] = add nsw i32 [[TMP19]], [[TMP20]]
+// CHECK-64-NEXT:    store i32 [[ADD3]], i32* [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP150]]
+// CHECK-64-NEXT:    [[TMP21:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP150]]
+// CHECK-64-NEXT:    [[CMP4:%.*]] = icmp sgt i32 [[TMP21]], 9
+// CHECK-64-NEXT:    br i1 [[CMP4]], label [[COND_TRUE5:%.*]], label [[COND_FALSE6:%.*]]
+// CHECK-64:       cond.true5:
+// CHECK-64-NEXT:    br label [[COND_END7:%.*]]
+// CHECK-64:       cond.false6:
+// CHECK-64-NEXT:    [[TMP22:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP150]]
+// CHECK-64-NEXT:    br label [[COND_END7]]
+// CHECK-64:       cond.end7:
+// CHECK-64-NEXT:    [[COND8:%.*]] = phi i32 [ 9, [[COND_TRUE5]] ], [ [[TMP22]], [[COND_FALSE6]] ]
+// CHECK-64-NEXT:    store i32 [[COND8]], i32* [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP150]]
+// CHECK-64-NEXT:    [[TMP23:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP150]]
+// CHECK-64-NEXT:    store i32 [[TMP23]], i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP150]]
+// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP151:![0-9]+]]
+// CHECK-64:       omp.inner.for.end:
+// CHECK-64-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK-64:       omp.loop.exit:
+// CHECK-64-NEXT:    call void @__kmpc_distribute_static_fini(%struct.ident_t* @[[GLOB2]], i32 [[TMP1]])
+// CHECK-64-NEXT:    [[TMP24:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-64-NEXT:    [[TMP25:%.*]] = icmp ne i32 [[TMP24]], 0
+// CHECK-64-NEXT:    br i1 [[TMP25]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]]
+// CHECK-64:       .omp.final.then:
+// CHECK-64-NEXT:    store i32 10, i32* [[I]], align 4
+// CHECK-64-NEXT:    br label [[DOTOMP_FINAL_DONE]]
+// CHECK-64:       .omp.final.done:
+// CHECK-64-NEXT:    ret void
+//
+//
+// CHECK-64-LABEL: define {{[^@]+}}@__omp_outlined__6
+// CHECK-64-SAME: (i32* noalias noundef [[DOTGLOBAL_TID_:%.*]], i32* noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR1]] {
+// CHECK-64-NEXT:  entry:
+// CHECK-64-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8
+// CHECK-64-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8
+// CHECK-64-NEXT:    [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i64, align 8
+// CHECK-64-NEXT:    [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i64, align 8
+// CHECK-64-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK-64-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8
+// CHECK-64-NEXT:    store i64 [[DOTPREVIOUS_LB_]], i64* [[DOTPREVIOUS_LB__ADDR]], align 8
+// CHECK-64-NEXT:    store i64 [[DOTPREVIOUS_UB_]], i64* [[DOTPREVIOUS_UB__ADDR]], align 8
+// CHECK-64-NEXT:    store i32 0, i32* [[DOTOMP_LB]], align 4
+// CHECK-64-NEXT:    store i32 9, i32* [[DOTOMP_UB]], align 4
+// CHECK-64-NEXT:    [[TMP0:%.*]] = load i64, i64* [[DOTPREVIOUS_LB__ADDR]], align 8
+// CHECK-64-NEXT:    [[CONV:%.*]] = trunc i64 [[TMP0]] to i32
+// CHECK-64-NEXT:    [[TMP1:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8
+// CHECK-64-NEXT:    [[CONV1:%.*]] = trunc i64 [[TMP1]] to i32
+// CHECK-64-NEXT:    store i32 [[CONV]], i32* [[DOTOMP_LB]], align 4
+// CHECK-64-NEXT:    store i32 [[CONV1]], i32* [[DOTOMP_UB]], align 4
+// CHECK-64-NEXT:    store i32 1, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-64-NEXT:    store i32 0, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-64-NEXT:    [[TMP2:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK-64-NEXT:    [[TMP3:%.*]] = load i32, i32* [[TMP2]], align 4
+// CHECK-64-NEXT:    call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB3]], i32 [[TMP3]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1)
+// CHECK-64-NEXT:    [[TMP4:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4
+// CHECK-64-NEXT:    store i32 [[TMP4]], i32* [[DOTOMP_IV]], align 4
+// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK-64:       omp.inner.for.cond:
+// CHECK-64-NEXT:    [[TMP5:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP153:![0-9]+]]
+// CHECK-64-NEXT:    [[CONV2:%.*]] = sext i32 [[TMP5]] to i64
+// CHECK-64-NEXT:    [[TMP6:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8, !llvm.access.group [[ACC_GRP153]]
+// CHECK-64-NEXT:    [[CMP:%.*]] = icmp ule i64 [[CONV2]], [[TMP6]]
+// CHECK-64-NEXT:    br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK-64:       omp.inner.for.body:
+// CHECK-64-NEXT:    [[TMP7:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP153]]
+// CHECK-64-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP7]], 1
+// CHECK-64-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// CHECK-64-NEXT:    store i32 [[ADD]], i32* [[I]], align 4, !llvm.access.group [[ACC_GRP153]]
+// CHECK-64-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK-64:       omp.body.continue:
+// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK-64:       omp.inner.for.inc:
+// CHECK-64-NEXT:    [[TMP8:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP153]]
+// CHECK-64-NEXT:    [[TMP9:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP153]]
+// CHECK-64-NEXT:    [[ADD3:%.*]] = add nsw i32 [[TMP8]], [[TMP9]]
+// CHECK-64-NEXT:    store i32 [[ADD3]], i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP153]]
+// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP154:![0-9]+]]
+// CHECK-64:       omp.inner.for.end:
+// CHECK-64-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK-64:       omp.loop.exit:
+// CHECK-64-NEXT:    call void @__kmpc_distribute_static_fini(%struct.ident_t* @[[GLOB2]], i32 [[TMP3]])
+// CHECK-64-NEXT:    [[TMP10:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-64-NEXT:    [[TMP11:%.*]] = icmp ne i32 [[TMP10]], 0
+// CHECK-64-NEXT:    br i1 [[TMP11]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]]
+// CHECK-64:       .omp.final.then:
+// CHECK-64-NEXT:    store i32 10, i32* [[I]], align 4
+// CHECK-64-NEXT:    br label [[DOTOMP_FINAL_DONE]]
+// CHECK-64:       .omp.final.done:
+// CHECK-64-NEXT:    ret void
+//
+//
+// CHECK-64-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l23
+// CHECK-64-SAME: () #[[ATTR0]] {
+// CHECK-64-NEXT:  entry:
+// CHECK-64-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 2, i1 false)
+// CHECK-64-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
+// CHECK-64-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
+// CHECK-64:       user_code.entry:
+// CHECK-64-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
+// CHECK-64-NEXT:    store i32 0, i32* [[DOTZERO_ADDR]], align 4
+// CHECK-64-NEXT:    store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4
+// CHECK-64-NEXT:    call void @__omp_outlined__7(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]]) #[[ATTR2]]
+// CHECK-64-NEXT:    call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 2)
+// CHECK-64-NEXT:    ret void
+// CHECK-64:       worker.exit:
+// CHECK-64-NEXT:    ret void
+//
+//
+// CHECK-64-LABEL: define {{[^@]+}}@__omp_outlined__7
+// CHECK-64-SAME: (i32* noalias noundef [[DOTGLOBAL_TID_:%.*]], i32* noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
+// CHECK-64-NEXT:  entry:
+// CHECK-64-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8
+// CHECK-64-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8
+// CHECK-64-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [2 x i8*], align 8
+// CHECK-64-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK-64-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8
+// CHECK-64-NEXT:    store i32 0, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-64-NEXT:    store i32 9, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-64-NEXT:    store i32 1, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-64-NEXT:    store i32 0, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-64-NEXT:    [[NVPTX_NUM_THREADS:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block()
+// CHECK-64-NEXT:    [[TMP0:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK-64-NEXT:    [[TMP1:%.*]] = load i32, i32* [[TMP0]], align 4
+// CHECK-64-NEXT:    call void @__kmpc_distribute_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP1]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]])
+// CHECK-64-NEXT:    [[TMP2:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-64-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[TMP2]], 9
+// CHECK-64-NEXT:    br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK-64:       cond.true:
+// CHECK-64-NEXT:    br label [[COND_END:%.*]]
+// CHECK-64:       cond.false:
+// CHECK-64-NEXT:    [[TMP3:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-64-NEXT:    br label [[COND_END]]
+// CHECK-64:       cond.end:
+// CHECK-64-NEXT:    [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP3]], [[COND_FALSE]] ]
+// CHECK-64-NEXT:    store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-64-NEXT:    [[TMP4:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-64-NEXT:    store i32 [[TMP4]], i32* [[DOTOMP_IV]], align 4
+// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK-64:       omp.inner.for.cond:
+// CHECK-64-NEXT:    [[TMP5:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP156:![0-9]+]]
+// CHECK-64-NEXT:    [[CMP1:%.*]] = icmp slt i32 [[TMP5]], 10
+// CHECK-64-NEXT:    br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK-64:       omp.inner.for.body:
+// CHECK-64-NEXT:    [[TMP6:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP156]]
+// CHECK-64-NEXT:    [[TMP7:%.*]] = zext i32 [[TMP6]] to i64
+// CHECK-64-NEXT:    [[TMP8:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP156]]
+// CHECK-64-NEXT:    [[TMP9:%.*]] = zext i32 [[TMP8]] to i64
+// CHECK-64-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 0
+// CHECK-64-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP7]] to i8*
+// CHECK-64-NEXT:    store i8* [[TMP11]], i8** [[TMP10]], align 8, !llvm.access.group [[ACC_GRP156]]
+// CHECK-64-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 1
+// CHECK-64-NEXT:    [[TMP13:%.*]] = inttoptr i64 [[TMP9]] to i8*
+// CHECK-64-NEXT:    store i8* [[TMP13]], i8** [[TMP12]], align 8, !llvm.access.group [[ACC_GRP156]]
+// CHECK-64-NEXT:    [[TMP14:%.*]] = bitcast [2 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8**
+// CHECK-64-NEXT:    call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i64, i64)* @__omp_outlined__8 to i8*), i8* null, i8** [[TMP14]], i64 2), !llvm.access.group [[ACC_GRP156]]
+// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK-64:       omp.inner.for.inc:
+// CHECK-64-NEXT:    [[TMP15:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP156]]
+// CHECK-64-NEXT:    [[TMP16:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP156]]
+// CHECK-64-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP15]], [[TMP16]]
+// CHECK-64-NEXT:    store i32 [[ADD]], i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP156]]
+// CHECK-64-NEXT:    [[TMP17:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP156]]
+// CHECK-64-NEXT:    [[TMP18:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP156]]
+// CHECK-64-NEXT:    [[ADD2:%.*]] = add nsw i32 [[TMP17]], [[TMP18]]
+// CHECK-64-NEXT:    store i32 [[ADD2]], i32* [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP156]]
+// CHECK-64-NEXT:    [[TMP19:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP156]]
+// CHECK-64-NEXT:    [[TMP20:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP156]]
+// CHECK-64-NEXT:    [[ADD3:%.*]] = add nsw i32 [[TMP19]], [[TMP20]]
+// CHECK-64-NEXT:    store i32 [[ADD3]], i32* [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP156]]
+// CHECK-64-NEXT:    [[TMP21:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP156]]
+// CHECK-64-NEXT:    [[CMP4:%.*]] = icmp sgt i32 [[TMP21]], 9
+// CHECK-64-NEXT:    br i1 [[CMP4]], label [[COND_TRUE5:%.*]], label [[COND_FALSE6:%.*]]
+// CHECK-64:       cond.true5:
+// CHECK-64-NEXT:    br label [[COND_END7:%.*]]
+// CHECK-64:       cond.false6:
+// CHECK-64-NEXT:    [[TMP22:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP156]]
+// CHECK-64-NEXT:    br label [[COND_END7]]
+// CHECK-64:       cond.end7:
+// CHECK-64-NEXT:    [[COND8:%.*]] = phi i32 [ 9, [[COND_TRUE5]] ], [ [[TMP22]], [[COND_FALSE6]] ]
+// CHECK-64-NEXT:    store i32 [[COND8]], i32* [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP156]]
+// CHECK-64-NEXT:    [[TMP23:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP156]]
+// CHECK-64-NEXT:    store i32 [[TMP23]], i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP156]]
+// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP157:![0-9]+]]
+// CHECK-64:       omp.inner.for.end:
+// CHECK-64-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK-64:       omp.loop.exit:
+// CHECK-64-NEXT:    call void @__kmpc_distribute_static_fini(%struct.ident_t* @[[GLOB2]], i32 [[TMP1]])
+// CHECK-64-NEXT:    [[TMP24:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-64-NEXT:    [[TMP25:%.*]] = icmp ne i32 [[TMP24]], 0
+// CHECK-64-NEXT:    br i1 [[TMP25]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]]
+// CHECK-64:       .omp.final.then:
+// CHECK-64-NEXT:    store i32 10, i32* [[I]], align 4
+// CHECK-64-NEXT:    br label [[DOTOMP_FINAL_DONE]]
+// CHECK-64:       .omp.final.done:
+// CHECK-64-NEXT:    ret void
+//
+//
+// CHECK-64-LABEL: define {{[^@]+}}@__omp_outlined__8
+// CHECK-64-SAME: (i32* noalias noundef [[DOTGLOBAL_TID_:%.*]], i32* noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR1]] {
+// CHECK-64-NEXT:  entry:
+// CHECK-64-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8
+// CHECK-64-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8
+// CHECK-64-NEXT:    [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i64, align 8
+// CHECK-64-NEXT:    [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i64, align 8
+// CHECK-64-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK-64-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8
+// CHECK-64-NEXT:    store i64 [[DOTPREVIOUS_LB_]], i64* [[DOTPREVIOUS_LB__ADDR]], align 8
+// CHECK-64-NEXT:    store i64 [[DOTPREVIOUS_UB_]], i64* [[DOTPREVIOUS_UB__ADDR]], align 8
+// CHECK-64-NEXT:    store i32 0, i32* [[DOTOMP_LB]], align 4
+// CHECK-64-NEXT:    store i32 9, i32* [[DOTOMP_UB]], align 4
+// CHECK-64-NEXT:    [[TMP0:%.*]] = load i64, i64* [[DOTPREVIOUS_LB__ADDR]], align 8
+// CHECK-64-NEXT:    [[CONV:%.*]] = trunc i64 [[TMP0]] to i32
+// CHECK-64-NEXT:    [[TMP1:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8
+// CHECK-64-NEXT:    [[CONV1:%.*]] = trunc i64 [[TMP1]] to i32
+// CHECK-64-NEXT:    store i32 [[CONV]], i32* [[DOTOMP_LB]], align 4
+// CHECK-64-NEXT:    store i32 [[CONV1]], i32* [[DOTOMP_UB]], align 4
+// CHECK-64-NEXT:    store i32 1, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-64-NEXT:    store i32 0, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-64-NEXT:    [[TMP2:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4
+// CHECK-64-NEXT:    [[TMP3:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4
+// CHECK-64-NEXT:    [[TMP4:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK-64-NEXT:    [[TMP5:%.*]] = load i32, i32* [[TMP4]], align 4
+// CHECK-64-NEXT:    call void @__kmpc_dispatch_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP5]], i32 1073741862, i32 [[TMP2]], i32 [[TMP3]], i32 1, i32 1)
+// CHECK-64-NEXT:    br label [[OMP_DISPATCH_COND:%.*]]
+// CHECK-64:       omp.dispatch.cond:
+// CHECK-64-NEXT:    [[TMP6:%.*]] = call i32 @__kmpc_dispatch_next_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP5]], i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]])
+// CHECK-64-NEXT:    [[TOBOOL:%.*]] = icmp ne i32 [[TMP6]], 0
+// CHECK-64-NEXT:    br i1 [[TOBOOL]], label [[OMP_DISPATCH_BODY:%.*]], label [[OMP_DISPATCH_END:%.*]]
+// CHECK-64:       omp.dispatch.body:
+// CHECK-64-NEXT:    [[TMP7:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4
+// CHECK-64-NEXT:    store i32 [[TMP7]], i32* [[DOTOMP_IV]], align 4
+// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK-64:       omp.inner.for.cond:
+// CHECK-64-NEXT:    [[TMP8:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP159:![0-9]+]]
+// CHECK-64-NEXT:    [[TMP9:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP159]]
+// CHECK-64-NEXT:    [[CMP:%.*]] = icmp sle i32 [[TMP8]], [[TMP9]]
+// CHECK-64-NEXT:    br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK-64:       omp.inner.for.body:
+// CHECK-64-NEXT:    [[TMP10:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP159]]
+// CHECK-64-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP10]], 1
+// CHECK-64-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// CHECK-64-NEXT:    store i32 [[ADD]], i32* [[I]], align 4, !llvm.access.group [[ACC_GRP159]]
+// CHECK-64-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK-64:       omp.body.continue:
+// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK-64:       omp.inner.for.inc:
+// CHECK-64-NEXT:    [[TMP11:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP159]]
+// CHECK-64-NEXT:    [[ADD2:%.*]] = add nsw i32 [[TMP11]], 1
+// CHECK-64-NEXT:    store i32 [[ADD2]], i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP159]]
+// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP160:![0-9]+]]
+// CHECK-64:       omp.inner.for.end:
+// CHECK-64-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
+// CHECK-64:       omp.dispatch.inc:
+// CHECK-64-NEXT:    br label [[OMP_DISPATCH_COND]]
+// CHECK-64:       omp.dispatch.end:
+// CHECK-64-NEXT:    [[TMP12:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-64-NEXT:    [[TMP13:%.*]] = icmp ne i32 [[TMP12]], 0
+// CHECK-64-NEXT:    br i1 [[TMP13]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]]
+// CHECK-64:       .omp.final.then:
+// CHECK-64-NEXT:    store i32 10, i32* [[I]], align 4
+// CHECK-64-NEXT:    br label [[DOTOMP_FINAL_DONE]]
+// CHECK-64:       .omp.final.done:
+// CHECK-64-NEXT:    ret void
+//
+//
+// CHECK-64-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l26
+// CHECK-64-SAME: () #[[ATTR0]] {
+// CHECK-64-NEXT:  entry:
+// CHECK-64-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 2, i1 false)
+// CHECK-64-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
+// CHECK-64-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
+// CHECK-64:       user_code.entry:
+// CHECK-64-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
+// CHECK-64-NEXT:    store i32 0, i32* [[DOTZERO_ADDR]], align 4
+// CHECK-64-NEXT:    store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4
+// CHECK-64-NEXT:    call void @__omp_outlined__9(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]]) #[[ATTR2]]
+// CHECK-64-NEXT:    call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 2)
+// CHECK-64-NEXT:    ret void
+// CHECK-64:       worker.exit:
+// CHECK-64-NEXT:    ret void
+//
+//
+// CHECK-64-LABEL: define {{[^@]+}}@__omp_outlined__9
+// CHECK-64-SAME: (i32* noalias noundef [[DOTGLOBAL_TID_:%.*]], i32* noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
+// CHECK-64-NEXT:  entry:
+// CHECK-64-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8
+// CHECK-64-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8
+// CHECK-64-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [2 x i8*], align 8
+// CHECK-64-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK-64-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8
+// CHECK-64-NEXT:    store i32 0, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-64-NEXT:    store i32 9, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-64-NEXT:    store i32 1, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-64-NEXT:    store i32 0, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-64-NEXT:    [[NVPTX_NUM_THREADS:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block()
+// CHECK-64-NEXT:    [[TMP0:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK-64-NEXT:    [[TMP1:%.*]] = load i32, i32* [[TMP0]], align 4
+// CHECK-64-NEXT:    call void @__kmpc_distribute_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP1]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]])
+// CHECK-64-NEXT:    [[TMP2:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-64-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[TMP2]], 9
+// CHECK-64-NEXT:    br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK-64:       cond.true:
+// CHECK-64-NEXT:    br label [[COND_END:%.*]]
+// CHECK-64:       cond.false:
+// CHECK-64-NEXT:    [[TMP3:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-64-NEXT:    br label [[COND_END]]
+// CHECK-64:       cond.end:
+// CHECK-64-NEXT:    [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP3]], [[COND_FALSE]] ]
+// CHECK-64-NEXT:    store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-64-NEXT:    [[TMP4:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-64-NEXT:    store i32 [[TMP4]], i32* [[DOTOMP_IV]], align 4
+// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK-64:       omp.inner.for.cond:
+// CHECK-64-NEXT:    [[TMP5:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP162:![0-9]+]]
+// CHECK-64-NEXT:    [[CMP1:%.*]] = icmp slt i32 [[TMP5]], 10
+// CHECK-64-NEXT:    br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK-64:       omp.inner.for.body:
+// CHECK-64-NEXT:    [[TMP6:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP162]]
+// CHECK-64-NEXT:    [[TMP7:%.*]] = zext i32 [[TMP6]] to i64
+// CHECK-64-NEXT:    [[TMP8:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP162]]
+// CHECK-64-NEXT:    [[TMP9:%.*]] = zext i32 [[TMP8]] to i64
+// CHECK-64-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 0
+// CHECK-64-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP7]] to i8*
+// CHECK-64-NEXT:    store i8* [[TMP11]], i8** [[TMP10]], align 8, !llvm.access.group [[ACC_GRP162]]
+// CHECK-64-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 1
+// CHECK-64-NEXT:    [[TMP13:%.*]] = inttoptr i64 [[TMP9]] to i8*
+// CHECK-64-NEXT:    store i8* [[TMP13]], i8** [[TMP12]], align 8, !llvm.access.group [[ACC_GRP162]]
+// CHECK-64-NEXT:    [[TMP14:%.*]] = bitcast [2 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8**
+// CHECK-64-NEXT:    call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i64, i64)* @__omp_outlined__10 to i8*), i8* null, i8** [[TMP14]], i64 2), !llvm.access.group [[ACC_GRP162]]
+// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK-64:       omp.inner.for.inc:
+// CHECK-64-NEXT:    [[TMP15:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP162]]
+// CHECK-64-NEXT:    [[TMP16:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP162]]
+// CHECK-64-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP15]], [[TMP16]]
+// CHECK-64-NEXT:    store i32 [[ADD]], i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP162]]
+// CHECK-64-NEXT:    [[TMP17:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP162]]
+// CHECK-64-NEXT:    [[TMP18:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP162]]
+// CHECK-64-NEXT:    [[ADD2:%.*]] = add nsw i32 [[TMP17]], [[TMP18]]
+// CHECK-64-NEXT:    store i32 [[ADD2]], i32* [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP162]]
+// CHECK-64-NEXT:    [[TMP19:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP162]]
+// CHECK-64-NEXT:    [[TMP20:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP162]]
+// CHECK-64-NEXT:    [[ADD3:%.*]] = add nsw i32 [[TMP19]], [[TMP20]]
+// CHECK-64-NEXT:    store i32 [[ADD3]], i32* [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP162]]
+// CHECK-64-NEXT:    [[TMP21:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP162]]
+// CHECK-64-NEXT:    [[CMP4:%.*]] = icmp sgt i32 [[TMP21]], 9
+// CHECK-64-NEXT:    br i1 [[CMP4]], label [[COND_TRUE5:%.*]], label [[COND_FALSE6:%.*]]
+// CHECK-64:       cond.true5:
+// CHECK-64-NEXT:    br label [[COND_END7:%.*]]
+// CHECK-64:       cond.false6:
+// CHECK-64-NEXT:    [[TMP22:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP162]]
+// CHECK-64-NEXT:    br label [[COND_END7]]
+// CHECK-64:       cond.end7:
+// CHECK-64-NEXT:    [[COND8:%.*]] = phi i32 [ 9, [[COND_TRUE5]] ], [ [[TMP22]], [[COND_FALSE6]] ]
+// CHECK-64-NEXT:    store i32 [[COND8]], i32* [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP162]]
+// CHECK-64-NEXT:    [[TMP23:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP162]]
+// CHECK-64-NEXT:    store i32 [[TMP23]], i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP162]]
+// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP163:![0-9]+]]
+// CHECK-64:       omp.inner.for.end:
+// CHECK-64-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK-64:       omp.loop.exit:
+// CHECK-64-NEXT:    call void @__kmpc_distribute_static_fini(%struct.ident_t* @[[GLOB2]], i32 [[TMP1]])
+// CHECK-64-NEXT:    [[TMP24:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-64-NEXT:    [[TMP25:%.*]] = icmp ne i32 [[TMP24]], 0
+// CHECK-64-NEXT:    br i1 [[TMP25]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]]
+// CHECK-64:       .omp.final.then:
+// CHECK-64-NEXT:    store i32 10, i32* [[I]], align 4
+// CHECK-64-NEXT:    br label [[DOTOMP_FINAL_DONE]]
+// CHECK-64:       .omp.final.done:
+// CHECK-64-NEXT:    ret void
+//
+//
+// CHECK-64-LABEL: define {{[^@]+}}@__omp_outlined__10
+// CHECK-64-SAME: (i32* noalias noundef [[DOTGLOBAL_TID_:%.*]], i32* noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR1]] {
+// CHECK-64-NEXT:  entry:
+// CHECK-64-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8
+// CHECK-64-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8
+// CHECK-64-NEXT:    [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i64, align 8
+// CHECK-64-NEXT:    [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i64, align 8
+// CHECK-64-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK-64-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8
+// CHECK-64-NEXT:    store i64 [[DOTPREVIOUS_LB_]], i64* [[DOTPREVIOUS_LB__ADDR]], align 8
+// CHECK-64-NEXT:    store i64 [[DOTPREVIOUS_UB_]], i64* [[DOTPREVIOUS_UB__ADDR]], align 8
+// CHECK-64-NEXT:    store i32 0, i32* [[DOTOMP_LB]], align 4
+// CHECK-64-NEXT:    store i32 9, i32* [[DOTOMP_UB]], align 4
+// CHECK-64-NEXT:    [[TMP0:%.*]] = load i64, i64* [[DOTPREVIOUS_LB__ADDR]], align 8
+// CHECK-64-NEXT:    [[CONV:%.*]] = trunc i64 [[TMP0]] to i32
+// CHECK-64-NEXT:    [[TMP1:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8
+// CHECK-64-NEXT:    [[CONV1:%.*]] = trunc i64 [[TMP1]] to i32
+// CHECK-64-NEXT:    store i32 [[CONV]], i32* [[DOTOMP_LB]], align 4
+// CHECK-64-NEXT:    store i32 [[CONV1]], i32* [[DOTOMP_UB]], align 4
+// CHECK-64-NEXT:    store i32 1, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-64-NEXT:    store i32 0, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-64-NEXT:    [[TMP2:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4
+// CHECK-64-NEXT:    [[TMP3:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4
+// CHECK-64-NEXT:    [[TMP4:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK-64-NEXT:    [[TMP5:%.*]] = load i32, i32* [[TMP4]], align 4
+// CHECK-64-NEXT:    call void @__kmpc_dispatch_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP5]], i32 1073741861, i32 [[TMP2]], i32 [[TMP3]], i32 1, i32 1)
+// CHECK-64-NEXT:    br label [[OMP_DISPATCH_COND:%.*]]
+// CHECK-64:       omp.dispatch.cond:
+// CHECK-64-NEXT:    [[TMP6:%.*]] = call i32 @__kmpc_dispatch_next_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP5]], i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]])
+// CHECK-64-NEXT:    [[TOBOOL:%.*]] = icmp ne i32 [[TMP6]], 0
+// CHECK-64-NEXT:    br i1 [[TOBOOL]], label [[OMP_DISPATCH_BODY:%.*]], label [[OMP_DISPATCH_END:%.*]]
+// CHECK-64:       omp.dispatch.body:
+// CHECK-64-NEXT:    [[TMP7:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4
+// CHECK-64-NEXT:    store i32 [[TMP7]], i32* [[DOTOMP_IV]], align 4
+// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK-64:       omp.inner.for.cond:
+// CHECK-64-NEXT:    [[TMP8:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP165:![0-9]+]]
+// CHECK-64-NEXT:    [[TMP9:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP165]]
+// CHECK-64-NEXT:    [[CMP:%.*]] = icmp sle i32 [[TMP8]], [[TMP9]]
+// CHECK-64-NEXT:    br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK-64:       omp.inner.for.body:
+// CHECK-64-NEXT:    [[TMP10:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP165]]
+// CHECK-64-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP10]], 1
+// CHECK-64-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// CHECK-64-NEXT:    store i32 [[ADD]], i32* [[I]], align 4, !llvm.access.group [[ACC_GRP165]]
+// CHECK-64-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK-64:       omp.body.continue:
+// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK-64:       omp.inner.for.inc:
+// CHECK-64-NEXT:    [[TMP11:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP165]]
+// CHECK-64-NEXT:    [[ADD2:%.*]] = add nsw i32 [[TMP11]], 1
+// CHECK-64-NEXT:    store i32 [[ADD2]], i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP165]]
+// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP166:![0-9]+]]
+// CHECK-64:       omp.inner.for.end:
+// CHECK-64-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
+// CHECK-64:       omp.dispatch.inc:
+// CHECK-64-NEXT:    br label [[OMP_DISPATCH_COND]]
+// CHECK-64:       omp.dispatch.end:
+// CHECK-64-NEXT:    [[TMP12:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-64-NEXT:    [[TMP13:%.*]] = icmp ne i32 [[TMP12]], 0
+// CHECK-64-NEXT:    br i1 [[TMP13]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]]
+// CHECK-64:       .omp.final.then:
+// CHECK-64-NEXT:    store i32 10, i32* [[I]], align 4
+// CHECK-64-NEXT:    br label [[DOTOMP_FINAL_DONE]]
+// CHECK-64:       .omp.final.done:
+// CHECK-64-NEXT:    ret void
+//
+//
+// CHECK-64-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l29
+// CHECK-64-SAME: () #[[ATTR0]] {
+// CHECK-64-NEXT:  entry:
+// CHECK-64-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 2, i1 false)
+// CHECK-64-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
+// CHECK-64-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
+// CHECK-64:       user_code.entry:
+// CHECK-64-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
+// CHECK-64-NEXT:    store i32 0, i32* [[DOTZERO_ADDR]], align 4
+// CHECK-64-NEXT:    store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4
+// CHECK-64-NEXT:    call void @__omp_outlined__11(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]]) #[[ATTR2]]
+// CHECK-64-NEXT:    call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 2)
+// CHECK-64-NEXT:    ret void
+// CHECK-64:       worker.exit:
+// CHECK-64-NEXT:    ret void
+//
+//
+// CHECK-64-LABEL: define {{[^@]+}}@__omp_outlined__11
+// CHECK-64-SAME: (i32* noalias noundef [[DOTGLOBAL_TID_:%.*]], i32* noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
+// CHECK-64-NEXT:  entry:
+// CHECK-64-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8
+// CHECK-64-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8
+// CHECK-64-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [2 x i8*], align 8
+// CHECK-64-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK-64-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8
+// CHECK-64-NEXT:    store i32 0, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-64-NEXT:    store i32 9, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-64-NEXT:    store i32 1, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-64-NEXT:    store i32 0, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-64-NEXT:    [[NVPTX_NUM_THREADS:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block()
+// CHECK-64-NEXT:    [[TMP0:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK-64-NEXT:    [[TMP1:%.*]] = load i32, i32* [[TMP0]], align 4
+// CHECK-64-NEXT:    call void @__kmpc_distribute_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP1]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]])
+// CHECK-64-NEXT:    [[TMP2:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-64-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[TMP2]], 9
+// CHECK-64-NEXT:    br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK-64:       cond.true:
+// CHECK-64-NEXT:    br label [[COND_END:%.*]]
+// CHECK-64:       cond.false:
+// CHECK-64-NEXT:    [[TMP3:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-64-NEXT:    br label [[COND_END]]
+// CHECK-64:       cond.end:
+// CHECK-64-NEXT:    [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP3]], [[COND_FALSE]] ]
+// CHECK-64-NEXT:    store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-64-NEXT:    [[TMP4:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-64-NEXT:    store i32 [[TMP4]], i32* [[DOTOMP_IV]], align 4
+// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK-64:       omp.inner.for.cond:
+// CHECK-64-NEXT:    [[TMP5:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP168:![0-9]+]]
+// CHECK-64-NEXT:    [[CMP1:%.*]] = icmp slt i32 [[TMP5]], 10
+// CHECK-64-NEXT:    br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK-64:       omp.inner.for.body:
+// CHECK-64-NEXT:    [[TMP6:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP168]]
+// CHECK-64-NEXT:    [[TMP7:%.*]] = zext i32 [[TMP6]] to i64
+// CHECK-64-NEXT:    [[TMP8:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP168]]
+// CHECK-64-NEXT:    [[TMP9:%.*]] = zext i32 [[TMP8]] to i64
+// CHECK-64-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 0
+// CHECK-64-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP7]] to i8*
+// CHECK-64-NEXT:    store i8* [[TMP11]], i8** [[TMP10]], align 8, !llvm.access.group [[ACC_GRP168]]
+// CHECK-64-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 1
+// CHECK-64-NEXT:    [[TMP13:%.*]] = inttoptr i64 [[TMP9]] to i8*
+// CHECK-64-NEXT:    store i8* [[TMP13]], i8** [[TMP12]], align 8, !llvm.access.group [[ACC_GRP168]]
+// CHECK-64-NEXT:    [[TMP14:%.*]] = bitcast [2 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8**
+// CHECK-64-NEXT:    call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i64, i64)* @__omp_outlined__12 to i8*), i8* null, i8** [[TMP14]], i64 2), !llvm.access.group [[ACC_GRP168]]
+// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK-64:       omp.inner.for.inc:
+// CHECK-64-NEXT:    [[TMP15:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP168]]
+// CHECK-64-NEXT:    [[TMP16:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP168]]
+// CHECK-64-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP15]], [[TMP16]]
+// CHECK-64-NEXT:    store i32 [[ADD]], i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP168]]
+// CHECK-64-NEXT:    [[TMP17:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP168]]
+// CHECK-64-NEXT:    [[TMP18:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP168]]
+// CHECK-64-NEXT:    [[ADD2:%.*]] = add nsw i32 [[TMP17]], [[TMP18]]
+// CHECK-64-NEXT:    store i32 [[ADD2]], i32* [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP168]]
+// CHECK-64-NEXT:    [[TMP19:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP168]]
+// CHECK-64-NEXT:    [[TMP20:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP168]]
+// CHECK-64-NEXT:    [[ADD3:%.*]] = add nsw i32 [[TMP19]], [[TMP20]]
+// CHECK-64-NEXT:    store i32 [[ADD3]], i32* [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP168]]
+// CHECK-64-NEXT:    [[TMP21:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP168]]
+// CHECK-64-NEXT:    [[CMP4:%.*]] = icmp sgt i32 [[TMP21]], 9
+// CHECK-64-NEXT:    br i1 [[CMP4]], label [[COND_TRUE5:%.*]], label [[COND_FALSE6:%.*]]
+// CHECK-64:       cond.true5:
+// CHECK-64-NEXT:    br label [[COND_END7:%.*]]
+// CHECK-64:       cond.false6:
+// CHECK-64-NEXT:    [[TMP22:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP168]]
+// CHECK-64-NEXT:    br label [[COND_END7]]
+// CHECK-64:       cond.end7:
+// CHECK-64-NEXT:    [[COND8:%.*]] = phi i32 [ 9, [[COND_TRUE5]] ], [ [[TMP22]], [[COND_FALSE6]] ]
+// CHECK-64-NEXT:    store i32 [[COND8]], i32* [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP168]]
+// CHECK-64-NEXT:    [[TMP23:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP168]]
+// CHECK-64-NEXT:    store i32 [[TMP23]], i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP168]]
+// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP169:![0-9]+]]
+// CHECK-64:       omp.inner.for.end:
+// CHECK-64-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK-64:       omp.loop.exit:
+// CHECK-64-NEXT:    call void @__kmpc_distribute_static_fini(%struct.ident_t* @[[GLOB2]], i32 [[TMP1]])
+// CHECK-64-NEXT:    [[TMP24:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-64-NEXT:    [[TMP25:%.*]] = icmp ne i32 [[TMP24]], 0
+// CHECK-64-NEXT:    br i1 [[TMP25]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]]
+// CHECK-64:       .omp.final.then:
+// CHECK-64-NEXT:    store i32 10, i32* [[I]], align 4
+// CHECK-64-NEXT:    br label [[DOTOMP_FINAL_DONE]]
+// CHECK-64:       .omp.final.done:
+// CHECK-64-NEXT:    ret void
+//
+//
+// CHECK-64-LABEL: define {{[^@]+}}@__omp_outlined__12
+// CHECK-64-SAME: (i32* noalias noundef [[DOTGLOBAL_TID_:%.*]], i32* noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR1]] {
+// CHECK-64-NEXT:  entry:
+// CHECK-64-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8
+// CHECK-64-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8
+// CHECK-64-NEXT:    [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i64, align 8
+// CHECK-64-NEXT:    [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i64, align 8
+// CHECK-64-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK-64-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8
+// CHECK-64-NEXT:    store i64 [[DOTPREVIOUS_LB_]], i64* [[DOTPREVIOUS_LB__ADDR]], align 8
+// CHECK-64-NEXT:    store i64 [[DOTPREVIOUS_UB_]], i64* [[DOTPREVIOUS_UB__ADDR]], align 8
+// CHECK-64-NEXT:    store i32 0, i32* [[DOTOMP_LB]], align 4
+// CHECK-64-NEXT:    store i32 9, i32* [[DOTOMP_UB]], align 4
+// CHECK-64-NEXT:    [[TMP0:%.*]] = load i64, i64* [[DOTPREVIOUS_LB__ADDR]], align 8
+// CHECK-64-NEXT:    [[CONV:%.*]] = trunc i64 [[TMP0]] to i32
+// CHECK-64-NEXT:    [[TMP1:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8
+// CHECK-64-NEXT:    [[CONV1:%.*]] = trunc i64 [[TMP1]] to i32
+// CHECK-64-NEXT:    store i32 [[CONV]], i32* [[DOTOMP_LB]], align 4
+// CHECK-64-NEXT:    store i32 [[CONV1]], i32* [[DOTOMP_UB]], align 4
+// CHECK-64-NEXT:    store i32 1, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-64-NEXT:    store i32 0, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-64-NEXT:    [[TMP2:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4
+// CHECK-64-NEXT:    [[TMP3:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4
+// CHECK-64-NEXT:    [[TMP4:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK-64-NEXT:    [[TMP5:%.*]] = load i32, i32* [[TMP4]], align 4
+// CHECK-64-NEXT:    call void @__kmpc_dispatch_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP5]], i32 1073741859, i32 [[TMP2]], i32 [[TMP3]], i32 1, i32 1)
+// CHECK-64-NEXT:    br label [[OMP_DISPATCH_COND:%.*]]
+// CHECK-64:       omp.dispatch.cond:
+// CHECK-64-NEXT:    [[TMP6:%.*]] = call i32 @__kmpc_dispatch_next_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP5]], i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]])
+// CHECK-64-NEXT:    [[TOBOOL:%.*]] = icmp ne i32 [[TMP6]], 0
+// CHECK-64-NEXT:    br i1 [[TOBOOL]], label [[OMP_DISPATCH_BODY:%.*]], label [[OMP_DISPATCH_END:%.*]]
+// CHECK-64:       omp.dispatch.body:
+// CHECK-64-NEXT:    [[TMP7:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4
+// CHECK-64-NEXT:    store i32 [[TMP7]], i32* [[DOTOMP_IV]], align 4
+// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK-64:       omp.inner.for.cond:
+// CHECK-64-NEXT:    [[TMP8:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP171:![0-9]+]]
+// CHECK-64-NEXT:    [[TMP9:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP171]]
+// CHECK-64-NEXT:    [[CMP:%.*]] = icmp sle i32 [[TMP8]], [[TMP9]]
+// CHECK-64-NEXT:    br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK-64:       omp.inner.for.body:
+// CHECK-64-NEXT:    [[TMP10:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP171]]
+// CHECK-64-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP10]], 1
+// CHECK-64-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// CHECK-64-NEXT:    store i32 [[ADD]], i32* [[I]], align 4, !llvm.access.group [[ACC_GRP171]]
+// CHECK-64-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK-64:       omp.body.continue:
+// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK-64:       omp.inner.for.inc:
+// CHECK-64-NEXT:    [[TMP11:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP171]]
+// CHECK-64-NEXT:    [[ADD2:%.*]] = add nsw i32 [[TMP11]], 1
+// CHECK-64-NEXT:    store i32 [[ADD2]], i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP171]]
+// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP172:![0-9]+]]
+// CHECK-64:       omp.inner.for.end:
+// CHECK-64-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
+// CHECK-64:       omp.dispatch.inc:
+// CHECK-64-NEXT:    br label [[OMP_DISPATCH_COND]]
+// CHECK-64:       omp.dispatch.end:
+// CHECK-64-NEXT:    [[TMP12:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-64-NEXT:    [[TMP13:%.*]] = icmp ne i32 [[TMP12]], 0
+// CHECK-64-NEXT:    br i1 [[TMP13]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]]
+// CHECK-64:       .omp.final.then:
+// CHECK-64-NEXT:    store i32 10, i32* [[I]], align 4
+// CHECK-64-NEXT:    br label [[DOTOMP_FINAL_DONE]]
+// CHECK-64:       .omp.final.done:
+// CHECK-64-NEXT:    ret void
+//
+//
+// CHECK-64-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l32
+// CHECK-64-SAME: () #[[ATTR0]] {
+// CHECK-64-NEXT:  entry:
+// CHECK-64-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 2, i1 false)
+// CHECK-64-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
+// CHECK-64-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
+// CHECK-64:       user_code.entry:
+// CHECK-64-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
+// CHECK-64-NEXT:    store i32 0, i32* [[DOTZERO_ADDR]], align 4
+// CHECK-64-NEXT:    store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4
+// CHECK-64-NEXT:    call void @__omp_outlined__13(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]]) #[[ATTR2]]
+// CHECK-64-NEXT:    call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 2)
+// CHECK-64-NEXT:    ret void
+// CHECK-64:       worker.exit:
+// CHECK-64-NEXT:    ret void
+//
+//
+// CHECK-64-LABEL: define {{[^@]+}}@__omp_outlined__13
+// CHECK-64-SAME: (i32* noalias noundef [[DOTGLOBAL_TID_:%.*]], i32* noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
+// CHECK-64-NEXT:  entry:
+// CHECK-64-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8
+// CHECK-64-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8
+// CHECK-64-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [2 x i8*], align 8
+// CHECK-64-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK-64-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8
+// CHECK-64-NEXT:    store i32 0, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-64-NEXT:    store i32 9, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-64-NEXT:    store i32 1, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-64-NEXT:    store i32 0, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-64-NEXT:    [[NVPTX_NUM_THREADS:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block()
+// CHECK-64-NEXT:    [[TMP0:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK-64-NEXT:    [[TMP1:%.*]] = load i32, i32* [[TMP0]], align 4
+// CHECK-64-NEXT:    call void @__kmpc_distribute_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP1]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]])
+// CHECK-64-NEXT:    [[TMP2:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-64-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[TMP2]], 9
+// CHECK-64-NEXT:    br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK-64:       cond.true:
+// CHECK-64-NEXT:    br label [[COND_END:%.*]]
+// CHECK-64:       cond.false:
+// CHECK-64-NEXT:    [[TMP3:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-64-NEXT:    br label [[COND_END]]
+// CHECK-64:       cond.end:
+// CHECK-64-NEXT:    [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP3]], [[COND_FALSE]] ]
+// CHECK-64-NEXT:    store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-64-NEXT:    [[TMP4:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-64-NEXT:    store i32 [[TMP4]], i32* [[DOTOMP_IV]], align 4
+// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK-64:       omp.inner.for.cond:
+// CHECK-64-NEXT:    [[TMP5:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP174:![0-9]+]]
+// CHECK-64-NEXT:    [[CMP1:%.*]] = icmp slt i32 [[TMP5]], 10
+// CHECK-64-NEXT:    br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK-64:       omp.inner.for.body:
+// CHECK-64-NEXT:    [[TMP6:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP174]]
+// CHECK-64-NEXT:    [[TMP7:%.*]] = zext i32 [[TMP6]] to i64
+// CHECK-64-NEXT:    [[TMP8:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP174]]
+// CHECK-64-NEXT:    [[TMP9:%.*]] = zext i32 [[TMP8]] to i64
+// CHECK-64-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 0
+// CHECK-64-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP7]] to i8*
+// CHECK-64-NEXT:    store i8* [[TMP11]], i8** [[TMP10]], align 8, !llvm.access.group [[ACC_GRP174]]
+// CHECK-64-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 1
+// CHECK-64-NEXT:    [[TMP13:%.*]] = inttoptr i64 [[TMP9]] to i8*
+// CHECK-64-NEXT:    store i8* [[TMP13]], i8** [[TMP12]], align 8, !llvm.access.group [[ACC_GRP174]]
+// CHECK-64-NEXT:    [[TMP14:%.*]] = bitcast [2 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8**
+// CHECK-64-NEXT:    call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i64, i64)* @__omp_outlined__14 to i8*), i8* null, i8** [[TMP14]], i64 2), !llvm.access.group [[ACC_GRP174]]
+// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK-64:       omp.inner.for.inc:
+// CHECK-64-NEXT:    [[TMP15:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP174]]
+// CHECK-64-NEXT:    [[TMP16:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP174]]
+// CHECK-64-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP15]], [[TMP16]]
+// CHECK-64-NEXT:    store i32 [[ADD]], i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP174]]
+// CHECK-64-NEXT:    [[TMP17:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP174]]
+// CHECK-64-NEXT:    [[TMP18:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP174]]
+// CHECK-64-NEXT:    [[ADD2:%.*]] = add nsw i32 [[TMP17]], [[TMP18]]
+// CHECK-64-NEXT:    store i32 [[ADD2]], i32* [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP174]]
+// CHECK-64-NEXT:    [[TMP19:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP174]]
+// CHECK-64-NEXT:    [[TMP20:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP174]]
+// CHECK-64-NEXT:    [[ADD3:%.*]] = add nsw i32 [[TMP19]], [[TMP20]]
+// CHECK-64-NEXT:    store i32 [[ADD3]], i32* [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP174]]
+// CHECK-64-NEXT:    [[TMP21:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP174]]
+// CHECK-64-NEXT:    [[CMP4:%.*]] = icmp sgt i32 [[TMP21]], 9
+// CHECK-64-NEXT:    br i1 [[CMP4]], label [[COND_TRUE5:%.*]], label [[COND_FALSE6:%.*]]
+// CHECK-64:       cond.true5:
+// CHECK-64-NEXT:    br label [[COND_END7:%.*]]
+// CHECK-64:       cond.false6:
+// CHECK-64-NEXT:    [[TMP22:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP174]]
+// CHECK-64-NEXT:    br label [[COND_END7]]
+// CHECK-64:       cond.end7:
+// CHECK-64-NEXT:    [[COND8:%.*]] = phi i32 [ 9, [[COND_TRUE5]] ], [ [[TMP22]], [[COND_FALSE6]] ]
+// CHECK-64-NEXT:    store i32 [[COND8]], i32* [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP174]]
+// CHECK-64-NEXT:    [[TMP23:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP174]]
+// CHECK-64-NEXT:    store i32 [[TMP23]], i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP174]]
+// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP175:![0-9]+]]
+// CHECK-64:       omp.inner.for.end:
+// CHECK-64-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK-64:       omp.loop.exit:
+// CHECK-64-NEXT:    call void @__kmpc_distribute_static_fini(%struct.ident_t* @[[GLOB2]], i32 [[TMP1]])
+// CHECK-64-NEXT:    [[TMP24:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-64-NEXT:    [[TMP25:%.*]] = icmp ne i32 [[TMP24]], 0
+// CHECK-64-NEXT:    br i1 [[TMP25]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]]
+// CHECK-64:       .omp.final.then:
+// CHECK-64-NEXT:    store i32 10, i32* [[I]], align 4
+// CHECK-64-NEXT:    br label [[DOTOMP_FINAL_DONE]]
+// CHECK-64:       .omp.final.done:
+// CHECK-64-NEXT:    ret void
+//
+//
+// CHECK-64-LABEL: define {{[^@]+}}@__omp_outlined__14
+// CHECK-64-SAME: (i32* noalias noundef [[DOTGLOBAL_TID_:%.*]], i32* noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR1]] {
+// CHECK-64-NEXT:  entry:
+// CHECK-64-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8
+// CHECK-64-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8
+// CHECK-64-NEXT:    [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i64, align 8
+// CHECK-64-NEXT:    [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i64, align 8
+// CHECK-64-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK-64-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8
+// CHECK-64-NEXT:    store i64 [[DOTPREVIOUS_LB_]], i64* [[DOTPREVIOUS_LB__ADDR]], align 8
+// CHECK-64-NEXT:    store i64 [[DOTPREVIOUS_UB_]], i64* [[DOTPREVIOUS_UB__ADDR]], align 8
+// CHECK-64-NEXT:    store i32 0, i32* [[DOTOMP_LB]], align 4
+// CHECK-64-NEXT:    store i32 9, i32* [[DOTOMP_UB]], align 4
+// CHECK-64-NEXT:    [[TMP0:%.*]] = load i64, i64* [[DOTPREVIOUS_LB__ADDR]], align 8
+// CHECK-64-NEXT:    [[CONV:%.*]] = trunc i64 [[TMP0]] to i32
+// CHECK-64-NEXT:    [[TMP1:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8
+// CHECK-64-NEXT:    [[CONV1:%.*]] = trunc i64 [[TMP1]] to i32
+// CHECK-64-NEXT:    store i32 [[CONV]], i32* [[DOTOMP_LB]], align 4
+// CHECK-64-NEXT:    store i32 [[CONV1]], i32* [[DOTOMP_UB]], align 4
+// CHECK-64-NEXT:    store i32 1, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-64-NEXT:    store i32 0, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-64-NEXT:    [[TMP2:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4
+// CHECK-64-NEXT:    [[TMP3:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4
+// CHECK-64-NEXT:    [[TMP4:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK-64-NEXT:    [[TMP5:%.*]] = load i32, i32* [[TMP4]], align 4
+// CHECK-64-NEXT:    call void @__kmpc_dispatch_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP5]], i32 1073741860, i32 [[TMP2]], i32 [[TMP3]], i32 1, i32 1)
+// CHECK-64-NEXT:    br label [[OMP_DISPATCH_COND:%.*]]
+// CHECK-64:       omp.dispatch.cond:
+// CHECK-64-NEXT:    [[TMP6:%.*]] = call i32 @__kmpc_dispatch_next_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP5]], i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]])
+// CHECK-64-NEXT:    [[TOBOOL:%.*]] = icmp ne i32 [[TMP6]], 0
+// CHECK-64-NEXT:    br i1 [[TOBOOL]], label [[OMP_DISPATCH_BODY:%.*]], label [[OMP_DISPATCH_END:%.*]]
+// CHECK-64:       omp.dispatch.body:
+// CHECK-64-NEXT:    [[TMP7:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4
+// CHECK-64-NEXT:    store i32 [[TMP7]], i32* [[DOTOMP_IV]], align 4
+// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK-64:       omp.inner.for.cond:
+// CHECK-64-NEXT:    [[TMP8:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP177:![0-9]+]]
+// CHECK-64-NEXT:    [[TMP9:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP177]]
+// CHECK-64-NEXT:    [[CMP:%.*]] = icmp sle i32 [[TMP8]], [[TMP9]]
+// CHECK-64-NEXT:    br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK-64:       omp.inner.for.body:
+// CHECK-64-NEXT:    [[TMP10:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP177]]
+// CHECK-64-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP10]], 1
+// CHECK-64-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// CHECK-64-NEXT:    store i32 [[ADD]], i32* [[I]], align 4, !llvm.access.group [[ACC_GRP177]]
+// CHECK-64-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK-64:       omp.body.continue:
+// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK-64:       omp.inner.for.inc:
+// CHECK-64-NEXT:    [[TMP11:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP177]]
+// CHECK-64-NEXT:    [[ADD2:%.*]] = add nsw i32 [[TMP11]], 1
+// CHECK-64-NEXT:    store i32 [[ADD2]], i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP177]]
+// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP178:![0-9]+]]
+// CHECK-64:       omp.inner.for.end:
+// CHECK-64-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
+// CHECK-64:       omp.dispatch.inc:
+// CHECK-64-NEXT:    br label [[OMP_DISPATCH_COND]]
+// CHECK-64:       omp.dispatch.end:
+// CHECK-64-NEXT:    [[TMP12:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-64-NEXT:    [[TMP13:%.*]] = icmp ne i32 [[TMP12]], 0
+// CHECK-64-NEXT:    br i1 [[TMP13]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]]
+// CHECK-64:       .omp.final.then:
+// CHECK-64-NEXT:    store i32 10, i32* [[I]], align 4
+// CHECK-64-NEXT:    br label [[DOTOMP_FINAL_DONE]]
+// CHECK-64:       .omp.final.done:
+// CHECK-64-NEXT:    ret void
+//
+//
+// CHECK-64-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l36
+// CHECK-64-SAME: (i64 noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-64-NEXT:  entry:
+// CHECK-64-NEXT:    [[A_ADDR:%.*]] = alloca i64, align 8
+// CHECK-64-NEXT:    [[A_CASTED:%.*]] = alloca i64, align 8
+// CHECK-64-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    store i64 [[A]], i64* [[A_ADDR]], align 8
+// CHECK-64-NEXT:    [[CONV:%.*]] = bitcast i64* [[A_ADDR]] to i32*
+// CHECK-64-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 2, i1 false)
+// CHECK-64-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
+// CHECK-64-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
+// CHECK-64:       user_code.entry:
+// CHECK-64-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
+// CHECK-64-NEXT:    [[TMP2:%.*]] = load i32, i32* [[CONV]], align 4
+// CHECK-64-NEXT:    [[CONV1:%.*]] = bitcast i64* [[A_CASTED]] to i32*
+// CHECK-64-NEXT:    store i32 [[TMP2]], i32* [[CONV1]], align 4
+// CHECK-64-NEXT:    [[TMP3:%.*]] = load i64, i64* [[A_CASTED]], align 8
+// CHECK-64-NEXT:    store i32 0, i32* [[DOTZERO_ADDR]], align 4
+// CHECK-64-NEXT:    store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4
+// CHECK-64-NEXT:    call void @__omp_outlined__15(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i64 [[TMP3]]) #[[ATTR2]]
+// CHECK-64-NEXT:    call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 2)
+// CHECK-64-NEXT:    ret void
+// CHECK-64:       worker.exit:
+// CHECK-64-NEXT:    ret void
+//
+//
+// CHECK-64-LABEL: define {{[^@]+}}@__omp_outlined__15
+// CHECK-64-SAME: (i32* noalias noundef [[DOTGLOBAL_TID_:%.*]], i32* noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[A:%.*]]) #[[ATTR1]] {
+// CHECK-64-NEXT:  entry:
+// CHECK-64-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8
+// CHECK-64-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8
+// CHECK-64-NEXT:    [[A_ADDR:%.*]] = alloca i64, align 8
+// CHECK-64-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[A_CASTED:%.*]] = alloca i64, align 8
+// CHECK-64-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [3 x i8*], align 8
+// CHECK-64-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK-64-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8
+// CHECK-64-NEXT:    store i64 [[A]], i64* [[A_ADDR]], align 8
+// CHECK-64-NEXT:    [[CONV:%.*]] = bitcast i64* [[A_ADDR]] to i32*
+// CHECK-64-NEXT:    [[A1:%.*]] = call align 8 i8* @__kmpc_alloc_shared(i64 4)
+// CHECK-64-NEXT:    [[A_ON_STACK:%.*]] = bitcast i8* [[A1]] to i32*
+// CHECK-64-NEXT:    store i32 0, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-64-NEXT:    store i32 9, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-64-NEXT:    store i32 1, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-64-NEXT:    store i32 0, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-64-NEXT:    [[NVPTX_NUM_THREADS:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block()
+// CHECK-64-NEXT:    [[TMP0:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK-64-NEXT:    [[TMP1:%.*]] = load i32, i32* [[TMP0]], align 4
+// CHECK-64-NEXT:    call void @__kmpc_distribute_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP1]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]])
+// CHECK-64-NEXT:    [[TMP2:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-64-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[TMP2]], 9
+// CHECK-64-NEXT:    br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK-64:       cond.true:
+// CHECK-64-NEXT:    br label [[COND_END:%.*]]
+// CHECK-64:       cond.false:
+// CHECK-64-NEXT:    [[TMP3:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-64-NEXT:    br label [[COND_END]]
+// CHECK-64:       cond.end:
+// CHECK-64-NEXT:    [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP3]], [[COND_FALSE]] ]
+// CHECK-64-NEXT:    store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-64-NEXT:    [[TMP4:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-64-NEXT:    store i32 [[TMP4]], i32* [[DOTOMP_IV]], align 4
+// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK-64:       omp.inner.for.cond:
+// CHECK-64-NEXT:    [[TMP5:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
+// CHECK-64-NEXT:    [[CMP2:%.*]] = icmp slt i32 [[TMP5]], 10
+// CHECK-64-NEXT:    br i1 [[CMP2]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK-64:       omp.inner.for.body:
+// CHECK-64-NEXT:    [[TMP6:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-64-NEXT:    [[TMP7:%.*]] = zext i32 [[TMP6]] to i64
+// CHECK-64-NEXT:    [[TMP8:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-64-NEXT:    [[TMP9:%.*]] = zext i32 [[TMP8]] to i64
+// CHECK-64-NEXT:    [[TMP10:%.*]] = load i32, i32* [[A_ON_STACK]], align 4
+// CHECK-64-NEXT:    [[CONV3:%.*]] = bitcast i64* [[A_CASTED]] to i32*
+// CHECK-64-NEXT:    store i32 [[TMP10]], i32* [[CONV3]], align 4
+// CHECK-64-NEXT:    [[TMP11:%.*]] = load i64, i64* [[A_CASTED]], align 8
+// CHECK-64-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 0
+// CHECK-64-NEXT:    [[TMP13:%.*]] = inttoptr i64 [[TMP7]] to i8*
+// CHECK-64-NEXT:    store i8* [[TMP13]], i8** [[TMP12]], align 8
+// CHECK-64-NEXT:    [[TMP14:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 1
+// CHECK-64-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP9]] to i8*
+// CHECK-64-NEXT:    store i8* [[TMP15]], i8** [[TMP14]], align 8
+// CHECK-64-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 2
+// CHECK-64-NEXT:    [[TMP17:%.*]] = inttoptr i64 [[TMP11]] to i8*
+// CHECK-64-NEXT:    store i8* [[TMP17]], i8** [[TMP16]], align 8
+// CHECK-64-NEXT:    [[TMP18:%.*]] = bitcast [3 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8**
+// CHECK-64-NEXT:    call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i64, i64, i64)* @__omp_outlined__16 to i8*), i8* null, i8** [[TMP18]], i64 3)
+// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK-64:       omp.inner.for.inc:
+// CHECK-64-NEXT:    [[TMP19:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
+// CHECK-64-NEXT:    [[TMP20:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-64-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP19]], [[TMP20]]
+// CHECK-64-NEXT:    store i32 [[ADD]], i32* [[DOTOMP_IV]], align 4
+// CHECK-64-NEXT:    [[TMP21:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-64-NEXT:    [[TMP22:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-64-NEXT:    [[ADD4:%.*]] = add nsw i32 [[TMP21]], [[TMP22]]
+// CHECK-64-NEXT:    store i32 [[ADD4]], i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-64-NEXT:    [[TMP23:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-64-NEXT:    [[TMP24:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-64-NEXT:    [[ADD5:%.*]] = add nsw i32 [[TMP23]], [[TMP24]]
+// CHECK-64-NEXT:    store i32 [[ADD5]], i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-64-NEXT:    [[TMP25:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-64-NEXT:    [[CMP6:%.*]] = icmp sgt i32 [[TMP25]], 9
+// CHECK-64-NEXT:    br i1 [[CMP6]], label [[COND_TRUE7:%.*]], label [[COND_FALSE8:%.*]]
+// CHECK-64:       cond.true7:
+// CHECK-64-NEXT:    br label [[COND_END9:%.*]]
+// CHECK-64:       cond.false8:
+// CHECK-64-NEXT:    [[TMP26:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-64-NEXT:    br label [[COND_END9]]
+// CHECK-64:       cond.end9:
+// CHECK-64-NEXT:    [[COND10:%.*]] = phi i32 [ 9, [[COND_TRUE7]] ], [ [[TMP26]], [[COND_FALSE8]] ]
+// CHECK-64-NEXT:    store i32 [[COND10]], i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-64-NEXT:    [[TMP27:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-64-NEXT:    store i32 [[TMP27]], i32* [[DOTOMP_IV]], align 4
+// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND]]
+// CHECK-64:       omp.inner.for.end:
+// CHECK-64-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK-64:       omp.loop.exit:
+// CHECK-64-NEXT:    call void @__kmpc_distribute_static_fini(%struct.ident_t* @[[GLOB2]], i32 [[TMP1]])
+// CHECK-64-NEXT:    [[TMP28:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-64-NEXT:    [[TMP29:%.*]] = icmp ne i32 [[TMP28]], 0
+// CHECK-64-NEXT:    br i1 [[TMP29]], label [[DOTOMP_LASTPRIVATE_THEN:%.*]], label [[DOTOMP_LASTPRIVATE_DONE:%.*]]
+// CHECK-64:       .omp.lastprivate.then:
+// CHECK-64-NEXT:    [[TMP30:%.*]] = load i32, i32* [[A_ON_STACK]], align 4
+// CHECK-64-NEXT:    store i32 [[TMP30]], i32* [[CONV]], align 4
+// CHECK-64-NEXT:    br label [[DOTOMP_LASTPRIVATE_DONE]]
+// CHECK-64:       .omp.lastprivate.done:
+// CHECK-64-NEXT:    call void @__kmpc_free_shared(i8* [[A1]], i64 4)
+// CHECK-64-NEXT:    ret void
+//
+//
+// CHECK-64-LABEL: define {{[^@]+}}@__omp_outlined__16
+// CHECK-64-SAME: (i32* noalias noundef [[DOTGLOBAL_TID_:%.*]], i32* noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], i64 noundef [[A:%.*]]) #[[ATTR1]] {
+// CHECK-64-NEXT:  entry:
+// CHECK-64-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8
+// CHECK-64-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8
+// CHECK-64-NEXT:    [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i64, align 8
+// CHECK-64-NEXT:    [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i64, align 8
+// CHECK-64-NEXT:    [[A_ADDR:%.*]] = alloca i64, align 8
+// CHECK-64-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[A3:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK-64-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8
+// CHECK-64-NEXT:    store i64 [[DOTPREVIOUS_LB_]], i64* [[DOTPREVIOUS_LB__ADDR]], align 8
+// CHECK-64-NEXT:    store i64 [[DOTPREVIOUS_UB_]], i64* [[DOTPREVIOUS_UB__ADDR]], align 8
+// CHECK-64-NEXT:    store i64 [[A]], i64* [[A_ADDR]], align 8
+// CHECK-64-NEXT:    [[CONV:%.*]] = bitcast i64* [[A_ADDR]] to i32*
+// CHECK-64-NEXT:    store i32 0, i32* [[DOTOMP_LB]], align 4
+// CHECK-64-NEXT:    store i32 9, i32* [[DOTOMP_UB]], align 4
+// CHECK-64-NEXT:    [[TMP0:%.*]] = load i64, i64* [[DOTPREVIOUS_LB__ADDR]], align 8
+// CHECK-64-NEXT:    [[CONV1:%.*]] = trunc i64 [[TMP0]] to i32
+// CHECK-64-NEXT:    [[TMP1:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8
+// CHECK-64-NEXT:    [[CONV2:%.*]] = trunc i64 [[TMP1]] to i32
+// CHECK-64-NEXT:    store i32 [[CONV1]], i32* [[DOTOMP_LB]], align 4
+// CHECK-64-NEXT:    store i32 [[CONV2]], i32* [[DOTOMP_UB]], align 4
+// CHECK-64-NEXT:    store i32 1, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-64-NEXT:    store i32 0, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-64-NEXT:    [[TMP2:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK-64-NEXT:    [[TMP3:%.*]] = load i32, i32* [[TMP2]], align 4
+// CHECK-64-NEXT:    call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB3]], i32 [[TMP3]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1)
+// CHECK-64-NEXT:    [[TMP4:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4
+// CHECK-64-NEXT:    store i32 [[TMP4]], i32* [[DOTOMP_IV]], align 4
+// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK-64:       omp.inner.for.cond:
+// CHECK-64-NEXT:    [[TMP5:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
+// CHECK-64-NEXT:    [[CONV4:%.*]] = sext i32 [[TMP5]] to i64
+// CHECK-64-NEXT:    [[TMP6:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8
+// CHECK-64-NEXT:    [[CMP:%.*]] = icmp ule i64 [[CONV4]], [[TMP6]]
+// CHECK-64-NEXT:    br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK-64:       omp.inner.for.body:
+// CHECK-64-NEXT:    [[TMP7:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
+// CHECK-64-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP7]], 1
+// CHECK-64-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// CHECK-64-NEXT:    store i32 [[ADD]], i32* [[I]], align 4
+// CHECK-64-NEXT:    [[TMP8:%.*]] = load i32, i32* [[I]], align 4
+// CHECK-64-NEXT:    store i32 [[TMP8]], i32* [[A3]], align 4
+// CHECK-64-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK-64:       omp.body.continue:
+// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK-64:       omp.inner.for.inc:
+// CHECK-64-NEXT:    [[TMP9:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
+// CHECK-64-NEXT:    [[TMP10:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-64-NEXT:    [[ADD5:%.*]] = add nsw i32 [[TMP9]], [[TMP10]]
+// CHECK-64-NEXT:    store i32 [[ADD5]], i32* [[DOTOMP_IV]], align 4
+// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND]]
+// CHECK-64:       omp.inner.for.end:
+// CHECK-64-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK-64:       omp.loop.exit:
+// CHECK-64-NEXT:    call void @__kmpc_distribute_static_fini(%struct.ident_t* @[[GLOB2]], i32 [[TMP3]])
+// CHECK-64-NEXT:    [[TMP11:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-64-NEXT:    [[TMP12:%.*]] = icmp ne i32 [[TMP11]], 0
+// CHECK-64-NEXT:    br i1 [[TMP12]], label [[DOTOMP_LASTPRIVATE_THEN:%.*]], label [[DOTOMP_LASTPRIVATE_DONE:%.*]]
+// CHECK-64:       .omp.lastprivate.then:
+// CHECK-64-NEXT:    [[TMP13:%.*]] = load i32, i32* [[A3]], align 4
+// CHECK-64-NEXT:    store i32 [[TMP13]], i32* [[CONV]], align 4
+// CHECK-64-NEXT:    br label [[DOTOMP_LASTPRIVATE_DONE]]
+// CHECK-64:       .omp.lastprivate.done:
+// CHECK-64-NEXT:    ret void
+//
+//
+// CHECK-64-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l39
+// CHECK-64-SAME: () #[[ATTR0]] {
+// CHECK-64-NEXT:  entry:
+// CHECK-64-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 2, i1 false)
+// CHECK-64-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
+// CHECK-64-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
+// CHECK-64:       user_code.entry:
+// CHECK-64-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
+// CHECK-64-NEXT:    store i32 0, i32* [[DOTZERO_ADDR]], align 4
+// CHECK-64-NEXT:    store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4
+// CHECK-64-NEXT:    call void @__omp_outlined__17(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]]) #[[ATTR2]]
+// CHECK-64-NEXT:    call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 2)
+// CHECK-64-NEXT:    ret void
+// CHECK-64:       worker.exit:
+// CHECK-64-NEXT:    ret void
+//
+//
+// CHECK-64-LABEL: define {{[^@]+}}@__omp_outlined__17
+// CHECK-64-SAME: (i32* noalias noundef [[DOTGLOBAL_TID_:%.*]], i32* noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
+// CHECK-64-NEXT:  entry:
+// CHECK-64-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8
+// CHECK-64-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8
+// CHECK-64-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [2 x i8*], align 8
+// CHECK-64-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK-64-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8
+// CHECK-64-NEXT:    store i32 0, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-64-NEXT:    store i32 9, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-64-NEXT:    store i32 1, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-64-NEXT:    store i32 0, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-64-NEXT:    [[NVPTX_NUM_THREADS:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block()
+// CHECK-64-NEXT:    [[TMP0:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK-64-NEXT:    [[TMP1:%.*]] = load i32, i32* [[TMP0]], align 4
+// CHECK-64-NEXT:    call void @__kmpc_distribute_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP1]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]])
+// CHECK-64-NEXT:    [[TMP2:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-64-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[TMP2]], 9
+// CHECK-64-NEXT:    br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK-64:       cond.true:
+// CHECK-64-NEXT:    br label [[COND_END:%.*]]
+// CHECK-64:       cond.false:
+// CHECK-64-NEXT:    [[TMP3:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-64-NEXT:    br label [[COND_END]]
+// CHECK-64:       cond.end:
+// CHECK-64-NEXT:    [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP3]], [[COND_FALSE]] ]
+// CHECK-64-NEXT:    store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-64-NEXT:    [[TMP4:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-64-NEXT:    store i32 [[TMP4]], i32* [[DOTOMP_IV]], align 4
+// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK-64:       omp.inner.for.cond:
+// CHECK-64-NEXT:    [[TMP5:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
+// CHECK-64-NEXT:    [[CMP1:%.*]] = icmp slt i32 [[TMP5]], 10
+// CHECK-64-NEXT:    br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK-64:       omp.inner.for.body:
+// CHECK-64-NEXT:    [[TMP6:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-64-NEXT:    [[TMP7:%.*]] = zext i32 [[TMP6]] to i64
+// CHECK-64-NEXT:    [[TMP8:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-64-NEXT:    [[TMP9:%.*]] = zext i32 [[TMP8]] to i64
+// CHECK-64-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 0
+// CHECK-64-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP7]] to i8*
+// CHECK-64-NEXT:    store i8* [[TMP11]], i8** [[TMP10]], align 8
+// CHECK-64-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 1
+// CHECK-64-NEXT:    [[TMP13:%.*]] = inttoptr i64 [[TMP9]] to i8*
+// CHECK-64-NEXT:    store i8* [[TMP13]], i8** [[TMP12]], align 8
+// CHECK-64-NEXT:    [[TMP14:%.*]] = bitcast [2 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8**
+// CHECK-64-NEXT:    call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i64, i64)* @__omp_outlined__18 to i8*), i8* null, i8** [[TMP14]], i64 2)
+// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK-64:       omp.inner.for.inc:
+// CHECK-64-NEXT:    [[TMP15:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
+// CHECK-64-NEXT:    [[TMP16:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-64-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP15]], [[TMP16]]
+// CHECK-64-NEXT:    store i32 [[ADD]], i32* [[DOTOMP_IV]], align 4
+// CHECK-64-NEXT:    [[TMP17:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-64-NEXT:    [[TMP18:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-64-NEXT:    [[ADD2:%.*]] = add nsw i32 [[TMP17]], [[TMP18]]
+// CHECK-64-NEXT:    store i32 [[ADD2]], i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-64-NEXT:    [[TMP19:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-64-NEXT:    [[TMP20:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-64-NEXT:    [[ADD3:%.*]] = add nsw i32 [[TMP19]], [[TMP20]]
+// CHECK-64-NEXT:    store i32 [[ADD3]], i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-64-NEXT:    [[TMP21:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-64-NEXT:    [[CMP4:%.*]] = icmp sgt i32 [[TMP21]], 9
+// CHECK-64-NEXT:    br i1 [[CMP4]], label [[COND_TRUE5:%.*]], label [[COND_FALSE6:%.*]]
+// CHECK-64:       cond.true5:
+// CHECK-64-NEXT:    br label [[COND_END7:%.*]]
+// CHECK-64:       cond.false6:
+// CHECK-64-NEXT:    [[TMP22:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-64-NEXT:    br label [[COND_END7]]
+// CHECK-64:       cond.end7:
+// CHECK-64-NEXT:    [[COND8:%.*]] = phi i32 [ 9, [[COND_TRUE5]] ], [ [[TMP22]], [[COND_FALSE6]] ]
+// CHECK-64-NEXT:    store i32 [[COND8]], i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-64-NEXT:    [[TMP23:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-64-NEXT:    store i32 [[TMP23]], i32* [[DOTOMP_IV]], align 4
+// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND]]
+// CHECK-64:       omp.inner.for.end:
+// CHECK-64-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK-64:       omp.loop.exit:
+// CHECK-64-NEXT:    call void @__kmpc_distribute_static_fini(%struct.ident_t* @[[GLOB2]], i32 [[TMP1]])
+// CHECK-64-NEXT:    ret void
+//
+//
+// CHECK-64-LABEL: define {{[^@]+}}@__omp_outlined__18
+// CHECK-64-SAME: (i32* noalias noundef [[DOTGLOBAL_TID_:%.*]], i32* noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR1]] {
+// CHECK-64-NEXT:  entry:
+// CHECK-64-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8
+// CHECK-64-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8
+// CHECK-64-NEXT:    [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i64, align 8
+// CHECK-64-NEXT:    [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i64, align 8
+// CHECK-64-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK-64-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8
+// CHECK-64-NEXT:    store i64 [[DOTPREVIOUS_LB_]], i64* [[DOTPREVIOUS_LB__ADDR]], align 8
+// CHECK-64-NEXT:    store i64 [[DOTPREVIOUS_UB_]], i64* [[DOTPREVIOUS_UB__ADDR]], align 8
+// CHECK-64-NEXT:    store i32 0, i32* [[DOTOMP_LB]], align 4
+// CHECK-64-NEXT:    store i32 9, i32* [[DOTOMP_UB]], align 4
+// CHECK-64-NEXT:    [[TMP0:%.*]] = load i64, i64* [[DOTPREVIOUS_LB__ADDR]], align 8
+// CHECK-64-NEXT:    [[CONV:%.*]] = trunc i64 [[TMP0]] to i32
+// CHECK-64-NEXT:    [[TMP1:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8
+// CHECK-64-NEXT:    [[CONV1:%.*]] = trunc i64 [[TMP1]] to i32
+// CHECK-64-NEXT:    store i32 [[CONV]], i32* [[DOTOMP_LB]], align 4
+// CHECK-64-NEXT:    store i32 [[CONV1]], i32* [[DOTOMP_UB]], align 4
+// CHECK-64-NEXT:    store i32 1, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-64-NEXT:    store i32 0, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-64-NEXT:    [[TMP2:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK-64-NEXT:    [[TMP3:%.*]] = load i32, i32* [[TMP2]], align 4
+// CHECK-64-NEXT:    call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB3]], i32 [[TMP3]], i32 34, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1)
+// CHECK-64-NEXT:    [[TMP4:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4
+// CHECK-64-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[TMP4]], 9
+// CHECK-64-NEXT:    br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK-64:       cond.true:
+// CHECK-64-NEXT:    br label [[COND_END:%.*]]
+// CHECK-64:       cond.false:
+// CHECK-64-NEXT:    [[TMP5:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4
+// CHECK-64-NEXT:    br label [[COND_END]]
+// CHECK-64:       cond.end:
+// CHECK-64-NEXT:    [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP5]], [[COND_FALSE]] ]
+// CHECK-64-NEXT:    store i32 [[COND]], i32* [[DOTOMP_UB]], align 4
+// CHECK-64-NEXT:    [[TMP6:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4
+// CHECK-64-NEXT:    store i32 [[TMP6]], i32* [[DOTOMP_IV]], align 4
+// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK-64:       omp.inner.for.cond:
+// CHECK-64-NEXT:    [[TMP7:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
+// CHECK-64-NEXT:    [[TMP8:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4
+// CHECK-64-NEXT:    [[CMP2:%.*]] = icmp sle i32 [[TMP7]], [[TMP8]]
+// CHECK-64-NEXT:    br i1 [[CMP2]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK-64:       omp.inner.for.body:
+// CHECK-64-NEXT:    [[TMP9:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
+// CHECK-64-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP9]], 1
+// CHECK-64-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// CHECK-64-NEXT:    store i32 [[ADD]], i32* [[I]], align 4
+// CHECK-64-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK-64:       omp.body.continue:
+// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK-64:       omp.inner.for.inc:
+// CHECK-64-NEXT:    [[TMP10:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
+// CHECK-64-NEXT:    [[ADD3:%.*]] = add nsw i32 [[TMP10]], 1
+// CHECK-64-NEXT:    store i32 [[ADD3]], i32* [[DOTOMP_IV]], align 4
+// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND]]
+// CHECK-64:       omp.inner.for.end:
+// CHECK-64-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK-64:       omp.loop.exit:
+// CHECK-64-NEXT:    call void @__kmpc_distribute_static_fini(%struct.ident_t* @[[GLOB2]], i32 [[TMP3]])
+// CHECK-64-NEXT:    ret void
+//
+//
+// CHECK-64-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l42
+// CHECK-64-SAME: () #[[ATTR0]] {
+// CHECK-64-NEXT:  entry:
+// CHECK-64-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 2, i1 false)
+// CHECK-64-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
+// CHECK-64-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
+// CHECK-64:       user_code.entry:
+// CHECK-64-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
+// CHECK-64-NEXT:    store i32 0, i32* [[DOTZERO_ADDR]], align 4
+// CHECK-64-NEXT:    store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4
+// CHECK-64-NEXT:    call void @__omp_outlined__19(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]]) #[[ATTR2]]
+// CHECK-64-NEXT:    call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 2)
+// CHECK-64-NEXT:    ret void
+// CHECK-64:       worker.exit:
+// CHECK-64-NEXT:    ret void
+//
+//
+// CHECK-64-LABEL: define {{[^@]+}}@__omp_outlined__19
+// CHECK-64-SAME: (i32* noalias noundef [[DOTGLOBAL_TID_:%.*]], i32* noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
+// CHECK-64-NEXT:  entry:
+// CHECK-64-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8
+// CHECK-64-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8
+// CHECK-64-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [2 x i8*], align 8
+// CHECK-64-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK-64-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8
+// CHECK-64-NEXT:    store i32 0, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-64-NEXT:    store i32 9, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-64-NEXT:    store i32 1, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-64-NEXT:    store i32 0, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-64-NEXT:    [[NVPTX_NUM_THREADS:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block()
+// CHECK-64-NEXT:    [[TMP0:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK-64-NEXT:    [[TMP1:%.*]] = load i32, i32* [[TMP0]], align 4
+// CHECK-64-NEXT:    call void @__kmpc_distribute_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP1]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]])
+// CHECK-64-NEXT:    [[TMP2:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-64-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[TMP2]], 9
+// CHECK-64-NEXT:    br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK-64:       cond.true:
+// CHECK-64-NEXT:    br label [[COND_END:%.*]]
+// CHECK-64:       cond.false:
+// CHECK-64-NEXT:    [[TMP3:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-64-NEXT:    br label [[COND_END]]
+// CHECK-64:       cond.end:
+// CHECK-64-NEXT:    [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP3]], [[COND_FALSE]] ]
+// CHECK-64-NEXT:    store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-64-NEXT:    [[TMP4:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-64-NEXT:    store i32 [[TMP4]], i32* [[DOTOMP_IV]], align 4
+// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK-64:       omp.inner.for.cond:
+// CHECK-64-NEXT:    [[TMP5:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
+// CHECK-64-NEXT:    [[CMP1:%.*]] = icmp slt i32 [[TMP5]], 10
+// CHECK-64-NEXT:    br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK-64:       omp.inner.for.body:
+// CHECK-64-NEXT:    [[TMP6:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-64-NEXT:    [[TMP7:%.*]] = zext i32 [[TMP6]] to i64
+// CHECK-64-NEXT:    [[TMP8:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-64-NEXT:    [[TMP9:%.*]] = zext i32 [[TMP8]] to i64
+// CHECK-64-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 0
+// CHECK-64-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP7]] to i8*
+// CHECK-64-NEXT:    store i8* [[TMP11]], i8** [[TMP10]], align 8
+// CHECK-64-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 1
+// CHECK-64-NEXT:    [[TMP13:%.*]] = inttoptr i64 [[TMP9]] to i8*
+// CHECK-64-NEXT:    store i8* [[TMP13]], i8** [[TMP12]], align 8
+// CHECK-64-NEXT:    [[TMP14:%.*]] = bitcast [2 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8**
+// CHECK-64-NEXT:    call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i64, i64)* @__omp_outlined__20 to i8*), i8* null, i8** [[TMP14]], i64 2)
+// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK-64:       omp.inner.for.inc:
+// CHECK-64-NEXT:    [[TMP15:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
+// CHECK-64-NEXT:    [[TMP16:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-64-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP15]], [[TMP16]]
+// CHECK-64-NEXT:    store i32 [[ADD]], i32* [[DOTOMP_IV]], align 4
+// CHECK-64-NEXT:    [[TMP17:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-64-NEXT:    [[TMP18:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-64-NEXT:    [[ADD2:%.*]] = add nsw i32 [[TMP17]], [[TMP18]]
+// CHECK-64-NEXT:    store i32 [[ADD2]], i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-64-NEXT:    [[TMP19:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-64-NEXT:    [[TMP20:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-64-NEXT:    [[ADD3:%.*]] = add nsw i32 [[TMP19]], [[TMP20]]
+// CHECK-64-NEXT:    store i32 [[ADD3]], i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-64-NEXT:    [[TMP21:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-64-NEXT:    [[CMP4:%.*]] = icmp sgt i32 [[TMP21]], 9
+// CHECK-64-NEXT:    br i1 [[CMP4]], label [[COND_TRUE5:%.*]], label [[COND_FALSE6:%.*]]
+// CHECK-64:       cond.true5:
+// CHECK-64-NEXT:    br label [[COND_END7:%.*]]
+// CHECK-64:       cond.false6:
+// CHECK-64-NEXT:    [[TMP22:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-64-NEXT:    br label [[COND_END7]]
+// CHECK-64:       cond.end7:
+// CHECK-64-NEXT:    [[COND8:%.*]] = phi i32 [ 9, [[COND_TRUE5]] ], [ [[TMP22]], [[COND_FALSE6]] ]
+// CHECK-64-NEXT:    store i32 [[COND8]], i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-64-NEXT:    [[TMP23:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-64-NEXT:    store i32 [[TMP23]], i32* [[DOTOMP_IV]], align 4
+// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND]]
+// CHECK-64:       omp.inner.for.end:
+// CHECK-64-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK-64:       omp.loop.exit:
+// CHECK-64-NEXT:    call void @__kmpc_distribute_static_fini(%struct.ident_t* @[[GLOB2]], i32 [[TMP1]])
+// CHECK-64-NEXT:    ret void
+//
+//
+// CHECK-64-LABEL: define {{[^@]+}}@__omp_outlined__20
+// CHECK-64-SAME: (i32* noalias noundef [[DOTGLOBAL_TID_:%.*]], i32* noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR1]] {
+// CHECK-64-NEXT:  entry:
+// CHECK-64-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8
+// CHECK-64-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8
+// CHECK-64-NEXT:    [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i64, align 8
+// CHECK-64-NEXT:    [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i64, align 8
+// CHECK-64-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK-64-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8
+// CHECK-64-NEXT:    store i64 [[DOTPREVIOUS_LB_]], i64* [[DOTPREVIOUS_LB__ADDR]], align 8
+// CHECK-64-NEXT:    store i64 [[DOTPREVIOUS_UB_]], i64* [[DOTPREVIOUS_UB__ADDR]], align 8
+// CHECK-64-NEXT:    store i32 0, i32* [[DOTOMP_LB]], align 4
+// CHECK-64-NEXT:    store i32 9, i32* [[DOTOMP_UB]], align 4
+// CHECK-64-NEXT:    [[TMP0:%.*]] = load i64, i64* [[DOTPREVIOUS_LB__ADDR]], align 8
+// CHECK-64-NEXT:    [[CONV:%.*]] = trunc i64 [[TMP0]] to i32
+// CHECK-64-NEXT:    [[TMP1:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8
+// CHECK-64-NEXT:    [[CONV1:%.*]] = trunc i64 [[TMP1]] to i32
+// CHECK-64-NEXT:    store i32 [[CONV]], i32* [[DOTOMP_LB]], align 4
+// CHECK-64-NEXT:    store i32 [[CONV1]], i32* [[DOTOMP_UB]], align 4
+// CHECK-64-NEXT:    store i32 1, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-64-NEXT:    store i32 0, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-64-NEXT:    [[TMP2:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK-64-NEXT:    [[TMP3:%.*]] = load i32, i32* [[TMP2]], align 4
+// CHECK-64-NEXT:    call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB3]], i32 [[TMP3]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1)
+// CHECK-64-NEXT:    [[TMP4:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4
+// CHECK-64-NEXT:    store i32 [[TMP4]], i32* [[DOTOMP_IV]], align 4
+// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK-64:       omp.inner.for.cond:
+// CHECK-64-NEXT:    [[TMP5:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
+// CHECK-64-NEXT:    [[CONV2:%.*]] = sext i32 [[TMP5]] to i64
+// CHECK-64-NEXT:    [[TMP6:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8
+// CHECK-64-NEXT:    [[CMP:%.*]] = icmp ule i64 [[CONV2]], [[TMP6]]
+// CHECK-64-NEXT:    br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK-64:       omp.inner.for.body:
+// CHECK-64-NEXT:    [[TMP7:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
+// CHECK-64-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP7]], 1
+// CHECK-64-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// CHECK-64-NEXT:    store i32 [[ADD]], i32* [[I]], align 4
+// CHECK-64-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK-64:       omp.body.continue:
+// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK-64:       omp.inner.for.inc:
+// CHECK-64-NEXT:    [[TMP8:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
+// CHECK-64-NEXT:    [[TMP9:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-64-NEXT:    [[ADD3:%.*]] = add nsw i32 [[TMP8]], [[TMP9]]
+// CHECK-64-NEXT:    store i32 [[ADD3]], i32* [[DOTOMP_IV]], align 4
+// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND]]
+// CHECK-64:       omp.inner.for.end:
+// CHECK-64-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK-64:       omp.loop.exit:
+// CHECK-64-NEXT:    call void @__kmpc_distribute_static_fini(%struct.ident_t* @[[GLOB2]], i32 [[TMP3]])
+// CHECK-64-NEXT:    ret void
+//
+//
+// CHECK-64-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l45
+// CHECK-64-SAME: () #[[ATTR0]] {
+// CHECK-64-NEXT:  entry:
+// CHECK-64-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 2, i1 false)
+// CHECK-64-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
+// CHECK-64-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
+// CHECK-64:       user_code.entry:
+// CHECK-64-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
+// CHECK-64-NEXT:    store i32 0, i32* [[DOTZERO_ADDR]], align 4
+// CHECK-64-NEXT:    store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4
+// CHECK-64-NEXT:    call void @__omp_outlined__21(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]]) #[[ATTR2]]
+// CHECK-64-NEXT:    call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 2)
+// CHECK-64-NEXT:    ret void
+// CHECK-64:       worker.exit:
+// CHECK-64-NEXT:    ret void
+//
+//
+// CHECK-64-LABEL: define {{[^@]+}}@__omp_outlined__21
+// CHECK-64-SAME: (i32* noalias noundef [[DOTGLOBAL_TID_:%.*]], i32* noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
+// CHECK-64-NEXT:  entry:
+// CHECK-64-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8
+// CHECK-64-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8
+// CHECK-64-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [2 x i8*], align 8
+// CHECK-64-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK-64-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8
+// CHECK-64-NEXT:    store i32 0, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-64-NEXT:    store i32 9, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-64-NEXT:    store i32 1, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-64-NEXT:    store i32 0, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-64-NEXT:    [[NVPTX_NUM_THREADS:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block()
+// CHECK-64-NEXT:    [[TMP0:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK-64-NEXT:    [[TMP1:%.*]] = load i32, i32* [[TMP0]], align 4
+// CHECK-64-NEXT:    call void @__kmpc_distribute_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP1]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]])
+// CHECK-64-NEXT:    [[TMP2:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-64-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[TMP2]], 9
+// CHECK-64-NEXT:    br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK-64:       cond.true:
+// CHECK-64-NEXT:    br label [[COND_END:%.*]]
+// CHECK-64:       cond.false:
+// CHECK-64-NEXT:    [[TMP3:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-64-NEXT:    br label [[COND_END]]
+// CHECK-64:       cond.end:
+// CHECK-64-NEXT:    [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP3]], [[COND_FALSE]] ]
+// CHECK-64-NEXT:    store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-64-NEXT:    [[TMP4:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-64-NEXT:    store i32 [[TMP4]], i32* [[DOTOMP_IV]], align 4
+// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK-64:       omp.inner.for.cond:
+// CHECK-64-NEXT:    [[TMP5:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
+// CHECK-64-NEXT:    [[CMP1:%.*]] = icmp slt i32 [[TMP5]], 10
+// CHECK-64-NEXT:    br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK-64:       omp.inner.for.body:
+// CHECK-64-NEXT:    [[TMP6:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-64-NEXT:    [[TMP7:%.*]] = zext i32 [[TMP6]] to i64
+// CHECK-64-NEXT:    [[TMP8:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-64-NEXT:    [[TMP9:%.*]] = zext i32 [[TMP8]] to i64
+// CHECK-64-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 0
+// CHECK-64-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP7]] to i8*
+// CHECK-64-NEXT:    store i8* [[TMP11]], i8** [[TMP10]], align 8
+// CHECK-64-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 1
+// CHECK-64-NEXT:    [[TMP13:%.*]] = inttoptr i64 [[TMP9]] to i8*
+// CHECK-64-NEXT:    store i8* [[TMP13]], i8** [[TMP12]], align 8
+// CHECK-64-NEXT:    [[TMP14:%.*]] = bitcast [2 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8**
+// CHECK-64-NEXT:    call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i64, i64)* @__omp_outlined__22 to i8*), i8* null, i8** [[TMP14]], i64 2)
+// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK-64:       omp.inner.for.inc:
+// CHECK-64-NEXT:    [[TMP15:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
+// CHECK-64-NEXT:    [[TMP16:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-64-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP15]], [[TMP16]]
+// CHECK-64-NEXT:    store i32 [[ADD]], i32* [[DOTOMP_IV]], align 4
+// CHECK-64-NEXT:    [[TMP17:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-64-NEXT:    [[TMP18:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-64-NEXT:    [[ADD2:%.*]] = add nsw i32 [[TMP17]], [[TMP18]]
+// CHECK-64-NEXT:    store i32 [[ADD2]], i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-64-NEXT:    [[TMP19:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-64-NEXT:    [[TMP20:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-64-NEXT:    [[ADD3:%.*]] = add nsw i32 [[TMP19]], [[TMP20]]
+// CHECK-64-NEXT:    store i32 [[ADD3]], i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-64-NEXT:    [[TMP21:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-64-NEXT:    [[CMP4:%.*]] = icmp sgt i32 [[TMP21]], 9
+// CHECK-64-NEXT:    br i1 [[CMP4]], label [[COND_TRUE5:%.*]], label [[COND_FALSE6:%.*]]
+// CHECK-64:       cond.true5:
+// CHECK-64-NEXT:    br label [[COND_END7:%.*]]
+// CHECK-64:       cond.false6:
+// CHECK-64-NEXT:    [[TMP22:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-64-NEXT:    br label [[COND_END7]]
+// CHECK-64:       cond.end7:
+// CHECK-64-NEXT:    [[COND8:%.*]] = phi i32 [ 9, [[COND_TRUE5]] ], [ [[TMP22]], [[COND_FALSE6]] ]
+// CHECK-64-NEXT:    store i32 [[COND8]], i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-64-NEXT:    [[TMP23:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-64-NEXT:    store i32 [[TMP23]], i32* [[DOTOMP_IV]], align 4
+// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND]]
+// CHECK-64:       omp.inner.for.end:
+// CHECK-64-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK-64:       omp.loop.exit:
+// CHECK-64-NEXT:    call void @__kmpc_distribute_static_fini(%struct.ident_t* @[[GLOB2]], i32 [[TMP1]])
+// CHECK-64-NEXT:    ret void
+//
+//
+// CHECK-64-LABEL: define {{[^@]+}}@__omp_outlined__22
+// CHECK-64-SAME: (i32* noalias noundef [[DOTGLOBAL_TID_:%.*]], i32* noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR1]] {
+// CHECK-64-NEXT:  entry:
+// CHECK-64-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8
+// CHECK-64-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8
+// CHECK-64-NEXT:    [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i64, align 8
+// CHECK-64-NEXT:    [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i64, align 8
+// CHECK-64-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK-64-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8
+// CHECK-64-NEXT:    store i64 [[DOTPREVIOUS_LB_]], i64* [[DOTPREVIOUS_LB__ADDR]], align 8
+// CHECK-64-NEXT:    store i64 [[DOTPREVIOUS_UB_]], i64* [[DOTPREVIOUS_UB__ADDR]], align 8
+// CHECK-64-NEXT:    store i32 0, i32* [[DOTOMP_LB]], align 4
+// CHECK-64-NEXT:    store i32 9, i32* [[DOTOMP_UB]], align 4
+// CHECK-64-NEXT:    [[TMP0:%.*]] = load i64, i64* [[DOTPREVIOUS_LB__ADDR]], align 8
+// CHECK-64-NEXT:    [[CONV:%.*]] = trunc i64 [[TMP0]] to i32
+// CHECK-64-NEXT:    [[TMP1:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8
+// CHECK-64-NEXT:    [[CONV1:%.*]] = trunc i64 [[TMP1]] to i32
+// CHECK-64-NEXT:    store i32 [[CONV]], i32* [[DOTOMP_LB]], align 4
+// CHECK-64-NEXT:    store i32 [[CONV1]], i32* [[DOTOMP_UB]], align 4
+// CHECK-64-NEXT:    store i32 1, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-64-NEXT:    store i32 0, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-64-NEXT:    [[TMP2:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4
+// CHECK-64-NEXT:    [[TMP3:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4
+// CHECK-64-NEXT:    [[TMP4:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK-64-NEXT:    [[TMP5:%.*]] = load i32, i32* [[TMP4]], align 4
+// CHECK-64-NEXT:    call void @__kmpc_dispatch_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP5]], i32 1073741862, i32 [[TMP2]], i32 [[TMP3]], i32 1, i32 1)
+// CHECK-64-NEXT:    br label [[OMP_DISPATCH_COND:%.*]]
+// CHECK-64:       omp.dispatch.cond:
+// CHECK-64-NEXT:    [[TMP6:%.*]] = call i32 @__kmpc_dispatch_next_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP5]], i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]])
+// CHECK-64-NEXT:    [[TOBOOL:%.*]] = icmp ne i32 [[TMP6]], 0
+// CHECK-64-NEXT:    br i1 [[TOBOOL]], label [[OMP_DISPATCH_BODY:%.*]], label [[OMP_DISPATCH_END:%.*]]
+// CHECK-64:       omp.dispatch.body:
+// CHECK-64-NEXT:    [[TMP7:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4
+// CHECK-64-NEXT:    store i32 [[TMP7]], i32* [[DOTOMP_IV]], align 4
+// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK-64:       omp.inner.for.cond:
+// CHECK-64-NEXT:    [[TMP8:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP180:![0-9]+]]
+// CHECK-64-NEXT:    [[TMP9:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP180]]
+// CHECK-64-NEXT:    [[CMP:%.*]] = icmp sle i32 [[TMP8]], [[TMP9]]
+// CHECK-64-NEXT:    br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK-64:       omp.inner.for.body:
+// CHECK-64-NEXT:    [[TMP10:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP180]]
+// CHECK-64-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP10]], 1
+// CHECK-64-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// CHECK-64-NEXT:    store i32 [[ADD]], i32* [[I]], align 4, !llvm.access.group [[ACC_GRP180]]
+// CHECK-64-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK-64:       omp.body.continue:
+// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK-64:       omp.inner.for.inc:
+// CHECK-64-NEXT:    [[TMP11:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP180]]
+// CHECK-64-NEXT:    [[ADD2:%.*]] = add nsw i32 [[TMP11]], 1
+// CHECK-64-NEXT:    store i32 [[ADD2]], i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP180]]
+// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP181:![0-9]+]]
+// CHECK-64:       omp.inner.for.end:
+// CHECK-64-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
+// CHECK-64:       omp.dispatch.inc:
+// CHECK-64-NEXT:    br label [[OMP_DISPATCH_COND]]
+// CHECK-64:       omp.dispatch.end:
+// CHECK-64-NEXT:    ret void
+//
+//
+// CHECK-64-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l48
+// CHECK-64-SAME: () #[[ATTR0]] {
+// CHECK-64-NEXT:  entry:
+// CHECK-64-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 2, i1 false)
+// CHECK-64-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
+// CHECK-64-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
+// CHECK-64:       user_code.entry:
+// CHECK-64-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
+// CHECK-64-NEXT:    store i32 0, i32* [[DOTZERO_ADDR]], align 4
+// CHECK-64-NEXT:    store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4
+// CHECK-64-NEXT:    call void @__omp_outlined__23(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]]) #[[ATTR2]]
+// CHECK-64-NEXT:    call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 2)
+// CHECK-64-NEXT:    ret void
+// CHECK-64:       worker.exit:
+// CHECK-64-NEXT:    ret void
+//
+//
+// CHECK-64-LABEL: define {{[^@]+}}@__omp_outlined__23
+// CHECK-64-SAME: (i32* noalias noundef [[DOTGLOBAL_TID_:%.*]], i32* noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
+// CHECK-64-NEXT:  entry:
+// CHECK-64-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8
+// CHECK-64-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8
+// CHECK-64-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [2 x i8*], align 8
+// CHECK-64-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK-64-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8
+// CHECK-64-NEXT:    store i32 0, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-64-NEXT:    store i32 9, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-64-NEXT:    store i32 1, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-64-NEXT:    store i32 0, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-64-NEXT:    [[NVPTX_NUM_THREADS:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block()
+// CHECK-64-NEXT:    [[TMP0:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK-64-NEXT:    [[TMP1:%.*]] = load i32, i32* [[TMP0]], align 4
+// CHECK-64-NEXT:    call void @__kmpc_distribute_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP1]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]])
+// CHECK-64-NEXT:    [[TMP2:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-64-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[TMP2]], 9
+// CHECK-64-NEXT:    br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK-64:       cond.true:
+// CHECK-64-NEXT:    br label [[COND_END:%.*]]
+// CHECK-64:       cond.false:
+// CHECK-64-NEXT:    [[TMP3:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-64-NEXT:    br label [[COND_END]]
+// CHECK-64:       cond.end:
+// CHECK-64-NEXT:    [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP3]], [[COND_FALSE]] ]
+// CHECK-64-NEXT:    store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-64-NEXT:    [[TMP4:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-64-NEXT:    store i32 [[TMP4]], i32* [[DOTOMP_IV]], align 4
+// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK-64:       omp.inner.for.cond:
+// CHECK-64-NEXT:    [[TMP5:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
+// CHECK-64-NEXT:    [[CMP1:%.*]] = icmp slt i32 [[TMP5]], 10
+// CHECK-64-NEXT:    br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK-64:       omp.inner.for.body:
+// CHECK-64-NEXT:    [[TMP6:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-64-NEXT:    [[TMP7:%.*]] = zext i32 [[TMP6]] to i64
+// CHECK-64-NEXT:    [[TMP8:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-64-NEXT:    [[TMP9:%.*]] = zext i32 [[TMP8]] to i64
+// CHECK-64-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 0
+// CHECK-64-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP7]] to i8*
+// CHECK-64-NEXT:    store i8* [[TMP11]], i8** [[TMP10]], align 8
+// CHECK-64-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 1
+// CHECK-64-NEXT:    [[TMP13:%.*]] = inttoptr i64 [[TMP9]] to i8*
+// CHECK-64-NEXT:    store i8* [[TMP13]], i8** [[TMP12]], align 8
+// CHECK-64-NEXT:    [[TMP14:%.*]] = bitcast [2 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8**
+// CHECK-64-NEXT:    call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i64, i64)* @__omp_outlined__24 to i8*), i8* null, i8** [[TMP14]], i64 2)
+// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK-64:       omp.inner.for.inc:
+// CHECK-64-NEXT:    [[TMP15:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
+// CHECK-64-NEXT:    [[TMP16:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-64-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP15]], [[TMP16]]
+// CHECK-64-NEXT:    store i32 [[ADD]], i32* [[DOTOMP_IV]], align 4
+// CHECK-64-NEXT:    [[TMP17:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-64-NEXT:    [[TMP18:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-64-NEXT:    [[ADD2:%.*]] = add nsw i32 [[TMP17]], [[TMP18]]
+// CHECK-64-NEXT:    store i32 [[ADD2]], i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-64-NEXT:    [[TMP19:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-64-NEXT:    [[TMP20:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-64-NEXT:    [[ADD3:%.*]] = add nsw i32 [[TMP19]], [[TMP20]]
+// CHECK-64-NEXT:    store i32 [[ADD3]], i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-64-NEXT:    [[TMP21:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-64-NEXT:    [[CMP4:%.*]] = icmp sgt i32 [[TMP21]], 9
+// CHECK-64-NEXT:    br i1 [[CMP4]], label [[COND_TRUE5:%.*]], label [[COND_FALSE6:%.*]]
+// CHECK-64:       cond.true5:
+// CHECK-64-NEXT:    br label [[COND_END7:%.*]]
+// CHECK-64:       cond.false6:
+// CHECK-64-NEXT:    [[TMP22:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-64-NEXT:    br label [[COND_END7]]
+// CHECK-64:       cond.end7:
+// CHECK-64-NEXT:    [[COND8:%.*]] = phi i32 [ 9, [[COND_TRUE5]] ], [ [[TMP22]], [[COND_FALSE6]] ]
+// CHECK-64-NEXT:    store i32 [[COND8]], i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-64-NEXT:    [[TMP23:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-64-NEXT:    store i32 [[TMP23]], i32* [[DOTOMP_IV]], align 4
+// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND]]
+// CHECK-64:       omp.inner.for.end:
+// CHECK-64-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK-64:       omp.loop.exit:
+// CHECK-64-NEXT:    call void @__kmpc_distribute_static_fini(%struct.ident_t* @[[GLOB2]], i32 [[TMP1]])
+// CHECK-64-NEXT:    ret void
+//
+//
+// CHECK-64-LABEL: define {{[^@]+}}@__omp_outlined__24
+// CHECK-64-SAME: (i32* noalias noundef [[DOTGLOBAL_TID_:%.*]], i32* noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR1]] {
+// CHECK-64-NEXT:  entry:
+// CHECK-64-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8
+// CHECK-64-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8
+// CHECK-64-NEXT:    [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i64, align 8
+// CHECK-64-NEXT:    [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i64, align 8
+// CHECK-64-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK-64-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8
+// CHECK-64-NEXT:    store i64 [[DOTPREVIOUS_LB_]], i64* [[DOTPREVIOUS_LB__ADDR]], align 8
+// CHECK-64-NEXT:    store i64 [[DOTPREVIOUS_UB_]], i64* [[DOTPREVIOUS_UB__ADDR]], align 8
+// CHECK-64-NEXT:    store i32 0, i32* [[DOTOMP_LB]], align 4
+// CHECK-64-NEXT:    store i32 9, i32* [[DOTOMP_UB]], align 4
+// CHECK-64-NEXT:    [[TMP0:%.*]] = load i64, i64* [[DOTPREVIOUS_LB__ADDR]], align 8
+// CHECK-64-NEXT:    [[CONV:%.*]] = trunc i64 [[TMP0]] to i32
+// CHECK-64-NEXT:    [[TMP1:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8
+// CHECK-64-NEXT:    [[CONV1:%.*]] = trunc i64 [[TMP1]] to i32
+// CHECK-64-NEXT:    store i32 [[CONV]], i32* [[DOTOMP_LB]], align 4
+// CHECK-64-NEXT:    store i32 [[CONV1]], i32* [[DOTOMP_UB]], align 4
+// CHECK-64-NEXT:    store i32 1, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-64-NEXT:    store i32 0, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-64-NEXT:    [[TMP2:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4
+// CHECK-64-NEXT:    [[TMP3:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4
+// CHECK-64-NEXT:    [[TMP4:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK-64-NEXT:    [[TMP5:%.*]] = load i32, i32* [[TMP4]], align 4
+// CHECK-64-NEXT:    call void @__kmpc_dispatch_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP5]], i32 1073741861, i32 [[TMP2]], i32 [[TMP3]], i32 1, i32 1)
+// CHECK-64-NEXT:    br label [[OMP_DISPATCH_COND:%.*]]
+// CHECK-64:       omp.dispatch.cond:
+// CHECK-64-NEXT:    [[TMP6:%.*]] = call i32 @__kmpc_dispatch_next_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP5]], i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]])
+// CHECK-64-NEXT:    [[TOBOOL:%.*]] = icmp ne i32 [[TMP6]], 0
+// CHECK-64-NEXT:    br i1 [[TOBOOL]], label [[OMP_DISPATCH_BODY:%.*]], label [[OMP_DISPATCH_END:%.*]]
+// CHECK-64:       omp.dispatch.body:
+// CHECK-64-NEXT:    [[TMP7:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4
+// CHECK-64-NEXT:    store i32 [[TMP7]], i32* [[DOTOMP_IV]], align 4
+// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK-64:       omp.inner.for.cond:
+// CHECK-64-NEXT:    [[TMP8:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP183:![0-9]+]]
+// CHECK-64-NEXT:    [[TMP9:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP183]]
+// CHECK-64-NEXT:    [[CMP:%.*]] = icmp sle i32 [[TMP8]], [[TMP9]]
+// CHECK-64-NEXT:    br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK-64:       omp.inner.for.body:
+// CHECK-64-NEXT:    [[TMP10:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP183]]
+// CHECK-64-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP10]], 1
+// CHECK-64-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// CHECK-64-NEXT:    store i32 [[ADD]], i32* [[I]], align 4, !llvm.access.group [[ACC_GRP183]]
+// CHECK-64-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK-64:       omp.body.continue:
+// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK-64:       omp.inner.for.inc:
+// CHECK-64-NEXT:    [[TMP11:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP183]]
+// CHECK-64-NEXT:    [[ADD2:%.*]] = add nsw i32 [[TMP11]], 1
+// CHECK-64-NEXT:    store i32 [[ADD2]], i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP183]]
+// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP184:![0-9]+]]
+// CHECK-64:       omp.inner.for.end:
+// CHECK-64-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
+// CHECK-64:       omp.dispatch.inc:
+// CHECK-64-NEXT:    br label [[OMP_DISPATCH_COND]]
+// CHECK-64:       omp.dispatch.end:
+// CHECK-64-NEXT:    ret void
+//
+//
+// CHECK-64-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l51
+// CHECK-64-SAME: () #[[ATTR0]] {
+// CHECK-64-NEXT:  entry:
+// CHECK-64-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 2, i1 false)
+// CHECK-64-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
+// CHECK-64-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
+// CHECK-64:       user_code.entry:
+// CHECK-64-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
+// CHECK-64-NEXT:    store i32 0, i32* [[DOTZERO_ADDR]], align 4
+// CHECK-64-NEXT:    store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4
+// CHECK-64-NEXT:    call void @__omp_outlined__25(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]]) #[[ATTR2]]
+// CHECK-64-NEXT:    call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 2)
+// CHECK-64-NEXT:    ret void
+// CHECK-64:       worker.exit:
+// CHECK-64-NEXT:    ret void
+//
+//
+// CHECK-64-LABEL: define {{[^@]+}}@__omp_outlined__25
+// CHECK-64-SAME: (i32* noalias noundef [[DOTGLOBAL_TID_:%.*]], i32* noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
+// CHECK-64-NEXT:  entry:
+// CHECK-64-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8
+// CHECK-64-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8
+// CHECK-64-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [2 x i8*], align 8
+// CHECK-64-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK-64-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8
+// CHECK-64-NEXT:    store i32 0, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-64-NEXT:    store i32 9, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-64-NEXT:    store i32 1, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-64-NEXT:    store i32 0, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-64-NEXT:    [[NVPTX_NUM_THREADS:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block()
+// CHECK-64-NEXT:    [[TMP0:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK-64-NEXT:    [[TMP1:%.*]] = load i32, i32* [[TMP0]], align 4
+// CHECK-64-NEXT:    call void @__kmpc_distribute_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP1]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]])
+// CHECK-64-NEXT:    [[TMP2:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-64-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[TMP2]], 9
+// CHECK-64-NEXT:    br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK-64:       cond.true:
+// CHECK-64-NEXT:    br label [[COND_END:%.*]]
+// CHECK-64:       cond.false:
+// CHECK-64-NEXT:    [[TMP3:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-64-NEXT:    br label [[COND_END]]
+// CHECK-64:       cond.end:
+// CHECK-64-NEXT:    [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP3]], [[COND_FALSE]] ]
+// CHECK-64-NEXT:    store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-64-NEXT:    [[TMP4:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-64-NEXT:    store i32 [[TMP4]], i32* [[DOTOMP_IV]], align 4
+// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK-64:       omp.inner.for.cond:
+// CHECK-64-NEXT:    [[TMP5:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
+// CHECK-64-NEXT:    [[CMP1:%.*]] = icmp slt i32 [[TMP5]], 10
+// CHECK-64-NEXT:    br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK-64:       omp.inner.for.body:
+// CHECK-64-NEXT:    [[TMP6:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-64-NEXT:    [[TMP7:%.*]] = zext i32 [[TMP6]] to i64
+// CHECK-64-NEXT:    [[TMP8:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-64-NEXT:    [[TMP9:%.*]] = zext i32 [[TMP8]] to i64
+// CHECK-64-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 0
+// CHECK-64-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP7]] to i8*
+// CHECK-64-NEXT:    store i8* [[TMP11]], i8** [[TMP10]], align 8
+// CHECK-64-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 1
+// CHECK-64-NEXT:    [[TMP13:%.*]] = inttoptr i64 [[TMP9]] to i8*
+// CHECK-64-NEXT:    store i8* [[TMP13]], i8** [[TMP12]], align 8
+// CHECK-64-NEXT:    [[TMP14:%.*]] = bitcast [2 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8**
+// CHECK-64-NEXT:    call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i64, i64)* @__omp_outlined__26 to i8*), i8* null, i8** [[TMP14]], i64 2)
+// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK-64:       omp.inner.for.inc:
+// CHECK-64-NEXT:    [[TMP15:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
+// CHECK-64-NEXT:    [[TMP16:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-64-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP15]], [[TMP16]]
+// CHECK-64-NEXT:    store i32 [[ADD]], i32* [[DOTOMP_IV]], align 4
+// CHECK-64-NEXT:    [[TMP17:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-64-NEXT:    [[TMP18:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-64-NEXT:    [[ADD2:%.*]] = add nsw i32 [[TMP17]], [[TMP18]]
+// CHECK-64-NEXT:    store i32 [[ADD2]], i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-64-NEXT:    [[TMP19:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-64-NEXT:    [[TMP20:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-64-NEXT:    [[ADD3:%.*]] = add nsw i32 [[TMP19]], [[TMP20]]
+// CHECK-64-NEXT:    store i32 [[ADD3]], i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-64-NEXT:    [[TMP21:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-64-NEXT:    [[CMP4:%.*]] = icmp sgt i32 [[TMP21]], 9
+// CHECK-64-NEXT:    br i1 [[CMP4]], label [[COND_TRUE5:%.*]], label [[COND_FALSE6:%.*]]
+// CHECK-64:       cond.true5:
+// CHECK-64-NEXT:    br label [[COND_END7:%.*]]
+// CHECK-64:       cond.false6:
+// CHECK-64-NEXT:    [[TMP22:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-64-NEXT:    br label [[COND_END7]]
+// CHECK-64:       cond.end7:
+// CHECK-64-NEXT:    [[COND8:%.*]] = phi i32 [ 9, [[COND_TRUE5]] ], [ [[TMP22]], [[COND_FALSE6]] ]
+// CHECK-64-NEXT:    store i32 [[COND8]], i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-64-NEXT:    [[TMP23:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-64-NEXT:    store i32 [[TMP23]], i32* [[DOTOMP_IV]], align 4
+// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND]]
+// CHECK-64:       omp.inner.for.end:
+// CHECK-64-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK-64:       omp.loop.exit:
+// CHECK-64-NEXT:    call void @__kmpc_distribute_static_fini(%struct.ident_t* @[[GLOB2]], i32 [[TMP1]])
+// CHECK-64-NEXT:    ret void
+//
+//
+// CHECK-64-LABEL: define {{[^@]+}}@__omp_outlined__26
+// CHECK-64-SAME: (i32* noalias noundef [[DOTGLOBAL_TID_:%.*]], i32* noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR1]] {
+// CHECK-64-NEXT:  entry:
+// CHECK-64-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8
+// CHECK-64-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8
+// CHECK-64-NEXT:    [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i64, align 8
+// CHECK-64-NEXT:    [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i64, align 8
+// CHECK-64-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK-64-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8
+// CHECK-64-NEXT:    store i64 [[DOTPREVIOUS_LB_]], i64* [[DOTPREVIOUS_LB__ADDR]], align 8
+// CHECK-64-NEXT:    store i64 [[DOTPREVIOUS_UB_]], i64* [[DOTPREVIOUS_UB__ADDR]], align 8
+// CHECK-64-NEXT:    store i32 0, i32* [[DOTOMP_LB]], align 4
+// CHECK-64-NEXT:    store i32 9, i32* [[DOTOMP_UB]], align 4
+// CHECK-64-NEXT:    [[TMP0:%.*]] = load i64, i64* [[DOTPREVIOUS_LB__ADDR]], align 8
+// CHECK-64-NEXT:    [[CONV:%.*]] = trunc i64 [[TMP0]] to i32
+// CHECK-64-NEXT:    [[TMP1:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8
+// CHECK-64-NEXT:    [[CONV1:%.*]] = trunc i64 [[TMP1]] to i32
+// CHECK-64-NEXT:    store i32 [[CONV]], i32* [[DOTOMP_LB]], align 4
+// CHECK-64-NEXT:    store i32 [[CONV1]], i32* [[DOTOMP_UB]], align 4
+// CHECK-64-NEXT:    store i32 1, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-64-NEXT:    store i32 0, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-64-NEXT:    [[TMP2:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4
+// CHECK-64-NEXT:    [[TMP3:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4
+// CHECK-64-NEXT:    [[TMP4:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK-64-NEXT:    [[TMP5:%.*]] = load i32, i32* [[TMP4]], align 4
+// CHECK-64-NEXT:    call void @__kmpc_dispatch_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP5]], i32 1073741859, i32 [[TMP2]], i32 [[TMP3]], i32 1, i32 1)
+// CHECK-64-NEXT:    br label [[OMP_DISPATCH_COND:%.*]]
+// CHECK-64:       omp.dispatch.cond:
+// CHECK-64-NEXT:    [[TMP6:%.*]] = call i32 @__kmpc_dispatch_next_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP5]], i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]])
+// CHECK-64-NEXT:    [[TOBOOL:%.*]] = icmp ne i32 [[TMP6]], 0
+// CHECK-64-NEXT:    br i1 [[TOBOOL]], label [[OMP_DISPATCH_BODY:%.*]], label [[OMP_DISPATCH_END:%.*]]
+// CHECK-64:       omp.dispatch.body:
+// CHECK-64-NEXT:    [[TMP7:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4
+// CHECK-64-NEXT:    store i32 [[TMP7]], i32* [[DOTOMP_IV]], align 4
+// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK-64:       omp.inner.for.cond:
+// CHECK-64-NEXT:    [[TMP8:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP186:![0-9]+]]
+// CHECK-64-NEXT:    [[TMP9:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP186]]
+// CHECK-64-NEXT:    [[CMP:%.*]] = icmp sle i32 [[TMP8]], [[TMP9]]
+// CHECK-64-NEXT:    br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK-64:       omp.inner.for.body:
+// CHECK-64-NEXT:    [[TMP10:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP186]]
+// CHECK-64-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP10]], 1
+// CHECK-64-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// CHECK-64-NEXT:    store i32 [[ADD]], i32* [[I]], align 4, !llvm.access.group [[ACC_GRP186]]
+// CHECK-64-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK-64:       omp.body.continue:
+// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK-64:       omp.inner.for.inc:
+// CHECK-64-NEXT:    [[TMP11:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP186]]
+// CHECK-64-NEXT:    [[ADD2:%.*]] = add nsw i32 [[TMP11]], 1
+// CHECK-64-NEXT:    store i32 [[ADD2]], i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP186]]
+// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP187:![0-9]+]]
+// CHECK-64:       omp.inner.for.end:
+// CHECK-64-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
+// CHECK-64:       omp.dispatch.inc:
+// CHECK-64-NEXT:    br label [[OMP_DISPATCH_COND]]
+// CHECK-64:       omp.dispatch.end:
+// CHECK-64-NEXT:    ret void
+//
+//
+// CHECK-64-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l54
+// CHECK-64-SAME: () #[[ATTR0]] {
+// CHECK-64-NEXT:  entry:
+// CHECK-64-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 2, i1 false)
+// CHECK-64-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
+// CHECK-64-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
+// CHECK-64:       user_code.entry:
+// CHECK-64-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
+// CHECK-64-NEXT:    store i32 0, i32* [[DOTZERO_ADDR]], align 4
+// CHECK-64-NEXT:    store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4
+// CHECK-64-NEXT:    call void @__omp_outlined__27(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]]) #[[ATTR2]]
+// CHECK-64-NEXT:    call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 2)
+// CHECK-64-NEXT:    ret void
+// CHECK-64:       worker.exit:
+// CHECK-64-NEXT:    ret void
+//
+//
+// CHECK-64-LABEL: define {{[^@]+}}@__omp_outlined__27
+// CHECK-64-SAME: (i32* noalias noundef [[DOTGLOBAL_TID_:%.*]], i32* noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
+// CHECK-64-NEXT:  entry:
+// CHECK-64-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8
+// CHECK-64-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8
+// CHECK-64-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [2 x i8*], align 8
+// CHECK-64-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK-64-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8
+// CHECK-64-NEXT:    store i32 0, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-64-NEXT:    store i32 9, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-64-NEXT:    store i32 1, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-64-NEXT:    store i32 0, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-64-NEXT:    [[NVPTX_NUM_THREADS:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block()
+// CHECK-64-NEXT:    [[TMP0:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK-64-NEXT:    [[TMP1:%.*]] = load i32, i32* [[TMP0]], align 4
+// CHECK-64-NEXT:    call void @__kmpc_distribute_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP1]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]])
+// CHECK-64-NEXT:    [[TMP2:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-64-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[TMP2]], 9
+// CHECK-64-NEXT:    br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK-64:       cond.true:
+// CHECK-64-NEXT:    br label [[COND_END:%.*]]
+// CHECK-64:       cond.false:
+// CHECK-64-NEXT:    [[TMP3:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-64-NEXT:    br label [[COND_END]]
+// CHECK-64:       cond.end:
+// CHECK-64-NEXT:    [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP3]], [[COND_FALSE]] ]
+// CHECK-64-NEXT:    store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-64-NEXT:    [[TMP4:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-64-NEXT:    store i32 [[TMP4]], i32* [[DOTOMP_IV]], align 4
+// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK-64:       omp.inner.for.cond:
+// CHECK-64-NEXT:    [[TMP5:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
+// CHECK-64-NEXT:    [[CMP1:%.*]] = icmp slt i32 [[TMP5]], 10
+// CHECK-64-NEXT:    br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK-64:       omp.inner.for.body:
+// CHECK-64-NEXT:    [[TMP6:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-64-NEXT:    [[TMP7:%.*]] = zext i32 [[TMP6]] to i64
+// CHECK-64-NEXT:    [[TMP8:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-64-NEXT:    [[TMP9:%.*]] = zext i32 [[TMP8]] to i64
+// CHECK-64-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 0
+// CHECK-64-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP7]] to i8*
+// CHECK-64-NEXT:    store i8* [[TMP11]], i8** [[TMP10]], align 8
+// CHECK-64-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 1
+// CHECK-64-NEXT:    [[TMP13:%.*]] = inttoptr i64 [[TMP9]] to i8*
+// CHECK-64-NEXT:    store i8* [[TMP13]], i8** [[TMP12]], align 8
+// CHECK-64-NEXT:    [[TMP14:%.*]] = bitcast [2 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8**
+// CHECK-64-NEXT:    call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i64, i64)* @__omp_outlined__28 to i8*), i8* null, i8** [[TMP14]], i64 2)
+// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK-64:       omp.inner.for.inc:
+// CHECK-64-NEXT:    [[TMP15:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
+// CHECK-64-NEXT:    [[TMP16:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-64-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP15]], [[TMP16]]
+// CHECK-64-NEXT:    store i32 [[ADD]], i32* [[DOTOMP_IV]], align 4
+// CHECK-64-NEXT:    [[TMP17:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-64-NEXT:    [[TMP18:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-64-NEXT:    [[ADD2:%.*]] = add nsw i32 [[TMP17]], [[TMP18]]
+// CHECK-64-NEXT:    store i32 [[ADD2]], i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-64-NEXT:    [[TMP19:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-64-NEXT:    [[TMP20:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-64-NEXT:    [[ADD3:%.*]] = add nsw i32 [[TMP19]], [[TMP20]]
+// CHECK-64-NEXT:    store i32 [[ADD3]], i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-64-NEXT:    [[TMP21:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-64-NEXT:    [[CMP4:%.*]] = icmp sgt i32 [[TMP21]], 9
+// CHECK-64-NEXT:    br i1 [[CMP4]], label [[COND_TRUE5:%.*]], label [[COND_FALSE6:%.*]]
+// CHECK-64:       cond.true5:
+// CHECK-64-NEXT:    br label [[COND_END7:%.*]]
+// CHECK-64:       cond.false6:
+// CHECK-64-NEXT:    [[TMP22:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-64-NEXT:    br label [[COND_END7]]
+// CHECK-64:       cond.end7:
+// CHECK-64-NEXT:    [[COND8:%.*]] = phi i32 [ 9, [[COND_TRUE5]] ], [ [[TMP22]], [[COND_FALSE6]] ]
+// CHECK-64-NEXT:    store i32 [[COND8]], i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-64-NEXT:    [[TMP23:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-64-NEXT:    store i32 [[TMP23]], i32* [[DOTOMP_IV]], align 4
+// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND]]
+// CHECK-64:       omp.inner.for.end:
+// CHECK-64-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK-64:       omp.loop.exit:
+// CHECK-64-NEXT:    call void @__kmpc_distribute_static_fini(%struct.ident_t* @[[GLOB2]], i32 [[TMP1]])
+// CHECK-64-NEXT:    ret void
+//
+//
+// CHECK-64-LABEL: define {{[^@]+}}@__omp_outlined__28
+// CHECK-64-SAME: (i32* noalias noundef [[DOTGLOBAL_TID_:%.*]], i32* noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR1]] {
+// CHECK-64-NEXT:  entry:
+// CHECK-64-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8
+// CHECK-64-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8
+// CHECK-64-NEXT:    [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i64, align 8
+// CHECK-64-NEXT:    [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i64, align 8
+// CHECK-64-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK-64-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8
+// CHECK-64-NEXT:    store i64 [[DOTPREVIOUS_LB_]], i64* [[DOTPREVIOUS_LB__ADDR]], align 8
+// CHECK-64-NEXT:    store i64 [[DOTPREVIOUS_UB_]], i64* [[DOTPREVIOUS_UB__ADDR]], align 8
+// CHECK-64-NEXT:    store i32 0, i32* [[DOTOMP_LB]], align 4
+// CHECK-64-NEXT:    store i32 9, i32* [[DOTOMP_UB]], align 4
+// CHECK-64-NEXT:    [[TMP0:%.*]] = load i64, i64* [[DOTPREVIOUS_LB__ADDR]], align 8
+// CHECK-64-NEXT:    [[CONV:%.*]] = trunc i64 [[TMP0]] to i32
+// CHECK-64-NEXT:    [[TMP1:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8
+// CHECK-64-NEXT:    [[CONV1:%.*]] = trunc i64 [[TMP1]] to i32
+// CHECK-64-NEXT:    store i32 [[CONV]], i32* [[DOTOMP_LB]], align 4
+// CHECK-64-NEXT:    store i32 [[CONV1]], i32* [[DOTOMP_UB]], align 4
+// CHECK-64-NEXT:    store i32 1, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-64-NEXT:    store i32 0, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-64-NEXT:    [[TMP2:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4
+// CHECK-64-NEXT:    [[TMP3:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4
+// CHECK-64-NEXT:    [[TMP4:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK-64-NEXT:    [[TMP5:%.*]] = load i32, i32* [[TMP4]], align 4
+// CHECK-64-NEXT:    call void @__kmpc_dispatch_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP5]], i32 1073741860, i32 [[TMP2]], i32 [[TMP3]], i32 1, i32 1)
+// CHECK-64-NEXT:    br label [[OMP_DISPATCH_COND:%.*]]
+// CHECK-64:       omp.dispatch.cond:
+// CHECK-64-NEXT:    [[TMP6:%.*]] = call i32 @__kmpc_dispatch_next_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP5]], i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]])
+// CHECK-64-NEXT:    [[TOBOOL:%.*]] = icmp ne i32 [[TMP6]], 0
+// CHECK-64-NEXT:    br i1 [[TOBOOL]], label [[OMP_DISPATCH_BODY:%.*]], label [[OMP_DISPATCH_END:%.*]]
+// CHECK-64:       omp.dispatch.body:
+// CHECK-64-NEXT:    [[TMP7:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4
+// CHECK-64-NEXT:    store i32 [[TMP7]], i32* [[DOTOMP_IV]], align 4
+// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK-64:       omp.inner.for.cond:
+// CHECK-64-NEXT:    [[TMP8:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP189:![0-9]+]]
+// CHECK-64-NEXT:    [[TMP9:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP189]]
+// CHECK-64-NEXT:    [[CMP:%.*]] = icmp sle i32 [[TMP8]], [[TMP9]]
+// CHECK-64-NEXT:    br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK-64:       omp.inner.for.body:
+// CHECK-64-NEXT:    [[TMP10:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP189]]
+// CHECK-64-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP10]], 1
+// CHECK-64-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// CHECK-64-NEXT:    store i32 [[ADD]], i32* [[I]], align 4, !llvm.access.group [[ACC_GRP189]]
+// CHECK-64-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK-64:       omp.body.continue:
+// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK-64:       omp.inner.for.inc:
+// CHECK-64-NEXT:    [[TMP11:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP189]]
+// CHECK-64-NEXT:    [[ADD2:%.*]] = add nsw i32 [[TMP11]], 1
+// CHECK-64-NEXT:    store i32 [[ADD2]], i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP189]]
+// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP190:![0-9]+]]
+// CHECK-64:       omp.inner.for.end:
+// CHECK-64-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
+// CHECK-64:       omp.dispatch.inc:
+// CHECK-64-NEXT:    br label [[OMP_DISPATCH_COND]]
+// CHECK-64:       omp.dispatch.end:
+// CHECK-64-NEXT:    ret void
+//
+//
+// CHECK-64-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l57
+// CHECK-64-SAME: () #[[ATTR0]] {
+// CHECK-64-NEXT:  entry:
+// CHECK-64-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 2, i1 false)
+// CHECK-64-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
+// CHECK-64-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
+// CHECK-64:       user_code.entry:
+// CHECK-64-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
+// CHECK-64-NEXT:    store i32 0, i32* [[DOTZERO_ADDR]], align 4
+// CHECK-64-NEXT:    store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4
+// CHECK-64-NEXT:    call void @__omp_outlined__29(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]]) #[[ATTR2]]
+// CHECK-64-NEXT:    call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 2)
+// CHECK-64-NEXT:    ret void
+// CHECK-64:       worker.exit:
+// CHECK-64-NEXT:    ret void
+//
+//
+// CHECK-64-LABEL: define {{[^@]+}}@__omp_outlined__29
+// CHECK-64-SAME: (i32* noalias noundef [[DOTGLOBAL_TID_:%.*]], i32* noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
+// CHECK-64-NEXT:  entry:
+// CHECK-64-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8
+// CHECK-64-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8
+// CHECK-64-NEXT:    [[B:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [2 x i8*], align 8
+// CHECK-64-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK-64-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8
+// CHECK-64-NEXT:    store i32 0, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-64-NEXT:    store i32 9, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-64-NEXT:    store i32 1, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-64-NEXT:    store i32 0, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-64-NEXT:    [[NVPTX_NUM_THREADS:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block()
+// CHECK-64-NEXT:    [[TMP0:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK-64-NEXT:    [[TMP1:%.*]] = load i32, i32* [[TMP0]], align 4
+// CHECK-64-NEXT:    call void @__kmpc_distribute_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP1]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]])
+// CHECK-64-NEXT:    [[TMP2:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-64-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[TMP2]], 9
+// CHECK-64-NEXT:    br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK-64:       cond.true:
+// CHECK-64-NEXT:    br label [[COND_END:%.*]]
+// CHECK-64:       cond.false:
+// CHECK-64-NEXT:    [[TMP3:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-64-NEXT:    br label [[COND_END]]
+// CHECK-64:       cond.end:
+// CHECK-64-NEXT:    [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP3]], [[COND_FALSE]] ]
+// CHECK-64-NEXT:    store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-64-NEXT:    [[TMP4:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-64-NEXT:    store i32 [[TMP4]], i32* [[DOTOMP_IV]], align 4
+// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK-64:       omp.inner.for.cond:
+// CHECK-64-NEXT:    [[TMP5:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP192:![0-9]+]]
+// CHECK-64-NEXT:    [[CMP1:%.*]] = icmp slt i32 [[TMP5]], 10
+// CHECK-64-NEXT:    br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK-64:       omp.inner.for.body:
+// CHECK-64-NEXT:    [[TMP6:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP192]]
+// CHECK-64-NEXT:    [[TMP7:%.*]] = zext i32 [[TMP6]] to i64
+// CHECK-64-NEXT:    [[TMP8:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP192]]
+// CHECK-64-NEXT:    [[TMP9:%.*]] = zext i32 [[TMP8]] to i64
+// CHECK-64-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 0
+// CHECK-64-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP7]] to i8*
+// CHECK-64-NEXT:    store i8* [[TMP11]], i8** [[TMP10]], align 8, !llvm.access.group [[ACC_GRP192]]
+// CHECK-64-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 1
+// CHECK-64-NEXT:    [[TMP13:%.*]] = inttoptr i64 [[TMP9]] to i8*
+// CHECK-64-NEXT:    store i8* [[TMP13]], i8** [[TMP12]], align 8, !llvm.access.group [[ACC_GRP192]]
+// CHECK-64-NEXT:    [[TMP14:%.*]] = bitcast [2 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8**
+// CHECK-64-NEXT:    call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i64, i64)* @__omp_outlined__30 to i8*), i8* null, i8** [[TMP14]], i64 2), !llvm.access.group [[ACC_GRP192]]
+// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK-64:       omp.inner.for.inc:
+// CHECK-64-NEXT:    [[TMP15:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP192]]
+// CHECK-64-NEXT:    [[TMP16:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP192]]
+// CHECK-64-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP15]], [[TMP16]]
+// CHECK-64-NEXT:    store i32 [[ADD]], i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP192]]
+// CHECK-64-NEXT:    [[TMP17:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP192]]
+// CHECK-64-NEXT:    [[TMP18:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP192]]
+// CHECK-64-NEXT:    [[ADD2:%.*]] = add nsw i32 [[TMP17]], [[TMP18]]
+// CHECK-64-NEXT:    store i32 [[ADD2]], i32* [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP192]]
+// CHECK-64-NEXT:    [[TMP19:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP192]]
+// CHECK-64-NEXT:    [[TMP20:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP192]]
+// CHECK-64-NEXT:    [[ADD3:%.*]] = add nsw i32 [[TMP19]], [[TMP20]]
+// CHECK-64-NEXT:    store i32 [[ADD3]], i32* [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP192]]
+// CHECK-64-NEXT:    [[TMP21:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP192]]
+// CHECK-64-NEXT:    [[CMP4:%.*]] = icmp sgt i32 [[TMP21]], 9
+// CHECK-64-NEXT:    br i1 [[CMP4]], label [[COND_TRUE5:%.*]], label [[COND_FALSE6:%.*]]
+// CHECK-64:       cond.true5:
+// CHECK-64-NEXT:    br label [[COND_END7:%.*]]
+// CHECK-64:       cond.false6:
+// CHECK-64-NEXT:    [[TMP22:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP192]]
+// CHECK-64-NEXT:    br label [[COND_END7]]
+// CHECK-64:       cond.end7:
+// CHECK-64-NEXT:    [[COND8:%.*]] = phi i32 [ 9, [[COND_TRUE5]] ], [ [[TMP22]], [[COND_FALSE6]] ]
+// CHECK-64-NEXT:    store i32 [[COND8]], i32* [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP192]]
+// CHECK-64-NEXT:    [[TMP23:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP192]]
+// CHECK-64-NEXT:    store i32 [[TMP23]], i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP192]]
+// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP193:![0-9]+]]
+// CHECK-64:       omp.inner.for.end:
+// CHECK-64-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK-64:       omp.loop.exit:
+// CHECK-64-NEXT:    call void @__kmpc_distribute_static_fini(%struct.ident_t* @[[GLOB2]], i32 [[TMP1]])
+// CHECK-64-NEXT:    [[TMP24:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-64-NEXT:    [[TMP25:%.*]] = icmp ne i32 [[TMP24]], 0
+// CHECK-64-NEXT:    br i1 [[TMP25]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]]
+// CHECK-64:       .omp.final.then:
+// CHECK-64-NEXT:    store i32 10, i32* [[I]], align 4
+// CHECK-64-NEXT:    br label [[DOTOMP_FINAL_DONE]]
+// CHECK-64:       .omp.final.done:
+// CHECK-64-NEXT:    ret void
+//
+//
+// CHECK-64-LABEL: define {{[^@]+}}@__omp_outlined__30
+// CHECK-64-SAME: (i32* noalias noundef [[DOTGLOBAL_TID_:%.*]], i32* noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR1]] {
+// CHECK-64-NEXT:  entry:
+// CHECK-64-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8
+// CHECK-64-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8
+// CHECK-64-NEXT:    [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i64, align 8
+// CHECK-64-NEXT:    [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i64, align 8
+// CHECK-64-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK-64-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8
+// CHECK-64-NEXT:    store i64 [[DOTPREVIOUS_LB_]], i64* [[DOTPREVIOUS_LB__ADDR]], align 8
+// CHECK-64-NEXT:    store i64 [[DOTPREVIOUS_UB_]], i64* [[DOTPREVIOUS_UB__ADDR]], align 8
+// CHECK-64-NEXT:    store i32 0, i32* [[DOTOMP_LB]], align 4
+// CHECK-64-NEXT:    store i32 9, i32* [[DOTOMP_UB]], align 4
+// CHECK-64-NEXT:    [[TMP0:%.*]] = load i64, i64* [[DOTPREVIOUS_LB__ADDR]], align 8
+// CHECK-64-NEXT:    [[CONV:%.*]] = trunc i64 [[TMP0]] to i32
+// CHECK-64-NEXT:    [[TMP1:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8
+// CHECK-64-NEXT:    [[CONV1:%.*]] = trunc i64 [[TMP1]] to i32
+// CHECK-64-NEXT:    store i32 [[CONV]], i32* [[DOTOMP_LB]], align 4
+// CHECK-64-NEXT:    store i32 [[CONV1]], i32* [[DOTOMP_UB]], align 4
+// CHECK-64-NEXT:    store i32 1, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-64-NEXT:    store i32 0, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-64-NEXT:    [[TMP2:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK-64-NEXT:    [[TMP3:%.*]] = load i32, i32* [[TMP2]], align 4
+// CHECK-64-NEXT:    call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB3]], i32 [[TMP3]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1)
+// CHECK-64-NEXT:    [[TMP4:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4
+// CHECK-64-NEXT:    store i32 [[TMP4]], i32* [[DOTOMP_IV]], align 4
+// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK-64:       omp.inner.for.cond:
+// CHECK-64-NEXT:    [[TMP5:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP195:![0-9]+]]
+// CHECK-64-NEXT:    [[CONV2:%.*]] = sext i32 [[TMP5]] to i64
+// CHECK-64-NEXT:    [[TMP6:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8, !llvm.access.group [[ACC_GRP195]]
+// CHECK-64-NEXT:    [[CMP:%.*]] = icmp ule i64 [[CONV2]], [[TMP6]]
+// CHECK-64-NEXT:    br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK-64:       omp.inner.for.body:
+// CHECK-64-NEXT:    [[TMP7:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP195]]
+// CHECK-64-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP7]], 1
+// CHECK-64-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// CHECK-64-NEXT:    store i32 [[ADD]], i32* [[I]], align 4, !llvm.access.group [[ACC_GRP195]]
+// CHECK-64-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK-64:       omp.body.continue:
+// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK-64:       omp.inner.for.inc:
+// CHECK-64-NEXT:    [[TMP8:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP195]]
+// CHECK-64-NEXT:    [[TMP9:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP195]]
+// CHECK-64-NEXT:    [[ADD3:%.*]] = add nsw i32 [[TMP8]], [[TMP9]]
+// CHECK-64-NEXT:    store i32 [[ADD3]], i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP195]]
+// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP196:![0-9]+]]
+// CHECK-64:       omp.inner.for.end:
+// CHECK-64-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK-64:       omp.loop.exit:
+// CHECK-64-NEXT:    call void @__kmpc_distribute_static_fini(%struct.ident_t* @[[GLOB2]], i32 [[TMP3]])
+// CHECK-64-NEXT:    [[TMP10:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-64-NEXT:    [[TMP11:%.*]] = icmp ne i32 [[TMP10]], 0
+// CHECK-64-NEXT:    br i1 [[TMP11]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]]
+// CHECK-64:       .omp.final.then:
+// CHECK-64-NEXT:    store i32 10, i32* [[I]], align 4
+// CHECK-64-NEXT:    br label [[DOTOMP_FINAL_DONE]]
+// CHECK-64:       .omp.final.done:
+// CHECK-64-NEXT:    ret void
+//
+//
+// CHECK-64-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l65
+// CHECK-64-SAME: () #[[ATTR0]] {
+// CHECK-64-NEXT:  entry:
+// CHECK-64-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 2, i1 false)
+// CHECK-64-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
+// CHECK-64-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
+// CHECK-64:       user_code.entry:
+// CHECK-64-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
+// CHECK-64-NEXT:    store i32 0, i32* [[DOTZERO_ADDR]], align 4
+// CHECK-64-NEXT:    store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4
+// CHECK-64-NEXT:    call void @__omp_outlined__31(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]]) #[[ATTR2]]
+// CHECK-64-NEXT:    call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 2)
+// CHECK-64-NEXT:    ret void
+// CHECK-64:       worker.exit:
+// CHECK-64-NEXT:    ret void
+//
+//
+// CHECK-64-LABEL: define {{[^@]+}}@__omp_outlined__31
+// CHECK-64-SAME: (i32* noalias noundef [[DOTGLOBAL_TID_:%.*]], i32* noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
+// CHECK-64-NEXT:  entry:
+// CHECK-64-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8
+// CHECK-64-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8
+// CHECK-64-NEXT:    [[B:%.*]] = alloca [3 x i32], align 4
+// CHECK-64-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [2 x i8*], align 8
+// CHECK-64-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK-64-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8
+// CHECK-64-NEXT:    [[TMP0:%.*]] = bitcast [3 x i32]* [[B]] to i8*
+// CHECK-64-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 [[TMP0]], i8* align 4 bitcast ([3 x i32]* @"__const.<captured>.b" to i8*), i64 12, i1 false)
+// CHECK-64-NEXT:    store i32 0, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-64-NEXT:    store i32 9, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-64-NEXT:    store i32 1, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-64-NEXT:    store i32 0, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-64-NEXT:    [[NVPTX_NUM_THREADS:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block()
+// CHECK-64-NEXT:    [[TMP1:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK-64-NEXT:    [[TMP2:%.*]] = load i32, i32* [[TMP1]], align 4
+// CHECK-64-NEXT:    call void @__kmpc_distribute_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP2]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]])
+// CHECK-64-NEXT:    [[TMP3:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-64-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[TMP3]], 9
+// CHECK-64-NEXT:    br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK-64:       cond.true:
+// CHECK-64-NEXT:    br label [[COND_END:%.*]]
+// CHECK-64:       cond.false:
+// CHECK-64-NEXT:    [[TMP4:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-64-NEXT:    br label [[COND_END]]
+// CHECK-64:       cond.end:
+// CHECK-64-NEXT:    [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP4]], [[COND_FALSE]] ]
+// CHECK-64-NEXT:    store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-64-NEXT:    [[TMP5:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-64-NEXT:    store i32 [[TMP5]], i32* [[DOTOMP_IV]], align 4
+// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK-64:       omp.inner.for.cond:
+// CHECK-64-NEXT:    [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP198:![0-9]+]]
+// CHECK-64-NEXT:    [[CMP1:%.*]] = icmp slt i32 [[TMP6]], 10
+// CHECK-64-NEXT:    br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK-64:       omp.inner.for.body:
+// CHECK-64-NEXT:    [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP198]]
+// CHECK-64-NEXT:    [[TMP8:%.*]] = zext i32 [[TMP7]] to i64
+// CHECK-64-NEXT:    [[TMP9:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP198]]
+// CHECK-64-NEXT:    [[TMP10:%.*]] = zext i32 [[TMP9]] to i64
+// CHECK-64-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 0
+// CHECK-64-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP8]] to i8*
+// CHECK-64-NEXT:    store i8* [[TMP12]], i8** [[TMP11]], align 8, !llvm.access.group [[ACC_GRP198]]
+// CHECK-64-NEXT:    [[TMP13:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 1
+// CHECK-64-NEXT:    [[TMP14:%.*]] = inttoptr i64 [[TMP10]] to i8*
+// CHECK-64-NEXT:    store i8* [[TMP14]], i8** [[TMP13]], align 8, !llvm.access.group [[ACC_GRP198]]
+// CHECK-64-NEXT:    [[TMP15:%.*]] = bitcast [2 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8**
+// CHECK-64-NEXT:    call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i64, i64)* @__omp_outlined__32 to i8*), i8* null, i8** [[TMP15]], i64 2), !llvm.access.group [[ACC_GRP198]]
+// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK-64:       omp.inner.for.inc:
+// CHECK-64-NEXT:    [[TMP16:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP198]]
+// CHECK-64-NEXT:    [[TMP17:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP198]]
+// CHECK-64-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP16]], [[TMP17]]
+// CHECK-64-NEXT:    store i32 [[ADD]], i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP198]]
+// CHECK-64-NEXT:    [[TMP18:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP198]]
+// CHECK-64-NEXT:    [[TMP19:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP198]]
+// CHECK-64-NEXT:    [[ADD2:%.*]] = add nsw i32 [[TMP18]], [[TMP19]]
+// CHECK-64-NEXT:    store i32 [[ADD2]], i32* [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP198]]
+// CHECK-64-NEXT:    [[TMP20:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP198]]
+// CHECK-64-NEXT:    [[TMP21:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP198]]
+// CHECK-64-NEXT:    [[ADD3:%.*]] = add nsw i32 [[TMP20]], [[TMP21]]
+// CHECK-64-NEXT:    store i32 [[ADD3]], i32* [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP198]]
+// CHECK-64-NEXT:    [[TMP22:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP198]]
+// CHECK-64-NEXT:    [[CMP4:%.*]] = icmp sgt i32 [[TMP22]], 9
+// CHECK-64-NEXT:    br i1 [[CMP4]], label [[COND_TRUE5:%.*]], label [[COND_FALSE6:%.*]]
+// CHECK-64:       cond.true5:
+// CHECK-64-NEXT:    br label [[COND_END7:%.*]]
+// CHECK-64:       cond.false6:
+// CHECK-64-NEXT:    [[TMP23:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP198]]
+// CHECK-64-NEXT:    br label [[COND_END7]]
+// CHECK-64:       cond.end7:
+// CHECK-64-NEXT:    [[COND8:%.*]] = phi i32 [ 9, [[COND_TRUE5]] ], [ [[TMP23]], [[COND_FALSE6]] ]
+// CHECK-64-NEXT:    store i32 [[COND8]], i32* [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP198]]
+// CHECK-64-NEXT:    [[TMP24:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP198]]
+// CHECK-64-NEXT:    store i32 [[TMP24]], i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP198]]
+// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP199:![0-9]+]]
+// CHECK-64:       omp.inner.for.end:
+// CHECK-64-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK-64:       omp.loop.exit:
+// CHECK-64-NEXT:    call void @__kmpc_distribute_static_fini(%struct.ident_t* @[[GLOB2]], i32 [[TMP2]])
+// CHECK-64-NEXT:    [[TMP25:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-64-NEXT:    [[TMP26:%.*]] = icmp ne i32 [[TMP25]], 0
+// CHECK-64-NEXT:    br i1 [[TMP26]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]]
+// CHECK-64:       .omp.final.then:
+// CHECK-64-NEXT:    store i32 10, i32* [[I]], align 4
+// CHECK-64-NEXT:    br label [[DOTOMP_FINAL_DONE]]
+// CHECK-64:       .omp.final.done:
+// CHECK-64-NEXT:    ret void
+//
+//
+// CHECK-64-LABEL: define {{[^@]+}}@__omp_outlined__32
+// CHECK-64-SAME: (i32* noalias noundef [[DOTGLOBAL_TID_:%.*]], i32* noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR1]] {
+// CHECK-64-NEXT:  entry:
+// CHECK-64-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8
+// CHECK-64-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8
+// CHECK-64-NEXT:    [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i64, align 8
+// CHECK-64-NEXT:    [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i64, align 8
+// CHECK-64-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK-64-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8
+// CHECK-64-NEXT:    store i64 [[DOTPREVIOUS_LB_]], i64* [[DOTPREVIOUS_LB__ADDR]], align 8
+// CHECK-64-NEXT:    store i64 [[DOTPREVIOUS_UB_]], i64* [[DOTPREVIOUS_UB__ADDR]], align 8
+// CHECK-64-NEXT:    store i32 0, i32* [[DOTOMP_LB]], align 4
+// CHECK-64-NEXT:    store i32 9, i32* [[DOTOMP_UB]], align 4
+// CHECK-64-NEXT:    [[TMP0:%.*]] = load i64, i64* [[DOTPREVIOUS_LB__ADDR]], align 8
+// CHECK-64-NEXT:    [[CONV:%.*]] = trunc i64 [[TMP0]] to i32
+// CHECK-64-NEXT:    [[TMP1:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8
+// CHECK-64-NEXT:    [[CONV1:%.*]] = trunc i64 [[TMP1]] to i32
+// CHECK-64-NEXT:    store i32 [[CONV]], i32* [[DOTOMP_LB]], align 4
+// CHECK-64-NEXT:    store i32 [[CONV1]], i32* [[DOTOMP_UB]], align 4
+// CHECK-64-NEXT:    store i32 1, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-64-NEXT:    store i32 0, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-64-NEXT:    [[TMP2:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK-64-NEXT:    [[TMP3:%.*]] = load i32, i32* [[TMP2]], align 4
+// CHECK-64-NEXT:    call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB3]], i32 [[TMP3]], i32 34, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1)
+// CHECK-64-NEXT:    [[TMP4:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4
+// CHECK-64-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[TMP4]], 9
+// CHECK-64-NEXT:    br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK-64:       cond.true:
+// CHECK-64-NEXT:    br label [[COND_END:%.*]]
+// CHECK-64:       cond.false:
+// CHECK-64-NEXT:    [[TMP5:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4
+// CHECK-64-NEXT:    br label [[COND_END]]
+// CHECK-64:       cond.end:
+// CHECK-64-NEXT:    [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP5]], [[COND_FALSE]] ]
+// CHECK-64-NEXT:    store i32 [[COND]], i32* [[DOTOMP_UB]], align 4
+// CHECK-64-NEXT:    [[TMP6:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4
+// CHECK-64-NEXT:    store i32 [[TMP6]], i32* [[DOTOMP_IV]], align 4
+// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK-64:       omp.inner.for.cond:
+// CHECK-64-NEXT:    [[TMP7:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP201:![0-9]+]]
+// CHECK-64-NEXT:    [[TMP8:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP201]]
+// CHECK-64-NEXT:    [[CMP2:%.*]] = icmp sle i32 [[TMP7]], [[TMP8]]
+// CHECK-64-NEXT:    br i1 [[CMP2]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK-64:       omp.inner.for.body:
+// CHECK-64-NEXT:    [[TMP9:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP201]]
+// CHECK-64-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP9]], 1
+// CHECK-64-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// CHECK-64-NEXT:    store i32 [[ADD]], i32* [[I]], align 4, !llvm.access.group [[ACC_GRP201]]
+// CHECK-64-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK-64:       omp.body.continue:
+// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK-64:       omp.inner.for.inc:
+// CHECK-64-NEXT:    [[TMP10:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP201]]
+// CHECK-64-NEXT:    [[ADD3:%.*]] = add nsw i32 [[TMP10]], 1
+// CHECK-64-NEXT:    store i32 [[ADD3]], i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP201]]
+// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP202:![0-9]+]]
+// CHECK-64:       omp.inner.for.end:
+// CHECK-64-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK-64:       omp.loop.exit:
+// CHECK-64-NEXT:    call void @__kmpc_distribute_static_fini(%struct.ident_t* @[[GLOB2]], i32 [[TMP3]])
+// CHECK-64-NEXT:    [[TMP11:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-64-NEXT:    [[TMP12:%.*]] = icmp ne i32 [[TMP11]], 0
+// CHECK-64-NEXT:    br i1 [[TMP12]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]]
+// CHECK-64:       .omp.final.then:
+// CHECK-64-NEXT:    store i32 10, i32* [[I]], align 4
+// CHECK-64-NEXT:    br label [[DOTOMP_FINAL_DONE]]
+// CHECK-64:       .omp.final.done:
+// CHECK-64-NEXT:    ret void
+//
+//
+// CHECK-64-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l72
+// CHECK-64-SAME: () #[[ATTR0]] {
+// CHECK-64-NEXT:  entry:
+// CHECK-64-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 1, i1 true)
+// CHECK-64-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
+// CHECK-64-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
+// CHECK-64:       user_code.entry:
+// CHECK-64-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
+// CHECK-64-NEXT:    store i32 0, i32* [[DOTZERO_ADDR]], align 4
+// CHECK-64-NEXT:    store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4
+// CHECK-64-NEXT:    call void @__omp_outlined__33(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]]) #[[ATTR2]]
+// CHECK-64-NEXT:    call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 1)
+// CHECK-64-NEXT:    ret void
+// CHECK-64:       worker.exit:
+// CHECK-64-NEXT:    ret void
+//
+//
+// CHECK-64-LABEL: define {{[^@]+}}@__omp_outlined__33
+// CHECK-64-SAME: (i32* noalias noundef [[DOTGLOBAL_TID_:%.*]], i32* noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
+// CHECK-64-NEXT:  entry:
+// CHECK-64-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8
+// CHECK-64-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8
+// CHECK-64-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [2 x i8*], align 8
+// CHECK-64-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK-64-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8
+// CHECK-64-NEXT:    [[C:%.*]] = call align 8 i8* @__kmpc_alloc_shared(i64 8)
+// CHECK-64-NEXT:    [[C_ON_STACK:%.*]] = bitcast i8* [[C]] to i32**
+// CHECK-64-NEXT:    [[B:%.*]] = call align 8 i8* @__kmpc_alloc_shared(i64 4)
+// CHECK-64-NEXT:    [[B_ON_STACK:%.*]] = bitcast i8* [[B]] to i32*
+// CHECK-64-NEXT:    store i32 0, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-64-NEXT:    store i32 9, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-64-NEXT:    store i32 1, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-64-NEXT:    store i32 0, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-64-NEXT:    [[TMP0:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK-64-NEXT:    [[TMP1:%.*]] = load i32, i32* [[TMP0]], align 4
+// CHECK-64-NEXT:    call void @__kmpc_distribute_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP1]], i32 92, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1)
+// CHECK-64-NEXT:    [[TMP2:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-64-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[TMP2]], 9
+// CHECK-64-NEXT:    br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK-64:       cond.true:
+// CHECK-64-NEXT:    br label [[COND_END:%.*]]
+// CHECK-64:       cond.false:
+// CHECK-64-NEXT:    [[TMP3:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-64-NEXT:    br label [[COND_END]]
+// CHECK-64:       cond.end:
+// CHECK-64-NEXT:    [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP3]], [[COND_FALSE]] ]
+// CHECK-64-NEXT:    store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-64-NEXT:    [[TMP4:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-64-NEXT:    store i32 [[TMP4]], i32* [[DOTOMP_IV]], align 4
+// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK-64:       omp.inner.for.cond:
+// CHECK-64-NEXT:    [[TMP5:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP204:![0-9]+]]
+// CHECK-64-NEXT:    [[TMP6:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP204]]
+// CHECK-64-NEXT:    [[CMP1:%.*]] = icmp sle i32 [[TMP5]], [[TMP6]]
+// CHECK-64-NEXT:    br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK-64:       omp.inner.for.body:
+// CHECK-64-NEXT:    [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP204]]
+// CHECK-64-NEXT:    [[TMP8:%.*]] = zext i32 [[TMP7]] to i64
+// CHECK-64-NEXT:    [[TMP9:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP204]]
+// CHECK-64-NEXT:    [[TMP10:%.*]] = zext i32 [[TMP9]] to i64
+// CHECK-64-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 0
+// CHECK-64-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP8]] to i8*
+// CHECK-64-NEXT:    store i8* [[TMP12]], i8** [[TMP11]], align 8, !llvm.access.group [[ACC_GRP204]]
+// CHECK-64-NEXT:    [[TMP13:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 1
+// CHECK-64-NEXT:    [[TMP14:%.*]] = inttoptr i64 [[TMP10]] to i8*
+// CHECK-64-NEXT:    store i8* [[TMP14]], i8** [[TMP13]], align 8, !llvm.access.group [[ACC_GRP204]]
+// CHECK-64-NEXT:    [[TMP15:%.*]] = bitcast [2 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8**
+// CHECK-64-NEXT:    call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i64, i64)* @__omp_outlined__34 to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined__34_wrapper to i8*), i8** [[TMP15]], i64 2), !llvm.access.group [[ACC_GRP204]]
+// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK-64:       omp.inner.for.inc:
+// CHECK-64-NEXT:    [[TMP16:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP204]]
+// CHECK-64-NEXT:    [[TMP17:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP204]]
+// CHECK-64-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP16]], [[TMP17]]
+// CHECK-64-NEXT:    store i32 [[ADD]], i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP204]]
+// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP205:![0-9]+]]
+// CHECK-64:       omp.inner.for.end:
+// CHECK-64-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK-64:       omp.loop.exit:
+// CHECK-64-NEXT:    call void @__kmpc_distribute_static_fini(%struct.ident_t* @[[GLOB2]], i32 [[TMP1]])
+// CHECK-64-NEXT:    [[TMP18:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-64-NEXT:    [[TMP19:%.*]] = icmp ne i32 [[TMP18]], 0
+// CHECK-64-NEXT:    br i1 [[TMP19]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]]
+// CHECK-64:       .omp.final.then:
+// CHECK-64-NEXT:    store i32 10, i32* [[I]], align 4
+// CHECK-64-NEXT:    br label [[DOTOMP_FINAL_DONE]]
+// CHECK-64:       .omp.final.done:
+// CHECK-64-NEXT:    store i32* [[B_ON_STACK]], i32** [[C_ON_STACK]], align 8
+// CHECK-64-NEXT:    call void @__kmpc_free_shared(i8* [[B]], i64 4)
+// CHECK-64-NEXT:    call void @__kmpc_free_shared(i8* [[C]], i64 8)
+// CHECK-64-NEXT:    ret void
+//
+//
+// CHECK-64-LABEL: define {{[^@]+}}@__omp_outlined__34
+// CHECK-64-SAME: (i32* noalias noundef [[DOTGLOBAL_TID_:%.*]], i32* noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR1]] {
+// CHECK-64-NEXT:  entry:
+// CHECK-64-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8
+// CHECK-64-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8
+// CHECK-64-NEXT:    [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i64, align 8
+// CHECK-64-NEXT:    [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i64, align 8
+// CHECK-64-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK-64-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8
+// CHECK-64-NEXT:    store i64 [[DOTPREVIOUS_LB_]], i64* [[DOTPREVIOUS_LB__ADDR]], align 8
+// CHECK-64-NEXT:    store i64 [[DOTPREVIOUS_UB_]], i64* [[DOTPREVIOUS_UB__ADDR]], align 8
+// CHECK-64-NEXT:    store i32 0, i32* [[DOTOMP_LB]], align 4
+// CHECK-64-NEXT:    store i32 9, i32* [[DOTOMP_UB]], align 4
+// CHECK-64-NEXT:    [[TMP0:%.*]] = load i64, i64* [[DOTPREVIOUS_LB__ADDR]], align 8
+// CHECK-64-NEXT:    [[CONV:%.*]] = trunc i64 [[TMP0]] to i32
+// CHECK-64-NEXT:    [[TMP1:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8
+// CHECK-64-NEXT:    [[CONV1:%.*]] = trunc i64 [[TMP1]] to i32
+// CHECK-64-NEXT:    store i32 [[CONV]], i32* [[DOTOMP_LB]], align 4
+// CHECK-64-NEXT:    store i32 [[CONV1]], i32* [[DOTOMP_UB]], align 4
+// CHECK-64-NEXT:    store i32 1, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-64-NEXT:    store i32 0, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-64-NEXT:    [[TMP2:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK-64-NEXT:    [[TMP3:%.*]] = load i32, i32* [[TMP2]], align 4
+// CHECK-64-NEXT:    call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB3]], i32 [[TMP3]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1)
+// CHECK-64-NEXT:    [[TMP4:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4
+// CHECK-64-NEXT:    store i32 [[TMP4]], i32* [[DOTOMP_IV]], align 4
+// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK-64:       omp.inner.for.cond:
+// CHECK-64-NEXT:    [[TMP5:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP207:![0-9]+]]
+// CHECK-64-NEXT:    [[CONV2:%.*]] = sext i32 [[TMP5]] to i64
+// CHECK-64-NEXT:    [[TMP6:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8, !llvm.access.group [[ACC_GRP207]]
+// CHECK-64-NEXT:    [[CMP:%.*]] = icmp ule i64 [[CONV2]], [[TMP6]]
+// CHECK-64-NEXT:    br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK-64:       omp.inner.for.body:
+// CHECK-64-NEXT:    [[TMP7:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP207]]
+// CHECK-64-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP7]], 1
+// CHECK-64-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// CHECK-64-NEXT:    store i32 [[ADD]], i32* [[I]], align 4, !llvm.access.group [[ACC_GRP207]]
+// CHECK-64-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK-64:       omp.body.continue:
+// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK-64:       omp.inner.for.inc:
+// CHECK-64-NEXT:    [[TMP8:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP207]]
+// CHECK-64-NEXT:    [[TMP9:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP207]]
+// CHECK-64-NEXT:    [[ADD3:%.*]] = add nsw i32 [[TMP8]], [[TMP9]]
+// CHECK-64-NEXT:    store i32 [[ADD3]], i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP207]]
+// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP208:![0-9]+]]
+// CHECK-64:       omp.inner.for.end:
+// CHECK-64-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK-64:       omp.loop.exit:
+// CHECK-64-NEXT:    call void @__kmpc_distribute_static_fini(%struct.ident_t* @[[GLOB2]], i32 [[TMP3]])
+// CHECK-64-NEXT:    [[TMP10:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-64-NEXT:    [[TMP11:%.*]] = icmp ne i32 [[TMP10]], 0
+// CHECK-64-NEXT:    br i1 [[TMP11]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]]
+// CHECK-64:       .omp.final.then:
+// CHECK-64-NEXT:    store i32 10, i32* [[I]], align 4
+// CHECK-64-NEXT:    br label [[DOTOMP_FINAL_DONE]]
+// CHECK-64:       .omp.final.done:
+// CHECK-64-NEXT:    ret void
+//
+//
+// CHECK-64-LABEL: define {{[^@]+}}@__omp_outlined__34_wrapper
+// CHECK-64-SAME: (i16 noundef zeroext [[TMP0:%.*]], i32 noundef [[TMP1:%.*]]) #[[ATTR7:[0-9]+]] {
+// CHECK-64-NEXT:  entry:
+// CHECK-64-NEXT:    [[DOTADDR:%.*]] = alloca i16, align 2
+// CHECK-64-NEXT:    [[DOTADDR1:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[GLOBAL_ARGS:%.*]] = alloca i8**, align 8
+// CHECK-64-NEXT:    store i16 [[TMP0]], i16* [[DOTADDR]], align 2
+// CHECK-64-NEXT:    store i32 [[TMP1]], i32* [[DOTADDR1]], align 4
+// CHECK-64-NEXT:    store i32 0, i32* [[DOTZERO_ADDR]], align 4
+// CHECK-64-NEXT:    call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]])
+// CHECK-64-NEXT:    [[TMP2:%.*]] = load i8**, i8*** [[GLOBAL_ARGS]], align 8
+// CHECK-64-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8*, i8** [[TMP2]], i64 0
+// CHECK-64-NEXT:    [[TMP4:%.*]] = bitcast i8** [[TMP3]] to i64*
+// CHECK-64-NEXT:    [[TMP5:%.*]] = load i64, i64* [[TMP4]], align 8
+// CHECK-64-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8*, i8** [[TMP2]], i64 1
+// CHECK-64-NEXT:    [[TMP7:%.*]] = bitcast i8** [[TMP6]] to i64*
+// CHECK-64-NEXT:    [[TMP8:%.*]] = load i64, i64* [[TMP7]], align 8
+// CHECK-64-NEXT:    call void @__omp_outlined__34(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]], i64 [[TMP5]], i64 [[TMP8]]) #[[ATTR2]]
+// CHECK-64-NEXT:    ret void
+//
+//
+// CHECK-64-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l80
+// CHECK-64-SAME: () #[[ATTR0]] {
+// CHECK-64-NEXT:  entry:
+// CHECK-64-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 2, i1 false)
+// CHECK-64-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
+// CHECK-64-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
+// CHECK-64:       user_code.entry:
+// CHECK-64-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
+// CHECK-64-NEXT:    store i32 0, i32* [[DOTZERO_ADDR]], align 4
+// CHECK-64-NEXT:    store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4
+// CHECK-64-NEXT:    call void @__omp_outlined__35(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]]) #[[ATTR2]]
+// CHECK-64-NEXT:    call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 2)
+// CHECK-64-NEXT:    ret void
+// CHECK-64:       worker.exit:
+// CHECK-64-NEXT:    ret void
+//
+//
+// CHECK-64-LABEL: define {{[^@]+}}@__omp_outlined__35
+// CHECK-64-SAME: (i32* noalias noundef [[DOTGLOBAL_TID_:%.*]], i32* noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
+// CHECK-64-NEXT:  entry:
+// CHECK-64-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8
+// CHECK-64-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8
+// CHECK-64-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [2 x i8*], align 8
+// CHECK-64-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK-64-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8
+// CHECK-64-NEXT:    store i32 0, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-64-NEXT:    store i32 9, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-64-NEXT:    store i32 1, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-64-NEXT:    store i32 0, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-64-NEXT:    [[NVPTX_NUM_THREADS:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block()
+// CHECK-64-NEXT:    [[TMP0:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK-64-NEXT:    [[TMP1:%.*]] = load i32, i32* [[TMP0]], align 4
+// CHECK-64-NEXT:    call void @__kmpc_distribute_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP1]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]])
+// CHECK-64-NEXT:    [[TMP2:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-64-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[TMP2]], 9
+// CHECK-64-NEXT:    br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK-64:       cond.true:
+// CHECK-64-NEXT:    br label [[COND_END:%.*]]
+// CHECK-64:       cond.false:
+// CHECK-64-NEXT:    [[TMP3:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-64-NEXT:    br label [[COND_END]]
+// CHECK-64:       cond.end:
+// CHECK-64-NEXT:    [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP3]], [[COND_FALSE]] ]
+// CHECK-64-NEXT:    store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-64-NEXT:    [[TMP4:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-64-NEXT:    store i32 [[TMP4]], i32* [[DOTOMP_IV]], align 4
+// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK-64:       omp.inner.for.cond:
+// CHECK-64-NEXT:    [[TMP5:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP210:![0-9]+]]
+// CHECK-64-NEXT:    [[CMP1:%.*]] = icmp slt i32 [[TMP5]], 10
+// CHECK-64-NEXT:    br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK-64:       omp.inner.for.body:
+// CHECK-64-NEXT:    [[TMP6:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP210]]
+// CHECK-64-NEXT:    [[TMP7:%.*]] = zext i32 [[TMP6]] to i64
+// CHECK-64-NEXT:    [[TMP8:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP210]]
+// CHECK-64-NEXT:    [[TMP9:%.*]] = zext i32 [[TMP8]] to i64
+// CHECK-64-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 0
+// CHECK-64-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP7]] to i8*
+// CHECK-64-NEXT:    store i8* [[TMP11]], i8** [[TMP10]], align 8, !llvm.access.group [[ACC_GRP210]]
+// CHECK-64-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 1
+// CHECK-64-NEXT:    [[TMP13:%.*]] = inttoptr i64 [[TMP9]] to i8*
+// CHECK-64-NEXT:    store i8* [[TMP13]], i8** [[TMP12]], align 8, !llvm.access.group [[ACC_GRP210]]
+// CHECK-64-NEXT:    [[TMP14:%.*]] = bitcast [2 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8**
+// CHECK-64-NEXT:    call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i64, i64)* @__omp_outlined__36 to i8*), i8* null, i8** [[TMP14]], i64 2), !llvm.access.group [[ACC_GRP210]]
+// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK-64:       omp.inner.for.inc:
+// CHECK-64-NEXT:    [[TMP15:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP210]]
+// CHECK-64-NEXT:    [[TMP16:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP210]]
+// CHECK-64-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP15]], [[TMP16]]
+// CHECK-64-NEXT:    store i32 [[ADD]], i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP210]]
+// CHECK-64-NEXT:    [[TMP17:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP210]]
+// CHECK-64-NEXT:    [[TMP18:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP210]]
+// CHECK-64-NEXT:    [[ADD2:%.*]] = add nsw i32 [[TMP17]], [[TMP18]]
+// CHECK-64-NEXT:    store i32 [[ADD2]], i32* [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP210]]
+// CHECK-64-NEXT:    [[TMP19:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP210]]
+// CHECK-64-NEXT:    [[TMP20:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP210]]
+// CHECK-64-NEXT:    [[ADD3:%.*]] = add nsw i32 [[TMP19]], [[TMP20]]
+// CHECK-64-NEXT:    store i32 [[ADD3]], i32* [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP210]]
+// CHECK-64-NEXT:    [[TMP21:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP210]]
+// CHECK-64-NEXT:    [[CMP4:%.*]] = icmp sgt i32 [[TMP21]], 9
+// CHECK-64-NEXT:    br i1 [[CMP4]], label [[COND_TRUE5:%.*]], label [[COND_FALSE6:%.*]]
+// CHECK-64:       cond.true5:
+// CHECK-64-NEXT:    br label [[COND_END7:%.*]]
+// CHECK-64:       cond.false6:
+// CHECK-64-NEXT:    [[TMP22:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP210]]
+// CHECK-64-NEXT:    br label [[COND_END7]]
+// CHECK-64:       cond.end7:
+// CHECK-64-NEXT:    [[COND8:%.*]] = phi i32 [ 9, [[COND_TRUE5]] ], [ [[TMP22]], [[COND_FALSE6]] ]
+// CHECK-64-NEXT:    store i32 [[COND8]], i32* [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP210]]
+// CHECK-64-NEXT:    [[TMP23:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP210]]
+// CHECK-64-NEXT:    store i32 [[TMP23]], i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP210]]
+// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP211:![0-9]+]]
+// CHECK-64:       omp.inner.for.end:
+// CHECK-64-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK-64:       omp.loop.exit:
+// CHECK-64-NEXT:    call void @__kmpc_distribute_static_fini(%struct.ident_t* @[[GLOB2]], i32 [[TMP1]])
+// CHECK-64-NEXT:    [[TMP24:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-64-NEXT:    [[TMP25:%.*]] = icmp ne i32 [[TMP24]], 0
+// CHECK-64-NEXT:    br i1 [[TMP25]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]]
+// CHECK-64:       .omp.final.then:
+// CHECK-64-NEXT:    store i32 10, i32* [[I]], align 4
+// CHECK-64-NEXT:    br label [[DOTOMP_FINAL_DONE]]
+// CHECK-64:       .omp.final.done:
+// CHECK-64-NEXT:    ret void
+//
+//
+// CHECK-64-LABEL: define {{[^@]+}}@__omp_outlined__36
+// CHECK-64-SAME: (i32* noalias noundef [[DOTGLOBAL_TID_:%.*]], i32* noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR1]] {
+// CHECK-64-NEXT:  entry:
+// CHECK-64-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8
+// CHECK-64-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8
+// CHECK-64-NEXT:    [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i64, align 8
+// CHECK-64-NEXT:    [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i64, align 8
+// CHECK-64-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK-64-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8
+// CHECK-64-NEXT:    store i64 [[DOTPREVIOUS_LB_]], i64* [[DOTPREVIOUS_LB__ADDR]], align 8
+// CHECK-64-NEXT:    store i64 [[DOTPREVIOUS_UB_]], i64* [[DOTPREVIOUS_UB__ADDR]], align 8
+// CHECK-64-NEXT:    store i32 0, i32* [[DOTOMP_LB]], align 4
+// CHECK-64-NEXT:    store i32 9, i32* [[DOTOMP_UB]], align 4
+// CHECK-64-NEXT:    [[TMP0:%.*]] = load i64, i64* [[DOTPREVIOUS_LB__ADDR]], align 8
+// CHECK-64-NEXT:    [[CONV:%.*]] = trunc i64 [[TMP0]] to i32
+// CHECK-64-NEXT:    [[TMP1:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8
+// CHECK-64-NEXT:    [[CONV1:%.*]] = trunc i64 [[TMP1]] to i32
+// CHECK-64-NEXT:    store i32 [[CONV]], i32* [[DOTOMP_LB]], align 4
+// CHECK-64-NEXT:    store i32 [[CONV1]], i32* [[DOTOMP_UB]], align 4
+// CHECK-64-NEXT:    store i32 1, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-64-NEXT:    store i32 0, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-64-NEXT:    [[TMP2:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4
+// CHECK-64-NEXT:    [[TMP3:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4
+// CHECK-64-NEXT:    [[TMP4:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK-64-NEXT:    [[TMP5:%.*]] = load i32, i32* [[TMP4]], align 4
+// CHECK-64-NEXT:    call void @__kmpc_dispatch_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP5]], i32 1073741862, i32 [[TMP2]], i32 [[TMP3]], i32 1, i32 1)
+// CHECK-64-NEXT:    br label [[OMP_DISPATCH_COND:%.*]]
+// CHECK-64:       omp.dispatch.cond:
+// CHECK-64-NEXT:    [[TMP6:%.*]] = call i32 @__kmpc_dispatch_next_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP5]], i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]])
+// CHECK-64-NEXT:    [[TOBOOL:%.*]] = icmp ne i32 [[TMP6]], 0
+// CHECK-64-NEXT:    br i1 [[TOBOOL]], label [[OMP_DISPATCH_BODY:%.*]], label [[OMP_DISPATCH_END:%.*]]
+// CHECK-64:       omp.dispatch.body:
+// CHECK-64-NEXT:    [[TMP7:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4
+// CHECK-64-NEXT:    store i32 [[TMP7]], i32* [[DOTOMP_IV]], align 4
+// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK-64:       omp.inner.for.cond:
+// CHECK-64-NEXT:    [[TMP8:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP213:![0-9]+]]
+// CHECK-64-NEXT:    [[TMP9:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP213]]
+// CHECK-64-NEXT:    [[CMP:%.*]] = icmp sle i32 [[TMP8]], [[TMP9]]
+// CHECK-64-NEXT:    br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK-64:       omp.inner.for.body:
+// CHECK-64-NEXT:    [[TMP10:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP213]]
+// CHECK-64-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP10]], 1
+// CHECK-64-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// CHECK-64-NEXT:    store i32 [[ADD]], i32* [[I]], align 4, !llvm.access.group [[ACC_GRP213]]
+// CHECK-64-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK-64:       omp.body.continue:
+// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK-64:       omp.inner.for.inc:
+// CHECK-64-NEXT:    [[TMP11:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP213]]
+// CHECK-64-NEXT:    [[ADD2:%.*]] = add nsw i32 [[TMP11]], 1
+// CHECK-64-NEXT:    store i32 [[ADD2]], i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP213]]
+// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP214:![0-9]+]]
+// CHECK-64:       omp.inner.for.end:
+// CHECK-64-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
+// CHECK-64:       omp.dispatch.inc:
+// CHECK-64-NEXT:    br label [[OMP_DISPATCH_COND]]
+// CHECK-64:       omp.dispatch.end:
+// CHECK-64-NEXT:    [[TMP12:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-64-NEXT:    [[TMP13:%.*]] = icmp ne i32 [[TMP12]], 0
+// CHECK-64-NEXT:    br i1 [[TMP13]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]]
+// CHECK-64:       .omp.final.then:
+// CHECK-64-NEXT:    store i32 10, i32* [[I]], align 4
+// CHECK-64-NEXT:    br label [[DOTOMP_FINAL_DONE]]
+// CHECK-64:       .omp.final.done:
+// CHECK-64-NEXT:    ret void
+//
+//
+// CHECK-64-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l84
+// CHECK-64-SAME: () #[[ATTR0]] {
+// CHECK-64-NEXT:  entry:
+// CHECK-64-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 2, i1 false)
+// CHECK-64-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
+// CHECK-64-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
+// CHECK-64:       user_code.entry:
+// CHECK-64-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
+// CHECK-64-NEXT:    store i32 0, i32* [[DOTZERO_ADDR]], align 4
+// CHECK-64-NEXT:    store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4
+// CHECK-64-NEXT:    call void @__omp_outlined__37(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]]) #[[ATTR2]]
+// CHECK-64-NEXT:    call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 2)
+// CHECK-64-NEXT:    ret void
+// CHECK-64:       worker.exit:
+// CHECK-64-NEXT:    ret void
+//
+//
+// CHECK-64-LABEL: define {{[^@]+}}@__omp_outlined__37
+// CHECK-64-SAME: (i32* noalias noundef [[DOTGLOBAL_TID_:%.*]], i32* noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
+// CHECK-64-NEXT:  entry:
+// CHECK-64-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8
+// CHECK-64-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8
+// CHECK-64-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [2 x i8*], align 8
+// CHECK-64-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK-64-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8
+// CHECK-64-NEXT:    store i32 0, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-64-NEXT:    store i32 9, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-64-NEXT:    store i32 1, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-64-NEXT:    store i32 0, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-64-NEXT:    [[NVPTX_NUM_THREADS:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block()
+// CHECK-64-NEXT:    [[TMP0:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK-64-NEXT:    [[TMP1:%.*]] = load i32, i32* [[TMP0]], align 4
+// CHECK-64-NEXT:    call void @__kmpc_distribute_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP1]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]])
+// CHECK-64-NEXT:    [[TMP2:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-64-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[TMP2]], 9
+// CHECK-64-NEXT:    br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK-64:       cond.true:
+// CHECK-64-NEXT:    br label [[COND_END:%.*]]
+// CHECK-64:       cond.false:
+// CHECK-64-NEXT:    [[TMP3:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-64-NEXT:    br label [[COND_END]]
+// CHECK-64:       cond.end:
+// CHECK-64-NEXT:    [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP3]], [[COND_FALSE]] ]
+// CHECK-64-NEXT:    store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-64-NEXT:    [[TMP4:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-64-NEXT:    store i32 [[TMP4]], i32* [[DOTOMP_IV]], align 4
+// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK-64:       omp.inner.for.cond:
+// CHECK-64-NEXT:    [[TMP5:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP216:![0-9]+]]
+// CHECK-64-NEXT:    [[CMP1:%.*]] = icmp slt i32 [[TMP5]], 10
+// CHECK-64-NEXT:    br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK-64:       omp.inner.for.body:
+// CHECK-64-NEXT:    [[TMP6:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP216]]
+// CHECK-64-NEXT:    [[TMP7:%.*]] = zext i32 [[TMP6]] to i64
+// CHECK-64-NEXT:    [[TMP8:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP216]]
+// CHECK-64-NEXT:    [[TMP9:%.*]] = zext i32 [[TMP8]] to i64
+// CHECK-64-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 0
+// CHECK-64-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP7]] to i8*
+// CHECK-64-NEXT:    store i8* [[TMP11]], i8** [[TMP10]], align 8, !llvm.access.group [[ACC_GRP216]]
+// CHECK-64-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 1
+// CHECK-64-NEXT:    [[TMP13:%.*]] = inttoptr i64 [[TMP9]] to i8*
+// CHECK-64-NEXT:    store i8* [[TMP13]], i8** [[TMP12]], align 8, !llvm.access.group [[ACC_GRP216]]
+// CHECK-64-NEXT:    [[TMP14:%.*]] = bitcast [2 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8**
+// CHECK-64-NEXT:    call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i64, i64)* @__omp_outlined__38 to i8*), i8* null, i8** [[TMP14]], i64 2), !llvm.access.group [[ACC_GRP216]]
+// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK-64:       omp.inner.for.inc:
+// CHECK-64-NEXT:    [[TMP15:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP216]]
+// CHECK-64-NEXT:    [[TMP16:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP216]]
+// CHECK-64-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP15]], [[TMP16]]
+// CHECK-64-NEXT:    store i32 [[ADD]], i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP216]]
+// CHECK-64-NEXT:    [[TMP17:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP216]]
+// CHECK-64-NEXT:    [[TMP18:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP216]]
+// CHECK-64-NEXT:    [[ADD2:%.*]] = add nsw i32 [[TMP17]], [[TMP18]]
+// CHECK-64-NEXT:    store i32 [[ADD2]], i32* [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP216]]
+// CHECK-64-NEXT:    [[TMP19:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP216]]
+// CHECK-64-NEXT:    [[TMP20:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP216]]
+// CHECK-64-NEXT:    [[ADD3:%.*]] = add nsw i32 [[TMP19]], [[TMP20]]
+// CHECK-64-NEXT:    store i32 [[ADD3]], i32* [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP216]]
+// CHECK-64-NEXT:    [[TMP21:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP216]]
+// CHECK-64-NEXT:    [[CMP4:%.*]] = icmp sgt i32 [[TMP21]], 9
+// CHECK-64-NEXT:    br i1 [[CMP4]], label [[COND_TRUE5:%.*]], label [[COND_FALSE6:%.*]]
+// CHECK-64:       cond.true5:
+// CHECK-64-NEXT:    br label [[COND_END7:%.*]]
+// CHECK-64:       cond.false6:
+// CHECK-64-NEXT:    [[TMP22:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP216]]
+// CHECK-64-NEXT:    br label [[COND_END7]]
+// CHECK-64:       cond.end7:
+// CHECK-64-NEXT:    [[COND8:%.*]] = phi i32 [ 9, [[COND_TRUE5]] ], [ [[TMP22]], [[COND_FALSE6]] ]
+// CHECK-64-NEXT:    store i32 [[COND8]], i32* [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP216]]
+// CHECK-64-NEXT:    [[TMP23:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP216]]
+// CHECK-64-NEXT:    store i32 [[TMP23]], i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP216]]
+// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP217:![0-9]+]]
+// CHECK-64:       omp.inner.for.end:
+// CHECK-64-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK-64:       omp.loop.exit:
+// CHECK-64-NEXT:    call void @__kmpc_distribute_static_fini(%struct.ident_t* @[[GLOB2]], i32 [[TMP1]])
+// CHECK-64-NEXT:    [[TMP24:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-64-NEXT:    [[TMP25:%.*]] = icmp ne i32 [[TMP24]], 0
+// CHECK-64-NEXT:    br i1 [[TMP25]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]]
+// CHECK-64:       .omp.final.then:
+// CHECK-64-NEXT:    store i32 10, i32* [[I]], align 4
+// CHECK-64-NEXT:    br label [[DOTOMP_FINAL_DONE]]
+// CHECK-64:       .omp.final.done:
+// CHECK-64-NEXT:    ret void
+//
+//
+// CHECK-64-LABEL: define {{[^@]+}}@__omp_outlined__38
+// CHECK-64-SAME: (i32* noalias noundef [[DOTGLOBAL_TID_:%.*]], i32* noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR1]] {
+// CHECK-64-NEXT:  entry:
+// CHECK-64-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8
+// CHECK-64-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8
+// CHECK-64-NEXT:    [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i64, align 8
+// CHECK-64-NEXT:    [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i64, align 8
+// CHECK-64-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK-64-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8
+// CHECK-64-NEXT:    store i64 [[DOTPREVIOUS_LB_]], i64* [[DOTPREVIOUS_LB__ADDR]], align 8
+// CHECK-64-NEXT:    store i64 [[DOTPREVIOUS_UB_]], i64* [[DOTPREVIOUS_UB__ADDR]], align 8
+// CHECK-64-NEXT:    store i32 0, i32* [[DOTOMP_LB]], align 4
+// CHECK-64-NEXT:    store i32 9, i32* [[DOTOMP_UB]], align 4
+// CHECK-64-NEXT:    [[TMP0:%.*]] = load i64, i64* [[DOTPREVIOUS_LB__ADDR]], align 8
+// CHECK-64-NEXT:    [[CONV:%.*]] = trunc i64 [[TMP0]] to i32
+// CHECK-64-NEXT:    [[TMP1:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8
+// CHECK-64-NEXT:    [[CONV1:%.*]] = trunc i64 [[TMP1]] to i32
+// CHECK-64-NEXT:    store i32 [[CONV]], i32* [[DOTOMP_LB]], align 4
+// CHECK-64-NEXT:    store i32 [[CONV1]], i32* [[DOTOMP_UB]], align 4
+// CHECK-64-NEXT:    store i32 1, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-64-NEXT:    store i32 0, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-64-NEXT:    [[TMP2:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4
+// CHECK-64-NEXT:    [[TMP3:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4
+// CHECK-64-NEXT:    [[TMP4:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK-64-NEXT:    [[TMP5:%.*]] = load i32, i32* [[TMP4]], align 4
+// CHECK-64-NEXT:    call void @__kmpc_dispatch_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP5]], i32 1073741861, i32 [[TMP2]], i32 [[TMP3]], i32 1, i32 1)
+// CHECK-64-NEXT:    br label [[OMP_DISPATCH_COND:%.*]]
+// CHECK-64:       omp.dispatch.cond:
+// CHECK-64-NEXT:    [[TMP6:%.*]] = call i32 @__kmpc_dispatch_next_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP5]], i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]])
+// CHECK-64-NEXT:    [[TOBOOL:%.*]] = icmp ne i32 [[TMP6]], 0
+// CHECK-64-NEXT:    br i1 [[TOBOOL]], label [[OMP_DISPATCH_BODY:%.*]], label [[OMP_DISPATCH_END:%.*]]
+// CHECK-64:       omp.dispatch.body:
+// CHECK-64-NEXT:    [[TMP7:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4
+// CHECK-64-NEXT:    store i32 [[TMP7]], i32* [[DOTOMP_IV]], align 4
+// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK-64:       omp.inner.for.cond:
+// CHECK-64-NEXT:    [[TMP8:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP219:![0-9]+]]
+// CHECK-64-NEXT:    [[TMP9:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP219]]
+// CHECK-64-NEXT:    [[CMP:%.*]] = icmp sle i32 [[TMP8]], [[TMP9]]
+// CHECK-64-NEXT:    br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK-64:       omp.inner.for.body:
+// CHECK-64-NEXT:    [[TMP10:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP219]]
+// CHECK-64-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP10]], 1
+// CHECK-64-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// CHECK-64-NEXT:    store i32 [[ADD]], i32* [[I]], align 4, !llvm.access.group [[ACC_GRP219]]
+// CHECK-64-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK-64:       omp.body.continue:
+// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK-64:       omp.inner.for.inc:
+// CHECK-64-NEXT:    [[TMP11:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP219]]
+// CHECK-64-NEXT:    [[ADD2:%.*]] = add nsw i32 [[TMP11]], 1
+// CHECK-64-NEXT:    store i32 [[ADD2]], i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP219]]
+// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP220:![0-9]+]]
+// CHECK-64:       omp.inner.for.end:
+// CHECK-64-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
+// CHECK-64:       omp.dispatch.inc:
+// CHECK-64-NEXT:    br label [[OMP_DISPATCH_COND]]
+// CHECK-64:       omp.dispatch.end:
+// CHECK-64-NEXT:    [[TMP12:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-64-NEXT:    [[TMP13:%.*]] = icmp ne i32 [[TMP12]], 0
+// CHECK-64-NEXT:    br i1 [[TMP13]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]]
+// CHECK-64:       .omp.final.then:
+// CHECK-64-NEXT:    store i32 10, i32* [[I]], align 4
+// CHECK-64-NEXT:    br label [[DOTOMP_FINAL_DONE]]
+// CHECK-64:       .omp.final.done:
+// CHECK-64-NEXT:    ret void
+//
+//
+// CHECK-64-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l88
+// CHECK-64-SAME: () #[[ATTR0]] {
+// CHECK-64-NEXT:  entry:
+// CHECK-64-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 2, i1 false)
+// CHECK-64-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
+// CHECK-64-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
+// CHECK-64:       user_code.entry:
+// CHECK-64-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
+// CHECK-64-NEXT:    store i32 0, i32* [[DOTZERO_ADDR]], align 4
+// CHECK-64-NEXT:    store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4
+// CHECK-64-NEXT:    call void @__omp_outlined__39(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]]) #[[ATTR2]]
+// CHECK-64-NEXT:    call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 2)
+// CHECK-64-NEXT:    ret void
+// CHECK-64:       worker.exit:
+// CHECK-64-NEXT:    ret void
+//
+//
+// CHECK-64-LABEL: define {{[^@]+}}@__omp_outlined__39
+// CHECK-64-SAME: (i32* noalias noundef [[DOTGLOBAL_TID_:%.*]], i32* noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
+// CHECK-64-NEXT:  entry:
+// CHECK-64-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8
+// CHECK-64-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8
+// CHECK-64-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [2 x i8*], align 8
+// CHECK-64-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK-64-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8
+// CHECK-64-NEXT:    store i32 0, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-64-NEXT:    store i32 9, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-64-NEXT:    store i32 1, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-64-NEXT:    store i32 0, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-64-NEXT:    [[NVPTX_NUM_THREADS:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block()
+// CHECK-64-NEXT:    [[TMP0:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK-64-NEXT:    [[TMP1:%.*]] = load i32, i32* [[TMP0]], align 4
+// CHECK-64-NEXT:    call void @__kmpc_distribute_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP1]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]])
+// CHECK-64-NEXT:    [[TMP2:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-64-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[TMP2]], 9
+// CHECK-64-NEXT:    br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK-64:       cond.true:
+// CHECK-64-NEXT:    br label [[COND_END:%.*]]
+// CHECK-64:       cond.false:
+// CHECK-64-NEXT:    [[TMP3:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-64-NEXT:    br label [[COND_END]]
+// CHECK-64:       cond.end:
+// CHECK-64-NEXT:    [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP3]], [[COND_FALSE]] ]
+// CHECK-64-NEXT:    store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-64-NEXT:    [[TMP4:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-64-NEXT:    store i32 [[TMP4]], i32* [[DOTOMP_IV]], align 4
+// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK-64:       omp.inner.for.cond:
+// CHECK-64-NEXT:    [[TMP5:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP222:![0-9]+]]
+// CHECK-64-NEXT:    [[CMP1:%.*]] = icmp slt i32 [[TMP5]], 10
+// CHECK-64-NEXT:    br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK-64:       omp.inner.for.body:
+// CHECK-64-NEXT:    [[TMP6:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP222]]
+// CHECK-64-NEXT:    [[TMP7:%.*]] = zext i32 [[TMP6]] to i64
+// CHECK-64-NEXT:    [[TMP8:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP222]]
+// CHECK-64-NEXT:    [[TMP9:%.*]] = zext i32 [[TMP8]] to i64
+// CHECK-64-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 0
+// CHECK-64-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP7]] to i8*
+// CHECK-64-NEXT:    store i8* [[TMP11]], i8** [[TMP10]], align 8, !llvm.access.group [[ACC_GRP222]]
+// CHECK-64-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 1
+// CHECK-64-NEXT:    [[TMP13:%.*]] = inttoptr i64 [[TMP9]] to i8*
+// CHECK-64-NEXT:    store i8* [[TMP13]], i8** [[TMP12]], align 8, !llvm.access.group [[ACC_GRP222]]
+// CHECK-64-NEXT:    [[TMP14:%.*]] = bitcast [2 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8**
+// CHECK-64-NEXT:    call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i64, i64)* @__omp_outlined__40 to i8*), i8* null, i8** [[TMP14]], i64 2), !llvm.access.group [[ACC_GRP222]]
+// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK-64:       omp.inner.for.inc:
+// CHECK-64-NEXT:    [[TMP15:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP222]]
+// CHECK-64-NEXT:    [[TMP16:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP222]]
+// CHECK-64-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP15]], [[TMP16]]
+// CHECK-64-NEXT:    store i32 [[ADD]], i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP222]]
+// CHECK-64-NEXT:    [[TMP17:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP222]]
+// CHECK-64-NEXT:    [[TMP18:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP222]]
+// CHECK-64-NEXT:    [[ADD2:%.*]] = add nsw i32 [[TMP17]], [[TMP18]]
+// CHECK-64-NEXT:    store i32 [[ADD2]], i32* [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP222]]
+// CHECK-64-NEXT:    [[TMP19:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP222]]
+// CHECK-64-NEXT:    [[TMP20:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP222]]
+// CHECK-64-NEXT:    [[ADD3:%.*]] = add nsw i32 [[TMP19]], [[TMP20]]
+// CHECK-64-NEXT:    store i32 [[ADD3]], i32* [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP222]]
+// CHECK-64-NEXT:    [[TMP21:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP222]]
+// CHECK-64-NEXT:    [[CMP4:%.*]] = icmp sgt i32 [[TMP21]], 9
+// CHECK-64-NEXT:    br i1 [[CMP4]], label [[COND_TRUE5:%.*]], label [[COND_FALSE6:%.*]]
+// CHECK-64:       cond.true5:
+// CHECK-64-NEXT:    br label [[COND_END7:%.*]]
+// CHECK-64:       cond.false6:
+// CHECK-64-NEXT:    [[TMP22:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP222]]
+// CHECK-64-NEXT:    br label [[COND_END7]]
+// CHECK-64:       cond.end7:
+// CHECK-64-NEXT:    [[COND8:%.*]] = phi i32 [ 9, [[COND_TRUE5]] ], [ [[TMP22]], [[COND_FALSE6]] ]
+// CHECK-64-NEXT:    store i32 [[COND8]], i32* [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP222]]
+// CHECK-64-NEXT:    [[TMP23:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP222]]
+// CHECK-64-NEXT:    store i32 [[TMP23]], i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP222]]
+// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP223:![0-9]+]]
+// CHECK-64:       omp.inner.for.end:
+// CHECK-64-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK-64:       omp.loop.exit:
+// CHECK-64-NEXT:    call void @__kmpc_distribute_static_fini(%struct.ident_t* @[[GLOB2]], i32 [[TMP1]])
+// CHECK-64-NEXT:    [[TMP24:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-64-NEXT:    [[TMP25:%.*]] = icmp ne i32 [[TMP24]], 0
+// CHECK-64-NEXT:    br i1 [[TMP25]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]]
+// CHECK-64:       .omp.final.then:
+// CHECK-64-NEXT:    store i32 10, i32* [[I]], align 4
+// CHECK-64-NEXT:    br label [[DOTOMP_FINAL_DONE]]
+// CHECK-64:       .omp.final.done:
+// CHECK-64-NEXT:    ret void
+//
+//
+// CHECK-64-LABEL: define {{[^@]+}}@__omp_outlined__40
+// CHECK-64-SAME: (i32* noalias noundef [[DOTGLOBAL_TID_:%.*]], i32* noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR1]] {
+// CHECK-64-NEXT:  entry:
+// CHECK-64-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8
+// CHECK-64-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8
+// CHECK-64-NEXT:    [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i64, align 8
+// CHECK-64-NEXT:    [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i64, align 8
+// CHECK-64-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK-64-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8
+// CHECK-64-NEXT:    store i64 [[DOTPREVIOUS_LB_]], i64* [[DOTPREVIOUS_LB__ADDR]], align 8
+// CHECK-64-NEXT:    store i64 [[DOTPREVIOUS_UB_]], i64* [[DOTPREVIOUS_UB__ADDR]], align 8
+// CHECK-64-NEXT:    store i32 0, i32* [[DOTOMP_LB]], align 4
+// CHECK-64-NEXT:    store i32 9, i32* [[DOTOMP_UB]], align 4
+// CHECK-64-NEXT:    [[TMP0:%.*]] = load i64, i64* [[DOTPREVIOUS_LB__ADDR]], align 8
+// CHECK-64-NEXT:    [[CONV:%.*]] = trunc i64 [[TMP0]] to i32
+// CHECK-64-NEXT:    [[TMP1:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8
+// CHECK-64-NEXT:    [[CONV1:%.*]] = trunc i64 [[TMP1]] to i32
+// CHECK-64-NEXT:    store i32 [[CONV]], i32* [[DOTOMP_LB]], align 4
+// CHECK-64-NEXT:    store i32 [[CONV1]], i32* [[DOTOMP_UB]], align 4
+// CHECK-64-NEXT:    store i32 1, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-64-NEXT:    store i32 0, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-64-NEXT:    [[TMP2:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4
+// CHECK-64-NEXT:    [[TMP3:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4
+// CHECK-64-NEXT:    [[TMP4:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK-64-NEXT:    [[TMP5:%.*]] = load i32, i32* [[TMP4]], align 4
+// CHECK-64-NEXT:    call void @__kmpc_dispatch_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP5]], i32 1073741859, i32 [[TMP2]], i32 [[TMP3]], i32 1, i32 1)
+// CHECK-64-NEXT:    br label [[OMP_DISPATCH_COND:%.*]]
+// CHECK-64:       omp.dispatch.cond:
+// CHECK-64-NEXT:    [[TMP6:%.*]] = call i32 @__kmpc_dispatch_next_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP5]], i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]])
+// CHECK-64-NEXT:    [[TOBOOL:%.*]] = icmp ne i32 [[TMP6]], 0
+// CHECK-64-NEXT:    br i1 [[TOBOOL]], label [[OMP_DISPATCH_BODY:%.*]], label [[OMP_DISPATCH_END:%.*]]
+// CHECK-64:       omp.dispatch.body:
+// CHECK-64-NEXT:    [[TMP7:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4
+// CHECK-64-NEXT:    store i32 [[TMP7]], i32* [[DOTOMP_IV]], align 4
+// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK-64:       omp.inner.for.cond:
+// CHECK-64-NEXT:    [[TMP8:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP225:![0-9]+]]
+// CHECK-64-NEXT:    [[TMP9:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP225]]
+// CHECK-64-NEXT:    [[CMP:%.*]] = icmp sle i32 [[TMP8]], [[TMP9]]
+// CHECK-64-NEXT:    br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK-64:       omp.inner.for.body:
+// CHECK-64-NEXT:    [[TMP10:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP225]]
+// CHECK-64-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP10]], 1
+// CHECK-64-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// CHECK-64-NEXT:    store i32 [[ADD]], i32* [[I]], align 4, !llvm.access.group [[ACC_GRP225]]
+// CHECK-64-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK-64:       omp.body.continue:
+// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK-64:       omp.inner.for.inc:
+// CHECK-64-NEXT:    [[TMP11:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP225]]
+// CHECK-64-NEXT:    [[ADD2:%.*]] = add nsw i32 [[TMP11]], 1
+// CHECK-64-NEXT:    store i32 [[ADD2]], i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP225]]
+// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP226:![0-9]+]]
+// CHECK-64:       omp.inner.for.end:
+// CHECK-64-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
+// CHECK-64:       omp.dispatch.inc:
+// CHECK-64-NEXT:    br label [[OMP_DISPATCH_COND]]
+// CHECK-64:       omp.dispatch.end:
+// CHECK-64-NEXT:    [[TMP12:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-64-NEXT:    [[TMP13:%.*]] = icmp ne i32 [[TMP12]], 0
+// CHECK-64-NEXT:    br i1 [[TMP13]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]]
+// CHECK-64:       .omp.final.then:
+// CHECK-64-NEXT:    store i32 10, i32* [[I]], align 4
+// CHECK-64-NEXT:    br label [[DOTOMP_FINAL_DONE]]
+// CHECK-64:       .omp.final.done:
+// CHECK-64-NEXT:    ret void
+//
+//
+// CHECK-64-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l92
+// CHECK-64-SAME: () #[[ATTR0]] {
+// CHECK-64-NEXT:  entry:
+// CHECK-64-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 2, i1 false)
+// CHECK-64-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
+// CHECK-64-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
+// CHECK-64:       user_code.entry:
+// CHECK-64-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
+// CHECK-64-NEXT:    store i32 0, i32* [[DOTZERO_ADDR]], align 4
+// CHECK-64-NEXT:    store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4
+// CHECK-64-NEXT:    call void @__omp_outlined__41(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]]) #[[ATTR2]]
+// CHECK-64-NEXT:    call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 2)
+// CHECK-64-NEXT:    ret void
+// CHECK-64:       worker.exit:
+// CHECK-64-NEXT:    ret void
+//
+//
+// CHECK-64-LABEL: define {{[^@]+}}@__omp_outlined__41
+// CHECK-64-SAME: (i32* noalias noundef [[DOTGLOBAL_TID_:%.*]], i32* noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
+// CHECK-64-NEXT:  entry:
+// CHECK-64-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8
+// CHECK-64-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8
+// CHECK-64-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [2 x i8*], align 8
+// CHECK-64-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK-64-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8
+// CHECK-64-NEXT:    store i32 0, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-64-NEXT:    store i32 9, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-64-NEXT:    store i32 1, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-64-NEXT:    store i32 0, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-64-NEXT:    [[NVPTX_NUM_THREADS:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block()
+// CHECK-64-NEXT:    [[TMP0:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK-64-NEXT:    [[TMP1:%.*]] = load i32, i32* [[TMP0]], align 4
+// CHECK-64-NEXT:    call void @__kmpc_distribute_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP1]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]])
+// CHECK-64-NEXT:    [[TMP2:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-64-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[TMP2]], 9
+// CHECK-64-NEXT:    br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK-64:       cond.true:
+// CHECK-64-NEXT:    br label [[COND_END:%.*]]
+// CHECK-64:       cond.false:
+// CHECK-64-NEXT:    [[TMP3:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-64-NEXT:    br label [[COND_END]]
+// CHECK-64:       cond.end:
+// CHECK-64-NEXT:    [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP3]], [[COND_FALSE]] ]
+// CHECK-64-NEXT:    store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-64-NEXT:    [[TMP4:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-64-NEXT:    store i32 [[TMP4]], i32* [[DOTOMP_IV]], align 4
+// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK-64:       omp.inner.for.cond:
+// CHECK-64-NEXT:    [[TMP5:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP228:![0-9]+]]
+// CHECK-64-NEXT:    [[CMP1:%.*]] = icmp slt i32 [[TMP5]], 10
+// CHECK-64-NEXT:    br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK-64:       omp.inner.for.body:
+// CHECK-64-NEXT:    [[TMP6:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP228]]
+// CHECK-64-NEXT:    [[TMP7:%.*]] = zext i32 [[TMP6]] to i64
+// CHECK-64-NEXT:    [[TMP8:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP228]]
+// CHECK-64-NEXT:    [[TMP9:%.*]] = zext i32 [[TMP8]] to i64
+// CHECK-64-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 0
+// CHECK-64-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP7]] to i8*
+// CHECK-64-NEXT:    store i8* [[TMP11]], i8** [[TMP10]], align 8, !llvm.access.group [[ACC_GRP228]]
+// CHECK-64-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 1
+// CHECK-64-NEXT:    [[TMP13:%.*]] = inttoptr i64 [[TMP9]] to i8*
+// CHECK-64-NEXT:    store i8* [[TMP13]], i8** [[TMP12]], align 8, !llvm.access.group [[ACC_GRP228]]
+// CHECK-64-NEXT:    [[TMP14:%.*]] = bitcast [2 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8**
+// CHECK-64-NEXT:    call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i64, i64)* @__omp_outlined__42 to i8*), i8* null, i8** [[TMP14]], i64 2), !llvm.access.group [[ACC_GRP228]]
+// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK-64:       omp.inner.for.inc:
+// CHECK-64-NEXT:    [[TMP15:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP228]]
+// CHECK-64-NEXT:    [[TMP16:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP228]]
+// CHECK-64-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP15]], [[TMP16]]
+// CHECK-64-NEXT:    store i32 [[ADD]], i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP228]]
+// CHECK-64-NEXT:    [[TMP17:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP228]]
+// CHECK-64-NEXT:    [[TMP18:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP228]]
+// CHECK-64-NEXT:    [[ADD2:%.*]] = add nsw i32 [[TMP17]], [[TMP18]]
+// CHECK-64-NEXT:    store i32 [[ADD2]], i32* [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP228]]
+// CHECK-64-NEXT:    [[TMP19:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP228]]
+// CHECK-64-NEXT:    [[TMP20:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP228]]
+// CHECK-64-NEXT:    [[ADD3:%.*]] = add nsw i32 [[TMP19]], [[TMP20]]
+// CHECK-64-NEXT:    store i32 [[ADD3]], i32* [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP228]]
+// CHECK-64-NEXT:    [[TMP21:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP228]]
+// CHECK-64-NEXT:    [[CMP4:%.*]] = icmp sgt i32 [[TMP21]], 9
+// CHECK-64-NEXT:    br i1 [[CMP4]], label [[COND_TRUE5:%.*]], label [[COND_FALSE6:%.*]]
+// CHECK-64:       cond.true5:
+// CHECK-64-NEXT:    br label [[COND_END7:%.*]]
+// CHECK-64:       cond.false6:
+// CHECK-64-NEXT:    [[TMP22:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP228]]
+// CHECK-64-NEXT:    br label [[COND_END7]]
+// CHECK-64:       cond.end7:
+// CHECK-64-NEXT:    [[COND8:%.*]] = phi i32 [ 9, [[COND_TRUE5]] ], [ [[TMP22]], [[COND_FALSE6]] ]
+// CHECK-64-NEXT:    store i32 [[COND8]], i32* [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP228]]
+// CHECK-64-NEXT:    [[TMP23:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP228]]
+// CHECK-64-NEXT:    store i32 [[TMP23]], i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP228]]
+// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP229:![0-9]+]]
+// CHECK-64:       omp.inner.for.end:
+// CHECK-64-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK-64:       omp.loop.exit:
+// CHECK-64-NEXT:    call void @__kmpc_distribute_static_fini(%struct.ident_t* @[[GLOB2]], i32 [[TMP1]])
+// CHECK-64-NEXT:    [[TMP24:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-64-NEXT:    [[TMP25:%.*]] = icmp ne i32 [[TMP24]], 0
+// CHECK-64-NEXT:    br i1 [[TMP25]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]]
+// CHECK-64:       .omp.final.then:
+// CHECK-64-NEXT:    store i32 10, i32* [[I]], align 4
+// CHECK-64-NEXT:    br label [[DOTOMP_FINAL_DONE]]
+// CHECK-64:       .omp.final.done:
+// CHECK-64-NEXT:    ret void
+//
+//
+// CHECK-64-LABEL: define {{[^@]+}}@__omp_outlined__42
+// CHECK-64-SAME: (i32* noalias noundef [[DOTGLOBAL_TID_:%.*]], i32* noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR1]] {
+// CHECK-64-NEXT:  entry:
+// CHECK-64-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8
+// CHECK-64-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8
+// CHECK-64-NEXT:    [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i64, align 8
+// CHECK-64-NEXT:    [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i64, align 8
+// CHECK-64-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK-64-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8
+// CHECK-64-NEXT:    store i64 [[DOTPREVIOUS_LB_]], i64* [[DOTPREVIOUS_LB__ADDR]], align 8
+// CHECK-64-NEXT:    store i64 [[DOTPREVIOUS_UB_]], i64* [[DOTPREVIOUS_UB__ADDR]], align 8
+// CHECK-64-NEXT:    store i32 0, i32* [[DOTOMP_LB]], align 4
+// CHECK-64-NEXT:    store i32 9, i32* [[DOTOMP_UB]], align 4
+// CHECK-64-NEXT:    [[TMP0:%.*]] = load i64, i64* [[DOTPREVIOUS_LB__ADDR]], align 8
+// CHECK-64-NEXT:    [[CONV:%.*]] = trunc i64 [[TMP0]] to i32
+// CHECK-64-NEXT:    [[TMP1:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8
+// CHECK-64-NEXT:    [[CONV1:%.*]] = trunc i64 [[TMP1]] to i32
+// CHECK-64-NEXT:    store i32 [[CONV]], i32* [[DOTOMP_LB]], align 4
+// CHECK-64-NEXT:    store i32 [[CONV1]], i32* [[DOTOMP_UB]], align 4
+// CHECK-64-NEXT:    store i32 1, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-64-NEXT:    store i32 0, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-64-NEXT:    [[TMP2:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4
+// CHECK-64-NEXT:    [[TMP3:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4
+// CHECK-64-NEXT:    [[TMP4:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK-64-NEXT:    [[TMP5:%.*]] = load i32, i32* [[TMP4]], align 4
+// CHECK-64-NEXT:    call void @__kmpc_dispatch_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP5]], i32 1073741860, i32 [[TMP2]], i32 [[TMP3]], i32 1, i32 1)
+// CHECK-64-NEXT:    br label [[OMP_DISPATCH_COND:%.*]]
+// CHECK-64:       omp.dispatch.cond:
+// CHECK-64-NEXT:    [[TMP6:%.*]] = call i32 @__kmpc_dispatch_next_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP5]], i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]])
+// CHECK-64-NEXT:    [[TOBOOL:%.*]] = icmp ne i32 [[TMP6]], 0
+// CHECK-64-NEXT:    br i1 [[TOBOOL]], label [[OMP_DISPATCH_BODY:%.*]], label [[OMP_DISPATCH_END:%.*]]
+// CHECK-64:       omp.dispatch.body:
+// CHECK-64-NEXT:    [[TMP7:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4
+// CHECK-64-NEXT:    store i32 [[TMP7]], i32* [[DOTOMP_IV]], align 4
+// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK-64:       omp.inner.for.cond:
+// CHECK-64-NEXT:    [[TMP8:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP231:![0-9]+]]
+// CHECK-64-NEXT:    [[TMP9:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP231]]
+// CHECK-64-NEXT:    [[CMP:%.*]] = icmp sle i32 [[TMP8]], [[TMP9]]
+// CHECK-64-NEXT:    br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK-64:       omp.inner.for.body:
+// CHECK-64-NEXT:    [[TMP10:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP231]]
+// CHECK-64-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP10]], 1
+// CHECK-64-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// CHECK-64-NEXT:    store i32 [[ADD]], i32* [[I]], align 4, !llvm.access.group [[ACC_GRP231]]
+// CHECK-64-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK-64:       omp.body.continue:
+// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK-64:       omp.inner.for.inc:
+// CHECK-64-NEXT:    [[TMP11:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP231]]
+// CHECK-64-NEXT:    [[ADD2:%.*]] = add nsw i32 [[TMP11]], 1
+// CHECK-64-NEXT:    store i32 [[ADD2]], i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP231]]
+// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP232:![0-9]+]]
+// CHECK-64:       omp.inner.for.end:
+// CHECK-64-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
+// CHECK-64:       omp.dispatch.inc:
+// CHECK-64-NEXT:    br label [[OMP_DISPATCH_COND]]
+// CHECK-64:       omp.dispatch.end:
+// CHECK-64-NEXT:    [[TMP12:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-64-NEXT:    [[TMP13:%.*]] = icmp ne i32 [[TMP12]], 0
+// CHECK-64-NEXT:    br i1 [[TMP13]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]]
+// CHECK-64:       .omp.final.then:
+// CHECK-64-NEXT:    store i32 10, i32* [[I]], align 4
+// CHECK-64-NEXT:    br label [[DOTOMP_FINAL_DONE]]
+// CHECK-64:       .omp.final.done:
+// CHECK-64-NEXT:    ret void
+//
+//
+// CHECK-64-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l96
+// CHECK-64-SAME: () #[[ATTR0]] {
+// CHECK-64-NEXT:  entry:
+// CHECK-64-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 2, i1 false)
+// CHECK-64-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
+// CHECK-64-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
+// CHECK-64:       user_code.entry:
+// CHECK-64-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
+// CHECK-64-NEXT:    store i32 0, i32* [[DOTZERO_ADDR]], align 4
+// CHECK-64-NEXT:    store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4
+// CHECK-64-NEXT:    call void @__omp_outlined__43(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]]) #[[ATTR2]]
+// CHECK-64-NEXT:    call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 2)
+// CHECK-64-NEXT:    ret void
+// CHECK-64:       worker.exit:
+// CHECK-64-NEXT:    ret void
+//
+//
+// CHECK-64-LABEL: define {{[^@]+}}@__omp_outlined__43
+// CHECK-64-SAME: (i32* noalias noundef [[DOTGLOBAL_TID_:%.*]], i32* noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
+// CHECK-64-NEXT:  entry:
+// CHECK-64-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8
+// CHECK-64-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8
+// CHECK-64-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [2 x i8*], align 8
+// CHECK-64-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK-64-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8
+// CHECK-64-NEXT:    store i32 0, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-64-NEXT:    store i32 9, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-64-NEXT:    store i32 1, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-64-NEXT:    store i32 0, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-64-NEXT:    [[NVPTX_NUM_THREADS:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block()
+// CHECK-64-NEXT:    [[TMP0:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK-64-NEXT:    [[TMP1:%.*]] = load i32, i32* [[TMP0]], align 4
+// CHECK-64-NEXT:    call void @__kmpc_distribute_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP1]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]])
+// CHECK-64-NEXT:    [[TMP2:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-64-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[TMP2]], 9
+// CHECK-64-NEXT:    br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK-64:       cond.true:
+// CHECK-64-NEXT:    br label [[COND_END:%.*]]
+// CHECK-64:       cond.false:
+// CHECK-64-NEXT:    [[TMP3:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-64-NEXT:    br label [[COND_END]]
+// CHECK-64:       cond.end:
+// CHECK-64-NEXT:    [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP3]], [[COND_FALSE]] ]
+// CHECK-64-NEXT:    store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-64-NEXT:    [[TMP4:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-64-NEXT:    store i32 [[TMP4]], i32* [[DOTOMP_IV]], align 4
+// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK-64:       omp.inner.for.cond:
+// CHECK-64-NEXT:    [[TMP5:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
+// CHECK-64-NEXT:    [[CMP1:%.*]] = icmp slt i32 [[TMP5]], 10
+// CHECK-64-NEXT:    br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK-64:       omp.inner.for.body:
+// CHECK-64-NEXT:    [[TMP6:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-64-NEXT:    [[TMP7:%.*]] = zext i32 [[TMP6]] to i64
+// CHECK-64-NEXT:    [[TMP8:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-64-NEXT:    [[TMP9:%.*]] = zext i32 [[TMP8]] to i64
+// CHECK-64-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 0
+// CHECK-64-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP7]] to i8*
+// CHECK-64-NEXT:    store i8* [[TMP11]], i8** [[TMP10]], align 8
+// CHECK-64-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 1
+// CHECK-64-NEXT:    [[TMP13:%.*]] = inttoptr i64 [[TMP9]] to i8*
+// CHECK-64-NEXT:    store i8* [[TMP13]], i8** [[TMP12]], align 8
+// CHECK-64-NEXT:    [[TMP14:%.*]] = bitcast [2 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8**
+// CHECK-64-NEXT:    call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i64, i64)* @__omp_outlined__44 to i8*), i8* null, i8** [[TMP14]], i64 2)
+// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK-64:       omp.inner.for.inc:
+// CHECK-64-NEXT:    [[TMP15:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
+// CHECK-64-NEXT:    [[TMP16:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-64-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP15]], [[TMP16]]
+// CHECK-64-NEXT:    store i32 [[ADD]], i32* [[DOTOMP_IV]], align 4
+// CHECK-64-NEXT:    [[TMP17:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-64-NEXT:    [[TMP18:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-64-NEXT:    [[ADD2:%.*]] = add nsw i32 [[TMP17]], [[TMP18]]
+// CHECK-64-NEXT:    store i32 [[ADD2]], i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-64-NEXT:    [[TMP19:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-64-NEXT:    [[TMP20:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-64-NEXT:    [[ADD3:%.*]] = add nsw i32 [[TMP19]], [[TMP20]]
+// CHECK-64-NEXT:    store i32 [[ADD3]], i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-64-NEXT:    [[TMP21:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-64-NEXT:    [[CMP4:%.*]] = icmp sgt i32 [[TMP21]], 9
+// CHECK-64-NEXT:    br i1 [[CMP4]], label [[COND_TRUE5:%.*]], label [[COND_FALSE6:%.*]]
+// CHECK-64:       cond.true5:
+// CHECK-64-NEXT:    br label [[COND_END7:%.*]]
+// CHECK-64:       cond.false6:
+// CHECK-64-NEXT:    [[TMP22:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-64-NEXT:    br label [[COND_END7]]
+// CHECK-64:       cond.end7:
+// CHECK-64-NEXT:    [[COND8:%.*]] = phi i32 [ 9, [[COND_TRUE5]] ], [ [[TMP22]], [[COND_FALSE6]] ]
+// CHECK-64-NEXT:    store i32 [[COND8]], i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-64-NEXT:    [[TMP23:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-64-NEXT:    store i32 [[TMP23]], i32* [[DOTOMP_IV]], align 4
+// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND]]
+// CHECK-64:       omp.inner.for.end:
+// CHECK-64-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK-64:       omp.loop.exit:
+// CHECK-64-NEXT:    call void @__kmpc_distribute_static_fini(%struct.ident_t* @[[GLOB2]], i32 [[TMP1]])
+// CHECK-64-NEXT:    ret void
+//
+//
+// CHECK-64-LABEL: define {{[^@]+}}@__omp_outlined__44
+// CHECK-64-SAME: (i32* noalias noundef [[DOTGLOBAL_TID_:%.*]], i32* noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR1]] {
+// CHECK-64-NEXT:  entry:
+// CHECK-64-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8
+// CHECK-64-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8
+// CHECK-64-NEXT:    [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i64, align 8
+// CHECK-64-NEXT:    [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i64, align 8
+// CHECK-64-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK-64-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8
+// CHECK-64-NEXT:    store i64 [[DOTPREVIOUS_LB_]], i64* [[DOTPREVIOUS_LB__ADDR]], align 8
+// CHECK-64-NEXT:    store i64 [[DOTPREVIOUS_UB_]], i64* [[DOTPREVIOUS_UB__ADDR]], align 8
+// CHECK-64-NEXT:    store i32 0, i32* [[DOTOMP_LB]], align 4
+// CHECK-64-NEXT:    store i32 9, i32* [[DOTOMP_UB]], align 4
+// CHECK-64-NEXT:    [[TMP0:%.*]] = load i64, i64* [[DOTPREVIOUS_LB__ADDR]], align 8
+// CHECK-64-NEXT:    [[CONV:%.*]] = trunc i64 [[TMP0]] to i32
+// CHECK-64-NEXT:    [[TMP1:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8
+// CHECK-64-NEXT:    [[CONV1:%.*]] = trunc i64 [[TMP1]] to i32
+// CHECK-64-NEXT:    store i32 [[CONV]], i32* [[DOTOMP_LB]], align 4
+// CHECK-64-NEXT:    store i32 [[CONV1]], i32* [[DOTOMP_UB]], align 4
+// CHECK-64-NEXT:    store i32 1, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-64-NEXT:    store i32 0, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-64-NEXT:    [[TMP2:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK-64-NEXT:    [[TMP3:%.*]] = load i32, i32* [[TMP2]], align 4
+// CHECK-64-NEXT:    call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB3]], i32 [[TMP3]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1)
+// CHECK-64-NEXT:    [[TMP4:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4
+// CHECK-64-NEXT:    store i32 [[TMP4]], i32* [[DOTOMP_IV]], align 4
+// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK-64:       omp.inner.for.cond:
+// CHECK-64-NEXT:    [[TMP5:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
+// CHECK-64-NEXT:    [[CONV2:%.*]] = sext i32 [[TMP5]] to i64
+// CHECK-64-NEXT:    [[TMP6:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8
+// CHECK-64-NEXT:    [[CMP:%.*]] = icmp ule i64 [[CONV2]], [[TMP6]]
+// CHECK-64-NEXT:    br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK-64:       omp.inner.for.body:
+// CHECK-64-NEXT:    [[TMP7:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
+// CHECK-64-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP7]], 1
+// CHECK-64-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// CHECK-64-NEXT:    store i32 [[ADD]], i32* [[I]], align 4
+// CHECK-64-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK-64:       omp.body.continue:
+// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK-64:       omp.inner.for.inc:
+// CHECK-64-NEXT:    [[TMP8:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
+// CHECK-64-NEXT:    [[TMP9:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-64-NEXT:    [[ADD3:%.*]] = add nsw i32 [[TMP8]], [[TMP9]]
+// CHECK-64-NEXT:    store i32 [[ADD3]], i32* [[DOTOMP_IV]], align 4
+// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND]]
+// CHECK-64:       omp.inner.for.end:
+// CHECK-64-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK-64:       omp.loop.exit:
+// CHECK-64-NEXT:    call void @__kmpc_distribute_static_fini(%struct.ident_t* @[[GLOB2]], i32 [[TMP3]])
+// CHECK-64-NEXT:    ret void
+//
+//
+// CHECK-64-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l100
+// CHECK-64-SAME: () #[[ATTR0]] {
+// CHECK-64-NEXT:  entry:
+// CHECK-64-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 2, i1 false)
+// CHECK-64-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
+// CHECK-64-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
+// CHECK-64:       user_code.entry:
+// CHECK-64-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
+// CHECK-64-NEXT:    store i32 0, i32* [[DOTZERO_ADDR]], align 4
+// CHECK-64-NEXT:    store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4
+// CHECK-64-NEXT:    call void @__omp_outlined__45(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]]) #[[ATTR2]]
+// CHECK-64-NEXT:    call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 2)
+// CHECK-64-NEXT:    ret void
+// CHECK-64:       worker.exit:
+// CHECK-64-NEXT:    ret void
+//
+//
+// CHECK-64-LABEL: define {{[^@]+}}@__omp_outlined__45
+// CHECK-64-SAME: (i32* noalias noundef [[DOTGLOBAL_TID_:%.*]], i32* noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
+// CHECK-64-NEXT:  entry:
+// CHECK-64-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8
+// CHECK-64-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8
+// CHECK-64-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [2 x i8*], align 8
+// CHECK-64-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK-64-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8
+// CHECK-64-NEXT:    store i32 0, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-64-NEXT:    store i32 9, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-64-NEXT:    store i32 1, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-64-NEXT:    store i32 0, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-64-NEXT:    [[NVPTX_NUM_THREADS:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block()
+// CHECK-64-NEXT:    [[TMP0:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK-64-NEXT:    [[TMP1:%.*]] = load i32, i32* [[TMP0]], align 4
+// CHECK-64-NEXT:    call void @__kmpc_distribute_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP1]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]])
+// CHECK-64-NEXT:    [[TMP2:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-64-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[TMP2]], 9
+// CHECK-64-NEXT:    br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK-64:       cond.true:
+// CHECK-64-NEXT:    br label [[COND_END:%.*]]
+// CHECK-64:       cond.false:
+// CHECK-64-NEXT:    [[TMP3:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-64-NEXT:    br label [[COND_END]]
+// CHECK-64:       cond.end:
+// CHECK-64-NEXT:    [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP3]], [[COND_FALSE]] ]
+// CHECK-64-NEXT:    store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-64-NEXT:    [[TMP4:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-64-NEXT:    store i32 [[TMP4]], i32* [[DOTOMP_IV]], align 4
+// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK-64:       omp.inner.for.cond:
+// CHECK-64-NEXT:    [[TMP5:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
+// CHECK-64-NEXT:    [[CMP1:%.*]] = icmp slt i32 [[TMP5]], 10
+// CHECK-64-NEXT:    br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK-64:       omp.inner.for.body:
+// CHECK-64-NEXT:    [[TMP6:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-64-NEXT:    [[TMP7:%.*]] = zext i32 [[TMP6]] to i64
+// CHECK-64-NEXT:    [[TMP8:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-64-NEXT:    [[TMP9:%.*]] = zext i32 [[TMP8]] to i64
+// CHECK-64-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 0
+// CHECK-64-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP7]] to i8*
+// CHECK-64-NEXT:    store i8* [[TMP11]], i8** [[TMP10]], align 8
+// CHECK-64-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 1
+// CHECK-64-NEXT:    [[TMP13:%.*]] = inttoptr i64 [[TMP9]] to i8*
+// CHECK-64-NEXT:    store i8* [[TMP13]], i8** [[TMP12]], align 8
+// CHECK-64-NEXT:    [[TMP14:%.*]] = bitcast [2 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8**
+// CHECK-64-NEXT:    call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i64, i64)* @__omp_outlined__46 to i8*), i8* null, i8** [[TMP14]], i64 2)
+// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK-64:       omp.inner.for.inc:
+// CHECK-64-NEXT:    [[TMP15:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
+// CHECK-64-NEXT:    [[TMP16:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-64-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP15]], [[TMP16]]
+// CHECK-64-NEXT:    store i32 [[ADD]], i32* [[DOTOMP_IV]], align 4
+// CHECK-64-NEXT:    [[TMP17:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-64-NEXT:    [[TMP18:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-64-NEXT:    [[ADD2:%.*]] = add nsw i32 [[TMP17]], [[TMP18]]
+// CHECK-64-NEXT:    store i32 [[ADD2]], i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-64-NEXT:    [[TMP19:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-64-NEXT:    [[TMP20:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-64-NEXT:    [[ADD3:%.*]] = add nsw i32 [[TMP19]], [[TMP20]]
+// CHECK-64-NEXT:    store i32 [[ADD3]], i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-64-NEXT:    [[TMP21:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-64-NEXT:    [[CMP4:%.*]] = icmp sgt i32 [[TMP21]], 9
+// CHECK-64-NEXT:    br i1 [[CMP4]], label [[COND_TRUE5:%.*]], label [[COND_FALSE6:%.*]]
+// CHECK-64:       cond.true5:
+// CHECK-64-NEXT:    br label [[COND_END7:%.*]]
+// CHECK-64:       cond.false6:
+// CHECK-64-NEXT:    [[TMP22:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-64-NEXT:    br label [[COND_END7]]
+// CHECK-64:       cond.end7:
+// CHECK-64-NEXT:    [[COND8:%.*]] = phi i32 [ 9, [[COND_TRUE5]] ], [ [[TMP22]], [[COND_FALSE6]] ]
+// CHECK-64-NEXT:    store i32 [[COND8]], i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-64-NEXT:    [[TMP23:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-64-NEXT:    store i32 [[TMP23]], i32* [[DOTOMP_IV]], align 4
+// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND]]
+// CHECK-64:       omp.inner.for.end:
+// CHECK-64-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK-64:       omp.loop.exit:
+// CHECK-64-NEXT:    call void @__kmpc_distribute_static_fini(%struct.ident_t* @[[GLOB2]], i32 [[TMP1]])
+// CHECK-64-NEXT:    ret void
+//
+//
+// CHECK-64-LABEL: define {{[^@]+}}@__omp_outlined__46
+// CHECK-64-SAME: (i32* noalias noundef [[DOTGLOBAL_TID_:%.*]], i32* noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR1]] {
+// CHECK-64-NEXT:  entry:
+// CHECK-64-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8
+// CHECK-64-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8
+// CHECK-64-NEXT:    [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i64, align 8
+// CHECK-64-NEXT:    [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i64, align 8
+// CHECK-64-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK-64-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8
+// CHECK-64-NEXT:    store i64 [[DOTPREVIOUS_LB_]], i64* [[DOTPREVIOUS_LB__ADDR]], align 8
+// CHECK-64-NEXT:    store i64 [[DOTPREVIOUS_UB_]], i64* [[DOTPREVIOUS_UB__ADDR]], align 8
+// CHECK-64-NEXT:    store i32 0, i32* [[DOTOMP_LB]], align 4
+// CHECK-64-NEXT:    store i32 9, i32* [[DOTOMP_UB]], align 4
+// CHECK-64-NEXT:    [[TMP0:%.*]] = load i64, i64* [[DOTPREVIOUS_LB__ADDR]], align 8
+// CHECK-64-NEXT:    [[CONV:%.*]] = trunc i64 [[TMP0]] to i32
+// CHECK-64-NEXT:    [[TMP1:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8
+// CHECK-64-NEXT:    [[CONV1:%.*]] = trunc i64 [[TMP1]] to i32
+// CHECK-64-NEXT:    store i32 [[CONV]], i32* [[DOTOMP_LB]], align 4
+// CHECK-64-NEXT:    store i32 [[CONV1]], i32* [[DOTOMP_UB]], align 4
+// CHECK-64-NEXT:    store i32 1, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-64-NEXT:    store i32 0, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-64-NEXT:    [[TMP2:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK-64-NEXT:    [[TMP3:%.*]] = load i32, i32* [[TMP2]], align 4
+// CHECK-64-NEXT:    call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB3]], i32 [[TMP3]], i32 34, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1)
+// CHECK-64-NEXT:    [[TMP4:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4
+// CHECK-64-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[TMP4]], 9
+// CHECK-64-NEXT:    br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK-64:       cond.true:
+// CHECK-64-NEXT:    br label [[COND_END:%.*]]
+// CHECK-64:       cond.false:
+// CHECK-64-NEXT:    [[TMP5:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4
+// CHECK-64-NEXT:    br label [[COND_END]]
+// CHECK-64:       cond.end:
+// CHECK-64-NEXT:    [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP5]], [[COND_FALSE]] ]
+// CHECK-64-NEXT:    store i32 [[COND]], i32* [[DOTOMP_UB]], align 4
+// CHECK-64-NEXT:    [[TMP6:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4
+// CHECK-64-NEXT:    store i32 [[TMP6]], i32* [[DOTOMP_IV]], align 4
+// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK-64:       omp.inner.for.cond:
+// CHECK-64-NEXT:    [[TMP7:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
+// CHECK-64-NEXT:    [[TMP8:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4
+// CHECK-64-NEXT:    [[CMP2:%.*]] = icmp sle i32 [[TMP7]], [[TMP8]]
+// CHECK-64-NEXT:    br i1 [[CMP2]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK-64:       omp.inner.for.body:
+// CHECK-64-NEXT:    [[TMP9:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
+// CHECK-64-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP9]], 1
+// CHECK-64-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// CHECK-64-NEXT:    store i32 [[ADD]], i32* [[I]], align 4
+// CHECK-64-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK-64:       omp.body.continue:
+// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK-64:       omp.inner.for.inc:
+// CHECK-64-NEXT:    [[TMP10:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
+// CHECK-64-NEXT:    [[ADD3:%.*]] = add nsw i32 [[TMP10]], 1
+// CHECK-64-NEXT:    store i32 [[ADD3]], i32* [[DOTOMP_IV]], align 4
+// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND]]
+// CHECK-64:       omp.inner.for.end:
+// CHECK-64-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK-64:       omp.loop.exit:
+// CHECK-64-NEXT:    call void @__kmpc_distribute_static_fini(%struct.ident_t* @[[GLOB2]], i32 [[TMP3]])
+// CHECK-64-NEXT:    ret void
+//
+//
+// CHECK-64-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l104
+// CHECK-64-SAME: () #[[ATTR0]] {
+// CHECK-64-NEXT:  entry:
+// CHECK-64-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 2, i1 false)
+// CHECK-64-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
+// CHECK-64-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
+// CHECK-64:       user_code.entry:
+// CHECK-64-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
+// CHECK-64-NEXT:    store i32 0, i32* [[DOTZERO_ADDR]], align 4
+// CHECK-64-NEXT:    store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4
+// CHECK-64-NEXT:    call void @__omp_outlined__47(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]]) #[[ATTR2]]
+// CHECK-64-NEXT:    call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 2)
+// CHECK-64-NEXT:    ret void
+// CHECK-64:       worker.exit:
+// CHECK-64-NEXT:    ret void
+//
+//
+// CHECK-64-LABEL: define {{[^@]+}}@__omp_outlined__47
+// CHECK-64-SAME: (i32* noalias noundef [[DOTGLOBAL_TID_:%.*]], i32* noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
+// CHECK-64-NEXT:  entry:
+// CHECK-64-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8
+// CHECK-64-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8
+// CHECK-64-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [2 x i8*], align 8
+// CHECK-64-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK-64-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8
+// CHECK-64-NEXT:    store i32 0, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-64-NEXT:    store i32 9, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-64-NEXT:    store i32 1, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-64-NEXT:    store i32 0, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-64-NEXT:    [[NVPTX_NUM_THREADS:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block()
+// CHECK-64-NEXT:    [[TMP0:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK-64-NEXT:    [[TMP1:%.*]] = load i32, i32* [[TMP0]], align 4
+// CHECK-64-NEXT:    call void @__kmpc_distribute_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP1]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]])
+// CHECK-64-NEXT:    [[TMP2:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-64-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[TMP2]], 9
+// CHECK-64-NEXT:    br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK-64:       cond.true:
+// CHECK-64-NEXT:    br label [[COND_END:%.*]]
+// CHECK-64:       cond.false:
+// CHECK-64-NEXT:    [[TMP3:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-64-NEXT:    br label [[COND_END]]
+// CHECK-64:       cond.end:
+// CHECK-64-NEXT:    [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP3]], [[COND_FALSE]] ]
+// CHECK-64-NEXT:    store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-64-NEXT:    [[TMP4:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-64-NEXT:    store i32 [[TMP4]], i32* [[DOTOMP_IV]], align 4
+// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK-64:       omp.inner.for.cond:
+// CHECK-64-NEXT:    [[TMP5:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
+// CHECK-64-NEXT:    [[CMP1:%.*]] = icmp slt i32 [[TMP5]], 10
+// CHECK-64-NEXT:    br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK-64:       omp.inner.for.body:
+// CHECK-64-NEXT:    [[TMP6:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-64-NEXT:    [[TMP7:%.*]] = zext i32 [[TMP6]] to i64
+// CHECK-64-NEXT:    [[TMP8:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-64-NEXT:    [[TMP9:%.*]] = zext i32 [[TMP8]] to i64
+// CHECK-64-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 0
+// CHECK-64-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP7]] to i8*
+// CHECK-64-NEXT:    store i8* [[TMP11]], i8** [[TMP10]], align 8
+// CHECK-64-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 1
+// CHECK-64-NEXT:    [[TMP13:%.*]] = inttoptr i64 [[TMP9]] to i8*
+// CHECK-64-NEXT:    store i8* [[TMP13]], i8** [[TMP12]], align 8
+// CHECK-64-NEXT:    [[TMP14:%.*]] = bitcast [2 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8**
+// CHECK-64-NEXT:    call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i64, i64)* @__omp_outlined__48 to i8*), i8* null, i8** [[TMP14]], i64 2)
+// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK-64:       omp.inner.for.inc:
+// CHECK-64-NEXT:    [[TMP15:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
+// CHECK-64-NEXT:    [[TMP16:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-64-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP15]], [[TMP16]]
+// CHECK-64-NEXT:    store i32 [[ADD]], i32* [[DOTOMP_IV]], align 4
+// CHECK-64-NEXT:    [[TMP17:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-64-NEXT:    [[TMP18:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-64-NEXT:    [[ADD2:%.*]] = add nsw i32 [[TMP17]], [[TMP18]]
+// CHECK-64-NEXT:    store i32 [[ADD2]], i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-64-NEXT:    [[TMP19:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-64-NEXT:    [[TMP20:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-64-NEXT:    [[ADD3:%.*]] = add nsw i32 [[TMP19]], [[TMP20]]
+// CHECK-64-NEXT:    store i32 [[ADD3]], i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-64-NEXT:    [[TMP21:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-64-NEXT:    [[CMP4:%.*]] = icmp sgt i32 [[TMP21]], 9
+// CHECK-64-NEXT:    br i1 [[CMP4]], label [[COND_TRUE5:%.*]], label [[COND_FALSE6:%.*]]
+// CHECK-64:       cond.true5:
+// CHECK-64-NEXT:    br label [[COND_END7:%.*]]
+// CHECK-64:       cond.false6:
+// CHECK-64-NEXT:    [[TMP22:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-64-NEXT:    br label [[COND_END7]]
+// CHECK-64:       cond.end7:
+// CHECK-64-NEXT:    [[COND8:%.*]] = phi i32 [ 9, [[COND_TRUE5]] ], [ [[TMP22]], [[COND_FALSE6]] ]
+// CHECK-64-NEXT:    store i32 [[COND8]], i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-64-NEXT:    [[TMP23:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-64-NEXT:    store i32 [[TMP23]], i32* [[DOTOMP_IV]], align 4
+// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND]]
+// CHECK-64:       omp.inner.for.end:
+// CHECK-64-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK-64:       omp.loop.exit:
+// CHECK-64-NEXT:    call void @__kmpc_distribute_static_fini(%struct.ident_t* @[[GLOB2]], i32 [[TMP1]])
+// CHECK-64-NEXT:    ret void
+//
+//
+// CHECK-64-LABEL: define {{[^@]+}}@__omp_outlined__48
+// CHECK-64-SAME: (i32* noalias noundef [[DOTGLOBAL_TID_:%.*]], i32* noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR1]] {
+// CHECK-64-NEXT:  entry:
+// CHECK-64-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8
+// CHECK-64-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8
+// CHECK-64-NEXT:    [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i64, align 8
+// CHECK-64-NEXT:    [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i64, align 8
+// CHECK-64-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK-64-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8
+// CHECK-64-NEXT:    store i64 [[DOTPREVIOUS_LB_]], i64* [[DOTPREVIOUS_LB__ADDR]], align 8
+// CHECK-64-NEXT:    store i64 [[DOTPREVIOUS_UB_]], i64* [[DOTPREVIOUS_UB__ADDR]], align 8
+// CHECK-64-NEXT:    store i32 0, i32* [[DOTOMP_LB]], align 4
+// CHECK-64-NEXT:    store i32 9, i32* [[DOTOMP_UB]], align 4
+// CHECK-64-NEXT:    [[TMP0:%.*]] = load i64, i64* [[DOTPREVIOUS_LB__ADDR]], align 8
+// CHECK-64-NEXT:    [[CONV:%.*]] = trunc i64 [[TMP0]] to i32
+// CHECK-64-NEXT:    [[TMP1:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8
+// CHECK-64-NEXT:    [[CONV1:%.*]] = trunc i64 [[TMP1]] to i32
+// CHECK-64-NEXT:    store i32 [[CONV]], i32* [[DOTOMP_LB]], align 4
+// CHECK-64-NEXT:    store i32 [[CONV1]], i32* [[DOTOMP_UB]], align 4
+// CHECK-64-NEXT:    store i32 1, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-64-NEXT:    store i32 0, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-64-NEXT:    [[TMP2:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK-64-NEXT:    [[TMP3:%.*]] = load i32, i32* [[TMP2]], align 4
+// CHECK-64-NEXT:    call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB3]], i32 [[TMP3]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1)
+// CHECK-64-NEXT:    [[TMP4:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4
+// CHECK-64-NEXT:    store i32 [[TMP4]], i32* [[DOTOMP_IV]], align 4
+// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK-64:       omp.inner.for.cond:
+// CHECK-64-NEXT:    [[TMP5:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
+// CHECK-64-NEXT:    [[CONV2:%.*]] = sext i32 [[TMP5]] to i64
+// CHECK-64-NEXT:    [[TMP6:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8
+// CHECK-64-NEXT:    [[CMP:%.*]] = icmp ule i64 [[CONV2]], [[TMP6]]
+// CHECK-64-NEXT:    br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK-64:       omp.inner.for.body:
+// CHECK-64-NEXT:    [[TMP7:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
+// CHECK-64-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP7]], 1
+// CHECK-64-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// CHECK-64-NEXT:    store i32 [[ADD]], i32* [[I]], align 4
+// CHECK-64-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK-64:       omp.body.continue:
+// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK-64:       omp.inner.for.inc:
+// CHECK-64-NEXT:    [[TMP8:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
+// CHECK-64-NEXT:    [[TMP9:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-64-NEXT:    [[ADD3:%.*]] = add nsw i32 [[TMP8]], [[TMP9]]
+// CHECK-64-NEXT:    store i32 [[ADD3]], i32* [[DOTOMP_IV]], align 4
+// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND]]
+// CHECK-64:       omp.inner.for.end:
+// CHECK-64-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK-64:       omp.loop.exit:
+// CHECK-64-NEXT:    call void @__kmpc_distribute_static_fini(%struct.ident_t* @[[GLOB2]], i32 [[TMP3]])
+// CHECK-64-NEXT:    ret void
+//
+//
+// CHECK-64-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l108
+// CHECK-64-SAME: () #[[ATTR0]] {
+// CHECK-64-NEXT:  entry:
+// CHECK-64-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 2, i1 false)
+// CHECK-64-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
+// CHECK-64-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
+// CHECK-64:       user_code.entry:
+// CHECK-64-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
+// CHECK-64-NEXT:    store i32 0, i32* [[DOTZERO_ADDR]], align 4
+// CHECK-64-NEXT:    store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4
+// CHECK-64-NEXT:    call void @__omp_outlined__49(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]]) #[[ATTR2]]
+// CHECK-64-NEXT:    call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 2)
+// CHECK-64-NEXT:    ret void
+// CHECK-64:       worker.exit:
+// CHECK-64-NEXT:    ret void
+//
+//
+// CHECK-64-LABEL: define {{[^@]+}}@__omp_outlined__49
+// CHECK-64-SAME: (i32* noalias noundef [[DOTGLOBAL_TID_:%.*]], i32* noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
+// CHECK-64-NEXT:  entry:
+// CHECK-64-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8
+// CHECK-64-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8
+// CHECK-64-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [2 x i8*], align 8
+// CHECK-64-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK-64-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8
+// CHECK-64-NEXT:    store i32 0, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-64-NEXT:    store i32 9, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-64-NEXT:    store i32 1, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-64-NEXT:    store i32 0, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-64-NEXT:    [[NVPTX_NUM_THREADS:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block()
+// CHECK-64-NEXT:    [[TMP0:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK-64-NEXT:    [[TMP1:%.*]] = load i32, i32* [[TMP0]], align 4
+// CHECK-64-NEXT:    call void @__kmpc_distribute_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP1]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]])
+// CHECK-64-NEXT:    [[TMP2:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-64-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[TMP2]], 9
+// CHECK-64-NEXT:    br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK-64:       cond.true:
+// CHECK-64-NEXT:    br label [[COND_END:%.*]]
+// CHECK-64:       cond.false:
+// CHECK-64-NEXT:    [[TMP3:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-64-NEXT:    br label [[COND_END]]
+// CHECK-64:       cond.end:
+// CHECK-64-NEXT:    [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP3]], [[COND_FALSE]] ]
+// CHECK-64-NEXT:    store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-64-NEXT:    [[TMP4:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-64-NEXT:    store i32 [[TMP4]], i32* [[DOTOMP_IV]], align 4
+// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK-64:       omp.inner.for.cond:
+// CHECK-64-NEXT:    [[TMP5:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
+// CHECK-64-NEXT:    [[CMP1:%.*]] = icmp slt i32 [[TMP5]], 10
+// CHECK-64-NEXT:    br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK-64:       omp.inner.for.body:
+// CHECK-64-NEXT:    [[TMP6:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-64-NEXT:    [[TMP7:%.*]] = zext i32 [[TMP6]] to i64
+// CHECK-64-NEXT:    [[TMP8:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-64-NEXT:    [[TMP9:%.*]] = zext i32 [[TMP8]] to i64
+// CHECK-64-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 0
+// CHECK-64-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP7]] to i8*
+// CHECK-64-NEXT:    store i8* [[TMP11]], i8** [[TMP10]], align 8
+// CHECK-64-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 1
+// CHECK-64-NEXT:    [[TMP13:%.*]] = inttoptr i64 [[TMP9]] to i8*
+// CHECK-64-NEXT:    store i8* [[TMP13]], i8** [[TMP12]], align 8
+// CHECK-64-NEXT:    [[TMP14:%.*]] = bitcast [2 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8**
+// CHECK-64-NEXT:    call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i64, i64)* @__omp_outlined__50 to i8*), i8* null, i8** [[TMP14]], i64 2)
+// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK-64:       omp.inner.for.inc:
+// CHECK-64-NEXT:    [[TMP15:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
+// CHECK-64-NEXT:    [[TMP16:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-64-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP15]], [[TMP16]]
+// CHECK-64-NEXT:    store i32 [[ADD]], i32* [[DOTOMP_IV]], align 4
+// CHECK-64-NEXT:    [[TMP17:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-64-NEXT:    [[TMP18:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-64-NEXT:    [[ADD2:%.*]] = add nsw i32 [[TMP17]], [[TMP18]]
+// CHECK-64-NEXT:    store i32 [[ADD2]], i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-64-NEXT:    [[TMP19:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-64-NEXT:    [[TMP20:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-64-NEXT:    [[ADD3:%.*]] = add nsw i32 [[TMP19]], [[TMP20]]
+// CHECK-64-NEXT:    store i32 [[ADD3]], i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-64-NEXT:    [[TMP21:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-64-NEXT:    [[CMP4:%.*]] = icmp sgt i32 [[TMP21]], 9
+// CHECK-64-NEXT:    br i1 [[CMP4]], label [[COND_TRUE5:%.*]], label [[COND_FALSE6:%.*]]
+// CHECK-64:       cond.true5:
+// CHECK-64-NEXT:    br label [[COND_END7:%.*]]
+// CHECK-64:       cond.false6:
+// CHECK-64-NEXT:    [[TMP22:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-64-NEXT:    br label [[COND_END7]]
+// CHECK-64:       cond.end7:
+// CHECK-64-NEXT:    [[COND8:%.*]] = phi i32 [ 9, [[COND_TRUE5]] ], [ [[TMP22]], [[COND_FALSE6]] ]
+// CHECK-64-NEXT:    store i32 [[COND8]], i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-64-NEXT:    [[TMP23:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-64-NEXT:    store i32 [[TMP23]], i32* [[DOTOMP_IV]], align 4
+// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND]]
+// CHECK-64:       omp.inner.for.end:
+// CHECK-64-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK-64:       omp.loop.exit:
+// CHECK-64-NEXT:    call void @__kmpc_distribute_static_fini(%struct.ident_t* @[[GLOB2]], i32 [[TMP1]])
+// CHECK-64-NEXT:    ret void
+//
+//
+// CHECK-64-LABEL: define {{[^@]+}}@__omp_outlined__50
+// CHECK-64-SAME: (i32* noalias noundef [[DOTGLOBAL_TID_:%.*]], i32* noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR1]] {
+// CHECK-64-NEXT:  entry:
+// CHECK-64-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8
+// CHECK-64-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8
+// CHECK-64-NEXT:    [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i64, align 8
+// CHECK-64-NEXT:    [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i64, align 8
+// CHECK-64-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK-64-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8
+// CHECK-64-NEXT:    store i64 [[DOTPREVIOUS_LB_]], i64* [[DOTPREVIOUS_LB__ADDR]], align 8
+// CHECK-64-NEXT:    store i64 [[DOTPREVIOUS_UB_]], i64* [[DOTPREVIOUS_UB__ADDR]], align 8
+// CHECK-64-NEXT:    store i32 0, i32* [[DOTOMP_LB]], align 4
+// CHECK-64-NEXT:    store i32 9, i32* [[DOTOMP_UB]], align 4
+// CHECK-64-NEXT:    [[TMP0:%.*]] = load i64, i64* [[DOTPREVIOUS_LB__ADDR]], align 8
+// CHECK-64-NEXT:    [[CONV:%.*]] = trunc i64 [[TMP0]] to i32
+// CHECK-64-NEXT:    [[TMP1:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8
+// CHECK-64-NEXT:    [[CONV1:%.*]] = trunc i64 [[TMP1]] to i32
+// CHECK-64-NEXT:    store i32 [[CONV]], i32* [[DOTOMP_LB]], align 4
+// CHECK-64-NEXT:    store i32 [[CONV1]], i32* [[DOTOMP_UB]], align 4
+// CHECK-64-NEXT:    store i32 1, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-64-NEXT:    store i32 0, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-64-NEXT:    [[TMP2:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4
+// CHECK-64-NEXT:    [[TMP3:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4
+// CHECK-64-NEXT:    [[TMP4:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK-64-NEXT:    [[TMP5:%.*]] = load i32, i32* [[TMP4]], align 4
+// CHECK-64-NEXT:    call void @__kmpc_dispatch_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP5]], i32 1073741862, i32 [[TMP2]], i32 [[TMP3]], i32 1, i32 1)
+// CHECK-64-NEXT:    br label [[OMP_DISPATCH_COND:%.*]]
+// CHECK-64:       omp.dispatch.cond:
+// CHECK-64-NEXT:    [[TMP6:%.*]] = call i32 @__kmpc_dispatch_next_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP5]], i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]])
+// CHECK-64-NEXT:    [[TOBOOL:%.*]] = icmp ne i32 [[TMP6]], 0
+// CHECK-64-NEXT:    br i1 [[TOBOOL]], label [[OMP_DISPATCH_BODY:%.*]], label [[OMP_DISPATCH_END:%.*]]
+// CHECK-64:       omp.dispatch.body:
+// CHECK-64-NEXT:    [[TMP7:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4
+// CHECK-64-NEXT:    store i32 [[TMP7]], i32* [[DOTOMP_IV]], align 4
+// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK-64:       omp.inner.for.cond:
+// CHECK-64-NEXT:    [[TMP8:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP234:![0-9]+]]
+// CHECK-64-NEXT:    [[TMP9:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP234]]
+// CHECK-64-NEXT:    [[CMP:%.*]] = icmp sle i32 [[TMP8]], [[TMP9]]
+// CHECK-64-NEXT:    br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK-64:       omp.inner.for.body:
+// CHECK-64-NEXT:    [[TMP10:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP234]]
+// CHECK-64-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP10]], 1
+// CHECK-64-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// CHECK-64-NEXT:    store i32 [[ADD]], i32* [[I]], align 4, !llvm.access.group [[ACC_GRP234]]
+// CHECK-64-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK-64:       omp.body.continue:
+// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK-64:       omp.inner.for.inc:
+// CHECK-64-NEXT:    [[TMP11:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP234]]
+// CHECK-64-NEXT:    [[ADD2:%.*]] = add nsw i32 [[TMP11]], 1
+// CHECK-64-NEXT:    store i32 [[ADD2]], i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP234]]
+// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP235:![0-9]+]]
+// CHECK-64:       omp.inner.for.end:
+// CHECK-64-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
+// CHECK-64:       omp.dispatch.inc:
+// CHECK-64-NEXT:    br label [[OMP_DISPATCH_COND]]
+// CHECK-64:       omp.dispatch.end:
+// CHECK-64-NEXT:    ret void
+//
+//
+// CHECK-64-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l112
+// CHECK-64-SAME: () #[[ATTR0]] {
+// CHECK-64-NEXT:  entry:
+// CHECK-64-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 2, i1 false)
+// CHECK-64-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
+// CHECK-64-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
+// CHECK-64:       user_code.entry:
+// CHECK-64-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
+// CHECK-64-NEXT:    store i32 0, i32* [[DOTZERO_ADDR]], align 4
+// CHECK-64-NEXT:    store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4
+// CHECK-64-NEXT:    call void @__omp_outlined__51(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]]) #[[ATTR2]]
+// CHECK-64-NEXT:    call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 2)
+// CHECK-64-NEXT:    ret void
+// CHECK-64:       worker.exit:
+// CHECK-64-NEXT:    ret void
+//
+//
+// CHECK-64-LABEL: define {{[^@]+}}@__omp_outlined__51
+// CHECK-64-SAME: (i32* noalias noundef [[DOTGLOBAL_TID_:%.*]], i32* noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
+// CHECK-64-NEXT:  entry:
+// CHECK-64-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8
+// CHECK-64-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8
+// CHECK-64-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [2 x i8*], align 8
+// CHECK-64-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK-64-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8
+// CHECK-64-NEXT:    store i32 0, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-64-NEXT:    store i32 9, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-64-NEXT:    store i32 1, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-64-NEXT:    store i32 0, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-64-NEXT:    [[NVPTX_NUM_THREADS:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block()
+// CHECK-64-NEXT:    [[TMP0:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK-64-NEXT:    [[TMP1:%.*]] = load i32, i32* [[TMP0]], align 4
+// CHECK-64-NEXT:    call void @__kmpc_distribute_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP1]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]])
+// CHECK-64-NEXT:    [[TMP2:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-64-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[TMP2]], 9
+// CHECK-64-NEXT:    br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK-64:       cond.true:
+// CHECK-64-NEXT:    br label [[COND_END:%.*]]
+// CHECK-64:       cond.false:
+// CHECK-64-NEXT:    [[TMP3:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-64-NEXT:    br label [[COND_END]]
+// CHECK-64:       cond.end:
+// CHECK-64-NEXT:    [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP3]], [[COND_FALSE]] ]
+// CHECK-64-NEXT:    store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-64-NEXT:    [[TMP4:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-64-NEXT:    store i32 [[TMP4]], i32* [[DOTOMP_IV]], align 4
+// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK-64:       omp.inner.for.cond:
+// CHECK-64-NEXT:    [[TMP5:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
+// CHECK-64-NEXT:    [[CMP1:%.*]] = icmp slt i32 [[TMP5]], 10
+// CHECK-64-NEXT:    br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK-64:       omp.inner.for.body:
+// CHECK-64-NEXT:    [[TMP6:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-64-NEXT:    [[TMP7:%.*]] = zext i32 [[TMP6]] to i64
+// CHECK-64-NEXT:    [[TMP8:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-64-NEXT:    [[TMP9:%.*]] = zext i32 [[TMP8]] to i64
+// CHECK-64-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 0
+// CHECK-64-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP7]] to i8*
+// CHECK-64-NEXT:    store i8* [[TMP11]], i8** [[TMP10]], align 8
+// CHECK-64-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 1
+// CHECK-64-NEXT:    [[TMP13:%.*]] = inttoptr i64 [[TMP9]] to i8*
+// CHECK-64-NEXT:    store i8* [[TMP13]], i8** [[TMP12]], align 8
+// CHECK-64-NEXT:    [[TMP14:%.*]] = bitcast [2 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8**
+// CHECK-64-NEXT:    call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i64, i64)* @__omp_outlined__52 to i8*), i8* null, i8** [[TMP14]], i64 2)
+// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK-64:       omp.inner.for.inc:
+// CHECK-64-NEXT:    [[TMP15:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
+// CHECK-64-NEXT:    [[TMP16:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-64-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP15]], [[TMP16]]
+// CHECK-64-NEXT:    store i32 [[ADD]], i32* [[DOTOMP_IV]], align 4
+// CHECK-64-NEXT:    [[TMP17:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-64-NEXT:    [[TMP18:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-64-NEXT:    [[ADD2:%.*]] = add nsw i32 [[TMP17]], [[TMP18]]
+// CHECK-64-NEXT:    store i32 [[ADD2]], i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-64-NEXT:    [[TMP19:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-64-NEXT:    [[TMP20:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-64-NEXT:    [[ADD3:%.*]] = add nsw i32 [[TMP19]], [[TMP20]]
+// CHECK-64-NEXT:    store i32 [[ADD3]], i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-64-NEXT:    [[TMP21:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-64-NEXT:    [[CMP4:%.*]] = icmp sgt i32 [[TMP21]], 9
+// CHECK-64-NEXT:    br i1 [[CMP4]], label [[COND_TRUE5:%.*]], label [[COND_FALSE6:%.*]]
+// CHECK-64:       cond.true5:
+// CHECK-64-NEXT:    br label [[COND_END7:%.*]]
+// CHECK-64:       cond.false6:
+// CHECK-64-NEXT:    [[TMP22:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-64-NEXT:    br label [[COND_END7]]
+// CHECK-64:       cond.end7:
+// CHECK-64-NEXT:    [[COND8:%.*]] = phi i32 [ 9, [[COND_TRUE5]] ], [ [[TMP22]], [[COND_FALSE6]] ]
+// CHECK-64-NEXT:    store i32 [[COND8]], i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-64-NEXT:    [[TMP23:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-64-NEXT:    store i32 [[TMP23]], i32* [[DOTOMP_IV]], align 4
+// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND]]
+// CHECK-64:       omp.inner.for.end:
+// CHECK-64-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK-64:       omp.loop.exit:
+// CHECK-64-NEXT:    call void @__kmpc_distribute_static_fini(%struct.ident_t* @[[GLOB2]], i32 [[TMP1]])
+// CHECK-64-NEXT:    ret void
+//
+//
+// CHECK-64-LABEL: define {{[^@]+}}@__omp_outlined__52
+// CHECK-64-SAME: (i32* noalias noundef [[DOTGLOBAL_TID_:%.*]], i32* noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR1]] {
+// CHECK-64-NEXT:  entry:
+// CHECK-64-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8
+// CHECK-64-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8
+// CHECK-64-NEXT:    [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i64, align 8
+// CHECK-64-NEXT:    [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i64, align 8
+// CHECK-64-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK-64-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8
+// CHECK-64-NEXT:    store i64 [[DOTPREVIOUS_LB_]], i64* [[DOTPREVIOUS_LB__ADDR]], align 8
+// CHECK-64-NEXT:    store i64 [[DOTPREVIOUS_UB_]], i64* [[DOTPREVIOUS_UB__ADDR]], align 8
+// CHECK-64-NEXT:    store i32 0, i32* [[DOTOMP_LB]], align 4
+// CHECK-64-NEXT:    store i32 9, i32* [[DOTOMP_UB]], align 4
+// CHECK-64-NEXT:    [[TMP0:%.*]] = load i64, i64* [[DOTPREVIOUS_LB__ADDR]], align 8
+// CHECK-64-NEXT:    [[CONV:%.*]] = trunc i64 [[TMP0]] to i32
+// CHECK-64-NEXT:    [[TMP1:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8
+// CHECK-64-NEXT:    [[CONV1:%.*]] = trunc i64 [[TMP1]] to i32
+// CHECK-64-NEXT:    store i32 [[CONV]], i32* [[DOTOMP_LB]], align 4
+// CHECK-64-NEXT:    store i32 [[CONV1]], i32* [[DOTOMP_UB]], align 4
+// CHECK-64-NEXT:    store i32 1, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-64-NEXT:    store i32 0, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-64-NEXT:    [[TMP2:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4
+// CHECK-64-NEXT:    [[TMP3:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4
+// CHECK-64-NEXT:    [[TMP4:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK-64-NEXT:    [[TMP5:%.*]] = load i32, i32* [[TMP4]], align 4
+// CHECK-64-NEXT:    call void @__kmpc_dispatch_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP5]], i32 1073741861, i32 [[TMP2]], i32 [[TMP3]], i32 1, i32 1)
+// CHECK-64-NEXT:    br label [[OMP_DISPATCH_COND:%.*]]
+// CHECK-64:       omp.dispatch.cond:
+// CHECK-64-NEXT:    [[TMP6:%.*]] = call i32 @__kmpc_dispatch_next_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP5]], i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]])
+// CHECK-64-NEXT:    [[TOBOOL:%.*]] = icmp ne i32 [[TMP6]], 0
+// CHECK-64-NEXT:    br i1 [[TOBOOL]], label [[OMP_DISPATCH_BODY:%.*]], label [[OMP_DISPATCH_END:%.*]]
+// CHECK-64:       omp.dispatch.body:
+// CHECK-64-NEXT:    [[TMP7:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4
+// CHECK-64-NEXT:    store i32 [[TMP7]], i32* [[DOTOMP_IV]], align 4
+// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK-64:       omp.inner.for.cond:
+// CHECK-64-NEXT:    [[TMP8:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP237:![0-9]+]]
+// CHECK-64-NEXT:    [[TMP9:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP237]]
+// CHECK-64-NEXT:    [[CMP:%.*]] = icmp sle i32 [[TMP8]], [[TMP9]]
+// CHECK-64-NEXT:    br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK-64:       omp.inner.for.body:
+// CHECK-64-NEXT:    [[TMP10:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP237]]
+// CHECK-64-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP10]], 1
+// CHECK-64-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// CHECK-64-NEXT:    store i32 [[ADD]], i32* [[I]], align 4, !llvm.access.group [[ACC_GRP237]]
+// CHECK-64-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK-64:       omp.body.continue:
+// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK-64:       omp.inner.for.inc:
+// CHECK-64-NEXT:    [[TMP11:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP237]]
+// CHECK-64-NEXT:    [[ADD2:%.*]] = add nsw i32 [[TMP11]], 1
+// CHECK-64-NEXT:    store i32 [[ADD2]], i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP237]]
+// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP238:![0-9]+]]
+// CHECK-64:       omp.inner.for.end:
+// CHECK-64-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
+// CHECK-64:       omp.dispatch.inc:
+// CHECK-64-NEXT:    br label [[OMP_DISPATCH_COND]]
+// CHECK-64:       omp.dispatch.end:
+// CHECK-64-NEXT:    ret void
+//
+//
+// CHECK-64-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l116
+// CHECK-64-SAME: () #[[ATTR0]] {
+// CHECK-64-NEXT:  entry:
+// CHECK-64-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 2, i1 false)
+// CHECK-64-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
+// CHECK-64-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
+// CHECK-64:       user_code.entry:
+// CHECK-64-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
+// CHECK-64-NEXT:    store i32 0, i32* [[DOTZERO_ADDR]], align 4
+// CHECK-64-NEXT:    store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4
+// CHECK-64-NEXT:    call void @__omp_outlined__53(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]]) #[[ATTR2]]
+// CHECK-64-NEXT:    call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 2)
+// CHECK-64-NEXT:    ret void
+// CHECK-64:       worker.exit:
+// CHECK-64-NEXT:    ret void
+//
+//
+// CHECK-64-LABEL: define {{[^@]+}}@__omp_outlined__53
+// CHECK-64-SAME: (i32* noalias noundef [[DOTGLOBAL_TID_:%.*]], i32* noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
+// CHECK-64-NEXT:  entry:
+// CHECK-64-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8
+// CHECK-64-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8
+// CHECK-64-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [2 x i8*], align 8
+// CHECK-64-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK-64-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8
+// CHECK-64-NEXT:    store i32 0, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-64-NEXT:    store i32 9, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-64-NEXT:    store i32 1, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-64-NEXT:    store i32 0, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-64-NEXT:    [[NVPTX_NUM_THREADS:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block()
+// CHECK-64-NEXT:    [[TMP0:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK-64-NEXT:    [[TMP1:%.*]] = load i32, i32* [[TMP0]], align 4
+// CHECK-64-NEXT:    call void @__kmpc_distribute_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP1]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]])
+// CHECK-64-NEXT:    [[TMP2:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-64-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[TMP2]], 9
+// CHECK-64-NEXT:    br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK-64:       cond.true:
+// CHECK-64-NEXT:    br label [[COND_END:%.*]]
+// CHECK-64:       cond.false:
+// CHECK-64-NEXT:    [[TMP3:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-64-NEXT:    br label [[COND_END]]
+// CHECK-64:       cond.end:
+// CHECK-64-NEXT:    [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP3]], [[COND_FALSE]] ]
+// CHECK-64-NEXT:    store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-64-NEXT:    [[TMP4:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-64-NEXT:    store i32 [[TMP4]], i32* [[DOTOMP_IV]], align 4
+// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK-64:       omp.inner.for.cond:
+// CHECK-64-NEXT:    [[TMP5:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
+// CHECK-64-NEXT:    [[CMP1:%.*]] = icmp slt i32 [[TMP5]], 10
+// CHECK-64-NEXT:    br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK-64:       omp.inner.for.body:
+// CHECK-64-NEXT:    [[TMP6:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-64-NEXT:    [[TMP7:%.*]] = zext i32 [[TMP6]] to i64
+// CHECK-64-NEXT:    [[TMP8:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-64-NEXT:    [[TMP9:%.*]] = zext i32 [[TMP8]] to i64
+// CHECK-64-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 0
+// CHECK-64-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP7]] to i8*
+// CHECK-64-NEXT:    store i8* [[TMP11]], i8** [[TMP10]], align 8
+// CHECK-64-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 1
+// CHECK-64-NEXT:    [[TMP13:%.*]] = inttoptr i64 [[TMP9]] to i8*
+// CHECK-64-NEXT:    store i8* [[TMP13]], i8** [[TMP12]], align 8
+// CHECK-64-NEXT:    [[TMP14:%.*]] = bitcast [2 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8**
+// CHECK-64-NEXT:    call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i64, i64)* @__omp_outlined__54 to i8*), i8* null, i8** [[TMP14]], i64 2)
+// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK-64:       omp.inner.for.inc:
+// CHECK-64-NEXT:    [[TMP15:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
+// CHECK-64-NEXT:    [[TMP16:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-64-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP15]], [[TMP16]]
+// CHECK-64-NEXT:    store i32 [[ADD]], i32* [[DOTOMP_IV]], align 4
+// CHECK-64-NEXT:    [[TMP17:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-64-NEXT:    [[TMP18:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-64-NEXT:    [[ADD2:%.*]] = add nsw i32 [[TMP17]], [[TMP18]]
+// CHECK-64-NEXT:    store i32 [[ADD2]], i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-64-NEXT:    [[TMP19:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-64-NEXT:    [[TMP20:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-64-NEXT:    [[ADD3:%.*]] = add nsw i32 [[TMP19]], [[TMP20]]
+// CHECK-64-NEXT:    store i32 [[ADD3]], i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-64-NEXT:    [[TMP21:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-64-NEXT:    [[CMP4:%.*]] = icmp sgt i32 [[TMP21]], 9
+// CHECK-64-NEXT:    br i1 [[CMP4]], label [[COND_TRUE5:%.*]], label [[COND_FALSE6:%.*]]
+// CHECK-64:       cond.true5:
+// CHECK-64-NEXT:    br label [[COND_END7:%.*]]
+// CHECK-64:       cond.false6:
+// CHECK-64-NEXT:    [[TMP22:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-64-NEXT:    br label [[COND_END7]]
+// CHECK-64:       cond.end7:
+// CHECK-64-NEXT:    [[COND8:%.*]] = phi i32 [ 9, [[COND_TRUE5]] ], [ [[TMP22]], [[COND_FALSE6]] ]
+// CHECK-64-NEXT:    store i32 [[COND8]], i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-64-NEXT:    [[TMP23:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-64-NEXT:    store i32 [[TMP23]], i32* [[DOTOMP_IV]], align 4
+// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND]]
+// CHECK-64:       omp.inner.for.end:
+// CHECK-64-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK-64:       omp.loop.exit:
+// CHECK-64-NEXT:    call void @__kmpc_distribute_static_fini(%struct.ident_t* @[[GLOB2]], i32 [[TMP1]])
+// CHECK-64-NEXT:    ret void
+//
+//
+// CHECK-64-LABEL: define {{[^@]+}}@__omp_outlined__54
+// CHECK-64-SAME: (i32* noalias noundef [[DOTGLOBAL_TID_:%.*]], i32* noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR1]] {
+// CHECK-64-NEXT:  entry:
+// CHECK-64-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8
+// CHECK-64-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8
+// CHECK-64-NEXT:    [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i64, align 8
+// CHECK-64-NEXT:    [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i64, align 8
+// CHECK-64-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK-64-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8
+// CHECK-64-NEXT:    store i64 [[DOTPREVIOUS_LB_]], i64* [[DOTPREVIOUS_LB__ADDR]], align 8
+// CHECK-64-NEXT:    store i64 [[DOTPREVIOUS_UB_]], i64* [[DOTPREVIOUS_UB__ADDR]], align 8
+// CHECK-64-NEXT:    store i32 0, i32* [[DOTOMP_LB]], align 4
+// CHECK-64-NEXT:    store i32 9, i32* [[DOTOMP_UB]], align 4
+// CHECK-64-NEXT:    [[TMP0:%.*]] = load i64, i64* [[DOTPREVIOUS_LB__ADDR]], align 8
+// CHECK-64-NEXT:    [[CONV:%.*]] = trunc i64 [[TMP0]] to i32
+// CHECK-64-NEXT:    [[TMP1:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8
+// CHECK-64-NEXT:    [[CONV1:%.*]] = trunc i64 [[TMP1]] to i32
+// CHECK-64-NEXT:    store i32 [[CONV]], i32* [[DOTOMP_LB]], align 4
+// CHECK-64-NEXT:    store i32 [[CONV1]], i32* [[DOTOMP_UB]], align 4
+// CHECK-64-NEXT:    store i32 1, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-64-NEXT:    store i32 0, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-64-NEXT:    [[TMP2:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4
+// CHECK-64-NEXT:    [[TMP3:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4
+// CHECK-64-NEXT:    [[TMP4:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK-64-NEXT:    [[TMP5:%.*]] = load i32, i32* [[TMP4]], align 4
+// CHECK-64-NEXT:    call void @__kmpc_dispatch_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP5]], i32 1073741859, i32 [[TMP2]], i32 [[TMP3]], i32 1, i32 1)
+// CHECK-64-NEXT:    br label [[OMP_DISPATCH_COND:%.*]]
+// CHECK-64:       omp.dispatch.cond:
+// CHECK-64-NEXT:    [[TMP6:%.*]] = call i32 @__kmpc_dispatch_next_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP5]], i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]])
+// CHECK-64-NEXT:    [[TOBOOL:%.*]] = icmp ne i32 [[TMP6]], 0
+// CHECK-64-NEXT:    br i1 [[TOBOOL]], label [[OMP_DISPATCH_BODY:%.*]], label [[OMP_DISPATCH_END:%.*]]
+// CHECK-64:       omp.dispatch.body:
+// CHECK-64-NEXT:    [[TMP7:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4
+// CHECK-64-NEXT:    store i32 [[TMP7]], i32* [[DOTOMP_IV]], align 4
+// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK-64:       omp.inner.for.cond:
+// CHECK-64-NEXT:    [[TMP8:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP240:![0-9]+]]
+// CHECK-64-NEXT:    [[TMP9:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP240]]
+// CHECK-64-NEXT:    [[CMP:%.*]] = icmp sle i32 [[TMP8]], [[TMP9]]
+// CHECK-64-NEXT:    br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK-64:       omp.inner.for.body:
+// CHECK-64-NEXT:    [[TMP10:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP240]]
+// CHECK-64-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP10]], 1
+// CHECK-64-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// CHECK-64-NEXT:    store i32 [[ADD]], i32* [[I]], align 4, !llvm.access.group [[ACC_GRP240]]
+// CHECK-64-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK-64:       omp.body.continue:
+// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK-64:       omp.inner.for.inc:
+// CHECK-64-NEXT:    [[TMP11:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP240]]
+// CHECK-64-NEXT:    [[ADD2:%.*]] = add nsw i32 [[TMP11]], 1
+// CHECK-64-NEXT:    store i32 [[ADD2]], i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP240]]
+// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP241:![0-9]+]]
+// CHECK-64:       omp.inner.for.end:
+// CHECK-64-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
+// CHECK-64:       omp.dispatch.inc:
+// CHECK-64-NEXT:    br label [[OMP_DISPATCH_COND]]
+// CHECK-64:       omp.dispatch.end:
+// CHECK-64-NEXT:    ret void
+//
+//
+// CHECK-64-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l120
+// CHECK-64-SAME: () #[[ATTR0]] {
+// CHECK-64-NEXT:  entry:
+// CHECK-64-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 2, i1 false)
+// CHECK-64-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
+// CHECK-64-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
+// CHECK-64:       user_code.entry:
+// CHECK-64-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
+// CHECK-64-NEXT:    store i32 0, i32* [[DOTZERO_ADDR]], align 4
+// CHECK-64-NEXT:    store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4
+// CHECK-64-NEXT:    call void @__omp_outlined__55(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]]) #[[ATTR2]]
+// CHECK-64-NEXT:    call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 2)
+// CHECK-64-NEXT:    ret void
+// CHECK-64:       worker.exit:
+// CHECK-64-NEXT:    ret void
+//
+//
+// CHECK-64-LABEL: define {{[^@]+}}@__omp_outlined__55
+// CHECK-64-SAME: (i32* noalias noundef [[DOTGLOBAL_TID_:%.*]], i32* noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
+// CHECK-64-NEXT:  entry:
+// CHECK-64-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8
+// CHECK-64-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8
+// CHECK-64-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [2 x i8*], align 8
+// CHECK-64-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK-64-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8
+// CHECK-64-NEXT:    store i32 0, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-64-NEXT:    store i32 9, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-64-NEXT:    store i32 1, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-64-NEXT:    store i32 0, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-64-NEXT:    [[NVPTX_NUM_THREADS:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block()
+// CHECK-64-NEXT:    [[TMP0:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK-64-NEXT:    [[TMP1:%.*]] = load i32, i32* [[TMP0]], align 4
+// CHECK-64-NEXT:    call void @__kmpc_distribute_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP1]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]])
+// CHECK-64-NEXT:    [[TMP2:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-64-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[TMP2]], 9
+// CHECK-64-NEXT:    br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK-64:       cond.true:
+// CHECK-64-NEXT:    br label [[COND_END:%.*]]
+// CHECK-64:       cond.false:
+// CHECK-64-NEXT:    [[TMP3:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-64-NEXT:    br label [[COND_END]]
+// CHECK-64:       cond.end:
+// CHECK-64-NEXT:    [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP3]], [[COND_FALSE]] ]
+// CHECK-64-NEXT:    store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-64-NEXT:    [[TMP4:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-64-NEXT:    store i32 [[TMP4]], i32* [[DOTOMP_IV]], align 4
+// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK-64:       omp.inner.for.cond:
+// CHECK-64-NEXT:    [[TMP5:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
+// CHECK-64-NEXT:    [[CMP1:%.*]] = icmp slt i32 [[TMP5]], 10
+// CHECK-64-NEXT:    br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK-64:       omp.inner.for.body:
+// CHECK-64-NEXT:    [[TMP6:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-64-NEXT:    [[TMP7:%.*]] = zext i32 [[TMP6]] to i64
+// CHECK-64-NEXT:    [[TMP8:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-64-NEXT:    [[TMP9:%.*]] = zext i32 [[TMP8]] to i64
+// CHECK-64-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 0
+// CHECK-64-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP7]] to i8*
+// CHECK-64-NEXT:    store i8* [[TMP11]], i8** [[TMP10]], align 8
+// CHECK-64-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 1
+// CHECK-64-NEXT:    [[TMP13:%.*]] = inttoptr i64 [[TMP9]] to i8*
+// CHECK-64-NEXT:    store i8* [[TMP13]], i8** [[TMP12]], align 8
+// CHECK-64-NEXT:    [[TMP14:%.*]] = bitcast [2 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8**
+// CHECK-64-NEXT:    call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i64, i64)* @__omp_outlined__56 to i8*), i8* null, i8** [[TMP14]], i64 2)
+// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK-64:       omp.inner.for.inc:
+// CHECK-64-NEXT:    [[TMP15:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
+// CHECK-64-NEXT:    [[TMP16:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-64-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP15]], [[TMP16]]
+// CHECK-64-NEXT:    store i32 [[ADD]], i32* [[DOTOMP_IV]], align 4
+// CHECK-64-NEXT:    [[TMP17:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-64-NEXT:    [[TMP18:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-64-NEXT:    [[ADD2:%.*]] = add nsw i32 [[TMP17]], [[TMP18]]
+// CHECK-64-NEXT:    store i32 [[ADD2]], i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-64-NEXT:    [[TMP19:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-64-NEXT:    [[TMP20:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-64-NEXT:    [[ADD3:%.*]] = add nsw i32 [[TMP19]], [[TMP20]]
+// CHECK-64-NEXT:    store i32 [[ADD3]], i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-64-NEXT:    [[TMP21:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-64-NEXT:    [[CMP4:%.*]] = icmp sgt i32 [[TMP21]], 9
+// CHECK-64-NEXT:    br i1 [[CMP4]], label [[COND_TRUE5:%.*]], label [[COND_FALSE6:%.*]]
+// CHECK-64:       cond.true5:
+// CHECK-64-NEXT:    br label [[COND_END7:%.*]]
+// CHECK-64:       cond.false6:
+// CHECK-64-NEXT:    [[TMP22:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-64-NEXT:    br label [[COND_END7]]
+// CHECK-64:       cond.end7:
+// CHECK-64-NEXT:    [[COND8:%.*]] = phi i32 [ 9, [[COND_TRUE5]] ], [ [[TMP22]], [[COND_FALSE6]] ]
+// CHECK-64-NEXT:    store i32 [[COND8]], i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-64-NEXT:    [[TMP23:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-64-NEXT:    store i32 [[TMP23]], i32* [[DOTOMP_IV]], align 4
+// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND]]
+// CHECK-64:       omp.inner.for.end:
+// CHECK-64-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK-64:       omp.loop.exit:
+// CHECK-64-NEXT:    call void @__kmpc_distribute_static_fini(%struct.ident_t* @[[GLOB2]], i32 [[TMP1]])
+// CHECK-64-NEXT:    ret void
+//
+//
+// CHECK-64-LABEL: define {{[^@]+}}@__omp_outlined__56
+// CHECK-64-SAME: (i32* noalias noundef [[DOTGLOBAL_TID_:%.*]], i32* noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR1]] {
+// CHECK-64-NEXT:  entry:
+// CHECK-64-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8
+// CHECK-64-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8
+// CHECK-64-NEXT:    [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i64, align 8
+// CHECK-64-NEXT:    [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i64, align 8
+// CHECK-64-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK-64-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8
+// CHECK-64-NEXT:    store i64 [[DOTPREVIOUS_LB_]], i64* [[DOTPREVIOUS_LB__ADDR]], align 8
+// CHECK-64-NEXT:    store i64 [[DOTPREVIOUS_UB_]], i64* [[DOTPREVIOUS_UB__ADDR]], align 8
+// CHECK-64-NEXT:    store i32 0, i32* [[DOTOMP_LB]], align 4
+// CHECK-64-NEXT:    store i32 9, i32* [[DOTOMP_UB]], align 4
+// CHECK-64-NEXT:    [[TMP0:%.*]] = load i64, i64* [[DOTPREVIOUS_LB__ADDR]], align 8
+// CHECK-64-NEXT:    [[CONV:%.*]] = trunc i64 [[TMP0]] to i32
+// CHECK-64-NEXT:    [[TMP1:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8
+// CHECK-64-NEXT:    [[CONV1:%.*]] = trunc i64 [[TMP1]] to i32
+// CHECK-64-NEXT:    store i32 [[CONV]], i32* [[DOTOMP_LB]], align 4
+// CHECK-64-NEXT:    store i32 [[CONV1]], i32* [[DOTOMP_UB]], align 4
+// CHECK-64-NEXT:    store i32 1, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-64-NEXT:    store i32 0, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-64-NEXT:    [[TMP2:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4
+// CHECK-64-NEXT:    [[TMP3:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4
+// CHECK-64-NEXT:    [[TMP4:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK-64-NEXT:    [[TMP5:%.*]] = load i32, i32* [[TMP4]], align 4
+// CHECK-64-NEXT:    call void @__kmpc_dispatch_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP5]], i32 1073741860, i32 [[TMP2]], i32 [[TMP3]], i32 1, i32 1)
+// CHECK-64-NEXT:    br label [[OMP_DISPATCH_COND:%.*]]
+// CHECK-64:       omp.dispatch.cond:
+// CHECK-64-NEXT:    [[TMP6:%.*]] = call i32 @__kmpc_dispatch_next_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP5]], i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]])
+// CHECK-64-NEXT:    [[TOBOOL:%.*]] = icmp ne i32 [[TMP6]], 0
+// CHECK-64-NEXT:    br i1 [[TOBOOL]], label [[OMP_DISPATCH_BODY:%.*]], label [[OMP_DISPATCH_END:%.*]]
+// CHECK-64:       omp.dispatch.body:
+// CHECK-64-NEXT:    [[TMP7:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4
+// CHECK-64-NEXT:    store i32 [[TMP7]], i32* [[DOTOMP_IV]], align 4
+// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK-64:       omp.inner.for.cond:
+// CHECK-64-NEXT:    [[TMP8:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP243:![0-9]+]]
+// CHECK-64-NEXT:    [[TMP9:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP243]]
+// CHECK-64-NEXT:    [[CMP:%.*]] = icmp sle i32 [[TMP8]], [[TMP9]]
+// CHECK-64-NEXT:    br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK-64:       omp.inner.for.body:
+// CHECK-64-NEXT:    [[TMP10:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP243]]
+// CHECK-64-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP10]], 1
+// CHECK-64-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// CHECK-64-NEXT:    store i32 [[ADD]], i32* [[I]], align 4, !llvm.access.group [[ACC_GRP243]]
+// CHECK-64-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK-64:       omp.body.continue:
+// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK-64:       omp.inner.for.inc:
+// CHECK-64-NEXT:    [[TMP11:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP243]]
+// CHECK-64-NEXT:    [[ADD2:%.*]] = add nsw i32 [[TMP11]], 1
+// CHECK-64-NEXT:    store i32 [[ADD2]], i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP243]]
+// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP244:![0-9]+]]
+// CHECK-64:       omp.inner.for.end:
+// CHECK-64-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
+// CHECK-64:       omp.dispatch.inc:
+// CHECK-64-NEXT:    br label [[OMP_DISPATCH_COND]]
+// CHECK-64:       omp.dispatch.end:
+// CHECK-64-NEXT:    ret void
+//
+//
+// CHECK-64-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l124
+// CHECK-64-SAME: () #[[ATTR0]] {
+// CHECK-64-NEXT:  entry:
+// CHECK-64-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 2, i1 false)
+// CHECK-64-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
+// CHECK-64-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
+// CHECK-64:       user_code.entry:
+// CHECK-64-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
+// CHECK-64-NEXT:    store i32 0, i32* [[DOTZERO_ADDR]], align 4
+// CHECK-64-NEXT:    store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4
+// CHECK-64-NEXT:    call void @__omp_outlined__57(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]]) #[[ATTR2]]
+// CHECK-64-NEXT:    call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 2)
+// CHECK-64-NEXT:    ret void
+// CHECK-64:       worker.exit:
+// CHECK-64-NEXT:    ret void
+//
+//
+// CHECK-64-LABEL: define {{[^@]+}}@__omp_outlined__57
+// CHECK-64-SAME: (i32* noalias noundef [[DOTGLOBAL_TID_:%.*]], i32* noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
+// CHECK-64-NEXT:  entry:
+// CHECK-64-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8
+// CHECK-64-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8
+// CHECK-64-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [2 x i8*], align 8
+// CHECK-64-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK-64-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8
+// CHECK-64-NEXT:    store i32 0, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-64-NEXT:    store i32 9, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-64-NEXT:    store i32 1, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-64-NEXT:    store i32 0, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-64-NEXT:    [[NVPTX_NUM_THREADS:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block()
+// CHECK-64-NEXT:    [[TMP0:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK-64-NEXT:    [[TMP1:%.*]] = load i32, i32* [[TMP0]], align 4
+// CHECK-64-NEXT:    call void @__kmpc_distribute_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP1]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]])
+// CHECK-64-NEXT:    [[TMP2:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-64-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[TMP2]], 9
+// CHECK-64-NEXT:    br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK-64:       cond.true:
+// CHECK-64-NEXT:    br label [[COND_END:%.*]]
+// CHECK-64:       cond.false:
+// CHECK-64-NEXT:    [[TMP3:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-64-NEXT:    br label [[COND_END]]
+// CHECK-64:       cond.end:
+// CHECK-64-NEXT:    [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP3]], [[COND_FALSE]] ]
+// CHECK-64-NEXT:    store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-64-NEXT:    [[TMP4:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-64-NEXT:    store i32 [[TMP4]], i32* [[DOTOMP_IV]], align 4
+// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK-64:       omp.inner.for.cond:
+// CHECK-64-NEXT:    [[TMP5:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
+// CHECK-64-NEXT:    [[CMP1:%.*]] = icmp slt i32 [[TMP5]], 10
+// CHECK-64-NEXT:    br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK-64:       omp.inner.for.body:
+// CHECK-64-NEXT:    [[TMP6:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-64-NEXT:    [[TMP7:%.*]] = zext i32 [[TMP6]] to i64
+// CHECK-64-NEXT:    [[TMP8:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-64-NEXT:    [[TMP9:%.*]] = zext i32 [[TMP8]] to i64
+// CHECK-64-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 0
+// CHECK-64-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP7]] to i8*
+// CHECK-64-NEXT:    store i8* [[TMP11]], i8** [[TMP10]], align 8
+// CHECK-64-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 1
+// CHECK-64-NEXT:    [[TMP13:%.*]] = inttoptr i64 [[TMP9]] to i8*
+// CHECK-64-NEXT:    store i8* [[TMP13]], i8** [[TMP12]], align 8
+// CHECK-64-NEXT:    [[TMP14:%.*]] = bitcast [2 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8**
+// CHECK-64-NEXT:    call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i64, i64)* @__omp_outlined__58 to i8*), i8* null, i8** [[TMP14]], i64 2)
+// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK-64:       omp.inner.for.inc:
+// CHECK-64-NEXT:    [[TMP15:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
+// CHECK-64-NEXT:    [[TMP16:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-64-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP15]], [[TMP16]]
+// CHECK-64-NEXT:    store i32 [[ADD]], i32* [[DOTOMP_IV]], align 4
+// CHECK-64-NEXT:    [[TMP17:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-64-NEXT:    [[TMP18:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-64-NEXT:    [[ADD2:%.*]] = add nsw i32 [[TMP17]], [[TMP18]]
+// CHECK-64-NEXT:    store i32 [[ADD2]], i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-64-NEXT:    [[TMP19:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-64-NEXT:    [[TMP20:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-64-NEXT:    [[ADD3:%.*]] = add nsw i32 [[TMP19]], [[TMP20]]
+// CHECK-64-NEXT:    store i32 [[ADD3]], i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-64-NEXT:    [[TMP21:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-64-NEXT:    [[CMP4:%.*]] = icmp sgt i32 [[TMP21]], 9
+// CHECK-64-NEXT:    br i1 [[CMP4]], label [[COND_TRUE5:%.*]], label [[COND_FALSE6:%.*]]
+// CHECK-64:       cond.true5:
+// CHECK-64-NEXT:    br label [[COND_END7:%.*]]
+// CHECK-64:       cond.false6:
+// CHECK-64-NEXT:    [[TMP22:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-64-NEXT:    br label [[COND_END7]]
+// CHECK-64:       cond.end7:
+// CHECK-64-NEXT:    [[COND8:%.*]] = phi i32 [ 9, [[COND_TRUE5]] ], [ [[TMP22]], [[COND_FALSE6]] ]
+// CHECK-64-NEXT:    store i32 [[COND8]], i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-64-NEXT:    [[TMP23:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-64-NEXT:    store i32 [[TMP23]], i32* [[DOTOMP_IV]], align 4
+// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND]]
+// CHECK-64:       omp.inner.for.end:
+// CHECK-64-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK-64:       omp.loop.exit:
+// CHECK-64-NEXT:    call void @__kmpc_distribute_static_fini(%struct.ident_t* @[[GLOB2]], i32 [[TMP1]])
+// CHECK-64-NEXT:    ret void
+//
+//
+// CHECK-64-LABEL: define {{[^@]+}}@__omp_outlined__58
+// CHECK-64-SAME: (i32* noalias noundef [[DOTGLOBAL_TID_:%.*]], i32* noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR1]] {
+// CHECK-64-NEXT:  entry:
+// CHECK-64-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8
+// CHECK-64-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8
+// CHECK-64-NEXT:    [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i64, align 8
+// CHECK-64-NEXT:    [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i64, align 8
+// CHECK-64-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK-64-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8
+// CHECK-64-NEXT:    store i64 [[DOTPREVIOUS_LB_]], i64* [[DOTPREVIOUS_LB__ADDR]], align 8
+// CHECK-64-NEXT:    store i64 [[DOTPREVIOUS_UB_]], i64* [[DOTPREVIOUS_UB__ADDR]], align 8
+// CHECK-64-NEXT:    store i32 0, i32* [[DOTOMP_LB]], align 4
+// CHECK-64-NEXT:    store i32 9, i32* [[DOTOMP_UB]], align 4
+// CHECK-64-NEXT:    [[TMP0:%.*]] = load i64, i64* [[DOTPREVIOUS_LB__ADDR]], align 8
+// CHECK-64-NEXT:    [[CONV:%.*]] = trunc i64 [[TMP0]] to i32
+// CHECK-64-NEXT:    [[TMP1:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8
+// CHECK-64-NEXT:    [[CONV1:%.*]] = trunc i64 [[TMP1]] to i32
+// CHECK-64-NEXT:    store i32 [[CONV]], i32* [[DOTOMP_LB]], align 4
+// CHECK-64-NEXT:    store i32 [[CONV1]], i32* [[DOTOMP_UB]], align 4
+// CHECK-64-NEXT:    store i32 1, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-64-NEXT:    store i32 0, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-64-NEXT:    [[TMP2:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK-64-NEXT:    [[TMP3:%.*]] = load i32, i32* [[TMP2]], align 4
+// CHECK-64-NEXT:    call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB3]], i32 [[TMP3]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1)
+// CHECK-64-NEXT:    [[TMP4:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4
+// CHECK-64-NEXT:    store i32 [[TMP4]], i32* [[DOTOMP_IV]], align 4
+// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK-64:       omp.inner.for.cond:
+// CHECK-64-NEXT:    [[TMP5:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
+// CHECK-64-NEXT:    [[CONV2:%.*]] = sext i32 [[TMP5]] to i64
+// CHECK-64-NEXT:    [[TMP6:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8
+// CHECK-64-NEXT:    [[CMP:%.*]] = icmp ule i64 [[CONV2]], [[TMP6]]
+// CHECK-64-NEXT:    br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK-64:       omp.inner.for.body:
+// CHECK-64-NEXT:    [[TMP7:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
+// CHECK-64-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP7]], 1
+// CHECK-64-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// CHECK-64-NEXT:    store i32 [[ADD]], i32* [[I]], align 4
+// CHECK-64-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK-64:       omp.body.continue:
+// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK-64:       omp.inner.for.inc:
+// CHECK-64-NEXT:    [[TMP8:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
+// CHECK-64-NEXT:    [[TMP9:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-64-NEXT:    [[ADD3:%.*]] = add nsw i32 [[TMP8]], [[TMP9]]
+// CHECK-64-NEXT:    store i32 [[ADD3]], i32* [[DOTOMP_IV]], align 4
+// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND]]
+// CHECK-64:       omp.inner.for.end:
+// CHECK-64-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK-64:       omp.loop.exit:
+// CHECK-64-NEXT:    call void @__kmpc_distribute_static_fini(%struct.ident_t* @[[GLOB2]], i32 [[TMP3]])
+// CHECK-64-NEXT:    ret void
+//
+//
+// CHECK-64-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l129
+// CHECK-64-SAME: () #[[ATTR0]] {
+// CHECK-64-NEXT:  entry:
+// CHECK-64-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 2, i1 false)
+// CHECK-64-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
+// CHECK-64-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
+// CHECK-64:       user_code.entry:
+// CHECK-64-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
+// CHECK-64-NEXT:    store i32 0, i32* [[DOTZERO_ADDR]], align 4
+// CHECK-64-NEXT:    store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4
+// CHECK-64-NEXT:    call void @__omp_outlined__59(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]]) #[[ATTR2]]
+// CHECK-64-NEXT:    call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 2)
+// CHECK-64-NEXT:    ret void
+// CHECK-64:       worker.exit:
+// CHECK-64-NEXT:    ret void
+//
+//
+// CHECK-64-LABEL: define {{[^@]+}}@__omp_outlined__59
+// CHECK-64-SAME: (i32* noalias noundef [[DOTGLOBAL_TID_:%.*]], i32* noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
+// CHECK-64-NEXT:  entry:
+// CHECK-64-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8
+// CHECK-64-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8
+// CHECK-64-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [2 x i8*], align 8
+// CHECK-64-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK-64-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8
+// CHECK-64-NEXT:    store i32 0, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-64-NEXT:    store i32 9, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-64-NEXT:    store i32 1, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-64-NEXT:    store i32 0, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-64-NEXT:    [[NVPTX_NUM_THREADS:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block()
+// CHECK-64-NEXT:    [[TMP0:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK-64-NEXT:    [[TMP1:%.*]] = load i32, i32* [[TMP0]], align 4
+// CHECK-64-NEXT:    call void @__kmpc_distribute_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP1]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]])
+// CHECK-64-NEXT:    [[TMP2:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-64-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[TMP2]], 9
+// CHECK-64-NEXT:    br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK-64:       cond.true:
+// CHECK-64-NEXT:    br label [[COND_END:%.*]]
+// CHECK-64:       cond.false:
+// CHECK-64-NEXT:    [[TMP3:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-64-NEXT:    br label [[COND_END]]
+// CHECK-64:       cond.end:
+// CHECK-64-NEXT:    [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP3]], [[COND_FALSE]] ]
+// CHECK-64-NEXT:    store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-64-NEXT:    [[TMP4:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-64-NEXT:    store i32 [[TMP4]], i32* [[DOTOMP_IV]], align 4
+// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK-64:       omp.inner.for.cond:
+// CHECK-64-NEXT:    [[TMP5:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
+// CHECK-64-NEXT:    [[CMP1:%.*]] = icmp slt i32 [[TMP5]], 10
+// CHECK-64-NEXT:    br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK-64:       omp.inner.for.body:
+// CHECK-64-NEXT:    [[TMP6:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-64-NEXT:    [[TMP7:%.*]] = zext i32 [[TMP6]] to i64
+// CHECK-64-NEXT:    [[TMP8:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-64-NEXT:    [[TMP9:%.*]] = zext i32 [[TMP8]] to i64
+// CHECK-64-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 0
+// CHECK-64-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP7]] to i8*
+// CHECK-64-NEXT:    store i8* [[TMP11]], i8** [[TMP10]], align 8
+// CHECK-64-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 1
+// CHECK-64-NEXT:    [[TMP13:%.*]] = inttoptr i64 [[TMP9]] to i8*
+// CHECK-64-NEXT:    store i8* [[TMP13]], i8** [[TMP12]], align 8
+// CHECK-64-NEXT:    [[TMP14:%.*]] = bitcast [2 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8**
+// CHECK-64-NEXT:    call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i64, i64)* @__omp_outlined__60 to i8*), i8* null, i8** [[TMP14]], i64 2)
+// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK-64:       omp.inner.for.inc:
+// CHECK-64-NEXT:    [[TMP15:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
+// CHECK-64-NEXT:    [[TMP16:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-64-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP15]], [[TMP16]]
+// CHECK-64-NEXT:    store i32 [[ADD]], i32* [[DOTOMP_IV]], align 4
+// CHECK-64-NEXT:    [[TMP17:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-64-NEXT:    [[TMP18:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-64-NEXT:    [[ADD2:%.*]] = add nsw i32 [[TMP17]], [[TMP18]]
+// CHECK-64-NEXT:    store i32 [[ADD2]], i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-64-NEXT:    [[TMP19:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-64-NEXT:    [[TMP20:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-64-NEXT:    [[ADD3:%.*]] = add nsw i32 [[TMP19]], [[TMP20]]
+// CHECK-64-NEXT:    store i32 [[ADD3]], i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-64-NEXT:    [[TMP21:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-64-NEXT:    [[CMP4:%.*]] = icmp sgt i32 [[TMP21]], 9
+// CHECK-64-NEXT:    br i1 [[CMP4]], label [[COND_TRUE5:%.*]], label [[COND_FALSE6:%.*]]
+// CHECK-64:       cond.true5:
+// CHECK-64-NEXT:    br label [[COND_END7:%.*]]
+// CHECK-64:       cond.false6:
+// CHECK-64-NEXT:    [[TMP22:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-64-NEXT:    br label [[COND_END7]]
+// CHECK-64:       cond.end7:
+// CHECK-64-NEXT:    [[COND8:%.*]] = phi i32 [ 9, [[COND_TRUE5]] ], [ [[TMP22]], [[COND_FALSE6]] ]
+// CHECK-64-NEXT:    store i32 [[COND8]], i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-64-NEXT:    [[TMP23:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-64-NEXT:    store i32 [[TMP23]], i32* [[DOTOMP_IV]], align 4
+// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND]]
+// CHECK-64:       omp.inner.for.end:
+// CHECK-64-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK-64:       omp.loop.exit:
+// CHECK-64-NEXT:    call void @__kmpc_distribute_static_fini(%struct.ident_t* @[[GLOB2]], i32 [[TMP1]])
+// CHECK-64-NEXT:    ret void
+//
+//
+// CHECK-64-LABEL: define {{[^@]+}}@__omp_outlined__60
+// CHECK-64-SAME: (i32* noalias noundef [[DOTGLOBAL_TID_:%.*]], i32* noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR1]] {
+// CHECK-64-NEXT:  entry:
+// CHECK-64-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8
+// CHECK-64-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8
+// CHECK-64-NEXT:    [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i64, align 8
+// CHECK-64-NEXT:    [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i64, align 8
+// CHECK-64-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK-64-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8
+// CHECK-64-NEXT:    store i64 [[DOTPREVIOUS_LB_]], i64* [[DOTPREVIOUS_LB__ADDR]], align 8
+// CHECK-64-NEXT:    store i64 [[DOTPREVIOUS_UB_]], i64* [[DOTPREVIOUS_UB__ADDR]], align 8
+// CHECK-64-NEXT:    store i32 0, i32* [[DOTOMP_LB]], align 4
+// CHECK-64-NEXT:    store i32 9, i32* [[DOTOMP_UB]], align 4
+// CHECK-64-NEXT:    [[TMP0:%.*]] = load i64, i64* [[DOTPREVIOUS_LB__ADDR]], align 8
+// CHECK-64-NEXT:    [[CONV:%.*]] = trunc i64 [[TMP0]] to i32
+// CHECK-64-NEXT:    [[TMP1:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8
+// CHECK-64-NEXT:    [[CONV1:%.*]] = trunc i64 [[TMP1]] to i32
+// CHECK-64-NEXT:    store i32 [[CONV]], i32* [[DOTOMP_LB]], align 4
+// CHECK-64-NEXT:    store i32 [[CONV1]], i32* [[DOTOMP_UB]], align 4
+// CHECK-64-NEXT:    store i32 1, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-64-NEXT:    store i32 0, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-64-NEXT:    [[TMP2:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK-64-NEXT:    [[TMP3:%.*]] = load i32, i32* [[TMP2]], align 4
+// CHECK-64-NEXT:    call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB3]], i32 [[TMP3]], i32 34, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1)
+// CHECK-64-NEXT:    [[TMP4:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4
+// CHECK-64-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[TMP4]], 9
+// CHECK-64-NEXT:    br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK-64:       cond.true:
+// CHECK-64-NEXT:    br label [[COND_END:%.*]]
+// CHECK-64:       cond.false:
+// CHECK-64-NEXT:    [[TMP5:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4
+// CHECK-64-NEXT:    br label [[COND_END]]
+// CHECK-64:       cond.end:
+// CHECK-64-NEXT:    [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP5]], [[COND_FALSE]] ]
+// CHECK-64-NEXT:    store i32 [[COND]], i32* [[DOTOMP_UB]], align 4
+// CHECK-64-NEXT:    [[TMP6:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4
+// CHECK-64-NEXT:    store i32 [[TMP6]], i32* [[DOTOMP_IV]], align 4
+// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK-64:       omp.inner.for.cond:
+// CHECK-64-NEXT:    [[TMP7:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
+// CHECK-64-NEXT:    [[TMP8:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4
+// CHECK-64-NEXT:    [[CMP2:%.*]] = icmp sle i32 [[TMP7]], [[TMP8]]
+// CHECK-64-NEXT:    br i1 [[CMP2]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK-64:       omp.inner.for.body:
+// CHECK-64-NEXT:    [[TMP9:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
+// CHECK-64-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP9]], 1
+// CHECK-64-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// CHECK-64-NEXT:    store i32 [[ADD]], i32* [[I]], align 4
+// CHECK-64-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK-64:       omp.body.continue:
+// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK-64:       omp.inner.for.inc:
+// CHECK-64-NEXT:    [[TMP10:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
+// CHECK-64-NEXT:    [[ADD3:%.*]] = add nsw i32 [[TMP10]], 1
+// CHECK-64-NEXT:    store i32 [[ADD3]], i32* [[DOTOMP_IV]], align 4
+// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND]]
+// CHECK-64:       omp.inner.for.end:
+// CHECK-64-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK-64:       omp.loop.exit:
+// CHECK-64-NEXT:    call void @__kmpc_distribute_static_fini(%struct.ident_t* @[[GLOB2]], i32 [[TMP3]])
+// CHECK-64-NEXT:    ret void
+//
+//
+// CHECK-64-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l134
+// CHECK-64-SAME: () #[[ATTR0]] {
+// CHECK-64-NEXT:  entry:
+// CHECK-64-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 2, i1 false)
+// CHECK-64-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
+// CHECK-64-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
+// CHECK-64:       user_code.entry:
+// CHECK-64-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
+// CHECK-64-NEXT:    store i32 0, i32* [[DOTZERO_ADDR]], align 4
+// CHECK-64-NEXT:    store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4
+// CHECK-64-NEXT:    call void @__omp_outlined__61(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]]) #[[ATTR2]]
+// CHECK-64-NEXT:    call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 2)
+// CHECK-64-NEXT:    ret void
+// CHECK-64:       worker.exit:
+// CHECK-64-NEXT:    ret void
+//
+//
+// CHECK-64-LABEL: define {{[^@]+}}@__omp_outlined__61
+// CHECK-64-SAME: (i32* noalias noundef [[DOTGLOBAL_TID_:%.*]], i32* noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
+// CHECK-64-NEXT:  entry:
+// CHECK-64-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8
+// CHECK-64-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8
+// CHECK-64-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [2 x i8*], align 8
+// CHECK-64-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK-64-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8
+// CHECK-64-NEXT:    store i32 0, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-64-NEXT:    store i32 9, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-64-NEXT:    store i32 1, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-64-NEXT:    store i32 0, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-64-NEXT:    [[NVPTX_NUM_THREADS:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block()
+// CHECK-64-NEXT:    [[TMP0:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK-64-NEXT:    [[TMP1:%.*]] = load i32, i32* [[TMP0]], align 4
+// CHECK-64-NEXT:    call void @__kmpc_distribute_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP1]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]])
+// CHECK-64-NEXT:    [[TMP2:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-64-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[TMP2]], 9
+// CHECK-64-NEXT:    br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK-64:       cond.true:
+// CHECK-64-NEXT:    br label [[COND_END:%.*]]
+// CHECK-64:       cond.false:
+// CHECK-64-NEXT:    [[TMP3:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-64-NEXT:    br label [[COND_END]]
+// CHECK-64:       cond.end:
+// CHECK-64-NEXT:    [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP3]], [[COND_FALSE]] ]
+// CHECK-64-NEXT:    store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-64-NEXT:    [[TMP4:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-64-NEXT:    store i32 [[TMP4]], i32* [[DOTOMP_IV]], align 4
+// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK-64:       omp.inner.for.cond:
+// CHECK-64-NEXT:    [[TMP5:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
+// CHECK-64-NEXT:    [[CMP1:%.*]] = icmp slt i32 [[TMP5]], 10
+// CHECK-64-NEXT:    br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK-64:       omp.inner.for.body:
+// CHECK-64-NEXT:    [[TMP6:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-64-NEXT:    [[TMP7:%.*]] = zext i32 [[TMP6]] to i64
+// CHECK-64-NEXT:    [[TMP8:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-64-NEXT:    [[TMP9:%.*]] = zext i32 [[TMP8]] to i64
+// CHECK-64-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 0
+// CHECK-64-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP7]] to i8*
+// CHECK-64-NEXT:    store i8* [[TMP11]], i8** [[TMP10]], align 8
+// CHECK-64-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 1
+// CHECK-64-NEXT:    [[TMP13:%.*]] = inttoptr i64 [[TMP9]] to i8*
+// CHECK-64-NEXT:    store i8* [[TMP13]], i8** [[TMP12]], align 8
+// CHECK-64-NEXT:    [[TMP14:%.*]] = bitcast [2 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8**
+// CHECK-64-NEXT:    call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i64, i64)* @__omp_outlined__62 to i8*), i8* null, i8** [[TMP14]], i64 2)
+// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK-64:       omp.inner.for.inc:
+// CHECK-64-NEXT:    [[TMP15:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
+// CHECK-64-NEXT:    [[TMP16:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-64-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP15]], [[TMP16]]
+// CHECK-64-NEXT:    store i32 [[ADD]], i32* [[DOTOMP_IV]], align 4
+// CHECK-64-NEXT:    [[TMP17:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-64-NEXT:    [[TMP18:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-64-NEXT:    [[ADD2:%.*]] = add nsw i32 [[TMP17]], [[TMP18]]
+// CHECK-64-NEXT:    store i32 [[ADD2]], i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-64-NEXT:    [[TMP19:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-64-NEXT:    [[TMP20:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-64-NEXT:    [[ADD3:%.*]] = add nsw i32 [[TMP19]], [[TMP20]]
+// CHECK-64-NEXT:    store i32 [[ADD3]], i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-64-NEXT:    [[TMP21:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-64-NEXT:    [[CMP4:%.*]] = icmp sgt i32 [[TMP21]], 9
+// CHECK-64-NEXT:    br i1 [[CMP4]], label [[COND_TRUE5:%.*]], label [[COND_FALSE6:%.*]]
+// CHECK-64:       cond.true5:
+// CHECK-64-NEXT:    br label [[COND_END7:%.*]]
+// CHECK-64:       cond.false6:
+// CHECK-64-NEXT:    [[TMP22:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-64-NEXT:    br label [[COND_END7]]
+// CHECK-64:       cond.end7:
+// CHECK-64-NEXT:    [[COND8:%.*]] = phi i32 [ 9, [[COND_TRUE5]] ], [ [[TMP22]], [[COND_FALSE6]] ]
+// CHECK-64-NEXT:    store i32 [[COND8]], i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-64-NEXT:    [[TMP23:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-64-NEXT:    store i32 [[TMP23]], i32* [[DOTOMP_IV]], align 4
+// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND]]
+// CHECK-64:       omp.inner.for.end:
+// CHECK-64-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK-64:       omp.loop.exit:
+// CHECK-64-NEXT:    call void @__kmpc_distribute_static_fini(%struct.ident_t* @[[GLOB2]], i32 [[TMP1]])
+// CHECK-64-NEXT:    ret void
+//
+//
+// CHECK-64-LABEL: define {{[^@]+}}@__omp_outlined__62
+// CHECK-64-SAME: (i32* noalias noundef [[DOTGLOBAL_TID_:%.*]], i32* noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR1]] {
+// CHECK-64-NEXT:  entry:
+// CHECK-64-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8
+// CHECK-64-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8
+// CHECK-64-NEXT:    [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i64, align 8
+// CHECK-64-NEXT:    [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i64, align 8
+// CHECK-64-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK-64-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8
+// CHECK-64-NEXT:    store i64 [[DOTPREVIOUS_LB_]], i64* [[DOTPREVIOUS_LB__ADDR]], align 8
+// CHECK-64-NEXT:    store i64 [[DOTPREVIOUS_UB_]], i64* [[DOTPREVIOUS_UB__ADDR]], align 8
+// CHECK-64-NEXT:    store i32 0, i32* [[DOTOMP_LB]], align 4
+// CHECK-64-NEXT:    store i32 9, i32* [[DOTOMP_UB]], align 4
+// CHECK-64-NEXT:    [[TMP0:%.*]] = load i64, i64* [[DOTPREVIOUS_LB__ADDR]], align 8
+// CHECK-64-NEXT:    [[CONV:%.*]] = trunc i64 [[TMP0]] to i32
+// CHECK-64-NEXT:    [[TMP1:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8
+// CHECK-64-NEXT:    [[CONV1:%.*]] = trunc i64 [[TMP1]] to i32
+// CHECK-64-NEXT:    store i32 [[CONV]], i32* [[DOTOMP_LB]], align 4
+// CHECK-64-NEXT:    store i32 [[CONV1]], i32* [[DOTOMP_UB]], align 4
+// CHECK-64-NEXT:    store i32 1, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-64-NEXT:    store i32 0, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-64-NEXT:    [[TMP2:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK-64-NEXT:    [[TMP3:%.*]] = load i32, i32* [[TMP2]], align 4
+// CHECK-64-NEXT:    call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB3]], i32 [[TMP3]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1)
+// CHECK-64-NEXT:    [[TMP4:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4
+// CHECK-64-NEXT:    store i32 [[TMP4]], i32* [[DOTOMP_IV]], align 4
+// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK-64:       omp.inner.for.cond:
+// CHECK-64-NEXT:    [[TMP5:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
+// CHECK-64-NEXT:    [[CONV2:%.*]] = sext i32 [[TMP5]] to i64
+// CHECK-64-NEXT:    [[TMP6:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8
+// CHECK-64-NEXT:    [[CMP:%.*]] = icmp ule i64 [[CONV2]], [[TMP6]]
+// CHECK-64-NEXT:    br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK-64:       omp.inner.for.body:
+// CHECK-64-NEXT:    [[TMP7:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
+// CHECK-64-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP7]], 1
+// CHECK-64-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// CHECK-64-NEXT:    store i32 [[ADD]], i32* [[I]], align 4
+// CHECK-64-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK-64:       omp.body.continue:
+// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK-64:       omp.inner.for.inc:
+// CHECK-64-NEXT:    [[TMP8:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
+// CHECK-64-NEXT:    [[TMP9:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-64-NEXT:    [[ADD3:%.*]] = add nsw i32 [[TMP8]], [[TMP9]]
+// CHECK-64-NEXT:    store i32 [[ADD3]], i32* [[DOTOMP_IV]], align 4
+// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND]]
+// CHECK-64:       omp.inner.for.end:
+// CHECK-64-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK-64:       omp.loop.exit:
+// CHECK-64-NEXT:    call void @__kmpc_distribute_static_fini(%struct.ident_t* @[[GLOB2]], i32 [[TMP3]])
+// CHECK-64-NEXT:    ret void
+//
+//
+// CHECK-64-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l139
+// CHECK-64-SAME: () #[[ATTR0]] {
+// CHECK-64-NEXT:  entry:
+// CHECK-64-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 2, i1 false)
+// CHECK-64-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
+// CHECK-64-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
+// CHECK-64:       user_code.entry:
+// CHECK-64-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
+// CHECK-64-NEXT:    store i32 0, i32* [[DOTZERO_ADDR]], align 4
+// CHECK-64-NEXT:    store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4
+// CHECK-64-NEXT:    call void @__omp_outlined__63(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]]) #[[ATTR2]]
+// CHECK-64-NEXT:    call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 2)
+// CHECK-64-NEXT:    ret void
+// CHECK-64:       worker.exit:
+// CHECK-64-NEXT:    ret void
+//
+//
+// CHECK-64-LABEL: define {{[^@]+}}@__omp_outlined__63
+// CHECK-64-SAME: (i32* noalias noundef [[DOTGLOBAL_TID_:%.*]], i32* noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
+// CHECK-64-NEXT:  entry:
+// CHECK-64-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8
+// CHECK-64-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8
+// CHECK-64-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [2 x i8*], align 8
+// CHECK-64-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK-64-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8
+// CHECK-64-NEXT:    store i32 0, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-64-NEXT:    store i32 9, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-64-NEXT:    store i32 1, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-64-NEXT:    store i32 0, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-64-NEXT:    [[NVPTX_NUM_THREADS:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block()
+// CHECK-64-NEXT:    [[TMP0:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK-64-NEXT:    [[TMP1:%.*]] = load i32, i32* [[TMP0]], align 4
+// CHECK-64-NEXT:    call void @__kmpc_distribute_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP1]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]])
+// CHECK-64-NEXT:    [[TMP2:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-64-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[TMP2]], 9
+// CHECK-64-NEXT:    br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK-64:       cond.true:
+// CHECK-64-NEXT:    br label [[COND_END:%.*]]
+// CHECK-64:       cond.false:
+// CHECK-64-NEXT:    [[TMP3:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-64-NEXT:    br label [[COND_END]]
+// CHECK-64:       cond.end:
+// CHECK-64-NEXT:    [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP3]], [[COND_FALSE]] ]
+// CHECK-64-NEXT:    store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-64-NEXT:    [[TMP4:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-64-NEXT:    store i32 [[TMP4]], i32* [[DOTOMP_IV]], align 4
+// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK-64:       omp.inner.for.cond:
+// CHECK-64-NEXT:    [[TMP5:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
+// CHECK-64-NEXT:    [[CMP1:%.*]] = icmp slt i32 [[TMP5]], 10
+// CHECK-64-NEXT:    br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK-64:       omp.inner.for.body:
+// CHECK-64-NEXT:    [[TMP6:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-64-NEXT:    [[TMP7:%.*]] = zext i32 [[TMP6]] to i64
+// CHECK-64-NEXT:    [[TMP8:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-64-NEXT:    [[TMP9:%.*]] = zext i32 [[TMP8]] to i64
+// CHECK-64-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 0
+// CHECK-64-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP7]] to i8*
+// CHECK-64-NEXT:    store i8* [[TMP11]], i8** [[TMP10]], align 8
+// CHECK-64-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 1
+// CHECK-64-NEXT:    [[TMP13:%.*]] = inttoptr i64 [[TMP9]] to i8*
+// CHECK-64-NEXT:    store i8* [[TMP13]], i8** [[TMP12]], align 8
+// CHECK-64-NEXT:    [[TMP14:%.*]] = bitcast [2 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8**
+// CHECK-64-NEXT:    call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i64, i64)* @__omp_outlined__64 to i8*), i8* null, i8** [[TMP14]], i64 2)
+// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK-64:       omp.inner.for.inc:
+// CHECK-64-NEXT:    [[TMP15:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
+// CHECK-64-NEXT:    [[TMP16:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-64-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP15]], [[TMP16]]
+// CHECK-64-NEXT:    store i32 [[ADD]], i32* [[DOTOMP_IV]], align 4
+// CHECK-64-NEXT:    [[TMP17:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-64-NEXT:    [[TMP18:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-64-NEXT:    [[ADD2:%.*]] = add nsw i32 [[TMP17]], [[TMP18]]
+// CHECK-64-NEXT:    store i32 [[ADD2]], i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-64-NEXT:    [[TMP19:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-64-NEXT:    [[TMP20:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-64-NEXT:    [[ADD3:%.*]] = add nsw i32 [[TMP19]], [[TMP20]]
+// CHECK-64-NEXT:    store i32 [[ADD3]], i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-64-NEXT:    [[TMP21:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-64-NEXT:    [[CMP4:%.*]] = icmp sgt i32 [[TMP21]], 9
+// CHECK-64-NEXT:    br i1 [[CMP4]], label [[COND_TRUE5:%.*]], label [[COND_FALSE6:%.*]]
+// CHECK-64:       cond.true5:
+// CHECK-64-NEXT:    br label [[COND_END7:%.*]]
+// CHECK-64:       cond.false6:
+// CHECK-64-NEXT:    [[TMP22:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-64-NEXT:    br label [[COND_END7]]
+// CHECK-64:       cond.end7:
+// CHECK-64-NEXT:    [[COND8:%.*]] = phi i32 [ 9, [[COND_TRUE5]] ], [ [[TMP22]], [[COND_FALSE6]] ]
+// CHECK-64-NEXT:    store i32 [[COND8]], i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-64-NEXT:    [[TMP23:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-64-NEXT:    store i32 [[TMP23]], i32* [[DOTOMP_IV]], align 4
+// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND]]
+// CHECK-64:       omp.inner.for.end:
+// CHECK-64-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK-64:       omp.loop.exit:
+// CHECK-64-NEXT:    call void @__kmpc_distribute_static_fini(%struct.ident_t* @[[GLOB2]], i32 [[TMP1]])
+// CHECK-64-NEXT:    ret void
+//
+//
+// CHECK-64-LABEL: define {{[^@]+}}@__omp_outlined__64
+// CHECK-64-SAME: (i32* noalias noundef [[DOTGLOBAL_TID_:%.*]], i32* noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR1]] {
+// CHECK-64-NEXT:  entry:
+// CHECK-64-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8
+// CHECK-64-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8
+// CHECK-64-NEXT:    [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i64, align 8
+// CHECK-64-NEXT:    [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i64, align 8
+// CHECK-64-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK-64-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8
+// CHECK-64-NEXT:    store i64 [[DOTPREVIOUS_LB_]], i64* [[DOTPREVIOUS_LB__ADDR]], align 8
+// CHECK-64-NEXT:    store i64 [[DOTPREVIOUS_UB_]], i64* [[DOTPREVIOUS_UB__ADDR]], align 8
+// CHECK-64-NEXT:    store i32 0, i32* [[DOTOMP_LB]], align 4
+// CHECK-64-NEXT:    store i32 9, i32* [[DOTOMP_UB]], align 4
+// CHECK-64-NEXT:    [[TMP0:%.*]] = load i64, i64* [[DOTPREVIOUS_LB__ADDR]], align 8
+// CHECK-64-NEXT:    [[CONV:%.*]] = trunc i64 [[TMP0]] to i32
+// CHECK-64-NEXT:    [[TMP1:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8
+// CHECK-64-NEXT:    [[CONV1:%.*]] = trunc i64 [[TMP1]] to i32
+// CHECK-64-NEXT:    store i32 [[CONV]], i32* [[DOTOMP_LB]], align 4
+// CHECK-64-NEXT:    store i32 [[CONV1]], i32* [[DOTOMP_UB]], align 4
+// CHECK-64-NEXT:    store i32 1, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-64-NEXT:    store i32 0, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-64-NEXT:    [[TMP2:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4
+// CHECK-64-NEXT:    [[TMP3:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4
+// CHECK-64-NEXT:    [[TMP4:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK-64-NEXT:    [[TMP5:%.*]] = load i32, i32* [[TMP4]], align 4
+// CHECK-64-NEXT:    call void @__kmpc_dispatch_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP5]], i32 1073741862, i32 [[TMP2]], i32 [[TMP3]], i32 1, i32 1)
+// CHECK-64-NEXT:    br label [[OMP_DISPATCH_COND:%.*]]
+// CHECK-64:       omp.dispatch.cond:
+// CHECK-64-NEXT:    [[TMP6:%.*]] = call i32 @__kmpc_dispatch_next_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP5]], i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]])
+// CHECK-64-NEXT:    [[TOBOOL:%.*]] = icmp ne i32 [[TMP6]], 0
+// CHECK-64-NEXT:    br i1 [[TOBOOL]], label [[OMP_DISPATCH_BODY:%.*]], label [[OMP_DISPATCH_END:%.*]]
+// CHECK-64:       omp.dispatch.body:
+// CHECK-64-NEXT:    [[TMP7:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4
+// CHECK-64-NEXT:    store i32 [[TMP7]], i32* [[DOTOMP_IV]], align 4
+// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK-64:       omp.inner.for.cond:
+// CHECK-64-NEXT:    [[TMP8:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP246:![0-9]+]]
+// CHECK-64-NEXT:    [[TMP9:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP246]]
+// CHECK-64-NEXT:    [[CMP:%.*]] = icmp sle i32 [[TMP8]], [[TMP9]]
+// CHECK-64-NEXT:    br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK-64:       omp.inner.for.body:
+// CHECK-64-NEXT:    [[TMP10:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP246]]
+// CHECK-64-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP10]], 1
+// CHECK-64-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// CHECK-64-NEXT:    store i32 [[ADD]], i32* [[I]], align 4, !llvm.access.group [[ACC_GRP246]]
+// CHECK-64-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK-64:       omp.body.continue:
+// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK-64:       omp.inner.for.inc:
+// CHECK-64-NEXT:    [[TMP11:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP246]]
+// CHECK-64-NEXT:    [[ADD2:%.*]] = add nsw i32 [[TMP11]], 1
+// CHECK-64-NEXT:    store i32 [[ADD2]], i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP246]]
+// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP247:![0-9]+]]
+// CHECK-64:       omp.inner.for.end:
+// CHECK-64-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
+// CHECK-64:       omp.dispatch.inc:
+// CHECK-64-NEXT:    br label [[OMP_DISPATCH_COND]]
+// CHECK-64:       omp.dispatch.end:
+// CHECK-64-NEXT:    ret void
+//
+//
+// CHECK-64-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l144
+// CHECK-64-SAME: () #[[ATTR0]] {
+// CHECK-64-NEXT:  entry:
+// CHECK-64-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 2, i1 false)
+// CHECK-64-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
+// CHECK-64-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
+// CHECK-64:       user_code.entry:
+// CHECK-64-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
+// CHECK-64-NEXT:    store i32 0, i32* [[DOTZERO_ADDR]], align 4
+// CHECK-64-NEXT:    store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4
+// CHECK-64-NEXT:    call void @__omp_outlined__65(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]]) #[[ATTR2]]
+// CHECK-64-NEXT:    call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 2)
+// CHECK-64-NEXT:    ret void
+// CHECK-64:       worker.exit:
+// CHECK-64-NEXT:    ret void
+//
+//
+// CHECK-64-LABEL: define {{[^@]+}}@__omp_outlined__65
+// CHECK-64-SAME: (i32* noalias noundef [[DOTGLOBAL_TID_:%.*]], i32* noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
+// CHECK-64-NEXT:  entry:
+// CHECK-64-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8
+// CHECK-64-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8
+// CHECK-64-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [2 x i8*], align 8
+// CHECK-64-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK-64-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8
+// CHECK-64-NEXT:    store i32 0, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-64-NEXT:    store i32 9, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-64-NEXT:    store i32 1, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-64-NEXT:    store i32 0, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-64-NEXT:    [[NVPTX_NUM_THREADS:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block()
+// CHECK-64-NEXT:    [[TMP0:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK-64-NEXT:    [[TMP1:%.*]] = load i32, i32* [[TMP0]], align 4
+// CHECK-64-NEXT:    call void @__kmpc_distribute_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP1]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]])
+// CHECK-64-NEXT:    [[TMP2:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-64-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[TMP2]], 9
+// CHECK-64-NEXT:    br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK-64:       cond.true:
+// CHECK-64-NEXT:    br label [[COND_END:%.*]]
+// CHECK-64:       cond.false:
+// CHECK-64-NEXT:    [[TMP3:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-64-NEXT:    br label [[COND_END]]
+// CHECK-64:       cond.end:
+// CHECK-64-NEXT:    [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP3]], [[COND_FALSE]] ]
+// CHECK-64-NEXT:    store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-64-NEXT:    [[TMP4:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-64-NEXT:    store i32 [[TMP4]], i32* [[DOTOMP_IV]], align 4
+// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK-64:       omp.inner.for.cond:
+// CHECK-64-NEXT:    [[TMP5:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
+// CHECK-64-NEXT:    [[CMP1:%.*]] = icmp slt i32 [[TMP5]], 10
+// CHECK-64-NEXT:    br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK-64:       omp.inner.for.body:
+// CHECK-64-NEXT:    [[TMP6:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-64-NEXT:    [[TMP7:%.*]] = zext i32 [[TMP6]] to i64
+// CHECK-64-NEXT:    [[TMP8:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-64-NEXT:    [[TMP9:%.*]] = zext i32 [[TMP8]] to i64
+// CHECK-64-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 0
+// CHECK-64-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP7]] to i8*
+// CHECK-64-NEXT:    store i8* [[TMP11]], i8** [[TMP10]], align 8
+// CHECK-64-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 1
+// CHECK-64-NEXT:    [[TMP13:%.*]] = inttoptr i64 [[TMP9]] to i8*
+// CHECK-64-NEXT:    store i8* [[TMP13]], i8** [[TMP12]], align 8
+// CHECK-64-NEXT:    [[TMP14:%.*]] = bitcast [2 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8**
+// CHECK-64-NEXT:    call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i64, i64)* @__omp_outlined__66 to i8*), i8* null, i8** [[TMP14]], i64 2)
+// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK-64:       omp.inner.for.inc:
+// CHECK-64-NEXT:    [[TMP15:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
+// CHECK-64-NEXT:    [[TMP16:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-64-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP15]], [[TMP16]]
+// CHECK-64-NEXT:    store i32 [[ADD]], i32* [[DOTOMP_IV]], align 4
+// CHECK-64-NEXT:    [[TMP17:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-64-NEXT:    [[TMP18:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-64-NEXT:    [[ADD2:%.*]] = add nsw i32 [[TMP17]], [[TMP18]]
+// CHECK-64-NEXT:    store i32 [[ADD2]], i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-64-NEXT:    [[TMP19:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-64-NEXT:    [[TMP20:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-64-NEXT:    [[ADD3:%.*]] = add nsw i32 [[TMP19]], [[TMP20]]
+// CHECK-64-NEXT:    store i32 [[ADD3]], i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-64-NEXT:    [[TMP21:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-64-NEXT:    [[CMP4:%.*]] = icmp sgt i32 [[TMP21]], 9
+// CHECK-64-NEXT:    br i1 [[CMP4]], label [[COND_TRUE5:%.*]], label [[COND_FALSE6:%.*]]
+// CHECK-64:       cond.true5:
+// CHECK-64-NEXT:    br label [[COND_END7:%.*]]
+// CHECK-64:       cond.false6:
+// CHECK-64-NEXT:    [[TMP22:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-64-NEXT:    br label [[COND_END7]]
+// CHECK-64:       cond.end7:
+// CHECK-64-NEXT:    [[COND8:%.*]] = phi i32 [ 9, [[COND_TRUE5]] ], [ [[TMP22]], [[COND_FALSE6]] ]
+// CHECK-64-NEXT:    store i32 [[COND8]], i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-64-NEXT:    [[TMP23:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-64-NEXT:    store i32 [[TMP23]], i32* [[DOTOMP_IV]], align 4
+// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND]]
+// CHECK-64:       omp.inner.for.end:
+// CHECK-64-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK-64:       omp.loop.exit:
+// CHECK-64-NEXT:    call void @__kmpc_distribute_static_fini(%struct.ident_t* @[[GLOB2]], i32 [[TMP1]])
+// CHECK-64-NEXT:    ret void
+//
+//
+// CHECK-64-LABEL: define {{[^@]+}}@__omp_outlined__66
+// CHECK-64-SAME: (i32* noalias noundef [[DOTGLOBAL_TID_:%.*]], i32* noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR1]] {
+// CHECK-64-NEXT:  entry:
+// CHECK-64-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8
+// CHECK-64-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8
+// CHECK-64-NEXT:    [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i64, align 8
+// CHECK-64-NEXT:    [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i64, align 8
+// CHECK-64-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK-64-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8
+// CHECK-64-NEXT:    store i64 [[DOTPREVIOUS_LB_]], i64* [[DOTPREVIOUS_LB__ADDR]], align 8
+// CHECK-64-NEXT:    store i64 [[DOTPREVIOUS_UB_]], i64* [[DOTPREVIOUS_UB__ADDR]], align 8
+// CHECK-64-NEXT:    store i32 0, i32* [[DOTOMP_LB]], align 4
+// CHECK-64-NEXT:    store i32 9, i32* [[DOTOMP_UB]], align 4
+// CHECK-64-NEXT:    [[TMP0:%.*]] = load i64, i64* [[DOTPREVIOUS_LB__ADDR]], align 8
+// CHECK-64-NEXT:    [[CONV:%.*]] = trunc i64 [[TMP0]] to i32
+// CHECK-64-NEXT:    [[TMP1:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8
+// CHECK-64-NEXT:    [[CONV1:%.*]] = trunc i64 [[TMP1]] to i32
+// CHECK-64-NEXT:    store i32 [[CONV]], i32* [[DOTOMP_LB]], align 4
+// CHECK-64-NEXT:    store i32 [[CONV1]], i32* [[DOTOMP_UB]], align 4
+// CHECK-64-NEXT:    store i32 1, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-64-NEXT:    store i32 0, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-64-NEXT:    [[TMP2:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4
+// CHECK-64-NEXT:    [[TMP3:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4
+// CHECK-64-NEXT:    [[TMP4:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK-64-NEXT:    [[TMP5:%.*]] = load i32, i32* [[TMP4]], align 4
+// CHECK-64-NEXT:    call void @__kmpc_dispatch_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP5]], i32 1073741861, i32 [[TMP2]], i32 [[TMP3]], i32 1, i32 1)
+// CHECK-64-NEXT:    br label [[OMP_DISPATCH_COND:%.*]]
+// CHECK-64:       omp.dispatch.cond:
+// CHECK-64-NEXT:    [[TMP6:%.*]] = call i32 @__kmpc_dispatch_next_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP5]], i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]])
+// CHECK-64-NEXT:    [[TOBOOL:%.*]] = icmp ne i32 [[TMP6]], 0
+// CHECK-64-NEXT:    br i1 [[TOBOOL]], label [[OMP_DISPATCH_BODY:%.*]], label [[OMP_DISPATCH_END:%.*]]
+// CHECK-64:       omp.dispatch.body:
+// CHECK-64-NEXT:    [[TMP7:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4
+// CHECK-64-NEXT:    store i32 [[TMP7]], i32* [[DOTOMP_IV]], align 4
+// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK-64:       omp.inner.for.cond:
+// CHECK-64-NEXT:    [[TMP8:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP249:![0-9]+]]
+// CHECK-64-NEXT:    [[TMP9:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP249]]
+// CHECK-64-NEXT:    [[CMP:%.*]] = icmp sle i32 [[TMP8]], [[TMP9]]
+// CHECK-64-NEXT:    br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK-64:       omp.inner.for.body:
+// CHECK-64-NEXT:    [[TMP10:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP249]]
+// CHECK-64-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP10]], 1
+// CHECK-64-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// CHECK-64-NEXT:    store i32 [[ADD]], i32* [[I]], align 4, !llvm.access.group [[ACC_GRP249]]
+// CHECK-64-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK-64:       omp.body.continue:
+// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK-64:       omp.inner.for.inc:
+// CHECK-64-NEXT:    [[TMP11:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP249]]
+// CHECK-64-NEXT:    [[ADD2:%.*]] = add nsw i32 [[TMP11]], 1
+// CHECK-64-NEXT:    store i32 [[ADD2]], i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP249]]
+// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP250:![0-9]+]]
+// CHECK-64:       omp.inner.for.end:
+// CHECK-64-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
+// CHECK-64:       omp.dispatch.inc:
+// CHECK-64-NEXT:    br label [[OMP_DISPATCH_COND]]
+// CHECK-64:       omp.dispatch.end:
+// CHECK-64-NEXT:    ret void
+//
+//
+// CHECK-64-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l149
+// CHECK-64-SAME: () #[[ATTR0]] {
+// CHECK-64-NEXT:  entry:
+// CHECK-64-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 2, i1 false)
+// CHECK-64-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
+// CHECK-64-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
+// CHECK-64:       user_code.entry:
+// CHECK-64-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
+// CHECK-64-NEXT:    store i32 0, i32* [[DOTZERO_ADDR]], align 4
+// CHECK-64-NEXT:    store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4
+// CHECK-64-NEXT:    call void @__omp_outlined__67(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]]) #[[ATTR2]]
+// CHECK-64-NEXT:    call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 2)
+// CHECK-64-NEXT:    ret void
+// CHECK-64:       worker.exit:
+// CHECK-64-NEXT:    ret void
+//
+//
+// CHECK-64-LABEL: define {{[^@]+}}@__omp_outlined__67
+// CHECK-64-SAME: (i32* noalias noundef [[DOTGLOBAL_TID_:%.*]], i32* noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
+// CHECK-64-NEXT:  entry:
+// CHECK-64-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8
+// CHECK-64-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8
+// CHECK-64-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [2 x i8*], align 8
+// CHECK-64-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK-64-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8
+// CHECK-64-NEXT:    store i32 0, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-64-NEXT:    store i32 9, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-64-NEXT:    store i32 1, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-64-NEXT:    store i32 0, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-64-NEXT:    [[NVPTX_NUM_THREADS:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block()
+// CHECK-64-NEXT:    [[TMP0:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK-64-NEXT:    [[TMP1:%.*]] = load i32, i32* [[TMP0]], align 4
+// CHECK-64-NEXT:    call void @__kmpc_distribute_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP1]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]])
+// CHECK-64-NEXT:    [[TMP2:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-64-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[TMP2]], 9
+// CHECK-64-NEXT:    br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK-64:       cond.true:
+// CHECK-64-NEXT:    br label [[COND_END:%.*]]
+// CHECK-64:       cond.false:
+// CHECK-64-NEXT:    [[TMP3:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-64-NEXT:    br label [[COND_END]]
+// CHECK-64:       cond.end:
+// CHECK-64-NEXT:    [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP3]], [[COND_FALSE]] ]
+// CHECK-64-NEXT:    store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-64-NEXT:    [[TMP4:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-64-NEXT:    store i32 [[TMP4]], i32* [[DOTOMP_IV]], align 4
+// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK-64:       omp.inner.for.cond:
+// CHECK-64-NEXT:    [[TMP5:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
+// CHECK-64-NEXT:    [[CMP1:%.*]] = icmp slt i32 [[TMP5]], 10
+// CHECK-64-NEXT:    br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK-64:       omp.inner.for.body:
+// CHECK-64-NEXT:    [[TMP6:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-64-NEXT:    [[TMP7:%.*]] = zext i32 [[TMP6]] to i64
+// CHECK-64-NEXT:    [[TMP8:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-64-NEXT:    [[TMP9:%.*]] = zext i32 [[TMP8]] to i64
+// CHECK-64-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 0
+// CHECK-64-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP7]] to i8*
+// CHECK-64-NEXT:    store i8* [[TMP11]], i8** [[TMP10]], align 8
+// CHECK-64-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 1
+// CHECK-64-NEXT:    [[TMP13:%.*]] = inttoptr i64 [[TMP9]] to i8*
+// CHECK-64-NEXT:    store i8* [[TMP13]], i8** [[TMP12]], align 8
+// CHECK-64-NEXT:    [[TMP14:%.*]] = bitcast [2 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8**
+// CHECK-64-NEXT:    call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i64, i64)* @__omp_outlined__68 to i8*), i8* null, i8** [[TMP14]], i64 2)
+// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK-64:       omp.inner.for.inc:
+// CHECK-64-NEXT:    [[TMP15:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
+// CHECK-64-NEXT:    [[TMP16:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-64-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP15]], [[TMP16]]
+// CHECK-64-NEXT:    store i32 [[ADD]], i32* [[DOTOMP_IV]], align 4
+// CHECK-64-NEXT:    [[TMP17:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-64-NEXT:    [[TMP18:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-64-NEXT:    [[ADD2:%.*]] = add nsw i32 [[TMP17]], [[TMP18]]
+// CHECK-64-NEXT:    store i32 [[ADD2]], i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-64-NEXT:    [[TMP19:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-64-NEXT:    [[TMP20:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-64-NEXT:    [[ADD3:%.*]] = add nsw i32 [[TMP19]], [[TMP20]]
+// CHECK-64-NEXT:    store i32 [[ADD3]], i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-64-NEXT:    [[TMP21:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-64-NEXT:    [[CMP4:%.*]] = icmp sgt i32 [[TMP21]], 9
+// CHECK-64-NEXT:    br i1 [[CMP4]], label [[COND_TRUE5:%.*]], label [[COND_FALSE6:%.*]]
+// CHECK-64:       cond.true5:
+// CHECK-64-NEXT:    br label [[COND_END7:%.*]]
+// CHECK-64:       cond.false6:
+// CHECK-64-NEXT:    [[TMP22:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-64-NEXT:    br label [[COND_END7]]
+// CHECK-64:       cond.end7:
+// CHECK-64-NEXT:    [[COND8:%.*]] = phi i32 [ 9, [[COND_TRUE5]] ], [ [[TMP22]], [[COND_FALSE6]] ]
+// CHECK-64-NEXT:    store i32 [[COND8]], i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-64-NEXT:    [[TMP23:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-64-NEXT:    store i32 [[TMP23]], i32* [[DOTOMP_IV]], align 4
+// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND]]
+// CHECK-64:       omp.inner.for.end:
+// CHECK-64-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK-64:       omp.loop.exit:
+// CHECK-64-NEXT:    call void @__kmpc_distribute_static_fini(%struct.ident_t* @[[GLOB2]], i32 [[TMP1]])
+// CHECK-64-NEXT:    ret void
+//
+//
+// CHECK-64-LABEL: define {{[^@]+}}@__omp_outlined__68
+// CHECK-64-SAME: (i32* noalias noundef [[DOTGLOBAL_TID_:%.*]], i32* noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR1]] {
+// CHECK-64-NEXT:  entry:
+// CHECK-64-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8
+// CHECK-64-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8
+// CHECK-64-NEXT:    [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i64, align 8
+// CHECK-64-NEXT:    [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i64, align 8
+// CHECK-64-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK-64-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8
+// CHECK-64-NEXT:    store i64 [[DOTPREVIOUS_LB_]], i64* [[DOTPREVIOUS_LB__ADDR]], align 8
+// CHECK-64-NEXT:    store i64 [[DOTPREVIOUS_UB_]], i64* [[DOTPREVIOUS_UB__ADDR]], align 8
+// CHECK-64-NEXT:    store i32 0, i32* [[DOTOMP_LB]], align 4
+// CHECK-64-NEXT:    store i32 9, i32* [[DOTOMP_UB]], align 4
+// CHECK-64-NEXT:    [[TMP0:%.*]] = load i64, i64* [[DOTPREVIOUS_LB__ADDR]], align 8
+// CHECK-64-NEXT:    [[CONV:%.*]] = trunc i64 [[TMP0]] to i32
+// CHECK-64-NEXT:    [[TMP1:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8
+// CHECK-64-NEXT:    [[CONV1:%.*]] = trunc i64 [[TMP1]] to i32
+// CHECK-64-NEXT:    store i32 [[CONV]], i32* [[DOTOMP_LB]], align 4
+// CHECK-64-NEXT:    store i32 [[CONV1]], i32* [[DOTOMP_UB]], align 4
+// CHECK-64-NEXT:    store i32 1, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-64-NEXT:    store i32 0, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-64-NEXT:    [[TMP2:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4
+// CHECK-64-NEXT:    [[TMP3:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4
+// CHECK-64-NEXT:    [[TMP4:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK-64-NEXT:    [[TMP5:%.*]] = load i32, i32* [[TMP4]], align 4
+// CHECK-64-NEXT:    call void @__kmpc_dispatch_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP5]], i32 1073741859, i32 [[TMP2]], i32 [[TMP3]], i32 1, i32 1)
+// CHECK-64-NEXT:    br label [[OMP_DISPATCH_COND:%.*]]
+// CHECK-64:       omp.dispatch.cond:
+// CHECK-64-NEXT:    [[TMP6:%.*]] = call i32 @__kmpc_dispatch_next_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP5]], i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]])
+// CHECK-64-NEXT:    [[TOBOOL:%.*]] = icmp ne i32 [[TMP6]], 0
+// CHECK-64-NEXT:    br i1 [[TOBOOL]], label [[OMP_DISPATCH_BODY:%.*]], label [[OMP_DISPATCH_END:%.*]]
+// CHECK-64:       omp.dispatch.body:
+// CHECK-64-NEXT:    [[TMP7:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4
+// CHECK-64-NEXT:    store i32 [[TMP7]], i32* [[DOTOMP_IV]], align 4
+// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK-64:       omp.inner.for.cond:
+// CHECK-64-NEXT:    [[TMP8:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP252:![0-9]+]]
+// CHECK-64-NEXT:    [[TMP9:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP252]]
+// CHECK-64-NEXT:    [[CMP:%.*]] = icmp sle i32 [[TMP8]], [[TMP9]]
+// CHECK-64-NEXT:    br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK-64:       omp.inner.for.body:
+// CHECK-64-NEXT:    [[TMP10:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP252]]
+// CHECK-64-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP10]], 1
+// CHECK-64-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// CHECK-64-NEXT:    store i32 [[ADD]], i32* [[I]], align 4, !llvm.access.group [[ACC_GRP252]]
+// CHECK-64-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK-64:       omp.body.continue:
+// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK-64:       omp.inner.for.inc:
+// CHECK-64-NEXT:    [[TMP11:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP252]]
+// CHECK-64-NEXT:    [[ADD2:%.*]] = add nsw i32 [[TMP11]], 1
+// CHECK-64-NEXT:    store i32 [[ADD2]], i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP252]]
+// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP253:![0-9]+]]
+// CHECK-64:       omp.inner.for.end:
+// CHECK-64-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
+// CHECK-64:       omp.dispatch.inc:
+// CHECK-64-NEXT:    br label [[OMP_DISPATCH_COND]]
+// CHECK-64:       omp.dispatch.end:
+// CHECK-64-NEXT:    ret void
+//
+//
+// CHECK-64-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l154
+// CHECK-64-SAME: () #[[ATTR0]] {
+// CHECK-64-NEXT:  entry:
+// CHECK-64-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 2, i1 false)
+// CHECK-64-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
+// CHECK-64-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
+// CHECK-64:       user_code.entry:
+// CHECK-64-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
+// CHECK-64-NEXT:    store i32 0, i32* [[DOTZERO_ADDR]], align 4
+// CHECK-64-NEXT:    store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4
+// CHECK-64-NEXT:    call void @__omp_outlined__69(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]]) #[[ATTR2]]
+// CHECK-64-NEXT:    call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 2)
+// CHECK-64-NEXT:    ret void
+// CHECK-64:       worker.exit:
+// CHECK-64-NEXT:    ret void
+//
+//
+// CHECK-64-LABEL: define {{[^@]+}}@__omp_outlined__69
+// CHECK-64-SAME: (i32* noalias noundef [[DOTGLOBAL_TID_:%.*]], i32* noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
+// CHECK-64-NEXT:  entry:
+// CHECK-64-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8
+// CHECK-64-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8
+// CHECK-64-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [2 x i8*], align 8
+// CHECK-64-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK-64-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8
+// CHECK-64-NEXT:    store i32 0, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-64-NEXT:    store i32 9, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-64-NEXT:    store i32 1, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-64-NEXT:    store i32 0, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-64-NEXT:    [[NVPTX_NUM_THREADS:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block()
+// CHECK-64-NEXT:    [[TMP0:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK-64-NEXT:    [[TMP1:%.*]] = load i32, i32* [[TMP0]], align 4
+// CHECK-64-NEXT:    call void @__kmpc_distribute_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP1]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]])
+// CHECK-64-NEXT:    [[TMP2:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-64-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[TMP2]], 9
+// CHECK-64-NEXT:    br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK-64:       cond.true:
+// CHECK-64-NEXT:    br label [[COND_END:%.*]]
+// CHECK-64:       cond.false:
+// CHECK-64-NEXT:    [[TMP3:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-64-NEXT:    br label [[COND_END]]
+// CHECK-64:       cond.end:
+// CHECK-64-NEXT:    [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP3]], [[COND_FALSE]] ]
+// CHECK-64-NEXT:    store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-64-NEXT:    [[TMP4:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-64-NEXT:    store i32 [[TMP4]], i32* [[DOTOMP_IV]], align 4
+// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK-64:       omp.inner.for.cond:
+// CHECK-64-NEXT:    [[TMP5:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
+// CHECK-64-NEXT:    [[CMP1:%.*]] = icmp slt i32 [[TMP5]], 10
+// CHECK-64-NEXT:    br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK-64:       omp.inner.for.body:
+// CHECK-64-NEXT:    [[TMP6:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-64-NEXT:    [[TMP7:%.*]] = zext i32 [[TMP6]] to i64
+// CHECK-64-NEXT:    [[TMP8:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-64-NEXT:    [[TMP9:%.*]] = zext i32 [[TMP8]] to i64
+// CHECK-64-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 0
+// CHECK-64-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP7]] to i8*
+// CHECK-64-NEXT:    store i8* [[TMP11]], i8** [[TMP10]], align 8
+// CHECK-64-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 1
+// CHECK-64-NEXT:    [[TMP13:%.*]] = inttoptr i64 [[TMP9]] to i8*
+// CHECK-64-NEXT:    store i8* [[TMP13]], i8** [[TMP12]], align 8
+// CHECK-64-NEXT:    [[TMP14:%.*]] = bitcast [2 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8**
+// CHECK-64-NEXT:    call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i64, i64)* @__omp_outlined__70 to i8*), i8* null, i8** [[TMP14]], i64 2)
+// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK-64:       omp.inner.for.inc:
+// CHECK-64-NEXT:    [[TMP15:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
+// CHECK-64-NEXT:    [[TMP16:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-64-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP15]], [[TMP16]]
+// CHECK-64-NEXT:    store i32 [[ADD]], i32* [[DOTOMP_IV]], align 4
+// CHECK-64-NEXT:    [[TMP17:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-64-NEXT:    [[TMP18:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-64-NEXT:    [[ADD2:%.*]] = add nsw i32 [[TMP17]], [[TMP18]]
+// CHECK-64-NEXT:    store i32 [[ADD2]], i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-64-NEXT:    [[TMP19:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-64-NEXT:    [[TMP20:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-64-NEXT:    [[ADD3:%.*]] = add nsw i32 [[TMP19]], [[TMP20]]
+// CHECK-64-NEXT:    store i32 [[ADD3]], i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-64-NEXT:    [[TMP21:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-64-NEXT:    [[CMP4:%.*]] = icmp sgt i32 [[TMP21]], 9
+// CHECK-64-NEXT:    br i1 [[CMP4]], label [[COND_TRUE5:%.*]], label [[COND_FALSE6:%.*]]
+// CHECK-64:       cond.true5:
+// CHECK-64-NEXT:    br label [[COND_END7:%.*]]
+// CHECK-64:       cond.false6:
+// CHECK-64-NEXT:    [[TMP22:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-64-NEXT:    br label [[COND_END7]]
+// CHECK-64:       cond.end7:
+// CHECK-64-NEXT:    [[COND8:%.*]] = phi i32 [ 9, [[COND_TRUE5]] ], [ [[TMP22]], [[COND_FALSE6]] ]
+// CHECK-64-NEXT:    store i32 [[COND8]], i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-64-NEXT:    [[TMP23:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-64-NEXT:    store i32 [[TMP23]], i32* [[DOTOMP_IV]], align 4
+// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND]]
+// CHECK-64:       omp.inner.for.end:
+// CHECK-64-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK-64:       omp.loop.exit:
+// CHECK-64-NEXT:    call void @__kmpc_distribute_static_fini(%struct.ident_t* @[[GLOB2]], i32 [[TMP1]])
+// CHECK-64-NEXT:    ret void
+//
+//
+// CHECK-64-LABEL: define {{[^@]+}}@__omp_outlined__70
+// CHECK-64-SAME: (i32* noalias noundef [[DOTGLOBAL_TID_:%.*]], i32* noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR1]] {
+// CHECK-64-NEXT:  entry:
+// CHECK-64-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8
+// CHECK-64-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8
+// CHECK-64-NEXT:    [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i64, align 8
+// CHECK-64-NEXT:    [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i64, align 8
+// CHECK-64-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK-64-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8
+// CHECK-64-NEXT:    store i64 [[DOTPREVIOUS_LB_]], i64* [[DOTPREVIOUS_LB__ADDR]], align 8
+// CHECK-64-NEXT:    store i64 [[DOTPREVIOUS_UB_]], i64* [[DOTPREVIOUS_UB__ADDR]], align 8
+// CHECK-64-NEXT:    store i32 0, i32* [[DOTOMP_LB]], align 4
+// CHECK-64-NEXT:    store i32 9, i32* [[DOTOMP_UB]], align 4
+// CHECK-64-NEXT:    [[TMP0:%.*]] = load i64, i64* [[DOTPREVIOUS_LB__ADDR]], align 8
+// CHECK-64-NEXT:    [[CONV:%.*]] = trunc i64 [[TMP0]] to i32
+// CHECK-64-NEXT:    [[TMP1:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8
+// CHECK-64-NEXT:    [[CONV1:%.*]] = trunc i64 [[TMP1]] to i32
+// CHECK-64-NEXT:    store i32 [[CONV]], i32* [[DOTOMP_LB]], align 4
+// CHECK-64-NEXT:    store i32 [[CONV1]], i32* [[DOTOMP_UB]], align 4
+// CHECK-64-NEXT:    store i32 1, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-64-NEXT:    store i32 0, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-64-NEXT:    [[TMP2:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4
+// CHECK-64-NEXT:    [[TMP3:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4
+// CHECK-64-NEXT:    [[TMP4:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK-64-NEXT:    [[TMP5:%.*]] = load i32, i32* [[TMP4]], align 4
+// CHECK-64-NEXT:    call void @__kmpc_dispatch_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP5]], i32 1073741860, i32 [[TMP2]], i32 [[TMP3]], i32 1, i32 1)
+// CHECK-64-NEXT:    br label [[OMP_DISPATCH_COND:%.*]]
+// CHECK-64:       omp.dispatch.cond:
+// CHECK-64-NEXT:    [[TMP6:%.*]] = call i32 @__kmpc_dispatch_next_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP5]], i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]])
+// CHECK-64-NEXT:    [[TOBOOL:%.*]] = icmp ne i32 [[TMP6]], 0
+// CHECK-64-NEXT:    br i1 [[TOBOOL]], label [[OMP_DISPATCH_BODY:%.*]], label [[OMP_DISPATCH_END:%.*]]
+// CHECK-64:       omp.dispatch.body:
+// CHECK-64-NEXT:    [[TMP7:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4
+// CHECK-64-NEXT:    store i32 [[TMP7]], i32* [[DOTOMP_IV]], align 4
+// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK-64:       omp.inner.for.cond:
+// CHECK-64-NEXT:    [[TMP8:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP255:![0-9]+]]
+// CHECK-64-NEXT:    [[TMP9:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP255]]
+// CHECK-64-NEXT:    [[CMP:%.*]] = icmp sle i32 [[TMP8]], [[TMP9]]
+// CHECK-64-NEXT:    br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK-64:       omp.inner.for.body:
+// CHECK-64-NEXT:    [[TMP10:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP255]]
+// CHECK-64-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP10]], 1
+// CHECK-64-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// CHECK-64-NEXT:    store i32 [[ADD]], i32* [[I]], align 4, !llvm.access.group [[ACC_GRP255]]
+// CHECK-64-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK-64:       omp.body.continue:
+// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK-64:       omp.inner.for.inc:
+// CHECK-64-NEXT:    [[TMP11:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP255]]
+// CHECK-64-NEXT:    [[ADD2:%.*]] = add nsw i32 [[TMP11]], 1
+// CHECK-64-NEXT:    store i32 [[ADD2]], i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP255]]
+// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP256:![0-9]+]]
+// CHECK-64:       omp.inner.for.end:
+// CHECK-64-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
+// CHECK-64:       omp.dispatch.inc:
+// CHECK-64-NEXT:    br label [[OMP_DISPATCH_COND]]
+// CHECK-64:       omp.dispatch.end:
+// CHECK-64-NEXT:    ret void
+//
+//
+// CHECK-64-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l159
+// CHECK-64-SAME: (i64 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR8:[0-9]+]] {
+// CHECK-64-NEXT:  entry:
+// CHECK-64-NEXT:    [[DOTCAPTURE_EXPR__ADDR:%.*]] = alloca i64, align 8
+// CHECK-64-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x i8*], align 8
+// CHECK-64-NEXT:    store i64 [[DOTCAPTURE_EXPR_]], i64* [[DOTCAPTURE_EXPR__ADDR]], align 8
+// CHECK-64-NEXT:    [[CONV:%.*]] = bitcast i64* [[DOTCAPTURE_EXPR__ADDR]] to i8*
+// CHECK-64-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 2, i1 false)
+// CHECK-64-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
+// CHECK-64-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
+// CHECK-64:       user_code.entry:
+// CHECK-64-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
+// CHECK-64-NEXT:    [[TMP2:%.*]] = load i8, i8* [[CONV]], align 1
+// CHECK-64-NEXT:    [[TOBOOL:%.*]] = trunc i8 [[TMP2]] to i1
+// CHECK-64-NEXT:    [[TMP3:%.*]] = zext i1 [[TOBOOL]] to i32
+// CHECK-64-NEXT:    [[TMP4:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8**
+// CHECK-64-NEXT:    call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]], i32 [[TMP3]], i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__71 to i8*), i8* null, i8** [[TMP4]], i64 0)
+// CHECK-64-NEXT:    call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 2)
+// CHECK-64-NEXT:    ret void
+// CHECK-64:       worker.exit:
+// CHECK-64-NEXT:    ret void
+//
+//
+// CHECK-64-LABEL: define {{[^@]+}}@__omp_outlined__71
+// CHECK-64-SAME: (i32* noalias noundef [[DOTGLOBAL_TID_:%.*]], i32* noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
+// CHECK-64-NEXT:  entry:
+// CHECK-64-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8
+// CHECK-64-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8
+// CHECK-64-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK-64-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8
+// CHECK-64-NEXT:    store i32 0, i32* [[DOTOMP_LB]], align 4
+// CHECK-64-NEXT:    store i32 9, i32* [[DOTOMP_UB]], align 4
+// CHECK-64-NEXT:    store i32 1, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-64-NEXT:    store i32 0, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-64-NEXT:    [[TMP0:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK-64-NEXT:    [[TMP1:%.*]] = load i32, i32* [[TMP0]], align 4
+// CHECK-64-NEXT:    call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB3]], i32 [[TMP1]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1)
+// CHECK-64-NEXT:    br label [[OMP_DISPATCH_COND:%.*]]
+// CHECK-64:       omp.dispatch.cond:
+// CHECK-64-NEXT:    [[TMP2:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4
+// CHECK-64-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[TMP2]], 9
+// CHECK-64-NEXT:    br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK-64:       cond.true:
+// CHECK-64-NEXT:    br label [[COND_END:%.*]]
+// CHECK-64:       cond.false:
+// CHECK-64-NEXT:    [[TMP3:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4
+// CHECK-64-NEXT:    br label [[COND_END]]
+// CHECK-64:       cond.end:
+// CHECK-64-NEXT:    [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP3]], [[COND_FALSE]] ]
+// CHECK-64-NEXT:    store i32 [[COND]], i32* [[DOTOMP_UB]], align 4
+// CHECK-64-NEXT:    [[TMP4:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4
+// CHECK-64-NEXT:    store i32 [[TMP4]], i32* [[DOTOMP_IV]], align 4
+// CHECK-64-NEXT:    [[TMP5:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
+// CHECK-64-NEXT:    [[TMP6:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4
+// CHECK-64-NEXT:    [[CMP1:%.*]] = icmp sle i32 [[TMP5]], [[TMP6]]
+// CHECK-64-NEXT:    br i1 [[CMP1]], label [[OMP_DISPATCH_BODY:%.*]], label [[OMP_DISPATCH_END:%.*]]
+// CHECK-64:       omp.dispatch.body:
+// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK-64:       omp.inner.for.cond:
+// CHECK-64-NEXT:    [[TMP7:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
+// CHECK-64-NEXT:    [[TMP8:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4
+// CHECK-64-NEXT:    [[CMP2:%.*]] = icmp sle i32 [[TMP7]], [[TMP8]]
+// CHECK-64-NEXT:    br i1 [[CMP2]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK-64:       omp.inner.for.body:
+// CHECK-64-NEXT:    [[TMP9:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
+// CHECK-64-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP9]], 1
+// CHECK-64-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// CHECK-64-NEXT:    store i32 [[ADD]], i32* [[I]], align 4
+// CHECK-64-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK-64:       omp.body.continue:
+// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK-64:       omp.inner.for.inc:
+// CHECK-64-NEXT:    [[TMP10:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
+// CHECK-64-NEXT:    [[ADD3:%.*]] = add nsw i32 [[TMP10]], 1
+// CHECK-64-NEXT:    store i32 [[ADD3]], i32* [[DOTOMP_IV]], align 4
+// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND]]
+// CHECK-64:       omp.inner.for.end:
+// CHECK-64-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
+// CHECK-64:       omp.dispatch.inc:
+// CHECK-64-NEXT:    [[TMP11:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4
+// CHECK-64-NEXT:    [[TMP12:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-64-NEXT:    [[ADD4:%.*]] = add nsw i32 [[TMP11]], [[TMP12]]
+// CHECK-64-NEXT:    store i32 [[ADD4]], i32* [[DOTOMP_LB]], align 4
+// CHECK-64-NEXT:    [[TMP13:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4
+// CHECK-64-NEXT:    [[TMP14:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-64-NEXT:    [[ADD5:%.*]] = add nsw i32 [[TMP13]], [[TMP14]]
+// CHECK-64-NEXT:    store i32 [[ADD5]], i32* [[DOTOMP_UB]], align 4
+// CHECK-64-NEXT:    br label [[OMP_DISPATCH_COND]]
+// CHECK-64:       omp.dispatch.end:
+// CHECK-64-NEXT:    call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB3]], i32 [[TMP1]])
+// CHECK-64-NEXT:    ret void
+//
+//
+// CHECK-64-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l162
+// CHECK-64-SAME: () #[[ATTR8]] {
+// CHECK-64-NEXT:  entry:
+// CHECK-64-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x i8*], align 8
+// CHECK-64-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 2, i1 false)
+// CHECK-64-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
+// CHECK-64-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
+// CHECK-64:       user_code.entry:
+// CHECK-64-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
+// CHECK-64-NEXT:    [[TMP2:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8**
+// CHECK-64-NEXT:    call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__72 to i8*), i8* null, i8** [[TMP2]], i64 0)
+// CHECK-64-NEXT:    call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 2)
+// CHECK-64-NEXT:    ret void
+// CHECK-64:       worker.exit:
+// CHECK-64-NEXT:    ret void
+//
+//
+// CHECK-64-LABEL: define {{[^@]+}}@__omp_outlined__72
+// CHECK-64-SAME: (i32* noalias noundef [[DOTGLOBAL_TID_:%.*]], i32* noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
+// CHECK-64-NEXT:  entry:
+// CHECK-64-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8
+// CHECK-64-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8
+// CHECK-64-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK-64-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8
+// CHECK-64-NEXT:    store i32 0, i32* [[DOTOMP_LB]], align 4
+// CHECK-64-NEXT:    store i32 9, i32* [[DOTOMP_UB]], align 4
+// CHECK-64-NEXT:    store i32 1, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-64-NEXT:    store i32 0, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-64-NEXT:    [[TMP0:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK-64-NEXT:    [[TMP1:%.*]] = load i32, i32* [[TMP0]], align 4
+// CHECK-64-NEXT:    call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB3]], i32 [[TMP1]], i32 34, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1)
+// CHECK-64-NEXT:    [[TMP2:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4
+// CHECK-64-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[TMP2]], 9
+// CHECK-64-NEXT:    br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK-64:       cond.true:
+// CHECK-64-NEXT:    br label [[COND_END:%.*]]
+// CHECK-64:       cond.false:
+// CHECK-64-NEXT:    [[TMP3:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4
+// CHECK-64-NEXT:    br label [[COND_END]]
+// CHECK-64:       cond.end:
+// CHECK-64-NEXT:    [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP3]], [[COND_FALSE]] ]
+// CHECK-64-NEXT:    store i32 [[COND]], i32* [[DOTOMP_UB]], align 4
+// CHECK-64-NEXT:    [[TMP4:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4
+// CHECK-64-NEXT:    store i32 [[TMP4]], i32* [[DOTOMP_IV]], align 4
+// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK-64:       omp.inner.for.cond:
+// CHECK-64-NEXT:    [[TMP5:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
+// CHECK-64-NEXT:    [[TMP6:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4
+// CHECK-64-NEXT:    [[CMP1:%.*]] = icmp sle i32 [[TMP5]], [[TMP6]]
+// CHECK-64-NEXT:    br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK-64:       omp.inner.for.body:
+// CHECK-64-NEXT:    [[TMP7:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
+// CHECK-64-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP7]], 1
+// CHECK-64-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// CHECK-64-NEXT:    store i32 [[ADD]], i32* [[I]], align 4
+// CHECK-64-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK-64:       omp.body.continue:
+// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK-64:       omp.inner.for.inc:
+// CHECK-64-NEXT:    [[TMP8:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
+// CHECK-64-NEXT:    [[ADD2:%.*]] = add nsw i32 [[TMP8]], 1
+// CHECK-64-NEXT:    store i32 [[ADD2]], i32* [[DOTOMP_IV]], align 4
+// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND]]
+// CHECK-64:       omp.inner.for.end:
+// CHECK-64-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK-64:       omp.loop.exit:
+// CHECK-64-NEXT:    call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB3]], i32 [[TMP1]])
+// CHECK-64-NEXT:    ret void
+//
+//
+// CHECK-64-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l165
+// CHECK-64-SAME: () #[[ATTR8]] {
+// CHECK-64-NEXT:  entry:
+// CHECK-64-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x i8*], align 8
+// CHECK-64-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 2, i1 false)
+// CHECK-64-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
+// CHECK-64-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
+// CHECK-64:       user_code.entry:
+// CHECK-64-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
+// CHECK-64-NEXT:    [[TMP2:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8**
+// CHECK-64-NEXT:    call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__73 to i8*), i8* null, i8** [[TMP2]], i64 0)
+// CHECK-64-NEXT:    call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 2)
+// CHECK-64-NEXT:    ret void
+// CHECK-64:       worker.exit:
+// CHECK-64-NEXT:    ret void
+//
+//
+// CHECK-64-LABEL: define {{[^@]+}}@__omp_outlined__73
+// CHECK-64-SAME: (i32* noalias noundef [[DOTGLOBAL_TID_:%.*]], i32* noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
+// CHECK-64-NEXT:  entry:
+// CHECK-64-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8
+// CHECK-64-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8
+// CHECK-64-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK-64-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8
+// CHECK-64-NEXT:    store i32 0, i32* [[DOTOMP_LB]], align 4
+// CHECK-64-NEXT:    store i32 9, i32* [[DOTOMP_UB]], align 4
+// CHECK-64-NEXT:    store i32 1, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-64-NEXT:    store i32 0, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-64-NEXT:    [[TMP0:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK-64-NEXT:    [[TMP1:%.*]] = load i32, i32* [[TMP0]], align 4
+// CHECK-64-NEXT:    call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB3]], i32 [[TMP1]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1)
+// CHECK-64-NEXT:    br label [[OMP_DISPATCH_COND:%.*]]
+// CHECK-64:       omp.dispatch.cond:
+// CHECK-64-NEXT:    [[TMP2:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4
+// CHECK-64-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[TMP2]], 9
+// CHECK-64-NEXT:    br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK-64:       cond.true:
+// CHECK-64-NEXT:    br label [[COND_END:%.*]]
+// CHECK-64:       cond.false:
+// CHECK-64-NEXT:    [[TMP3:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4
+// CHECK-64-NEXT:    br label [[COND_END]]
+// CHECK-64:       cond.end:
+// CHECK-64-NEXT:    [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP3]], [[COND_FALSE]] ]
+// CHECK-64-NEXT:    store i32 [[COND]], i32* [[DOTOMP_UB]], align 4
+// CHECK-64-NEXT:    [[TMP4:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4
+// CHECK-64-NEXT:    store i32 [[TMP4]], i32* [[DOTOMP_IV]], align 4
+// CHECK-64-NEXT:    [[TMP5:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
+// CHECK-64-NEXT:    [[TMP6:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4
+// CHECK-64-NEXT:    [[CMP1:%.*]] = icmp sle i32 [[TMP5]], [[TMP6]]
+// CHECK-64-NEXT:    br i1 [[CMP1]], label [[OMP_DISPATCH_BODY:%.*]], label [[OMP_DISPATCH_END:%.*]]
+// CHECK-64:       omp.dispatch.body:
+// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK-64:       omp.inner.for.cond:
+// CHECK-64-NEXT:    [[TMP7:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
+// CHECK-64-NEXT:    [[TMP8:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4
+// CHECK-64-NEXT:    [[CMP2:%.*]] = icmp sle i32 [[TMP7]], [[TMP8]]
+// CHECK-64-NEXT:    br i1 [[CMP2]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK-64:       omp.inner.for.body:
+// CHECK-64-NEXT:    [[TMP9:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
+// CHECK-64-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP9]], 1
+// CHECK-64-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// CHECK-64-NEXT:    store i32 [[ADD]], i32* [[I]], align 4
+// CHECK-64-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK-64:       omp.body.continue:
+// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK-64:       omp.inner.for.inc:
+// CHECK-64-NEXT:    [[TMP10:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
+// CHECK-64-NEXT:    [[ADD3:%.*]] = add nsw i32 [[TMP10]], 1
+// CHECK-64-NEXT:    store i32 [[ADD3]], i32* [[DOTOMP_IV]], align 4
+// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND]]
+// CHECK-64:       omp.inner.for.end:
+// CHECK-64-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
+// CHECK-64:       omp.dispatch.inc:
+// CHECK-64-NEXT:    [[TMP11:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4
+// CHECK-64-NEXT:    [[TMP12:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-64-NEXT:    [[ADD4:%.*]] = add nsw i32 [[TMP11]], [[TMP12]]
+// CHECK-64-NEXT:    store i32 [[ADD4]], i32* [[DOTOMP_LB]], align 4
+// CHECK-64-NEXT:    [[TMP13:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4
+// CHECK-64-NEXT:    [[TMP14:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-64-NEXT:    [[ADD5:%.*]] = add nsw i32 [[TMP13]], [[TMP14]]
+// CHECK-64-NEXT:    store i32 [[ADD5]], i32* [[DOTOMP_UB]], align 4
+// CHECK-64-NEXT:    br label [[OMP_DISPATCH_COND]]
+// CHECK-64:       omp.dispatch.end:
+// CHECK-64-NEXT:    call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB3]], i32 [[TMP1]])
+// CHECK-64-NEXT:    ret void
+//
+//
+// CHECK-64-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l168
+// CHECK-64-SAME: () #[[ATTR8]] {
+// CHECK-64-NEXT:  entry:
+// CHECK-64-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x i8*], align 8
+// CHECK-64-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 2, i1 false)
+// CHECK-64-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
+// CHECK-64-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
+// CHECK-64:       user_code.entry:
+// CHECK-64-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
+// CHECK-64-NEXT:    [[TMP2:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8**
+// CHECK-64-NEXT:    call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__74 to i8*), i8* null, i8** [[TMP2]], i64 0)
+// CHECK-64-NEXT:    call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 2)
+// CHECK-64-NEXT:    ret void
+// CHECK-64:       worker.exit:
+// CHECK-64-NEXT:    ret void
+//
+//
+// CHECK-64-LABEL: define {{[^@]+}}@__omp_outlined__74
+// CHECK-64-SAME: (i32* noalias noundef [[DOTGLOBAL_TID_:%.*]], i32* noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
+// CHECK-64-NEXT:  entry:
+// CHECK-64-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8
+// CHECK-64-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8
+// CHECK-64-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK-64-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8
+// CHECK-64-NEXT:    store i32 0, i32* [[DOTOMP_LB]], align 4
+// CHECK-64-NEXT:    store i32 9, i32* [[DOTOMP_UB]], align 4
+// CHECK-64-NEXT:    store i32 1, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-64-NEXT:    store i32 0, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-64-NEXT:    [[TMP0:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK-64-NEXT:    [[TMP1:%.*]] = load i32, i32* [[TMP0]], align 4
+// CHECK-64-NEXT:    call void @__kmpc_dispatch_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]], i32 1073741862, i32 0, i32 9, i32 1, i32 1)
+// CHECK-64-NEXT:    br label [[OMP_DISPATCH_COND:%.*]]
+// CHECK-64:       omp.dispatch.cond:
+// CHECK-64-NEXT:    [[TMP2:%.*]] = call i32 @__kmpc_dispatch_next_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]], i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]])
+// CHECK-64-NEXT:    [[TOBOOL:%.*]] = icmp ne i32 [[TMP2]], 0
+// CHECK-64-NEXT:    br i1 [[TOBOOL]], label [[OMP_DISPATCH_BODY:%.*]], label [[OMP_DISPATCH_END:%.*]]
+// CHECK-64:       omp.dispatch.body:
+// CHECK-64-NEXT:    [[TMP3:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4
+// CHECK-64-NEXT:    store i32 [[TMP3]], i32* [[DOTOMP_IV]], align 4
+// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK-64:       omp.inner.for.cond:
+// CHECK-64-NEXT:    [[TMP4:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP258:![0-9]+]]
+// CHECK-64-NEXT:    [[TMP5:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP258]]
+// CHECK-64-NEXT:    [[CMP:%.*]] = icmp sle i32 [[TMP4]], [[TMP5]]
+// CHECK-64-NEXT:    br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK-64:       omp.inner.for.body:
+// CHECK-64-NEXT:    [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP258]]
+// CHECK-64-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP6]], 1
+// CHECK-64-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// CHECK-64-NEXT:    store i32 [[ADD]], i32* [[I]], align 4, !llvm.access.group [[ACC_GRP258]]
+// CHECK-64-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK-64:       omp.body.continue:
+// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK-64:       omp.inner.for.inc:
+// CHECK-64-NEXT:    [[TMP7:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP258]]
+// CHECK-64-NEXT:    [[ADD1:%.*]] = add nsw i32 [[TMP7]], 1
+// CHECK-64-NEXT:    store i32 [[ADD1]], i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP258]]
+// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP259:![0-9]+]]
+// CHECK-64:       omp.inner.for.end:
+// CHECK-64-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
+// CHECK-64:       omp.dispatch.inc:
+// CHECK-64-NEXT:    br label [[OMP_DISPATCH_COND]]
+// CHECK-64:       omp.dispatch.end:
+// CHECK-64-NEXT:    ret void
+//
+//
+// CHECK-64-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l171
+// CHECK-64-SAME: () #[[ATTR8]] {
+// CHECK-64-NEXT:  entry:
+// CHECK-64-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x i8*], align 8
+// CHECK-64-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 2, i1 false)
+// CHECK-64-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
+// CHECK-64-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
+// CHECK-64:       user_code.entry:
+// CHECK-64-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
+// CHECK-64-NEXT:    [[TMP2:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8**
+// CHECK-64-NEXT:    call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__75 to i8*), i8* null, i8** [[TMP2]], i64 0)
+// CHECK-64-NEXT:    call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 2)
+// CHECK-64-NEXT:    ret void
+// CHECK-64:       worker.exit:
+// CHECK-64-NEXT:    ret void
+//
+//
+// CHECK-64-LABEL: define {{[^@]+}}@__omp_outlined__75
+// CHECK-64-SAME: (i32* noalias noundef [[DOTGLOBAL_TID_:%.*]], i32* noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
+// CHECK-64-NEXT:  entry:
+// CHECK-64-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8
+// CHECK-64-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8
+// CHECK-64-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK-64-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8
+// CHECK-64-NEXT:    store i32 0, i32* [[DOTOMP_LB]], align 4
+// CHECK-64-NEXT:    store i32 9, i32* [[DOTOMP_UB]], align 4
+// CHECK-64-NEXT:    store i32 1, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-64-NEXT:    store i32 0, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-64-NEXT:    [[TMP0:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK-64-NEXT:    [[TMP1:%.*]] = load i32, i32* [[TMP0]], align 4
+// CHECK-64-NEXT:    call void @__kmpc_dispatch_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]], i32 1073741861, i32 0, i32 9, i32 1, i32 1)
+// CHECK-64-NEXT:    br label [[OMP_DISPATCH_COND:%.*]]
+// CHECK-64:       omp.dispatch.cond:
+// CHECK-64-NEXT:    [[TMP2:%.*]] = call i32 @__kmpc_dispatch_next_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]], i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]])
+// CHECK-64-NEXT:    [[TOBOOL:%.*]] = icmp ne i32 [[TMP2]], 0
+// CHECK-64-NEXT:    br i1 [[TOBOOL]], label [[OMP_DISPATCH_BODY:%.*]], label [[OMP_DISPATCH_END:%.*]]
+// CHECK-64:       omp.dispatch.body:
+// CHECK-64-NEXT:    [[TMP3:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4
+// CHECK-64-NEXT:    store i32 [[TMP3]], i32* [[DOTOMP_IV]], align 4
+// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK-64:       omp.inner.for.cond:
+// CHECK-64-NEXT:    [[TMP4:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP261:![0-9]+]]
+// CHECK-64-NEXT:    [[TMP5:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP261]]
+// CHECK-64-NEXT:    [[CMP:%.*]] = icmp sle i32 [[TMP4]], [[TMP5]]
+// CHECK-64-NEXT:    br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK-64:       omp.inner.for.body:
+// CHECK-64-NEXT:    [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP261]]
+// CHECK-64-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP6]], 1
+// CHECK-64-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// CHECK-64-NEXT:    store i32 [[ADD]], i32* [[I]], align 4, !llvm.access.group [[ACC_GRP261]]
+// CHECK-64-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK-64:       omp.body.continue:
+// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK-64:       omp.inner.for.inc:
+// CHECK-64-NEXT:    [[TMP7:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP261]]
+// CHECK-64-NEXT:    [[ADD1:%.*]] = add nsw i32 [[TMP7]], 1
+// CHECK-64-NEXT:    store i32 [[ADD1]], i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP261]]
+// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP262:![0-9]+]]
+// CHECK-64:       omp.inner.for.end:
+// CHECK-64-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
+// CHECK-64:       omp.dispatch.inc:
+// CHECK-64-NEXT:    br label [[OMP_DISPATCH_COND]]
+// CHECK-64:       omp.dispatch.end:
+// CHECK-64-NEXT:    ret void
+//
+//
+// CHECK-64-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l174
+// CHECK-64-SAME: () #[[ATTR8]] {
+// CHECK-64-NEXT:  entry:
+// CHECK-64-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x i8*], align 8
+// CHECK-64-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 2, i1 false)
+// CHECK-64-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
+// CHECK-64-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
+// CHECK-64:       user_code.entry:
+// CHECK-64-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
+// CHECK-64-NEXT:    [[TMP2:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8**
+// CHECK-64-NEXT:    call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__76 to i8*), i8* null, i8** [[TMP2]], i64 0)
+// CHECK-64-NEXT:    call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 2)
+// CHECK-64-NEXT:    ret void
+// CHECK-64:       worker.exit:
+// CHECK-64-NEXT:    ret void
+//
+//
+// CHECK-64-LABEL: define {{[^@]+}}@__omp_outlined__76
+// CHECK-64-SAME: (i32* noalias noundef [[DOTGLOBAL_TID_:%.*]], i32* noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
+// CHECK-64-NEXT:  entry:
+// CHECK-64-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8
+// CHECK-64-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8
+// CHECK-64-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK-64-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8
+// CHECK-64-NEXT:    store i32 0, i32* [[DOTOMP_LB]], align 4
+// CHECK-64-NEXT:    store i32 9, i32* [[DOTOMP_UB]], align 4
+// CHECK-64-NEXT:    store i32 1, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-64-NEXT:    store i32 0, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-64-NEXT:    [[TMP0:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK-64-NEXT:    [[TMP1:%.*]] = load i32, i32* [[TMP0]], align 4
+// CHECK-64-NEXT:    call void @__kmpc_dispatch_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]], i32 1073741859, i32 0, i32 9, i32 1, i32 1)
+// CHECK-64-NEXT:    br label [[OMP_DISPATCH_COND:%.*]]
+// CHECK-64:       omp.dispatch.cond:
+// CHECK-64-NEXT:    [[TMP2:%.*]] = call i32 @__kmpc_dispatch_next_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]], i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]])
+// CHECK-64-NEXT:    [[TOBOOL:%.*]] = icmp ne i32 [[TMP2]], 0
+// CHECK-64-NEXT:    br i1 [[TOBOOL]], label [[OMP_DISPATCH_BODY:%.*]], label [[OMP_DISPATCH_END:%.*]]
+// CHECK-64:       omp.dispatch.body:
+// CHECK-64-NEXT:    [[TMP3:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4
+// CHECK-64-NEXT:    store i32 [[TMP3]], i32* [[DOTOMP_IV]], align 4
+// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK-64:       omp.inner.for.cond:
+// CHECK-64-NEXT:    [[TMP4:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP264:![0-9]+]]
+// CHECK-64-NEXT:    [[TMP5:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP264]]
+// CHECK-64-NEXT:    [[CMP:%.*]] = icmp sle i32 [[TMP4]], [[TMP5]]
+// CHECK-64-NEXT:    br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK-64:       omp.inner.for.body:
+// CHECK-64-NEXT:    [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP264]]
+// CHECK-64-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP6]], 1
+// CHECK-64-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// CHECK-64-NEXT:    store i32 [[ADD]], i32* [[I]], align 4, !llvm.access.group [[ACC_GRP264]]
+// CHECK-64-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK-64:       omp.body.continue:
+// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK-64:       omp.inner.for.inc:
+// CHECK-64-NEXT:    [[TMP7:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP264]]
+// CHECK-64-NEXT:    [[ADD1:%.*]] = add nsw i32 [[TMP7]], 1
+// CHECK-64-NEXT:    store i32 [[ADD1]], i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP264]]
+// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP265:![0-9]+]]
+// CHECK-64:       omp.inner.for.end:
+// CHECK-64-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
+// CHECK-64:       omp.dispatch.inc:
+// CHECK-64-NEXT:    br label [[OMP_DISPATCH_COND]]
+// CHECK-64:       omp.dispatch.end:
+// CHECK-64-NEXT:    ret void
+//
+//
+// CHECK-64-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l177
+// CHECK-64-SAME: () #[[ATTR8]] {
+// CHECK-64-NEXT:  entry:
+// CHECK-64-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x i8*], align 8
+// CHECK-64-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 2, i1 false)
+// CHECK-64-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
+// CHECK-64-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
+// CHECK-64:       user_code.entry:
+// CHECK-64-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
+// CHECK-64-NEXT:    [[TMP2:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8**
+// CHECK-64-NEXT:    call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__77 to i8*), i8* null, i8** [[TMP2]], i64 0)
+// CHECK-64-NEXT:    call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 2)
+// CHECK-64-NEXT:    ret void
+// CHECK-64:       worker.exit:
+// CHECK-64-NEXT:    ret void
+//
+//
+// CHECK-64-LABEL: define {{[^@]+}}@__omp_outlined__77
+// CHECK-64-SAME: (i32* noalias noundef [[DOTGLOBAL_TID_:%.*]], i32* noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
+// CHECK-64-NEXT:  entry:
+// CHECK-64-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8
+// CHECK-64-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8
+// CHECK-64-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK-64-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8
+// CHECK-64-NEXT:    store i32 0, i32* [[DOTOMP_LB]], align 4
+// CHECK-64-NEXT:    store i32 9, i32* [[DOTOMP_UB]], align 4
+// CHECK-64-NEXT:    store i32 1, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-64-NEXT:    store i32 0, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-64-NEXT:    [[TMP0:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK-64-NEXT:    [[TMP1:%.*]] = load i32, i32* [[TMP0]], align 4
+// CHECK-64-NEXT:    call void @__kmpc_dispatch_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]], i32 1073741860, i32 0, i32 9, i32 1, i32 1)
+// CHECK-64-NEXT:    br label [[OMP_DISPATCH_COND:%.*]]
+// CHECK-64:       omp.dispatch.cond:
+// CHECK-64-NEXT:    [[TMP2:%.*]] = call i32 @__kmpc_dispatch_next_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]], i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]])
+// CHECK-64-NEXT:    [[TOBOOL:%.*]] = icmp ne i32 [[TMP2]], 0
+// CHECK-64-NEXT:    br i1 [[TOBOOL]], label [[OMP_DISPATCH_BODY:%.*]], label [[OMP_DISPATCH_END:%.*]]
+// CHECK-64:       omp.dispatch.body:
+// CHECK-64-NEXT:    [[TMP3:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4
+// CHECK-64-NEXT:    store i32 [[TMP3]], i32* [[DOTOMP_IV]], align 4
+// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK-64:       omp.inner.for.cond:
+// CHECK-64-NEXT:    [[TMP4:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP267:![0-9]+]]
+// CHECK-64-NEXT:    [[TMP5:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP267]]
+// CHECK-64-NEXT:    [[CMP:%.*]] = icmp sle i32 [[TMP4]], [[TMP5]]
+// CHECK-64-NEXT:    br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK-64:       omp.inner.for.body:
+// CHECK-64-NEXT:    [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP267]]
+// CHECK-64-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP6]], 1
+// CHECK-64-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// CHECK-64-NEXT:    store i32 [[ADD]], i32* [[I]], align 4, !llvm.access.group [[ACC_GRP267]]
+// CHECK-64-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK-64:       omp.body.continue:
+// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK-64:       omp.inner.for.inc:
+// CHECK-64-NEXT:    [[TMP7:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP267]]
+// CHECK-64-NEXT:    [[ADD1:%.*]] = add nsw i32 [[TMP7]], 1
+// CHECK-64-NEXT:    store i32 [[ADD1]], i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP267]]
+// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP268:![0-9]+]]
+// CHECK-64:       omp.inner.for.end:
+// CHECK-64-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
+// CHECK-64:       omp.dispatch.inc:
+// CHECK-64-NEXT:    br label [[OMP_DISPATCH_COND]]
+// CHECK-64:       omp.dispatch.end:
+// CHECK-64-NEXT:    ret void
+//
+//
+// CHECK-64-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l180
+// CHECK-64-SAME: (i64 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR8]] {
+// CHECK-64-NEXT:  entry:
+// CHECK-64-NEXT:    [[DOTCAPTURE_EXPR__ADDR:%.*]] = alloca i64, align 8
+// CHECK-64-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x i8*], align 8
+// CHECK-64-NEXT:    store i64 [[DOTCAPTURE_EXPR_]], i64* [[DOTCAPTURE_EXPR__ADDR]], align 8
+// CHECK-64-NEXT:    [[CONV:%.*]] = bitcast i64* [[DOTCAPTURE_EXPR__ADDR]] to i8*
+// CHECK-64-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 2, i1 false)
+// CHECK-64-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
+// CHECK-64-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
+// CHECK-64:       user_code.entry:
+// CHECK-64-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
+// CHECK-64-NEXT:    [[TMP2:%.*]] = load i8, i8* [[CONV]], align 1
+// CHECK-64-NEXT:    [[TOBOOL:%.*]] = trunc i8 [[TMP2]] to i1
+// CHECK-64-NEXT:    [[TMP3:%.*]] = zext i1 [[TOBOOL]] to i32
+// CHECK-64-NEXT:    [[TMP4:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8**
+// CHECK-64-NEXT:    call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]], i32 [[TMP3]], i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__78 to i8*), i8* null, i8** [[TMP4]], i64 0)
+// CHECK-64-NEXT:    call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 2)
+// CHECK-64-NEXT:    ret void
+// CHECK-64:       worker.exit:
+// CHECK-64-NEXT:    ret void
+//
+//
+// CHECK-64-LABEL: define {{[^@]+}}@__omp_outlined__78
+// CHECK-64-SAME: (i32* noalias noundef [[DOTGLOBAL_TID_:%.*]], i32* noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
+// CHECK-64-NEXT:  entry:
+// CHECK-64-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8
+// CHECK-64-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8
+// CHECK-64-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK-64-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8
+// CHECK-64-NEXT:    store i32 0, i32* [[DOTOMP_LB]], align 4
+// CHECK-64-NEXT:    store i32 9, i32* [[DOTOMP_UB]], align 4
+// CHECK-64-NEXT:    store i32 1, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-64-NEXT:    store i32 0, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-64-NEXT:    [[TMP0:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK-64-NEXT:    [[TMP1:%.*]] = load i32, i32* [[TMP0]], align 4
+// CHECK-64-NEXT:    call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB3]], i32 [[TMP1]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1)
+// CHECK-64-NEXT:    br label [[OMP_DISPATCH_COND:%.*]]
+// CHECK-64:       omp.dispatch.cond:
+// CHECK-64-NEXT:    [[TMP2:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4
+// CHECK-64-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[TMP2]], 9
+// CHECK-64-NEXT:    br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK-64:       cond.true:
+// CHECK-64-NEXT:    br label [[COND_END:%.*]]
+// CHECK-64:       cond.false:
+// CHECK-64-NEXT:    [[TMP3:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4
+// CHECK-64-NEXT:    br label [[COND_END]]
+// CHECK-64:       cond.end:
+// CHECK-64-NEXT:    [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP3]], [[COND_FALSE]] ]
+// CHECK-64-NEXT:    store i32 [[COND]], i32* [[DOTOMP_UB]], align 4
+// CHECK-64-NEXT:    [[TMP4:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4
+// CHECK-64-NEXT:    store i32 [[TMP4]], i32* [[DOTOMP_IV]], align 4
+// CHECK-64-NEXT:    [[TMP5:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
+// CHECK-64-NEXT:    [[TMP6:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4
+// CHECK-64-NEXT:    [[CMP1:%.*]] = icmp sle i32 [[TMP5]], [[TMP6]]
+// CHECK-64-NEXT:    br i1 [[CMP1]], label [[OMP_DISPATCH_BODY:%.*]], label [[OMP_DISPATCH_END:%.*]]
+// CHECK-64:       omp.dispatch.body:
+// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK-64:       omp.inner.for.cond:
+// CHECK-64-NEXT:    [[TMP7:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP270:![0-9]+]]
+// CHECK-64-NEXT:    [[TMP8:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP270]]
+// CHECK-64-NEXT:    [[CMP2:%.*]] = icmp sle i32 [[TMP7]], [[TMP8]]
+// CHECK-64-NEXT:    br i1 [[CMP2]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK-64:       omp.inner.for.body:
+// CHECK-64-NEXT:    [[TMP9:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP270]]
+// CHECK-64-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP9]], 1
+// CHECK-64-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// CHECK-64-NEXT:    store i32 [[ADD]], i32* [[I]], align 4, !llvm.access.group [[ACC_GRP270]]
+// CHECK-64-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK-64:       omp.body.continue:
+// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK-64:       omp.inner.for.inc:
+// CHECK-64-NEXT:    [[TMP10:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP270]]
+// CHECK-64-NEXT:    [[ADD3:%.*]] = add nsw i32 [[TMP10]], 1
+// CHECK-64-NEXT:    store i32 [[ADD3]], i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP270]]
+// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP271:![0-9]+]]
+// CHECK-64:       omp.inner.for.end:
+// CHECK-64-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
+// CHECK-64:       omp.dispatch.inc:
+// CHECK-64-NEXT:    [[TMP11:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4
+// CHECK-64-NEXT:    [[TMP12:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-64-NEXT:    [[ADD4:%.*]] = add nsw i32 [[TMP11]], [[TMP12]]
+// CHECK-64-NEXT:    store i32 [[ADD4]], i32* [[DOTOMP_LB]], align 4
+// CHECK-64-NEXT:    [[TMP13:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4
+// CHECK-64-NEXT:    [[TMP14:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-64-NEXT:    [[ADD5:%.*]] = add nsw i32 [[TMP13]], [[TMP14]]
+// CHECK-64-NEXT:    store i32 [[ADD5]], i32* [[DOTOMP_UB]], align 4
+// CHECK-64-NEXT:    br label [[OMP_DISPATCH_COND]]
+// CHECK-64:       omp.dispatch.end:
+// CHECK-64-NEXT:    call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB3]], i32 [[TMP1]])
+// CHECK-64-NEXT:    [[TMP15:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-64-NEXT:    [[TMP16:%.*]] = icmp ne i32 [[TMP15]], 0
+// CHECK-64-NEXT:    br i1 [[TMP16]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]]
+// CHECK-64:       .omp.final.then:
+// CHECK-64-NEXT:    store i32 10, i32* [[I]], align 4
+// CHECK-64-NEXT:    br label [[DOTOMP_FINAL_DONE]]
+// CHECK-64:       .omp.final.done:
+// CHECK-64-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4:[0-9]+]], i32 [[TMP1]])
+// CHECK-64-NEXT:    ret void
+//
+//
+// CHECK-64-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l184
+// CHECK-64-SAME: () #[[ATTR8]] {
+// CHECK-64-NEXT:  entry:
+// CHECK-64-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x i8*], align 8
+// CHECK-64-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 2, i1 false)
+// CHECK-64-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
+// CHECK-64-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
+// CHECK-64:       user_code.entry:
+// CHECK-64-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
+// CHECK-64-NEXT:    [[TMP2:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8**
+// CHECK-64-NEXT:    call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__79 to i8*), i8* null, i8** [[TMP2]], i64 0)
+// CHECK-64-NEXT:    call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 2)
+// CHECK-64-NEXT:    ret void
+// CHECK-64:       worker.exit:
+// CHECK-64-NEXT:    ret void
+//
+//
+// CHECK-64-LABEL: define {{[^@]+}}@__omp_outlined__79
+// CHECK-64-SAME: (i32* noalias noundef [[DOTGLOBAL_TID_:%.*]], i32* noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
+// CHECK-64-NEXT:  entry:
+// CHECK-64-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8
+// CHECK-64-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8
+// CHECK-64-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK-64-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8
+// CHECK-64-NEXT:    store i32 0, i32* [[DOTOMP_LB]], align 4
+// CHECK-64-NEXT:    store i32 9, i32* [[DOTOMP_UB]], align 4
+// CHECK-64-NEXT:    store i32 1, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-64-NEXT:    store i32 0, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-64-NEXT:    [[TMP0:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK-64-NEXT:    [[TMP1:%.*]] = load i32, i32* [[TMP0]], align 4
+// CHECK-64-NEXT:    call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB3]], i32 [[TMP1]], i32 34, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1)
+// CHECK-64-NEXT:    [[TMP2:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4
+// CHECK-64-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[TMP2]], 9
+// CHECK-64-NEXT:    br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK-64:       cond.true:
+// CHECK-64-NEXT:    br label [[COND_END:%.*]]
+// CHECK-64:       cond.false:
+// CHECK-64-NEXT:    [[TMP3:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4
+// CHECK-64-NEXT:    br label [[COND_END]]
+// CHECK-64:       cond.end:
+// CHECK-64-NEXT:    [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP3]], [[COND_FALSE]] ]
+// CHECK-64-NEXT:    store i32 [[COND]], i32* [[DOTOMP_UB]], align 4
+// CHECK-64-NEXT:    [[TMP4:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4
+// CHECK-64-NEXT:    store i32 [[TMP4]], i32* [[DOTOMP_IV]], align 4
+// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK-64:       omp.inner.for.cond:
+// CHECK-64-NEXT:    [[TMP5:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP273:![0-9]+]]
+// CHECK-64-NEXT:    [[TMP6:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP273]]
+// CHECK-64-NEXT:    [[CMP1:%.*]] = icmp sle i32 [[TMP5]], [[TMP6]]
+// CHECK-64-NEXT:    br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK-64:       omp.inner.for.body:
+// CHECK-64-NEXT:    [[TMP7:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP273]]
+// CHECK-64-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP7]], 1
+// CHECK-64-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// CHECK-64-NEXT:    store i32 [[ADD]], i32* [[I]], align 4, !llvm.access.group [[ACC_GRP273]]
+// CHECK-64-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK-64:       omp.body.continue:
+// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK-64:       omp.inner.for.inc:
+// CHECK-64-NEXT:    [[TMP8:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP273]]
+// CHECK-64-NEXT:    [[ADD2:%.*]] = add nsw i32 [[TMP8]], 1
+// CHECK-64-NEXT:    store i32 [[ADD2]], i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP273]]
+// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP274:![0-9]+]]
+// CHECK-64:       omp.inner.for.end:
+// CHECK-64-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK-64:       omp.loop.exit:
+// CHECK-64-NEXT:    call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB3]], i32 [[TMP1]])
+// CHECK-64-NEXT:    [[TMP9:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-64-NEXT:    [[TMP10:%.*]] = icmp ne i32 [[TMP9]], 0
+// CHECK-64-NEXT:    br i1 [[TMP10]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]]
+// CHECK-64:       .omp.final.then:
+// CHECK-64-NEXT:    store i32 10, i32* [[I]], align 4
+// CHECK-64-NEXT:    br label [[DOTOMP_FINAL_DONE]]
+// CHECK-64:       .omp.final.done:
+// CHECK-64-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[TMP1]])
+// CHECK-64-NEXT:    ret void
+//
+//
+// CHECK-64-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l188
+// CHECK-64-SAME: () #[[ATTR8]] {
+// CHECK-64-NEXT:  entry:
+// CHECK-64-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x i8*], align 8
+// CHECK-64-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 2, i1 false)
+// CHECK-64-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
+// CHECK-64-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
+// CHECK-64:       user_code.entry:
+// CHECK-64-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
+// CHECK-64-NEXT:    [[TMP2:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8**
+// CHECK-64-NEXT:    call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__80 to i8*), i8* null, i8** [[TMP2]], i64 0)
+// CHECK-64-NEXT:    call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 2)
+// CHECK-64-NEXT:    ret void
+// CHECK-64:       worker.exit:
+// CHECK-64-NEXT:    ret void
+//
+//
+// CHECK-64-LABEL: define {{[^@]+}}@__omp_outlined__80
+// CHECK-64-SAME: (i32* noalias noundef [[DOTGLOBAL_TID_:%.*]], i32* noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
+// CHECK-64-NEXT:  entry:
+// CHECK-64-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8
+// CHECK-64-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8
+// CHECK-64-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK-64-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8
+// CHECK-64-NEXT:    store i32 0, i32* [[DOTOMP_LB]], align 4
+// CHECK-64-NEXT:    store i32 9, i32* [[DOTOMP_UB]], align 4
+// CHECK-64-NEXT:    store i32 1, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-64-NEXT:    store i32 0, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-64-NEXT:    [[TMP0:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK-64-NEXT:    [[TMP1:%.*]] = load i32, i32* [[TMP0]], align 4
+// CHECK-64-NEXT:    call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB3]], i32 [[TMP1]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1)
+// CHECK-64-NEXT:    br label [[OMP_DISPATCH_COND:%.*]]
+// CHECK-64:       omp.dispatch.cond:
+// CHECK-64-NEXT:    [[TMP2:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4
+// CHECK-64-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[TMP2]], 9
+// CHECK-64-NEXT:    br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK-64:       cond.true:
+// CHECK-64-NEXT:    br label [[COND_END:%.*]]
+// CHECK-64:       cond.false:
+// CHECK-64-NEXT:    [[TMP3:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4
+// CHECK-64-NEXT:    br label [[COND_END]]
+// CHECK-64:       cond.end:
+// CHECK-64-NEXT:    [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP3]], [[COND_FALSE]] ]
+// CHECK-64-NEXT:    store i32 [[COND]], i32* [[DOTOMP_UB]], align 4
+// CHECK-64-NEXT:    [[TMP4:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4
+// CHECK-64-NEXT:    store i32 [[TMP4]], i32* [[DOTOMP_IV]], align 4
+// CHECK-64-NEXT:    [[TMP5:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
+// CHECK-64-NEXT:    [[TMP6:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4
+// CHECK-64-NEXT:    [[CMP1:%.*]] = icmp sle i32 [[TMP5]], [[TMP6]]
+// CHECK-64-NEXT:    br i1 [[CMP1]], label [[OMP_DISPATCH_BODY:%.*]], label [[OMP_DISPATCH_END:%.*]]
+// CHECK-64:       omp.dispatch.body:
+// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK-64:       omp.inner.for.cond:
+// CHECK-64-NEXT:    [[TMP7:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP276:![0-9]+]]
+// CHECK-64-NEXT:    [[TMP8:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP276]]
+// CHECK-64-NEXT:    [[CMP2:%.*]] = icmp sle i32 [[TMP7]], [[TMP8]]
+// CHECK-64-NEXT:    br i1 [[CMP2]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK-64:       omp.inner.for.body:
+// CHECK-64-NEXT:    [[TMP9:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP276]]
+// CHECK-64-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP9]], 1
+// CHECK-64-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// CHECK-64-NEXT:    store i32 [[ADD]], i32* [[I]], align 4, !llvm.access.group [[ACC_GRP276]]
+// CHECK-64-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK-64:       omp.body.continue:
+// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK-64:       omp.inner.for.inc:
+// CHECK-64-NEXT:    [[TMP10:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP276]]
+// CHECK-64-NEXT:    [[ADD3:%.*]] = add nsw i32 [[TMP10]], 1
+// CHECK-64-NEXT:    store i32 [[ADD3]], i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP276]]
+// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP277:![0-9]+]]
+// CHECK-64:       omp.inner.for.end:
+// CHECK-64-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
+// CHECK-64:       omp.dispatch.inc:
+// CHECK-64-NEXT:    [[TMP11:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4
+// CHECK-64-NEXT:    [[TMP12:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-64-NEXT:    [[ADD4:%.*]] = add nsw i32 [[TMP11]], [[TMP12]]
+// CHECK-64-NEXT:    store i32 [[ADD4]], i32* [[DOTOMP_LB]], align 4
+// CHECK-64-NEXT:    [[TMP13:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4
+// CHECK-64-NEXT:    [[TMP14:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-64-NEXT:    [[ADD5:%.*]] = add nsw i32 [[TMP13]], [[TMP14]]
+// CHECK-64-NEXT:    store i32 [[ADD5]], i32* [[DOTOMP_UB]], align 4
+// CHECK-64-NEXT:    br label [[OMP_DISPATCH_COND]]
+// CHECK-64:       omp.dispatch.end:
+// CHECK-64-NEXT:    call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB3]], i32 [[TMP1]])
+// CHECK-64-NEXT:    [[TMP15:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-64-NEXT:    [[TMP16:%.*]] = icmp ne i32 [[TMP15]], 0
+// CHECK-64-NEXT:    br i1 [[TMP16]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]]
+// CHECK-64:       .omp.final.then:
+// CHECK-64-NEXT:    store i32 10, i32* [[I]], align 4
+// CHECK-64-NEXT:    br label [[DOTOMP_FINAL_DONE]]
+// CHECK-64:       .omp.final.done:
+// CHECK-64-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[TMP1]])
+// CHECK-64-NEXT:    ret void
+//
+//
+// CHECK-64-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l192
+// CHECK-64-SAME: () #[[ATTR8]] {
+// CHECK-64-NEXT:  entry:
+// CHECK-64-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x i8*], align 8
+// CHECK-64-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 2, i1 false)
+// CHECK-64-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
+// CHECK-64-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
+// CHECK-64:       user_code.entry:
+// CHECK-64-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
+// CHECK-64-NEXT:    [[TMP2:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8**
+// CHECK-64-NEXT:    call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__81 to i8*), i8* null, i8** [[TMP2]], i64 0)
+// CHECK-64-NEXT:    call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 2)
+// CHECK-64-NEXT:    ret void
+// CHECK-64:       worker.exit:
+// CHECK-64-NEXT:    ret void
+//
+//
+// CHECK-64-LABEL: define {{[^@]+}}@__omp_outlined__81
+// CHECK-64-SAME: (i32* noalias noundef [[DOTGLOBAL_TID_:%.*]], i32* noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
+// CHECK-64-NEXT:  entry:
+// CHECK-64-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8
+// CHECK-64-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8
+// CHECK-64-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK-64-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8
+// CHECK-64-NEXT:    store i32 0, i32* [[DOTOMP_LB]], align 4
+// CHECK-64-NEXT:    store i32 9, i32* [[DOTOMP_UB]], align 4
+// CHECK-64-NEXT:    store i32 1, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-64-NEXT:    store i32 0, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-64-NEXT:    [[TMP0:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK-64-NEXT:    [[TMP1:%.*]] = load i32, i32* [[TMP0]], align 4
+// CHECK-64-NEXT:    call void @__kmpc_dispatch_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]], i32 1073741862, i32 0, i32 9, i32 1, i32 1)
+// CHECK-64-NEXT:    br label [[OMP_DISPATCH_COND:%.*]]
+// CHECK-64:       omp.dispatch.cond:
+// CHECK-64-NEXT:    [[TMP2:%.*]] = call i32 @__kmpc_dispatch_next_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]], i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]])
+// CHECK-64-NEXT:    [[TOBOOL:%.*]] = icmp ne i32 [[TMP2]], 0
+// CHECK-64-NEXT:    br i1 [[TOBOOL]], label [[OMP_DISPATCH_BODY:%.*]], label [[OMP_DISPATCH_END:%.*]]
+// CHECK-64:       omp.dispatch.body:
+// CHECK-64-NEXT:    [[TMP3:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4
+// CHECK-64-NEXT:    store i32 [[TMP3]], i32* [[DOTOMP_IV]], align 4
+// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK-64:       omp.inner.for.cond:
+// CHECK-64-NEXT:    [[TMP4:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP279:![0-9]+]]
+// CHECK-64-NEXT:    [[TMP5:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP279]]
+// CHECK-64-NEXT:    [[CMP:%.*]] = icmp sle i32 [[TMP4]], [[TMP5]]
+// CHECK-64-NEXT:    br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK-64:       omp.inner.for.body:
+// CHECK-64-NEXT:    [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP279]]
+// CHECK-64-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP6]], 1
+// CHECK-64-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// CHECK-64-NEXT:    store i32 [[ADD]], i32* [[I]], align 4, !llvm.access.group [[ACC_GRP279]]
+// CHECK-64-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK-64:       omp.body.continue:
+// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK-64:       omp.inner.for.inc:
+// CHECK-64-NEXT:    [[TMP7:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP279]]
+// CHECK-64-NEXT:    [[ADD1:%.*]] = add nsw i32 [[TMP7]], 1
+// CHECK-64-NEXT:    store i32 [[ADD1]], i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP279]]
+// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP280:![0-9]+]]
+// CHECK-64:       omp.inner.for.end:
+// CHECK-64-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
+// CHECK-64:       omp.dispatch.inc:
+// CHECK-64-NEXT:    br label [[OMP_DISPATCH_COND]]
+// CHECK-64:       omp.dispatch.end:
+// CHECK-64-NEXT:    [[TMP8:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-64-NEXT:    [[TMP9:%.*]] = icmp ne i32 [[TMP8]], 0
+// CHECK-64-NEXT:    br i1 [[TMP9]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]]
+// CHECK-64:       .omp.final.then:
+// CHECK-64-NEXT:    store i32 10, i32* [[I]], align 4
+// CHECK-64-NEXT:    br label [[DOTOMP_FINAL_DONE]]
+// CHECK-64:       .omp.final.done:
+// CHECK-64-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[TMP1]])
+// CHECK-64-NEXT:    ret void
+//
+//
+// CHECK-64-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l196
+// CHECK-64-SAME: () #[[ATTR8]] {
+// CHECK-64-NEXT:  entry:
+// CHECK-64-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x i8*], align 8
+// CHECK-64-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 2, i1 false)
+// CHECK-64-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
+// CHECK-64-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
+// CHECK-64:       user_code.entry:
+// CHECK-64-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
+// CHECK-64-NEXT:    [[TMP2:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8**
+// CHECK-64-NEXT:    call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__82 to i8*), i8* null, i8** [[TMP2]], i64 0)
+// CHECK-64-NEXT:    call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 2)
+// CHECK-64-NEXT:    ret void
+// CHECK-64:       worker.exit:
+// CHECK-64-NEXT:    ret void
+//
+//
+// CHECK-64-LABEL: define {{[^@]+}}@__omp_outlined__82
+// CHECK-64-SAME: (i32* noalias noundef [[DOTGLOBAL_TID_:%.*]], i32* noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
+// CHECK-64-NEXT:  entry:
+// CHECK-64-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8
+// CHECK-64-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8
+// CHECK-64-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK-64-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8
+// CHECK-64-NEXT:    store i32 0, i32* [[DOTOMP_LB]], align 4
+// CHECK-64-NEXT:    store i32 9, i32* [[DOTOMP_UB]], align 4
+// CHECK-64-NEXT:    store i32 1, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-64-NEXT:    store i32 0, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-64-NEXT:    [[TMP0:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK-64-NEXT:    [[TMP1:%.*]] = load i32, i32* [[TMP0]], align 4
+// CHECK-64-NEXT:    call void @__kmpc_dispatch_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]], i32 1073741861, i32 0, i32 9, i32 1, i32 1)
+// CHECK-64-NEXT:    br label [[OMP_DISPATCH_COND:%.*]]
+// CHECK-64:       omp.dispatch.cond:
+// CHECK-64-NEXT:    [[TMP2:%.*]] = call i32 @__kmpc_dispatch_next_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]], i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]])
+// CHECK-64-NEXT:    [[TOBOOL:%.*]] = icmp ne i32 [[TMP2]], 0
+// CHECK-64-NEXT:    br i1 [[TOBOOL]], label [[OMP_DISPATCH_BODY:%.*]], label [[OMP_DISPATCH_END:%.*]]
+// CHECK-64:       omp.dispatch.body:
+// CHECK-64-NEXT:    [[TMP3:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4
+// CHECK-64-NEXT:    store i32 [[TMP3]], i32* [[DOTOMP_IV]], align 4
+// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK-64:       omp.inner.for.cond:
+// CHECK-64-NEXT:    [[TMP4:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP282:![0-9]+]]
+// CHECK-64-NEXT:    [[TMP5:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP282]]
+// CHECK-64-NEXT:    [[CMP:%.*]] = icmp sle i32 [[TMP4]], [[TMP5]]
+// CHECK-64-NEXT:    br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK-64:       omp.inner.for.body:
+// CHECK-64-NEXT:    [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP282]]
+// CHECK-64-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP6]], 1
+// CHECK-64-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// CHECK-64-NEXT:    store i32 [[ADD]], i32* [[I]], align 4, !llvm.access.group [[ACC_GRP282]]
+// CHECK-64-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK-64:       omp.body.continue:
+// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK-64:       omp.inner.for.inc:
+// CHECK-64-NEXT:    [[TMP7:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP282]]
+// CHECK-64-NEXT:    [[ADD1:%.*]] = add nsw i32 [[TMP7]], 1
+// CHECK-64-NEXT:    store i32 [[ADD1]], i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP282]]
+// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP283:![0-9]+]]
+// CHECK-64:       omp.inner.for.end:
+// CHECK-64-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
+// CHECK-64:       omp.dispatch.inc:
+// CHECK-64-NEXT:    br label [[OMP_DISPATCH_COND]]
+// CHECK-64:       omp.dispatch.end:
+// CHECK-64-NEXT:    [[TMP8:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-64-NEXT:    [[TMP9:%.*]] = icmp ne i32 [[TMP8]], 0
+// CHECK-64-NEXT:    br i1 [[TMP9]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]]
+// CHECK-64:       .omp.final.then:
+// CHECK-64-NEXT:    store i32 10, i32* [[I]], align 4
+// CHECK-64-NEXT:    br label [[DOTOMP_FINAL_DONE]]
+// CHECK-64:       .omp.final.done:
+// CHECK-64-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[TMP1]])
+// CHECK-64-NEXT:    ret void
+//
+//
+// CHECK-64-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l200
+// CHECK-64-SAME: () #[[ATTR8]] {
+// CHECK-64-NEXT:  entry:
+// CHECK-64-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x i8*], align 8
+// CHECK-64-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 2, i1 false)
+// CHECK-64-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
+// CHECK-64-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
+// CHECK-64:       user_code.entry:
+// CHECK-64-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
+// CHECK-64-NEXT:    [[TMP2:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8**
+// CHECK-64-NEXT:    call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__83 to i8*), i8* null, i8** [[TMP2]], i64 0)
+// CHECK-64-NEXT:    call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 2)
+// CHECK-64-NEXT:    ret void
+// CHECK-64:       worker.exit:
+// CHECK-64-NEXT:    ret void
+//
+//
+// CHECK-64-LABEL: define {{[^@]+}}@__omp_outlined__83
+// CHECK-64-SAME: (i32* noalias noundef [[DOTGLOBAL_TID_:%.*]], i32* noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
+// CHECK-64-NEXT:  entry:
+// CHECK-64-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8
+// CHECK-64-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8
+// CHECK-64-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK-64-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8
+// CHECK-64-NEXT:    store i32 0, i32* [[DOTOMP_LB]], align 4
+// CHECK-64-NEXT:    store i32 9, i32* [[DOTOMP_UB]], align 4
+// CHECK-64-NEXT:    store i32 1, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-64-NEXT:    store i32 0, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-64-NEXT:    [[TMP0:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK-64-NEXT:    [[TMP1:%.*]] = load i32, i32* [[TMP0]], align 4
+// CHECK-64-NEXT:    call void @__kmpc_dispatch_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]], i32 1073741859, i32 0, i32 9, i32 1, i32 1)
+// CHECK-64-NEXT:    br label [[OMP_DISPATCH_COND:%.*]]
+// CHECK-64:       omp.dispatch.cond:
+// CHECK-64-NEXT:    [[TMP2:%.*]] = call i32 @__kmpc_dispatch_next_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]], i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]])
+// CHECK-64-NEXT:    [[TOBOOL:%.*]] = icmp ne i32 [[TMP2]], 0
+// CHECK-64-NEXT:    br i1 [[TOBOOL]], label [[OMP_DISPATCH_BODY:%.*]], label [[OMP_DISPATCH_END:%.*]]
+// CHECK-64:       omp.dispatch.body:
+// CHECK-64-NEXT:    [[TMP3:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4
+// CHECK-64-NEXT:    store i32 [[TMP3]], i32* [[DOTOMP_IV]], align 4
+// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK-64:       omp.inner.for.cond:
+// CHECK-64-NEXT:    [[TMP4:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP285:![0-9]+]]
+// CHECK-64-NEXT:    [[TMP5:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP285]]
+// CHECK-64-NEXT:    [[CMP:%.*]] = icmp sle i32 [[TMP4]], [[TMP5]]
+// CHECK-64-NEXT:    br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK-64:       omp.inner.for.body:
+// CHECK-64-NEXT:    [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP285]]
+// CHECK-64-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP6]], 1
+// CHECK-64-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// CHECK-64-NEXT:    store i32 [[ADD]], i32* [[I]], align 4, !llvm.access.group [[ACC_GRP285]]
+// CHECK-64-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK-64:       omp.body.continue:
+// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK-64:       omp.inner.for.inc:
+// CHECK-64-NEXT:    [[TMP7:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP285]]
+// CHECK-64-NEXT:    [[ADD1:%.*]] = add nsw i32 [[TMP7]], 1
+// CHECK-64-NEXT:    store i32 [[ADD1]], i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP285]]
+// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP286:![0-9]+]]
+// CHECK-64:       omp.inner.for.end:
+// CHECK-64-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
+// CHECK-64:       omp.dispatch.inc:
+// CHECK-64-NEXT:    br label [[OMP_DISPATCH_COND]]
+// CHECK-64:       omp.dispatch.end:
+// CHECK-64-NEXT:    [[TMP8:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-64-NEXT:    [[TMP9:%.*]] = icmp ne i32 [[TMP8]], 0
+// CHECK-64-NEXT:    br i1 [[TMP9]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]]
+// CHECK-64:       .omp.final.then:
+// CHECK-64-NEXT:    store i32 10, i32* [[I]], align 4
+// CHECK-64-NEXT:    br label [[DOTOMP_FINAL_DONE]]
+// CHECK-64:       .omp.final.done:
+// CHECK-64-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[TMP1]])
+// CHECK-64-NEXT:    ret void
+//
+//
+// CHECK-64-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l204
+// CHECK-64-SAME: () #[[ATTR8]] {
+// CHECK-64-NEXT:  entry:
+// CHECK-64-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x i8*], align 8
+// CHECK-64-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 2, i1 false)
+// CHECK-64-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
+// CHECK-64-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
+// CHECK-64:       user_code.entry:
+// CHECK-64-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
+// CHECK-64-NEXT:    [[TMP2:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8**
+// CHECK-64-NEXT:    call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__84 to i8*), i8* null, i8** [[TMP2]], i64 0)
+// CHECK-64-NEXT:    call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 2)
+// CHECK-64-NEXT:    ret void
+// CHECK-64:       worker.exit:
+// CHECK-64-NEXT:    ret void
+//
+//
+// CHECK-64-LABEL: define {{[^@]+}}@__omp_outlined__84
+// CHECK-64-SAME: (i32* noalias noundef [[DOTGLOBAL_TID_:%.*]], i32* noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
+// CHECK-64-NEXT:  entry:
+// CHECK-64-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8
+// CHECK-64-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8
+// CHECK-64-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK-64-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8
+// CHECK-64-NEXT:    store i32 0, i32* [[DOTOMP_LB]], align 4
+// CHECK-64-NEXT:    store i32 9, i32* [[DOTOMP_UB]], align 4
+// CHECK-64-NEXT:    store i32 1, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-64-NEXT:    store i32 0, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-64-NEXT:    [[TMP0:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK-64-NEXT:    [[TMP1:%.*]] = load i32, i32* [[TMP0]], align 4
+// CHECK-64-NEXT:    call void @__kmpc_dispatch_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]], i32 1073741860, i32 0, i32 9, i32 1, i32 1)
+// CHECK-64-NEXT:    br label [[OMP_DISPATCH_COND:%.*]]
+// CHECK-64:       omp.dispatch.cond:
+// CHECK-64-NEXT:    [[TMP2:%.*]] = call i32 @__kmpc_dispatch_next_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]], i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]])
+// CHECK-64-NEXT:    [[TOBOOL:%.*]] = icmp ne i32 [[TMP2]], 0
+// CHECK-64-NEXT:    br i1 [[TOBOOL]], label [[OMP_DISPATCH_BODY:%.*]], label [[OMP_DISPATCH_END:%.*]]
+// CHECK-64:       omp.dispatch.body:
+// CHECK-64-NEXT:    [[TMP3:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4
+// CHECK-64-NEXT:    store i32 [[TMP3]], i32* [[DOTOMP_IV]], align 4
+// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK-64:       omp.inner.for.cond:
+// CHECK-64-NEXT:    [[TMP4:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP288:![0-9]+]]
+// CHECK-64-NEXT:    [[TMP5:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP288]]
+// CHECK-64-NEXT:    [[CMP:%.*]] = icmp sle i32 [[TMP4]], [[TMP5]]
+// CHECK-64-NEXT:    br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK-64:       omp.inner.for.body:
+// CHECK-64-NEXT:    [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP288]]
+// CHECK-64-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP6]], 1
+// CHECK-64-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// CHECK-64-NEXT:    store i32 [[ADD]], i32* [[I]], align 4, !llvm.access.group [[ACC_GRP288]]
+// CHECK-64-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK-64:       omp.body.continue:
+// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK-64:       omp.inner.for.inc:
+// CHECK-64-NEXT:    [[TMP7:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP288]]
+// CHECK-64-NEXT:    [[ADD1:%.*]] = add nsw i32 [[TMP7]], 1
+// CHECK-64-NEXT:    store i32 [[ADD1]], i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP288]]
+// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP289:![0-9]+]]
+// CHECK-64:       omp.inner.for.end:
+// CHECK-64-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
+// CHECK-64:       omp.dispatch.inc:
+// CHECK-64-NEXT:    br label [[OMP_DISPATCH_COND]]
+// CHECK-64:       omp.dispatch.end:
+// CHECK-64-NEXT:    [[TMP8:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-64-NEXT:    [[TMP9:%.*]] = icmp ne i32 [[TMP8]], 0
+// CHECK-64-NEXT:    br i1 [[TMP9]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]]
+// CHECK-64:       .omp.final.then:
+// CHECK-64-NEXT:    store i32 10, i32* [[I]], align 4
+// CHECK-64-NEXT:    br label [[DOTOMP_FINAL_DONE]]
+// CHECK-64:       .omp.final.done:
+// CHECK-64-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[TMP1]])
+// CHECK-64-NEXT:    ret void
+//
+//
+// CHECK-64-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l208
+// CHECK-64-SAME: () #[[ATTR8]] {
+// CHECK-64-NEXT:  entry:
+// CHECK-64-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x i8*], align 8
+// CHECK-64-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 2, i1 false)
+// CHECK-64-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
+// CHECK-64-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
+// CHECK-64:       user_code.entry:
+// CHECK-64-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
+// CHECK-64-NEXT:    [[TMP2:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8**
+// CHECK-64-NEXT:    call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__85 to i8*), i8* null, i8** [[TMP2]], i64 0)
+// CHECK-64-NEXT:    call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 2)
+// CHECK-64-NEXT:    ret void
+// CHECK-64:       worker.exit:
+// CHECK-64-NEXT:    ret void
+//
+//
+// CHECK-64-LABEL: define {{[^@]+}}@__omp_outlined__85
+// CHECK-64-SAME: (i32* noalias noundef [[DOTGLOBAL_TID_:%.*]], i32* noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
+// CHECK-64-NEXT:  entry:
+// CHECK-64-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8
+// CHECK-64-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8
+// CHECK-64-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK-64-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8
+// CHECK-64-NEXT:    store i32 0, i32* [[DOTOMP_LB]], align 4
+// CHECK-64-NEXT:    store i32 9, i32* [[DOTOMP_UB]], align 4
+// CHECK-64-NEXT:    store i32 1, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-64-NEXT:    store i32 0, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-64-NEXT:    [[TMP0:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK-64-NEXT:    [[TMP1:%.*]] = load i32, i32* [[TMP0]], align 4
+// CHECK-64-NEXT:    call void @__kmpc_dispatch_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]], i32 65, i32 0, i32 9, i32 1, i32 1)
+// CHECK-64-NEXT:    br label [[OMP_DISPATCH_COND:%.*]]
+// CHECK-64:       omp.dispatch.cond:
+// CHECK-64-NEXT:    [[TMP2:%.*]] = call i32 @__kmpc_dispatch_next_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]], i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]])
+// CHECK-64-NEXT:    [[TOBOOL:%.*]] = icmp ne i32 [[TMP2]], 0
+// CHECK-64-NEXT:    br i1 [[TOBOOL]], label [[OMP_DISPATCH_BODY:%.*]], label [[OMP_DISPATCH_END:%.*]]
+// CHECK-64:       omp.dispatch.body:
+// CHECK-64-NEXT:    [[TMP3:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4
+// CHECK-64-NEXT:    store i32 [[TMP3]], i32* [[DOTOMP_IV]], align 4
+// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK-64:       omp.inner.for.cond:
+// CHECK-64-NEXT:    [[TMP4:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP291:![0-9]+]]
+// CHECK-64-NEXT:    [[TMP5:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP291]]
+// CHECK-64-NEXT:    [[CMP:%.*]] = icmp sle i32 [[TMP4]], [[TMP5]]
+// CHECK-64-NEXT:    br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK-64:       omp.inner.for.body:
+// CHECK-64-NEXT:    [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP291]]
+// CHECK-64-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP6]], 1
+// CHECK-64-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// CHECK-64-NEXT:    store i32 [[ADD]], i32* [[I]], align 4, !llvm.access.group [[ACC_GRP291]]
+// CHECK-64-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK-64:       omp.body.continue:
+// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK-64:       omp.inner.for.inc:
+// CHECK-64-NEXT:    [[TMP7:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP291]]
+// CHECK-64-NEXT:    [[ADD1:%.*]] = add nsw i32 [[TMP7]], 1
+// CHECK-64-NEXT:    store i32 [[ADD1]], i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP291]]
+// CHECK-64-NEXT:    call void @__kmpc_dispatch_fini_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]]), !llvm.access.group [[ACC_GRP291]]
+// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP292:![0-9]+]]
+// CHECK-64:       omp.inner.for.end:
+// CHECK-64-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
+// CHECK-64:       omp.dispatch.inc:
+// CHECK-64-NEXT:    br label [[OMP_DISPATCH_COND]]
+// CHECK-64:       omp.dispatch.end:
+// CHECK-64-NEXT:    [[TMP8:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-64-NEXT:    [[TMP9:%.*]] = icmp ne i32 [[TMP8]], 0
+// CHECK-64-NEXT:    br i1 [[TMP9]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]]
+// CHECK-64:       .omp.final.then:
+// CHECK-64-NEXT:    store i32 10, i32* [[I]], align 4
+// CHECK-64-NEXT:    br label [[DOTOMP_FINAL_DONE]]
+// CHECK-64:       .omp.final.done:
+// CHECK-64-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[TMP1]])
+// CHECK-64-NEXT:    ret void
+//
+//
+// CHECK-64-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l213
+// CHECK-64-SAME: () #[[ATTR8]] {
+// CHECK-64-NEXT:  entry:
+// CHECK-64-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x i8*], align 8
+// CHECK-64-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 2, i1 false)
+// CHECK-64-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
+// CHECK-64-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
+// CHECK-64:       user_code.entry:
+// CHECK-64-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
+// CHECK-64-NEXT:    [[TMP2:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8**
+// CHECK-64-NEXT:    call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__86 to i8*), i8* null, i8** [[TMP2]], i64 0)
+// CHECK-64-NEXT:    call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 2)
+// CHECK-64-NEXT:    ret void
+// CHECK-64:       worker.exit:
+// CHECK-64-NEXT:    ret void
+//
+//
+// CHECK-64-LABEL: define {{[^@]+}}@__omp_outlined__86
+// CHECK-64-SAME: (i32* noalias noundef [[DOTGLOBAL_TID_:%.*]], i32* noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
+// CHECK-64-NEXT:  entry:
+// CHECK-64-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8
+// CHECK-64-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8
+// CHECK-64-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK-64-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8
+// CHECK-64-NEXT:    store i32 0, i32* [[DOTOMP_LB]], align 4
+// CHECK-64-NEXT:    store i32 9, i32* [[DOTOMP_UB]], align 4
+// CHECK-64-NEXT:    store i32 1, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-64-NEXT:    store i32 0, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-64-NEXT:    [[TMP0:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK-64-NEXT:    [[TMP1:%.*]] = load i32, i32* [[TMP0]], align 4
+// CHECK-64-NEXT:    call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB3]], i32 [[TMP1]], i32 34, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1)
+// CHECK-64-NEXT:    [[TMP2:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4
+// CHECK-64-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[TMP2]], 9
+// CHECK-64-NEXT:    br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK-64:       cond.true:
+// CHECK-64-NEXT:    br label [[COND_END:%.*]]
+// CHECK-64:       cond.false:
+// CHECK-64-NEXT:    [[TMP3:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4
+// CHECK-64-NEXT:    br label [[COND_END]]
+// CHECK-64:       cond.end:
+// CHECK-64-NEXT:    [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP3]], [[COND_FALSE]] ]
+// CHECK-64-NEXT:    store i32 [[COND]], i32* [[DOTOMP_UB]], align 4
+// CHECK-64-NEXT:    [[TMP4:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4
+// CHECK-64-NEXT:    store i32 [[TMP4]], i32* [[DOTOMP_IV]], align 4
+// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK-64:       omp.inner.for.cond:
+// CHECK-64-NEXT:    [[TMP5:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP294:![0-9]+]]
+// CHECK-64-NEXT:    [[TMP6:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP294]]
+// CHECK-64-NEXT:    [[CMP1:%.*]] = icmp sle i32 [[TMP5]], [[TMP6]]
+// CHECK-64-NEXT:    br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK-64:       omp.inner.for.body:
+// CHECK-64-NEXT:    [[TMP7:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP294]]
+// CHECK-64-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP7]], 1
+// CHECK-64-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// CHECK-64-NEXT:    store i32 [[ADD]], i32* [[I]], align 4, !llvm.access.group [[ACC_GRP294]]
+// CHECK-64-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK-64:       omp.body.continue:
+// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK-64:       omp.inner.for.inc:
+// CHECK-64-NEXT:    [[TMP8:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP294]]
+// CHECK-64-NEXT:    [[ADD2:%.*]] = add nsw i32 [[TMP8]], 1
+// CHECK-64-NEXT:    store i32 [[ADD2]], i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP294]]
+// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP295:![0-9]+]]
+// CHECK-64:       omp.inner.for.end:
+// CHECK-64-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK-64:       omp.loop.exit:
+// CHECK-64-NEXT:    call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB3]], i32 [[TMP1]])
+// CHECK-64-NEXT:    [[TMP9:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-64-NEXT:    [[TMP10:%.*]] = icmp ne i32 [[TMP9]], 0
+// CHECK-64-NEXT:    br i1 [[TMP10]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]]
+// CHECK-64:       .omp.final.then:
+// CHECK-64-NEXT:    store i32 10, i32* [[I]], align 4
+// CHECK-64-NEXT:    br label [[DOTOMP_FINAL_DONE]]
+// CHECK-64:       .omp.final.done:
+// CHECK-64-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[TMP1]])
+// CHECK-64-NEXT:    ret void
+//
+//
+// CHECK-64-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l218
+// CHECK-64-SAME: () #[[ATTR8]] {
+// CHECK-64-NEXT:  entry:
+// CHECK-64-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x i8*], align 8
+// CHECK-64-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 2, i1 false)
+// CHECK-64-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
+// CHECK-64-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
+// CHECK-64:       user_code.entry:
+// CHECK-64-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
+// CHECK-64-NEXT:    [[TMP2:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8**
+// CHECK-64-NEXT:    call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__87 to i8*), i8* null, i8** [[TMP2]], i64 0)
+// CHECK-64-NEXT:    call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 2)
+// CHECK-64-NEXT:    ret void
+// CHECK-64:       worker.exit:
+// CHECK-64-NEXT:    ret void
+//
+//
+// CHECK-64-LABEL: define {{[^@]+}}@__omp_outlined__87
+// CHECK-64-SAME: (i32* noalias noundef [[DOTGLOBAL_TID_:%.*]], i32* noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
+// CHECK-64-NEXT:  entry:
+// CHECK-64-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8
+// CHECK-64-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8
+// CHECK-64-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK-64-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8
+// CHECK-64-NEXT:    store i32 0, i32* [[DOTOMP_LB]], align 4
+// CHECK-64-NEXT:    store i32 9, i32* [[DOTOMP_UB]], align 4
+// CHECK-64-NEXT:    store i32 1, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-64-NEXT:    store i32 0, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-64-NEXT:    [[TMP0:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK-64-NEXT:    [[TMP1:%.*]] = load i32, i32* [[TMP0]], align 4
+// CHECK-64-NEXT:    call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB3]], i32 [[TMP1]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1)
+// CHECK-64-NEXT:    br label [[OMP_DISPATCH_COND:%.*]]
+// CHECK-64:       omp.dispatch.cond:
+// CHECK-64-NEXT:    [[TMP2:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4
+// CHECK-64-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[TMP2]], 9
+// CHECK-64-NEXT:    br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK-64:       cond.true:
+// CHECK-64-NEXT:    br label [[COND_END:%.*]]
+// CHECK-64:       cond.false:
+// CHECK-64-NEXT:    [[TMP3:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4
+// CHECK-64-NEXT:    br label [[COND_END]]
+// CHECK-64:       cond.end:
+// CHECK-64-NEXT:    [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP3]], [[COND_FALSE]] ]
+// CHECK-64-NEXT:    store i32 [[COND]], i32* [[DOTOMP_UB]], align 4
+// CHECK-64-NEXT:    [[TMP4:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4
+// CHECK-64-NEXT:    store i32 [[TMP4]], i32* [[DOTOMP_IV]], align 4
+// CHECK-64-NEXT:    [[TMP5:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
+// CHECK-64-NEXT:    [[TMP6:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4
+// CHECK-64-NEXT:    [[CMP1:%.*]] = icmp sle i32 [[TMP5]], [[TMP6]]
+// CHECK-64-NEXT:    br i1 [[CMP1]], label [[OMP_DISPATCH_BODY:%.*]], label [[OMP_DISPATCH_END:%.*]]
+// CHECK-64:       omp.dispatch.body:
+// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK-64:       omp.inner.for.cond:
+// CHECK-64-NEXT:    [[TMP7:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP297:![0-9]+]]
+// CHECK-64-NEXT:    [[TMP8:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP297]]
+// CHECK-64-NEXT:    [[CMP2:%.*]] = icmp sle i32 [[TMP7]], [[TMP8]]
+// CHECK-64-NEXT:    br i1 [[CMP2]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK-64:       omp.inner.for.body:
+// CHECK-64-NEXT:    [[TMP9:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP297]]
+// CHECK-64-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP9]], 1
+// CHECK-64-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// CHECK-64-NEXT:    store i32 [[ADD]], i32* [[I]], align 4, !llvm.access.group [[ACC_GRP297]]
+// CHECK-64-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK-64:       omp.body.continue:
+// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK-64:       omp.inner.for.inc:
+// CHECK-64-NEXT:    [[TMP10:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP297]]
+// CHECK-64-NEXT:    [[ADD3:%.*]] = add nsw i32 [[TMP10]], 1
+// CHECK-64-NEXT:    store i32 [[ADD3]], i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP297]]
+// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP298:![0-9]+]]
+// CHECK-64:       omp.inner.for.end:
+// CHECK-64-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
+// CHECK-64:       omp.dispatch.inc:
+// CHECK-64-NEXT:    [[TMP11:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4
+// CHECK-64-NEXT:    [[TMP12:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-64-NEXT:    [[ADD4:%.*]] = add nsw i32 [[TMP11]], [[TMP12]]
+// CHECK-64-NEXT:    store i32 [[ADD4]], i32* [[DOTOMP_LB]], align 4
+// CHECK-64-NEXT:    [[TMP13:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4
+// CHECK-64-NEXT:    [[TMP14:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-64-NEXT:    [[ADD5:%.*]] = add nsw i32 [[TMP13]], [[TMP14]]
+// CHECK-64-NEXT:    store i32 [[ADD5]], i32* [[DOTOMP_UB]], align 4
+// CHECK-64-NEXT:    br label [[OMP_DISPATCH_COND]]
+// CHECK-64:       omp.dispatch.end:
+// CHECK-64-NEXT:    call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB3]], i32 [[TMP1]])
+// CHECK-64-NEXT:    [[TMP15:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-64-NEXT:    [[TMP16:%.*]] = icmp ne i32 [[TMP15]], 0
+// CHECK-64-NEXT:    br i1 [[TMP16]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]]
+// CHECK-64:       .omp.final.then:
+// CHECK-64-NEXT:    store i32 10, i32* [[I]], align 4
+// CHECK-64-NEXT:    br label [[DOTOMP_FINAL_DONE]]
+// CHECK-64:       .omp.final.done:
+// CHECK-64-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[TMP1]])
+// CHECK-64-NEXT:    ret void
+//
+//
+// CHECK-64-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l223
+// CHECK-64-SAME: () #[[ATTR8]] {
+// CHECK-64-NEXT:  entry:
+// CHECK-64-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x i8*], align 8
+// CHECK-64-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 2, i1 false)
+// CHECK-64-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
+// CHECK-64-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
+// CHECK-64:       user_code.entry:
+// CHECK-64-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
+// CHECK-64-NEXT:    [[TMP2:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8**
+// CHECK-64-NEXT:    call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__88 to i8*), i8* null, i8** [[TMP2]], i64 0)
+// CHECK-64-NEXT:    call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 2)
+// CHECK-64-NEXT:    ret void
+// CHECK-64:       worker.exit:
+// CHECK-64-NEXT:    ret void
+//
+//
+// CHECK-64-LABEL: define {{[^@]+}}@__omp_outlined__88
+// CHECK-64-SAME: (i32* noalias noundef [[DOTGLOBAL_TID_:%.*]], i32* noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
+// CHECK-64-NEXT:  entry:
+// CHECK-64-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8
+// CHECK-64-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8
+// CHECK-64-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK-64-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8
+// CHECK-64-NEXT:    store i32 0, i32* [[DOTOMP_LB]], align 4
+// CHECK-64-NEXT:    store i32 9, i32* [[DOTOMP_UB]], align 4
+// CHECK-64-NEXT:    store i32 1, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-64-NEXT:    store i32 0, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-64-NEXT:    [[TMP0:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK-64-NEXT:    [[TMP1:%.*]] = load i32, i32* [[TMP0]], align 4
+// CHECK-64-NEXT:    call void @__kmpc_dispatch_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]], i32 1073741862, i32 0, i32 9, i32 1, i32 1)
+// CHECK-64-NEXT:    br label [[OMP_DISPATCH_COND:%.*]]
+// CHECK-64:       omp.dispatch.cond:
+// CHECK-64-NEXT:    [[TMP2:%.*]] = call i32 @__kmpc_dispatch_next_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]], i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]])
+// CHECK-64-NEXT:    [[TOBOOL:%.*]] = icmp ne i32 [[TMP2]], 0
+// CHECK-64-NEXT:    br i1 [[TOBOOL]], label [[OMP_DISPATCH_BODY:%.*]], label [[OMP_DISPATCH_END:%.*]]
+// CHECK-64:       omp.dispatch.body:
+// CHECK-64-NEXT:    [[TMP3:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4
+// CHECK-64-NEXT:    store i32 [[TMP3]], i32* [[DOTOMP_IV]], align 4
+// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK-64:       omp.inner.for.cond:
+// CHECK-64-NEXT:    [[TMP4:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP300:![0-9]+]]
+// CHECK-64-NEXT:    [[TMP5:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP300]]
+// CHECK-64-NEXT:    [[CMP:%.*]] = icmp sle i32 [[TMP4]], [[TMP5]]
+// CHECK-64-NEXT:    br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK-64:       omp.inner.for.body:
+// CHECK-64-NEXT:    [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP300]]
+// CHECK-64-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP6]], 1
+// CHECK-64-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// CHECK-64-NEXT:    store i32 [[ADD]], i32* [[I]], align 4, !llvm.access.group [[ACC_GRP300]]
+// CHECK-64-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK-64:       omp.body.continue:
+// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK-64:       omp.inner.for.inc:
+// CHECK-64-NEXT:    [[TMP7:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP300]]
+// CHECK-64-NEXT:    [[ADD1:%.*]] = add nsw i32 [[TMP7]], 1
+// CHECK-64-NEXT:    store i32 [[ADD1]], i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP300]]
+// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP301:![0-9]+]]
+// CHECK-64:       omp.inner.for.end:
+// CHECK-64-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
+// CHECK-64:       omp.dispatch.inc:
+// CHECK-64-NEXT:    br label [[OMP_DISPATCH_COND]]
+// CHECK-64:       omp.dispatch.end:
+// CHECK-64-NEXT:    [[TMP8:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-64-NEXT:    [[TMP9:%.*]] = icmp ne i32 [[TMP8]], 0
+// CHECK-64-NEXT:    br i1 [[TMP9]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]]
+// CHECK-64:       .omp.final.then:
+// CHECK-64-NEXT:    store i32 10, i32* [[I]], align 4
+// CHECK-64-NEXT:    br label [[DOTOMP_FINAL_DONE]]
+// CHECK-64:       .omp.final.done:
+// CHECK-64-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[TMP1]])
+// CHECK-64-NEXT:    ret void
+//
+//
+// CHECK-64-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l228
+// CHECK-64-SAME: () #[[ATTR8]] {
+// CHECK-64-NEXT:  entry:
+// CHECK-64-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x i8*], align 8
+// CHECK-64-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 2, i1 false)
+// CHECK-64-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
+// CHECK-64-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
+// CHECK-64:       user_code.entry:
+// CHECK-64-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
+// CHECK-64-NEXT:    [[TMP2:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8**
+// CHECK-64-NEXT:    call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__89 to i8*), i8* null, i8** [[TMP2]], i64 0)
+// CHECK-64-NEXT:    call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 2)
+// CHECK-64-NEXT:    ret void
+// CHECK-64:       worker.exit:
+// CHECK-64-NEXT:    ret void
+//
+//
+// CHECK-64-LABEL: define {{[^@]+}}@__omp_outlined__89
+// CHECK-64-SAME: (i32* noalias noundef [[DOTGLOBAL_TID_:%.*]], i32* noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
+// CHECK-64-NEXT:  entry:
+// CHECK-64-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8
+// CHECK-64-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8
+// CHECK-64-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK-64-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8
+// CHECK-64-NEXT:    store i32 0, i32* [[DOTOMP_LB]], align 4
+// CHECK-64-NEXT:    store i32 9, i32* [[DOTOMP_UB]], align 4
+// CHECK-64-NEXT:    store i32 1, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-64-NEXT:    store i32 0, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-64-NEXT:    [[TMP0:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK-64-NEXT:    [[TMP1:%.*]] = load i32, i32* [[TMP0]], align 4
+// CHECK-64-NEXT:    call void @__kmpc_dispatch_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]], i32 1073741861, i32 0, i32 9, i32 1, i32 1)
+// CHECK-64-NEXT:    br label [[OMP_DISPATCH_COND:%.*]]
+// CHECK-64:       omp.dispatch.cond:
+// CHECK-64-NEXT:    [[TMP2:%.*]] = call i32 @__kmpc_dispatch_next_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]], i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]])
+// CHECK-64-NEXT:    [[TOBOOL:%.*]] = icmp ne i32 [[TMP2]], 0
+// CHECK-64-NEXT:    br i1 [[TOBOOL]], label [[OMP_DISPATCH_BODY:%.*]], label [[OMP_DISPATCH_END:%.*]]
+// CHECK-64:       omp.dispatch.body:
+// CHECK-64-NEXT:    [[TMP3:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4
+// CHECK-64-NEXT:    store i32 [[TMP3]], i32* [[DOTOMP_IV]], align 4
+// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK-64:       omp.inner.for.cond:
+// CHECK-64-NEXT:    [[TMP4:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP303:![0-9]+]]
+// CHECK-64-NEXT:    [[TMP5:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP303]]
+// CHECK-64-NEXT:    [[CMP:%.*]] = icmp sle i32 [[TMP4]], [[TMP5]]
+// CHECK-64-NEXT:    br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK-64:       omp.inner.for.body:
+// CHECK-64-NEXT:    [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP303]]
+// CHECK-64-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP6]], 1
+// CHECK-64-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// CHECK-64-NEXT:    store i32 [[ADD]], i32* [[I]], align 4, !llvm.access.group [[ACC_GRP303]]
+// CHECK-64-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK-64:       omp.body.continue:
+// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK-64:       omp.inner.for.inc:
+// CHECK-64-NEXT:    [[TMP7:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP303]]
+// CHECK-64-NEXT:    [[ADD1:%.*]] = add nsw i32 [[TMP7]], 1
+// CHECK-64-NEXT:    store i32 [[ADD1]], i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP303]]
+// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP304:![0-9]+]]
+// CHECK-64:       omp.inner.for.end:
+// CHECK-64-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
+// CHECK-64:       omp.dispatch.inc:
+// CHECK-64-NEXT:    br label [[OMP_DISPATCH_COND]]
+// CHECK-64:       omp.dispatch.end:
+// CHECK-64-NEXT:    [[TMP8:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-64-NEXT:    [[TMP9:%.*]] = icmp ne i32 [[TMP8]], 0
+// CHECK-64-NEXT:    br i1 [[TMP9]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]]
+// CHECK-64:       .omp.final.then:
+// CHECK-64-NEXT:    store i32 10, i32* [[I]], align 4
+// CHECK-64-NEXT:    br label [[DOTOMP_FINAL_DONE]]
+// CHECK-64:       .omp.final.done:
+// CHECK-64-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[TMP1]])
+// CHECK-64-NEXT:    ret void
+//
+//
+// CHECK-64-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l233
+// CHECK-64-SAME: () #[[ATTR8]] {
+// CHECK-64-NEXT:  entry:
+// CHECK-64-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x i8*], align 8
+// CHECK-64-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 2, i1 false)
+// CHECK-64-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
+// CHECK-64-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
+// CHECK-64:       user_code.entry:
+// CHECK-64-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
+// CHECK-64-NEXT:    [[TMP2:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8**
+// CHECK-64-NEXT:    call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__90 to i8*), i8* null, i8** [[TMP2]], i64 0)
+// CHECK-64-NEXT:    call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 2)
+// CHECK-64-NEXT:    ret void
+// CHECK-64:       worker.exit:
+// CHECK-64-NEXT:    ret void
+//
+//
+// CHECK-64-LABEL: define {{[^@]+}}@__omp_outlined__90
+// CHECK-64-SAME: (i32* noalias noundef [[DOTGLOBAL_TID_:%.*]], i32* noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
+// CHECK-64-NEXT:  entry:
+// CHECK-64-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8
+// CHECK-64-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8
+// CHECK-64-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK-64-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8
+// CHECK-64-NEXT:    store i32 0, i32* [[DOTOMP_LB]], align 4
+// CHECK-64-NEXT:    store i32 9, i32* [[DOTOMP_UB]], align 4
+// CHECK-64-NEXT:    store i32 1, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-64-NEXT:    store i32 0, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-64-NEXT:    [[TMP0:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK-64-NEXT:    [[TMP1:%.*]] = load i32, i32* [[TMP0]], align 4
+// CHECK-64-NEXT:    call void @__kmpc_dispatch_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]], i32 1073741859, i32 0, i32 9, i32 1, i32 1)
+// CHECK-64-NEXT:    br label [[OMP_DISPATCH_COND:%.*]]
+// CHECK-64:       omp.dispatch.cond:
+// CHECK-64-NEXT:    [[TMP2:%.*]] = call i32 @__kmpc_dispatch_next_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]], i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]])
+// CHECK-64-NEXT:    [[TOBOOL:%.*]] = icmp ne i32 [[TMP2]], 0
+// CHECK-64-NEXT:    br i1 [[TOBOOL]], label [[OMP_DISPATCH_BODY:%.*]], label [[OMP_DISPATCH_END:%.*]]
+// CHECK-64:       omp.dispatch.body:
+// CHECK-64-NEXT:    [[TMP3:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4
+// CHECK-64-NEXT:    store i32 [[TMP3]], i32* [[DOTOMP_IV]], align 4
+// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK-64:       omp.inner.for.cond:
+// CHECK-64-NEXT:    [[TMP4:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP306:![0-9]+]]
+// CHECK-64-NEXT:    [[TMP5:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP306]]
+// CHECK-64-NEXT:    [[CMP:%.*]] = icmp sle i32 [[TMP4]], [[TMP5]]
+// CHECK-64-NEXT:    br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK-64:       omp.inner.for.body:
+// CHECK-64-NEXT:    [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP306]]
+// CHECK-64-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP6]], 1
+// CHECK-64-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// CHECK-64-NEXT:    store i32 [[ADD]], i32* [[I]], align 4, !llvm.access.group [[ACC_GRP306]]
+// CHECK-64-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK-64:       omp.body.continue:
+// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK-64:       omp.inner.for.inc:
+// CHECK-64-NEXT:    [[TMP7:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP306]]
+// CHECK-64-NEXT:    [[ADD1:%.*]] = add nsw i32 [[TMP7]], 1
+// CHECK-64-NEXT:    store i32 [[ADD1]], i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP306]]
+// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP307:![0-9]+]]
+// CHECK-64:       omp.inner.for.end:
+// CHECK-64-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
+// CHECK-64:       omp.dispatch.inc:
+// CHECK-64-NEXT:    br label [[OMP_DISPATCH_COND]]
+// CHECK-64:       omp.dispatch.end:
+// CHECK-64-NEXT:    [[TMP8:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-64-NEXT:    [[TMP9:%.*]] = icmp ne i32 [[TMP8]], 0
+// CHECK-64-NEXT:    br i1 [[TMP9]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]]
+// CHECK-64:       .omp.final.then:
+// CHECK-64-NEXT:    store i32 10, i32* [[I]], align 4
+// CHECK-64-NEXT:    br label [[DOTOMP_FINAL_DONE]]
+// CHECK-64:       .omp.final.done:
+// CHECK-64-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[TMP1]])
+// CHECK-64-NEXT:    ret void
+//
+//
+// CHECK-64-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l238
+// CHECK-64-SAME: () #[[ATTR8]] {
+// CHECK-64-NEXT:  entry:
+// CHECK-64-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x i8*], align 8
+// CHECK-64-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 2, i1 false)
+// CHECK-64-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
+// CHECK-64-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
+// CHECK-64:       user_code.entry:
+// CHECK-64-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
+// CHECK-64-NEXT:    [[TMP2:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8**
+// CHECK-64-NEXT:    call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__91 to i8*), i8* null, i8** [[TMP2]], i64 0)
+// CHECK-64-NEXT:    call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 2)
+// CHECK-64-NEXT:    ret void
+// CHECK-64:       worker.exit:
+// CHECK-64-NEXT:    ret void
+//
+//
+// CHECK-64-LABEL: define {{[^@]+}}@__omp_outlined__91
+// CHECK-64-SAME: (i32* noalias noundef [[DOTGLOBAL_TID_:%.*]], i32* noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
+// CHECK-64-NEXT:  entry:
+// CHECK-64-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8
+// CHECK-64-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8
+// CHECK-64-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK-64-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8
+// CHECK-64-NEXT:    store i32 0, i32* [[DOTOMP_LB]], align 4
+// CHECK-64-NEXT:    store i32 9, i32* [[DOTOMP_UB]], align 4
+// CHECK-64-NEXT:    store i32 1, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-64-NEXT:    store i32 0, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-64-NEXT:    [[TMP0:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK-64-NEXT:    [[TMP1:%.*]] = load i32, i32* [[TMP0]], align 4
+// CHECK-64-NEXT:    call void @__kmpc_dispatch_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]], i32 1073741860, i32 0, i32 9, i32 1, i32 1)
+// CHECK-64-NEXT:    br label [[OMP_DISPATCH_COND:%.*]]
+// CHECK-64:       omp.dispatch.cond:
+// CHECK-64-NEXT:    [[TMP2:%.*]] = call i32 @__kmpc_dispatch_next_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]], i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]])
+// CHECK-64-NEXT:    [[TOBOOL:%.*]] = icmp ne i32 [[TMP2]], 0
+// CHECK-64-NEXT:    br i1 [[TOBOOL]], label [[OMP_DISPATCH_BODY:%.*]], label [[OMP_DISPATCH_END:%.*]]
+// CHECK-64:       omp.dispatch.body:
+// CHECK-64-NEXT:    [[TMP3:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4
+// CHECK-64-NEXT:    store i32 [[TMP3]], i32* [[DOTOMP_IV]], align 4
+// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK-64:       omp.inner.for.cond:
+// CHECK-64-NEXT:    [[TMP4:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP309:![0-9]+]]
+// CHECK-64-NEXT:    [[TMP5:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP309]]
+// CHECK-64-NEXT:    [[CMP:%.*]] = icmp sle i32 [[TMP4]], [[TMP5]]
+// CHECK-64-NEXT:    br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK-64:       omp.inner.for.body:
+// CHECK-64-NEXT:    [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP309]]
+// CHECK-64-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP6]], 1
+// CHECK-64-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// CHECK-64-NEXT:    store i32 [[ADD]], i32* [[I]], align 4, !llvm.access.group [[ACC_GRP309]]
+// CHECK-64-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK-64:       omp.body.continue:
+// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK-64:       omp.inner.for.inc:
+// CHECK-64-NEXT:    [[TMP7:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP309]]
+// CHECK-64-NEXT:    [[ADD1:%.*]] = add nsw i32 [[TMP7]], 1
+// CHECK-64-NEXT:    store i32 [[ADD1]], i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP309]]
+// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP310:![0-9]+]]
+// CHECK-64:       omp.inner.for.end:
+// CHECK-64-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
+// CHECK-64:       omp.dispatch.inc:
+// CHECK-64-NEXT:    br label [[OMP_DISPATCH_COND]]
+// CHECK-64:       omp.dispatch.end:
+// CHECK-64-NEXT:    [[TMP8:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-64-NEXT:    [[TMP9:%.*]] = icmp ne i32 [[TMP8]], 0
+// CHECK-64-NEXT:    br i1 [[TMP9]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]]
+// CHECK-64:       .omp.final.then:
+// CHECK-64-NEXT:    store i32 10, i32* [[I]], align 4
+// CHECK-64-NEXT:    br label [[DOTOMP_FINAL_DONE]]
+// CHECK-64:       .omp.final.done:
+// CHECK-64-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[TMP1]])
+// CHECK-64-NEXT:    ret void
+//
+//
+// CHECK-64-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l243
+// CHECK-64-SAME: () #[[ATTR8]] {
+// CHECK-64-NEXT:  entry:
+// CHECK-64-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x i8*], align 8
+// CHECK-64-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 2, i1 false)
+// CHECK-64-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
+// CHECK-64-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
+// CHECK-64:       user_code.entry:
+// CHECK-64-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
+// CHECK-64-NEXT:    [[TMP2:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8**
+// CHECK-64-NEXT:    call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__92 to i8*), i8* null, i8** [[TMP2]], i64 0)
+// CHECK-64-NEXT:    call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 2)
+// CHECK-64-NEXT:    ret void
+// CHECK-64:       worker.exit:
+// CHECK-64-NEXT:    ret void
+//
+//
+// CHECK-64-LABEL: define {{[^@]+}}@__omp_outlined__92
+// CHECK-64-SAME: (i32* noalias noundef [[DOTGLOBAL_TID_:%.*]], i32* noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
+// CHECK-64-NEXT:  entry:
+// CHECK-64-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8
+// CHECK-64-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8
+// CHECK-64-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK-64-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8
+// CHECK-64-NEXT:    store i32 0, i32* [[DOTOMP_LB]], align 4
+// CHECK-64-NEXT:    store i32 9, i32* [[DOTOMP_UB]], align 4
+// CHECK-64-NEXT:    store i32 1, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-64-NEXT:    store i32 0, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-64-NEXT:    [[TMP0:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK-64-NEXT:    [[TMP1:%.*]] = load i32, i32* [[TMP0]], align 4
+// CHECK-64-NEXT:    call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB3]], i32 [[TMP1]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1)
+// CHECK-64-NEXT:    br label [[OMP_DISPATCH_COND:%.*]]
+// CHECK-64:       omp.dispatch.cond:
+// CHECK-64-NEXT:    [[TMP2:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4
+// CHECK-64-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[TMP2]], 9
+// CHECK-64-NEXT:    br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK-64:       cond.true:
+// CHECK-64-NEXT:    br label [[COND_END:%.*]]
+// CHECK-64:       cond.false:
+// CHECK-64-NEXT:    [[TMP3:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4
+// CHECK-64-NEXT:    br label [[COND_END]]
+// CHECK-64:       cond.end:
+// CHECK-64-NEXT:    [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP3]], [[COND_FALSE]] ]
+// CHECK-64-NEXT:    store i32 [[COND]], i32* [[DOTOMP_UB]], align 4
+// CHECK-64-NEXT:    [[TMP4:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4
+// CHECK-64-NEXT:    store i32 [[TMP4]], i32* [[DOTOMP_IV]], align 4
+// CHECK-64-NEXT:    [[TMP5:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
+// CHECK-64-NEXT:    [[TMP6:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4
+// CHECK-64-NEXT:    [[CMP1:%.*]] = icmp sle i32 [[TMP5]], [[TMP6]]
+// CHECK-64-NEXT:    br i1 [[CMP1]], label [[OMP_DISPATCH_BODY:%.*]], label [[OMP_DISPATCH_END:%.*]]
+// CHECK-64:       omp.dispatch.body:
+// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK-64:       omp.inner.for.cond:
+// CHECK-64-NEXT:    [[TMP7:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
+// CHECK-64-NEXT:    [[TMP8:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4
+// CHECK-64-NEXT:    [[CMP2:%.*]] = icmp sle i32 [[TMP7]], [[TMP8]]
+// CHECK-64-NEXT:    br i1 [[CMP2]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK-64:       omp.inner.for.body:
+// CHECK-64-NEXT:    [[TMP9:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
+// CHECK-64-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP9]], 1
+// CHECK-64-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// CHECK-64-NEXT:    store i32 [[ADD]], i32* [[I]], align 4
+// CHECK-64-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK-64:       omp.body.continue:
+// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK-64:       omp.inner.for.inc:
+// CHECK-64-NEXT:    [[TMP10:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
+// CHECK-64-NEXT:    [[ADD3:%.*]] = add nsw i32 [[TMP10]], 1
+// CHECK-64-NEXT:    store i32 [[ADD3]], i32* [[DOTOMP_IV]], align 4
+// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND]]
+// CHECK-64:       omp.inner.for.end:
+// CHECK-64-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
+// CHECK-64:       omp.dispatch.inc:
+// CHECK-64-NEXT:    [[TMP11:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4
+// CHECK-64-NEXT:    [[TMP12:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-64-NEXT:    [[ADD4:%.*]] = add nsw i32 [[TMP11]], [[TMP12]]
+// CHECK-64-NEXT:    store i32 [[ADD4]], i32* [[DOTOMP_LB]], align 4
+// CHECK-64-NEXT:    [[TMP13:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4
+// CHECK-64-NEXT:    [[TMP14:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-64-NEXT:    [[ADD5:%.*]] = add nsw i32 [[TMP13]], [[TMP14]]
+// CHECK-64-NEXT:    store i32 [[ADD5]], i32* [[DOTOMP_UB]], align 4
+// CHECK-64-NEXT:    br label [[OMP_DISPATCH_COND]]
+// CHECK-64:       omp.dispatch.end:
+// CHECK-64-NEXT:    call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB3]], i32 [[TMP1]])
+// CHECK-64-NEXT:    ret void
+//
+//
+// CHECK-64-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l247
+// CHECK-64-SAME: () #[[ATTR8]] {
+// CHECK-64-NEXT:  entry:
+// CHECK-64-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x i8*], align 8
+// CHECK-64-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 2, i1 false)
+// CHECK-64-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
+// CHECK-64-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
+// CHECK-64:       user_code.entry:
+// CHECK-64-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
+// CHECK-64-NEXT:    [[TMP2:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8**
+// CHECK-64-NEXT:    call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__93 to i8*), i8* null, i8** [[TMP2]], i64 0)
+// CHECK-64-NEXT:    call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 2)
+// CHECK-64-NEXT:    ret void
+// CHECK-64:       worker.exit:
+// CHECK-64-NEXT:    ret void
+//
+//
+// CHECK-64-LABEL: define {{[^@]+}}@__omp_outlined__93
+// CHECK-64-SAME: (i32* noalias noundef [[DOTGLOBAL_TID_:%.*]], i32* noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
+// CHECK-64-NEXT:  entry:
+// CHECK-64-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8
+// CHECK-64-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8
+// CHECK-64-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK-64-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8
+// CHECK-64-NEXT:    store i32 0, i32* [[DOTOMP_LB]], align 4
+// CHECK-64-NEXT:    store i32 9, i32* [[DOTOMP_UB]], align 4
+// CHECK-64-NEXT:    store i32 1, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-64-NEXT:    store i32 0, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-64-NEXT:    [[TMP0:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK-64-NEXT:    [[TMP1:%.*]] = load i32, i32* [[TMP0]], align 4
+// CHECK-64-NEXT:    call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB3]], i32 [[TMP1]], i32 34, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1)
+// CHECK-64-NEXT:    [[TMP2:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4
+// CHECK-64-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[TMP2]], 9
+// CHECK-64-NEXT:    br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK-64:       cond.true:
+// CHECK-64-NEXT:    br label [[COND_END:%.*]]
+// CHECK-64:       cond.false:
+// CHECK-64-NEXT:    [[TMP3:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4
+// CHECK-64-NEXT:    br label [[COND_END]]
+// CHECK-64:       cond.end:
+// CHECK-64-NEXT:    [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP3]], [[COND_FALSE]] ]
+// CHECK-64-NEXT:    store i32 [[COND]], i32* [[DOTOMP_UB]], align 4
+// CHECK-64-NEXT:    [[TMP4:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4
+// CHECK-64-NEXT:    store i32 [[TMP4]], i32* [[DOTOMP_IV]], align 4
+// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK-64:       omp.inner.for.cond:
+// CHECK-64-NEXT:    [[TMP5:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
+// CHECK-64-NEXT:    [[TMP6:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4
+// CHECK-64-NEXT:    [[CMP1:%.*]] = icmp sle i32 [[TMP5]], [[TMP6]]
+// CHECK-64-NEXT:    br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK-64:       omp.inner.for.body:
+// CHECK-64-NEXT:    [[TMP7:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
+// CHECK-64-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP7]], 1
+// CHECK-64-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// CHECK-64-NEXT:    store i32 [[ADD]], i32* [[I]], align 4
+// CHECK-64-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK-64:       omp.body.continue:
+// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK-64:       omp.inner.for.inc:
+// CHECK-64-NEXT:    [[TMP8:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
+// CHECK-64-NEXT:    [[ADD2:%.*]] = add nsw i32 [[TMP8]], 1
+// CHECK-64-NEXT:    store i32 [[ADD2]], i32* [[DOTOMP_IV]], align 4
+// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND]]
+// CHECK-64:       omp.inner.for.end:
+// CHECK-64-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK-64:       omp.loop.exit:
+// CHECK-64-NEXT:    call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB3]], i32 [[TMP1]])
+// CHECK-64-NEXT:    ret void
+//
+//
+// CHECK-64-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l251
+// CHECK-64-SAME: () #[[ATTR8]] {
+// CHECK-64-NEXT:  entry:
+// CHECK-64-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x i8*], align 8
+// CHECK-64-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 2, i1 false)
+// CHECK-64-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
+// CHECK-64-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
+// CHECK-64:       user_code.entry:
+// CHECK-64-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
+// CHECK-64-NEXT:    [[TMP2:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8**
+// CHECK-64-NEXT:    call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__94 to i8*), i8* null, i8** [[TMP2]], i64 0)
+// CHECK-64-NEXT:    call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 2)
+// CHECK-64-NEXT:    ret void
+// CHECK-64:       worker.exit:
+// CHECK-64-NEXT:    ret void
+//
+//
+// CHECK-64-LABEL: define {{[^@]+}}@__omp_outlined__94
+// CHECK-64-SAME: (i32* noalias noundef [[DOTGLOBAL_TID_:%.*]], i32* noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
+// CHECK-64-NEXT:  entry:
+// CHECK-64-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8
+// CHECK-64-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8
+// CHECK-64-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK-64-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8
+// CHECK-64-NEXT:    store i32 0, i32* [[DOTOMP_LB]], align 4
+// CHECK-64-NEXT:    store i32 9, i32* [[DOTOMP_UB]], align 4
+// CHECK-64-NEXT:    store i32 1, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-64-NEXT:    store i32 0, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-64-NEXT:    [[TMP0:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK-64-NEXT:    [[TMP1:%.*]] = load i32, i32* [[TMP0]], align 4
+// CHECK-64-NEXT:    call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB3]], i32 [[TMP1]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1)
+// CHECK-64-NEXT:    br label [[OMP_DISPATCH_COND:%.*]]
+// CHECK-64:       omp.dispatch.cond:
+// CHECK-64-NEXT:    [[TMP2:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4
+// CHECK-64-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[TMP2]], 9
+// CHECK-64-NEXT:    br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK-64:       cond.true:
+// CHECK-64-NEXT:    br label [[COND_END:%.*]]
+// CHECK-64:       cond.false:
+// CHECK-64-NEXT:    [[TMP3:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4
+// CHECK-64-NEXT:    br label [[COND_END]]
+// CHECK-64:       cond.end:
+// CHECK-64-NEXT:    [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP3]], [[COND_FALSE]] ]
+// CHECK-64-NEXT:    store i32 [[COND]], i32* [[DOTOMP_UB]], align 4
+// CHECK-64-NEXT:    [[TMP4:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4
+// CHECK-64-NEXT:    store i32 [[TMP4]], i32* [[DOTOMP_IV]], align 4
+// CHECK-64-NEXT:    [[TMP5:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
+// CHECK-64-NEXT:    [[TMP6:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4
+// CHECK-64-NEXT:    [[CMP1:%.*]] = icmp sle i32 [[TMP5]], [[TMP6]]
+// CHECK-64-NEXT:    br i1 [[CMP1]], label [[OMP_DISPATCH_BODY:%.*]], label [[OMP_DISPATCH_END:%.*]]
+// CHECK-64:       omp.dispatch.body:
+// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK-64:       omp.inner.for.cond:
+// CHECK-64-NEXT:    [[TMP7:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
+// CHECK-64-NEXT:    [[TMP8:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4
+// CHECK-64-NEXT:    [[CMP2:%.*]] = icmp sle i32 [[TMP7]], [[TMP8]]
+// CHECK-64-NEXT:    br i1 [[CMP2]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK-64:       omp.inner.for.body:
+// CHECK-64-NEXT:    [[TMP9:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
+// CHECK-64-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP9]], 1
+// CHECK-64-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// CHECK-64-NEXT:    store i32 [[ADD]], i32* [[I]], align 4
+// CHECK-64-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK-64:       omp.body.continue:
+// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK-64:       omp.inner.for.inc:
+// CHECK-64-NEXT:    [[TMP10:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
+// CHECK-64-NEXT:    [[ADD3:%.*]] = add nsw i32 [[TMP10]], 1
+// CHECK-64-NEXT:    store i32 [[ADD3]], i32* [[DOTOMP_IV]], align 4
+// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND]]
+// CHECK-64:       omp.inner.for.end:
+// CHECK-64-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
+// CHECK-64:       omp.dispatch.inc:
+// CHECK-64-NEXT:    [[TMP11:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4
+// CHECK-64-NEXT:    [[TMP12:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-64-NEXT:    [[ADD4:%.*]] = add nsw i32 [[TMP11]], [[TMP12]]
+// CHECK-64-NEXT:    store i32 [[ADD4]], i32* [[DOTOMP_LB]], align 4
+// CHECK-64-NEXT:    [[TMP13:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4
+// CHECK-64-NEXT:    [[TMP14:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-64-NEXT:    [[ADD5:%.*]] = add nsw i32 [[TMP13]], [[TMP14]]
+// CHECK-64-NEXT:    store i32 [[ADD5]], i32* [[DOTOMP_UB]], align 4
+// CHECK-64-NEXT:    br label [[OMP_DISPATCH_COND]]
+// CHECK-64:       omp.dispatch.end:
+// CHECK-64-NEXT:    call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB3]], i32 [[TMP1]])
+// CHECK-64-NEXT:    ret void
+//
+//
+// CHECK-64-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l255
+// CHECK-64-SAME: () #[[ATTR8]] {
+// CHECK-64-NEXT:  entry:
+// CHECK-64-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x i8*], align 8
+// CHECK-64-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 2, i1 false)
+// CHECK-64-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
+// CHECK-64-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
+// CHECK-64:       user_code.entry:
+// CHECK-64-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
+// CHECK-64-NEXT:    [[TMP2:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8**
+// CHECK-64-NEXT:    call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__95 to i8*), i8* null, i8** [[TMP2]], i64 0)
+// CHECK-64-NEXT:    call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 2)
+// CHECK-64-NEXT:    ret void
+// CHECK-64:       worker.exit:
+// CHECK-64-NEXT:    ret void
+//
+//
+// CHECK-64-LABEL: define {{[^@]+}}@__omp_outlined__95
+// CHECK-64-SAME: (i32* noalias noundef [[DOTGLOBAL_TID_:%.*]], i32* noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
+// CHECK-64-NEXT:  entry:
+// CHECK-64-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8
+// CHECK-64-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8
+// CHECK-64-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK-64-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8
+// CHECK-64-NEXT:    store i32 0, i32* [[DOTOMP_LB]], align 4
+// CHECK-64-NEXT:    store i32 9, i32* [[DOTOMP_UB]], align 4
+// CHECK-64-NEXT:    store i32 1, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-64-NEXT:    store i32 0, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-64-NEXT:    [[TMP0:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK-64-NEXT:    [[TMP1:%.*]] = load i32, i32* [[TMP0]], align 4
+// CHECK-64-NEXT:    call void @__kmpc_dispatch_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]], i32 1073741862, i32 0, i32 9, i32 1, i32 1)
+// CHECK-64-NEXT:    br label [[OMP_DISPATCH_COND:%.*]]
+// CHECK-64:       omp.dispatch.cond:
+// CHECK-64-NEXT:    [[TMP2:%.*]] = call i32 @__kmpc_dispatch_next_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]], i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]])
+// CHECK-64-NEXT:    [[TOBOOL:%.*]] = icmp ne i32 [[TMP2]], 0
+// CHECK-64-NEXT:    br i1 [[TOBOOL]], label [[OMP_DISPATCH_BODY:%.*]], label [[OMP_DISPATCH_END:%.*]]
+// CHECK-64:       omp.dispatch.body:
+// CHECK-64-NEXT:    [[TMP3:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4
+// CHECK-64-NEXT:    store i32 [[TMP3]], i32* [[DOTOMP_IV]], align 4
+// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK-64:       omp.inner.for.cond:
+// CHECK-64-NEXT:    [[TMP4:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP312:![0-9]+]]
+// CHECK-64-NEXT:    [[TMP5:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP312]]
+// CHECK-64-NEXT:    [[CMP:%.*]] = icmp sle i32 [[TMP4]], [[TMP5]]
+// CHECK-64-NEXT:    br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK-64:       omp.inner.for.body:
+// CHECK-64-NEXT:    [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP312]]
+// CHECK-64-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP6]], 1
+// CHECK-64-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// CHECK-64-NEXT:    store i32 [[ADD]], i32* [[I]], align 4, !llvm.access.group [[ACC_GRP312]]
+// CHECK-64-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK-64:       omp.body.continue:
+// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK-64:       omp.inner.for.inc:
+// CHECK-64-NEXT:    [[TMP7:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP312]]
+// CHECK-64-NEXT:    [[ADD1:%.*]] = add nsw i32 [[TMP7]], 1
+// CHECK-64-NEXT:    store i32 [[ADD1]], i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP312]]
+// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP313:![0-9]+]]
+// CHECK-64:       omp.inner.for.end:
+// CHECK-64-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
+// CHECK-64:       omp.dispatch.inc:
+// CHECK-64-NEXT:    br label [[OMP_DISPATCH_COND]]
+// CHECK-64:       omp.dispatch.end:
+// CHECK-64-NEXT:    ret void
+//
+//
+// CHECK-64-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l259
+// CHECK-64-SAME: () #[[ATTR8]] {
+// CHECK-64-NEXT:  entry:
+// CHECK-64-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x i8*], align 8
+// CHECK-64-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 2, i1 false)
+// CHECK-64-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
+// CHECK-64-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
+// CHECK-64:       user_code.entry:
+// CHECK-64-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
+// CHECK-64-NEXT:    [[TMP2:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8**
+// CHECK-64-NEXT:    call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__96 to i8*), i8* null, i8** [[TMP2]], i64 0)
+// CHECK-64-NEXT:    call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 2)
+// CHECK-64-NEXT:    ret void
+// CHECK-64:       worker.exit:
+// CHECK-64-NEXT:    ret void
+//
+//
+// CHECK-64-LABEL: define {{[^@]+}}@__omp_outlined__96
+// CHECK-64-SAME: (i32* noalias noundef [[DOTGLOBAL_TID_:%.*]], i32* noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
+// CHECK-64-NEXT:  entry:
+// CHECK-64-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8
+// CHECK-64-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8
+// CHECK-64-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK-64-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8
+// CHECK-64-NEXT:    store i32 0, i32* [[DOTOMP_LB]], align 4
+// CHECK-64-NEXT:    store i32 9, i32* [[DOTOMP_UB]], align 4
+// CHECK-64-NEXT:    store i32 1, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-64-NEXT:    store i32 0, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-64-NEXT:    [[TMP0:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK-64-NEXT:    [[TMP1:%.*]] = load i32, i32* [[TMP0]], align 4
+// CHECK-64-NEXT:    call void @__kmpc_dispatch_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]], i32 1073741861, i32 0, i32 9, i32 1, i32 1)
+// CHECK-64-NEXT:    br label [[OMP_DISPATCH_COND:%.*]]
+// CHECK-64:       omp.dispatch.cond:
+// CHECK-64-NEXT:    [[TMP2:%.*]] = call i32 @__kmpc_dispatch_next_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]], i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]])
+// CHECK-64-NEXT:    [[TOBOOL:%.*]] = icmp ne i32 [[TMP2]], 0
+// CHECK-64-NEXT:    br i1 [[TOBOOL]], label [[OMP_DISPATCH_BODY:%.*]], label [[OMP_DISPATCH_END:%.*]]
+// CHECK-64:       omp.dispatch.body:
+// CHECK-64-NEXT:    [[TMP3:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4
+// CHECK-64-NEXT:    store i32 [[TMP3]], i32* [[DOTOMP_IV]], align 4
+// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK-64:       omp.inner.for.cond:
+// CHECK-64-NEXT:    [[TMP4:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP315:![0-9]+]]
+// CHECK-64-NEXT:    [[TMP5:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP315]]
+// CHECK-64-NEXT:    [[CMP:%.*]] = icmp sle i32 [[TMP4]], [[TMP5]]
+// CHECK-64-NEXT:    br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK-64:       omp.inner.for.body:
+// CHECK-64-NEXT:    [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP315]]
+// CHECK-64-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP6]], 1
+// CHECK-64-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// CHECK-64-NEXT:    store i32 [[ADD]], i32* [[I]], align 4, !llvm.access.group [[ACC_GRP315]]
+// CHECK-64-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK-64:       omp.body.continue:
+// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK-64:       omp.inner.for.inc:
+// CHECK-64-NEXT:    [[TMP7:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP315]]
+// CHECK-64-NEXT:    [[ADD1:%.*]] = add nsw i32 [[TMP7]], 1
+// CHECK-64-NEXT:    store i32 [[ADD1]], i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP315]]
+// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP316:![0-9]+]]
+// CHECK-64:       omp.inner.for.end:
+// CHECK-64-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
+// CHECK-64:       omp.dispatch.inc:
+// CHECK-64-NEXT:    br label [[OMP_DISPATCH_COND]]
+// CHECK-64:       omp.dispatch.end:
+// CHECK-64-NEXT:    ret void
+//
+//
+// CHECK-64-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l263
+// CHECK-64-SAME: () #[[ATTR8]] {
+// CHECK-64-NEXT:  entry:
+// CHECK-64-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x i8*], align 8
+// CHECK-64-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 2, i1 false)
+// CHECK-64-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
+// CHECK-64-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
+// CHECK-64:       user_code.entry:
+// CHECK-64-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
+// CHECK-64-NEXT:    [[TMP2:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8**
+// CHECK-64-NEXT:    call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__97 to i8*), i8* null, i8** [[TMP2]], i64 0)
+// CHECK-64-NEXT:    call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 2)
+// CHECK-64-NEXT:    ret void
+// CHECK-64:       worker.exit:
+// CHECK-64-NEXT:    ret void
+//
+//
+// CHECK-64-LABEL: define {{[^@]+}}@__omp_outlined__97
+// CHECK-64-SAME: (i32* noalias noundef [[DOTGLOBAL_TID_:%.*]], i32* noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
+// CHECK-64-NEXT:  entry:
+// CHECK-64-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8
+// CHECK-64-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8
+// CHECK-64-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK-64-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8
+// CHECK-64-NEXT:    store i32 0, i32* [[DOTOMP_LB]], align 4
+// CHECK-64-NEXT:    store i32 9, i32* [[DOTOMP_UB]], align 4
+// CHECK-64-NEXT:    store i32 1, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-64-NEXT:    store i32 0, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-64-NEXT:    [[TMP0:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK-64-NEXT:    [[TMP1:%.*]] = load i32, i32* [[TMP0]], align 4
+// CHECK-64-NEXT:    call void @__kmpc_dispatch_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]], i32 1073741859, i32 0, i32 9, i32 1, i32 1)
+// CHECK-64-NEXT:    br label [[OMP_DISPATCH_COND:%.*]]
+// CHECK-64:       omp.dispatch.cond:
+// CHECK-64-NEXT:    [[TMP2:%.*]] = call i32 @__kmpc_dispatch_next_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]], i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]])
+// CHECK-64-NEXT:    [[TOBOOL:%.*]] = icmp ne i32 [[TMP2]], 0
+// CHECK-64-NEXT:    br i1 [[TOBOOL]], label [[OMP_DISPATCH_BODY:%.*]], label [[OMP_DISPATCH_END:%.*]]
+// CHECK-64:       omp.dispatch.body:
+// CHECK-64-NEXT:    [[TMP3:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4
+// CHECK-64-NEXT:    store i32 [[TMP3]], i32* [[DOTOMP_IV]], align 4
+// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK-64:       omp.inner.for.cond:
+// CHECK-64-NEXT:    [[TMP4:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP318:![0-9]+]]
+// CHECK-64-NEXT:    [[TMP5:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP318]]
+// CHECK-64-NEXT:    [[CMP:%.*]] = icmp sle i32 [[TMP4]], [[TMP5]]
+// CHECK-64-NEXT:    br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK-64:       omp.inner.for.body:
+// CHECK-64-NEXT:    [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP318]]
+// CHECK-64-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP6]], 1
+// CHECK-64-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// CHECK-64-NEXT:    store i32 [[ADD]], i32* [[I]], align 4, !llvm.access.group [[ACC_GRP318]]
+// CHECK-64-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK-64:       omp.body.continue:
+// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK-64:       omp.inner.for.inc:
+// CHECK-64-NEXT:    [[TMP7:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP318]]
+// CHECK-64-NEXT:    [[ADD1:%.*]] = add nsw i32 [[TMP7]], 1
+// CHECK-64-NEXT:    store i32 [[ADD1]], i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP318]]
+// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP319:![0-9]+]]
+// CHECK-64:       omp.inner.for.end:
+// CHECK-64-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
+// CHECK-64:       omp.dispatch.inc:
+// CHECK-64-NEXT:    br label [[OMP_DISPATCH_COND]]
+// CHECK-64:       omp.dispatch.end:
+// CHECK-64-NEXT:    ret void
+//
+//
+// CHECK-64-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l267
+// CHECK-64-SAME: () #[[ATTR8]] {
+// CHECK-64-NEXT:  entry:
+// CHECK-64-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x i8*], align 8
+// CHECK-64-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 2, i1 false)
+// CHECK-64-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
+// CHECK-64-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
+// CHECK-64:       user_code.entry:
+// CHECK-64-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
+// CHECK-64-NEXT:    [[TMP2:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8**
+// CHECK-64-NEXT:    call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__98 to i8*), i8* null, i8** [[TMP2]], i64 0)
+// CHECK-64-NEXT:    call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 2)
+// CHECK-64-NEXT:    ret void
+// CHECK-64:       worker.exit:
+// CHECK-64-NEXT:    ret void
+//
+//
+// CHECK-64-LABEL: define {{[^@]+}}@__omp_outlined__98
+// CHECK-64-SAME: (i32* noalias noundef [[DOTGLOBAL_TID_:%.*]], i32* noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
+// CHECK-64-NEXT:  entry:
+// CHECK-64-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8
+// CHECK-64-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8
+// CHECK-64-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK-64-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8
+// CHECK-64-NEXT:    store i32 0, i32* [[DOTOMP_LB]], align 4
+// CHECK-64-NEXT:    store i32 9, i32* [[DOTOMP_UB]], align 4
+// CHECK-64-NEXT:    store i32 1, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-64-NEXT:    store i32 0, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-64-NEXT:    [[TMP0:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK-64-NEXT:    [[TMP1:%.*]] = load i32, i32* [[TMP0]], align 4
+// CHECK-64-NEXT:    call void @__kmpc_dispatch_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]], i32 1073741860, i32 0, i32 9, i32 1, i32 1)
+// CHECK-64-NEXT:    br label [[OMP_DISPATCH_COND:%.*]]
+// CHECK-64:       omp.dispatch.cond:
+// CHECK-64-NEXT:    [[TMP2:%.*]] = call i32 @__kmpc_dispatch_next_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]], i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]])
+// CHECK-64-NEXT:    [[TOBOOL:%.*]] = icmp ne i32 [[TMP2]], 0
+// CHECK-64-NEXT:    br i1 [[TOBOOL]], label [[OMP_DISPATCH_BODY:%.*]], label [[OMP_DISPATCH_END:%.*]]
+// CHECK-64:       omp.dispatch.body:
+// CHECK-64-NEXT:    [[TMP3:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4
+// CHECK-64-NEXT:    store i32 [[TMP3]], i32* [[DOTOMP_IV]], align 4
+// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK-64:       omp.inner.for.cond:
+// CHECK-64-NEXT:    [[TMP4:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP321:![0-9]+]]
+// CHECK-64-NEXT:    [[TMP5:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP321]]
+// CHECK-64-NEXT:    [[CMP:%.*]] = icmp sle i32 [[TMP4]], [[TMP5]]
+// CHECK-64-NEXT:    br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK-64:       omp.inner.for.body:
+// CHECK-64-NEXT:    [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP321]]
+// CHECK-64-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP6]], 1
+// CHECK-64-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// CHECK-64-NEXT:    store i32 [[ADD]], i32* [[I]], align 4, !llvm.access.group [[ACC_GRP321]]
+// CHECK-64-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK-64:       omp.body.continue:
+// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK-64:       omp.inner.for.inc:
+// CHECK-64-NEXT:    [[TMP7:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP321]]
+// CHECK-64-NEXT:    [[ADD1:%.*]] = add nsw i32 [[TMP7]], 1
+// CHECK-64-NEXT:    store i32 [[ADD1]], i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP321]]
+// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP322:![0-9]+]]
+// CHECK-64:       omp.inner.for.end:
+// CHECK-64-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
+// CHECK-64:       omp.dispatch.inc:
+// CHECK-64-NEXT:    br label [[OMP_DISPATCH_COND]]
+// CHECK-64:       omp.dispatch.end:
+// CHECK-64-NEXT:    ret void
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+// CHECK-32-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l14
+// CHECK-32-SAME: (i32 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-32-NEXT:  entry:
+// CHECK-32-NEXT:    [[DOTCAPTURE_EXPR__ADDR:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTCAPTURE_EXPR__CASTED:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    store i32 [[DOTCAPTURE_EXPR_]], i32* [[DOTCAPTURE_EXPR__ADDR]], align 4
+// CHECK-32-NEXT:    [[CONV:%.*]] = bitcast i32* [[DOTCAPTURE_EXPR__ADDR]] to i8*
+// CHECK-32-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1:[0-9]+]], i8 2, i1 false)
+// CHECK-32-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
+// CHECK-32-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
+// CHECK-32:       user_code.entry:
+// CHECK-32-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
+// CHECK-32-NEXT:    [[TMP2:%.*]] = load i8, i8* [[CONV]], align 1
+// CHECK-32-NEXT:    [[TOBOOL:%.*]] = trunc i8 [[TMP2]] to i1
+// CHECK-32-NEXT:    [[CONV1:%.*]] = bitcast i32* [[DOTCAPTURE_EXPR__CASTED]] to i8*
+// CHECK-32-NEXT:    [[FROMBOOL:%.*]] = zext i1 [[TOBOOL]] to i8
+// CHECK-32-NEXT:    store i8 [[FROMBOOL]], i8* [[CONV1]], align 1
+// CHECK-32-NEXT:    [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR__CASTED]], align 4
+// CHECK-32-NEXT:    store i32 0, i32* [[DOTZERO_ADDR]], align 4
+// CHECK-32-NEXT:    store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4
+// CHECK-32-NEXT:    call void @__omp_outlined__(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i32 [[TMP3]]) #[[ATTR2:[0-9]+]]
+// CHECK-32-NEXT:    call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 2)
+// CHECK-32-NEXT:    ret void
+// CHECK-32:       worker.exit:
+// CHECK-32-NEXT:    ret void
+//
+//
+// CHECK-32-LABEL: define {{[^@]+}}@__omp_outlined__
+// CHECK-32-SAME: (i32* noalias noundef [[DOTGLOBAL_TID_:%.*]], i32* noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR1:[0-9]+]] {
+// CHECK-32-NEXT:  entry:
+// CHECK-32-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-NEXT:    [[DOTCAPTURE_EXPR__ADDR:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTCAPTURE_EXPR__CASTED:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [3 x i8*], align 4
+// CHECK-32-NEXT:    [[DOTCAPTURE_EXPR__CASTED16:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[CAPTURED_VARS_ADDRS19:%.*]] = alloca [3 x i8*], align 4
+// CHECK-32-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4
+// CHECK-32-NEXT:    store i32 [[DOTCAPTURE_EXPR_]], i32* [[DOTCAPTURE_EXPR__ADDR]], align 4
+// CHECK-32-NEXT:    [[CONV:%.*]] = bitcast i32* [[DOTCAPTURE_EXPR__ADDR]] to i8*
+// CHECK-32-NEXT:    store i32 0, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-32-NEXT:    store i32 9, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-NEXT:    store i32 1, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-32-NEXT:    store i32 0, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-32-NEXT:    [[NVPTX_NUM_THREADS:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block()
+// CHECK-32-NEXT:    [[TMP0:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-NEXT:    [[TMP1:%.*]] = load i32, i32* [[TMP0]], align 4
+// CHECK-32-NEXT:    call void @__kmpc_distribute_static_init_4(%struct.ident_t* @[[GLOB2:[0-9]+]], i32 [[TMP1]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]])
+// CHECK-32-NEXT:    [[TMP2:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[TMP2]], 9
+// CHECK-32-NEXT:    br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK-32:       cond.true:
+// CHECK-32-NEXT:    br label [[COND_END:%.*]]
+// CHECK-32:       cond.false:
+// CHECK-32-NEXT:    [[TMP3:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-NEXT:    br label [[COND_END]]
+// CHECK-32:       cond.end:
+// CHECK-32-NEXT:    [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP3]], [[COND_FALSE]] ]
+// CHECK-32-NEXT:    store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-NEXT:    [[TMP4:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-32-NEXT:    store i32 [[TMP4]], i32* [[DOTOMP_IV]], align 4
+// CHECK-32-NEXT:    [[TMP5:%.*]] = load i8, i8* [[CONV]], align 1
+// CHECK-32-NEXT:    [[TOBOOL:%.*]] = trunc i8 [[TMP5]] to i1
+// CHECK-32-NEXT:    br i1 [[TOBOOL]], label [[OMP_IF_THEN:%.*]], label [[OMP_IF_ELSE:%.*]]
+// CHECK-32:       omp_if.then:
+// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK-32:       omp.inner.for.cond:
+// CHECK-32-NEXT:    [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP130:![0-9]+]]
+// CHECK-32-NEXT:    [[CMP1:%.*]] = icmp slt i32 [[TMP6]], 10
+// CHECK-32-NEXT:    br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK-32:       omp.inner.for.body:
+// CHECK-32-NEXT:    [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP130]]
+// CHECK-32-NEXT:    [[TMP8:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP130]]
+// CHECK-32-NEXT:    [[TMP9:%.*]] = load i8, i8* [[CONV]], align 1, !llvm.access.group [[ACC_GRP130]]
+// CHECK-32-NEXT:    [[TOBOOL2:%.*]] = trunc i8 [[TMP9]] to i1
+// CHECK-32-NEXT:    [[CONV3:%.*]] = bitcast i32* [[DOTCAPTURE_EXPR__CASTED]] to i8*
+// CHECK-32-NEXT:    [[FROMBOOL:%.*]] = zext i1 [[TOBOOL2]] to i8
+// CHECK-32-NEXT:    store i8 [[FROMBOOL]], i8* [[CONV3]], align 1, !llvm.access.group [[ACC_GRP130]]
+// CHECK-32-NEXT:    [[TMP10:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR__CASTED]], align 4, !llvm.access.group [[ACC_GRP130]]
+// CHECK-32-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0
+// CHECK-32-NEXT:    [[TMP12:%.*]] = inttoptr i32 [[TMP7]] to i8*
+// CHECK-32-NEXT:    store i8* [[TMP12]], i8** [[TMP11]], align 4, !llvm.access.group [[ACC_GRP130]]
+// CHECK-32-NEXT:    [[TMP13:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1
+// CHECK-32-NEXT:    [[TMP14:%.*]] = inttoptr i32 [[TMP8]] to i8*
+// CHECK-32-NEXT:    store i8* [[TMP14]], i8** [[TMP13]], align 4, !llvm.access.group [[ACC_GRP130]]
+// CHECK-32-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 2
+// CHECK-32-NEXT:    [[TMP16:%.*]] = inttoptr i32 [[TMP10]] to i8*
+// CHECK-32-NEXT:    store i8* [[TMP16]], i8** [[TMP15]], align 4, !llvm.access.group [[ACC_GRP130]]
+// CHECK-32-NEXT:    [[TMP17:%.*]] = load i8, i8* [[CONV]], align 1, !llvm.access.group [[ACC_GRP130]]
+// CHECK-32-NEXT:    [[TOBOOL4:%.*]] = trunc i8 [[TMP17]] to i1
+// CHECK-32-NEXT:    [[TMP18:%.*]] = zext i1 [[TOBOOL4]] to i32
+// CHECK-32-NEXT:    [[TMP19:%.*]] = bitcast [3 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8**
+// CHECK-32-NEXT:    call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]], i32 [[TMP18]], i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32, i32)* @__omp_outlined__1 to i8*), i8* null, i8** [[TMP19]], i32 3), !llvm.access.group [[ACC_GRP130]]
+// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK-32:       omp.inner.for.inc:
+// CHECK-32-NEXT:    [[TMP20:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP130]]
+// CHECK-32-NEXT:    [[TMP21:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP130]]
+// CHECK-32-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP20]], [[TMP21]]
+// CHECK-32-NEXT:    store i32 [[ADD]], i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP130]]
+// CHECK-32-NEXT:    [[TMP22:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP130]]
+// CHECK-32-NEXT:    [[TMP23:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP130]]
+// CHECK-32-NEXT:    [[ADD5:%.*]] = add nsw i32 [[TMP22]], [[TMP23]]
+// CHECK-32-NEXT:    store i32 [[ADD5]], i32* [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP130]]
+// CHECK-32-NEXT:    [[TMP24:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP130]]
+// CHECK-32-NEXT:    [[TMP25:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP130]]
+// CHECK-32-NEXT:    [[ADD6:%.*]] = add nsw i32 [[TMP24]], [[TMP25]]
+// CHECK-32-NEXT:    store i32 [[ADD6]], i32* [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP130]]
+// CHECK-32-NEXT:    [[TMP26:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP130]]
+// CHECK-32-NEXT:    [[CMP7:%.*]] = icmp sgt i32 [[TMP26]], 9
+// CHECK-32-NEXT:    br i1 [[CMP7]], label [[COND_TRUE8:%.*]], label [[COND_FALSE9:%.*]]
+// CHECK-32:       cond.true8:
+// CHECK-32-NEXT:    br label [[COND_END10:%.*]]
+// CHECK-32:       cond.false9:
+// CHECK-32-NEXT:    [[TMP27:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP130]]
+// CHECK-32-NEXT:    br label [[COND_END10]]
+// CHECK-32:       cond.end10:
+// CHECK-32-NEXT:    [[COND11:%.*]] = phi i32 [ 9, [[COND_TRUE8]] ], [ [[TMP27]], [[COND_FALSE9]] ]
+// CHECK-32-NEXT:    store i32 [[COND11]], i32* [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP130]]
+// CHECK-32-NEXT:    [[TMP28:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP130]]
+// CHECK-32-NEXT:    store i32 [[TMP28]], i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP130]]
+// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP131:![0-9]+]]
+// CHECK-32:       omp.inner.for.end:
+// CHECK-32-NEXT:    br label [[OMP_IF_END:%.*]]
+// CHECK-32:       omp_if.else:
+// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND12:%.*]]
+// CHECK-32:       omp.inner.for.cond12:
+// CHECK-32-NEXT:    [[TMP29:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
+// CHECK-32-NEXT:    [[CMP13:%.*]] = icmp slt i32 [[TMP29]], 10
+// CHECK-32-NEXT:    br i1 [[CMP13]], label [[OMP_INNER_FOR_BODY14:%.*]], label [[OMP_INNER_FOR_END30:%.*]]
+// CHECK-32:       omp.inner.for.body14:
+// CHECK-32-NEXT:    [[TMP30:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-32-NEXT:    [[TMP31:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-NEXT:    [[TMP32:%.*]] = load i8, i8* [[CONV]], align 1
+// CHECK-32-NEXT:    [[TOBOOL15:%.*]] = trunc i8 [[TMP32]] to i1
+// CHECK-32-NEXT:    [[CONV17:%.*]] = bitcast i32* [[DOTCAPTURE_EXPR__CASTED16]] to i8*
+// CHECK-32-NEXT:    [[FROMBOOL18:%.*]] = zext i1 [[TOBOOL15]] to i8
+// CHECK-32-NEXT:    store i8 [[FROMBOOL18]], i8* [[CONV17]], align 1
+// CHECK-32-NEXT:    [[TMP33:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR__CASTED16]], align 4
+// CHECK-32-NEXT:    [[TMP34:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS19]], i32 0, i32 0
+// CHECK-32-NEXT:    [[TMP35:%.*]] = inttoptr i32 [[TMP30]] to i8*
+// CHECK-32-NEXT:    store i8* [[TMP35]], i8** [[TMP34]], align 4
+// CHECK-32-NEXT:    [[TMP36:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS19]], i32 0, i32 1
+// CHECK-32-NEXT:    [[TMP37:%.*]] = inttoptr i32 [[TMP31]] to i8*
+// CHECK-32-NEXT:    store i8* [[TMP37]], i8** [[TMP36]], align 4
+// CHECK-32-NEXT:    [[TMP38:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS19]], i32 0, i32 2
+// CHECK-32-NEXT:    [[TMP39:%.*]] = inttoptr i32 [[TMP33]] to i8*
+// CHECK-32-NEXT:    store i8* [[TMP39]], i8** [[TMP38]], align 4
+// CHECK-32-NEXT:    [[TMP40:%.*]] = load i8, i8* [[CONV]], align 1
+// CHECK-32-NEXT:    [[TOBOOL20:%.*]] = trunc i8 [[TMP40]] to i1
+// CHECK-32-NEXT:    [[TMP41:%.*]] = zext i1 [[TOBOOL20]] to i32
+// CHECK-32-NEXT:    [[TMP42:%.*]] = bitcast [3 x i8*]* [[CAPTURED_VARS_ADDRS19]] to i8**
+// CHECK-32-NEXT:    call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]], i32 [[TMP41]], i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32, i32)* @__omp_outlined__2 to i8*), i8* null, i8** [[TMP42]], i32 3)
+// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_INC21:%.*]]
+// CHECK-32:       omp.inner.for.inc21:
+// CHECK-32-NEXT:    [[TMP43:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
+// CHECK-32-NEXT:    [[TMP44:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-32-NEXT:    [[ADD22:%.*]] = add nsw i32 [[TMP43]], [[TMP44]]
+// CHECK-32-NEXT:    store i32 [[ADD22]], i32* [[DOTOMP_IV]], align 4
+// CHECK-32-NEXT:    [[TMP45:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-32-NEXT:    [[TMP46:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-32-NEXT:    [[ADD23:%.*]] = add nsw i32 [[TMP45]], [[TMP46]]
+// CHECK-32-NEXT:    store i32 [[ADD23]], i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-32-NEXT:    [[TMP47:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-NEXT:    [[TMP48:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-32-NEXT:    [[ADD24:%.*]] = add nsw i32 [[TMP47]], [[TMP48]]
+// CHECK-32-NEXT:    store i32 [[ADD24]], i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-NEXT:    [[TMP49:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-NEXT:    [[CMP25:%.*]] = icmp sgt i32 [[TMP49]], 9
+// CHECK-32-NEXT:    br i1 [[CMP25]], label [[COND_TRUE26:%.*]], label [[COND_FALSE27:%.*]]
+// CHECK-32:       cond.true26:
+// CHECK-32-NEXT:    br label [[COND_END28:%.*]]
+// CHECK-32:       cond.false27:
+// CHECK-32-NEXT:    [[TMP50:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-NEXT:    br label [[COND_END28]]
+// CHECK-32:       cond.end28:
+// CHECK-32-NEXT:    [[COND29:%.*]] = phi i32 [ 9, [[COND_TRUE26]] ], [ [[TMP50]], [[COND_FALSE27]] ]
+// CHECK-32-NEXT:    store i32 [[COND29]], i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-NEXT:    [[TMP51:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-32-NEXT:    store i32 [[TMP51]], i32* [[DOTOMP_IV]], align 4
+// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND12]], !llvm.loop [[LOOP134:![0-9]+]]
+// CHECK-32:       omp.inner.for.end30:
+// CHECK-32-NEXT:    br label [[OMP_IF_END]]
+// CHECK-32:       omp_if.end:
+// CHECK-32-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK-32:       omp.loop.exit:
+// CHECK-32-NEXT:    call void @__kmpc_distribute_static_fini(%struct.ident_t* @[[GLOB2]], i32 [[TMP1]])
+// CHECK-32-NEXT:    [[TMP52:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-32-NEXT:    [[TMP53:%.*]] = icmp ne i32 [[TMP52]], 0
+// CHECK-32-NEXT:    br i1 [[TMP53]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]]
+// CHECK-32:       .omp.final.then:
+// CHECK-32-NEXT:    store i32 10, i32* [[I]], align 4
+// CHECK-32-NEXT:    br label [[DOTOMP_FINAL_DONE]]
+// CHECK-32:       .omp.final.done:
+// CHECK-32-NEXT:    ret void
+//
+//
+// CHECK-32-LABEL: define {{[^@]+}}@__omp_outlined__1
+// CHECK-32-SAME: (i32* noalias noundef [[DOTGLOBAL_TID_:%.*]], i32* noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], i32 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR1]] {
+// CHECK-32-NEXT:  entry:
+// CHECK-32-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-NEXT:    [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTCAPTURE_EXPR__ADDR:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4
+// CHECK-32-NEXT:    store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4
+// CHECK-32-NEXT:    store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4
+// CHECK-32-NEXT:    store i32 [[DOTCAPTURE_EXPR_]], i32* [[DOTCAPTURE_EXPR__ADDR]], align 4
+// CHECK-32-NEXT:    [[CONV:%.*]] = bitcast i32* [[DOTCAPTURE_EXPR__ADDR]] to i8*
+// CHECK-32-NEXT:    store i32 0, i32* [[DOTOMP_LB]], align 4
+// CHECK-32-NEXT:    store i32 9, i32* [[DOTOMP_UB]], align 4
+// CHECK-32-NEXT:    [[TMP0:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4
+// CHECK-32-NEXT:    [[TMP1:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4
+// CHECK-32-NEXT:    store i32 [[TMP0]], i32* [[DOTOMP_LB]], align 4
+// CHECK-32-NEXT:    store i32 [[TMP1]], i32* [[DOTOMP_UB]], align 4
+// CHECK-32-NEXT:    store i32 1, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-32-NEXT:    store i32 0, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-32-NEXT:    [[TMP2:%.*]] = load i8, i8* [[CONV]], align 1
+// CHECK-32-NEXT:    [[TOBOOL:%.*]] = trunc i8 [[TMP2]] to i1
+// CHECK-32-NEXT:    br i1 [[TOBOOL]], label [[OMP_IF_THEN:%.*]], label [[OMP_IF_ELSE:%.*]]
+// CHECK-32:       omp_if.then:
+// CHECK-32-NEXT:    [[TMP3:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-NEXT:    [[TMP4:%.*]] = load i32, i32* [[TMP3]], align 4
+// CHECK-32-NEXT:    call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB3:[0-9]+]], i32 [[TMP4]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1)
+// CHECK-32-NEXT:    [[TMP5:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4
+// CHECK-32-NEXT:    store i32 [[TMP5]], i32* [[DOTOMP_IV]], align 4
+// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK-32:       omp.inner.for.cond:
+// CHECK-32-NEXT:    [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP136:![0-9]+]]
+// CHECK-32-NEXT:    [[TMP7:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4, !llvm.access.group [[ACC_GRP136]]
+// CHECK-32-NEXT:    [[CMP:%.*]] = icmp ule i32 [[TMP6]], [[TMP7]]
+// CHECK-32-NEXT:    br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK-32:       omp.inner.for.body:
+// CHECK-32-NEXT:    [[TMP8:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP136]]
+// CHECK-32-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP8]], 1
+// CHECK-32-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// CHECK-32-NEXT:    store i32 [[ADD]], i32* [[I]], align 4, !llvm.access.group [[ACC_GRP136]]
+// CHECK-32-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK-32:       omp.body.continue:
+// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK-32:       omp.inner.for.inc:
+// CHECK-32-NEXT:    [[TMP9:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP136]]
+// CHECK-32-NEXT:    [[TMP10:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP136]]
+// CHECK-32-NEXT:    [[ADD1:%.*]] = add nsw i32 [[TMP9]], [[TMP10]]
+// CHECK-32-NEXT:    store i32 [[ADD1]], i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP136]]
+// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP137:![0-9]+]]
+// CHECK-32:       omp.inner.for.end:
+// CHECK-32-NEXT:    br label [[OMP_IF_END:%.*]]
+// CHECK-32:       omp_if.else:
+// CHECK-32-NEXT:    [[TMP11:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-NEXT:    [[TMP12:%.*]] = load i32, i32* [[TMP11]], align 4
+// CHECK-32-NEXT:    call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB3]], i32 [[TMP12]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1)
+// CHECK-32-NEXT:    [[TMP13:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4
+// CHECK-32-NEXT:    store i32 [[TMP13]], i32* [[DOTOMP_IV]], align 4
+// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND2:%.*]]
+// CHECK-32:       omp.inner.for.cond2:
+// CHECK-32-NEXT:    [[TMP14:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
+// CHECK-32-NEXT:    [[TMP15:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4
+// CHECK-32-NEXT:    [[CMP3:%.*]] = icmp ule i32 [[TMP14]], [[TMP15]]
+// CHECK-32-NEXT:    br i1 [[CMP3]], label [[OMP_INNER_FOR_BODY4:%.*]], label [[OMP_INNER_FOR_END10:%.*]]
+// CHECK-32:       omp.inner.for.body4:
+// CHECK-32-NEXT:    [[TMP16:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
+// CHECK-32-NEXT:    [[MUL5:%.*]] = mul nsw i32 [[TMP16]], 1
+// CHECK-32-NEXT:    [[ADD6:%.*]] = add nsw i32 0, [[MUL5]]
+// CHECK-32-NEXT:    store i32 [[ADD6]], i32* [[I]], align 4
+// CHECK-32-NEXT:    br label [[OMP_BODY_CONTINUE7:%.*]]
+// CHECK-32:       omp.body.continue7:
+// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_INC8:%.*]]
+// CHECK-32:       omp.inner.for.inc8:
+// CHECK-32-NEXT:    [[TMP17:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
+// CHECK-32-NEXT:    [[TMP18:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-32-NEXT:    [[ADD9:%.*]] = add nsw i32 [[TMP17]], [[TMP18]]
+// CHECK-32-NEXT:    store i32 [[ADD9]], i32* [[DOTOMP_IV]], align 4
+// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND2]], !llvm.loop [[LOOP139:![0-9]+]]
+// CHECK-32:       omp.inner.for.end10:
+// CHECK-32-NEXT:    br label [[OMP_IF_END]]
+// CHECK-32:       omp_if.end:
+// CHECK-32-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK-32:       omp.loop.exit:
+// CHECK-32-NEXT:    [[TMP19:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-NEXT:    [[TMP20:%.*]] = load i32, i32* [[TMP19]], align 4
+// CHECK-32-NEXT:    call void @__kmpc_distribute_static_fini(%struct.ident_t* @[[GLOB2]], i32 [[TMP20]])
+// CHECK-32-NEXT:    [[TMP21:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-32-NEXT:    [[TMP22:%.*]] = icmp ne i32 [[TMP21]], 0
+// CHECK-32-NEXT:    br i1 [[TMP22]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]]
+// CHECK-32:       .omp.final.then:
+// CHECK-32-NEXT:    store i32 10, i32* [[I]], align 4
+// CHECK-32-NEXT:    br label [[DOTOMP_FINAL_DONE]]
+// CHECK-32:       .omp.final.done:
+// CHECK-32-NEXT:    ret void
+//
+//
+// CHECK-32-LABEL: define {{[^@]+}}@__omp_outlined__2
+// CHECK-32-SAME: (i32* noalias noundef [[DOTGLOBAL_TID_:%.*]], i32* noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], i32 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR1]] {
+// CHECK-32-NEXT:  entry:
+// CHECK-32-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-NEXT:    [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTCAPTURE_EXPR__ADDR:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4
+// CHECK-32-NEXT:    store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4
+// CHECK-32-NEXT:    store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4
+// CHECK-32-NEXT:    store i32 [[DOTCAPTURE_EXPR_]], i32* [[DOTCAPTURE_EXPR__ADDR]], align 4
+// CHECK-32-NEXT:    [[CONV:%.*]] = bitcast i32* [[DOTCAPTURE_EXPR__ADDR]] to i8*
+// CHECK-32-NEXT:    store i32 0, i32* [[DOTOMP_LB]], align 4
+// CHECK-32-NEXT:    store i32 9, i32* [[DOTOMP_UB]], align 4
+// CHECK-32-NEXT:    [[TMP0:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4
+// CHECK-32-NEXT:    [[TMP1:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4
+// CHECK-32-NEXT:    store i32 [[TMP0]], i32* [[DOTOMP_LB]], align 4
+// CHECK-32-NEXT:    store i32 [[TMP1]], i32* [[DOTOMP_UB]], align 4
+// CHECK-32-NEXT:    store i32 1, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-32-NEXT:    store i32 0, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-32-NEXT:    [[TMP2:%.*]] = load i8, i8* [[CONV]], align 1
+// CHECK-32-NEXT:    [[TOBOOL:%.*]] = trunc i8 [[TMP2]] to i1
+// CHECK-32-NEXT:    br i1 [[TOBOOL]], label [[OMP_IF_THEN:%.*]], label [[OMP_IF_ELSE:%.*]]
+// CHECK-32:       omp_if.then:
+// CHECK-32-NEXT:    [[TMP3:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-NEXT:    [[TMP4:%.*]] = load i32, i32* [[TMP3]], align 4
+// CHECK-32-NEXT:    call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB3]], i32 [[TMP4]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1)
+// CHECK-32-NEXT:    [[TMP5:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4
+// CHECK-32-NEXT:    store i32 [[TMP5]], i32* [[DOTOMP_IV]], align 4
+// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK-32:       omp.inner.for.cond:
+// CHECK-32-NEXT:    [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP140:![0-9]+]]
+// CHECK-32-NEXT:    [[TMP7:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4, !llvm.access.group [[ACC_GRP140]]
+// CHECK-32-NEXT:    [[CMP:%.*]] = icmp ule i32 [[TMP6]], [[TMP7]]
+// CHECK-32-NEXT:    br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK-32:       omp.inner.for.body:
+// CHECK-32-NEXT:    [[TMP8:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP140]]
+// CHECK-32-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP8]], 1
+// CHECK-32-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// CHECK-32-NEXT:    store i32 [[ADD]], i32* [[I]], align 4, !llvm.access.group [[ACC_GRP140]]
+// CHECK-32-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK-32:       omp.body.continue:
+// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK-32:       omp.inner.for.inc:
+// CHECK-32-NEXT:    [[TMP9:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP140]]
+// CHECK-32-NEXT:    [[TMP10:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP140]]
+// CHECK-32-NEXT:    [[ADD1:%.*]] = add nsw i32 [[TMP9]], [[TMP10]]
+// CHECK-32-NEXT:    store i32 [[ADD1]], i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP140]]
+// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP141:![0-9]+]]
+// CHECK-32:       omp.inner.for.end:
+// CHECK-32-NEXT:    br label [[OMP_IF_END:%.*]]
+// CHECK-32:       omp_if.else:
+// CHECK-32-NEXT:    [[TMP11:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-NEXT:    [[TMP12:%.*]] = load i32, i32* [[TMP11]], align 4
+// CHECK-32-NEXT:    call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB3]], i32 [[TMP12]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1)
+// CHECK-32-NEXT:    [[TMP13:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4
+// CHECK-32-NEXT:    store i32 [[TMP13]], i32* [[DOTOMP_IV]], align 4
+// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND2:%.*]]
+// CHECK-32:       omp.inner.for.cond2:
+// CHECK-32-NEXT:    [[TMP14:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
+// CHECK-32-NEXT:    [[TMP15:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4
+// CHECK-32-NEXT:    [[CMP3:%.*]] = icmp ule i32 [[TMP14]], [[TMP15]]
+// CHECK-32-NEXT:    br i1 [[CMP3]], label [[OMP_INNER_FOR_BODY4:%.*]], label [[OMP_INNER_FOR_END10:%.*]]
+// CHECK-32:       omp.inner.for.body4:
+// CHECK-32-NEXT:    [[TMP16:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
+// CHECK-32-NEXT:    [[MUL5:%.*]] = mul nsw i32 [[TMP16]], 1
+// CHECK-32-NEXT:    [[ADD6:%.*]] = add nsw i32 0, [[MUL5]]
+// CHECK-32-NEXT:    store i32 [[ADD6]], i32* [[I]], align 4
+// CHECK-32-NEXT:    br label [[OMP_BODY_CONTINUE7:%.*]]
+// CHECK-32:       omp.body.continue7:
+// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_INC8:%.*]]
+// CHECK-32:       omp.inner.for.inc8:
+// CHECK-32-NEXT:    [[TMP17:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
+// CHECK-32-NEXT:    [[TMP18:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-32-NEXT:    [[ADD9:%.*]] = add nsw i32 [[TMP17]], [[TMP18]]
+// CHECK-32-NEXT:    store i32 [[ADD9]], i32* [[DOTOMP_IV]], align 4
+// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND2]], !llvm.loop [[LOOP143:![0-9]+]]
+// CHECK-32:       omp.inner.for.end10:
+// CHECK-32-NEXT:    br label [[OMP_IF_END]]
+// CHECK-32:       omp_if.end:
+// CHECK-32-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK-32:       omp.loop.exit:
+// CHECK-32-NEXT:    [[TMP19:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-NEXT:    [[TMP20:%.*]] = load i32, i32* [[TMP19]], align 4
+// CHECK-32-NEXT:    call void @__kmpc_distribute_static_fini(%struct.ident_t* @[[GLOB2]], i32 [[TMP20]])
+// CHECK-32-NEXT:    [[TMP21:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-32-NEXT:    [[TMP22:%.*]] = icmp ne i32 [[TMP21]], 0
+// CHECK-32-NEXT:    br i1 [[TMP22]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]]
+// CHECK-32:       .omp.final.then:
+// CHECK-32-NEXT:    store i32 10, i32* [[I]], align 4
+// CHECK-32-NEXT:    br label [[DOTOMP_FINAL_DONE]]
+// CHECK-32:       .omp.final.done:
+// CHECK-32-NEXT:    ret void
+//
+//
+// CHECK-32-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l17
+// CHECK-32-SAME: () #[[ATTR0]] {
+// CHECK-32-NEXT:  entry:
+// CHECK-32-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 2, i1 false)
+// CHECK-32-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
+// CHECK-32-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
+// CHECK-32:       user_code.entry:
+// CHECK-32-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
+// CHECK-32-NEXT:    store i32 0, i32* [[DOTZERO_ADDR]], align 4
+// CHECK-32-NEXT:    store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4
+// CHECK-32-NEXT:    call void @__omp_outlined__3(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]]) #[[ATTR2]]
+// CHECK-32-NEXT:    call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 2)
+// CHECK-32-NEXT:    ret void
+// CHECK-32:       worker.exit:
+// CHECK-32-NEXT:    ret void
+//
+//
+// CHECK-32-LABEL: define {{[^@]+}}@__omp_outlined__3
+// CHECK-32-SAME: (i32* noalias noundef [[DOTGLOBAL_TID_:%.*]], i32* noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
+// CHECK-32-NEXT:  entry:
+// CHECK-32-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [2 x i8*], align 4
+// CHECK-32-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4
+// CHECK-32-NEXT:    store i32 0, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-32-NEXT:    store i32 9, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-NEXT:    store i32 1, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-32-NEXT:    store i32 0, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-32-NEXT:    [[NVPTX_NUM_THREADS:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block()
+// CHECK-32-NEXT:    [[TMP0:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-NEXT:    [[TMP1:%.*]] = load i32, i32* [[TMP0]], align 4
+// CHECK-32-NEXT:    call void @__kmpc_distribute_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP1]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]])
+// CHECK-32-NEXT:    [[TMP2:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[TMP2]], 9
+// CHECK-32-NEXT:    br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK-32:       cond.true:
+// CHECK-32-NEXT:    br label [[COND_END:%.*]]
+// CHECK-32:       cond.false:
+// CHECK-32-NEXT:    [[TMP3:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-NEXT:    br label [[COND_END]]
+// CHECK-32:       cond.end:
+// CHECK-32-NEXT:    [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP3]], [[COND_FALSE]] ]
+// CHECK-32-NEXT:    store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-NEXT:    [[TMP4:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-32-NEXT:    store i32 [[TMP4]], i32* [[DOTOMP_IV]], align 4
+// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK-32:       omp.inner.for.cond:
+// CHECK-32-NEXT:    [[TMP5:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP144:![0-9]+]]
+// CHECK-32-NEXT:    [[CMP1:%.*]] = icmp slt i32 [[TMP5]], 10
+// CHECK-32-NEXT:    br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK-32:       omp.inner.for.body:
+// CHECK-32-NEXT:    [[TMP6:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP144]]
+// CHECK-32-NEXT:    [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP144]]
+// CHECK-32-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0
+// CHECK-32-NEXT:    [[TMP9:%.*]] = inttoptr i32 [[TMP6]] to i8*
+// CHECK-32-NEXT:    store i8* [[TMP9]], i8** [[TMP8]], align 4, !llvm.access.group [[ACC_GRP144]]
+// CHECK-32-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1
+// CHECK-32-NEXT:    [[TMP11:%.*]] = inttoptr i32 [[TMP7]] to i8*
+// CHECK-32-NEXT:    store i8* [[TMP11]], i8** [[TMP10]], align 4, !llvm.access.group [[ACC_GRP144]]
+// CHECK-32-NEXT:    [[TMP12:%.*]] = bitcast [2 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8**
+// CHECK-32-NEXT:    call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32)* @__omp_outlined__4 to i8*), i8* null, i8** [[TMP12]], i32 2), !llvm.access.group [[ACC_GRP144]]
+// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK-32:       omp.inner.for.inc:
+// CHECK-32-NEXT:    [[TMP13:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP144]]
+// CHECK-32-NEXT:    [[TMP14:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP144]]
+// CHECK-32-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP13]], [[TMP14]]
+// CHECK-32-NEXT:    store i32 [[ADD]], i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP144]]
+// CHECK-32-NEXT:    [[TMP15:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP144]]
+// CHECK-32-NEXT:    [[TMP16:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP144]]
+// CHECK-32-NEXT:    [[ADD2:%.*]] = add nsw i32 [[TMP15]], [[TMP16]]
+// CHECK-32-NEXT:    store i32 [[ADD2]], i32* [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP144]]
+// CHECK-32-NEXT:    [[TMP17:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP144]]
+// CHECK-32-NEXT:    [[TMP18:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP144]]
+// CHECK-32-NEXT:    [[ADD3:%.*]] = add nsw i32 [[TMP17]], [[TMP18]]
+// CHECK-32-NEXT:    store i32 [[ADD3]], i32* [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP144]]
+// CHECK-32-NEXT:    [[TMP19:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP144]]
+// CHECK-32-NEXT:    [[CMP4:%.*]] = icmp sgt i32 [[TMP19]], 9
+// CHECK-32-NEXT:    br i1 [[CMP4]], label [[COND_TRUE5:%.*]], label [[COND_FALSE6:%.*]]
+// CHECK-32:       cond.true5:
+// CHECK-32-NEXT:    br label [[COND_END7:%.*]]
+// CHECK-32:       cond.false6:
+// CHECK-32-NEXT:    [[TMP20:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP144]]
+// CHECK-32-NEXT:    br label [[COND_END7]]
+// CHECK-32:       cond.end7:
+// CHECK-32-NEXT:    [[COND8:%.*]] = phi i32 [ 9, [[COND_TRUE5]] ], [ [[TMP20]], [[COND_FALSE6]] ]
+// CHECK-32-NEXT:    store i32 [[COND8]], i32* [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP144]]
+// CHECK-32-NEXT:    [[TMP21:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP144]]
+// CHECK-32-NEXT:    store i32 [[TMP21]], i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP144]]
+// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP145:![0-9]+]]
+// CHECK-32:       omp.inner.for.end:
+// CHECK-32-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK-32:       omp.loop.exit:
+// CHECK-32-NEXT:    call void @__kmpc_distribute_static_fini(%struct.ident_t* @[[GLOB2]], i32 [[TMP1]])
+// CHECK-32-NEXT:    [[TMP22:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-32-NEXT:    [[TMP23:%.*]] = icmp ne i32 [[TMP22]], 0
+// CHECK-32-NEXT:    br i1 [[TMP23]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]]
+// CHECK-32:       .omp.final.then:
+// CHECK-32-NEXT:    store i32 10, i32* [[I]], align 4
+// CHECK-32-NEXT:    br label [[DOTOMP_FINAL_DONE]]
+// CHECK-32:       .omp.final.done:
+// CHECK-32-NEXT:    ret void
+//
+//
+// CHECK-32-LABEL: define {{[^@]+}}@__omp_outlined__4
+// CHECK-32-SAME: (i32* noalias noundef [[DOTGLOBAL_TID_:%.*]], i32* noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR1]] {
+// CHECK-32-NEXT:  entry:
+// CHECK-32-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-NEXT:    [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4
+// CHECK-32-NEXT:    store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4
+// CHECK-32-NEXT:    store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4
+// CHECK-32-NEXT:    store i32 0, i32* [[DOTOMP_LB]], align 4
+// CHECK-32-NEXT:    store i32 9, i32* [[DOTOMP_UB]], align 4
+// CHECK-32-NEXT:    [[TMP0:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4
+// CHECK-32-NEXT:    [[TMP1:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4
+// CHECK-32-NEXT:    store i32 [[TMP0]], i32* [[DOTOMP_LB]], align 4
+// CHECK-32-NEXT:    store i32 [[TMP1]], i32* [[DOTOMP_UB]], align 4
+// CHECK-32-NEXT:    store i32 1, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-32-NEXT:    store i32 0, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-32-NEXT:    [[TMP2:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-NEXT:    [[TMP3:%.*]] = load i32, i32* [[TMP2]], align 4
+// CHECK-32-NEXT:    call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB3]], i32 [[TMP3]], i32 34, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1)
+// CHECK-32-NEXT:    [[TMP4:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4
+// CHECK-32-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[TMP4]], 9
+// CHECK-32-NEXT:    br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK-32:       cond.true:
+// CHECK-32-NEXT:    br label [[COND_END:%.*]]
+// CHECK-32:       cond.false:
+// CHECK-32-NEXT:    [[TMP5:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4
+// CHECK-32-NEXT:    br label [[COND_END]]
+// CHECK-32:       cond.end:
+// CHECK-32-NEXT:    [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP5]], [[COND_FALSE]] ]
+// CHECK-32-NEXT:    store i32 [[COND]], i32* [[DOTOMP_UB]], align 4
+// CHECK-32-NEXT:    [[TMP6:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4
+// CHECK-32-NEXT:    store i32 [[TMP6]], i32* [[DOTOMP_IV]], align 4
+// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK-32:       omp.inner.for.cond:
+// CHECK-32-NEXT:    [[TMP7:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP147:![0-9]+]]
+// CHECK-32-NEXT:    [[TMP8:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP147]]
+// CHECK-32-NEXT:    [[CMP1:%.*]] = icmp sle i32 [[TMP7]], [[TMP8]]
+// CHECK-32-NEXT:    br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK-32:       omp.inner.for.body:
+// CHECK-32-NEXT:    [[TMP9:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP147]]
+// CHECK-32-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP9]], 1
+// CHECK-32-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// CHECK-32-NEXT:    store i32 [[ADD]], i32* [[I]], align 4, !llvm.access.group [[ACC_GRP147]]
+// CHECK-32-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK-32:       omp.body.continue:
+// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK-32:       omp.inner.for.inc:
+// CHECK-32-NEXT:    [[TMP10:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP147]]
+// CHECK-32-NEXT:    [[ADD2:%.*]] = add nsw i32 [[TMP10]], 1
+// CHECK-32-NEXT:    store i32 [[ADD2]], i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP147]]
+// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP148:![0-9]+]]
+// CHECK-32:       omp.inner.for.end:
+// CHECK-32-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK-32:       omp.loop.exit:
+// CHECK-32-NEXT:    call void @__kmpc_distribute_static_fini(%struct.ident_t* @[[GLOB2]], i32 [[TMP3]])
+// CHECK-32-NEXT:    [[TMP11:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-32-NEXT:    [[TMP12:%.*]] = icmp ne i32 [[TMP11]], 0
+// CHECK-32-NEXT:    br i1 [[TMP12]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]]
+// CHECK-32:       .omp.final.then:
+// CHECK-32-NEXT:    store i32 10, i32* [[I]], align 4
+// CHECK-32-NEXT:    br label [[DOTOMP_FINAL_DONE]]
+// CHECK-32:       .omp.final.done:
+// CHECK-32-NEXT:    ret void
+//
+//
+// CHECK-32-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l20
+// CHECK-32-SAME: () #[[ATTR0]] {
+// CHECK-32-NEXT:  entry:
+// CHECK-32-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 2, i1 false)
+// CHECK-32-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
+// CHECK-32-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
+// CHECK-32:       user_code.entry:
+// CHECK-32-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
+// CHECK-32-NEXT:    store i32 0, i32* [[DOTZERO_ADDR]], align 4
+// CHECK-32-NEXT:    store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4
+// CHECK-32-NEXT:    call void @__omp_outlined__5(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]]) #[[ATTR2]]
+// CHECK-32-NEXT:    call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 2)
+// CHECK-32-NEXT:    ret void
+// CHECK-32:       worker.exit:
+// CHECK-32-NEXT:    ret void
+//
+//
+// CHECK-32-LABEL: define {{[^@]+}}@__omp_outlined__5
+// CHECK-32-SAME: (i32* noalias noundef [[DOTGLOBAL_TID_:%.*]], i32* noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
+// CHECK-32-NEXT:  entry:
+// CHECK-32-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [2 x i8*], align 4
+// CHECK-32-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4
+// CHECK-32-NEXT:    store i32 0, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-32-NEXT:    store i32 9, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-NEXT:    store i32 1, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-32-NEXT:    store i32 0, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-32-NEXT:    [[NVPTX_NUM_THREADS:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block()
+// CHECK-32-NEXT:    [[TMP0:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-NEXT:    [[TMP1:%.*]] = load i32, i32* [[TMP0]], align 4
+// CHECK-32-NEXT:    call void @__kmpc_distribute_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP1]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]])
+// CHECK-32-NEXT:    [[TMP2:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[TMP2]], 9
+// CHECK-32-NEXT:    br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK-32:       cond.true:
+// CHECK-32-NEXT:    br label [[COND_END:%.*]]
+// CHECK-32:       cond.false:
+// CHECK-32-NEXT:    [[TMP3:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-NEXT:    br label [[COND_END]]
+// CHECK-32:       cond.end:
+// CHECK-32-NEXT:    [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP3]], [[COND_FALSE]] ]
+// CHECK-32-NEXT:    store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-NEXT:    [[TMP4:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-32-NEXT:    store i32 [[TMP4]], i32* [[DOTOMP_IV]], align 4
+// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK-32:       omp.inner.for.cond:
+// CHECK-32-NEXT:    [[TMP5:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP150:![0-9]+]]
+// CHECK-32-NEXT:    [[CMP1:%.*]] = icmp slt i32 [[TMP5]], 10
+// CHECK-32-NEXT:    br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK-32:       omp.inner.for.body:
+// CHECK-32-NEXT:    [[TMP6:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP150]]
+// CHECK-32-NEXT:    [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP150]]
+// CHECK-32-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0
+// CHECK-32-NEXT:    [[TMP9:%.*]] = inttoptr i32 [[TMP6]] to i8*
+// CHECK-32-NEXT:    store i8* [[TMP9]], i8** [[TMP8]], align 4, !llvm.access.group [[ACC_GRP150]]
+// CHECK-32-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1
+// CHECK-32-NEXT:    [[TMP11:%.*]] = inttoptr i32 [[TMP7]] to i8*
+// CHECK-32-NEXT:    store i8* [[TMP11]], i8** [[TMP10]], align 4, !llvm.access.group [[ACC_GRP150]]
+// CHECK-32-NEXT:    [[TMP12:%.*]] = bitcast [2 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8**
+// CHECK-32-NEXT:    call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32)* @__omp_outlined__6 to i8*), i8* null, i8** [[TMP12]], i32 2), !llvm.access.group [[ACC_GRP150]]
+// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK-32:       omp.inner.for.inc:
+// CHECK-32-NEXT:    [[TMP13:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP150]]
+// CHECK-32-NEXT:    [[TMP14:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP150]]
+// CHECK-32-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP13]], [[TMP14]]
+// CHECK-32-NEXT:    store i32 [[ADD]], i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP150]]
+// CHECK-32-NEXT:    [[TMP15:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP150]]
+// CHECK-32-NEXT:    [[TMP16:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP150]]
+// CHECK-32-NEXT:    [[ADD2:%.*]] = add nsw i32 [[TMP15]], [[TMP16]]
+// CHECK-32-NEXT:    store i32 [[ADD2]], i32* [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP150]]
+// CHECK-32-NEXT:    [[TMP17:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP150]]
+// CHECK-32-NEXT:    [[TMP18:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP150]]
+// CHECK-32-NEXT:    [[ADD3:%.*]] = add nsw i32 [[TMP17]], [[TMP18]]
+// CHECK-32-NEXT:    store i32 [[ADD3]], i32* [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP150]]
+// CHECK-32-NEXT:    [[TMP19:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP150]]
+// CHECK-32-NEXT:    [[CMP4:%.*]] = icmp sgt i32 [[TMP19]], 9
+// CHECK-32-NEXT:    br i1 [[CMP4]], label [[COND_TRUE5:%.*]], label [[COND_FALSE6:%.*]]
+// CHECK-32:       cond.true5:
+// CHECK-32-NEXT:    br label [[COND_END7:%.*]]
+// CHECK-32:       cond.false6:
+// CHECK-32-NEXT:    [[TMP20:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP150]]
+// CHECK-32-NEXT:    br label [[COND_END7]]
+// CHECK-32:       cond.end7:
+// CHECK-32-NEXT:    [[COND8:%.*]] = phi i32 [ 9, [[COND_TRUE5]] ], [ [[TMP20]], [[COND_FALSE6]] ]
+// CHECK-32-NEXT:    store i32 [[COND8]], i32* [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP150]]
+// CHECK-32-NEXT:    [[TMP21:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP150]]
+// CHECK-32-NEXT:    store i32 [[TMP21]], i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP150]]
+// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP151:![0-9]+]]
+// CHECK-32:       omp.inner.for.end:
+// CHECK-32-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK-32:       omp.loop.exit:
+// CHECK-32-NEXT:    call void @__kmpc_distribute_static_fini(%struct.ident_t* @[[GLOB2]], i32 [[TMP1]])
+// CHECK-32-NEXT:    [[TMP22:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-32-NEXT:    [[TMP23:%.*]] = icmp ne i32 [[TMP22]], 0
+// CHECK-32-NEXT:    br i1 [[TMP23]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]]
+// CHECK-32:       .omp.final.then:
+// CHECK-32-NEXT:    store i32 10, i32* [[I]], align 4
+// CHECK-32-NEXT:    br label [[DOTOMP_FINAL_DONE]]
+// CHECK-32:       .omp.final.done:
+// CHECK-32-NEXT:    ret void
+//
+//
+// CHECK-32-LABEL: define {{[^@]+}}@__omp_outlined__6
+// CHECK-32-SAME: (i32* noalias noundef [[DOTGLOBAL_TID_:%.*]], i32* noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR1]] {
+// CHECK-32-NEXT:  entry:
+// CHECK-32-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-NEXT:    [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4
+// CHECK-32-NEXT:    store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4
+// CHECK-32-NEXT:    store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4
+// CHECK-32-NEXT:    store i32 0, i32* [[DOTOMP_LB]], align 4
+// CHECK-32-NEXT:    store i32 9, i32* [[DOTOMP_UB]], align 4
+// CHECK-32-NEXT:    [[TMP0:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4
+// CHECK-32-NEXT:    [[TMP1:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4
+// CHECK-32-NEXT:    store i32 [[TMP0]], i32* [[DOTOMP_LB]], align 4
+// CHECK-32-NEXT:    store i32 [[TMP1]], i32* [[DOTOMP_UB]], align 4
+// CHECK-32-NEXT:    store i32 1, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-32-NEXT:    store i32 0, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-32-NEXT:    [[TMP2:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-NEXT:    [[TMP3:%.*]] = load i32, i32* [[TMP2]], align 4
+// CHECK-32-NEXT:    call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB3]], i32 [[TMP3]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1)
+// CHECK-32-NEXT:    [[TMP4:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4
+// CHECK-32-NEXT:    store i32 [[TMP4]], i32* [[DOTOMP_IV]], align 4
+// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK-32:       omp.inner.for.cond:
+// CHECK-32-NEXT:    [[TMP5:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP153:![0-9]+]]
+// CHECK-32-NEXT:    [[TMP6:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4, !llvm.access.group [[ACC_GRP153]]
+// CHECK-32-NEXT:    [[CMP:%.*]] = icmp ule i32 [[TMP5]], [[TMP6]]
+// CHECK-32-NEXT:    br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK-32:       omp.inner.for.body:
+// CHECK-32-NEXT:    [[TMP7:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP153]]
+// CHECK-32-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP7]], 1
+// CHECK-32-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// CHECK-32-NEXT:    store i32 [[ADD]], i32* [[I]], align 4, !llvm.access.group [[ACC_GRP153]]
+// CHECK-32-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK-32:       omp.body.continue:
+// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK-32:       omp.inner.for.inc:
+// CHECK-32-NEXT:    [[TMP8:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP153]]
+// CHECK-32-NEXT:    [[TMP9:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP153]]
+// CHECK-32-NEXT:    [[ADD1:%.*]] = add nsw i32 [[TMP8]], [[TMP9]]
+// CHECK-32-NEXT:    store i32 [[ADD1]], i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP153]]
+// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP154:![0-9]+]]
+// CHECK-32:       omp.inner.for.end:
+// CHECK-32-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK-32:       omp.loop.exit:
+// CHECK-32-NEXT:    call void @__kmpc_distribute_static_fini(%struct.ident_t* @[[GLOB2]], i32 [[TMP3]])
+// CHECK-32-NEXT:    [[TMP10:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-32-NEXT:    [[TMP11:%.*]] = icmp ne i32 [[TMP10]], 0
+// CHECK-32-NEXT:    br i1 [[TMP11]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]]
+// CHECK-32:       .omp.final.then:
+// CHECK-32-NEXT:    store i32 10, i32* [[I]], align 4
+// CHECK-32-NEXT:    br label [[DOTOMP_FINAL_DONE]]
+// CHECK-32:       .omp.final.done:
+// CHECK-32-NEXT:    ret void
+//
+//
+// CHECK-32-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l23
+// CHECK-32-SAME: () #[[ATTR0]] {
+// CHECK-32-NEXT:  entry:
+// CHECK-32-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 2, i1 false)
+// CHECK-32-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
+// CHECK-32-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
+// CHECK-32:       user_code.entry:
+// CHECK-32-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
+// CHECK-32-NEXT:    store i32 0, i32* [[DOTZERO_ADDR]], align 4
+// CHECK-32-NEXT:    store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4
+// CHECK-32-NEXT:    call void @__omp_outlined__7(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]]) #[[ATTR2]]
+// CHECK-32-NEXT:    call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 2)
+// CHECK-32-NEXT:    ret void
+// CHECK-32:       worker.exit:
+// CHECK-32-NEXT:    ret void
+//
+//
+// CHECK-32-LABEL: define {{[^@]+}}@__omp_outlined__7
+// CHECK-32-SAME: (i32* noalias noundef [[DOTGLOBAL_TID_:%.*]], i32* noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
+// CHECK-32-NEXT:  entry:
+// CHECK-32-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [2 x i8*], align 4
+// CHECK-32-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4
+// CHECK-32-NEXT:    store i32 0, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-32-NEXT:    store i32 9, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-NEXT:    store i32 1, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-32-NEXT:    store i32 0, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-32-NEXT:    [[NVPTX_NUM_THREADS:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block()
+// CHECK-32-NEXT:    [[TMP0:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-NEXT:    [[TMP1:%.*]] = load i32, i32* [[TMP0]], align 4
+// CHECK-32-NEXT:    call void @__kmpc_distribute_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP1]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]])
+// CHECK-32-NEXT:    [[TMP2:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[TMP2]], 9
+// CHECK-32-NEXT:    br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK-32:       cond.true:
+// CHECK-32-NEXT:    br label [[COND_END:%.*]]
+// CHECK-32:       cond.false:
+// CHECK-32-NEXT:    [[TMP3:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-NEXT:    br label [[COND_END]]
+// CHECK-32:       cond.end:
+// CHECK-32-NEXT:    [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP3]], [[COND_FALSE]] ]
+// CHECK-32-NEXT:    store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-NEXT:    [[TMP4:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-32-NEXT:    store i32 [[TMP4]], i32* [[DOTOMP_IV]], align 4
+// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK-32:       omp.inner.for.cond:
+// CHECK-32-NEXT:    [[TMP5:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP156:![0-9]+]]
+// CHECK-32-NEXT:    [[CMP1:%.*]] = icmp slt i32 [[TMP5]], 10
+// CHECK-32-NEXT:    br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK-32:       omp.inner.for.body:
+// CHECK-32-NEXT:    [[TMP6:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP156]]
+// CHECK-32-NEXT:    [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP156]]
+// CHECK-32-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0
+// CHECK-32-NEXT:    [[TMP9:%.*]] = inttoptr i32 [[TMP6]] to i8*
+// CHECK-32-NEXT:    store i8* [[TMP9]], i8** [[TMP8]], align 4, !llvm.access.group [[ACC_GRP156]]
+// CHECK-32-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1
+// CHECK-32-NEXT:    [[TMP11:%.*]] = inttoptr i32 [[TMP7]] to i8*
+// CHECK-32-NEXT:    store i8* [[TMP11]], i8** [[TMP10]], align 4, !llvm.access.group [[ACC_GRP156]]
+// CHECK-32-NEXT:    [[TMP12:%.*]] = bitcast [2 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8**
+// CHECK-32-NEXT:    call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32)* @__omp_outlined__8 to i8*), i8* null, i8** [[TMP12]], i32 2), !llvm.access.group [[ACC_GRP156]]
+// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK-32:       omp.inner.for.inc:
+// CHECK-32-NEXT:    [[TMP13:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP156]]
+// CHECK-32-NEXT:    [[TMP14:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP156]]
+// CHECK-32-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP13]], [[TMP14]]
+// CHECK-32-NEXT:    store i32 [[ADD]], i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP156]]
+// CHECK-32-NEXT:    [[TMP15:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP156]]
+// CHECK-32-NEXT:    [[TMP16:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP156]]
+// CHECK-32-NEXT:    [[ADD2:%.*]] = add nsw i32 [[TMP15]], [[TMP16]]
+// CHECK-32-NEXT:    store i32 [[ADD2]], i32* [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP156]]
+// CHECK-32-NEXT:    [[TMP17:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP156]]
+// CHECK-32-NEXT:    [[TMP18:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP156]]
+// CHECK-32-NEXT:    [[ADD3:%.*]] = add nsw i32 [[TMP17]], [[TMP18]]
+// CHECK-32-NEXT:    store i32 [[ADD3]], i32* [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP156]]
+// CHECK-32-NEXT:    [[TMP19:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP156]]
+// CHECK-32-NEXT:    [[CMP4:%.*]] = icmp sgt i32 [[TMP19]], 9
+// CHECK-32-NEXT:    br i1 [[CMP4]], label [[COND_TRUE5:%.*]], label [[COND_FALSE6:%.*]]
+// CHECK-32:       cond.true5:
+// CHECK-32-NEXT:    br label [[COND_END7:%.*]]
+// CHECK-32:       cond.false6:
+// CHECK-32-NEXT:    [[TMP20:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP156]]
+// CHECK-32-NEXT:    br label [[COND_END7]]
+// CHECK-32:       cond.end7:
+// CHECK-32-NEXT:    [[COND8:%.*]] = phi i32 [ 9, [[COND_TRUE5]] ], [ [[TMP20]], [[COND_FALSE6]] ]
+// CHECK-32-NEXT:    store i32 [[COND8]], i32* [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP156]]
+// CHECK-32-NEXT:    [[TMP21:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP156]]
+// CHECK-32-NEXT:    store i32 [[TMP21]], i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP156]]
+// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP157:![0-9]+]]
+// CHECK-32:       omp.inner.for.end:
+// CHECK-32-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK-32:       omp.loop.exit:
+// CHECK-32-NEXT:    call void @__kmpc_distribute_static_fini(%struct.ident_t* @[[GLOB2]], i32 [[TMP1]])
+// CHECK-32-NEXT:    [[TMP22:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-32-NEXT:    [[TMP23:%.*]] = icmp ne i32 [[TMP22]], 0
+// CHECK-32-NEXT:    br i1 [[TMP23]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]]
+// CHECK-32:       .omp.final.then:
+// CHECK-32-NEXT:    store i32 10, i32* [[I]], align 4
+// CHECK-32-NEXT:    br label [[DOTOMP_FINAL_DONE]]
+// CHECK-32:       .omp.final.done:
+// CHECK-32-NEXT:    ret void
+//
+//
+// CHECK-32-LABEL: define {{[^@]+}}@__omp_outlined__8
+// CHECK-32-SAME: (i32* noalias noundef [[DOTGLOBAL_TID_:%.*]], i32* noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR1]] {
+// CHECK-32-NEXT:  entry:
+// CHECK-32-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-NEXT:    [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4
+// CHECK-32-NEXT:    store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4
+// CHECK-32-NEXT:    store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4
+// CHECK-32-NEXT:    store i32 0, i32* [[DOTOMP_LB]], align 4
+// CHECK-32-NEXT:    store i32 9, i32* [[DOTOMP_UB]], align 4
+// CHECK-32-NEXT:    [[TMP0:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4
+// CHECK-32-NEXT:    [[TMP1:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4
+// CHECK-32-NEXT:    store i32 [[TMP0]], i32* [[DOTOMP_LB]], align 4
+// CHECK-32-NEXT:    store i32 [[TMP1]], i32* [[DOTOMP_UB]], align 4
+// CHECK-32-NEXT:    store i32 1, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-32-NEXT:    store i32 0, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-32-NEXT:    [[TMP2:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4
+// CHECK-32-NEXT:    [[TMP3:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4
+// CHECK-32-NEXT:    [[TMP4:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-NEXT:    [[TMP5:%.*]] = load i32, i32* [[TMP4]], align 4
+// CHECK-32-NEXT:    call void @__kmpc_dispatch_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP5]], i32 1073741862, i32 [[TMP2]], i32 [[TMP3]], i32 1, i32 1)
+// CHECK-32-NEXT:    br label [[OMP_DISPATCH_COND:%.*]]
+// CHECK-32:       omp.dispatch.cond:
+// CHECK-32-NEXT:    [[TMP6:%.*]] = call i32 @__kmpc_dispatch_next_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP5]], i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]])
+// CHECK-32-NEXT:    [[TOBOOL:%.*]] = icmp ne i32 [[TMP6]], 0
+// CHECK-32-NEXT:    br i1 [[TOBOOL]], label [[OMP_DISPATCH_BODY:%.*]], label [[OMP_DISPATCH_END:%.*]]
+// CHECK-32:       omp.dispatch.body:
+// CHECK-32-NEXT:    [[TMP7:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4
+// CHECK-32-NEXT:    store i32 [[TMP7]], i32* [[DOTOMP_IV]], align 4
+// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK-32:       omp.inner.for.cond:
+// CHECK-32-NEXT:    [[TMP8:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP159:![0-9]+]]
+// CHECK-32-NEXT:    [[TMP9:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP159]]
+// CHECK-32-NEXT:    [[CMP:%.*]] = icmp sle i32 [[TMP8]], [[TMP9]]
+// CHECK-32-NEXT:    br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK-32:       omp.inner.for.body:
+// CHECK-32-NEXT:    [[TMP10:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP159]]
+// CHECK-32-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP10]], 1
+// CHECK-32-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// CHECK-32-NEXT:    store i32 [[ADD]], i32* [[I]], align 4, !llvm.access.group [[ACC_GRP159]]
+// CHECK-32-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK-32:       omp.body.continue:
+// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK-32:       omp.inner.for.inc:
+// CHECK-32-NEXT:    [[TMP11:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP159]]
+// CHECK-32-NEXT:    [[ADD1:%.*]] = add nsw i32 [[TMP11]], 1
+// CHECK-32-NEXT:    store i32 [[ADD1]], i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP159]]
+// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP160:![0-9]+]]
+// CHECK-32:       omp.inner.for.end:
+// CHECK-32-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
+// CHECK-32:       omp.dispatch.inc:
+// CHECK-32-NEXT:    br label [[OMP_DISPATCH_COND]]
+// CHECK-32:       omp.dispatch.end:
+// CHECK-32-NEXT:    [[TMP12:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-32-NEXT:    [[TMP13:%.*]] = icmp ne i32 [[TMP12]], 0
+// CHECK-32-NEXT:    br i1 [[TMP13]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]]
+// CHECK-32:       .omp.final.then:
+// CHECK-32-NEXT:    store i32 10, i32* [[I]], align 4
+// CHECK-32-NEXT:    br label [[DOTOMP_FINAL_DONE]]
+// CHECK-32:       .omp.final.done:
+// CHECK-32-NEXT:    ret void
+//
+//
+// CHECK-32-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l26
+// CHECK-32-SAME: () #[[ATTR0]] {
+// CHECK-32-NEXT:  entry:
+// CHECK-32-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 2, i1 false)
+// CHECK-32-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
+// CHECK-32-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
+// CHECK-32:       user_code.entry:
+// CHECK-32-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
+// CHECK-32-NEXT:    store i32 0, i32* [[DOTZERO_ADDR]], align 4
+// CHECK-32-NEXT:    store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4
+// CHECK-32-NEXT:    call void @__omp_outlined__9(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]]) #[[ATTR2]]
+// CHECK-32-NEXT:    call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 2)
+// CHECK-32-NEXT:    ret void
+// CHECK-32:       worker.exit:
+// CHECK-32-NEXT:    ret void
+//
+//
+// CHECK-32-LABEL: define {{[^@]+}}@__omp_outlined__9
+// CHECK-32-SAME: (i32* noalias noundef [[DOTGLOBAL_TID_:%.*]], i32* noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
+// CHECK-32-NEXT:  entry:
+// CHECK-32-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [2 x i8*], align 4
+// CHECK-32-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4
+// CHECK-32-NEXT:    store i32 0, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-32-NEXT:    store i32 9, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-NEXT:    store i32 1, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-32-NEXT:    store i32 0, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-32-NEXT:    [[NVPTX_NUM_THREADS:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block()
+// CHECK-32-NEXT:    [[TMP0:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-NEXT:    [[TMP1:%.*]] = load i32, i32* [[TMP0]], align 4
+// CHECK-32-NEXT:    call void @__kmpc_distribute_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP1]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]])
+// CHECK-32-NEXT:    [[TMP2:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[TMP2]], 9
+// CHECK-32-NEXT:    br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK-32:       cond.true:
+// CHECK-32-NEXT:    br label [[COND_END:%.*]]
+// CHECK-32:       cond.false:
+// CHECK-32-NEXT:    [[TMP3:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-NEXT:    br label [[COND_END]]
+// CHECK-32:       cond.end:
+// CHECK-32-NEXT:    [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP3]], [[COND_FALSE]] ]
+// CHECK-32-NEXT:    store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-NEXT:    [[TMP4:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-32-NEXT:    store i32 [[TMP4]], i32* [[DOTOMP_IV]], align 4
+// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK-32:       omp.inner.for.cond:
+// CHECK-32-NEXT:    [[TMP5:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP162:![0-9]+]]
+// CHECK-32-NEXT:    [[CMP1:%.*]] = icmp slt i32 [[TMP5]], 10
+// CHECK-32-NEXT:    br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK-32:       omp.inner.for.body:
+// CHECK-32-NEXT:    [[TMP6:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP162]]
+// CHECK-32-NEXT:    [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP162]]
+// CHECK-32-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0
+// CHECK-32-NEXT:    [[TMP9:%.*]] = inttoptr i32 [[TMP6]] to i8*
+// CHECK-32-NEXT:    store i8* [[TMP9]], i8** [[TMP8]], align 4, !llvm.access.group [[ACC_GRP162]]
+// CHECK-32-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1
+// CHECK-32-NEXT:    [[TMP11:%.*]] = inttoptr i32 [[TMP7]] to i8*
+// CHECK-32-NEXT:    store i8* [[TMP11]], i8** [[TMP10]], align 4, !llvm.access.group [[ACC_GRP162]]
+// CHECK-32-NEXT:    [[TMP12:%.*]] = bitcast [2 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8**
+// CHECK-32-NEXT:    call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32)* @__omp_outlined__10 to i8*), i8* null, i8** [[TMP12]], i32 2), !llvm.access.group [[ACC_GRP162]]
+// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK-32:       omp.inner.for.inc:
+// CHECK-32-NEXT:    [[TMP13:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP162]]
+// CHECK-32-NEXT:    [[TMP14:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP162]]
+// CHECK-32-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP13]], [[TMP14]]
+// CHECK-32-NEXT:    store i32 [[ADD]], i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP162]]
+// CHECK-32-NEXT:    [[TMP15:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP162]]
+// CHECK-32-NEXT:    [[TMP16:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP162]]
+// CHECK-32-NEXT:    [[ADD2:%.*]] = add nsw i32 [[TMP15]], [[TMP16]]
+// CHECK-32-NEXT:    store i32 [[ADD2]], i32* [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP162]]
+// CHECK-32-NEXT:    [[TMP17:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP162]]
+// CHECK-32-NEXT:    [[TMP18:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP162]]
+// CHECK-32-NEXT:    [[ADD3:%.*]] = add nsw i32 [[TMP17]], [[TMP18]]
+// CHECK-32-NEXT:    store i32 [[ADD3]], i32* [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP162]]
+// CHECK-32-NEXT:    [[TMP19:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP162]]
+// CHECK-32-NEXT:    [[CMP4:%.*]] = icmp sgt i32 [[TMP19]], 9
+// CHECK-32-NEXT:    br i1 [[CMP4]], label [[COND_TRUE5:%.*]], label [[COND_FALSE6:%.*]]
+// CHECK-32:       cond.true5:
+// CHECK-32-NEXT:    br label [[COND_END7:%.*]]
+// CHECK-32:       cond.false6:
+// CHECK-32-NEXT:    [[TMP20:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP162]]
+// CHECK-32-NEXT:    br label [[COND_END7]]
+// CHECK-32:       cond.end7:
+// CHECK-32-NEXT:    [[COND8:%.*]] = phi i32 [ 9, [[COND_TRUE5]] ], [ [[TMP20]], [[COND_FALSE6]] ]
+// CHECK-32-NEXT:    store i32 [[COND8]], i32* [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP162]]
+// CHECK-32-NEXT:    [[TMP21:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP162]]
+// CHECK-32-NEXT:    store i32 [[TMP21]], i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP162]]
+// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP163:![0-9]+]]
+// CHECK-32:       omp.inner.for.end:
+// CHECK-32-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK-32:       omp.loop.exit:
+// CHECK-32-NEXT:    call void @__kmpc_distribute_static_fini(%struct.ident_t* @[[GLOB2]], i32 [[TMP1]])
+// CHECK-32-NEXT:    [[TMP22:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-32-NEXT:    [[TMP23:%.*]] = icmp ne i32 [[TMP22]], 0
+// CHECK-32-NEXT:    br i1 [[TMP23]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]]
+// CHECK-32:       .omp.final.then:
+// CHECK-32-NEXT:    store i32 10, i32* [[I]], align 4
+// CHECK-32-NEXT:    br label [[DOTOMP_FINAL_DONE]]
+// CHECK-32:       .omp.final.done:
+// CHECK-32-NEXT:    ret void
+//
+//
+// CHECK-32-LABEL: define {{[^@]+}}@__omp_outlined__10
+// CHECK-32-SAME: (i32* noalias noundef [[DOTGLOBAL_TID_:%.*]], i32* noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR1]] {
+// CHECK-32-NEXT:  entry:
+// CHECK-32-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-NEXT:    [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4
+// CHECK-32-NEXT:    store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4
+// CHECK-32-NEXT:    store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4
+// CHECK-32-NEXT:    store i32 0, i32* [[DOTOMP_LB]], align 4
+// CHECK-32-NEXT:    store i32 9, i32* [[DOTOMP_UB]], align 4
+// CHECK-32-NEXT:    [[TMP0:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4
+// CHECK-32-NEXT:    [[TMP1:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4
+// CHECK-32-NEXT:    store i32 [[TMP0]], i32* [[DOTOMP_LB]], align 4
+// CHECK-32-NEXT:    store i32 [[TMP1]], i32* [[DOTOMP_UB]], align 4
+// CHECK-32-NEXT:    store i32 1, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-32-NEXT:    store i32 0, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-32-NEXT:    [[TMP2:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4
+// CHECK-32-NEXT:    [[TMP3:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4
+// CHECK-32-NEXT:    [[TMP4:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-NEXT:    [[TMP5:%.*]] = load i32, i32* [[TMP4]], align 4
+// CHECK-32-NEXT:    call void @__kmpc_dispatch_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP5]], i32 1073741861, i32 [[TMP2]], i32 [[TMP3]], i32 1, i32 1)
+// CHECK-32-NEXT:    br label [[OMP_DISPATCH_COND:%.*]]
+// CHECK-32:       omp.dispatch.cond:
+// CHECK-32-NEXT:    [[TMP6:%.*]] = call i32 @__kmpc_dispatch_next_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP5]], i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]])
+// CHECK-32-NEXT:    [[TOBOOL:%.*]] = icmp ne i32 [[TMP6]], 0
+// CHECK-32-NEXT:    br i1 [[TOBOOL]], label [[OMP_DISPATCH_BODY:%.*]], label [[OMP_DISPATCH_END:%.*]]
+// CHECK-32:       omp.dispatch.body:
+// CHECK-32-NEXT:    [[TMP7:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4
+// CHECK-32-NEXT:    store i32 [[TMP7]], i32* [[DOTOMP_IV]], align 4
+// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK-32:       omp.inner.for.cond:
+// CHECK-32-NEXT:    [[TMP8:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP165:![0-9]+]]
+// CHECK-32-NEXT:    [[TMP9:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP165]]
+// CHECK-32-NEXT:    [[CMP:%.*]] = icmp sle i32 [[TMP8]], [[TMP9]]
+// CHECK-32-NEXT:    br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK-32:       omp.inner.for.body:
+// CHECK-32-NEXT:    [[TMP10:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP165]]
+// CHECK-32-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP10]], 1
+// CHECK-32-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// CHECK-32-NEXT:    store i32 [[ADD]], i32* [[I]], align 4, !llvm.access.group [[ACC_GRP165]]
+// CHECK-32-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK-32:       omp.body.continue:
+// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK-32:       omp.inner.for.inc:
+// CHECK-32-NEXT:    [[TMP11:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP165]]
+// CHECK-32-NEXT:    [[ADD1:%.*]] = add nsw i32 [[TMP11]], 1
+// CHECK-32-NEXT:    store i32 [[ADD1]], i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP165]]
+// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP166:![0-9]+]]
+// CHECK-32:       omp.inner.for.end:
+// CHECK-32-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
+// CHECK-32:       omp.dispatch.inc:
+// CHECK-32-NEXT:    br label [[OMP_DISPATCH_COND]]
+// CHECK-32:       omp.dispatch.end:
+// CHECK-32-NEXT:    [[TMP12:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-32-NEXT:    [[TMP13:%.*]] = icmp ne i32 [[TMP12]], 0
+// CHECK-32-NEXT:    br i1 [[TMP13]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]]
+// CHECK-32:       .omp.final.then:
+// CHECK-32-NEXT:    store i32 10, i32* [[I]], align 4
+// CHECK-32-NEXT:    br label [[DOTOMP_FINAL_DONE]]
+// CHECK-32:       .omp.final.done:
+// CHECK-32-NEXT:    ret void
+//
+//
+// CHECK-32-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l29
+// CHECK-32-SAME: () #[[ATTR0]] {
+// CHECK-32-NEXT:  entry:
+// CHECK-32-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 2, i1 false)
+// CHECK-32-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
+// CHECK-32-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
+// CHECK-32:       user_code.entry:
+// CHECK-32-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
+// CHECK-32-NEXT:    store i32 0, i32* [[DOTZERO_ADDR]], align 4
+// CHECK-32-NEXT:    store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4
+// CHECK-32-NEXT:    call void @__omp_outlined__11(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]]) #[[ATTR2]]
+// CHECK-32-NEXT:    call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 2)
+// CHECK-32-NEXT:    ret void
+// CHECK-32:       worker.exit:
+// CHECK-32-NEXT:    ret void
+//
+//
+// CHECK-32-LABEL: define {{[^@]+}}@__omp_outlined__11
+// CHECK-32-SAME: (i32* noalias noundef [[DOTGLOBAL_TID_:%.*]], i32* noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
+// CHECK-32-NEXT:  entry:
+// CHECK-32-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [2 x i8*], align 4
+// CHECK-32-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4
+// CHECK-32-NEXT:    store i32 0, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-32-NEXT:    store i32 9, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-NEXT:    store i32 1, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-32-NEXT:    store i32 0, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-32-NEXT:    [[NVPTX_NUM_THREADS:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block()
+// CHECK-32-NEXT:    [[TMP0:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-NEXT:    [[TMP1:%.*]] = load i32, i32* [[TMP0]], align 4
+// CHECK-32-NEXT:    call void @__kmpc_distribute_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP1]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]])
+// CHECK-32-NEXT:    [[TMP2:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[TMP2]], 9
+// CHECK-32-NEXT:    br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK-32:       cond.true:
+// CHECK-32-NEXT:    br label [[COND_END:%.*]]
+// CHECK-32:       cond.false:
+// CHECK-32-NEXT:    [[TMP3:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-NEXT:    br label [[COND_END]]
+// CHECK-32:       cond.end:
+// CHECK-32-NEXT:    [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP3]], [[COND_FALSE]] ]
+// CHECK-32-NEXT:    store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-NEXT:    [[TMP4:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-32-NEXT:    store i32 [[TMP4]], i32* [[DOTOMP_IV]], align 4
+// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK-32:       omp.inner.for.cond:
+// CHECK-32-NEXT:    [[TMP5:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP168:![0-9]+]]
+// CHECK-32-NEXT:    [[CMP1:%.*]] = icmp slt i32 [[TMP5]], 10
+// CHECK-32-NEXT:    br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK-32:       omp.inner.for.body:
+// CHECK-32-NEXT:    [[TMP6:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP168]]
+// CHECK-32-NEXT:    [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP168]]
+// CHECK-32-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0
+// CHECK-32-NEXT:    [[TMP9:%.*]] = inttoptr i32 [[TMP6]] to i8*
+// CHECK-32-NEXT:    store i8* [[TMP9]], i8** [[TMP8]], align 4, !llvm.access.group [[ACC_GRP168]]
+// CHECK-32-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1
+// CHECK-32-NEXT:    [[TMP11:%.*]] = inttoptr i32 [[TMP7]] to i8*
+// CHECK-32-NEXT:    store i8* [[TMP11]], i8** [[TMP10]], align 4, !llvm.access.group [[ACC_GRP168]]
+// CHECK-32-NEXT:    [[TMP12:%.*]] = bitcast [2 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8**
+// CHECK-32-NEXT:    call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32)* @__omp_outlined__12 to i8*), i8* null, i8** [[TMP12]], i32 2), !llvm.access.group [[ACC_GRP168]]
+// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK-32:       omp.inner.for.inc:
+// CHECK-32-NEXT:    [[TMP13:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP168]]
+// CHECK-32-NEXT:    [[TMP14:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP168]]
+// CHECK-32-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP13]], [[TMP14]]
+// CHECK-32-NEXT:    store i32 [[ADD]], i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP168]]
+// CHECK-32-NEXT:    [[TMP15:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP168]]
+// CHECK-32-NEXT:    [[TMP16:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP168]]
+// CHECK-32-NEXT:    [[ADD2:%.*]] = add nsw i32 [[TMP15]], [[TMP16]]
+// CHECK-32-NEXT:    store i32 [[ADD2]], i32* [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP168]]
+// CHECK-32-NEXT:    [[TMP17:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP168]]
+// CHECK-32-NEXT:    [[TMP18:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP168]]
+// CHECK-32-NEXT:    [[ADD3:%.*]] = add nsw i32 [[TMP17]], [[TMP18]]
+// CHECK-32-NEXT:    store i32 [[ADD3]], i32* [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP168]]
+// CHECK-32-NEXT:    [[TMP19:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP168]]
+// CHECK-32-NEXT:    [[CMP4:%.*]] = icmp sgt i32 [[TMP19]], 9
+// CHECK-32-NEXT:    br i1 [[CMP4]], label [[COND_TRUE5:%.*]], label [[COND_FALSE6:%.*]]
+// CHECK-32:       cond.true5:
+// CHECK-32-NEXT:    br label [[COND_END7:%.*]]
+// CHECK-32:       cond.false6:
+// CHECK-32-NEXT:    [[TMP20:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP168]]
+// CHECK-32-NEXT:    br label [[COND_END7]]
+// CHECK-32:       cond.end7:
+// CHECK-32-NEXT:    [[COND8:%.*]] = phi i32 [ 9, [[COND_TRUE5]] ], [ [[TMP20]], [[COND_FALSE6]] ]
+// CHECK-32-NEXT:    store i32 [[COND8]], i32* [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP168]]
+// CHECK-32-NEXT:    [[TMP21:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP168]]
+// CHECK-32-NEXT:    store i32 [[TMP21]], i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP168]]
+// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP169:![0-9]+]]
+// CHECK-32:       omp.inner.for.end:
+// CHECK-32-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK-32:       omp.loop.exit:
+// CHECK-32-NEXT:    call void @__kmpc_distribute_static_fini(%struct.ident_t* @[[GLOB2]], i32 [[TMP1]])
+// CHECK-32-NEXT:    [[TMP22:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-32-NEXT:    [[TMP23:%.*]] = icmp ne i32 [[TMP22]], 0
+// CHECK-32-NEXT:    br i1 [[TMP23]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]]
+// CHECK-32:       .omp.final.then:
+// CHECK-32-NEXT:    store i32 10, i32* [[I]], align 4
+// CHECK-32-NEXT:    br label [[DOTOMP_FINAL_DONE]]
+// CHECK-32:       .omp.final.done:
+// CHECK-32-NEXT:    ret void
+//
+//
+// CHECK-32-LABEL: define {{[^@]+}}@__omp_outlined__12
+// CHECK-32-SAME: (i32* noalias noundef [[DOTGLOBAL_TID_:%.*]], i32* noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR1]] {
+// CHECK-32-NEXT:  entry:
+// CHECK-32-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-NEXT:    [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4
+// CHECK-32-NEXT:    store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4
+// CHECK-32-NEXT:    store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4
+// CHECK-32-NEXT:    store i32 0, i32* [[DOTOMP_LB]], align 4
+// CHECK-32-NEXT:    store i32 9, i32* [[DOTOMP_UB]], align 4
+// CHECK-32-NEXT:    [[TMP0:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4
+// CHECK-32-NEXT:    [[TMP1:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4
+// CHECK-32-NEXT:    store i32 [[TMP0]], i32* [[DOTOMP_LB]], align 4
+// CHECK-32-NEXT:    store i32 [[TMP1]], i32* [[DOTOMP_UB]], align 4
+// CHECK-32-NEXT:    store i32 1, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-32-NEXT:    store i32 0, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-32-NEXT:    [[TMP2:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4
+// CHECK-32-NEXT:    [[TMP3:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4
+// CHECK-32-NEXT:    [[TMP4:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-NEXT:    [[TMP5:%.*]] = load i32, i32* [[TMP4]], align 4
+// CHECK-32-NEXT:    call void @__kmpc_dispatch_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP5]], i32 1073741859, i32 [[TMP2]], i32 [[TMP3]], i32 1, i32 1)
+// CHECK-32-NEXT:    br label [[OMP_DISPATCH_COND:%.*]]
+// CHECK-32:       omp.dispatch.cond:
+// CHECK-32-NEXT:    [[TMP6:%.*]] = call i32 @__kmpc_dispatch_next_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP5]], i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]])
+// CHECK-32-NEXT:    [[TOBOOL:%.*]] = icmp ne i32 [[TMP6]], 0
+// CHECK-32-NEXT:    br i1 [[TOBOOL]], label [[OMP_DISPATCH_BODY:%.*]], label [[OMP_DISPATCH_END:%.*]]
+// CHECK-32:       omp.dispatch.body:
+// CHECK-32-NEXT:    [[TMP7:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4
+// CHECK-32-NEXT:    store i32 [[TMP7]], i32* [[DOTOMP_IV]], align 4
+// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK-32:       omp.inner.for.cond:
+// CHECK-32-NEXT:    [[TMP8:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP171:![0-9]+]]
+// CHECK-32-NEXT:    [[TMP9:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP171]]
+// CHECK-32-NEXT:    [[CMP:%.*]] = icmp sle i32 [[TMP8]], [[TMP9]]
+// CHECK-32-NEXT:    br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK-32:       omp.inner.for.body:
+// CHECK-32-NEXT:    [[TMP10:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP171]]
+// CHECK-32-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP10]], 1
+// CHECK-32-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// CHECK-32-NEXT:    store i32 [[ADD]], i32* [[I]], align 4, !llvm.access.group [[ACC_GRP171]]
+// CHECK-32-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK-32:       omp.body.continue:
+// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK-32:       omp.inner.for.inc:
+// CHECK-32-NEXT:    [[TMP11:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP171]]
+// CHECK-32-NEXT:    [[ADD1:%.*]] = add nsw i32 [[TMP11]], 1
+// CHECK-32-NEXT:    store i32 [[ADD1]], i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP171]]
+// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP172:![0-9]+]]
+// CHECK-32:       omp.inner.for.end:
+// CHECK-32-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
+// CHECK-32:       omp.dispatch.inc:
+// CHECK-32-NEXT:    br label [[OMP_DISPATCH_COND]]
+// CHECK-32:       omp.dispatch.end:
+// CHECK-32-NEXT:    [[TMP12:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-32-NEXT:    [[TMP13:%.*]] = icmp ne i32 [[TMP12]], 0
+// CHECK-32-NEXT:    br i1 [[TMP13]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]]
+// CHECK-32:       .omp.final.then:
+// CHECK-32-NEXT:    store i32 10, i32* [[I]], align 4
+// CHECK-32-NEXT:    br label [[DOTOMP_FINAL_DONE]]
+// CHECK-32:       .omp.final.done:
+// CHECK-32-NEXT:    ret void
+//
+//
+// CHECK-32-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l32
+// CHECK-32-SAME: () #[[ATTR0]] {
+// CHECK-32-NEXT:  entry:
+// CHECK-32-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 2, i1 false)
+// CHECK-32-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
+// CHECK-32-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
+// CHECK-32:       user_code.entry:
+// CHECK-32-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
+// CHECK-32-NEXT:    store i32 0, i32* [[DOTZERO_ADDR]], align 4
+// CHECK-32-NEXT:    store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4
+// CHECK-32-NEXT:    call void @__omp_outlined__13(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]]) #[[ATTR2]]
+// CHECK-32-NEXT:    call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 2)
+// CHECK-32-NEXT:    ret void
+// CHECK-32:       worker.exit:
+// CHECK-32-NEXT:    ret void
+//
+//
+// CHECK-32-LABEL: define {{[^@]+}}@__omp_outlined__13
+// CHECK-32-SAME: (i32* noalias noundef [[DOTGLOBAL_TID_:%.*]], i32* noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
+// CHECK-32-NEXT:  entry:
+// CHECK-32-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [2 x i8*], align 4
+// CHECK-32-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4
+// CHECK-32-NEXT:    store i32 0, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-32-NEXT:    store i32 9, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-NEXT:    store i32 1, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-32-NEXT:    store i32 0, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-32-NEXT:    [[NVPTX_NUM_THREADS:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block()
+// CHECK-32-NEXT:    [[TMP0:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-NEXT:    [[TMP1:%.*]] = load i32, i32* [[TMP0]], align 4
+// CHECK-32-NEXT:    call void @__kmpc_distribute_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP1]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]])
+// CHECK-32-NEXT:    [[TMP2:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[TMP2]], 9
+// CHECK-32-NEXT:    br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK-32:       cond.true:
+// CHECK-32-NEXT:    br label [[COND_END:%.*]]
+// CHECK-32:       cond.false:
+// CHECK-32-NEXT:    [[TMP3:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-NEXT:    br label [[COND_END]]
+// CHECK-32:       cond.end:
+// CHECK-32-NEXT:    [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP3]], [[COND_FALSE]] ]
+// CHECK-32-NEXT:    store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-NEXT:    [[TMP4:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-32-NEXT:    store i32 [[TMP4]], i32* [[DOTOMP_IV]], align 4
+// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK-32:       omp.inner.for.cond:
+// CHECK-32-NEXT:    [[TMP5:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP174:![0-9]+]]
+// CHECK-32-NEXT:    [[CMP1:%.*]] = icmp slt i32 [[TMP5]], 10
+// CHECK-32-NEXT:    br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK-32:       omp.inner.for.body:
+// CHECK-32-NEXT:    [[TMP6:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP174]]
+// CHECK-32-NEXT:    [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP174]]
+// CHECK-32-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0
+// CHECK-32-NEXT:    [[TMP9:%.*]] = inttoptr i32 [[TMP6]] to i8*
+// CHECK-32-NEXT:    store i8* [[TMP9]], i8** [[TMP8]], align 4, !llvm.access.group [[ACC_GRP174]]
+// CHECK-32-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1
+// CHECK-32-NEXT:    [[TMP11:%.*]] = inttoptr i32 [[TMP7]] to i8*
+// CHECK-32-NEXT:    store i8* [[TMP11]], i8** [[TMP10]], align 4, !llvm.access.group [[ACC_GRP174]]
+// CHECK-32-NEXT:    [[TMP12:%.*]] = bitcast [2 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8**
+// CHECK-32-NEXT:    call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32)* @__omp_outlined__14 to i8*), i8* null, i8** [[TMP12]], i32 2), !llvm.access.group [[ACC_GRP174]]
+// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK-32:       omp.inner.for.inc:
+// CHECK-32-NEXT:    [[TMP13:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP174]]
+// CHECK-32-NEXT:    [[TMP14:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP174]]
+// CHECK-32-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP13]], [[TMP14]]
+// CHECK-32-NEXT:    store i32 [[ADD]], i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP174]]
+// CHECK-32-NEXT:    [[TMP15:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP174]]
+// CHECK-32-NEXT:    [[TMP16:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP174]]
+// CHECK-32-NEXT:    [[ADD2:%.*]] = add nsw i32 [[TMP15]], [[TMP16]]
+// CHECK-32-NEXT:    store i32 [[ADD2]], i32* [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP174]]
+// CHECK-32-NEXT:    [[TMP17:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP174]]
+// CHECK-32-NEXT:    [[TMP18:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP174]]
+// CHECK-32-NEXT:    [[ADD3:%.*]] = add nsw i32 [[TMP17]], [[TMP18]]
+// CHECK-32-NEXT:    store i32 [[ADD3]], i32* [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP174]]
+// CHECK-32-NEXT:    [[TMP19:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP174]]
+// CHECK-32-NEXT:    [[CMP4:%.*]] = icmp sgt i32 [[TMP19]], 9
+// CHECK-32-NEXT:    br i1 [[CMP4]], label [[COND_TRUE5:%.*]], label [[COND_FALSE6:%.*]]
+// CHECK-32:       cond.true5:
+// CHECK-32-NEXT:    br label [[COND_END7:%.*]]
+// CHECK-32:       cond.false6:
+// CHECK-32-NEXT:    [[TMP20:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP174]]
+// CHECK-32-NEXT:    br label [[COND_END7]]
+// CHECK-32:       cond.end7:
+// CHECK-32-NEXT:    [[COND8:%.*]] = phi i32 [ 9, [[COND_TRUE5]] ], [ [[TMP20]], [[COND_FALSE6]] ]
+// CHECK-32-NEXT:    store i32 [[COND8]], i32* [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP174]]
+// CHECK-32-NEXT:    [[TMP21:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP174]]
+// CHECK-32-NEXT:    store i32 [[TMP21]], i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP174]]
+// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP175:![0-9]+]]
+// CHECK-32:       omp.inner.for.end:
+// CHECK-32-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK-32:       omp.loop.exit:
+// CHECK-32-NEXT:    call void @__kmpc_distribute_static_fini(%struct.ident_t* @[[GLOB2]], i32 [[TMP1]])
+// CHECK-32-NEXT:    [[TMP22:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-32-NEXT:    [[TMP23:%.*]] = icmp ne i32 [[TMP22]], 0
+// CHECK-32-NEXT:    br i1 [[TMP23]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]]
+// CHECK-32:       .omp.final.then:
+// CHECK-32-NEXT:    store i32 10, i32* [[I]], align 4
+// CHECK-32-NEXT:    br label [[DOTOMP_FINAL_DONE]]
+// CHECK-32:       .omp.final.done:
+// CHECK-32-NEXT:    ret void
+//
+//
+// CHECK-32-LABEL: define {{[^@]+}}@__omp_outlined__14
+// CHECK-32-SAME: (i32* noalias noundef [[DOTGLOBAL_TID_:%.*]], i32* noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR1]] {
+// CHECK-32-NEXT:  entry:
+// CHECK-32-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-NEXT:    [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4
+// CHECK-32-NEXT:    store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4
+// CHECK-32-NEXT:    store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4
+// CHECK-32-NEXT:    store i32 0, i32* [[DOTOMP_LB]], align 4
+// CHECK-32-NEXT:    store i32 9, i32* [[DOTOMP_UB]], align 4
+// CHECK-32-NEXT:    [[TMP0:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4
+// CHECK-32-NEXT:    [[TMP1:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4
+// CHECK-32-NEXT:    store i32 [[TMP0]], i32* [[DOTOMP_LB]], align 4
+// CHECK-32-NEXT:    store i32 [[TMP1]], i32* [[DOTOMP_UB]], align 4
+// CHECK-32-NEXT:    store i32 1, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-32-NEXT:    store i32 0, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-32-NEXT:    [[TMP2:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4
+// CHECK-32-NEXT:    [[TMP3:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4
+// CHECK-32-NEXT:    [[TMP4:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-NEXT:    [[TMP5:%.*]] = load i32, i32* [[TMP4]], align 4
+// CHECK-32-NEXT:    call void @__kmpc_dispatch_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP5]], i32 1073741860, i32 [[TMP2]], i32 [[TMP3]], i32 1, i32 1)
+// CHECK-32-NEXT:    br label [[OMP_DISPATCH_COND:%.*]]
+// CHECK-32:       omp.dispatch.cond:
+// CHECK-32-NEXT:    [[TMP6:%.*]] = call i32 @__kmpc_dispatch_next_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP5]], i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]])
+// CHECK-32-NEXT:    [[TOBOOL:%.*]] = icmp ne i32 [[TMP6]], 0
+// CHECK-32-NEXT:    br i1 [[TOBOOL]], label [[OMP_DISPATCH_BODY:%.*]], label [[OMP_DISPATCH_END:%.*]]
+// CHECK-32:       omp.dispatch.body:
+// CHECK-32-NEXT:    [[TMP7:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4
+// CHECK-32-NEXT:    store i32 [[TMP7]], i32* [[DOTOMP_IV]], align 4
+// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK-32:       omp.inner.for.cond:
+// CHECK-32-NEXT:    [[TMP8:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP177:![0-9]+]]
+// CHECK-32-NEXT:    [[TMP9:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP177]]
+// CHECK-32-NEXT:    [[CMP:%.*]] = icmp sle i32 [[TMP8]], [[TMP9]]
+// CHECK-32-NEXT:    br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK-32:       omp.inner.for.body:
+// CHECK-32-NEXT:    [[TMP10:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP177]]
+// CHECK-32-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP10]], 1
+// CHECK-32-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// CHECK-32-NEXT:    store i32 [[ADD]], i32* [[I]], align 4, !llvm.access.group [[ACC_GRP177]]
+// CHECK-32-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK-32:       omp.body.continue:
+// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK-32:       omp.inner.for.inc:
+// CHECK-32-NEXT:    [[TMP11:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP177]]
+// CHECK-32-NEXT:    [[ADD1:%.*]] = add nsw i32 [[TMP11]], 1
+// CHECK-32-NEXT:    store i32 [[ADD1]], i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP177]]
+// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP178:![0-9]+]]
+// CHECK-32:       omp.inner.for.end:
+// CHECK-32-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
+// CHECK-32:       omp.dispatch.inc:
+// CHECK-32-NEXT:    br label [[OMP_DISPATCH_COND]]
+// CHECK-32:       omp.dispatch.end:
+// CHECK-32-NEXT:    [[TMP12:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-32-NEXT:    [[TMP13:%.*]] = icmp ne i32 [[TMP12]], 0
+// CHECK-32-NEXT:    br i1 [[TMP13]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]]
+// CHECK-32:       .omp.final.then:
+// CHECK-32-NEXT:    store i32 10, i32* [[I]], align 4
+// CHECK-32-NEXT:    br label [[DOTOMP_FINAL_DONE]]
+// CHECK-32:       .omp.final.done:
+// CHECK-32-NEXT:    ret void
+//
+//
+// CHECK-32-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l36
+// CHECK-32-SAME: (i32 noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-32-NEXT:  entry:
+// CHECK-32-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[A_CASTED:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    store i32 [[A]], i32* [[A_ADDR]], align 4
+// CHECK-32-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 2, i1 false)
+// CHECK-32-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
+// CHECK-32-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
+// CHECK-32:       user_code.entry:
+// CHECK-32-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
+// CHECK-32-NEXT:    [[TMP2:%.*]] = load i32, i32* [[A_ADDR]], align 4
+// CHECK-32-NEXT:    store i32 [[TMP2]], i32* [[A_CASTED]], align 4
+// CHECK-32-NEXT:    [[TMP3:%.*]] = load i32, i32* [[A_CASTED]], align 4
+// CHECK-32-NEXT:    store i32 0, i32* [[DOTZERO_ADDR]], align 4
+// CHECK-32-NEXT:    store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4
+// CHECK-32-NEXT:    call void @__omp_outlined__15(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i32 [[TMP3]]) #[[ATTR2]]
+// CHECK-32-NEXT:    call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 2)
+// CHECK-32-NEXT:    ret void
+// CHECK-32:       worker.exit:
+// CHECK-32-NEXT:    ret void
+//
+//
+// CHECK-32-LABEL: define {{[^@]+}}@__omp_outlined__15
+// CHECK-32-SAME: (i32* noalias noundef [[DOTGLOBAL_TID_:%.*]], i32* noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[A:%.*]]) #[[ATTR1]] {
+// CHECK-32-NEXT:  entry:
+// CHECK-32-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[A_CASTED:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [3 x i8*], align 4
+// CHECK-32-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4
+// CHECK-32-NEXT:    store i32 [[A]], i32* [[A_ADDR]], align 4
+// CHECK-32-NEXT:    [[A1:%.*]] = call align 8 i8* @__kmpc_alloc_shared(i32 4)
+// CHECK-32-NEXT:    [[A_ON_STACK:%.*]] = bitcast i8* [[A1]] to i32*
+// CHECK-32-NEXT:    store i32 0, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-32-NEXT:    store i32 9, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-NEXT:    store i32 1, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-32-NEXT:    store i32 0, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-32-NEXT:    [[NVPTX_NUM_THREADS:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block()
+// CHECK-32-NEXT:    [[TMP0:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-NEXT:    [[TMP1:%.*]] = load i32, i32* [[TMP0]], align 4
+// CHECK-32-NEXT:    call void @__kmpc_distribute_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP1]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]])
+// CHECK-32-NEXT:    [[TMP2:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[TMP2]], 9
+// CHECK-32-NEXT:    br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK-32:       cond.true:
+// CHECK-32-NEXT:    br label [[COND_END:%.*]]
+// CHECK-32:       cond.false:
+// CHECK-32-NEXT:    [[TMP3:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-NEXT:    br label [[COND_END]]
+// CHECK-32:       cond.end:
+// CHECK-32-NEXT:    [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP3]], [[COND_FALSE]] ]
+// CHECK-32-NEXT:    store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-NEXT:    [[TMP4:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-32-NEXT:    store i32 [[TMP4]], i32* [[DOTOMP_IV]], align 4
+// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK-32:       omp.inner.for.cond:
+// CHECK-32-NEXT:    [[TMP5:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
+// CHECK-32-NEXT:    [[CMP2:%.*]] = icmp slt i32 [[TMP5]], 10
+// CHECK-32-NEXT:    br i1 [[CMP2]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK-32:       omp.inner.for.body:
+// CHECK-32-NEXT:    [[TMP6:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-32-NEXT:    [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-NEXT:    [[TMP8:%.*]] = load i32, i32* [[A_ON_STACK]], align 4
+// CHECK-32-NEXT:    store i32 [[TMP8]], i32* [[A_CASTED]], align 4
+// CHECK-32-NEXT:    [[TMP9:%.*]] = load i32, i32* [[A_CASTED]], align 4
+// CHECK-32-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0
+// CHECK-32-NEXT:    [[TMP11:%.*]] = inttoptr i32 [[TMP6]] to i8*
+// CHECK-32-NEXT:    store i8* [[TMP11]], i8** [[TMP10]], align 4
+// CHECK-32-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1
+// CHECK-32-NEXT:    [[TMP13:%.*]] = inttoptr i32 [[TMP7]] to i8*
+// CHECK-32-NEXT:    store i8* [[TMP13]], i8** [[TMP12]], align 4
+// CHECK-32-NEXT:    [[TMP14:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 2
+// CHECK-32-NEXT:    [[TMP15:%.*]] = inttoptr i32 [[TMP9]] to i8*
+// CHECK-32-NEXT:    store i8* [[TMP15]], i8** [[TMP14]], align 4
+// CHECK-32-NEXT:    [[TMP16:%.*]] = bitcast [3 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8**
+// CHECK-32-NEXT:    call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32, i32)* @__omp_outlined__16 to i8*), i8* null, i8** [[TMP16]], i32 3)
+// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK-32:       omp.inner.for.inc:
+// CHECK-32-NEXT:    [[TMP17:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
+// CHECK-32-NEXT:    [[TMP18:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-32-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP17]], [[TMP18]]
+// CHECK-32-NEXT:    store i32 [[ADD]], i32* [[DOTOMP_IV]], align 4
+// CHECK-32-NEXT:    [[TMP19:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-32-NEXT:    [[TMP20:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-32-NEXT:    [[ADD3:%.*]] = add nsw i32 [[TMP19]], [[TMP20]]
+// CHECK-32-NEXT:    store i32 [[ADD3]], i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-32-NEXT:    [[TMP21:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-NEXT:    [[TMP22:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-32-NEXT:    [[ADD4:%.*]] = add nsw i32 [[TMP21]], [[TMP22]]
+// CHECK-32-NEXT:    store i32 [[ADD4]], i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-NEXT:    [[TMP23:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-NEXT:    [[CMP5:%.*]] = icmp sgt i32 [[TMP23]], 9
+// CHECK-32-NEXT:    br i1 [[CMP5]], label [[COND_TRUE6:%.*]], label [[COND_FALSE7:%.*]]
+// CHECK-32:       cond.true6:
+// CHECK-32-NEXT:    br label [[COND_END8:%.*]]
+// CHECK-32:       cond.false7:
+// CHECK-32-NEXT:    [[TMP24:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-NEXT:    br label [[COND_END8]]
+// CHECK-32:       cond.end8:
+// CHECK-32-NEXT:    [[COND9:%.*]] = phi i32 [ 9, [[COND_TRUE6]] ], [ [[TMP24]], [[COND_FALSE7]] ]
+// CHECK-32-NEXT:    store i32 [[COND9]], i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-NEXT:    [[TMP25:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-32-NEXT:    store i32 [[TMP25]], i32* [[DOTOMP_IV]], align 4
+// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND]]
+// CHECK-32:       omp.inner.for.end:
+// CHECK-32-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK-32:       omp.loop.exit:
+// CHECK-32-NEXT:    call void @__kmpc_distribute_static_fini(%struct.ident_t* @[[GLOB2]], i32 [[TMP1]])
+// CHECK-32-NEXT:    [[TMP26:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-32-NEXT:    [[TMP27:%.*]] = icmp ne i32 [[TMP26]], 0
+// CHECK-32-NEXT:    br i1 [[TMP27]], label [[DOTOMP_LASTPRIVATE_THEN:%.*]], label [[DOTOMP_LASTPRIVATE_DONE:%.*]]
+// CHECK-32:       .omp.lastprivate.then:
+// CHECK-32-NEXT:    [[TMP28:%.*]] = load i32, i32* [[A_ON_STACK]], align 4
+// CHECK-32-NEXT:    store i32 [[TMP28]], i32* [[A_ADDR]], align 4
+// CHECK-32-NEXT:    br label [[DOTOMP_LASTPRIVATE_DONE]]
+// CHECK-32:       .omp.lastprivate.done:
+// CHECK-32-NEXT:    call void @__kmpc_free_shared(i8* [[A1]], i32 4)
+// CHECK-32-NEXT:    ret void
+//
+//
+// CHECK-32-LABEL: define {{[^@]+}}@__omp_outlined__16
+// CHECK-32-SAME: (i32* noalias noundef [[DOTGLOBAL_TID_:%.*]], i32* noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], i32 noundef [[A:%.*]]) #[[ATTR1]] {
+// CHECK-32-NEXT:  entry:
+// CHECK-32-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-NEXT:    [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[A1:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4
+// CHECK-32-NEXT:    store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4
+// CHECK-32-NEXT:    store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4
+// CHECK-32-NEXT:    store i32 [[A]], i32* [[A_ADDR]], align 4
+// CHECK-32-NEXT:    store i32 0, i32* [[DOTOMP_LB]], align 4
+// CHECK-32-NEXT:    store i32 9, i32* [[DOTOMP_UB]], align 4
+// CHECK-32-NEXT:    [[TMP0:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4
+// CHECK-32-NEXT:    [[TMP1:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4
+// CHECK-32-NEXT:    store i32 [[TMP0]], i32* [[DOTOMP_LB]], align 4
+// CHECK-32-NEXT:    store i32 [[TMP1]], i32* [[DOTOMP_UB]], align 4
+// CHECK-32-NEXT:    store i32 1, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-32-NEXT:    store i32 0, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-32-NEXT:    [[TMP2:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-NEXT:    [[TMP3:%.*]] = load i32, i32* [[TMP2]], align 4
+// CHECK-32-NEXT:    call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB3]], i32 [[TMP3]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1)
+// CHECK-32-NEXT:    [[TMP4:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4
+// CHECK-32-NEXT:    store i32 [[TMP4]], i32* [[DOTOMP_IV]], align 4
+// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK-32:       omp.inner.for.cond:
+// CHECK-32-NEXT:    [[TMP5:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
+// CHECK-32-NEXT:    [[TMP6:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4
+// CHECK-32-NEXT:    [[CMP:%.*]] = icmp ule i32 [[TMP5]], [[TMP6]]
+// CHECK-32-NEXT:    br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK-32:       omp.inner.for.body:
+// CHECK-32-NEXT:    [[TMP7:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
+// CHECK-32-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP7]], 1
+// CHECK-32-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// CHECK-32-NEXT:    store i32 [[ADD]], i32* [[I]], align 4
+// CHECK-32-NEXT:    [[TMP8:%.*]] = load i32, i32* [[I]], align 4
+// CHECK-32-NEXT:    store i32 [[TMP8]], i32* [[A1]], align 4
+// CHECK-32-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK-32:       omp.body.continue:
+// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK-32:       omp.inner.for.inc:
+// CHECK-32-NEXT:    [[TMP9:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
+// CHECK-32-NEXT:    [[TMP10:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-32-NEXT:    [[ADD2:%.*]] = add nsw i32 [[TMP9]], [[TMP10]]
+// CHECK-32-NEXT:    store i32 [[ADD2]], i32* [[DOTOMP_IV]], align 4
+// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND]]
+// CHECK-32:       omp.inner.for.end:
+// CHECK-32-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK-32:       omp.loop.exit:
+// CHECK-32-NEXT:    call void @__kmpc_distribute_static_fini(%struct.ident_t* @[[GLOB2]], i32 [[TMP3]])
+// CHECK-32-NEXT:    [[TMP11:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-32-NEXT:    [[TMP12:%.*]] = icmp ne i32 [[TMP11]], 0
+// CHECK-32-NEXT:    br i1 [[TMP12]], label [[DOTOMP_LASTPRIVATE_THEN:%.*]], label [[DOTOMP_LASTPRIVATE_DONE:%.*]]
+// CHECK-32:       .omp.lastprivate.then:
+// CHECK-32-NEXT:    [[TMP13:%.*]] = load i32, i32* [[A1]], align 4
+// CHECK-32-NEXT:    store i32 [[TMP13]], i32* [[A_ADDR]], align 4
+// CHECK-32-NEXT:    br label [[DOTOMP_LASTPRIVATE_DONE]]
+// CHECK-32:       .omp.lastprivate.done:
+// CHECK-32-NEXT:    ret void
+//
+//
+// CHECK-32-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l39
+// CHECK-32-SAME: () #[[ATTR0]] {
+// CHECK-32-NEXT:  entry:
+// CHECK-32-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 2, i1 false)
+// CHECK-32-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
+// CHECK-32-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
+// CHECK-32:       user_code.entry:
+// CHECK-32-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
+// CHECK-32-NEXT:    store i32 0, i32* [[DOTZERO_ADDR]], align 4
+// CHECK-32-NEXT:    store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4
+// CHECK-32-NEXT:    call void @__omp_outlined__17(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]]) #[[ATTR2]]
+// CHECK-32-NEXT:    call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 2)
+// CHECK-32-NEXT:    ret void
+// CHECK-32:       worker.exit:
+// CHECK-32-NEXT:    ret void
+//
+//
+// CHECK-32-LABEL: define {{[^@]+}}@__omp_outlined__17
+// CHECK-32-SAME: (i32* noalias noundef [[DOTGLOBAL_TID_:%.*]], i32* noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
+// CHECK-32-NEXT:  entry:
+// CHECK-32-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [2 x i8*], align 4
+// CHECK-32-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4
+// CHECK-32-NEXT:    store i32 0, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-32-NEXT:    store i32 9, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-NEXT:    store i32 1, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-32-NEXT:    store i32 0, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-32-NEXT:    [[NVPTX_NUM_THREADS:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block()
+// CHECK-32-NEXT:    [[TMP0:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-NEXT:    [[TMP1:%.*]] = load i32, i32* [[TMP0]], align 4
+// CHECK-32-NEXT:    call void @__kmpc_distribute_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP1]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]])
+// CHECK-32-NEXT:    [[TMP2:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[TMP2]], 9
+// CHECK-32-NEXT:    br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK-32:       cond.true:
+// CHECK-32-NEXT:    br label [[COND_END:%.*]]
+// CHECK-32:       cond.false:
+// CHECK-32-NEXT:    [[TMP3:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-NEXT:    br label [[COND_END]]
+// CHECK-32:       cond.end:
+// CHECK-32-NEXT:    [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP3]], [[COND_FALSE]] ]
+// CHECK-32-NEXT:    store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-NEXT:    [[TMP4:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-32-NEXT:    store i32 [[TMP4]], i32* [[DOTOMP_IV]], align 4
+// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK-32:       omp.inner.for.cond:
+// CHECK-32-NEXT:    [[TMP5:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
+// CHECK-32-NEXT:    [[CMP1:%.*]] = icmp slt i32 [[TMP5]], 10
+// CHECK-32-NEXT:    br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK-32:       omp.inner.for.body:
+// CHECK-32-NEXT:    [[TMP6:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-32-NEXT:    [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0
+// CHECK-32-NEXT:    [[TMP9:%.*]] = inttoptr i32 [[TMP6]] to i8*
+// CHECK-32-NEXT:    store i8* [[TMP9]], i8** [[TMP8]], align 4
+// CHECK-32-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1
+// CHECK-32-NEXT:    [[TMP11:%.*]] = inttoptr i32 [[TMP7]] to i8*
+// CHECK-32-NEXT:    store i8* [[TMP11]], i8** [[TMP10]], align 4
+// CHECK-32-NEXT:    [[TMP12:%.*]] = bitcast [2 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8**
+// CHECK-32-NEXT:    call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32)* @__omp_outlined__18 to i8*), i8* null, i8** [[TMP12]], i32 2)
+// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK-32:       omp.inner.for.inc:
+// CHECK-32-NEXT:    [[TMP13:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
+// CHECK-32-NEXT:    [[TMP14:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-32-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP13]], [[TMP14]]
+// CHECK-32-NEXT:    store i32 [[ADD]], i32* [[DOTOMP_IV]], align 4
+// CHECK-32-NEXT:    [[TMP15:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-32-NEXT:    [[TMP16:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-32-NEXT:    [[ADD2:%.*]] = add nsw i32 [[TMP15]], [[TMP16]]
+// CHECK-32-NEXT:    store i32 [[ADD2]], i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-32-NEXT:    [[TMP17:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-NEXT:    [[TMP18:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-32-NEXT:    [[ADD3:%.*]] = add nsw i32 [[TMP17]], [[TMP18]]
+// CHECK-32-NEXT:    store i32 [[ADD3]], i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-NEXT:    [[TMP19:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-NEXT:    [[CMP4:%.*]] = icmp sgt i32 [[TMP19]], 9
+// CHECK-32-NEXT:    br i1 [[CMP4]], label [[COND_TRUE5:%.*]], label [[COND_FALSE6:%.*]]
+// CHECK-32:       cond.true5:
+// CHECK-32-NEXT:    br label [[COND_END7:%.*]]
+// CHECK-32:       cond.false6:
+// CHECK-32-NEXT:    [[TMP20:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-NEXT:    br label [[COND_END7]]
+// CHECK-32:       cond.end7:
+// CHECK-32-NEXT:    [[COND8:%.*]] = phi i32 [ 9, [[COND_TRUE5]] ], [ [[TMP20]], [[COND_FALSE6]] ]
+// CHECK-32-NEXT:    store i32 [[COND8]], i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-NEXT:    [[TMP21:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-32-NEXT:    store i32 [[TMP21]], i32* [[DOTOMP_IV]], align 4
+// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND]]
+// CHECK-32:       omp.inner.for.end:
+// CHECK-32-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK-32:       omp.loop.exit:
+// CHECK-32-NEXT:    call void @__kmpc_distribute_static_fini(%struct.ident_t* @[[GLOB2]], i32 [[TMP1]])
+// CHECK-32-NEXT:    ret void
+//
+//
+// CHECK-32-LABEL: define {{[^@]+}}@__omp_outlined__18
+// CHECK-32-SAME: (i32* noalias noundef [[DOTGLOBAL_TID_:%.*]], i32* noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR1]] {
+// CHECK-32-NEXT:  entry:
+// CHECK-32-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-NEXT:    [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4
+// CHECK-32-NEXT:    store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4
+// CHECK-32-NEXT:    store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4
+// CHECK-32-NEXT:    store i32 0, i32* [[DOTOMP_LB]], align 4
+// CHECK-32-NEXT:    store i32 9, i32* [[DOTOMP_UB]], align 4
+// CHECK-32-NEXT:    [[TMP0:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4
+// CHECK-32-NEXT:    [[TMP1:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4
+// CHECK-32-NEXT:    store i32 [[TMP0]], i32* [[DOTOMP_LB]], align 4
+// CHECK-32-NEXT:    store i32 [[TMP1]], i32* [[DOTOMP_UB]], align 4
+// CHECK-32-NEXT:    store i32 1, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-32-NEXT:    store i32 0, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-32-NEXT:    [[TMP2:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-NEXT:    [[TMP3:%.*]] = load i32, i32* [[TMP2]], align 4
+// CHECK-32-NEXT:    call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB3]], i32 [[TMP3]], i32 34, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1)
+// CHECK-32-NEXT:    [[TMP4:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4
+// CHECK-32-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[TMP4]], 9
+// CHECK-32-NEXT:    br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK-32:       cond.true:
+// CHECK-32-NEXT:    br label [[COND_END:%.*]]
+// CHECK-32:       cond.false:
+// CHECK-32-NEXT:    [[TMP5:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4
+// CHECK-32-NEXT:    br label [[COND_END]]
+// CHECK-32:       cond.end:
+// CHECK-32-NEXT:    [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP5]], [[COND_FALSE]] ]
+// CHECK-32-NEXT:    store i32 [[COND]], i32* [[DOTOMP_UB]], align 4
+// CHECK-32-NEXT:    [[TMP6:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4
+// CHECK-32-NEXT:    store i32 [[TMP6]], i32* [[DOTOMP_IV]], align 4
+// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK-32:       omp.inner.for.cond:
+// CHECK-32-NEXT:    [[TMP7:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
+// CHECK-32-NEXT:    [[TMP8:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4
+// CHECK-32-NEXT:    [[CMP1:%.*]] = icmp sle i32 [[TMP7]], [[TMP8]]
+// CHECK-32-NEXT:    br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK-32:       omp.inner.for.body:
+// CHECK-32-NEXT:    [[TMP9:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
+// CHECK-32-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP9]], 1
+// CHECK-32-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// CHECK-32-NEXT:    store i32 [[ADD]], i32* [[I]], align 4
+// CHECK-32-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK-32:       omp.body.continue:
+// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK-32:       omp.inner.for.inc:
+// CHECK-32-NEXT:    [[TMP10:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
+// CHECK-32-NEXT:    [[ADD2:%.*]] = add nsw i32 [[TMP10]], 1
+// CHECK-32-NEXT:    store i32 [[ADD2]], i32* [[DOTOMP_IV]], align 4
+// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND]]
+// CHECK-32:       omp.inner.for.end:
+// CHECK-32-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK-32:       omp.loop.exit:
+// CHECK-32-NEXT:    call void @__kmpc_distribute_static_fini(%struct.ident_t* @[[GLOB2]], i32 [[TMP3]])
+// CHECK-32-NEXT:    ret void
+//
+//
+// CHECK-32-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l42
+// CHECK-32-SAME: () #[[ATTR0]] {
+// CHECK-32-NEXT:  entry:
+// CHECK-32-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 2, i1 false)
+// CHECK-32-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
+// CHECK-32-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
+// CHECK-32:       user_code.entry:
+// CHECK-32-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
+// CHECK-32-NEXT:    store i32 0, i32* [[DOTZERO_ADDR]], align 4
+// CHECK-32-NEXT:    store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4
+// CHECK-32-NEXT:    call void @__omp_outlined__19(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]]) #[[ATTR2]]
+// CHECK-32-NEXT:    call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 2)
+// CHECK-32-NEXT:    ret void
+// CHECK-32:       worker.exit:
+// CHECK-32-NEXT:    ret void
+//
+//
+// CHECK-32-LABEL: define {{[^@]+}}@__omp_outlined__19
+// CHECK-32-SAME: (i32* noalias noundef [[DOTGLOBAL_TID_:%.*]], i32* noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
+// CHECK-32-NEXT:  entry:
+// CHECK-32-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [2 x i8*], align 4
+// CHECK-32-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4
+// CHECK-32-NEXT:    store i32 0, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-32-NEXT:    store i32 9, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-NEXT:    store i32 1, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-32-NEXT:    store i32 0, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-32-NEXT:    [[NVPTX_NUM_THREADS:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block()
+// CHECK-32-NEXT:    [[TMP0:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-NEXT:    [[TMP1:%.*]] = load i32, i32* [[TMP0]], align 4
+// CHECK-32-NEXT:    call void @__kmpc_distribute_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP1]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]])
+// CHECK-32-NEXT:    [[TMP2:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[TMP2]], 9
+// CHECK-32-NEXT:    br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK-32:       cond.true:
+// CHECK-32-NEXT:    br label [[COND_END:%.*]]
+// CHECK-32:       cond.false:
+// CHECK-32-NEXT:    [[TMP3:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-NEXT:    br label [[COND_END]]
+// CHECK-32:       cond.end:
+// CHECK-32-NEXT:    [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP3]], [[COND_FALSE]] ]
+// CHECK-32-NEXT:    store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-NEXT:    [[TMP4:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-32-NEXT:    store i32 [[TMP4]], i32* [[DOTOMP_IV]], align 4
+// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK-32:       omp.inner.for.cond:
+// CHECK-32-NEXT:    [[TMP5:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
+// CHECK-32-NEXT:    [[CMP1:%.*]] = icmp slt i32 [[TMP5]], 10
+// CHECK-32-NEXT:    br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK-32:       omp.inner.for.body:
+// CHECK-32-NEXT:    [[TMP6:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-32-NEXT:    [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0
+// CHECK-32-NEXT:    [[TMP9:%.*]] = inttoptr i32 [[TMP6]] to i8*
+// CHECK-32-NEXT:    store i8* [[TMP9]], i8** [[TMP8]], align 4
+// CHECK-32-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1
+// CHECK-32-NEXT:    [[TMP11:%.*]] = inttoptr i32 [[TMP7]] to i8*
+// CHECK-32-NEXT:    store i8* [[TMP11]], i8** [[TMP10]], align 4
+// CHECK-32-NEXT:    [[TMP12:%.*]] = bitcast [2 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8**
+// CHECK-32-NEXT:    call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32)* @__omp_outlined__20 to i8*), i8* null, i8** [[TMP12]], i32 2)
+// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK-32:       omp.inner.for.inc:
+// CHECK-32-NEXT:    [[TMP13:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
+// CHECK-32-NEXT:    [[TMP14:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-32-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP13]], [[TMP14]]
+// CHECK-32-NEXT:    store i32 [[ADD]], i32* [[DOTOMP_IV]], align 4
+// CHECK-32-NEXT:    [[TMP15:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-32-NEXT:    [[TMP16:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-32-NEXT:    [[ADD2:%.*]] = add nsw i32 [[TMP15]], [[TMP16]]
+// CHECK-32-NEXT:    store i32 [[ADD2]], i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-32-NEXT:    [[TMP17:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-NEXT:    [[TMP18:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-32-NEXT:    [[ADD3:%.*]] = add nsw i32 [[TMP17]], [[TMP18]]
+// CHECK-32-NEXT:    store i32 [[ADD3]], i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-NEXT:    [[TMP19:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-NEXT:    [[CMP4:%.*]] = icmp sgt i32 [[TMP19]], 9
+// CHECK-32-NEXT:    br i1 [[CMP4]], label [[COND_TRUE5:%.*]], label [[COND_FALSE6:%.*]]
+// CHECK-32:       cond.true5:
+// CHECK-32-NEXT:    br label [[COND_END7:%.*]]
+// CHECK-32:       cond.false6:
+// CHECK-32-NEXT:    [[TMP20:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-NEXT:    br label [[COND_END7]]
+// CHECK-32:       cond.end7:
+// CHECK-32-NEXT:    [[COND8:%.*]] = phi i32 [ 9, [[COND_TRUE5]] ], [ [[TMP20]], [[COND_FALSE6]] ]
+// CHECK-32-NEXT:    store i32 [[COND8]], i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-NEXT:    [[TMP21:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-32-NEXT:    store i32 [[TMP21]], i32* [[DOTOMP_IV]], align 4
+// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND]]
+// CHECK-32:       omp.inner.for.end:
+// CHECK-32-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK-32:       omp.loop.exit:
+// CHECK-32-NEXT:    call void @__kmpc_distribute_static_fini(%struct.ident_t* @[[GLOB2]], i32 [[TMP1]])
+// CHECK-32-NEXT:    ret void
+//
+//
+// CHECK-32-LABEL: define {{[^@]+}}@__omp_outlined__20
+// CHECK-32-SAME: (i32* noalias noundef [[DOTGLOBAL_TID_:%.*]], i32* noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR1]] {
+// CHECK-32-NEXT:  entry:
+// CHECK-32-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-NEXT:    [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4
+// CHECK-32-NEXT:    store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4
+// CHECK-32-NEXT:    store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4
+// CHECK-32-NEXT:    store i32 0, i32* [[DOTOMP_LB]], align 4
+// CHECK-32-NEXT:    store i32 9, i32* [[DOTOMP_UB]], align 4
+// CHECK-32-NEXT:    [[TMP0:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4
+// CHECK-32-NEXT:    [[TMP1:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4
+// CHECK-32-NEXT:    store i32 [[TMP0]], i32* [[DOTOMP_LB]], align 4
+// CHECK-32-NEXT:    store i32 [[TMP1]], i32* [[DOTOMP_UB]], align 4
+// CHECK-32-NEXT:    store i32 1, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-32-NEXT:    store i32 0, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-32-NEXT:    [[TMP2:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-NEXT:    [[TMP3:%.*]] = load i32, i32* [[TMP2]], align 4
+// CHECK-32-NEXT:    call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB3]], i32 [[TMP3]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1)
+// CHECK-32-NEXT:    [[TMP4:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4
+// CHECK-32-NEXT:    store i32 [[TMP4]], i32* [[DOTOMP_IV]], align 4
+// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK-32:       omp.inner.for.cond:
+// CHECK-32-NEXT:    [[TMP5:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
+// CHECK-32-NEXT:    [[TMP6:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4
+// CHECK-32-NEXT:    [[CMP:%.*]] = icmp ule i32 [[TMP5]], [[TMP6]]
+// CHECK-32-NEXT:    br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK-32:       omp.inner.for.body:
+// CHECK-32-NEXT:    [[TMP7:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
+// CHECK-32-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP7]], 1
+// CHECK-32-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// CHECK-32-NEXT:    store i32 [[ADD]], i32* [[I]], align 4
+// CHECK-32-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK-32:       omp.body.continue:
+// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK-32:       omp.inner.for.inc:
+// CHECK-32-NEXT:    [[TMP8:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
+// CHECK-32-NEXT:    [[TMP9:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-32-NEXT:    [[ADD1:%.*]] = add nsw i32 [[TMP8]], [[TMP9]]
+// CHECK-32-NEXT:    store i32 [[ADD1]], i32* [[DOTOMP_IV]], align 4
+// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND]]
+// CHECK-32:       omp.inner.for.end:
+// CHECK-32-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK-32:       omp.loop.exit:
+// CHECK-32-NEXT:    call void @__kmpc_distribute_static_fini(%struct.ident_t* @[[GLOB2]], i32 [[TMP3]])
+// CHECK-32-NEXT:    ret void
+//
+//
+// CHECK-32-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l45
+// CHECK-32-SAME: () #[[ATTR0]] {
+// CHECK-32-NEXT:  entry:
+// CHECK-32-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 2, i1 false)
+// CHECK-32-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
+// CHECK-32-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
+// CHECK-32:       user_code.entry:
+// CHECK-32-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
+// CHECK-32-NEXT:    store i32 0, i32* [[DOTZERO_ADDR]], align 4
+// CHECK-32-NEXT:    store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4
+// CHECK-32-NEXT:    call void @__omp_outlined__21(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]]) #[[ATTR2]]
+// CHECK-32-NEXT:    call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 2)
+// CHECK-32-NEXT:    ret void
+// CHECK-32:       worker.exit:
+// CHECK-32-NEXT:    ret void
+//
+//
+// CHECK-32-LABEL: define {{[^@]+}}@__omp_outlined__21
+// CHECK-32-SAME: (i32* noalias noundef [[DOTGLOBAL_TID_:%.*]], i32* noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
+// CHECK-32-NEXT:  entry:
+// CHECK-32-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [2 x i8*], align 4
+// CHECK-32-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4
+// CHECK-32-NEXT:    store i32 0, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-32-NEXT:    store i32 9, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-NEXT:    store i32 1, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-32-NEXT:    store i32 0, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-32-NEXT:    [[NVPTX_NUM_THREADS:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block()
+// CHECK-32-NEXT:    [[TMP0:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-NEXT:    [[TMP1:%.*]] = load i32, i32* [[TMP0]], align 4
+// CHECK-32-NEXT:    call void @__kmpc_distribute_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP1]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]])
+// CHECK-32-NEXT:    [[TMP2:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[TMP2]], 9
+// CHECK-32-NEXT:    br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK-32:       cond.true:
+// CHECK-32-NEXT:    br label [[COND_END:%.*]]
+// CHECK-32:       cond.false:
+// CHECK-32-NEXT:    [[TMP3:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-NEXT:    br label [[COND_END]]
+// CHECK-32:       cond.end:
+// CHECK-32-NEXT:    [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP3]], [[COND_FALSE]] ]
+// CHECK-32-NEXT:    store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-NEXT:    [[TMP4:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-32-NEXT:    store i32 [[TMP4]], i32* [[DOTOMP_IV]], align 4
+// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK-32:       omp.inner.for.cond:
+// CHECK-32-NEXT:    [[TMP5:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
+// CHECK-32-NEXT:    [[CMP1:%.*]] = icmp slt i32 [[TMP5]], 10
+// CHECK-32-NEXT:    br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK-32:       omp.inner.for.body:
+// CHECK-32-NEXT:    [[TMP6:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-32-NEXT:    [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0
+// CHECK-32-NEXT:    [[TMP9:%.*]] = inttoptr i32 [[TMP6]] to i8*
+// CHECK-32-NEXT:    store i8* [[TMP9]], i8** [[TMP8]], align 4
+// CHECK-32-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1
+// CHECK-32-NEXT:    [[TMP11:%.*]] = inttoptr i32 [[TMP7]] to i8*
+// CHECK-32-NEXT:    store i8* [[TMP11]], i8** [[TMP10]], align 4
+// CHECK-32-NEXT:    [[TMP12:%.*]] = bitcast [2 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8**
+// CHECK-32-NEXT:    call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32)* @__omp_outlined__22 to i8*), i8* null, i8** [[TMP12]], i32 2)
+// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK-32:       omp.inner.for.inc:
+// CHECK-32-NEXT:    [[TMP13:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
+// CHECK-32-NEXT:    [[TMP14:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-32-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP13]], [[TMP14]]
+// CHECK-32-NEXT:    store i32 [[ADD]], i32* [[DOTOMP_IV]], align 4
+// CHECK-32-NEXT:    [[TMP15:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-32-NEXT:    [[TMP16:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-32-NEXT:    [[ADD2:%.*]] = add nsw i32 [[TMP15]], [[TMP16]]
+// CHECK-32-NEXT:    store i32 [[ADD2]], i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-32-NEXT:    [[TMP17:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-NEXT:    [[TMP18:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-32-NEXT:    [[ADD3:%.*]] = add nsw i32 [[TMP17]], [[TMP18]]
+// CHECK-32-NEXT:    store i32 [[ADD3]], i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-NEXT:    [[TMP19:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-NEXT:    [[CMP4:%.*]] = icmp sgt i32 [[TMP19]], 9
+// CHECK-32-NEXT:    br i1 [[CMP4]], label [[COND_TRUE5:%.*]], label [[COND_FALSE6:%.*]]
+// CHECK-32:       cond.true5:
+// CHECK-32-NEXT:    br label [[COND_END7:%.*]]
+// CHECK-32:       cond.false6:
+// CHECK-32-NEXT:    [[TMP20:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-NEXT:    br label [[COND_END7]]
+// CHECK-32:       cond.end7:
+// CHECK-32-NEXT:    [[COND8:%.*]] = phi i32 [ 9, [[COND_TRUE5]] ], [ [[TMP20]], [[COND_FALSE6]] ]
+// CHECK-32-NEXT:    store i32 [[COND8]], i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-NEXT:    [[TMP21:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-32-NEXT:    store i32 [[TMP21]], i32* [[DOTOMP_IV]], align 4
+// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND]]
+// CHECK-32:       omp.inner.for.end:
+// CHECK-32-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK-32:       omp.loop.exit:
+// CHECK-32-NEXT:    call void @__kmpc_distribute_static_fini(%struct.ident_t* @[[GLOB2]], i32 [[TMP1]])
+// CHECK-32-NEXT:    ret void
+//
+//
+// CHECK-32-LABEL: define {{[^@]+}}@__omp_outlined__22
+// CHECK-32-SAME: (i32* noalias noundef [[DOTGLOBAL_TID_:%.*]], i32* noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR1]] {
+// CHECK-32-NEXT:  entry:
+// CHECK-32-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-NEXT:    [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4
+// CHECK-32-NEXT:    store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4
+// CHECK-32-NEXT:    store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4
+// CHECK-32-NEXT:    store i32 0, i32* [[DOTOMP_LB]], align 4
+// CHECK-32-NEXT:    store i32 9, i32* [[DOTOMP_UB]], align 4
+// CHECK-32-NEXT:    [[TMP0:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4
+// CHECK-32-NEXT:    [[TMP1:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4
+// CHECK-32-NEXT:    store i32 [[TMP0]], i32* [[DOTOMP_LB]], align 4
+// CHECK-32-NEXT:    store i32 [[TMP1]], i32* [[DOTOMP_UB]], align 4
+// CHECK-32-NEXT:    store i32 1, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-32-NEXT:    store i32 0, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-32-NEXT:    [[TMP2:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4
+// CHECK-32-NEXT:    [[TMP3:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4
+// CHECK-32-NEXT:    [[TMP4:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-NEXT:    [[TMP5:%.*]] = load i32, i32* [[TMP4]], align 4
+// CHECK-32-NEXT:    call void @__kmpc_dispatch_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP5]], i32 1073741862, i32 [[TMP2]], i32 [[TMP3]], i32 1, i32 1)
+// CHECK-32-NEXT:    br label [[OMP_DISPATCH_COND:%.*]]
+// CHECK-32:       omp.dispatch.cond:
+// CHECK-32-NEXT:    [[TMP6:%.*]] = call i32 @__kmpc_dispatch_next_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP5]], i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]])
+// CHECK-32-NEXT:    [[TOBOOL:%.*]] = icmp ne i32 [[TMP6]], 0
+// CHECK-32-NEXT:    br i1 [[TOBOOL]], label [[OMP_DISPATCH_BODY:%.*]], label [[OMP_DISPATCH_END:%.*]]
+// CHECK-32:       omp.dispatch.body:
+// CHECK-32-NEXT:    [[TMP7:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4
+// CHECK-32-NEXT:    store i32 [[TMP7]], i32* [[DOTOMP_IV]], align 4
+// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK-32:       omp.inner.for.cond:
+// CHECK-32-NEXT:    [[TMP8:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP180:![0-9]+]]
+// CHECK-32-NEXT:    [[TMP9:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP180]]
+// CHECK-32-NEXT:    [[CMP:%.*]] = icmp sle i32 [[TMP8]], [[TMP9]]
+// CHECK-32-NEXT:    br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK-32:       omp.inner.for.body:
+// CHECK-32-NEXT:    [[TMP10:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP180]]
+// CHECK-32-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP10]], 1
+// CHECK-32-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// CHECK-32-NEXT:    store i32 [[ADD]], i32* [[I]], align 4, !llvm.access.group [[ACC_GRP180]]
+// CHECK-32-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK-32:       omp.body.continue:
+// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK-32:       omp.inner.for.inc:
+// CHECK-32-NEXT:    [[TMP11:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP180]]
+// CHECK-32-NEXT:    [[ADD1:%.*]] = add nsw i32 [[TMP11]], 1
+// CHECK-32-NEXT:    store i32 [[ADD1]], i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP180]]
+// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP181:![0-9]+]]
+// CHECK-32:       omp.inner.for.end:
+// CHECK-32-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
+// CHECK-32:       omp.dispatch.inc:
+// CHECK-32-NEXT:    br label [[OMP_DISPATCH_COND]]
+// CHECK-32:       omp.dispatch.end:
+// CHECK-32-NEXT:    ret void
+//
+//
+// CHECK-32-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l48
+// CHECK-32-SAME: () #[[ATTR0]] {
+// CHECK-32-NEXT:  entry:
+// CHECK-32-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 2, i1 false)
+// CHECK-32-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
+// CHECK-32-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
+// CHECK-32:       user_code.entry:
+// CHECK-32-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
+// CHECK-32-NEXT:    store i32 0, i32* [[DOTZERO_ADDR]], align 4
+// CHECK-32-NEXT:    store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4
+// CHECK-32-NEXT:    call void @__omp_outlined__23(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]]) #[[ATTR2]]
+// CHECK-32-NEXT:    call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 2)
+// CHECK-32-NEXT:    ret void
+// CHECK-32:       worker.exit:
+// CHECK-32-NEXT:    ret void
+//
+//
+// CHECK-32-LABEL: define {{[^@]+}}@__omp_outlined__23
+// CHECK-32-SAME: (i32* noalias noundef [[DOTGLOBAL_TID_:%.*]], i32* noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
+// CHECK-32-NEXT:  entry:
+// CHECK-32-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [2 x i8*], align 4
+// CHECK-32-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4
+// CHECK-32-NEXT:    store i32 0, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-32-NEXT:    store i32 9, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-NEXT:    store i32 1, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-32-NEXT:    store i32 0, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-32-NEXT:    [[NVPTX_NUM_THREADS:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block()
+// CHECK-32-NEXT:    [[TMP0:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-NEXT:    [[TMP1:%.*]] = load i32, i32* [[TMP0]], align 4
+// CHECK-32-NEXT:    call void @__kmpc_distribute_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP1]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]])
+// CHECK-32-NEXT:    [[TMP2:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[TMP2]], 9
+// CHECK-32-NEXT:    br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK-32:       cond.true:
+// CHECK-32-NEXT:    br label [[COND_END:%.*]]
+// CHECK-32:       cond.false:
+// CHECK-32-NEXT:    [[TMP3:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-NEXT:    br label [[COND_END]]
+// CHECK-32:       cond.end:
+// CHECK-32-NEXT:    [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP3]], [[COND_FALSE]] ]
+// CHECK-32-NEXT:    store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-NEXT:    [[TMP4:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-32-NEXT:    store i32 [[TMP4]], i32* [[DOTOMP_IV]], align 4
+// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK-32:       omp.inner.for.cond:
+// CHECK-32-NEXT:    [[TMP5:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
+// CHECK-32-NEXT:    [[CMP1:%.*]] = icmp slt i32 [[TMP5]], 10
+// CHECK-32-NEXT:    br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK-32:       omp.inner.for.body:
+// CHECK-32-NEXT:    [[TMP6:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-32-NEXT:    [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0
+// CHECK-32-NEXT:    [[TMP9:%.*]] = inttoptr i32 [[TMP6]] to i8*
+// CHECK-32-NEXT:    store i8* [[TMP9]], i8** [[TMP8]], align 4
+// CHECK-32-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1
+// CHECK-32-NEXT:    [[TMP11:%.*]] = inttoptr i32 [[TMP7]] to i8*
+// CHECK-32-NEXT:    store i8* [[TMP11]], i8** [[TMP10]], align 4
+// CHECK-32-NEXT:    [[TMP12:%.*]] = bitcast [2 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8**
+// CHECK-32-NEXT:    call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32)* @__omp_outlined__24 to i8*), i8* null, i8** [[TMP12]], i32 2)
+// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK-32:       omp.inner.for.inc:
+// CHECK-32-NEXT:    [[TMP13:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
+// CHECK-32-NEXT:    [[TMP14:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-32-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP13]], [[TMP14]]
+// CHECK-32-NEXT:    store i32 [[ADD]], i32* [[DOTOMP_IV]], align 4
+// CHECK-32-NEXT:    [[TMP15:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-32-NEXT:    [[TMP16:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-32-NEXT:    [[ADD2:%.*]] = add nsw i32 [[TMP15]], [[TMP16]]
+// CHECK-32-NEXT:    store i32 [[ADD2]], i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-32-NEXT:    [[TMP17:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-NEXT:    [[TMP18:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-32-NEXT:    [[ADD3:%.*]] = add nsw i32 [[TMP17]], [[TMP18]]
+// CHECK-32-NEXT:    store i32 [[ADD3]], i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-NEXT:    [[TMP19:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-NEXT:    [[CMP4:%.*]] = icmp sgt i32 [[TMP19]], 9
+// CHECK-32-NEXT:    br i1 [[CMP4]], label [[COND_TRUE5:%.*]], label [[COND_FALSE6:%.*]]
+// CHECK-32:       cond.true5:
+// CHECK-32-NEXT:    br label [[COND_END7:%.*]]
+// CHECK-32:       cond.false6:
+// CHECK-32-NEXT:    [[TMP20:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-NEXT:    br label [[COND_END7]]
+// CHECK-32:       cond.end7:
+// CHECK-32-NEXT:    [[COND8:%.*]] = phi i32 [ 9, [[COND_TRUE5]] ], [ [[TMP20]], [[COND_FALSE6]] ]
+// CHECK-32-NEXT:    store i32 [[COND8]], i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-NEXT:    [[TMP21:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-32-NEXT:    store i32 [[TMP21]], i32* [[DOTOMP_IV]], align 4
+// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND]]
+// CHECK-32:       omp.inner.for.end:
+// CHECK-32-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK-32:       omp.loop.exit:
+// CHECK-32-NEXT:    call void @__kmpc_distribute_static_fini(%struct.ident_t* @[[GLOB2]], i32 [[TMP1]])
+// CHECK-32-NEXT:    ret void
+//
+//
+// CHECK-32-LABEL: define {{[^@]+}}@__omp_outlined__24
+// CHECK-32-SAME: (i32* noalias noundef [[DOTGLOBAL_TID_:%.*]], i32* noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR1]] {
+// CHECK-32-NEXT:  entry:
+// CHECK-32-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-NEXT:    [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4
+// CHECK-32-NEXT:    store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4
+// CHECK-32-NEXT:    store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4
+// CHECK-32-NEXT:    store i32 0, i32* [[DOTOMP_LB]], align 4
+// CHECK-32-NEXT:    store i32 9, i32* [[DOTOMP_UB]], align 4
+// CHECK-32-NEXT:    [[TMP0:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4
+// CHECK-32-NEXT:    [[TMP1:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4
+// CHECK-32-NEXT:    store i32 [[TMP0]], i32* [[DOTOMP_LB]], align 4
+// CHECK-32-NEXT:    store i32 [[TMP1]], i32* [[DOTOMP_UB]], align 4
+// CHECK-32-NEXT:    store i32 1, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-32-NEXT:    store i32 0, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-32-NEXT:    [[TMP2:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4
+// CHECK-32-NEXT:    [[TMP3:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4
+// CHECK-32-NEXT:    [[TMP4:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-NEXT:    [[TMP5:%.*]] = load i32, i32* [[TMP4]], align 4
+// CHECK-32-NEXT:    call void @__kmpc_dispatch_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP5]], i32 1073741861, i32 [[TMP2]], i32 [[TMP3]], i32 1, i32 1)
+// CHECK-32-NEXT:    br label [[OMP_DISPATCH_COND:%.*]]
+// CHECK-32:       omp.dispatch.cond:
+// CHECK-32-NEXT:    [[TMP6:%.*]] = call i32 @__kmpc_dispatch_next_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP5]], i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]])
+// CHECK-32-NEXT:    [[TOBOOL:%.*]] = icmp ne i32 [[TMP6]], 0
+// CHECK-32-NEXT:    br i1 [[TOBOOL]], label [[OMP_DISPATCH_BODY:%.*]], label [[OMP_DISPATCH_END:%.*]]
+// CHECK-32:       omp.dispatch.body:
+// CHECK-32-NEXT:    [[TMP7:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4
+// CHECK-32-NEXT:    store i32 [[TMP7]], i32* [[DOTOMP_IV]], align 4
+// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK-32:       omp.inner.for.cond:
+// CHECK-32-NEXT:    [[TMP8:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP183:![0-9]+]]
+// CHECK-32-NEXT:    [[TMP9:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP183]]
+// CHECK-32-NEXT:    [[CMP:%.*]] = icmp sle i32 [[TMP8]], [[TMP9]]
+// CHECK-32-NEXT:    br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK-32:       omp.inner.for.body:
+// CHECK-32-NEXT:    [[TMP10:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP183]]
+// CHECK-32-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP10]], 1
+// CHECK-32-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// CHECK-32-NEXT:    store i32 [[ADD]], i32* [[I]], align 4, !llvm.access.group [[ACC_GRP183]]
+// CHECK-32-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK-32:       omp.body.continue:
+// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK-32:       omp.inner.for.inc:
+// CHECK-32-NEXT:    [[TMP11:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP183]]
+// CHECK-32-NEXT:    [[ADD1:%.*]] = add nsw i32 [[TMP11]], 1
+// CHECK-32-NEXT:    store i32 [[ADD1]], i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP183]]
+// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP184:![0-9]+]]
+// CHECK-32:       omp.inner.for.end:
+// CHECK-32-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
+// CHECK-32:       omp.dispatch.inc:
+// CHECK-32-NEXT:    br label [[OMP_DISPATCH_COND]]
+// CHECK-32:       omp.dispatch.end:
+// CHECK-32-NEXT:    ret void
+//
+//
+// CHECK-32-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l51
+// CHECK-32-SAME: () #[[ATTR0]] {
+// CHECK-32-NEXT:  entry:
+// CHECK-32-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 2, i1 false)
+// CHECK-32-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
+// CHECK-32-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
+// CHECK-32:       user_code.entry:
+// CHECK-32-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
+// CHECK-32-NEXT:    store i32 0, i32* [[DOTZERO_ADDR]], align 4
+// CHECK-32-NEXT:    store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4
+// CHECK-32-NEXT:    call void @__omp_outlined__25(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]]) #[[ATTR2]]
+// CHECK-32-NEXT:    call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 2)
+// CHECK-32-NEXT:    ret void
+// CHECK-32:       worker.exit:
+// CHECK-32-NEXT:    ret void
+//
+//
+// CHECK-32-LABEL: define {{[^@]+}}@__omp_outlined__25
+// CHECK-32-SAME: (i32* noalias noundef [[DOTGLOBAL_TID_:%.*]], i32* noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
+// CHECK-32-NEXT:  entry:
+// CHECK-32-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [2 x i8*], align 4
+// CHECK-32-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4
+// CHECK-32-NEXT:    store i32 0, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-32-NEXT:    store i32 9, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-NEXT:    store i32 1, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-32-NEXT:    store i32 0, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-32-NEXT:    [[NVPTX_NUM_THREADS:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block()
+// CHECK-32-NEXT:    [[TMP0:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-NEXT:    [[TMP1:%.*]] = load i32, i32* [[TMP0]], align 4
+// CHECK-32-NEXT:    call void @__kmpc_distribute_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP1]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]])
+// CHECK-32-NEXT:    [[TMP2:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[TMP2]], 9
+// CHECK-32-NEXT:    br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK-32:       cond.true:
+// CHECK-32-NEXT:    br label [[COND_END:%.*]]
+// CHECK-32:       cond.false:
+// CHECK-32-NEXT:    [[TMP3:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-NEXT:    br label [[COND_END]]
+// CHECK-32:       cond.end:
+// CHECK-32-NEXT:    [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP3]], [[COND_FALSE]] ]
+// CHECK-32-NEXT:    store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-NEXT:    [[TMP4:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-32-NEXT:    store i32 [[TMP4]], i32* [[DOTOMP_IV]], align 4
+// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK-32:       omp.inner.for.cond:
+// CHECK-32-NEXT:    [[TMP5:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
+// CHECK-32-NEXT:    [[CMP1:%.*]] = icmp slt i32 [[TMP5]], 10
+// CHECK-32-NEXT:    br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK-32:       omp.inner.for.body:
+// CHECK-32-NEXT:    [[TMP6:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-32-NEXT:    [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0
+// CHECK-32-NEXT:    [[TMP9:%.*]] = inttoptr i32 [[TMP6]] to i8*
+// CHECK-32-NEXT:    store i8* [[TMP9]], i8** [[TMP8]], align 4
+// CHECK-32-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1
+// CHECK-32-NEXT:    [[TMP11:%.*]] = inttoptr i32 [[TMP7]] to i8*
+// CHECK-32-NEXT:    store i8* [[TMP11]], i8** [[TMP10]], align 4
+// CHECK-32-NEXT:    [[TMP12:%.*]] = bitcast [2 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8**
+// CHECK-32-NEXT:    call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32)* @__omp_outlined__26 to i8*), i8* null, i8** [[TMP12]], i32 2)
+// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK-32:       omp.inner.for.inc:
+// CHECK-32-NEXT:    [[TMP13:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
+// CHECK-32-NEXT:    [[TMP14:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-32-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP13]], [[TMP14]]
+// CHECK-32-NEXT:    store i32 [[ADD]], i32* [[DOTOMP_IV]], align 4
+// CHECK-32-NEXT:    [[TMP15:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-32-NEXT:    [[TMP16:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-32-NEXT:    [[ADD2:%.*]] = add nsw i32 [[TMP15]], [[TMP16]]
+// CHECK-32-NEXT:    store i32 [[ADD2]], i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-32-NEXT:    [[TMP17:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-NEXT:    [[TMP18:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-32-NEXT:    [[ADD3:%.*]] = add nsw i32 [[TMP17]], [[TMP18]]
+// CHECK-32-NEXT:    store i32 [[ADD3]], i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-NEXT:    [[TMP19:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-NEXT:    [[CMP4:%.*]] = icmp sgt i32 [[TMP19]], 9
+// CHECK-32-NEXT:    br i1 [[CMP4]], label [[COND_TRUE5:%.*]], label [[COND_FALSE6:%.*]]
+// CHECK-32:       cond.true5:
+// CHECK-32-NEXT:    br label [[COND_END7:%.*]]
+// CHECK-32:       cond.false6:
+// CHECK-32-NEXT:    [[TMP20:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-NEXT:    br label [[COND_END7]]
+// CHECK-32:       cond.end7:
+// CHECK-32-NEXT:    [[COND8:%.*]] = phi i32 [ 9, [[COND_TRUE5]] ], [ [[TMP20]], [[COND_FALSE6]] ]
+// CHECK-32-NEXT:    store i32 [[COND8]], i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-NEXT:    [[TMP21:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-32-NEXT:    store i32 [[TMP21]], i32* [[DOTOMP_IV]], align 4
+// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND]]
+// CHECK-32:       omp.inner.for.end:
+// CHECK-32-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK-32:       omp.loop.exit:
+// CHECK-32-NEXT:    call void @__kmpc_distribute_static_fini(%struct.ident_t* @[[GLOB2]], i32 [[TMP1]])
+// CHECK-32-NEXT:    ret void
+//
+//
+// CHECK-32-LABEL: define {{[^@]+}}@__omp_outlined__26
+// CHECK-32-SAME: (i32* noalias noundef [[DOTGLOBAL_TID_:%.*]], i32* noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR1]] {
+// CHECK-32-NEXT:  entry:
+// CHECK-32-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-NEXT:    [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4
+// CHECK-32-NEXT:    store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4
+// CHECK-32-NEXT:    store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4
+// CHECK-32-NEXT:    store i32 0, i32* [[DOTOMP_LB]], align 4
+// CHECK-32-NEXT:    store i32 9, i32* [[DOTOMP_UB]], align 4
+// CHECK-32-NEXT:    [[TMP0:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4
+// CHECK-32-NEXT:    [[TMP1:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4
+// CHECK-32-NEXT:    store i32 [[TMP0]], i32* [[DOTOMP_LB]], align 4
+// CHECK-32-NEXT:    store i32 [[TMP1]], i32* [[DOTOMP_UB]], align 4
+// CHECK-32-NEXT:    store i32 1, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-32-NEXT:    store i32 0, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-32-NEXT:    [[TMP2:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4
+// CHECK-32-NEXT:    [[TMP3:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4
+// CHECK-32-NEXT:    [[TMP4:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-NEXT:    [[TMP5:%.*]] = load i32, i32* [[TMP4]], align 4
+// CHECK-32-NEXT:    call void @__kmpc_dispatch_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP5]], i32 1073741859, i32 [[TMP2]], i32 [[TMP3]], i32 1, i32 1)
+// CHECK-32-NEXT:    br label [[OMP_DISPATCH_COND:%.*]]
+// CHECK-32:       omp.dispatch.cond:
+// CHECK-32-NEXT:    [[TMP6:%.*]] = call i32 @__kmpc_dispatch_next_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP5]], i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]])
+// CHECK-32-NEXT:    [[TOBOOL:%.*]] = icmp ne i32 [[TMP6]], 0
+// CHECK-32-NEXT:    br i1 [[TOBOOL]], label [[OMP_DISPATCH_BODY:%.*]], label [[OMP_DISPATCH_END:%.*]]
+// CHECK-32:       omp.dispatch.body:
+// CHECK-32-NEXT:    [[TMP7:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4
+// CHECK-32-NEXT:    store i32 [[TMP7]], i32* [[DOTOMP_IV]], align 4
+// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK-32:       omp.inner.for.cond:
+// CHECK-32-NEXT:    [[TMP8:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP186:![0-9]+]]
+// CHECK-32-NEXT:    [[TMP9:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP186]]
+// CHECK-32-NEXT:    [[CMP:%.*]] = icmp sle i32 [[TMP8]], [[TMP9]]
+// CHECK-32-NEXT:    br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK-32:       omp.inner.for.body:
+// CHECK-32-NEXT:    [[TMP10:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP186]]
+// CHECK-32-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP10]], 1
+// CHECK-32-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// CHECK-32-NEXT:    store i32 [[ADD]], i32* [[I]], align 4, !llvm.access.group [[ACC_GRP186]]
+// CHECK-32-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK-32:       omp.body.continue:
+// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK-32:       omp.inner.for.inc:
+// CHECK-32-NEXT:    [[TMP11:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP186]]
+// CHECK-32-NEXT:    [[ADD1:%.*]] = add nsw i32 [[TMP11]], 1
+// CHECK-32-NEXT:    store i32 [[ADD1]], i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP186]]
+// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP187:![0-9]+]]
+// CHECK-32:       omp.inner.for.end:
+// CHECK-32-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
+// CHECK-32:       omp.dispatch.inc:
+// CHECK-32-NEXT:    br label [[OMP_DISPATCH_COND]]
+// CHECK-32:       omp.dispatch.end:
+// CHECK-32-NEXT:    ret void
+//
+//
+// CHECK-32-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l54
+// CHECK-32-SAME: () #[[ATTR0]] {
+// CHECK-32-NEXT:  entry:
+// CHECK-32-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 2, i1 false)
+// CHECK-32-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
+// CHECK-32-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
+// CHECK-32:       user_code.entry:
+// CHECK-32-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
+// CHECK-32-NEXT:    store i32 0, i32* [[DOTZERO_ADDR]], align 4
+// CHECK-32-NEXT:    store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4
+// CHECK-32-NEXT:    call void @__omp_outlined__27(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]]) #[[ATTR2]]
+// CHECK-32-NEXT:    call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 2)
+// CHECK-32-NEXT:    ret void
+// CHECK-32:       worker.exit:
+// CHECK-32-NEXT:    ret void
+//
+//
+// CHECK-32-LABEL: define {{[^@]+}}@__omp_outlined__27
+// CHECK-32-SAME: (i32* noalias noundef [[DOTGLOBAL_TID_:%.*]], i32* noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
+// CHECK-32-NEXT:  entry:
+// CHECK-32-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [2 x i8*], align 4
+// CHECK-32-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4
+// CHECK-32-NEXT:    store i32 0, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-32-NEXT:    store i32 9, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-NEXT:    store i32 1, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-32-NEXT:    store i32 0, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-32-NEXT:    [[NVPTX_NUM_THREADS:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block()
+// CHECK-32-NEXT:    [[TMP0:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-NEXT:    [[TMP1:%.*]] = load i32, i32* [[TMP0]], align 4
+// CHECK-32-NEXT:    call void @__kmpc_distribute_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP1]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]])
+// CHECK-32-NEXT:    [[TMP2:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[TMP2]], 9
+// CHECK-32-NEXT:    br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK-32:       cond.true:
+// CHECK-32-NEXT:    br label [[COND_END:%.*]]
+// CHECK-32:       cond.false:
+// CHECK-32-NEXT:    [[TMP3:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-NEXT:    br label [[COND_END]]
+// CHECK-32:       cond.end:
+// CHECK-32-NEXT:    [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP3]], [[COND_FALSE]] ]
+// CHECK-32-NEXT:    store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-NEXT:    [[TMP4:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-32-NEXT:    store i32 [[TMP4]], i32* [[DOTOMP_IV]], align 4
+// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK-32:       omp.inner.for.cond:
+// CHECK-32-NEXT:    [[TMP5:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
+// CHECK-32-NEXT:    [[CMP1:%.*]] = icmp slt i32 [[TMP5]], 10
+// CHECK-32-NEXT:    br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK-32:       omp.inner.for.body:
+// CHECK-32-NEXT:    [[TMP6:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-32-NEXT:    [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0
+// CHECK-32-NEXT:    [[TMP9:%.*]] = inttoptr i32 [[TMP6]] to i8*
+// CHECK-32-NEXT:    store i8* [[TMP9]], i8** [[TMP8]], align 4
+// CHECK-32-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1
+// CHECK-32-NEXT:    [[TMP11:%.*]] = inttoptr i32 [[TMP7]] to i8*
+// CHECK-32-NEXT:    store i8* [[TMP11]], i8** [[TMP10]], align 4
+// CHECK-32-NEXT:    [[TMP12:%.*]] = bitcast [2 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8**
+// CHECK-32-NEXT:    call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32)* @__omp_outlined__28 to i8*), i8* null, i8** [[TMP12]], i32 2)
+// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK-32:       omp.inner.for.inc:
+// CHECK-32-NEXT:    [[TMP13:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
+// CHECK-32-NEXT:    [[TMP14:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-32-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP13]], [[TMP14]]
+// CHECK-32-NEXT:    store i32 [[ADD]], i32* [[DOTOMP_IV]], align 4
+// CHECK-32-NEXT:    [[TMP15:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-32-NEXT:    [[TMP16:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-32-NEXT:    [[ADD2:%.*]] = add nsw i32 [[TMP15]], [[TMP16]]
+// CHECK-32-NEXT:    store i32 [[ADD2]], i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-32-NEXT:    [[TMP17:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-NEXT:    [[TMP18:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-32-NEXT:    [[ADD3:%.*]] = add nsw i32 [[TMP17]], [[TMP18]]
+// CHECK-32-NEXT:    store i32 [[ADD3]], i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-NEXT:    [[TMP19:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-NEXT:    [[CMP4:%.*]] = icmp sgt i32 [[TMP19]], 9
+// CHECK-32-NEXT:    br i1 [[CMP4]], label [[COND_TRUE5:%.*]], label [[COND_FALSE6:%.*]]
+// CHECK-32:       cond.true5:
+// CHECK-32-NEXT:    br label [[COND_END7:%.*]]
+// CHECK-32:       cond.false6:
+// CHECK-32-NEXT:    [[TMP20:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-NEXT:    br label [[COND_END7]]
+// CHECK-32:       cond.end7:
+// CHECK-32-NEXT:    [[COND8:%.*]] = phi i32 [ 9, [[COND_TRUE5]] ], [ [[TMP20]], [[COND_FALSE6]] ]
+// CHECK-32-NEXT:    store i32 [[COND8]], i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-NEXT:    [[TMP21:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-32-NEXT:    store i32 [[TMP21]], i32* [[DOTOMP_IV]], align 4
+// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND]]
+// CHECK-32:       omp.inner.for.end:
+// CHECK-32-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK-32:       omp.loop.exit:
+// CHECK-32-NEXT:    call void @__kmpc_distribute_static_fini(%struct.ident_t* @[[GLOB2]], i32 [[TMP1]])
+// CHECK-32-NEXT:    ret void
+//
+//
+// CHECK-32-LABEL: define {{[^@]+}}@__omp_outlined__28
+// CHECK-32-SAME: (i32* noalias noundef [[DOTGLOBAL_TID_:%.*]], i32* noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR1]] {
+// CHECK-32-NEXT:  entry:
+// CHECK-32-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-NEXT:    [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4
+// CHECK-32-NEXT:    store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4
+// CHECK-32-NEXT:    store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4
+// CHECK-32-NEXT:    store i32 0, i32* [[DOTOMP_LB]], align 4
+// CHECK-32-NEXT:    store i32 9, i32* [[DOTOMP_UB]], align 4
+// CHECK-32-NEXT:    [[TMP0:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4
+// CHECK-32-NEXT:    [[TMP1:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4
+// CHECK-32-NEXT:    store i32 [[TMP0]], i32* [[DOTOMP_LB]], align 4
+// CHECK-32-NEXT:    store i32 [[TMP1]], i32* [[DOTOMP_UB]], align 4
+// CHECK-32-NEXT:    store i32 1, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-32-NEXT:    store i32 0, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-32-NEXT:    [[TMP2:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4
+// CHECK-32-NEXT:    [[TMP3:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4
+// CHECK-32-NEXT:    [[TMP4:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-NEXT:    [[TMP5:%.*]] = load i32, i32* [[TMP4]], align 4
+// CHECK-32-NEXT:    call void @__kmpc_dispatch_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP5]], i32 1073741860, i32 [[TMP2]], i32 [[TMP3]], i32 1, i32 1)
+// CHECK-32-NEXT:    br label [[OMP_DISPATCH_COND:%.*]]
+// CHECK-32:       omp.dispatch.cond:
+// CHECK-32-NEXT:    [[TMP6:%.*]] = call i32 @__kmpc_dispatch_next_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP5]], i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]])
+// CHECK-32-NEXT:    [[TOBOOL:%.*]] = icmp ne i32 [[TMP6]], 0
+// CHECK-32-NEXT:    br i1 [[TOBOOL]], label [[OMP_DISPATCH_BODY:%.*]], label [[OMP_DISPATCH_END:%.*]]
+// CHECK-32:       omp.dispatch.body:
+// CHECK-32-NEXT:    [[TMP7:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4
+// CHECK-32-NEXT:    store i32 [[TMP7]], i32* [[DOTOMP_IV]], align 4
+// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK-32:       omp.inner.for.cond:
+// CHECK-32-NEXT:    [[TMP8:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP189:![0-9]+]]
+// CHECK-32-NEXT:    [[TMP9:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP189]]
+// CHECK-32-NEXT:    [[CMP:%.*]] = icmp sle i32 [[TMP8]], [[TMP9]]
+// CHECK-32-NEXT:    br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK-32:       omp.inner.for.body:
+// CHECK-32-NEXT:    [[TMP10:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP189]]
+// CHECK-32-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP10]], 1
+// CHECK-32-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// CHECK-32-NEXT:    store i32 [[ADD]], i32* [[I]], align 4, !llvm.access.group [[ACC_GRP189]]
+// CHECK-32-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK-32:       omp.body.continue:
+// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK-32:       omp.inner.for.inc:
+// CHECK-32-NEXT:    [[TMP11:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP189]]
+// CHECK-32-NEXT:    [[ADD1:%.*]] = add nsw i32 [[TMP11]], 1
+// CHECK-32-NEXT:    store i32 [[ADD1]], i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP189]]
+// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP190:![0-9]+]]
+// CHECK-32:       omp.inner.for.end:
+// CHECK-32-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
+// CHECK-32:       omp.dispatch.inc:
+// CHECK-32-NEXT:    br label [[OMP_DISPATCH_COND]]
+// CHECK-32:       omp.dispatch.end:
+// CHECK-32-NEXT:    ret void
+//
+//
+// CHECK-32-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l57
+// CHECK-32-SAME: () #[[ATTR0]] {
+// CHECK-32-NEXT:  entry:
+// CHECK-32-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 2, i1 false)
+// CHECK-32-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
+// CHECK-32-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
+// CHECK-32:       user_code.entry:
+// CHECK-32-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
+// CHECK-32-NEXT:    store i32 0, i32* [[DOTZERO_ADDR]], align 4
+// CHECK-32-NEXT:    store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4
+// CHECK-32-NEXT:    call void @__omp_outlined__29(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]]) #[[ATTR2]]
+// CHECK-32-NEXT:    call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 2)
+// CHECK-32-NEXT:    ret void
+// CHECK-32:       worker.exit:
+// CHECK-32-NEXT:    ret void
+//
+//
+// CHECK-32-LABEL: define {{[^@]+}}@__omp_outlined__29
+// CHECK-32-SAME: (i32* noalias noundef [[DOTGLOBAL_TID_:%.*]], i32* noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
+// CHECK-32-NEXT:  entry:
+// CHECK-32-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-NEXT:    [[B:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [2 x i8*], align 4
+// CHECK-32-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4
+// CHECK-32-NEXT:    store i32 0, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-32-NEXT:    store i32 9, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-NEXT:    store i32 1, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-32-NEXT:    store i32 0, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-32-NEXT:    [[NVPTX_NUM_THREADS:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block()
+// CHECK-32-NEXT:    [[TMP0:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-NEXT:    [[TMP1:%.*]] = load i32, i32* [[TMP0]], align 4
+// CHECK-32-NEXT:    call void @__kmpc_distribute_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP1]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]])
+// CHECK-32-NEXT:    [[TMP2:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[TMP2]], 9
+// CHECK-32-NEXT:    br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK-32:       cond.true:
+// CHECK-32-NEXT:    br label [[COND_END:%.*]]
+// CHECK-32:       cond.false:
+// CHECK-32-NEXT:    [[TMP3:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-NEXT:    br label [[COND_END]]
+// CHECK-32:       cond.end:
+// CHECK-32-NEXT:    [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP3]], [[COND_FALSE]] ]
+// CHECK-32-NEXT:    store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-NEXT:    [[TMP4:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-32-NEXT:    store i32 [[TMP4]], i32* [[DOTOMP_IV]], align 4
+// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK-32:       omp.inner.for.cond:
+// CHECK-32-NEXT:    [[TMP5:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP192:![0-9]+]]
+// CHECK-32-NEXT:    [[CMP1:%.*]] = icmp slt i32 [[TMP5]], 10
+// CHECK-32-NEXT:    br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK-32:       omp.inner.for.body:
+// CHECK-32-NEXT:    [[TMP6:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP192]]
+// CHECK-32-NEXT:    [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP192]]
+// CHECK-32-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0
+// CHECK-32-NEXT:    [[TMP9:%.*]] = inttoptr i32 [[TMP6]] to i8*
+// CHECK-32-NEXT:    store i8* [[TMP9]], i8** [[TMP8]], align 4, !llvm.access.group [[ACC_GRP192]]
+// CHECK-32-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1
+// CHECK-32-NEXT:    [[TMP11:%.*]] = inttoptr i32 [[TMP7]] to i8*
+// CHECK-32-NEXT:    store i8* [[TMP11]], i8** [[TMP10]], align 4, !llvm.access.group [[ACC_GRP192]]
+// CHECK-32-NEXT:    [[TMP12:%.*]] = bitcast [2 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8**
+// CHECK-32-NEXT:    call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32)* @__omp_outlined__30 to i8*), i8* null, i8** [[TMP12]], i32 2), !llvm.access.group [[ACC_GRP192]]
+// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK-32:       omp.inner.for.inc:
+// CHECK-32-NEXT:    [[TMP13:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP192]]
+// CHECK-32-NEXT:    [[TMP14:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP192]]
+// CHECK-32-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP13]], [[TMP14]]
+// CHECK-32-NEXT:    store i32 [[ADD]], i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP192]]
+// CHECK-32-NEXT:    [[TMP15:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP192]]
+// CHECK-32-NEXT:    [[TMP16:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP192]]
+// CHECK-32-NEXT:    [[ADD2:%.*]] = add nsw i32 [[TMP15]], [[TMP16]]
+// CHECK-32-NEXT:    store i32 [[ADD2]], i32* [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP192]]
+// CHECK-32-NEXT:    [[TMP17:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP192]]
+// CHECK-32-NEXT:    [[TMP18:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP192]]
+// CHECK-32-NEXT:    [[ADD3:%.*]] = add nsw i32 [[TMP17]], [[TMP18]]
+// CHECK-32-NEXT:    store i32 [[ADD3]], i32* [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP192]]
+// CHECK-32-NEXT:    [[TMP19:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP192]]
+// CHECK-32-NEXT:    [[CMP4:%.*]] = icmp sgt i32 [[TMP19]], 9
+// CHECK-32-NEXT:    br i1 [[CMP4]], label [[COND_TRUE5:%.*]], label [[COND_FALSE6:%.*]]
+// CHECK-32:       cond.true5:
+// CHECK-32-NEXT:    br label [[COND_END7:%.*]]
+// CHECK-32:       cond.false6:
+// CHECK-32-NEXT:    [[TMP20:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP192]]
+// CHECK-32-NEXT:    br label [[COND_END7]]
+// CHECK-32:       cond.end7:
+// CHECK-32-NEXT:    [[COND8:%.*]] = phi i32 [ 9, [[COND_TRUE5]] ], [ [[TMP20]], [[COND_FALSE6]] ]
+// CHECK-32-NEXT:    store i32 [[COND8]], i32* [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP192]]
+// CHECK-32-NEXT:    [[TMP21:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP192]]
+// CHECK-32-NEXT:    store i32 [[TMP21]], i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP192]]
+// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP193:![0-9]+]]
+// CHECK-32:       omp.inner.for.end:
+// CHECK-32-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK-32:       omp.loop.exit:
+// CHECK-32-NEXT:    call void @__kmpc_distribute_static_fini(%struct.ident_t* @[[GLOB2]], i32 [[TMP1]])
+// CHECK-32-NEXT:    [[TMP22:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-32-NEXT:    [[TMP23:%.*]] = icmp ne i32 [[TMP22]], 0
+// CHECK-32-NEXT:    br i1 [[TMP23]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]]
+// CHECK-32:       .omp.final.then:
+// CHECK-32-NEXT:    store i32 10, i32* [[I]], align 4
+// CHECK-32-NEXT:    br label [[DOTOMP_FINAL_DONE]]
+// CHECK-32:       .omp.final.done:
+// CHECK-32-NEXT:    ret void
+//
+//
+// CHECK-32-LABEL: define {{[^@]+}}@__omp_outlined__30
+// CHECK-32-SAME: (i32* noalias noundef [[DOTGLOBAL_TID_:%.*]], i32* noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR1]] {
+// CHECK-32-NEXT:  entry:
+// CHECK-32-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-NEXT:    [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4
+// CHECK-32-NEXT:    store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4
+// CHECK-32-NEXT:    store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4
+// CHECK-32-NEXT:    store i32 0, i32* [[DOTOMP_LB]], align 4
+// CHECK-32-NEXT:    store i32 9, i32* [[DOTOMP_UB]], align 4
+// CHECK-32-NEXT:    [[TMP0:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4
+// CHECK-32-NEXT:    [[TMP1:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4
+// CHECK-32-NEXT:    store i32 [[TMP0]], i32* [[DOTOMP_LB]], align 4
+// CHECK-32-NEXT:    store i32 [[TMP1]], i32* [[DOTOMP_UB]], align 4
+// CHECK-32-NEXT:    store i32 1, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-32-NEXT:    store i32 0, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-32-NEXT:    [[TMP2:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-NEXT:    [[TMP3:%.*]] = load i32, i32* [[TMP2]], align 4
+// CHECK-32-NEXT:    call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB3]], i32 [[TMP3]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1)
+// CHECK-32-NEXT:    [[TMP4:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4
+// CHECK-32-NEXT:    store i32 [[TMP4]], i32* [[DOTOMP_IV]], align 4
+// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK-32:       omp.inner.for.cond:
+// CHECK-32-NEXT:    [[TMP5:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP195:![0-9]+]]
+// CHECK-32-NEXT:    [[TMP6:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4, !llvm.access.group [[ACC_GRP195]]
+// CHECK-32-NEXT:    [[CMP:%.*]] = icmp ule i32 [[TMP5]], [[TMP6]]
+// CHECK-32-NEXT:    br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK-32:       omp.inner.for.body:
+// CHECK-32-NEXT:    [[TMP7:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP195]]
+// CHECK-32-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP7]], 1
+// CHECK-32-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// CHECK-32-NEXT:    store i32 [[ADD]], i32* [[I]], align 4, !llvm.access.group [[ACC_GRP195]]
+// CHECK-32-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK-32:       omp.body.continue:
+// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK-32:       omp.inner.for.inc:
+// CHECK-32-NEXT:    [[TMP8:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP195]]
+// CHECK-32-NEXT:    [[TMP9:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP195]]
+// CHECK-32-NEXT:    [[ADD1:%.*]] = add nsw i32 [[TMP8]], [[TMP9]]
+// CHECK-32-NEXT:    store i32 [[ADD1]], i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP195]]
+// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP196:![0-9]+]]
+// CHECK-32:       omp.inner.for.end:
+// CHECK-32-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK-32:       omp.loop.exit:
+// CHECK-32-NEXT:    call void @__kmpc_distribute_static_fini(%struct.ident_t* @[[GLOB2]], i32 [[TMP3]])
+// CHECK-32-NEXT:    [[TMP10:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-32-NEXT:    [[TMP11:%.*]] = icmp ne i32 [[TMP10]], 0
+// CHECK-32-NEXT:    br i1 [[TMP11]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]]
+// CHECK-32:       .omp.final.then:
+// CHECK-32-NEXT:    store i32 10, i32* [[I]], align 4
+// CHECK-32-NEXT:    br label [[DOTOMP_FINAL_DONE]]
+// CHECK-32:       .omp.final.done:
+// CHECK-32-NEXT:    ret void
+//
+//
+// CHECK-32-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l65
+// CHECK-32-SAME: () #[[ATTR0]] {
+// CHECK-32-NEXT:  entry:
+// CHECK-32-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 2, i1 false)
+// CHECK-32-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
+// CHECK-32-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
+// CHECK-32:       user_code.entry:
+// CHECK-32-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
+// CHECK-32-NEXT:    store i32 0, i32* [[DOTZERO_ADDR]], align 4
+// CHECK-32-NEXT:    store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4
+// CHECK-32-NEXT:    call void @__omp_outlined__31(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]]) #[[ATTR2]]
+// CHECK-32-NEXT:    call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 2)
+// CHECK-32-NEXT:    ret void
+// CHECK-32:       worker.exit:
+// CHECK-32-NEXT:    ret void
+//
+//
+// CHECK-32-LABEL: define {{[^@]+}}@__omp_outlined__31
+// CHECK-32-SAME: (i32* noalias noundef [[DOTGLOBAL_TID_:%.*]], i32* noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
+// CHECK-32-NEXT:  entry:
+// CHECK-32-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-NEXT:    [[B:%.*]] = alloca [3 x i32], align 4
+// CHECK-32-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [2 x i8*], align 4
+// CHECK-32-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4
+// CHECK-32-NEXT:    [[TMP0:%.*]] = bitcast [3 x i32]* [[B]] to i8*
+// CHECK-32-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 4 [[TMP0]], i8* align 4 bitcast ([3 x i32]* @"__const.<captured>.b" to i8*), i32 12, i1 false)
+// CHECK-32-NEXT:    store i32 0, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-32-NEXT:    store i32 9, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-NEXT:    store i32 1, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-32-NEXT:    store i32 0, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-32-NEXT:    [[NVPTX_NUM_THREADS:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block()
+// CHECK-32-NEXT:    [[TMP1:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-NEXT:    [[TMP2:%.*]] = load i32, i32* [[TMP1]], align 4
+// CHECK-32-NEXT:    call void @__kmpc_distribute_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP2]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]])
+// CHECK-32-NEXT:    [[TMP3:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[TMP3]], 9
+// CHECK-32-NEXT:    br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK-32:       cond.true:
+// CHECK-32-NEXT:    br label [[COND_END:%.*]]
+// CHECK-32:       cond.false:
+// CHECK-32-NEXT:    [[TMP4:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-NEXT:    br label [[COND_END]]
+// CHECK-32:       cond.end:
+// CHECK-32-NEXT:    [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP4]], [[COND_FALSE]] ]
+// CHECK-32-NEXT:    store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-NEXT:    [[TMP5:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-32-NEXT:    store i32 [[TMP5]], i32* [[DOTOMP_IV]], align 4
+// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK-32:       omp.inner.for.cond:
+// CHECK-32-NEXT:    [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP198:![0-9]+]]
+// CHECK-32-NEXT:    [[CMP1:%.*]] = icmp slt i32 [[TMP6]], 10
+// CHECK-32-NEXT:    br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK-32:       omp.inner.for.body:
+// CHECK-32-NEXT:    [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP198]]
+// CHECK-32-NEXT:    [[TMP8:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP198]]
+// CHECK-32-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0
+// CHECK-32-NEXT:    [[TMP10:%.*]] = inttoptr i32 [[TMP7]] to i8*
+// CHECK-32-NEXT:    store i8* [[TMP10]], i8** [[TMP9]], align 4, !llvm.access.group [[ACC_GRP198]]
+// CHECK-32-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1
+// CHECK-32-NEXT:    [[TMP12:%.*]] = inttoptr i32 [[TMP8]] to i8*
+// CHECK-32-NEXT:    store i8* [[TMP12]], i8** [[TMP11]], align 4, !llvm.access.group [[ACC_GRP198]]
+// CHECK-32-NEXT:    [[TMP13:%.*]] = bitcast [2 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8**
+// CHECK-32-NEXT:    call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32)* @__omp_outlined__32 to i8*), i8* null, i8** [[TMP13]], i32 2), !llvm.access.group [[ACC_GRP198]]
+// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK-32:       omp.inner.for.inc:
+// CHECK-32-NEXT:    [[TMP14:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP198]]
+// CHECK-32-NEXT:    [[TMP15:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP198]]
+// CHECK-32-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP14]], [[TMP15]]
+// CHECK-32-NEXT:    store i32 [[ADD]], i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP198]]
+// CHECK-32-NEXT:    [[TMP16:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP198]]
+// CHECK-32-NEXT:    [[TMP17:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP198]]
+// CHECK-32-NEXT:    [[ADD2:%.*]] = add nsw i32 [[TMP16]], [[TMP17]]
+// CHECK-32-NEXT:    store i32 [[ADD2]], i32* [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP198]]
+// CHECK-32-NEXT:    [[TMP18:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP198]]
+// CHECK-32-NEXT:    [[TMP19:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP198]]
+// CHECK-32-NEXT:    [[ADD3:%.*]] = add nsw i32 [[TMP18]], [[TMP19]]
+// CHECK-32-NEXT:    store i32 [[ADD3]], i32* [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP198]]
+// CHECK-32-NEXT:    [[TMP20:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP198]]
+// CHECK-32-NEXT:    [[CMP4:%.*]] = icmp sgt i32 [[TMP20]], 9
+// CHECK-32-NEXT:    br i1 [[CMP4]], label [[COND_TRUE5:%.*]], label [[COND_FALSE6:%.*]]
+// CHECK-32:       cond.true5:
+// CHECK-32-NEXT:    br label [[COND_END7:%.*]]
+// CHECK-32:       cond.false6:
+// CHECK-32-NEXT:    [[TMP21:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP198]]
+// CHECK-32-NEXT:    br label [[COND_END7]]
+// CHECK-32:       cond.end7:
+// CHECK-32-NEXT:    [[COND8:%.*]] = phi i32 [ 9, [[COND_TRUE5]] ], [ [[TMP21]], [[COND_FALSE6]] ]
+// CHECK-32-NEXT:    store i32 [[COND8]], i32* [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP198]]
+// CHECK-32-NEXT:    [[TMP22:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP198]]
+// CHECK-32-NEXT:    store i32 [[TMP22]], i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP198]]
+// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP199:![0-9]+]]
+// CHECK-32:       omp.inner.for.end:
+// CHECK-32-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK-32:       omp.loop.exit:
+// CHECK-32-NEXT:    call void @__kmpc_distribute_static_fini(%struct.ident_t* @[[GLOB2]], i32 [[TMP2]])
+// CHECK-32-NEXT:    [[TMP23:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-32-NEXT:    [[TMP24:%.*]] = icmp ne i32 [[TMP23]], 0
+// CHECK-32-NEXT:    br i1 [[TMP24]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]]
+// CHECK-32:       .omp.final.then:
+// CHECK-32-NEXT:    store i32 10, i32* [[I]], align 4
+// CHECK-32-NEXT:    br label [[DOTOMP_FINAL_DONE]]
+// CHECK-32:       .omp.final.done:
+// CHECK-32-NEXT:    ret void
+//
+//
+// CHECK-32-LABEL: define {{[^@]+}}@__omp_outlined__32
+// CHECK-32-SAME: (i32* noalias noundef [[DOTGLOBAL_TID_:%.*]], i32* noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR1]] {
+// CHECK-32-NEXT:  entry:
+// CHECK-32-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-NEXT:    [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4
+// CHECK-32-NEXT:    store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4
+// CHECK-32-NEXT:    store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4
+// CHECK-32-NEXT:    store i32 0, i32* [[DOTOMP_LB]], align 4
+// CHECK-32-NEXT:    store i32 9, i32* [[DOTOMP_UB]], align 4
+// CHECK-32-NEXT:    [[TMP0:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4
+// CHECK-32-NEXT:    [[TMP1:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4
+// CHECK-32-NEXT:    store i32 [[TMP0]], i32* [[DOTOMP_LB]], align 4
+// CHECK-32-NEXT:    store i32 [[TMP1]], i32* [[DOTOMP_UB]], align 4
+// CHECK-32-NEXT:    store i32 1, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-32-NEXT:    store i32 0, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-32-NEXT:    [[TMP2:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-NEXT:    [[TMP3:%.*]] = load i32, i32* [[TMP2]], align 4
+// CHECK-32-NEXT:    call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB3]], i32 [[TMP3]], i32 34, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1)
+// CHECK-32-NEXT:    [[TMP4:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4
+// CHECK-32-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[TMP4]], 9
+// CHECK-32-NEXT:    br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK-32:       cond.true:
+// CHECK-32-NEXT:    br label [[COND_END:%.*]]
+// CHECK-32:       cond.false:
+// CHECK-32-NEXT:    [[TMP5:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4
+// CHECK-32-NEXT:    br label [[COND_END]]
+// CHECK-32:       cond.end:
+// CHECK-32-NEXT:    [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP5]], [[COND_FALSE]] ]
+// CHECK-32-NEXT:    store i32 [[COND]], i32* [[DOTOMP_UB]], align 4
+// CHECK-32-NEXT:    [[TMP6:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4
+// CHECK-32-NEXT:    store i32 [[TMP6]], i32* [[DOTOMP_IV]], align 4
+// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK-32:       omp.inner.for.cond:
+// CHECK-32-NEXT:    [[TMP7:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP201:![0-9]+]]
+// CHECK-32-NEXT:    [[TMP8:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP201]]
+// CHECK-32-NEXT:    [[CMP1:%.*]] = icmp sle i32 [[TMP7]], [[TMP8]]
+// CHECK-32-NEXT:    br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK-32:       omp.inner.for.body:
+// CHECK-32-NEXT:    [[TMP9:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP201]]
+// CHECK-32-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP9]], 1
+// CHECK-32-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// CHECK-32-NEXT:    store i32 [[ADD]], i32* [[I]], align 4, !llvm.access.group [[ACC_GRP201]]
+// CHECK-32-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK-32:       omp.body.continue:
+// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK-32:       omp.inner.for.inc:
+// CHECK-32-NEXT:    [[TMP10:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP201]]
+// CHECK-32-NEXT:    [[ADD2:%.*]] = add nsw i32 [[TMP10]], 1
+// CHECK-32-NEXT:    store i32 [[ADD2]], i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP201]]
+// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP202:![0-9]+]]
+// CHECK-32:       omp.inner.for.end:
+// CHECK-32-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK-32:       omp.loop.exit:
+// CHECK-32-NEXT:    call void @__kmpc_distribute_static_fini(%struct.ident_t* @[[GLOB2]], i32 [[TMP3]])
+// CHECK-32-NEXT:    [[TMP11:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-32-NEXT:    [[TMP12:%.*]] = icmp ne i32 [[TMP11]], 0
+// CHECK-32-NEXT:    br i1 [[TMP12]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]]
+// CHECK-32:       .omp.final.then:
+// CHECK-32-NEXT:    store i32 10, i32* [[I]], align 4
+// CHECK-32-NEXT:    br label [[DOTOMP_FINAL_DONE]]
+// CHECK-32:       .omp.final.done:
+// CHECK-32-NEXT:    ret void
+//
+//
+// CHECK-32-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l72
+// CHECK-32-SAME: () #[[ATTR0]] {
+// CHECK-32-NEXT:  entry:
+// CHECK-32-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 1, i1 true)
+// CHECK-32-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
+// CHECK-32-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
+// CHECK-32:       user_code.entry:
+// CHECK-32-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
+// CHECK-32-NEXT:    store i32 0, i32* [[DOTZERO_ADDR]], align 4
+// CHECK-32-NEXT:    store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4
+// CHECK-32-NEXT:    call void @__omp_outlined__33(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]]) #[[ATTR2]]
+// CHECK-32-NEXT:    call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 1)
+// CHECK-32-NEXT:    ret void
+// CHECK-32:       worker.exit:
+// CHECK-32-NEXT:    ret void
+//
+//
+// CHECK-32-LABEL: define {{[^@]+}}@__omp_outlined__33
+// CHECK-32-SAME: (i32* noalias noundef [[DOTGLOBAL_TID_:%.*]], i32* noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
+// CHECK-32-NEXT:  entry:
+// CHECK-32-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [2 x i8*], align 4
+// CHECK-32-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4
+// CHECK-32-NEXT:    [[C:%.*]] = call align 8 i8* @__kmpc_alloc_shared(i32 4)
+// CHECK-32-NEXT:    [[C_ON_STACK:%.*]] = bitcast i8* [[C]] to i32**
+// CHECK-32-NEXT:    [[B:%.*]] = call align 8 i8* @__kmpc_alloc_shared(i32 4)
+// CHECK-32-NEXT:    [[B_ON_STACK:%.*]] = bitcast i8* [[B]] to i32*
+// CHECK-32-NEXT:    store i32 0, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-32-NEXT:    store i32 9, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-NEXT:    store i32 1, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-32-NEXT:    store i32 0, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-32-NEXT:    [[TMP0:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-NEXT:    [[TMP1:%.*]] = load i32, i32* [[TMP0]], align 4
+// CHECK-32-NEXT:    call void @__kmpc_distribute_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP1]], i32 92, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1)
+// CHECK-32-NEXT:    [[TMP2:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[TMP2]], 9
+// CHECK-32-NEXT:    br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK-32:       cond.true:
+// CHECK-32-NEXT:    br label [[COND_END:%.*]]
+// CHECK-32:       cond.false:
+// CHECK-32-NEXT:    [[TMP3:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-NEXT:    br label [[COND_END]]
+// CHECK-32:       cond.end:
+// CHECK-32-NEXT:    [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP3]], [[COND_FALSE]] ]
+// CHECK-32-NEXT:    store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-NEXT:    [[TMP4:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-32-NEXT:    store i32 [[TMP4]], i32* [[DOTOMP_IV]], align 4
+// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK-32:       omp.inner.for.cond:
+// CHECK-32-NEXT:    [[TMP5:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP204:![0-9]+]]
+// CHECK-32-NEXT:    [[TMP6:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP204]]
+// CHECK-32-NEXT:    [[CMP1:%.*]] = icmp sle i32 [[TMP5]], [[TMP6]]
+// CHECK-32-NEXT:    br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK-32:       omp.inner.for.body:
+// CHECK-32-NEXT:    [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP204]]
+// CHECK-32-NEXT:    [[TMP8:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP204]]
+// CHECK-32-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0
+// CHECK-32-NEXT:    [[TMP10:%.*]] = inttoptr i32 [[TMP7]] to i8*
+// CHECK-32-NEXT:    store i8* [[TMP10]], i8** [[TMP9]], align 4, !llvm.access.group [[ACC_GRP204]]
+// CHECK-32-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1
+// CHECK-32-NEXT:    [[TMP12:%.*]] = inttoptr i32 [[TMP8]] to i8*
+// CHECK-32-NEXT:    store i8* [[TMP12]], i8** [[TMP11]], align 4, !llvm.access.group [[ACC_GRP204]]
+// CHECK-32-NEXT:    [[TMP13:%.*]] = bitcast [2 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8**
+// CHECK-32-NEXT:    call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32)* @__omp_outlined__34 to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined__34_wrapper to i8*), i8** [[TMP13]], i32 2), !llvm.access.group [[ACC_GRP204]]
+// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK-32:       omp.inner.for.inc:
+// CHECK-32-NEXT:    [[TMP14:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP204]]
+// CHECK-32-NEXT:    [[TMP15:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP204]]
+// CHECK-32-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP14]], [[TMP15]]
+// CHECK-32-NEXT:    store i32 [[ADD]], i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP204]]
+// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP205:![0-9]+]]
+// CHECK-32:       omp.inner.for.end:
+// CHECK-32-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK-32:       omp.loop.exit:
+// CHECK-32-NEXT:    call void @__kmpc_distribute_static_fini(%struct.ident_t* @[[GLOB2]], i32 [[TMP1]])
+// CHECK-32-NEXT:    [[TMP16:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-32-NEXT:    [[TMP17:%.*]] = icmp ne i32 [[TMP16]], 0
+// CHECK-32-NEXT:    br i1 [[TMP17]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]]
+// CHECK-32:       .omp.final.then:
+// CHECK-32-NEXT:    store i32 10, i32* [[I]], align 4
+// CHECK-32-NEXT:    br label [[DOTOMP_FINAL_DONE]]
+// CHECK-32:       .omp.final.done:
+// CHECK-32-NEXT:    store i32* [[B_ON_STACK]], i32** [[C_ON_STACK]], align 4
+// CHECK-32-NEXT:    call void @__kmpc_free_shared(i8* [[B]], i32 4)
+// CHECK-32-NEXT:    call void @__kmpc_free_shared(i8* [[C]], i32 4)
+// CHECK-32-NEXT:    ret void
+//
+//
+// CHECK-32-LABEL: define {{[^@]+}}@__omp_outlined__34
+// CHECK-32-SAME: (i32* noalias noundef [[DOTGLOBAL_TID_:%.*]], i32* noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR1]] {
+// CHECK-32-NEXT:  entry:
+// CHECK-32-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-NEXT:    [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4
+// CHECK-32-NEXT:    store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4
+// CHECK-32-NEXT:    store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4
+// CHECK-32-NEXT:    store i32 0, i32* [[DOTOMP_LB]], align 4
+// CHECK-32-NEXT:    store i32 9, i32* [[DOTOMP_UB]], align 4
+// CHECK-32-NEXT:    [[TMP0:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4
+// CHECK-32-NEXT:    [[TMP1:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4
+// CHECK-32-NEXT:    store i32 [[TMP0]], i32* [[DOTOMP_LB]], align 4
+// CHECK-32-NEXT:    store i32 [[TMP1]], i32* [[DOTOMP_UB]], align 4
+// CHECK-32-NEXT:    store i32 1, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-32-NEXT:    store i32 0, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-32-NEXT:    [[TMP2:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-NEXT:    [[TMP3:%.*]] = load i32, i32* [[TMP2]], align 4
+// CHECK-32-NEXT:    call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB3]], i32 [[TMP3]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1)
+// CHECK-32-NEXT:    [[TMP4:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4
+// CHECK-32-NEXT:    store i32 [[TMP4]], i32* [[DOTOMP_IV]], align 4
+// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK-32:       omp.inner.for.cond:
+// CHECK-32-NEXT:    [[TMP5:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP207:![0-9]+]]
+// CHECK-32-NEXT:    [[TMP6:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4, !llvm.access.group [[ACC_GRP207]]
+// CHECK-32-NEXT:    [[CMP:%.*]] = icmp ule i32 [[TMP5]], [[TMP6]]
+// CHECK-32-NEXT:    br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK-32:       omp.inner.for.body:
+// CHECK-32-NEXT:    [[TMP7:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP207]]
+// CHECK-32-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP7]], 1
+// CHECK-32-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// CHECK-32-NEXT:    store i32 [[ADD]], i32* [[I]], align 4, !llvm.access.group [[ACC_GRP207]]
+// CHECK-32-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK-32:       omp.body.continue:
+// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK-32:       omp.inner.for.inc:
+// CHECK-32-NEXT:    [[TMP8:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP207]]
+// CHECK-32-NEXT:    [[TMP9:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP207]]
+// CHECK-32-NEXT:    [[ADD1:%.*]] = add nsw i32 [[TMP8]], [[TMP9]]
+// CHECK-32-NEXT:    store i32 [[ADD1]], i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP207]]
+// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP208:![0-9]+]]
+// CHECK-32:       omp.inner.for.end:
+// CHECK-32-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK-32:       omp.loop.exit:
+// CHECK-32-NEXT:    call void @__kmpc_distribute_static_fini(%struct.ident_t* @[[GLOB2]], i32 [[TMP3]])
+// CHECK-32-NEXT:    [[TMP10:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-32-NEXT:    [[TMP11:%.*]] = icmp ne i32 [[TMP10]], 0
+// CHECK-32-NEXT:    br i1 [[TMP11]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]]
+// CHECK-32:       .omp.final.then:
+// CHECK-32-NEXT:    store i32 10, i32* [[I]], align 4
+// CHECK-32-NEXT:    br label [[DOTOMP_FINAL_DONE]]
+// CHECK-32:       .omp.final.done:
+// CHECK-32-NEXT:    ret void
+//
+//
+// CHECK-32-LABEL: define {{[^@]+}}@__omp_outlined__34_wrapper
+// CHECK-32-SAME: (i16 noundef zeroext [[TMP0:%.*]], i32 noundef [[TMP1:%.*]]) #[[ATTR7:[0-9]+]] {
+// CHECK-32-NEXT:  entry:
+// CHECK-32-NEXT:    [[DOTADDR:%.*]] = alloca i16, align 2
+// CHECK-32-NEXT:    [[DOTADDR1:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[GLOBAL_ARGS:%.*]] = alloca i8**, align 4
+// CHECK-32-NEXT:    store i16 [[TMP0]], i16* [[DOTADDR]], align 2
+// CHECK-32-NEXT:    store i32 [[TMP1]], i32* [[DOTADDR1]], align 4
+// CHECK-32-NEXT:    store i32 0, i32* [[DOTZERO_ADDR]], align 4
+// CHECK-32-NEXT:    call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]])
+// CHECK-32-NEXT:    [[TMP2:%.*]] = load i8**, i8*** [[GLOBAL_ARGS]], align 4
+// CHECK-32-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8*, i8** [[TMP2]], i32 0
+// CHECK-32-NEXT:    [[TMP4:%.*]] = bitcast i8** [[TMP3]] to i32*
+// CHECK-32-NEXT:    [[TMP5:%.*]] = load i32, i32* [[TMP4]], align 4
+// CHECK-32-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8*, i8** [[TMP2]], i32 1
+// CHECK-32-NEXT:    [[TMP7:%.*]] = bitcast i8** [[TMP6]] to i32*
+// CHECK-32-NEXT:    [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4
+// CHECK-32-NEXT:    call void @__omp_outlined__34(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]], i32 [[TMP5]], i32 [[TMP8]]) #[[ATTR2]]
+// CHECK-32-NEXT:    ret void
+//
+//
+// CHECK-32-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l80
+// CHECK-32-SAME: () #[[ATTR0]] {
+// CHECK-32-NEXT:  entry:
+// CHECK-32-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 2, i1 false)
+// CHECK-32-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
+// CHECK-32-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
+// CHECK-32:       user_code.entry:
+// CHECK-32-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
+// CHECK-32-NEXT:    store i32 0, i32* [[DOTZERO_ADDR]], align 4
+// CHECK-32-NEXT:    store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4
+// CHECK-32-NEXT:    call void @__omp_outlined__35(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]]) #[[ATTR2]]
+// CHECK-32-NEXT:    call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 2)
+// CHECK-32-NEXT:    ret void
+// CHECK-32:       worker.exit:
+// CHECK-32-NEXT:    ret void
+//
+//
+// CHECK-32-LABEL: define {{[^@]+}}@__omp_outlined__35
+// CHECK-32-SAME: (i32* noalias noundef [[DOTGLOBAL_TID_:%.*]], i32* noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
+// CHECK-32-NEXT:  entry:
+// CHECK-32-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [2 x i8*], align 4
+// CHECK-32-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4
+// CHECK-32-NEXT:    store i32 0, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-32-NEXT:    store i32 9, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-NEXT:    store i32 1, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-32-NEXT:    store i32 0, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-32-NEXT:    [[NVPTX_NUM_THREADS:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block()
+// CHECK-32-NEXT:    [[TMP0:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-NEXT:    [[TMP1:%.*]] = load i32, i32* [[TMP0]], align 4
+// CHECK-32-NEXT:    call void @__kmpc_distribute_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP1]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]])
+// CHECK-32-NEXT:    [[TMP2:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[TMP2]], 9
+// CHECK-32-NEXT:    br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK-32:       cond.true:
+// CHECK-32-NEXT:    br label [[COND_END:%.*]]
+// CHECK-32:       cond.false:
+// CHECK-32-NEXT:    [[TMP3:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-NEXT:    br label [[COND_END]]
+// CHECK-32:       cond.end:
+// CHECK-32-NEXT:    [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP3]], [[COND_FALSE]] ]
+// CHECK-32-NEXT:    store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-NEXT:    [[TMP4:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-32-NEXT:    store i32 [[TMP4]], i32* [[DOTOMP_IV]], align 4
+// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK-32:       omp.inner.for.cond:
+// CHECK-32-NEXT:    [[TMP5:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP210:![0-9]+]]
+// CHECK-32-NEXT:    [[CMP1:%.*]] = icmp slt i32 [[TMP5]], 10
+// CHECK-32-NEXT:    br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK-32:       omp.inner.for.body:
+// CHECK-32-NEXT:    [[TMP6:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP210]]
+// CHECK-32-NEXT:    [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP210]]
+// CHECK-32-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0
+// CHECK-32-NEXT:    [[TMP9:%.*]] = inttoptr i32 [[TMP6]] to i8*
+// CHECK-32-NEXT:    store i8* [[TMP9]], i8** [[TMP8]], align 4, !llvm.access.group [[ACC_GRP210]]
+// CHECK-32-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1
+// CHECK-32-NEXT:    [[TMP11:%.*]] = inttoptr i32 [[TMP7]] to i8*
+// CHECK-32-NEXT:    store i8* [[TMP11]], i8** [[TMP10]], align 4, !llvm.access.group [[ACC_GRP210]]
+// CHECK-32-NEXT:    [[TMP12:%.*]] = bitcast [2 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8**
+// CHECK-32-NEXT:    call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32)* @__omp_outlined__36 to i8*), i8* null, i8** [[TMP12]], i32 2), !llvm.access.group [[ACC_GRP210]]
+// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK-32:       omp.inner.for.inc:
+// CHECK-32-NEXT:    [[TMP13:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP210]]
+// CHECK-32-NEXT:    [[TMP14:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP210]]
+// CHECK-32-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP13]], [[TMP14]]
+// CHECK-32-NEXT:    store i32 [[ADD]], i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP210]]
+// CHECK-32-NEXT:    [[TMP15:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP210]]
+// CHECK-32-NEXT:    [[TMP16:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP210]]
+// CHECK-32-NEXT:    [[ADD2:%.*]] = add nsw i32 [[TMP15]], [[TMP16]]
+// CHECK-32-NEXT:    store i32 [[ADD2]], i32* [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP210]]
+// CHECK-32-NEXT:    [[TMP17:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP210]]
+// CHECK-32-NEXT:    [[TMP18:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP210]]
+// CHECK-32-NEXT:    [[ADD3:%.*]] = add nsw i32 [[TMP17]], [[TMP18]]
+// CHECK-32-NEXT:    store i32 [[ADD3]], i32* [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP210]]
+// CHECK-32-NEXT:    [[TMP19:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP210]]
+// CHECK-32-NEXT:    [[CMP4:%.*]] = icmp sgt i32 [[TMP19]], 9
+// CHECK-32-NEXT:    br i1 [[CMP4]], label [[COND_TRUE5:%.*]], label [[COND_FALSE6:%.*]]
+// CHECK-32:       cond.true5:
+// CHECK-32-NEXT:    br label [[COND_END7:%.*]]
+// CHECK-32:       cond.false6:
+// CHECK-32-NEXT:    [[TMP20:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP210]]
+// CHECK-32-NEXT:    br label [[COND_END7]]
+// CHECK-32:       cond.end7:
+// CHECK-32-NEXT:    [[COND8:%.*]] = phi i32 [ 9, [[COND_TRUE5]] ], [ [[TMP20]], [[COND_FALSE6]] ]
+// CHECK-32-NEXT:    store i32 [[COND8]], i32* [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP210]]
+// CHECK-32-NEXT:    [[TMP21:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP210]]
+// CHECK-32-NEXT:    store i32 [[TMP21]], i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP210]]
+// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP211:![0-9]+]]
+// CHECK-32:       omp.inner.for.end:
+// CHECK-32-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK-32:       omp.loop.exit:
+// CHECK-32-NEXT:    call void @__kmpc_distribute_static_fini(%struct.ident_t* @[[GLOB2]], i32 [[TMP1]])
+// CHECK-32-NEXT:    [[TMP22:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-32-NEXT:    [[TMP23:%.*]] = icmp ne i32 [[TMP22]], 0
+// CHECK-32-NEXT:    br i1 [[TMP23]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]]
+// CHECK-32:       .omp.final.then:
+// CHECK-32-NEXT:    store i32 10, i32* [[I]], align 4
+// CHECK-32-NEXT:    br label [[DOTOMP_FINAL_DONE]]
+// CHECK-32:       .omp.final.done:
+// CHECK-32-NEXT:    ret void
+//
+//
+// CHECK-32-LABEL: define {{[^@]+}}@__omp_outlined__36
+// CHECK-32-SAME: (i32* noalias noundef [[DOTGLOBAL_TID_:%.*]], i32* noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR1]] {
+// CHECK-32-NEXT:  entry:
+// CHECK-32-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-NEXT:    [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4
+// CHECK-32-NEXT:    store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4
+// CHECK-32-NEXT:    store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4
+// CHECK-32-NEXT:    store i32 0, i32* [[DOTOMP_LB]], align 4
+// CHECK-32-NEXT:    store i32 9, i32* [[DOTOMP_UB]], align 4
+// CHECK-32-NEXT:    [[TMP0:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4
+// CHECK-32-NEXT:    [[TMP1:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4
+// CHECK-32-NEXT:    store i32 [[TMP0]], i32* [[DOTOMP_LB]], align 4
+// CHECK-32-NEXT:    store i32 [[TMP1]], i32* [[DOTOMP_UB]], align 4
+// CHECK-32-NEXT:    store i32 1, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-32-NEXT:    store i32 0, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-32-NEXT:    [[TMP2:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4
+// CHECK-32-NEXT:    [[TMP3:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4
+// CHECK-32-NEXT:    [[TMP4:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-NEXT:    [[TMP5:%.*]] = load i32, i32* [[TMP4]], align 4
+// CHECK-32-NEXT:    call void @__kmpc_dispatch_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP5]], i32 1073741862, i32 [[TMP2]], i32 [[TMP3]], i32 1, i32 1)
+// CHECK-32-NEXT:    br label [[OMP_DISPATCH_COND:%.*]]
+// CHECK-32:       omp.dispatch.cond:
+// CHECK-32-NEXT:    [[TMP6:%.*]] = call i32 @__kmpc_dispatch_next_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP5]], i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]])
+// CHECK-32-NEXT:    [[TOBOOL:%.*]] = icmp ne i32 [[TMP6]], 0
+// CHECK-32-NEXT:    br i1 [[TOBOOL]], label [[OMP_DISPATCH_BODY:%.*]], label [[OMP_DISPATCH_END:%.*]]
+// CHECK-32:       omp.dispatch.body:
+// CHECK-32-NEXT:    [[TMP7:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4
+// CHECK-32-NEXT:    store i32 [[TMP7]], i32* [[DOTOMP_IV]], align 4
+// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK-32:       omp.inner.for.cond:
+// CHECK-32-NEXT:    [[TMP8:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP213:![0-9]+]]
+// CHECK-32-NEXT:    [[TMP9:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP213]]
+// CHECK-32-NEXT:    [[CMP:%.*]] = icmp sle i32 [[TMP8]], [[TMP9]]
+// CHECK-32-NEXT:    br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK-32:       omp.inner.for.body:
+// CHECK-32-NEXT:    [[TMP10:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP213]]
+// CHECK-32-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP10]], 1
+// CHECK-32-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// CHECK-32-NEXT:    store i32 [[ADD]], i32* [[I]], align 4, !llvm.access.group [[ACC_GRP213]]
+// CHECK-32-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK-32:       omp.body.continue:
+// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK-32:       omp.inner.for.inc:
+// CHECK-32-NEXT:    [[TMP11:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP213]]
+// CHECK-32-NEXT:    [[ADD1:%.*]] = add nsw i32 [[TMP11]], 1
+// CHECK-32-NEXT:    store i32 [[ADD1]], i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP213]]
+// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP214:![0-9]+]]
+// CHECK-32:       omp.inner.for.end:
+// CHECK-32-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
+// CHECK-32:       omp.dispatch.inc:
+// CHECK-32-NEXT:    br label [[OMP_DISPATCH_COND]]
+// CHECK-32:       omp.dispatch.end:
+// CHECK-32-NEXT:    [[TMP12:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-32-NEXT:    [[TMP13:%.*]] = icmp ne i32 [[TMP12]], 0
+// CHECK-32-NEXT:    br i1 [[TMP13]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]]
+// CHECK-32:       .omp.final.then:
+// CHECK-32-NEXT:    store i32 10, i32* [[I]], align 4
+// CHECK-32-NEXT:    br label [[DOTOMP_FINAL_DONE]]
+// CHECK-32:       .omp.final.done:
+// CHECK-32-NEXT:    ret void
+//
+//
+// CHECK-32-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l84
+// CHECK-32-SAME: () #[[ATTR0]] {
+// CHECK-32-NEXT:  entry:
+// CHECK-32-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 2, i1 false)
+// CHECK-32-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
+// CHECK-32-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
+// CHECK-32:       user_code.entry:
+// CHECK-32-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
+// CHECK-32-NEXT:    store i32 0, i32* [[DOTZERO_ADDR]], align 4
+// CHECK-32-NEXT:    store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4
+// CHECK-32-NEXT:    call void @__omp_outlined__37(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]]) #[[ATTR2]]
+// CHECK-32-NEXT:    call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 2)
+// CHECK-32-NEXT:    ret void
+// CHECK-32:       worker.exit:
+// CHECK-32-NEXT:    ret void
+//
+//
+// CHECK-32-LABEL: define {{[^@]+}}@__omp_outlined__37
+// CHECK-32-SAME: (i32* noalias noundef [[DOTGLOBAL_TID_:%.*]], i32* noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
+// CHECK-32-NEXT:  entry:
+// CHECK-32-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [2 x i8*], align 4
+// CHECK-32-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4
+// CHECK-32-NEXT:    store i32 0, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-32-NEXT:    store i32 9, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-NEXT:    store i32 1, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-32-NEXT:    store i32 0, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-32-NEXT:    [[NVPTX_NUM_THREADS:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block()
+// CHECK-32-NEXT:    [[TMP0:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-NEXT:    [[TMP1:%.*]] = load i32, i32* [[TMP0]], align 4
+// CHECK-32-NEXT:    call void @__kmpc_distribute_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP1]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]])
+// CHECK-32-NEXT:    [[TMP2:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[TMP2]], 9
+// CHECK-32-NEXT:    br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK-32:       cond.true:
+// CHECK-32-NEXT:    br label [[COND_END:%.*]]
+// CHECK-32:       cond.false:
+// CHECK-32-NEXT:    [[TMP3:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-NEXT:    br label [[COND_END]]
+// CHECK-32:       cond.end:
+// CHECK-32-NEXT:    [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP3]], [[COND_FALSE]] ]
+// CHECK-32-NEXT:    store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-NEXT:    [[TMP4:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-32-NEXT:    store i32 [[TMP4]], i32* [[DOTOMP_IV]], align 4
+// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK-32:       omp.inner.for.cond:
+// CHECK-32-NEXT:    [[TMP5:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP216:![0-9]+]]
+// CHECK-32-NEXT:    [[CMP1:%.*]] = icmp slt i32 [[TMP5]], 10
+// CHECK-32-NEXT:    br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK-32:       omp.inner.for.body:
+// CHECK-32-NEXT:    [[TMP6:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP216]]
+// CHECK-32-NEXT:    [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP216]]
+// CHECK-32-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0
+// CHECK-32-NEXT:    [[TMP9:%.*]] = inttoptr i32 [[TMP6]] to i8*
+// CHECK-32-NEXT:    store i8* [[TMP9]], i8** [[TMP8]], align 4, !llvm.access.group [[ACC_GRP216]]
+// CHECK-32-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1
+// CHECK-32-NEXT:    [[TMP11:%.*]] = inttoptr i32 [[TMP7]] to i8*
+// CHECK-32-NEXT:    store i8* [[TMP11]], i8** [[TMP10]], align 4, !llvm.access.group [[ACC_GRP216]]
+// CHECK-32-NEXT:    [[TMP12:%.*]] = bitcast [2 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8**
+// CHECK-32-NEXT:    call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32)* @__omp_outlined__38 to i8*), i8* null, i8** [[TMP12]], i32 2), !llvm.access.group [[ACC_GRP216]]
+// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK-32:       omp.inner.for.inc:
+// CHECK-32-NEXT:    [[TMP13:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP216]]
+// CHECK-32-NEXT:    [[TMP14:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP216]]
+// CHECK-32-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP13]], [[TMP14]]
+// CHECK-32-NEXT:    store i32 [[ADD]], i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP216]]
+// CHECK-32-NEXT:    [[TMP15:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP216]]
+// CHECK-32-NEXT:    [[TMP16:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP216]]
+// CHECK-32-NEXT:    [[ADD2:%.*]] = add nsw i32 [[TMP15]], [[TMP16]]
+// CHECK-32-NEXT:    store i32 [[ADD2]], i32* [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP216]]
+// CHECK-32-NEXT:    [[TMP17:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP216]]
+// CHECK-32-NEXT:    [[TMP18:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP216]]
+// CHECK-32-NEXT:    [[ADD3:%.*]] = add nsw i32 [[TMP17]], [[TMP18]]
+// CHECK-32-NEXT:    store i32 [[ADD3]], i32* [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP216]]
+// CHECK-32-NEXT:    [[TMP19:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP216]]
+// CHECK-32-NEXT:    [[CMP4:%.*]] = icmp sgt i32 [[TMP19]], 9
+// CHECK-32-NEXT:    br i1 [[CMP4]], label [[COND_TRUE5:%.*]], label [[COND_FALSE6:%.*]]
+// CHECK-32:       cond.true5:
+// CHECK-32-NEXT:    br label [[COND_END7:%.*]]
+// CHECK-32:       cond.false6:
+// CHECK-32-NEXT:    [[TMP20:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP216]]
+// CHECK-32-NEXT:    br label [[COND_END7]]
+// CHECK-32:       cond.end7:
+// CHECK-32-NEXT:    [[COND8:%.*]] = phi i32 [ 9, [[COND_TRUE5]] ], [ [[TMP20]], [[COND_FALSE6]] ]
+// CHECK-32-NEXT:    store i32 [[COND8]], i32* [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP216]]
+// CHECK-32-NEXT:    [[TMP21:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP216]]
+// CHECK-32-NEXT:    store i32 [[TMP21]], i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP216]]
+// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP217:![0-9]+]]
+// CHECK-32:       omp.inner.for.end:
+// CHECK-32-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK-32:       omp.loop.exit:
+// CHECK-32-NEXT:    call void @__kmpc_distribute_static_fini(%struct.ident_t* @[[GLOB2]], i32 [[TMP1]])
+// CHECK-32-NEXT:    [[TMP22:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-32-NEXT:    [[TMP23:%.*]] = icmp ne i32 [[TMP22]], 0
+// CHECK-32-NEXT:    br i1 [[TMP23]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]]
+// CHECK-32:       .omp.final.then:
+// CHECK-32-NEXT:    store i32 10, i32* [[I]], align 4
+// CHECK-32-NEXT:    br label [[DOTOMP_FINAL_DONE]]
+// CHECK-32:       .omp.final.done:
+// CHECK-32-NEXT:    ret void
+//
+//
+// CHECK-32-LABEL: define {{[^@]+}}@__omp_outlined__38
+// CHECK-32-SAME: (i32* noalias noundef [[DOTGLOBAL_TID_:%.*]], i32* noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR1]] {
+// CHECK-32-NEXT:  entry:
+// CHECK-32-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-NEXT:    [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4
+// CHECK-32-NEXT:    store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4
+// CHECK-32-NEXT:    store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4
+// CHECK-32-NEXT:    store i32 0, i32* [[DOTOMP_LB]], align 4
+// CHECK-32-NEXT:    store i32 9, i32* [[DOTOMP_UB]], align 4
+// CHECK-32-NEXT:    [[TMP0:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4
+// CHECK-32-NEXT:    [[TMP1:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4
+// CHECK-32-NEXT:    store i32 [[TMP0]], i32* [[DOTOMP_LB]], align 4
+// CHECK-32-NEXT:    store i32 [[TMP1]], i32* [[DOTOMP_UB]], align 4
+// CHECK-32-NEXT:    store i32 1, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-32-NEXT:    store i32 0, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-32-NEXT:    [[TMP2:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4
+// CHECK-32-NEXT:    [[TMP3:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4
+// CHECK-32-NEXT:    [[TMP4:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-NEXT:    [[TMP5:%.*]] = load i32, i32* [[TMP4]], align 4
+// CHECK-32-NEXT:    call void @__kmpc_dispatch_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP5]], i32 1073741861, i32 [[TMP2]], i32 [[TMP3]], i32 1, i32 1)
+// CHECK-32-NEXT:    br label [[OMP_DISPATCH_COND:%.*]]
+// CHECK-32:       omp.dispatch.cond:
+// CHECK-32-NEXT:    [[TMP6:%.*]] = call i32 @__kmpc_dispatch_next_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP5]], i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]])
+// CHECK-32-NEXT:    [[TOBOOL:%.*]] = icmp ne i32 [[TMP6]], 0
+// CHECK-32-NEXT:    br i1 [[TOBOOL]], label [[OMP_DISPATCH_BODY:%.*]], label [[OMP_DISPATCH_END:%.*]]
+// CHECK-32:       omp.dispatch.body:
+// CHECK-32-NEXT:    [[TMP7:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4
+// CHECK-32-NEXT:    store i32 [[TMP7]], i32* [[DOTOMP_IV]], align 4
+// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK-32:       omp.inner.for.cond:
+// CHECK-32-NEXT:    [[TMP8:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP219:![0-9]+]]
+// CHECK-32-NEXT:    [[TMP9:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP219]]
+// CHECK-32-NEXT:    [[CMP:%.*]] = icmp sle i32 [[TMP8]], [[TMP9]]
+// CHECK-32-NEXT:    br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK-32:       omp.inner.for.body:
+// CHECK-32-NEXT:    [[TMP10:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP219]]
+// CHECK-32-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP10]], 1
+// CHECK-32-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// CHECK-32-NEXT:    store i32 [[ADD]], i32* [[I]], align 4, !llvm.access.group [[ACC_GRP219]]
+// CHECK-32-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK-32:       omp.body.continue:
+// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK-32:       omp.inner.for.inc:
+// CHECK-32-NEXT:    [[TMP11:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP219]]
+// CHECK-32-NEXT:    [[ADD1:%.*]] = add nsw i32 [[TMP11]], 1
+// CHECK-32-NEXT:    store i32 [[ADD1]], i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP219]]
+// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP220:![0-9]+]]
+// CHECK-32:       omp.inner.for.end:
+// CHECK-32-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
+// CHECK-32:       omp.dispatch.inc:
+// CHECK-32-NEXT:    br label [[OMP_DISPATCH_COND]]
+// CHECK-32:       omp.dispatch.end:
+// CHECK-32-NEXT:    [[TMP12:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-32-NEXT:    [[TMP13:%.*]] = icmp ne i32 [[TMP12]], 0
+// CHECK-32-NEXT:    br i1 [[TMP13]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]]
+// CHECK-32:       .omp.final.then:
+// CHECK-32-NEXT:    store i32 10, i32* [[I]], align 4
+// CHECK-32-NEXT:    br label [[DOTOMP_FINAL_DONE]]
+// CHECK-32:       .omp.final.done:
+// CHECK-32-NEXT:    ret void
+//
+//
+// CHECK-32-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l88
+// CHECK-32-SAME: () #[[ATTR0]] {
+// CHECK-32-NEXT:  entry:
+// CHECK-32-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 2, i1 false)
+// CHECK-32-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
+// CHECK-32-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
+// CHECK-32:       user_code.entry:
+// CHECK-32-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
+// CHECK-32-NEXT:    store i32 0, i32* [[DOTZERO_ADDR]], align 4
+// CHECK-32-NEXT:    store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4
+// CHECK-32-NEXT:    call void @__omp_outlined__39(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]]) #[[ATTR2]]
+// CHECK-32-NEXT:    call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 2)
+// CHECK-32-NEXT:    ret void
+// CHECK-32:       worker.exit:
+// CHECK-32-NEXT:    ret void
+//
+//
+// CHECK-32-LABEL: define {{[^@]+}}@__omp_outlined__39
+// CHECK-32-SAME: (i32* noalias noundef [[DOTGLOBAL_TID_:%.*]], i32* noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
+// CHECK-32-NEXT:  entry:
+// CHECK-32-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [2 x i8*], align 4
+// CHECK-32-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4
+// CHECK-32-NEXT:    store i32 0, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-32-NEXT:    store i32 9, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-NEXT:    store i32 1, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-32-NEXT:    store i32 0, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-32-NEXT:    [[NVPTX_NUM_THREADS:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block()
+// CHECK-32-NEXT:    [[TMP0:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-NEXT:    [[TMP1:%.*]] = load i32, i32* [[TMP0]], align 4
+// CHECK-32-NEXT:    call void @__kmpc_distribute_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP1]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]])
+// CHECK-32-NEXT:    [[TMP2:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[TMP2]], 9
+// CHECK-32-NEXT:    br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK-32:       cond.true:
+// CHECK-32-NEXT:    br label [[COND_END:%.*]]
+// CHECK-32:       cond.false:
+// CHECK-32-NEXT:    [[TMP3:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-NEXT:    br label [[COND_END]]
+// CHECK-32:       cond.end:
+// CHECK-32-NEXT:    [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP3]], [[COND_FALSE]] ]
+// CHECK-32-NEXT:    store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-NEXT:    [[TMP4:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-32-NEXT:    store i32 [[TMP4]], i32* [[DOTOMP_IV]], align 4
+// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK-32:       omp.inner.for.cond:
+// CHECK-32-NEXT:    [[TMP5:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP222:![0-9]+]]
+// CHECK-32-NEXT:    [[CMP1:%.*]] = icmp slt i32 [[TMP5]], 10
+// CHECK-32-NEXT:    br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK-32:       omp.inner.for.body:
+// CHECK-32-NEXT:    [[TMP6:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP222]]
+// CHECK-32-NEXT:    [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP222]]
+// CHECK-32-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0
+// CHECK-32-NEXT:    [[TMP9:%.*]] = inttoptr i32 [[TMP6]] to i8*
+// CHECK-32-NEXT:    store i8* [[TMP9]], i8** [[TMP8]], align 4, !llvm.access.group [[ACC_GRP222]]
+// CHECK-32-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1
+// CHECK-32-NEXT:    [[TMP11:%.*]] = inttoptr i32 [[TMP7]] to i8*
+// CHECK-32-NEXT:    store i8* [[TMP11]], i8** [[TMP10]], align 4, !llvm.access.group [[ACC_GRP222]]
+// CHECK-32-NEXT:    [[TMP12:%.*]] = bitcast [2 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8**
+// CHECK-32-NEXT:    call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32)* @__omp_outlined__40 to i8*), i8* null, i8** [[TMP12]], i32 2), !llvm.access.group [[ACC_GRP222]]
+// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK-32:       omp.inner.for.inc:
+// CHECK-32-NEXT:    [[TMP13:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP222]]
+// CHECK-32-NEXT:    [[TMP14:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP222]]
+// CHECK-32-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP13]], [[TMP14]]
+// CHECK-32-NEXT:    store i32 [[ADD]], i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP222]]
+// CHECK-32-NEXT:    [[TMP15:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP222]]
+// CHECK-32-NEXT:    [[TMP16:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP222]]
+// CHECK-32-NEXT:    [[ADD2:%.*]] = add nsw i32 [[TMP15]], [[TMP16]]
+// CHECK-32-NEXT:    store i32 [[ADD2]], i32* [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP222]]
+// CHECK-32-NEXT:    [[TMP17:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP222]]
+// CHECK-32-NEXT:    [[TMP18:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP222]]
+// CHECK-32-NEXT:    [[ADD3:%.*]] = add nsw i32 [[TMP17]], [[TMP18]]
+// CHECK-32-NEXT:    store i32 [[ADD3]], i32* [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP222]]
+// CHECK-32-NEXT:    [[TMP19:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP222]]
+// CHECK-32-NEXT:    [[CMP4:%.*]] = icmp sgt i32 [[TMP19]], 9
+// CHECK-32-NEXT:    br i1 [[CMP4]], label [[COND_TRUE5:%.*]], label [[COND_FALSE6:%.*]]
+// CHECK-32:       cond.true5:
+// CHECK-32-NEXT:    br label [[COND_END7:%.*]]
+// CHECK-32:       cond.false6:
+// CHECK-32-NEXT:    [[TMP20:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP222]]
+// CHECK-32-NEXT:    br label [[COND_END7]]
+// CHECK-32:       cond.end7:
+// CHECK-32-NEXT:    [[COND8:%.*]] = phi i32 [ 9, [[COND_TRUE5]] ], [ [[TMP20]], [[COND_FALSE6]] ]
+// CHECK-32-NEXT:    store i32 [[COND8]], i32* [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP222]]
+// CHECK-32-NEXT:    [[TMP21:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP222]]
+// CHECK-32-NEXT:    store i32 [[TMP21]], i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP222]]
+// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP223:![0-9]+]]
+// CHECK-32:       omp.inner.for.end:
+// CHECK-32-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK-32:       omp.loop.exit:
+// CHECK-32-NEXT:    call void @__kmpc_distribute_static_fini(%struct.ident_t* @[[GLOB2]], i32 [[TMP1]])
+// CHECK-32-NEXT:    [[TMP22:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-32-NEXT:    [[TMP23:%.*]] = icmp ne i32 [[TMP22]], 0
+// CHECK-32-NEXT:    br i1 [[TMP23]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]]
+// CHECK-32:       .omp.final.then:
+// CHECK-32-NEXT:    store i32 10, i32* [[I]], align 4
+// CHECK-32-NEXT:    br label [[DOTOMP_FINAL_DONE]]
+// CHECK-32:       .omp.final.done:
+// CHECK-32-NEXT:    ret void
+//
+//
+// CHECK-32-LABEL: define {{[^@]+}}@__omp_outlined__40
+// CHECK-32-SAME: (i32* noalias noundef [[DOTGLOBAL_TID_:%.*]], i32* noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR1]] {
+// CHECK-32-NEXT:  entry:
+// CHECK-32-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-NEXT:    [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4
+// CHECK-32-NEXT:    store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4
+// CHECK-32-NEXT:    store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4
+// CHECK-32-NEXT:    store i32 0, i32* [[DOTOMP_LB]], align 4
+// CHECK-32-NEXT:    store i32 9, i32* [[DOTOMP_UB]], align 4
+// CHECK-32-NEXT:    [[TMP0:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4
+// CHECK-32-NEXT:    [[TMP1:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4
+// CHECK-32-NEXT:    store i32 [[TMP0]], i32* [[DOTOMP_LB]], align 4
+// CHECK-32-NEXT:    store i32 [[TMP1]], i32* [[DOTOMP_UB]], align 4
+// CHECK-32-NEXT:    store i32 1, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-32-NEXT:    store i32 0, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-32-NEXT:    [[TMP2:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4
+// CHECK-32-NEXT:    [[TMP3:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4
+// CHECK-32-NEXT:    [[TMP4:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-NEXT:    [[TMP5:%.*]] = load i32, i32* [[TMP4]], align 4
+// CHECK-32-NEXT:    call void @__kmpc_dispatch_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP5]], i32 1073741859, i32 [[TMP2]], i32 [[TMP3]], i32 1, i32 1)
+// CHECK-32-NEXT:    br label [[OMP_DISPATCH_COND:%.*]]
+// CHECK-32:       omp.dispatch.cond:
+// CHECK-32-NEXT:    [[TMP6:%.*]] = call i32 @__kmpc_dispatch_next_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP5]], i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]])
+// CHECK-32-NEXT:    [[TOBOOL:%.*]] = icmp ne i32 [[TMP6]], 0
+// CHECK-32-NEXT:    br i1 [[TOBOOL]], label [[OMP_DISPATCH_BODY:%.*]], label [[OMP_DISPATCH_END:%.*]]
+// CHECK-32:       omp.dispatch.body:
+// CHECK-32-NEXT:    [[TMP7:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4
+// CHECK-32-NEXT:    store i32 [[TMP7]], i32* [[DOTOMP_IV]], align 4
+// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK-32:       omp.inner.for.cond:
+// CHECK-32-NEXT:    [[TMP8:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP225:![0-9]+]]
+// CHECK-32-NEXT:    [[TMP9:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP225]]
+// CHECK-32-NEXT:    [[CMP:%.*]] = icmp sle i32 [[TMP8]], [[TMP9]]
+// CHECK-32-NEXT:    br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK-32:       omp.inner.for.body:
+// CHECK-32-NEXT:    [[TMP10:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP225]]
+// CHECK-32-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP10]], 1
+// CHECK-32-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// CHECK-32-NEXT:    store i32 [[ADD]], i32* [[I]], align 4, !llvm.access.group [[ACC_GRP225]]
+// CHECK-32-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK-32:       omp.body.continue:
+// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK-32:       omp.inner.for.inc:
+// CHECK-32-NEXT:    [[TMP11:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP225]]
+// CHECK-32-NEXT:    [[ADD1:%.*]] = add nsw i32 [[TMP11]], 1
+// CHECK-32-NEXT:    store i32 [[ADD1]], i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP225]]
+// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP226:![0-9]+]]
+// CHECK-32:       omp.inner.for.end:
+// CHECK-32-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
+// CHECK-32:       omp.dispatch.inc:
+// CHECK-32-NEXT:    br label [[OMP_DISPATCH_COND]]
+// CHECK-32:       omp.dispatch.end:
+// CHECK-32-NEXT:    [[TMP12:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-32-NEXT:    [[TMP13:%.*]] = icmp ne i32 [[TMP12]], 0
+// CHECK-32-NEXT:    br i1 [[TMP13]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]]
+// CHECK-32:       .omp.final.then:
+// CHECK-32-NEXT:    store i32 10, i32* [[I]], align 4
+// CHECK-32-NEXT:    br label [[DOTOMP_FINAL_DONE]]
+// CHECK-32:       .omp.final.done:
+// CHECK-32-NEXT:    ret void
+//
+//
+// CHECK-32-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l92
+// CHECK-32-SAME: () #[[ATTR0]] {
+// CHECK-32-NEXT:  entry:
+// CHECK-32-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 2, i1 false)
+// CHECK-32-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
+// CHECK-32-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
+// CHECK-32:       user_code.entry:
+// CHECK-32-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
+// CHECK-32-NEXT:    store i32 0, i32* [[DOTZERO_ADDR]], align 4
+// CHECK-32-NEXT:    store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4
+// CHECK-32-NEXT:    call void @__omp_outlined__41(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]]) #[[ATTR2]]
+// CHECK-32-NEXT:    call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 2)
+// CHECK-32-NEXT:    ret void
+// CHECK-32:       worker.exit:
+// CHECK-32-NEXT:    ret void
+//
+//
+// CHECK-32-LABEL: define {{[^@]+}}@__omp_outlined__41
+// CHECK-32-SAME: (i32* noalias noundef [[DOTGLOBAL_TID_:%.*]], i32* noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
+// CHECK-32-NEXT:  entry:
+// CHECK-32-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [2 x i8*], align 4
+// CHECK-32-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4
+// CHECK-32-NEXT:    store i32 0, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-32-NEXT:    store i32 9, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-NEXT:    store i32 1, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-32-NEXT:    store i32 0, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-32-NEXT:    [[NVPTX_NUM_THREADS:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block()
+// CHECK-32-NEXT:    [[TMP0:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-NEXT:    [[TMP1:%.*]] = load i32, i32* [[TMP0]], align 4
+// CHECK-32-NEXT:    call void @__kmpc_distribute_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP1]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]])
+// CHECK-32-NEXT:    [[TMP2:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[TMP2]], 9
+// CHECK-32-NEXT:    br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK-32:       cond.true:
+// CHECK-32-NEXT:    br label [[COND_END:%.*]]
+// CHECK-32:       cond.false:
+// CHECK-32-NEXT:    [[TMP3:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-NEXT:    br label [[COND_END]]
+// CHECK-32:       cond.end:
+// CHECK-32-NEXT:    [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP3]], [[COND_FALSE]] ]
+// CHECK-32-NEXT:    store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-NEXT:    [[TMP4:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-32-NEXT:    store i32 [[TMP4]], i32* [[DOTOMP_IV]], align 4
+// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK-32:       omp.inner.for.cond:
+// CHECK-32-NEXT:    [[TMP5:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP228:![0-9]+]]
+// CHECK-32-NEXT:    [[CMP1:%.*]] = icmp slt i32 [[TMP5]], 10
+// CHECK-32-NEXT:    br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK-32:       omp.inner.for.body:
+// CHECK-32-NEXT:    [[TMP6:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP228]]
+// CHECK-32-NEXT:    [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP228]]
+// CHECK-32-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0
+// CHECK-32-NEXT:    [[TMP9:%.*]] = inttoptr i32 [[TMP6]] to i8*
+// CHECK-32-NEXT:    store i8* [[TMP9]], i8** [[TMP8]], align 4, !llvm.access.group [[ACC_GRP228]]
+// CHECK-32-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1
+// CHECK-32-NEXT:    [[TMP11:%.*]] = inttoptr i32 [[TMP7]] to i8*
+// CHECK-32-NEXT:    store i8* [[TMP11]], i8** [[TMP10]], align 4, !llvm.access.group [[ACC_GRP228]]
+// CHECK-32-NEXT:    [[TMP12:%.*]] = bitcast [2 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8**
+// CHECK-32-NEXT:    call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32)* @__omp_outlined__42 to i8*), i8* null, i8** [[TMP12]], i32 2), !llvm.access.group [[ACC_GRP228]]
+// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK-32:       omp.inner.for.inc:
+// CHECK-32-NEXT:    [[TMP13:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP228]]
+// CHECK-32-NEXT:    [[TMP14:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP228]]
+// CHECK-32-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP13]], [[TMP14]]
+// CHECK-32-NEXT:    store i32 [[ADD]], i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP228]]
+// CHECK-32-NEXT:    [[TMP15:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP228]]
+// CHECK-32-NEXT:    [[TMP16:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP228]]
+// CHECK-32-NEXT:    [[ADD2:%.*]] = add nsw i32 [[TMP15]], [[TMP16]]
+// CHECK-32-NEXT:    store i32 [[ADD2]], i32* [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP228]]
+// CHECK-32-NEXT:    [[TMP17:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP228]]
+// CHECK-32-NEXT:    [[TMP18:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP228]]
+// CHECK-32-NEXT:    [[ADD3:%.*]] = add nsw i32 [[TMP17]], [[TMP18]]
+// CHECK-32-NEXT:    store i32 [[ADD3]], i32* [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP228]]
+// CHECK-32-NEXT:    [[TMP19:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP228]]
+// CHECK-32-NEXT:    [[CMP4:%.*]] = icmp sgt i32 [[TMP19]], 9
+// CHECK-32-NEXT:    br i1 [[CMP4]], label [[COND_TRUE5:%.*]], label [[COND_FALSE6:%.*]]
+// CHECK-32:       cond.true5:
+// CHECK-32-NEXT:    br label [[COND_END7:%.*]]
+// CHECK-32:       cond.false6:
+// CHECK-32-NEXT:    [[TMP20:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP228]]
+// CHECK-32-NEXT:    br label [[COND_END7]]
+// CHECK-32:       cond.end7:
+// CHECK-32-NEXT:    [[COND8:%.*]] = phi i32 [ 9, [[COND_TRUE5]] ], [ [[TMP20]], [[COND_FALSE6]] ]
+// CHECK-32-NEXT:    store i32 [[COND8]], i32* [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP228]]
+// CHECK-32-NEXT:    [[TMP21:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP228]]
+// CHECK-32-NEXT:    store i32 [[TMP21]], i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP228]]
+// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP229:![0-9]+]]
+// CHECK-32:       omp.inner.for.end:
+// CHECK-32-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK-32:       omp.loop.exit:
+// CHECK-32-NEXT:    call void @__kmpc_distribute_static_fini(%struct.ident_t* @[[GLOB2]], i32 [[TMP1]])
+// CHECK-32-NEXT:    [[TMP22:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-32-NEXT:    [[TMP23:%.*]] = icmp ne i32 [[TMP22]], 0
+// CHECK-32-NEXT:    br i1 [[TMP23]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]]
+// CHECK-32:       .omp.final.then:
+// CHECK-32-NEXT:    store i32 10, i32* [[I]], align 4
+// CHECK-32-NEXT:    br label [[DOTOMP_FINAL_DONE]]
+// CHECK-32:       .omp.final.done:
+// CHECK-32-NEXT:    ret void
+//
+//
+// CHECK-32-LABEL: define {{[^@]+}}@__omp_outlined__42
+// CHECK-32-SAME: (i32* noalias noundef [[DOTGLOBAL_TID_:%.*]], i32* noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR1]] {
+// CHECK-32-NEXT:  entry:
+// CHECK-32-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-NEXT:    [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4
+// CHECK-32-NEXT:    store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4
+// CHECK-32-NEXT:    store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4
+// CHECK-32-NEXT:    store i32 0, i32* [[DOTOMP_LB]], align 4
+// CHECK-32-NEXT:    store i32 9, i32* [[DOTOMP_UB]], align 4
+// CHECK-32-NEXT:    [[TMP0:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4
+// CHECK-32-NEXT:    [[TMP1:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4
+// CHECK-32-NEXT:    store i32 [[TMP0]], i32* [[DOTOMP_LB]], align 4
+// CHECK-32-NEXT:    store i32 [[TMP1]], i32* [[DOTOMP_UB]], align 4
+// CHECK-32-NEXT:    store i32 1, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-32-NEXT:    store i32 0, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-32-NEXT:    [[TMP2:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4
+// CHECK-32-NEXT:    [[TMP3:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4
+// CHECK-32-NEXT:    [[TMP4:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-NEXT:    [[TMP5:%.*]] = load i32, i32* [[TMP4]], align 4
+// CHECK-32-NEXT:    call void @__kmpc_dispatch_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP5]], i32 1073741860, i32 [[TMP2]], i32 [[TMP3]], i32 1, i32 1)
+// CHECK-32-NEXT:    br label [[OMP_DISPATCH_COND:%.*]]
+// CHECK-32:       omp.dispatch.cond:
+// CHECK-32-NEXT:    [[TMP6:%.*]] = call i32 @__kmpc_dispatch_next_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP5]], i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]])
+// CHECK-32-NEXT:    [[TOBOOL:%.*]] = icmp ne i32 [[TMP6]], 0
+// CHECK-32-NEXT:    br i1 [[TOBOOL]], label [[OMP_DISPATCH_BODY:%.*]], label [[OMP_DISPATCH_END:%.*]]
+// CHECK-32:       omp.dispatch.body:
+// CHECK-32-NEXT:    [[TMP7:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4
+// CHECK-32-NEXT:    store i32 [[TMP7]], i32* [[DOTOMP_IV]], align 4
+// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK-32:       omp.inner.for.cond:
+// CHECK-32-NEXT:    [[TMP8:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP231:![0-9]+]]
+// CHECK-32-NEXT:    [[TMP9:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP231]]
+// CHECK-32-NEXT:    [[CMP:%.*]] = icmp sle i32 [[TMP8]], [[TMP9]]
+// CHECK-32-NEXT:    br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK-32:       omp.inner.for.body:
+// CHECK-32-NEXT:    [[TMP10:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP231]]
+// CHECK-32-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP10]], 1
+// CHECK-32-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// CHECK-32-NEXT:    store i32 [[ADD]], i32* [[I]], align 4, !llvm.access.group [[ACC_GRP231]]
+// CHECK-32-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK-32:       omp.body.continue:
+// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK-32:       omp.inner.for.inc:
+// CHECK-32-NEXT:    [[TMP11:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP231]]
+// CHECK-32-NEXT:    [[ADD1:%.*]] = add nsw i32 [[TMP11]], 1
+// CHECK-32-NEXT:    store i32 [[ADD1]], i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP231]]
+// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP232:![0-9]+]]
+// CHECK-32:       omp.inner.for.end:
+// CHECK-32-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
+// CHECK-32:       omp.dispatch.inc:
+// CHECK-32-NEXT:    br label [[OMP_DISPATCH_COND]]
+// CHECK-32:       omp.dispatch.end:
+// CHECK-32-NEXT:    [[TMP12:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-32-NEXT:    [[TMP13:%.*]] = icmp ne i32 [[TMP12]], 0
+// CHECK-32-NEXT:    br i1 [[TMP13]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]]
+// CHECK-32:       .omp.final.then:
+// CHECK-32-NEXT:    store i32 10, i32* [[I]], align 4
+// CHECK-32-NEXT:    br label [[DOTOMP_FINAL_DONE]]
+// CHECK-32:       .omp.final.done:
+// CHECK-32-NEXT:    ret void
+//
+//
+// CHECK-32-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l96
+// CHECK-32-SAME: () #[[ATTR0]] {
+// CHECK-32-NEXT:  entry:
+// CHECK-32-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 2, i1 false)
+// CHECK-32-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
+// CHECK-32-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
+// CHECK-32:       user_code.entry:
+// CHECK-32-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
+// CHECK-32-NEXT:    store i32 0, i32* [[DOTZERO_ADDR]], align 4
+// CHECK-32-NEXT:    store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4
+// CHECK-32-NEXT:    call void @__omp_outlined__43(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]]) #[[ATTR2]]
+// CHECK-32-NEXT:    call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 2)
+// CHECK-32-NEXT:    ret void
+// CHECK-32:       worker.exit:
+// CHECK-32-NEXT:    ret void
+//
+//
+// CHECK-32-LABEL: define {{[^@]+}}@__omp_outlined__43
+// CHECK-32-SAME: (i32* noalias noundef [[DOTGLOBAL_TID_:%.*]], i32* noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
+// CHECK-32-NEXT:  entry:
+// CHECK-32-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [2 x i8*], align 4
+// CHECK-32-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4
+// CHECK-32-NEXT:    store i32 0, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-32-NEXT:    store i32 9, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-NEXT:    store i32 1, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-32-NEXT:    store i32 0, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-32-NEXT:    [[NVPTX_NUM_THREADS:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block()
+// CHECK-32-NEXT:    [[TMP0:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-NEXT:    [[TMP1:%.*]] = load i32, i32* [[TMP0]], align 4
+// CHECK-32-NEXT:    call void @__kmpc_distribute_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP1]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]])
+// CHECK-32-NEXT:    [[TMP2:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[TMP2]], 9
+// CHECK-32-NEXT:    br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK-32:       cond.true:
+// CHECK-32-NEXT:    br label [[COND_END:%.*]]
+// CHECK-32:       cond.false:
+// CHECK-32-NEXT:    [[TMP3:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-NEXT:    br label [[COND_END]]
+// CHECK-32:       cond.end:
+// CHECK-32-NEXT:    [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP3]], [[COND_FALSE]] ]
+// CHECK-32-NEXT:    store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-NEXT:    [[TMP4:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-32-NEXT:    store i32 [[TMP4]], i32* [[DOTOMP_IV]], align 4
+// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK-32:       omp.inner.for.cond:
+// CHECK-32-NEXT:    [[TMP5:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
+// CHECK-32-NEXT:    [[CMP1:%.*]] = icmp slt i32 [[TMP5]], 10
+// CHECK-32-NEXT:    br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK-32:       omp.inner.for.body:
+// CHECK-32-NEXT:    [[TMP6:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-32-NEXT:    [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0
+// CHECK-32-NEXT:    [[TMP9:%.*]] = inttoptr i32 [[TMP6]] to i8*
+// CHECK-32-NEXT:    store i8* [[TMP9]], i8** [[TMP8]], align 4
+// CHECK-32-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1
+// CHECK-32-NEXT:    [[TMP11:%.*]] = inttoptr i32 [[TMP7]] to i8*
+// CHECK-32-NEXT:    store i8* [[TMP11]], i8** [[TMP10]], align 4
+// CHECK-32-NEXT:    [[TMP12:%.*]] = bitcast [2 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8**
+// CHECK-32-NEXT:    call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32)* @__omp_outlined__44 to i8*), i8* null, i8** [[TMP12]], i32 2)
+// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK-32:       omp.inner.for.inc:
+// CHECK-32-NEXT:    [[TMP13:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
+// CHECK-32-NEXT:    [[TMP14:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-32-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP13]], [[TMP14]]
+// CHECK-32-NEXT:    store i32 [[ADD]], i32* [[DOTOMP_IV]], align 4
+// CHECK-32-NEXT:    [[TMP15:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-32-NEXT:    [[TMP16:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-32-NEXT:    [[ADD2:%.*]] = add nsw i32 [[TMP15]], [[TMP16]]
+// CHECK-32-NEXT:    store i32 [[ADD2]], i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-32-NEXT:    [[TMP17:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-NEXT:    [[TMP18:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-32-NEXT:    [[ADD3:%.*]] = add nsw i32 [[TMP17]], [[TMP18]]
+// CHECK-32-NEXT:    store i32 [[ADD3]], i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-NEXT:    [[TMP19:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-NEXT:    [[CMP4:%.*]] = icmp sgt i32 [[TMP19]], 9
+// CHECK-32-NEXT:    br i1 [[CMP4]], label [[COND_TRUE5:%.*]], label [[COND_FALSE6:%.*]]
+// CHECK-32:       cond.true5:
+// CHECK-32-NEXT:    br label [[COND_END7:%.*]]
+// CHECK-32:       cond.false6:
+// CHECK-32-NEXT:    [[TMP20:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-NEXT:    br label [[COND_END7]]
+// CHECK-32:       cond.end7:
+// CHECK-32-NEXT:    [[COND8:%.*]] = phi i32 [ 9, [[COND_TRUE5]] ], [ [[TMP20]], [[COND_FALSE6]] ]
+// CHECK-32-NEXT:    store i32 [[COND8]], i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-NEXT:    [[TMP21:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-32-NEXT:    store i32 [[TMP21]], i32* [[DOTOMP_IV]], align 4
+// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND]]
+// CHECK-32:       omp.inner.for.end:
+// CHECK-32-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK-32:       omp.loop.exit:
+// CHECK-32-NEXT:    call void @__kmpc_distribute_static_fini(%struct.ident_t* @[[GLOB2]], i32 [[TMP1]])
+// CHECK-32-NEXT:    ret void
+//
+//
+// CHECK-32-LABEL: define {{[^@]+}}@__omp_outlined__44
+// CHECK-32-SAME: (i32* noalias noundef [[DOTGLOBAL_TID_:%.*]], i32* noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR1]] {
+// CHECK-32-NEXT:  entry:
+// CHECK-32-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-NEXT:    [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4
+// CHECK-32-NEXT:    store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4
+// CHECK-32-NEXT:    store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4
+// CHECK-32-NEXT:    store i32 0, i32* [[DOTOMP_LB]], align 4
+// CHECK-32-NEXT:    store i32 9, i32* [[DOTOMP_UB]], align 4
+// CHECK-32-NEXT:    [[TMP0:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4
+// CHECK-32-NEXT:    [[TMP1:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4
+// CHECK-32-NEXT:    store i32 [[TMP0]], i32* [[DOTOMP_LB]], align 4
+// CHECK-32-NEXT:    store i32 [[TMP1]], i32* [[DOTOMP_UB]], align 4
+// CHECK-32-NEXT:    store i32 1, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-32-NEXT:    store i32 0, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-32-NEXT:    [[TMP2:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-NEXT:    [[TMP3:%.*]] = load i32, i32* [[TMP2]], align 4
+// CHECK-32-NEXT:    call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB3]], i32 [[TMP3]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1)
+// CHECK-32-NEXT:    [[TMP4:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4
+// CHECK-32-NEXT:    store i32 [[TMP4]], i32* [[DOTOMP_IV]], align 4
+// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK-32:       omp.inner.for.cond:
+// CHECK-32-NEXT:    [[TMP5:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
+// CHECK-32-NEXT:    [[TMP6:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4
+// CHECK-32-NEXT:    [[CMP:%.*]] = icmp ule i32 [[TMP5]], [[TMP6]]
+// CHECK-32-NEXT:    br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK-32:       omp.inner.for.body:
+// CHECK-32-NEXT:    [[TMP7:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
+// CHECK-32-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP7]], 1
+// CHECK-32-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// CHECK-32-NEXT:    store i32 [[ADD]], i32* [[I]], align 4
+// CHECK-32-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK-32:       omp.body.continue:
+// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK-32:       omp.inner.for.inc:
+// CHECK-32-NEXT:    [[TMP8:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
+// CHECK-32-NEXT:    [[TMP9:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-32-NEXT:    [[ADD1:%.*]] = add nsw i32 [[TMP8]], [[TMP9]]
+// CHECK-32-NEXT:    store i32 [[ADD1]], i32* [[DOTOMP_IV]], align 4
+// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND]]
+// CHECK-32:       omp.inner.for.end:
+// CHECK-32-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK-32:       omp.loop.exit:
+// CHECK-32-NEXT:    call void @__kmpc_distribute_static_fini(%struct.ident_t* @[[GLOB2]], i32 [[TMP3]])
+// CHECK-32-NEXT:    ret void
+//
+//
+// CHECK-32-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l100
+// CHECK-32-SAME: () #[[ATTR0]] {
+// CHECK-32-NEXT:  entry:
+// CHECK-32-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 2, i1 false)
+// CHECK-32-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
+// CHECK-32-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
+// CHECK-32:       user_code.entry:
+// CHECK-32-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
+// CHECK-32-NEXT:    store i32 0, i32* [[DOTZERO_ADDR]], align 4
+// CHECK-32-NEXT:    store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4
+// CHECK-32-NEXT:    call void @__omp_outlined__45(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]]) #[[ATTR2]]
+// CHECK-32-NEXT:    call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 2)
+// CHECK-32-NEXT:    ret void
+// CHECK-32:       worker.exit:
+// CHECK-32-NEXT:    ret void
+//
+//
+// CHECK-32-LABEL: define {{[^@]+}}@__omp_outlined__45
+// CHECK-32-SAME: (i32* noalias noundef [[DOTGLOBAL_TID_:%.*]], i32* noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
+// CHECK-32-NEXT:  entry:
+// CHECK-32-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [2 x i8*], align 4
+// CHECK-32-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4
+// CHECK-32-NEXT:    store i32 0, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-32-NEXT:    store i32 9, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-NEXT:    store i32 1, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-32-NEXT:    store i32 0, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-32-NEXT:    [[NVPTX_NUM_THREADS:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block()
+// CHECK-32-NEXT:    [[TMP0:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-NEXT:    [[TMP1:%.*]] = load i32, i32* [[TMP0]], align 4
+// CHECK-32-NEXT:    call void @__kmpc_distribute_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP1]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]])
+// CHECK-32-NEXT:    [[TMP2:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[TMP2]], 9
+// CHECK-32-NEXT:    br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK-32:       cond.true:
+// CHECK-32-NEXT:    br label [[COND_END:%.*]]
+// CHECK-32:       cond.false:
+// CHECK-32-NEXT:    [[TMP3:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-NEXT:    br label [[COND_END]]
+// CHECK-32:       cond.end:
+// CHECK-32-NEXT:    [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP3]], [[COND_FALSE]] ]
+// CHECK-32-NEXT:    store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-NEXT:    [[TMP4:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-32-NEXT:    store i32 [[TMP4]], i32* [[DOTOMP_IV]], align 4
+// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK-32:       omp.inner.for.cond:
+// CHECK-32-NEXT:    [[TMP5:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
+// CHECK-32-NEXT:    [[CMP1:%.*]] = icmp slt i32 [[TMP5]], 10
+// CHECK-32-NEXT:    br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK-32:       omp.inner.for.body:
+// CHECK-32-NEXT:    [[TMP6:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-32-NEXT:    [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0
+// CHECK-32-NEXT:    [[TMP9:%.*]] = inttoptr i32 [[TMP6]] to i8*
+// CHECK-32-NEXT:    store i8* [[TMP9]], i8** [[TMP8]], align 4
+// CHECK-32-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1
+// CHECK-32-NEXT:    [[TMP11:%.*]] = inttoptr i32 [[TMP7]] to i8*
+// CHECK-32-NEXT:    store i8* [[TMP11]], i8** [[TMP10]], align 4
+// CHECK-32-NEXT:    [[TMP12:%.*]] = bitcast [2 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8**
+// CHECK-32-NEXT:    call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32)* @__omp_outlined__46 to i8*), i8* null, i8** [[TMP12]], i32 2)
+// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK-32:       omp.inner.for.inc:
+// CHECK-32-NEXT:    [[TMP13:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
+// CHECK-32-NEXT:    [[TMP14:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-32-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP13]], [[TMP14]]
+// CHECK-32-NEXT:    store i32 [[ADD]], i32* [[DOTOMP_IV]], align 4
+// CHECK-32-NEXT:    [[TMP15:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-32-NEXT:    [[TMP16:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-32-NEXT:    [[ADD2:%.*]] = add nsw i32 [[TMP15]], [[TMP16]]
+// CHECK-32-NEXT:    store i32 [[ADD2]], i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-32-NEXT:    [[TMP17:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-NEXT:    [[TMP18:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-32-NEXT:    [[ADD3:%.*]] = add nsw i32 [[TMP17]], [[TMP18]]
+// CHECK-32-NEXT:    store i32 [[ADD3]], i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-NEXT:    [[TMP19:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-NEXT:    [[CMP4:%.*]] = icmp sgt i32 [[TMP19]], 9
+// CHECK-32-NEXT:    br i1 [[CMP4]], label [[COND_TRUE5:%.*]], label [[COND_FALSE6:%.*]]
+// CHECK-32:       cond.true5:
+// CHECK-32-NEXT:    br label [[COND_END7:%.*]]
+// CHECK-32:       cond.false6:
+// CHECK-32-NEXT:    [[TMP20:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-NEXT:    br label [[COND_END7]]
+// CHECK-32:       cond.end7:
+// CHECK-32-NEXT:    [[COND8:%.*]] = phi i32 [ 9, [[COND_TRUE5]] ], [ [[TMP20]], [[COND_FALSE6]] ]
+// CHECK-32-NEXT:    store i32 [[COND8]], i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-NEXT:    [[TMP21:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-32-NEXT:    store i32 [[TMP21]], i32* [[DOTOMP_IV]], align 4
+// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND]]
+// CHECK-32:       omp.inner.for.end:
+// CHECK-32-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK-32:       omp.loop.exit:
+// CHECK-32-NEXT:    call void @__kmpc_distribute_static_fini(%struct.ident_t* @[[GLOB2]], i32 [[TMP1]])
+// CHECK-32-NEXT:    ret void
+//
+//
+// CHECK-32-LABEL: define {{[^@]+}}@__omp_outlined__46
+// CHECK-32-SAME: (i32* noalias noundef [[DOTGLOBAL_TID_:%.*]], i32* noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR1]] {
+// CHECK-32-NEXT:  entry:
+// CHECK-32-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-NEXT:    [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4
+// CHECK-32-NEXT:    store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4
+// CHECK-32-NEXT:    store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4
+// CHECK-32-NEXT:    store i32 0, i32* [[DOTOMP_LB]], align 4
+// CHECK-32-NEXT:    store i32 9, i32* [[DOTOMP_UB]], align 4
+// CHECK-32-NEXT:    [[TMP0:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4
+// CHECK-32-NEXT:    [[TMP1:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4
+// CHECK-32-NEXT:    store i32 [[TMP0]], i32* [[DOTOMP_LB]], align 4
+// CHECK-32-NEXT:    store i32 [[TMP1]], i32* [[DOTOMP_UB]], align 4
+// CHECK-32-NEXT:    store i32 1, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-32-NEXT:    store i32 0, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-32-NEXT:    [[TMP2:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-NEXT:    [[TMP3:%.*]] = load i32, i32* [[TMP2]], align 4
+// CHECK-32-NEXT:    call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB3]], i32 [[TMP3]], i32 34, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1)
+// CHECK-32-NEXT:    [[TMP4:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4
+// CHECK-32-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[TMP4]], 9
+// CHECK-32-NEXT:    br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK-32:       cond.true:
+// CHECK-32-NEXT:    br label [[COND_END:%.*]]
+// CHECK-32:       cond.false:
+// CHECK-32-NEXT:    [[TMP5:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4
+// CHECK-32-NEXT:    br label [[COND_END]]
+// CHECK-32:       cond.end:
+// CHECK-32-NEXT:    [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP5]], [[COND_FALSE]] ]
+// CHECK-32-NEXT:    store i32 [[COND]], i32* [[DOTOMP_UB]], align 4
+// CHECK-32-NEXT:    [[TMP6:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4
+// CHECK-32-NEXT:    store i32 [[TMP6]], i32* [[DOTOMP_IV]], align 4
+// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK-32:       omp.inner.for.cond:
+// CHECK-32-NEXT:    [[TMP7:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
+// CHECK-32-NEXT:    [[TMP8:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4
+// CHECK-32-NEXT:    [[CMP1:%.*]] = icmp sle i32 [[TMP7]], [[TMP8]]
+// CHECK-32-NEXT:    br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK-32:       omp.inner.for.body:
+// CHECK-32-NEXT:    [[TMP9:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
+// CHECK-32-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP9]], 1
+// CHECK-32-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// CHECK-32-NEXT:    store i32 [[ADD]], i32* [[I]], align 4
+// CHECK-32-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK-32:       omp.body.continue:
+// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK-32:       omp.inner.for.inc:
+// CHECK-32-NEXT:    [[TMP10:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
+// CHECK-32-NEXT:    [[ADD2:%.*]] = add nsw i32 [[TMP10]], 1
+// CHECK-32-NEXT:    store i32 [[ADD2]], i32* [[DOTOMP_IV]], align 4
+// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND]]
+// CHECK-32:       omp.inner.for.end:
+// CHECK-32-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK-32:       omp.loop.exit:
+// CHECK-32-NEXT:    call void @__kmpc_distribute_static_fini(%struct.ident_t* @[[GLOB2]], i32 [[TMP3]])
+// CHECK-32-NEXT:    ret void
+//
+//
+// CHECK-32-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l104
+// CHECK-32-SAME: () #[[ATTR0]] {
+// CHECK-32-NEXT:  entry:
+// CHECK-32-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 2, i1 false)
+// CHECK-32-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
+// CHECK-32-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
+// CHECK-32:       user_code.entry:
+// CHECK-32-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
+// CHECK-32-NEXT:    store i32 0, i32* [[DOTZERO_ADDR]], align 4
+// CHECK-32-NEXT:    store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4
+// CHECK-32-NEXT:    call void @__omp_outlined__47(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]]) #[[ATTR2]]
+// CHECK-32-NEXT:    call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 2)
+// CHECK-32-NEXT:    ret void
+// CHECK-32:       worker.exit:
+// CHECK-32-NEXT:    ret void
+//
+//
+// CHECK-32-LABEL: define {{[^@]+}}@__omp_outlined__47
+// CHECK-32-SAME: (i32* noalias noundef [[DOTGLOBAL_TID_:%.*]], i32* noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
+// CHECK-32-NEXT:  entry:
+// CHECK-32-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [2 x i8*], align 4
+// CHECK-32-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4
+// CHECK-32-NEXT:    store i32 0, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-32-NEXT:    store i32 9, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-NEXT:    store i32 1, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-32-NEXT:    store i32 0, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-32-NEXT:    [[NVPTX_NUM_THREADS:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block()
+// CHECK-32-NEXT:    [[TMP0:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-NEXT:    [[TMP1:%.*]] = load i32, i32* [[TMP0]], align 4
+// CHECK-32-NEXT:    call void @__kmpc_distribute_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP1]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]])
+// CHECK-32-NEXT:    [[TMP2:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[TMP2]], 9
+// CHECK-32-NEXT:    br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK-32:       cond.true:
+// CHECK-32-NEXT:    br label [[COND_END:%.*]]
+// CHECK-32:       cond.false:
+// CHECK-32-NEXT:    [[TMP3:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-NEXT:    br label [[COND_END]]
+// CHECK-32:       cond.end:
+// CHECK-32-NEXT:    [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP3]], [[COND_FALSE]] ]
+// CHECK-32-NEXT:    store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-NEXT:    [[TMP4:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-32-NEXT:    store i32 [[TMP4]], i32* [[DOTOMP_IV]], align 4
+// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK-32:       omp.inner.for.cond:
+// CHECK-32-NEXT:    [[TMP5:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
+// CHECK-32-NEXT:    [[CMP1:%.*]] = icmp slt i32 [[TMP5]], 10
+// CHECK-32-NEXT:    br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK-32:       omp.inner.for.body:
+// CHECK-32-NEXT:    [[TMP6:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-32-NEXT:    [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0
+// CHECK-32-NEXT:    [[TMP9:%.*]] = inttoptr i32 [[TMP6]] to i8*
+// CHECK-32-NEXT:    store i8* [[TMP9]], i8** [[TMP8]], align 4
+// CHECK-32-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1
+// CHECK-32-NEXT:    [[TMP11:%.*]] = inttoptr i32 [[TMP7]] to i8*
+// CHECK-32-NEXT:    store i8* [[TMP11]], i8** [[TMP10]], align 4
+// CHECK-32-NEXT:    [[TMP12:%.*]] = bitcast [2 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8**
+// CHECK-32-NEXT:    call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32)* @__omp_outlined__48 to i8*), i8* null, i8** [[TMP12]], i32 2)
+// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK-32:       omp.inner.for.inc:
+// CHECK-32-NEXT:    [[TMP13:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
+// CHECK-32-NEXT:    [[TMP14:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-32-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP13]], [[TMP14]]
+// CHECK-32-NEXT:    store i32 [[ADD]], i32* [[DOTOMP_IV]], align 4
+// CHECK-32-NEXT:    [[TMP15:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-32-NEXT:    [[TMP16:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-32-NEXT:    [[ADD2:%.*]] = add nsw i32 [[TMP15]], [[TMP16]]
+// CHECK-32-NEXT:    store i32 [[ADD2]], i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-32-NEXT:    [[TMP17:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-NEXT:    [[TMP18:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-32-NEXT:    [[ADD3:%.*]] = add nsw i32 [[TMP17]], [[TMP18]]
+// CHECK-32-NEXT:    store i32 [[ADD3]], i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-NEXT:    [[TMP19:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-NEXT:    [[CMP4:%.*]] = icmp sgt i32 [[TMP19]], 9
+// CHECK-32-NEXT:    br i1 [[CMP4]], label [[COND_TRUE5:%.*]], label [[COND_FALSE6:%.*]]
+// CHECK-32:       cond.true5:
+// CHECK-32-NEXT:    br label [[COND_END7:%.*]]
+// CHECK-32:       cond.false6:
+// CHECK-32-NEXT:    [[TMP20:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-NEXT:    br label [[COND_END7]]
+// CHECK-32:       cond.end7:
+// CHECK-32-NEXT:    [[COND8:%.*]] = phi i32 [ 9, [[COND_TRUE5]] ], [ [[TMP20]], [[COND_FALSE6]] ]
+// CHECK-32-NEXT:    store i32 [[COND8]], i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-NEXT:    [[TMP21:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-32-NEXT:    store i32 [[TMP21]], i32* [[DOTOMP_IV]], align 4
+// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND]]
+// CHECK-32:       omp.inner.for.end:
+// CHECK-32-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK-32:       omp.loop.exit:
+// CHECK-32-NEXT:    call void @__kmpc_distribute_static_fini(%struct.ident_t* @[[GLOB2]], i32 [[TMP1]])
+// CHECK-32-NEXT:    ret void
+//
+//
+// CHECK-32-LABEL: define {{[^@]+}}@__omp_outlined__48
+// CHECK-32-SAME: (i32* noalias noundef [[DOTGLOBAL_TID_:%.*]], i32* noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR1]] {
+// CHECK-32-NEXT:  entry:
+// CHECK-32-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-NEXT:    [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4
+// CHECK-32-NEXT:    store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4
+// CHECK-32-NEXT:    store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4
+// CHECK-32-NEXT:    store i32 0, i32* [[DOTOMP_LB]], align 4
+// CHECK-32-NEXT:    store i32 9, i32* [[DOTOMP_UB]], align 4
+// CHECK-32-NEXT:    [[TMP0:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4
+// CHECK-32-NEXT:    [[TMP1:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4
+// CHECK-32-NEXT:    store i32 [[TMP0]], i32* [[DOTOMP_LB]], align 4
+// CHECK-32-NEXT:    store i32 [[TMP1]], i32* [[DOTOMP_UB]], align 4
+// CHECK-32-NEXT:    store i32 1, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-32-NEXT:    store i32 0, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-32-NEXT:    [[TMP2:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-NEXT:    [[TMP3:%.*]] = load i32, i32* [[TMP2]], align 4
+// CHECK-32-NEXT:    call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB3]], i32 [[TMP3]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1)
+// CHECK-32-NEXT:    [[TMP4:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4
+// CHECK-32-NEXT:    store i32 [[TMP4]], i32* [[DOTOMP_IV]], align 4
+// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK-32:       omp.inner.for.cond:
+// CHECK-32-NEXT:    [[TMP5:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
+// CHECK-32-NEXT:    [[TMP6:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4
+// CHECK-32-NEXT:    [[CMP:%.*]] = icmp ule i32 [[TMP5]], [[TMP6]]
+// CHECK-32-NEXT:    br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK-32:       omp.inner.for.body:
+// CHECK-32-NEXT:    [[TMP7:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
+// CHECK-32-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP7]], 1
+// CHECK-32-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// CHECK-32-NEXT:    store i32 [[ADD]], i32* [[I]], align 4
+// CHECK-32-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK-32:       omp.body.continue:
+// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK-32:       omp.inner.for.inc:
+// CHECK-32-NEXT:    [[TMP8:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
+// CHECK-32-NEXT:    [[TMP9:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-32-NEXT:    [[ADD1:%.*]] = add nsw i32 [[TMP8]], [[TMP9]]
+// CHECK-32-NEXT:    store i32 [[ADD1]], i32* [[DOTOMP_IV]], align 4
+// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND]]
+// CHECK-32:       omp.inner.for.end:
+// CHECK-32-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK-32:       omp.loop.exit:
+// CHECK-32-NEXT:    call void @__kmpc_distribute_static_fini(%struct.ident_t* @[[GLOB2]], i32 [[TMP3]])
+// CHECK-32-NEXT:    ret void
+//
+//
+// CHECK-32-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l108
+// CHECK-32-SAME: () #[[ATTR0]] {
+// CHECK-32-NEXT:  entry:
+// CHECK-32-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 2, i1 false)
+// CHECK-32-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
+// CHECK-32-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
+// CHECK-32:       user_code.entry:
+// CHECK-32-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
+// CHECK-32-NEXT:    store i32 0, i32* [[DOTZERO_ADDR]], align 4
+// CHECK-32-NEXT:    store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4
+// CHECK-32-NEXT:    call void @__omp_outlined__49(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]]) #[[ATTR2]]
+// CHECK-32-NEXT:    call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 2)
+// CHECK-32-NEXT:    ret void
+// CHECK-32:       worker.exit:
+// CHECK-32-NEXT:    ret void
+//
+//
+// CHECK-32-LABEL: define {{[^@]+}}@__omp_outlined__49
+// CHECK-32-SAME: (i32* noalias noundef [[DOTGLOBAL_TID_:%.*]], i32* noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
+// CHECK-32-NEXT:  entry:
+// CHECK-32-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [2 x i8*], align 4
+// CHECK-32-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4
+// CHECK-32-NEXT:    store i32 0, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-32-NEXT:    store i32 9, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-NEXT:    store i32 1, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-32-NEXT:    store i32 0, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-32-NEXT:    [[NVPTX_NUM_THREADS:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block()
+// CHECK-32-NEXT:    [[TMP0:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-NEXT:    [[TMP1:%.*]] = load i32, i32* [[TMP0]], align 4
+// CHECK-32-NEXT:    call void @__kmpc_distribute_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP1]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]])
+// CHECK-32-NEXT:    [[TMP2:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[TMP2]], 9
+// CHECK-32-NEXT:    br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK-32:       cond.true:
+// CHECK-32-NEXT:    br label [[COND_END:%.*]]
+// CHECK-32:       cond.false:
+// CHECK-32-NEXT:    [[TMP3:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-NEXT:    br label [[COND_END]]
+// CHECK-32:       cond.end:
+// CHECK-32-NEXT:    [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP3]], [[COND_FALSE]] ]
+// CHECK-32-NEXT:    store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-NEXT:    [[TMP4:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-32-NEXT:    store i32 [[TMP4]], i32* [[DOTOMP_IV]], align 4
+// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK-32:       omp.inner.for.cond:
+// CHECK-32-NEXT:    [[TMP5:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
+// CHECK-32-NEXT:    [[CMP1:%.*]] = icmp slt i32 [[TMP5]], 10
+// CHECK-32-NEXT:    br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK-32:       omp.inner.for.body:
+// CHECK-32-NEXT:    [[TMP6:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-32-NEXT:    [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0
+// CHECK-32-NEXT:    [[TMP9:%.*]] = inttoptr i32 [[TMP6]] to i8*
+// CHECK-32-NEXT:    store i8* [[TMP9]], i8** [[TMP8]], align 4
+// CHECK-32-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1
+// CHECK-32-NEXT:    [[TMP11:%.*]] = inttoptr i32 [[TMP7]] to i8*
+// CHECK-32-NEXT:    store i8* [[TMP11]], i8** [[TMP10]], align 4
+// CHECK-32-NEXT:    [[TMP12:%.*]] = bitcast [2 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8**
+// CHECK-32-NEXT:    call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32)* @__omp_outlined__50 to i8*), i8* null, i8** [[TMP12]], i32 2)
+// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK-32:       omp.inner.for.inc:
+// CHECK-32-NEXT:    [[TMP13:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
+// CHECK-32-NEXT:    [[TMP14:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-32-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP13]], [[TMP14]]
+// CHECK-32-NEXT:    store i32 [[ADD]], i32* [[DOTOMP_IV]], align 4
+// CHECK-32-NEXT:    [[TMP15:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-32-NEXT:    [[TMP16:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-32-NEXT:    [[ADD2:%.*]] = add nsw i32 [[TMP15]], [[TMP16]]
+// CHECK-32-NEXT:    store i32 [[ADD2]], i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-32-NEXT:    [[TMP17:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-NEXT:    [[TMP18:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-32-NEXT:    [[ADD3:%.*]] = add nsw i32 [[TMP17]], [[TMP18]]
+// CHECK-32-NEXT:    store i32 [[ADD3]], i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-NEXT:    [[TMP19:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-NEXT:    [[CMP4:%.*]] = icmp sgt i32 [[TMP19]], 9
+// CHECK-32-NEXT:    br i1 [[CMP4]], label [[COND_TRUE5:%.*]], label [[COND_FALSE6:%.*]]
+// CHECK-32:       cond.true5:
+// CHECK-32-NEXT:    br label [[COND_END7:%.*]]
+// CHECK-32:       cond.false6:
+// CHECK-32-NEXT:    [[TMP20:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-NEXT:    br label [[COND_END7]]
+// CHECK-32:       cond.end7:
+// CHECK-32-NEXT:    [[COND8:%.*]] = phi i32 [ 9, [[COND_TRUE5]] ], [ [[TMP20]], [[COND_FALSE6]] ]
+// CHECK-32-NEXT:    store i32 [[COND8]], i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-NEXT:    [[TMP21:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-32-NEXT:    store i32 [[TMP21]], i32* [[DOTOMP_IV]], align 4
+// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND]]
+// CHECK-32:       omp.inner.for.end:
+// CHECK-32-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK-32:       omp.loop.exit:
+// CHECK-32-NEXT:    call void @__kmpc_distribute_static_fini(%struct.ident_t* @[[GLOB2]], i32 [[TMP1]])
+// CHECK-32-NEXT:    ret void
+//
+//
+// CHECK-32-LABEL: define {{[^@]+}}@__omp_outlined__50
+// CHECK-32-SAME: (i32* noalias noundef [[DOTGLOBAL_TID_:%.*]], i32* noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR1]] {
+// CHECK-32-NEXT:  entry:
+// CHECK-32-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-NEXT:    [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4
+// CHECK-32-NEXT:    store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4
+// CHECK-32-NEXT:    store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4
+// CHECK-32-NEXT:    store i32 0, i32* [[DOTOMP_LB]], align 4
+// CHECK-32-NEXT:    store i32 9, i32* [[DOTOMP_UB]], align 4
+// CHECK-32-NEXT:    [[TMP0:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4
+// CHECK-32-NEXT:    [[TMP1:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4
+// CHECK-32-NEXT:    store i32 [[TMP0]], i32* [[DOTOMP_LB]], align 4
+// CHECK-32-NEXT:    store i32 [[TMP1]], i32* [[DOTOMP_UB]], align 4
+// CHECK-32-NEXT:    store i32 1, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-32-NEXT:    store i32 0, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-32-NEXT:    [[TMP2:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4
+// CHECK-32-NEXT:    [[TMP3:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4
+// CHECK-32-NEXT:    [[TMP4:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-NEXT:    [[TMP5:%.*]] = load i32, i32* [[TMP4]], align 4
+// CHECK-32-NEXT:    call void @__kmpc_dispatch_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP5]], i32 1073741862, i32 [[TMP2]], i32 [[TMP3]], i32 1, i32 1)
+// CHECK-32-NEXT:    br label [[OMP_DISPATCH_COND:%.*]]
+// CHECK-32:       omp.dispatch.cond:
+// CHECK-32-NEXT:    [[TMP6:%.*]] = call i32 @__kmpc_dispatch_next_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP5]], i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]])
+// CHECK-32-NEXT:    [[TOBOOL:%.*]] = icmp ne i32 [[TMP6]], 0
+// CHECK-32-NEXT:    br i1 [[TOBOOL]], label [[OMP_DISPATCH_BODY:%.*]], label [[OMP_DISPATCH_END:%.*]]
+// CHECK-32:       omp.dispatch.body:
+// CHECK-32-NEXT:    [[TMP7:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4
+// CHECK-32-NEXT:    store i32 [[TMP7]], i32* [[DOTOMP_IV]], align 4
+// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK-32:       omp.inner.for.cond:
+// CHECK-32-NEXT:    [[TMP8:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP234:![0-9]+]]
+// CHECK-32-NEXT:    [[TMP9:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP234]]
+// CHECK-32-NEXT:    [[CMP:%.*]] = icmp sle i32 [[TMP8]], [[TMP9]]
+// CHECK-32-NEXT:    br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK-32:       omp.inner.for.body:
+// CHECK-32-NEXT:    [[TMP10:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP234]]
+// CHECK-32-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP10]], 1
+// CHECK-32-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// CHECK-32-NEXT:    store i32 [[ADD]], i32* [[I]], align 4, !llvm.access.group [[ACC_GRP234]]
+// CHECK-32-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK-32:       omp.body.continue:
+// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK-32:       omp.inner.for.inc:
+// CHECK-32-NEXT:    [[TMP11:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP234]]
+// CHECK-32-NEXT:    [[ADD1:%.*]] = add nsw i32 [[TMP11]], 1
+// CHECK-32-NEXT:    store i32 [[ADD1]], i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP234]]
+// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP235:![0-9]+]]
+// CHECK-32:       omp.inner.for.end:
+// CHECK-32-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
+// CHECK-32:       omp.dispatch.inc:
+// CHECK-32-NEXT:    br label [[OMP_DISPATCH_COND]]
+// CHECK-32:       omp.dispatch.end:
+// CHECK-32-NEXT:    ret void
+//
+//
+// CHECK-32-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l112
+// CHECK-32-SAME: () #[[ATTR0]] {
+// CHECK-32-NEXT:  entry:
+// CHECK-32-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 2, i1 false)
+// CHECK-32-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
+// CHECK-32-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
+// CHECK-32:       user_code.entry:
+// CHECK-32-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
+// CHECK-32-NEXT:    store i32 0, i32* [[DOTZERO_ADDR]], align 4
+// CHECK-32-NEXT:    store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4
+// CHECK-32-NEXT:    call void @__omp_outlined__51(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]]) #[[ATTR2]]
+// CHECK-32-NEXT:    call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 2)
+// CHECK-32-NEXT:    ret void
+// CHECK-32:       worker.exit:
+// CHECK-32-NEXT:    ret void
+//
+//
+// CHECK-32-LABEL: define {{[^@]+}}@__omp_outlined__51
+// CHECK-32-SAME: (i32* noalias noundef [[DOTGLOBAL_TID_:%.*]], i32* noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
+// CHECK-32-NEXT:  entry:
+// CHECK-32-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [2 x i8*], align 4
+// CHECK-32-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4
+// CHECK-32-NEXT:    store i32 0, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-32-NEXT:    store i32 9, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-NEXT:    store i32 1, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-32-NEXT:    store i32 0, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-32-NEXT:    [[NVPTX_NUM_THREADS:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block()
+// CHECK-32-NEXT:    [[TMP0:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-NEXT:    [[TMP1:%.*]] = load i32, i32* [[TMP0]], align 4
+// CHECK-32-NEXT:    call void @__kmpc_distribute_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP1]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]])
+// CHECK-32-NEXT:    [[TMP2:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[TMP2]], 9
+// CHECK-32-NEXT:    br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK-32:       cond.true:
+// CHECK-32-NEXT:    br label [[COND_END:%.*]]
+// CHECK-32:       cond.false:
+// CHECK-32-NEXT:    [[TMP3:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-NEXT:    br label [[COND_END]]
+// CHECK-32:       cond.end:
+// CHECK-32-NEXT:    [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP3]], [[COND_FALSE]] ]
+// CHECK-32-NEXT:    store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-NEXT:    [[TMP4:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-32-NEXT:    store i32 [[TMP4]], i32* [[DOTOMP_IV]], align 4
+// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK-32:       omp.inner.for.cond:
+// CHECK-32-NEXT:    [[TMP5:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
+// CHECK-32-NEXT:    [[CMP1:%.*]] = icmp slt i32 [[TMP5]], 10
+// CHECK-32-NEXT:    br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK-32:       omp.inner.for.body:
+// CHECK-32-NEXT:    [[TMP6:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-32-NEXT:    [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0
+// CHECK-32-NEXT:    [[TMP9:%.*]] = inttoptr i32 [[TMP6]] to i8*
+// CHECK-32-NEXT:    store i8* [[TMP9]], i8** [[TMP8]], align 4
+// CHECK-32-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1
+// CHECK-32-NEXT:    [[TMP11:%.*]] = inttoptr i32 [[TMP7]] to i8*
+// CHECK-32-NEXT:    store i8* [[TMP11]], i8** [[TMP10]], align 4
+// CHECK-32-NEXT:    [[TMP12:%.*]] = bitcast [2 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8**
+// CHECK-32-NEXT:    call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32)* @__omp_outlined__52 to i8*), i8* null, i8** [[TMP12]], i32 2)
+// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK-32:       omp.inner.for.inc:
+// CHECK-32-NEXT:    [[TMP13:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
+// CHECK-32-NEXT:    [[TMP14:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-32-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP13]], [[TMP14]]
+// CHECK-32-NEXT:    store i32 [[ADD]], i32* [[DOTOMP_IV]], align 4
+// CHECK-32-NEXT:    [[TMP15:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-32-NEXT:    [[TMP16:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-32-NEXT:    [[ADD2:%.*]] = add nsw i32 [[TMP15]], [[TMP16]]
+// CHECK-32-NEXT:    store i32 [[ADD2]], i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-32-NEXT:    [[TMP17:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-NEXT:    [[TMP18:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-32-NEXT:    [[ADD3:%.*]] = add nsw i32 [[TMP17]], [[TMP18]]
+// CHECK-32-NEXT:    store i32 [[ADD3]], i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-NEXT:    [[TMP19:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-NEXT:    [[CMP4:%.*]] = icmp sgt i32 [[TMP19]], 9
+// CHECK-32-NEXT:    br i1 [[CMP4]], label [[COND_TRUE5:%.*]], label [[COND_FALSE6:%.*]]
+// CHECK-32:       cond.true5:
+// CHECK-32-NEXT:    br label [[COND_END7:%.*]]
+// CHECK-32:       cond.false6:
+// CHECK-32-NEXT:    [[TMP20:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-NEXT:    br label [[COND_END7]]
+// CHECK-32:       cond.end7:
+// CHECK-32-NEXT:    [[COND8:%.*]] = phi i32 [ 9, [[COND_TRUE5]] ], [ [[TMP20]], [[COND_FALSE6]] ]
+// CHECK-32-NEXT:    store i32 [[COND8]], i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-NEXT:    [[TMP21:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-32-NEXT:    store i32 [[TMP21]], i32* [[DOTOMP_IV]], align 4
+// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND]]
+// CHECK-32:       omp.inner.for.end:
+// CHECK-32-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK-32:       omp.loop.exit:
+// CHECK-32-NEXT:    call void @__kmpc_distribute_static_fini(%struct.ident_t* @[[GLOB2]], i32 [[TMP1]])
+// CHECK-32-NEXT:    ret void
+//
+//
+// CHECK-32-LABEL: define {{[^@]+}}@__omp_outlined__52
+// CHECK-32-SAME: (i32* noalias noundef [[DOTGLOBAL_TID_:%.*]], i32* noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR1]] {
+// CHECK-32-NEXT:  entry:
+// CHECK-32-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-NEXT:    [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4
+// CHECK-32-NEXT:    store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4
+// CHECK-32-NEXT:    store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4
+// CHECK-32-NEXT:    store i32 0, i32* [[DOTOMP_LB]], align 4
+// CHECK-32-NEXT:    store i32 9, i32* [[DOTOMP_UB]], align 4
+// CHECK-32-NEXT:    [[TMP0:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4
+// CHECK-32-NEXT:    [[TMP1:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4
+// CHECK-32-NEXT:    store i32 [[TMP0]], i32* [[DOTOMP_LB]], align 4
+// CHECK-32-NEXT:    store i32 [[TMP1]], i32* [[DOTOMP_UB]], align 4
+// CHECK-32-NEXT:    store i32 1, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-32-NEXT:    store i32 0, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-32-NEXT:    [[TMP2:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4
+// CHECK-32-NEXT:    [[TMP3:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4
+// CHECK-32-NEXT:    [[TMP4:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-NEXT:    [[TMP5:%.*]] = load i32, i32* [[TMP4]], align 4
+// CHECK-32-NEXT:    call void @__kmpc_dispatch_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP5]], i32 1073741861, i32 [[TMP2]], i32 [[TMP3]], i32 1, i32 1)
+// CHECK-32-NEXT:    br label [[OMP_DISPATCH_COND:%.*]]
+// CHECK-32:       omp.dispatch.cond:
+// CHECK-32-NEXT:    [[TMP6:%.*]] = call i32 @__kmpc_dispatch_next_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP5]], i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]])
+// CHECK-32-NEXT:    [[TOBOOL:%.*]] = icmp ne i32 [[TMP6]], 0
+// CHECK-32-NEXT:    br i1 [[TOBOOL]], label [[OMP_DISPATCH_BODY:%.*]], label [[OMP_DISPATCH_END:%.*]]
+// CHECK-32:       omp.dispatch.body:
+// CHECK-32-NEXT:    [[TMP7:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4
+// CHECK-32-NEXT:    store i32 [[TMP7]], i32* [[DOTOMP_IV]], align 4
+// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK-32:       omp.inner.for.cond:
+// CHECK-32-NEXT:    [[TMP8:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP237:![0-9]+]]
+// CHECK-32-NEXT:    [[TMP9:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP237]]
+// CHECK-32-NEXT:    [[CMP:%.*]] = icmp sle i32 [[TMP8]], [[TMP9]]
+// CHECK-32-NEXT:    br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK-32:       omp.inner.for.body:
+// CHECK-32-NEXT:    [[TMP10:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP237]]
+// CHECK-32-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP10]], 1
+// CHECK-32-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// CHECK-32-NEXT:    store i32 [[ADD]], i32* [[I]], align 4, !llvm.access.group [[ACC_GRP237]]
+// CHECK-32-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK-32:       omp.body.continue:
+// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK-32:       omp.inner.for.inc:
+// CHECK-32-NEXT:    [[TMP11:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP237]]
+// CHECK-32-NEXT:    [[ADD1:%.*]] = add nsw i32 [[TMP11]], 1
+// CHECK-32-NEXT:    store i32 [[ADD1]], i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP237]]
+// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP238:![0-9]+]]
+// CHECK-32:       omp.inner.for.end:
+// CHECK-32-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
+// CHECK-32:       omp.dispatch.inc:
+// CHECK-32-NEXT:    br label [[OMP_DISPATCH_COND]]
+// CHECK-32:       omp.dispatch.end:
+// CHECK-32-NEXT:    ret void
+//
+//
+// CHECK-32-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l116
+// CHECK-32-SAME: () #[[ATTR0]] {
+// CHECK-32-NEXT:  entry:
+// CHECK-32-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 2, i1 false)
+// CHECK-32-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
+// CHECK-32-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
+// CHECK-32:       user_code.entry:
+// CHECK-32-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
+// CHECK-32-NEXT:    store i32 0, i32* [[DOTZERO_ADDR]], align 4
+// CHECK-32-NEXT:    store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4
+// CHECK-32-NEXT:    call void @__omp_outlined__53(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]]) #[[ATTR2]]
+// CHECK-32-NEXT:    call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 2)
+// CHECK-32-NEXT:    ret void
+// CHECK-32:       worker.exit:
+// CHECK-32-NEXT:    ret void
+//
+//
+// CHECK-32-LABEL: define {{[^@]+}}@__omp_outlined__53
+// CHECK-32-SAME: (i32* noalias noundef [[DOTGLOBAL_TID_:%.*]], i32* noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
+// CHECK-32-NEXT:  entry:
+// CHECK-32-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [2 x i8*], align 4
+// CHECK-32-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4
+// CHECK-32-NEXT:    store i32 0, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-32-NEXT:    store i32 9, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-NEXT:    store i32 1, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-32-NEXT:    store i32 0, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-32-NEXT:    [[NVPTX_NUM_THREADS:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block()
+// CHECK-32-NEXT:    [[TMP0:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-NEXT:    [[TMP1:%.*]] = load i32, i32* [[TMP0]], align 4
+// CHECK-32-NEXT:    call void @__kmpc_distribute_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP1]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]])
+// CHECK-32-NEXT:    [[TMP2:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[TMP2]], 9
+// CHECK-32-NEXT:    br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK-32:       cond.true:
+// CHECK-32-NEXT:    br label [[COND_END:%.*]]
+// CHECK-32:       cond.false:
+// CHECK-32-NEXT:    [[TMP3:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-NEXT:    br label [[COND_END]]
+// CHECK-32:       cond.end:
+// CHECK-32-NEXT:    [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP3]], [[COND_FALSE]] ]
+// CHECK-32-NEXT:    store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-NEXT:    [[TMP4:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-32-NEXT:    store i32 [[TMP4]], i32* [[DOTOMP_IV]], align 4
+// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK-32:       omp.inner.for.cond:
+// CHECK-32-NEXT:    [[TMP5:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
+// CHECK-32-NEXT:    [[CMP1:%.*]] = icmp slt i32 [[TMP5]], 10
+// CHECK-32-NEXT:    br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK-32:       omp.inner.for.body:
+// CHECK-32-NEXT:    [[TMP6:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-32-NEXT:    [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0
+// CHECK-32-NEXT:    [[TMP9:%.*]] = inttoptr i32 [[TMP6]] to i8*
+// CHECK-32-NEXT:    store i8* [[TMP9]], i8** [[TMP8]], align 4
+// CHECK-32-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1
+// CHECK-32-NEXT:    [[TMP11:%.*]] = inttoptr i32 [[TMP7]] to i8*
+// CHECK-32-NEXT:    store i8* [[TMP11]], i8** [[TMP10]], align 4
+// CHECK-32-NEXT:    [[TMP12:%.*]] = bitcast [2 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8**
+// CHECK-32-NEXT:    call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32)* @__omp_outlined__54 to i8*), i8* null, i8** [[TMP12]], i32 2)
+// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK-32:       omp.inner.for.inc:
+// CHECK-32-NEXT:    [[TMP13:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
+// CHECK-32-NEXT:    [[TMP14:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-32-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP13]], [[TMP14]]
+// CHECK-32-NEXT:    store i32 [[ADD]], i32* [[DOTOMP_IV]], align 4
+// CHECK-32-NEXT:    [[TMP15:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-32-NEXT:    [[TMP16:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-32-NEXT:    [[ADD2:%.*]] = add nsw i32 [[TMP15]], [[TMP16]]
+// CHECK-32-NEXT:    store i32 [[ADD2]], i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-32-NEXT:    [[TMP17:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-NEXT:    [[TMP18:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-32-NEXT:    [[ADD3:%.*]] = add nsw i32 [[TMP17]], [[TMP18]]
+// CHECK-32-NEXT:    store i32 [[ADD3]], i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-NEXT:    [[TMP19:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-NEXT:    [[CMP4:%.*]] = icmp sgt i32 [[TMP19]], 9
+// CHECK-32-NEXT:    br i1 [[CMP4]], label [[COND_TRUE5:%.*]], label [[COND_FALSE6:%.*]]
+// CHECK-32:       cond.true5:
+// CHECK-32-NEXT:    br label [[COND_END7:%.*]]
+// CHECK-32:       cond.false6:
+// CHECK-32-NEXT:    [[TMP20:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-NEXT:    br label [[COND_END7]]
+// CHECK-32:       cond.end7:
+// CHECK-32-NEXT:    [[COND8:%.*]] = phi i32 [ 9, [[COND_TRUE5]] ], [ [[TMP20]], [[COND_FALSE6]] ]
+// CHECK-32-NEXT:    store i32 [[COND8]], i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-NEXT:    [[TMP21:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-32-NEXT:    store i32 [[TMP21]], i32* [[DOTOMP_IV]], align 4
+// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND]]
+// CHECK-32:       omp.inner.for.end:
+// CHECK-32-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK-32:       omp.loop.exit:
+// CHECK-32-NEXT:    call void @__kmpc_distribute_static_fini(%struct.ident_t* @[[GLOB2]], i32 [[TMP1]])
+// CHECK-32-NEXT:    ret void
+//
+//
+// CHECK-32-LABEL: define {{[^@]+}}@__omp_outlined__54
+// CHECK-32-SAME: (i32* noalias noundef [[DOTGLOBAL_TID_:%.*]], i32* noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR1]] {
+// CHECK-32-NEXT:  entry:
+// CHECK-32-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-NEXT:    [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4
+// CHECK-32-NEXT:    store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4
+// CHECK-32-NEXT:    store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4
+// CHECK-32-NEXT:    store i32 0, i32* [[DOTOMP_LB]], align 4
+// CHECK-32-NEXT:    store i32 9, i32* [[DOTOMP_UB]], align 4
+// CHECK-32-NEXT:    [[TMP0:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4
+// CHECK-32-NEXT:    [[TMP1:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4
+// CHECK-32-NEXT:    store i32 [[TMP0]], i32* [[DOTOMP_LB]], align 4
+// CHECK-32-NEXT:    store i32 [[TMP1]], i32* [[DOTOMP_UB]], align 4
+// CHECK-32-NEXT:    store i32 1, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-32-NEXT:    store i32 0, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-32-NEXT:    [[TMP2:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4
+// CHECK-32-NEXT:    [[TMP3:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4
+// CHECK-32-NEXT:    [[TMP4:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-NEXT:    [[TMP5:%.*]] = load i32, i32* [[TMP4]], align 4
+// CHECK-32-NEXT:    call void @__kmpc_dispatch_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP5]], i32 1073741859, i32 [[TMP2]], i32 [[TMP3]], i32 1, i32 1)
+// CHECK-32-NEXT:    br label [[OMP_DISPATCH_COND:%.*]]
+// CHECK-32:       omp.dispatch.cond:
+// CHECK-32-NEXT:    [[TMP6:%.*]] = call i32 @__kmpc_dispatch_next_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP5]], i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]])
+// CHECK-32-NEXT:    [[TOBOOL:%.*]] = icmp ne i32 [[TMP6]], 0
+// CHECK-32-NEXT:    br i1 [[TOBOOL]], label [[OMP_DISPATCH_BODY:%.*]], label [[OMP_DISPATCH_END:%.*]]
+// CHECK-32:       omp.dispatch.body:
+// CHECK-32-NEXT:    [[TMP7:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4
+// CHECK-32-NEXT:    store i32 [[TMP7]], i32* [[DOTOMP_IV]], align 4
+// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK-32:       omp.inner.for.cond:
+// CHECK-32-NEXT:    [[TMP8:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP240:![0-9]+]]
+// CHECK-32-NEXT:    [[TMP9:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP240]]
+// CHECK-32-NEXT:    [[CMP:%.*]] = icmp sle i32 [[TMP8]], [[TMP9]]
+// CHECK-32-NEXT:    br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK-32:       omp.inner.for.body:
+// CHECK-32-NEXT:    [[TMP10:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP240]]
+// CHECK-32-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP10]], 1
+// CHECK-32-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// CHECK-32-NEXT:    store i32 [[ADD]], i32* [[I]], align 4, !llvm.access.group [[ACC_GRP240]]
+// CHECK-32-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK-32:       omp.body.continue:
+// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK-32:       omp.inner.for.inc:
+// CHECK-32-NEXT:    [[TMP11:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP240]]
+// CHECK-32-NEXT:    [[ADD1:%.*]] = add nsw i32 [[TMP11]], 1
+// CHECK-32-NEXT:    store i32 [[ADD1]], i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP240]]
+// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP241:![0-9]+]]
+// CHECK-32:       omp.inner.for.end:
+// CHECK-32-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
+// CHECK-32:       omp.dispatch.inc:
+// CHECK-32-NEXT:    br label [[OMP_DISPATCH_COND]]
+// CHECK-32:       omp.dispatch.end:
+// CHECK-32-NEXT:    ret void
+//
+//
+// CHECK-32-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l120
+// CHECK-32-SAME: () #[[ATTR0]] {
+// CHECK-32-NEXT:  entry:
+// CHECK-32-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 2, i1 false)
+// CHECK-32-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
+// CHECK-32-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
+// CHECK-32:       user_code.entry:
+// CHECK-32-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
+// CHECK-32-NEXT:    store i32 0, i32* [[DOTZERO_ADDR]], align 4
+// CHECK-32-NEXT:    store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4
+// CHECK-32-NEXT:    call void @__omp_outlined__55(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]]) #[[ATTR2]]
+// CHECK-32-NEXT:    call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 2)
+// CHECK-32-NEXT:    ret void
+// CHECK-32:       worker.exit:
+// CHECK-32-NEXT:    ret void
+//
+//
+// CHECK-32-LABEL: define {{[^@]+}}@__omp_outlined__55
+// CHECK-32-SAME: (i32* noalias noundef [[DOTGLOBAL_TID_:%.*]], i32* noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
+// CHECK-32-NEXT:  entry:
+// CHECK-32-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [2 x i8*], align 4
+// CHECK-32-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4
+// CHECK-32-NEXT:    store i32 0, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-32-NEXT:    store i32 9, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-NEXT:    store i32 1, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-32-NEXT:    store i32 0, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-32-NEXT:    [[NVPTX_NUM_THREADS:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block()
+// CHECK-32-NEXT:    [[TMP0:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-NEXT:    [[TMP1:%.*]] = load i32, i32* [[TMP0]], align 4
+// CHECK-32-NEXT:    call void @__kmpc_distribute_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP1]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]])
+// CHECK-32-NEXT:    [[TMP2:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[TMP2]], 9
+// CHECK-32-NEXT:    br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK-32:       cond.true:
+// CHECK-32-NEXT:    br label [[COND_END:%.*]]
+// CHECK-32:       cond.false:
+// CHECK-32-NEXT:    [[TMP3:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-NEXT:    br label [[COND_END]]
+// CHECK-32:       cond.end:
+// CHECK-32-NEXT:    [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP3]], [[COND_FALSE]] ]
+// CHECK-32-NEXT:    store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-NEXT:    [[TMP4:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-32-NEXT:    store i32 [[TMP4]], i32* [[DOTOMP_IV]], align 4
+// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK-32:       omp.inner.for.cond:
+// CHECK-32-NEXT:    [[TMP5:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
+// CHECK-32-NEXT:    [[CMP1:%.*]] = icmp slt i32 [[TMP5]], 10
+// CHECK-32-NEXT:    br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK-32:       omp.inner.for.body:
+// CHECK-32-NEXT:    [[TMP6:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-32-NEXT:    [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0
+// CHECK-32-NEXT:    [[TMP9:%.*]] = inttoptr i32 [[TMP6]] to i8*
+// CHECK-32-NEXT:    store i8* [[TMP9]], i8** [[TMP8]], align 4
+// CHECK-32-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1
+// CHECK-32-NEXT:    [[TMP11:%.*]] = inttoptr i32 [[TMP7]] to i8*
+// CHECK-32-NEXT:    store i8* [[TMP11]], i8** [[TMP10]], align 4
+// CHECK-32-NEXT:    [[TMP12:%.*]] = bitcast [2 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8**
+// CHECK-32-NEXT:    call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32)* @__omp_outlined__56 to i8*), i8* null, i8** [[TMP12]], i32 2)
+// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK-32:       omp.inner.for.inc:
+// CHECK-32-NEXT:    [[TMP13:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
+// CHECK-32-NEXT:    [[TMP14:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-32-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP13]], [[TMP14]]
+// CHECK-32-NEXT:    store i32 [[ADD]], i32* [[DOTOMP_IV]], align 4
+// CHECK-32-NEXT:    [[TMP15:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-32-NEXT:    [[TMP16:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-32-NEXT:    [[ADD2:%.*]] = add nsw i32 [[TMP15]], [[TMP16]]
+// CHECK-32-NEXT:    store i32 [[ADD2]], i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-32-NEXT:    [[TMP17:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-NEXT:    [[TMP18:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-32-NEXT:    [[ADD3:%.*]] = add nsw i32 [[TMP17]], [[TMP18]]
+// CHECK-32-NEXT:    store i32 [[ADD3]], i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-NEXT:    [[TMP19:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-NEXT:    [[CMP4:%.*]] = icmp sgt i32 [[TMP19]], 9
+// CHECK-32-NEXT:    br i1 [[CMP4]], label [[COND_TRUE5:%.*]], label [[COND_FALSE6:%.*]]
+// CHECK-32:       cond.true5:
+// CHECK-32-NEXT:    br label [[COND_END7:%.*]]
+// CHECK-32:       cond.false6:
+// CHECK-32-NEXT:    [[TMP20:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-NEXT:    br label [[COND_END7]]
+// CHECK-32:       cond.end7:
+// CHECK-32-NEXT:    [[COND8:%.*]] = phi i32 [ 9, [[COND_TRUE5]] ], [ [[TMP20]], [[COND_FALSE6]] ]
+// CHECK-32-NEXT:    store i32 [[COND8]], i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-NEXT:    [[TMP21:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-32-NEXT:    store i32 [[TMP21]], i32* [[DOTOMP_IV]], align 4
+// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND]]
+// CHECK-32:       omp.inner.for.end:
+// CHECK-32-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK-32:       omp.loop.exit:
+// CHECK-32-NEXT:    call void @__kmpc_distribute_static_fini(%struct.ident_t* @[[GLOB2]], i32 [[TMP1]])
+// CHECK-32-NEXT:    ret void
+//
+//
+// CHECK-32-LABEL: define {{[^@]+}}@__omp_outlined__56
+// CHECK-32-SAME: (i32* noalias noundef [[DOTGLOBAL_TID_:%.*]], i32* noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR1]] {
+// CHECK-32-NEXT:  entry:
+// CHECK-32-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-NEXT:    [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4
+// CHECK-32-NEXT:    store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4
+// CHECK-32-NEXT:    store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4
+// CHECK-32-NEXT:    store i32 0, i32* [[DOTOMP_LB]], align 4
+// CHECK-32-NEXT:    store i32 9, i32* [[DOTOMP_UB]], align 4
+// CHECK-32-NEXT:    [[TMP0:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4
+// CHECK-32-NEXT:    [[TMP1:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4
+// CHECK-32-NEXT:    store i32 [[TMP0]], i32* [[DOTOMP_LB]], align 4
+// CHECK-32-NEXT:    store i32 [[TMP1]], i32* [[DOTOMP_UB]], align 4
+// CHECK-32-NEXT:    store i32 1, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-32-NEXT:    store i32 0, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-32-NEXT:    [[TMP2:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4
+// CHECK-32-NEXT:    [[TMP3:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4
+// CHECK-32-NEXT:    [[TMP4:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-NEXT:    [[TMP5:%.*]] = load i32, i32* [[TMP4]], align 4
+// CHECK-32-NEXT:    call void @__kmpc_dispatch_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP5]], i32 1073741860, i32 [[TMP2]], i32 [[TMP3]], i32 1, i32 1)
+// CHECK-32-NEXT:    br label [[OMP_DISPATCH_COND:%.*]]
+// CHECK-32:       omp.dispatch.cond:
+// CHECK-32-NEXT:    [[TMP6:%.*]] = call i32 @__kmpc_dispatch_next_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP5]], i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]])
+// CHECK-32-NEXT:    [[TOBOOL:%.*]] = icmp ne i32 [[TMP6]], 0
+// CHECK-32-NEXT:    br i1 [[TOBOOL]], label [[OMP_DISPATCH_BODY:%.*]], label [[OMP_DISPATCH_END:%.*]]
+// CHECK-32:       omp.dispatch.body:
+// CHECK-32-NEXT:    [[TMP7:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4
+// CHECK-32-NEXT:    store i32 [[TMP7]], i32* [[DOTOMP_IV]], align 4
+// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK-32:       omp.inner.for.cond:
+// CHECK-32-NEXT:    [[TMP8:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP243:![0-9]+]]
+// CHECK-32-NEXT:    [[TMP9:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP243]]
+// CHECK-32-NEXT:    [[CMP:%.*]] = icmp sle i32 [[TMP8]], [[TMP9]]
+// CHECK-32-NEXT:    br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK-32:       omp.inner.for.body:
+// CHECK-32-NEXT:    [[TMP10:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP243]]
+// CHECK-32-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP10]], 1
+// CHECK-32-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// CHECK-32-NEXT:    store i32 [[ADD]], i32* [[I]], align 4, !llvm.access.group [[ACC_GRP243]]
+// CHECK-32-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK-32:       omp.body.continue:
+// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK-32:       omp.inner.for.inc:
+// CHECK-32-NEXT:    [[TMP11:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP243]]
+// CHECK-32-NEXT:    [[ADD1:%.*]] = add nsw i32 [[TMP11]], 1
+// CHECK-32-NEXT:    store i32 [[ADD1]], i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP243]]
+// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP244:![0-9]+]]
+// CHECK-32:       omp.inner.for.end:
+// CHECK-32-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
+// CHECK-32:       omp.dispatch.inc:
+// CHECK-32-NEXT:    br label [[OMP_DISPATCH_COND]]
+// CHECK-32:       omp.dispatch.end:
+// CHECK-32-NEXT:    ret void
+//
+//
+// CHECK-32-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l124
+// CHECK-32-SAME: () #[[ATTR0]] {
+// CHECK-32-NEXT:  entry:
+// CHECK-32-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 2, i1 false)
+// CHECK-32-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
+// CHECK-32-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
+// CHECK-32:       user_code.entry:
+// CHECK-32-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
+// CHECK-32-NEXT:    store i32 0, i32* [[DOTZERO_ADDR]], align 4
+// CHECK-32-NEXT:    store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4
+// CHECK-32-NEXT:    call void @__omp_outlined__57(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]]) #[[ATTR2]]
+// CHECK-32-NEXT:    call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 2)
+// CHECK-32-NEXT:    ret void
+// CHECK-32:       worker.exit:
+// CHECK-32-NEXT:    ret void
+//
+//
+// CHECK-32-LABEL: define {{[^@]+}}@__omp_outlined__57
+// CHECK-32-SAME: (i32* noalias noundef [[DOTGLOBAL_TID_:%.*]], i32* noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
+// CHECK-32-NEXT:  entry:
+// CHECK-32-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [2 x i8*], align 4
+// CHECK-32-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4
+// CHECK-32-NEXT:    store i32 0, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-32-NEXT:    store i32 9, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-NEXT:    store i32 1, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-32-NEXT:    store i32 0, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-32-NEXT:    [[NVPTX_NUM_THREADS:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block()
+// CHECK-32-NEXT:    [[TMP0:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-NEXT:    [[TMP1:%.*]] = load i32, i32* [[TMP0]], align 4
+// CHECK-32-NEXT:    call void @__kmpc_distribute_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP1]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]])
+// CHECK-32-NEXT:    [[TMP2:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[TMP2]], 9
+// CHECK-32-NEXT:    br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK-32:       cond.true:
+// CHECK-32-NEXT:    br label [[COND_END:%.*]]
+// CHECK-32:       cond.false:
+// CHECK-32-NEXT:    [[TMP3:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-NEXT:    br label [[COND_END]]
+// CHECK-32:       cond.end:
+// CHECK-32-NEXT:    [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP3]], [[COND_FALSE]] ]
+// CHECK-32-NEXT:    store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-NEXT:    [[TMP4:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-32-NEXT:    store i32 [[TMP4]], i32* [[DOTOMP_IV]], align 4
+// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK-32:       omp.inner.for.cond:
+// CHECK-32-NEXT:    [[TMP5:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
+// CHECK-32-NEXT:    [[CMP1:%.*]] = icmp slt i32 [[TMP5]], 10
+// CHECK-32-NEXT:    br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK-32:       omp.inner.for.body:
+// CHECK-32-NEXT:    [[TMP6:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-32-NEXT:    [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0
+// CHECK-32-NEXT:    [[TMP9:%.*]] = inttoptr i32 [[TMP6]] to i8*
+// CHECK-32-NEXT:    store i8* [[TMP9]], i8** [[TMP8]], align 4
+// CHECK-32-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1
+// CHECK-32-NEXT:    [[TMP11:%.*]] = inttoptr i32 [[TMP7]] to i8*
+// CHECK-32-NEXT:    store i8* [[TMP11]], i8** [[TMP10]], align 4
+// CHECK-32-NEXT:    [[TMP12:%.*]] = bitcast [2 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8**
+// CHECK-32-NEXT:    call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32)* @__omp_outlined__58 to i8*), i8* null, i8** [[TMP12]], i32 2)
+// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK-32:       omp.inner.for.inc:
+// CHECK-32-NEXT:    [[TMP13:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
+// CHECK-32-NEXT:    [[TMP14:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-32-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP13]], [[TMP14]]
+// CHECK-32-NEXT:    store i32 [[ADD]], i32* [[DOTOMP_IV]], align 4
+// CHECK-32-NEXT:    [[TMP15:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-32-NEXT:    [[TMP16:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-32-NEXT:    [[ADD2:%.*]] = add nsw i32 [[TMP15]], [[TMP16]]
+// CHECK-32-NEXT:    store i32 [[ADD2]], i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-32-NEXT:    [[TMP17:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-NEXT:    [[TMP18:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-32-NEXT:    [[ADD3:%.*]] = add nsw i32 [[TMP17]], [[TMP18]]
+// CHECK-32-NEXT:    store i32 [[ADD3]], i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-NEXT:    [[TMP19:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-NEXT:    [[CMP4:%.*]] = icmp sgt i32 [[TMP19]], 9
+// CHECK-32-NEXT:    br i1 [[CMP4]], label [[COND_TRUE5:%.*]], label [[COND_FALSE6:%.*]]
+// CHECK-32:       cond.true5:
+// CHECK-32-NEXT:    br label [[COND_END7:%.*]]
+// CHECK-32:       cond.false6:
+// CHECK-32-NEXT:    [[TMP20:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-NEXT:    br label [[COND_END7]]
+// CHECK-32:       cond.end7:
+// CHECK-32-NEXT:    [[COND8:%.*]] = phi i32 [ 9, [[COND_TRUE5]] ], [ [[TMP20]], [[COND_FALSE6]] ]
+// CHECK-32-NEXT:    store i32 [[COND8]], i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-NEXT:    [[TMP21:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-32-NEXT:    store i32 [[TMP21]], i32* [[DOTOMP_IV]], align 4
+// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND]]
+// CHECK-32:       omp.inner.for.end:
+// CHECK-32-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK-32:       omp.loop.exit:
+// CHECK-32-NEXT:    call void @__kmpc_distribute_static_fini(%struct.ident_t* @[[GLOB2]], i32 [[TMP1]])
+// CHECK-32-NEXT:    ret void
+//
+//
+// CHECK-32-LABEL: define {{[^@]+}}@__omp_outlined__58
+// CHECK-32-SAME: (i32* noalias noundef [[DOTGLOBAL_TID_:%.*]], i32* noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR1]] {
+// CHECK-32-NEXT:  entry:
+// CHECK-32-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-NEXT:    [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4
+// CHECK-32-NEXT:    store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4
+// CHECK-32-NEXT:    store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4
+// CHECK-32-NEXT:    store i32 0, i32* [[DOTOMP_LB]], align 4
+// CHECK-32-NEXT:    store i32 9, i32* [[DOTOMP_UB]], align 4
+// CHECK-32-NEXT:    [[TMP0:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4
+// CHECK-32-NEXT:    [[TMP1:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4
+// CHECK-32-NEXT:    store i32 [[TMP0]], i32* [[DOTOMP_LB]], align 4
+// CHECK-32-NEXT:    store i32 [[TMP1]], i32* [[DOTOMP_UB]], align 4
+// CHECK-32-NEXT:    store i32 1, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-32-NEXT:    store i32 0, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-32-NEXT:    [[TMP2:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-NEXT:    [[TMP3:%.*]] = load i32, i32* [[TMP2]], align 4
+// CHECK-32-NEXT:    call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB3]], i32 [[TMP3]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1)
+// CHECK-32-NEXT:    [[TMP4:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4
+// CHECK-32-NEXT:    store i32 [[TMP4]], i32* [[DOTOMP_IV]], align 4
+// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK-32:       omp.inner.for.cond:
+// CHECK-32-NEXT:    [[TMP5:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
+// CHECK-32-NEXT:    [[TMP6:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4
+// CHECK-32-NEXT:    [[CMP:%.*]] = icmp ule i32 [[TMP5]], [[TMP6]]
+// CHECK-32-NEXT:    br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK-32:       omp.inner.for.body:
+// CHECK-32-NEXT:    [[TMP7:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
+// CHECK-32-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP7]], 1
+// CHECK-32-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// CHECK-32-NEXT:    store i32 [[ADD]], i32* [[I]], align 4
+// CHECK-32-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK-32:       omp.body.continue:
+// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK-32:       omp.inner.for.inc:
+// CHECK-32-NEXT:    [[TMP8:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
+// CHECK-32-NEXT:    [[TMP9:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-32-NEXT:    [[ADD1:%.*]] = add nsw i32 [[TMP8]], [[TMP9]]
+// CHECK-32-NEXT:    store i32 [[ADD1]], i32* [[DOTOMP_IV]], align 4
+// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND]]
+// CHECK-32:       omp.inner.for.end:
+// CHECK-32-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK-32:       omp.loop.exit:
+// CHECK-32-NEXT:    call void @__kmpc_distribute_static_fini(%struct.ident_t* @[[GLOB2]], i32 [[TMP3]])
+// CHECK-32-NEXT:    ret void
+//
+//
+// CHECK-32-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l129
+// CHECK-32-SAME: () #[[ATTR0]] {
+// CHECK-32-NEXT:  entry:
+// CHECK-32-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 2, i1 false)
+// CHECK-32-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
+// CHECK-32-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
+// CHECK-32:       user_code.entry:
+// CHECK-32-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
+// CHECK-32-NEXT:    store i32 0, i32* [[DOTZERO_ADDR]], align 4
+// CHECK-32-NEXT:    store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4
+// CHECK-32-NEXT:    call void @__omp_outlined__59(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]]) #[[ATTR2]]
+// CHECK-32-NEXT:    call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 2)
+// CHECK-32-NEXT:    ret void
+// CHECK-32:       worker.exit:
+// CHECK-32-NEXT:    ret void
+//
+//
+// CHECK-32-LABEL: define {{[^@]+}}@__omp_outlined__59
+// CHECK-32-SAME: (i32* noalias noundef [[DOTGLOBAL_TID_:%.*]], i32* noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
+// CHECK-32-NEXT:  entry:
+// CHECK-32-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [2 x i8*], align 4
+// CHECK-32-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4
+// CHECK-32-NEXT:    store i32 0, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-32-NEXT:    store i32 9, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-NEXT:    store i32 1, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-32-NEXT:    store i32 0, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-32-NEXT:    [[NVPTX_NUM_THREADS:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block()
+// CHECK-32-NEXT:    [[TMP0:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-NEXT:    [[TMP1:%.*]] = load i32, i32* [[TMP0]], align 4
+// CHECK-32-NEXT:    call void @__kmpc_distribute_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP1]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]])
+// CHECK-32-NEXT:    [[TMP2:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[TMP2]], 9
+// CHECK-32-NEXT:    br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK-32:       cond.true:
+// CHECK-32-NEXT:    br label [[COND_END:%.*]]
+// CHECK-32:       cond.false:
+// CHECK-32-NEXT:    [[TMP3:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-NEXT:    br label [[COND_END]]
+// CHECK-32:       cond.end:
+// CHECK-32-NEXT:    [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP3]], [[COND_FALSE]] ]
+// CHECK-32-NEXT:    store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-NEXT:    [[TMP4:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-32-NEXT:    store i32 [[TMP4]], i32* [[DOTOMP_IV]], align 4
+// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK-32:       omp.inner.for.cond:
+// CHECK-32-NEXT:    [[TMP5:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
+// CHECK-32-NEXT:    [[CMP1:%.*]] = icmp slt i32 [[TMP5]], 10
+// CHECK-32-NEXT:    br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK-32:       omp.inner.for.body:
+// CHECK-32-NEXT:    [[TMP6:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-32-NEXT:    [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0
+// CHECK-32-NEXT:    [[TMP9:%.*]] = inttoptr i32 [[TMP6]] to i8*
+// CHECK-32-NEXT:    store i8* [[TMP9]], i8** [[TMP8]], align 4
+// CHECK-32-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1
+// CHECK-32-NEXT:    [[TMP11:%.*]] = inttoptr i32 [[TMP7]] to i8*
+// CHECK-32-NEXT:    store i8* [[TMP11]], i8** [[TMP10]], align 4
+// CHECK-32-NEXT:    [[TMP12:%.*]] = bitcast [2 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8**
+// CHECK-32-NEXT:    call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32)* @__omp_outlined__60 to i8*), i8* null, i8** [[TMP12]], i32 2)
+// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK-32:       omp.inner.for.inc:
+// CHECK-32-NEXT:    [[TMP13:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
+// CHECK-32-NEXT:    [[TMP14:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-32-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP13]], [[TMP14]]
+// CHECK-32-NEXT:    store i32 [[ADD]], i32* [[DOTOMP_IV]], align 4
+// CHECK-32-NEXT:    [[TMP15:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-32-NEXT:    [[TMP16:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-32-NEXT:    [[ADD2:%.*]] = add nsw i32 [[TMP15]], [[TMP16]]
+// CHECK-32-NEXT:    store i32 [[ADD2]], i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-32-NEXT:    [[TMP17:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-NEXT:    [[TMP18:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-32-NEXT:    [[ADD3:%.*]] = add nsw i32 [[TMP17]], [[TMP18]]
+// CHECK-32-NEXT:    store i32 [[ADD3]], i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-NEXT:    [[TMP19:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-NEXT:    [[CMP4:%.*]] = icmp sgt i32 [[TMP19]], 9
+// CHECK-32-NEXT:    br i1 [[CMP4]], label [[COND_TRUE5:%.*]], label [[COND_FALSE6:%.*]]
+// CHECK-32:       cond.true5:
+// CHECK-32-NEXT:    br label [[COND_END7:%.*]]
+// CHECK-32:       cond.false6:
+// CHECK-32-NEXT:    [[TMP20:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-NEXT:    br label [[COND_END7]]
+// CHECK-32:       cond.end7:
+// CHECK-32-NEXT:    [[COND8:%.*]] = phi i32 [ 9, [[COND_TRUE5]] ], [ [[TMP20]], [[COND_FALSE6]] ]
+// CHECK-32-NEXT:    store i32 [[COND8]], i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-NEXT:    [[TMP21:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-32-NEXT:    store i32 [[TMP21]], i32* [[DOTOMP_IV]], align 4
+// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND]]
+// CHECK-32:       omp.inner.for.end:
+// CHECK-32-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK-32:       omp.loop.exit:
+// CHECK-32-NEXT:    call void @__kmpc_distribute_static_fini(%struct.ident_t* @[[GLOB2]], i32 [[TMP1]])
+// CHECK-32-NEXT:    ret void
+//
+//
+// CHECK-32-LABEL: define {{[^@]+}}@__omp_outlined__60
+// CHECK-32-SAME: (i32* noalias noundef [[DOTGLOBAL_TID_:%.*]], i32* noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR1]] {
+// CHECK-32-NEXT:  entry:
+// CHECK-32-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-NEXT:    [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4
+// CHECK-32-NEXT:    store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4
+// CHECK-32-NEXT:    store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4
+// CHECK-32-NEXT:    store i32 0, i32* [[DOTOMP_LB]], align 4
+// CHECK-32-NEXT:    store i32 9, i32* [[DOTOMP_UB]], align 4
+// CHECK-32-NEXT:    [[TMP0:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4
+// CHECK-32-NEXT:    [[TMP1:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4
+// CHECK-32-NEXT:    store i32 [[TMP0]], i32* [[DOTOMP_LB]], align 4
+// CHECK-32-NEXT:    store i32 [[TMP1]], i32* [[DOTOMP_UB]], align 4
+// CHECK-32-NEXT:    store i32 1, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-32-NEXT:    store i32 0, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-32-NEXT:    [[TMP2:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-NEXT:    [[TMP3:%.*]] = load i32, i32* [[TMP2]], align 4
+// CHECK-32-NEXT:    call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB3]], i32 [[TMP3]], i32 34, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1)
+// CHECK-32-NEXT:    [[TMP4:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4
+// CHECK-32-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[TMP4]], 9
+// CHECK-32-NEXT:    br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK-32:       cond.true:
+// CHECK-32-NEXT:    br label [[COND_END:%.*]]
+// CHECK-32:       cond.false:
+// CHECK-32-NEXT:    [[TMP5:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4
+// CHECK-32-NEXT:    br label [[COND_END]]
+// CHECK-32:       cond.end:
+// CHECK-32-NEXT:    [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP5]], [[COND_FALSE]] ]
+// CHECK-32-NEXT:    store i32 [[COND]], i32* [[DOTOMP_UB]], align 4
+// CHECK-32-NEXT:    [[TMP6:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4
+// CHECK-32-NEXT:    store i32 [[TMP6]], i32* [[DOTOMP_IV]], align 4
+// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK-32:       omp.inner.for.cond:
+// CHECK-32-NEXT:    [[TMP7:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
+// CHECK-32-NEXT:    [[TMP8:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4
+// CHECK-32-NEXT:    [[CMP1:%.*]] = icmp sle i32 [[TMP7]], [[TMP8]]
+// CHECK-32-NEXT:    br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK-32:       omp.inner.for.body:
+// CHECK-32-NEXT:    [[TMP9:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
+// CHECK-32-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP9]], 1
+// CHECK-32-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// CHECK-32-NEXT:    store i32 [[ADD]], i32* [[I]], align 4
+// CHECK-32-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK-32:       omp.body.continue:
+// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK-32:       omp.inner.for.inc:
+// CHECK-32-NEXT:    [[TMP10:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
+// CHECK-32-NEXT:    [[ADD2:%.*]] = add nsw i32 [[TMP10]], 1
+// CHECK-32-NEXT:    store i32 [[ADD2]], i32* [[DOTOMP_IV]], align 4
+// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND]]
+// CHECK-32:       omp.inner.for.end:
+// CHECK-32-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK-32:       omp.loop.exit:
+// CHECK-32-NEXT:    call void @__kmpc_distribute_static_fini(%struct.ident_t* @[[GLOB2]], i32 [[TMP3]])
+// CHECK-32-NEXT:    ret void
+//
+//
+// CHECK-32-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l134
+// CHECK-32-SAME: () #[[ATTR0]] {
+// CHECK-32-NEXT:  entry:
+// CHECK-32-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 2, i1 false)
+// CHECK-32-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
+// CHECK-32-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
+// CHECK-32:       user_code.entry:
+// CHECK-32-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
+// CHECK-32-NEXT:    store i32 0, i32* [[DOTZERO_ADDR]], align 4
+// CHECK-32-NEXT:    store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4
+// CHECK-32-NEXT:    call void @__omp_outlined__61(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]]) #[[ATTR2]]
+// CHECK-32-NEXT:    call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 2)
+// CHECK-32-NEXT:    ret void
+// CHECK-32:       worker.exit:
+// CHECK-32-NEXT:    ret void
+//
+//
+// CHECK-32-LABEL: define {{[^@]+}}@__omp_outlined__61
+// CHECK-32-SAME: (i32* noalias noundef [[DOTGLOBAL_TID_:%.*]], i32* noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
+// CHECK-32-NEXT:  entry:
+// CHECK-32-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [2 x i8*], align 4
+// CHECK-32-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4
+// CHECK-32-NEXT:    store i32 0, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-32-NEXT:    store i32 9, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-NEXT:    store i32 1, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-32-NEXT:    store i32 0, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-32-NEXT:    [[NVPTX_NUM_THREADS:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block()
+// CHECK-32-NEXT:    [[TMP0:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-NEXT:    [[TMP1:%.*]] = load i32, i32* [[TMP0]], align 4
+// CHECK-32-NEXT:    call void @__kmpc_distribute_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP1]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]])
+// CHECK-32-NEXT:    [[TMP2:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[TMP2]], 9
+// CHECK-32-NEXT:    br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK-32:       cond.true:
+// CHECK-32-NEXT:    br label [[COND_END:%.*]]
+// CHECK-32:       cond.false:
+// CHECK-32-NEXT:    [[TMP3:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-NEXT:    br label [[COND_END]]
+// CHECK-32:       cond.end:
+// CHECK-32-NEXT:    [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP3]], [[COND_FALSE]] ]
+// CHECK-32-NEXT:    store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-NEXT:    [[TMP4:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-32-NEXT:    store i32 [[TMP4]], i32* [[DOTOMP_IV]], align 4
+// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK-32:       omp.inner.for.cond:
+// CHECK-32-NEXT:    [[TMP5:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
+// CHECK-32-NEXT:    [[CMP1:%.*]] = icmp slt i32 [[TMP5]], 10
+// CHECK-32-NEXT:    br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK-32:       omp.inner.for.body:
+// CHECK-32-NEXT:    [[TMP6:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-32-NEXT:    [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0
+// CHECK-32-NEXT:    [[TMP9:%.*]] = inttoptr i32 [[TMP6]] to i8*
+// CHECK-32-NEXT:    store i8* [[TMP9]], i8** [[TMP8]], align 4
+// CHECK-32-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1
+// CHECK-32-NEXT:    [[TMP11:%.*]] = inttoptr i32 [[TMP7]] to i8*
+// CHECK-32-NEXT:    store i8* [[TMP11]], i8** [[TMP10]], align 4
+// CHECK-32-NEXT:    [[TMP12:%.*]] = bitcast [2 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8**
+// CHECK-32-NEXT:    call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32)* @__omp_outlined__62 to i8*), i8* null, i8** [[TMP12]], i32 2)
+// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK-32:       omp.inner.for.inc:
+// CHECK-32-NEXT:    [[TMP13:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
+// CHECK-32-NEXT:    [[TMP14:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-32-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP13]], [[TMP14]]
+// CHECK-32-NEXT:    store i32 [[ADD]], i32* [[DOTOMP_IV]], align 4
+// CHECK-32-NEXT:    [[TMP15:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-32-NEXT:    [[TMP16:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-32-NEXT:    [[ADD2:%.*]] = add nsw i32 [[TMP15]], [[TMP16]]
+// CHECK-32-NEXT:    store i32 [[ADD2]], i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-32-NEXT:    [[TMP17:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-NEXT:    [[TMP18:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-32-NEXT:    [[ADD3:%.*]] = add nsw i32 [[TMP17]], [[TMP18]]
+// CHECK-32-NEXT:    store i32 [[ADD3]], i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-NEXT:    [[TMP19:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-NEXT:    [[CMP4:%.*]] = icmp sgt i32 [[TMP19]], 9
+// CHECK-32-NEXT:    br i1 [[CMP4]], label [[COND_TRUE5:%.*]], label [[COND_FALSE6:%.*]]
+// CHECK-32:       cond.true5:
+// CHECK-32-NEXT:    br label [[COND_END7:%.*]]
+// CHECK-32:       cond.false6:
+// CHECK-32-NEXT:    [[TMP20:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-NEXT:    br label [[COND_END7]]
+// CHECK-32:       cond.end7:
+// CHECK-32-NEXT:    [[COND8:%.*]] = phi i32 [ 9, [[COND_TRUE5]] ], [ [[TMP20]], [[COND_FALSE6]] ]
+// CHECK-32-NEXT:    store i32 [[COND8]], i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-NEXT:    [[TMP21:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-32-NEXT:    store i32 [[TMP21]], i32* [[DOTOMP_IV]], align 4
+// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND]]
+// CHECK-32:       omp.inner.for.end:
+// CHECK-32-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK-32:       omp.loop.exit:
+// CHECK-32-NEXT:    call void @__kmpc_distribute_static_fini(%struct.ident_t* @[[GLOB2]], i32 [[TMP1]])
+// CHECK-32-NEXT:    ret void
+//
+//
+// CHECK-32-LABEL: define {{[^@]+}}@__omp_outlined__62
+// CHECK-32-SAME: (i32* noalias noundef [[DOTGLOBAL_TID_:%.*]], i32* noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR1]] {
+// CHECK-32-NEXT:  entry:
+// CHECK-32-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-NEXT:    [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4
+// CHECK-32-NEXT:    store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4
+// CHECK-32-NEXT:    store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4
+// CHECK-32-NEXT:    store i32 0, i32* [[DOTOMP_LB]], align 4
+// CHECK-32-NEXT:    store i32 9, i32* [[DOTOMP_UB]], align 4
+// CHECK-32-NEXT:    [[TMP0:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4
+// CHECK-32-NEXT:    [[TMP1:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4
+// CHECK-32-NEXT:    store i32 [[TMP0]], i32* [[DOTOMP_LB]], align 4
+// CHECK-32-NEXT:    store i32 [[TMP1]], i32* [[DOTOMP_UB]], align 4
+// CHECK-32-NEXT:    store i32 1, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-32-NEXT:    store i32 0, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-32-NEXT:    [[TMP2:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-NEXT:    [[TMP3:%.*]] = load i32, i32* [[TMP2]], align 4
+// CHECK-32-NEXT:    call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB3]], i32 [[TMP3]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1)
+// CHECK-32-NEXT:    [[TMP4:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4
+// CHECK-32-NEXT:    store i32 [[TMP4]], i32* [[DOTOMP_IV]], align 4
+// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK-32:       omp.inner.for.cond:
+// CHECK-32-NEXT:    [[TMP5:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
+// CHECK-32-NEXT:    [[TMP6:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4
+// CHECK-32-NEXT:    [[CMP:%.*]] = icmp ule i32 [[TMP5]], [[TMP6]]
+// CHECK-32-NEXT:    br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK-32:       omp.inner.for.body:
+// CHECK-32-NEXT:    [[TMP7:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
+// CHECK-32-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP7]], 1
+// CHECK-32-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// CHECK-32-NEXT:    store i32 [[ADD]], i32* [[I]], align 4
+// CHECK-32-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK-32:       omp.body.continue:
+// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK-32:       omp.inner.for.inc:
+// CHECK-32-NEXT:    [[TMP8:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
+// CHECK-32-NEXT:    [[TMP9:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-32-NEXT:    [[ADD1:%.*]] = add nsw i32 [[TMP8]], [[TMP9]]
+// CHECK-32-NEXT:    store i32 [[ADD1]], i32* [[DOTOMP_IV]], align 4
+// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND]]
+// CHECK-32:       omp.inner.for.end:
+// CHECK-32-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK-32:       omp.loop.exit:
+// CHECK-32-NEXT:    call void @__kmpc_distribute_static_fini(%struct.ident_t* @[[GLOB2]], i32 [[TMP3]])
+// CHECK-32-NEXT:    ret void
+//
+//
+// CHECK-32-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l139
+// CHECK-32-SAME: () #[[ATTR0]] {
+// CHECK-32-NEXT:  entry:
+// CHECK-32-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 2, i1 false)
+// CHECK-32-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
+// CHECK-32-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
+// CHECK-32:       user_code.entry:
+// CHECK-32-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
+// CHECK-32-NEXT:    store i32 0, i32* [[DOTZERO_ADDR]], align 4
+// CHECK-32-NEXT:    store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4
+// CHECK-32-NEXT:    call void @__omp_outlined__63(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]]) #[[ATTR2]]
+// CHECK-32-NEXT:    call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 2)
+// CHECK-32-NEXT:    ret void
+// CHECK-32:       worker.exit:
+// CHECK-32-NEXT:    ret void
+//
+//
+// CHECK-32-LABEL: define {{[^@]+}}@__omp_outlined__63
+// CHECK-32-SAME: (i32* noalias noundef [[DOTGLOBAL_TID_:%.*]], i32* noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
+// CHECK-32-NEXT:  entry:
+// CHECK-32-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [2 x i8*], align 4
+// CHECK-32-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4
+// CHECK-32-NEXT:    store i32 0, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-32-NEXT:    store i32 9, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-NEXT:    store i32 1, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-32-NEXT:    store i32 0, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-32-NEXT:    [[NVPTX_NUM_THREADS:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block()
+// CHECK-32-NEXT:    [[TMP0:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-NEXT:    [[TMP1:%.*]] = load i32, i32* [[TMP0]], align 4
+// CHECK-32-NEXT:    call void @__kmpc_distribute_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP1]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]])
+// CHECK-32-NEXT:    [[TMP2:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[TMP2]], 9
+// CHECK-32-NEXT:    br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK-32:       cond.true:
+// CHECK-32-NEXT:    br label [[COND_END:%.*]]
+// CHECK-32:       cond.false:
+// CHECK-32-NEXT:    [[TMP3:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-NEXT:    br label [[COND_END]]
+// CHECK-32:       cond.end:
+// CHECK-32-NEXT:    [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP3]], [[COND_FALSE]] ]
+// CHECK-32-NEXT:    store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-NEXT:    [[TMP4:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-32-NEXT:    store i32 [[TMP4]], i32* [[DOTOMP_IV]], align 4
+// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK-32:       omp.inner.for.cond:
+// CHECK-32-NEXT:    [[TMP5:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
+// CHECK-32-NEXT:    [[CMP1:%.*]] = icmp slt i32 [[TMP5]], 10
+// CHECK-32-NEXT:    br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK-32:       omp.inner.for.body:
+// CHECK-32-NEXT:    [[TMP6:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-32-NEXT:    [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0
+// CHECK-32-NEXT:    [[TMP9:%.*]] = inttoptr i32 [[TMP6]] to i8*
+// CHECK-32-NEXT:    store i8* [[TMP9]], i8** [[TMP8]], align 4
+// CHECK-32-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1
+// CHECK-32-NEXT:    [[TMP11:%.*]] = inttoptr i32 [[TMP7]] to i8*
+// CHECK-32-NEXT:    store i8* [[TMP11]], i8** [[TMP10]], align 4
+// CHECK-32-NEXT:    [[TMP12:%.*]] = bitcast [2 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8**
+// CHECK-32-NEXT:    call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32)* @__omp_outlined__64 to i8*), i8* null, i8** [[TMP12]], i32 2)
+// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK-32:       omp.inner.for.inc:
+// CHECK-32-NEXT:    [[TMP13:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
+// CHECK-32-NEXT:    [[TMP14:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-32-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP13]], [[TMP14]]
+// CHECK-32-NEXT:    store i32 [[ADD]], i32* [[DOTOMP_IV]], align 4
+// CHECK-32-NEXT:    [[TMP15:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-32-NEXT:    [[TMP16:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-32-NEXT:    [[ADD2:%.*]] = add nsw i32 [[TMP15]], [[TMP16]]
+// CHECK-32-NEXT:    store i32 [[ADD2]], i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-32-NEXT:    [[TMP17:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-NEXT:    [[TMP18:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-32-NEXT:    [[ADD3:%.*]] = add nsw i32 [[TMP17]], [[TMP18]]
+// CHECK-32-NEXT:    store i32 [[ADD3]], i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-NEXT:    [[TMP19:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-NEXT:    [[CMP4:%.*]] = icmp sgt i32 [[TMP19]], 9
+// CHECK-32-NEXT:    br i1 [[CMP4]], label [[COND_TRUE5:%.*]], label [[COND_FALSE6:%.*]]
+// CHECK-32:       cond.true5:
+// CHECK-32-NEXT:    br label [[COND_END7:%.*]]
+// CHECK-32:       cond.false6:
+// CHECK-32-NEXT:    [[TMP20:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-NEXT:    br label [[COND_END7]]
+// CHECK-32:       cond.end7:
+// CHECK-32-NEXT:    [[COND8:%.*]] = phi i32 [ 9, [[COND_TRUE5]] ], [ [[TMP20]], [[COND_FALSE6]] ]
+// CHECK-32-NEXT:    store i32 [[COND8]], i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-NEXT:    [[TMP21:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-32-NEXT:    store i32 [[TMP21]], i32* [[DOTOMP_IV]], align 4
+// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND]]
+// CHECK-32:       omp.inner.for.end:
+// CHECK-32-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK-32:       omp.loop.exit:
+// CHECK-32-NEXT:    call void @__kmpc_distribute_static_fini(%struct.ident_t* @[[GLOB2]], i32 [[TMP1]])
+// CHECK-32-NEXT:    ret void
+//
+//
+// CHECK-32-LABEL: define {{[^@]+}}@__omp_outlined__64
+// CHECK-32-SAME: (i32* noalias noundef [[DOTGLOBAL_TID_:%.*]], i32* noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR1]] {
+// CHECK-32-NEXT:  entry:
+// CHECK-32-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-NEXT:    [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4
+// CHECK-32-NEXT:    store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4
+// CHECK-32-NEXT:    store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4
+// CHECK-32-NEXT:    store i32 0, i32* [[DOTOMP_LB]], align 4
+// CHECK-32-NEXT:    store i32 9, i32* [[DOTOMP_UB]], align 4
+// CHECK-32-NEXT:    [[TMP0:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4
+// CHECK-32-NEXT:    [[TMP1:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4
+// CHECK-32-NEXT:    store i32 [[TMP0]], i32* [[DOTOMP_LB]], align 4
+// CHECK-32-NEXT:    store i32 [[TMP1]], i32* [[DOTOMP_UB]], align 4
+// CHECK-32-NEXT:    store i32 1, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-32-NEXT:    store i32 0, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-32-NEXT:    [[TMP2:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4
+// CHECK-32-NEXT:    [[TMP3:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4
+// CHECK-32-NEXT:    [[TMP4:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-NEXT:    [[TMP5:%.*]] = load i32, i32* [[TMP4]], align 4
+// CHECK-32-NEXT:    call void @__kmpc_dispatch_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP5]], i32 1073741862, i32 [[TMP2]], i32 [[TMP3]], i32 1, i32 1)
+// CHECK-32-NEXT:    br label [[OMP_DISPATCH_COND:%.*]]
+// CHECK-32:       omp.dispatch.cond:
+// CHECK-32-NEXT:    [[TMP6:%.*]] = call i32 @__kmpc_dispatch_next_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP5]], i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]])
+// CHECK-32-NEXT:    [[TOBOOL:%.*]] = icmp ne i32 [[TMP6]], 0
+// CHECK-32-NEXT:    br i1 [[TOBOOL]], label [[OMP_DISPATCH_BODY:%.*]], label [[OMP_DISPATCH_END:%.*]]
+// CHECK-32:       omp.dispatch.body:
+// CHECK-32-NEXT:    [[TMP7:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4
+// CHECK-32-NEXT:    store i32 [[TMP7]], i32* [[DOTOMP_IV]], align 4
+// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK-32:       omp.inner.for.cond:
+// CHECK-32-NEXT:    [[TMP8:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP246:![0-9]+]]
+// CHECK-32-NEXT:    [[TMP9:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP246]]
+// CHECK-32-NEXT:    [[CMP:%.*]] = icmp sle i32 [[TMP8]], [[TMP9]]
+// CHECK-32-NEXT:    br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK-32:       omp.inner.for.body:
+// CHECK-32-NEXT:    [[TMP10:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP246]]
+// CHECK-32-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP10]], 1
+// CHECK-32-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// CHECK-32-NEXT:    store i32 [[ADD]], i32* [[I]], align 4, !llvm.access.group [[ACC_GRP246]]
+// CHECK-32-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK-32:       omp.body.continue:
+// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK-32:       omp.inner.for.inc:
+// CHECK-32-NEXT:    [[TMP11:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP246]]
+// CHECK-32-NEXT:    [[ADD1:%.*]] = add nsw i32 [[TMP11]], 1
+// CHECK-32-NEXT:    store i32 [[ADD1]], i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP246]]
+// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP247:![0-9]+]]
+// CHECK-32:       omp.inner.for.end:
+// CHECK-32-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
+// CHECK-32:       omp.dispatch.inc:
+// CHECK-32-NEXT:    br label [[OMP_DISPATCH_COND]]
+// CHECK-32:       omp.dispatch.end:
+// CHECK-32-NEXT:    ret void
+//
+//
+// CHECK-32-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l144
+// CHECK-32-SAME: () #[[ATTR0]] {
+// CHECK-32-NEXT:  entry:
+// CHECK-32-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 2, i1 false)
+// CHECK-32-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
+// CHECK-32-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
+// CHECK-32:       user_code.entry:
+// CHECK-32-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
+// CHECK-32-NEXT:    store i32 0, i32* [[DOTZERO_ADDR]], align 4
+// CHECK-32-NEXT:    store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4
+// CHECK-32-NEXT:    call void @__omp_outlined__65(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]]) #[[ATTR2]]
+// CHECK-32-NEXT:    call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 2)
+// CHECK-32-NEXT:    ret void
+// CHECK-32:       worker.exit:
+// CHECK-32-NEXT:    ret void
+//
+//
+// CHECK-32-LABEL: define {{[^@]+}}@__omp_outlined__65
+// CHECK-32-SAME: (i32* noalias noundef [[DOTGLOBAL_TID_:%.*]], i32* noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
+// CHECK-32-NEXT:  entry:
+// CHECK-32-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [2 x i8*], align 4
+// CHECK-32-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4
+// CHECK-32-NEXT:    store i32 0, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-32-NEXT:    store i32 9, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-NEXT:    store i32 1, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-32-NEXT:    store i32 0, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-32-NEXT:    [[NVPTX_NUM_THREADS:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block()
+// CHECK-32-NEXT:    [[TMP0:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-NEXT:    [[TMP1:%.*]] = load i32, i32* [[TMP0]], align 4
+// CHECK-32-NEXT:    call void @__kmpc_distribute_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP1]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]])
+// CHECK-32-NEXT:    [[TMP2:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[TMP2]], 9
+// CHECK-32-NEXT:    br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK-32:       cond.true:
+// CHECK-32-NEXT:    br label [[COND_END:%.*]]
+// CHECK-32:       cond.false:
+// CHECK-32-NEXT:    [[TMP3:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-NEXT:    br label [[COND_END]]
+// CHECK-32:       cond.end:
+// CHECK-32-NEXT:    [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP3]], [[COND_FALSE]] ]
+// CHECK-32-NEXT:    store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-NEXT:    [[TMP4:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-32-NEXT:    store i32 [[TMP4]], i32* [[DOTOMP_IV]], align 4
+// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK-32:       omp.inner.for.cond:
+// CHECK-32-NEXT:    [[TMP5:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
+// CHECK-32-NEXT:    [[CMP1:%.*]] = icmp slt i32 [[TMP5]], 10
+// CHECK-32-NEXT:    br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK-32:       omp.inner.for.body:
+// CHECK-32-NEXT:    [[TMP6:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-32-NEXT:    [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0
+// CHECK-32-NEXT:    [[TMP9:%.*]] = inttoptr i32 [[TMP6]] to i8*
+// CHECK-32-NEXT:    store i8* [[TMP9]], i8** [[TMP8]], align 4
+// CHECK-32-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1
+// CHECK-32-NEXT:    [[TMP11:%.*]] = inttoptr i32 [[TMP7]] to i8*
+// CHECK-32-NEXT:    store i8* [[TMP11]], i8** [[TMP10]], align 4
+// CHECK-32-NEXT:    [[TMP12:%.*]] = bitcast [2 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8**
+// CHECK-32-NEXT:    call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32)* @__omp_outlined__66 to i8*), i8* null, i8** [[TMP12]], i32 2)
+// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK-32:       omp.inner.for.inc:
+// CHECK-32-NEXT:    [[TMP13:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
+// CHECK-32-NEXT:    [[TMP14:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-32-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP13]], [[TMP14]]
+// CHECK-32-NEXT:    store i32 [[ADD]], i32* [[DOTOMP_IV]], align 4
+// CHECK-32-NEXT:    [[TMP15:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-32-NEXT:    [[TMP16:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-32-NEXT:    [[ADD2:%.*]] = add nsw i32 [[TMP15]], [[TMP16]]
+// CHECK-32-NEXT:    store i32 [[ADD2]], i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-32-NEXT:    [[TMP17:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-NEXT:    [[TMP18:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-32-NEXT:    [[ADD3:%.*]] = add nsw i32 [[TMP17]], [[TMP18]]
+// CHECK-32-NEXT:    store i32 [[ADD3]], i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-NEXT:    [[TMP19:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-NEXT:    [[CMP4:%.*]] = icmp sgt i32 [[TMP19]], 9
+// CHECK-32-NEXT:    br i1 [[CMP4]], label [[COND_TRUE5:%.*]], label [[COND_FALSE6:%.*]]
+// CHECK-32:       cond.true5:
+// CHECK-32-NEXT:    br label [[COND_END7:%.*]]
+// CHECK-32:       cond.false6:
+// CHECK-32-NEXT:    [[TMP20:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-NEXT:    br label [[COND_END7]]
+// CHECK-32:       cond.end7:
+// CHECK-32-NEXT:    [[COND8:%.*]] = phi i32 [ 9, [[COND_TRUE5]] ], [ [[TMP20]], [[COND_FALSE6]] ]
+// CHECK-32-NEXT:    store i32 [[COND8]], i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-NEXT:    [[TMP21:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-32-NEXT:    store i32 [[TMP21]], i32* [[DOTOMP_IV]], align 4
+// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND]]
+// CHECK-32:       omp.inner.for.end:
+// CHECK-32-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK-32:       omp.loop.exit:
+// CHECK-32-NEXT:    call void @__kmpc_distribute_static_fini(%struct.ident_t* @[[GLOB2]], i32 [[TMP1]])
+// CHECK-32-NEXT:    ret void
+//
+//
+// CHECK-32-LABEL: define {{[^@]+}}@__omp_outlined__66
+// CHECK-32-SAME: (i32* noalias noundef [[DOTGLOBAL_TID_:%.*]], i32* noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR1]] {
+// CHECK-32-NEXT:  entry:
+// CHECK-32-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-NEXT:    [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4
+// CHECK-32-NEXT:    store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4
+// CHECK-32-NEXT:    store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4
+// CHECK-32-NEXT:    store i32 0, i32* [[DOTOMP_LB]], align 4
+// CHECK-32-NEXT:    store i32 9, i32* [[DOTOMP_UB]], align 4
+// CHECK-32-NEXT:    [[TMP0:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4
+// CHECK-32-NEXT:    [[TMP1:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4
+// CHECK-32-NEXT:    store i32 [[TMP0]], i32* [[DOTOMP_LB]], align 4
+// CHECK-32-NEXT:    store i32 [[TMP1]], i32* [[DOTOMP_UB]], align 4
+// CHECK-32-NEXT:    store i32 1, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-32-NEXT:    store i32 0, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-32-NEXT:    [[TMP2:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4
+// CHECK-32-NEXT:    [[TMP3:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4
+// CHECK-32-NEXT:    [[TMP4:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-NEXT:    [[TMP5:%.*]] = load i32, i32* [[TMP4]], align 4
+// CHECK-32-NEXT:    call void @__kmpc_dispatch_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP5]], i32 1073741861, i32 [[TMP2]], i32 [[TMP3]], i32 1, i32 1)
+// CHECK-32-NEXT:    br label [[OMP_DISPATCH_COND:%.*]]
+// CHECK-32:       omp.dispatch.cond:
+// CHECK-32-NEXT:    [[TMP6:%.*]] = call i32 @__kmpc_dispatch_next_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP5]], i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]])
+// CHECK-32-NEXT:    [[TOBOOL:%.*]] = icmp ne i32 [[TMP6]], 0
+// CHECK-32-NEXT:    br i1 [[TOBOOL]], label [[OMP_DISPATCH_BODY:%.*]], label [[OMP_DISPATCH_END:%.*]]
+// CHECK-32:       omp.dispatch.body:
+// CHECK-32-NEXT:    [[TMP7:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4
+// CHECK-32-NEXT:    store i32 [[TMP7]], i32* [[DOTOMP_IV]], align 4
+// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK-32:       omp.inner.for.cond:
+// CHECK-32-NEXT:    [[TMP8:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP249:![0-9]+]]
+// CHECK-32-NEXT:    [[TMP9:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP249]]
+// CHECK-32-NEXT:    [[CMP:%.*]] = icmp sle i32 [[TMP8]], [[TMP9]]
+// CHECK-32-NEXT:    br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK-32:       omp.inner.for.body:
+// CHECK-32-NEXT:    [[TMP10:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP249]]
+// CHECK-32-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP10]], 1
+// CHECK-32-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// CHECK-32-NEXT:    store i32 [[ADD]], i32* [[I]], align 4, !llvm.access.group [[ACC_GRP249]]
+// CHECK-32-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK-32:       omp.body.continue:
+// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK-32:       omp.inner.for.inc:
+// CHECK-32-NEXT:    [[TMP11:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP249]]
+// CHECK-32-NEXT:    [[ADD1:%.*]] = add nsw i32 [[TMP11]], 1
+// CHECK-32-NEXT:    store i32 [[ADD1]], i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP249]]
+// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP250:![0-9]+]]
+// CHECK-32:       omp.inner.for.end:
+// CHECK-32-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
+// CHECK-32:       omp.dispatch.inc:
+// CHECK-32-NEXT:    br label [[OMP_DISPATCH_COND]]
+// CHECK-32:       omp.dispatch.end:
+// CHECK-32-NEXT:    ret void
+//
+//
+// CHECK-32-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l149
+// CHECK-32-SAME: () #[[ATTR0]] {
+// CHECK-32-NEXT:  entry:
+// CHECK-32-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 2, i1 false)
+// CHECK-32-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
+// CHECK-32-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
+// CHECK-32:       user_code.entry:
+// CHECK-32-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
+// CHECK-32-NEXT:    store i32 0, i32* [[DOTZERO_ADDR]], align 4
+// CHECK-32-NEXT:    store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4
+// CHECK-32-NEXT:    call void @__omp_outlined__67(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]]) #[[ATTR2]]
+// CHECK-32-NEXT:    call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 2)
+// CHECK-32-NEXT:    ret void
+// CHECK-32:       worker.exit:
+// CHECK-32-NEXT:    ret void
+//
+//
+// CHECK-32-LABEL: define {{[^@]+}}@__omp_outlined__67
+// CHECK-32-SAME: (i32* noalias noundef [[DOTGLOBAL_TID_:%.*]], i32* noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
+// CHECK-32-NEXT:  entry:
+// CHECK-32-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [2 x i8*], align 4
+// CHECK-32-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4
+// CHECK-32-NEXT:    store i32 0, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-32-NEXT:    store i32 9, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-NEXT:    store i32 1, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-32-NEXT:    store i32 0, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-32-NEXT:    [[NVPTX_NUM_THREADS:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block()
+// CHECK-32-NEXT:    [[TMP0:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-NEXT:    [[TMP1:%.*]] = load i32, i32* [[TMP0]], align 4
+// CHECK-32-NEXT:    call void @__kmpc_distribute_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP1]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]])
+// CHECK-32-NEXT:    [[TMP2:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[TMP2]], 9
+// CHECK-32-NEXT:    br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK-32:       cond.true:
+// CHECK-32-NEXT:    br label [[COND_END:%.*]]
+// CHECK-32:       cond.false:
+// CHECK-32-NEXT:    [[TMP3:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-NEXT:    br label [[COND_END]]
+// CHECK-32:       cond.end:
+// CHECK-32-NEXT:    [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP3]], [[COND_FALSE]] ]
+// CHECK-32-NEXT:    store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-NEXT:    [[TMP4:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-32-NEXT:    store i32 [[TMP4]], i32* [[DOTOMP_IV]], align 4
+// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK-32:       omp.inner.for.cond:
+// CHECK-32-NEXT:    [[TMP5:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
+// CHECK-32-NEXT:    [[CMP1:%.*]] = icmp slt i32 [[TMP5]], 10
+// CHECK-32-NEXT:    br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK-32:       omp.inner.for.body:
+// CHECK-32-NEXT:    [[TMP6:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-32-NEXT:    [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0
+// CHECK-32-NEXT:    [[TMP9:%.*]] = inttoptr i32 [[TMP6]] to i8*
+// CHECK-32-NEXT:    store i8* [[TMP9]], i8** [[TMP8]], align 4
+// CHECK-32-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1
+// CHECK-32-NEXT:    [[TMP11:%.*]] = inttoptr i32 [[TMP7]] to i8*
+// CHECK-32-NEXT:    store i8* [[TMP11]], i8** [[TMP10]], align 4
+// CHECK-32-NEXT:    [[TMP12:%.*]] = bitcast [2 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8**
+// CHECK-32-NEXT:    call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32)* @__omp_outlined__68 to i8*), i8* null, i8** [[TMP12]], i32 2)
+// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK-32:       omp.inner.for.inc:
+// CHECK-32-NEXT:    [[TMP13:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
+// CHECK-32-NEXT:    [[TMP14:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-32-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP13]], [[TMP14]]
+// CHECK-32-NEXT:    store i32 [[ADD]], i32* [[DOTOMP_IV]], align 4
+// CHECK-32-NEXT:    [[TMP15:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-32-NEXT:    [[TMP16:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-32-NEXT:    [[ADD2:%.*]] = add nsw i32 [[TMP15]], [[TMP16]]
+// CHECK-32-NEXT:    store i32 [[ADD2]], i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-32-NEXT:    [[TMP17:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-NEXT:    [[TMP18:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-32-NEXT:    [[ADD3:%.*]] = add nsw i32 [[TMP17]], [[TMP18]]
+// CHECK-32-NEXT:    store i32 [[ADD3]], i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-NEXT:    [[TMP19:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-NEXT:    [[CMP4:%.*]] = icmp sgt i32 [[TMP19]], 9
+// CHECK-32-NEXT:    br i1 [[CMP4]], label [[COND_TRUE5:%.*]], label [[COND_FALSE6:%.*]]
+// CHECK-32:       cond.true5:
+// CHECK-32-NEXT:    br label [[COND_END7:%.*]]
+// CHECK-32:       cond.false6:
+// CHECK-32-NEXT:    [[TMP20:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-NEXT:    br label [[COND_END7]]
+// CHECK-32:       cond.end7:
+// CHECK-32-NEXT:    [[COND8:%.*]] = phi i32 [ 9, [[COND_TRUE5]] ], [ [[TMP20]], [[COND_FALSE6]] ]
+// CHECK-32-NEXT:    store i32 [[COND8]], i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-NEXT:    [[TMP21:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-32-NEXT:    store i32 [[TMP21]], i32* [[DOTOMP_IV]], align 4
+// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND]]
+// CHECK-32:       omp.inner.for.end:
+// CHECK-32-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK-32:       omp.loop.exit:
+// CHECK-32-NEXT:    call void @__kmpc_distribute_static_fini(%struct.ident_t* @[[GLOB2]], i32 [[TMP1]])
+// CHECK-32-NEXT:    ret void
+//
+//
+// CHECK-32-LABEL: define {{[^@]+}}@__omp_outlined__68
+// CHECK-32-SAME: (i32* noalias noundef [[DOTGLOBAL_TID_:%.*]], i32* noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR1]] {
+// CHECK-32-NEXT:  entry:
+// CHECK-32-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-NEXT:    [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4
+// CHECK-32-NEXT:    store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4
+// CHECK-32-NEXT:    store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4
+// CHECK-32-NEXT:    store i32 0, i32* [[DOTOMP_LB]], align 4
+// CHECK-32-NEXT:    store i32 9, i32* [[DOTOMP_UB]], align 4
+// CHECK-32-NEXT:    [[TMP0:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4
+// CHECK-32-NEXT:    [[TMP1:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4
+// CHECK-32-NEXT:    store i32 [[TMP0]], i32* [[DOTOMP_LB]], align 4
+// CHECK-32-NEXT:    store i32 [[TMP1]], i32* [[DOTOMP_UB]], align 4
+// CHECK-32-NEXT:    store i32 1, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-32-NEXT:    store i32 0, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-32-NEXT:    [[TMP2:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4
+// CHECK-32-NEXT:    [[TMP3:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4
+// CHECK-32-NEXT:    [[TMP4:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-NEXT:    [[TMP5:%.*]] = load i32, i32* [[TMP4]], align 4
+// CHECK-32-NEXT:    call void @__kmpc_dispatch_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP5]], i32 1073741859, i32 [[TMP2]], i32 [[TMP3]], i32 1, i32 1)
+// CHECK-32-NEXT:    br label [[OMP_DISPATCH_COND:%.*]]
+// CHECK-32:       omp.dispatch.cond:
+// CHECK-32-NEXT:    [[TMP6:%.*]] = call i32 @__kmpc_dispatch_next_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP5]], i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]])
+// CHECK-32-NEXT:    [[TOBOOL:%.*]] = icmp ne i32 [[TMP6]], 0
+// CHECK-32-NEXT:    br i1 [[TOBOOL]], label [[OMP_DISPATCH_BODY:%.*]], label [[OMP_DISPATCH_END:%.*]]
+// CHECK-32:       omp.dispatch.body:
+// CHECK-32-NEXT:    [[TMP7:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4
+// CHECK-32-NEXT:    store i32 [[TMP7]], i32* [[DOTOMP_IV]], align 4
+// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK-32:       omp.inner.for.cond:
+// CHECK-32-NEXT:    [[TMP8:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP252:![0-9]+]]
+// CHECK-32-NEXT:    [[TMP9:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP252]]
+// CHECK-32-NEXT:    [[CMP:%.*]] = icmp sle i32 [[TMP8]], [[TMP9]]
+// CHECK-32-NEXT:    br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK-32:       omp.inner.for.body:
+// CHECK-32-NEXT:    [[TMP10:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP252]]
+// CHECK-32-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP10]], 1
+// CHECK-32-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// CHECK-32-NEXT:    store i32 [[ADD]], i32* [[I]], align 4, !llvm.access.group [[ACC_GRP252]]
+// CHECK-32-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK-32:       omp.body.continue:
+// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK-32:       omp.inner.for.inc:
+// CHECK-32-NEXT:    [[TMP11:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP252]]
+// CHECK-32-NEXT:    [[ADD1:%.*]] = add nsw i32 [[TMP11]], 1
+// CHECK-32-NEXT:    store i32 [[ADD1]], i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP252]]
+// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP253:![0-9]+]]
+// CHECK-32:       omp.inner.for.end:
+// CHECK-32-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
+// CHECK-32:       omp.dispatch.inc:
+// CHECK-32-NEXT:    br label [[OMP_DISPATCH_COND]]
+// CHECK-32:       omp.dispatch.end:
+// CHECK-32-NEXT:    ret void
+//
+//
+// CHECK-32-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l154
+// CHECK-32-SAME: () #[[ATTR0]] {
+// CHECK-32-NEXT:  entry:
+// CHECK-32-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 2, i1 false)
+// CHECK-32-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
+// CHECK-32-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
+// CHECK-32:       user_code.entry:
+// CHECK-32-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
+// CHECK-32-NEXT:    store i32 0, i32* [[DOTZERO_ADDR]], align 4
+// CHECK-32-NEXT:    store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4
+// CHECK-32-NEXT:    call void @__omp_outlined__69(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]]) #[[ATTR2]]
+// CHECK-32-NEXT:    call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 2)
+// CHECK-32-NEXT:    ret void
+// CHECK-32:       worker.exit:
+// CHECK-32-NEXT:    ret void
+//
+//
+// CHECK-32-LABEL: define {{[^@]+}}@__omp_outlined__69
+// CHECK-32-SAME: (i32* noalias noundef [[DOTGLOBAL_TID_:%.*]], i32* noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
+// CHECK-32-NEXT:  entry:
+// CHECK-32-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [2 x i8*], align 4
+// CHECK-32-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4
+// CHECK-32-NEXT:    store i32 0, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-32-NEXT:    store i32 9, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-NEXT:    store i32 1, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-32-NEXT:    store i32 0, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-32-NEXT:    [[NVPTX_NUM_THREADS:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block()
+// CHECK-32-NEXT:    [[TMP0:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-NEXT:    [[TMP1:%.*]] = load i32, i32* [[TMP0]], align 4
+// CHECK-32-NEXT:    call void @__kmpc_distribute_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP1]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]])
+// CHECK-32-NEXT:    [[TMP2:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[TMP2]], 9
+// CHECK-32-NEXT:    br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK-32:       cond.true:
+// CHECK-32-NEXT:    br label [[COND_END:%.*]]
+// CHECK-32:       cond.false:
+// CHECK-32-NEXT:    [[TMP3:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-NEXT:    br label [[COND_END]]
+// CHECK-32:       cond.end:
+// CHECK-32-NEXT:    [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP3]], [[COND_FALSE]] ]
+// CHECK-32-NEXT:    store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-NEXT:    [[TMP4:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-32-NEXT:    store i32 [[TMP4]], i32* [[DOTOMP_IV]], align 4
+// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK-32:       omp.inner.for.cond:
+// CHECK-32-NEXT:    [[TMP5:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
+// CHECK-32-NEXT:    [[CMP1:%.*]] = icmp slt i32 [[TMP5]], 10
+// CHECK-32-NEXT:    br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK-32:       omp.inner.for.body:
+// CHECK-32-NEXT:    [[TMP6:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-32-NEXT:    [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0
+// CHECK-32-NEXT:    [[TMP9:%.*]] = inttoptr i32 [[TMP6]] to i8*
+// CHECK-32-NEXT:    store i8* [[TMP9]], i8** [[TMP8]], align 4
+// CHECK-32-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1
+// CHECK-32-NEXT:    [[TMP11:%.*]] = inttoptr i32 [[TMP7]] to i8*
+// CHECK-32-NEXT:    store i8* [[TMP11]], i8** [[TMP10]], align 4
+// CHECK-32-NEXT:    [[TMP12:%.*]] = bitcast [2 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8**
+// CHECK-32-NEXT:    call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32)* @__omp_outlined__70 to i8*), i8* null, i8** [[TMP12]], i32 2)
+// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK-32:       omp.inner.for.inc:
+// CHECK-32-NEXT:    [[TMP13:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
+// CHECK-32-NEXT:    [[TMP14:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-32-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP13]], [[TMP14]]
+// CHECK-32-NEXT:    store i32 [[ADD]], i32* [[DOTOMP_IV]], align 4
+// CHECK-32-NEXT:    [[TMP15:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-32-NEXT:    [[TMP16:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-32-NEXT:    [[ADD2:%.*]] = add nsw i32 [[TMP15]], [[TMP16]]
+// CHECK-32-NEXT:    store i32 [[ADD2]], i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-32-NEXT:    [[TMP17:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-NEXT:    [[TMP18:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-32-NEXT:    [[ADD3:%.*]] = add nsw i32 [[TMP17]], [[TMP18]]
+// CHECK-32-NEXT:    store i32 [[ADD3]], i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-NEXT:    [[TMP19:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-NEXT:    [[CMP4:%.*]] = icmp sgt i32 [[TMP19]], 9
+// CHECK-32-NEXT:    br i1 [[CMP4]], label [[COND_TRUE5:%.*]], label [[COND_FALSE6:%.*]]
+// CHECK-32:       cond.true5:
+// CHECK-32-NEXT:    br label [[COND_END7:%.*]]
+// CHECK-32:       cond.false6:
+// CHECK-32-NEXT:    [[TMP20:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-NEXT:    br label [[COND_END7]]
+// CHECK-32:       cond.end7:
+// CHECK-32-NEXT:    [[COND8:%.*]] = phi i32 [ 9, [[COND_TRUE5]] ], [ [[TMP20]], [[COND_FALSE6]] ]
+// CHECK-32-NEXT:    store i32 [[COND8]], i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-NEXT:    [[TMP21:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-32-NEXT:    store i32 [[TMP21]], i32* [[DOTOMP_IV]], align 4
+// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND]]
+// CHECK-32:       omp.inner.for.end:
+// CHECK-32-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK-32:       omp.loop.exit:
+// CHECK-32-NEXT:    call void @__kmpc_distribute_static_fini(%struct.ident_t* @[[GLOB2]], i32 [[TMP1]])
+// CHECK-32-NEXT:    ret void
+//
+//
+// CHECK-32-LABEL: define {{[^@]+}}@__omp_outlined__70
+// CHECK-32-SAME: (i32* noalias noundef [[DOTGLOBAL_TID_:%.*]], i32* noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR1]] {
+// CHECK-32-NEXT:  entry:
+// CHECK-32-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-NEXT:    [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4
+// CHECK-32-NEXT:    store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4
+// CHECK-32-NEXT:    store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4
+// CHECK-32-NEXT:    store i32 0, i32* [[DOTOMP_LB]], align 4
+// CHECK-32-NEXT:    store i32 9, i32* [[DOTOMP_UB]], align 4
+// CHECK-32-NEXT:    [[TMP0:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4
+// CHECK-32-NEXT:    [[TMP1:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4
+// CHECK-32-NEXT:    store i32 [[TMP0]], i32* [[DOTOMP_LB]], align 4
+// CHECK-32-NEXT:    store i32 [[TMP1]], i32* [[DOTOMP_UB]], align 4
+// CHECK-32-NEXT:    store i32 1, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-32-NEXT:    store i32 0, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-32-NEXT:    [[TMP2:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4
+// CHECK-32-NEXT:    [[TMP3:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4
+// CHECK-32-NEXT:    [[TMP4:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-NEXT:    [[TMP5:%.*]] = load i32, i32* [[TMP4]], align 4
+// CHECK-32-NEXT:    call void @__kmpc_dispatch_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP5]], i32 1073741860, i32 [[TMP2]], i32 [[TMP3]], i32 1, i32 1)
+// CHECK-32-NEXT:    br label [[OMP_DISPATCH_COND:%.*]]
+// CHECK-32:       omp.dispatch.cond:
+// CHECK-32-NEXT:    [[TMP6:%.*]] = call i32 @__kmpc_dispatch_next_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP5]], i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]])
+// CHECK-32-NEXT:    [[TOBOOL:%.*]] = icmp ne i32 [[TMP6]], 0
+// CHECK-32-NEXT:    br i1 [[TOBOOL]], label [[OMP_DISPATCH_BODY:%.*]], label [[OMP_DISPATCH_END:%.*]]
+// CHECK-32:       omp.dispatch.body:
+// CHECK-32-NEXT:    [[TMP7:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4
+// CHECK-32-NEXT:    store i32 [[TMP7]], i32* [[DOTOMP_IV]], align 4
+// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK-32:       omp.inner.for.cond:
+// CHECK-32-NEXT:    [[TMP8:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP255:![0-9]+]]
+// CHECK-32-NEXT:    [[TMP9:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP255]]
+// CHECK-32-NEXT:    [[CMP:%.*]] = icmp sle i32 [[TMP8]], [[TMP9]]
+// CHECK-32-NEXT:    br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK-32:       omp.inner.for.body:
+// CHECK-32-NEXT:    [[TMP10:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP255]]
+// CHECK-32-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP10]], 1
+// CHECK-32-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// CHECK-32-NEXT:    store i32 [[ADD]], i32* [[I]], align 4, !llvm.access.group [[ACC_GRP255]]
+// CHECK-32-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK-32:       omp.body.continue:
+// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK-32:       omp.inner.for.inc:
+// CHECK-32-NEXT:    [[TMP11:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP255]]
+// CHECK-32-NEXT:    [[ADD1:%.*]] = add nsw i32 [[TMP11]], 1
+// CHECK-32-NEXT:    store i32 [[ADD1]], i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP255]]
+// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP256:![0-9]+]]
+// CHECK-32:       omp.inner.for.end:
+// CHECK-32-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
+// CHECK-32:       omp.dispatch.inc:
+// CHECK-32-NEXT:    br label [[OMP_DISPATCH_COND]]
+// CHECK-32:       omp.dispatch.end:
+// CHECK-32-NEXT:    ret void
+//
+//
+// CHECK-32-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l159
+// CHECK-32-SAME: (i32 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR8:[0-9]+]] {
+// CHECK-32-NEXT:  entry:
+// CHECK-32-NEXT:    [[DOTCAPTURE_EXPR__ADDR:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x i8*], align 4
+// CHECK-32-NEXT:    store i32 [[DOTCAPTURE_EXPR_]], i32* [[DOTCAPTURE_EXPR__ADDR]], align 4
+// CHECK-32-NEXT:    [[CONV:%.*]] = bitcast i32* [[DOTCAPTURE_EXPR__ADDR]] to i8*
+// CHECK-32-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 2, i1 false)
+// CHECK-32-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
+// CHECK-32-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
+// CHECK-32:       user_code.entry:
+// CHECK-32-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
+// CHECK-32-NEXT:    [[TMP2:%.*]] = load i8, i8* [[CONV]], align 1
+// CHECK-32-NEXT:    [[TOBOOL:%.*]] = trunc i8 [[TMP2]] to i1
+// CHECK-32-NEXT:    [[TMP3:%.*]] = zext i1 [[TOBOOL]] to i32
+// CHECK-32-NEXT:    [[TMP4:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8**
+// CHECK-32-NEXT:    call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]], i32 [[TMP3]], i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__71 to i8*), i8* null, i8** [[TMP4]], i32 0)
+// CHECK-32-NEXT:    call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 2)
+// CHECK-32-NEXT:    ret void
+// CHECK-32:       worker.exit:
+// CHECK-32-NEXT:    ret void
+//
+//
+// CHECK-32-LABEL: define {{[^@]+}}@__omp_outlined__71
+// CHECK-32-SAME: (i32* noalias noundef [[DOTGLOBAL_TID_:%.*]], i32* noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
+// CHECK-32-NEXT:  entry:
+// CHECK-32-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4
+// CHECK-32-NEXT:    store i32 0, i32* [[DOTOMP_LB]], align 4
+// CHECK-32-NEXT:    store i32 9, i32* [[DOTOMP_UB]], align 4
+// CHECK-32-NEXT:    store i32 1, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-32-NEXT:    store i32 0, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-32-NEXT:    [[TMP0:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-NEXT:    [[TMP1:%.*]] = load i32, i32* [[TMP0]], align 4
+// CHECK-32-NEXT:    call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB3]], i32 [[TMP1]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1)
+// CHECK-32-NEXT:    br label [[OMP_DISPATCH_COND:%.*]]
+// CHECK-32:       omp.dispatch.cond:
+// CHECK-32-NEXT:    [[TMP2:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4
+// CHECK-32-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[TMP2]], 9
+// CHECK-32-NEXT:    br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK-32:       cond.true:
+// CHECK-32-NEXT:    br label [[COND_END:%.*]]
+// CHECK-32:       cond.false:
+// CHECK-32-NEXT:    [[TMP3:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4
+// CHECK-32-NEXT:    br label [[COND_END]]
+// CHECK-32:       cond.end:
+// CHECK-32-NEXT:    [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP3]], [[COND_FALSE]] ]
+// CHECK-32-NEXT:    store i32 [[COND]], i32* [[DOTOMP_UB]], align 4
+// CHECK-32-NEXT:    [[TMP4:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4
+// CHECK-32-NEXT:    store i32 [[TMP4]], i32* [[DOTOMP_IV]], align 4
+// CHECK-32-NEXT:    [[TMP5:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
+// CHECK-32-NEXT:    [[TMP6:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4
+// CHECK-32-NEXT:    [[CMP1:%.*]] = icmp sle i32 [[TMP5]], [[TMP6]]
+// CHECK-32-NEXT:    br i1 [[CMP1]], label [[OMP_DISPATCH_BODY:%.*]], label [[OMP_DISPATCH_END:%.*]]
+// CHECK-32:       omp.dispatch.body:
+// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK-32:       omp.inner.for.cond:
+// CHECK-32-NEXT:    [[TMP7:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
+// CHECK-32-NEXT:    [[TMP8:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4
+// CHECK-32-NEXT:    [[CMP2:%.*]] = icmp sle i32 [[TMP7]], [[TMP8]]
+// CHECK-32-NEXT:    br i1 [[CMP2]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK-32:       omp.inner.for.body:
+// CHECK-32-NEXT:    [[TMP9:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
+// CHECK-32-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP9]], 1
+// CHECK-32-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// CHECK-32-NEXT:    store i32 [[ADD]], i32* [[I]], align 4
+// CHECK-32-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK-32:       omp.body.continue:
+// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK-32:       omp.inner.for.inc:
+// CHECK-32-NEXT:    [[TMP10:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
+// CHECK-32-NEXT:    [[ADD3:%.*]] = add nsw i32 [[TMP10]], 1
+// CHECK-32-NEXT:    store i32 [[ADD3]], i32* [[DOTOMP_IV]], align 4
+// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND]]
+// CHECK-32:       omp.inner.for.end:
+// CHECK-32-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
+// CHECK-32:       omp.dispatch.inc:
+// CHECK-32-NEXT:    [[TMP11:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4
+// CHECK-32-NEXT:    [[TMP12:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-32-NEXT:    [[ADD4:%.*]] = add nsw i32 [[TMP11]], [[TMP12]]
+// CHECK-32-NEXT:    store i32 [[ADD4]], i32* [[DOTOMP_LB]], align 4
+// CHECK-32-NEXT:    [[TMP13:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4
+// CHECK-32-NEXT:    [[TMP14:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-32-NEXT:    [[ADD5:%.*]] = add nsw i32 [[TMP13]], [[TMP14]]
+// CHECK-32-NEXT:    store i32 [[ADD5]], i32* [[DOTOMP_UB]], align 4
+// CHECK-32-NEXT:    br label [[OMP_DISPATCH_COND]]
+// CHECK-32:       omp.dispatch.end:
+// CHECK-32-NEXT:    call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB3]], i32 [[TMP1]])
+// CHECK-32-NEXT:    ret void
+//
+//
+// CHECK-32-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l162
+// CHECK-32-SAME: () #[[ATTR8]] {
+// CHECK-32-NEXT:  entry:
+// CHECK-32-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x i8*], align 4
+// CHECK-32-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 2, i1 false)
+// CHECK-32-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
+// CHECK-32-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
+// CHECK-32:       user_code.entry:
+// CHECK-32-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
+// CHECK-32-NEXT:    [[TMP2:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8**
+// CHECK-32-NEXT:    call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__72 to i8*), i8* null, i8** [[TMP2]], i32 0)
+// CHECK-32-NEXT:    call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 2)
+// CHECK-32-NEXT:    ret void
+// CHECK-32:       worker.exit:
+// CHECK-32-NEXT:    ret void
+//
+//
+// CHECK-32-LABEL: define {{[^@]+}}@__omp_outlined__72
+// CHECK-32-SAME: (i32* noalias noundef [[DOTGLOBAL_TID_:%.*]], i32* noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
+// CHECK-32-NEXT:  entry:
+// CHECK-32-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4
+// CHECK-32-NEXT:    store i32 0, i32* [[DOTOMP_LB]], align 4
+// CHECK-32-NEXT:    store i32 9, i32* [[DOTOMP_UB]], align 4
+// CHECK-32-NEXT:    store i32 1, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-32-NEXT:    store i32 0, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-32-NEXT:    [[TMP0:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-NEXT:    [[TMP1:%.*]] = load i32, i32* [[TMP0]], align 4
+// CHECK-32-NEXT:    call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB3]], i32 [[TMP1]], i32 34, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1)
+// CHECK-32-NEXT:    [[TMP2:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4
+// CHECK-32-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[TMP2]], 9
+// CHECK-32-NEXT:    br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK-32:       cond.true:
+// CHECK-32-NEXT:    br label [[COND_END:%.*]]
+// CHECK-32:       cond.false:
+// CHECK-32-NEXT:    [[TMP3:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4
+// CHECK-32-NEXT:    br label [[COND_END]]
+// CHECK-32:       cond.end:
+// CHECK-32-NEXT:    [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP3]], [[COND_FALSE]] ]
+// CHECK-32-NEXT:    store i32 [[COND]], i32* [[DOTOMP_UB]], align 4
+// CHECK-32-NEXT:    [[TMP4:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4
+// CHECK-32-NEXT:    store i32 [[TMP4]], i32* [[DOTOMP_IV]], align 4
+// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK-32:       omp.inner.for.cond:
+// CHECK-32-NEXT:    [[TMP5:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
+// CHECK-32-NEXT:    [[TMP6:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4
+// CHECK-32-NEXT:    [[CMP1:%.*]] = icmp sle i32 [[TMP5]], [[TMP6]]
+// CHECK-32-NEXT:    br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK-32:       omp.inner.for.body:
+// CHECK-32-NEXT:    [[TMP7:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
+// CHECK-32-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP7]], 1
+// CHECK-32-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// CHECK-32-NEXT:    store i32 [[ADD]], i32* [[I]], align 4
+// CHECK-32-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK-32:       omp.body.continue:
+// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK-32:       omp.inner.for.inc:
+// CHECK-32-NEXT:    [[TMP8:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
+// CHECK-32-NEXT:    [[ADD2:%.*]] = add nsw i32 [[TMP8]], 1
+// CHECK-32-NEXT:    store i32 [[ADD2]], i32* [[DOTOMP_IV]], align 4
+// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND]]
+// CHECK-32:       omp.inner.for.end:
+// CHECK-32-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK-32:       omp.loop.exit:
+// CHECK-32-NEXT:    call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB3]], i32 [[TMP1]])
+// CHECK-32-NEXT:    ret void
+//
+//
+// CHECK-32-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l165
+// CHECK-32-SAME: () #[[ATTR8]] {
+// CHECK-32-NEXT:  entry:
+// CHECK-32-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x i8*], align 4
+// CHECK-32-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 2, i1 false)
+// CHECK-32-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
+// CHECK-32-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
+// CHECK-32:       user_code.entry:
+// CHECK-32-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
+// CHECK-32-NEXT:    [[TMP2:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8**
+// CHECK-32-NEXT:    call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__73 to i8*), i8* null, i8** [[TMP2]], i32 0)
+// CHECK-32-NEXT:    call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 2)
+// CHECK-32-NEXT:    ret void
+// CHECK-32:       worker.exit:
+// CHECK-32-NEXT:    ret void
+//
+//
+// CHECK-32-LABEL: define {{[^@]+}}@__omp_outlined__73
+// CHECK-32-SAME: (i32* noalias noundef [[DOTGLOBAL_TID_:%.*]], i32* noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
+// CHECK-32-NEXT:  entry:
+// CHECK-32-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4
+// CHECK-32-NEXT:    store i32 0, i32* [[DOTOMP_LB]], align 4
+// CHECK-32-NEXT:    store i32 9, i32* [[DOTOMP_UB]], align 4
+// CHECK-32-NEXT:    store i32 1, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-32-NEXT:    store i32 0, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-32-NEXT:    [[TMP0:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-NEXT:    [[TMP1:%.*]] = load i32, i32* [[TMP0]], align 4
+// CHECK-32-NEXT:    call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB3]], i32 [[TMP1]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1)
+// CHECK-32-NEXT:    br label [[OMP_DISPATCH_COND:%.*]]
+// CHECK-32:       omp.dispatch.cond:
+// CHECK-32-NEXT:    [[TMP2:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4
+// CHECK-32-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[TMP2]], 9
+// CHECK-32-NEXT:    br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK-32:       cond.true:
+// CHECK-32-NEXT:    br label [[COND_END:%.*]]
+// CHECK-32:       cond.false:
+// CHECK-32-NEXT:    [[TMP3:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4
+// CHECK-32-NEXT:    br label [[COND_END]]
+// CHECK-32:       cond.end:
+// CHECK-32-NEXT:    [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP3]], [[COND_FALSE]] ]
+// CHECK-32-NEXT:    store i32 [[COND]], i32* [[DOTOMP_UB]], align 4
+// CHECK-32-NEXT:    [[TMP4:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4
+// CHECK-32-NEXT:    store i32 [[TMP4]], i32* [[DOTOMP_IV]], align 4
+// CHECK-32-NEXT:    [[TMP5:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
+// CHECK-32-NEXT:    [[TMP6:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4
+// CHECK-32-NEXT:    [[CMP1:%.*]] = icmp sle i32 [[TMP5]], [[TMP6]]
+// CHECK-32-NEXT:    br i1 [[CMP1]], label [[OMP_DISPATCH_BODY:%.*]], label [[OMP_DISPATCH_END:%.*]]
+// CHECK-32:       omp.dispatch.body:
+// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK-32:       omp.inner.for.cond:
+// CHECK-32-NEXT:    [[TMP7:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
+// CHECK-32-NEXT:    [[TMP8:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4
+// CHECK-32-NEXT:    [[CMP2:%.*]] = icmp sle i32 [[TMP7]], [[TMP8]]
+// CHECK-32-NEXT:    br i1 [[CMP2]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK-32:       omp.inner.for.body:
+// CHECK-32-NEXT:    [[TMP9:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
+// CHECK-32-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP9]], 1
+// CHECK-32-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// CHECK-32-NEXT:    store i32 [[ADD]], i32* [[I]], align 4
+// CHECK-32-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK-32:       omp.body.continue:
+// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK-32:       omp.inner.for.inc:
+// CHECK-32-NEXT:    [[TMP10:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
+// CHECK-32-NEXT:    [[ADD3:%.*]] = add nsw i32 [[TMP10]], 1
+// CHECK-32-NEXT:    store i32 [[ADD3]], i32* [[DOTOMP_IV]], align 4
+// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND]]
+// CHECK-32:       omp.inner.for.end:
+// CHECK-32-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
+// CHECK-32:       omp.dispatch.inc:
+// CHECK-32-NEXT:    [[TMP11:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4
+// CHECK-32-NEXT:    [[TMP12:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-32-NEXT:    [[ADD4:%.*]] = add nsw i32 [[TMP11]], [[TMP12]]
+// CHECK-32-NEXT:    store i32 [[ADD4]], i32* [[DOTOMP_LB]], align 4
+// CHECK-32-NEXT:    [[TMP13:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4
+// CHECK-32-NEXT:    [[TMP14:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-32-NEXT:    [[ADD5:%.*]] = add nsw i32 [[TMP13]], [[TMP14]]
+// CHECK-32-NEXT:    store i32 [[ADD5]], i32* [[DOTOMP_UB]], align 4
+// CHECK-32-NEXT:    br label [[OMP_DISPATCH_COND]]
+// CHECK-32:       omp.dispatch.end:
+// CHECK-32-NEXT:    call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB3]], i32 [[TMP1]])
+// CHECK-32-NEXT:    ret void
+//
+//
+// CHECK-32-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l168
+// CHECK-32-SAME: () #[[ATTR8]] {
+// CHECK-32-NEXT:  entry:
+// CHECK-32-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x i8*], align 4
+// CHECK-32-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 2, i1 false)
+// CHECK-32-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
+// CHECK-32-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
+// CHECK-32:       user_code.entry:
+// CHECK-32-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
+// CHECK-32-NEXT:    [[TMP2:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8**
+// CHECK-32-NEXT:    call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__74 to i8*), i8* null, i8** [[TMP2]], i32 0)
+// CHECK-32-NEXT:    call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 2)
+// CHECK-32-NEXT:    ret void
+// CHECK-32:       worker.exit:
+// CHECK-32-NEXT:    ret void
+//
+//
+// CHECK-32-LABEL: define {{[^@]+}}@__omp_outlined__74
+// CHECK-32-SAME: (i32* noalias noundef [[DOTGLOBAL_TID_:%.*]], i32* noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
+// CHECK-32-NEXT:  entry:
+// CHECK-32-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4
+// CHECK-32-NEXT:    store i32 0, i32* [[DOTOMP_LB]], align 4
+// CHECK-32-NEXT:    store i32 9, i32* [[DOTOMP_UB]], align 4
+// CHECK-32-NEXT:    store i32 1, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-32-NEXT:    store i32 0, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-32-NEXT:    [[TMP0:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-NEXT:    [[TMP1:%.*]] = load i32, i32* [[TMP0]], align 4
+// CHECK-32-NEXT:    call void @__kmpc_dispatch_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]], i32 1073741862, i32 0, i32 9, i32 1, i32 1)
+// CHECK-32-NEXT:    br label [[OMP_DISPATCH_COND:%.*]]
+// CHECK-32:       omp.dispatch.cond:
+// CHECK-32-NEXT:    [[TMP2:%.*]] = call i32 @__kmpc_dispatch_next_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]], i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]])
+// CHECK-32-NEXT:    [[TOBOOL:%.*]] = icmp ne i32 [[TMP2]], 0
+// CHECK-32-NEXT:    br i1 [[TOBOOL]], label [[OMP_DISPATCH_BODY:%.*]], label [[OMP_DISPATCH_END:%.*]]
+// CHECK-32:       omp.dispatch.body:
+// CHECK-32-NEXT:    [[TMP3:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4
+// CHECK-32-NEXT:    store i32 [[TMP3]], i32* [[DOTOMP_IV]], align 4
+// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK-32:       omp.inner.for.cond:
+// CHECK-32-NEXT:    [[TMP4:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP258:![0-9]+]]
+// CHECK-32-NEXT:    [[TMP5:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP258]]
+// CHECK-32-NEXT:    [[CMP:%.*]] = icmp sle i32 [[TMP4]], [[TMP5]]
+// CHECK-32-NEXT:    br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK-32:       omp.inner.for.body:
+// CHECK-32-NEXT:    [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP258]]
+// CHECK-32-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP6]], 1
+// CHECK-32-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// CHECK-32-NEXT:    store i32 [[ADD]], i32* [[I]], align 4, !llvm.access.group [[ACC_GRP258]]
+// CHECK-32-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK-32:       omp.body.continue:
+// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK-32:       omp.inner.for.inc:
+// CHECK-32-NEXT:    [[TMP7:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP258]]
+// CHECK-32-NEXT:    [[ADD1:%.*]] = add nsw i32 [[TMP7]], 1
+// CHECK-32-NEXT:    store i32 [[ADD1]], i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP258]]
+// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP259:![0-9]+]]
+// CHECK-32:       omp.inner.for.end:
+// CHECK-32-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
+// CHECK-32:       omp.dispatch.inc:
+// CHECK-32-NEXT:    br label [[OMP_DISPATCH_COND]]
+// CHECK-32:       omp.dispatch.end:
+// CHECK-32-NEXT:    ret void
+//
+//
+// CHECK-32-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l171
+// CHECK-32-SAME: () #[[ATTR8]] {
+// CHECK-32-NEXT:  entry:
+// CHECK-32-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x i8*], align 4
+// CHECK-32-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 2, i1 false)
+// CHECK-32-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
+// CHECK-32-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
+// CHECK-32:       user_code.entry:
+// CHECK-32-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
+// CHECK-32-NEXT:    [[TMP2:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8**
+// CHECK-32-NEXT:    call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__75 to i8*), i8* null, i8** [[TMP2]], i32 0)
+// CHECK-32-NEXT:    call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 2)
+// CHECK-32-NEXT:    ret void
+// CHECK-32:       worker.exit:
+// CHECK-32-NEXT:    ret void
+//
+//
+// CHECK-32-LABEL: define {{[^@]+}}@__omp_outlined__75
+// CHECK-32-SAME: (i32* noalias noundef [[DOTGLOBAL_TID_:%.*]], i32* noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
+// CHECK-32-NEXT:  entry:
+// CHECK-32-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4
+// CHECK-32-NEXT:    store i32 0, i32* [[DOTOMP_LB]], align 4
+// CHECK-32-NEXT:    store i32 9, i32* [[DOTOMP_UB]], align 4
+// CHECK-32-NEXT:    store i32 1, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-32-NEXT:    store i32 0, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-32-NEXT:    [[TMP0:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-NEXT:    [[TMP1:%.*]] = load i32, i32* [[TMP0]], align 4
+// CHECK-32-NEXT:    call void @__kmpc_dispatch_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]], i32 1073741861, i32 0, i32 9, i32 1, i32 1)
+// CHECK-32-NEXT:    br label [[OMP_DISPATCH_COND:%.*]]
+// CHECK-32:       omp.dispatch.cond:
+// CHECK-32-NEXT:    [[TMP2:%.*]] = call i32 @__kmpc_dispatch_next_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]], i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]])
+// CHECK-32-NEXT:    [[TOBOOL:%.*]] = icmp ne i32 [[TMP2]], 0
+// CHECK-32-NEXT:    br i1 [[TOBOOL]], label [[OMP_DISPATCH_BODY:%.*]], label [[OMP_DISPATCH_END:%.*]]
+// CHECK-32:       omp.dispatch.body:
+// CHECK-32-NEXT:    [[TMP3:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4
+// CHECK-32-NEXT:    store i32 [[TMP3]], i32* [[DOTOMP_IV]], align 4
+// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK-32:       omp.inner.for.cond:
+// CHECK-32-NEXT:    [[TMP4:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP261:![0-9]+]]
+// CHECK-32-NEXT:    [[TMP5:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP261]]
+// CHECK-32-NEXT:    [[CMP:%.*]] = icmp sle i32 [[TMP4]], [[TMP5]]
+// CHECK-32-NEXT:    br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK-32:       omp.inner.for.body:
+// CHECK-32-NEXT:    [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP261]]
+// CHECK-32-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP6]], 1
+// CHECK-32-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// CHECK-32-NEXT:    store i32 [[ADD]], i32* [[I]], align 4, !llvm.access.group [[ACC_GRP261]]
+// CHECK-32-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK-32:       omp.body.continue:
+// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK-32:       omp.inner.for.inc:
+// CHECK-32-NEXT:    [[TMP7:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP261]]
+// CHECK-32-NEXT:    [[ADD1:%.*]] = add nsw i32 [[TMP7]], 1
+// CHECK-32-NEXT:    store i32 [[ADD1]], i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP261]]
+// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP262:![0-9]+]]
+// CHECK-32:       omp.inner.for.end:
+// CHECK-32-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
+// CHECK-32:       omp.dispatch.inc:
+// CHECK-32-NEXT:    br label [[OMP_DISPATCH_COND]]
+// CHECK-32:       omp.dispatch.end:
+// CHECK-32-NEXT:    ret void
+//
+//
+// CHECK-32-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l174
+// CHECK-32-SAME: () #[[ATTR8]] {
+// CHECK-32-NEXT:  entry:
+// CHECK-32-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x i8*], align 4
+// CHECK-32-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 2, i1 false)
+// CHECK-32-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
+// CHECK-32-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
+// CHECK-32:       user_code.entry:
+// CHECK-32-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
+// CHECK-32-NEXT:    [[TMP2:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8**
+// CHECK-32-NEXT:    call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__76 to i8*), i8* null, i8** [[TMP2]], i32 0)
+// CHECK-32-NEXT:    call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 2)
+// CHECK-32-NEXT:    ret void
+// CHECK-32:       worker.exit:
+// CHECK-32-NEXT:    ret void
+//
+//
+// CHECK-32-LABEL: define {{[^@]+}}@__omp_outlined__76
+// CHECK-32-SAME: (i32* noalias noundef [[DOTGLOBAL_TID_:%.*]], i32* noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
+// CHECK-32-NEXT:  entry:
+// CHECK-32-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4
+// CHECK-32-NEXT:    store i32 0, i32* [[DOTOMP_LB]], align 4
+// CHECK-32-NEXT:    store i32 9, i32* [[DOTOMP_UB]], align 4
+// CHECK-32-NEXT:    store i32 1, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-32-NEXT:    store i32 0, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-32-NEXT:    [[TMP0:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-NEXT:    [[TMP1:%.*]] = load i32, i32* [[TMP0]], align 4
+// CHECK-32-NEXT:    call void @__kmpc_dispatch_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]], i32 1073741859, i32 0, i32 9, i32 1, i32 1)
+// CHECK-32-NEXT:    br label [[OMP_DISPATCH_COND:%.*]]
+// CHECK-32:       omp.dispatch.cond:
+// CHECK-32-NEXT:    [[TMP2:%.*]] = call i32 @__kmpc_dispatch_next_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]], i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]])
+// CHECK-32-NEXT:    [[TOBOOL:%.*]] = icmp ne i32 [[TMP2]], 0
+// CHECK-32-NEXT:    br i1 [[TOBOOL]], label [[OMP_DISPATCH_BODY:%.*]], label [[OMP_DISPATCH_END:%.*]]
+// CHECK-32:       omp.dispatch.body:
+// CHECK-32-NEXT:    [[TMP3:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4
+// CHECK-32-NEXT:    store i32 [[TMP3]], i32* [[DOTOMP_IV]], align 4
+// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK-32:       omp.inner.for.cond:
+// CHECK-32-NEXT:    [[TMP4:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP264:![0-9]+]]
+// CHECK-32-NEXT:    [[TMP5:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP264]]
+// CHECK-32-NEXT:    [[CMP:%.*]] = icmp sle i32 [[TMP4]], [[TMP5]]
+// CHECK-32-NEXT:    br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK-32:       omp.inner.for.body:
+// CHECK-32-NEXT:    [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP264]]
+// CHECK-32-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP6]], 1
+// CHECK-32-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// CHECK-32-NEXT:    store i32 [[ADD]], i32* [[I]], align 4, !llvm.access.group [[ACC_GRP264]]
+// CHECK-32-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK-32:       omp.body.continue:
+// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK-32:       omp.inner.for.inc:
+// CHECK-32-NEXT:    [[TMP7:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP264]]
+// CHECK-32-NEXT:    [[ADD1:%.*]] = add nsw i32 [[TMP7]], 1
+// CHECK-32-NEXT:    store i32 [[ADD1]], i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP264]]
+// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP265:![0-9]+]]
+// CHECK-32:       omp.inner.for.end:
+// CHECK-32-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
+// CHECK-32:       omp.dispatch.inc:
+// CHECK-32-NEXT:    br label [[OMP_DISPATCH_COND]]
+// CHECK-32:       omp.dispatch.end:
+// CHECK-32-NEXT:    ret void
+//
+//
+// CHECK-32-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l177
+// CHECK-32-SAME: () #[[ATTR8]] {
+// CHECK-32-NEXT:  entry:
+// CHECK-32-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x i8*], align 4
+// CHECK-32-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 2, i1 false)
+// CHECK-32-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
+// CHECK-32-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
+// CHECK-32:       user_code.entry:
+// CHECK-32-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
+// CHECK-32-NEXT:    [[TMP2:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8**
+// CHECK-32-NEXT:    call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__77 to i8*), i8* null, i8** [[TMP2]], i32 0)
+// CHECK-32-NEXT:    call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 2)
+// CHECK-32-NEXT:    ret void
+// CHECK-32:       worker.exit:
+// CHECK-32-NEXT:    ret void
+//
+//
+// CHECK-32-LABEL: define {{[^@]+}}@__omp_outlined__77
+// CHECK-32-SAME: (i32* noalias noundef [[DOTGLOBAL_TID_:%.*]], i32* noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
+// CHECK-32-NEXT:  entry:
+// CHECK-32-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4
+// CHECK-32-NEXT:    store i32 0, i32* [[DOTOMP_LB]], align 4
+// CHECK-32-NEXT:    store i32 9, i32* [[DOTOMP_UB]], align 4
+// CHECK-32-NEXT:    store i32 1, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-32-NEXT:    store i32 0, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-32-NEXT:    [[TMP0:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-NEXT:    [[TMP1:%.*]] = load i32, i32* [[TMP0]], align 4
+// CHECK-32-NEXT:    call void @__kmpc_dispatch_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]], i32 1073741860, i32 0, i32 9, i32 1, i32 1)
+// CHECK-32-NEXT:    br label [[OMP_DISPATCH_COND:%.*]]
+// CHECK-32:       omp.dispatch.cond:
+// CHECK-32-NEXT:    [[TMP2:%.*]] = call i32 @__kmpc_dispatch_next_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]], i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]])
+// CHECK-32-NEXT:    [[TOBOOL:%.*]] = icmp ne i32 [[TMP2]], 0
+// CHECK-32-NEXT:    br i1 [[TOBOOL]], label [[OMP_DISPATCH_BODY:%.*]], label [[OMP_DISPATCH_END:%.*]]
+// CHECK-32:       omp.dispatch.body:
+// CHECK-32-NEXT:    [[TMP3:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4
+// CHECK-32-NEXT:    store i32 [[TMP3]], i32* [[DOTOMP_IV]], align 4
+// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK-32:       omp.inner.for.cond:
+// CHECK-32-NEXT:    [[TMP4:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP267:![0-9]+]]
+// CHECK-32-NEXT:    [[TMP5:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP267]]
+// CHECK-32-NEXT:    [[CMP:%.*]] = icmp sle i32 [[TMP4]], [[TMP5]]
+// CHECK-32-NEXT:    br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK-32:       omp.inner.for.body:
+// CHECK-32-NEXT:    [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP267]]
+// CHECK-32-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP6]], 1
+// CHECK-32-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// CHECK-32-NEXT:    store i32 [[ADD]], i32* [[I]], align 4, !llvm.access.group [[ACC_GRP267]]
+// CHECK-32-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK-32:       omp.body.continue:
+// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK-32:       omp.inner.for.inc:
+// CHECK-32-NEXT:    [[TMP7:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP267]]
+// CHECK-32-NEXT:    [[ADD1:%.*]] = add nsw i32 [[TMP7]], 1
+// CHECK-32-NEXT:    store i32 [[ADD1]], i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP267]]
+// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP268:![0-9]+]]
+// CHECK-32:       omp.inner.for.end:
+// CHECK-32-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
+// CHECK-32:       omp.dispatch.inc:
+// CHECK-32-NEXT:    br label [[OMP_DISPATCH_COND]]
+// CHECK-32:       omp.dispatch.end:
+// CHECK-32-NEXT:    ret void
+//
+//
+// CHECK-32-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l180
+// CHECK-32-SAME: (i32 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR8]] {
+// CHECK-32-NEXT:  entry:
+// CHECK-32-NEXT:    [[DOTCAPTURE_EXPR__ADDR:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x i8*], align 4
+// CHECK-32-NEXT:    store i32 [[DOTCAPTURE_EXPR_]], i32* [[DOTCAPTURE_EXPR__ADDR]], align 4
+// CHECK-32-NEXT:    [[CONV:%.*]] = bitcast i32* [[DOTCAPTURE_EXPR__ADDR]] to i8*
+// CHECK-32-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 2, i1 false)
+// CHECK-32-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
+// CHECK-32-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
+// CHECK-32:       user_code.entry:
+// CHECK-32-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
+// CHECK-32-NEXT:    [[TMP2:%.*]] = load i8, i8* [[CONV]], align 1
+// CHECK-32-NEXT:    [[TOBOOL:%.*]] = trunc i8 [[TMP2]] to i1
+// CHECK-32-NEXT:    [[TMP3:%.*]] = zext i1 [[TOBOOL]] to i32
+// CHECK-32-NEXT:    [[TMP4:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8**
+// CHECK-32-NEXT:    call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]], i32 [[TMP3]], i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__78 to i8*), i8* null, i8** [[TMP4]], i32 0)
+// CHECK-32-NEXT:    call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 2)
+// CHECK-32-NEXT:    ret void
+// CHECK-32:       worker.exit:
+// CHECK-32-NEXT:    ret void
+//
+//
+// CHECK-32-LABEL: define {{[^@]+}}@__omp_outlined__78
+// CHECK-32-SAME: (i32* noalias noundef [[DOTGLOBAL_TID_:%.*]], i32* noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
+// CHECK-32-NEXT:  entry:
+// CHECK-32-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4
+// CHECK-32-NEXT:    store i32 0, i32* [[DOTOMP_LB]], align 4
+// CHECK-32-NEXT:    store i32 9, i32* [[DOTOMP_UB]], align 4
+// CHECK-32-NEXT:    store i32 1, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-32-NEXT:    store i32 0, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-32-NEXT:    [[TMP0:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-NEXT:    [[TMP1:%.*]] = load i32, i32* [[TMP0]], align 4
+// CHECK-32-NEXT:    call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB3]], i32 [[TMP1]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1)
+// CHECK-32-NEXT:    br label [[OMP_DISPATCH_COND:%.*]]
+// CHECK-32:       omp.dispatch.cond:
+// CHECK-32-NEXT:    [[TMP2:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4
+// CHECK-32-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[TMP2]], 9
+// CHECK-32-NEXT:    br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK-32:       cond.true:
+// CHECK-32-NEXT:    br label [[COND_END:%.*]]
+// CHECK-32:       cond.false:
+// CHECK-32-NEXT:    [[TMP3:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4
+// CHECK-32-NEXT:    br label [[COND_END]]
+// CHECK-32:       cond.end:
+// CHECK-32-NEXT:    [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP3]], [[COND_FALSE]] ]
+// CHECK-32-NEXT:    store i32 [[COND]], i32* [[DOTOMP_UB]], align 4
+// CHECK-32-NEXT:    [[TMP4:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4
+// CHECK-32-NEXT:    store i32 [[TMP4]], i32* [[DOTOMP_IV]], align 4
+// CHECK-32-NEXT:    [[TMP5:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
+// CHECK-32-NEXT:    [[TMP6:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4
+// CHECK-32-NEXT:    [[CMP1:%.*]] = icmp sle i32 [[TMP5]], [[TMP6]]
+// CHECK-32-NEXT:    br i1 [[CMP1]], label [[OMP_DISPATCH_BODY:%.*]], label [[OMP_DISPATCH_END:%.*]]
+// CHECK-32:       omp.dispatch.body:
+// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK-32:       omp.inner.for.cond:
+// CHECK-32-NEXT:    [[TMP7:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP270:![0-9]+]]
+// CHECK-32-NEXT:    [[TMP8:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP270]]
+// CHECK-32-NEXT:    [[CMP2:%.*]] = icmp sle i32 [[TMP7]], [[TMP8]]
+// CHECK-32-NEXT:    br i1 [[CMP2]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK-32:       omp.inner.for.body:
+// CHECK-32-NEXT:    [[TMP9:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP270]]
+// CHECK-32-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP9]], 1
+// CHECK-32-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// CHECK-32-NEXT:    store i32 [[ADD]], i32* [[I]], align 4, !llvm.access.group [[ACC_GRP270]]
+// CHECK-32-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK-32:       omp.body.continue:
+// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK-32:       omp.inner.for.inc:
+// CHECK-32-NEXT:    [[TMP10:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP270]]
+// CHECK-32-NEXT:    [[ADD3:%.*]] = add nsw i32 [[TMP10]], 1
+// CHECK-32-NEXT:    store i32 [[ADD3]], i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP270]]
+// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP271:![0-9]+]]
+// CHECK-32:       omp.inner.for.end:
+// CHECK-32-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
+// CHECK-32:       omp.dispatch.inc:
+// CHECK-32-NEXT:    [[TMP11:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4
+// CHECK-32-NEXT:    [[TMP12:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-32-NEXT:    [[ADD4:%.*]] = add nsw i32 [[TMP11]], [[TMP12]]
+// CHECK-32-NEXT:    store i32 [[ADD4]], i32* [[DOTOMP_LB]], align 4
+// CHECK-32-NEXT:    [[TMP13:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4
+// CHECK-32-NEXT:    [[TMP14:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-32-NEXT:    [[ADD5:%.*]] = add nsw i32 [[TMP13]], [[TMP14]]
+// CHECK-32-NEXT:    store i32 [[ADD5]], i32* [[DOTOMP_UB]], align 4
+// CHECK-32-NEXT:    br label [[OMP_DISPATCH_COND]]
+// CHECK-32:       omp.dispatch.end:
+// CHECK-32-NEXT:    call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB3]], i32 [[TMP1]])
+// CHECK-32-NEXT:    [[TMP15:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-32-NEXT:    [[TMP16:%.*]] = icmp ne i32 [[TMP15]], 0
+// CHECK-32-NEXT:    br i1 [[TMP16]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]]
+// CHECK-32:       .omp.final.then:
+// CHECK-32-NEXT:    store i32 10, i32* [[I]], align 4
+// CHECK-32-NEXT:    br label [[DOTOMP_FINAL_DONE]]
+// CHECK-32:       .omp.final.done:
+// CHECK-32-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4:[0-9]+]], i32 [[TMP1]])
+// CHECK-32-NEXT:    ret void
+//
+//
+// CHECK-32-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l184
+// CHECK-32-SAME: () #[[ATTR8]] {
+// CHECK-32-NEXT:  entry:
+// CHECK-32-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x i8*], align 4
+// CHECK-32-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 2, i1 false)
+// CHECK-32-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
+// CHECK-32-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
+// CHECK-32:       user_code.entry:
+// CHECK-32-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
+// CHECK-32-NEXT:    [[TMP2:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8**
+// CHECK-32-NEXT:    call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__79 to i8*), i8* null, i8** [[TMP2]], i32 0)
+// CHECK-32-NEXT:    call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 2)
+// CHECK-32-NEXT:    ret void
+// CHECK-32:       worker.exit:
+// CHECK-32-NEXT:    ret void
+//
+//
+// CHECK-32-LABEL: define {{[^@]+}}@__omp_outlined__79
+// CHECK-32-SAME: (i32* noalias noundef [[DOTGLOBAL_TID_:%.*]], i32* noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
+// CHECK-32-NEXT:  entry:
+// CHECK-32-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4
+// CHECK-32-NEXT:    store i32 0, i32* [[DOTOMP_LB]], align 4
+// CHECK-32-NEXT:    store i32 9, i32* [[DOTOMP_UB]], align 4
+// CHECK-32-NEXT:    store i32 1, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-32-NEXT:    store i32 0, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-32-NEXT:    [[TMP0:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-NEXT:    [[TMP1:%.*]] = load i32, i32* [[TMP0]], align 4
+// CHECK-32-NEXT:    call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB3]], i32 [[TMP1]], i32 34, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1)
+// CHECK-32-NEXT:    [[TMP2:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4
+// CHECK-32-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[TMP2]], 9
+// CHECK-32-NEXT:    br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK-32:       cond.true:
+// CHECK-32-NEXT:    br label [[COND_END:%.*]]
+// CHECK-32:       cond.false:
+// CHECK-32-NEXT:    [[TMP3:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4
+// CHECK-32-NEXT:    br label [[COND_END]]
+// CHECK-32:       cond.end:
+// CHECK-32-NEXT:    [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP3]], [[COND_FALSE]] ]
+// CHECK-32-NEXT:    store i32 [[COND]], i32* [[DOTOMP_UB]], align 4
+// CHECK-32-NEXT:    [[TMP4:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4
+// CHECK-32-NEXT:    store i32 [[TMP4]], i32* [[DOTOMP_IV]], align 4
+// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK-32:       omp.inner.for.cond:
+// CHECK-32-NEXT:    [[TMP5:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP273:![0-9]+]]
+// CHECK-32-NEXT:    [[TMP6:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP273]]
+// CHECK-32-NEXT:    [[CMP1:%.*]] = icmp sle i32 [[TMP5]], [[TMP6]]
+// CHECK-32-NEXT:    br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK-32:       omp.inner.for.body:
+// CHECK-32-NEXT:    [[TMP7:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP273]]
+// CHECK-32-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP7]], 1
+// CHECK-32-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// CHECK-32-NEXT:    store i32 [[ADD]], i32* [[I]], align 4, !llvm.access.group [[ACC_GRP273]]
+// CHECK-32-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK-32:       omp.body.continue:
+// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK-32:       omp.inner.for.inc:
+// CHECK-32-NEXT:    [[TMP8:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP273]]
+// CHECK-32-NEXT:    [[ADD2:%.*]] = add nsw i32 [[TMP8]], 1
+// CHECK-32-NEXT:    store i32 [[ADD2]], i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP273]]
+// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP274:![0-9]+]]
+// CHECK-32:       omp.inner.for.end:
+// CHECK-32-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK-32:       omp.loop.exit:
+// CHECK-32-NEXT:    call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB3]], i32 [[TMP1]])
+// CHECK-32-NEXT:    [[TMP9:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-32-NEXT:    [[TMP10:%.*]] = icmp ne i32 [[TMP9]], 0
+// CHECK-32-NEXT:    br i1 [[TMP10]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]]
+// CHECK-32:       .omp.final.then:
+// CHECK-32-NEXT:    store i32 10, i32* [[I]], align 4
+// CHECK-32-NEXT:    br label [[DOTOMP_FINAL_DONE]]
+// CHECK-32:       .omp.final.done:
+// CHECK-32-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[TMP1]])
+// CHECK-32-NEXT:    ret void
+//
+//
+// CHECK-32-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l188
+// CHECK-32-SAME: () #[[ATTR8]] {
+// CHECK-32-NEXT:  entry:
+// CHECK-32-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x i8*], align 4
+// CHECK-32-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 2, i1 false)
+// CHECK-32-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
+// CHECK-32-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
+// CHECK-32:       user_code.entry:
+// CHECK-32-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
+// CHECK-32-NEXT:    [[TMP2:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8**
+// CHECK-32-NEXT:    call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__80 to i8*), i8* null, i8** [[TMP2]], i32 0)
+// CHECK-32-NEXT:    call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 2)
+// CHECK-32-NEXT:    ret void
+// CHECK-32:       worker.exit:
+// CHECK-32-NEXT:    ret void
+//
+//
+// CHECK-32-LABEL: define {{[^@]+}}@__omp_outlined__80
+// CHECK-32-SAME: (i32* noalias noundef [[DOTGLOBAL_TID_:%.*]], i32* noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
+// CHECK-32-NEXT:  entry:
+// CHECK-32-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4
+// CHECK-32-NEXT:    store i32 0, i32* [[DOTOMP_LB]], align 4
+// CHECK-32-NEXT:    store i32 9, i32* [[DOTOMP_UB]], align 4
+// CHECK-32-NEXT:    store i32 1, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-32-NEXT:    store i32 0, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-32-NEXT:    [[TMP0:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-NEXT:    [[TMP1:%.*]] = load i32, i32* [[TMP0]], align 4
+// CHECK-32-NEXT:    call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB3]], i32 [[TMP1]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1)
+// CHECK-32-NEXT:    br label [[OMP_DISPATCH_COND:%.*]]
+// CHECK-32:       omp.dispatch.cond:
+// CHECK-32-NEXT:    [[TMP2:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4
+// CHECK-32-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[TMP2]], 9
+// CHECK-32-NEXT:    br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK-32:       cond.true:
+// CHECK-32-NEXT:    br label [[COND_END:%.*]]
+// CHECK-32:       cond.false:
+// CHECK-32-NEXT:    [[TMP3:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4
+// CHECK-32-NEXT:    br label [[COND_END]]
+// CHECK-32:       cond.end:
+// CHECK-32-NEXT:    [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP3]], [[COND_FALSE]] ]
+// CHECK-32-NEXT:    store i32 [[COND]], i32* [[DOTOMP_UB]], align 4
+// CHECK-32-NEXT:    [[TMP4:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4
+// CHECK-32-NEXT:    store i32 [[TMP4]], i32* [[DOTOMP_IV]], align 4
+// CHECK-32-NEXT:    [[TMP5:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
+// CHECK-32-NEXT:    [[TMP6:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4
+// CHECK-32-NEXT:    [[CMP1:%.*]] = icmp sle i32 [[TMP5]], [[TMP6]]
+// CHECK-32-NEXT:    br i1 [[CMP1]], label [[OMP_DISPATCH_BODY:%.*]], label [[OMP_DISPATCH_END:%.*]]
+// CHECK-32:       omp.dispatch.body:
+// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK-32:       omp.inner.for.cond:
+// CHECK-32-NEXT:    [[TMP7:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP276:![0-9]+]]
+// CHECK-32-NEXT:    [[TMP8:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP276]]
+// CHECK-32-NEXT:    [[CMP2:%.*]] = icmp sle i32 [[TMP7]], [[TMP8]]
+// CHECK-32-NEXT:    br i1 [[CMP2]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK-32:       omp.inner.for.body:
+// CHECK-32-NEXT:    [[TMP9:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP276]]
+// CHECK-32-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP9]], 1
+// CHECK-32-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// CHECK-32-NEXT:    store i32 [[ADD]], i32* [[I]], align 4, !llvm.access.group [[ACC_GRP276]]
+// CHECK-32-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK-32:       omp.body.continue:
+// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK-32:       omp.inner.for.inc:
+// CHECK-32-NEXT:    [[TMP10:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP276]]
+// CHECK-32-NEXT:    [[ADD3:%.*]] = add nsw i32 [[TMP10]], 1
+// CHECK-32-NEXT:    store i32 [[ADD3]], i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP276]]
+// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP277:![0-9]+]]
+// CHECK-32:       omp.inner.for.end:
+// CHECK-32-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
+// CHECK-32:       omp.dispatch.inc:
+// CHECK-32-NEXT:    [[TMP11:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4
+// CHECK-32-NEXT:    [[TMP12:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-32-NEXT:    [[ADD4:%.*]] = add nsw i32 [[TMP11]], [[TMP12]]
+// CHECK-32-NEXT:    store i32 [[ADD4]], i32* [[DOTOMP_LB]], align 4
+// CHECK-32-NEXT:    [[TMP13:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4
+// CHECK-32-NEXT:    [[TMP14:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-32-NEXT:    [[ADD5:%.*]] = add nsw i32 [[TMP13]], [[TMP14]]
+// CHECK-32-NEXT:    store i32 [[ADD5]], i32* [[DOTOMP_UB]], align 4
+// CHECK-32-NEXT:    br label [[OMP_DISPATCH_COND]]
+// CHECK-32:       omp.dispatch.end:
+// CHECK-32-NEXT:    call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB3]], i32 [[TMP1]])
+// CHECK-32-NEXT:    [[TMP15:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-32-NEXT:    [[TMP16:%.*]] = icmp ne i32 [[TMP15]], 0
+// CHECK-32-NEXT:    br i1 [[TMP16]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]]
+// CHECK-32:       .omp.final.then:
+// CHECK-32-NEXT:    store i32 10, i32* [[I]], align 4
+// CHECK-32-NEXT:    br label [[DOTOMP_FINAL_DONE]]
+// CHECK-32:       .omp.final.done:
+// CHECK-32-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[TMP1]])
+// CHECK-32-NEXT:    ret void
+//
+//
+// CHECK-32-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l192
+// CHECK-32-SAME: () #[[ATTR8]] {
+// CHECK-32-NEXT:  entry:
+// CHECK-32-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x i8*], align 4
+// CHECK-32-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 2, i1 false)
+// CHECK-32-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
+// CHECK-32-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
+// CHECK-32:       user_code.entry:
+// CHECK-32-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
+// CHECK-32-NEXT:    [[TMP2:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8**
+// CHECK-32-NEXT:    call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__81 to i8*), i8* null, i8** [[TMP2]], i32 0)
+// CHECK-32-NEXT:    call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 2)
+// CHECK-32-NEXT:    ret void
+// CHECK-32:       worker.exit:
+// CHECK-32-NEXT:    ret void
+//
+//
+// CHECK-32-LABEL: define {{[^@]+}}@__omp_outlined__81
+// CHECK-32-SAME: (i32* noalias noundef [[DOTGLOBAL_TID_:%.*]], i32* noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
+// CHECK-32-NEXT:  entry:
+// CHECK-32-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4
+// CHECK-32-NEXT:    store i32 0, i32* [[DOTOMP_LB]], align 4
+// CHECK-32-NEXT:    store i32 9, i32* [[DOTOMP_UB]], align 4
+// CHECK-32-NEXT:    store i32 1, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-32-NEXT:    store i32 0, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-32-NEXT:    [[TMP0:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-NEXT:    [[TMP1:%.*]] = load i32, i32* [[TMP0]], align 4
+// CHECK-32-NEXT:    call void @__kmpc_dispatch_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]], i32 1073741862, i32 0, i32 9, i32 1, i32 1)
+// CHECK-32-NEXT:    br label [[OMP_DISPATCH_COND:%.*]]
+// CHECK-32:       omp.dispatch.cond:
+// CHECK-32-NEXT:    [[TMP2:%.*]] = call i32 @__kmpc_dispatch_next_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]], i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]])
+// CHECK-32-NEXT:    [[TOBOOL:%.*]] = icmp ne i32 [[TMP2]], 0
+// CHECK-32-NEXT:    br i1 [[TOBOOL]], label [[OMP_DISPATCH_BODY:%.*]], label [[OMP_DISPATCH_END:%.*]]
+// CHECK-32:       omp.dispatch.body:
+// CHECK-32-NEXT:    [[TMP3:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4
+// CHECK-32-NEXT:    store i32 [[TMP3]], i32* [[DOTOMP_IV]], align 4
+// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK-32:       omp.inner.for.cond:
+// CHECK-32-NEXT:    [[TMP4:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP279:![0-9]+]]
+// CHECK-32-NEXT:    [[TMP5:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP279]]
+// CHECK-32-NEXT:    [[CMP:%.*]] = icmp sle i32 [[TMP4]], [[TMP5]]
+// CHECK-32-NEXT:    br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK-32:       omp.inner.for.body:
+// CHECK-32-NEXT:    [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP279]]
+// CHECK-32-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP6]], 1
+// CHECK-32-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// CHECK-32-NEXT:    store i32 [[ADD]], i32* [[I]], align 4, !llvm.access.group [[ACC_GRP279]]
+// CHECK-32-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK-32:       omp.body.continue:
+// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK-32:       omp.inner.for.inc:
+// CHECK-32-NEXT:    [[TMP7:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP279]]
+// CHECK-32-NEXT:    [[ADD1:%.*]] = add nsw i32 [[TMP7]], 1
+// CHECK-32-NEXT:    store i32 [[ADD1]], i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP279]]
+// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP280:![0-9]+]]
+// CHECK-32:       omp.inner.for.end:
+// CHECK-32-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
+// CHECK-32:       omp.dispatch.inc:
+// CHECK-32-NEXT:    br label [[OMP_DISPATCH_COND]]
+// CHECK-32:       omp.dispatch.end:
+// CHECK-32-NEXT:    [[TMP8:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-32-NEXT:    [[TMP9:%.*]] = icmp ne i32 [[TMP8]], 0
+// CHECK-32-NEXT:    br i1 [[TMP9]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]]
+// CHECK-32:       .omp.final.then:
+// CHECK-32-NEXT:    store i32 10, i32* [[I]], align 4
+// CHECK-32-NEXT:    br label [[DOTOMP_FINAL_DONE]]
+// CHECK-32:       .omp.final.done:
+// CHECK-32-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[TMP1]])
+// CHECK-32-NEXT:    ret void
+//
+//
+// CHECK-32-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l196
+// CHECK-32-SAME: () #[[ATTR8]] {
+// CHECK-32-NEXT:  entry:
+// CHECK-32-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x i8*], align 4
+// CHECK-32-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 2, i1 false)
+// CHECK-32-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
+// CHECK-32-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
+// CHECK-32:       user_code.entry:
+// CHECK-32-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
+// CHECK-32-NEXT:    [[TMP2:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8**
+// CHECK-32-NEXT:    call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__82 to i8*), i8* null, i8** [[TMP2]], i32 0)
+// CHECK-32-NEXT:    call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 2)
+// CHECK-32-NEXT:    ret void
+// CHECK-32:       worker.exit:
+// CHECK-32-NEXT:    ret void
+//
+//
+// CHECK-32-LABEL: define {{[^@]+}}@__omp_outlined__82
+// CHECK-32-SAME: (i32* noalias noundef [[DOTGLOBAL_TID_:%.*]], i32* noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
+// CHECK-32-NEXT:  entry:
+// CHECK-32-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4
+// CHECK-32-NEXT:    store i32 0, i32* [[DOTOMP_LB]], align 4
+// CHECK-32-NEXT:    store i32 9, i32* [[DOTOMP_UB]], align 4
+// CHECK-32-NEXT:    store i32 1, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-32-NEXT:    store i32 0, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-32-NEXT:    [[TMP0:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-NEXT:    [[TMP1:%.*]] = load i32, i32* [[TMP0]], align 4
+// CHECK-32-NEXT:    call void @__kmpc_dispatch_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]], i32 1073741861, i32 0, i32 9, i32 1, i32 1)
+// CHECK-32-NEXT:    br label [[OMP_DISPATCH_COND:%.*]]
+// CHECK-32:       omp.dispatch.cond:
+// CHECK-32-NEXT:    [[TMP2:%.*]] = call i32 @__kmpc_dispatch_next_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]], i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]])
+// CHECK-32-NEXT:    [[TOBOOL:%.*]] = icmp ne i32 [[TMP2]], 0
+// CHECK-32-NEXT:    br i1 [[TOBOOL]], label [[OMP_DISPATCH_BODY:%.*]], label [[OMP_DISPATCH_END:%.*]]
+// CHECK-32:       omp.dispatch.body:
+// CHECK-32-NEXT:    [[TMP3:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4
+// CHECK-32-NEXT:    store i32 [[TMP3]], i32* [[DOTOMP_IV]], align 4
+// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK-32:       omp.inner.for.cond:
+// CHECK-32-NEXT:    [[TMP4:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP282:![0-9]+]]
+// CHECK-32-NEXT:    [[TMP5:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP282]]
+// CHECK-32-NEXT:    [[CMP:%.*]] = icmp sle i32 [[TMP4]], [[TMP5]]
+// CHECK-32-NEXT:    br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK-32:       omp.inner.for.body:
+// CHECK-32-NEXT:    [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP282]]
+// CHECK-32-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP6]], 1
+// CHECK-32-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// CHECK-32-NEXT:    store i32 [[ADD]], i32* [[I]], align 4, !llvm.access.group [[ACC_GRP282]]
+// CHECK-32-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK-32:       omp.body.continue:
+// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK-32:       omp.inner.for.inc:
+// CHECK-32-NEXT:    [[TMP7:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP282]]
+// CHECK-32-NEXT:    [[ADD1:%.*]] = add nsw i32 [[TMP7]], 1
+// CHECK-32-NEXT:    store i32 [[ADD1]], i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP282]]
+// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP283:![0-9]+]]
+// CHECK-32:       omp.inner.for.end:
+// CHECK-32-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
+// CHECK-32:       omp.dispatch.inc:
+// CHECK-32-NEXT:    br label [[OMP_DISPATCH_COND]]
+// CHECK-32:       omp.dispatch.end:
+// CHECK-32-NEXT:    [[TMP8:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-32-NEXT:    [[TMP9:%.*]] = icmp ne i32 [[TMP8]], 0
+// CHECK-32-NEXT:    br i1 [[TMP9]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]]
+// CHECK-32:       .omp.final.then:
+// CHECK-32-NEXT:    store i32 10, i32* [[I]], align 4
+// CHECK-32-NEXT:    br label [[DOTOMP_FINAL_DONE]]
+// CHECK-32:       .omp.final.done:
+// CHECK-32-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[TMP1]])
+// CHECK-32-NEXT:    ret void
+//
+//
+// CHECK-32-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l200
+// CHECK-32-SAME: () #[[ATTR8]] {
+// CHECK-32-NEXT:  entry:
+// CHECK-32-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x i8*], align 4
+// CHECK-32-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 2, i1 false)
+// CHECK-32-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
+// CHECK-32-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
+// CHECK-32:       user_code.entry:
+// CHECK-32-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
+// CHECK-32-NEXT:    [[TMP2:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8**
+// CHECK-32-NEXT:    call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__83 to i8*), i8* null, i8** [[TMP2]], i32 0)
+// CHECK-32-NEXT:    call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 2)
+// CHECK-32-NEXT:    ret void
+// CHECK-32:       worker.exit:
+// CHECK-32-NEXT:    ret void
+//
+//
+// CHECK-32-LABEL: define {{[^@]+}}@__omp_outlined__83
+// CHECK-32-SAME: (i32* noalias noundef [[DOTGLOBAL_TID_:%.*]], i32* noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
+// CHECK-32-NEXT:  entry:
+// CHECK-32-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4
+// CHECK-32-NEXT:    store i32 0, i32* [[DOTOMP_LB]], align 4
+// CHECK-32-NEXT:    store i32 9, i32* [[DOTOMP_UB]], align 4
+// CHECK-32-NEXT:    store i32 1, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-32-NEXT:    store i32 0, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-32-NEXT:    [[TMP0:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-NEXT:    [[TMP1:%.*]] = load i32, i32* [[TMP0]], align 4
+// CHECK-32-NEXT:    call void @__kmpc_dispatch_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]], i32 1073741859, i32 0, i32 9, i32 1, i32 1)
+// CHECK-32-NEXT:    br label [[OMP_DISPATCH_COND:%.*]]
+// CHECK-32:       omp.dispatch.cond:
+// CHECK-32-NEXT:    [[TMP2:%.*]] = call i32 @__kmpc_dispatch_next_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]], i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]])
+// CHECK-32-NEXT:    [[TOBOOL:%.*]] = icmp ne i32 [[TMP2]], 0
+// CHECK-32-NEXT:    br i1 [[TOBOOL]], label [[OMP_DISPATCH_BODY:%.*]], label [[OMP_DISPATCH_END:%.*]]
+// CHECK-32:       omp.dispatch.body:
+// CHECK-32-NEXT:    [[TMP3:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4
+// CHECK-32-NEXT:    store i32 [[TMP3]], i32* [[DOTOMP_IV]], align 4
+// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK-32:       omp.inner.for.cond:
+// CHECK-32-NEXT:    [[TMP4:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP285:![0-9]+]]
+// CHECK-32-NEXT:    [[TMP5:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP285]]
+// CHECK-32-NEXT:    [[CMP:%.*]] = icmp sle i32 [[TMP4]], [[TMP5]]
+// CHECK-32-NEXT:    br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK-32:       omp.inner.for.body:
+// CHECK-32-NEXT:    [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP285]]
+// CHECK-32-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP6]], 1
+// CHECK-32-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// CHECK-32-NEXT:    store i32 [[ADD]], i32* [[I]], align 4, !llvm.access.group [[ACC_GRP285]]
+// CHECK-32-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK-32:       omp.body.continue:
+// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK-32:       omp.inner.for.inc:
+// CHECK-32-NEXT:    [[TMP7:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP285]]
+// CHECK-32-NEXT:    [[ADD1:%.*]] = add nsw i32 [[TMP7]], 1
+// CHECK-32-NEXT:    store i32 [[ADD1]], i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP285]]
+// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP286:![0-9]+]]
+// CHECK-32:       omp.inner.for.end:
+// CHECK-32-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
+// CHECK-32:       omp.dispatch.inc:
+// CHECK-32-NEXT:    br label [[OMP_DISPATCH_COND]]
+// CHECK-32:       omp.dispatch.end:
+// CHECK-32-NEXT:    [[TMP8:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-32-NEXT:    [[TMP9:%.*]] = icmp ne i32 [[TMP8]], 0
+// CHECK-32-NEXT:    br i1 [[TMP9]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]]
+// CHECK-32:       .omp.final.then:
+// CHECK-32-NEXT:    store i32 10, i32* [[I]], align 4
+// CHECK-32-NEXT:    br label [[DOTOMP_FINAL_DONE]]
+// CHECK-32:       .omp.final.done:
+// CHECK-32-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[TMP1]])
+// CHECK-32-NEXT:    ret void
+//
+//
+// CHECK-32-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l204
+// CHECK-32-SAME: () #[[ATTR8]] {
+// CHECK-32-NEXT:  entry:
+// CHECK-32-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x i8*], align 4
+// CHECK-32-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 2, i1 false)
+// CHECK-32-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
+// CHECK-32-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
+// CHECK-32:       user_code.entry:
+// CHECK-32-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
+// CHECK-32-NEXT:    [[TMP2:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8**
+// CHECK-32-NEXT:    call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__84 to i8*), i8* null, i8** [[TMP2]], i32 0)
+// CHECK-32-NEXT:    call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 2)
+// CHECK-32-NEXT:    ret void
+// CHECK-32:       worker.exit:
+// CHECK-32-NEXT:    ret void
+//
+//
+// CHECK-32-LABEL: define {{[^@]+}}@__omp_outlined__84
+// CHECK-32-SAME: (i32* noalias noundef [[DOTGLOBAL_TID_:%.*]], i32* noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
+// CHECK-32-NEXT:  entry:
+// CHECK-32-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4
+// CHECK-32-NEXT:    store i32 0, i32* [[DOTOMP_LB]], align 4
+// CHECK-32-NEXT:    store i32 9, i32* [[DOTOMP_UB]], align 4
+// CHECK-32-NEXT:    store i32 1, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-32-NEXT:    store i32 0, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-32-NEXT:    [[TMP0:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-NEXT:    [[TMP1:%.*]] = load i32, i32* [[TMP0]], align 4
+// CHECK-32-NEXT:    call void @__kmpc_dispatch_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]], i32 1073741860, i32 0, i32 9, i32 1, i32 1)
+// CHECK-32-NEXT:    br label [[OMP_DISPATCH_COND:%.*]]
+// CHECK-32:       omp.dispatch.cond:
+// CHECK-32-NEXT:    [[TMP2:%.*]] = call i32 @__kmpc_dispatch_next_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]], i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]])
+// CHECK-32-NEXT:    [[TOBOOL:%.*]] = icmp ne i32 [[TMP2]], 0
+// CHECK-32-NEXT:    br i1 [[TOBOOL]], label [[OMP_DISPATCH_BODY:%.*]], label [[OMP_DISPATCH_END:%.*]]
+// CHECK-32:       omp.dispatch.body:
+// CHECK-32-NEXT:    [[TMP3:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4
+// CHECK-32-NEXT:    store i32 [[TMP3]], i32* [[DOTOMP_IV]], align 4
+// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK-32:       omp.inner.for.cond:
+// CHECK-32-NEXT:    [[TMP4:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP288:![0-9]+]]
+// CHECK-32-NEXT:    [[TMP5:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP288]]
+// CHECK-32-NEXT:    [[CMP:%.*]] = icmp sle i32 [[TMP4]], [[TMP5]]
+// CHECK-32-NEXT:    br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK-32:       omp.inner.for.body:
+// CHECK-32-NEXT:    [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP288]]
+// CHECK-32-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP6]], 1
+// CHECK-32-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// CHECK-32-NEXT:    store i32 [[ADD]], i32* [[I]], align 4, !llvm.access.group [[ACC_GRP288]]
+// CHECK-32-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK-32:       omp.body.continue:
+// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK-32:       omp.inner.for.inc:
+// CHECK-32-NEXT:    [[TMP7:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP288]]
+// CHECK-32-NEXT:    [[ADD1:%.*]] = add nsw i32 [[TMP7]], 1
+// CHECK-32-NEXT:    store i32 [[ADD1]], i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP288]]
+// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP289:![0-9]+]]
+// CHECK-32:       omp.inner.for.end:
+// CHECK-32-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
+// CHECK-32:       omp.dispatch.inc:
+// CHECK-32-NEXT:    br label [[OMP_DISPATCH_COND]]
+// CHECK-32:       omp.dispatch.end:
+// CHECK-32-NEXT:    [[TMP8:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-32-NEXT:    [[TMP9:%.*]] = icmp ne i32 [[TMP8]], 0
+// CHECK-32-NEXT:    br i1 [[TMP9]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]]
+// CHECK-32:       .omp.final.then:
+// CHECK-32-NEXT:    store i32 10, i32* [[I]], align 4
+// CHECK-32-NEXT:    br label [[DOTOMP_FINAL_DONE]]
+// CHECK-32:       .omp.final.done:
+// CHECK-32-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[TMP1]])
+// CHECK-32-NEXT:    ret void
+//
+//
+// CHECK-32-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l208
+// CHECK-32-SAME: () #[[ATTR8]] {
+// CHECK-32-NEXT:  entry:
+// CHECK-32-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x i8*], align 4
+// CHECK-32-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 2, i1 false)
+// CHECK-32-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
+// CHECK-32-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
+// CHECK-32:       user_code.entry:
+// CHECK-32-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
+// CHECK-32-NEXT:    [[TMP2:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8**
+// CHECK-32-NEXT:    call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__85 to i8*), i8* null, i8** [[TMP2]], i32 0)
+// CHECK-32-NEXT:    call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 2)
+// CHECK-32-NEXT:    ret void
+// CHECK-32:       worker.exit:
+// CHECK-32-NEXT:    ret void
+//
+//
+// CHECK-32-LABEL: define {{[^@]+}}@__omp_outlined__85
+// CHECK-32-SAME: (i32* noalias noundef [[DOTGLOBAL_TID_:%.*]], i32* noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
+// CHECK-32-NEXT:  entry:
+// CHECK-32-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4
+// CHECK-32-NEXT:    store i32 0, i32* [[DOTOMP_LB]], align 4
+// CHECK-32-NEXT:    store i32 9, i32* [[DOTOMP_UB]], align 4
+// CHECK-32-NEXT:    store i32 1, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-32-NEXT:    store i32 0, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-32-NEXT:    [[TMP0:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-NEXT:    [[TMP1:%.*]] = load i32, i32* [[TMP0]], align 4
+// CHECK-32-NEXT:    call void @__kmpc_dispatch_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]], i32 65, i32 0, i32 9, i32 1, i32 1)
+// CHECK-32-NEXT:    br label [[OMP_DISPATCH_COND:%.*]]
+// CHECK-32:       omp.dispatch.cond:
+// CHECK-32-NEXT:    [[TMP2:%.*]] = call i32 @__kmpc_dispatch_next_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]], i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]])
+// CHECK-32-NEXT:    [[TOBOOL:%.*]] = icmp ne i32 [[TMP2]], 0
+// CHECK-32-NEXT:    br i1 [[TOBOOL]], label [[OMP_DISPATCH_BODY:%.*]], label [[OMP_DISPATCH_END:%.*]]
+// CHECK-32:       omp.dispatch.body:
+// CHECK-32-NEXT:    [[TMP3:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4
+// CHECK-32-NEXT:    store i32 [[TMP3]], i32* [[DOTOMP_IV]], align 4
+// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK-32:       omp.inner.for.cond:
+// CHECK-32-NEXT:    [[TMP4:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP291:![0-9]+]]
+// CHECK-32-NEXT:    [[TMP5:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP291]]
+// CHECK-32-NEXT:    [[CMP:%.*]] = icmp sle i32 [[TMP4]], [[TMP5]]
+// CHECK-32-NEXT:    br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK-32:       omp.inner.for.body:
+// CHECK-32-NEXT:    [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP291]]
+// CHECK-32-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP6]], 1
+// CHECK-32-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// CHECK-32-NEXT:    store i32 [[ADD]], i32* [[I]], align 4, !llvm.access.group [[ACC_GRP291]]
+// CHECK-32-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK-32:       omp.body.continue:
+// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK-32:       omp.inner.for.inc:
+// CHECK-32-NEXT:    [[TMP7:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP291]]
+// CHECK-32-NEXT:    [[ADD1:%.*]] = add nsw i32 [[TMP7]], 1
+// CHECK-32-NEXT:    store i32 [[ADD1]], i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP291]]
+// CHECK-32-NEXT:    call void @__kmpc_dispatch_fini_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]]), !llvm.access.group [[ACC_GRP291]]
+// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP292:![0-9]+]]
+// CHECK-32:       omp.inner.for.end:
+// CHECK-32-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
+// CHECK-32:       omp.dispatch.inc:
+// CHECK-32-NEXT:    br label [[OMP_DISPATCH_COND]]
+// CHECK-32:       omp.dispatch.end:
+// CHECK-32-NEXT:    [[TMP8:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-32-NEXT:    [[TMP9:%.*]] = icmp ne i32 [[TMP8]], 0
+// CHECK-32-NEXT:    br i1 [[TMP9]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]]
+// CHECK-32:       .omp.final.then:
+// CHECK-32-NEXT:    store i32 10, i32* [[I]], align 4
+// CHECK-32-NEXT:    br label [[DOTOMP_FINAL_DONE]]
+// CHECK-32:       .omp.final.done:
+// CHECK-32-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[TMP1]])
+// CHECK-32-NEXT:    ret void
+//
+//
+// CHECK-32-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l213
+// CHECK-32-SAME: () #[[ATTR8]] {
+// CHECK-32-NEXT:  entry:
+// CHECK-32-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x i8*], align 4
+// CHECK-32-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 2, i1 false)
+// CHECK-32-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
+// CHECK-32-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
+// CHECK-32:       user_code.entry:
+// CHECK-32-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
+// CHECK-32-NEXT:    [[TMP2:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8**
+// CHECK-32-NEXT:    call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__86 to i8*), i8* null, i8** [[TMP2]], i32 0)
+// CHECK-32-NEXT:    call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 2)
+// CHECK-32-NEXT:    ret void
+// CHECK-32:       worker.exit:
+// CHECK-32-NEXT:    ret void
+//
+//
+// CHECK-32-LABEL: define {{[^@]+}}@__omp_outlined__86
+// CHECK-32-SAME: (i32* noalias noundef [[DOTGLOBAL_TID_:%.*]], i32* noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
+// CHECK-32-NEXT:  entry:
+// CHECK-32-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4
+// CHECK-32-NEXT:    store i32 0, i32* [[DOTOMP_LB]], align 4
+// CHECK-32-NEXT:    store i32 9, i32* [[DOTOMP_UB]], align 4
+// CHECK-32-NEXT:    store i32 1, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-32-NEXT:    store i32 0, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-32-NEXT:    [[TMP0:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-NEXT:    [[TMP1:%.*]] = load i32, i32* [[TMP0]], align 4
+// CHECK-32-NEXT:    call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB3]], i32 [[TMP1]], i32 34, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1)
+// CHECK-32-NEXT:    [[TMP2:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4
+// CHECK-32-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[TMP2]], 9
+// CHECK-32-NEXT:    br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK-32:       cond.true:
+// CHECK-32-NEXT:    br label [[COND_END:%.*]]
+// CHECK-32:       cond.false:
+// CHECK-32-NEXT:    [[TMP3:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4
+// CHECK-32-NEXT:    br label [[COND_END]]
+// CHECK-32:       cond.end:
+// CHECK-32-NEXT:    [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP3]], [[COND_FALSE]] ]
+// CHECK-32-NEXT:    store i32 [[COND]], i32* [[DOTOMP_UB]], align 4
+// CHECK-32-NEXT:    [[TMP4:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4
+// CHECK-32-NEXT:    store i32 [[TMP4]], i32* [[DOTOMP_IV]], align 4
+// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK-32:       omp.inner.for.cond:
+// CHECK-32-NEXT:    [[TMP5:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP294:![0-9]+]]
+// CHECK-32-NEXT:    [[TMP6:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP294]]
+// CHECK-32-NEXT:    [[CMP1:%.*]] = icmp sle i32 [[TMP5]], [[TMP6]]
+// CHECK-32-NEXT:    br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK-32:       omp.inner.for.body:
+// CHECK-32-NEXT:    [[TMP7:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP294]]
+// CHECK-32-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP7]], 1
+// CHECK-32-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// CHECK-32-NEXT:    store i32 [[ADD]], i32* [[I]], align 4, !llvm.access.group [[ACC_GRP294]]
+// CHECK-32-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK-32:       omp.body.continue:
+// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK-32:       omp.inner.for.inc:
+// CHECK-32-NEXT:    [[TMP8:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP294]]
+// CHECK-32-NEXT:    [[ADD2:%.*]] = add nsw i32 [[TMP8]], 1
+// CHECK-32-NEXT:    store i32 [[ADD2]], i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP294]]
+// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP295:![0-9]+]]
+// CHECK-32:       omp.inner.for.end:
+// CHECK-32-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK-32:       omp.loop.exit:
+// CHECK-32-NEXT:    call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB3]], i32 [[TMP1]])
+// CHECK-32-NEXT:    [[TMP9:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-32-NEXT:    [[TMP10:%.*]] = icmp ne i32 [[TMP9]], 0
+// CHECK-32-NEXT:    br i1 [[TMP10]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]]
+// CHECK-32:       .omp.final.then:
+// CHECK-32-NEXT:    store i32 10, i32* [[I]], align 4
+// CHECK-32-NEXT:    br label [[DOTOMP_FINAL_DONE]]
+// CHECK-32:       .omp.final.done:
+// CHECK-32-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[TMP1]])
+// CHECK-32-NEXT:    ret void
+//
+//
+// CHECK-32-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l218
+// CHECK-32-SAME: () #[[ATTR8]] {
+// CHECK-32-NEXT:  entry:
+// CHECK-32-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x i8*], align 4
+// CHECK-32-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 2, i1 false)
+// CHECK-32-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
+// CHECK-32-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
+// CHECK-32:       user_code.entry:
+// CHECK-32-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
+// CHECK-32-NEXT:    [[TMP2:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8**
+// CHECK-32-NEXT:    call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__87 to i8*), i8* null, i8** [[TMP2]], i32 0)
+// CHECK-32-NEXT:    call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 2)
+// CHECK-32-NEXT:    ret void
+// CHECK-32:       worker.exit:
+// CHECK-32-NEXT:    ret void
+//
+//
+// CHECK-32-LABEL: define {{[^@]+}}@__omp_outlined__87
+// CHECK-32-SAME: (i32* noalias noundef [[DOTGLOBAL_TID_:%.*]], i32* noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
+// CHECK-32-NEXT:  entry:
+// CHECK-32-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4
+// CHECK-32-NEXT:    store i32 0, i32* [[DOTOMP_LB]], align 4
+// CHECK-32-NEXT:    store i32 9, i32* [[DOTOMP_UB]], align 4
+// CHECK-32-NEXT:    store i32 1, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-32-NEXT:    store i32 0, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-32-NEXT:    [[TMP0:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-NEXT:    [[TMP1:%.*]] = load i32, i32* [[TMP0]], align 4
+// CHECK-32-NEXT:    call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB3]], i32 [[TMP1]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1)
+// CHECK-32-NEXT:    br label [[OMP_DISPATCH_COND:%.*]]
+// CHECK-32:       omp.dispatch.cond:
+// CHECK-32-NEXT:    [[TMP2:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4
+// CHECK-32-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[TMP2]], 9
+// CHECK-32-NEXT:    br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK-32:       cond.true:
+// CHECK-32-NEXT:    br label [[COND_END:%.*]]
+// CHECK-32:       cond.false:
+// CHECK-32-NEXT:    [[TMP3:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4
+// CHECK-32-NEXT:    br label [[COND_END]]
+// CHECK-32:       cond.end:
+// CHECK-32-NEXT:    [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP3]], [[COND_FALSE]] ]
+// CHECK-32-NEXT:    store i32 [[COND]], i32* [[DOTOMP_UB]], align 4
+// CHECK-32-NEXT:    [[TMP4:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4
+// CHECK-32-NEXT:    store i32 [[TMP4]], i32* [[DOTOMP_IV]], align 4
+// CHECK-32-NEXT:    [[TMP5:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
+// CHECK-32-NEXT:    [[TMP6:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4
+// CHECK-32-NEXT:    [[CMP1:%.*]] = icmp sle i32 [[TMP5]], [[TMP6]]
+// CHECK-32-NEXT:    br i1 [[CMP1]], label [[OMP_DISPATCH_BODY:%.*]], label [[OMP_DISPATCH_END:%.*]]
+// CHECK-32:       omp.dispatch.body:
+// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK-32:       omp.inner.for.cond:
+// CHECK-32-NEXT:    [[TMP7:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP297:![0-9]+]]
+// CHECK-32-NEXT:    [[TMP8:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP297]]
+// CHECK-32-NEXT:    [[CMP2:%.*]] = icmp sle i32 [[TMP7]], [[TMP8]]
+// CHECK-32-NEXT:    br i1 [[CMP2]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK-32:       omp.inner.for.body:
+// CHECK-32-NEXT:    [[TMP9:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP297]]
+// CHECK-32-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP9]], 1
+// CHECK-32-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// CHECK-32-NEXT:    store i32 [[ADD]], i32* [[I]], align 4, !llvm.access.group [[ACC_GRP297]]
+// CHECK-32-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK-32:       omp.body.continue:
+// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK-32:       omp.inner.for.inc:
+// CHECK-32-NEXT:    [[TMP10:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP297]]
+// CHECK-32-NEXT:    [[ADD3:%.*]] = add nsw i32 [[TMP10]], 1
+// CHECK-32-NEXT:    store i32 [[ADD3]], i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP297]]
+// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP298:![0-9]+]]
+// CHECK-32:       omp.inner.for.end:
+// CHECK-32-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
+// CHECK-32:       omp.dispatch.inc:
+// CHECK-32-NEXT:    [[TMP11:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4
+// CHECK-32-NEXT:    [[TMP12:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-32-NEXT:    [[ADD4:%.*]] = add nsw i32 [[TMP11]], [[TMP12]]
+// CHECK-32-NEXT:    store i32 [[ADD4]], i32* [[DOTOMP_LB]], align 4
+// CHECK-32-NEXT:    [[TMP13:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4
+// CHECK-32-NEXT:    [[TMP14:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-32-NEXT:    [[ADD5:%.*]] = add nsw i32 [[TMP13]], [[TMP14]]
+// CHECK-32-NEXT:    store i32 [[ADD5]], i32* [[DOTOMP_UB]], align 4
+// CHECK-32-NEXT:    br label [[OMP_DISPATCH_COND]]
+// CHECK-32:       omp.dispatch.end:
+// CHECK-32-NEXT:    call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB3]], i32 [[TMP1]])
+// CHECK-32-NEXT:    [[TMP15:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-32-NEXT:    [[TMP16:%.*]] = icmp ne i32 [[TMP15]], 0
+// CHECK-32-NEXT:    br i1 [[TMP16]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]]
+// CHECK-32:       .omp.final.then:
+// CHECK-32-NEXT:    store i32 10, i32* [[I]], align 4
+// CHECK-32-NEXT:    br label [[DOTOMP_FINAL_DONE]]
+// CHECK-32:       .omp.final.done:
+// CHECK-32-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[TMP1]])
+// CHECK-32-NEXT:    ret void
+//
+//
+// CHECK-32-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l223
+// CHECK-32-SAME: () #[[ATTR8]] {
+// CHECK-32-NEXT:  entry:
+// CHECK-32-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x i8*], align 4
+// CHECK-32-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 2, i1 false)
+// CHECK-32-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
+// CHECK-32-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
+// CHECK-32:       user_code.entry:
+// CHECK-32-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
+// CHECK-32-NEXT:    [[TMP2:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8**
+// CHECK-32-NEXT:    call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__88 to i8*), i8* null, i8** [[TMP2]], i32 0)
+// CHECK-32-NEXT:    call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 2)
+// CHECK-32-NEXT:    ret void
+// CHECK-32:       worker.exit:
+// CHECK-32-NEXT:    ret void
+//
+//
+// CHECK-32-LABEL: define {{[^@]+}}@__omp_outlined__88
+// CHECK-32-SAME: (i32* noalias noundef [[DOTGLOBAL_TID_:%.*]], i32* noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
+// CHECK-32-NEXT:  entry:
+// CHECK-32-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4
+// CHECK-32-NEXT:    store i32 0, i32* [[DOTOMP_LB]], align 4
+// CHECK-32-NEXT:    store i32 9, i32* [[DOTOMP_UB]], align 4
+// CHECK-32-NEXT:    store i32 1, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-32-NEXT:    store i32 0, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-32-NEXT:    [[TMP0:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-NEXT:    [[TMP1:%.*]] = load i32, i32* [[TMP0]], align 4
+// CHECK-32-NEXT:    call void @__kmpc_dispatch_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]], i32 1073741862, i32 0, i32 9, i32 1, i32 1)
+// CHECK-32-NEXT:    br label [[OMP_DISPATCH_COND:%.*]]
+// CHECK-32:       omp.dispatch.cond:
+// CHECK-32-NEXT:    [[TMP2:%.*]] = call i32 @__kmpc_dispatch_next_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]], i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]])
+// CHECK-32-NEXT:    [[TOBOOL:%.*]] = icmp ne i32 [[TMP2]], 0
+// CHECK-32-NEXT:    br i1 [[TOBOOL]], label [[OMP_DISPATCH_BODY:%.*]], label [[OMP_DISPATCH_END:%.*]]
+// CHECK-32:       omp.dispatch.body:
+// CHECK-32-NEXT:    [[TMP3:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4
+// CHECK-32-NEXT:    store i32 [[TMP3]], i32* [[DOTOMP_IV]], align 4
+// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK-32:       omp.inner.for.cond:
+// CHECK-32-NEXT:    [[TMP4:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP300:![0-9]+]]
+// CHECK-32-NEXT:    [[TMP5:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP300]]
+// CHECK-32-NEXT:    [[CMP:%.*]] = icmp sle i32 [[TMP4]], [[TMP5]]
+// CHECK-32-NEXT:    br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK-32:       omp.inner.for.body:
+// CHECK-32-NEXT:    [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP300]]
+// CHECK-32-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP6]], 1
+// CHECK-32-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// CHECK-32-NEXT:    store i32 [[ADD]], i32* [[I]], align 4, !llvm.access.group [[ACC_GRP300]]
+// CHECK-32-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK-32:       omp.body.continue:
+// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK-32:       omp.inner.for.inc:
+// CHECK-32-NEXT:    [[TMP7:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP300]]
+// CHECK-32-NEXT:    [[ADD1:%.*]] = add nsw i32 [[TMP7]], 1
+// CHECK-32-NEXT:    store i32 [[ADD1]], i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP300]]
+// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP301:![0-9]+]]
+// CHECK-32:       omp.inner.for.end:
+// CHECK-32-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
+// CHECK-32:       omp.dispatch.inc:
+// CHECK-32-NEXT:    br label [[OMP_DISPATCH_COND]]
+// CHECK-32:       omp.dispatch.end:
+// CHECK-32-NEXT:    [[TMP8:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-32-NEXT:    [[TMP9:%.*]] = icmp ne i32 [[TMP8]], 0
+// CHECK-32-NEXT:    br i1 [[TMP9]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]]
+// CHECK-32:       .omp.final.then:
+// CHECK-32-NEXT:    store i32 10, i32* [[I]], align 4
+// CHECK-32-NEXT:    br label [[DOTOMP_FINAL_DONE]]
+// CHECK-32:       .omp.final.done:
+// CHECK-32-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[TMP1]])
+// CHECK-32-NEXT:    ret void
+//
+//
+// CHECK-32-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l228
+// CHECK-32-SAME: () #[[ATTR8]] {
+// CHECK-32-NEXT:  entry:
+// CHECK-32-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x i8*], align 4
+// CHECK-32-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 2, i1 false)
+// CHECK-32-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
+// CHECK-32-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
+// CHECK-32:       user_code.entry:
+// CHECK-32-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
+// CHECK-32-NEXT:    [[TMP2:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8**
+// CHECK-32-NEXT:    call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__89 to i8*), i8* null, i8** [[TMP2]], i32 0)
+// CHECK-32-NEXT:    call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 2)
+// CHECK-32-NEXT:    ret void
+// CHECK-32:       worker.exit:
+// CHECK-32-NEXT:    ret void
+//
+//
+// CHECK-32-LABEL: define {{[^@]+}}@__omp_outlined__89
+// CHECK-32-SAME: (i32* noalias noundef [[DOTGLOBAL_TID_:%.*]], i32* noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
+// CHECK-32-NEXT:  entry:
+// CHECK-32-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4
+// CHECK-32-NEXT:    store i32 0, i32* [[DOTOMP_LB]], align 4
+// CHECK-32-NEXT:    store i32 9, i32* [[DOTOMP_UB]], align 4
+// CHECK-32-NEXT:    store i32 1, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-32-NEXT:    store i32 0, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-32-NEXT:    [[TMP0:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-NEXT:    [[TMP1:%.*]] = load i32, i32* [[TMP0]], align 4
+// CHECK-32-NEXT:    call void @__kmpc_dispatch_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]], i32 1073741861, i32 0, i32 9, i32 1, i32 1)
+// CHECK-32-NEXT:    br label [[OMP_DISPATCH_COND:%.*]]
+// CHECK-32:       omp.dispatch.cond:
+// CHECK-32-NEXT:    [[TMP2:%.*]] = call i32 @__kmpc_dispatch_next_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]], i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]])
+// CHECK-32-NEXT:    [[TOBOOL:%.*]] = icmp ne i32 [[TMP2]], 0
+// CHECK-32-NEXT:    br i1 [[TOBOOL]], label [[OMP_DISPATCH_BODY:%.*]], label [[OMP_DISPATCH_END:%.*]]
+// CHECK-32:       omp.dispatch.body:
+// CHECK-32-NEXT:    [[TMP3:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4
+// CHECK-32-NEXT:    store i32 [[TMP3]], i32* [[DOTOMP_IV]], align 4
+// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK-32:       omp.inner.for.cond:
+// CHECK-32-NEXT:    [[TMP4:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP303:![0-9]+]]
+// CHECK-32-NEXT:    [[TMP5:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP303]]
+// CHECK-32-NEXT:    [[CMP:%.*]] = icmp sle i32 [[TMP4]], [[TMP5]]
+// CHECK-32-NEXT:    br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK-32:       omp.inner.for.body:
+// CHECK-32-NEXT:    [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP303]]
+// CHECK-32-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP6]], 1
+// CHECK-32-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// CHECK-32-NEXT:    store i32 [[ADD]], i32* [[I]], align 4, !llvm.access.group [[ACC_GRP303]]
+// CHECK-32-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK-32:       omp.body.continue:
+// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK-32:       omp.inner.for.inc:
+// CHECK-32-NEXT:    [[TMP7:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP303]]
+// CHECK-32-NEXT:    [[ADD1:%.*]] = add nsw i32 [[TMP7]], 1
+// CHECK-32-NEXT:    store i32 [[ADD1]], i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP303]]
+// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP304:![0-9]+]]
+// CHECK-32:       omp.inner.for.end:
+// CHECK-32-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
+// CHECK-32:       omp.dispatch.inc:
+// CHECK-32-NEXT:    br label [[OMP_DISPATCH_COND]]
+// CHECK-32:       omp.dispatch.end:
+// CHECK-32-NEXT:    [[TMP8:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-32-NEXT:    [[TMP9:%.*]] = icmp ne i32 [[TMP8]], 0
+// CHECK-32-NEXT:    br i1 [[TMP9]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]]
+// CHECK-32:       .omp.final.then:
+// CHECK-32-NEXT:    store i32 10, i32* [[I]], align 4
+// CHECK-32-NEXT:    br label [[DOTOMP_FINAL_DONE]]
+// CHECK-32:       .omp.final.done:
+// CHECK-32-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[TMP1]])
+// CHECK-32-NEXT:    ret void
+//
+//
+// CHECK-32-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l233
+// CHECK-32-SAME: () #[[ATTR8]] {
+// CHECK-32-NEXT:  entry:
+// CHECK-32-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x i8*], align 4
+// CHECK-32-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 2, i1 false)
+// CHECK-32-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
+// CHECK-32-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
+// CHECK-32:       user_code.entry:
+// CHECK-32-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
+// CHECK-32-NEXT:    [[TMP2:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8**
+// CHECK-32-NEXT:    call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__90 to i8*), i8* null, i8** [[TMP2]], i32 0)
+// CHECK-32-NEXT:    call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 2)
+// CHECK-32-NEXT:    ret void
+// CHECK-32:       worker.exit:
+// CHECK-32-NEXT:    ret void
+//
+//
+// CHECK-32-LABEL: define {{[^@]+}}@__omp_outlined__90
+// CHECK-32-SAME: (i32* noalias noundef [[DOTGLOBAL_TID_:%.*]], i32* noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
+// CHECK-32-NEXT:  entry:
+// CHECK-32-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4
+// CHECK-32-NEXT:    store i32 0, i32* [[DOTOMP_LB]], align 4
+// CHECK-32-NEXT:    store i32 9, i32* [[DOTOMP_UB]], align 4
+// CHECK-32-NEXT:    store i32 1, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-32-NEXT:    store i32 0, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-32-NEXT:    [[TMP0:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-NEXT:    [[TMP1:%.*]] = load i32, i32* [[TMP0]], align 4
+// CHECK-32-NEXT:    call void @__kmpc_dispatch_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]], i32 1073741859, i32 0, i32 9, i32 1, i32 1)
+// CHECK-32-NEXT:    br label [[OMP_DISPATCH_COND:%.*]]
+// CHECK-32:       omp.dispatch.cond:
+// CHECK-32-NEXT:    [[TMP2:%.*]] = call i32 @__kmpc_dispatch_next_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]], i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]])
+// CHECK-32-NEXT:    [[TOBOOL:%.*]] = icmp ne i32 [[TMP2]], 0
+// CHECK-32-NEXT:    br i1 [[TOBOOL]], label [[OMP_DISPATCH_BODY:%.*]], label [[OMP_DISPATCH_END:%.*]]
+// CHECK-32:       omp.dispatch.body:
+// CHECK-32-NEXT:    [[TMP3:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4
+// CHECK-32-NEXT:    store i32 [[TMP3]], i32* [[DOTOMP_IV]], align 4
+// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK-32:       omp.inner.for.cond:
+// CHECK-32-NEXT:    [[TMP4:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP306:![0-9]+]]
+// CHECK-32-NEXT:    [[TMP5:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP306]]
+// CHECK-32-NEXT:    [[CMP:%.*]] = icmp sle i32 [[TMP4]], [[TMP5]]
+// CHECK-32-NEXT:    br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK-32:       omp.inner.for.body:
+// CHECK-32-NEXT:    [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP306]]
+// CHECK-32-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP6]], 1
+// CHECK-32-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// CHECK-32-NEXT:    store i32 [[ADD]], i32* [[I]], align 4, !llvm.access.group [[ACC_GRP306]]
+// CHECK-32-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK-32:       omp.body.continue:
+// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK-32:       omp.inner.for.inc:
+// CHECK-32-NEXT:    [[TMP7:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP306]]
+// CHECK-32-NEXT:    [[ADD1:%.*]] = add nsw i32 [[TMP7]], 1
+// CHECK-32-NEXT:    store i32 [[ADD1]], i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP306]]
+// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP307:![0-9]+]]
+// CHECK-32:       omp.inner.for.end:
+// CHECK-32-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
+// CHECK-32:       omp.dispatch.inc:
+// CHECK-32-NEXT:    br label [[OMP_DISPATCH_COND]]
+// CHECK-32:       omp.dispatch.end:
+// CHECK-32-NEXT:    [[TMP8:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-32-NEXT:    [[TMP9:%.*]] = icmp ne i32 [[TMP8]], 0
+// CHECK-32-NEXT:    br i1 [[TMP9]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]]
+// CHECK-32:       .omp.final.then:
+// CHECK-32-NEXT:    store i32 10, i32* [[I]], align 4
+// CHECK-32-NEXT:    br label [[DOTOMP_FINAL_DONE]]
+// CHECK-32:       .omp.final.done:
+// CHECK-32-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[TMP1]])
+// CHECK-32-NEXT:    ret void
+//
+//
+// CHECK-32-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l238
+// CHECK-32-SAME: () #[[ATTR8]] {
+// CHECK-32-NEXT:  entry:
+// CHECK-32-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x i8*], align 4
+// CHECK-32-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 2, i1 false)
+// CHECK-32-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
+// CHECK-32-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
+// CHECK-32:       user_code.entry:
+// CHECK-32-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
+// CHECK-32-NEXT:    [[TMP2:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8**
+// CHECK-32-NEXT:    call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__91 to i8*), i8* null, i8** [[TMP2]], i32 0)
+// CHECK-32-NEXT:    call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 2)
+// CHECK-32-NEXT:    ret void
+// CHECK-32:       worker.exit:
+// CHECK-32-NEXT:    ret void
+//
+//
+// CHECK-32-LABEL: define {{[^@]+}}@__omp_outlined__91
+// CHECK-32-SAME: (i32* noalias noundef [[DOTGLOBAL_TID_:%.*]], i32* noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
+// CHECK-32-NEXT:  entry:
+// CHECK-32-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4
+// CHECK-32-NEXT:    store i32 0, i32* [[DOTOMP_LB]], align 4
+// CHECK-32-NEXT:    store i32 9, i32* [[DOTOMP_UB]], align 4
+// CHECK-32-NEXT:    store i32 1, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-32-NEXT:    store i32 0, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-32-NEXT:    [[TMP0:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-NEXT:    [[TMP1:%.*]] = load i32, i32* [[TMP0]], align 4
+// CHECK-32-NEXT:    call void @__kmpc_dispatch_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]], i32 1073741860, i32 0, i32 9, i32 1, i32 1)
+// CHECK-32-NEXT:    br label [[OMP_DISPATCH_COND:%.*]]
+// CHECK-32:       omp.dispatch.cond:
+// CHECK-32-NEXT:    [[TMP2:%.*]] = call i32 @__kmpc_dispatch_next_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]], i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]])
+// CHECK-32-NEXT:    [[TOBOOL:%.*]] = icmp ne i32 [[TMP2]], 0
+// CHECK-32-NEXT:    br i1 [[TOBOOL]], label [[OMP_DISPATCH_BODY:%.*]], label [[OMP_DISPATCH_END:%.*]]
+// CHECK-32:       omp.dispatch.body:
+// CHECK-32-NEXT:    [[TMP3:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4
+// CHECK-32-NEXT:    store i32 [[TMP3]], i32* [[DOTOMP_IV]], align 4
+// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK-32:       omp.inner.for.cond:
+// CHECK-32-NEXT:    [[TMP4:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP309:![0-9]+]]
+// CHECK-32-NEXT:    [[TMP5:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP309]]
+// CHECK-32-NEXT:    [[CMP:%.*]] = icmp sle i32 [[TMP4]], [[TMP5]]
+// CHECK-32-NEXT:    br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK-32:       omp.inner.for.body:
+// CHECK-32-NEXT:    [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP309]]
+// CHECK-32-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP6]], 1
+// CHECK-32-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// CHECK-32-NEXT:    store i32 [[ADD]], i32* [[I]], align 4, !llvm.access.group [[ACC_GRP309]]
+// CHECK-32-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK-32:       omp.body.continue:
+// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK-32:       omp.inner.for.inc:
+// CHECK-32-NEXT:    [[TMP7:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP309]]
+// CHECK-32-NEXT:    [[ADD1:%.*]] = add nsw i32 [[TMP7]], 1
+// CHECK-32-NEXT:    store i32 [[ADD1]], i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP309]]
+// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP310:![0-9]+]]
+// CHECK-32:       omp.inner.for.end:
+// CHECK-32-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
+// CHECK-32:       omp.dispatch.inc:
+// CHECK-32-NEXT:    br label [[OMP_DISPATCH_COND]]
+// CHECK-32:       omp.dispatch.end:
+// CHECK-32-NEXT:    [[TMP8:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-32-NEXT:    [[TMP9:%.*]] = icmp ne i32 [[TMP8]], 0
+// CHECK-32-NEXT:    br i1 [[TMP9]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]]
+// CHECK-32:       .omp.final.then:
+// CHECK-32-NEXT:    store i32 10, i32* [[I]], align 4
+// CHECK-32-NEXT:    br label [[DOTOMP_FINAL_DONE]]
+// CHECK-32:       .omp.final.done:
+// CHECK-32-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[TMP1]])
+// CHECK-32-NEXT:    ret void
+//
+//
+// CHECK-32-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l243
+// CHECK-32-SAME: () #[[ATTR8]] {
+// CHECK-32-NEXT:  entry:
+// CHECK-32-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x i8*], align 4
+// CHECK-32-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 2, i1 false)
+// CHECK-32-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
+// CHECK-32-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
+// CHECK-32:       user_code.entry:
+// CHECK-32-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
+// CHECK-32-NEXT:    [[TMP2:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8**
+// CHECK-32-NEXT:    call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__92 to i8*), i8* null, i8** [[TMP2]], i32 0)
+// CHECK-32-NEXT:    call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 2)
+// CHECK-32-NEXT:    ret void
+// CHECK-32:       worker.exit:
+// CHECK-32-NEXT:    ret void
+//
+//
+// CHECK-32-LABEL: define {{[^@]+}}@__omp_outlined__92
+// CHECK-32-SAME: (i32* noalias noundef [[DOTGLOBAL_TID_:%.*]], i32* noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
+// CHECK-32-NEXT:  entry:
+// CHECK-32-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4
+// CHECK-32-NEXT:    store i32 0, i32* [[DOTOMP_LB]], align 4
+// CHECK-32-NEXT:    store i32 9, i32* [[DOTOMP_UB]], align 4
+// CHECK-32-NEXT:    store i32 1, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-32-NEXT:    store i32 0, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-32-NEXT:    [[TMP0:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-NEXT:    [[TMP1:%.*]] = load i32, i32* [[TMP0]], align 4
+// CHECK-32-NEXT:    call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB3]], i32 [[TMP1]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1)
+// CHECK-32-NEXT:    br label [[OMP_DISPATCH_COND:%.*]]
+// CHECK-32:       omp.dispatch.cond:
+// CHECK-32-NEXT:    [[TMP2:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4
+// CHECK-32-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[TMP2]], 9
+// CHECK-32-NEXT:    br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK-32:       cond.true:
+// CHECK-32-NEXT:    br label [[COND_END:%.*]]
+// CHECK-32:       cond.false:
+// CHECK-32-NEXT:    [[TMP3:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4
+// CHECK-32-NEXT:    br label [[COND_END]]
+// CHECK-32:       cond.end:
+// CHECK-32-NEXT:    [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP3]], [[COND_FALSE]] ]
+// CHECK-32-NEXT:    store i32 [[COND]], i32* [[DOTOMP_UB]], align 4
+// CHECK-32-NEXT:    [[TMP4:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4
+// CHECK-32-NEXT:    store i32 [[TMP4]], i32* [[DOTOMP_IV]], align 4
+// CHECK-32-NEXT:    [[TMP5:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
+// CHECK-32-NEXT:    [[TMP6:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4
+// CHECK-32-NEXT:    [[CMP1:%.*]] = icmp sle i32 [[TMP5]], [[TMP6]]
+// CHECK-32-NEXT:    br i1 [[CMP1]], label [[OMP_DISPATCH_BODY:%.*]], label [[OMP_DISPATCH_END:%.*]]
+// CHECK-32:       omp.dispatch.body:
+// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK-32:       omp.inner.for.cond:
+// CHECK-32-NEXT:    [[TMP7:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
+// CHECK-32-NEXT:    [[TMP8:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4
+// CHECK-32-NEXT:    [[CMP2:%.*]] = icmp sle i32 [[TMP7]], [[TMP8]]
+// CHECK-32-NEXT:    br i1 [[CMP2]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK-32:       omp.inner.for.body:
+// CHECK-32-NEXT:    [[TMP9:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
+// CHECK-32-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP9]], 1
+// CHECK-32-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// CHECK-32-NEXT:    store i32 [[ADD]], i32* [[I]], align 4
+// CHECK-32-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK-32:       omp.body.continue:
+// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK-32:       omp.inner.for.inc:
+// CHECK-32-NEXT:    [[TMP10:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
+// CHECK-32-NEXT:    [[ADD3:%.*]] = add nsw i32 [[TMP10]], 1
+// CHECK-32-NEXT:    store i32 [[ADD3]], i32* [[DOTOMP_IV]], align 4
+// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND]]
+// CHECK-32:       omp.inner.for.end:
+// CHECK-32-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
+// CHECK-32:       omp.dispatch.inc:
+// CHECK-32-NEXT:    [[TMP11:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4
+// CHECK-32-NEXT:    [[TMP12:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-32-NEXT:    [[ADD4:%.*]] = add nsw i32 [[TMP11]], [[TMP12]]
+// CHECK-32-NEXT:    store i32 [[ADD4]], i32* [[DOTOMP_LB]], align 4
+// CHECK-32-NEXT:    [[TMP13:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4
+// CHECK-32-NEXT:    [[TMP14:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-32-NEXT:    [[ADD5:%.*]] = add nsw i32 [[TMP13]], [[TMP14]]
+// CHECK-32-NEXT:    store i32 [[ADD5]], i32* [[DOTOMP_UB]], align 4
+// CHECK-32-NEXT:    br label [[OMP_DISPATCH_COND]]
+// CHECK-32:       omp.dispatch.end:
+// CHECK-32-NEXT:    call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB3]], i32 [[TMP1]])
+// CHECK-32-NEXT:    ret void
+//
+//
+// CHECK-32-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l247
+// CHECK-32-SAME: () #[[ATTR8]] {
+// CHECK-32-NEXT:  entry:
+// CHECK-32-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x i8*], align 4
+// CHECK-32-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 2, i1 false)
+// CHECK-32-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
+// CHECK-32-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
+// CHECK-32:       user_code.entry:
+// CHECK-32-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
+// CHECK-32-NEXT:    [[TMP2:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8**
+// CHECK-32-NEXT:    call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__93 to i8*), i8* null, i8** [[TMP2]], i32 0)
+// CHECK-32-NEXT:    call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 2)
+// CHECK-32-NEXT:    ret void
+// CHECK-32:       worker.exit:
+// CHECK-32-NEXT:    ret void
+//
+//
+// CHECK-32-LABEL: define {{[^@]+}}@__omp_outlined__93
+// CHECK-32-SAME: (i32* noalias noundef [[DOTGLOBAL_TID_:%.*]], i32* noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
+// CHECK-32-NEXT:  entry:
+// CHECK-32-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4
+// CHECK-32-NEXT:    store i32 0, i32* [[DOTOMP_LB]], align 4
+// CHECK-32-NEXT:    store i32 9, i32* [[DOTOMP_UB]], align 4
+// CHECK-32-NEXT:    store i32 1, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-32-NEXT:    store i32 0, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-32-NEXT:    [[TMP0:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-NEXT:    [[TMP1:%.*]] = load i32, i32* [[TMP0]], align 4
+// CHECK-32-NEXT:    call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB3]], i32 [[TMP1]], i32 34, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1)
+// CHECK-32-NEXT:    [[TMP2:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4
+// CHECK-32-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[TMP2]], 9
+// CHECK-32-NEXT:    br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK-32:       cond.true:
+// CHECK-32-NEXT:    br label [[COND_END:%.*]]
+// CHECK-32:       cond.false:
+// CHECK-32-NEXT:    [[TMP3:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4
+// CHECK-32-NEXT:    br label [[COND_END]]
+// CHECK-32:       cond.end:
+// CHECK-32-NEXT:    [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP3]], [[COND_FALSE]] ]
+// CHECK-32-NEXT:    store i32 [[COND]], i32* [[DOTOMP_UB]], align 4
+// CHECK-32-NEXT:    [[TMP4:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4
+// CHECK-32-NEXT:    store i32 [[TMP4]], i32* [[DOTOMP_IV]], align 4
+// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK-32:       omp.inner.for.cond:
+// CHECK-32-NEXT:    [[TMP5:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
+// CHECK-32-NEXT:    [[TMP6:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4
+// CHECK-32-NEXT:    [[CMP1:%.*]] = icmp sle i32 [[TMP5]], [[TMP6]]
+// CHECK-32-NEXT:    br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK-32:       omp.inner.for.body:
+// CHECK-32-NEXT:    [[TMP7:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
+// CHECK-32-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP7]], 1
+// CHECK-32-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// CHECK-32-NEXT:    store i32 [[ADD]], i32* [[I]], align 4
+// CHECK-32-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK-32:       omp.body.continue:
+// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK-32:       omp.inner.for.inc:
+// CHECK-32-NEXT:    [[TMP8:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
+// CHECK-32-NEXT:    [[ADD2:%.*]] = add nsw i32 [[TMP8]], 1
+// CHECK-32-NEXT:    store i32 [[ADD2]], i32* [[DOTOMP_IV]], align 4
+// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND]]
+// CHECK-32:       omp.inner.for.end:
+// CHECK-32-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK-32:       omp.loop.exit:
+// CHECK-32-NEXT:    call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB3]], i32 [[TMP1]])
+// CHECK-32-NEXT:    ret void
+//
+//
+// CHECK-32-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l251
+// CHECK-32-SAME: () #[[ATTR8]] {
+// CHECK-32-NEXT:  entry:
+// CHECK-32-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x i8*], align 4
+// CHECK-32-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 2, i1 false)
+// CHECK-32-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
+// CHECK-32-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
+// CHECK-32:       user_code.entry:
+// CHECK-32-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
+// CHECK-32-NEXT:    [[TMP2:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8**
+// CHECK-32-NEXT:    call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__94 to i8*), i8* null, i8** [[TMP2]], i32 0)
+// CHECK-32-NEXT:    call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 2)
+// CHECK-32-NEXT:    ret void
+// CHECK-32:       worker.exit:
+// CHECK-32-NEXT:    ret void
+//
+//
+// CHECK-32-LABEL: define {{[^@]+}}@__omp_outlined__94
+// CHECK-32-SAME: (i32* noalias noundef [[DOTGLOBAL_TID_:%.*]], i32* noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
+// CHECK-32-NEXT:  entry:
+// CHECK-32-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4
+// CHECK-32-NEXT:    store i32 0, i32* [[DOTOMP_LB]], align 4
+// CHECK-32-NEXT:    store i32 9, i32* [[DOTOMP_UB]], align 4
+// CHECK-32-NEXT:    store i32 1, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-32-NEXT:    store i32 0, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-32-NEXT:    [[TMP0:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-NEXT:    [[TMP1:%.*]] = load i32, i32* [[TMP0]], align 4
+// CHECK-32-NEXT:    call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB3]], i32 [[TMP1]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1)
+// CHECK-32-NEXT:    br label [[OMP_DISPATCH_COND:%.*]]
+// CHECK-32:       omp.dispatch.cond:
+// CHECK-32-NEXT:    [[TMP2:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4
+// CHECK-32-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[TMP2]], 9
+// CHECK-32-NEXT:    br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK-32:       cond.true:
+// CHECK-32-NEXT:    br label [[COND_END:%.*]]
+// CHECK-32:       cond.false:
+// CHECK-32-NEXT:    [[TMP3:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4
+// CHECK-32-NEXT:    br label [[COND_END]]
+// CHECK-32:       cond.end:
+// CHECK-32-NEXT:    [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP3]], [[COND_FALSE]] ]
+// CHECK-32-NEXT:    store i32 [[COND]], i32* [[DOTOMP_UB]], align 4
+// CHECK-32-NEXT:    [[TMP4:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4
+// CHECK-32-NEXT:    store i32 [[TMP4]], i32* [[DOTOMP_IV]], align 4
+// CHECK-32-NEXT:    [[TMP5:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
+// CHECK-32-NEXT:    [[TMP6:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4
+// CHECK-32-NEXT:    [[CMP1:%.*]] = icmp sle i32 [[TMP5]], [[TMP6]]
+// CHECK-32-NEXT:    br i1 [[CMP1]], label [[OMP_DISPATCH_BODY:%.*]], label [[OMP_DISPATCH_END:%.*]]
+// CHECK-32:       omp.dispatch.body:
+// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK-32:       omp.inner.for.cond:
+// CHECK-32-NEXT:    [[TMP7:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
+// CHECK-32-NEXT:    [[TMP8:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4
+// CHECK-32-NEXT:    [[CMP2:%.*]] = icmp sle i32 [[TMP7]], [[TMP8]]
+// CHECK-32-NEXT:    br i1 [[CMP2]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK-32:       omp.inner.for.body:
+// CHECK-32-NEXT:    [[TMP9:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
+// CHECK-32-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP9]], 1
+// CHECK-32-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// CHECK-32-NEXT:    store i32 [[ADD]], i32* [[I]], align 4
+// CHECK-32-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK-32:       omp.body.continue:
+// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK-32:       omp.inner.for.inc:
+// CHECK-32-NEXT:    [[TMP10:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
+// CHECK-32-NEXT:    [[ADD3:%.*]] = add nsw i32 [[TMP10]], 1
+// CHECK-32-NEXT:    store i32 [[ADD3]], i32* [[DOTOMP_IV]], align 4
+// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND]]
+// CHECK-32:       omp.inner.for.end:
+// CHECK-32-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
+// CHECK-32:       omp.dispatch.inc:
+// CHECK-32-NEXT:    [[TMP11:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4
+// CHECK-32-NEXT:    [[TMP12:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-32-NEXT:    [[ADD4:%.*]] = add nsw i32 [[TMP11]], [[TMP12]]
+// CHECK-32-NEXT:    store i32 [[ADD4]], i32* [[DOTOMP_LB]], align 4
+// CHECK-32-NEXT:    [[TMP13:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4
+// CHECK-32-NEXT:    [[TMP14:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-32-NEXT:    [[ADD5:%.*]] = add nsw i32 [[TMP13]], [[TMP14]]
+// CHECK-32-NEXT:    store i32 [[ADD5]], i32* [[DOTOMP_UB]], align 4
+// CHECK-32-NEXT:    br label [[OMP_DISPATCH_COND]]
+// CHECK-32:       omp.dispatch.end:
+// CHECK-32-NEXT:    call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB3]], i32 [[TMP1]])
+// CHECK-32-NEXT:    ret void
+//
+//
+// CHECK-32-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l255
+// CHECK-32-SAME: () #[[ATTR8]] {
+// CHECK-32-NEXT:  entry:
+// CHECK-32-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x i8*], align 4
+// CHECK-32-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 2, i1 false)
+// CHECK-32-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
+// CHECK-32-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
+// CHECK-32:       user_code.entry:
+// CHECK-32-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
+// CHECK-32-NEXT:    [[TMP2:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8**
+// CHECK-32-NEXT:    call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__95 to i8*), i8* null, i8** [[TMP2]], i32 0)
+// CHECK-32-NEXT:    call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 2)
+// CHECK-32-NEXT:    ret void
+// CHECK-32:       worker.exit:
+// CHECK-32-NEXT:    ret void
+//
+//
+// CHECK-32-LABEL: define {{[^@]+}}@__omp_outlined__95
+// CHECK-32-SAME: (i32* noalias noundef [[DOTGLOBAL_TID_:%.*]], i32* noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
+// CHECK-32-NEXT:  entry:
+// CHECK-32-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4
+// CHECK-32-NEXT:    store i32 0, i32* [[DOTOMP_LB]], align 4
+// CHECK-32-NEXT:    store i32 9, i32* [[DOTOMP_UB]], align 4
+// CHECK-32-NEXT:    store i32 1, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-32-NEXT:    store i32 0, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-32-NEXT:    [[TMP0:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-NEXT:    [[TMP1:%.*]] = load i32, i32* [[TMP0]], align 4
+// CHECK-32-NEXT:    call void @__kmpc_dispatch_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]], i32 1073741862, i32 0, i32 9, i32 1, i32 1)
+// CHECK-32-NEXT:    br label [[OMP_DISPATCH_COND:%.*]]
+// CHECK-32:       omp.dispatch.cond:
+// CHECK-32-NEXT:    [[TMP2:%.*]] = call i32 @__kmpc_dispatch_next_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]], i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]])
+// CHECK-32-NEXT:    [[TOBOOL:%.*]] = icmp ne i32 [[TMP2]], 0
+// CHECK-32-NEXT:    br i1 [[TOBOOL]], label [[OMP_DISPATCH_BODY:%.*]], label [[OMP_DISPATCH_END:%.*]]
+// CHECK-32:       omp.dispatch.body:
+// CHECK-32-NEXT:    [[TMP3:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4
+// CHECK-32-NEXT:    store i32 [[TMP3]], i32* [[DOTOMP_IV]], align 4
+// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK-32:       omp.inner.for.cond:
+// CHECK-32-NEXT:    [[TMP4:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP312:![0-9]+]]
+// CHECK-32-NEXT:    [[TMP5:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP312]]
+// CHECK-32-NEXT:    [[CMP:%.*]] = icmp sle i32 [[TMP4]], [[TMP5]]
+// CHECK-32-NEXT:    br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK-32:       omp.inner.for.body:
+// CHECK-32-NEXT:    [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP312]]
+// CHECK-32-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP6]], 1
+// CHECK-32-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// CHECK-32-NEXT:    store i32 [[ADD]], i32* [[I]], align 4, !llvm.access.group [[ACC_GRP312]]
+// CHECK-32-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK-32:       omp.body.continue:
+// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK-32:       omp.inner.for.inc:
+// CHECK-32-NEXT:    [[TMP7:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP312]]
+// CHECK-32-NEXT:    [[ADD1:%.*]] = add nsw i32 [[TMP7]], 1
+// CHECK-32-NEXT:    store i32 [[ADD1]], i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP312]]
+// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP313:![0-9]+]]
+// CHECK-32:       omp.inner.for.end:
+// CHECK-32-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
+// CHECK-32:       omp.dispatch.inc:
+// CHECK-32-NEXT:    br label [[OMP_DISPATCH_COND]]
+// CHECK-32:       omp.dispatch.end:
+// CHECK-32-NEXT:    ret void
+//
+//
+// CHECK-32-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l259
+// CHECK-32-SAME: () #[[ATTR8]] {
+// CHECK-32-NEXT:  entry:
+// CHECK-32-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x i8*], align 4
+// CHECK-32-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 2, i1 false)
+// CHECK-32-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
+// CHECK-32-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
+// CHECK-32:       user_code.entry:
+// CHECK-32-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
+// CHECK-32-NEXT:    [[TMP2:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8**
+// CHECK-32-NEXT:    call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__96 to i8*), i8* null, i8** [[TMP2]], i32 0)
+// CHECK-32-NEXT:    call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 2)
+// CHECK-32-NEXT:    ret void
+// CHECK-32:       worker.exit:
+// CHECK-32-NEXT:    ret void
+//
+//
+// CHECK-32-LABEL: define {{[^@]+}}@__omp_outlined__96
+// CHECK-32-SAME: (i32* noalias noundef [[DOTGLOBAL_TID_:%.*]], i32* noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
+// CHECK-32-NEXT:  entry:
+// CHECK-32-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4
+// CHECK-32-NEXT:    store i32 0, i32* [[DOTOMP_LB]], align 4
+// CHECK-32-NEXT:    store i32 9, i32* [[DOTOMP_UB]], align 4
+// CHECK-32-NEXT:    store i32 1, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-32-NEXT:    store i32 0, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-32-NEXT:    [[TMP0:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-NEXT:    [[TMP1:%.*]] = load i32, i32* [[TMP0]], align 4
+// CHECK-32-NEXT:    call void @__kmpc_dispatch_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]], i32 1073741861, i32 0, i32 9, i32 1, i32 1)
+// CHECK-32-NEXT:    br label [[OMP_DISPATCH_COND:%.*]]
+// CHECK-32:       omp.dispatch.cond:
+// CHECK-32-NEXT:    [[TMP2:%.*]] = call i32 @__kmpc_dispatch_next_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]], i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]])
+// CHECK-32-NEXT:    [[TOBOOL:%.*]] = icmp ne i32 [[TMP2]], 0
+// CHECK-32-NEXT:    br i1 [[TOBOOL]], label [[OMP_DISPATCH_BODY:%.*]], label [[OMP_DISPATCH_END:%.*]]
+// CHECK-32:       omp.dispatch.body:
+// CHECK-32-NEXT:    [[TMP3:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4
+// CHECK-32-NEXT:    store i32 [[TMP3]], i32* [[DOTOMP_IV]], align 4
+// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK-32:       omp.inner.for.cond:
+// CHECK-32-NEXT:    [[TMP4:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP315:![0-9]+]]
+// CHECK-32-NEXT:    [[TMP5:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP315]]
+// CHECK-32-NEXT:    [[CMP:%.*]] = icmp sle i32 [[TMP4]], [[TMP5]]
+// CHECK-32-NEXT:    br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK-32:       omp.inner.for.body:
+// CHECK-32-NEXT:    [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP315]]
+// CHECK-32-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP6]], 1
+// CHECK-32-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// CHECK-32-NEXT:    store i32 [[ADD]], i32* [[I]], align 4, !llvm.access.group [[ACC_GRP315]]
+// CHECK-32-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK-32:       omp.body.continue:
+// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK-32:       omp.inner.for.inc:
+// CHECK-32-NEXT:    [[TMP7:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP315]]
+// CHECK-32-NEXT:    [[ADD1:%.*]] = add nsw i32 [[TMP7]], 1
+// CHECK-32-NEXT:    store i32 [[ADD1]], i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP315]]
+// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP316:![0-9]+]]
+// CHECK-32:       omp.inner.for.end:
+// CHECK-32-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
+// CHECK-32:       omp.dispatch.inc:
+// CHECK-32-NEXT:    br label [[OMP_DISPATCH_COND]]
+// CHECK-32:       omp.dispatch.end:
+// CHECK-32-NEXT:    ret void
+//
+//
+// CHECK-32-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l263
+// CHECK-32-SAME: () #[[ATTR8]] {
+// CHECK-32-NEXT:  entry:
+// CHECK-32-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x i8*], align 4
+// CHECK-32-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 2, i1 false)
+// CHECK-32-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
+// CHECK-32-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
+// CHECK-32:       user_code.entry:
+// CHECK-32-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
+// CHECK-32-NEXT:    [[TMP2:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8**
+// CHECK-32-NEXT:    call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__97 to i8*), i8* null, i8** [[TMP2]], i32 0)
+// CHECK-32-NEXT:    call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 2)
+// CHECK-32-NEXT:    ret void
+// CHECK-32:       worker.exit:
+// CHECK-32-NEXT:    ret void
+//
+//
+// CHECK-32-LABEL: define {{[^@]+}}@__omp_outlined__97
+// CHECK-32-SAME: (i32* noalias noundef [[DOTGLOBAL_TID_:%.*]], i32* noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
+// CHECK-32-NEXT:  entry:
+// CHECK-32-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4
+// CHECK-32-NEXT:    store i32 0, i32* [[DOTOMP_LB]], align 4
+// CHECK-32-NEXT:    store i32 9, i32* [[DOTOMP_UB]], align 4
+// CHECK-32-NEXT:    store i32 1, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-32-NEXT:    store i32 0, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-32-NEXT:    [[TMP0:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-NEXT:    [[TMP1:%.*]] = load i32, i32* [[TMP0]], align 4
+// CHECK-32-NEXT:    call void @__kmpc_dispatch_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]], i32 1073741859, i32 0, i32 9, i32 1, i32 1)
+// CHECK-32-NEXT:    br label [[OMP_DISPATCH_COND:%.*]]
+// CHECK-32:       omp.dispatch.cond:
+// CHECK-32-NEXT:    [[TMP2:%.*]] = call i32 @__kmpc_dispatch_next_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]], i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]])
+// CHECK-32-NEXT:    [[TOBOOL:%.*]] = icmp ne i32 [[TMP2]], 0
+// CHECK-32-NEXT:    br i1 [[TOBOOL]], label [[OMP_DISPATCH_BODY:%.*]], label [[OMP_DISPATCH_END:%.*]]
+// CHECK-32:       omp.dispatch.body:
+// CHECK-32-NEXT:    [[TMP3:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4
+// CHECK-32-NEXT:    store i32 [[TMP3]], i32* [[DOTOMP_IV]], align 4
+// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK-32:       omp.inner.for.cond:
+// CHECK-32-NEXT:    [[TMP4:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP318:![0-9]+]]
+// CHECK-32-NEXT:    [[TMP5:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP318]]
+// CHECK-32-NEXT:    [[CMP:%.*]] = icmp sle i32 [[TMP4]], [[TMP5]]
+// CHECK-32-NEXT:    br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK-32:       omp.inner.for.body:
+// CHECK-32-NEXT:    [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP318]]
+// CHECK-32-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP6]], 1
+// CHECK-32-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// CHECK-32-NEXT:    store i32 [[ADD]], i32* [[I]], align 4, !llvm.access.group [[ACC_GRP318]]
+// CHECK-32-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK-32:       omp.body.continue:
+// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK-32:       omp.inner.for.inc:
+// CHECK-32-NEXT:    [[TMP7:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP318]]
+// CHECK-32-NEXT:    [[ADD1:%.*]] = add nsw i32 [[TMP7]], 1
+// CHECK-32-NEXT:    store i32 [[ADD1]], i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP318]]
+// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP319:![0-9]+]]
+// CHECK-32:       omp.inner.for.end:
+// CHECK-32-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
+// CHECK-32:       omp.dispatch.inc:
+// CHECK-32-NEXT:    br label [[OMP_DISPATCH_COND]]
+// CHECK-32:       omp.dispatch.end:
+// CHECK-32-NEXT:    ret void
+//
+//
+// CHECK-32-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l267
+// CHECK-32-SAME: () #[[ATTR8]] {
+// CHECK-32-NEXT:  entry:
+// CHECK-32-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x i8*], align 4
+// CHECK-32-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 2, i1 false)
+// CHECK-32-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
+// CHECK-32-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
+// CHECK-32:       user_code.entry:
+// CHECK-32-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
+// CHECK-32-NEXT:    [[TMP2:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8**
+// CHECK-32-NEXT:    call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__98 to i8*), i8* null, i8** [[TMP2]], i32 0)
+// CHECK-32-NEXT:    call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 2)
+// CHECK-32-NEXT:    ret void
+// CHECK-32:       worker.exit:
+// CHECK-32-NEXT:    ret void
+//
+//
+// CHECK-32-LABEL: define {{[^@]+}}@__omp_outlined__98
+// CHECK-32-SAME: (i32* noalias noundef [[DOTGLOBAL_TID_:%.*]], i32* noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
+// CHECK-32-NEXT:  entry:
+// CHECK-32-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4
+// CHECK-32-NEXT:    store i32 0, i32* [[DOTOMP_LB]], align 4
+// CHECK-32-NEXT:    store i32 9, i32* [[DOTOMP_UB]], align 4
+// CHECK-32-NEXT:    store i32 1, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-32-NEXT:    store i32 0, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-32-NEXT:    [[TMP0:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-NEXT:    [[TMP1:%.*]] = load i32, i32* [[TMP0]], align 4
+// CHECK-32-NEXT:    call void @__kmpc_dispatch_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]], i32 1073741860, i32 0, i32 9, i32 1, i32 1)
+// CHECK-32-NEXT:    br label [[OMP_DISPATCH_COND:%.*]]
+// CHECK-32:       omp.dispatch.cond:
+// CHECK-32-NEXT:    [[TMP2:%.*]] = call i32 @__kmpc_dispatch_next_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]], i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]])
+// CHECK-32-NEXT:    [[TOBOOL:%.*]] = icmp ne i32 [[TMP2]], 0
+// CHECK-32-NEXT:    br i1 [[TOBOOL]], label [[OMP_DISPATCH_BODY:%.*]], label [[OMP_DISPATCH_END:%.*]]
+// CHECK-32:       omp.dispatch.body:
+// CHECK-32-NEXT:    [[TMP3:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4
+// CHECK-32-NEXT:    store i32 [[TMP3]], i32* [[DOTOMP_IV]], align 4
+// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK-32:       omp.inner.for.cond:
+// CHECK-32-NEXT:    [[TMP4:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP321:![0-9]+]]
+// CHECK-32-NEXT:    [[TMP5:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP321]]
+// CHECK-32-NEXT:    [[CMP:%.*]] = icmp sle i32 [[TMP4]], [[TMP5]]
+// CHECK-32-NEXT:    br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK-32:       omp.inner.for.body:
+// CHECK-32-NEXT:    [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP321]]
+// CHECK-32-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP6]], 1
+// CHECK-32-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// CHECK-32-NEXT:    store i32 [[ADD]], i32* [[I]], align 4, !llvm.access.group [[ACC_GRP321]]
+// CHECK-32-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK-32:       omp.body.continue:
+// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK-32:       omp.inner.for.inc:
+// CHECK-32-NEXT:    [[TMP7:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP321]]
+// CHECK-32-NEXT:    [[ADD1:%.*]] = add nsw i32 [[TMP7]], 1
+// CHECK-32-NEXT:    store i32 [[ADD1]], i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP321]]
+// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP322:![0-9]+]]
+// CHECK-32:       omp.inner.for.end:
+// CHECK-32-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
+// CHECK-32:       omp.dispatch.inc:
+// CHECK-32-NEXT:    br label [[OMP_DISPATCH_COND]]
+// CHECK-32:       omp.dispatch.end:
+// CHECK-32-NEXT:    ret void
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+// CHECK-32-EX-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l14
+// CHECK-32-EX-SAME: (i32 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-32-EX-NEXT:  entry:
+// CHECK-32-EX-NEXT:    [[DOTCAPTURE_EXPR__ADDR:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTCAPTURE_EXPR__CASTED:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    store i32 [[DOTCAPTURE_EXPR_]], i32* [[DOTCAPTURE_EXPR__ADDR]], align 4
+// CHECK-32-EX-NEXT:    [[CONV:%.*]] = bitcast i32* [[DOTCAPTURE_EXPR__ADDR]] to i8*
+// CHECK-32-EX-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1:[0-9]+]], i8 2, i1 false)
+// CHECK-32-EX-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
+// CHECK-32-EX-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
+// CHECK-32-EX:       user_code.entry:
+// CHECK-32-EX-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
+// CHECK-32-EX-NEXT:    [[TMP2:%.*]] = load i8, i8* [[CONV]], align 1
+// CHECK-32-EX-NEXT:    [[TOBOOL:%.*]] = trunc i8 [[TMP2]] to i1
+// CHECK-32-EX-NEXT:    [[CONV1:%.*]] = bitcast i32* [[DOTCAPTURE_EXPR__CASTED]] to i8*
+// CHECK-32-EX-NEXT:    [[FROMBOOL:%.*]] = zext i1 [[TOBOOL]] to i8
+// CHECK-32-EX-NEXT:    store i8 [[FROMBOOL]], i8* [[CONV1]], align 1
+// CHECK-32-EX-NEXT:    [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR__CASTED]], align 4
+// CHECK-32-EX-NEXT:    store i32 0, i32* [[DOTZERO_ADDR]], align 4
+// CHECK-32-EX-NEXT:    store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4
+// CHECK-32-EX-NEXT:    call void @__omp_outlined__(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i32 [[TMP3]]) #[[ATTR2:[0-9]+]]
+// CHECK-32-EX-NEXT:    call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 2)
+// CHECK-32-EX-NEXT:    ret void
+// CHECK-32-EX:       worker.exit:
+// CHECK-32-EX-NEXT:    ret void
+//
+//
+// CHECK-32-EX-LABEL: define {{[^@]+}}@__omp_outlined__
+// CHECK-32-EX-SAME: (i32* noalias noundef [[DOTGLOBAL_TID_:%.*]], i32* noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR1:[0-9]+]] {
+// CHECK-32-EX-NEXT:  entry:
+// CHECK-32-EX-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-EX-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-EX-NEXT:    [[DOTCAPTURE_EXPR__ADDR:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTCAPTURE_EXPR__CASTED:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [3 x i8*], align 4
+// CHECK-32-EX-NEXT:    [[DOTCAPTURE_EXPR__CASTED16:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[CAPTURED_VARS_ADDRS19:%.*]] = alloca [3 x i8*], align 4
+// CHECK-32-EX-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-EX-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4
+// CHECK-32-EX-NEXT:    store i32 [[DOTCAPTURE_EXPR_]], i32* [[DOTCAPTURE_EXPR__ADDR]], align 4
+// CHECK-32-EX-NEXT:    [[CONV:%.*]] = bitcast i32* [[DOTCAPTURE_EXPR__ADDR]] to i8*
+// CHECK-32-EX-NEXT:    store i32 0, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-32-EX-NEXT:    store i32 9, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-EX-NEXT:    store i32 1, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-32-EX-NEXT:    store i32 0, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-32-EX-NEXT:    [[NVPTX_NUM_THREADS:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block()
+// CHECK-32-EX-NEXT:    [[TMP0:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-EX-NEXT:    [[TMP1:%.*]] = load i32, i32* [[TMP0]], align 4
+// CHECK-32-EX-NEXT:    call void @__kmpc_distribute_static_init_4(%struct.ident_t* @[[GLOB2:[0-9]+]], i32 [[TMP1]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]])
+// CHECK-32-EX-NEXT:    [[TMP2:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-EX-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[TMP2]], 9
+// CHECK-32-EX-NEXT:    br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK-32-EX:       cond.true:
+// CHECK-32-EX-NEXT:    br label [[COND_END:%.*]]
+// CHECK-32-EX:       cond.false:
+// CHECK-32-EX-NEXT:    [[TMP3:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-EX-NEXT:    br label [[COND_END]]
+// CHECK-32-EX:       cond.end:
+// CHECK-32-EX-NEXT:    [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP3]], [[COND_FALSE]] ]
+// CHECK-32-EX-NEXT:    store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-EX-NEXT:    [[TMP4:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-32-EX-NEXT:    store i32 [[TMP4]], i32* [[DOTOMP_IV]], align 4
+// CHECK-32-EX-NEXT:    [[TMP5:%.*]] = load i8, i8* [[CONV]], align 1
+// CHECK-32-EX-NEXT:    [[TOBOOL:%.*]] = trunc i8 [[TMP5]] to i1
+// CHECK-32-EX-NEXT:    br i1 [[TOBOOL]], label [[OMP_IF_THEN:%.*]], label [[OMP_IF_ELSE:%.*]]
+// CHECK-32-EX:       omp_if.then:
+// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK-32-EX:       omp.inner.for.cond:
+// CHECK-32-EX-NEXT:    [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP130:![0-9]+]]
+// CHECK-32-EX-NEXT:    [[CMP1:%.*]] = icmp slt i32 [[TMP6]], 10
+// CHECK-32-EX-NEXT:    br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK-32-EX:       omp.inner.for.body:
+// CHECK-32-EX-NEXT:    [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP130]]
+// CHECK-32-EX-NEXT:    [[TMP8:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP130]]
+// CHECK-32-EX-NEXT:    [[TMP9:%.*]] = load i8, i8* [[CONV]], align 1, !llvm.access.group [[ACC_GRP130]]
+// CHECK-32-EX-NEXT:    [[TOBOOL2:%.*]] = trunc i8 [[TMP9]] to i1
+// CHECK-32-EX-NEXT:    [[CONV3:%.*]] = bitcast i32* [[DOTCAPTURE_EXPR__CASTED]] to i8*
+// CHECK-32-EX-NEXT:    [[FROMBOOL:%.*]] = zext i1 [[TOBOOL2]] to i8
+// CHECK-32-EX-NEXT:    store i8 [[FROMBOOL]], i8* [[CONV3]], align 1, !llvm.access.group [[ACC_GRP130]]
+// CHECK-32-EX-NEXT:    [[TMP10:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR__CASTED]], align 4, !llvm.access.group [[ACC_GRP130]]
+// CHECK-32-EX-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0
+// CHECK-32-EX-NEXT:    [[TMP12:%.*]] = inttoptr i32 [[TMP7]] to i8*
+// CHECK-32-EX-NEXT:    store i8* [[TMP12]], i8** [[TMP11]], align 4, !llvm.access.group [[ACC_GRP130]]
+// CHECK-32-EX-NEXT:    [[TMP13:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1
+// CHECK-32-EX-NEXT:    [[TMP14:%.*]] = inttoptr i32 [[TMP8]] to i8*
+// CHECK-32-EX-NEXT:    store i8* [[TMP14]], i8** [[TMP13]], align 4, !llvm.access.group [[ACC_GRP130]]
+// CHECK-32-EX-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 2
+// CHECK-32-EX-NEXT:    [[TMP16:%.*]] = inttoptr i32 [[TMP10]] to i8*
+// CHECK-32-EX-NEXT:    store i8* [[TMP16]], i8** [[TMP15]], align 4, !llvm.access.group [[ACC_GRP130]]
+// CHECK-32-EX-NEXT:    [[TMP17:%.*]] = load i8, i8* [[CONV]], align 1, !llvm.access.group [[ACC_GRP130]]
+// CHECK-32-EX-NEXT:    [[TOBOOL4:%.*]] = trunc i8 [[TMP17]] to i1
+// CHECK-32-EX-NEXT:    [[TMP18:%.*]] = zext i1 [[TOBOOL4]] to i32
+// CHECK-32-EX-NEXT:    [[TMP19:%.*]] = bitcast [3 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8**
+// CHECK-32-EX-NEXT:    call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]], i32 [[TMP18]], i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32, i32)* @__omp_outlined__1 to i8*), i8* null, i8** [[TMP19]], i32 3), !llvm.access.group [[ACC_GRP130]]
+// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK-32-EX:       omp.inner.for.inc:
+// CHECK-32-EX-NEXT:    [[TMP20:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP130]]
+// CHECK-32-EX-NEXT:    [[TMP21:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP130]]
+// CHECK-32-EX-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP20]], [[TMP21]]
+// CHECK-32-EX-NEXT:    store i32 [[ADD]], i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP130]]
+// CHECK-32-EX-NEXT:    [[TMP22:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP130]]
+// CHECK-32-EX-NEXT:    [[TMP23:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP130]]
+// CHECK-32-EX-NEXT:    [[ADD5:%.*]] = add nsw i32 [[TMP22]], [[TMP23]]
+// CHECK-32-EX-NEXT:    store i32 [[ADD5]], i32* [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP130]]
+// CHECK-32-EX-NEXT:    [[TMP24:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP130]]
+// CHECK-32-EX-NEXT:    [[TMP25:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP130]]
+// CHECK-32-EX-NEXT:    [[ADD6:%.*]] = add nsw i32 [[TMP24]], [[TMP25]]
+// CHECK-32-EX-NEXT:    store i32 [[ADD6]], i32* [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP130]]
+// CHECK-32-EX-NEXT:    [[TMP26:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP130]]
+// CHECK-32-EX-NEXT:    [[CMP7:%.*]] = icmp sgt i32 [[TMP26]], 9
+// CHECK-32-EX-NEXT:    br i1 [[CMP7]], label [[COND_TRUE8:%.*]], label [[COND_FALSE9:%.*]]
+// CHECK-32-EX:       cond.true8:
+// CHECK-32-EX-NEXT:    br label [[COND_END10:%.*]]
+// CHECK-32-EX:       cond.false9:
+// CHECK-32-EX-NEXT:    [[TMP27:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP130]]
+// CHECK-32-EX-NEXT:    br label [[COND_END10]]
+// CHECK-32-EX:       cond.end10:
+// CHECK-32-EX-NEXT:    [[COND11:%.*]] = phi i32 [ 9, [[COND_TRUE8]] ], [ [[TMP27]], [[COND_FALSE9]] ]
+// CHECK-32-EX-NEXT:    store i32 [[COND11]], i32* [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP130]]
+// CHECK-32-EX-NEXT:    [[TMP28:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP130]]
+// CHECK-32-EX-NEXT:    store i32 [[TMP28]], i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP130]]
+// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP131:![0-9]+]]
+// CHECK-32-EX:       omp.inner.for.end:
+// CHECK-32-EX-NEXT:    br label [[OMP_IF_END:%.*]]
+// CHECK-32-EX:       omp_if.else:
+// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND12:%.*]]
+// CHECK-32-EX:       omp.inner.for.cond12:
+// CHECK-32-EX-NEXT:    [[TMP29:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
+// CHECK-32-EX-NEXT:    [[CMP13:%.*]] = icmp slt i32 [[TMP29]], 10
+// CHECK-32-EX-NEXT:    br i1 [[CMP13]], label [[OMP_INNER_FOR_BODY14:%.*]], label [[OMP_INNER_FOR_END30:%.*]]
+// CHECK-32-EX:       omp.inner.for.body14:
+// CHECK-32-EX-NEXT:    [[TMP30:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-32-EX-NEXT:    [[TMP31:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-EX-NEXT:    [[TMP32:%.*]] = load i8, i8* [[CONV]], align 1
+// CHECK-32-EX-NEXT:    [[TOBOOL15:%.*]] = trunc i8 [[TMP32]] to i1
+// CHECK-32-EX-NEXT:    [[CONV17:%.*]] = bitcast i32* [[DOTCAPTURE_EXPR__CASTED16]] to i8*
+// CHECK-32-EX-NEXT:    [[FROMBOOL18:%.*]] = zext i1 [[TOBOOL15]] to i8
+// CHECK-32-EX-NEXT:    store i8 [[FROMBOOL18]], i8* [[CONV17]], align 1
+// CHECK-32-EX-NEXT:    [[TMP33:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR__CASTED16]], align 4
+// CHECK-32-EX-NEXT:    [[TMP34:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS19]], i32 0, i32 0
+// CHECK-32-EX-NEXT:    [[TMP35:%.*]] = inttoptr i32 [[TMP30]] to i8*
+// CHECK-32-EX-NEXT:    store i8* [[TMP35]], i8** [[TMP34]], align 4
+// CHECK-32-EX-NEXT:    [[TMP36:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS19]], i32 0, i32 1
+// CHECK-32-EX-NEXT:    [[TMP37:%.*]] = inttoptr i32 [[TMP31]] to i8*
+// CHECK-32-EX-NEXT:    store i8* [[TMP37]], i8** [[TMP36]], align 4
+// CHECK-32-EX-NEXT:    [[TMP38:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS19]], i32 0, i32 2
+// CHECK-32-EX-NEXT:    [[TMP39:%.*]] = inttoptr i32 [[TMP33]] to i8*
+// CHECK-32-EX-NEXT:    store i8* [[TMP39]], i8** [[TMP38]], align 4
+// CHECK-32-EX-NEXT:    [[TMP40:%.*]] = load i8, i8* [[CONV]], align 1
+// CHECK-32-EX-NEXT:    [[TOBOOL20:%.*]] = trunc i8 [[TMP40]] to i1
+// CHECK-32-EX-NEXT:    [[TMP41:%.*]] = zext i1 [[TOBOOL20]] to i32
+// CHECK-32-EX-NEXT:    [[TMP42:%.*]] = bitcast [3 x i8*]* [[CAPTURED_VARS_ADDRS19]] to i8**
+// CHECK-32-EX-NEXT:    call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]], i32 [[TMP41]], i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32, i32)* @__omp_outlined__2 to i8*), i8* null, i8** [[TMP42]], i32 3)
+// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_INC21:%.*]]
+// CHECK-32-EX:       omp.inner.for.inc21:
+// CHECK-32-EX-NEXT:    [[TMP43:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
+// CHECK-32-EX-NEXT:    [[TMP44:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-32-EX-NEXT:    [[ADD22:%.*]] = add nsw i32 [[TMP43]], [[TMP44]]
+// CHECK-32-EX-NEXT:    store i32 [[ADD22]], i32* [[DOTOMP_IV]], align 4
+// CHECK-32-EX-NEXT:    [[TMP45:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-32-EX-NEXT:    [[TMP46:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-32-EX-NEXT:    [[ADD23:%.*]] = add nsw i32 [[TMP45]], [[TMP46]]
+// CHECK-32-EX-NEXT:    store i32 [[ADD23]], i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-32-EX-NEXT:    [[TMP47:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-EX-NEXT:    [[TMP48:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-32-EX-NEXT:    [[ADD24:%.*]] = add nsw i32 [[TMP47]], [[TMP48]]
+// CHECK-32-EX-NEXT:    store i32 [[ADD24]], i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-EX-NEXT:    [[TMP49:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-EX-NEXT:    [[CMP25:%.*]] = icmp sgt i32 [[TMP49]], 9
+// CHECK-32-EX-NEXT:    br i1 [[CMP25]], label [[COND_TRUE26:%.*]], label [[COND_FALSE27:%.*]]
+// CHECK-32-EX:       cond.true26:
+// CHECK-32-EX-NEXT:    br label [[COND_END28:%.*]]
+// CHECK-32-EX:       cond.false27:
+// CHECK-32-EX-NEXT:    [[TMP50:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-EX-NEXT:    br label [[COND_END28]]
+// CHECK-32-EX:       cond.end28:
+// CHECK-32-EX-NEXT:    [[COND29:%.*]] = phi i32 [ 9, [[COND_TRUE26]] ], [ [[TMP50]], [[COND_FALSE27]] ]
+// CHECK-32-EX-NEXT:    store i32 [[COND29]], i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-EX-NEXT:    [[TMP51:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-32-EX-NEXT:    store i32 [[TMP51]], i32* [[DOTOMP_IV]], align 4
+// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND12]], !llvm.loop [[LOOP134:![0-9]+]]
+// CHECK-32-EX:       omp.inner.for.end30:
+// CHECK-32-EX-NEXT:    br label [[OMP_IF_END]]
+// CHECK-32-EX:       omp_if.end:
+// CHECK-32-EX-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK-32-EX:       omp.loop.exit:
+// CHECK-32-EX-NEXT:    call void @__kmpc_distribute_static_fini(%struct.ident_t* @[[GLOB2]], i32 [[TMP1]])
+// CHECK-32-EX-NEXT:    [[TMP52:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-32-EX-NEXT:    [[TMP53:%.*]] = icmp ne i32 [[TMP52]], 0
+// CHECK-32-EX-NEXT:    br i1 [[TMP53]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]]
+// CHECK-32-EX:       .omp.final.then:
+// CHECK-32-EX-NEXT:    store i32 10, i32* [[I]], align 4
+// CHECK-32-EX-NEXT:    br label [[DOTOMP_FINAL_DONE]]
+// CHECK-32-EX:       .omp.final.done:
+// CHECK-32-EX-NEXT:    ret void
+//
+//
+// CHECK-32-EX-LABEL: define {{[^@]+}}@__omp_outlined__1
+// CHECK-32-EX-SAME: (i32* noalias noundef [[DOTGLOBAL_TID_:%.*]], i32* noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], i32 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR1]] {
+// CHECK-32-EX-NEXT:  entry:
+// CHECK-32-EX-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-EX-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-EX-NEXT:    [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTCAPTURE_EXPR__ADDR:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-EX-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4
+// CHECK-32-EX-NEXT:    store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4
+// CHECK-32-EX-NEXT:    store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4
+// CHECK-32-EX-NEXT:    store i32 [[DOTCAPTURE_EXPR_]], i32* [[DOTCAPTURE_EXPR__ADDR]], align 4
+// CHECK-32-EX-NEXT:    [[CONV:%.*]] = bitcast i32* [[DOTCAPTURE_EXPR__ADDR]] to i8*
+// CHECK-32-EX-NEXT:    store i32 0, i32* [[DOTOMP_LB]], align 4
+// CHECK-32-EX-NEXT:    store i32 9, i32* [[DOTOMP_UB]], align 4
+// CHECK-32-EX-NEXT:    [[TMP0:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4
+// CHECK-32-EX-NEXT:    [[TMP1:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4
+// CHECK-32-EX-NEXT:    store i32 [[TMP0]], i32* [[DOTOMP_LB]], align 4
+// CHECK-32-EX-NEXT:    store i32 [[TMP1]], i32* [[DOTOMP_UB]], align 4
+// CHECK-32-EX-NEXT:    store i32 1, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-32-EX-NEXT:    store i32 0, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-32-EX-NEXT:    [[TMP2:%.*]] = load i8, i8* [[CONV]], align 1
+// CHECK-32-EX-NEXT:    [[TOBOOL:%.*]] = trunc i8 [[TMP2]] to i1
+// CHECK-32-EX-NEXT:    br i1 [[TOBOOL]], label [[OMP_IF_THEN:%.*]], label [[OMP_IF_ELSE:%.*]]
+// CHECK-32-EX:       omp_if.then:
+// CHECK-32-EX-NEXT:    [[TMP3:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-EX-NEXT:    [[TMP4:%.*]] = load i32, i32* [[TMP3]], align 4
+// CHECK-32-EX-NEXT:    call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB3:[0-9]+]], i32 [[TMP4]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1)
+// CHECK-32-EX-NEXT:    [[TMP5:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4
+// CHECK-32-EX-NEXT:    store i32 [[TMP5]], i32* [[DOTOMP_IV]], align 4
+// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK-32-EX:       omp.inner.for.cond:
+// CHECK-32-EX-NEXT:    [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP136:![0-9]+]]
+// CHECK-32-EX-NEXT:    [[TMP7:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4, !llvm.access.group [[ACC_GRP136]]
+// CHECK-32-EX-NEXT:    [[CMP:%.*]] = icmp ule i32 [[TMP6]], [[TMP7]]
+// CHECK-32-EX-NEXT:    br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK-32-EX:       omp.inner.for.body:
+// CHECK-32-EX-NEXT:    [[TMP8:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP136]]
+// CHECK-32-EX-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP8]], 1
+// CHECK-32-EX-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// CHECK-32-EX-NEXT:    store i32 [[ADD]], i32* [[I]], align 4, !llvm.access.group [[ACC_GRP136]]
+// CHECK-32-EX-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK-32-EX:       omp.body.continue:
+// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK-32-EX:       omp.inner.for.inc:
+// CHECK-32-EX-NEXT:    [[TMP9:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP136]]
+// CHECK-32-EX-NEXT:    [[TMP10:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP136]]
+// CHECK-32-EX-NEXT:    [[ADD1:%.*]] = add nsw i32 [[TMP9]], [[TMP10]]
+// CHECK-32-EX-NEXT:    store i32 [[ADD1]], i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP136]]
+// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP137:![0-9]+]]
+// CHECK-32-EX:       omp.inner.for.end:
+// CHECK-32-EX-NEXT:    br label [[OMP_IF_END:%.*]]
+// CHECK-32-EX:       omp_if.else:
+// CHECK-32-EX-NEXT:    [[TMP11:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-EX-NEXT:    [[TMP12:%.*]] = load i32, i32* [[TMP11]], align 4
+// CHECK-32-EX-NEXT:    call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB3]], i32 [[TMP12]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1)
+// CHECK-32-EX-NEXT:    [[TMP13:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4
+// CHECK-32-EX-NEXT:    store i32 [[TMP13]], i32* [[DOTOMP_IV]], align 4
+// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND2:%.*]]
+// CHECK-32-EX:       omp.inner.for.cond2:
+// CHECK-32-EX-NEXT:    [[TMP14:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
+// CHECK-32-EX-NEXT:    [[TMP15:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4
+// CHECK-32-EX-NEXT:    [[CMP3:%.*]] = icmp ule i32 [[TMP14]], [[TMP15]]
+// CHECK-32-EX-NEXT:    br i1 [[CMP3]], label [[OMP_INNER_FOR_BODY4:%.*]], label [[OMP_INNER_FOR_END10:%.*]]
+// CHECK-32-EX:       omp.inner.for.body4:
+// CHECK-32-EX-NEXT:    [[TMP16:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
+// CHECK-32-EX-NEXT:    [[MUL5:%.*]] = mul nsw i32 [[TMP16]], 1
+// CHECK-32-EX-NEXT:    [[ADD6:%.*]] = add nsw i32 0, [[MUL5]]
+// CHECK-32-EX-NEXT:    store i32 [[ADD6]], i32* [[I]], align 4
+// CHECK-32-EX-NEXT:    br label [[OMP_BODY_CONTINUE7:%.*]]
+// CHECK-32-EX:       omp.body.continue7:
+// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_INC8:%.*]]
+// CHECK-32-EX:       omp.inner.for.inc8:
+// CHECK-32-EX-NEXT:    [[TMP17:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
+// CHECK-32-EX-NEXT:    [[TMP18:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-32-EX-NEXT:    [[ADD9:%.*]] = add nsw i32 [[TMP17]], [[TMP18]]
+// CHECK-32-EX-NEXT:    store i32 [[ADD9]], i32* [[DOTOMP_IV]], align 4
+// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND2]], !llvm.loop [[LOOP139:![0-9]+]]
+// CHECK-32-EX:       omp.inner.for.end10:
+// CHECK-32-EX-NEXT:    br label [[OMP_IF_END]]
+// CHECK-32-EX:       omp_if.end:
+// CHECK-32-EX-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK-32-EX:       omp.loop.exit:
+// CHECK-32-EX-NEXT:    [[TMP19:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-EX-NEXT:    [[TMP20:%.*]] = load i32, i32* [[TMP19]], align 4
+// CHECK-32-EX-NEXT:    call void @__kmpc_distribute_static_fini(%struct.ident_t* @[[GLOB2]], i32 [[TMP20]])
+// CHECK-32-EX-NEXT:    [[TMP21:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-32-EX-NEXT:    [[TMP22:%.*]] = icmp ne i32 [[TMP21]], 0
+// CHECK-32-EX-NEXT:    br i1 [[TMP22]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]]
+// CHECK-32-EX:       .omp.final.then:
+// CHECK-32-EX-NEXT:    store i32 10, i32* [[I]], align 4
+// CHECK-32-EX-NEXT:    br label [[DOTOMP_FINAL_DONE]]
+// CHECK-32-EX:       .omp.final.done:
+// CHECK-32-EX-NEXT:    ret void
+//
+//
+// CHECK-32-EX-LABEL: define {{[^@]+}}@__omp_outlined__2
+// CHECK-32-EX-SAME: (i32* noalias noundef [[DOTGLOBAL_TID_:%.*]], i32* noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], i32 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR1]] {
+// CHECK-32-EX-NEXT:  entry:
+// CHECK-32-EX-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-EX-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-EX-NEXT:    [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTCAPTURE_EXPR__ADDR:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-EX-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4
+// CHECK-32-EX-NEXT:    store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4
+// CHECK-32-EX-NEXT:    store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4
+// CHECK-32-EX-NEXT:    store i32 [[DOTCAPTURE_EXPR_]], i32* [[DOTCAPTURE_EXPR__ADDR]], align 4
+// CHECK-32-EX-NEXT:    [[CONV:%.*]] = bitcast i32* [[DOTCAPTURE_EXPR__ADDR]] to i8*
+// CHECK-32-EX-NEXT:    store i32 0, i32* [[DOTOMP_LB]], align 4
+// CHECK-32-EX-NEXT:    store i32 9, i32* [[DOTOMP_UB]], align 4
+// CHECK-32-EX-NEXT:    [[TMP0:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4
+// CHECK-32-EX-NEXT:    [[TMP1:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4
+// CHECK-32-EX-NEXT:    store i32 [[TMP0]], i32* [[DOTOMP_LB]], align 4
+// CHECK-32-EX-NEXT:    store i32 [[TMP1]], i32* [[DOTOMP_UB]], align 4
+// CHECK-32-EX-NEXT:    store i32 1, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-32-EX-NEXT:    store i32 0, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-32-EX-NEXT:    [[TMP2:%.*]] = load i8, i8* [[CONV]], align 1
+// CHECK-32-EX-NEXT:    [[TOBOOL:%.*]] = trunc i8 [[TMP2]] to i1
+// CHECK-32-EX-NEXT:    br i1 [[TOBOOL]], label [[OMP_IF_THEN:%.*]], label [[OMP_IF_ELSE:%.*]]
+// CHECK-32-EX:       omp_if.then:
+// CHECK-32-EX-NEXT:    [[TMP3:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-EX-NEXT:    [[TMP4:%.*]] = load i32, i32* [[TMP3]], align 4
+// CHECK-32-EX-NEXT:    call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB3]], i32 [[TMP4]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1)
+// CHECK-32-EX-NEXT:    [[TMP5:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4
+// CHECK-32-EX-NEXT:    store i32 [[TMP5]], i32* [[DOTOMP_IV]], align 4
+// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK-32-EX:       omp.inner.for.cond:
+// CHECK-32-EX-NEXT:    [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP140:![0-9]+]]
+// CHECK-32-EX-NEXT:    [[TMP7:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4, !llvm.access.group [[ACC_GRP140]]
+// CHECK-32-EX-NEXT:    [[CMP:%.*]] = icmp ule i32 [[TMP6]], [[TMP7]]
+// CHECK-32-EX-NEXT:    br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK-32-EX:       omp.inner.for.body:
+// CHECK-32-EX-NEXT:    [[TMP8:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP140]]
+// CHECK-32-EX-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP8]], 1
+// CHECK-32-EX-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// CHECK-32-EX-NEXT:    store i32 [[ADD]], i32* [[I]], align 4, !llvm.access.group [[ACC_GRP140]]
+// CHECK-32-EX-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK-32-EX:       omp.body.continue:
+// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK-32-EX:       omp.inner.for.inc:
+// CHECK-32-EX-NEXT:    [[TMP9:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP140]]
+// CHECK-32-EX-NEXT:    [[TMP10:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP140]]
+// CHECK-32-EX-NEXT:    [[ADD1:%.*]] = add nsw i32 [[TMP9]], [[TMP10]]
+// CHECK-32-EX-NEXT:    store i32 [[ADD1]], i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP140]]
+// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP141:![0-9]+]]
+// CHECK-32-EX:       omp.inner.for.end:
+// CHECK-32-EX-NEXT:    br label [[OMP_IF_END:%.*]]
+// CHECK-32-EX:       omp_if.else:
+// CHECK-32-EX-NEXT:    [[TMP11:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-EX-NEXT:    [[TMP12:%.*]] = load i32, i32* [[TMP11]], align 4
+// CHECK-32-EX-NEXT:    call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB3]], i32 [[TMP12]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1)
+// CHECK-32-EX-NEXT:    [[TMP13:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4
+// CHECK-32-EX-NEXT:    store i32 [[TMP13]], i32* [[DOTOMP_IV]], align 4
+// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND2:%.*]]
+// CHECK-32-EX:       omp.inner.for.cond2:
+// CHECK-32-EX-NEXT:    [[TMP14:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
+// CHECK-32-EX-NEXT:    [[TMP15:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4
+// CHECK-32-EX-NEXT:    [[CMP3:%.*]] = icmp ule i32 [[TMP14]], [[TMP15]]
+// CHECK-32-EX-NEXT:    br i1 [[CMP3]], label [[OMP_INNER_FOR_BODY4:%.*]], label [[OMP_INNER_FOR_END10:%.*]]
+// CHECK-32-EX:       omp.inner.for.body4:
+// CHECK-32-EX-NEXT:    [[TMP16:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
+// CHECK-32-EX-NEXT:    [[MUL5:%.*]] = mul nsw i32 [[TMP16]], 1
+// CHECK-32-EX-NEXT:    [[ADD6:%.*]] = add nsw i32 0, [[MUL5]]
+// CHECK-32-EX-NEXT:    store i32 [[ADD6]], i32* [[I]], align 4
+// CHECK-32-EX-NEXT:    br label [[OMP_BODY_CONTINUE7:%.*]]
+// CHECK-32-EX:       omp.body.continue7:
+// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_INC8:%.*]]
+// CHECK-32-EX:       omp.inner.for.inc8:
+// CHECK-32-EX-NEXT:    [[TMP17:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
+// CHECK-32-EX-NEXT:    [[TMP18:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-32-EX-NEXT:    [[ADD9:%.*]] = add nsw i32 [[TMP17]], [[TMP18]]
+// CHECK-32-EX-NEXT:    store i32 [[ADD9]], i32* [[DOTOMP_IV]], align 4
+// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND2]], !llvm.loop [[LOOP143:![0-9]+]]
+// CHECK-32-EX:       omp.inner.for.end10:
+// CHECK-32-EX-NEXT:    br label [[OMP_IF_END]]
+// CHECK-32-EX:       omp_if.end:
+// CHECK-32-EX-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK-32-EX:       omp.loop.exit:
+// CHECK-32-EX-NEXT:    [[TMP19:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-EX-NEXT:    [[TMP20:%.*]] = load i32, i32* [[TMP19]], align 4
+// CHECK-32-EX-NEXT:    call void @__kmpc_distribute_static_fini(%struct.ident_t* @[[GLOB2]], i32 [[TMP20]])
+// CHECK-32-EX-NEXT:    [[TMP21:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-32-EX-NEXT:    [[TMP22:%.*]] = icmp ne i32 [[TMP21]], 0
+// CHECK-32-EX-NEXT:    br i1 [[TMP22]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]]
+// CHECK-32-EX:       .omp.final.then:
+// CHECK-32-EX-NEXT:    store i32 10, i32* [[I]], align 4
+// CHECK-32-EX-NEXT:    br label [[DOTOMP_FINAL_DONE]]
+// CHECK-32-EX:       .omp.final.done:
+// CHECK-32-EX-NEXT:    ret void
+//
+//
+// CHECK-32-EX-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l17
+// CHECK-32-EX-SAME: () #[[ATTR0]] {
+// CHECK-32-EX-NEXT:  entry:
+// CHECK-32-EX-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 2, i1 false)
+// CHECK-32-EX-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
+// CHECK-32-EX-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
+// CHECK-32-EX:       user_code.entry:
+// CHECK-32-EX-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
+// CHECK-32-EX-NEXT:    store i32 0, i32* [[DOTZERO_ADDR]], align 4
+// CHECK-32-EX-NEXT:    store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4
+// CHECK-32-EX-NEXT:    call void @__omp_outlined__3(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]]) #[[ATTR2]]
+// CHECK-32-EX-NEXT:    call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 2)
+// CHECK-32-EX-NEXT:    ret void
+// CHECK-32-EX:       worker.exit:
+// CHECK-32-EX-NEXT:    ret void
+//
+//
+// CHECK-32-EX-LABEL: define {{[^@]+}}@__omp_outlined__3
+// CHECK-32-EX-SAME: (i32* noalias noundef [[DOTGLOBAL_TID_:%.*]], i32* noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
+// CHECK-32-EX-NEXT:  entry:
+// CHECK-32-EX-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-EX-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [2 x i8*], align 4
+// CHECK-32-EX-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-EX-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4
+// CHECK-32-EX-NEXT:    store i32 0, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-32-EX-NEXT:    store i32 9, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-EX-NEXT:    store i32 1, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-32-EX-NEXT:    store i32 0, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-32-EX-NEXT:    [[NVPTX_NUM_THREADS:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block()
+// CHECK-32-EX-NEXT:    [[TMP0:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-EX-NEXT:    [[TMP1:%.*]] = load i32, i32* [[TMP0]], align 4
+// CHECK-32-EX-NEXT:    call void @__kmpc_distribute_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP1]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]])
+// CHECK-32-EX-NEXT:    [[TMP2:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-EX-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[TMP2]], 9
+// CHECK-32-EX-NEXT:    br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK-32-EX:       cond.true:
+// CHECK-32-EX-NEXT:    br label [[COND_END:%.*]]
+// CHECK-32-EX:       cond.false:
+// CHECK-32-EX-NEXT:    [[TMP3:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-EX-NEXT:    br label [[COND_END]]
+// CHECK-32-EX:       cond.end:
+// CHECK-32-EX-NEXT:    [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP3]], [[COND_FALSE]] ]
+// CHECK-32-EX-NEXT:    store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-EX-NEXT:    [[TMP4:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-32-EX-NEXT:    store i32 [[TMP4]], i32* [[DOTOMP_IV]], align 4
+// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK-32-EX:       omp.inner.for.cond:
+// CHECK-32-EX-NEXT:    [[TMP5:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP144:![0-9]+]]
+// CHECK-32-EX-NEXT:    [[CMP1:%.*]] = icmp slt i32 [[TMP5]], 10
+// CHECK-32-EX-NEXT:    br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK-32-EX:       omp.inner.for.body:
+// CHECK-32-EX-NEXT:    [[TMP6:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP144]]
+// CHECK-32-EX-NEXT:    [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP144]]
+// CHECK-32-EX-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0
+// CHECK-32-EX-NEXT:    [[TMP9:%.*]] = inttoptr i32 [[TMP6]] to i8*
+// CHECK-32-EX-NEXT:    store i8* [[TMP9]], i8** [[TMP8]], align 4, !llvm.access.group [[ACC_GRP144]]
+// CHECK-32-EX-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1
+// CHECK-32-EX-NEXT:    [[TMP11:%.*]] = inttoptr i32 [[TMP7]] to i8*
+// CHECK-32-EX-NEXT:    store i8* [[TMP11]], i8** [[TMP10]], align 4, !llvm.access.group [[ACC_GRP144]]
+// CHECK-32-EX-NEXT:    [[TMP12:%.*]] = bitcast [2 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8**
+// CHECK-32-EX-NEXT:    call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32)* @__omp_outlined__4 to i8*), i8* null, i8** [[TMP12]], i32 2), !llvm.access.group [[ACC_GRP144]]
+// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK-32-EX:       omp.inner.for.inc:
+// CHECK-32-EX-NEXT:    [[TMP13:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP144]]
+// CHECK-32-EX-NEXT:    [[TMP14:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP144]]
+// CHECK-32-EX-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP13]], [[TMP14]]
+// CHECK-32-EX-NEXT:    store i32 [[ADD]], i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP144]]
+// CHECK-32-EX-NEXT:    [[TMP15:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP144]]
+// CHECK-32-EX-NEXT:    [[TMP16:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP144]]
+// CHECK-32-EX-NEXT:    [[ADD2:%.*]] = add nsw i32 [[TMP15]], [[TMP16]]
+// CHECK-32-EX-NEXT:    store i32 [[ADD2]], i32* [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP144]]
+// CHECK-32-EX-NEXT:    [[TMP17:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP144]]
+// CHECK-32-EX-NEXT:    [[TMP18:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP144]]
+// CHECK-32-EX-NEXT:    [[ADD3:%.*]] = add nsw i32 [[TMP17]], [[TMP18]]
+// CHECK-32-EX-NEXT:    store i32 [[ADD3]], i32* [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP144]]
+// CHECK-32-EX-NEXT:    [[TMP19:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP144]]
+// CHECK-32-EX-NEXT:    [[CMP4:%.*]] = icmp sgt i32 [[TMP19]], 9
+// CHECK-32-EX-NEXT:    br i1 [[CMP4]], label [[COND_TRUE5:%.*]], label [[COND_FALSE6:%.*]]
+// CHECK-32-EX:       cond.true5:
+// CHECK-32-EX-NEXT:    br label [[COND_END7:%.*]]
+// CHECK-32-EX:       cond.false6:
+// CHECK-32-EX-NEXT:    [[TMP20:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP144]]
+// CHECK-32-EX-NEXT:    br label [[COND_END7]]
+// CHECK-32-EX:       cond.end7:
+// CHECK-32-EX-NEXT:    [[COND8:%.*]] = phi i32 [ 9, [[COND_TRUE5]] ], [ [[TMP20]], [[COND_FALSE6]] ]
+// CHECK-32-EX-NEXT:    store i32 [[COND8]], i32* [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP144]]
+// CHECK-32-EX-NEXT:    [[TMP21:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP144]]
+// CHECK-32-EX-NEXT:    store i32 [[TMP21]], i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP144]]
+// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP145:![0-9]+]]
+// CHECK-32-EX:       omp.inner.for.end:
+// CHECK-32-EX-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK-32-EX:       omp.loop.exit:
+// CHECK-32-EX-NEXT:    call void @__kmpc_distribute_static_fini(%struct.ident_t* @[[GLOB2]], i32 [[TMP1]])
+// CHECK-32-EX-NEXT:    [[TMP22:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-32-EX-NEXT:    [[TMP23:%.*]] = icmp ne i32 [[TMP22]], 0
+// CHECK-32-EX-NEXT:    br i1 [[TMP23]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]]
+// CHECK-32-EX:       .omp.final.then:
+// CHECK-32-EX-NEXT:    store i32 10, i32* [[I]], align 4
+// CHECK-32-EX-NEXT:    br label [[DOTOMP_FINAL_DONE]]
+// CHECK-32-EX:       .omp.final.done:
+// CHECK-32-EX-NEXT:    ret void
+//
+//
+// CHECK-32-EX-LABEL: define {{[^@]+}}@__omp_outlined__4
+// CHECK-32-EX-SAME: (i32* noalias noundef [[DOTGLOBAL_TID_:%.*]], i32* noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR1]] {
+// CHECK-32-EX-NEXT:  entry:
+// CHECK-32-EX-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-EX-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-EX-NEXT:    [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-EX-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4
+// CHECK-32-EX-NEXT:    store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4
+// CHECK-32-EX-NEXT:    store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4
+// CHECK-32-EX-NEXT:    store i32 0, i32* [[DOTOMP_LB]], align 4
+// CHECK-32-EX-NEXT:    store i32 9, i32* [[DOTOMP_UB]], align 4
+// CHECK-32-EX-NEXT:    [[TMP0:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4
+// CHECK-32-EX-NEXT:    [[TMP1:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4
+// CHECK-32-EX-NEXT:    store i32 [[TMP0]], i32* [[DOTOMP_LB]], align 4
+// CHECK-32-EX-NEXT:    store i32 [[TMP1]], i32* [[DOTOMP_UB]], align 4
+// CHECK-32-EX-NEXT:    store i32 1, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-32-EX-NEXT:    store i32 0, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-32-EX-NEXT:    [[TMP2:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-EX-NEXT:    [[TMP3:%.*]] = load i32, i32* [[TMP2]], align 4
+// CHECK-32-EX-NEXT:    call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB3]], i32 [[TMP3]], i32 34, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1)
+// CHECK-32-EX-NEXT:    [[TMP4:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4
+// CHECK-32-EX-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[TMP4]], 9
+// CHECK-32-EX-NEXT:    br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK-32-EX:       cond.true:
+// CHECK-32-EX-NEXT:    br label [[COND_END:%.*]]
+// CHECK-32-EX:       cond.false:
+// CHECK-32-EX-NEXT:    [[TMP5:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4
+// CHECK-32-EX-NEXT:    br label [[COND_END]]
+// CHECK-32-EX:       cond.end:
+// CHECK-32-EX-NEXT:    [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP5]], [[COND_FALSE]] ]
+// CHECK-32-EX-NEXT:    store i32 [[COND]], i32* [[DOTOMP_UB]], align 4
+// CHECK-32-EX-NEXT:    [[TMP6:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4
+// CHECK-32-EX-NEXT:    store i32 [[TMP6]], i32* [[DOTOMP_IV]], align 4
+// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK-32-EX:       omp.inner.for.cond:
+// CHECK-32-EX-NEXT:    [[TMP7:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP147:![0-9]+]]
+// CHECK-32-EX-NEXT:    [[TMP8:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP147]]
+// CHECK-32-EX-NEXT:    [[CMP1:%.*]] = icmp sle i32 [[TMP7]], [[TMP8]]
+// CHECK-32-EX-NEXT:    br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK-32-EX:       omp.inner.for.body:
+// CHECK-32-EX-NEXT:    [[TMP9:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP147]]
+// CHECK-32-EX-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP9]], 1
+// CHECK-32-EX-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// CHECK-32-EX-NEXT:    store i32 [[ADD]], i32* [[I]], align 4, !llvm.access.group [[ACC_GRP147]]
+// CHECK-32-EX-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK-32-EX:       omp.body.continue:
+// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK-32-EX:       omp.inner.for.inc:
+// CHECK-32-EX-NEXT:    [[TMP10:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP147]]
+// CHECK-32-EX-NEXT:    [[ADD2:%.*]] = add nsw i32 [[TMP10]], 1
+// CHECK-32-EX-NEXT:    store i32 [[ADD2]], i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP147]]
+// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP148:![0-9]+]]
+// CHECK-32-EX:       omp.inner.for.end:
+// CHECK-32-EX-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK-32-EX:       omp.loop.exit:
+// CHECK-32-EX-NEXT:    call void @__kmpc_distribute_static_fini(%struct.ident_t* @[[GLOB2]], i32 [[TMP3]])
+// CHECK-32-EX-NEXT:    [[TMP11:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-32-EX-NEXT:    [[TMP12:%.*]] = icmp ne i32 [[TMP11]], 0
+// CHECK-32-EX-NEXT:    br i1 [[TMP12]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]]
+// CHECK-32-EX:       .omp.final.then:
+// CHECK-32-EX-NEXT:    store i32 10, i32* [[I]], align 4
+// CHECK-32-EX-NEXT:    br label [[DOTOMP_FINAL_DONE]]
+// CHECK-32-EX:       .omp.final.done:
+// CHECK-32-EX-NEXT:    ret void
+//
+//
+// CHECK-32-EX-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l20
+// CHECK-32-EX-SAME: () #[[ATTR0]] {
+// CHECK-32-EX-NEXT:  entry:
+// CHECK-32-EX-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 2, i1 false)
+// CHECK-32-EX-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
+// CHECK-32-EX-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
+// CHECK-32-EX:       user_code.entry:
+// CHECK-32-EX-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
+// CHECK-32-EX-NEXT:    store i32 0, i32* [[DOTZERO_ADDR]], align 4
+// CHECK-32-EX-NEXT:    store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4
+// CHECK-32-EX-NEXT:    call void @__omp_outlined__5(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]]) #[[ATTR2]]
+// CHECK-32-EX-NEXT:    call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 2)
+// CHECK-32-EX-NEXT:    ret void
+// CHECK-32-EX:       worker.exit:
+// CHECK-32-EX-NEXT:    ret void
+//
+//
+// CHECK-32-EX-LABEL: define {{[^@]+}}@__omp_outlined__5
+// CHECK-32-EX-SAME: (i32* noalias noundef [[DOTGLOBAL_TID_:%.*]], i32* noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
+// CHECK-32-EX-NEXT:  entry:
+// CHECK-32-EX-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-EX-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [2 x i8*], align 4
+// CHECK-32-EX-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-EX-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4
+// CHECK-32-EX-NEXT:    store i32 0, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-32-EX-NEXT:    store i32 9, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-EX-NEXT:    store i32 1, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-32-EX-NEXT:    store i32 0, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-32-EX-NEXT:    [[NVPTX_NUM_THREADS:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block()
+// CHECK-32-EX-NEXT:    [[TMP0:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-EX-NEXT:    [[TMP1:%.*]] = load i32, i32* [[TMP0]], align 4
+// CHECK-32-EX-NEXT:    call void @__kmpc_distribute_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP1]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]])
+// CHECK-32-EX-NEXT:    [[TMP2:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-EX-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[TMP2]], 9
+// CHECK-32-EX-NEXT:    br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK-32-EX:       cond.true:
+// CHECK-32-EX-NEXT:    br label [[COND_END:%.*]]
+// CHECK-32-EX:       cond.false:
+// CHECK-32-EX-NEXT:    [[TMP3:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-EX-NEXT:    br label [[COND_END]]
+// CHECK-32-EX:       cond.end:
+// CHECK-32-EX-NEXT:    [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP3]], [[COND_FALSE]] ]
+// CHECK-32-EX-NEXT:    store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-EX-NEXT:    [[TMP4:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-32-EX-NEXT:    store i32 [[TMP4]], i32* [[DOTOMP_IV]], align 4
+// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK-32-EX:       omp.inner.for.cond:
+// CHECK-32-EX-NEXT:    [[TMP5:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP150:![0-9]+]]
+// CHECK-32-EX-NEXT:    [[CMP1:%.*]] = icmp slt i32 [[TMP5]], 10
+// CHECK-32-EX-NEXT:    br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK-32-EX:       omp.inner.for.body:
+// CHECK-32-EX-NEXT:    [[TMP6:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP150]]
+// CHECK-32-EX-NEXT:    [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP150]]
+// CHECK-32-EX-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0
+// CHECK-32-EX-NEXT:    [[TMP9:%.*]] = inttoptr i32 [[TMP6]] to i8*
+// CHECK-32-EX-NEXT:    store i8* [[TMP9]], i8** [[TMP8]], align 4, !llvm.access.group [[ACC_GRP150]]
+// CHECK-32-EX-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1
+// CHECK-32-EX-NEXT:    [[TMP11:%.*]] = inttoptr i32 [[TMP7]] to i8*
+// CHECK-32-EX-NEXT:    store i8* [[TMP11]], i8** [[TMP10]], align 4, !llvm.access.group [[ACC_GRP150]]
+// CHECK-32-EX-NEXT:    [[TMP12:%.*]] = bitcast [2 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8**
+// CHECK-32-EX-NEXT:    call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32)* @__omp_outlined__6 to i8*), i8* null, i8** [[TMP12]], i32 2), !llvm.access.group [[ACC_GRP150]]
+// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK-32-EX:       omp.inner.for.inc:
+// CHECK-32-EX-NEXT:    [[TMP13:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP150]]
+// CHECK-32-EX-NEXT:    [[TMP14:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP150]]
+// CHECK-32-EX-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP13]], [[TMP14]]
+// CHECK-32-EX-NEXT:    store i32 [[ADD]], i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP150]]
+// CHECK-32-EX-NEXT:    [[TMP15:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP150]]
+// CHECK-32-EX-NEXT:    [[TMP16:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP150]]
+// CHECK-32-EX-NEXT:    [[ADD2:%.*]] = add nsw i32 [[TMP15]], [[TMP16]]
+// CHECK-32-EX-NEXT:    store i32 [[ADD2]], i32* [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP150]]
+// CHECK-32-EX-NEXT:    [[TMP17:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP150]]
+// CHECK-32-EX-NEXT:    [[TMP18:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP150]]
+// CHECK-32-EX-NEXT:    [[ADD3:%.*]] = add nsw i32 [[TMP17]], [[TMP18]]
+// CHECK-32-EX-NEXT:    store i32 [[ADD3]], i32* [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP150]]
+// CHECK-32-EX-NEXT:    [[TMP19:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP150]]
+// CHECK-32-EX-NEXT:    [[CMP4:%.*]] = icmp sgt i32 [[TMP19]], 9
+// CHECK-32-EX-NEXT:    br i1 [[CMP4]], label [[COND_TRUE5:%.*]], label [[COND_FALSE6:%.*]]
+// CHECK-32-EX:       cond.true5:
+// CHECK-32-EX-NEXT:    br label [[COND_END7:%.*]]
+// CHECK-32-EX:       cond.false6:
+// CHECK-32-EX-NEXT:    [[TMP20:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP150]]
+// CHECK-32-EX-NEXT:    br label [[COND_END7]]
+// CHECK-32-EX:       cond.end7:
+// CHECK-32-EX-NEXT:    [[COND8:%.*]] = phi i32 [ 9, [[COND_TRUE5]] ], [ [[TMP20]], [[COND_FALSE6]] ]
+// CHECK-32-EX-NEXT:    store i32 [[COND8]], i32* [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP150]]
+// CHECK-32-EX-NEXT:    [[TMP21:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP150]]
+// CHECK-32-EX-NEXT:    store i32 [[TMP21]], i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP150]]
+// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP151:![0-9]+]]
+// CHECK-32-EX:       omp.inner.for.end:
+// CHECK-32-EX-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK-32-EX:       omp.loop.exit:
+// CHECK-32-EX-NEXT:    call void @__kmpc_distribute_static_fini(%struct.ident_t* @[[GLOB2]], i32 [[TMP1]])
+// CHECK-32-EX-NEXT:    [[TMP22:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-32-EX-NEXT:    [[TMP23:%.*]] = icmp ne i32 [[TMP22]], 0
+// CHECK-32-EX-NEXT:    br i1 [[TMP23]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]]
+// CHECK-32-EX:       .omp.final.then:
+// CHECK-32-EX-NEXT:    store i32 10, i32* [[I]], align 4
+// CHECK-32-EX-NEXT:    br label [[DOTOMP_FINAL_DONE]]
+// CHECK-32-EX:       .omp.final.done:
+// CHECK-32-EX-NEXT:    ret void
+//
+//
+// CHECK-32-EX-LABEL: define {{[^@]+}}@__omp_outlined__6
+// CHECK-32-EX-SAME: (i32* noalias noundef [[DOTGLOBAL_TID_:%.*]], i32* noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR1]] {
+// CHECK-32-EX-NEXT:  entry:
+// CHECK-32-EX-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-EX-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-EX-NEXT:    [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-EX-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4
+// CHECK-32-EX-NEXT:    store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4
+// CHECK-32-EX-NEXT:    store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4
+// CHECK-32-EX-NEXT:    store i32 0, i32* [[DOTOMP_LB]], align 4
+// CHECK-32-EX-NEXT:    store i32 9, i32* [[DOTOMP_UB]], align 4
+// CHECK-32-EX-NEXT:    [[TMP0:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4
+// CHECK-32-EX-NEXT:    [[TMP1:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4
+// CHECK-32-EX-NEXT:    store i32 [[TMP0]], i32* [[DOTOMP_LB]], align 4
+// CHECK-32-EX-NEXT:    store i32 [[TMP1]], i32* [[DOTOMP_UB]], align 4
+// CHECK-32-EX-NEXT:    store i32 1, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-32-EX-NEXT:    store i32 0, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-32-EX-NEXT:    [[TMP2:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-EX-NEXT:    [[TMP3:%.*]] = load i32, i32* [[TMP2]], align 4
+// CHECK-32-EX-NEXT:    call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB3]], i32 [[TMP3]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1)
+// CHECK-32-EX-NEXT:    [[TMP4:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4
+// CHECK-32-EX-NEXT:    store i32 [[TMP4]], i32* [[DOTOMP_IV]], align 4
+// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK-32-EX:       omp.inner.for.cond:
+// CHECK-32-EX-NEXT:    [[TMP5:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP153:![0-9]+]]
+// CHECK-32-EX-NEXT:    [[TMP6:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4, !llvm.access.group [[ACC_GRP153]]
+// CHECK-32-EX-NEXT:    [[CMP:%.*]] = icmp ule i32 [[TMP5]], [[TMP6]]
+// CHECK-32-EX-NEXT:    br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK-32-EX:       omp.inner.for.body:
+// CHECK-32-EX-NEXT:    [[TMP7:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP153]]
+// CHECK-32-EX-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP7]], 1
+// CHECK-32-EX-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// CHECK-32-EX-NEXT:    store i32 [[ADD]], i32* [[I]], align 4, !llvm.access.group [[ACC_GRP153]]
+// CHECK-32-EX-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK-32-EX:       omp.body.continue:
+// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK-32-EX:       omp.inner.for.inc:
+// CHECK-32-EX-NEXT:    [[TMP8:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP153]]
+// CHECK-32-EX-NEXT:    [[TMP9:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP153]]
+// CHECK-32-EX-NEXT:    [[ADD1:%.*]] = add nsw i32 [[TMP8]], [[TMP9]]
+// CHECK-32-EX-NEXT:    store i32 [[ADD1]], i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP153]]
+// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP154:![0-9]+]]
+// CHECK-32-EX:       omp.inner.for.end:
+// CHECK-32-EX-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK-32-EX:       omp.loop.exit:
+// CHECK-32-EX-NEXT:    call void @__kmpc_distribute_static_fini(%struct.ident_t* @[[GLOB2]], i32 [[TMP3]])
+// CHECK-32-EX-NEXT:    [[TMP10:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-32-EX-NEXT:    [[TMP11:%.*]] = icmp ne i32 [[TMP10]], 0
+// CHECK-32-EX-NEXT:    br i1 [[TMP11]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]]
+// CHECK-32-EX:       .omp.final.then:
+// CHECK-32-EX-NEXT:    store i32 10, i32* [[I]], align 4
+// CHECK-32-EX-NEXT:    br label [[DOTOMP_FINAL_DONE]]
+// CHECK-32-EX:       .omp.final.done:
+// CHECK-32-EX-NEXT:    ret void
+//
+//
+// CHECK-32-EX-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l23
+// CHECK-32-EX-SAME: () #[[ATTR0]] {
+// CHECK-32-EX-NEXT:  entry:
+// CHECK-32-EX-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 2, i1 false)
+// CHECK-32-EX-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
+// CHECK-32-EX-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
+// CHECK-32-EX:       user_code.entry:
+// CHECK-32-EX-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
+// CHECK-32-EX-NEXT:    store i32 0, i32* [[DOTZERO_ADDR]], align 4
+// CHECK-32-EX-NEXT:    store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4
+// CHECK-32-EX-NEXT:    call void @__omp_outlined__7(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]]) #[[ATTR2]]
+// CHECK-32-EX-NEXT:    call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 2)
+// CHECK-32-EX-NEXT:    ret void
+// CHECK-32-EX:       worker.exit:
+// CHECK-32-EX-NEXT:    ret void
+//
+//
+// CHECK-32-EX-LABEL: define {{[^@]+}}@__omp_outlined__7
+// CHECK-32-EX-SAME: (i32* noalias noundef [[DOTGLOBAL_TID_:%.*]], i32* noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
+// CHECK-32-EX-NEXT:  entry:
+// CHECK-32-EX-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-EX-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [2 x i8*], align 4
+// CHECK-32-EX-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-EX-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4
+// CHECK-32-EX-NEXT:    store i32 0, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-32-EX-NEXT:    store i32 9, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-EX-NEXT:    store i32 1, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-32-EX-NEXT:    store i32 0, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-32-EX-NEXT:    [[NVPTX_NUM_THREADS:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block()
+// CHECK-32-EX-NEXT:    [[TMP0:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-EX-NEXT:    [[TMP1:%.*]] = load i32, i32* [[TMP0]], align 4
+// CHECK-32-EX-NEXT:    call void @__kmpc_distribute_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP1]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]])
+// CHECK-32-EX-NEXT:    [[TMP2:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-EX-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[TMP2]], 9
+// CHECK-32-EX-NEXT:    br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK-32-EX:       cond.true:
+// CHECK-32-EX-NEXT:    br label [[COND_END:%.*]]
+// CHECK-32-EX:       cond.false:
+// CHECK-32-EX-NEXT:    [[TMP3:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-EX-NEXT:    br label [[COND_END]]
+// CHECK-32-EX:       cond.end:
+// CHECK-32-EX-NEXT:    [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP3]], [[COND_FALSE]] ]
+// CHECK-32-EX-NEXT:    store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-EX-NEXT:    [[TMP4:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-32-EX-NEXT:    store i32 [[TMP4]], i32* [[DOTOMP_IV]], align 4
+// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK-32-EX:       omp.inner.for.cond:
+// CHECK-32-EX-NEXT:    [[TMP5:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP156:![0-9]+]]
+// CHECK-32-EX-NEXT:    [[CMP1:%.*]] = icmp slt i32 [[TMP5]], 10
+// CHECK-32-EX-NEXT:    br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK-32-EX:       omp.inner.for.body:
+// CHECK-32-EX-NEXT:    [[TMP6:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP156]]
+// CHECK-32-EX-NEXT:    [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP156]]
+// CHECK-32-EX-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0
+// CHECK-32-EX-NEXT:    [[TMP9:%.*]] = inttoptr i32 [[TMP6]] to i8*
+// CHECK-32-EX-NEXT:    store i8* [[TMP9]], i8** [[TMP8]], align 4, !llvm.access.group [[ACC_GRP156]]
+// CHECK-32-EX-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1
+// CHECK-32-EX-NEXT:    [[TMP11:%.*]] = inttoptr i32 [[TMP7]] to i8*
+// CHECK-32-EX-NEXT:    store i8* [[TMP11]], i8** [[TMP10]], align 4, !llvm.access.group [[ACC_GRP156]]
+// CHECK-32-EX-NEXT:    [[TMP12:%.*]] = bitcast [2 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8**
+// CHECK-32-EX-NEXT:    call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32)* @__omp_outlined__8 to i8*), i8* null, i8** [[TMP12]], i32 2), !llvm.access.group [[ACC_GRP156]]
+// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK-32-EX:       omp.inner.for.inc:
+// CHECK-32-EX-NEXT:    [[TMP13:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP156]]
+// CHECK-32-EX-NEXT:    [[TMP14:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP156]]
+// CHECK-32-EX-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP13]], [[TMP14]]
+// CHECK-32-EX-NEXT:    store i32 [[ADD]], i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP156]]
+// CHECK-32-EX-NEXT:    [[TMP15:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP156]]
+// CHECK-32-EX-NEXT:    [[TMP16:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP156]]
+// CHECK-32-EX-NEXT:    [[ADD2:%.*]] = add nsw i32 [[TMP15]], [[TMP16]]
+// CHECK-32-EX-NEXT:    store i32 [[ADD2]], i32* [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP156]]
+// CHECK-32-EX-NEXT:    [[TMP17:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP156]]
+// CHECK-32-EX-NEXT:    [[TMP18:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP156]]
+// CHECK-32-EX-NEXT:    [[ADD3:%.*]] = add nsw i32 [[TMP17]], [[TMP18]]
+// CHECK-32-EX-NEXT:    store i32 [[ADD3]], i32* [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP156]]
+// CHECK-32-EX-NEXT:    [[TMP19:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP156]]
+// CHECK-32-EX-NEXT:    [[CMP4:%.*]] = icmp sgt i32 [[TMP19]], 9
+// CHECK-32-EX-NEXT:    br i1 [[CMP4]], label [[COND_TRUE5:%.*]], label [[COND_FALSE6:%.*]]
+// CHECK-32-EX:       cond.true5:
+// CHECK-32-EX-NEXT:    br label [[COND_END7:%.*]]
+// CHECK-32-EX:       cond.false6:
+// CHECK-32-EX-NEXT:    [[TMP20:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP156]]
+// CHECK-32-EX-NEXT:    br label [[COND_END7]]
+// CHECK-32-EX:       cond.end7:
+// CHECK-32-EX-NEXT:    [[COND8:%.*]] = phi i32 [ 9, [[COND_TRUE5]] ], [ [[TMP20]], [[COND_FALSE6]] ]
+// CHECK-32-EX-NEXT:    store i32 [[COND8]], i32* [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP156]]
+// CHECK-32-EX-NEXT:    [[TMP21:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP156]]
+// CHECK-32-EX-NEXT:    store i32 [[TMP21]], i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP156]]
+// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP157:![0-9]+]]
+// CHECK-32-EX:       omp.inner.for.end:
+// CHECK-32-EX-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK-32-EX:       omp.loop.exit:
+// CHECK-32-EX-NEXT:    call void @__kmpc_distribute_static_fini(%struct.ident_t* @[[GLOB2]], i32 [[TMP1]])
+// CHECK-32-EX-NEXT:    [[TMP22:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-32-EX-NEXT:    [[TMP23:%.*]] = icmp ne i32 [[TMP22]], 0
+// CHECK-32-EX-NEXT:    br i1 [[TMP23]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]]
+// CHECK-32-EX:       .omp.final.then:
+// CHECK-32-EX-NEXT:    store i32 10, i32* [[I]], align 4
+// CHECK-32-EX-NEXT:    br label [[DOTOMP_FINAL_DONE]]
+// CHECK-32-EX:       .omp.final.done:
+// CHECK-32-EX-NEXT:    ret void
+//
+//
+// CHECK-32-EX-LABEL: define {{[^@]+}}@__omp_outlined__8
+// CHECK-32-EX-SAME: (i32* noalias noundef [[DOTGLOBAL_TID_:%.*]], i32* noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR1]] {
+// CHECK-32-EX-NEXT:  entry:
+// CHECK-32-EX-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-EX-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-EX-NEXT:    [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-EX-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4
+// CHECK-32-EX-NEXT:    store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4
+// CHECK-32-EX-NEXT:    store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4
+// CHECK-32-EX-NEXT:    store i32 0, i32* [[DOTOMP_LB]], align 4
+// CHECK-32-EX-NEXT:    store i32 9, i32* [[DOTOMP_UB]], align 4
+// CHECK-32-EX-NEXT:    [[TMP0:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4
+// CHECK-32-EX-NEXT:    [[TMP1:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4
+// CHECK-32-EX-NEXT:    store i32 [[TMP0]], i32* [[DOTOMP_LB]], align 4
+// CHECK-32-EX-NEXT:    store i32 [[TMP1]], i32* [[DOTOMP_UB]], align 4
+// CHECK-32-EX-NEXT:    store i32 1, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-32-EX-NEXT:    store i32 0, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-32-EX-NEXT:    [[TMP2:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4
+// CHECK-32-EX-NEXT:    [[TMP3:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4
+// CHECK-32-EX-NEXT:    [[TMP4:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-EX-NEXT:    [[TMP5:%.*]] = load i32, i32* [[TMP4]], align 4
+// CHECK-32-EX-NEXT:    call void @__kmpc_dispatch_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP5]], i32 1073741862, i32 [[TMP2]], i32 [[TMP3]], i32 1, i32 1)
+// CHECK-32-EX-NEXT:    br label [[OMP_DISPATCH_COND:%.*]]
+// CHECK-32-EX:       omp.dispatch.cond:
+// CHECK-32-EX-NEXT:    [[TMP6:%.*]] = call i32 @__kmpc_dispatch_next_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP5]], i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]])
+// CHECK-32-EX-NEXT:    [[TOBOOL:%.*]] = icmp ne i32 [[TMP6]], 0
+// CHECK-32-EX-NEXT:    br i1 [[TOBOOL]], label [[OMP_DISPATCH_BODY:%.*]], label [[OMP_DISPATCH_END:%.*]]
+// CHECK-32-EX:       omp.dispatch.body:
+// CHECK-32-EX-NEXT:    [[TMP7:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4
+// CHECK-32-EX-NEXT:    store i32 [[TMP7]], i32* [[DOTOMP_IV]], align 4
+// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK-32-EX:       omp.inner.for.cond:
+// CHECK-32-EX-NEXT:    [[TMP8:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP159:![0-9]+]]
+// CHECK-32-EX-NEXT:    [[TMP9:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP159]]
+// CHECK-32-EX-NEXT:    [[CMP:%.*]] = icmp sle i32 [[TMP8]], [[TMP9]]
+// CHECK-32-EX-NEXT:    br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK-32-EX:       omp.inner.for.body:
+// CHECK-32-EX-NEXT:    [[TMP10:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP159]]
+// CHECK-32-EX-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP10]], 1
+// CHECK-32-EX-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// CHECK-32-EX-NEXT:    store i32 [[ADD]], i32* [[I]], align 4, !llvm.access.group [[ACC_GRP159]]
+// CHECK-32-EX-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK-32-EX:       omp.body.continue:
+// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK-32-EX:       omp.inner.for.inc:
+// CHECK-32-EX-NEXT:    [[TMP11:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP159]]
+// CHECK-32-EX-NEXT:    [[ADD1:%.*]] = add nsw i32 [[TMP11]], 1
+// CHECK-32-EX-NEXT:    store i32 [[ADD1]], i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP159]]
+// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP160:![0-9]+]]
+// CHECK-32-EX:       omp.inner.for.end:
+// CHECK-32-EX-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
+// CHECK-32-EX:       omp.dispatch.inc:
+// CHECK-32-EX-NEXT:    br label [[OMP_DISPATCH_COND]]
+// CHECK-32-EX:       omp.dispatch.end:
+// CHECK-32-EX-NEXT:    [[TMP12:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-32-EX-NEXT:    [[TMP13:%.*]] = icmp ne i32 [[TMP12]], 0
+// CHECK-32-EX-NEXT:    br i1 [[TMP13]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]]
+// CHECK-32-EX:       .omp.final.then:
+// CHECK-32-EX-NEXT:    store i32 10, i32* [[I]], align 4
+// CHECK-32-EX-NEXT:    br label [[DOTOMP_FINAL_DONE]]
+// CHECK-32-EX:       .omp.final.done:
+// CHECK-32-EX-NEXT:    ret void
+//
+//
+// CHECK-32-EX-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l26
+// CHECK-32-EX-SAME: () #[[ATTR0]] {
+// CHECK-32-EX-NEXT:  entry:
+// CHECK-32-EX-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 2, i1 false)
+// CHECK-32-EX-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
+// CHECK-32-EX-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
+// CHECK-32-EX:       user_code.entry:
+// CHECK-32-EX-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
+// CHECK-32-EX-NEXT:    store i32 0, i32* [[DOTZERO_ADDR]], align 4
+// CHECK-32-EX-NEXT:    store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4
+// CHECK-32-EX-NEXT:    call void @__omp_outlined__9(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]]) #[[ATTR2]]
+// CHECK-32-EX-NEXT:    call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 2)
+// CHECK-32-EX-NEXT:    ret void
+// CHECK-32-EX:       worker.exit:
+// CHECK-32-EX-NEXT:    ret void
+//
+//
+// CHECK-32-EX-LABEL: define {{[^@]+}}@__omp_outlined__9
+// CHECK-32-EX-SAME: (i32* noalias noundef [[DOTGLOBAL_TID_:%.*]], i32* noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
+// CHECK-32-EX-NEXT:  entry:
+// CHECK-32-EX-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-EX-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [2 x i8*], align 4
+// CHECK-32-EX-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-EX-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4
+// CHECK-32-EX-NEXT:    store i32 0, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-32-EX-NEXT:    store i32 9, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-EX-NEXT:    store i32 1, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-32-EX-NEXT:    store i32 0, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-32-EX-NEXT:    [[NVPTX_NUM_THREADS:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block()
+// CHECK-32-EX-NEXT:    [[TMP0:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-EX-NEXT:    [[TMP1:%.*]] = load i32, i32* [[TMP0]], align 4
+// CHECK-32-EX-NEXT:    call void @__kmpc_distribute_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP1]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]])
+// CHECK-32-EX-NEXT:    [[TMP2:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-EX-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[TMP2]], 9
+// CHECK-32-EX-NEXT:    br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK-32-EX:       cond.true:
+// CHECK-32-EX-NEXT:    br label [[COND_END:%.*]]
+// CHECK-32-EX:       cond.false:
+// CHECK-32-EX-NEXT:    [[TMP3:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-EX-NEXT:    br label [[COND_END]]
+// CHECK-32-EX:       cond.end:
+// CHECK-32-EX-NEXT:    [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP3]], [[COND_FALSE]] ]
+// CHECK-32-EX-NEXT:    store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-EX-NEXT:    [[TMP4:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-32-EX-NEXT:    store i32 [[TMP4]], i32* [[DOTOMP_IV]], align 4
+// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK-32-EX:       omp.inner.for.cond:
+// CHECK-32-EX-NEXT:    [[TMP5:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP162:![0-9]+]]
+// CHECK-32-EX-NEXT:    [[CMP1:%.*]] = icmp slt i32 [[TMP5]], 10
+// CHECK-32-EX-NEXT:    br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK-32-EX:       omp.inner.for.body:
+// CHECK-32-EX-NEXT:    [[TMP6:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP162]]
+// CHECK-32-EX-NEXT:    [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP162]]
+// CHECK-32-EX-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0
+// CHECK-32-EX-NEXT:    [[TMP9:%.*]] = inttoptr i32 [[TMP6]] to i8*
+// CHECK-32-EX-NEXT:    store i8* [[TMP9]], i8** [[TMP8]], align 4, !llvm.access.group [[ACC_GRP162]]
+// CHECK-32-EX-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1
+// CHECK-32-EX-NEXT:    [[TMP11:%.*]] = inttoptr i32 [[TMP7]] to i8*
+// CHECK-32-EX-NEXT:    store i8* [[TMP11]], i8** [[TMP10]], align 4, !llvm.access.group [[ACC_GRP162]]
+// CHECK-32-EX-NEXT:    [[TMP12:%.*]] = bitcast [2 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8**
+// CHECK-32-EX-NEXT:    call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32)* @__omp_outlined__10 to i8*), i8* null, i8** [[TMP12]], i32 2), !llvm.access.group [[ACC_GRP162]]
+// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK-32-EX:       omp.inner.for.inc:
+// CHECK-32-EX-NEXT:    [[TMP13:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP162]]
+// CHECK-32-EX-NEXT:    [[TMP14:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP162]]
+// CHECK-32-EX-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP13]], [[TMP14]]
+// CHECK-32-EX-NEXT:    store i32 [[ADD]], i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP162]]
+// CHECK-32-EX-NEXT:    [[TMP15:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP162]]
+// CHECK-32-EX-NEXT:    [[TMP16:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP162]]
+// CHECK-32-EX-NEXT:    [[ADD2:%.*]] = add nsw i32 [[TMP15]], [[TMP16]]
+// CHECK-32-EX-NEXT:    store i32 [[ADD2]], i32* [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP162]]
+// CHECK-32-EX-NEXT:    [[TMP17:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP162]]
+// CHECK-32-EX-NEXT:    [[TMP18:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP162]]
+// CHECK-32-EX-NEXT:    [[ADD3:%.*]] = add nsw i32 [[TMP17]], [[TMP18]]
+// CHECK-32-EX-NEXT:    store i32 [[ADD3]], i32* [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP162]]
+// CHECK-32-EX-NEXT:    [[TMP19:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP162]]
+// CHECK-32-EX-NEXT:    [[CMP4:%.*]] = icmp sgt i32 [[TMP19]], 9
+// CHECK-32-EX-NEXT:    br i1 [[CMP4]], label [[COND_TRUE5:%.*]], label [[COND_FALSE6:%.*]]
+// CHECK-32-EX:       cond.true5:
+// CHECK-32-EX-NEXT:    br label [[COND_END7:%.*]]
+// CHECK-32-EX:       cond.false6:
+// CHECK-32-EX-NEXT:    [[TMP20:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP162]]
+// CHECK-32-EX-NEXT:    br label [[COND_END7]]
+// CHECK-32-EX:       cond.end7:
+// CHECK-32-EX-NEXT:    [[COND8:%.*]] = phi i32 [ 9, [[COND_TRUE5]] ], [ [[TMP20]], [[COND_FALSE6]] ]
+// CHECK-32-EX-NEXT:    store i32 [[COND8]], i32* [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP162]]
+// CHECK-32-EX-NEXT:    [[TMP21:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP162]]
+// CHECK-32-EX-NEXT:    store i32 [[TMP21]], i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP162]]
+// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP163:![0-9]+]]
+// CHECK-32-EX:       omp.inner.for.end:
+// CHECK-32-EX-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK-32-EX:       omp.loop.exit:
+// CHECK-32-EX-NEXT:    call void @__kmpc_distribute_static_fini(%struct.ident_t* @[[GLOB2]], i32 [[TMP1]])
+// CHECK-32-EX-NEXT:    [[TMP22:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-32-EX-NEXT:    [[TMP23:%.*]] = icmp ne i32 [[TMP22]], 0
+// CHECK-32-EX-NEXT:    br i1 [[TMP23]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]]
+// CHECK-32-EX:       .omp.final.then:
+// CHECK-32-EX-NEXT:    store i32 10, i32* [[I]], align 4
+// CHECK-32-EX-NEXT:    br label [[DOTOMP_FINAL_DONE]]
+// CHECK-32-EX:       .omp.final.done:
+// CHECK-32-EX-NEXT:    ret void
+//
+//
+// CHECK-32-EX-LABEL: define {{[^@]+}}@__omp_outlined__10
+// CHECK-32-EX-SAME: (i32* noalias noundef [[DOTGLOBAL_TID_:%.*]], i32* noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR1]] {
+// CHECK-32-EX-NEXT:  entry:
+// CHECK-32-EX-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-EX-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-EX-NEXT:    [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-EX-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4
+// CHECK-32-EX-NEXT:    store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4
+// CHECK-32-EX-NEXT:    store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4
+// CHECK-32-EX-NEXT:    store i32 0, i32* [[DOTOMP_LB]], align 4
+// CHECK-32-EX-NEXT:    store i32 9, i32* [[DOTOMP_UB]], align 4
+// CHECK-32-EX-NEXT:    [[TMP0:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4
+// CHECK-32-EX-NEXT:    [[TMP1:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4
+// CHECK-32-EX-NEXT:    store i32 [[TMP0]], i32* [[DOTOMP_LB]], align 4
+// CHECK-32-EX-NEXT:    store i32 [[TMP1]], i32* [[DOTOMP_UB]], align 4
+// CHECK-32-EX-NEXT:    store i32 1, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-32-EX-NEXT:    store i32 0, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-32-EX-NEXT:    [[TMP2:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4
+// CHECK-32-EX-NEXT:    [[TMP3:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4
+// CHECK-32-EX-NEXT:    [[TMP4:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-EX-NEXT:    [[TMP5:%.*]] = load i32, i32* [[TMP4]], align 4
+// CHECK-32-EX-NEXT:    call void @__kmpc_dispatch_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP5]], i32 1073741861, i32 [[TMP2]], i32 [[TMP3]], i32 1, i32 1)
+// CHECK-32-EX-NEXT:    br label [[OMP_DISPATCH_COND:%.*]]
+// CHECK-32-EX:       omp.dispatch.cond:
+// CHECK-32-EX-NEXT:    [[TMP6:%.*]] = call i32 @__kmpc_dispatch_next_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP5]], i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]])
+// CHECK-32-EX-NEXT:    [[TOBOOL:%.*]] = icmp ne i32 [[TMP6]], 0
+// CHECK-32-EX-NEXT:    br i1 [[TOBOOL]], label [[OMP_DISPATCH_BODY:%.*]], label [[OMP_DISPATCH_END:%.*]]
+// CHECK-32-EX:       omp.dispatch.body:
+// CHECK-32-EX-NEXT:    [[TMP7:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4
+// CHECK-32-EX-NEXT:    store i32 [[TMP7]], i32* [[DOTOMP_IV]], align 4
+// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK-32-EX:       omp.inner.for.cond:
+// CHECK-32-EX-NEXT:    [[TMP8:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP165:![0-9]+]]
+// CHECK-32-EX-NEXT:    [[TMP9:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP165]]
+// CHECK-32-EX-NEXT:    [[CMP:%.*]] = icmp sle i32 [[TMP8]], [[TMP9]]
+// CHECK-32-EX-NEXT:    br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK-32-EX:       omp.inner.for.body:
+// CHECK-32-EX-NEXT:    [[TMP10:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP165]]
+// CHECK-32-EX-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP10]], 1
+// CHECK-32-EX-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// CHECK-32-EX-NEXT:    store i32 [[ADD]], i32* [[I]], align 4, !llvm.access.group [[ACC_GRP165]]
+// CHECK-32-EX-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK-32-EX:       omp.body.continue:
+// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK-32-EX:       omp.inner.for.inc:
+// CHECK-32-EX-NEXT:    [[TMP11:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP165]]
+// CHECK-32-EX-NEXT:    [[ADD1:%.*]] = add nsw i32 [[TMP11]], 1
+// CHECK-32-EX-NEXT:    store i32 [[ADD1]], i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP165]]
+// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP166:![0-9]+]]
+// CHECK-32-EX:       omp.inner.for.end:
+// CHECK-32-EX-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
+// CHECK-32-EX:       omp.dispatch.inc:
+// CHECK-32-EX-NEXT:    br label [[OMP_DISPATCH_COND]]
+// CHECK-32-EX:       omp.dispatch.end:
+// CHECK-32-EX-NEXT:    [[TMP12:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-32-EX-NEXT:    [[TMP13:%.*]] = icmp ne i32 [[TMP12]], 0
+// CHECK-32-EX-NEXT:    br i1 [[TMP13]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]]
+// CHECK-32-EX:       .omp.final.then:
+// CHECK-32-EX-NEXT:    store i32 10, i32* [[I]], align 4
+// CHECK-32-EX-NEXT:    br label [[DOTOMP_FINAL_DONE]]
+// CHECK-32-EX:       .omp.final.done:
+// CHECK-32-EX-NEXT:    ret void
+//
+//
+// CHECK-32-EX-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l29
+// CHECK-32-EX-SAME: () #[[ATTR0]] {
+// CHECK-32-EX-NEXT:  entry:
+// CHECK-32-EX-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 2, i1 false)
+// CHECK-32-EX-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
+// CHECK-32-EX-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
+// CHECK-32-EX:       user_code.entry:
+// CHECK-32-EX-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
+// CHECK-32-EX-NEXT:    store i32 0, i32* [[DOTZERO_ADDR]], align 4
+// CHECK-32-EX-NEXT:    store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4
+// CHECK-32-EX-NEXT:    call void @__omp_outlined__11(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]]) #[[ATTR2]]
+// CHECK-32-EX-NEXT:    call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 2)
+// CHECK-32-EX-NEXT:    ret void
+// CHECK-32-EX:       worker.exit:
+// CHECK-32-EX-NEXT:    ret void
+//
+//
+// CHECK-32-EX-LABEL: define {{[^@]+}}@__omp_outlined__11
+// CHECK-32-EX-SAME: (i32* noalias noundef [[DOTGLOBAL_TID_:%.*]], i32* noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
+// CHECK-32-EX-NEXT:  entry:
+// CHECK-32-EX-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-EX-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [2 x i8*], align 4
+// CHECK-32-EX-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-EX-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4
+// CHECK-32-EX-NEXT:    store i32 0, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-32-EX-NEXT:    store i32 9, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-EX-NEXT:    store i32 1, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-32-EX-NEXT:    store i32 0, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-32-EX-NEXT:    [[NVPTX_NUM_THREADS:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block()
+// CHECK-32-EX-NEXT:    [[TMP0:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-EX-NEXT:    [[TMP1:%.*]] = load i32, i32* [[TMP0]], align 4
+// CHECK-32-EX-NEXT:    call void @__kmpc_distribute_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP1]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]])
+// CHECK-32-EX-NEXT:    [[TMP2:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-EX-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[TMP2]], 9
+// CHECK-32-EX-NEXT:    br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK-32-EX:       cond.true:
+// CHECK-32-EX-NEXT:    br label [[COND_END:%.*]]
+// CHECK-32-EX:       cond.false:
+// CHECK-32-EX-NEXT:    [[TMP3:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-EX-NEXT:    br label [[COND_END]]
+// CHECK-32-EX:       cond.end:
+// CHECK-32-EX-NEXT:    [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP3]], [[COND_FALSE]] ]
+// CHECK-32-EX-NEXT:    store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-EX-NEXT:    [[TMP4:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-32-EX-NEXT:    store i32 [[TMP4]], i32* [[DOTOMP_IV]], align 4
+// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK-32-EX:       omp.inner.for.cond:
+// CHECK-32-EX-NEXT:    [[TMP5:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP168:![0-9]+]]
+// CHECK-32-EX-NEXT:    [[CMP1:%.*]] = icmp slt i32 [[TMP5]], 10
+// CHECK-32-EX-NEXT:    br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK-32-EX:       omp.inner.for.body:
+// CHECK-32-EX-NEXT:    [[TMP6:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP168]]
+// CHECK-32-EX-NEXT:    [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP168]]
+// CHECK-32-EX-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0
+// CHECK-32-EX-NEXT:    [[TMP9:%.*]] = inttoptr i32 [[TMP6]] to i8*
+// CHECK-32-EX-NEXT:    store i8* [[TMP9]], i8** [[TMP8]], align 4, !llvm.access.group [[ACC_GRP168]]
+// CHECK-32-EX-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1
+// CHECK-32-EX-NEXT:    [[TMP11:%.*]] = inttoptr i32 [[TMP7]] to i8*
+// CHECK-32-EX-NEXT:    store i8* [[TMP11]], i8** [[TMP10]], align 4, !llvm.access.group [[ACC_GRP168]]
+// CHECK-32-EX-NEXT:    [[TMP12:%.*]] = bitcast [2 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8**
+// CHECK-32-EX-NEXT:    call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32)* @__omp_outlined__12 to i8*), i8* null, i8** [[TMP12]], i32 2), !llvm.access.group [[ACC_GRP168]]
+// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK-32-EX:       omp.inner.for.inc:
+// CHECK-32-EX-NEXT:    [[TMP13:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP168]]
+// CHECK-32-EX-NEXT:    [[TMP14:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP168]]
+// CHECK-32-EX-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP13]], [[TMP14]]
+// CHECK-32-EX-NEXT:    store i32 [[ADD]], i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP168]]
+// CHECK-32-EX-NEXT:    [[TMP15:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP168]]
+// CHECK-32-EX-NEXT:    [[TMP16:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP168]]
+// CHECK-32-EX-NEXT:    [[ADD2:%.*]] = add nsw i32 [[TMP15]], [[TMP16]]
+// CHECK-32-EX-NEXT:    store i32 [[ADD2]], i32* [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP168]]
+// CHECK-32-EX-NEXT:    [[TMP17:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP168]]
+// CHECK-32-EX-NEXT:    [[TMP18:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP168]]
+// CHECK-32-EX-NEXT:    [[ADD3:%.*]] = add nsw i32 [[TMP17]], [[TMP18]]
+// CHECK-32-EX-NEXT:    store i32 [[ADD3]], i32* [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP168]]
+// CHECK-32-EX-NEXT:    [[TMP19:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP168]]
+// CHECK-32-EX-NEXT:    [[CMP4:%.*]] = icmp sgt i32 [[TMP19]], 9
+// CHECK-32-EX-NEXT:    br i1 [[CMP4]], label [[COND_TRUE5:%.*]], label [[COND_FALSE6:%.*]]
+// CHECK-32-EX:       cond.true5:
+// CHECK-32-EX-NEXT:    br label [[COND_END7:%.*]]
+// CHECK-32-EX:       cond.false6:
+// CHECK-32-EX-NEXT:    [[TMP20:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP168]]
+// CHECK-32-EX-NEXT:    br label [[COND_END7]]
+// CHECK-32-EX:       cond.end7:
+// CHECK-32-EX-NEXT:    [[COND8:%.*]] = phi i32 [ 9, [[COND_TRUE5]] ], [ [[TMP20]], [[COND_FALSE6]] ]
+// CHECK-32-EX-NEXT:    store i32 [[COND8]], i32* [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP168]]
+// CHECK-32-EX-NEXT:    [[TMP21:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP168]]
+// CHECK-32-EX-NEXT:    store i32 [[TMP21]], i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP168]]
+// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP169:![0-9]+]]
+// CHECK-32-EX:       omp.inner.for.end:
+// CHECK-32-EX-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK-32-EX:       omp.loop.exit:
+// CHECK-32-EX-NEXT:    call void @__kmpc_distribute_static_fini(%struct.ident_t* @[[GLOB2]], i32 [[TMP1]])
+// CHECK-32-EX-NEXT:    [[TMP22:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-32-EX-NEXT:    [[TMP23:%.*]] = icmp ne i32 [[TMP22]], 0
+// CHECK-32-EX-NEXT:    br i1 [[TMP23]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]]
+// CHECK-32-EX:       .omp.final.then:
+// CHECK-32-EX-NEXT:    store i32 10, i32* [[I]], align 4
+// CHECK-32-EX-NEXT:    br label [[DOTOMP_FINAL_DONE]]
+// CHECK-32-EX:       .omp.final.done:
+// CHECK-32-EX-NEXT:    ret void
+//
+//
+// CHECK-32-EX-LABEL: define {{[^@]+}}@__omp_outlined__12
+// CHECK-32-EX-SAME: (i32* noalias noundef [[DOTGLOBAL_TID_:%.*]], i32* noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR1]] {
+// CHECK-32-EX-NEXT:  entry:
+// CHECK-32-EX-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-EX-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-EX-NEXT:    [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-EX-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4
+// CHECK-32-EX-NEXT:    store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4
+// CHECK-32-EX-NEXT:    store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4
+// CHECK-32-EX-NEXT:    store i32 0, i32* [[DOTOMP_LB]], align 4
+// CHECK-32-EX-NEXT:    store i32 9, i32* [[DOTOMP_UB]], align 4
+// CHECK-32-EX-NEXT:    [[TMP0:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4
+// CHECK-32-EX-NEXT:    [[TMP1:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4
+// CHECK-32-EX-NEXT:    store i32 [[TMP0]], i32* [[DOTOMP_LB]], align 4
+// CHECK-32-EX-NEXT:    store i32 [[TMP1]], i32* [[DOTOMP_UB]], align 4
+// CHECK-32-EX-NEXT:    store i32 1, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-32-EX-NEXT:    store i32 0, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-32-EX-NEXT:    [[TMP2:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4
+// CHECK-32-EX-NEXT:    [[TMP3:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4
+// CHECK-32-EX-NEXT:    [[TMP4:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-EX-NEXT:    [[TMP5:%.*]] = load i32, i32* [[TMP4]], align 4
+// CHECK-32-EX-NEXT:    call void @__kmpc_dispatch_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP5]], i32 1073741859, i32 [[TMP2]], i32 [[TMP3]], i32 1, i32 1)
+// CHECK-32-EX-NEXT:    br label [[OMP_DISPATCH_COND:%.*]]
+// CHECK-32-EX:       omp.dispatch.cond:
+// CHECK-32-EX-NEXT:    [[TMP6:%.*]] = call i32 @__kmpc_dispatch_next_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP5]], i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]])
+// CHECK-32-EX-NEXT:    [[TOBOOL:%.*]] = icmp ne i32 [[TMP6]], 0
+// CHECK-32-EX-NEXT:    br i1 [[TOBOOL]], label [[OMP_DISPATCH_BODY:%.*]], label [[OMP_DISPATCH_END:%.*]]
+// CHECK-32-EX:       omp.dispatch.body:
+// CHECK-32-EX-NEXT:    [[TMP7:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4
+// CHECK-32-EX-NEXT:    store i32 [[TMP7]], i32* [[DOTOMP_IV]], align 4
+// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK-32-EX:       omp.inner.for.cond:
+// CHECK-32-EX-NEXT:    [[TMP8:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP171:![0-9]+]]
+// CHECK-32-EX-NEXT:    [[TMP9:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP171]]
+// CHECK-32-EX-NEXT:    [[CMP:%.*]] = icmp sle i32 [[TMP8]], [[TMP9]]
+// CHECK-32-EX-NEXT:    br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK-32-EX:       omp.inner.for.body:
+// CHECK-32-EX-NEXT:    [[TMP10:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP171]]
+// CHECK-32-EX-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP10]], 1
+// CHECK-32-EX-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// CHECK-32-EX-NEXT:    store i32 [[ADD]], i32* [[I]], align 4, !llvm.access.group [[ACC_GRP171]]
+// CHECK-32-EX-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK-32-EX:       omp.body.continue:
+// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK-32-EX:       omp.inner.for.inc:
+// CHECK-32-EX-NEXT:    [[TMP11:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP171]]
+// CHECK-32-EX-NEXT:    [[ADD1:%.*]] = add nsw i32 [[TMP11]], 1
+// CHECK-32-EX-NEXT:    store i32 [[ADD1]], i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP171]]
+// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP172:![0-9]+]]
+// CHECK-32-EX:       omp.inner.for.end:
+// CHECK-32-EX-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
+// CHECK-32-EX:       omp.dispatch.inc:
+// CHECK-32-EX-NEXT:    br label [[OMP_DISPATCH_COND]]
+// CHECK-32-EX:       omp.dispatch.end:
+// CHECK-32-EX-NEXT:    [[TMP12:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-32-EX-NEXT:    [[TMP13:%.*]] = icmp ne i32 [[TMP12]], 0
+// CHECK-32-EX-NEXT:    br i1 [[TMP13]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]]
+// CHECK-32-EX:       .omp.final.then:
+// CHECK-32-EX-NEXT:    store i32 10, i32* [[I]], align 4
+// CHECK-32-EX-NEXT:    br label [[DOTOMP_FINAL_DONE]]
+// CHECK-32-EX:       .omp.final.done:
+// CHECK-32-EX-NEXT:    ret void
+//
+//
+// CHECK-32-EX-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l32
+// CHECK-32-EX-SAME: () #[[ATTR0]] {
+// CHECK-32-EX-NEXT:  entry:
+// CHECK-32-EX-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 2, i1 false)
+// CHECK-32-EX-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
+// CHECK-32-EX-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
+// CHECK-32-EX:       user_code.entry:
+// CHECK-32-EX-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
+// CHECK-32-EX-NEXT:    store i32 0, i32* [[DOTZERO_ADDR]], align 4
+// CHECK-32-EX-NEXT:    store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4
+// CHECK-32-EX-NEXT:    call void @__omp_outlined__13(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]]) #[[ATTR2]]
+// CHECK-32-EX-NEXT:    call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 2)
+// CHECK-32-EX-NEXT:    ret void
+// CHECK-32-EX:       worker.exit:
+// CHECK-32-EX-NEXT:    ret void
+//
+//
+// CHECK-32-EX-LABEL: define {{[^@]+}}@__omp_outlined__13
+// CHECK-32-EX-SAME: (i32* noalias noundef [[DOTGLOBAL_TID_:%.*]], i32* noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
+// CHECK-32-EX-NEXT:  entry:
+// CHECK-32-EX-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-EX-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [2 x i8*], align 4
+// CHECK-32-EX-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-EX-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4
+// CHECK-32-EX-NEXT:    store i32 0, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-32-EX-NEXT:    store i32 9, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-EX-NEXT:    store i32 1, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-32-EX-NEXT:    store i32 0, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-32-EX-NEXT:    [[NVPTX_NUM_THREADS:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block()
+// CHECK-32-EX-NEXT:    [[TMP0:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-EX-NEXT:    [[TMP1:%.*]] = load i32, i32* [[TMP0]], align 4
+// CHECK-32-EX-NEXT:    call void @__kmpc_distribute_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP1]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]])
+// CHECK-32-EX-NEXT:    [[TMP2:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-EX-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[TMP2]], 9
+// CHECK-32-EX-NEXT:    br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK-32-EX:       cond.true:
+// CHECK-32-EX-NEXT:    br label [[COND_END:%.*]]
+// CHECK-32-EX:       cond.false:
+// CHECK-32-EX-NEXT:    [[TMP3:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-EX-NEXT:    br label [[COND_END]]
+// CHECK-32-EX:       cond.end:
+// CHECK-32-EX-NEXT:    [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP3]], [[COND_FALSE]] ]
+// CHECK-32-EX-NEXT:    store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-EX-NEXT:    [[TMP4:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-32-EX-NEXT:    store i32 [[TMP4]], i32* [[DOTOMP_IV]], align 4
+// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK-32-EX:       omp.inner.for.cond:
+// CHECK-32-EX-NEXT:    [[TMP5:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP174:![0-9]+]]
+// CHECK-32-EX-NEXT:    [[CMP1:%.*]] = icmp slt i32 [[TMP5]], 10
+// CHECK-32-EX-NEXT:    br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK-32-EX:       omp.inner.for.body:
+// CHECK-32-EX-NEXT:    [[TMP6:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP174]]
+// CHECK-32-EX-NEXT:    [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP174]]
+// CHECK-32-EX-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0
+// CHECK-32-EX-NEXT:    [[TMP9:%.*]] = inttoptr i32 [[TMP6]] to i8*
+// CHECK-32-EX-NEXT:    store i8* [[TMP9]], i8** [[TMP8]], align 4, !llvm.access.group [[ACC_GRP174]]
+// CHECK-32-EX-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1
+// CHECK-32-EX-NEXT:    [[TMP11:%.*]] = inttoptr i32 [[TMP7]] to i8*
+// CHECK-32-EX-NEXT:    store i8* [[TMP11]], i8** [[TMP10]], align 4, !llvm.access.group [[ACC_GRP174]]
+// CHECK-32-EX-NEXT:    [[TMP12:%.*]] = bitcast [2 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8**
+// CHECK-32-EX-NEXT:    call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32)* @__omp_outlined__14 to i8*), i8* null, i8** [[TMP12]], i32 2), !llvm.access.group [[ACC_GRP174]]
+// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK-32-EX:       omp.inner.for.inc:
+// CHECK-32-EX-NEXT:    [[TMP13:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP174]]
+// CHECK-32-EX-NEXT:    [[TMP14:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP174]]
+// CHECK-32-EX-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP13]], [[TMP14]]
+// CHECK-32-EX-NEXT:    store i32 [[ADD]], i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP174]]
+// CHECK-32-EX-NEXT:    [[TMP15:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP174]]
+// CHECK-32-EX-NEXT:    [[TMP16:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP174]]
+// CHECK-32-EX-NEXT:    [[ADD2:%.*]] = add nsw i32 [[TMP15]], [[TMP16]]
+// CHECK-32-EX-NEXT:    store i32 [[ADD2]], i32* [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP174]]
+// CHECK-32-EX-NEXT:    [[TMP17:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP174]]
+// CHECK-32-EX-NEXT:    [[TMP18:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP174]]
+// CHECK-32-EX-NEXT:    [[ADD3:%.*]] = add nsw i32 [[TMP17]], [[TMP18]]
+// CHECK-32-EX-NEXT:    store i32 [[ADD3]], i32* [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP174]]
+// CHECK-32-EX-NEXT:    [[TMP19:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP174]]
+// CHECK-32-EX-NEXT:    [[CMP4:%.*]] = icmp sgt i32 [[TMP19]], 9
+// CHECK-32-EX-NEXT:    br i1 [[CMP4]], label [[COND_TRUE5:%.*]], label [[COND_FALSE6:%.*]]
+// CHECK-32-EX:       cond.true5:
+// CHECK-32-EX-NEXT:    br label [[COND_END7:%.*]]
+// CHECK-32-EX:       cond.false6:
+// CHECK-32-EX-NEXT:    [[TMP20:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP174]]
+// CHECK-32-EX-NEXT:    br label [[COND_END7]]
+// CHECK-32-EX:       cond.end7:
+// CHECK-32-EX-NEXT:    [[COND8:%.*]] = phi i32 [ 9, [[COND_TRUE5]] ], [ [[TMP20]], [[COND_FALSE6]] ]
+// CHECK-32-EX-NEXT:    store i32 [[COND8]], i32* [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP174]]
+// CHECK-32-EX-NEXT:    [[TMP21:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP174]]
+// CHECK-32-EX-NEXT:    store i32 [[TMP21]], i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP174]]
+// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP175:![0-9]+]]
+// CHECK-32-EX:       omp.inner.for.end:
+// CHECK-32-EX-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK-32-EX:       omp.loop.exit:
+// CHECK-32-EX-NEXT:    call void @__kmpc_distribute_static_fini(%struct.ident_t* @[[GLOB2]], i32 [[TMP1]])
+// CHECK-32-EX-NEXT:    [[TMP22:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-32-EX-NEXT:    [[TMP23:%.*]] = icmp ne i32 [[TMP22]], 0
+// CHECK-32-EX-NEXT:    br i1 [[TMP23]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]]
+// CHECK-32-EX:       .omp.final.then:
+// CHECK-32-EX-NEXT:    store i32 10, i32* [[I]], align 4
+// CHECK-32-EX-NEXT:    br label [[DOTOMP_FINAL_DONE]]
+// CHECK-32-EX:       .omp.final.done:
+// CHECK-32-EX-NEXT:    ret void
+//
+//
+// CHECK-32-EX-LABEL: define {{[^@]+}}@__omp_outlined__14
+// CHECK-32-EX-SAME: (i32* noalias noundef [[DOTGLOBAL_TID_:%.*]], i32* noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR1]] {
+// CHECK-32-EX-NEXT:  entry:
+// CHECK-32-EX-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-EX-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-EX-NEXT:    [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-EX-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4
+// CHECK-32-EX-NEXT:    store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4
+// CHECK-32-EX-NEXT:    store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4
+// CHECK-32-EX-NEXT:    store i32 0, i32* [[DOTOMP_LB]], align 4
+// CHECK-32-EX-NEXT:    store i32 9, i32* [[DOTOMP_UB]], align 4
+// CHECK-32-EX-NEXT:    [[TMP0:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4
+// CHECK-32-EX-NEXT:    [[TMP1:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4
+// CHECK-32-EX-NEXT:    store i32 [[TMP0]], i32* [[DOTOMP_LB]], align 4
+// CHECK-32-EX-NEXT:    store i32 [[TMP1]], i32* [[DOTOMP_UB]], align 4
+// CHECK-32-EX-NEXT:    store i32 1, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-32-EX-NEXT:    store i32 0, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-32-EX-NEXT:    [[TMP2:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4
+// CHECK-32-EX-NEXT:    [[TMP3:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4
+// CHECK-32-EX-NEXT:    [[TMP4:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-EX-NEXT:    [[TMP5:%.*]] = load i32, i32* [[TMP4]], align 4
+// CHECK-32-EX-NEXT:    call void @__kmpc_dispatch_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP5]], i32 1073741860, i32 [[TMP2]], i32 [[TMP3]], i32 1, i32 1)
+// CHECK-32-EX-NEXT:    br label [[OMP_DISPATCH_COND:%.*]]
+// CHECK-32-EX:       omp.dispatch.cond:
+// CHECK-32-EX-NEXT:    [[TMP6:%.*]] = call i32 @__kmpc_dispatch_next_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP5]], i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]])
+// CHECK-32-EX-NEXT:    [[TOBOOL:%.*]] = icmp ne i32 [[TMP6]], 0
+// CHECK-32-EX-NEXT:    br i1 [[TOBOOL]], label [[OMP_DISPATCH_BODY:%.*]], label [[OMP_DISPATCH_END:%.*]]
+// CHECK-32-EX:       omp.dispatch.body:
+// CHECK-32-EX-NEXT:    [[TMP7:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4
+// CHECK-32-EX-NEXT:    store i32 [[TMP7]], i32* [[DOTOMP_IV]], align 4
+// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK-32-EX:       omp.inner.for.cond:
+// CHECK-32-EX-NEXT:    [[TMP8:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP177:![0-9]+]]
+// CHECK-32-EX-NEXT:    [[TMP9:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP177]]
+// CHECK-32-EX-NEXT:    [[CMP:%.*]] = icmp sle i32 [[TMP8]], [[TMP9]]
+// CHECK-32-EX-NEXT:    br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK-32-EX:       omp.inner.for.body:
+// CHECK-32-EX-NEXT:    [[TMP10:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP177]]
+// CHECK-32-EX-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP10]], 1
+// CHECK-32-EX-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// CHECK-32-EX-NEXT:    store i32 [[ADD]], i32* [[I]], align 4, !llvm.access.group [[ACC_GRP177]]
+// CHECK-32-EX-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK-32-EX:       omp.body.continue:
+// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK-32-EX:       omp.inner.for.inc:
+// CHECK-32-EX-NEXT:    [[TMP11:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP177]]
+// CHECK-32-EX-NEXT:    [[ADD1:%.*]] = add nsw i32 [[TMP11]], 1
+// CHECK-32-EX-NEXT:    store i32 [[ADD1]], i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP177]]
+// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP178:![0-9]+]]
+// CHECK-32-EX:       omp.inner.for.end:
+// CHECK-32-EX-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
+// CHECK-32-EX:       omp.dispatch.inc:
+// CHECK-32-EX-NEXT:    br label [[OMP_DISPATCH_COND]]
+// CHECK-32-EX:       omp.dispatch.end:
+// CHECK-32-EX-NEXT:    [[TMP12:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-32-EX-NEXT:    [[TMP13:%.*]] = icmp ne i32 [[TMP12]], 0
+// CHECK-32-EX-NEXT:    br i1 [[TMP13]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]]
+// CHECK-32-EX:       .omp.final.then:
+// CHECK-32-EX-NEXT:    store i32 10, i32* [[I]], align 4
+// CHECK-32-EX-NEXT:    br label [[DOTOMP_FINAL_DONE]]
+// CHECK-32-EX:       .omp.final.done:
+// CHECK-32-EX-NEXT:    ret void
+//
+//
+// CHECK-32-EX-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l36
+// CHECK-32-EX-SAME: (i32 noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-32-EX-NEXT:  entry:
+// CHECK-32-EX-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[A_CASTED:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    store i32 [[A]], i32* [[A_ADDR]], align 4
+// CHECK-32-EX-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 2, i1 false)
+// CHECK-32-EX-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
+// CHECK-32-EX-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
+// CHECK-32-EX:       user_code.entry:
+// CHECK-32-EX-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
+// CHECK-32-EX-NEXT:    [[TMP2:%.*]] = load i32, i32* [[A_ADDR]], align 4
+// CHECK-32-EX-NEXT:    store i32 [[TMP2]], i32* [[A_CASTED]], align 4
+// CHECK-32-EX-NEXT:    [[TMP3:%.*]] = load i32, i32* [[A_CASTED]], align 4
+// CHECK-32-EX-NEXT:    store i32 0, i32* [[DOTZERO_ADDR]], align 4
+// CHECK-32-EX-NEXT:    store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4
+// CHECK-32-EX-NEXT:    call void @__omp_outlined__15(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i32 [[TMP3]]) #[[ATTR2]]
+// CHECK-32-EX-NEXT:    call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 2)
+// CHECK-32-EX-NEXT:    ret void
+// CHECK-32-EX:       worker.exit:
+// CHECK-32-EX-NEXT:    ret void
+//
+//
+// CHECK-32-EX-LABEL: define {{[^@]+}}@__omp_outlined__15
+// CHECK-32-EX-SAME: (i32* noalias noundef [[DOTGLOBAL_TID_:%.*]], i32* noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[A:%.*]]) #[[ATTR1]] {
+// CHECK-32-EX-NEXT:  entry:
+// CHECK-32-EX-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-EX-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-EX-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[A_CASTED:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [3 x i8*], align 4
+// CHECK-32-EX-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-EX-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4
+// CHECK-32-EX-NEXT:    store i32 [[A]], i32* [[A_ADDR]], align 4
+// CHECK-32-EX-NEXT:    [[A1:%.*]] = call align 8 i8* @__kmpc_alloc_shared(i32 4)
+// CHECK-32-EX-NEXT:    [[A_ON_STACK:%.*]] = bitcast i8* [[A1]] to i32*
+// CHECK-32-EX-NEXT:    store i32 0, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-32-EX-NEXT:    store i32 9, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-EX-NEXT:    store i32 1, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-32-EX-NEXT:    store i32 0, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-32-EX-NEXT:    [[NVPTX_NUM_THREADS:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block()
+// CHECK-32-EX-NEXT:    [[TMP0:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-EX-NEXT:    [[TMP1:%.*]] = load i32, i32* [[TMP0]], align 4
+// CHECK-32-EX-NEXT:    call void @__kmpc_distribute_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP1]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]])
+// CHECK-32-EX-NEXT:    [[TMP2:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-EX-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[TMP2]], 9
+// CHECK-32-EX-NEXT:    br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK-32-EX:       cond.true:
+// CHECK-32-EX-NEXT:    br label [[COND_END:%.*]]
+// CHECK-32-EX:       cond.false:
+// CHECK-32-EX-NEXT:    [[TMP3:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-EX-NEXT:    br label [[COND_END]]
+// CHECK-32-EX:       cond.end:
+// CHECK-32-EX-NEXT:    [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP3]], [[COND_FALSE]] ]
+// CHECK-32-EX-NEXT:    store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-EX-NEXT:    [[TMP4:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-32-EX-NEXT:    store i32 [[TMP4]], i32* [[DOTOMP_IV]], align 4
+// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK-32-EX:       omp.inner.for.cond:
+// CHECK-32-EX-NEXT:    [[TMP5:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
+// CHECK-32-EX-NEXT:    [[CMP2:%.*]] = icmp slt i32 [[TMP5]], 10
+// CHECK-32-EX-NEXT:    br i1 [[CMP2]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK-32-EX:       omp.inner.for.body:
+// CHECK-32-EX-NEXT:    [[TMP6:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-32-EX-NEXT:    [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-EX-NEXT:    [[TMP8:%.*]] = load i32, i32* [[A_ON_STACK]], align 4
+// CHECK-32-EX-NEXT:    store i32 [[TMP8]], i32* [[A_CASTED]], align 4
+// CHECK-32-EX-NEXT:    [[TMP9:%.*]] = load i32, i32* [[A_CASTED]], align 4
+// CHECK-32-EX-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0
+// CHECK-32-EX-NEXT:    [[TMP11:%.*]] = inttoptr i32 [[TMP6]] to i8*
+// CHECK-32-EX-NEXT:    store i8* [[TMP11]], i8** [[TMP10]], align 4
+// CHECK-32-EX-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1
+// CHECK-32-EX-NEXT:    [[TMP13:%.*]] = inttoptr i32 [[TMP7]] to i8*
+// CHECK-32-EX-NEXT:    store i8* [[TMP13]], i8** [[TMP12]], align 4
+// CHECK-32-EX-NEXT:    [[TMP14:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 2
+// CHECK-32-EX-NEXT:    [[TMP15:%.*]] = inttoptr i32 [[TMP9]] to i8*
+// CHECK-32-EX-NEXT:    store i8* [[TMP15]], i8** [[TMP14]], align 4
+// CHECK-32-EX-NEXT:    [[TMP16:%.*]] = bitcast [3 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8**
+// CHECK-32-EX-NEXT:    call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32, i32)* @__omp_outlined__16 to i8*), i8* null, i8** [[TMP16]], i32 3)
+// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK-32-EX:       omp.inner.for.inc:
+// CHECK-32-EX-NEXT:    [[TMP17:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
+// CHECK-32-EX-NEXT:    [[TMP18:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-32-EX-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP17]], [[TMP18]]
+// CHECK-32-EX-NEXT:    store i32 [[ADD]], i32* [[DOTOMP_IV]], align 4
+// CHECK-32-EX-NEXT:    [[TMP19:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-32-EX-NEXT:    [[TMP20:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-32-EX-NEXT:    [[ADD3:%.*]] = add nsw i32 [[TMP19]], [[TMP20]]
+// CHECK-32-EX-NEXT:    store i32 [[ADD3]], i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-32-EX-NEXT:    [[TMP21:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-EX-NEXT:    [[TMP22:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-32-EX-NEXT:    [[ADD4:%.*]] = add nsw i32 [[TMP21]], [[TMP22]]
+// CHECK-32-EX-NEXT:    store i32 [[ADD4]], i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-EX-NEXT:    [[TMP23:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-EX-NEXT:    [[CMP5:%.*]] = icmp sgt i32 [[TMP23]], 9
+// CHECK-32-EX-NEXT:    br i1 [[CMP5]], label [[COND_TRUE6:%.*]], label [[COND_FALSE7:%.*]]
+// CHECK-32-EX:       cond.true6:
+// CHECK-32-EX-NEXT:    br label [[COND_END8:%.*]]
+// CHECK-32-EX:       cond.false7:
+// CHECK-32-EX-NEXT:    [[TMP24:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-EX-NEXT:    br label [[COND_END8]]
+// CHECK-32-EX:       cond.end8:
+// CHECK-32-EX-NEXT:    [[COND9:%.*]] = phi i32 [ 9, [[COND_TRUE6]] ], [ [[TMP24]], [[COND_FALSE7]] ]
+// CHECK-32-EX-NEXT:    store i32 [[COND9]], i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-EX-NEXT:    [[TMP25:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-32-EX-NEXT:    store i32 [[TMP25]], i32* [[DOTOMP_IV]], align 4
+// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND]]
+// CHECK-32-EX:       omp.inner.for.end:
+// CHECK-32-EX-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK-32-EX:       omp.loop.exit:
+// CHECK-32-EX-NEXT:    call void @__kmpc_distribute_static_fini(%struct.ident_t* @[[GLOB2]], i32 [[TMP1]])
+// CHECK-32-EX-NEXT:    [[TMP26:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-32-EX-NEXT:    [[TMP27:%.*]] = icmp ne i32 [[TMP26]], 0
+// CHECK-32-EX-NEXT:    br i1 [[TMP27]], label [[DOTOMP_LASTPRIVATE_THEN:%.*]], label [[DOTOMP_LASTPRIVATE_DONE:%.*]]
+// CHECK-32-EX:       .omp.lastprivate.then:
+// CHECK-32-EX-NEXT:    [[TMP28:%.*]] = load i32, i32* [[A_ON_STACK]], align 4
+// CHECK-32-EX-NEXT:    store i32 [[TMP28]], i32* [[A_ADDR]], align 4
+// CHECK-32-EX-NEXT:    br label [[DOTOMP_LASTPRIVATE_DONE]]
+// CHECK-32-EX:       .omp.lastprivate.done:
+// CHECK-32-EX-NEXT:    call void @__kmpc_free_shared(i8* [[A1]], i32 4)
+// CHECK-32-EX-NEXT:    ret void
+//
+//
+// CHECK-32-EX-LABEL: define {{[^@]+}}@__omp_outlined__16
+// CHECK-32-EX-SAME: (i32* noalias noundef [[DOTGLOBAL_TID_:%.*]], i32* noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], i32 noundef [[A:%.*]]) #[[ATTR1]] {
+// CHECK-32-EX-NEXT:  entry:
+// CHECK-32-EX-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-EX-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-EX-NEXT:    [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[A1:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-EX-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4
+// CHECK-32-EX-NEXT:    store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4
+// CHECK-32-EX-NEXT:    store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4
+// CHECK-32-EX-NEXT:    store i32 [[A]], i32* [[A_ADDR]], align 4
+// CHECK-32-EX-NEXT:    store i32 0, i32* [[DOTOMP_LB]], align 4
+// CHECK-32-EX-NEXT:    store i32 9, i32* [[DOTOMP_UB]], align 4
+// CHECK-32-EX-NEXT:    [[TMP0:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4
+// CHECK-32-EX-NEXT:    [[TMP1:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4
+// CHECK-32-EX-NEXT:    store i32 [[TMP0]], i32* [[DOTOMP_LB]], align 4
+// CHECK-32-EX-NEXT:    store i32 [[TMP1]], i32* [[DOTOMP_UB]], align 4
+// CHECK-32-EX-NEXT:    store i32 1, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-32-EX-NEXT:    store i32 0, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-32-EX-NEXT:    [[TMP2:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-EX-NEXT:    [[TMP3:%.*]] = load i32, i32* [[TMP2]], align 4
+// CHECK-32-EX-NEXT:    call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB3]], i32 [[TMP3]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1)
+// CHECK-32-EX-NEXT:    [[TMP4:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4
+// CHECK-32-EX-NEXT:    store i32 [[TMP4]], i32* [[DOTOMP_IV]], align 4
+// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK-32-EX:       omp.inner.for.cond:
+// CHECK-32-EX-NEXT:    [[TMP5:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
+// CHECK-32-EX-NEXT:    [[TMP6:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4
+// CHECK-32-EX-NEXT:    [[CMP:%.*]] = icmp ule i32 [[TMP5]], [[TMP6]]
+// CHECK-32-EX-NEXT:    br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK-32-EX:       omp.inner.for.body:
+// CHECK-32-EX-NEXT:    [[TMP7:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
+// CHECK-32-EX-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP7]], 1
+// CHECK-32-EX-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// CHECK-32-EX-NEXT:    store i32 [[ADD]], i32* [[I]], align 4
+// CHECK-32-EX-NEXT:    [[TMP8:%.*]] = load i32, i32* [[I]], align 4
+// CHECK-32-EX-NEXT:    store i32 [[TMP8]], i32* [[A1]], align 4
+// CHECK-32-EX-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK-32-EX:       omp.body.continue:
+// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK-32-EX:       omp.inner.for.inc:
+// CHECK-32-EX-NEXT:    [[TMP9:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
+// CHECK-32-EX-NEXT:    [[TMP10:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-32-EX-NEXT:    [[ADD2:%.*]] = add nsw i32 [[TMP9]], [[TMP10]]
+// CHECK-32-EX-NEXT:    store i32 [[ADD2]], i32* [[DOTOMP_IV]], align 4
+// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND]]
+// CHECK-32-EX:       omp.inner.for.end:
+// CHECK-32-EX-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK-32-EX:       omp.loop.exit:
+// CHECK-32-EX-NEXT:    call void @__kmpc_distribute_static_fini(%struct.ident_t* @[[GLOB2]], i32 [[TMP3]])
+// CHECK-32-EX-NEXT:    [[TMP11:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-32-EX-NEXT:    [[TMP12:%.*]] = icmp ne i32 [[TMP11]], 0
+// CHECK-32-EX-NEXT:    br i1 [[TMP12]], label [[DOTOMP_LASTPRIVATE_THEN:%.*]], label [[DOTOMP_LASTPRIVATE_DONE:%.*]]
+// CHECK-32-EX:       .omp.lastprivate.then:
+// CHECK-32-EX-NEXT:    [[TMP13:%.*]] = load i32, i32* [[A1]], align 4
+// CHECK-32-EX-NEXT:    store i32 [[TMP13]], i32* [[A_ADDR]], align 4
+// CHECK-32-EX-NEXT:    br label [[DOTOMP_LASTPRIVATE_DONE]]
+// CHECK-32-EX:       .omp.lastprivate.done:
+// CHECK-32-EX-NEXT:    ret void
+//
+//
+// CHECK-32-EX-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l39
+// CHECK-32-EX-SAME: () #[[ATTR0]] {
+// CHECK-32-EX-NEXT:  entry:
+// CHECK-32-EX-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 2, i1 false)
+// CHECK-32-EX-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
+// CHECK-32-EX-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
+// CHECK-32-EX:       user_code.entry:
+// CHECK-32-EX-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
+// CHECK-32-EX-NEXT:    store i32 0, i32* [[DOTZERO_ADDR]], align 4
+// CHECK-32-EX-NEXT:    store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4
+// CHECK-32-EX-NEXT:    call void @__omp_outlined__17(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]]) #[[ATTR2]]
+// CHECK-32-EX-NEXT:    call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 2)
+// CHECK-32-EX-NEXT:    ret void
+// CHECK-32-EX:       worker.exit:
+// CHECK-32-EX-NEXT:    ret void
+//
+//
+// CHECK-32-EX-LABEL: define {{[^@]+}}@__omp_outlined__17
+// CHECK-32-EX-SAME: (i32* noalias noundef [[DOTGLOBAL_TID_:%.*]], i32* noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
+// CHECK-32-EX-NEXT:  entry:
+// CHECK-32-EX-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-EX-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [2 x i8*], align 4
+// CHECK-32-EX-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-EX-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4
+// CHECK-32-EX-NEXT:    store i32 0, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-32-EX-NEXT:    store i32 9, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-EX-NEXT:    store i32 1, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-32-EX-NEXT:    store i32 0, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-32-EX-NEXT:    [[NVPTX_NUM_THREADS:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block()
+// CHECK-32-EX-NEXT:    [[TMP0:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-EX-NEXT:    [[TMP1:%.*]] = load i32, i32* [[TMP0]], align 4
+// CHECK-32-EX-NEXT:    call void @__kmpc_distribute_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP1]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]])
+// CHECK-32-EX-NEXT:    [[TMP2:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-EX-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[TMP2]], 9
+// CHECK-32-EX-NEXT:    br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK-32-EX:       cond.true:
+// CHECK-32-EX-NEXT:    br label [[COND_END:%.*]]
+// CHECK-32-EX:       cond.false:
+// CHECK-32-EX-NEXT:    [[TMP3:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-EX-NEXT:    br label [[COND_END]]
+// CHECK-32-EX:       cond.end:
+// CHECK-32-EX-NEXT:    [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP3]], [[COND_FALSE]] ]
+// CHECK-32-EX-NEXT:    store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-EX-NEXT:    [[TMP4:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-32-EX-NEXT:    store i32 [[TMP4]], i32* [[DOTOMP_IV]], align 4
+// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK-32-EX:       omp.inner.for.cond:
+// CHECK-32-EX-NEXT:    [[TMP5:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
+// CHECK-32-EX-NEXT:    [[CMP1:%.*]] = icmp slt i32 [[TMP5]], 10
+// CHECK-32-EX-NEXT:    br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK-32-EX:       omp.inner.for.body:
+// CHECK-32-EX-NEXT:    [[TMP6:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-32-EX-NEXT:    [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-EX-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0
+// CHECK-32-EX-NEXT:    [[TMP9:%.*]] = inttoptr i32 [[TMP6]] to i8*
+// CHECK-32-EX-NEXT:    store i8* [[TMP9]], i8** [[TMP8]], align 4
+// CHECK-32-EX-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1
+// CHECK-32-EX-NEXT:    [[TMP11:%.*]] = inttoptr i32 [[TMP7]] to i8*
+// CHECK-32-EX-NEXT:    store i8* [[TMP11]], i8** [[TMP10]], align 4
+// CHECK-32-EX-NEXT:    [[TMP12:%.*]] = bitcast [2 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8**
+// CHECK-32-EX-NEXT:    call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32)* @__omp_outlined__18 to i8*), i8* null, i8** [[TMP12]], i32 2)
+// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK-32-EX:       omp.inner.for.inc:
+// CHECK-32-EX-NEXT:    [[TMP13:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
+// CHECK-32-EX-NEXT:    [[TMP14:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-32-EX-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP13]], [[TMP14]]
+// CHECK-32-EX-NEXT:    store i32 [[ADD]], i32* [[DOTOMP_IV]], align 4
+// CHECK-32-EX-NEXT:    [[TMP15:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-32-EX-NEXT:    [[TMP16:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-32-EX-NEXT:    [[ADD2:%.*]] = add nsw i32 [[TMP15]], [[TMP16]]
+// CHECK-32-EX-NEXT:    store i32 [[ADD2]], i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-32-EX-NEXT:    [[TMP17:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-EX-NEXT:    [[TMP18:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-32-EX-NEXT:    [[ADD3:%.*]] = add nsw i32 [[TMP17]], [[TMP18]]
+// CHECK-32-EX-NEXT:    store i32 [[ADD3]], i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-EX-NEXT:    [[TMP19:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-EX-NEXT:    [[CMP4:%.*]] = icmp sgt i32 [[TMP19]], 9
+// CHECK-32-EX-NEXT:    br i1 [[CMP4]], label [[COND_TRUE5:%.*]], label [[COND_FALSE6:%.*]]
+// CHECK-32-EX:       cond.true5:
+// CHECK-32-EX-NEXT:    br label [[COND_END7:%.*]]
+// CHECK-32-EX:       cond.false6:
+// CHECK-32-EX-NEXT:    [[TMP20:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-EX-NEXT:    br label [[COND_END7]]
+// CHECK-32-EX:       cond.end7:
+// CHECK-32-EX-NEXT:    [[COND8:%.*]] = phi i32 [ 9, [[COND_TRUE5]] ], [ [[TMP20]], [[COND_FALSE6]] ]
+// CHECK-32-EX-NEXT:    store i32 [[COND8]], i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-EX-NEXT:    [[TMP21:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-32-EX-NEXT:    store i32 [[TMP21]], i32* [[DOTOMP_IV]], align 4
+// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND]]
+// CHECK-32-EX:       omp.inner.for.end:
+// CHECK-32-EX-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK-32-EX:       omp.loop.exit:
+// CHECK-32-EX-NEXT:    call void @__kmpc_distribute_static_fini(%struct.ident_t* @[[GLOB2]], i32 [[TMP1]])
+// CHECK-32-EX-NEXT:    ret void
+//
+//
+// CHECK-32-EX-LABEL: define {{[^@]+}}@__omp_outlined__18
+// CHECK-32-EX-SAME: (i32* noalias noundef [[DOTGLOBAL_TID_:%.*]], i32* noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR1]] {
+// CHECK-32-EX-NEXT:  entry:
+// CHECK-32-EX-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-EX-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-EX-NEXT:    [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-EX-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4
+// CHECK-32-EX-NEXT:    store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4
+// CHECK-32-EX-NEXT:    store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4
+// CHECK-32-EX-NEXT:    store i32 0, i32* [[DOTOMP_LB]], align 4
+// CHECK-32-EX-NEXT:    store i32 9, i32* [[DOTOMP_UB]], align 4
+// CHECK-32-EX-NEXT:    [[TMP0:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4
+// CHECK-32-EX-NEXT:    [[TMP1:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4
+// CHECK-32-EX-NEXT:    store i32 [[TMP0]], i32* [[DOTOMP_LB]], align 4
+// CHECK-32-EX-NEXT:    store i32 [[TMP1]], i32* [[DOTOMP_UB]], align 4
+// CHECK-32-EX-NEXT:    store i32 1, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-32-EX-NEXT:    store i32 0, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-32-EX-NEXT:    [[TMP2:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-EX-NEXT:    [[TMP3:%.*]] = load i32, i32* [[TMP2]], align 4
+// CHECK-32-EX-NEXT:    call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB3]], i32 [[TMP3]], i32 34, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1)
+// CHECK-32-EX-NEXT:    [[TMP4:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4
+// CHECK-32-EX-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[TMP4]], 9
+// CHECK-32-EX-NEXT:    br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK-32-EX:       cond.true:
+// CHECK-32-EX-NEXT:    br label [[COND_END:%.*]]
+// CHECK-32-EX:       cond.false:
+// CHECK-32-EX-NEXT:    [[TMP5:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4
+// CHECK-32-EX-NEXT:    br label [[COND_END]]
+// CHECK-32-EX:       cond.end:
+// CHECK-32-EX-NEXT:    [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP5]], [[COND_FALSE]] ]
+// CHECK-32-EX-NEXT:    store i32 [[COND]], i32* [[DOTOMP_UB]], align 4
+// CHECK-32-EX-NEXT:    [[TMP6:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4
+// CHECK-32-EX-NEXT:    store i32 [[TMP6]], i32* [[DOTOMP_IV]], align 4
+// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK-32-EX:       omp.inner.for.cond:
+// CHECK-32-EX-NEXT:    [[TMP7:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
+// CHECK-32-EX-NEXT:    [[TMP8:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4
+// CHECK-32-EX-NEXT:    [[CMP1:%.*]] = icmp sle i32 [[TMP7]], [[TMP8]]
+// CHECK-32-EX-NEXT:    br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK-32-EX:       omp.inner.for.body:
+// CHECK-32-EX-NEXT:    [[TMP9:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
+// CHECK-32-EX-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP9]], 1
+// CHECK-32-EX-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// CHECK-32-EX-NEXT:    store i32 [[ADD]], i32* [[I]], align 4
+// CHECK-32-EX-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK-32-EX:       omp.body.continue:
+// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK-32-EX:       omp.inner.for.inc:
+// CHECK-32-EX-NEXT:    [[TMP10:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
+// CHECK-32-EX-NEXT:    [[ADD2:%.*]] = add nsw i32 [[TMP10]], 1
+// CHECK-32-EX-NEXT:    store i32 [[ADD2]], i32* [[DOTOMP_IV]], align 4
+// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND]]
+// CHECK-32-EX:       omp.inner.for.end:
+// CHECK-32-EX-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK-32-EX:       omp.loop.exit:
+// CHECK-32-EX-NEXT:    call void @__kmpc_distribute_static_fini(%struct.ident_t* @[[GLOB2]], i32 [[TMP3]])
+// CHECK-32-EX-NEXT:    ret void
+//
+//
+// CHECK-32-EX-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l42
+// CHECK-32-EX-SAME: () #[[ATTR0]] {
+// CHECK-32-EX-NEXT:  entry:
+// CHECK-32-EX-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 2, i1 false)
+// CHECK-32-EX-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
+// CHECK-32-EX-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
+// CHECK-32-EX:       user_code.entry:
+// CHECK-32-EX-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
+// CHECK-32-EX-NEXT:    store i32 0, i32* [[DOTZERO_ADDR]], align 4
+// CHECK-32-EX-NEXT:    store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4
+// CHECK-32-EX-NEXT:    call void @__omp_outlined__19(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]]) #[[ATTR2]]
+// CHECK-32-EX-NEXT:    call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 2)
+// CHECK-32-EX-NEXT:    ret void
+// CHECK-32-EX:       worker.exit:
+// CHECK-32-EX-NEXT:    ret void
+//
+//
+// CHECK-32-EX-LABEL: define {{[^@]+}}@__omp_outlined__19
+// CHECK-32-EX-SAME: (i32* noalias noundef [[DOTGLOBAL_TID_:%.*]], i32* noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
+// CHECK-32-EX-NEXT:  entry:
+// CHECK-32-EX-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-EX-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [2 x i8*], align 4
+// CHECK-32-EX-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-EX-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4
+// CHECK-32-EX-NEXT:    store i32 0, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-32-EX-NEXT:    store i32 9, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-EX-NEXT:    store i32 1, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-32-EX-NEXT:    store i32 0, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-32-EX-NEXT:    [[NVPTX_NUM_THREADS:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block()
+// CHECK-32-EX-NEXT:    [[TMP0:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-EX-NEXT:    [[TMP1:%.*]] = load i32, i32* [[TMP0]], align 4
+// CHECK-32-EX-NEXT:    call void @__kmpc_distribute_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP1]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]])
+// CHECK-32-EX-NEXT:    [[TMP2:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-EX-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[TMP2]], 9
+// CHECK-32-EX-NEXT:    br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK-32-EX:       cond.true:
+// CHECK-32-EX-NEXT:    br label [[COND_END:%.*]]
+// CHECK-32-EX:       cond.false:
+// CHECK-32-EX-NEXT:    [[TMP3:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-EX-NEXT:    br label [[COND_END]]
+// CHECK-32-EX:       cond.end:
+// CHECK-32-EX-NEXT:    [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP3]], [[COND_FALSE]] ]
+// CHECK-32-EX-NEXT:    store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-EX-NEXT:    [[TMP4:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-32-EX-NEXT:    store i32 [[TMP4]], i32* [[DOTOMP_IV]], align 4
+// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK-32-EX:       omp.inner.for.cond:
+// CHECK-32-EX-NEXT:    [[TMP5:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
+// CHECK-32-EX-NEXT:    [[CMP1:%.*]] = icmp slt i32 [[TMP5]], 10
+// CHECK-32-EX-NEXT:    br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK-32-EX:       omp.inner.for.body:
+// CHECK-32-EX-NEXT:    [[TMP6:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-32-EX-NEXT:    [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-EX-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0
+// CHECK-32-EX-NEXT:    [[TMP9:%.*]] = inttoptr i32 [[TMP6]] to i8*
+// CHECK-32-EX-NEXT:    store i8* [[TMP9]], i8** [[TMP8]], align 4
+// CHECK-32-EX-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1
+// CHECK-32-EX-NEXT:    [[TMP11:%.*]] = inttoptr i32 [[TMP7]] to i8*
+// CHECK-32-EX-NEXT:    store i8* [[TMP11]], i8** [[TMP10]], align 4
+// CHECK-32-EX-NEXT:    [[TMP12:%.*]] = bitcast [2 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8**
+// CHECK-32-EX-NEXT:    call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32)* @__omp_outlined__20 to i8*), i8* null, i8** [[TMP12]], i32 2)
+// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK-32-EX:       omp.inner.for.inc:
+// CHECK-32-EX-NEXT:    [[TMP13:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
+// CHECK-32-EX-NEXT:    [[TMP14:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-32-EX-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP13]], [[TMP14]]
+// CHECK-32-EX-NEXT:    store i32 [[ADD]], i32* [[DOTOMP_IV]], align 4
+// CHECK-32-EX-NEXT:    [[TMP15:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-32-EX-NEXT:    [[TMP16:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-32-EX-NEXT:    [[ADD2:%.*]] = add nsw i32 [[TMP15]], [[TMP16]]
+// CHECK-32-EX-NEXT:    store i32 [[ADD2]], i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-32-EX-NEXT:    [[TMP17:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-EX-NEXT:    [[TMP18:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-32-EX-NEXT:    [[ADD3:%.*]] = add nsw i32 [[TMP17]], [[TMP18]]
+// CHECK-32-EX-NEXT:    store i32 [[ADD3]], i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-EX-NEXT:    [[TMP19:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-EX-NEXT:    [[CMP4:%.*]] = icmp sgt i32 [[TMP19]], 9
+// CHECK-32-EX-NEXT:    br i1 [[CMP4]], label [[COND_TRUE5:%.*]], label [[COND_FALSE6:%.*]]
+// CHECK-32-EX:       cond.true5:
+// CHECK-32-EX-NEXT:    br label [[COND_END7:%.*]]
+// CHECK-32-EX:       cond.false6:
+// CHECK-32-EX-NEXT:    [[TMP20:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-EX-NEXT:    br label [[COND_END7]]
+// CHECK-32-EX:       cond.end7:
+// CHECK-32-EX-NEXT:    [[COND8:%.*]] = phi i32 [ 9, [[COND_TRUE5]] ], [ [[TMP20]], [[COND_FALSE6]] ]
+// CHECK-32-EX-NEXT:    store i32 [[COND8]], i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-EX-NEXT:    [[TMP21:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-32-EX-NEXT:    store i32 [[TMP21]], i32* [[DOTOMP_IV]], align 4
+// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND]]
+// CHECK-32-EX:       omp.inner.for.end:
+// CHECK-32-EX-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK-32-EX:       omp.loop.exit:
+// CHECK-32-EX-NEXT:    call void @__kmpc_distribute_static_fini(%struct.ident_t* @[[GLOB2]], i32 [[TMP1]])
+// CHECK-32-EX-NEXT:    ret void
+//
+//
+// CHECK-32-EX-LABEL: define {{[^@]+}}@__omp_outlined__20
+// CHECK-32-EX-SAME: (i32* noalias noundef [[DOTGLOBAL_TID_:%.*]], i32* noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR1]] {
+// CHECK-32-EX-NEXT:  entry:
+// CHECK-32-EX-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-EX-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-EX-NEXT:    [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-EX-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4
+// CHECK-32-EX-NEXT:    store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4
+// CHECK-32-EX-NEXT:    store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4
+// CHECK-32-EX-NEXT:    store i32 0, i32* [[DOTOMP_LB]], align 4
+// CHECK-32-EX-NEXT:    store i32 9, i32* [[DOTOMP_UB]], align 4
+// CHECK-32-EX-NEXT:    [[TMP0:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4
+// CHECK-32-EX-NEXT:    [[TMP1:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4
+// CHECK-32-EX-NEXT:    store i32 [[TMP0]], i32* [[DOTOMP_LB]], align 4
+// CHECK-32-EX-NEXT:    store i32 [[TMP1]], i32* [[DOTOMP_UB]], align 4
+// CHECK-32-EX-NEXT:    store i32 1, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-32-EX-NEXT:    store i32 0, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-32-EX-NEXT:    [[TMP2:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-EX-NEXT:    [[TMP3:%.*]] = load i32, i32* [[TMP2]], align 4
+// CHECK-32-EX-NEXT:    call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB3]], i32 [[TMP3]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1)
+// CHECK-32-EX-NEXT:    [[TMP4:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4
+// CHECK-32-EX-NEXT:    store i32 [[TMP4]], i32* [[DOTOMP_IV]], align 4
+// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK-32-EX:       omp.inner.for.cond:
+// CHECK-32-EX-NEXT:    [[TMP5:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
+// CHECK-32-EX-NEXT:    [[TMP6:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4
+// CHECK-32-EX-NEXT:    [[CMP:%.*]] = icmp ule i32 [[TMP5]], [[TMP6]]
+// CHECK-32-EX-NEXT:    br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK-32-EX:       omp.inner.for.body:
+// CHECK-32-EX-NEXT:    [[TMP7:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
+// CHECK-32-EX-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP7]], 1
+// CHECK-32-EX-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// CHECK-32-EX-NEXT:    store i32 [[ADD]], i32* [[I]], align 4
+// CHECK-32-EX-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK-32-EX:       omp.body.continue:
+// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK-32-EX:       omp.inner.for.inc:
+// CHECK-32-EX-NEXT:    [[TMP8:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
+// CHECK-32-EX-NEXT:    [[TMP9:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-32-EX-NEXT:    [[ADD1:%.*]] = add nsw i32 [[TMP8]], [[TMP9]]
+// CHECK-32-EX-NEXT:    store i32 [[ADD1]], i32* [[DOTOMP_IV]], align 4
+// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND]]
+// CHECK-32-EX:       omp.inner.for.end:
+// CHECK-32-EX-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK-32-EX:       omp.loop.exit:
+// CHECK-32-EX-NEXT:    call void @__kmpc_distribute_static_fini(%struct.ident_t* @[[GLOB2]], i32 [[TMP3]])
+// CHECK-32-EX-NEXT:    ret void
+//
+//
+// CHECK-32-EX-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l45
+// CHECK-32-EX-SAME: () #[[ATTR0]] {
+// CHECK-32-EX-NEXT:  entry:
+// CHECK-32-EX-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 2, i1 false)
+// CHECK-32-EX-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
+// CHECK-32-EX-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
+// CHECK-32-EX:       user_code.entry:
+// CHECK-32-EX-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
+// CHECK-32-EX-NEXT:    store i32 0, i32* [[DOTZERO_ADDR]], align 4
+// CHECK-32-EX-NEXT:    store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4
+// CHECK-32-EX-NEXT:    call void @__omp_outlined__21(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]]) #[[ATTR2]]
+// CHECK-32-EX-NEXT:    call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 2)
+// CHECK-32-EX-NEXT:    ret void
+// CHECK-32-EX:       worker.exit:
+// CHECK-32-EX-NEXT:    ret void
+//
+//
+// CHECK-32-EX-LABEL: define {{[^@]+}}@__omp_outlined__21
+// CHECK-32-EX-SAME: (i32* noalias noundef [[DOTGLOBAL_TID_:%.*]], i32* noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
+// CHECK-32-EX-NEXT:  entry:
+// CHECK-32-EX-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-EX-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [2 x i8*], align 4
+// CHECK-32-EX-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-EX-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4
+// CHECK-32-EX-NEXT:    store i32 0, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-32-EX-NEXT:    store i32 9, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-EX-NEXT:    store i32 1, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-32-EX-NEXT:    store i32 0, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-32-EX-NEXT:    [[NVPTX_NUM_THREADS:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block()
+// CHECK-32-EX-NEXT:    [[TMP0:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-EX-NEXT:    [[TMP1:%.*]] = load i32, i32* [[TMP0]], align 4
+// CHECK-32-EX-NEXT:    call void @__kmpc_distribute_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP1]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]])
+// CHECK-32-EX-NEXT:    [[TMP2:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-EX-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[TMP2]], 9
+// CHECK-32-EX-NEXT:    br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK-32-EX:       cond.true:
+// CHECK-32-EX-NEXT:    br label [[COND_END:%.*]]
+// CHECK-32-EX:       cond.false:
+// CHECK-32-EX-NEXT:    [[TMP3:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-EX-NEXT:    br label [[COND_END]]
+// CHECK-32-EX:       cond.end:
+// CHECK-32-EX-NEXT:    [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP3]], [[COND_FALSE]] ]
+// CHECK-32-EX-NEXT:    store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-EX-NEXT:    [[TMP4:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-32-EX-NEXT:    store i32 [[TMP4]], i32* [[DOTOMP_IV]], align 4
+// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK-32-EX:       omp.inner.for.cond:
+// CHECK-32-EX-NEXT:    [[TMP5:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
+// CHECK-32-EX-NEXT:    [[CMP1:%.*]] = icmp slt i32 [[TMP5]], 10
+// CHECK-32-EX-NEXT:    br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK-32-EX:       omp.inner.for.body:
+// CHECK-32-EX-NEXT:    [[TMP6:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-32-EX-NEXT:    [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-EX-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0
+// CHECK-32-EX-NEXT:    [[TMP9:%.*]] = inttoptr i32 [[TMP6]] to i8*
+// CHECK-32-EX-NEXT:    store i8* [[TMP9]], i8** [[TMP8]], align 4
+// CHECK-32-EX-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1
+// CHECK-32-EX-NEXT:    [[TMP11:%.*]] = inttoptr i32 [[TMP7]] to i8*
+// CHECK-32-EX-NEXT:    store i8* [[TMP11]], i8** [[TMP10]], align 4
+// CHECK-32-EX-NEXT:    [[TMP12:%.*]] = bitcast [2 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8**
+// CHECK-32-EX-NEXT:    call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32)* @__omp_outlined__22 to i8*), i8* null, i8** [[TMP12]], i32 2)
+// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK-32-EX:       omp.inner.for.inc:
+// CHECK-32-EX-NEXT:    [[TMP13:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
+// CHECK-32-EX-NEXT:    [[TMP14:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-32-EX-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP13]], [[TMP14]]
+// CHECK-32-EX-NEXT:    store i32 [[ADD]], i32* [[DOTOMP_IV]], align 4
+// CHECK-32-EX-NEXT:    [[TMP15:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-32-EX-NEXT:    [[TMP16:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-32-EX-NEXT:    [[ADD2:%.*]] = add nsw i32 [[TMP15]], [[TMP16]]
+// CHECK-32-EX-NEXT:    store i32 [[ADD2]], i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-32-EX-NEXT:    [[TMP17:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-EX-NEXT:    [[TMP18:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-32-EX-NEXT:    [[ADD3:%.*]] = add nsw i32 [[TMP17]], [[TMP18]]
+// CHECK-32-EX-NEXT:    store i32 [[ADD3]], i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-EX-NEXT:    [[TMP19:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-EX-NEXT:    [[CMP4:%.*]] = icmp sgt i32 [[TMP19]], 9
+// CHECK-32-EX-NEXT:    br i1 [[CMP4]], label [[COND_TRUE5:%.*]], label [[COND_FALSE6:%.*]]
+// CHECK-32-EX:       cond.true5:
+// CHECK-32-EX-NEXT:    br label [[COND_END7:%.*]]
+// CHECK-32-EX:       cond.false6:
+// CHECK-32-EX-NEXT:    [[TMP20:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-EX-NEXT:    br label [[COND_END7]]
+// CHECK-32-EX:       cond.end7:
+// CHECK-32-EX-NEXT:    [[COND8:%.*]] = phi i32 [ 9, [[COND_TRUE5]] ], [ [[TMP20]], [[COND_FALSE6]] ]
+// CHECK-32-EX-NEXT:    store i32 [[COND8]], i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-EX-NEXT:    [[TMP21:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-32-EX-NEXT:    store i32 [[TMP21]], i32* [[DOTOMP_IV]], align 4
+// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND]]
+// CHECK-32-EX:       omp.inner.for.end:
+// CHECK-32-EX-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK-32-EX:       omp.loop.exit:
+// CHECK-32-EX-NEXT:    call void @__kmpc_distribute_static_fini(%struct.ident_t* @[[GLOB2]], i32 [[TMP1]])
+// CHECK-32-EX-NEXT:    ret void
+//
+//
+// CHECK-32-EX-LABEL: define {{[^@]+}}@__omp_outlined__22
+// CHECK-32-EX-SAME: (i32* noalias noundef [[DOTGLOBAL_TID_:%.*]], i32* noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR1]] {
+// CHECK-32-EX-NEXT:  entry:
+// CHECK-32-EX-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-EX-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-EX-NEXT:    [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-EX-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4
+// CHECK-32-EX-NEXT:    store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4
+// CHECK-32-EX-NEXT:    store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4
+// CHECK-32-EX-NEXT:    store i32 0, i32* [[DOTOMP_LB]], align 4
+// CHECK-32-EX-NEXT:    store i32 9, i32* [[DOTOMP_UB]], align 4
+// CHECK-32-EX-NEXT:    [[TMP0:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4
+// CHECK-32-EX-NEXT:    [[TMP1:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4
+// CHECK-32-EX-NEXT:    store i32 [[TMP0]], i32* [[DOTOMP_LB]], align 4
+// CHECK-32-EX-NEXT:    store i32 [[TMP1]], i32* [[DOTOMP_UB]], align 4
+// CHECK-32-EX-NEXT:    store i32 1, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-32-EX-NEXT:    store i32 0, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-32-EX-NEXT:    [[TMP2:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4
+// CHECK-32-EX-NEXT:    [[TMP3:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4
+// CHECK-32-EX-NEXT:    [[TMP4:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-EX-NEXT:    [[TMP5:%.*]] = load i32, i32* [[TMP4]], align 4
+// CHECK-32-EX-NEXT:    call void @__kmpc_dispatch_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP5]], i32 1073741862, i32 [[TMP2]], i32 [[TMP3]], i32 1, i32 1)
+// CHECK-32-EX-NEXT:    br label [[OMP_DISPATCH_COND:%.*]]
+// CHECK-32-EX:       omp.dispatch.cond:
+// CHECK-32-EX-NEXT:    [[TMP6:%.*]] = call i32 @__kmpc_dispatch_next_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP5]], i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]])
+// CHECK-32-EX-NEXT:    [[TOBOOL:%.*]] = icmp ne i32 [[TMP6]], 0
+// CHECK-32-EX-NEXT:    br i1 [[TOBOOL]], label [[OMP_DISPATCH_BODY:%.*]], label [[OMP_DISPATCH_END:%.*]]
+// CHECK-32-EX:       omp.dispatch.body:
+// CHECK-32-EX-NEXT:    [[TMP7:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4
+// CHECK-32-EX-NEXT:    store i32 [[TMP7]], i32* [[DOTOMP_IV]], align 4
+// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK-32-EX:       omp.inner.for.cond:
+// CHECK-32-EX-NEXT:    [[TMP8:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP180:![0-9]+]]
+// CHECK-32-EX-NEXT:    [[TMP9:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP180]]
+// CHECK-32-EX-NEXT:    [[CMP:%.*]] = icmp sle i32 [[TMP8]], [[TMP9]]
+// CHECK-32-EX-NEXT:    br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK-32-EX:       omp.inner.for.body:
+// CHECK-32-EX-NEXT:    [[TMP10:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP180]]
+// CHECK-32-EX-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP10]], 1
+// CHECK-32-EX-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// CHECK-32-EX-NEXT:    store i32 [[ADD]], i32* [[I]], align 4, !llvm.access.group [[ACC_GRP180]]
+// CHECK-32-EX-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK-32-EX:       omp.body.continue:
+// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK-32-EX:       omp.inner.for.inc:
+// CHECK-32-EX-NEXT:    [[TMP11:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP180]]
+// CHECK-32-EX-NEXT:    [[ADD1:%.*]] = add nsw i32 [[TMP11]], 1
+// CHECK-32-EX-NEXT:    store i32 [[ADD1]], i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP180]]
+// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP181:![0-9]+]]
+// CHECK-32-EX:       omp.inner.for.end:
+// CHECK-32-EX-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
+// CHECK-32-EX:       omp.dispatch.inc:
+// CHECK-32-EX-NEXT:    br label [[OMP_DISPATCH_COND]]
+// CHECK-32-EX:       omp.dispatch.end:
+// CHECK-32-EX-NEXT:    ret void
+//
+//
+// CHECK-32-EX-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l48
+// CHECK-32-EX-SAME: () #[[ATTR0]] {
+// CHECK-32-EX-NEXT:  entry:
+// CHECK-32-EX-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 2, i1 false)
+// CHECK-32-EX-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
+// CHECK-32-EX-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
+// CHECK-32-EX:       user_code.entry:
+// CHECK-32-EX-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
+// CHECK-32-EX-NEXT:    store i32 0, i32* [[DOTZERO_ADDR]], align 4
+// CHECK-32-EX-NEXT:    store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4
+// CHECK-32-EX-NEXT:    call void @__omp_outlined__23(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]]) #[[ATTR2]]
+// CHECK-32-EX-NEXT:    call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 2)
+// CHECK-32-EX-NEXT:    ret void
+// CHECK-32-EX:       worker.exit:
+// CHECK-32-EX-NEXT:    ret void
+//
+//
+// CHECK-32-EX-LABEL: define {{[^@]+}}@__omp_outlined__23
+// CHECK-32-EX-SAME: (i32* noalias noundef [[DOTGLOBAL_TID_:%.*]], i32* noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
+// CHECK-32-EX-NEXT:  entry:
+// CHECK-32-EX-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-EX-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [2 x i8*], align 4
+// CHECK-32-EX-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-EX-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4
+// CHECK-32-EX-NEXT:    store i32 0, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-32-EX-NEXT:    store i32 9, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-EX-NEXT:    store i32 1, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-32-EX-NEXT:    store i32 0, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-32-EX-NEXT:    [[NVPTX_NUM_THREADS:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block()
+// CHECK-32-EX-NEXT:    [[TMP0:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-EX-NEXT:    [[TMP1:%.*]] = load i32, i32* [[TMP0]], align 4
+// CHECK-32-EX-NEXT:    call void @__kmpc_distribute_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP1]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]])
+// CHECK-32-EX-NEXT:    [[TMP2:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-EX-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[TMP2]], 9
+// CHECK-32-EX-NEXT:    br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK-32-EX:       cond.true:
+// CHECK-32-EX-NEXT:    br label [[COND_END:%.*]]
+// CHECK-32-EX:       cond.false:
+// CHECK-32-EX-NEXT:    [[TMP3:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-EX-NEXT:    br label [[COND_END]]
+// CHECK-32-EX:       cond.end:
+// CHECK-32-EX-NEXT:    [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP3]], [[COND_FALSE]] ]
+// CHECK-32-EX-NEXT:    store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-EX-NEXT:    [[TMP4:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-32-EX-NEXT:    store i32 [[TMP4]], i32* [[DOTOMP_IV]], align 4
+// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK-32-EX:       omp.inner.for.cond:
+// CHECK-32-EX-NEXT:    [[TMP5:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
+// CHECK-32-EX-NEXT:    [[CMP1:%.*]] = icmp slt i32 [[TMP5]], 10
+// CHECK-32-EX-NEXT:    br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK-32-EX:       omp.inner.for.body:
+// CHECK-32-EX-NEXT:    [[TMP6:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-32-EX-NEXT:    [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-EX-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0
+// CHECK-32-EX-NEXT:    [[TMP9:%.*]] = inttoptr i32 [[TMP6]] to i8*
+// CHECK-32-EX-NEXT:    store i8* [[TMP9]], i8** [[TMP8]], align 4
+// CHECK-32-EX-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1
+// CHECK-32-EX-NEXT:    [[TMP11:%.*]] = inttoptr i32 [[TMP7]] to i8*
+// CHECK-32-EX-NEXT:    store i8* [[TMP11]], i8** [[TMP10]], align 4
+// CHECK-32-EX-NEXT:    [[TMP12:%.*]] = bitcast [2 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8**
+// CHECK-32-EX-NEXT:    call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32)* @__omp_outlined__24 to i8*), i8* null, i8** [[TMP12]], i32 2)
+// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK-32-EX:       omp.inner.for.inc:
+// CHECK-32-EX-NEXT:    [[TMP13:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
+// CHECK-32-EX-NEXT:    [[TMP14:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-32-EX-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP13]], [[TMP14]]
+// CHECK-32-EX-NEXT:    store i32 [[ADD]], i32* [[DOTOMP_IV]], align 4
+// CHECK-32-EX-NEXT:    [[TMP15:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-32-EX-NEXT:    [[TMP16:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-32-EX-NEXT:    [[ADD2:%.*]] = add nsw i32 [[TMP15]], [[TMP16]]
+// CHECK-32-EX-NEXT:    store i32 [[ADD2]], i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-32-EX-NEXT:    [[TMP17:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-EX-NEXT:    [[TMP18:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-32-EX-NEXT:    [[ADD3:%.*]] = add nsw i32 [[TMP17]], [[TMP18]]
+// CHECK-32-EX-NEXT:    store i32 [[ADD3]], i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-EX-NEXT:    [[TMP19:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-EX-NEXT:    [[CMP4:%.*]] = icmp sgt i32 [[TMP19]], 9
+// CHECK-32-EX-NEXT:    br i1 [[CMP4]], label [[COND_TRUE5:%.*]], label [[COND_FALSE6:%.*]]
+// CHECK-32-EX:       cond.true5:
+// CHECK-32-EX-NEXT:    br label [[COND_END7:%.*]]
+// CHECK-32-EX:       cond.false6:
+// CHECK-32-EX-NEXT:    [[TMP20:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-EX-NEXT:    br label [[COND_END7]]
+// CHECK-32-EX:       cond.end7:
+// CHECK-32-EX-NEXT:    [[COND8:%.*]] = phi i32 [ 9, [[COND_TRUE5]] ], [ [[TMP20]], [[COND_FALSE6]] ]
+// CHECK-32-EX-NEXT:    store i32 [[COND8]], i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-EX-NEXT:    [[TMP21:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-32-EX-NEXT:    store i32 [[TMP21]], i32* [[DOTOMP_IV]], align 4
+// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND]]
+// CHECK-32-EX:       omp.inner.for.end:
+// CHECK-32-EX-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK-32-EX:       omp.loop.exit:
+// CHECK-32-EX-NEXT:    call void @__kmpc_distribute_static_fini(%struct.ident_t* @[[GLOB2]], i32 [[TMP1]])
+// CHECK-32-EX-NEXT:    ret void
+//
+//
+// CHECK-32-EX-LABEL: define {{[^@]+}}@__omp_outlined__24
+// CHECK-32-EX-SAME: (i32* noalias noundef [[DOTGLOBAL_TID_:%.*]], i32* noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR1]] {
+// CHECK-32-EX-NEXT:  entry:
+// CHECK-32-EX-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-EX-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-EX-NEXT:    [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-EX-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4
+// CHECK-32-EX-NEXT:    store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4
+// CHECK-32-EX-NEXT:    store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4
+// CHECK-32-EX-NEXT:    store i32 0, i32* [[DOTOMP_LB]], align 4
+// CHECK-32-EX-NEXT:    store i32 9, i32* [[DOTOMP_UB]], align 4
+// CHECK-32-EX-NEXT:    [[TMP0:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4
+// CHECK-32-EX-NEXT:    [[TMP1:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4
+// CHECK-32-EX-NEXT:    store i32 [[TMP0]], i32* [[DOTOMP_LB]], align 4
+// CHECK-32-EX-NEXT:    store i32 [[TMP1]], i32* [[DOTOMP_UB]], align 4
+// CHECK-32-EX-NEXT:    store i32 1, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-32-EX-NEXT:    store i32 0, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-32-EX-NEXT:    [[TMP2:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4
+// CHECK-32-EX-NEXT:    [[TMP3:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4
+// CHECK-32-EX-NEXT:    [[TMP4:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-EX-NEXT:    [[TMP5:%.*]] = load i32, i32* [[TMP4]], align 4
+// CHECK-32-EX-NEXT:    call void @__kmpc_dispatch_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP5]], i32 1073741861, i32 [[TMP2]], i32 [[TMP3]], i32 1, i32 1)
+// CHECK-32-EX-NEXT:    br label [[OMP_DISPATCH_COND:%.*]]
+// CHECK-32-EX:       omp.dispatch.cond:
+// CHECK-32-EX-NEXT:    [[TMP6:%.*]] = call i32 @__kmpc_dispatch_next_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP5]], i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]])
+// CHECK-32-EX-NEXT:    [[TOBOOL:%.*]] = icmp ne i32 [[TMP6]], 0
+// CHECK-32-EX-NEXT:    br i1 [[TOBOOL]], label [[OMP_DISPATCH_BODY:%.*]], label [[OMP_DISPATCH_END:%.*]]
+// CHECK-32-EX:       omp.dispatch.body:
+// CHECK-32-EX-NEXT:    [[TMP7:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4
+// CHECK-32-EX-NEXT:    store i32 [[TMP7]], i32* [[DOTOMP_IV]], align 4
+// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK-32-EX:       omp.inner.for.cond:
+// CHECK-32-EX-NEXT:    [[TMP8:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP183:![0-9]+]]
+// CHECK-32-EX-NEXT:    [[TMP9:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP183]]
+// CHECK-32-EX-NEXT:    [[CMP:%.*]] = icmp sle i32 [[TMP8]], [[TMP9]]
+// CHECK-32-EX-NEXT:    br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK-32-EX:       omp.inner.for.body:
+// CHECK-32-EX-NEXT:    [[TMP10:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP183]]
+// CHECK-32-EX-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP10]], 1
+// CHECK-32-EX-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// CHECK-32-EX-NEXT:    store i32 [[ADD]], i32* [[I]], align 4, !llvm.access.group [[ACC_GRP183]]
+// CHECK-32-EX-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK-32-EX:       omp.body.continue:
+// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK-32-EX:       omp.inner.for.inc:
+// CHECK-32-EX-NEXT:    [[TMP11:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP183]]
+// CHECK-32-EX-NEXT:    [[ADD1:%.*]] = add nsw i32 [[TMP11]], 1
+// CHECK-32-EX-NEXT:    store i32 [[ADD1]], i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP183]]
+// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP184:![0-9]+]]
+// CHECK-32-EX:       omp.inner.for.end:
+// CHECK-32-EX-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
+// CHECK-32-EX:       omp.dispatch.inc:
+// CHECK-32-EX-NEXT:    br label [[OMP_DISPATCH_COND]]
+// CHECK-32-EX:       omp.dispatch.end:
+// CHECK-32-EX-NEXT:    ret void
+//
+//
+// CHECK-32-EX-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l51
+// CHECK-32-EX-SAME: () #[[ATTR0]] {
+// CHECK-32-EX-NEXT:  entry:
+// CHECK-32-EX-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 2, i1 false)
+// CHECK-32-EX-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
+// CHECK-32-EX-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
+// CHECK-32-EX:       user_code.entry:
+// CHECK-32-EX-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
+// CHECK-32-EX-NEXT:    store i32 0, i32* [[DOTZERO_ADDR]], align 4
+// CHECK-32-EX-NEXT:    store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4
+// CHECK-32-EX-NEXT:    call void @__omp_outlined__25(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]]) #[[ATTR2]]
+// CHECK-32-EX-NEXT:    call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 2)
+// CHECK-32-EX-NEXT:    ret void
+// CHECK-32-EX:       worker.exit:
+// CHECK-32-EX-NEXT:    ret void
+//
+//
+// CHECK-32-EX-LABEL: define {{[^@]+}}@__omp_outlined__25
+// CHECK-32-EX-SAME: (i32* noalias noundef [[DOTGLOBAL_TID_:%.*]], i32* noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
+// CHECK-32-EX-NEXT:  entry:
+// CHECK-32-EX-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-EX-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [2 x i8*], align 4
+// CHECK-32-EX-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-EX-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4
+// CHECK-32-EX-NEXT:    store i32 0, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-32-EX-NEXT:    store i32 9, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-EX-NEXT:    store i32 1, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-32-EX-NEXT:    store i32 0, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-32-EX-NEXT:    [[NVPTX_NUM_THREADS:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block()
+// CHECK-32-EX-NEXT:    [[TMP0:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-EX-NEXT:    [[TMP1:%.*]] = load i32, i32* [[TMP0]], align 4
+// CHECK-32-EX-NEXT:    call void @__kmpc_distribute_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP1]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]])
+// CHECK-32-EX-NEXT:    [[TMP2:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-EX-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[TMP2]], 9
+// CHECK-32-EX-NEXT:    br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK-32-EX:       cond.true:
+// CHECK-32-EX-NEXT:    br label [[COND_END:%.*]]
+// CHECK-32-EX:       cond.false:
+// CHECK-32-EX-NEXT:    [[TMP3:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-EX-NEXT:    br label [[COND_END]]
+// CHECK-32-EX:       cond.end:
+// CHECK-32-EX-NEXT:    [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP3]], [[COND_FALSE]] ]
+// CHECK-32-EX-NEXT:    store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-EX-NEXT:    [[TMP4:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-32-EX-NEXT:    store i32 [[TMP4]], i32* [[DOTOMP_IV]], align 4
+// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK-32-EX:       omp.inner.for.cond:
+// CHECK-32-EX-NEXT:    [[TMP5:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
+// CHECK-32-EX-NEXT:    [[CMP1:%.*]] = icmp slt i32 [[TMP5]], 10
+// CHECK-32-EX-NEXT:    br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK-32-EX:       omp.inner.for.body:
+// CHECK-32-EX-NEXT:    [[TMP6:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-32-EX-NEXT:    [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-EX-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0
+// CHECK-32-EX-NEXT:    [[TMP9:%.*]] = inttoptr i32 [[TMP6]] to i8*
+// CHECK-32-EX-NEXT:    store i8* [[TMP9]], i8** [[TMP8]], align 4
+// CHECK-32-EX-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1
+// CHECK-32-EX-NEXT:    [[TMP11:%.*]] = inttoptr i32 [[TMP7]] to i8*
+// CHECK-32-EX-NEXT:    store i8* [[TMP11]], i8** [[TMP10]], align 4
+// CHECK-32-EX-NEXT:    [[TMP12:%.*]] = bitcast [2 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8**
+// CHECK-32-EX-NEXT:    call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32)* @__omp_outlined__26 to i8*), i8* null, i8** [[TMP12]], i32 2)
+// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK-32-EX:       omp.inner.for.inc:
+// CHECK-32-EX-NEXT:    [[TMP13:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
+// CHECK-32-EX-NEXT:    [[TMP14:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-32-EX-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP13]], [[TMP14]]
+// CHECK-32-EX-NEXT:    store i32 [[ADD]], i32* [[DOTOMP_IV]], align 4
+// CHECK-32-EX-NEXT:    [[TMP15:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-32-EX-NEXT:    [[TMP16:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-32-EX-NEXT:    [[ADD2:%.*]] = add nsw i32 [[TMP15]], [[TMP16]]
+// CHECK-32-EX-NEXT:    store i32 [[ADD2]], i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-32-EX-NEXT:    [[TMP17:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-EX-NEXT:    [[TMP18:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-32-EX-NEXT:    [[ADD3:%.*]] = add nsw i32 [[TMP17]], [[TMP18]]
+// CHECK-32-EX-NEXT:    store i32 [[ADD3]], i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-EX-NEXT:    [[TMP19:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-EX-NEXT:    [[CMP4:%.*]] = icmp sgt i32 [[TMP19]], 9
+// CHECK-32-EX-NEXT:    br i1 [[CMP4]], label [[COND_TRUE5:%.*]], label [[COND_FALSE6:%.*]]
+// CHECK-32-EX:       cond.true5:
+// CHECK-32-EX-NEXT:    br label [[COND_END7:%.*]]
+// CHECK-32-EX:       cond.false6:
+// CHECK-32-EX-NEXT:    [[TMP20:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-EX-NEXT:    br label [[COND_END7]]
+// CHECK-32-EX:       cond.end7:
+// CHECK-32-EX-NEXT:    [[COND8:%.*]] = phi i32 [ 9, [[COND_TRUE5]] ], [ [[TMP20]], [[COND_FALSE6]] ]
+// CHECK-32-EX-NEXT:    store i32 [[COND8]], i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-EX-NEXT:    [[TMP21:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-32-EX-NEXT:    store i32 [[TMP21]], i32* [[DOTOMP_IV]], align 4
+// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND]]
+// CHECK-32-EX:       omp.inner.for.end:
+// CHECK-32-EX-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK-32-EX:       omp.loop.exit:
+// CHECK-32-EX-NEXT:    call void @__kmpc_distribute_static_fini(%struct.ident_t* @[[GLOB2]], i32 [[TMP1]])
+// CHECK-32-EX-NEXT:    ret void
+//
+//
+// CHECK-32-EX-LABEL: define {{[^@]+}}@__omp_outlined__26
+// CHECK-32-EX-SAME: (i32* noalias noundef [[DOTGLOBAL_TID_:%.*]], i32* noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR1]] {
+// CHECK-32-EX-NEXT:  entry:
+// CHECK-32-EX-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-EX-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-EX-NEXT:    [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-EX-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4
+// CHECK-32-EX-NEXT:    store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4
+// CHECK-32-EX-NEXT:    store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4
+// CHECK-32-EX-NEXT:    store i32 0, i32* [[DOTOMP_LB]], align 4
+// CHECK-32-EX-NEXT:    store i32 9, i32* [[DOTOMP_UB]], align 4
+// CHECK-32-EX-NEXT:    [[TMP0:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4
+// CHECK-32-EX-NEXT:    [[TMP1:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4
+// CHECK-32-EX-NEXT:    store i32 [[TMP0]], i32* [[DOTOMP_LB]], align 4
+// CHECK-32-EX-NEXT:    store i32 [[TMP1]], i32* [[DOTOMP_UB]], align 4
+// CHECK-32-EX-NEXT:    store i32 1, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-32-EX-NEXT:    store i32 0, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-32-EX-NEXT:    [[TMP2:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4
+// CHECK-32-EX-NEXT:    [[TMP3:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4
+// CHECK-32-EX-NEXT:    [[TMP4:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-EX-NEXT:    [[TMP5:%.*]] = load i32, i32* [[TMP4]], align 4
+// CHECK-32-EX-NEXT:    call void @__kmpc_dispatch_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP5]], i32 1073741859, i32 [[TMP2]], i32 [[TMP3]], i32 1, i32 1)
+// CHECK-32-EX-NEXT:    br label [[OMP_DISPATCH_COND:%.*]]
+// CHECK-32-EX:       omp.dispatch.cond:
+// CHECK-32-EX-NEXT:    [[TMP6:%.*]] = call i32 @__kmpc_dispatch_next_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP5]], i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]])
+// CHECK-32-EX-NEXT:    [[TOBOOL:%.*]] = icmp ne i32 [[TMP6]], 0
+// CHECK-32-EX-NEXT:    br i1 [[TOBOOL]], label [[OMP_DISPATCH_BODY:%.*]], label [[OMP_DISPATCH_END:%.*]]
+// CHECK-32-EX:       omp.dispatch.body:
+// CHECK-32-EX-NEXT:    [[TMP7:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4
+// CHECK-32-EX-NEXT:    store i32 [[TMP7]], i32* [[DOTOMP_IV]], align 4
+// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK-32-EX:       omp.inner.for.cond:
+// CHECK-32-EX-NEXT:    [[TMP8:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP186:![0-9]+]]
+// CHECK-32-EX-NEXT:    [[TMP9:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP186]]
+// CHECK-32-EX-NEXT:    [[CMP:%.*]] = icmp sle i32 [[TMP8]], [[TMP9]]
+// CHECK-32-EX-NEXT:    br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK-32-EX:       omp.inner.for.body:
+// CHECK-32-EX-NEXT:    [[TMP10:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP186]]
+// CHECK-32-EX-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP10]], 1
+// CHECK-32-EX-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// CHECK-32-EX-NEXT:    store i32 [[ADD]], i32* [[I]], align 4, !llvm.access.group [[ACC_GRP186]]
+// CHECK-32-EX-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK-32-EX:       omp.body.continue:
+// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK-32-EX:       omp.inner.for.inc:
+// CHECK-32-EX-NEXT:    [[TMP11:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP186]]
+// CHECK-32-EX-NEXT:    [[ADD1:%.*]] = add nsw i32 [[TMP11]], 1
+// CHECK-32-EX-NEXT:    store i32 [[ADD1]], i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP186]]
+// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP187:![0-9]+]]
+// CHECK-32-EX:       omp.inner.for.end:
+// CHECK-32-EX-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
+// CHECK-32-EX:       omp.dispatch.inc:
+// CHECK-32-EX-NEXT:    br label [[OMP_DISPATCH_COND]]
+// CHECK-32-EX:       omp.dispatch.end:
+// CHECK-32-EX-NEXT:    ret void
+//
+//
+// CHECK-32-EX-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l54
+// CHECK-32-EX-SAME: () #[[ATTR0]] {
+// CHECK-32-EX-NEXT:  entry:
+// CHECK-32-EX-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 2, i1 false)
+// CHECK-32-EX-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
+// CHECK-32-EX-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
+// CHECK-32-EX:       user_code.entry:
+// CHECK-32-EX-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
+// CHECK-32-EX-NEXT:    store i32 0, i32* [[DOTZERO_ADDR]], align 4
+// CHECK-32-EX-NEXT:    store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4
+// CHECK-32-EX-NEXT:    call void @__omp_outlined__27(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]]) #[[ATTR2]]
+// CHECK-32-EX-NEXT:    call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 2)
+// CHECK-32-EX-NEXT:    ret void
+// CHECK-32-EX:       worker.exit:
+// CHECK-32-EX-NEXT:    ret void
+//
+//
+// CHECK-32-EX-LABEL: define {{[^@]+}}@__omp_outlined__27
+// CHECK-32-EX-SAME: (i32* noalias noundef [[DOTGLOBAL_TID_:%.*]], i32* noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
+// CHECK-32-EX-NEXT:  entry:
+// CHECK-32-EX-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-EX-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [2 x i8*], align 4
+// CHECK-32-EX-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-EX-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4
+// CHECK-32-EX-NEXT:    store i32 0, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-32-EX-NEXT:    store i32 9, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-EX-NEXT:    store i32 1, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-32-EX-NEXT:    store i32 0, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-32-EX-NEXT:    [[NVPTX_NUM_THREADS:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block()
+// CHECK-32-EX-NEXT:    [[TMP0:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-EX-NEXT:    [[TMP1:%.*]] = load i32, i32* [[TMP0]], align 4
+// CHECK-32-EX-NEXT:    call void @__kmpc_distribute_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP1]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]])
+// CHECK-32-EX-NEXT:    [[TMP2:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-EX-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[TMP2]], 9
+// CHECK-32-EX-NEXT:    br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK-32-EX:       cond.true:
+// CHECK-32-EX-NEXT:    br label [[COND_END:%.*]]
+// CHECK-32-EX:       cond.false:
+// CHECK-32-EX-NEXT:    [[TMP3:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-EX-NEXT:    br label [[COND_END]]
+// CHECK-32-EX:       cond.end:
+// CHECK-32-EX-NEXT:    [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP3]], [[COND_FALSE]] ]
+// CHECK-32-EX-NEXT:    store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-EX-NEXT:    [[TMP4:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-32-EX-NEXT:    store i32 [[TMP4]], i32* [[DOTOMP_IV]], align 4
+// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK-32-EX:       omp.inner.for.cond:
+// CHECK-32-EX-NEXT:    [[TMP5:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
+// CHECK-32-EX-NEXT:    [[CMP1:%.*]] = icmp slt i32 [[TMP5]], 10
+// CHECK-32-EX-NEXT:    br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK-32-EX:       omp.inner.for.body:
+// CHECK-32-EX-NEXT:    [[TMP6:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-32-EX-NEXT:    [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-EX-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0
+// CHECK-32-EX-NEXT:    [[TMP9:%.*]] = inttoptr i32 [[TMP6]] to i8*
+// CHECK-32-EX-NEXT:    store i8* [[TMP9]], i8** [[TMP8]], align 4
+// CHECK-32-EX-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1
+// CHECK-32-EX-NEXT:    [[TMP11:%.*]] = inttoptr i32 [[TMP7]] to i8*
+// CHECK-32-EX-NEXT:    store i8* [[TMP11]], i8** [[TMP10]], align 4
+// CHECK-32-EX-NEXT:    [[TMP12:%.*]] = bitcast [2 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8**
+// CHECK-32-EX-NEXT:    call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32)* @__omp_outlined__28 to i8*), i8* null, i8** [[TMP12]], i32 2)
+// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK-32-EX:       omp.inner.for.inc:
+// CHECK-32-EX-NEXT:    [[TMP13:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
+// CHECK-32-EX-NEXT:    [[TMP14:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-32-EX-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP13]], [[TMP14]]
+// CHECK-32-EX-NEXT:    store i32 [[ADD]], i32* [[DOTOMP_IV]], align 4
+// CHECK-32-EX-NEXT:    [[TMP15:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-32-EX-NEXT:    [[TMP16:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-32-EX-NEXT:    [[ADD2:%.*]] = add nsw i32 [[TMP15]], [[TMP16]]
+// CHECK-32-EX-NEXT:    store i32 [[ADD2]], i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-32-EX-NEXT:    [[TMP17:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-EX-NEXT:    [[TMP18:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-32-EX-NEXT:    [[ADD3:%.*]] = add nsw i32 [[TMP17]], [[TMP18]]
+// CHECK-32-EX-NEXT:    store i32 [[ADD3]], i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-EX-NEXT:    [[TMP19:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-EX-NEXT:    [[CMP4:%.*]] = icmp sgt i32 [[TMP19]], 9
+// CHECK-32-EX-NEXT:    br i1 [[CMP4]], label [[COND_TRUE5:%.*]], label [[COND_FALSE6:%.*]]
+// CHECK-32-EX:       cond.true5:
+// CHECK-32-EX-NEXT:    br label [[COND_END7:%.*]]
+// CHECK-32-EX:       cond.false6:
+// CHECK-32-EX-NEXT:    [[TMP20:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-EX-NEXT:    br label [[COND_END7]]
+// CHECK-32-EX:       cond.end7:
+// CHECK-32-EX-NEXT:    [[COND8:%.*]] = phi i32 [ 9, [[COND_TRUE5]] ], [ [[TMP20]], [[COND_FALSE6]] ]
+// CHECK-32-EX-NEXT:    store i32 [[COND8]], i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-EX-NEXT:    [[TMP21:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-32-EX-NEXT:    store i32 [[TMP21]], i32* [[DOTOMP_IV]], align 4
+// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND]]
+// CHECK-32-EX:       omp.inner.for.end:
+// CHECK-32-EX-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK-32-EX:       omp.loop.exit:
+// CHECK-32-EX-NEXT:    call void @__kmpc_distribute_static_fini(%struct.ident_t* @[[GLOB2]], i32 [[TMP1]])
+// CHECK-32-EX-NEXT:    ret void
+//
+//
+// CHECK-32-EX-LABEL: define {{[^@]+}}@__omp_outlined__28
+// CHECK-32-EX-SAME: (i32* noalias noundef [[DOTGLOBAL_TID_:%.*]], i32* noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR1]] {
+// CHECK-32-EX-NEXT:  entry:
+// CHECK-32-EX-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-EX-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-EX-NEXT:    [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-EX-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4
+// CHECK-32-EX-NEXT:    store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4
+// CHECK-32-EX-NEXT:    store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4
+// CHECK-32-EX-NEXT:    store i32 0, i32* [[DOTOMP_LB]], align 4
+// CHECK-32-EX-NEXT:    store i32 9, i32* [[DOTOMP_UB]], align 4
+// CHECK-32-EX-NEXT:    [[TMP0:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4
+// CHECK-32-EX-NEXT:    [[TMP1:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4
+// CHECK-32-EX-NEXT:    store i32 [[TMP0]], i32* [[DOTOMP_LB]], align 4
+// CHECK-32-EX-NEXT:    store i32 [[TMP1]], i32* [[DOTOMP_UB]], align 4
+// CHECK-32-EX-NEXT:    store i32 1, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-32-EX-NEXT:    store i32 0, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-32-EX-NEXT:    [[TMP2:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4
+// CHECK-32-EX-NEXT:    [[TMP3:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4
+// CHECK-32-EX-NEXT:    [[TMP4:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-EX-NEXT:    [[TMP5:%.*]] = load i32, i32* [[TMP4]], align 4
+// CHECK-32-EX-NEXT:    call void @__kmpc_dispatch_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP5]], i32 1073741860, i32 [[TMP2]], i32 [[TMP3]], i32 1, i32 1)
+// CHECK-32-EX-NEXT:    br label [[OMP_DISPATCH_COND:%.*]]
+// CHECK-32-EX:       omp.dispatch.cond:
+// CHECK-32-EX-NEXT:    [[TMP6:%.*]] = call i32 @__kmpc_dispatch_next_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP5]], i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]])
+// CHECK-32-EX-NEXT:    [[TOBOOL:%.*]] = icmp ne i32 [[TMP6]], 0
+// CHECK-32-EX-NEXT:    br i1 [[TOBOOL]], label [[OMP_DISPATCH_BODY:%.*]], label [[OMP_DISPATCH_END:%.*]]
+// CHECK-32-EX:       omp.dispatch.body:
+// CHECK-32-EX-NEXT:    [[TMP7:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4
+// CHECK-32-EX-NEXT:    store i32 [[TMP7]], i32* [[DOTOMP_IV]], align 4
+// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK-32-EX:       omp.inner.for.cond:
+// CHECK-32-EX-NEXT:    [[TMP8:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP189:![0-9]+]]
+// CHECK-32-EX-NEXT:    [[TMP9:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP189]]
+// CHECK-32-EX-NEXT:    [[CMP:%.*]] = icmp sle i32 [[TMP8]], [[TMP9]]
+// CHECK-32-EX-NEXT:    br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK-32-EX:       omp.inner.for.body:
+// CHECK-32-EX-NEXT:    [[TMP10:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP189]]
+// CHECK-32-EX-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP10]], 1
+// CHECK-32-EX-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// CHECK-32-EX-NEXT:    store i32 [[ADD]], i32* [[I]], align 4, !llvm.access.group [[ACC_GRP189]]
+// CHECK-32-EX-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK-32-EX:       omp.body.continue:
+// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK-32-EX:       omp.inner.for.inc:
+// CHECK-32-EX-NEXT:    [[TMP11:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP189]]
+// CHECK-32-EX-NEXT:    [[ADD1:%.*]] = add nsw i32 [[TMP11]], 1
+// CHECK-32-EX-NEXT:    store i32 [[ADD1]], i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP189]]
+// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP190:![0-9]+]]
+// CHECK-32-EX:       omp.inner.for.end:
+// CHECK-32-EX-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
+// CHECK-32-EX:       omp.dispatch.inc:
+// CHECK-32-EX-NEXT:    br label [[OMP_DISPATCH_COND]]
+// CHECK-32-EX:       omp.dispatch.end:
+// CHECK-32-EX-NEXT:    ret void
+//
+//
+// CHECK-32-EX-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l57
+// CHECK-32-EX-SAME: () #[[ATTR0]] {
+// CHECK-32-EX-NEXT:  entry:
+// CHECK-32-EX-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 2, i1 false)
+// CHECK-32-EX-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
+// CHECK-32-EX-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
+// CHECK-32-EX:       user_code.entry:
+// CHECK-32-EX-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
+// CHECK-32-EX-NEXT:    store i32 0, i32* [[DOTZERO_ADDR]], align 4
+// CHECK-32-EX-NEXT:    store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4
+// CHECK-32-EX-NEXT:    call void @__omp_outlined__29(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]]) #[[ATTR2]]
+// CHECK-32-EX-NEXT:    call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 2)
+// CHECK-32-EX-NEXT:    ret void
+// CHECK-32-EX:       worker.exit:
+// CHECK-32-EX-NEXT:    ret void
+//
+//
+// CHECK-32-EX-LABEL: define {{[^@]+}}@__omp_outlined__29
+// CHECK-32-EX-SAME: (i32* noalias noundef [[DOTGLOBAL_TID_:%.*]], i32* noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
+// CHECK-32-EX-NEXT:  entry:
+// CHECK-32-EX-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-EX-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-EX-NEXT:    [[B:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [2 x i8*], align 4
+// CHECK-32-EX-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-EX-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4
+// CHECK-32-EX-NEXT:    store i32 0, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-32-EX-NEXT:    store i32 9, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-EX-NEXT:    store i32 1, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-32-EX-NEXT:    store i32 0, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-32-EX-NEXT:    [[NVPTX_NUM_THREADS:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block()
+// CHECK-32-EX-NEXT:    [[TMP0:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-EX-NEXT:    [[TMP1:%.*]] = load i32, i32* [[TMP0]], align 4
+// CHECK-32-EX-NEXT:    call void @__kmpc_distribute_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP1]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]])
+// CHECK-32-EX-NEXT:    [[TMP2:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-EX-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[TMP2]], 9
+// CHECK-32-EX-NEXT:    br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK-32-EX:       cond.true:
+// CHECK-32-EX-NEXT:    br label [[COND_END:%.*]]
+// CHECK-32-EX:       cond.false:
+// CHECK-32-EX-NEXT:    [[TMP3:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-EX-NEXT:    br label [[COND_END]]
+// CHECK-32-EX:       cond.end:
+// CHECK-32-EX-NEXT:    [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP3]], [[COND_FALSE]] ]
+// CHECK-32-EX-NEXT:    store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-EX-NEXT:    [[TMP4:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-32-EX-NEXT:    store i32 [[TMP4]], i32* [[DOTOMP_IV]], align 4
+// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK-32-EX:       omp.inner.for.cond:
+// CHECK-32-EX-NEXT:    [[TMP5:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP192:![0-9]+]]
+// CHECK-32-EX-NEXT:    [[CMP1:%.*]] = icmp slt i32 [[TMP5]], 10
+// CHECK-32-EX-NEXT:    br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK-32-EX:       omp.inner.for.body:
+// CHECK-32-EX-NEXT:    [[TMP6:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP192]]
+// CHECK-32-EX-NEXT:    [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP192]]
+// CHECK-32-EX-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0
+// CHECK-32-EX-NEXT:    [[TMP9:%.*]] = inttoptr i32 [[TMP6]] to i8*
+// CHECK-32-EX-NEXT:    store i8* [[TMP9]], i8** [[TMP8]], align 4, !llvm.access.group [[ACC_GRP192]]
+// CHECK-32-EX-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1
+// CHECK-32-EX-NEXT:    [[TMP11:%.*]] = inttoptr i32 [[TMP7]] to i8*
+// CHECK-32-EX-NEXT:    store i8* [[TMP11]], i8** [[TMP10]], align 4, !llvm.access.group [[ACC_GRP192]]
+// CHECK-32-EX-NEXT:    [[TMP12:%.*]] = bitcast [2 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8**
+// CHECK-32-EX-NEXT:    call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32)* @__omp_outlined__30 to i8*), i8* null, i8** [[TMP12]], i32 2), !llvm.access.group [[ACC_GRP192]]
+// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK-32-EX:       omp.inner.for.inc:
+// CHECK-32-EX-NEXT:    [[TMP13:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP192]]
+// CHECK-32-EX-NEXT:    [[TMP14:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP192]]
+// CHECK-32-EX-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP13]], [[TMP14]]
+// CHECK-32-EX-NEXT:    store i32 [[ADD]], i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP192]]
+// CHECK-32-EX-NEXT:    [[TMP15:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP192]]
+// CHECK-32-EX-NEXT:    [[TMP16:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP192]]
+// CHECK-32-EX-NEXT:    [[ADD2:%.*]] = add nsw i32 [[TMP15]], [[TMP16]]
+// CHECK-32-EX-NEXT:    store i32 [[ADD2]], i32* [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP192]]
+// CHECK-32-EX-NEXT:    [[TMP17:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP192]]
+// CHECK-32-EX-NEXT:    [[TMP18:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP192]]
+// CHECK-32-EX-NEXT:    [[ADD3:%.*]] = add nsw i32 [[TMP17]], [[TMP18]]
+// CHECK-32-EX-NEXT:    store i32 [[ADD3]], i32* [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP192]]
+// CHECK-32-EX-NEXT:    [[TMP19:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP192]]
+// CHECK-32-EX-NEXT:    [[CMP4:%.*]] = icmp sgt i32 [[TMP19]], 9
+// CHECK-32-EX-NEXT:    br i1 [[CMP4]], label [[COND_TRUE5:%.*]], label [[COND_FALSE6:%.*]]
+// CHECK-32-EX:       cond.true5:
+// CHECK-32-EX-NEXT:    br label [[COND_END7:%.*]]
+// CHECK-32-EX:       cond.false6:
+// CHECK-32-EX-NEXT:    [[TMP20:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP192]]
+// CHECK-32-EX-NEXT:    br label [[COND_END7]]
+// CHECK-32-EX:       cond.end7:
+// CHECK-32-EX-NEXT:    [[COND8:%.*]] = phi i32 [ 9, [[COND_TRUE5]] ], [ [[TMP20]], [[COND_FALSE6]] ]
+// CHECK-32-EX-NEXT:    store i32 [[COND8]], i32* [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP192]]
+// CHECK-32-EX-NEXT:    [[TMP21:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP192]]
+// CHECK-32-EX-NEXT:    store i32 [[TMP21]], i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP192]]
+// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP193:![0-9]+]]
+// CHECK-32-EX:       omp.inner.for.end:
+// CHECK-32-EX-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK-32-EX:       omp.loop.exit:
+// CHECK-32-EX-NEXT:    call void @__kmpc_distribute_static_fini(%struct.ident_t* @[[GLOB2]], i32 [[TMP1]])
+// CHECK-32-EX-NEXT:    [[TMP22:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-32-EX-NEXT:    [[TMP23:%.*]] = icmp ne i32 [[TMP22]], 0
+// CHECK-32-EX-NEXT:    br i1 [[TMP23]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]]
+// CHECK-32-EX:       .omp.final.then:
+// CHECK-32-EX-NEXT:    store i32 10, i32* [[I]], align 4
+// CHECK-32-EX-NEXT:    br label [[DOTOMP_FINAL_DONE]]
+// CHECK-32-EX:       .omp.final.done:
+// CHECK-32-EX-NEXT:    ret void
+//
+//
+// CHECK-32-EX-LABEL: define {{[^@]+}}@__omp_outlined__30
+// CHECK-32-EX-SAME: (i32* noalias noundef [[DOTGLOBAL_TID_:%.*]], i32* noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR1]] {
+// CHECK-32-EX-NEXT:  entry:
+// CHECK-32-EX-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-EX-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-EX-NEXT:    [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-EX-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4
+// CHECK-32-EX-NEXT:    store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4
+// CHECK-32-EX-NEXT:    store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4
+// CHECK-32-EX-NEXT:    store i32 0, i32* [[DOTOMP_LB]], align 4
+// CHECK-32-EX-NEXT:    store i32 9, i32* [[DOTOMP_UB]], align 4
+// CHECK-32-EX-NEXT:    [[TMP0:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4
+// CHECK-32-EX-NEXT:    [[TMP1:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4
+// CHECK-32-EX-NEXT:    store i32 [[TMP0]], i32* [[DOTOMP_LB]], align 4
+// CHECK-32-EX-NEXT:    store i32 [[TMP1]], i32* [[DOTOMP_UB]], align 4
+// CHECK-32-EX-NEXT:    store i32 1, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-32-EX-NEXT:    store i32 0, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-32-EX-NEXT:    [[TMP2:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-EX-NEXT:    [[TMP3:%.*]] = load i32, i32* [[TMP2]], align 4
+// CHECK-32-EX-NEXT:    call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB3]], i32 [[TMP3]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1)
+// CHECK-32-EX-NEXT:    [[TMP4:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4
+// CHECK-32-EX-NEXT:    store i32 [[TMP4]], i32* [[DOTOMP_IV]], align 4
+// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK-32-EX:       omp.inner.for.cond:
+// CHECK-32-EX-NEXT:    [[TMP5:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP195:![0-9]+]]
+// CHECK-32-EX-NEXT:    [[TMP6:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4, !llvm.access.group [[ACC_GRP195]]
+// CHECK-32-EX-NEXT:    [[CMP:%.*]] = icmp ule i32 [[TMP5]], [[TMP6]]
+// CHECK-32-EX-NEXT:    br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK-32-EX:       omp.inner.for.body:
+// CHECK-32-EX-NEXT:    [[TMP7:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP195]]
+// CHECK-32-EX-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP7]], 1
+// CHECK-32-EX-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// CHECK-32-EX-NEXT:    store i32 [[ADD]], i32* [[I]], align 4, !llvm.access.group [[ACC_GRP195]]
+// CHECK-32-EX-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK-32-EX:       omp.body.continue:
+// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK-32-EX:       omp.inner.for.inc:
+// CHECK-32-EX-NEXT:    [[TMP8:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP195]]
+// CHECK-32-EX-NEXT:    [[TMP9:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP195]]
+// CHECK-32-EX-NEXT:    [[ADD1:%.*]] = add nsw i32 [[TMP8]], [[TMP9]]
+// CHECK-32-EX-NEXT:    store i32 [[ADD1]], i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP195]]
+// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP196:![0-9]+]]
+// CHECK-32-EX:       omp.inner.for.end:
+// CHECK-32-EX-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK-32-EX:       omp.loop.exit:
+// CHECK-32-EX-NEXT:    call void @__kmpc_distribute_static_fini(%struct.ident_t* @[[GLOB2]], i32 [[TMP3]])
+// CHECK-32-EX-NEXT:    [[TMP10:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-32-EX-NEXT:    [[TMP11:%.*]] = icmp ne i32 [[TMP10]], 0
+// CHECK-32-EX-NEXT:    br i1 [[TMP11]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]]
+// CHECK-32-EX:       .omp.final.then:
+// CHECK-32-EX-NEXT:    store i32 10, i32* [[I]], align 4
+// CHECK-32-EX-NEXT:    br label [[DOTOMP_FINAL_DONE]]
+// CHECK-32-EX:       .omp.final.done:
+// CHECK-32-EX-NEXT:    ret void
+//
+//
+// CHECK-32-EX-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l65
+// CHECK-32-EX-SAME: () #[[ATTR0]] {
+// CHECK-32-EX-NEXT:  entry:
+// CHECK-32-EX-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 2, i1 false)
+// CHECK-32-EX-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
+// CHECK-32-EX-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
+// CHECK-32-EX:       user_code.entry:
+// CHECK-32-EX-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
+// CHECK-32-EX-NEXT:    store i32 0, i32* [[DOTZERO_ADDR]], align 4
+// CHECK-32-EX-NEXT:    store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4
+// CHECK-32-EX-NEXT:    call void @__omp_outlined__31(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]]) #[[ATTR2]]
+// CHECK-32-EX-NEXT:    call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 2)
+// CHECK-32-EX-NEXT:    ret void
+// CHECK-32-EX:       worker.exit:
+// CHECK-32-EX-NEXT:    ret void
+//
+//
+// CHECK-32-EX-LABEL: define {{[^@]+}}@__omp_outlined__31
+// CHECK-32-EX-SAME: (i32* noalias noundef [[DOTGLOBAL_TID_:%.*]], i32* noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
+// CHECK-32-EX-NEXT:  entry:
+// CHECK-32-EX-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-EX-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-EX-NEXT:    [[B:%.*]] = alloca [3 x i32], align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [2 x i8*], align 4
+// CHECK-32-EX-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-EX-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4
+// CHECK-32-EX-NEXT:    [[TMP0:%.*]] = bitcast [3 x i32]* [[B]] to i8*
+// CHECK-32-EX-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 4 [[TMP0]], i8* align 4 bitcast ([3 x i32]* @"__const.<captured>.b" to i8*), i32 12, i1 false)
+// CHECK-32-EX-NEXT:    store i32 0, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-32-EX-NEXT:    store i32 9, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-EX-NEXT:    store i32 1, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-32-EX-NEXT:    store i32 0, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-32-EX-NEXT:    [[NVPTX_NUM_THREADS:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block()
+// CHECK-32-EX-NEXT:    [[TMP1:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-EX-NEXT:    [[TMP2:%.*]] = load i32, i32* [[TMP1]], align 4
+// CHECK-32-EX-NEXT:    call void @__kmpc_distribute_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP2]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]])
+// CHECK-32-EX-NEXT:    [[TMP3:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-EX-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[TMP3]], 9
+// CHECK-32-EX-NEXT:    br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK-32-EX:       cond.true:
+// CHECK-32-EX-NEXT:    br label [[COND_END:%.*]]
+// CHECK-32-EX:       cond.false:
+// CHECK-32-EX-NEXT:    [[TMP4:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-EX-NEXT:    br label [[COND_END]]
+// CHECK-32-EX:       cond.end:
+// CHECK-32-EX-NEXT:    [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP4]], [[COND_FALSE]] ]
+// CHECK-32-EX-NEXT:    store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-EX-NEXT:    [[TMP5:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-32-EX-NEXT:    store i32 [[TMP5]], i32* [[DOTOMP_IV]], align 4
+// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK-32-EX:       omp.inner.for.cond:
+// CHECK-32-EX-NEXT:    [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP198:![0-9]+]]
+// CHECK-32-EX-NEXT:    [[CMP1:%.*]] = icmp slt i32 [[TMP6]], 10
+// CHECK-32-EX-NEXT:    br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK-32-EX:       omp.inner.for.body:
+// CHECK-32-EX-NEXT:    [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP198]]
+// CHECK-32-EX-NEXT:    [[TMP8:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP198]]
+// CHECK-32-EX-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0
+// CHECK-32-EX-NEXT:    [[TMP10:%.*]] = inttoptr i32 [[TMP7]] to i8*
+// CHECK-32-EX-NEXT:    store i8* [[TMP10]], i8** [[TMP9]], align 4, !llvm.access.group [[ACC_GRP198]]
+// CHECK-32-EX-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1
+// CHECK-32-EX-NEXT:    [[TMP12:%.*]] = inttoptr i32 [[TMP8]] to i8*
+// CHECK-32-EX-NEXT:    store i8* [[TMP12]], i8** [[TMP11]], align 4, !llvm.access.group [[ACC_GRP198]]
+// CHECK-32-EX-NEXT:    [[TMP13:%.*]] = bitcast [2 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8**
+// CHECK-32-EX-NEXT:    call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32)* @__omp_outlined__32 to i8*), i8* null, i8** [[TMP13]], i32 2), !llvm.access.group [[ACC_GRP198]]
+// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK-32-EX:       omp.inner.for.inc:
+// CHECK-32-EX-NEXT:    [[TMP14:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP198]]
+// CHECK-32-EX-NEXT:    [[TMP15:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP198]]
+// CHECK-32-EX-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP14]], [[TMP15]]
+// CHECK-32-EX-NEXT:    store i32 [[ADD]], i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP198]]
+// CHECK-32-EX-NEXT:    [[TMP16:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP198]]
+// CHECK-32-EX-NEXT:    [[TMP17:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP198]]
+// CHECK-32-EX-NEXT:    [[ADD2:%.*]] = add nsw i32 [[TMP16]], [[TMP17]]
+// CHECK-32-EX-NEXT:    store i32 [[ADD2]], i32* [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP198]]
+// CHECK-32-EX-NEXT:    [[TMP18:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP198]]
+// CHECK-32-EX-NEXT:    [[TMP19:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP198]]
+// CHECK-32-EX-NEXT:    [[ADD3:%.*]] = add nsw i32 [[TMP18]], [[TMP19]]
+// CHECK-32-EX-NEXT:    store i32 [[ADD3]], i32* [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP198]]
+// CHECK-32-EX-NEXT:    [[TMP20:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP198]]
+// CHECK-32-EX-NEXT:    [[CMP4:%.*]] = icmp sgt i32 [[TMP20]], 9
+// CHECK-32-EX-NEXT:    br i1 [[CMP4]], label [[COND_TRUE5:%.*]], label [[COND_FALSE6:%.*]]
+// CHECK-32-EX:       cond.true5:
+// CHECK-32-EX-NEXT:    br label [[COND_END7:%.*]]
+// CHECK-32-EX:       cond.false6:
+// CHECK-32-EX-NEXT:    [[TMP21:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP198]]
+// CHECK-32-EX-NEXT:    br label [[COND_END7]]
+// CHECK-32-EX:       cond.end7:
+// CHECK-32-EX-NEXT:    [[COND8:%.*]] = phi i32 [ 9, [[COND_TRUE5]] ], [ [[TMP21]], [[COND_FALSE6]] ]
+// CHECK-32-EX-NEXT:    store i32 [[COND8]], i32* [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP198]]
+// CHECK-32-EX-NEXT:    [[TMP22:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP198]]
+// CHECK-32-EX-NEXT:    store i32 [[TMP22]], i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP198]]
+// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP199:![0-9]+]]
+// CHECK-32-EX:       omp.inner.for.end:
+// CHECK-32-EX-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK-32-EX:       omp.loop.exit:
+// CHECK-32-EX-NEXT:    call void @__kmpc_distribute_static_fini(%struct.ident_t* @[[GLOB2]], i32 [[TMP2]])
+// CHECK-32-EX-NEXT:    [[TMP23:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-32-EX-NEXT:    [[TMP24:%.*]] = icmp ne i32 [[TMP23]], 0
+// CHECK-32-EX-NEXT:    br i1 [[TMP24]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]]
+// CHECK-32-EX:       .omp.final.then:
+// CHECK-32-EX-NEXT:    store i32 10, i32* [[I]], align 4
+// CHECK-32-EX-NEXT:    br label [[DOTOMP_FINAL_DONE]]
+// CHECK-32-EX:       .omp.final.done:
+// CHECK-32-EX-NEXT:    ret void
+//
+//
+// CHECK-32-EX-LABEL: define {{[^@]+}}@__omp_outlined__32
+// CHECK-32-EX-SAME: (i32* noalias noundef [[DOTGLOBAL_TID_:%.*]], i32* noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR1]] {
+// CHECK-32-EX-NEXT:  entry:
+// CHECK-32-EX-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-EX-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-EX-NEXT:    [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-EX-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4
+// CHECK-32-EX-NEXT:    store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4
+// CHECK-32-EX-NEXT:    store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4
+// CHECK-32-EX-NEXT:    store i32 0, i32* [[DOTOMP_LB]], align 4
+// CHECK-32-EX-NEXT:    store i32 9, i32* [[DOTOMP_UB]], align 4
+// CHECK-32-EX-NEXT:    [[TMP0:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4
+// CHECK-32-EX-NEXT:    [[TMP1:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4
+// CHECK-32-EX-NEXT:    store i32 [[TMP0]], i32* [[DOTOMP_LB]], align 4
+// CHECK-32-EX-NEXT:    store i32 [[TMP1]], i32* [[DOTOMP_UB]], align 4
+// CHECK-32-EX-NEXT:    store i32 1, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-32-EX-NEXT:    store i32 0, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-32-EX-NEXT:    [[TMP2:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-EX-NEXT:    [[TMP3:%.*]] = load i32, i32* [[TMP2]], align 4
+// CHECK-32-EX-NEXT:    call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB3]], i32 [[TMP3]], i32 34, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1)
+// CHECK-32-EX-NEXT:    [[TMP4:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4
+// CHECK-32-EX-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[TMP4]], 9
+// CHECK-32-EX-NEXT:    br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK-32-EX:       cond.true:
+// CHECK-32-EX-NEXT:    br label [[COND_END:%.*]]
+// CHECK-32-EX:       cond.false:
+// CHECK-32-EX-NEXT:    [[TMP5:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4
+// CHECK-32-EX-NEXT:    br label [[COND_END]]
+// CHECK-32-EX:       cond.end:
+// CHECK-32-EX-NEXT:    [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP5]], [[COND_FALSE]] ]
+// CHECK-32-EX-NEXT:    store i32 [[COND]], i32* [[DOTOMP_UB]], align 4
+// CHECK-32-EX-NEXT:    [[TMP6:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4
+// CHECK-32-EX-NEXT:    store i32 [[TMP6]], i32* [[DOTOMP_IV]], align 4
+// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK-32-EX:       omp.inner.for.cond:
+// CHECK-32-EX-NEXT:    [[TMP7:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP201:![0-9]+]]
+// CHECK-32-EX-NEXT:    [[TMP8:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP201]]
+// CHECK-32-EX-NEXT:    [[CMP1:%.*]] = icmp sle i32 [[TMP7]], [[TMP8]]
+// CHECK-32-EX-NEXT:    br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK-32-EX:       omp.inner.for.body:
+// CHECK-32-EX-NEXT:    [[TMP9:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP201]]
+// CHECK-32-EX-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP9]], 1
+// CHECK-32-EX-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// CHECK-32-EX-NEXT:    store i32 [[ADD]], i32* [[I]], align 4, !llvm.access.group [[ACC_GRP201]]
+// CHECK-32-EX-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK-32-EX:       omp.body.continue:
+// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK-32-EX:       omp.inner.for.inc:
+// CHECK-32-EX-NEXT:    [[TMP10:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP201]]
+// CHECK-32-EX-NEXT:    [[ADD2:%.*]] = add nsw i32 [[TMP10]], 1
+// CHECK-32-EX-NEXT:    store i32 [[ADD2]], i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP201]]
+// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP202:![0-9]+]]
+// CHECK-32-EX:       omp.inner.for.end:
+// CHECK-32-EX-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK-32-EX:       omp.loop.exit:
+// CHECK-32-EX-NEXT:    call void @__kmpc_distribute_static_fini(%struct.ident_t* @[[GLOB2]], i32 [[TMP3]])
+// CHECK-32-EX-NEXT:    [[TMP11:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-32-EX-NEXT:    [[TMP12:%.*]] = icmp ne i32 [[TMP11]], 0
+// CHECK-32-EX-NEXT:    br i1 [[TMP12]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]]
+// CHECK-32-EX:       .omp.final.then:
+// CHECK-32-EX-NEXT:    store i32 10, i32* [[I]], align 4
+// CHECK-32-EX-NEXT:    br label [[DOTOMP_FINAL_DONE]]
+// CHECK-32-EX:       .omp.final.done:
+// CHECK-32-EX-NEXT:    ret void
+//
+//
+// CHECK-32-EX-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l72
+// CHECK-32-EX-SAME: () #[[ATTR0]] {
+// CHECK-32-EX-NEXT:  entry:
+// CHECK-32-EX-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 1, i1 true)
+// CHECK-32-EX-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
+// CHECK-32-EX-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
+// CHECK-32-EX:       user_code.entry:
+// CHECK-32-EX-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
+// CHECK-32-EX-NEXT:    store i32 0, i32* [[DOTZERO_ADDR]], align 4
+// CHECK-32-EX-NEXT:    store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4
+// CHECK-32-EX-NEXT:    call void @__omp_outlined__33(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]]) #[[ATTR2]]
+// CHECK-32-EX-NEXT:    call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 1)
+// CHECK-32-EX-NEXT:    ret void
+// CHECK-32-EX:       worker.exit:
+// CHECK-32-EX-NEXT:    ret void
+//
+//
+// CHECK-32-EX-LABEL: define {{[^@]+}}@__omp_outlined__33
+// CHECK-32-EX-SAME: (i32* noalias noundef [[DOTGLOBAL_TID_:%.*]], i32* noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
+// CHECK-32-EX-NEXT:  entry:
+// CHECK-32-EX-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-EX-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [2 x i8*], align 4
+// CHECK-32-EX-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-EX-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4
+// CHECK-32-EX-NEXT:    [[C:%.*]] = call align 8 i8* @__kmpc_alloc_shared(i32 4)
+// CHECK-32-EX-NEXT:    [[C_ON_STACK:%.*]] = bitcast i8* [[C]] to i32**
+// CHECK-32-EX-NEXT:    [[B:%.*]] = call align 8 i8* @__kmpc_alloc_shared(i32 4)
+// CHECK-32-EX-NEXT:    [[B_ON_STACK:%.*]] = bitcast i8* [[B]] to i32*
+// CHECK-32-EX-NEXT:    store i32 0, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-32-EX-NEXT:    store i32 9, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-EX-NEXT:    store i32 1, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-32-EX-NEXT:    store i32 0, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-32-EX-NEXT:    [[TMP0:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-EX-NEXT:    [[TMP1:%.*]] = load i32, i32* [[TMP0]], align 4
+// CHECK-32-EX-NEXT:    call void @__kmpc_distribute_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP1]], i32 92, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1)
+// CHECK-32-EX-NEXT:    [[TMP2:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-EX-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[TMP2]], 9
+// CHECK-32-EX-NEXT:    br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK-32-EX:       cond.true:
+// CHECK-32-EX-NEXT:    br label [[COND_END:%.*]]
+// CHECK-32-EX:       cond.false:
+// CHECK-32-EX-NEXT:    [[TMP3:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-EX-NEXT:    br label [[COND_END]]
+// CHECK-32-EX:       cond.end:
+// CHECK-32-EX-NEXT:    [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP3]], [[COND_FALSE]] ]
+// CHECK-32-EX-NEXT:    store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-EX-NEXT:    [[TMP4:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-32-EX-NEXT:    store i32 [[TMP4]], i32* [[DOTOMP_IV]], align 4
+// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK-32-EX:       omp.inner.for.cond:
+// CHECK-32-EX-NEXT:    [[TMP5:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP204:![0-9]+]]
+// CHECK-32-EX-NEXT:    [[TMP6:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP204]]
+// CHECK-32-EX-NEXT:    [[CMP1:%.*]] = icmp sle i32 [[TMP5]], [[TMP6]]
+// CHECK-32-EX-NEXT:    br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK-32-EX:       omp.inner.for.body:
+// CHECK-32-EX-NEXT:    [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP204]]
+// CHECK-32-EX-NEXT:    [[TMP8:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP204]]
+// CHECK-32-EX-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0
+// CHECK-32-EX-NEXT:    [[TMP10:%.*]] = inttoptr i32 [[TMP7]] to i8*
+// CHECK-32-EX-NEXT:    store i8* [[TMP10]], i8** [[TMP9]], align 4, !llvm.access.group [[ACC_GRP204]]
+// CHECK-32-EX-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1
+// CHECK-32-EX-NEXT:    [[TMP12:%.*]] = inttoptr i32 [[TMP8]] to i8*
+// CHECK-32-EX-NEXT:    store i8* [[TMP12]], i8** [[TMP11]], align 4, !llvm.access.group [[ACC_GRP204]]
+// CHECK-32-EX-NEXT:    [[TMP13:%.*]] = bitcast [2 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8**
+// CHECK-32-EX-NEXT:    call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32)* @__omp_outlined__34 to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined__34_wrapper to i8*), i8** [[TMP13]], i32 2), !llvm.access.group [[ACC_GRP204]]
+// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK-32-EX:       omp.inner.for.inc:
+// CHECK-32-EX-NEXT:    [[TMP14:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP204]]
+// CHECK-32-EX-NEXT:    [[TMP15:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP204]]
+// CHECK-32-EX-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP14]], [[TMP15]]
+// CHECK-32-EX-NEXT:    store i32 [[ADD]], i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP204]]
+// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP205:![0-9]+]]
+// CHECK-32-EX:       omp.inner.for.end:
+// CHECK-32-EX-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK-32-EX:       omp.loop.exit:
+// CHECK-32-EX-NEXT:    call void @__kmpc_distribute_static_fini(%struct.ident_t* @[[GLOB2]], i32 [[TMP1]])
+// CHECK-32-EX-NEXT:    [[TMP16:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-32-EX-NEXT:    [[TMP17:%.*]] = icmp ne i32 [[TMP16]], 0
+// CHECK-32-EX-NEXT:    br i1 [[TMP17]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]]
+// CHECK-32-EX:       .omp.final.then:
+// CHECK-32-EX-NEXT:    store i32 10, i32* [[I]], align 4
+// CHECK-32-EX-NEXT:    br label [[DOTOMP_FINAL_DONE]]
+// CHECK-32-EX:       .omp.final.done:
+// CHECK-32-EX-NEXT:    store i32* [[B_ON_STACK]], i32** [[C_ON_STACK]], align 4
+// CHECK-32-EX-NEXT:    call void @__kmpc_free_shared(i8* [[B]], i32 4)
+// CHECK-32-EX-NEXT:    call void @__kmpc_free_shared(i8* [[C]], i32 4)
+// CHECK-32-EX-NEXT:    ret void
+//
+//
+// CHECK-32-EX-LABEL: define {{[^@]+}}@__omp_outlined__34
+// CHECK-32-EX-SAME: (i32* noalias noundef [[DOTGLOBAL_TID_:%.*]], i32* noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR1]] {
+// CHECK-32-EX-NEXT:  entry:
+// CHECK-32-EX-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-EX-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-EX-NEXT:    [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-EX-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4
+// CHECK-32-EX-NEXT:    store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4
+// CHECK-32-EX-NEXT:    store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4
+// CHECK-32-EX-NEXT:    store i32 0, i32* [[DOTOMP_LB]], align 4
+// CHECK-32-EX-NEXT:    store i32 9, i32* [[DOTOMP_UB]], align 4
+// CHECK-32-EX-NEXT:    [[TMP0:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4
+// CHECK-32-EX-NEXT:    [[TMP1:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4
+// CHECK-32-EX-NEXT:    store i32 [[TMP0]], i32* [[DOTOMP_LB]], align 4
+// CHECK-32-EX-NEXT:    store i32 [[TMP1]], i32* [[DOTOMP_UB]], align 4
+// CHECK-32-EX-NEXT:    store i32 1, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-32-EX-NEXT:    store i32 0, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-32-EX-NEXT:    [[TMP2:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-EX-NEXT:    [[TMP3:%.*]] = load i32, i32* [[TMP2]], align 4
+// CHECK-32-EX-NEXT:    call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB3]], i32 [[TMP3]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1)
+// CHECK-32-EX-NEXT:    [[TMP4:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4
+// CHECK-32-EX-NEXT:    store i32 [[TMP4]], i32* [[DOTOMP_IV]], align 4
+// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK-32-EX:       omp.inner.for.cond:
+// CHECK-32-EX-NEXT:    [[TMP5:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP207:![0-9]+]]
+// CHECK-32-EX-NEXT:    [[TMP6:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4, !llvm.access.group [[ACC_GRP207]]
+// CHECK-32-EX-NEXT:    [[CMP:%.*]] = icmp ule i32 [[TMP5]], [[TMP6]]
+// CHECK-32-EX-NEXT:    br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK-32-EX:       omp.inner.for.body:
+// CHECK-32-EX-NEXT:    [[TMP7:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP207]]
+// CHECK-32-EX-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP7]], 1
+// CHECK-32-EX-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// CHECK-32-EX-NEXT:    store i32 [[ADD]], i32* [[I]], align 4, !llvm.access.group [[ACC_GRP207]]
+// CHECK-32-EX-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK-32-EX:       omp.body.continue:
+// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK-32-EX:       omp.inner.for.inc:
+// CHECK-32-EX-NEXT:    [[TMP8:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP207]]
+// CHECK-32-EX-NEXT:    [[TMP9:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP207]]
+// CHECK-32-EX-NEXT:    [[ADD1:%.*]] = add nsw i32 [[TMP8]], [[TMP9]]
+// CHECK-32-EX-NEXT:    store i32 [[ADD1]], i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP207]]
+// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP208:![0-9]+]]
+// CHECK-32-EX:       omp.inner.for.end:
+// CHECK-32-EX-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK-32-EX:       omp.loop.exit:
+// CHECK-32-EX-NEXT:    call void @__kmpc_distribute_static_fini(%struct.ident_t* @[[GLOB2]], i32 [[TMP3]])
+// CHECK-32-EX-NEXT:    [[TMP10:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-32-EX-NEXT:    [[TMP11:%.*]] = icmp ne i32 [[TMP10]], 0
+// CHECK-32-EX-NEXT:    br i1 [[TMP11]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]]
+// CHECK-32-EX:       .omp.final.then:
+// CHECK-32-EX-NEXT:    store i32 10, i32* [[I]], align 4
+// CHECK-32-EX-NEXT:    br label [[DOTOMP_FINAL_DONE]]
+// CHECK-32-EX:       .omp.final.done:
+// CHECK-32-EX-NEXT:    ret void
+//
+//
+// CHECK-32-EX-LABEL: define {{[^@]+}}@__omp_outlined__34_wrapper
+// CHECK-32-EX-SAME: (i16 noundef zeroext [[TMP0:%.*]], i32 noundef [[TMP1:%.*]]) #[[ATTR7:[0-9]+]] {
+// CHECK-32-EX-NEXT:  entry:
+// CHECK-32-EX-NEXT:    [[DOTADDR:%.*]] = alloca i16, align 2
+// CHECK-32-EX-NEXT:    [[DOTADDR1:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[GLOBAL_ARGS:%.*]] = alloca i8**, align 4
+// CHECK-32-EX-NEXT:    store i16 [[TMP0]], i16* [[DOTADDR]], align 2
+// CHECK-32-EX-NEXT:    store i32 [[TMP1]], i32* [[DOTADDR1]], align 4
+// CHECK-32-EX-NEXT:    store i32 0, i32* [[DOTZERO_ADDR]], align 4
+// CHECK-32-EX-NEXT:    call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]])
+// CHECK-32-EX-NEXT:    [[TMP2:%.*]] = load i8**, i8*** [[GLOBAL_ARGS]], align 4
+// CHECK-32-EX-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8*, i8** [[TMP2]], i32 0
+// CHECK-32-EX-NEXT:    [[TMP4:%.*]] = bitcast i8** [[TMP3]] to i32*
+// CHECK-32-EX-NEXT:    [[TMP5:%.*]] = load i32, i32* [[TMP4]], align 4
+// CHECK-32-EX-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8*, i8** [[TMP2]], i32 1
+// CHECK-32-EX-NEXT:    [[TMP7:%.*]] = bitcast i8** [[TMP6]] to i32*
+// CHECK-32-EX-NEXT:    [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4
+// CHECK-32-EX-NEXT:    call void @__omp_outlined__34(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]], i32 [[TMP5]], i32 [[TMP8]]) #[[ATTR2]]
+// CHECK-32-EX-NEXT:    ret void
+//
+//
+// CHECK-32-EX-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l80
+// CHECK-32-EX-SAME: () #[[ATTR0]] {
+// CHECK-32-EX-NEXT:  entry:
+// CHECK-32-EX-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 2, i1 false)
+// CHECK-32-EX-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
+// CHECK-32-EX-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
+// CHECK-32-EX:       user_code.entry:
+// CHECK-32-EX-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
+// CHECK-32-EX-NEXT:    store i32 0, i32* [[DOTZERO_ADDR]], align 4
+// CHECK-32-EX-NEXT:    store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4
+// CHECK-32-EX-NEXT:    call void @__omp_outlined__35(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]]) #[[ATTR2]]
+// CHECK-32-EX-NEXT:    call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 2)
+// CHECK-32-EX-NEXT:    ret void
+// CHECK-32-EX:       worker.exit:
+// CHECK-32-EX-NEXT:    ret void
+//
+//
+// CHECK-32-EX-LABEL: define {{[^@]+}}@__omp_outlined__35
+// CHECK-32-EX-SAME: (i32* noalias noundef [[DOTGLOBAL_TID_:%.*]], i32* noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
+// CHECK-32-EX-NEXT:  entry:
+// CHECK-32-EX-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-EX-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [2 x i8*], align 4
+// CHECK-32-EX-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-EX-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4
+// CHECK-32-EX-NEXT:    store i32 0, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-32-EX-NEXT:    store i32 9, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-EX-NEXT:    store i32 1, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-32-EX-NEXT:    store i32 0, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-32-EX-NEXT:    [[NVPTX_NUM_THREADS:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block()
+// CHECK-32-EX-NEXT:    [[TMP0:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-EX-NEXT:    [[TMP1:%.*]] = load i32, i32* [[TMP0]], align 4
+// CHECK-32-EX-NEXT:    call void @__kmpc_distribute_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP1]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]])
+// CHECK-32-EX-NEXT:    [[TMP2:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-EX-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[TMP2]], 9
+// CHECK-32-EX-NEXT:    br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK-32-EX:       cond.true:
+// CHECK-32-EX-NEXT:    br label [[COND_END:%.*]]
+// CHECK-32-EX:       cond.false:
+// CHECK-32-EX-NEXT:    [[TMP3:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-EX-NEXT:    br label [[COND_END]]
+// CHECK-32-EX:       cond.end:
+// CHECK-32-EX-NEXT:    [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP3]], [[COND_FALSE]] ]
+// CHECK-32-EX-NEXT:    store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-EX-NEXT:    [[TMP4:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-32-EX-NEXT:    store i32 [[TMP4]], i32* [[DOTOMP_IV]], align 4
+// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK-32-EX:       omp.inner.for.cond:
+// CHECK-32-EX-NEXT:    [[TMP5:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP210:![0-9]+]]
+// CHECK-32-EX-NEXT:    [[CMP1:%.*]] = icmp slt i32 [[TMP5]], 10
+// CHECK-32-EX-NEXT:    br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK-32-EX:       omp.inner.for.body:
+// CHECK-32-EX-NEXT:    [[TMP6:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP210]]
+// CHECK-32-EX-NEXT:    [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP210]]
+// CHECK-32-EX-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0
+// CHECK-32-EX-NEXT:    [[TMP9:%.*]] = inttoptr i32 [[TMP6]] to i8*
+// CHECK-32-EX-NEXT:    store i8* [[TMP9]], i8** [[TMP8]], align 4, !llvm.access.group [[ACC_GRP210]]
+// CHECK-32-EX-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1
+// CHECK-32-EX-NEXT:    [[TMP11:%.*]] = inttoptr i32 [[TMP7]] to i8*
+// CHECK-32-EX-NEXT:    store i8* [[TMP11]], i8** [[TMP10]], align 4, !llvm.access.group [[ACC_GRP210]]
+// CHECK-32-EX-NEXT:    [[TMP12:%.*]] = bitcast [2 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8**
+// CHECK-32-EX-NEXT:    call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32)* @__omp_outlined__36 to i8*), i8* null, i8** [[TMP12]], i32 2), !llvm.access.group [[ACC_GRP210]]
+// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK-32-EX:       omp.inner.for.inc:
+// CHECK-32-EX-NEXT:    [[TMP13:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP210]]
+// CHECK-32-EX-NEXT:    [[TMP14:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP210]]
+// CHECK-32-EX-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP13]], [[TMP14]]
+// CHECK-32-EX-NEXT:    store i32 [[ADD]], i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP210]]
+// CHECK-32-EX-NEXT:    [[TMP15:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP210]]
+// CHECK-32-EX-NEXT:    [[TMP16:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP210]]
+// CHECK-32-EX-NEXT:    [[ADD2:%.*]] = add nsw i32 [[TMP15]], [[TMP16]]
+// CHECK-32-EX-NEXT:    store i32 [[ADD2]], i32* [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP210]]
+// CHECK-32-EX-NEXT:    [[TMP17:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP210]]
+// CHECK-32-EX-NEXT:    [[TMP18:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP210]]
+// CHECK-32-EX-NEXT:    [[ADD3:%.*]] = add nsw i32 [[TMP17]], [[TMP18]]
+// CHECK-32-EX-NEXT:    store i32 [[ADD3]], i32* [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP210]]
+// CHECK-32-EX-NEXT:    [[TMP19:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP210]]
+// CHECK-32-EX-NEXT:    [[CMP4:%.*]] = icmp sgt i32 [[TMP19]], 9
+// CHECK-32-EX-NEXT:    br i1 [[CMP4]], label [[COND_TRUE5:%.*]], label [[COND_FALSE6:%.*]]
+// CHECK-32-EX:       cond.true5:
+// CHECK-32-EX-NEXT:    br label [[COND_END7:%.*]]
+// CHECK-32-EX:       cond.false6:
+// CHECK-32-EX-NEXT:    [[TMP20:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP210]]
+// CHECK-32-EX-NEXT:    br label [[COND_END7]]
+// CHECK-32-EX:       cond.end7:
+// CHECK-32-EX-NEXT:    [[COND8:%.*]] = phi i32 [ 9, [[COND_TRUE5]] ], [ [[TMP20]], [[COND_FALSE6]] ]
+// CHECK-32-EX-NEXT:    store i32 [[COND8]], i32* [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP210]]
+// CHECK-32-EX-NEXT:    [[TMP21:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP210]]
+// CHECK-32-EX-NEXT:    store i32 [[TMP21]], i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP210]]
+// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP211:![0-9]+]]
+// CHECK-32-EX:       omp.inner.for.end:
+// CHECK-32-EX-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK-32-EX:       omp.loop.exit:
+// CHECK-32-EX-NEXT:    call void @__kmpc_distribute_static_fini(%struct.ident_t* @[[GLOB2]], i32 [[TMP1]])
+// CHECK-32-EX-NEXT:    [[TMP22:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-32-EX-NEXT:    [[TMP23:%.*]] = icmp ne i32 [[TMP22]], 0
+// CHECK-32-EX-NEXT:    br i1 [[TMP23]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]]
+// CHECK-32-EX:       .omp.final.then:
+// CHECK-32-EX-NEXT:    store i32 10, i32* [[I]], align 4
+// CHECK-32-EX-NEXT:    br label [[DOTOMP_FINAL_DONE]]
+// CHECK-32-EX:       .omp.final.done:
+// CHECK-32-EX-NEXT:    ret void
+//
+//
+// CHECK-32-EX-LABEL: define {{[^@]+}}@__omp_outlined__36
+// CHECK-32-EX-SAME: (i32* noalias noundef [[DOTGLOBAL_TID_:%.*]], i32* noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR1]] {
+// CHECK-32-EX-NEXT:  entry:
+// CHECK-32-EX-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-EX-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-EX-NEXT:    [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-EX-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4
+// CHECK-32-EX-NEXT:    store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4
+// CHECK-32-EX-NEXT:    store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4
+// CHECK-32-EX-NEXT:    store i32 0, i32* [[DOTOMP_LB]], align 4
+// CHECK-32-EX-NEXT:    store i32 9, i32* [[DOTOMP_UB]], align 4
+// CHECK-32-EX-NEXT:    [[TMP0:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4
+// CHECK-32-EX-NEXT:    [[TMP1:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4
+// CHECK-32-EX-NEXT:    store i32 [[TMP0]], i32* [[DOTOMP_LB]], align 4
+// CHECK-32-EX-NEXT:    store i32 [[TMP1]], i32* [[DOTOMP_UB]], align 4
+// CHECK-32-EX-NEXT:    store i32 1, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-32-EX-NEXT:    store i32 0, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-32-EX-NEXT:    [[TMP2:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4
+// CHECK-32-EX-NEXT:    [[TMP3:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4
+// CHECK-32-EX-NEXT:    [[TMP4:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-EX-NEXT:    [[TMP5:%.*]] = load i32, i32* [[TMP4]], align 4
+// CHECK-32-EX-NEXT:    call void @__kmpc_dispatch_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP5]], i32 1073741862, i32 [[TMP2]], i32 [[TMP3]], i32 1, i32 1)
+// CHECK-32-EX-NEXT:    br label [[OMP_DISPATCH_COND:%.*]]
+// CHECK-32-EX:       omp.dispatch.cond:
+// CHECK-32-EX-NEXT:    [[TMP6:%.*]] = call i32 @__kmpc_dispatch_next_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP5]], i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]])
+// CHECK-32-EX-NEXT:    [[TOBOOL:%.*]] = icmp ne i32 [[TMP6]], 0
+// CHECK-32-EX-NEXT:    br i1 [[TOBOOL]], label [[OMP_DISPATCH_BODY:%.*]], label [[OMP_DISPATCH_END:%.*]]
+// CHECK-32-EX:       omp.dispatch.body:
+// CHECK-32-EX-NEXT:    [[TMP7:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4
+// CHECK-32-EX-NEXT:    store i32 [[TMP7]], i32* [[DOTOMP_IV]], align 4
+// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK-32-EX:       omp.inner.for.cond:
+// CHECK-32-EX-NEXT:    [[TMP8:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP213:![0-9]+]]
+// CHECK-32-EX-NEXT:    [[TMP9:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP213]]
+// CHECK-32-EX-NEXT:    [[CMP:%.*]] = icmp sle i32 [[TMP8]], [[TMP9]]
+// CHECK-32-EX-NEXT:    br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK-32-EX:       omp.inner.for.body:
+// CHECK-32-EX-NEXT:    [[TMP10:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP213]]
+// CHECK-32-EX-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP10]], 1
+// CHECK-32-EX-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// CHECK-32-EX-NEXT:    store i32 [[ADD]], i32* [[I]], align 4, !llvm.access.group [[ACC_GRP213]]
+// CHECK-32-EX-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK-32-EX:       omp.body.continue:
+// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK-32-EX:       omp.inner.for.inc:
+// CHECK-32-EX-NEXT:    [[TMP11:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP213]]
+// CHECK-32-EX-NEXT:    [[ADD1:%.*]] = add nsw i32 [[TMP11]], 1
+// CHECK-32-EX-NEXT:    store i32 [[ADD1]], i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP213]]
+// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP214:![0-9]+]]
+// CHECK-32-EX:       omp.inner.for.end:
+// CHECK-32-EX-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
+// CHECK-32-EX:       omp.dispatch.inc:
+// CHECK-32-EX-NEXT:    br label [[OMP_DISPATCH_COND]]
+// CHECK-32-EX:       omp.dispatch.end:
+// CHECK-32-EX-NEXT:    [[TMP12:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-32-EX-NEXT:    [[TMP13:%.*]] = icmp ne i32 [[TMP12]], 0
+// CHECK-32-EX-NEXT:    br i1 [[TMP13]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]]
+// CHECK-32-EX:       .omp.final.then:
+// CHECK-32-EX-NEXT:    store i32 10, i32* [[I]], align 4
+// CHECK-32-EX-NEXT:    br label [[DOTOMP_FINAL_DONE]]
+// CHECK-32-EX:       .omp.final.done:
+// CHECK-32-EX-NEXT:    ret void
+//
+//
+// CHECK-32-EX-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l84
+// CHECK-32-EX-SAME: () #[[ATTR0]] {
+// CHECK-32-EX-NEXT:  entry:
+// CHECK-32-EX-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 2, i1 false)
+// CHECK-32-EX-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
+// CHECK-32-EX-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
+// CHECK-32-EX:       user_code.entry:
+// CHECK-32-EX-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
+// CHECK-32-EX-NEXT:    store i32 0, i32* [[DOTZERO_ADDR]], align 4
+// CHECK-32-EX-NEXT:    store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4
+// CHECK-32-EX-NEXT:    call void @__omp_outlined__37(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]]) #[[ATTR2]]
+// CHECK-32-EX-NEXT:    call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 2)
+// CHECK-32-EX-NEXT:    ret void
+// CHECK-32-EX:       worker.exit:
+// CHECK-32-EX-NEXT:    ret void
+//
+//
+// CHECK-32-EX-LABEL: define {{[^@]+}}@__omp_outlined__37
+// CHECK-32-EX-SAME: (i32* noalias noundef [[DOTGLOBAL_TID_:%.*]], i32* noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
+// CHECK-32-EX-NEXT:  entry:
+// CHECK-32-EX-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-EX-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [2 x i8*], align 4
+// CHECK-32-EX-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-EX-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4
+// CHECK-32-EX-NEXT:    store i32 0, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-32-EX-NEXT:    store i32 9, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-EX-NEXT:    store i32 1, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-32-EX-NEXT:    store i32 0, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-32-EX-NEXT:    [[NVPTX_NUM_THREADS:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block()
+// CHECK-32-EX-NEXT:    [[TMP0:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-EX-NEXT:    [[TMP1:%.*]] = load i32, i32* [[TMP0]], align 4
+// CHECK-32-EX-NEXT:    call void @__kmpc_distribute_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP1]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]])
+// CHECK-32-EX-NEXT:    [[TMP2:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-EX-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[TMP2]], 9
+// CHECK-32-EX-NEXT:    br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK-32-EX:       cond.true:
+// CHECK-32-EX-NEXT:    br label [[COND_END:%.*]]
+// CHECK-32-EX:       cond.false:
+// CHECK-32-EX-NEXT:    [[TMP3:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-EX-NEXT:    br label [[COND_END]]
+// CHECK-32-EX:       cond.end:
+// CHECK-32-EX-NEXT:    [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP3]], [[COND_FALSE]] ]
+// CHECK-32-EX-NEXT:    store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-EX-NEXT:    [[TMP4:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-32-EX-NEXT:    store i32 [[TMP4]], i32* [[DOTOMP_IV]], align 4
+// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK-32-EX:       omp.inner.for.cond:
+// CHECK-32-EX-NEXT:    [[TMP5:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP216:![0-9]+]]
+// CHECK-32-EX-NEXT:    [[CMP1:%.*]] = icmp slt i32 [[TMP5]], 10
+// CHECK-32-EX-NEXT:    br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK-32-EX:       omp.inner.for.body:
+// CHECK-32-EX-NEXT:    [[TMP6:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP216]]
+// CHECK-32-EX-NEXT:    [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP216]]
+// CHECK-32-EX-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0
+// CHECK-32-EX-NEXT:    [[TMP9:%.*]] = inttoptr i32 [[TMP6]] to i8*
+// CHECK-32-EX-NEXT:    store i8* [[TMP9]], i8** [[TMP8]], align 4, !llvm.access.group [[ACC_GRP216]]
+// CHECK-32-EX-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1
+// CHECK-32-EX-NEXT:    [[TMP11:%.*]] = inttoptr i32 [[TMP7]] to i8*
+// CHECK-32-EX-NEXT:    store i8* [[TMP11]], i8** [[TMP10]], align 4, !llvm.access.group [[ACC_GRP216]]
+// CHECK-32-EX-NEXT:    [[TMP12:%.*]] = bitcast [2 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8**
+// CHECK-32-EX-NEXT:    call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32)* @__omp_outlined__38 to i8*), i8* null, i8** [[TMP12]], i32 2), !llvm.access.group [[ACC_GRP216]]
+// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK-32-EX:       omp.inner.for.inc:
+// CHECK-32-EX-NEXT:    [[TMP13:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP216]]
+// CHECK-32-EX-NEXT:    [[TMP14:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP216]]
+// CHECK-32-EX-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP13]], [[TMP14]]
+// CHECK-32-EX-NEXT:    store i32 [[ADD]], i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP216]]
+// CHECK-32-EX-NEXT:    [[TMP15:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP216]]
+// CHECK-32-EX-NEXT:    [[TMP16:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP216]]
+// CHECK-32-EX-NEXT:    [[ADD2:%.*]] = add nsw i32 [[TMP15]], [[TMP16]]
+// CHECK-32-EX-NEXT:    store i32 [[ADD2]], i32* [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP216]]
+// CHECK-32-EX-NEXT:    [[TMP17:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP216]]
+// CHECK-32-EX-NEXT:    [[TMP18:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP216]]
+// CHECK-32-EX-NEXT:    [[ADD3:%.*]] = add nsw i32 [[TMP17]], [[TMP18]]
+// CHECK-32-EX-NEXT:    store i32 [[ADD3]], i32* [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP216]]
+// CHECK-32-EX-NEXT:    [[TMP19:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP216]]
+// CHECK-32-EX-NEXT:    [[CMP4:%.*]] = icmp sgt i32 [[TMP19]], 9
+// CHECK-32-EX-NEXT:    br i1 [[CMP4]], label [[COND_TRUE5:%.*]], label [[COND_FALSE6:%.*]]
+// CHECK-32-EX:       cond.true5:
+// CHECK-32-EX-NEXT:    br label [[COND_END7:%.*]]
+// CHECK-32-EX:       cond.false6:
+// CHECK-32-EX-NEXT:    [[TMP20:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP216]]
+// CHECK-32-EX-NEXT:    br label [[COND_END7]]
+// CHECK-32-EX:       cond.end7:
+// CHECK-32-EX-NEXT:    [[COND8:%.*]] = phi i32 [ 9, [[COND_TRUE5]] ], [ [[TMP20]], [[COND_FALSE6]] ]
+// CHECK-32-EX-NEXT:    store i32 [[COND8]], i32* [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP216]]
+// CHECK-32-EX-NEXT:    [[TMP21:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP216]]
+// CHECK-32-EX-NEXT:    store i32 [[TMP21]], i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP216]]
+// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP217:![0-9]+]]
+// CHECK-32-EX:       omp.inner.for.end:
+// CHECK-32-EX-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK-32-EX:       omp.loop.exit:
+// CHECK-32-EX-NEXT:    call void @__kmpc_distribute_static_fini(%struct.ident_t* @[[GLOB2]], i32 [[TMP1]])
+// CHECK-32-EX-NEXT:    [[TMP22:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-32-EX-NEXT:    [[TMP23:%.*]] = icmp ne i32 [[TMP22]], 0
+// CHECK-32-EX-NEXT:    br i1 [[TMP23]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]]
+// CHECK-32-EX:       .omp.final.then:
+// CHECK-32-EX-NEXT:    store i32 10, i32* [[I]], align 4
+// CHECK-32-EX-NEXT:    br label [[DOTOMP_FINAL_DONE]]
+// CHECK-32-EX:       .omp.final.done:
+// CHECK-32-EX-NEXT:    ret void
+//
+//
+// CHECK-32-EX-LABEL: define {{[^@]+}}@__omp_outlined__38
+// CHECK-32-EX-SAME: (i32* noalias noundef [[DOTGLOBAL_TID_:%.*]], i32* noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR1]] {
+// CHECK-32-EX-NEXT:  entry:
+// CHECK-32-EX-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-EX-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-EX-NEXT:    [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-EX-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4
+// CHECK-32-EX-NEXT:    store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4
+// CHECK-32-EX-NEXT:    store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4
+// CHECK-32-EX-NEXT:    store i32 0, i32* [[DOTOMP_LB]], align 4
+// CHECK-32-EX-NEXT:    store i32 9, i32* [[DOTOMP_UB]], align 4
+// CHECK-32-EX-NEXT:    [[TMP0:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4
+// CHECK-32-EX-NEXT:    [[TMP1:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4
+// CHECK-32-EX-NEXT:    store i32 [[TMP0]], i32* [[DOTOMP_LB]], align 4
+// CHECK-32-EX-NEXT:    store i32 [[TMP1]], i32* [[DOTOMP_UB]], align 4
+// CHECK-32-EX-NEXT:    store i32 1, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-32-EX-NEXT:    store i32 0, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-32-EX-NEXT:    [[TMP2:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4
+// CHECK-32-EX-NEXT:    [[TMP3:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4
+// CHECK-32-EX-NEXT:    [[TMP4:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-EX-NEXT:    [[TMP5:%.*]] = load i32, i32* [[TMP4]], align 4
+// CHECK-32-EX-NEXT:    call void @__kmpc_dispatch_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP5]], i32 1073741861, i32 [[TMP2]], i32 [[TMP3]], i32 1, i32 1)
+// CHECK-32-EX-NEXT:    br label [[OMP_DISPATCH_COND:%.*]]
+// CHECK-32-EX:       omp.dispatch.cond:
+// CHECK-32-EX-NEXT:    [[TMP6:%.*]] = call i32 @__kmpc_dispatch_next_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP5]], i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]])
+// CHECK-32-EX-NEXT:    [[TOBOOL:%.*]] = icmp ne i32 [[TMP6]], 0
+// CHECK-32-EX-NEXT:    br i1 [[TOBOOL]], label [[OMP_DISPATCH_BODY:%.*]], label [[OMP_DISPATCH_END:%.*]]
+// CHECK-32-EX:       omp.dispatch.body:
+// CHECK-32-EX-NEXT:    [[TMP7:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4
+// CHECK-32-EX-NEXT:    store i32 [[TMP7]], i32* [[DOTOMP_IV]], align 4
+// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK-32-EX:       omp.inner.for.cond:
+// CHECK-32-EX-NEXT:    [[TMP8:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP219:![0-9]+]]
+// CHECK-32-EX-NEXT:    [[TMP9:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP219]]
+// CHECK-32-EX-NEXT:    [[CMP:%.*]] = icmp sle i32 [[TMP8]], [[TMP9]]
+// CHECK-32-EX-NEXT:    br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK-32-EX:       omp.inner.for.body:
+// CHECK-32-EX-NEXT:    [[TMP10:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP219]]
+// CHECK-32-EX-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP10]], 1
+// CHECK-32-EX-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// CHECK-32-EX-NEXT:    store i32 [[ADD]], i32* [[I]], align 4, !llvm.access.group [[ACC_GRP219]]
+// CHECK-32-EX-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK-32-EX:       omp.body.continue:
+// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK-32-EX:       omp.inner.for.inc:
+// CHECK-32-EX-NEXT:    [[TMP11:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP219]]
+// CHECK-32-EX-NEXT:    [[ADD1:%.*]] = add nsw i32 [[TMP11]], 1
+// CHECK-32-EX-NEXT:    store i32 [[ADD1]], i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP219]]
+// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP220:![0-9]+]]
+// CHECK-32-EX:       omp.inner.for.end:
+// CHECK-32-EX-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
+// CHECK-32-EX:       omp.dispatch.inc:
+// CHECK-32-EX-NEXT:    br label [[OMP_DISPATCH_COND]]
+// CHECK-32-EX:       omp.dispatch.end:
+// CHECK-32-EX-NEXT:    [[TMP12:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-32-EX-NEXT:    [[TMP13:%.*]] = icmp ne i32 [[TMP12]], 0
+// CHECK-32-EX-NEXT:    br i1 [[TMP13]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]]
+// CHECK-32-EX:       .omp.final.then:
+// CHECK-32-EX-NEXT:    store i32 10, i32* [[I]], align 4
+// CHECK-32-EX-NEXT:    br label [[DOTOMP_FINAL_DONE]]
+// CHECK-32-EX:       .omp.final.done:
+// CHECK-32-EX-NEXT:    ret void
+//
+//
+// CHECK-32-EX-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l88
+// CHECK-32-EX-SAME: () #[[ATTR0]] {
+// CHECK-32-EX-NEXT:  entry:
+// CHECK-32-EX-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 2, i1 false)
+// CHECK-32-EX-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
+// CHECK-32-EX-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
+// CHECK-32-EX:       user_code.entry:
+// CHECK-32-EX-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
+// CHECK-32-EX-NEXT:    store i32 0, i32* [[DOTZERO_ADDR]], align 4
+// CHECK-32-EX-NEXT:    store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4
+// CHECK-32-EX-NEXT:    call void @__omp_outlined__39(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]]) #[[ATTR2]]
+// CHECK-32-EX-NEXT:    call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 2)
+// CHECK-32-EX-NEXT:    ret void
+// CHECK-32-EX:       worker.exit:
+// CHECK-32-EX-NEXT:    ret void
+//
+//
+// CHECK-32-EX-LABEL: define {{[^@]+}}@__omp_outlined__39
+// CHECK-32-EX-SAME: (i32* noalias noundef [[DOTGLOBAL_TID_:%.*]], i32* noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
+// CHECK-32-EX-NEXT:  entry:
+// CHECK-32-EX-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-EX-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [2 x i8*], align 4
+// CHECK-32-EX-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-EX-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4
+// CHECK-32-EX-NEXT:    store i32 0, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-32-EX-NEXT:    store i32 9, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-EX-NEXT:    store i32 1, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-32-EX-NEXT:    store i32 0, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-32-EX-NEXT:    [[NVPTX_NUM_THREADS:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block()
+// CHECK-32-EX-NEXT:    [[TMP0:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-EX-NEXT:    [[TMP1:%.*]] = load i32, i32* [[TMP0]], align 4
+// CHECK-32-EX-NEXT:    call void @__kmpc_distribute_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP1]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]])
+// CHECK-32-EX-NEXT:    [[TMP2:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-EX-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[TMP2]], 9
+// CHECK-32-EX-NEXT:    br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK-32-EX:       cond.true:
+// CHECK-32-EX-NEXT:    br label [[COND_END:%.*]]
+// CHECK-32-EX:       cond.false:
+// CHECK-32-EX-NEXT:    [[TMP3:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-EX-NEXT:    br label [[COND_END]]
+// CHECK-32-EX:       cond.end:
+// CHECK-32-EX-NEXT:    [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP3]], [[COND_FALSE]] ]
+// CHECK-32-EX-NEXT:    store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-EX-NEXT:    [[TMP4:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-32-EX-NEXT:    store i32 [[TMP4]], i32* [[DOTOMP_IV]], align 4
+// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK-32-EX:       omp.inner.for.cond:
+// CHECK-32-EX-NEXT:    [[TMP5:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP222:![0-9]+]]
+// CHECK-32-EX-NEXT:    [[CMP1:%.*]] = icmp slt i32 [[TMP5]], 10
+// CHECK-32-EX-NEXT:    br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK-32-EX:       omp.inner.for.body:
+// CHECK-32-EX-NEXT:    [[TMP6:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP222]]
+// CHECK-32-EX-NEXT:    [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP222]]
+// CHECK-32-EX-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0
+// CHECK-32-EX-NEXT:    [[TMP9:%.*]] = inttoptr i32 [[TMP6]] to i8*
+// CHECK-32-EX-NEXT:    store i8* [[TMP9]], i8** [[TMP8]], align 4, !llvm.access.group [[ACC_GRP222]]
+// CHECK-32-EX-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1
+// CHECK-32-EX-NEXT:    [[TMP11:%.*]] = inttoptr i32 [[TMP7]] to i8*
+// CHECK-32-EX-NEXT:    store i8* [[TMP11]], i8** [[TMP10]], align 4, !llvm.access.group [[ACC_GRP222]]
+// CHECK-32-EX-NEXT:    [[TMP12:%.*]] = bitcast [2 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8**
+// CHECK-32-EX-NEXT:    call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32)* @__omp_outlined__40 to i8*), i8* null, i8** [[TMP12]], i32 2), !llvm.access.group [[ACC_GRP222]]
+// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK-32-EX:       omp.inner.for.inc:
+// CHECK-32-EX-NEXT:    [[TMP13:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP222]]
+// CHECK-32-EX-NEXT:    [[TMP14:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP222]]
+// CHECK-32-EX-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP13]], [[TMP14]]
+// CHECK-32-EX-NEXT:    store i32 [[ADD]], i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP222]]
+// CHECK-32-EX-NEXT:    [[TMP15:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP222]]
+// CHECK-32-EX-NEXT:    [[TMP16:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP222]]
+// CHECK-32-EX-NEXT:    [[ADD2:%.*]] = add nsw i32 [[TMP15]], [[TMP16]]
+// CHECK-32-EX-NEXT:    store i32 [[ADD2]], i32* [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP222]]
+// CHECK-32-EX-NEXT:    [[TMP17:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP222]]
+// CHECK-32-EX-NEXT:    [[TMP18:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP222]]
+// CHECK-32-EX-NEXT:    [[ADD3:%.*]] = add nsw i32 [[TMP17]], [[TMP18]]
+// CHECK-32-EX-NEXT:    store i32 [[ADD3]], i32* [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP222]]
+// CHECK-32-EX-NEXT:    [[TMP19:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP222]]
+// CHECK-32-EX-NEXT:    [[CMP4:%.*]] = icmp sgt i32 [[TMP19]], 9
+// CHECK-32-EX-NEXT:    br i1 [[CMP4]], label [[COND_TRUE5:%.*]], label [[COND_FALSE6:%.*]]
+// CHECK-32-EX:       cond.true5:
+// CHECK-32-EX-NEXT:    br label [[COND_END7:%.*]]
+// CHECK-32-EX:       cond.false6:
+// CHECK-32-EX-NEXT:    [[TMP20:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP222]]
+// CHECK-32-EX-NEXT:    br label [[COND_END7]]
+// CHECK-32-EX:       cond.end7:
+// CHECK-32-EX-NEXT:    [[COND8:%.*]] = phi i32 [ 9, [[COND_TRUE5]] ], [ [[TMP20]], [[COND_FALSE6]] ]
+// CHECK-32-EX-NEXT:    store i32 [[COND8]], i32* [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP222]]
+// CHECK-32-EX-NEXT:    [[TMP21:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP222]]
+// CHECK-32-EX-NEXT:    store i32 [[TMP21]], i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP222]]
+// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP223:![0-9]+]]
+// CHECK-32-EX:       omp.inner.for.end:
+// CHECK-32-EX-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK-32-EX:       omp.loop.exit:
+// CHECK-32-EX-NEXT:    call void @__kmpc_distribute_static_fini(%struct.ident_t* @[[GLOB2]], i32 [[TMP1]])
+// CHECK-32-EX-NEXT:    [[TMP22:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-32-EX-NEXT:    [[TMP23:%.*]] = icmp ne i32 [[TMP22]], 0
+// CHECK-32-EX-NEXT:    br i1 [[TMP23]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]]
+// CHECK-32-EX:       .omp.final.then:
+// CHECK-32-EX-NEXT:    store i32 10, i32* [[I]], align 4
+// CHECK-32-EX-NEXT:    br label [[DOTOMP_FINAL_DONE]]
+// CHECK-32-EX:       .omp.final.done:
+// CHECK-32-EX-NEXT:    ret void
+//
+//
+// CHECK-32-EX-LABEL: define {{[^@]+}}@__omp_outlined__40
+// CHECK-32-EX-SAME: (i32* noalias noundef [[DOTGLOBAL_TID_:%.*]], i32* noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR1]] {
+// CHECK-32-EX-NEXT:  entry:
+// CHECK-32-EX-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-EX-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-EX-NEXT:    [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-EX-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4
+// CHECK-32-EX-NEXT:    store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4
+// CHECK-32-EX-NEXT:    store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4
+// CHECK-32-EX-NEXT:    store i32 0, i32* [[DOTOMP_LB]], align 4
+// CHECK-32-EX-NEXT:    store i32 9, i32* [[DOTOMP_UB]], align 4
+// CHECK-32-EX-NEXT:    [[TMP0:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4
+// CHECK-32-EX-NEXT:    [[TMP1:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4
+// CHECK-32-EX-NEXT:    store i32 [[TMP0]], i32* [[DOTOMP_LB]], align 4
+// CHECK-32-EX-NEXT:    store i32 [[TMP1]], i32* [[DOTOMP_UB]], align 4
+// CHECK-32-EX-NEXT:    store i32 1, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-32-EX-NEXT:    store i32 0, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-32-EX-NEXT:    [[TMP2:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4
+// CHECK-32-EX-NEXT:    [[TMP3:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4
+// CHECK-32-EX-NEXT:    [[TMP4:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-EX-NEXT:    [[TMP5:%.*]] = load i32, i32* [[TMP4]], align 4
+// CHECK-32-EX-NEXT:    call void @__kmpc_dispatch_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP5]], i32 1073741859, i32 [[TMP2]], i32 [[TMP3]], i32 1, i32 1)
+// CHECK-32-EX-NEXT:    br label [[OMP_DISPATCH_COND:%.*]]
+// CHECK-32-EX:       omp.dispatch.cond:
+// CHECK-32-EX-NEXT:    [[TMP6:%.*]] = call i32 @__kmpc_dispatch_next_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP5]], i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]])
+// CHECK-32-EX-NEXT:    [[TOBOOL:%.*]] = icmp ne i32 [[TMP6]], 0
+// CHECK-32-EX-NEXT:    br i1 [[TOBOOL]], label [[OMP_DISPATCH_BODY:%.*]], label [[OMP_DISPATCH_END:%.*]]
+// CHECK-32-EX:       omp.dispatch.body:
+// CHECK-32-EX-NEXT:    [[TMP7:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4
+// CHECK-32-EX-NEXT:    store i32 [[TMP7]], i32* [[DOTOMP_IV]], align 4
+// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK-32-EX:       omp.inner.for.cond:
+// CHECK-32-EX-NEXT:    [[TMP8:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP225:![0-9]+]]
+// CHECK-32-EX-NEXT:    [[TMP9:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP225]]
+// CHECK-32-EX-NEXT:    [[CMP:%.*]] = icmp sle i32 [[TMP8]], [[TMP9]]
+// CHECK-32-EX-NEXT:    br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK-32-EX:       omp.inner.for.body:
+// CHECK-32-EX-NEXT:    [[TMP10:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP225]]
+// CHECK-32-EX-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP10]], 1
+// CHECK-32-EX-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// CHECK-32-EX-NEXT:    store i32 [[ADD]], i32* [[I]], align 4, !llvm.access.group [[ACC_GRP225]]
+// CHECK-32-EX-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK-32-EX:       omp.body.continue:
+// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK-32-EX:       omp.inner.for.inc:
+// CHECK-32-EX-NEXT:    [[TMP11:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP225]]
+// CHECK-32-EX-NEXT:    [[ADD1:%.*]] = add nsw i32 [[TMP11]], 1
+// CHECK-32-EX-NEXT:    store i32 [[ADD1]], i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP225]]
+// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP226:![0-9]+]]
+// CHECK-32-EX:       omp.inner.for.end:
+// CHECK-32-EX-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
+// CHECK-32-EX:       omp.dispatch.inc:
+// CHECK-32-EX-NEXT:    br label [[OMP_DISPATCH_COND]]
+// CHECK-32-EX:       omp.dispatch.end:
+// CHECK-32-EX-NEXT:    [[TMP12:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-32-EX-NEXT:    [[TMP13:%.*]] = icmp ne i32 [[TMP12]], 0
+// CHECK-32-EX-NEXT:    br i1 [[TMP13]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]]
+// CHECK-32-EX:       .omp.final.then:
+// CHECK-32-EX-NEXT:    store i32 10, i32* [[I]], align 4
+// CHECK-32-EX-NEXT:    br label [[DOTOMP_FINAL_DONE]]
+// CHECK-32-EX:       .omp.final.done:
+// CHECK-32-EX-NEXT:    ret void
+//
+//
+// CHECK-32-EX-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l92
+// CHECK-32-EX-SAME: () #[[ATTR0]] {
+// CHECK-32-EX-NEXT:  entry:
+// CHECK-32-EX-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 2, i1 false)
+// CHECK-32-EX-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
+// CHECK-32-EX-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
+// CHECK-32-EX:       user_code.entry:
+// CHECK-32-EX-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
+// CHECK-32-EX-NEXT:    store i32 0, i32* [[DOTZERO_ADDR]], align 4
+// CHECK-32-EX-NEXT:    store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4
+// CHECK-32-EX-NEXT:    call void @__omp_outlined__41(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]]) #[[ATTR2]]
+// CHECK-32-EX-NEXT:    call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 2)
+// CHECK-32-EX-NEXT:    ret void
+// CHECK-32-EX:       worker.exit:
+// CHECK-32-EX-NEXT:    ret void
+//
+//
+// CHECK-32-EX-LABEL: define {{[^@]+}}@__omp_outlined__41
+// CHECK-32-EX-SAME: (i32* noalias noundef [[DOTGLOBAL_TID_:%.*]], i32* noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
+// CHECK-32-EX-NEXT:  entry:
+// CHECK-32-EX-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-EX-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [2 x i8*], align 4
+// CHECK-32-EX-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-EX-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4
+// CHECK-32-EX-NEXT:    store i32 0, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-32-EX-NEXT:    store i32 9, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-EX-NEXT:    store i32 1, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-32-EX-NEXT:    store i32 0, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-32-EX-NEXT:    [[NVPTX_NUM_THREADS:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block()
+// CHECK-32-EX-NEXT:    [[TMP0:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-EX-NEXT:    [[TMP1:%.*]] = load i32, i32* [[TMP0]], align 4
+// CHECK-32-EX-NEXT:    call void @__kmpc_distribute_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP1]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]])
+// CHECK-32-EX-NEXT:    [[TMP2:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-EX-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[TMP2]], 9
+// CHECK-32-EX-NEXT:    br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK-32-EX:       cond.true:
+// CHECK-32-EX-NEXT:    br label [[COND_END:%.*]]
+// CHECK-32-EX:       cond.false:
+// CHECK-32-EX-NEXT:    [[TMP3:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-EX-NEXT:    br label [[COND_END]]
+// CHECK-32-EX:       cond.end:
+// CHECK-32-EX-NEXT:    [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP3]], [[COND_FALSE]] ]
+// CHECK-32-EX-NEXT:    store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-EX-NEXT:    [[TMP4:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-32-EX-NEXT:    store i32 [[TMP4]], i32* [[DOTOMP_IV]], align 4
+// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK-32-EX:       omp.inner.for.cond:
+// CHECK-32-EX-NEXT:    [[TMP5:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP228:![0-9]+]]
+// CHECK-32-EX-NEXT:    [[CMP1:%.*]] = icmp slt i32 [[TMP5]], 10
+// CHECK-32-EX-NEXT:    br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK-32-EX:       omp.inner.for.body:
+// CHECK-32-EX-NEXT:    [[TMP6:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP228]]
+// CHECK-32-EX-NEXT:    [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP228]]
+// CHECK-32-EX-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0
+// CHECK-32-EX-NEXT:    [[TMP9:%.*]] = inttoptr i32 [[TMP6]] to i8*
+// CHECK-32-EX-NEXT:    store i8* [[TMP9]], i8** [[TMP8]], align 4, !llvm.access.group [[ACC_GRP228]]
+// CHECK-32-EX-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1
+// CHECK-32-EX-NEXT:    [[TMP11:%.*]] = inttoptr i32 [[TMP7]] to i8*
+// CHECK-32-EX-NEXT:    store i8* [[TMP11]], i8** [[TMP10]], align 4, !llvm.access.group [[ACC_GRP228]]
+// CHECK-32-EX-NEXT:    [[TMP12:%.*]] = bitcast [2 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8**
+// CHECK-32-EX-NEXT:    call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32)* @__omp_outlined__42 to i8*), i8* null, i8** [[TMP12]], i32 2), !llvm.access.group [[ACC_GRP228]]
+// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK-32-EX:       omp.inner.for.inc:
+// CHECK-32-EX-NEXT:    [[TMP13:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP228]]
+// CHECK-32-EX-NEXT:    [[TMP14:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP228]]
+// CHECK-32-EX-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP13]], [[TMP14]]
+// CHECK-32-EX-NEXT:    store i32 [[ADD]], i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP228]]
+// CHECK-32-EX-NEXT:    [[TMP15:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP228]]
+// CHECK-32-EX-NEXT:    [[TMP16:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP228]]
+// CHECK-32-EX-NEXT:    [[ADD2:%.*]] = add nsw i32 [[TMP15]], [[TMP16]]
+// CHECK-32-EX-NEXT:    store i32 [[ADD2]], i32* [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP228]]
+// CHECK-32-EX-NEXT:    [[TMP17:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP228]]
+// CHECK-32-EX-NEXT:    [[TMP18:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP228]]
+// CHECK-32-EX-NEXT:    [[ADD3:%.*]] = add nsw i32 [[TMP17]], [[TMP18]]
+// CHECK-32-EX-NEXT:    store i32 [[ADD3]], i32* [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP228]]
+// CHECK-32-EX-NEXT:    [[TMP19:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP228]]
+// CHECK-32-EX-NEXT:    [[CMP4:%.*]] = icmp sgt i32 [[TMP19]], 9
+// CHECK-32-EX-NEXT:    br i1 [[CMP4]], label [[COND_TRUE5:%.*]], label [[COND_FALSE6:%.*]]
+// CHECK-32-EX:       cond.true5:
+// CHECK-32-EX-NEXT:    br label [[COND_END7:%.*]]
+// CHECK-32-EX:       cond.false6:
+// CHECK-32-EX-NEXT:    [[TMP20:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP228]]
+// CHECK-32-EX-NEXT:    br label [[COND_END7]]
+// CHECK-32-EX:       cond.end7:
+// CHECK-32-EX-NEXT:    [[COND8:%.*]] = phi i32 [ 9, [[COND_TRUE5]] ], [ [[TMP20]], [[COND_FALSE6]] ]
+// CHECK-32-EX-NEXT:    store i32 [[COND8]], i32* [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP228]]
+// CHECK-32-EX-NEXT:    [[TMP21:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP228]]
+// CHECK-32-EX-NEXT:    store i32 [[TMP21]], i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP228]]
+// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP229:![0-9]+]]
+// CHECK-32-EX:       omp.inner.for.end:
+// CHECK-32-EX-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK-32-EX:       omp.loop.exit:
+// CHECK-32-EX-NEXT:    call void @__kmpc_distribute_static_fini(%struct.ident_t* @[[GLOB2]], i32 [[TMP1]])
+// CHECK-32-EX-NEXT:    [[TMP22:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-32-EX-NEXT:    [[TMP23:%.*]] = icmp ne i32 [[TMP22]], 0
+// CHECK-32-EX-NEXT:    br i1 [[TMP23]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]]
+// CHECK-32-EX:       .omp.final.then:
+// CHECK-32-EX-NEXT:    store i32 10, i32* [[I]], align 4
+// CHECK-32-EX-NEXT:    br label [[DOTOMP_FINAL_DONE]]
+// CHECK-32-EX:       .omp.final.done:
+// CHECK-32-EX-NEXT:    ret void
+//
+//
+// CHECK-32-EX-LABEL: define {{[^@]+}}@__omp_outlined__42
+// CHECK-32-EX-SAME: (i32* noalias noundef [[DOTGLOBAL_TID_:%.*]], i32* noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR1]] {
+// CHECK-32-EX-NEXT:  entry:
+// CHECK-32-EX-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-EX-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-EX-NEXT:    [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-EX-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4
+// CHECK-32-EX-NEXT:    store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4
+// CHECK-32-EX-NEXT:    store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4
+// CHECK-32-EX-NEXT:    store i32 0, i32* [[DOTOMP_LB]], align 4
+// CHECK-32-EX-NEXT:    store i32 9, i32* [[DOTOMP_UB]], align 4
+// CHECK-32-EX-NEXT:    [[TMP0:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4
+// CHECK-32-EX-NEXT:    [[TMP1:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4
+// CHECK-32-EX-NEXT:    store i32 [[TMP0]], i32* [[DOTOMP_LB]], align 4
+// CHECK-32-EX-NEXT:    store i32 [[TMP1]], i32* [[DOTOMP_UB]], align 4
+// CHECK-32-EX-NEXT:    store i32 1, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-32-EX-NEXT:    store i32 0, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-32-EX-NEXT:    [[TMP2:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4
+// CHECK-32-EX-NEXT:    [[TMP3:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4
+// CHECK-32-EX-NEXT:    [[TMP4:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-EX-NEXT:    [[TMP5:%.*]] = load i32, i32* [[TMP4]], align 4
+// CHECK-32-EX-NEXT:    call void @__kmpc_dispatch_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP5]], i32 1073741860, i32 [[TMP2]], i32 [[TMP3]], i32 1, i32 1)
+// CHECK-32-EX-NEXT:    br label [[OMP_DISPATCH_COND:%.*]]
+// CHECK-32-EX:       omp.dispatch.cond:
+// CHECK-32-EX-NEXT:    [[TMP6:%.*]] = call i32 @__kmpc_dispatch_next_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP5]], i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]])
+// CHECK-32-EX-NEXT:    [[TOBOOL:%.*]] = icmp ne i32 [[TMP6]], 0
+// CHECK-32-EX-NEXT:    br i1 [[TOBOOL]], label [[OMP_DISPATCH_BODY:%.*]], label [[OMP_DISPATCH_END:%.*]]
+// CHECK-32-EX:       omp.dispatch.body:
+// CHECK-32-EX-NEXT:    [[TMP7:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4
+// CHECK-32-EX-NEXT:    store i32 [[TMP7]], i32* [[DOTOMP_IV]], align 4
+// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK-32-EX:       omp.inner.for.cond:
+// CHECK-32-EX-NEXT:    [[TMP8:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP231:![0-9]+]]
+// CHECK-32-EX-NEXT:    [[TMP9:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP231]]
+// CHECK-32-EX-NEXT:    [[CMP:%.*]] = icmp sle i32 [[TMP8]], [[TMP9]]
+// CHECK-32-EX-NEXT:    br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK-32-EX:       omp.inner.for.body:
+// CHECK-32-EX-NEXT:    [[TMP10:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP231]]
+// CHECK-32-EX-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP10]], 1
+// CHECK-32-EX-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// CHECK-32-EX-NEXT:    store i32 [[ADD]], i32* [[I]], align 4, !llvm.access.group [[ACC_GRP231]]
+// CHECK-32-EX-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK-32-EX:       omp.body.continue:
+// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK-32-EX:       omp.inner.for.inc:
+// CHECK-32-EX-NEXT:    [[TMP11:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP231]]
+// CHECK-32-EX-NEXT:    [[ADD1:%.*]] = add nsw i32 [[TMP11]], 1
+// CHECK-32-EX-NEXT:    store i32 [[ADD1]], i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP231]]
+// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP232:![0-9]+]]
+// CHECK-32-EX:       omp.inner.for.end:
+// CHECK-32-EX-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
+// CHECK-32-EX:       omp.dispatch.inc:
+// CHECK-32-EX-NEXT:    br label [[OMP_DISPATCH_COND]]
+// CHECK-32-EX:       omp.dispatch.end:
+// CHECK-32-EX-NEXT:    [[TMP12:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-32-EX-NEXT:    [[TMP13:%.*]] = icmp ne i32 [[TMP12]], 0
+// CHECK-32-EX-NEXT:    br i1 [[TMP13]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]]
+// CHECK-32-EX:       .omp.final.then:
+// CHECK-32-EX-NEXT:    store i32 10, i32* [[I]], align 4
+// CHECK-32-EX-NEXT:    br label [[DOTOMP_FINAL_DONE]]
+// CHECK-32-EX:       .omp.final.done:
+// CHECK-32-EX-NEXT:    ret void
+//
+//
+// CHECK-32-EX-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l96
+// CHECK-32-EX-SAME: () #[[ATTR0]] {
+// CHECK-32-EX-NEXT:  entry:
+// CHECK-32-EX-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 2, i1 false)
+// CHECK-32-EX-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
+// CHECK-32-EX-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
+// CHECK-32-EX:       user_code.entry:
+// CHECK-32-EX-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
+// CHECK-32-EX-NEXT:    store i32 0, i32* [[DOTZERO_ADDR]], align 4
+// CHECK-32-EX-NEXT:    store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4
+// CHECK-32-EX-NEXT:    call void @__omp_outlined__43(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]]) #[[ATTR2]]
+// CHECK-32-EX-NEXT:    call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 2)
+// CHECK-32-EX-NEXT:    ret void
+// CHECK-32-EX:       worker.exit:
+// CHECK-32-EX-NEXT:    ret void
+//
+//
+// CHECK-32-EX-LABEL: define {{[^@]+}}@__omp_outlined__43
+// CHECK-32-EX-SAME: (i32* noalias noundef [[DOTGLOBAL_TID_:%.*]], i32* noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
+// CHECK-32-EX-NEXT:  entry:
+// CHECK-32-EX-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-EX-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [2 x i8*], align 4
+// CHECK-32-EX-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-EX-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4
+// CHECK-32-EX-NEXT:    store i32 0, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-32-EX-NEXT:    store i32 9, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-EX-NEXT:    store i32 1, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-32-EX-NEXT:    store i32 0, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-32-EX-NEXT:    [[NVPTX_NUM_THREADS:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block()
+// CHECK-32-EX-NEXT:    [[TMP0:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-EX-NEXT:    [[TMP1:%.*]] = load i32, i32* [[TMP0]], align 4
+// CHECK-32-EX-NEXT:    call void @__kmpc_distribute_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP1]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]])
+// CHECK-32-EX-NEXT:    [[TMP2:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-EX-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[TMP2]], 9
+// CHECK-32-EX-NEXT:    br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK-32-EX:       cond.true:
+// CHECK-32-EX-NEXT:    br label [[COND_END:%.*]]
+// CHECK-32-EX:       cond.false:
+// CHECK-32-EX-NEXT:    [[TMP3:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-EX-NEXT:    br label [[COND_END]]
+// CHECK-32-EX:       cond.end:
+// CHECK-32-EX-NEXT:    [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP3]], [[COND_FALSE]] ]
+// CHECK-32-EX-NEXT:    store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-EX-NEXT:    [[TMP4:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-32-EX-NEXT:    store i32 [[TMP4]], i32* [[DOTOMP_IV]], align 4
+// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK-32-EX:       omp.inner.for.cond:
+// CHECK-32-EX-NEXT:    [[TMP5:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
+// CHECK-32-EX-NEXT:    [[CMP1:%.*]] = icmp slt i32 [[TMP5]], 10
+// CHECK-32-EX-NEXT:    br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK-32-EX:       omp.inner.for.body:
+// CHECK-32-EX-NEXT:    [[TMP6:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-32-EX-NEXT:    [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-EX-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0
+// CHECK-32-EX-NEXT:    [[TMP9:%.*]] = inttoptr i32 [[TMP6]] to i8*
+// CHECK-32-EX-NEXT:    store i8* [[TMP9]], i8** [[TMP8]], align 4
+// CHECK-32-EX-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1
+// CHECK-32-EX-NEXT:    [[TMP11:%.*]] = inttoptr i32 [[TMP7]] to i8*
+// CHECK-32-EX-NEXT:    store i8* [[TMP11]], i8** [[TMP10]], align 4
+// CHECK-32-EX-NEXT:    [[TMP12:%.*]] = bitcast [2 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8**
+// CHECK-32-EX-NEXT:    call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32)* @__omp_outlined__44 to i8*), i8* null, i8** [[TMP12]], i32 2)
+// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK-32-EX:       omp.inner.for.inc:
+// CHECK-32-EX-NEXT:    [[TMP13:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
+// CHECK-32-EX-NEXT:    [[TMP14:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-32-EX-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP13]], [[TMP14]]
+// CHECK-32-EX-NEXT:    store i32 [[ADD]], i32* [[DOTOMP_IV]], align 4
+// CHECK-32-EX-NEXT:    [[TMP15:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-32-EX-NEXT:    [[TMP16:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-32-EX-NEXT:    [[ADD2:%.*]] = add nsw i32 [[TMP15]], [[TMP16]]
+// CHECK-32-EX-NEXT:    store i32 [[ADD2]], i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-32-EX-NEXT:    [[TMP17:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-EX-NEXT:    [[TMP18:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-32-EX-NEXT:    [[ADD3:%.*]] = add nsw i32 [[TMP17]], [[TMP18]]
+// CHECK-32-EX-NEXT:    store i32 [[ADD3]], i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-EX-NEXT:    [[TMP19:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-EX-NEXT:    [[CMP4:%.*]] = icmp sgt i32 [[TMP19]], 9
+// CHECK-32-EX-NEXT:    br i1 [[CMP4]], label [[COND_TRUE5:%.*]], label [[COND_FALSE6:%.*]]
+// CHECK-32-EX:       cond.true5:
+// CHECK-32-EX-NEXT:    br label [[COND_END7:%.*]]
+// CHECK-32-EX:       cond.false6:
+// CHECK-32-EX-NEXT:    [[TMP20:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-EX-NEXT:    br label [[COND_END7]]
+// CHECK-32-EX:       cond.end7:
+// CHECK-32-EX-NEXT:    [[COND8:%.*]] = phi i32 [ 9, [[COND_TRUE5]] ], [ [[TMP20]], [[COND_FALSE6]] ]
+// CHECK-32-EX-NEXT:    store i32 [[COND8]], i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-EX-NEXT:    [[TMP21:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-32-EX-NEXT:    store i32 [[TMP21]], i32* [[DOTOMP_IV]], align 4
+// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND]]
+// CHECK-32-EX:       omp.inner.for.end:
+// CHECK-32-EX-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK-32-EX:       omp.loop.exit:
+// CHECK-32-EX-NEXT:    call void @__kmpc_distribute_static_fini(%struct.ident_t* @[[GLOB2]], i32 [[TMP1]])
+// CHECK-32-EX-NEXT:    ret void
+//
+//
+// CHECK-32-EX-LABEL: define {{[^@]+}}@__omp_outlined__44
+// CHECK-32-EX-SAME: (i32* noalias noundef [[DOTGLOBAL_TID_:%.*]], i32* noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR1]] {
+// CHECK-32-EX-NEXT:  entry:
+// CHECK-32-EX-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-EX-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-EX-NEXT:    [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-EX-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4
+// CHECK-32-EX-NEXT:    store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4
+// CHECK-32-EX-NEXT:    store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4
+// CHECK-32-EX-NEXT:    store i32 0, i32* [[DOTOMP_LB]], align 4
+// CHECK-32-EX-NEXT:    store i32 9, i32* [[DOTOMP_UB]], align 4
+// CHECK-32-EX-NEXT:    [[TMP0:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4
+// CHECK-32-EX-NEXT:    [[TMP1:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4
+// CHECK-32-EX-NEXT:    store i32 [[TMP0]], i32* [[DOTOMP_LB]], align 4
+// CHECK-32-EX-NEXT:    store i32 [[TMP1]], i32* [[DOTOMP_UB]], align 4
+// CHECK-32-EX-NEXT:    store i32 1, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-32-EX-NEXT:    store i32 0, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-32-EX-NEXT:    [[TMP2:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-EX-NEXT:    [[TMP3:%.*]] = load i32, i32* [[TMP2]], align 4
+// CHECK-32-EX-NEXT:    call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB3]], i32 [[TMP3]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1)
+// CHECK-32-EX-NEXT:    [[TMP4:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4
+// CHECK-32-EX-NEXT:    store i32 [[TMP4]], i32* [[DOTOMP_IV]], align 4
+// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK-32-EX:       omp.inner.for.cond:
+// CHECK-32-EX-NEXT:    [[TMP5:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
+// CHECK-32-EX-NEXT:    [[TMP6:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4
+// CHECK-32-EX-NEXT:    [[CMP:%.*]] = icmp ule i32 [[TMP5]], [[TMP6]]
+// CHECK-32-EX-NEXT:    br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK-32-EX:       omp.inner.for.body:
+// CHECK-32-EX-NEXT:    [[TMP7:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
+// CHECK-32-EX-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP7]], 1
+// CHECK-32-EX-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// CHECK-32-EX-NEXT:    store i32 [[ADD]], i32* [[I]], align 4
+// CHECK-32-EX-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK-32-EX:       omp.body.continue:
+// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK-32-EX:       omp.inner.for.inc:
+// CHECK-32-EX-NEXT:    [[TMP8:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
+// CHECK-32-EX-NEXT:    [[TMP9:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-32-EX-NEXT:    [[ADD1:%.*]] = add nsw i32 [[TMP8]], [[TMP9]]
+// CHECK-32-EX-NEXT:    store i32 [[ADD1]], i32* [[DOTOMP_IV]], align 4
+// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND]]
+// CHECK-32-EX:       omp.inner.for.end:
+// CHECK-32-EX-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK-32-EX:       omp.loop.exit:
+// CHECK-32-EX-NEXT:    call void @__kmpc_distribute_static_fini(%struct.ident_t* @[[GLOB2]], i32 [[TMP3]])
+// CHECK-32-EX-NEXT:    ret void
+//
+//
+// CHECK-32-EX-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l100
+// CHECK-32-EX-SAME: () #[[ATTR0]] {
+// CHECK-32-EX-NEXT:  entry:
+// CHECK-32-EX-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 2, i1 false)
+// CHECK-32-EX-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
+// CHECK-32-EX-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
+// CHECK-32-EX:       user_code.entry:
+// CHECK-32-EX-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
+// CHECK-32-EX-NEXT:    store i32 0, i32* [[DOTZERO_ADDR]], align 4
+// CHECK-32-EX-NEXT:    store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4
+// CHECK-32-EX-NEXT:    call void @__omp_outlined__45(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]]) #[[ATTR2]]
+// CHECK-32-EX-NEXT:    call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 2)
+// CHECK-32-EX-NEXT:    ret void
+// CHECK-32-EX:       worker.exit:
+// CHECK-32-EX-NEXT:    ret void
+//
+//
+// CHECK-32-EX-LABEL: define {{[^@]+}}@__omp_outlined__45
+// CHECK-32-EX-SAME: (i32* noalias noundef [[DOTGLOBAL_TID_:%.*]], i32* noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
+// CHECK-32-EX-NEXT:  entry:
+// CHECK-32-EX-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-EX-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [2 x i8*], align 4
+// CHECK-32-EX-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-EX-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4
+// CHECK-32-EX-NEXT:    store i32 0, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-32-EX-NEXT:    store i32 9, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-EX-NEXT:    store i32 1, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-32-EX-NEXT:    store i32 0, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-32-EX-NEXT:    [[NVPTX_NUM_THREADS:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block()
+// CHECK-32-EX-NEXT:    [[TMP0:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-EX-NEXT:    [[TMP1:%.*]] = load i32, i32* [[TMP0]], align 4
+// CHECK-32-EX-NEXT:    call void @__kmpc_distribute_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP1]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]])
+// CHECK-32-EX-NEXT:    [[TMP2:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-EX-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[TMP2]], 9
+// CHECK-32-EX-NEXT:    br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK-32-EX:       cond.true:
+// CHECK-32-EX-NEXT:    br label [[COND_END:%.*]]
+// CHECK-32-EX:       cond.false:
+// CHECK-32-EX-NEXT:    [[TMP3:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-EX-NEXT:    br label [[COND_END]]
+// CHECK-32-EX:       cond.end:
+// CHECK-32-EX-NEXT:    [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP3]], [[COND_FALSE]] ]
+// CHECK-32-EX-NEXT:    store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-EX-NEXT:    [[TMP4:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-32-EX-NEXT:    store i32 [[TMP4]], i32* [[DOTOMP_IV]], align 4
+// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK-32-EX:       omp.inner.for.cond:
+// CHECK-32-EX-NEXT:    [[TMP5:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
+// CHECK-32-EX-NEXT:    [[CMP1:%.*]] = icmp slt i32 [[TMP5]], 10
+// CHECK-32-EX-NEXT:    br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK-32-EX:       omp.inner.for.body:
+// CHECK-32-EX-NEXT:    [[TMP6:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-32-EX-NEXT:    [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-EX-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0
+// CHECK-32-EX-NEXT:    [[TMP9:%.*]] = inttoptr i32 [[TMP6]] to i8*
+// CHECK-32-EX-NEXT:    store i8* [[TMP9]], i8** [[TMP8]], align 4
+// CHECK-32-EX-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1
+// CHECK-32-EX-NEXT:    [[TMP11:%.*]] = inttoptr i32 [[TMP7]] to i8*
+// CHECK-32-EX-NEXT:    store i8* [[TMP11]], i8** [[TMP10]], align 4
+// CHECK-32-EX-NEXT:    [[TMP12:%.*]] = bitcast [2 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8**
+// CHECK-32-EX-NEXT:    call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32)* @__omp_outlined__46 to i8*), i8* null, i8** [[TMP12]], i32 2)
+// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK-32-EX:       omp.inner.for.inc:
+// CHECK-32-EX-NEXT:    [[TMP13:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
+// CHECK-32-EX-NEXT:    [[TMP14:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-32-EX-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP13]], [[TMP14]]
+// CHECK-32-EX-NEXT:    store i32 [[ADD]], i32* [[DOTOMP_IV]], align 4
+// CHECK-32-EX-NEXT:    [[TMP15:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-32-EX-NEXT:    [[TMP16:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-32-EX-NEXT:    [[ADD2:%.*]] = add nsw i32 [[TMP15]], [[TMP16]]
+// CHECK-32-EX-NEXT:    store i32 [[ADD2]], i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-32-EX-NEXT:    [[TMP17:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-EX-NEXT:    [[TMP18:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-32-EX-NEXT:    [[ADD3:%.*]] = add nsw i32 [[TMP17]], [[TMP18]]
+// CHECK-32-EX-NEXT:    store i32 [[ADD3]], i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-EX-NEXT:    [[TMP19:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-EX-NEXT:    [[CMP4:%.*]] = icmp sgt i32 [[TMP19]], 9
+// CHECK-32-EX-NEXT:    br i1 [[CMP4]], label [[COND_TRUE5:%.*]], label [[COND_FALSE6:%.*]]
+// CHECK-32-EX:       cond.true5:
+// CHECK-32-EX-NEXT:    br label [[COND_END7:%.*]]
+// CHECK-32-EX:       cond.false6:
+// CHECK-32-EX-NEXT:    [[TMP20:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-EX-NEXT:    br label [[COND_END7]]
+// CHECK-32-EX:       cond.end7:
+// CHECK-32-EX-NEXT:    [[COND8:%.*]] = phi i32 [ 9, [[COND_TRUE5]] ], [ [[TMP20]], [[COND_FALSE6]] ]
+// CHECK-32-EX-NEXT:    store i32 [[COND8]], i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-EX-NEXT:    [[TMP21:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-32-EX-NEXT:    store i32 [[TMP21]], i32* [[DOTOMP_IV]], align 4
+// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND]]
+// CHECK-32-EX:       omp.inner.for.end:
+// CHECK-32-EX-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK-32-EX:       omp.loop.exit:
+// CHECK-32-EX-NEXT:    call void @__kmpc_distribute_static_fini(%struct.ident_t* @[[GLOB2]], i32 [[TMP1]])
+// CHECK-32-EX-NEXT:    ret void
+//
+//
+// CHECK-32-EX-LABEL: define {{[^@]+}}@__omp_outlined__46
+// CHECK-32-EX-SAME: (i32* noalias noundef [[DOTGLOBAL_TID_:%.*]], i32* noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR1]] {
+// CHECK-32-EX-NEXT:  entry:
+// CHECK-32-EX-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-EX-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-EX-NEXT:    [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-EX-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4
+// CHECK-32-EX-NEXT:    store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4
+// CHECK-32-EX-NEXT:    store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4
+// CHECK-32-EX-NEXT:    store i32 0, i32* [[DOTOMP_LB]], align 4
+// CHECK-32-EX-NEXT:    store i32 9, i32* [[DOTOMP_UB]], align 4
+// CHECK-32-EX-NEXT:    [[TMP0:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4
+// CHECK-32-EX-NEXT:    [[TMP1:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4
+// CHECK-32-EX-NEXT:    store i32 [[TMP0]], i32* [[DOTOMP_LB]], align 4
+// CHECK-32-EX-NEXT:    store i32 [[TMP1]], i32* [[DOTOMP_UB]], align 4
+// CHECK-32-EX-NEXT:    store i32 1, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-32-EX-NEXT:    store i32 0, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-32-EX-NEXT:    [[TMP2:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-EX-NEXT:    [[TMP3:%.*]] = load i32, i32* [[TMP2]], align 4
+// CHECK-32-EX-NEXT:    call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB3]], i32 [[TMP3]], i32 34, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1)
+// CHECK-32-EX-NEXT:    [[TMP4:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4
+// CHECK-32-EX-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[TMP4]], 9
+// CHECK-32-EX-NEXT:    br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK-32-EX:       cond.true:
+// CHECK-32-EX-NEXT:    br label [[COND_END:%.*]]
+// CHECK-32-EX:       cond.false:
+// CHECK-32-EX-NEXT:    [[TMP5:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4
+// CHECK-32-EX-NEXT:    br label [[COND_END]]
+// CHECK-32-EX:       cond.end:
+// CHECK-32-EX-NEXT:    [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP5]], [[COND_FALSE]] ]
+// CHECK-32-EX-NEXT:    store i32 [[COND]], i32* [[DOTOMP_UB]], align 4
+// CHECK-32-EX-NEXT:    [[TMP6:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4
+// CHECK-32-EX-NEXT:    store i32 [[TMP6]], i32* [[DOTOMP_IV]], align 4
+// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK-32-EX:       omp.inner.for.cond:
+// CHECK-32-EX-NEXT:    [[TMP7:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
+// CHECK-32-EX-NEXT:    [[TMP8:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4
+// CHECK-32-EX-NEXT:    [[CMP1:%.*]] = icmp sle i32 [[TMP7]], [[TMP8]]
+// CHECK-32-EX-NEXT:    br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK-32-EX:       omp.inner.for.body:
+// CHECK-32-EX-NEXT:    [[TMP9:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
+// CHECK-32-EX-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP9]], 1
+// CHECK-32-EX-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// CHECK-32-EX-NEXT:    store i32 [[ADD]], i32* [[I]], align 4
+// CHECK-32-EX-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK-32-EX:       omp.body.continue:
+// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK-32-EX:       omp.inner.for.inc:
+// CHECK-32-EX-NEXT:    [[TMP10:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
+// CHECK-32-EX-NEXT:    [[ADD2:%.*]] = add nsw i32 [[TMP10]], 1
+// CHECK-32-EX-NEXT:    store i32 [[ADD2]], i32* [[DOTOMP_IV]], align 4
+// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND]]
+// CHECK-32-EX:       omp.inner.for.end:
+// CHECK-32-EX-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK-32-EX:       omp.loop.exit:
+// CHECK-32-EX-NEXT:    call void @__kmpc_distribute_static_fini(%struct.ident_t* @[[GLOB2]], i32 [[TMP3]])
+// CHECK-32-EX-NEXT:    ret void
+//
+//
+// CHECK-32-EX-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l104
+// CHECK-32-EX-SAME: () #[[ATTR0]] {
+// CHECK-32-EX-NEXT:  entry:
+// CHECK-32-EX-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 2, i1 false)
+// CHECK-32-EX-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
+// CHECK-32-EX-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
+// CHECK-32-EX:       user_code.entry:
+// CHECK-32-EX-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
+// CHECK-32-EX-NEXT:    store i32 0, i32* [[DOTZERO_ADDR]], align 4
+// CHECK-32-EX-NEXT:    store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4
+// CHECK-32-EX-NEXT:    call void @__omp_outlined__47(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]]) #[[ATTR2]]
+// CHECK-32-EX-NEXT:    call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 2)
+// CHECK-32-EX-NEXT:    ret void
+// CHECK-32-EX:       worker.exit:
+// CHECK-32-EX-NEXT:    ret void
+//
+//
+// CHECK-32-EX-LABEL: define {{[^@]+}}@__omp_outlined__47
+// CHECK-32-EX-SAME: (i32* noalias noundef [[DOTGLOBAL_TID_:%.*]], i32* noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
+// CHECK-32-EX-NEXT:  entry:
+// CHECK-32-EX-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-EX-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [2 x i8*], align 4
+// CHECK-32-EX-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-EX-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4
+// CHECK-32-EX-NEXT:    store i32 0, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-32-EX-NEXT:    store i32 9, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-EX-NEXT:    store i32 1, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-32-EX-NEXT:    store i32 0, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-32-EX-NEXT:    [[NVPTX_NUM_THREADS:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block()
+// CHECK-32-EX-NEXT:    [[TMP0:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-EX-NEXT:    [[TMP1:%.*]] = load i32, i32* [[TMP0]], align 4
+// CHECK-32-EX-NEXT:    call void @__kmpc_distribute_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP1]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]])
+// CHECK-32-EX-NEXT:    [[TMP2:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-EX-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[TMP2]], 9
+// CHECK-32-EX-NEXT:    br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK-32-EX:       cond.true:
+// CHECK-32-EX-NEXT:    br label [[COND_END:%.*]]
+// CHECK-32-EX:       cond.false:
+// CHECK-32-EX-NEXT:    [[TMP3:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-EX-NEXT:    br label [[COND_END]]
+// CHECK-32-EX:       cond.end:
+// CHECK-32-EX-NEXT:    [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP3]], [[COND_FALSE]] ]
+// CHECK-32-EX-NEXT:    store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-EX-NEXT:    [[TMP4:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-32-EX-NEXT:    store i32 [[TMP4]], i32* [[DOTOMP_IV]], align 4
+// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK-32-EX:       omp.inner.for.cond:
+// CHECK-32-EX-NEXT:    [[TMP5:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
+// CHECK-32-EX-NEXT:    [[CMP1:%.*]] = icmp slt i32 [[TMP5]], 10
+// CHECK-32-EX-NEXT:    br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK-32-EX:       omp.inner.for.body:
+// CHECK-32-EX-NEXT:    [[TMP6:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-32-EX-NEXT:    [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-EX-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0
+// CHECK-32-EX-NEXT:    [[TMP9:%.*]] = inttoptr i32 [[TMP6]] to i8*
+// CHECK-32-EX-NEXT:    store i8* [[TMP9]], i8** [[TMP8]], align 4
+// CHECK-32-EX-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1
+// CHECK-32-EX-NEXT:    [[TMP11:%.*]] = inttoptr i32 [[TMP7]] to i8*
+// CHECK-32-EX-NEXT:    store i8* [[TMP11]], i8** [[TMP10]], align 4
+// CHECK-32-EX-NEXT:    [[TMP12:%.*]] = bitcast [2 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8**
+// CHECK-32-EX-NEXT:    call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32)* @__omp_outlined__48 to i8*), i8* null, i8** [[TMP12]], i32 2)
+// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK-32-EX:       omp.inner.for.inc:
+// CHECK-32-EX-NEXT:    [[TMP13:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
+// CHECK-32-EX-NEXT:    [[TMP14:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-32-EX-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP13]], [[TMP14]]
+// CHECK-32-EX-NEXT:    store i32 [[ADD]], i32* [[DOTOMP_IV]], align 4
+// CHECK-32-EX-NEXT:    [[TMP15:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-32-EX-NEXT:    [[TMP16:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-32-EX-NEXT:    [[ADD2:%.*]] = add nsw i32 [[TMP15]], [[TMP16]]
+// CHECK-32-EX-NEXT:    store i32 [[ADD2]], i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-32-EX-NEXT:    [[TMP17:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-EX-NEXT:    [[TMP18:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-32-EX-NEXT:    [[ADD3:%.*]] = add nsw i32 [[TMP17]], [[TMP18]]
+// CHECK-32-EX-NEXT:    store i32 [[ADD3]], i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-EX-NEXT:    [[TMP19:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-EX-NEXT:    [[CMP4:%.*]] = icmp sgt i32 [[TMP19]], 9
+// CHECK-32-EX-NEXT:    br i1 [[CMP4]], label [[COND_TRUE5:%.*]], label [[COND_FALSE6:%.*]]
+// CHECK-32-EX:       cond.true5:
+// CHECK-32-EX-NEXT:    br label [[COND_END7:%.*]]
+// CHECK-32-EX:       cond.false6:
+// CHECK-32-EX-NEXT:    [[TMP20:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-EX-NEXT:    br label [[COND_END7]]
+// CHECK-32-EX:       cond.end7:
+// CHECK-32-EX-NEXT:    [[COND8:%.*]] = phi i32 [ 9, [[COND_TRUE5]] ], [ [[TMP20]], [[COND_FALSE6]] ]
+// CHECK-32-EX-NEXT:    store i32 [[COND8]], i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-EX-NEXT:    [[TMP21:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-32-EX-NEXT:    store i32 [[TMP21]], i32* [[DOTOMP_IV]], align 4
+// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND]]
+// CHECK-32-EX:       omp.inner.for.end:
+// CHECK-32-EX-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK-32-EX:       omp.loop.exit:
+// CHECK-32-EX-NEXT:    call void @__kmpc_distribute_static_fini(%struct.ident_t* @[[GLOB2]], i32 [[TMP1]])
+// CHECK-32-EX-NEXT:    ret void
+//
+//
+// CHECK-32-EX-LABEL: define {{[^@]+}}@__omp_outlined__48
+// CHECK-32-EX-SAME: (i32* noalias noundef [[DOTGLOBAL_TID_:%.*]], i32* noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR1]] {
+// CHECK-32-EX-NEXT:  entry:
+// CHECK-32-EX-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-EX-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-EX-NEXT:    [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-EX-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4
+// CHECK-32-EX-NEXT:    store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4
+// CHECK-32-EX-NEXT:    store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4
+// CHECK-32-EX-NEXT:    store i32 0, i32* [[DOTOMP_LB]], align 4
+// CHECK-32-EX-NEXT:    store i32 9, i32* [[DOTOMP_UB]], align 4
+// CHECK-32-EX-NEXT:    [[TMP0:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4
+// CHECK-32-EX-NEXT:    [[TMP1:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4
+// CHECK-32-EX-NEXT:    store i32 [[TMP0]], i32* [[DOTOMP_LB]], align 4
+// CHECK-32-EX-NEXT:    store i32 [[TMP1]], i32* [[DOTOMP_UB]], align 4
+// CHECK-32-EX-NEXT:    store i32 1, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-32-EX-NEXT:    store i32 0, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-32-EX-NEXT:    [[TMP2:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-EX-NEXT:    [[TMP3:%.*]] = load i32, i32* [[TMP2]], align 4
+// CHECK-32-EX-NEXT:    call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB3]], i32 [[TMP3]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1)
+// CHECK-32-EX-NEXT:    [[TMP4:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4
+// CHECK-32-EX-NEXT:    store i32 [[TMP4]], i32* [[DOTOMP_IV]], align 4
+// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK-32-EX:       omp.inner.for.cond:
+// CHECK-32-EX-NEXT:    [[TMP5:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
+// CHECK-32-EX-NEXT:    [[TMP6:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4
+// CHECK-32-EX-NEXT:    [[CMP:%.*]] = icmp ule i32 [[TMP5]], [[TMP6]]
+// CHECK-32-EX-NEXT:    br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK-32-EX:       omp.inner.for.body:
+// CHECK-32-EX-NEXT:    [[TMP7:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
+// CHECK-32-EX-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP7]], 1
+// CHECK-32-EX-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// CHECK-32-EX-NEXT:    store i32 [[ADD]], i32* [[I]], align 4
+// CHECK-32-EX-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK-32-EX:       omp.body.continue:
+// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK-32-EX:       omp.inner.for.inc:
+// CHECK-32-EX-NEXT:    [[TMP8:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
+// CHECK-32-EX-NEXT:    [[TMP9:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-32-EX-NEXT:    [[ADD1:%.*]] = add nsw i32 [[TMP8]], [[TMP9]]
+// CHECK-32-EX-NEXT:    store i32 [[ADD1]], i32* [[DOTOMP_IV]], align 4
+// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND]]
+// CHECK-32-EX:       omp.inner.for.end:
+// CHECK-32-EX-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK-32-EX:       omp.loop.exit:
+// CHECK-32-EX-NEXT:    call void @__kmpc_distribute_static_fini(%struct.ident_t* @[[GLOB2]], i32 [[TMP3]])
+// CHECK-32-EX-NEXT:    ret void
+//
+//
+// CHECK-32-EX-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l108
+// CHECK-32-EX-SAME: () #[[ATTR0]] {
+// CHECK-32-EX-NEXT:  entry:
+// CHECK-32-EX-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 2, i1 false)
+// CHECK-32-EX-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
+// CHECK-32-EX-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
+// CHECK-32-EX:       user_code.entry:
+// CHECK-32-EX-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
+// CHECK-32-EX-NEXT:    store i32 0, i32* [[DOTZERO_ADDR]], align 4
+// CHECK-32-EX-NEXT:    store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4
+// CHECK-32-EX-NEXT:    call void @__omp_outlined__49(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]]) #[[ATTR2]]
+// CHECK-32-EX-NEXT:    call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 2)
+// CHECK-32-EX-NEXT:    ret void
+// CHECK-32-EX:       worker.exit:
+// CHECK-32-EX-NEXT:    ret void
+//
+//
+// CHECK-32-EX-LABEL: define {{[^@]+}}@__omp_outlined__49
+// CHECK-32-EX-SAME: (i32* noalias noundef [[DOTGLOBAL_TID_:%.*]], i32* noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
+// CHECK-32-EX-NEXT:  entry:
+// CHECK-32-EX-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-EX-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [2 x i8*], align 4
+// CHECK-32-EX-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-EX-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4
+// CHECK-32-EX-NEXT:    store i32 0, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-32-EX-NEXT:    store i32 9, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-EX-NEXT:    store i32 1, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-32-EX-NEXT:    store i32 0, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-32-EX-NEXT:    [[NVPTX_NUM_THREADS:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block()
+// CHECK-32-EX-NEXT:    [[TMP0:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-EX-NEXT:    [[TMP1:%.*]] = load i32, i32* [[TMP0]], align 4
+// CHECK-32-EX-NEXT:    call void @__kmpc_distribute_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP1]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]])
+// CHECK-32-EX-NEXT:    [[TMP2:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-EX-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[TMP2]], 9
+// CHECK-32-EX-NEXT:    br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK-32-EX:       cond.true:
+// CHECK-32-EX-NEXT:    br label [[COND_END:%.*]]
+// CHECK-32-EX:       cond.false:
+// CHECK-32-EX-NEXT:    [[TMP3:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-EX-NEXT:    br label [[COND_END]]
+// CHECK-32-EX:       cond.end:
+// CHECK-32-EX-NEXT:    [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP3]], [[COND_FALSE]] ]
+// CHECK-32-EX-NEXT:    store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-EX-NEXT:    [[TMP4:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-32-EX-NEXT:    store i32 [[TMP4]], i32* [[DOTOMP_IV]], align 4
+// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK-32-EX:       omp.inner.for.cond:
+// CHECK-32-EX-NEXT:    [[TMP5:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
+// CHECK-32-EX-NEXT:    [[CMP1:%.*]] = icmp slt i32 [[TMP5]], 10
+// CHECK-32-EX-NEXT:    br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK-32-EX:       omp.inner.for.body:
+// CHECK-32-EX-NEXT:    [[TMP6:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-32-EX-NEXT:    [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-EX-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0
+// CHECK-32-EX-NEXT:    [[TMP9:%.*]] = inttoptr i32 [[TMP6]] to i8*
+// CHECK-32-EX-NEXT:    store i8* [[TMP9]], i8** [[TMP8]], align 4
+// CHECK-32-EX-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1
+// CHECK-32-EX-NEXT:    [[TMP11:%.*]] = inttoptr i32 [[TMP7]] to i8*
+// CHECK-32-EX-NEXT:    store i8* [[TMP11]], i8** [[TMP10]], align 4
+// CHECK-32-EX-NEXT:    [[TMP12:%.*]] = bitcast [2 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8**
+// CHECK-32-EX-NEXT:    call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32)* @__omp_outlined__50 to i8*), i8* null, i8** [[TMP12]], i32 2)
+// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK-32-EX:       omp.inner.for.inc:
+// CHECK-32-EX-NEXT:    [[TMP13:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
+// CHECK-32-EX-NEXT:    [[TMP14:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-32-EX-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP13]], [[TMP14]]
+// CHECK-32-EX-NEXT:    store i32 [[ADD]], i32* [[DOTOMP_IV]], align 4
+// CHECK-32-EX-NEXT:    [[TMP15:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-32-EX-NEXT:    [[TMP16:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-32-EX-NEXT:    [[ADD2:%.*]] = add nsw i32 [[TMP15]], [[TMP16]]
+// CHECK-32-EX-NEXT:    store i32 [[ADD2]], i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-32-EX-NEXT:    [[TMP17:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-EX-NEXT:    [[TMP18:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-32-EX-NEXT:    [[ADD3:%.*]] = add nsw i32 [[TMP17]], [[TMP18]]
+// CHECK-32-EX-NEXT:    store i32 [[ADD3]], i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-EX-NEXT:    [[TMP19:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-EX-NEXT:    [[CMP4:%.*]] = icmp sgt i32 [[TMP19]], 9
+// CHECK-32-EX-NEXT:    br i1 [[CMP4]], label [[COND_TRUE5:%.*]], label [[COND_FALSE6:%.*]]
+// CHECK-32-EX:       cond.true5:
+// CHECK-32-EX-NEXT:    br label [[COND_END7:%.*]]
+// CHECK-32-EX:       cond.false6:
+// CHECK-32-EX-NEXT:    [[TMP20:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-EX-NEXT:    br label [[COND_END7]]
+// CHECK-32-EX:       cond.end7:
+// CHECK-32-EX-NEXT:    [[COND8:%.*]] = phi i32 [ 9, [[COND_TRUE5]] ], [ [[TMP20]], [[COND_FALSE6]] ]
+// CHECK-32-EX-NEXT:    store i32 [[COND8]], i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-EX-NEXT:    [[TMP21:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-32-EX-NEXT:    store i32 [[TMP21]], i32* [[DOTOMP_IV]], align 4
+// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND]]
+// CHECK-32-EX:       omp.inner.for.end:
+// CHECK-32-EX-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK-32-EX:       omp.loop.exit:
+// CHECK-32-EX-NEXT:    call void @__kmpc_distribute_static_fini(%struct.ident_t* @[[GLOB2]], i32 [[TMP1]])
+// CHECK-32-EX-NEXT:    ret void
+//
+//
+// CHECK-32-EX-LABEL: define {{[^@]+}}@__omp_outlined__50
+// CHECK-32-EX-SAME: (i32* noalias noundef [[DOTGLOBAL_TID_:%.*]], i32* noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR1]] {
+// CHECK-32-EX-NEXT:  entry:
+// CHECK-32-EX-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-EX-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-EX-NEXT:    [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-EX-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4
+// CHECK-32-EX-NEXT:    store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4
+// CHECK-32-EX-NEXT:    store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4
+// CHECK-32-EX-NEXT:    store i32 0, i32* [[DOTOMP_LB]], align 4
+// CHECK-32-EX-NEXT:    store i32 9, i32* [[DOTOMP_UB]], align 4
+// CHECK-32-EX-NEXT:    [[TMP0:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4
+// CHECK-32-EX-NEXT:    [[TMP1:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4
+// CHECK-32-EX-NEXT:    store i32 [[TMP0]], i32* [[DOTOMP_LB]], align 4
+// CHECK-32-EX-NEXT:    store i32 [[TMP1]], i32* [[DOTOMP_UB]], align 4
+// CHECK-32-EX-NEXT:    store i32 1, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-32-EX-NEXT:    store i32 0, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-32-EX-NEXT:    [[TMP2:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4
+// CHECK-32-EX-NEXT:    [[TMP3:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4
+// CHECK-32-EX-NEXT:    [[TMP4:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-EX-NEXT:    [[TMP5:%.*]] = load i32, i32* [[TMP4]], align 4
+// CHECK-32-EX-NEXT:    call void @__kmpc_dispatch_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP5]], i32 1073741862, i32 [[TMP2]], i32 [[TMP3]], i32 1, i32 1)
+// CHECK-32-EX-NEXT:    br label [[OMP_DISPATCH_COND:%.*]]
+// CHECK-32-EX:       omp.dispatch.cond:
+// CHECK-32-EX-NEXT:    [[TMP6:%.*]] = call i32 @__kmpc_dispatch_next_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP5]], i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]])
+// CHECK-32-EX-NEXT:    [[TOBOOL:%.*]] = icmp ne i32 [[TMP6]], 0
+// CHECK-32-EX-NEXT:    br i1 [[TOBOOL]], label [[OMP_DISPATCH_BODY:%.*]], label [[OMP_DISPATCH_END:%.*]]
+// CHECK-32-EX:       omp.dispatch.body:
+// CHECK-32-EX-NEXT:    [[TMP7:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4
+// CHECK-32-EX-NEXT:    store i32 [[TMP7]], i32* [[DOTOMP_IV]], align 4
+// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK-32-EX:       omp.inner.for.cond:
+// CHECK-32-EX-NEXT:    [[TMP8:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP234:![0-9]+]]
+// CHECK-32-EX-NEXT:    [[TMP9:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP234]]
+// CHECK-32-EX-NEXT:    [[CMP:%.*]] = icmp sle i32 [[TMP8]], [[TMP9]]
+// CHECK-32-EX-NEXT:    br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK-32-EX:       omp.inner.for.body:
+// CHECK-32-EX-NEXT:    [[TMP10:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP234]]
+// CHECK-32-EX-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP10]], 1
+// CHECK-32-EX-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// CHECK-32-EX-NEXT:    store i32 [[ADD]], i32* [[I]], align 4, !llvm.access.group [[ACC_GRP234]]
+// CHECK-32-EX-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK-32-EX:       omp.body.continue:
+// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK-32-EX:       omp.inner.for.inc:
+// CHECK-32-EX-NEXT:    [[TMP11:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP234]]
+// CHECK-32-EX-NEXT:    [[ADD1:%.*]] = add nsw i32 [[TMP11]], 1
+// CHECK-32-EX-NEXT:    store i32 [[ADD1]], i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP234]]
+// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP235:![0-9]+]]
+// CHECK-32-EX:       omp.inner.for.end:
+// CHECK-32-EX-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
+// CHECK-32-EX:       omp.dispatch.inc:
+// CHECK-32-EX-NEXT:    br label [[OMP_DISPATCH_COND]]
+// CHECK-32-EX:       omp.dispatch.end:
+// CHECK-32-EX-NEXT:    ret void
+//
+//
+// CHECK-32-EX-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l112
+// CHECK-32-EX-SAME: () #[[ATTR0]] {
+// CHECK-32-EX-NEXT:  entry:
+// CHECK-32-EX-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 2, i1 false)
+// CHECK-32-EX-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
+// CHECK-32-EX-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
+// CHECK-32-EX:       user_code.entry:
+// CHECK-32-EX-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
+// CHECK-32-EX-NEXT:    store i32 0, i32* [[DOTZERO_ADDR]], align 4
+// CHECK-32-EX-NEXT:    store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4
+// CHECK-32-EX-NEXT:    call void @__omp_outlined__51(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]]) #[[ATTR2]]
+// CHECK-32-EX-NEXT:    call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 2)
+// CHECK-32-EX-NEXT:    ret void
+// CHECK-32-EX:       worker.exit:
+// CHECK-32-EX-NEXT:    ret void
+//
+//
+// CHECK-32-EX-LABEL: define {{[^@]+}}@__omp_outlined__51
+// CHECK-32-EX-SAME: (i32* noalias noundef [[DOTGLOBAL_TID_:%.*]], i32* noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
+// CHECK-32-EX-NEXT:  entry:
+// CHECK-32-EX-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-EX-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [2 x i8*], align 4
+// CHECK-32-EX-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-EX-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4
+// CHECK-32-EX-NEXT:    store i32 0, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-32-EX-NEXT:    store i32 9, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-EX-NEXT:    store i32 1, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-32-EX-NEXT:    store i32 0, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-32-EX-NEXT:    [[NVPTX_NUM_THREADS:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block()
+// CHECK-32-EX-NEXT:    [[TMP0:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-EX-NEXT:    [[TMP1:%.*]] = load i32, i32* [[TMP0]], align 4
+// CHECK-32-EX-NEXT:    call void @__kmpc_distribute_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP1]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]])
+// CHECK-32-EX-NEXT:    [[TMP2:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-EX-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[TMP2]], 9
+// CHECK-32-EX-NEXT:    br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK-32-EX:       cond.true:
+// CHECK-32-EX-NEXT:    br label [[COND_END:%.*]]
+// CHECK-32-EX:       cond.false:
+// CHECK-32-EX-NEXT:    [[TMP3:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-EX-NEXT:    br label [[COND_END]]
+// CHECK-32-EX:       cond.end:
+// CHECK-32-EX-NEXT:    [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP3]], [[COND_FALSE]] ]
+// CHECK-32-EX-NEXT:    store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-EX-NEXT:    [[TMP4:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-32-EX-NEXT:    store i32 [[TMP4]], i32* [[DOTOMP_IV]], align 4
+// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK-32-EX:       omp.inner.for.cond:
+// CHECK-32-EX-NEXT:    [[TMP5:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
+// CHECK-32-EX-NEXT:    [[CMP1:%.*]] = icmp slt i32 [[TMP5]], 10
+// CHECK-32-EX-NEXT:    br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK-32-EX:       omp.inner.for.body:
+// CHECK-32-EX-NEXT:    [[TMP6:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-32-EX-NEXT:    [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-EX-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0
+// CHECK-32-EX-NEXT:    [[TMP9:%.*]] = inttoptr i32 [[TMP6]] to i8*
+// CHECK-32-EX-NEXT:    store i8* [[TMP9]], i8** [[TMP8]], align 4
+// CHECK-32-EX-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1
+// CHECK-32-EX-NEXT:    [[TMP11:%.*]] = inttoptr i32 [[TMP7]] to i8*
+// CHECK-32-EX-NEXT:    store i8* [[TMP11]], i8** [[TMP10]], align 4
+// CHECK-32-EX-NEXT:    [[TMP12:%.*]] = bitcast [2 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8**
+// CHECK-32-EX-NEXT:    call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32)* @__omp_outlined__52 to i8*), i8* null, i8** [[TMP12]], i32 2)
+// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK-32-EX:       omp.inner.for.inc:
+// CHECK-32-EX-NEXT:    [[TMP13:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
+// CHECK-32-EX-NEXT:    [[TMP14:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-32-EX-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP13]], [[TMP14]]
+// CHECK-32-EX-NEXT:    store i32 [[ADD]], i32* [[DOTOMP_IV]], align 4
+// CHECK-32-EX-NEXT:    [[TMP15:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-32-EX-NEXT:    [[TMP16:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-32-EX-NEXT:    [[ADD2:%.*]] = add nsw i32 [[TMP15]], [[TMP16]]
+// CHECK-32-EX-NEXT:    store i32 [[ADD2]], i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-32-EX-NEXT:    [[TMP17:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-EX-NEXT:    [[TMP18:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-32-EX-NEXT:    [[ADD3:%.*]] = add nsw i32 [[TMP17]], [[TMP18]]
+// CHECK-32-EX-NEXT:    store i32 [[ADD3]], i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-EX-NEXT:    [[TMP19:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-EX-NEXT:    [[CMP4:%.*]] = icmp sgt i32 [[TMP19]], 9
+// CHECK-32-EX-NEXT:    br i1 [[CMP4]], label [[COND_TRUE5:%.*]], label [[COND_FALSE6:%.*]]
+// CHECK-32-EX:       cond.true5:
+// CHECK-32-EX-NEXT:    br label [[COND_END7:%.*]]
+// CHECK-32-EX:       cond.false6:
+// CHECK-32-EX-NEXT:    [[TMP20:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-EX-NEXT:    br label [[COND_END7]]
+// CHECK-32-EX:       cond.end7:
+// CHECK-32-EX-NEXT:    [[COND8:%.*]] = phi i32 [ 9, [[COND_TRUE5]] ], [ [[TMP20]], [[COND_FALSE6]] ]
+// CHECK-32-EX-NEXT:    store i32 [[COND8]], i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-EX-NEXT:    [[TMP21:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-32-EX-NEXT:    store i32 [[TMP21]], i32* [[DOTOMP_IV]], align 4
+// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND]]
+// CHECK-32-EX:       omp.inner.for.end:
+// CHECK-32-EX-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK-32-EX:       omp.loop.exit:
+// CHECK-32-EX-NEXT:    call void @__kmpc_distribute_static_fini(%struct.ident_t* @[[GLOB2]], i32 [[TMP1]])
+// CHECK-32-EX-NEXT:    ret void
+//
+//
+// CHECK-32-EX-LABEL: define {{[^@]+}}@__omp_outlined__52
+// CHECK-32-EX-SAME: (i32* noalias noundef [[DOTGLOBAL_TID_:%.*]], i32* noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR1]] {
+// CHECK-32-EX-NEXT:  entry:
+// CHECK-32-EX-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-EX-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-EX-NEXT:    [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-EX-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4
+// CHECK-32-EX-NEXT:    store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4
+// CHECK-32-EX-NEXT:    store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4
+// CHECK-32-EX-NEXT:    store i32 0, i32* [[DOTOMP_LB]], align 4
+// CHECK-32-EX-NEXT:    store i32 9, i32* [[DOTOMP_UB]], align 4
+// CHECK-32-EX-NEXT:    [[TMP0:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4
+// CHECK-32-EX-NEXT:    [[TMP1:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4
+// CHECK-32-EX-NEXT:    store i32 [[TMP0]], i32* [[DOTOMP_LB]], align 4
+// CHECK-32-EX-NEXT:    store i32 [[TMP1]], i32* [[DOTOMP_UB]], align 4
+// CHECK-32-EX-NEXT:    store i32 1, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-32-EX-NEXT:    store i32 0, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-32-EX-NEXT:    [[TMP2:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4
+// CHECK-32-EX-NEXT:    [[TMP3:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4
+// CHECK-32-EX-NEXT:    [[TMP4:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-EX-NEXT:    [[TMP5:%.*]] = load i32, i32* [[TMP4]], align 4
+// CHECK-32-EX-NEXT:    call void @__kmpc_dispatch_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP5]], i32 1073741861, i32 [[TMP2]], i32 [[TMP3]], i32 1, i32 1)
+// CHECK-32-EX-NEXT:    br label [[OMP_DISPATCH_COND:%.*]]
+// CHECK-32-EX:       omp.dispatch.cond:
+// CHECK-32-EX-NEXT:    [[TMP6:%.*]] = call i32 @__kmpc_dispatch_next_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP5]], i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]])
+// CHECK-32-EX-NEXT:    [[TOBOOL:%.*]] = icmp ne i32 [[TMP6]], 0
+// CHECK-32-EX-NEXT:    br i1 [[TOBOOL]], label [[OMP_DISPATCH_BODY:%.*]], label [[OMP_DISPATCH_END:%.*]]
+// CHECK-32-EX:       omp.dispatch.body:
+// CHECK-32-EX-NEXT:    [[TMP7:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4
+// CHECK-32-EX-NEXT:    store i32 [[TMP7]], i32* [[DOTOMP_IV]], align 4
+// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK-32-EX:       omp.inner.for.cond:
+// CHECK-32-EX-NEXT:    [[TMP8:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP237:![0-9]+]]
+// CHECK-32-EX-NEXT:    [[TMP9:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP237]]
+// CHECK-32-EX-NEXT:    [[CMP:%.*]] = icmp sle i32 [[TMP8]], [[TMP9]]
+// CHECK-32-EX-NEXT:    br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK-32-EX:       omp.inner.for.body:
+// CHECK-32-EX-NEXT:    [[TMP10:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP237]]
+// CHECK-32-EX-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP10]], 1
+// CHECK-32-EX-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// CHECK-32-EX-NEXT:    store i32 [[ADD]], i32* [[I]], align 4, !llvm.access.group [[ACC_GRP237]]
+// CHECK-32-EX-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK-32-EX:       omp.body.continue:
+// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK-32-EX:       omp.inner.for.inc:
+// CHECK-32-EX-NEXT:    [[TMP11:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP237]]
+// CHECK-32-EX-NEXT:    [[ADD1:%.*]] = add nsw i32 [[TMP11]], 1
+// CHECK-32-EX-NEXT:    store i32 [[ADD1]], i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP237]]
+// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP238:![0-9]+]]
+// CHECK-32-EX:       omp.inner.for.end:
+// CHECK-32-EX-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
+// CHECK-32-EX:       omp.dispatch.inc:
+// CHECK-32-EX-NEXT:    br label [[OMP_DISPATCH_COND]]
+// CHECK-32-EX:       omp.dispatch.end:
+// CHECK-32-EX-NEXT:    ret void
+//
+//
+// CHECK-32-EX-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l116
+// CHECK-32-EX-SAME: () #[[ATTR0]] {
+// CHECK-32-EX-NEXT:  entry:
+// CHECK-32-EX-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 2, i1 false)
+// CHECK-32-EX-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
+// CHECK-32-EX-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
+// CHECK-32-EX:       user_code.entry:
+// CHECK-32-EX-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
+// CHECK-32-EX-NEXT:    store i32 0, i32* [[DOTZERO_ADDR]], align 4
+// CHECK-32-EX-NEXT:    store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4
+// CHECK-32-EX-NEXT:    call void @__omp_outlined__53(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]]) #[[ATTR2]]
+// CHECK-32-EX-NEXT:    call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 2)
+// CHECK-32-EX-NEXT:    ret void
+// CHECK-32-EX:       worker.exit:
+// CHECK-32-EX-NEXT:    ret void
+//
+//
+// CHECK-32-EX-LABEL: define {{[^@]+}}@__omp_outlined__53
+// CHECK-32-EX-SAME: (i32* noalias noundef [[DOTGLOBAL_TID_:%.*]], i32* noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
+// CHECK-32-EX-NEXT:  entry:
+// CHECK-32-EX-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-EX-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [2 x i8*], align 4
+// CHECK-32-EX-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-EX-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4
+// CHECK-32-EX-NEXT:    store i32 0, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-32-EX-NEXT:    store i32 9, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-EX-NEXT:    store i32 1, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-32-EX-NEXT:    store i32 0, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-32-EX-NEXT:    [[NVPTX_NUM_THREADS:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block()
+// CHECK-32-EX-NEXT:    [[TMP0:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-EX-NEXT:    [[TMP1:%.*]] = load i32, i32* [[TMP0]], align 4
+// CHECK-32-EX-NEXT:    call void @__kmpc_distribute_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP1]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]])
+// CHECK-32-EX-NEXT:    [[TMP2:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-EX-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[TMP2]], 9
+// CHECK-32-EX-NEXT:    br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK-32-EX:       cond.true:
+// CHECK-32-EX-NEXT:    br label [[COND_END:%.*]]
+// CHECK-32-EX:       cond.false:
+// CHECK-32-EX-NEXT:    [[TMP3:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-EX-NEXT:    br label [[COND_END]]
+// CHECK-32-EX:       cond.end:
+// CHECK-32-EX-NEXT:    [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP3]], [[COND_FALSE]] ]
+// CHECK-32-EX-NEXT:    store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-EX-NEXT:    [[TMP4:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-32-EX-NEXT:    store i32 [[TMP4]], i32* [[DOTOMP_IV]], align 4
+// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK-32-EX:       omp.inner.for.cond:
+// CHECK-32-EX-NEXT:    [[TMP5:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
+// CHECK-32-EX-NEXT:    [[CMP1:%.*]] = icmp slt i32 [[TMP5]], 10
+// CHECK-32-EX-NEXT:    br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK-32-EX:       omp.inner.for.body:
+// CHECK-32-EX-NEXT:    [[TMP6:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-32-EX-NEXT:    [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-EX-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0
+// CHECK-32-EX-NEXT:    [[TMP9:%.*]] = inttoptr i32 [[TMP6]] to i8*
+// CHECK-32-EX-NEXT:    store i8* [[TMP9]], i8** [[TMP8]], align 4
+// CHECK-32-EX-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1
+// CHECK-32-EX-NEXT:    [[TMP11:%.*]] = inttoptr i32 [[TMP7]] to i8*
+// CHECK-32-EX-NEXT:    store i8* [[TMP11]], i8** [[TMP10]], align 4
+// CHECK-32-EX-NEXT:    [[TMP12:%.*]] = bitcast [2 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8**
+// CHECK-32-EX-NEXT:    call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32)* @__omp_outlined__54 to i8*), i8* null, i8** [[TMP12]], i32 2)
+// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK-32-EX:       omp.inner.for.inc:
+// CHECK-32-EX-NEXT:    [[TMP13:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
+// CHECK-32-EX-NEXT:    [[TMP14:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-32-EX-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP13]], [[TMP14]]
+// CHECK-32-EX-NEXT:    store i32 [[ADD]], i32* [[DOTOMP_IV]], align 4
+// CHECK-32-EX-NEXT:    [[TMP15:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-32-EX-NEXT:    [[TMP16:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-32-EX-NEXT:    [[ADD2:%.*]] = add nsw i32 [[TMP15]], [[TMP16]]
+// CHECK-32-EX-NEXT:    store i32 [[ADD2]], i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-32-EX-NEXT:    [[TMP17:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-EX-NEXT:    [[TMP18:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-32-EX-NEXT:    [[ADD3:%.*]] = add nsw i32 [[TMP17]], [[TMP18]]
+// CHECK-32-EX-NEXT:    store i32 [[ADD3]], i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-EX-NEXT:    [[TMP19:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-EX-NEXT:    [[CMP4:%.*]] = icmp sgt i32 [[TMP19]], 9
+// CHECK-32-EX-NEXT:    br i1 [[CMP4]], label [[COND_TRUE5:%.*]], label [[COND_FALSE6:%.*]]
+// CHECK-32-EX:       cond.true5:
+// CHECK-32-EX-NEXT:    br label [[COND_END7:%.*]]
+// CHECK-32-EX:       cond.false6:
+// CHECK-32-EX-NEXT:    [[TMP20:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-EX-NEXT:    br label [[COND_END7]]
+// CHECK-32-EX:       cond.end7:
+// CHECK-32-EX-NEXT:    [[COND8:%.*]] = phi i32 [ 9, [[COND_TRUE5]] ], [ [[TMP20]], [[COND_FALSE6]] ]
+// CHECK-32-EX-NEXT:    store i32 [[COND8]], i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-EX-NEXT:    [[TMP21:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-32-EX-NEXT:    store i32 [[TMP21]], i32* [[DOTOMP_IV]], align 4
+// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND]]
+// CHECK-32-EX:       omp.inner.for.end:
+// CHECK-32-EX-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK-32-EX:       omp.loop.exit:
+// CHECK-32-EX-NEXT:    call void @__kmpc_distribute_static_fini(%struct.ident_t* @[[GLOB2]], i32 [[TMP1]])
+// CHECK-32-EX-NEXT:    ret void
+//
+//
+// CHECK-32-EX-LABEL: define {{[^@]+}}@__omp_outlined__54
+// CHECK-32-EX-SAME: (i32* noalias noundef [[DOTGLOBAL_TID_:%.*]], i32* noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR1]] {
+// CHECK-32-EX-NEXT:  entry:
+// CHECK-32-EX-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-EX-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-EX-NEXT:    [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-EX-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4
+// CHECK-32-EX-NEXT:    store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4
+// CHECK-32-EX-NEXT:    store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4
+// CHECK-32-EX-NEXT:    store i32 0, i32* [[DOTOMP_LB]], align 4
+// CHECK-32-EX-NEXT:    store i32 9, i32* [[DOTOMP_UB]], align 4
+// CHECK-32-EX-NEXT:    [[TMP0:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4
+// CHECK-32-EX-NEXT:    [[TMP1:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4
+// CHECK-32-EX-NEXT:    store i32 [[TMP0]], i32* [[DOTOMP_LB]], align 4
+// CHECK-32-EX-NEXT:    store i32 [[TMP1]], i32* [[DOTOMP_UB]], align 4
+// CHECK-32-EX-NEXT:    store i32 1, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-32-EX-NEXT:    store i32 0, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-32-EX-NEXT:    [[TMP2:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4
+// CHECK-32-EX-NEXT:    [[TMP3:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4
+// CHECK-32-EX-NEXT:    [[TMP4:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-EX-NEXT:    [[TMP5:%.*]] = load i32, i32* [[TMP4]], align 4
+// CHECK-32-EX-NEXT:    call void @__kmpc_dispatch_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP5]], i32 1073741859, i32 [[TMP2]], i32 [[TMP3]], i32 1, i32 1)
+// CHECK-32-EX-NEXT:    br label [[OMP_DISPATCH_COND:%.*]]
+// CHECK-32-EX:       omp.dispatch.cond:
+// CHECK-32-EX-NEXT:    [[TMP6:%.*]] = call i32 @__kmpc_dispatch_next_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP5]], i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]])
+// CHECK-32-EX-NEXT:    [[TOBOOL:%.*]] = icmp ne i32 [[TMP6]], 0
+// CHECK-32-EX-NEXT:    br i1 [[TOBOOL]], label [[OMP_DISPATCH_BODY:%.*]], label [[OMP_DISPATCH_END:%.*]]
+// CHECK-32-EX:       omp.dispatch.body:
+// CHECK-32-EX-NEXT:    [[TMP7:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4
+// CHECK-32-EX-NEXT:    store i32 [[TMP7]], i32* [[DOTOMP_IV]], align 4
+// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK-32-EX:       omp.inner.for.cond:
+// CHECK-32-EX-NEXT:    [[TMP8:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP240:![0-9]+]]
+// CHECK-32-EX-NEXT:    [[TMP9:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP240]]
+// CHECK-32-EX-NEXT:    [[CMP:%.*]] = icmp sle i32 [[TMP8]], [[TMP9]]
+// CHECK-32-EX-NEXT:    br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK-32-EX:       omp.inner.for.body:
+// CHECK-32-EX-NEXT:    [[TMP10:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP240]]
+// CHECK-32-EX-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP10]], 1
+// CHECK-32-EX-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// CHECK-32-EX-NEXT:    store i32 [[ADD]], i32* [[I]], align 4, !llvm.access.group [[ACC_GRP240]]
+// CHECK-32-EX-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK-32-EX:       omp.body.continue:
+// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK-32-EX:       omp.inner.for.inc:
+// CHECK-32-EX-NEXT:    [[TMP11:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP240]]
+// CHECK-32-EX-NEXT:    [[ADD1:%.*]] = add nsw i32 [[TMP11]], 1
+// CHECK-32-EX-NEXT:    store i32 [[ADD1]], i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP240]]
+// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP241:![0-9]+]]
+// CHECK-32-EX:       omp.inner.for.end:
+// CHECK-32-EX-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
+// CHECK-32-EX:       omp.dispatch.inc:
+// CHECK-32-EX-NEXT:    br label [[OMP_DISPATCH_COND]]
+// CHECK-32-EX:       omp.dispatch.end:
+// CHECK-32-EX-NEXT:    ret void
+//
+//
+// CHECK-32-EX-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l120
+// CHECK-32-EX-SAME: () #[[ATTR0]] {
+// CHECK-32-EX-NEXT:  entry:
+// CHECK-32-EX-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 2, i1 false)
+// CHECK-32-EX-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
+// CHECK-32-EX-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
+// CHECK-32-EX:       user_code.entry:
+// CHECK-32-EX-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
+// CHECK-32-EX-NEXT:    store i32 0, i32* [[DOTZERO_ADDR]], align 4
+// CHECK-32-EX-NEXT:    store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4
+// CHECK-32-EX-NEXT:    call void @__omp_outlined__55(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]]) #[[ATTR2]]
+// CHECK-32-EX-NEXT:    call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 2)
+// CHECK-32-EX-NEXT:    ret void
+// CHECK-32-EX:       worker.exit:
+// CHECK-32-EX-NEXT:    ret void
+//
+//
+// CHECK-32-EX-LABEL: define {{[^@]+}}@__omp_outlined__55
+// CHECK-32-EX-SAME: (i32* noalias noundef [[DOTGLOBAL_TID_:%.*]], i32* noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
+// CHECK-32-EX-NEXT:  entry:
+// CHECK-32-EX-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-EX-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [2 x i8*], align 4
+// CHECK-32-EX-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-EX-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4
+// CHECK-32-EX-NEXT:    store i32 0, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-32-EX-NEXT:    store i32 9, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-EX-NEXT:    store i32 1, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-32-EX-NEXT:    store i32 0, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-32-EX-NEXT:    [[NVPTX_NUM_THREADS:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block()
+// CHECK-32-EX-NEXT:    [[TMP0:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-EX-NEXT:    [[TMP1:%.*]] = load i32, i32* [[TMP0]], align 4
+// CHECK-32-EX-NEXT:    call void @__kmpc_distribute_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP1]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]])
+// CHECK-32-EX-NEXT:    [[TMP2:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-EX-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[TMP2]], 9
+// CHECK-32-EX-NEXT:    br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK-32-EX:       cond.true:
+// CHECK-32-EX-NEXT:    br label [[COND_END:%.*]]
+// CHECK-32-EX:       cond.false:
+// CHECK-32-EX-NEXT:    [[TMP3:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-EX-NEXT:    br label [[COND_END]]
+// CHECK-32-EX:       cond.end:
+// CHECK-32-EX-NEXT:    [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP3]], [[COND_FALSE]] ]
+// CHECK-32-EX-NEXT:    store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-EX-NEXT:    [[TMP4:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-32-EX-NEXT:    store i32 [[TMP4]], i32* [[DOTOMP_IV]], align 4
+// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK-32-EX:       omp.inner.for.cond:
+// CHECK-32-EX-NEXT:    [[TMP5:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
+// CHECK-32-EX-NEXT:    [[CMP1:%.*]] = icmp slt i32 [[TMP5]], 10
+// CHECK-32-EX-NEXT:    br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK-32-EX:       omp.inner.for.body:
+// CHECK-32-EX-NEXT:    [[TMP6:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-32-EX-NEXT:    [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-EX-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0
+// CHECK-32-EX-NEXT:    [[TMP9:%.*]] = inttoptr i32 [[TMP6]] to i8*
+// CHECK-32-EX-NEXT:    store i8* [[TMP9]], i8** [[TMP8]], align 4
+// CHECK-32-EX-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1
+// CHECK-32-EX-NEXT:    [[TMP11:%.*]] = inttoptr i32 [[TMP7]] to i8*
+// CHECK-32-EX-NEXT:    store i8* [[TMP11]], i8** [[TMP10]], align 4
+// CHECK-32-EX-NEXT:    [[TMP12:%.*]] = bitcast [2 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8**
+// CHECK-32-EX-NEXT:    call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32)* @__omp_outlined__56 to i8*), i8* null, i8** [[TMP12]], i32 2)
+// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK-32-EX:       omp.inner.for.inc:
+// CHECK-32-EX-NEXT:    [[TMP13:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
+// CHECK-32-EX-NEXT:    [[TMP14:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-32-EX-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP13]], [[TMP14]]
+// CHECK-32-EX-NEXT:    store i32 [[ADD]], i32* [[DOTOMP_IV]], align 4
+// CHECK-32-EX-NEXT:    [[TMP15:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-32-EX-NEXT:    [[TMP16:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-32-EX-NEXT:    [[ADD2:%.*]] = add nsw i32 [[TMP15]], [[TMP16]]
+// CHECK-32-EX-NEXT:    store i32 [[ADD2]], i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-32-EX-NEXT:    [[TMP17:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-EX-NEXT:    [[TMP18:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-32-EX-NEXT:    [[ADD3:%.*]] = add nsw i32 [[TMP17]], [[TMP18]]
+// CHECK-32-EX-NEXT:    store i32 [[ADD3]], i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-EX-NEXT:    [[TMP19:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-EX-NEXT:    [[CMP4:%.*]] = icmp sgt i32 [[TMP19]], 9
+// CHECK-32-EX-NEXT:    br i1 [[CMP4]], label [[COND_TRUE5:%.*]], label [[COND_FALSE6:%.*]]
+// CHECK-32-EX:       cond.true5:
+// CHECK-32-EX-NEXT:    br label [[COND_END7:%.*]]
+// CHECK-32-EX:       cond.false6:
+// CHECK-32-EX-NEXT:    [[TMP20:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-EX-NEXT:    br label [[COND_END7]]
+// CHECK-32-EX:       cond.end7:
+// CHECK-32-EX-NEXT:    [[COND8:%.*]] = phi i32 [ 9, [[COND_TRUE5]] ], [ [[TMP20]], [[COND_FALSE6]] ]
+// CHECK-32-EX-NEXT:    store i32 [[COND8]], i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-EX-NEXT:    [[TMP21:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-32-EX-NEXT:    store i32 [[TMP21]], i32* [[DOTOMP_IV]], align 4
+// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND]]
+// CHECK-32-EX:       omp.inner.for.end:
+// CHECK-32-EX-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK-32-EX:       omp.loop.exit:
+// CHECK-32-EX-NEXT:    call void @__kmpc_distribute_static_fini(%struct.ident_t* @[[GLOB2]], i32 [[TMP1]])
+// CHECK-32-EX-NEXT:    ret void
+//
+//
+// CHECK-32-EX-LABEL: define {{[^@]+}}@__omp_outlined__56
+// CHECK-32-EX-SAME: (i32* noalias noundef [[DOTGLOBAL_TID_:%.*]], i32* noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR1]] {
+// CHECK-32-EX-NEXT:  entry:
+// CHECK-32-EX-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-EX-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-EX-NEXT:    [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-EX-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4
+// CHECK-32-EX-NEXT:    store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4
+// CHECK-32-EX-NEXT:    store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4
+// CHECK-32-EX-NEXT:    store i32 0, i32* [[DOTOMP_LB]], align 4
+// CHECK-32-EX-NEXT:    store i32 9, i32* [[DOTOMP_UB]], align 4
+// CHECK-32-EX-NEXT:    [[TMP0:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4
+// CHECK-32-EX-NEXT:    [[TMP1:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4
+// CHECK-32-EX-NEXT:    store i32 [[TMP0]], i32* [[DOTOMP_LB]], align 4
+// CHECK-32-EX-NEXT:    store i32 [[TMP1]], i32* [[DOTOMP_UB]], align 4
+// CHECK-32-EX-NEXT:    store i32 1, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-32-EX-NEXT:    store i32 0, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-32-EX-NEXT:    [[TMP2:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4
+// CHECK-32-EX-NEXT:    [[TMP3:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4
+// CHECK-32-EX-NEXT:    [[TMP4:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-EX-NEXT:    [[TMP5:%.*]] = load i32, i32* [[TMP4]], align 4
+// CHECK-32-EX-NEXT:    call void @__kmpc_dispatch_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP5]], i32 1073741860, i32 [[TMP2]], i32 [[TMP3]], i32 1, i32 1)
+// CHECK-32-EX-NEXT:    br label [[OMP_DISPATCH_COND:%.*]]
+// CHECK-32-EX:       omp.dispatch.cond:
+// CHECK-32-EX-NEXT:    [[TMP6:%.*]] = call i32 @__kmpc_dispatch_next_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP5]], i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]])
+// CHECK-32-EX-NEXT:    [[TOBOOL:%.*]] = icmp ne i32 [[TMP6]], 0
+// CHECK-32-EX-NEXT:    br i1 [[TOBOOL]], label [[OMP_DISPATCH_BODY:%.*]], label [[OMP_DISPATCH_END:%.*]]
+// CHECK-32-EX:       omp.dispatch.body:
+// CHECK-32-EX-NEXT:    [[TMP7:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4
+// CHECK-32-EX-NEXT:    store i32 [[TMP7]], i32* [[DOTOMP_IV]], align 4
+// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK-32-EX:       omp.inner.for.cond:
+// CHECK-32-EX-NEXT:    [[TMP8:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP243:![0-9]+]]
+// CHECK-32-EX-NEXT:    [[TMP9:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP243]]
+// CHECK-32-EX-NEXT:    [[CMP:%.*]] = icmp sle i32 [[TMP8]], [[TMP9]]
+// CHECK-32-EX-NEXT:    br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK-32-EX:       omp.inner.for.body:
+// CHECK-32-EX-NEXT:    [[TMP10:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP243]]
+// CHECK-32-EX-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP10]], 1
+// CHECK-32-EX-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// CHECK-32-EX-NEXT:    store i32 [[ADD]], i32* [[I]], align 4, !llvm.access.group [[ACC_GRP243]]
+// CHECK-32-EX-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK-32-EX:       omp.body.continue:
+// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK-32-EX:       omp.inner.for.inc:
+// CHECK-32-EX-NEXT:    [[TMP11:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP243]]
+// CHECK-32-EX-NEXT:    [[ADD1:%.*]] = add nsw i32 [[TMP11]], 1
+// CHECK-32-EX-NEXT:    store i32 [[ADD1]], i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP243]]
+// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP244:![0-9]+]]
+// CHECK-32-EX:       omp.inner.for.end:
+// CHECK-32-EX-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
+// CHECK-32-EX:       omp.dispatch.inc:
+// CHECK-32-EX-NEXT:    br label [[OMP_DISPATCH_COND]]
+// CHECK-32-EX:       omp.dispatch.end:
+// CHECK-32-EX-NEXT:    ret void
+//
+//
+// CHECK-32-EX-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l124
+// CHECK-32-EX-SAME: () #[[ATTR0]] {
+// CHECK-32-EX-NEXT:  entry:
+// CHECK-32-EX-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 2, i1 false)
+// CHECK-32-EX-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
+// CHECK-32-EX-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
+// CHECK-32-EX:       user_code.entry:
+// CHECK-32-EX-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
+// CHECK-32-EX-NEXT:    store i32 0, i32* [[DOTZERO_ADDR]], align 4
+// CHECK-32-EX-NEXT:    store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4
+// CHECK-32-EX-NEXT:    call void @__omp_outlined__57(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]]) #[[ATTR2]]
+// CHECK-32-EX-NEXT:    call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 2)
+// CHECK-32-EX-NEXT:    ret void
+// CHECK-32-EX:       worker.exit:
+// CHECK-32-EX-NEXT:    ret void
+//
+//
+// CHECK-32-EX-LABEL: define {{[^@]+}}@__omp_outlined__57
+// CHECK-32-EX-SAME: (i32* noalias noundef [[DOTGLOBAL_TID_:%.*]], i32* noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
+// CHECK-32-EX-NEXT:  entry:
+// CHECK-32-EX-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-EX-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [2 x i8*], align 4
+// CHECK-32-EX-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-EX-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4
+// CHECK-32-EX-NEXT:    store i32 0, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-32-EX-NEXT:    store i32 9, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-EX-NEXT:    store i32 1, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-32-EX-NEXT:    store i32 0, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-32-EX-NEXT:    [[NVPTX_NUM_THREADS:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block()
+// CHECK-32-EX-NEXT:    [[TMP0:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-EX-NEXT:    [[TMP1:%.*]] = load i32, i32* [[TMP0]], align 4
+// CHECK-32-EX-NEXT:    call void @__kmpc_distribute_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP1]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]])
+// CHECK-32-EX-NEXT:    [[TMP2:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-EX-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[TMP2]], 9
+// CHECK-32-EX-NEXT:    br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK-32-EX:       cond.true:
+// CHECK-32-EX-NEXT:    br label [[COND_END:%.*]]
+// CHECK-32-EX:       cond.false:
+// CHECK-32-EX-NEXT:    [[TMP3:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-EX-NEXT:    br label [[COND_END]]
+// CHECK-32-EX:       cond.end:
+// CHECK-32-EX-NEXT:    [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP3]], [[COND_FALSE]] ]
+// CHECK-32-EX-NEXT:    store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-EX-NEXT:    [[TMP4:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-32-EX-NEXT:    store i32 [[TMP4]], i32* [[DOTOMP_IV]], align 4
+// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK-32-EX:       omp.inner.for.cond:
+// CHECK-32-EX-NEXT:    [[TMP5:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
+// CHECK-32-EX-NEXT:    [[CMP1:%.*]] = icmp slt i32 [[TMP5]], 10
+// CHECK-32-EX-NEXT:    br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK-32-EX:       omp.inner.for.body:
+// CHECK-32-EX-NEXT:    [[TMP6:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-32-EX-NEXT:    [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-EX-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0
+// CHECK-32-EX-NEXT:    [[TMP9:%.*]] = inttoptr i32 [[TMP6]] to i8*
+// CHECK-32-EX-NEXT:    store i8* [[TMP9]], i8** [[TMP8]], align 4
+// CHECK-32-EX-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1
+// CHECK-32-EX-NEXT:    [[TMP11:%.*]] = inttoptr i32 [[TMP7]] to i8*
+// CHECK-32-EX-NEXT:    store i8* [[TMP11]], i8** [[TMP10]], align 4
+// CHECK-32-EX-NEXT:    [[TMP12:%.*]] = bitcast [2 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8**
+// CHECK-32-EX-NEXT:    call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32)* @__omp_outlined__58 to i8*), i8* null, i8** [[TMP12]], i32 2)
+// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK-32-EX:       omp.inner.for.inc:
+// CHECK-32-EX-NEXT:    [[TMP13:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
+// CHECK-32-EX-NEXT:    [[TMP14:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-32-EX-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP13]], [[TMP14]]
+// CHECK-32-EX-NEXT:    store i32 [[ADD]], i32* [[DOTOMP_IV]], align 4
+// CHECK-32-EX-NEXT:    [[TMP15:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-32-EX-NEXT:    [[TMP16:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-32-EX-NEXT:    [[ADD2:%.*]] = add nsw i32 [[TMP15]], [[TMP16]]
+// CHECK-32-EX-NEXT:    store i32 [[ADD2]], i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-32-EX-NEXT:    [[TMP17:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-EX-NEXT:    [[TMP18:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-32-EX-NEXT:    [[ADD3:%.*]] = add nsw i32 [[TMP17]], [[TMP18]]
+// CHECK-32-EX-NEXT:    store i32 [[ADD3]], i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-EX-NEXT:    [[TMP19:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-EX-NEXT:    [[CMP4:%.*]] = icmp sgt i32 [[TMP19]], 9
+// CHECK-32-EX-NEXT:    br i1 [[CMP4]], label [[COND_TRUE5:%.*]], label [[COND_FALSE6:%.*]]
+// CHECK-32-EX:       cond.true5:
+// CHECK-32-EX-NEXT:    br label [[COND_END7:%.*]]
+// CHECK-32-EX:       cond.false6:
+// CHECK-32-EX-NEXT:    [[TMP20:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-EX-NEXT:    br label [[COND_END7]]
+// CHECK-32-EX:       cond.end7:
+// CHECK-32-EX-NEXT:    [[COND8:%.*]] = phi i32 [ 9, [[COND_TRUE5]] ], [ [[TMP20]], [[COND_FALSE6]] ]
+// CHECK-32-EX-NEXT:    store i32 [[COND8]], i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-EX-NEXT:    [[TMP21:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-32-EX-NEXT:    store i32 [[TMP21]], i32* [[DOTOMP_IV]], align 4
+// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND]]
+// CHECK-32-EX:       omp.inner.for.end:
+// CHECK-32-EX-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK-32-EX:       omp.loop.exit:
+// CHECK-32-EX-NEXT:    call void @__kmpc_distribute_static_fini(%struct.ident_t* @[[GLOB2]], i32 [[TMP1]])
+// CHECK-32-EX-NEXT:    ret void
+//
+//
+// CHECK-32-EX-LABEL: define {{[^@]+}}@__omp_outlined__58
+// CHECK-32-EX-SAME: (i32* noalias noundef [[DOTGLOBAL_TID_:%.*]], i32* noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR1]] {
+// CHECK-32-EX-NEXT:  entry:
+// CHECK-32-EX-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-EX-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-EX-NEXT:    [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-EX-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4
+// CHECK-32-EX-NEXT:    store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4
+// CHECK-32-EX-NEXT:    store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4
+// CHECK-32-EX-NEXT:    store i32 0, i32* [[DOTOMP_LB]], align 4
+// CHECK-32-EX-NEXT:    store i32 9, i32* [[DOTOMP_UB]], align 4
+// CHECK-32-EX-NEXT:    [[TMP0:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4
+// CHECK-32-EX-NEXT:    [[TMP1:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4
+// CHECK-32-EX-NEXT:    store i32 [[TMP0]], i32* [[DOTOMP_LB]], align 4
+// CHECK-32-EX-NEXT:    store i32 [[TMP1]], i32* [[DOTOMP_UB]], align 4
+// CHECK-32-EX-NEXT:    store i32 1, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-32-EX-NEXT:    store i32 0, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-32-EX-NEXT:    [[TMP2:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-EX-NEXT:    [[TMP3:%.*]] = load i32, i32* [[TMP2]], align 4
+// CHECK-32-EX-NEXT:    call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB3]], i32 [[TMP3]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1)
+// CHECK-32-EX-NEXT:    [[TMP4:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4
+// CHECK-32-EX-NEXT:    store i32 [[TMP4]], i32* [[DOTOMP_IV]], align 4
+// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK-32-EX:       omp.inner.for.cond:
+// CHECK-32-EX-NEXT:    [[TMP5:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
+// CHECK-32-EX-NEXT:    [[TMP6:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4
+// CHECK-32-EX-NEXT:    [[CMP:%.*]] = icmp ule i32 [[TMP5]], [[TMP6]]
+// CHECK-32-EX-NEXT:    br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK-32-EX:       omp.inner.for.body:
+// CHECK-32-EX-NEXT:    [[TMP7:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
+// CHECK-32-EX-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP7]], 1
+// CHECK-32-EX-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// CHECK-32-EX-NEXT:    store i32 [[ADD]], i32* [[I]], align 4
+// CHECK-32-EX-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK-32-EX:       omp.body.continue:
+// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK-32-EX:       omp.inner.for.inc:
+// CHECK-32-EX-NEXT:    [[TMP8:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
+// CHECK-32-EX-NEXT:    [[TMP9:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-32-EX-NEXT:    [[ADD1:%.*]] = add nsw i32 [[TMP8]], [[TMP9]]
+// CHECK-32-EX-NEXT:    store i32 [[ADD1]], i32* [[DOTOMP_IV]], align 4
+// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND]]
+// CHECK-32-EX:       omp.inner.for.end:
+// CHECK-32-EX-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK-32-EX:       omp.loop.exit:
+// CHECK-32-EX-NEXT:    call void @__kmpc_distribute_static_fini(%struct.ident_t* @[[GLOB2]], i32 [[TMP3]])
+// CHECK-32-EX-NEXT:    ret void
+//
+//
+// CHECK-32-EX-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l129
+// CHECK-32-EX-SAME: () #[[ATTR0]] {
+// CHECK-32-EX-NEXT:  entry:
+// CHECK-32-EX-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 2, i1 false)
+// CHECK-32-EX-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
+// CHECK-32-EX-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
+// CHECK-32-EX:       user_code.entry:
+// CHECK-32-EX-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
+// CHECK-32-EX-NEXT:    store i32 0, i32* [[DOTZERO_ADDR]], align 4
+// CHECK-32-EX-NEXT:    store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4
+// CHECK-32-EX-NEXT:    call void @__omp_outlined__59(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]]) #[[ATTR2]]
+// CHECK-32-EX-NEXT:    call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 2)
+// CHECK-32-EX-NEXT:    ret void
+// CHECK-32-EX:       worker.exit:
+// CHECK-32-EX-NEXT:    ret void
+//
+//
+// CHECK-32-EX-LABEL: define {{[^@]+}}@__omp_outlined__59
+// CHECK-32-EX-SAME: (i32* noalias noundef [[DOTGLOBAL_TID_:%.*]], i32* noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
+// CHECK-32-EX-NEXT:  entry:
+// CHECK-32-EX-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-EX-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [2 x i8*], align 4
+// CHECK-32-EX-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-EX-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4
+// CHECK-32-EX-NEXT:    store i32 0, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-32-EX-NEXT:    store i32 9, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-EX-NEXT:    store i32 1, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-32-EX-NEXT:    store i32 0, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-32-EX-NEXT:    [[NVPTX_NUM_THREADS:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block()
+// CHECK-32-EX-NEXT:    [[TMP0:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-EX-NEXT:    [[TMP1:%.*]] = load i32, i32* [[TMP0]], align 4
+// CHECK-32-EX-NEXT:    call void @__kmpc_distribute_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP1]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]])
+// CHECK-32-EX-NEXT:    [[TMP2:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-EX-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[TMP2]], 9
+// CHECK-32-EX-NEXT:    br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK-32-EX:       cond.true:
+// CHECK-32-EX-NEXT:    br label [[COND_END:%.*]]
+// CHECK-32-EX:       cond.false:
+// CHECK-32-EX-NEXT:    [[TMP3:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-EX-NEXT:    br label [[COND_END]]
+// CHECK-32-EX:       cond.end:
+// CHECK-32-EX-NEXT:    [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP3]], [[COND_FALSE]] ]
+// CHECK-32-EX-NEXT:    store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-EX-NEXT:    [[TMP4:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-32-EX-NEXT:    store i32 [[TMP4]], i32* [[DOTOMP_IV]], align 4
+// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK-32-EX:       omp.inner.for.cond:
+// CHECK-32-EX-NEXT:    [[TMP5:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
+// CHECK-32-EX-NEXT:    [[CMP1:%.*]] = icmp slt i32 [[TMP5]], 10
+// CHECK-32-EX-NEXT:    br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK-32-EX:       omp.inner.for.body:
+// CHECK-32-EX-NEXT:    [[TMP6:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-32-EX-NEXT:    [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-EX-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0
+// CHECK-32-EX-NEXT:    [[TMP9:%.*]] = inttoptr i32 [[TMP6]] to i8*
+// CHECK-32-EX-NEXT:    store i8* [[TMP9]], i8** [[TMP8]], align 4
+// CHECK-32-EX-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1
+// CHECK-32-EX-NEXT:    [[TMP11:%.*]] = inttoptr i32 [[TMP7]] to i8*
+// CHECK-32-EX-NEXT:    store i8* [[TMP11]], i8** [[TMP10]], align 4
+// CHECK-32-EX-NEXT:    [[TMP12:%.*]] = bitcast [2 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8**
+// CHECK-32-EX-NEXT:    call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32)* @__omp_outlined__60 to i8*), i8* null, i8** [[TMP12]], i32 2)
+// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK-32-EX:       omp.inner.for.inc:
+// CHECK-32-EX-NEXT:    [[TMP13:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
+// CHECK-32-EX-NEXT:    [[TMP14:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-32-EX-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP13]], [[TMP14]]
+// CHECK-32-EX-NEXT:    store i32 [[ADD]], i32* [[DOTOMP_IV]], align 4
+// CHECK-32-EX-NEXT:    [[TMP15:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-32-EX-NEXT:    [[TMP16:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-32-EX-NEXT:    [[ADD2:%.*]] = add nsw i32 [[TMP15]], [[TMP16]]
+// CHECK-32-EX-NEXT:    store i32 [[ADD2]], i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-32-EX-NEXT:    [[TMP17:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-EX-NEXT:    [[TMP18:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-32-EX-NEXT:    [[ADD3:%.*]] = add nsw i32 [[TMP17]], [[TMP18]]
+// CHECK-32-EX-NEXT:    store i32 [[ADD3]], i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-EX-NEXT:    [[TMP19:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-EX-NEXT:    [[CMP4:%.*]] = icmp sgt i32 [[TMP19]], 9
+// CHECK-32-EX-NEXT:    br i1 [[CMP4]], label [[COND_TRUE5:%.*]], label [[COND_FALSE6:%.*]]
+// CHECK-32-EX:       cond.true5:
+// CHECK-32-EX-NEXT:    br label [[COND_END7:%.*]]
+// CHECK-32-EX:       cond.false6:
+// CHECK-32-EX-NEXT:    [[TMP20:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-EX-NEXT:    br label [[COND_END7]]
+// CHECK-32-EX:       cond.end7:
+// CHECK-32-EX-NEXT:    [[COND8:%.*]] = phi i32 [ 9, [[COND_TRUE5]] ], [ [[TMP20]], [[COND_FALSE6]] ]
+// CHECK-32-EX-NEXT:    store i32 [[COND8]], i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-EX-NEXT:    [[TMP21:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-32-EX-NEXT:    store i32 [[TMP21]], i32* [[DOTOMP_IV]], align 4
+// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND]]
+// CHECK-32-EX:       omp.inner.for.end:
+// CHECK-32-EX-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK-32-EX:       omp.loop.exit:
+// CHECK-32-EX-NEXT:    call void @__kmpc_distribute_static_fini(%struct.ident_t* @[[GLOB2]], i32 [[TMP1]])
+// CHECK-32-EX-NEXT:    ret void
+//
+//
+// CHECK-32-EX-LABEL: define {{[^@]+}}@__omp_outlined__60
+// CHECK-32-EX-SAME: (i32* noalias noundef [[DOTGLOBAL_TID_:%.*]], i32* noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR1]] {
+// CHECK-32-EX-NEXT:  entry:
+// CHECK-32-EX-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-EX-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-EX-NEXT:    [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-EX-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4
+// CHECK-32-EX-NEXT:    store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4
+// CHECK-32-EX-NEXT:    store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4
+// CHECK-32-EX-NEXT:    store i32 0, i32* [[DOTOMP_LB]], align 4
+// CHECK-32-EX-NEXT:    store i32 9, i32* [[DOTOMP_UB]], align 4
+// CHECK-32-EX-NEXT:    [[TMP0:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4
+// CHECK-32-EX-NEXT:    [[TMP1:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4
+// CHECK-32-EX-NEXT:    store i32 [[TMP0]], i32* [[DOTOMP_LB]], align 4
+// CHECK-32-EX-NEXT:    store i32 [[TMP1]], i32* [[DOTOMP_UB]], align 4
+// CHECK-32-EX-NEXT:    store i32 1, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-32-EX-NEXT:    store i32 0, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-32-EX-NEXT:    [[TMP2:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-EX-NEXT:    [[TMP3:%.*]] = load i32, i32* [[TMP2]], align 4
+// CHECK-32-EX-NEXT:    call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB3]], i32 [[TMP3]], i32 34, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1)
+// CHECK-32-EX-NEXT:    [[TMP4:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4
+// CHECK-32-EX-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[TMP4]], 9
+// CHECK-32-EX-NEXT:    br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK-32-EX:       cond.true:
+// CHECK-32-EX-NEXT:    br label [[COND_END:%.*]]
+// CHECK-32-EX:       cond.false:
+// CHECK-32-EX-NEXT:    [[TMP5:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4
+// CHECK-32-EX-NEXT:    br label [[COND_END]]
+// CHECK-32-EX:       cond.end:
+// CHECK-32-EX-NEXT:    [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP5]], [[COND_FALSE]] ]
+// CHECK-32-EX-NEXT:    store i32 [[COND]], i32* [[DOTOMP_UB]], align 4
+// CHECK-32-EX-NEXT:    [[TMP6:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4
+// CHECK-32-EX-NEXT:    store i32 [[TMP6]], i32* [[DOTOMP_IV]], align 4
+// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK-32-EX:       omp.inner.for.cond:
+// CHECK-32-EX-NEXT:    [[TMP7:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
+// CHECK-32-EX-NEXT:    [[TMP8:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4
+// CHECK-32-EX-NEXT:    [[CMP1:%.*]] = icmp sle i32 [[TMP7]], [[TMP8]]
+// CHECK-32-EX-NEXT:    br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK-32-EX:       omp.inner.for.body:
+// CHECK-32-EX-NEXT:    [[TMP9:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
+// CHECK-32-EX-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP9]], 1
+// CHECK-32-EX-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// CHECK-32-EX-NEXT:    store i32 [[ADD]], i32* [[I]], align 4
+// CHECK-32-EX-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK-32-EX:       omp.body.continue:
+// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK-32-EX:       omp.inner.for.inc:
+// CHECK-32-EX-NEXT:    [[TMP10:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
+// CHECK-32-EX-NEXT:    [[ADD2:%.*]] = add nsw i32 [[TMP10]], 1
+// CHECK-32-EX-NEXT:    store i32 [[ADD2]], i32* [[DOTOMP_IV]], align 4
+// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND]]
+// CHECK-32-EX:       omp.inner.for.end:
+// CHECK-32-EX-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK-32-EX:       omp.loop.exit:
+// CHECK-32-EX-NEXT:    call void @__kmpc_distribute_static_fini(%struct.ident_t* @[[GLOB2]], i32 [[TMP3]])
+// CHECK-32-EX-NEXT:    ret void
+//
+//
+// CHECK-32-EX-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l134
+// CHECK-32-EX-SAME: () #[[ATTR0]] {
+// CHECK-32-EX-NEXT:  entry:
+// CHECK-32-EX-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 2, i1 false)
+// CHECK-32-EX-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
+// CHECK-32-EX-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
+// CHECK-32-EX:       user_code.entry:
+// CHECK-32-EX-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
+// CHECK-32-EX-NEXT:    store i32 0, i32* [[DOTZERO_ADDR]], align 4
+// CHECK-32-EX-NEXT:    store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4
+// CHECK-32-EX-NEXT:    call void @__omp_outlined__61(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]]) #[[ATTR2]]
+// CHECK-32-EX-NEXT:    call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 2)
+// CHECK-32-EX-NEXT:    ret void
+// CHECK-32-EX:       worker.exit:
+// CHECK-32-EX-NEXT:    ret void
+//
+//
+// CHECK-32-EX-LABEL: define {{[^@]+}}@__omp_outlined__61
+// CHECK-32-EX-SAME: (i32* noalias noundef [[DOTGLOBAL_TID_:%.*]], i32* noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
+// CHECK-32-EX-NEXT:  entry:
+// CHECK-32-EX-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-EX-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [2 x i8*], align 4
+// CHECK-32-EX-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-EX-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4
+// CHECK-32-EX-NEXT:    store i32 0, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-32-EX-NEXT:    store i32 9, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-EX-NEXT:    store i32 1, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-32-EX-NEXT:    store i32 0, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-32-EX-NEXT:    [[NVPTX_NUM_THREADS:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block()
+// CHECK-32-EX-NEXT:    [[TMP0:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-EX-NEXT:    [[TMP1:%.*]] = load i32, i32* [[TMP0]], align 4
+// CHECK-32-EX-NEXT:    call void @__kmpc_distribute_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP1]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]])
+// CHECK-32-EX-NEXT:    [[TMP2:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-EX-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[TMP2]], 9
+// CHECK-32-EX-NEXT:    br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK-32-EX:       cond.true:
+// CHECK-32-EX-NEXT:    br label [[COND_END:%.*]]
+// CHECK-32-EX:       cond.false:
+// CHECK-32-EX-NEXT:    [[TMP3:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-EX-NEXT:    br label [[COND_END]]
+// CHECK-32-EX:       cond.end:
+// CHECK-32-EX-NEXT:    [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP3]], [[COND_FALSE]] ]
+// CHECK-32-EX-NEXT:    store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-EX-NEXT:    [[TMP4:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-32-EX-NEXT:    store i32 [[TMP4]], i32* [[DOTOMP_IV]], align 4
+// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK-32-EX:       omp.inner.for.cond:
+// CHECK-32-EX-NEXT:    [[TMP5:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
+// CHECK-32-EX-NEXT:    [[CMP1:%.*]] = icmp slt i32 [[TMP5]], 10
+// CHECK-32-EX-NEXT:    br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK-32-EX:       omp.inner.for.body:
+// CHECK-32-EX-NEXT:    [[TMP6:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-32-EX-NEXT:    [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-EX-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0
+// CHECK-32-EX-NEXT:    [[TMP9:%.*]] = inttoptr i32 [[TMP6]] to i8*
+// CHECK-32-EX-NEXT:    store i8* [[TMP9]], i8** [[TMP8]], align 4
+// CHECK-32-EX-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1
+// CHECK-32-EX-NEXT:    [[TMP11:%.*]] = inttoptr i32 [[TMP7]] to i8*
+// CHECK-32-EX-NEXT:    store i8* [[TMP11]], i8** [[TMP10]], align 4
+// CHECK-32-EX-NEXT:    [[TMP12:%.*]] = bitcast [2 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8**
+// CHECK-32-EX-NEXT:    call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32)* @__omp_outlined__62 to i8*), i8* null, i8** [[TMP12]], i32 2)
+// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK-32-EX:       omp.inner.for.inc:
+// CHECK-32-EX-NEXT:    [[TMP13:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
+// CHECK-32-EX-NEXT:    [[TMP14:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-32-EX-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP13]], [[TMP14]]
+// CHECK-32-EX-NEXT:    store i32 [[ADD]], i32* [[DOTOMP_IV]], align 4
+// CHECK-32-EX-NEXT:    [[TMP15:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-32-EX-NEXT:    [[TMP16:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-32-EX-NEXT:    [[ADD2:%.*]] = add nsw i32 [[TMP15]], [[TMP16]]
+// CHECK-32-EX-NEXT:    store i32 [[ADD2]], i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-32-EX-NEXT:    [[TMP17:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-EX-NEXT:    [[TMP18:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-32-EX-NEXT:    [[ADD3:%.*]] = add nsw i32 [[TMP17]], [[TMP18]]
+// CHECK-32-EX-NEXT:    store i32 [[ADD3]], i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-EX-NEXT:    [[TMP19:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-EX-NEXT:    [[CMP4:%.*]] = icmp sgt i32 [[TMP19]], 9
+// CHECK-32-EX-NEXT:    br i1 [[CMP4]], label [[COND_TRUE5:%.*]], label [[COND_FALSE6:%.*]]
+// CHECK-32-EX:       cond.true5:
+// CHECK-32-EX-NEXT:    br label [[COND_END7:%.*]]
+// CHECK-32-EX:       cond.false6:
+// CHECK-32-EX-NEXT:    [[TMP20:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-EX-NEXT:    br label [[COND_END7]]
+// CHECK-32-EX:       cond.end7:
+// CHECK-32-EX-NEXT:    [[COND8:%.*]] = phi i32 [ 9, [[COND_TRUE5]] ], [ [[TMP20]], [[COND_FALSE6]] ]
+// CHECK-32-EX-NEXT:    store i32 [[COND8]], i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-EX-NEXT:    [[TMP21:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-32-EX-NEXT:    store i32 [[TMP21]], i32* [[DOTOMP_IV]], align 4
+// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND]]
+// CHECK-32-EX:       omp.inner.for.end:
+// CHECK-32-EX-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK-32-EX:       omp.loop.exit:
+// CHECK-32-EX-NEXT:    call void @__kmpc_distribute_static_fini(%struct.ident_t* @[[GLOB2]], i32 [[TMP1]])
+// CHECK-32-EX-NEXT:    ret void
+//
+//
+// CHECK-32-EX-LABEL: define {{[^@]+}}@__omp_outlined__62
+// CHECK-32-EX-SAME: (i32* noalias noundef [[DOTGLOBAL_TID_:%.*]], i32* noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR1]] {
+// CHECK-32-EX-NEXT:  entry:
+// CHECK-32-EX-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-EX-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-EX-NEXT:    [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-EX-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4
+// CHECK-32-EX-NEXT:    store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4
+// CHECK-32-EX-NEXT:    store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4
+// CHECK-32-EX-NEXT:    store i32 0, i32* [[DOTOMP_LB]], align 4
+// CHECK-32-EX-NEXT:    store i32 9, i32* [[DOTOMP_UB]], align 4
+// CHECK-32-EX-NEXT:    [[TMP0:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4
+// CHECK-32-EX-NEXT:    [[TMP1:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4
+// CHECK-32-EX-NEXT:    store i32 [[TMP0]], i32* [[DOTOMP_LB]], align 4
+// CHECK-32-EX-NEXT:    store i32 [[TMP1]], i32* [[DOTOMP_UB]], align 4
+// CHECK-32-EX-NEXT:    store i32 1, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-32-EX-NEXT:    store i32 0, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-32-EX-NEXT:    [[TMP2:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-EX-NEXT:    [[TMP3:%.*]] = load i32, i32* [[TMP2]], align 4
+// CHECK-32-EX-NEXT:    call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB3]], i32 [[TMP3]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1)
+// CHECK-32-EX-NEXT:    [[TMP4:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4
+// CHECK-32-EX-NEXT:    store i32 [[TMP4]], i32* [[DOTOMP_IV]], align 4
+// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK-32-EX:       omp.inner.for.cond:
+// CHECK-32-EX-NEXT:    [[TMP5:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
+// CHECK-32-EX-NEXT:    [[TMP6:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4
+// CHECK-32-EX-NEXT:    [[CMP:%.*]] = icmp ule i32 [[TMP5]], [[TMP6]]
+// CHECK-32-EX-NEXT:    br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK-32-EX:       omp.inner.for.body:
+// CHECK-32-EX-NEXT:    [[TMP7:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
+// CHECK-32-EX-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP7]], 1
+// CHECK-32-EX-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// CHECK-32-EX-NEXT:    store i32 [[ADD]], i32* [[I]], align 4
+// CHECK-32-EX-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK-32-EX:       omp.body.continue:
+// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK-32-EX:       omp.inner.for.inc:
+// CHECK-32-EX-NEXT:    [[TMP8:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
+// CHECK-32-EX-NEXT:    [[TMP9:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-32-EX-NEXT:    [[ADD1:%.*]] = add nsw i32 [[TMP8]], [[TMP9]]
+// CHECK-32-EX-NEXT:    store i32 [[ADD1]], i32* [[DOTOMP_IV]], align 4
+// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND]]
+// CHECK-32-EX:       omp.inner.for.end:
+// CHECK-32-EX-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK-32-EX:       omp.loop.exit:
+// CHECK-32-EX-NEXT:    call void @__kmpc_distribute_static_fini(%struct.ident_t* @[[GLOB2]], i32 [[TMP3]])
+// CHECK-32-EX-NEXT:    ret void
+//
+//
+// CHECK-32-EX-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l139
+// CHECK-32-EX-SAME: () #[[ATTR0]] {
+// CHECK-32-EX-NEXT:  entry:
+// CHECK-32-EX-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 2, i1 false)
+// CHECK-32-EX-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
+// CHECK-32-EX-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
+// CHECK-32-EX:       user_code.entry:
+// CHECK-32-EX-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
+// CHECK-32-EX-NEXT:    store i32 0, i32* [[DOTZERO_ADDR]], align 4
+// CHECK-32-EX-NEXT:    store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4
+// CHECK-32-EX-NEXT:    call void @__omp_outlined__63(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]]) #[[ATTR2]]
+// CHECK-32-EX-NEXT:    call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 2)
+// CHECK-32-EX-NEXT:    ret void
+// CHECK-32-EX:       worker.exit:
+// CHECK-32-EX-NEXT:    ret void
+//
+//
+// CHECK-32-EX-LABEL: define {{[^@]+}}@__omp_outlined__63
+// CHECK-32-EX-SAME: (i32* noalias noundef [[DOTGLOBAL_TID_:%.*]], i32* noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
+// CHECK-32-EX-NEXT:  entry:
+// CHECK-32-EX-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-EX-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [2 x i8*], align 4
+// CHECK-32-EX-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-EX-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4
+// CHECK-32-EX-NEXT:    store i32 0, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-32-EX-NEXT:    store i32 9, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-EX-NEXT:    store i32 1, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-32-EX-NEXT:    store i32 0, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-32-EX-NEXT:    [[NVPTX_NUM_THREADS:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block()
+// CHECK-32-EX-NEXT:    [[TMP0:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-EX-NEXT:    [[TMP1:%.*]] = load i32, i32* [[TMP0]], align 4
+// CHECK-32-EX-NEXT:    call void @__kmpc_distribute_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP1]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]])
+// CHECK-32-EX-NEXT:    [[TMP2:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-EX-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[TMP2]], 9
+// CHECK-32-EX-NEXT:    br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK-32-EX:       cond.true:
+// CHECK-32-EX-NEXT:    br label [[COND_END:%.*]]
+// CHECK-32-EX:       cond.false:
+// CHECK-32-EX-NEXT:    [[TMP3:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-EX-NEXT:    br label [[COND_END]]
+// CHECK-32-EX:       cond.end:
+// CHECK-32-EX-NEXT:    [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP3]], [[COND_FALSE]] ]
+// CHECK-32-EX-NEXT:    store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-EX-NEXT:    [[TMP4:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-32-EX-NEXT:    store i32 [[TMP4]], i32* [[DOTOMP_IV]], align 4
+// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK-32-EX:       omp.inner.for.cond:
+// CHECK-32-EX-NEXT:    [[TMP5:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
+// CHECK-32-EX-NEXT:    [[CMP1:%.*]] = icmp slt i32 [[TMP5]], 10
+// CHECK-32-EX-NEXT:    br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK-32-EX:       omp.inner.for.body:
+// CHECK-32-EX-NEXT:    [[TMP6:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-32-EX-NEXT:    [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-EX-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0
+// CHECK-32-EX-NEXT:    [[TMP9:%.*]] = inttoptr i32 [[TMP6]] to i8*
+// CHECK-32-EX-NEXT:    store i8* [[TMP9]], i8** [[TMP8]], align 4
+// CHECK-32-EX-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1
+// CHECK-32-EX-NEXT:    [[TMP11:%.*]] = inttoptr i32 [[TMP7]] to i8*
+// CHECK-32-EX-NEXT:    store i8* [[TMP11]], i8** [[TMP10]], align 4
+// CHECK-32-EX-NEXT:    [[TMP12:%.*]] = bitcast [2 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8**
+// CHECK-32-EX-NEXT:    call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32)* @__omp_outlined__64 to i8*), i8* null, i8** [[TMP12]], i32 2)
+// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK-32-EX:       omp.inner.for.inc:
+// CHECK-32-EX-NEXT:    [[TMP13:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
+// CHECK-32-EX-NEXT:    [[TMP14:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-32-EX-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP13]], [[TMP14]]
+// CHECK-32-EX-NEXT:    store i32 [[ADD]], i32* [[DOTOMP_IV]], align 4
+// CHECK-32-EX-NEXT:    [[TMP15:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-32-EX-NEXT:    [[TMP16:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-32-EX-NEXT:    [[ADD2:%.*]] = add nsw i32 [[TMP15]], [[TMP16]]
+// CHECK-32-EX-NEXT:    store i32 [[ADD2]], i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-32-EX-NEXT:    [[TMP17:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-EX-NEXT:    [[TMP18:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-32-EX-NEXT:    [[ADD3:%.*]] = add nsw i32 [[TMP17]], [[TMP18]]
+// CHECK-32-EX-NEXT:    store i32 [[ADD3]], i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-EX-NEXT:    [[TMP19:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-EX-NEXT:    [[CMP4:%.*]] = icmp sgt i32 [[TMP19]], 9
+// CHECK-32-EX-NEXT:    br i1 [[CMP4]], label [[COND_TRUE5:%.*]], label [[COND_FALSE6:%.*]]
+// CHECK-32-EX:       cond.true5:
+// CHECK-32-EX-NEXT:    br label [[COND_END7:%.*]]
+// CHECK-32-EX:       cond.false6:
+// CHECK-32-EX-NEXT:    [[TMP20:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-EX-NEXT:    br label [[COND_END7]]
+// CHECK-32-EX:       cond.end7:
+// CHECK-32-EX-NEXT:    [[COND8:%.*]] = phi i32 [ 9, [[COND_TRUE5]] ], [ [[TMP20]], [[COND_FALSE6]] ]
+// CHECK-32-EX-NEXT:    store i32 [[COND8]], i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-EX-NEXT:    [[TMP21:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-32-EX-NEXT:    store i32 [[TMP21]], i32* [[DOTOMP_IV]], align 4
+// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND]]
+// CHECK-32-EX:       omp.inner.for.end:
+// CHECK-32-EX-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK-32-EX:       omp.loop.exit:
+// CHECK-32-EX-NEXT:    call void @__kmpc_distribute_static_fini(%struct.ident_t* @[[GLOB2]], i32 [[TMP1]])
+// CHECK-32-EX-NEXT:    ret void
+//
+//
+// CHECK-32-EX-LABEL: define {{[^@]+}}@__omp_outlined__64
+// CHECK-32-EX-SAME: (i32* noalias noundef [[DOTGLOBAL_TID_:%.*]], i32* noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR1]] {
+// CHECK-32-EX-NEXT:  entry:
+// CHECK-32-EX-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-EX-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-EX-NEXT:    [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-EX-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4
+// CHECK-32-EX-NEXT:    store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4
+// CHECK-32-EX-NEXT:    store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4
+// CHECK-32-EX-NEXT:    store i32 0, i32* [[DOTOMP_LB]], align 4
+// CHECK-32-EX-NEXT:    store i32 9, i32* [[DOTOMP_UB]], align 4
+// CHECK-32-EX-NEXT:    [[TMP0:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4
+// CHECK-32-EX-NEXT:    [[TMP1:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4
+// CHECK-32-EX-NEXT:    store i32 [[TMP0]], i32* [[DOTOMP_LB]], align 4
+// CHECK-32-EX-NEXT:    store i32 [[TMP1]], i32* [[DOTOMP_UB]], align 4
+// CHECK-32-EX-NEXT:    store i32 1, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-32-EX-NEXT:    store i32 0, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-32-EX-NEXT:    [[TMP2:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4
+// CHECK-32-EX-NEXT:    [[TMP3:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4
+// CHECK-32-EX-NEXT:    [[TMP4:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-EX-NEXT:    [[TMP5:%.*]] = load i32, i32* [[TMP4]], align 4
+// CHECK-32-EX-NEXT:    call void @__kmpc_dispatch_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP5]], i32 1073741862, i32 [[TMP2]], i32 [[TMP3]], i32 1, i32 1)
+// CHECK-32-EX-NEXT:    br label [[OMP_DISPATCH_COND:%.*]]
+// CHECK-32-EX:       omp.dispatch.cond:
+// CHECK-32-EX-NEXT:    [[TMP6:%.*]] = call i32 @__kmpc_dispatch_next_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP5]], i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]])
+// CHECK-32-EX-NEXT:    [[TOBOOL:%.*]] = icmp ne i32 [[TMP6]], 0
+// CHECK-32-EX-NEXT:    br i1 [[TOBOOL]], label [[OMP_DISPATCH_BODY:%.*]], label [[OMP_DISPATCH_END:%.*]]
+// CHECK-32-EX:       omp.dispatch.body:
+// CHECK-32-EX-NEXT:    [[TMP7:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4
+// CHECK-32-EX-NEXT:    store i32 [[TMP7]], i32* [[DOTOMP_IV]], align 4
+// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK-32-EX:       omp.inner.for.cond:
+// CHECK-32-EX-NEXT:    [[TMP8:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP246:![0-9]+]]
+// CHECK-32-EX-NEXT:    [[TMP9:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP246]]
+// CHECK-32-EX-NEXT:    [[CMP:%.*]] = icmp sle i32 [[TMP8]], [[TMP9]]
+// CHECK-32-EX-NEXT:    br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK-32-EX:       omp.inner.for.body:
+// CHECK-32-EX-NEXT:    [[TMP10:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP246]]
+// CHECK-32-EX-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP10]], 1
+// CHECK-32-EX-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// CHECK-32-EX-NEXT:    store i32 [[ADD]], i32* [[I]], align 4, !llvm.access.group [[ACC_GRP246]]
+// CHECK-32-EX-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK-32-EX:       omp.body.continue:
+// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK-32-EX:       omp.inner.for.inc:
+// CHECK-32-EX-NEXT:    [[TMP11:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP246]]
+// CHECK-32-EX-NEXT:    [[ADD1:%.*]] = add nsw i32 [[TMP11]], 1
+// CHECK-32-EX-NEXT:    store i32 [[ADD1]], i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP246]]
+// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP247:![0-9]+]]
+// CHECK-32-EX:       omp.inner.for.end:
+// CHECK-32-EX-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
+// CHECK-32-EX:       omp.dispatch.inc:
+// CHECK-32-EX-NEXT:    br label [[OMP_DISPATCH_COND]]
+// CHECK-32-EX:       omp.dispatch.end:
+// CHECK-32-EX-NEXT:    ret void
+//
+//
+// CHECK-32-EX-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l144
+// CHECK-32-EX-SAME: () #[[ATTR0]] {
+// CHECK-32-EX-NEXT:  entry:
+// CHECK-32-EX-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 2, i1 false)
+// CHECK-32-EX-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
+// CHECK-32-EX-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
+// CHECK-32-EX:       user_code.entry:
+// CHECK-32-EX-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
+// CHECK-32-EX-NEXT:    store i32 0, i32* [[DOTZERO_ADDR]], align 4
+// CHECK-32-EX-NEXT:    store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4
+// CHECK-32-EX-NEXT:    call void @__omp_outlined__65(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]]) #[[ATTR2]]
+// CHECK-32-EX-NEXT:    call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 2)
+// CHECK-32-EX-NEXT:    ret void
+// CHECK-32-EX:       worker.exit:
+// CHECK-32-EX-NEXT:    ret void
+//
+//
+// CHECK-32-EX-LABEL: define {{[^@]+}}@__omp_outlined__65
+// CHECK-32-EX-SAME: (i32* noalias noundef [[DOTGLOBAL_TID_:%.*]], i32* noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
+// CHECK-32-EX-NEXT:  entry:
+// CHECK-32-EX-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-EX-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [2 x i8*], align 4
+// CHECK-32-EX-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-EX-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4
+// CHECK-32-EX-NEXT:    store i32 0, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-32-EX-NEXT:    store i32 9, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-EX-NEXT:    store i32 1, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-32-EX-NEXT:    store i32 0, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-32-EX-NEXT:    [[NVPTX_NUM_THREADS:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block()
+// CHECK-32-EX-NEXT:    [[TMP0:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-EX-NEXT:    [[TMP1:%.*]] = load i32, i32* [[TMP0]], align 4
+// CHECK-32-EX-NEXT:    call void @__kmpc_distribute_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP1]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]])
+// CHECK-32-EX-NEXT:    [[TMP2:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-EX-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[TMP2]], 9
+// CHECK-32-EX-NEXT:    br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK-32-EX:       cond.true:
+// CHECK-32-EX-NEXT:    br label [[COND_END:%.*]]
+// CHECK-32-EX:       cond.false:
+// CHECK-32-EX-NEXT:    [[TMP3:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-EX-NEXT:    br label [[COND_END]]
+// CHECK-32-EX:       cond.end:
+// CHECK-32-EX-NEXT:    [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP3]], [[COND_FALSE]] ]
+// CHECK-32-EX-NEXT:    store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-EX-NEXT:    [[TMP4:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-32-EX-NEXT:    store i32 [[TMP4]], i32* [[DOTOMP_IV]], align 4
+// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK-32-EX:       omp.inner.for.cond:
+// CHECK-32-EX-NEXT:    [[TMP5:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
+// CHECK-32-EX-NEXT:    [[CMP1:%.*]] = icmp slt i32 [[TMP5]], 10
+// CHECK-32-EX-NEXT:    br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK-32-EX:       omp.inner.for.body:
+// CHECK-32-EX-NEXT:    [[TMP6:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-32-EX-NEXT:    [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-EX-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0
+// CHECK-32-EX-NEXT:    [[TMP9:%.*]] = inttoptr i32 [[TMP6]] to i8*
+// CHECK-32-EX-NEXT:    store i8* [[TMP9]], i8** [[TMP8]], align 4
+// CHECK-32-EX-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1
+// CHECK-32-EX-NEXT:    [[TMP11:%.*]] = inttoptr i32 [[TMP7]] to i8*
+// CHECK-32-EX-NEXT:    store i8* [[TMP11]], i8** [[TMP10]], align 4
+// CHECK-32-EX-NEXT:    [[TMP12:%.*]] = bitcast [2 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8**
+// CHECK-32-EX-NEXT:    call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32)* @__omp_outlined__66 to i8*), i8* null, i8** [[TMP12]], i32 2)
+// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK-32-EX:       omp.inner.for.inc:
+// CHECK-32-EX-NEXT:    [[TMP13:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
+// CHECK-32-EX-NEXT:    [[TMP14:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-32-EX-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP13]], [[TMP14]]
+// CHECK-32-EX-NEXT:    store i32 [[ADD]], i32* [[DOTOMP_IV]], align 4
+// CHECK-32-EX-NEXT:    [[TMP15:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-32-EX-NEXT:    [[TMP16:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-32-EX-NEXT:    [[ADD2:%.*]] = add nsw i32 [[TMP15]], [[TMP16]]
+// CHECK-32-EX-NEXT:    store i32 [[ADD2]], i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-32-EX-NEXT:    [[TMP17:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-EX-NEXT:    [[TMP18:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-32-EX-NEXT:    [[ADD3:%.*]] = add nsw i32 [[TMP17]], [[TMP18]]
+// CHECK-32-EX-NEXT:    store i32 [[ADD3]], i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-EX-NEXT:    [[TMP19:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-EX-NEXT:    [[CMP4:%.*]] = icmp sgt i32 [[TMP19]], 9
+// CHECK-32-EX-NEXT:    br i1 [[CMP4]], label [[COND_TRUE5:%.*]], label [[COND_FALSE6:%.*]]
+// CHECK-32-EX:       cond.true5:
+// CHECK-32-EX-NEXT:    br label [[COND_END7:%.*]]
+// CHECK-32-EX:       cond.false6:
+// CHECK-32-EX-NEXT:    [[TMP20:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-EX-NEXT:    br label [[COND_END7]]
+// CHECK-32-EX:       cond.end7:
+// CHECK-32-EX-NEXT:    [[COND8:%.*]] = phi i32 [ 9, [[COND_TRUE5]] ], [ [[TMP20]], [[COND_FALSE6]] ]
+// CHECK-32-EX-NEXT:    store i32 [[COND8]], i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-EX-NEXT:    [[TMP21:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-32-EX-NEXT:    store i32 [[TMP21]], i32* [[DOTOMP_IV]], align 4
+// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND]]
+// CHECK-32-EX:       omp.inner.for.end:
+// CHECK-32-EX-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK-32-EX:       omp.loop.exit:
+// CHECK-32-EX-NEXT:    call void @__kmpc_distribute_static_fini(%struct.ident_t* @[[GLOB2]], i32 [[TMP1]])
+// CHECK-32-EX-NEXT:    ret void
+//
+//
+// CHECK-32-EX-LABEL: define {{[^@]+}}@__omp_outlined__66
+// CHECK-32-EX-SAME: (i32* noalias noundef [[DOTGLOBAL_TID_:%.*]], i32* noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR1]] {
+// CHECK-32-EX-NEXT:  entry:
+// CHECK-32-EX-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-EX-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-EX-NEXT:    [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-EX-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4
+// CHECK-32-EX-NEXT:    store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4
+// CHECK-32-EX-NEXT:    store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4
+// CHECK-32-EX-NEXT:    store i32 0, i32* [[DOTOMP_LB]], align 4
+// CHECK-32-EX-NEXT:    store i32 9, i32* [[DOTOMP_UB]], align 4
+// CHECK-32-EX-NEXT:    [[TMP0:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4
+// CHECK-32-EX-NEXT:    [[TMP1:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4
+// CHECK-32-EX-NEXT:    store i32 [[TMP0]], i32* [[DOTOMP_LB]], align 4
+// CHECK-32-EX-NEXT:    store i32 [[TMP1]], i32* [[DOTOMP_UB]], align 4
+// CHECK-32-EX-NEXT:    store i32 1, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-32-EX-NEXT:    store i32 0, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-32-EX-NEXT:    [[TMP2:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4
+// CHECK-32-EX-NEXT:    [[TMP3:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4
+// CHECK-32-EX-NEXT:    [[TMP4:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-EX-NEXT:    [[TMP5:%.*]] = load i32, i32* [[TMP4]], align 4
+// CHECK-32-EX-NEXT:    call void @__kmpc_dispatch_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP5]], i32 1073741861, i32 [[TMP2]], i32 [[TMP3]], i32 1, i32 1)
+// CHECK-32-EX-NEXT:    br label [[OMP_DISPATCH_COND:%.*]]
+// CHECK-32-EX:       omp.dispatch.cond:
+// CHECK-32-EX-NEXT:    [[TMP6:%.*]] = call i32 @__kmpc_dispatch_next_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP5]], i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]])
+// CHECK-32-EX-NEXT:    [[TOBOOL:%.*]] = icmp ne i32 [[TMP6]], 0
+// CHECK-32-EX-NEXT:    br i1 [[TOBOOL]], label [[OMP_DISPATCH_BODY:%.*]], label [[OMP_DISPATCH_END:%.*]]
+// CHECK-32-EX:       omp.dispatch.body:
+// CHECK-32-EX-NEXT:    [[TMP7:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4
+// CHECK-32-EX-NEXT:    store i32 [[TMP7]], i32* [[DOTOMP_IV]], align 4
+// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK-32-EX:       omp.inner.for.cond:
+// CHECK-32-EX-NEXT:    [[TMP8:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP249:![0-9]+]]
+// CHECK-32-EX-NEXT:    [[TMP9:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP249]]
+// CHECK-32-EX-NEXT:    [[CMP:%.*]] = icmp sle i32 [[TMP8]], [[TMP9]]
+// CHECK-32-EX-NEXT:    br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK-32-EX:       omp.inner.for.body:
+// CHECK-32-EX-NEXT:    [[TMP10:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP249]]
+// CHECK-32-EX-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP10]], 1
+// CHECK-32-EX-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// CHECK-32-EX-NEXT:    store i32 [[ADD]], i32* [[I]], align 4, !llvm.access.group [[ACC_GRP249]]
+// CHECK-32-EX-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK-32-EX:       omp.body.continue:
+// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK-32-EX:       omp.inner.for.inc:
+// CHECK-32-EX-NEXT:    [[TMP11:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP249]]
+// CHECK-32-EX-NEXT:    [[ADD1:%.*]] = add nsw i32 [[TMP11]], 1
+// CHECK-32-EX-NEXT:    store i32 [[ADD1]], i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP249]]
+// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP250:![0-9]+]]
+// CHECK-32-EX:       omp.inner.for.end:
+// CHECK-32-EX-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
+// CHECK-32-EX:       omp.dispatch.inc:
+// CHECK-32-EX-NEXT:    br label [[OMP_DISPATCH_COND]]
+// CHECK-32-EX:       omp.dispatch.end:
+// CHECK-32-EX-NEXT:    ret void
+//
+//
+// CHECK-32-EX-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l149
+// CHECK-32-EX-SAME: () #[[ATTR0]] {
+// CHECK-32-EX-NEXT:  entry:
+// CHECK-32-EX-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 2, i1 false)
+// CHECK-32-EX-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
+// CHECK-32-EX-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
+// CHECK-32-EX:       user_code.entry:
+// CHECK-32-EX-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
+// CHECK-32-EX-NEXT:    store i32 0, i32* [[DOTZERO_ADDR]], align 4
+// CHECK-32-EX-NEXT:    store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4
+// CHECK-32-EX-NEXT:    call void @__omp_outlined__67(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]]) #[[ATTR2]]
+// CHECK-32-EX-NEXT:    call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 2)
+// CHECK-32-EX-NEXT:    ret void
+// CHECK-32-EX:       worker.exit:
+// CHECK-32-EX-NEXT:    ret void
+//
+//
+// CHECK-32-EX-LABEL: define {{[^@]+}}@__omp_outlined__67
+// CHECK-32-EX-SAME: (i32* noalias noundef [[DOTGLOBAL_TID_:%.*]], i32* noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
+// CHECK-32-EX-NEXT:  entry:
+// CHECK-32-EX-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-EX-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [2 x i8*], align 4
+// CHECK-32-EX-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-EX-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4
+// CHECK-32-EX-NEXT:    store i32 0, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-32-EX-NEXT:    store i32 9, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-EX-NEXT:    store i32 1, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-32-EX-NEXT:    store i32 0, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-32-EX-NEXT:    [[NVPTX_NUM_THREADS:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block()
+// CHECK-32-EX-NEXT:    [[TMP0:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-EX-NEXT:    [[TMP1:%.*]] = load i32, i32* [[TMP0]], align 4
+// CHECK-32-EX-NEXT:    call void @__kmpc_distribute_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP1]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]])
+// CHECK-32-EX-NEXT:    [[TMP2:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-EX-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[TMP2]], 9
+// CHECK-32-EX-NEXT:    br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK-32-EX:       cond.true:
+// CHECK-32-EX-NEXT:    br label [[COND_END:%.*]]
+// CHECK-32-EX:       cond.false:
+// CHECK-32-EX-NEXT:    [[TMP3:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-EX-NEXT:    br label [[COND_END]]
+// CHECK-32-EX:       cond.end:
+// CHECK-32-EX-NEXT:    [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP3]], [[COND_FALSE]] ]
+// CHECK-32-EX-NEXT:    store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-EX-NEXT:    [[TMP4:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-32-EX-NEXT:    store i32 [[TMP4]], i32* [[DOTOMP_IV]], align 4
+// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK-32-EX:       omp.inner.for.cond:
+// CHECK-32-EX-NEXT:    [[TMP5:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
+// CHECK-32-EX-NEXT:    [[CMP1:%.*]] = icmp slt i32 [[TMP5]], 10
+// CHECK-32-EX-NEXT:    br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK-32-EX:       omp.inner.for.body:
+// CHECK-32-EX-NEXT:    [[TMP6:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-32-EX-NEXT:    [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-EX-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0
+// CHECK-32-EX-NEXT:    [[TMP9:%.*]] = inttoptr i32 [[TMP6]] to i8*
+// CHECK-32-EX-NEXT:    store i8* [[TMP9]], i8** [[TMP8]], align 4
+// CHECK-32-EX-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1
+// CHECK-32-EX-NEXT:    [[TMP11:%.*]] = inttoptr i32 [[TMP7]] to i8*
+// CHECK-32-EX-NEXT:    store i8* [[TMP11]], i8** [[TMP10]], align 4
+// CHECK-32-EX-NEXT:    [[TMP12:%.*]] = bitcast [2 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8**
+// CHECK-32-EX-NEXT:    call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32)* @__omp_outlined__68 to i8*), i8* null, i8** [[TMP12]], i32 2)
+// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK-32-EX:       omp.inner.for.inc:
+// CHECK-32-EX-NEXT:    [[TMP13:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
+// CHECK-32-EX-NEXT:    [[TMP14:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-32-EX-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP13]], [[TMP14]]
+// CHECK-32-EX-NEXT:    store i32 [[ADD]], i32* [[DOTOMP_IV]], align 4
+// CHECK-32-EX-NEXT:    [[TMP15:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-32-EX-NEXT:    [[TMP16:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-32-EX-NEXT:    [[ADD2:%.*]] = add nsw i32 [[TMP15]], [[TMP16]]
+// CHECK-32-EX-NEXT:    store i32 [[ADD2]], i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-32-EX-NEXT:    [[TMP17:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-EX-NEXT:    [[TMP18:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-32-EX-NEXT:    [[ADD3:%.*]] = add nsw i32 [[TMP17]], [[TMP18]]
+// CHECK-32-EX-NEXT:    store i32 [[ADD3]], i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-EX-NEXT:    [[TMP19:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-EX-NEXT:    [[CMP4:%.*]] = icmp sgt i32 [[TMP19]], 9
+// CHECK-32-EX-NEXT:    br i1 [[CMP4]], label [[COND_TRUE5:%.*]], label [[COND_FALSE6:%.*]]
+// CHECK-32-EX:       cond.true5:
+// CHECK-32-EX-NEXT:    br label [[COND_END7:%.*]]
+// CHECK-32-EX:       cond.false6:
+// CHECK-32-EX-NEXT:    [[TMP20:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-EX-NEXT:    br label [[COND_END7]]
+// CHECK-32-EX:       cond.end7:
+// CHECK-32-EX-NEXT:    [[COND8:%.*]] = phi i32 [ 9, [[COND_TRUE5]] ], [ [[TMP20]], [[COND_FALSE6]] ]
+// CHECK-32-EX-NEXT:    store i32 [[COND8]], i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-EX-NEXT:    [[TMP21:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-32-EX-NEXT:    store i32 [[TMP21]], i32* [[DOTOMP_IV]], align 4
+// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND]]
+// CHECK-32-EX:       omp.inner.for.end:
+// CHECK-32-EX-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK-32-EX:       omp.loop.exit:
+// CHECK-32-EX-NEXT:    call void @__kmpc_distribute_static_fini(%struct.ident_t* @[[GLOB2]], i32 [[TMP1]])
+// CHECK-32-EX-NEXT:    ret void
+//
+//
+// CHECK-32-EX-LABEL: define {{[^@]+}}@__omp_outlined__68
+// CHECK-32-EX-SAME: (i32* noalias noundef [[DOTGLOBAL_TID_:%.*]], i32* noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR1]] {
+// CHECK-32-EX-NEXT:  entry:
+// CHECK-32-EX-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-EX-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-EX-NEXT:    [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-EX-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4
+// CHECK-32-EX-NEXT:    store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4
+// CHECK-32-EX-NEXT:    store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4
+// CHECK-32-EX-NEXT:    store i32 0, i32* [[DOTOMP_LB]], align 4
+// CHECK-32-EX-NEXT:    store i32 9, i32* [[DOTOMP_UB]], align 4
+// CHECK-32-EX-NEXT:    [[TMP0:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4
+// CHECK-32-EX-NEXT:    [[TMP1:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4
+// CHECK-32-EX-NEXT:    store i32 [[TMP0]], i32* [[DOTOMP_LB]], align 4
+// CHECK-32-EX-NEXT:    store i32 [[TMP1]], i32* [[DOTOMP_UB]], align 4
+// CHECK-32-EX-NEXT:    store i32 1, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-32-EX-NEXT:    store i32 0, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-32-EX-NEXT:    [[TMP2:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4
+// CHECK-32-EX-NEXT:    [[TMP3:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4
+// CHECK-32-EX-NEXT:    [[TMP4:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-EX-NEXT:    [[TMP5:%.*]] = load i32, i32* [[TMP4]], align 4
+// CHECK-32-EX-NEXT:    call void @__kmpc_dispatch_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP5]], i32 1073741859, i32 [[TMP2]], i32 [[TMP3]], i32 1, i32 1)
+// CHECK-32-EX-NEXT:    br label [[OMP_DISPATCH_COND:%.*]]
+// CHECK-32-EX:       omp.dispatch.cond:
+// CHECK-32-EX-NEXT:    [[TMP6:%.*]] = call i32 @__kmpc_dispatch_next_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP5]], i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]])
+// CHECK-32-EX-NEXT:    [[TOBOOL:%.*]] = icmp ne i32 [[TMP6]], 0
+// CHECK-32-EX-NEXT:    br i1 [[TOBOOL]], label [[OMP_DISPATCH_BODY:%.*]], label [[OMP_DISPATCH_END:%.*]]
+// CHECK-32-EX:       omp.dispatch.body:
+// CHECK-32-EX-NEXT:    [[TMP7:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4
+// CHECK-32-EX-NEXT:    store i32 [[TMP7]], i32* [[DOTOMP_IV]], align 4
+// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK-32-EX:       omp.inner.for.cond:
+// CHECK-32-EX-NEXT:    [[TMP8:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP252:![0-9]+]]
+// CHECK-32-EX-NEXT:    [[TMP9:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP252]]
+// CHECK-32-EX-NEXT:    [[CMP:%.*]] = icmp sle i32 [[TMP8]], [[TMP9]]
+// CHECK-32-EX-NEXT:    br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK-32-EX:       omp.inner.for.body:
+// CHECK-32-EX-NEXT:    [[TMP10:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP252]]
+// CHECK-32-EX-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP10]], 1
+// CHECK-32-EX-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// CHECK-32-EX-NEXT:    store i32 [[ADD]], i32* [[I]], align 4, !llvm.access.group [[ACC_GRP252]]
+// CHECK-32-EX-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK-32-EX:       omp.body.continue:
+// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK-32-EX:       omp.inner.for.inc:
+// CHECK-32-EX-NEXT:    [[TMP11:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP252]]
+// CHECK-32-EX-NEXT:    [[ADD1:%.*]] = add nsw i32 [[TMP11]], 1
+// CHECK-32-EX-NEXT:    store i32 [[ADD1]], i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP252]]
+// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP253:![0-9]+]]
+// CHECK-32-EX:       omp.inner.for.end:
+// CHECK-32-EX-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
+// CHECK-32-EX:       omp.dispatch.inc:
+// CHECK-32-EX-NEXT:    br label [[OMP_DISPATCH_COND]]
+// CHECK-32-EX:       omp.dispatch.end:
+// CHECK-32-EX-NEXT:    ret void
+//
+//
+// CHECK-32-EX-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l154
+// CHECK-32-EX-SAME: () #[[ATTR0]] {
+// CHECK-32-EX-NEXT:  entry:
+// CHECK-32-EX-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 2, i1 false)
+// CHECK-32-EX-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
+// CHECK-32-EX-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
+// CHECK-32-EX:       user_code.entry:
+// CHECK-32-EX-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
+// CHECK-32-EX-NEXT:    store i32 0, i32* [[DOTZERO_ADDR]], align 4
+// CHECK-32-EX-NEXT:    store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4
+// CHECK-32-EX-NEXT:    call void @__omp_outlined__69(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]]) #[[ATTR2]]
+// CHECK-32-EX-NEXT:    call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 2)
+// CHECK-32-EX-NEXT:    ret void
+// CHECK-32-EX:       worker.exit:
+// CHECK-32-EX-NEXT:    ret void
+//
+//
+// CHECK-32-EX-LABEL: define {{[^@]+}}@__omp_outlined__69
+// CHECK-32-EX-SAME: (i32* noalias noundef [[DOTGLOBAL_TID_:%.*]], i32* noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
+// CHECK-32-EX-NEXT:  entry:
+// CHECK-32-EX-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-EX-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [2 x i8*], align 4
+// CHECK-32-EX-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-EX-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4
+// CHECK-32-EX-NEXT:    store i32 0, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-32-EX-NEXT:    store i32 9, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-EX-NEXT:    store i32 1, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-32-EX-NEXT:    store i32 0, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-32-EX-NEXT:    [[NVPTX_NUM_THREADS:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block()
+// CHECK-32-EX-NEXT:    [[TMP0:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-EX-NEXT:    [[TMP1:%.*]] = load i32, i32* [[TMP0]], align 4
+// CHECK-32-EX-NEXT:    call void @__kmpc_distribute_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP1]], i32 91, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]])
+// CHECK-32-EX-NEXT:    [[TMP2:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-EX-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[TMP2]], 9
+// CHECK-32-EX-NEXT:    br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK-32-EX:       cond.true:
+// CHECK-32-EX-NEXT:    br label [[COND_END:%.*]]
+// CHECK-32-EX:       cond.false:
+// CHECK-32-EX-NEXT:    [[TMP3:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-EX-NEXT:    br label [[COND_END]]
+// CHECK-32-EX:       cond.end:
+// CHECK-32-EX-NEXT:    [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP3]], [[COND_FALSE]] ]
+// CHECK-32-EX-NEXT:    store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-EX-NEXT:    [[TMP4:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-32-EX-NEXT:    store i32 [[TMP4]], i32* [[DOTOMP_IV]], align 4
+// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK-32-EX:       omp.inner.for.cond:
+// CHECK-32-EX-NEXT:    [[TMP5:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
+// CHECK-32-EX-NEXT:    [[CMP1:%.*]] = icmp slt i32 [[TMP5]], 10
+// CHECK-32-EX-NEXT:    br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK-32-EX:       omp.inner.for.body:
+// CHECK-32-EX-NEXT:    [[TMP6:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-32-EX-NEXT:    [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-EX-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0
+// CHECK-32-EX-NEXT:    [[TMP9:%.*]] = inttoptr i32 [[TMP6]] to i8*
+// CHECK-32-EX-NEXT:    store i8* [[TMP9]], i8** [[TMP8]], align 4
+// CHECK-32-EX-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1
+// CHECK-32-EX-NEXT:    [[TMP11:%.*]] = inttoptr i32 [[TMP7]] to i8*
+// CHECK-32-EX-NEXT:    store i8* [[TMP11]], i8** [[TMP10]], align 4
+// CHECK-32-EX-NEXT:    [[TMP12:%.*]] = bitcast [2 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8**
+// CHECK-32-EX-NEXT:    call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32, i32)* @__omp_outlined__70 to i8*), i8* null, i8** [[TMP12]], i32 2)
+// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK-32-EX:       omp.inner.for.inc:
+// CHECK-32-EX-NEXT:    [[TMP13:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
+// CHECK-32-EX-NEXT:    [[TMP14:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-32-EX-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP13]], [[TMP14]]
+// CHECK-32-EX-NEXT:    store i32 [[ADD]], i32* [[DOTOMP_IV]], align 4
+// CHECK-32-EX-NEXT:    [[TMP15:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-32-EX-NEXT:    [[TMP16:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-32-EX-NEXT:    [[ADD2:%.*]] = add nsw i32 [[TMP15]], [[TMP16]]
+// CHECK-32-EX-NEXT:    store i32 [[ADD2]], i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-32-EX-NEXT:    [[TMP17:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-EX-NEXT:    [[TMP18:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-32-EX-NEXT:    [[ADD3:%.*]] = add nsw i32 [[TMP17]], [[TMP18]]
+// CHECK-32-EX-NEXT:    store i32 [[ADD3]], i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-EX-NEXT:    [[TMP19:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-EX-NEXT:    [[CMP4:%.*]] = icmp sgt i32 [[TMP19]], 9
+// CHECK-32-EX-NEXT:    br i1 [[CMP4]], label [[COND_TRUE5:%.*]], label [[COND_FALSE6:%.*]]
+// CHECK-32-EX:       cond.true5:
+// CHECK-32-EX-NEXT:    br label [[COND_END7:%.*]]
+// CHECK-32-EX:       cond.false6:
+// CHECK-32-EX-NEXT:    [[TMP20:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-EX-NEXT:    br label [[COND_END7]]
+// CHECK-32-EX:       cond.end7:
+// CHECK-32-EX-NEXT:    [[COND8:%.*]] = phi i32 [ 9, [[COND_TRUE5]] ], [ [[TMP20]], [[COND_FALSE6]] ]
+// CHECK-32-EX-NEXT:    store i32 [[COND8]], i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-EX-NEXT:    [[TMP21:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-32-EX-NEXT:    store i32 [[TMP21]], i32* [[DOTOMP_IV]], align 4
+// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND]]
+// CHECK-32-EX:       omp.inner.for.end:
+// CHECK-32-EX-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK-32-EX:       omp.loop.exit:
+// CHECK-32-EX-NEXT:    call void @__kmpc_distribute_static_fini(%struct.ident_t* @[[GLOB2]], i32 [[TMP1]])
+// CHECK-32-EX-NEXT:    ret void
+//
+//
+// CHECK-32-EX-LABEL: define {{[^@]+}}@__omp_outlined__70
+// CHECK-32-EX-SAME: (i32* noalias noundef [[DOTGLOBAL_TID_:%.*]], i32* noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR1]] {
+// CHECK-32-EX-NEXT:  entry:
+// CHECK-32-EX-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-EX-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-EX-NEXT:    [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-EX-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4
+// CHECK-32-EX-NEXT:    store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4
+// CHECK-32-EX-NEXT:    store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4
+// CHECK-32-EX-NEXT:    store i32 0, i32* [[DOTOMP_LB]], align 4
+// CHECK-32-EX-NEXT:    store i32 9, i32* [[DOTOMP_UB]], align 4
+// CHECK-32-EX-NEXT:    [[TMP0:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4
+// CHECK-32-EX-NEXT:    [[TMP1:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4
+// CHECK-32-EX-NEXT:    store i32 [[TMP0]], i32* [[DOTOMP_LB]], align 4
+// CHECK-32-EX-NEXT:    store i32 [[TMP1]], i32* [[DOTOMP_UB]], align 4
+// CHECK-32-EX-NEXT:    store i32 1, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-32-EX-NEXT:    store i32 0, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-32-EX-NEXT:    [[TMP2:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4
+// CHECK-32-EX-NEXT:    [[TMP3:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4
+// CHECK-32-EX-NEXT:    [[TMP4:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-EX-NEXT:    [[TMP5:%.*]] = load i32, i32* [[TMP4]], align 4
+// CHECK-32-EX-NEXT:    call void @__kmpc_dispatch_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP5]], i32 1073741860, i32 [[TMP2]], i32 [[TMP3]], i32 1, i32 1)
+// CHECK-32-EX-NEXT:    br label [[OMP_DISPATCH_COND:%.*]]
+// CHECK-32-EX:       omp.dispatch.cond:
+// CHECK-32-EX-NEXT:    [[TMP6:%.*]] = call i32 @__kmpc_dispatch_next_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP5]], i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]])
+// CHECK-32-EX-NEXT:    [[TOBOOL:%.*]] = icmp ne i32 [[TMP6]], 0
+// CHECK-32-EX-NEXT:    br i1 [[TOBOOL]], label [[OMP_DISPATCH_BODY:%.*]], label [[OMP_DISPATCH_END:%.*]]
+// CHECK-32-EX:       omp.dispatch.body:
+// CHECK-32-EX-NEXT:    [[TMP7:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4
+// CHECK-32-EX-NEXT:    store i32 [[TMP7]], i32* [[DOTOMP_IV]], align 4
+// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK-32-EX:       omp.inner.for.cond:
+// CHECK-32-EX-NEXT:    [[TMP8:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP255:![0-9]+]]
+// CHECK-32-EX-NEXT:    [[TMP9:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP255]]
+// CHECK-32-EX-NEXT:    [[CMP:%.*]] = icmp sle i32 [[TMP8]], [[TMP9]]
+// CHECK-32-EX-NEXT:    br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK-32-EX:       omp.inner.for.body:
+// CHECK-32-EX-NEXT:    [[TMP10:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP255]]
+// CHECK-32-EX-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP10]], 1
+// CHECK-32-EX-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// CHECK-32-EX-NEXT:    store i32 [[ADD]], i32* [[I]], align 4, !llvm.access.group [[ACC_GRP255]]
+// CHECK-32-EX-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK-32-EX:       omp.body.continue:
+// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK-32-EX:       omp.inner.for.inc:
+// CHECK-32-EX-NEXT:    [[TMP11:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP255]]
+// CHECK-32-EX-NEXT:    [[ADD1:%.*]] = add nsw i32 [[TMP11]], 1
+// CHECK-32-EX-NEXT:    store i32 [[ADD1]], i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP255]]
+// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP256:![0-9]+]]
+// CHECK-32-EX:       omp.inner.for.end:
+// CHECK-32-EX-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
+// CHECK-32-EX:       omp.dispatch.inc:
+// CHECK-32-EX-NEXT:    br label [[OMP_DISPATCH_COND]]
+// CHECK-32-EX:       omp.dispatch.end:
+// CHECK-32-EX-NEXT:    ret void
+//
+//
+// CHECK-32-EX-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l159
+// CHECK-32-EX-SAME: (i32 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR8:[0-9]+]] {
+// CHECK-32-EX-NEXT:  entry:
+// CHECK-32-EX-NEXT:    [[DOTCAPTURE_EXPR__ADDR:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x i8*], align 4
+// CHECK-32-EX-NEXT:    store i32 [[DOTCAPTURE_EXPR_]], i32* [[DOTCAPTURE_EXPR__ADDR]], align 4
+// CHECK-32-EX-NEXT:    [[CONV:%.*]] = bitcast i32* [[DOTCAPTURE_EXPR__ADDR]] to i8*
+// CHECK-32-EX-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 2, i1 false)
+// CHECK-32-EX-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
+// CHECK-32-EX-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
+// CHECK-32-EX:       user_code.entry:
+// CHECK-32-EX-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
+// CHECK-32-EX-NEXT:    [[TMP2:%.*]] = load i8, i8* [[CONV]], align 1
+// CHECK-32-EX-NEXT:    [[TOBOOL:%.*]] = trunc i8 [[TMP2]] to i1
+// CHECK-32-EX-NEXT:    [[TMP3:%.*]] = zext i1 [[TOBOOL]] to i32
+// CHECK-32-EX-NEXT:    [[TMP4:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8**
+// CHECK-32-EX-NEXT:    call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]], i32 [[TMP3]], i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__71 to i8*), i8* null, i8** [[TMP4]], i32 0)
+// CHECK-32-EX-NEXT:    call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 2)
+// CHECK-32-EX-NEXT:    ret void
+// CHECK-32-EX:       worker.exit:
+// CHECK-32-EX-NEXT:    ret void
+//
+//
+// CHECK-32-EX-LABEL: define {{[^@]+}}@__omp_outlined__71
+// CHECK-32-EX-SAME: (i32* noalias noundef [[DOTGLOBAL_TID_:%.*]], i32* noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
+// CHECK-32-EX-NEXT:  entry:
+// CHECK-32-EX-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-EX-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-EX-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4
+// CHECK-32-EX-NEXT:    store i32 0, i32* [[DOTOMP_LB]], align 4
+// CHECK-32-EX-NEXT:    store i32 9, i32* [[DOTOMP_UB]], align 4
+// CHECK-32-EX-NEXT:    store i32 1, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-32-EX-NEXT:    store i32 0, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-32-EX-NEXT:    [[TMP0:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-EX-NEXT:    [[TMP1:%.*]] = load i32, i32* [[TMP0]], align 4
+// CHECK-32-EX-NEXT:    call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB3]], i32 [[TMP1]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1)
+// CHECK-32-EX-NEXT:    br label [[OMP_DISPATCH_COND:%.*]]
+// CHECK-32-EX:       omp.dispatch.cond:
+// CHECK-32-EX-NEXT:    [[TMP2:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4
+// CHECK-32-EX-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[TMP2]], 9
+// CHECK-32-EX-NEXT:    br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK-32-EX:       cond.true:
+// CHECK-32-EX-NEXT:    br label [[COND_END:%.*]]
+// CHECK-32-EX:       cond.false:
+// CHECK-32-EX-NEXT:    [[TMP3:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4
+// CHECK-32-EX-NEXT:    br label [[COND_END]]
+// CHECK-32-EX:       cond.end:
+// CHECK-32-EX-NEXT:    [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP3]], [[COND_FALSE]] ]
+// CHECK-32-EX-NEXT:    store i32 [[COND]], i32* [[DOTOMP_UB]], align 4
+// CHECK-32-EX-NEXT:    [[TMP4:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4
+// CHECK-32-EX-NEXT:    store i32 [[TMP4]], i32* [[DOTOMP_IV]], align 4
+// CHECK-32-EX-NEXT:    [[TMP5:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
+// CHECK-32-EX-NEXT:    [[TMP6:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4
+// CHECK-32-EX-NEXT:    [[CMP1:%.*]] = icmp sle i32 [[TMP5]], [[TMP6]]
+// CHECK-32-EX-NEXT:    br i1 [[CMP1]], label [[OMP_DISPATCH_BODY:%.*]], label [[OMP_DISPATCH_END:%.*]]
+// CHECK-32-EX:       omp.dispatch.body:
+// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK-32-EX:       omp.inner.for.cond:
+// CHECK-32-EX-NEXT:    [[TMP7:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
+// CHECK-32-EX-NEXT:    [[TMP8:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4
+// CHECK-32-EX-NEXT:    [[CMP2:%.*]] = icmp sle i32 [[TMP7]], [[TMP8]]
+// CHECK-32-EX-NEXT:    br i1 [[CMP2]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK-32-EX:       omp.inner.for.body:
+// CHECK-32-EX-NEXT:    [[TMP9:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
+// CHECK-32-EX-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP9]], 1
+// CHECK-32-EX-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// CHECK-32-EX-NEXT:    store i32 [[ADD]], i32* [[I]], align 4
+// CHECK-32-EX-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK-32-EX:       omp.body.continue:
+// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK-32-EX:       omp.inner.for.inc:
+// CHECK-32-EX-NEXT:    [[TMP10:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
+// CHECK-32-EX-NEXT:    [[ADD3:%.*]] = add nsw i32 [[TMP10]], 1
+// CHECK-32-EX-NEXT:    store i32 [[ADD3]], i32* [[DOTOMP_IV]], align 4
+// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND]]
+// CHECK-32-EX:       omp.inner.for.end:
+// CHECK-32-EX-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
+// CHECK-32-EX:       omp.dispatch.inc:
+// CHECK-32-EX-NEXT:    [[TMP11:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4
+// CHECK-32-EX-NEXT:    [[TMP12:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-32-EX-NEXT:    [[ADD4:%.*]] = add nsw i32 [[TMP11]], [[TMP12]]
+// CHECK-32-EX-NEXT:    store i32 [[ADD4]], i32* [[DOTOMP_LB]], align 4
+// CHECK-32-EX-NEXT:    [[TMP13:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4
+// CHECK-32-EX-NEXT:    [[TMP14:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-32-EX-NEXT:    [[ADD5:%.*]] = add nsw i32 [[TMP13]], [[TMP14]]
+// CHECK-32-EX-NEXT:    store i32 [[ADD5]], i32* [[DOTOMP_UB]], align 4
+// CHECK-32-EX-NEXT:    br label [[OMP_DISPATCH_COND]]
+// CHECK-32-EX:       omp.dispatch.end:
+// CHECK-32-EX-NEXT:    call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB3]], i32 [[TMP1]])
+// CHECK-32-EX-NEXT:    ret void
+//
+//
+// CHECK-32-EX-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l162
+// CHECK-32-EX-SAME: () #[[ATTR8]] {
+// CHECK-32-EX-NEXT:  entry:
+// CHECK-32-EX-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x i8*], align 4
+// CHECK-32-EX-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 2, i1 false)
+// CHECK-32-EX-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
+// CHECK-32-EX-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
+// CHECK-32-EX:       user_code.entry:
+// CHECK-32-EX-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
+// CHECK-32-EX-NEXT:    [[TMP2:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8**
+// CHECK-32-EX-NEXT:    call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__72 to i8*), i8* null, i8** [[TMP2]], i32 0)
+// CHECK-32-EX-NEXT:    call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 2)
+// CHECK-32-EX-NEXT:    ret void
+// CHECK-32-EX:       worker.exit:
+// CHECK-32-EX-NEXT:    ret void
+//
+//
+// CHECK-32-EX-LABEL: define {{[^@]+}}@__omp_outlined__72
+// CHECK-32-EX-SAME: (i32* noalias noundef [[DOTGLOBAL_TID_:%.*]], i32* noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
+// CHECK-32-EX-NEXT:  entry:
+// CHECK-32-EX-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-EX-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-EX-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4
+// CHECK-32-EX-NEXT:    store i32 0, i32* [[DOTOMP_LB]], align 4
+// CHECK-32-EX-NEXT:    store i32 9, i32* [[DOTOMP_UB]], align 4
+// CHECK-32-EX-NEXT:    store i32 1, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-32-EX-NEXT:    store i32 0, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-32-EX-NEXT:    [[TMP0:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-EX-NEXT:    [[TMP1:%.*]] = load i32, i32* [[TMP0]], align 4
+// CHECK-32-EX-NEXT:    call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB3]], i32 [[TMP1]], i32 34, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1)
+// CHECK-32-EX-NEXT:    [[TMP2:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4
+// CHECK-32-EX-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[TMP2]], 9
+// CHECK-32-EX-NEXT:    br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK-32-EX:       cond.true:
+// CHECK-32-EX-NEXT:    br label [[COND_END:%.*]]
+// CHECK-32-EX:       cond.false:
+// CHECK-32-EX-NEXT:    [[TMP3:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4
+// CHECK-32-EX-NEXT:    br label [[COND_END]]
+// CHECK-32-EX:       cond.end:
+// CHECK-32-EX-NEXT:    [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP3]], [[COND_FALSE]] ]
+// CHECK-32-EX-NEXT:    store i32 [[COND]], i32* [[DOTOMP_UB]], align 4
+// CHECK-32-EX-NEXT:    [[TMP4:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4
+// CHECK-32-EX-NEXT:    store i32 [[TMP4]], i32* [[DOTOMP_IV]], align 4
+// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK-32-EX:       omp.inner.for.cond:
+// CHECK-32-EX-NEXT:    [[TMP5:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
+// CHECK-32-EX-NEXT:    [[TMP6:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4
+// CHECK-32-EX-NEXT:    [[CMP1:%.*]] = icmp sle i32 [[TMP5]], [[TMP6]]
+// CHECK-32-EX-NEXT:    br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK-32-EX:       omp.inner.for.body:
+// CHECK-32-EX-NEXT:    [[TMP7:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
+// CHECK-32-EX-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP7]], 1
+// CHECK-32-EX-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// CHECK-32-EX-NEXT:    store i32 [[ADD]], i32* [[I]], align 4
+// CHECK-32-EX-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK-32-EX:       omp.body.continue:
+// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK-32-EX:       omp.inner.for.inc:
+// CHECK-32-EX-NEXT:    [[TMP8:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
+// CHECK-32-EX-NEXT:    [[ADD2:%.*]] = add nsw i32 [[TMP8]], 1
+// CHECK-32-EX-NEXT:    store i32 [[ADD2]], i32* [[DOTOMP_IV]], align 4
+// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND]]
+// CHECK-32-EX:       omp.inner.for.end:
+// CHECK-32-EX-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK-32-EX:       omp.loop.exit:
+// CHECK-32-EX-NEXT:    call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB3]], i32 [[TMP1]])
+// CHECK-32-EX-NEXT:    ret void
+//
+//
+// CHECK-32-EX-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l165
+// CHECK-32-EX-SAME: () #[[ATTR8]] {
+// CHECK-32-EX-NEXT:  entry:
+// CHECK-32-EX-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x i8*], align 4
+// CHECK-32-EX-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 2, i1 false)
+// CHECK-32-EX-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
+// CHECK-32-EX-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
+// CHECK-32-EX:       user_code.entry:
+// CHECK-32-EX-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
+// CHECK-32-EX-NEXT:    [[TMP2:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8**
+// CHECK-32-EX-NEXT:    call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__73 to i8*), i8* null, i8** [[TMP2]], i32 0)
+// CHECK-32-EX-NEXT:    call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 2)
+// CHECK-32-EX-NEXT:    ret void
+// CHECK-32-EX:       worker.exit:
+// CHECK-32-EX-NEXT:    ret void
+//
+//
+// CHECK-32-EX-LABEL: define {{[^@]+}}@__omp_outlined__73
+// CHECK-32-EX-SAME: (i32* noalias noundef [[DOTGLOBAL_TID_:%.*]], i32* noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
+// CHECK-32-EX-NEXT:  entry:
+// CHECK-32-EX-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-EX-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-EX-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4
+// CHECK-32-EX-NEXT:    store i32 0, i32* [[DOTOMP_LB]], align 4
+// CHECK-32-EX-NEXT:    store i32 9, i32* [[DOTOMP_UB]], align 4
+// CHECK-32-EX-NEXT:    store i32 1, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-32-EX-NEXT:    store i32 0, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-32-EX-NEXT:    [[TMP0:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-EX-NEXT:    [[TMP1:%.*]] = load i32, i32* [[TMP0]], align 4
+// CHECK-32-EX-NEXT:    call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB3]], i32 [[TMP1]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1)
+// CHECK-32-EX-NEXT:    br label [[OMP_DISPATCH_COND:%.*]]
+// CHECK-32-EX:       omp.dispatch.cond:
+// CHECK-32-EX-NEXT:    [[TMP2:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4
+// CHECK-32-EX-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[TMP2]], 9
+// CHECK-32-EX-NEXT:    br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK-32-EX:       cond.true:
+// CHECK-32-EX-NEXT:    br label [[COND_END:%.*]]
+// CHECK-32-EX:       cond.false:
+// CHECK-32-EX-NEXT:    [[TMP3:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4
+// CHECK-32-EX-NEXT:    br label [[COND_END]]
+// CHECK-32-EX:       cond.end:
+// CHECK-32-EX-NEXT:    [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP3]], [[COND_FALSE]] ]
+// CHECK-32-EX-NEXT:    store i32 [[COND]], i32* [[DOTOMP_UB]], align 4
+// CHECK-32-EX-NEXT:    [[TMP4:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4
+// CHECK-32-EX-NEXT:    store i32 [[TMP4]], i32* [[DOTOMP_IV]], align 4
+// CHECK-32-EX-NEXT:    [[TMP5:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
+// CHECK-32-EX-NEXT:    [[TMP6:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4
+// CHECK-32-EX-NEXT:    [[CMP1:%.*]] = icmp sle i32 [[TMP5]], [[TMP6]]
+// CHECK-32-EX-NEXT:    br i1 [[CMP1]], label [[OMP_DISPATCH_BODY:%.*]], label [[OMP_DISPATCH_END:%.*]]
+// CHECK-32-EX:       omp.dispatch.body:
+// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK-32-EX:       omp.inner.for.cond:
+// CHECK-32-EX-NEXT:    [[TMP7:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
+// CHECK-32-EX-NEXT:    [[TMP8:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4
+// CHECK-32-EX-NEXT:    [[CMP2:%.*]] = icmp sle i32 [[TMP7]], [[TMP8]]
+// CHECK-32-EX-NEXT:    br i1 [[CMP2]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK-32-EX:       omp.inner.for.body:
+// CHECK-32-EX-NEXT:    [[TMP9:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
+// CHECK-32-EX-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP9]], 1
+// CHECK-32-EX-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// CHECK-32-EX-NEXT:    store i32 [[ADD]], i32* [[I]], align 4
+// CHECK-32-EX-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK-32-EX:       omp.body.continue:
+// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK-32-EX:       omp.inner.for.inc:
+// CHECK-32-EX-NEXT:    [[TMP10:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
+// CHECK-32-EX-NEXT:    [[ADD3:%.*]] = add nsw i32 [[TMP10]], 1
+// CHECK-32-EX-NEXT:    store i32 [[ADD3]], i32* [[DOTOMP_IV]], align 4
+// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND]]
+// CHECK-32-EX:       omp.inner.for.end:
+// CHECK-32-EX-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
+// CHECK-32-EX:       omp.dispatch.inc:
+// CHECK-32-EX-NEXT:    [[TMP11:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4
+// CHECK-32-EX-NEXT:    [[TMP12:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-32-EX-NEXT:    [[ADD4:%.*]] = add nsw i32 [[TMP11]], [[TMP12]]
+// CHECK-32-EX-NEXT:    store i32 [[ADD4]], i32* [[DOTOMP_LB]], align 4
+// CHECK-32-EX-NEXT:    [[TMP13:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4
+// CHECK-32-EX-NEXT:    [[TMP14:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-32-EX-NEXT:    [[ADD5:%.*]] = add nsw i32 [[TMP13]], [[TMP14]]
+// CHECK-32-EX-NEXT:    store i32 [[ADD5]], i32* [[DOTOMP_UB]], align 4
+// CHECK-32-EX-NEXT:    br label [[OMP_DISPATCH_COND]]
+// CHECK-32-EX:       omp.dispatch.end:
+// CHECK-32-EX-NEXT:    call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB3]], i32 [[TMP1]])
+// CHECK-32-EX-NEXT:    ret void
+//
+//
+// CHECK-32-EX-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l168
+// CHECK-32-EX-SAME: () #[[ATTR8]] {
+// CHECK-32-EX-NEXT:  entry:
+// CHECK-32-EX-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x i8*], align 4
+// CHECK-32-EX-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 2, i1 false)
+// CHECK-32-EX-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
+// CHECK-32-EX-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
+// CHECK-32-EX:       user_code.entry:
+// CHECK-32-EX-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
+// CHECK-32-EX-NEXT:    [[TMP2:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8**
+// CHECK-32-EX-NEXT:    call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__74 to i8*), i8* null, i8** [[TMP2]], i32 0)
+// CHECK-32-EX-NEXT:    call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 2)
+// CHECK-32-EX-NEXT:    ret void
+// CHECK-32-EX:       worker.exit:
+// CHECK-32-EX-NEXT:    ret void
+//
+//
+// CHECK-32-EX-LABEL: define {{[^@]+}}@__omp_outlined__74
+// CHECK-32-EX-SAME: (i32* noalias noundef [[DOTGLOBAL_TID_:%.*]], i32* noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
+// CHECK-32-EX-NEXT:  entry:
+// CHECK-32-EX-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-EX-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-EX-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4
+// CHECK-32-EX-NEXT:    store i32 0, i32* [[DOTOMP_LB]], align 4
+// CHECK-32-EX-NEXT:    store i32 9, i32* [[DOTOMP_UB]], align 4
+// CHECK-32-EX-NEXT:    store i32 1, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-32-EX-NEXT:    store i32 0, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-32-EX-NEXT:    [[TMP0:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-EX-NEXT:    [[TMP1:%.*]] = load i32, i32* [[TMP0]], align 4
+// CHECK-32-EX-NEXT:    call void @__kmpc_dispatch_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]], i32 1073741862, i32 0, i32 9, i32 1, i32 1)
+// CHECK-32-EX-NEXT:    br label [[OMP_DISPATCH_COND:%.*]]
+// CHECK-32-EX:       omp.dispatch.cond:
+// CHECK-32-EX-NEXT:    [[TMP2:%.*]] = call i32 @__kmpc_dispatch_next_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]], i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]])
+// CHECK-32-EX-NEXT:    [[TOBOOL:%.*]] = icmp ne i32 [[TMP2]], 0
+// CHECK-32-EX-NEXT:    br i1 [[TOBOOL]], label [[OMP_DISPATCH_BODY:%.*]], label [[OMP_DISPATCH_END:%.*]]
+// CHECK-32-EX:       omp.dispatch.body:
+// CHECK-32-EX-NEXT:    [[TMP3:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4
+// CHECK-32-EX-NEXT:    store i32 [[TMP3]], i32* [[DOTOMP_IV]], align 4
+// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK-32-EX:       omp.inner.for.cond:
+// CHECK-32-EX-NEXT:    [[TMP4:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP258:![0-9]+]]
+// CHECK-32-EX-NEXT:    [[TMP5:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP258]]
+// CHECK-32-EX-NEXT:    [[CMP:%.*]] = icmp sle i32 [[TMP4]], [[TMP5]]
+// CHECK-32-EX-NEXT:    br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK-32-EX:       omp.inner.for.body:
+// CHECK-32-EX-NEXT:    [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP258]]
+// CHECK-32-EX-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP6]], 1
+// CHECK-32-EX-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// CHECK-32-EX-NEXT:    store i32 [[ADD]], i32* [[I]], align 4, !llvm.access.group [[ACC_GRP258]]
+// CHECK-32-EX-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK-32-EX:       omp.body.continue:
+// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK-32-EX:       omp.inner.for.inc:
+// CHECK-32-EX-NEXT:    [[TMP7:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP258]]
+// CHECK-32-EX-NEXT:    [[ADD1:%.*]] = add nsw i32 [[TMP7]], 1
+// CHECK-32-EX-NEXT:    store i32 [[ADD1]], i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP258]]
+// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP259:![0-9]+]]
+// CHECK-32-EX:       omp.inner.for.end:
+// CHECK-32-EX-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
+// CHECK-32-EX:       omp.dispatch.inc:
+// CHECK-32-EX-NEXT:    br label [[OMP_DISPATCH_COND]]
+// CHECK-32-EX:       omp.dispatch.end:
+// CHECK-32-EX-NEXT:    ret void
+//
+//
+// CHECK-32-EX-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l171
+// CHECK-32-EX-SAME: () #[[ATTR8]] {
+// CHECK-32-EX-NEXT:  entry:
+// CHECK-32-EX-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x i8*], align 4
+// CHECK-32-EX-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 2, i1 false)
+// CHECK-32-EX-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
+// CHECK-32-EX-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
+// CHECK-32-EX:       user_code.entry:
+// CHECK-32-EX-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
+// CHECK-32-EX-NEXT:    [[TMP2:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8**
+// CHECK-32-EX-NEXT:    call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__75 to i8*), i8* null, i8** [[TMP2]], i32 0)
+// CHECK-32-EX-NEXT:    call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 2)
+// CHECK-32-EX-NEXT:    ret void
+// CHECK-32-EX:       worker.exit:
+// CHECK-32-EX-NEXT:    ret void
+//
+//
+// CHECK-32-EX-LABEL: define {{[^@]+}}@__omp_outlined__75
+// CHECK-32-EX-SAME: (i32* noalias noundef [[DOTGLOBAL_TID_:%.*]], i32* noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
+// CHECK-32-EX-NEXT:  entry:
+// CHECK-32-EX-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-EX-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-EX-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4
+// CHECK-32-EX-NEXT:    store i32 0, i32* [[DOTOMP_LB]], align 4
+// CHECK-32-EX-NEXT:    store i32 9, i32* [[DOTOMP_UB]], align 4
+// CHECK-32-EX-NEXT:    store i32 1, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-32-EX-NEXT:    store i32 0, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-32-EX-NEXT:    [[TMP0:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-EX-NEXT:    [[TMP1:%.*]] = load i32, i32* [[TMP0]], align 4
+// CHECK-32-EX-NEXT:    call void @__kmpc_dispatch_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]], i32 1073741861, i32 0, i32 9, i32 1, i32 1)
+// CHECK-32-EX-NEXT:    br label [[OMP_DISPATCH_COND:%.*]]
+// CHECK-32-EX:       omp.dispatch.cond:
+// CHECK-32-EX-NEXT:    [[TMP2:%.*]] = call i32 @__kmpc_dispatch_next_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]], i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]])
+// CHECK-32-EX-NEXT:    [[TOBOOL:%.*]] = icmp ne i32 [[TMP2]], 0
+// CHECK-32-EX-NEXT:    br i1 [[TOBOOL]], label [[OMP_DISPATCH_BODY:%.*]], label [[OMP_DISPATCH_END:%.*]]
+// CHECK-32-EX:       omp.dispatch.body:
+// CHECK-32-EX-NEXT:    [[TMP3:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4
+// CHECK-32-EX-NEXT:    store i32 [[TMP3]], i32* [[DOTOMP_IV]], align 4
+// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK-32-EX:       omp.inner.for.cond:
+// CHECK-32-EX-NEXT:    [[TMP4:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP261:![0-9]+]]
+// CHECK-32-EX-NEXT:    [[TMP5:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP261]]
+// CHECK-32-EX-NEXT:    [[CMP:%.*]] = icmp sle i32 [[TMP4]], [[TMP5]]
+// CHECK-32-EX-NEXT:    br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK-32-EX:       omp.inner.for.body:
+// CHECK-32-EX-NEXT:    [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP261]]
+// CHECK-32-EX-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP6]], 1
+// CHECK-32-EX-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// CHECK-32-EX-NEXT:    store i32 [[ADD]], i32* [[I]], align 4, !llvm.access.group [[ACC_GRP261]]
+// CHECK-32-EX-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK-32-EX:       omp.body.continue:
+// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK-32-EX:       omp.inner.for.inc:
+// CHECK-32-EX-NEXT:    [[TMP7:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP261]]
+// CHECK-32-EX-NEXT:    [[ADD1:%.*]] = add nsw i32 [[TMP7]], 1
+// CHECK-32-EX-NEXT:    store i32 [[ADD1]], i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP261]]
+// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP262:![0-9]+]]
+// CHECK-32-EX:       omp.inner.for.end:
+// CHECK-32-EX-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
+// CHECK-32-EX:       omp.dispatch.inc:
+// CHECK-32-EX-NEXT:    br label [[OMP_DISPATCH_COND]]
+// CHECK-32-EX:       omp.dispatch.end:
+// CHECK-32-EX-NEXT:    ret void
+//
+//
+// CHECK-32-EX-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l174
+// CHECK-32-EX-SAME: () #[[ATTR8]] {
+// CHECK-32-EX-NEXT:  entry:
+// CHECK-32-EX-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x i8*], align 4
+// CHECK-32-EX-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 2, i1 false)
+// CHECK-32-EX-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
+// CHECK-32-EX-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
+// CHECK-32-EX:       user_code.entry:
+// CHECK-32-EX-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
+// CHECK-32-EX-NEXT:    [[TMP2:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8**
+// CHECK-32-EX-NEXT:    call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__76 to i8*), i8* null, i8** [[TMP2]], i32 0)
+// CHECK-32-EX-NEXT:    call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 2)
+// CHECK-32-EX-NEXT:    ret void
+// CHECK-32-EX:       worker.exit:
+// CHECK-32-EX-NEXT:    ret void
+//
+//
+// CHECK-32-EX-LABEL: define {{[^@]+}}@__omp_outlined__76
+// CHECK-32-EX-SAME: (i32* noalias noundef [[DOTGLOBAL_TID_:%.*]], i32* noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
+// CHECK-32-EX-NEXT:  entry:
+// CHECK-32-EX-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-EX-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-EX-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4
+// CHECK-32-EX-NEXT:    store i32 0, i32* [[DOTOMP_LB]], align 4
+// CHECK-32-EX-NEXT:    store i32 9, i32* [[DOTOMP_UB]], align 4
+// CHECK-32-EX-NEXT:    store i32 1, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-32-EX-NEXT:    store i32 0, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-32-EX-NEXT:    [[TMP0:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-EX-NEXT:    [[TMP1:%.*]] = load i32, i32* [[TMP0]], align 4
+// CHECK-32-EX-NEXT:    call void @__kmpc_dispatch_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]], i32 1073741859, i32 0, i32 9, i32 1, i32 1)
+// CHECK-32-EX-NEXT:    br label [[OMP_DISPATCH_COND:%.*]]
+// CHECK-32-EX:       omp.dispatch.cond:
+// CHECK-32-EX-NEXT:    [[TMP2:%.*]] = call i32 @__kmpc_dispatch_next_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]], i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]])
+// CHECK-32-EX-NEXT:    [[TOBOOL:%.*]] = icmp ne i32 [[TMP2]], 0
+// CHECK-32-EX-NEXT:    br i1 [[TOBOOL]], label [[OMP_DISPATCH_BODY:%.*]], label [[OMP_DISPATCH_END:%.*]]
+// CHECK-32-EX:       omp.dispatch.body:
+// CHECK-32-EX-NEXT:    [[TMP3:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4
+// CHECK-32-EX-NEXT:    store i32 [[TMP3]], i32* [[DOTOMP_IV]], align 4
+// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK-32-EX:       omp.inner.for.cond:
+// CHECK-32-EX-NEXT:    [[TMP4:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP264:![0-9]+]]
+// CHECK-32-EX-NEXT:    [[TMP5:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP264]]
+// CHECK-32-EX-NEXT:    [[CMP:%.*]] = icmp sle i32 [[TMP4]], [[TMP5]]
+// CHECK-32-EX-NEXT:    br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK-32-EX:       omp.inner.for.body:
+// CHECK-32-EX-NEXT:    [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP264]]
+// CHECK-32-EX-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP6]], 1
+// CHECK-32-EX-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// CHECK-32-EX-NEXT:    store i32 [[ADD]], i32* [[I]], align 4, !llvm.access.group [[ACC_GRP264]]
+// CHECK-32-EX-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK-32-EX:       omp.body.continue:
+// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK-32-EX:       omp.inner.for.inc:
+// CHECK-32-EX-NEXT:    [[TMP7:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP264]]
+// CHECK-32-EX-NEXT:    [[ADD1:%.*]] = add nsw i32 [[TMP7]], 1
+// CHECK-32-EX-NEXT:    store i32 [[ADD1]], i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP264]]
+// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP265:![0-9]+]]
+// CHECK-32-EX:       omp.inner.for.end:
+// CHECK-32-EX-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
+// CHECK-32-EX:       omp.dispatch.inc:
+// CHECK-32-EX-NEXT:    br label [[OMP_DISPATCH_COND]]
+// CHECK-32-EX:       omp.dispatch.end:
+// CHECK-32-EX-NEXT:    ret void
+//
+//
+// CHECK-32-EX-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l177
+// CHECK-32-EX-SAME: () #[[ATTR8]] {
+// CHECK-32-EX-NEXT:  entry:
+// CHECK-32-EX-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x i8*], align 4
+// CHECK-32-EX-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 2, i1 false)
+// CHECK-32-EX-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
+// CHECK-32-EX-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
+// CHECK-32-EX:       user_code.entry:
+// CHECK-32-EX-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
+// CHECK-32-EX-NEXT:    [[TMP2:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8**
+// CHECK-32-EX-NEXT:    call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__77 to i8*), i8* null, i8** [[TMP2]], i32 0)
+// CHECK-32-EX-NEXT:    call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 2)
+// CHECK-32-EX-NEXT:    ret void
+// CHECK-32-EX:       worker.exit:
+// CHECK-32-EX-NEXT:    ret void
+//
+//
+// CHECK-32-EX-LABEL: define {{[^@]+}}@__omp_outlined__77
+// CHECK-32-EX-SAME: (i32* noalias noundef [[DOTGLOBAL_TID_:%.*]], i32* noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
+// CHECK-32-EX-NEXT:  entry:
+// CHECK-32-EX-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-EX-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-EX-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4
+// CHECK-32-EX-NEXT:    store i32 0, i32* [[DOTOMP_LB]], align 4
+// CHECK-32-EX-NEXT:    store i32 9, i32* [[DOTOMP_UB]], align 4
+// CHECK-32-EX-NEXT:    store i32 1, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-32-EX-NEXT:    store i32 0, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-32-EX-NEXT:    [[TMP0:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-EX-NEXT:    [[TMP1:%.*]] = load i32, i32* [[TMP0]], align 4
+// CHECK-32-EX-NEXT:    call void @__kmpc_dispatch_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]], i32 1073741860, i32 0, i32 9, i32 1, i32 1)
+// CHECK-32-EX-NEXT:    br label [[OMP_DISPATCH_COND:%.*]]
+// CHECK-32-EX:       omp.dispatch.cond:
+// CHECK-32-EX-NEXT:    [[TMP2:%.*]] = call i32 @__kmpc_dispatch_next_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]], i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]])
+// CHECK-32-EX-NEXT:    [[TOBOOL:%.*]] = icmp ne i32 [[TMP2]], 0
+// CHECK-32-EX-NEXT:    br i1 [[TOBOOL]], label [[OMP_DISPATCH_BODY:%.*]], label [[OMP_DISPATCH_END:%.*]]
+// CHECK-32-EX:       omp.dispatch.body:
+// CHECK-32-EX-NEXT:    [[TMP3:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4
+// CHECK-32-EX-NEXT:    store i32 [[TMP3]], i32* [[DOTOMP_IV]], align 4
+// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK-32-EX:       omp.inner.for.cond:
+// CHECK-32-EX-NEXT:    [[TMP4:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP267:![0-9]+]]
+// CHECK-32-EX-NEXT:    [[TMP5:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP267]]
+// CHECK-32-EX-NEXT:    [[CMP:%.*]] = icmp sle i32 [[TMP4]], [[TMP5]]
+// CHECK-32-EX-NEXT:    br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK-32-EX:       omp.inner.for.body:
+// CHECK-32-EX-NEXT:    [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP267]]
+// CHECK-32-EX-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP6]], 1
+// CHECK-32-EX-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// CHECK-32-EX-NEXT:    store i32 [[ADD]], i32* [[I]], align 4, !llvm.access.group [[ACC_GRP267]]
+// CHECK-32-EX-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK-32-EX:       omp.body.continue:
+// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK-32-EX:       omp.inner.for.inc:
+// CHECK-32-EX-NEXT:    [[TMP7:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP267]]
+// CHECK-32-EX-NEXT:    [[ADD1:%.*]] = add nsw i32 [[TMP7]], 1
+// CHECK-32-EX-NEXT:    store i32 [[ADD1]], i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP267]]
+// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP268:![0-9]+]]
+// CHECK-32-EX:       omp.inner.for.end:
+// CHECK-32-EX-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
+// CHECK-32-EX:       omp.dispatch.inc:
+// CHECK-32-EX-NEXT:    br label [[OMP_DISPATCH_COND]]
+// CHECK-32-EX:       omp.dispatch.end:
+// CHECK-32-EX-NEXT:    ret void
+//
+//
+// CHECK-32-EX-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l180
+// CHECK-32-EX-SAME: (i32 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR8]] {
+// CHECK-32-EX-NEXT:  entry:
+// CHECK-32-EX-NEXT:    [[DOTCAPTURE_EXPR__ADDR:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x i8*], align 4
+// CHECK-32-EX-NEXT:    store i32 [[DOTCAPTURE_EXPR_]], i32* [[DOTCAPTURE_EXPR__ADDR]], align 4
+// CHECK-32-EX-NEXT:    [[CONV:%.*]] = bitcast i32* [[DOTCAPTURE_EXPR__ADDR]] to i8*
+// CHECK-32-EX-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 2, i1 false)
+// CHECK-32-EX-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
+// CHECK-32-EX-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
+// CHECK-32-EX:       user_code.entry:
+// CHECK-32-EX-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
+// CHECK-32-EX-NEXT:    [[TMP2:%.*]] = load i8, i8* [[CONV]], align 1
+// CHECK-32-EX-NEXT:    [[TOBOOL:%.*]] = trunc i8 [[TMP2]] to i1
+// CHECK-32-EX-NEXT:    [[TMP3:%.*]] = zext i1 [[TOBOOL]] to i32
+// CHECK-32-EX-NEXT:    [[TMP4:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8**
+// CHECK-32-EX-NEXT:    call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]], i32 [[TMP3]], i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__78 to i8*), i8* null, i8** [[TMP4]], i32 0)
+// CHECK-32-EX-NEXT:    call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 2)
+// CHECK-32-EX-NEXT:    ret void
+// CHECK-32-EX:       worker.exit:
+// CHECK-32-EX-NEXT:    ret void
+//
+//
+// CHECK-32-EX-LABEL: define {{[^@]+}}@__omp_outlined__78
+// CHECK-32-EX-SAME: (i32* noalias noundef [[DOTGLOBAL_TID_:%.*]], i32* noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
+// CHECK-32-EX-NEXT:  entry:
+// CHECK-32-EX-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-EX-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-EX-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4
+// CHECK-32-EX-NEXT:    store i32 0, i32* [[DOTOMP_LB]], align 4
+// CHECK-32-EX-NEXT:    store i32 9, i32* [[DOTOMP_UB]], align 4
+// CHECK-32-EX-NEXT:    store i32 1, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-32-EX-NEXT:    store i32 0, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-32-EX-NEXT:    [[TMP0:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-EX-NEXT:    [[TMP1:%.*]] = load i32, i32* [[TMP0]], align 4
+// CHECK-32-EX-NEXT:    call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB3]], i32 [[TMP1]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1)
+// CHECK-32-EX-NEXT:    br label [[OMP_DISPATCH_COND:%.*]]
+// CHECK-32-EX:       omp.dispatch.cond:
+// CHECK-32-EX-NEXT:    [[TMP2:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4
+// CHECK-32-EX-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[TMP2]], 9
+// CHECK-32-EX-NEXT:    br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK-32-EX:       cond.true:
+// CHECK-32-EX-NEXT:    br label [[COND_END:%.*]]
+// CHECK-32-EX:       cond.false:
+// CHECK-32-EX-NEXT:    [[TMP3:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4
+// CHECK-32-EX-NEXT:    br label [[COND_END]]
+// CHECK-32-EX:       cond.end:
+// CHECK-32-EX-NEXT:    [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP3]], [[COND_FALSE]] ]
+// CHECK-32-EX-NEXT:    store i32 [[COND]], i32* [[DOTOMP_UB]], align 4
+// CHECK-32-EX-NEXT:    [[TMP4:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4
+// CHECK-32-EX-NEXT:    store i32 [[TMP4]], i32* [[DOTOMP_IV]], align 4
+// CHECK-32-EX-NEXT:    [[TMP5:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
+// CHECK-32-EX-NEXT:    [[TMP6:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4
+// CHECK-32-EX-NEXT:    [[CMP1:%.*]] = icmp sle i32 [[TMP5]], [[TMP6]]
+// CHECK-32-EX-NEXT:    br i1 [[CMP1]], label [[OMP_DISPATCH_BODY:%.*]], label [[OMP_DISPATCH_END:%.*]]
+// CHECK-32-EX:       omp.dispatch.body:
+// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK-32-EX:       omp.inner.for.cond:
+// CHECK-32-EX-NEXT:    [[TMP7:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP270:![0-9]+]]
+// CHECK-32-EX-NEXT:    [[TMP8:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP270]]
+// CHECK-32-EX-NEXT:    [[CMP2:%.*]] = icmp sle i32 [[TMP7]], [[TMP8]]
+// CHECK-32-EX-NEXT:    br i1 [[CMP2]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK-32-EX:       omp.inner.for.body:
+// CHECK-32-EX-NEXT:    [[TMP9:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP270]]
+// CHECK-32-EX-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP9]], 1
+// CHECK-32-EX-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// CHECK-32-EX-NEXT:    store i32 [[ADD]], i32* [[I]], align 4, !llvm.access.group [[ACC_GRP270]]
+// CHECK-32-EX-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK-32-EX:       omp.body.continue:
+// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK-32-EX:       omp.inner.for.inc:
+// CHECK-32-EX-NEXT:    [[TMP10:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP270]]
+// CHECK-32-EX-NEXT:    [[ADD3:%.*]] = add nsw i32 [[TMP10]], 1
+// CHECK-32-EX-NEXT:    store i32 [[ADD3]], i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP270]]
+// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP271:![0-9]+]]
+// CHECK-32-EX:       omp.inner.for.end:
+// CHECK-32-EX-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
+// CHECK-32-EX:       omp.dispatch.inc:
+// CHECK-32-EX-NEXT:    [[TMP11:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4
+// CHECK-32-EX-NEXT:    [[TMP12:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-32-EX-NEXT:    [[ADD4:%.*]] = add nsw i32 [[TMP11]], [[TMP12]]
+// CHECK-32-EX-NEXT:    store i32 [[ADD4]], i32* [[DOTOMP_LB]], align 4
+// CHECK-32-EX-NEXT:    [[TMP13:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4
+// CHECK-32-EX-NEXT:    [[TMP14:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-32-EX-NEXT:    [[ADD5:%.*]] = add nsw i32 [[TMP13]], [[TMP14]]
+// CHECK-32-EX-NEXT:    store i32 [[ADD5]], i32* [[DOTOMP_UB]], align 4
+// CHECK-32-EX-NEXT:    br label [[OMP_DISPATCH_COND]]
+// CHECK-32-EX:       omp.dispatch.end:
+// CHECK-32-EX-NEXT:    call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB3]], i32 [[TMP1]])
+// CHECK-32-EX-NEXT:    [[TMP15:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-32-EX-NEXT:    [[TMP16:%.*]] = icmp ne i32 [[TMP15]], 0
+// CHECK-32-EX-NEXT:    br i1 [[TMP16]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]]
+// CHECK-32-EX:       .omp.final.then:
+// CHECK-32-EX-NEXT:    store i32 10, i32* [[I]], align 4
+// CHECK-32-EX-NEXT:    br label [[DOTOMP_FINAL_DONE]]
+// CHECK-32-EX:       .omp.final.done:
+// CHECK-32-EX-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4:[0-9]+]], i32 [[TMP1]])
+// CHECK-32-EX-NEXT:    ret void
+//
+//
+// CHECK-32-EX-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l184
+// CHECK-32-EX-SAME: () #[[ATTR8]] {
+// CHECK-32-EX-NEXT:  entry:
+// CHECK-32-EX-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x i8*], align 4
+// CHECK-32-EX-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 2, i1 false)
+// CHECK-32-EX-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
+// CHECK-32-EX-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
+// CHECK-32-EX:       user_code.entry:
+// CHECK-32-EX-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
+// CHECK-32-EX-NEXT:    [[TMP2:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8**
+// CHECK-32-EX-NEXT:    call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__79 to i8*), i8* null, i8** [[TMP2]], i32 0)
+// CHECK-32-EX-NEXT:    call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 2)
+// CHECK-32-EX-NEXT:    ret void
+// CHECK-32-EX:       worker.exit:
+// CHECK-32-EX-NEXT:    ret void
+//
+//
+// CHECK-32-EX-LABEL: define {{[^@]+}}@__omp_outlined__79
+// CHECK-32-EX-SAME: (i32* noalias noundef [[DOTGLOBAL_TID_:%.*]], i32* noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
+// CHECK-32-EX-NEXT:  entry:
+// CHECK-32-EX-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-EX-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-EX-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4
+// CHECK-32-EX-NEXT:    store i32 0, i32* [[DOTOMP_LB]], align 4
+// CHECK-32-EX-NEXT:    store i32 9, i32* [[DOTOMP_UB]], align 4
+// CHECK-32-EX-NEXT:    store i32 1, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-32-EX-NEXT:    store i32 0, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-32-EX-NEXT:    [[TMP0:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-EX-NEXT:    [[TMP1:%.*]] = load i32, i32* [[TMP0]], align 4
+// CHECK-32-EX-NEXT:    call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB3]], i32 [[TMP1]], i32 34, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1)
+// CHECK-32-EX-NEXT:    [[TMP2:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4
+// CHECK-32-EX-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[TMP2]], 9
+// CHECK-32-EX-NEXT:    br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK-32-EX:       cond.true:
+// CHECK-32-EX-NEXT:    br label [[COND_END:%.*]]
+// CHECK-32-EX:       cond.false:
+// CHECK-32-EX-NEXT:    [[TMP3:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4
+// CHECK-32-EX-NEXT:    br label [[COND_END]]
+// CHECK-32-EX:       cond.end:
+// CHECK-32-EX-NEXT:    [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP3]], [[COND_FALSE]] ]
+// CHECK-32-EX-NEXT:    store i32 [[COND]], i32* [[DOTOMP_UB]], align 4
+// CHECK-32-EX-NEXT:    [[TMP4:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4
+// CHECK-32-EX-NEXT:    store i32 [[TMP4]], i32* [[DOTOMP_IV]], align 4
+// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK-32-EX:       omp.inner.for.cond:
+// CHECK-32-EX-NEXT:    [[TMP5:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP273:![0-9]+]]
+// CHECK-32-EX-NEXT:    [[TMP6:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP273]]
+// CHECK-32-EX-NEXT:    [[CMP1:%.*]] = icmp sle i32 [[TMP5]], [[TMP6]]
+// CHECK-32-EX-NEXT:    br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK-32-EX:       omp.inner.for.body:
+// CHECK-32-EX-NEXT:    [[TMP7:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP273]]
+// CHECK-32-EX-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP7]], 1
+// CHECK-32-EX-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// CHECK-32-EX-NEXT:    store i32 [[ADD]], i32* [[I]], align 4, !llvm.access.group [[ACC_GRP273]]
+// CHECK-32-EX-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK-32-EX:       omp.body.continue:
+// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK-32-EX:       omp.inner.for.inc:
+// CHECK-32-EX-NEXT:    [[TMP8:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP273]]
+// CHECK-32-EX-NEXT:    [[ADD2:%.*]] = add nsw i32 [[TMP8]], 1
+// CHECK-32-EX-NEXT:    store i32 [[ADD2]], i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP273]]
+// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP274:![0-9]+]]
+// CHECK-32-EX:       omp.inner.for.end:
+// CHECK-32-EX-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK-32-EX:       omp.loop.exit:
+// CHECK-32-EX-NEXT:    call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB3]], i32 [[TMP1]])
+// CHECK-32-EX-NEXT:    [[TMP9:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-32-EX-NEXT:    [[TMP10:%.*]] = icmp ne i32 [[TMP9]], 0
+// CHECK-32-EX-NEXT:    br i1 [[TMP10]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]]
+// CHECK-32-EX:       .omp.final.then:
+// CHECK-32-EX-NEXT:    store i32 10, i32* [[I]], align 4
+// CHECK-32-EX-NEXT:    br label [[DOTOMP_FINAL_DONE]]
+// CHECK-32-EX:       .omp.final.done:
+// CHECK-32-EX-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[TMP1]])
+// CHECK-32-EX-NEXT:    ret void
+//
+//
+// CHECK-32-EX-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l188
+// CHECK-32-EX-SAME: () #[[ATTR8]] {
+// CHECK-32-EX-NEXT:  entry:
+// CHECK-32-EX-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x i8*], align 4
+// CHECK-32-EX-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 2, i1 false)
+// CHECK-32-EX-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
+// CHECK-32-EX-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
+// CHECK-32-EX:       user_code.entry:
+// CHECK-32-EX-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
+// CHECK-32-EX-NEXT:    [[TMP2:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8**
+// CHECK-32-EX-NEXT:    call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__80 to i8*), i8* null, i8** [[TMP2]], i32 0)
+// CHECK-32-EX-NEXT:    call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 2)
+// CHECK-32-EX-NEXT:    ret void
+// CHECK-32-EX:       worker.exit:
+// CHECK-32-EX-NEXT:    ret void
+//
+//
+// CHECK-32-EX-LABEL: define {{[^@]+}}@__omp_outlined__80
+// CHECK-32-EX-SAME: (i32* noalias noundef [[DOTGLOBAL_TID_:%.*]], i32* noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
+// CHECK-32-EX-NEXT:  entry:
+// CHECK-32-EX-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-EX-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-EX-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4
+// CHECK-32-EX-NEXT:    store i32 0, i32* [[DOTOMP_LB]], align 4
+// CHECK-32-EX-NEXT:    store i32 9, i32* [[DOTOMP_UB]], align 4
+// CHECK-32-EX-NEXT:    store i32 1, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-32-EX-NEXT:    store i32 0, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-32-EX-NEXT:    [[TMP0:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-EX-NEXT:    [[TMP1:%.*]] = load i32, i32* [[TMP0]], align 4
+// CHECK-32-EX-NEXT:    call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB3]], i32 [[TMP1]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1)
+// CHECK-32-EX-NEXT:    br label [[OMP_DISPATCH_COND:%.*]]
+// CHECK-32-EX:       omp.dispatch.cond:
+// CHECK-32-EX-NEXT:    [[TMP2:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4
+// CHECK-32-EX-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[TMP2]], 9
+// CHECK-32-EX-NEXT:    br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK-32-EX:       cond.true:
+// CHECK-32-EX-NEXT:    br label [[COND_END:%.*]]
+// CHECK-32-EX:       cond.false:
+// CHECK-32-EX-NEXT:    [[TMP3:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4
+// CHECK-32-EX-NEXT:    br label [[COND_END]]
+// CHECK-32-EX:       cond.end:
+// CHECK-32-EX-NEXT:    [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP3]], [[COND_FALSE]] ]
+// CHECK-32-EX-NEXT:    store i32 [[COND]], i32* [[DOTOMP_UB]], align 4
+// CHECK-32-EX-NEXT:    [[TMP4:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4
+// CHECK-32-EX-NEXT:    store i32 [[TMP4]], i32* [[DOTOMP_IV]], align 4
+// CHECK-32-EX-NEXT:    [[TMP5:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
+// CHECK-32-EX-NEXT:    [[TMP6:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4
+// CHECK-32-EX-NEXT:    [[CMP1:%.*]] = icmp sle i32 [[TMP5]], [[TMP6]]
+// CHECK-32-EX-NEXT:    br i1 [[CMP1]], label [[OMP_DISPATCH_BODY:%.*]], label [[OMP_DISPATCH_END:%.*]]
+// CHECK-32-EX:       omp.dispatch.body:
+// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK-32-EX:       omp.inner.for.cond:
+// CHECK-32-EX-NEXT:    [[TMP7:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP276:![0-9]+]]
+// CHECK-32-EX-NEXT:    [[TMP8:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP276]]
+// CHECK-32-EX-NEXT:    [[CMP2:%.*]] = icmp sle i32 [[TMP7]], [[TMP8]]
+// CHECK-32-EX-NEXT:    br i1 [[CMP2]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK-32-EX:       omp.inner.for.body:
+// CHECK-32-EX-NEXT:    [[TMP9:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP276]]
+// CHECK-32-EX-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP9]], 1
+// CHECK-32-EX-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// CHECK-32-EX-NEXT:    store i32 [[ADD]], i32* [[I]], align 4, !llvm.access.group [[ACC_GRP276]]
+// CHECK-32-EX-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK-32-EX:       omp.body.continue:
+// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK-32-EX:       omp.inner.for.inc:
+// CHECK-32-EX-NEXT:    [[TMP10:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP276]]
+// CHECK-32-EX-NEXT:    [[ADD3:%.*]] = add nsw i32 [[TMP10]], 1
+// CHECK-32-EX-NEXT:    store i32 [[ADD3]], i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP276]]
+// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP277:![0-9]+]]
+// CHECK-32-EX:       omp.inner.for.end:
+// CHECK-32-EX-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
+// CHECK-32-EX:       omp.dispatch.inc:
+// CHECK-32-EX-NEXT:    [[TMP11:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4
+// CHECK-32-EX-NEXT:    [[TMP12:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-32-EX-NEXT:    [[ADD4:%.*]] = add nsw i32 [[TMP11]], [[TMP12]]
+// CHECK-32-EX-NEXT:    store i32 [[ADD4]], i32* [[DOTOMP_LB]], align 4
+// CHECK-32-EX-NEXT:    [[TMP13:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4
+// CHECK-32-EX-NEXT:    [[TMP14:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-32-EX-NEXT:    [[ADD5:%.*]] = add nsw i32 [[TMP13]], [[TMP14]]
+// CHECK-32-EX-NEXT:    store i32 [[ADD5]], i32* [[DOTOMP_UB]], align 4
+// CHECK-32-EX-NEXT:    br label [[OMP_DISPATCH_COND]]
+// CHECK-32-EX:       omp.dispatch.end:
+// CHECK-32-EX-NEXT:    call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB3]], i32 [[TMP1]])
+// CHECK-32-EX-NEXT:    [[TMP15:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-32-EX-NEXT:    [[TMP16:%.*]] = icmp ne i32 [[TMP15]], 0
+// CHECK-32-EX-NEXT:    br i1 [[TMP16]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]]
+// CHECK-32-EX:       .omp.final.then:
+// CHECK-32-EX-NEXT:    store i32 10, i32* [[I]], align 4
+// CHECK-32-EX-NEXT:    br label [[DOTOMP_FINAL_DONE]]
+// CHECK-32-EX:       .omp.final.done:
+// CHECK-32-EX-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[TMP1]])
+// CHECK-32-EX-NEXT:    ret void
+//
+//
+// CHECK-32-EX-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l192
+// CHECK-32-EX-SAME: () #[[ATTR8]] {
+// CHECK-32-EX-NEXT:  entry:
+// CHECK-32-EX-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x i8*], align 4
+// CHECK-32-EX-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 2, i1 false)
+// CHECK-32-EX-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
+// CHECK-32-EX-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
+// CHECK-32-EX:       user_code.entry:
+// CHECK-32-EX-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
+// CHECK-32-EX-NEXT:    [[TMP2:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8**
+// CHECK-32-EX-NEXT:    call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__81 to i8*), i8* null, i8** [[TMP2]], i32 0)
+// CHECK-32-EX-NEXT:    call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 2)
+// CHECK-32-EX-NEXT:    ret void
+// CHECK-32-EX:       worker.exit:
+// CHECK-32-EX-NEXT:    ret void
+//
+//
+// CHECK-32-EX-LABEL: define {{[^@]+}}@__omp_outlined__81
+// CHECK-32-EX-SAME: (i32* noalias noundef [[DOTGLOBAL_TID_:%.*]], i32* noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
+// CHECK-32-EX-NEXT:  entry:
+// CHECK-32-EX-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-EX-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-EX-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4
+// CHECK-32-EX-NEXT:    store i32 0, i32* [[DOTOMP_LB]], align 4
+// CHECK-32-EX-NEXT:    store i32 9, i32* [[DOTOMP_UB]], align 4
+// CHECK-32-EX-NEXT:    store i32 1, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-32-EX-NEXT:    store i32 0, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-32-EX-NEXT:    [[TMP0:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-EX-NEXT:    [[TMP1:%.*]] = load i32, i32* [[TMP0]], align 4
+// CHECK-32-EX-NEXT:    call void @__kmpc_dispatch_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]], i32 1073741862, i32 0, i32 9, i32 1, i32 1)
+// CHECK-32-EX-NEXT:    br label [[OMP_DISPATCH_COND:%.*]]
+// CHECK-32-EX:       omp.dispatch.cond:
+// CHECK-32-EX-NEXT:    [[TMP2:%.*]] = call i32 @__kmpc_dispatch_next_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]], i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]])
+// CHECK-32-EX-NEXT:    [[TOBOOL:%.*]] = icmp ne i32 [[TMP2]], 0
+// CHECK-32-EX-NEXT:    br i1 [[TOBOOL]], label [[OMP_DISPATCH_BODY:%.*]], label [[OMP_DISPATCH_END:%.*]]
+// CHECK-32-EX:       omp.dispatch.body:
+// CHECK-32-EX-NEXT:    [[TMP3:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4
+// CHECK-32-EX-NEXT:    store i32 [[TMP3]], i32* [[DOTOMP_IV]], align 4
+// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK-32-EX:       omp.inner.for.cond:
+// CHECK-32-EX-NEXT:    [[TMP4:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP279:![0-9]+]]
+// CHECK-32-EX-NEXT:    [[TMP5:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP279]]
+// CHECK-32-EX-NEXT:    [[CMP:%.*]] = icmp sle i32 [[TMP4]], [[TMP5]]
+// CHECK-32-EX-NEXT:    br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK-32-EX:       omp.inner.for.body:
+// CHECK-32-EX-NEXT:    [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP279]]
+// CHECK-32-EX-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP6]], 1
+// CHECK-32-EX-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// CHECK-32-EX-NEXT:    store i32 [[ADD]], i32* [[I]], align 4, !llvm.access.group [[ACC_GRP279]]
+// CHECK-32-EX-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK-32-EX:       omp.body.continue:
+// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK-32-EX:       omp.inner.for.inc:
+// CHECK-32-EX-NEXT:    [[TMP7:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP279]]
+// CHECK-32-EX-NEXT:    [[ADD1:%.*]] = add nsw i32 [[TMP7]], 1
+// CHECK-32-EX-NEXT:    store i32 [[ADD1]], i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP279]]
+// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP280:![0-9]+]]
+// CHECK-32-EX:       omp.inner.for.end:
+// CHECK-32-EX-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
+// CHECK-32-EX:       omp.dispatch.inc:
+// CHECK-32-EX-NEXT:    br label [[OMP_DISPATCH_COND]]
+// CHECK-32-EX:       omp.dispatch.end:
+// CHECK-32-EX-NEXT:    [[TMP8:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-32-EX-NEXT:    [[TMP9:%.*]] = icmp ne i32 [[TMP8]], 0
+// CHECK-32-EX-NEXT:    br i1 [[TMP9]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]]
+// CHECK-32-EX:       .omp.final.then:
+// CHECK-32-EX-NEXT:    store i32 10, i32* [[I]], align 4
+// CHECK-32-EX-NEXT:    br label [[DOTOMP_FINAL_DONE]]
+// CHECK-32-EX:       .omp.final.done:
+// CHECK-32-EX-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[TMP1]])
+// CHECK-32-EX-NEXT:    ret void
+//
+//
+// CHECK-32-EX-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l196
+// CHECK-32-EX-SAME: () #[[ATTR8]] {
+// CHECK-32-EX-NEXT:  entry:
+// CHECK-32-EX-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x i8*], align 4
+// CHECK-32-EX-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 2, i1 false)
+// CHECK-32-EX-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
+// CHECK-32-EX-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
+// CHECK-32-EX:       user_code.entry:
+// CHECK-32-EX-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
+// CHECK-32-EX-NEXT:    [[TMP2:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8**
+// CHECK-32-EX-NEXT:    call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__82 to i8*), i8* null, i8** [[TMP2]], i32 0)
+// CHECK-32-EX-NEXT:    call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 2)
+// CHECK-32-EX-NEXT:    ret void
+// CHECK-32-EX:       worker.exit:
+// CHECK-32-EX-NEXT:    ret void
+//
+//
+// CHECK-32-EX-LABEL: define {{[^@]+}}@__omp_outlined__82
+// CHECK-32-EX-SAME: (i32* noalias noundef [[DOTGLOBAL_TID_:%.*]], i32* noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
+// CHECK-32-EX-NEXT:  entry:
+// CHECK-32-EX-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-EX-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-EX-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4
+// CHECK-32-EX-NEXT:    store i32 0, i32* [[DOTOMP_LB]], align 4
+// CHECK-32-EX-NEXT:    store i32 9, i32* [[DOTOMP_UB]], align 4
+// CHECK-32-EX-NEXT:    store i32 1, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-32-EX-NEXT:    store i32 0, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-32-EX-NEXT:    [[TMP0:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-EX-NEXT:    [[TMP1:%.*]] = load i32, i32* [[TMP0]], align 4
+// CHECK-32-EX-NEXT:    call void @__kmpc_dispatch_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]], i32 1073741861, i32 0, i32 9, i32 1, i32 1)
+// CHECK-32-EX-NEXT:    br label [[OMP_DISPATCH_COND:%.*]]
+// CHECK-32-EX:       omp.dispatch.cond:
+// CHECK-32-EX-NEXT:    [[TMP2:%.*]] = call i32 @__kmpc_dispatch_next_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]], i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]])
+// CHECK-32-EX-NEXT:    [[TOBOOL:%.*]] = icmp ne i32 [[TMP2]], 0
+// CHECK-32-EX-NEXT:    br i1 [[TOBOOL]], label [[OMP_DISPATCH_BODY:%.*]], label [[OMP_DISPATCH_END:%.*]]
+// CHECK-32-EX:       omp.dispatch.body:
+// CHECK-32-EX-NEXT:    [[TMP3:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4
+// CHECK-32-EX-NEXT:    store i32 [[TMP3]], i32* [[DOTOMP_IV]], align 4
+// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK-32-EX:       omp.inner.for.cond:
+// CHECK-32-EX-NEXT:    [[TMP4:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP282:![0-9]+]]
+// CHECK-32-EX-NEXT:    [[TMP5:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP282]]
+// CHECK-32-EX-NEXT:    [[CMP:%.*]] = icmp sle i32 [[TMP4]], [[TMP5]]
+// CHECK-32-EX-NEXT:    br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK-32-EX:       omp.inner.for.body:
+// CHECK-32-EX-NEXT:    [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP282]]
+// CHECK-32-EX-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP6]], 1
+// CHECK-32-EX-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// CHECK-32-EX-NEXT:    store i32 [[ADD]], i32* [[I]], align 4, !llvm.access.group [[ACC_GRP282]]
+// CHECK-32-EX-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK-32-EX:       omp.body.continue:
+// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK-32-EX:       omp.inner.for.inc:
+// CHECK-32-EX-NEXT:    [[TMP7:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP282]]
+// CHECK-32-EX-NEXT:    [[ADD1:%.*]] = add nsw i32 [[TMP7]], 1
+// CHECK-32-EX-NEXT:    store i32 [[ADD1]], i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP282]]
+// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP283:![0-9]+]]
+// CHECK-32-EX:       omp.inner.for.end:
+// CHECK-32-EX-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
+// CHECK-32-EX:       omp.dispatch.inc:
+// CHECK-32-EX-NEXT:    br label [[OMP_DISPATCH_COND]]
+// CHECK-32-EX:       omp.dispatch.end:
+// CHECK-32-EX-NEXT:    [[TMP8:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-32-EX-NEXT:    [[TMP9:%.*]] = icmp ne i32 [[TMP8]], 0
+// CHECK-32-EX-NEXT:    br i1 [[TMP9]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]]
+// CHECK-32-EX:       .omp.final.then:
+// CHECK-32-EX-NEXT:    store i32 10, i32* [[I]], align 4
+// CHECK-32-EX-NEXT:    br label [[DOTOMP_FINAL_DONE]]
+// CHECK-32-EX:       .omp.final.done:
+// CHECK-32-EX-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[TMP1]])
+// CHECK-32-EX-NEXT:    ret void
+//
+//
+// CHECK-32-EX-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l200
+// CHECK-32-EX-SAME: () #[[ATTR8]] {
+// CHECK-32-EX-NEXT:  entry:
+// CHECK-32-EX-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x i8*], align 4
+// CHECK-32-EX-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 2, i1 false)
+// CHECK-32-EX-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
+// CHECK-32-EX-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
+// CHECK-32-EX:       user_code.entry:
+// CHECK-32-EX-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
+// CHECK-32-EX-NEXT:    [[TMP2:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8**
+// CHECK-32-EX-NEXT:    call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__83 to i8*), i8* null, i8** [[TMP2]], i32 0)
+// CHECK-32-EX-NEXT:    call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 2)
+// CHECK-32-EX-NEXT:    ret void
+// CHECK-32-EX:       worker.exit:
+// CHECK-32-EX-NEXT:    ret void
+//
+//
+// CHECK-32-EX-LABEL: define {{[^@]+}}@__omp_outlined__83
+// CHECK-32-EX-SAME: (i32* noalias noundef [[DOTGLOBAL_TID_:%.*]], i32* noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
+// CHECK-32-EX-NEXT:  entry:
+// CHECK-32-EX-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-EX-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-EX-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4
+// CHECK-32-EX-NEXT:    store i32 0, i32* [[DOTOMP_LB]], align 4
+// CHECK-32-EX-NEXT:    store i32 9, i32* [[DOTOMP_UB]], align 4
+// CHECK-32-EX-NEXT:    store i32 1, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-32-EX-NEXT:    store i32 0, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-32-EX-NEXT:    [[TMP0:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-EX-NEXT:    [[TMP1:%.*]] = load i32, i32* [[TMP0]], align 4
+// CHECK-32-EX-NEXT:    call void @__kmpc_dispatch_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]], i32 1073741859, i32 0, i32 9, i32 1, i32 1)
+// CHECK-32-EX-NEXT:    br label [[OMP_DISPATCH_COND:%.*]]
+// CHECK-32-EX:       omp.dispatch.cond:
+// CHECK-32-EX-NEXT:    [[TMP2:%.*]] = call i32 @__kmpc_dispatch_next_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]], i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]])
+// CHECK-32-EX-NEXT:    [[TOBOOL:%.*]] = icmp ne i32 [[TMP2]], 0
+// CHECK-32-EX-NEXT:    br i1 [[TOBOOL]], label [[OMP_DISPATCH_BODY:%.*]], label [[OMP_DISPATCH_END:%.*]]
+// CHECK-32-EX:       omp.dispatch.body:
+// CHECK-32-EX-NEXT:    [[TMP3:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4
+// CHECK-32-EX-NEXT:    store i32 [[TMP3]], i32* [[DOTOMP_IV]], align 4
+// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK-32-EX:       omp.inner.for.cond:
+// CHECK-32-EX-NEXT:    [[TMP4:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP285:![0-9]+]]
+// CHECK-32-EX-NEXT:    [[TMP5:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP285]]
+// CHECK-32-EX-NEXT:    [[CMP:%.*]] = icmp sle i32 [[TMP4]], [[TMP5]]
+// CHECK-32-EX-NEXT:    br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK-32-EX:       omp.inner.for.body:
+// CHECK-32-EX-NEXT:    [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP285]]
+// CHECK-32-EX-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP6]], 1
+// CHECK-32-EX-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// CHECK-32-EX-NEXT:    store i32 [[ADD]], i32* [[I]], align 4, !llvm.access.group [[ACC_GRP285]]
+// CHECK-32-EX-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK-32-EX:       omp.body.continue:
+// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK-32-EX:       omp.inner.for.inc:
+// CHECK-32-EX-NEXT:    [[TMP7:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP285]]
+// CHECK-32-EX-NEXT:    [[ADD1:%.*]] = add nsw i32 [[TMP7]], 1
+// CHECK-32-EX-NEXT:    store i32 [[ADD1]], i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP285]]
+// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP286:![0-9]+]]
+// CHECK-32-EX:       omp.inner.for.end:
+// CHECK-32-EX-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
+// CHECK-32-EX:       omp.dispatch.inc:
+// CHECK-32-EX-NEXT:    br label [[OMP_DISPATCH_COND]]
+// CHECK-32-EX:       omp.dispatch.end:
+// CHECK-32-EX-NEXT:    [[TMP8:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-32-EX-NEXT:    [[TMP9:%.*]] = icmp ne i32 [[TMP8]], 0
+// CHECK-32-EX-NEXT:    br i1 [[TMP9]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]]
+// CHECK-32-EX:       .omp.final.then:
+// CHECK-32-EX-NEXT:    store i32 10, i32* [[I]], align 4
+// CHECK-32-EX-NEXT:    br label [[DOTOMP_FINAL_DONE]]
+// CHECK-32-EX:       .omp.final.done:
+// CHECK-32-EX-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[TMP1]])
+// CHECK-32-EX-NEXT:    ret void
+//
+//
+// CHECK-32-EX-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l204
+// CHECK-32-EX-SAME: () #[[ATTR8]] {
+// CHECK-32-EX-NEXT:  entry:
+// CHECK-32-EX-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x i8*], align 4
+// CHECK-32-EX-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 2, i1 false)
+// CHECK-32-EX-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
+// CHECK-32-EX-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
+// CHECK-32-EX:       user_code.entry:
+// CHECK-32-EX-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
+// CHECK-32-EX-NEXT:    [[TMP2:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8**
+// CHECK-32-EX-NEXT:    call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__84 to i8*), i8* null, i8** [[TMP2]], i32 0)
+// CHECK-32-EX-NEXT:    call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 2)
+// CHECK-32-EX-NEXT:    ret void
+// CHECK-32-EX:       worker.exit:
+// CHECK-32-EX-NEXT:    ret void
+//
+//
+// CHECK-32-EX-LABEL: define {{[^@]+}}@__omp_outlined__84
+// CHECK-32-EX-SAME: (i32* noalias noundef [[DOTGLOBAL_TID_:%.*]], i32* noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
+// CHECK-32-EX-NEXT:  entry:
+// CHECK-32-EX-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-EX-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-EX-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4
+// CHECK-32-EX-NEXT:    store i32 0, i32* [[DOTOMP_LB]], align 4
+// CHECK-32-EX-NEXT:    store i32 9, i32* [[DOTOMP_UB]], align 4
+// CHECK-32-EX-NEXT:    store i32 1, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-32-EX-NEXT:    store i32 0, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-32-EX-NEXT:    [[TMP0:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-EX-NEXT:    [[TMP1:%.*]] = load i32, i32* [[TMP0]], align 4
+// CHECK-32-EX-NEXT:    call void @__kmpc_dispatch_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]], i32 1073741860, i32 0, i32 9, i32 1, i32 1)
+// CHECK-32-EX-NEXT:    br label [[OMP_DISPATCH_COND:%.*]]
+// CHECK-32-EX:       omp.dispatch.cond:
+// CHECK-32-EX-NEXT:    [[TMP2:%.*]] = call i32 @__kmpc_dispatch_next_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]], i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]])
+// CHECK-32-EX-NEXT:    [[TOBOOL:%.*]] = icmp ne i32 [[TMP2]], 0
+// CHECK-32-EX-NEXT:    br i1 [[TOBOOL]], label [[OMP_DISPATCH_BODY:%.*]], label [[OMP_DISPATCH_END:%.*]]
+// CHECK-32-EX:       omp.dispatch.body:
+// CHECK-32-EX-NEXT:    [[TMP3:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4
+// CHECK-32-EX-NEXT:    store i32 [[TMP3]], i32* [[DOTOMP_IV]], align 4
+// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK-32-EX:       omp.inner.for.cond:
+// CHECK-32-EX-NEXT:    [[TMP4:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP288:![0-9]+]]
+// CHECK-32-EX-NEXT:    [[TMP5:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP288]]
+// CHECK-32-EX-NEXT:    [[CMP:%.*]] = icmp sle i32 [[TMP4]], [[TMP5]]
+// CHECK-32-EX-NEXT:    br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK-32-EX:       omp.inner.for.body:
+// CHECK-32-EX-NEXT:    [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP288]]
+// CHECK-32-EX-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP6]], 1
+// CHECK-32-EX-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// CHECK-32-EX-NEXT:    store i32 [[ADD]], i32* [[I]], align 4, !llvm.access.group [[ACC_GRP288]]
+// CHECK-32-EX-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK-32-EX:       omp.body.continue:
+// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK-32-EX:       omp.inner.for.inc:
+// CHECK-32-EX-NEXT:    [[TMP7:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP288]]
+// CHECK-32-EX-NEXT:    [[ADD1:%.*]] = add nsw i32 [[TMP7]], 1
+// CHECK-32-EX-NEXT:    store i32 [[ADD1]], i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP288]]
+// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP289:![0-9]+]]
+// CHECK-32-EX:       omp.inner.for.end:
+// CHECK-32-EX-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
+// CHECK-32-EX:       omp.dispatch.inc:
+// CHECK-32-EX-NEXT:    br label [[OMP_DISPATCH_COND]]
+// CHECK-32-EX:       omp.dispatch.end:
+// CHECK-32-EX-NEXT:    [[TMP8:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-32-EX-NEXT:    [[TMP9:%.*]] = icmp ne i32 [[TMP8]], 0
+// CHECK-32-EX-NEXT:    br i1 [[TMP9]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]]
+// CHECK-32-EX:       .omp.final.then:
+// CHECK-32-EX-NEXT:    store i32 10, i32* [[I]], align 4
+// CHECK-32-EX-NEXT:    br label [[DOTOMP_FINAL_DONE]]
+// CHECK-32-EX:       .omp.final.done:
+// CHECK-32-EX-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[TMP1]])
+// CHECK-32-EX-NEXT:    ret void
+//
+//
+// CHECK-32-EX-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l208
+// CHECK-32-EX-SAME: () #[[ATTR8]] {
+// CHECK-32-EX-NEXT:  entry:
+// CHECK-32-EX-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x i8*], align 4
+// CHECK-32-EX-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 2, i1 false)
+// CHECK-32-EX-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
+// CHECK-32-EX-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
+// CHECK-32-EX:       user_code.entry:
+// CHECK-32-EX-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
+// CHECK-32-EX-NEXT:    [[TMP2:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8**
+// CHECK-32-EX-NEXT:    call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__85 to i8*), i8* null, i8** [[TMP2]], i32 0)
+// CHECK-32-EX-NEXT:    call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 2)
+// CHECK-32-EX-NEXT:    ret void
+// CHECK-32-EX:       worker.exit:
+// CHECK-32-EX-NEXT:    ret void
+//
+//
+// CHECK-32-EX-LABEL: define {{[^@]+}}@__omp_outlined__85
+// CHECK-32-EX-SAME: (i32* noalias noundef [[DOTGLOBAL_TID_:%.*]], i32* noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
+// CHECK-32-EX-NEXT:  entry:
+// CHECK-32-EX-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-EX-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-EX-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4
+// CHECK-32-EX-NEXT:    store i32 0, i32* [[DOTOMP_LB]], align 4
+// CHECK-32-EX-NEXT:    store i32 9, i32* [[DOTOMP_UB]], align 4
+// CHECK-32-EX-NEXT:    store i32 1, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-32-EX-NEXT:    store i32 0, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-32-EX-NEXT:    [[TMP0:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-EX-NEXT:    [[TMP1:%.*]] = load i32, i32* [[TMP0]], align 4
+// CHECK-32-EX-NEXT:    call void @__kmpc_dispatch_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]], i32 65, i32 0, i32 9, i32 1, i32 1)
+// CHECK-32-EX-NEXT:    br label [[OMP_DISPATCH_COND:%.*]]
+// CHECK-32-EX:       omp.dispatch.cond:
+// CHECK-32-EX-NEXT:    [[TMP2:%.*]] = call i32 @__kmpc_dispatch_next_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]], i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]])
+// CHECK-32-EX-NEXT:    [[TOBOOL:%.*]] = icmp ne i32 [[TMP2]], 0
+// CHECK-32-EX-NEXT:    br i1 [[TOBOOL]], label [[OMP_DISPATCH_BODY:%.*]], label [[OMP_DISPATCH_END:%.*]]
+// CHECK-32-EX:       omp.dispatch.body:
+// CHECK-32-EX-NEXT:    [[TMP3:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4
+// CHECK-32-EX-NEXT:    store i32 [[TMP3]], i32* [[DOTOMP_IV]], align 4
+// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK-32-EX:       omp.inner.for.cond:
+// CHECK-32-EX-NEXT:    [[TMP4:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP291:![0-9]+]]
+// CHECK-32-EX-NEXT:    [[TMP5:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP291]]
+// CHECK-32-EX-NEXT:    [[CMP:%.*]] = icmp sle i32 [[TMP4]], [[TMP5]]
+// CHECK-32-EX-NEXT:    br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK-32-EX:       omp.inner.for.body:
+// CHECK-32-EX-NEXT:    [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP291]]
+// CHECK-32-EX-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP6]], 1
+// CHECK-32-EX-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// CHECK-32-EX-NEXT:    store i32 [[ADD]], i32* [[I]], align 4, !llvm.access.group [[ACC_GRP291]]
+// CHECK-32-EX-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK-32-EX:       omp.body.continue:
+// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK-32-EX:       omp.inner.for.inc:
+// CHECK-32-EX-NEXT:    [[TMP7:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP291]]
+// CHECK-32-EX-NEXT:    [[ADD1:%.*]] = add nsw i32 [[TMP7]], 1
+// CHECK-32-EX-NEXT:    store i32 [[ADD1]], i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP291]]
+// CHECK-32-EX-NEXT:    call void @__kmpc_dispatch_fini_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]]), !llvm.access.group [[ACC_GRP291]]
+// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP292:![0-9]+]]
+// CHECK-32-EX:       omp.inner.for.end:
+// CHECK-32-EX-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
+// CHECK-32-EX:       omp.dispatch.inc:
+// CHECK-32-EX-NEXT:    br label [[OMP_DISPATCH_COND]]
+// CHECK-32-EX:       omp.dispatch.end:
+// CHECK-32-EX-NEXT:    [[TMP8:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-32-EX-NEXT:    [[TMP9:%.*]] = icmp ne i32 [[TMP8]], 0
+// CHECK-32-EX-NEXT:    br i1 [[TMP9]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]]
+// CHECK-32-EX:       .omp.final.then:
+// CHECK-32-EX-NEXT:    store i32 10, i32* [[I]], align 4
+// CHECK-32-EX-NEXT:    br label [[DOTOMP_FINAL_DONE]]
+// CHECK-32-EX:       .omp.final.done:
+// CHECK-32-EX-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[TMP1]])
+// CHECK-32-EX-NEXT:    ret void
+//
+//
+// CHECK-32-EX-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l213
+// CHECK-32-EX-SAME: () #[[ATTR8]] {
+// CHECK-32-EX-NEXT:  entry:
+// CHECK-32-EX-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x i8*], align 4
+// CHECK-32-EX-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 2, i1 false)
+// CHECK-32-EX-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
+// CHECK-32-EX-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
+// CHECK-32-EX:       user_code.entry:
+// CHECK-32-EX-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
+// CHECK-32-EX-NEXT:    [[TMP2:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8**
+// CHECK-32-EX-NEXT:    call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__86 to i8*), i8* null, i8** [[TMP2]], i32 0)
+// CHECK-32-EX-NEXT:    call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 2)
+// CHECK-32-EX-NEXT:    ret void
+// CHECK-32-EX:       worker.exit:
+// CHECK-32-EX-NEXT:    ret void
+//
+//
+// CHECK-32-EX-LABEL: define {{[^@]+}}@__omp_outlined__86
+// CHECK-32-EX-SAME: (i32* noalias noundef [[DOTGLOBAL_TID_:%.*]], i32* noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
+// CHECK-32-EX-NEXT:  entry:
+// CHECK-32-EX-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-EX-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-EX-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4
+// CHECK-32-EX-NEXT:    store i32 0, i32* [[DOTOMP_LB]], align 4
+// CHECK-32-EX-NEXT:    store i32 9, i32* [[DOTOMP_UB]], align 4
+// CHECK-32-EX-NEXT:    store i32 1, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-32-EX-NEXT:    store i32 0, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-32-EX-NEXT:    [[TMP0:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-EX-NEXT:    [[TMP1:%.*]] = load i32, i32* [[TMP0]], align 4
+// CHECK-32-EX-NEXT:    call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB3]], i32 [[TMP1]], i32 34, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1)
+// CHECK-32-EX-NEXT:    [[TMP2:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4
+// CHECK-32-EX-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[TMP2]], 9
+// CHECK-32-EX-NEXT:    br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK-32-EX:       cond.true:
+// CHECK-32-EX-NEXT:    br label [[COND_END:%.*]]
+// CHECK-32-EX:       cond.false:
+// CHECK-32-EX-NEXT:    [[TMP3:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4
+// CHECK-32-EX-NEXT:    br label [[COND_END]]
+// CHECK-32-EX:       cond.end:
+// CHECK-32-EX-NEXT:    [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP3]], [[COND_FALSE]] ]
+// CHECK-32-EX-NEXT:    store i32 [[COND]], i32* [[DOTOMP_UB]], align 4
+// CHECK-32-EX-NEXT:    [[TMP4:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4
+// CHECK-32-EX-NEXT:    store i32 [[TMP4]], i32* [[DOTOMP_IV]], align 4
+// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK-32-EX:       omp.inner.for.cond:
+// CHECK-32-EX-NEXT:    [[TMP5:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP294:![0-9]+]]
+// CHECK-32-EX-NEXT:    [[TMP6:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP294]]
+// CHECK-32-EX-NEXT:    [[CMP1:%.*]] = icmp sle i32 [[TMP5]], [[TMP6]]
+// CHECK-32-EX-NEXT:    br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK-32-EX:       omp.inner.for.body:
+// CHECK-32-EX-NEXT:    [[TMP7:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP294]]
+// CHECK-32-EX-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP7]], 1
+// CHECK-32-EX-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// CHECK-32-EX-NEXT:    store i32 [[ADD]], i32* [[I]], align 4, !llvm.access.group [[ACC_GRP294]]
+// CHECK-32-EX-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK-32-EX:       omp.body.continue:
+// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK-32-EX:       omp.inner.for.inc:
+// CHECK-32-EX-NEXT:    [[TMP8:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP294]]
+// CHECK-32-EX-NEXT:    [[ADD2:%.*]] = add nsw i32 [[TMP8]], 1
+// CHECK-32-EX-NEXT:    store i32 [[ADD2]], i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP294]]
+// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP295:![0-9]+]]
+// CHECK-32-EX:       omp.inner.for.end:
+// CHECK-32-EX-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK-32-EX:       omp.loop.exit:
+// CHECK-32-EX-NEXT:    call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB3]], i32 [[TMP1]])
+// CHECK-32-EX-NEXT:    [[TMP9:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-32-EX-NEXT:    [[TMP10:%.*]] = icmp ne i32 [[TMP9]], 0
+// CHECK-32-EX-NEXT:    br i1 [[TMP10]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]]
+// CHECK-32-EX:       .omp.final.then:
+// CHECK-32-EX-NEXT:    store i32 10, i32* [[I]], align 4
+// CHECK-32-EX-NEXT:    br label [[DOTOMP_FINAL_DONE]]
+// CHECK-32-EX:       .omp.final.done:
+// CHECK-32-EX-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[TMP1]])
+// CHECK-32-EX-NEXT:    ret void
+//
+//
+// CHECK-32-EX-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l218
+// CHECK-32-EX-SAME: () #[[ATTR8]] {
+// CHECK-32-EX-NEXT:  entry:
+// CHECK-32-EX-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x i8*], align 4
+// CHECK-32-EX-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 2, i1 false)
+// CHECK-32-EX-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
+// CHECK-32-EX-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
+// CHECK-32-EX:       user_code.entry:
+// CHECK-32-EX-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
+// CHECK-32-EX-NEXT:    [[TMP2:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8**
+// CHECK-32-EX-NEXT:    call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__87 to i8*), i8* null, i8** [[TMP2]], i32 0)
+// CHECK-32-EX-NEXT:    call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 2)
+// CHECK-32-EX-NEXT:    ret void
+// CHECK-32-EX:       worker.exit:
+// CHECK-32-EX-NEXT:    ret void
+//
+//
+// CHECK-32-EX-LABEL: define {{[^@]+}}@__omp_outlined__87
+// CHECK-32-EX-SAME: (i32* noalias noundef [[DOTGLOBAL_TID_:%.*]], i32* noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
+// CHECK-32-EX-NEXT:  entry:
+// CHECK-32-EX-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-EX-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-EX-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4
+// CHECK-32-EX-NEXT:    store i32 0, i32* [[DOTOMP_LB]], align 4
+// CHECK-32-EX-NEXT:    store i32 9, i32* [[DOTOMP_UB]], align 4
+// CHECK-32-EX-NEXT:    store i32 1, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-32-EX-NEXT:    store i32 0, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-32-EX-NEXT:    [[TMP0:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-EX-NEXT:    [[TMP1:%.*]] = load i32, i32* [[TMP0]], align 4
+// CHECK-32-EX-NEXT:    call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB3]], i32 [[TMP1]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1)
+// CHECK-32-EX-NEXT:    br label [[OMP_DISPATCH_COND:%.*]]
+// CHECK-32-EX:       omp.dispatch.cond:
+// CHECK-32-EX-NEXT:    [[TMP2:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4
+// CHECK-32-EX-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[TMP2]], 9
+// CHECK-32-EX-NEXT:    br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK-32-EX:       cond.true:
+// CHECK-32-EX-NEXT:    br label [[COND_END:%.*]]
+// CHECK-32-EX:       cond.false:
+// CHECK-32-EX-NEXT:    [[TMP3:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4
+// CHECK-32-EX-NEXT:    br label [[COND_END]]
+// CHECK-32-EX:       cond.end:
+// CHECK-32-EX-NEXT:    [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP3]], [[COND_FALSE]] ]
+// CHECK-32-EX-NEXT:    store i32 [[COND]], i32* [[DOTOMP_UB]], align 4
+// CHECK-32-EX-NEXT:    [[TMP4:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4
+// CHECK-32-EX-NEXT:    store i32 [[TMP4]], i32* [[DOTOMP_IV]], align 4
+// CHECK-32-EX-NEXT:    [[TMP5:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
+// CHECK-32-EX-NEXT:    [[TMP6:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4
+// CHECK-32-EX-NEXT:    [[CMP1:%.*]] = icmp sle i32 [[TMP5]], [[TMP6]]
+// CHECK-32-EX-NEXT:    br i1 [[CMP1]], label [[OMP_DISPATCH_BODY:%.*]], label [[OMP_DISPATCH_END:%.*]]
+// CHECK-32-EX:       omp.dispatch.body:
+// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK-32-EX:       omp.inner.for.cond:
+// CHECK-32-EX-NEXT:    [[TMP7:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP297:![0-9]+]]
+// CHECK-32-EX-NEXT:    [[TMP8:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP297]]
+// CHECK-32-EX-NEXT:    [[CMP2:%.*]] = icmp sle i32 [[TMP7]], [[TMP8]]
+// CHECK-32-EX-NEXT:    br i1 [[CMP2]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK-32-EX:       omp.inner.for.body:
+// CHECK-32-EX-NEXT:    [[TMP9:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP297]]
+// CHECK-32-EX-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP9]], 1
+// CHECK-32-EX-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// CHECK-32-EX-NEXT:    store i32 [[ADD]], i32* [[I]], align 4, !llvm.access.group [[ACC_GRP297]]
+// CHECK-32-EX-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK-32-EX:       omp.body.continue:
+// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK-32-EX:       omp.inner.for.inc:
+// CHECK-32-EX-NEXT:    [[TMP10:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP297]]
+// CHECK-32-EX-NEXT:    [[ADD3:%.*]] = add nsw i32 [[TMP10]], 1
+// CHECK-32-EX-NEXT:    store i32 [[ADD3]], i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP297]]
+// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP298:![0-9]+]]
+// CHECK-32-EX:       omp.inner.for.end:
+// CHECK-32-EX-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
+// CHECK-32-EX:       omp.dispatch.inc:
+// CHECK-32-EX-NEXT:    [[TMP11:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4
+// CHECK-32-EX-NEXT:    [[TMP12:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-32-EX-NEXT:    [[ADD4:%.*]] = add nsw i32 [[TMP11]], [[TMP12]]
+// CHECK-32-EX-NEXT:    store i32 [[ADD4]], i32* [[DOTOMP_LB]], align 4
+// CHECK-32-EX-NEXT:    [[TMP13:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4
+// CHECK-32-EX-NEXT:    [[TMP14:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-32-EX-NEXT:    [[ADD5:%.*]] = add nsw i32 [[TMP13]], [[TMP14]]
+// CHECK-32-EX-NEXT:    store i32 [[ADD5]], i32* [[DOTOMP_UB]], align 4
+// CHECK-32-EX-NEXT:    br label [[OMP_DISPATCH_COND]]
+// CHECK-32-EX:       omp.dispatch.end:
+// CHECK-32-EX-NEXT:    call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB3]], i32 [[TMP1]])
+// CHECK-32-EX-NEXT:    [[TMP15:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-32-EX-NEXT:    [[TMP16:%.*]] = icmp ne i32 [[TMP15]], 0
+// CHECK-32-EX-NEXT:    br i1 [[TMP16]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]]
+// CHECK-32-EX:       .omp.final.then:
+// CHECK-32-EX-NEXT:    store i32 10, i32* [[I]], align 4
+// CHECK-32-EX-NEXT:    br label [[DOTOMP_FINAL_DONE]]
+// CHECK-32-EX:       .omp.final.done:
+// CHECK-32-EX-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[TMP1]])
+// CHECK-32-EX-NEXT:    ret void
+//
+//
+// CHECK-32-EX-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l223
+// CHECK-32-EX-SAME: () #[[ATTR8]] {
+// CHECK-32-EX-NEXT:  entry:
+// CHECK-32-EX-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x i8*], align 4
+// CHECK-32-EX-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 2, i1 false)
+// CHECK-32-EX-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
+// CHECK-32-EX-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
+// CHECK-32-EX:       user_code.entry:
+// CHECK-32-EX-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
+// CHECK-32-EX-NEXT:    [[TMP2:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8**
+// CHECK-32-EX-NEXT:    call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__88 to i8*), i8* null, i8** [[TMP2]], i32 0)
+// CHECK-32-EX-NEXT:    call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 2)
+// CHECK-32-EX-NEXT:    ret void
+// CHECK-32-EX:       worker.exit:
+// CHECK-32-EX-NEXT:    ret void
+//
+//
+// CHECK-32-EX-LABEL: define {{[^@]+}}@__omp_outlined__88
+// CHECK-32-EX-SAME: (i32* noalias noundef [[DOTGLOBAL_TID_:%.*]], i32* noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
+// CHECK-32-EX-NEXT:  entry:
+// CHECK-32-EX-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-EX-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-EX-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4
+// CHECK-32-EX-NEXT:    store i32 0, i32* [[DOTOMP_LB]], align 4
+// CHECK-32-EX-NEXT:    store i32 9, i32* [[DOTOMP_UB]], align 4
+// CHECK-32-EX-NEXT:    store i32 1, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-32-EX-NEXT:    store i32 0, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-32-EX-NEXT:    [[TMP0:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-EX-NEXT:    [[TMP1:%.*]] = load i32, i32* [[TMP0]], align 4
+// CHECK-32-EX-NEXT:    call void @__kmpc_dispatch_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]], i32 1073741862, i32 0, i32 9, i32 1, i32 1)
+// CHECK-32-EX-NEXT:    br label [[OMP_DISPATCH_COND:%.*]]
+// CHECK-32-EX:       omp.dispatch.cond:
+// CHECK-32-EX-NEXT:    [[TMP2:%.*]] = call i32 @__kmpc_dispatch_next_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]], i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]])
+// CHECK-32-EX-NEXT:    [[TOBOOL:%.*]] = icmp ne i32 [[TMP2]], 0
+// CHECK-32-EX-NEXT:    br i1 [[TOBOOL]], label [[OMP_DISPATCH_BODY:%.*]], label [[OMP_DISPATCH_END:%.*]]
+// CHECK-32-EX:       omp.dispatch.body:
+// CHECK-32-EX-NEXT:    [[TMP3:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4
+// CHECK-32-EX-NEXT:    store i32 [[TMP3]], i32* [[DOTOMP_IV]], align 4
+// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK-32-EX:       omp.inner.for.cond:
+// CHECK-32-EX-NEXT:    [[TMP4:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP300:![0-9]+]]
+// CHECK-32-EX-NEXT:    [[TMP5:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP300]]
+// CHECK-32-EX-NEXT:    [[CMP:%.*]] = icmp sle i32 [[TMP4]], [[TMP5]]
+// CHECK-32-EX-NEXT:    br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK-32-EX:       omp.inner.for.body:
+// CHECK-32-EX-NEXT:    [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP300]]
+// CHECK-32-EX-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP6]], 1
+// CHECK-32-EX-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// CHECK-32-EX-NEXT:    store i32 [[ADD]], i32* [[I]], align 4, !llvm.access.group [[ACC_GRP300]]
+// CHECK-32-EX-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK-32-EX:       omp.body.continue:
+// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK-32-EX:       omp.inner.for.inc:
+// CHECK-32-EX-NEXT:    [[TMP7:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP300]]
+// CHECK-32-EX-NEXT:    [[ADD1:%.*]] = add nsw i32 [[TMP7]], 1
+// CHECK-32-EX-NEXT:    store i32 [[ADD1]], i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP300]]
+// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP301:![0-9]+]]
+// CHECK-32-EX:       omp.inner.for.end:
+// CHECK-32-EX-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
+// CHECK-32-EX:       omp.dispatch.inc:
+// CHECK-32-EX-NEXT:    br label [[OMP_DISPATCH_COND]]
+// CHECK-32-EX:       omp.dispatch.end:
+// CHECK-32-EX-NEXT:    [[TMP8:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-32-EX-NEXT:    [[TMP9:%.*]] = icmp ne i32 [[TMP8]], 0
+// CHECK-32-EX-NEXT:    br i1 [[TMP9]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]]
+// CHECK-32-EX:       .omp.final.then:
+// CHECK-32-EX-NEXT:    store i32 10, i32* [[I]], align 4
+// CHECK-32-EX-NEXT:    br label [[DOTOMP_FINAL_DONE]]
+// CHECK-32-EX:       .omp.final.done:
+// CHECK-32-EX-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[TMP1]])
+// CHECK-32-EX-NEXT:    ret void
+//
+//
+// CHECK-32-EX-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l228
+// CHECK-32-EX-SAME: () #[[ATTR8]] {
+// CHECK-32-EX-NEXT:  entry:
+// CHECK-32-EX-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x i8*], align 4
+// CHECK-32-EX-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 2, i1 false)
+// CHECK-32-EX-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
+// CHECK-32-EX-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
+// CHECK-32-EX:       user_code.entry:
+// CHECK-32-EX-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
+// CHECK-32-EX-NEXT:    [[TMP2:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8**
+// CHECK-32-EX-NEXT:    call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__89 to i8*), i8* null, i8** [[TMP2]], i32 0)
+// CHECK-32-EX-NEXT:    call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 2)
+// CHECK-32-EX-NEXT:    ret void
+// CHECK-32-EX:       worker.exit:
+// CHECK-32-EX-NEXT:    ret void
+//
+//
+// CHECK-32-EX-LABEL: define {{[^@]+}}@__omp_outlined__89
+// CHECK-32-EX-SAME: (i32* noalias noundef [[DOTGLOBAL_TID_:%.*]], i32* noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
+// CHECK-32-EX-NEXT:  entry:
+// CHECK-32-EX-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-EX-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-EX-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4
+// CHECK-32-EX-NEXT:    store i32 0, i32* [[DOTOMP_LB]], align 4
+// CHECK-32-EX-NEXT:    store i32 9, i32* [[DOTOMP_UB]], align 4
+// CHECK-32-EX-NEXT:    store i32 1, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-32-EX-NEXT:    store i32 0, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-32-EX-NEXT:    [[TMP0:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-EX-NEXT:    [[TMP1:%.*]] = load i32, i32* [[TMP0]], align 4
+// CHECK-32-EX-NEXT:    call void @__kmpc_dispatch_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]], i32 1073741861, i32 0, i32 9, i32 1, i32 1)
+// CHECK-32-EX-NEXT:    br label [[OMP_DISPATCH_COND:%.*]]
+// CHECK-32-EX:       omp.dispatch.cond:
+// CHECK-32-EX-NEXT:    [[TMP2:%.*]] = call i32 @__kmpc_dispatch_next_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]], i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]])
+// CHECK-32-EX-NEXT:    [[TOBOOL:%.*]] = icmp ne i32 [[TMP2]], 0
+// CHECK-32-EX-NEXT:    br i1 [[TOBOOL]], label [[OMP_DISPATCH_BODY:%.*]], label [[OMP_DISPATCH_END:%.*]]
+// CHECK-32-EX:       omp.dispatch.body:
+// CHECK-32-EX-NEXT:    [[TMP3:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4
+// CHECK-32-EX-NEXT:    store i32 [[TMP3]], i32* [[DOTOMP_IV]], align 4
+// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK-32-EX:       omp.inner.for.cond:
+// CHECK-32-EX-NEXT:    [[TMP4:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP303:![0-9]+]]
+// CHECK-32-EX-NEXT:    [[TMP5:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP303]]
+// CHECK-32-EX-NEXT:    [[CMP:%.*]] = icmp sle i32 [[TMP4]], [[TMP5]]
+// CHECK-32-EX-NEXT:    br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK-32-EX:       omp.inner.for.body:
+// CHECK-32-EX-NEXT:    [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP303]]
+// CHECK-32-EX-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP6]], 1
+// CHECK-32-EX-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// CHECK-32-EX-NEXT:    store i32 [[ADD]], i32* [[I]], align 4, !llvm.access.group [[ACC_GRP303]]
+// CHECK-32-EX-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK-32-EX:       omp.body.continue:
+// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK-32-EX:       omp.inner.for.inc:
+// CHECK-32-EX-NEXT:    [[TMP7:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP303]]
+// CHECK-32-EX-NEXT:    [[ADD1:%.*]] = add nsw i32 [[TMP7]], 1
+// CHECK-32-EX-NEXT:    store i32 [[ADD1]], i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP303]]
+// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP304:![0-9]+]]
+// CHECK-32-EX:       omp.inner.for.end:
+// CHECK-32-EX-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
+// CHECK-32-EX:       omp.dispatch.inc:
+// CHECK-32-EX-NEXT:    br label [[OMP_DISPATCH_COND]]
+// CHECK-32-EX:       omp.dispatch.end:
+// CHECK-32-EX-NEXT:    [[TMP8:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-32-EX-NEXT:    [[TMP9:%.*]] = icmp ne i32 [[TMP8]], 0
+// CHECK-32-EX-NEXT:    br i1 [[TMP9]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]]
+// CHECK-32-EX:       .omp.final.then:
+// CHECK-32-EX-NEXT:    store i32 10, i32* [[I]], align 4
+// CHECK-32-EX-NEXT:    br label [[DOTOMP_FINAL_DONE]]
+// CHECK-32-EX:       .omp.final.done:
+// CHECK-32-EX-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[TMP1]])
+// CHECK-32-EX-NEXT:    ret void
+//
+//
+// CHECK-32-EX-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l233
+// CHECK-32-EX-SAME: () #[[ATTR8]] {
+// CHECK-32-EX-NEXT:  entry:
+// CHECK-32-EX-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x i8*], align 4
+// CHECK-32-EX-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 2, i1 false)
+// CHECK-32-EX-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
+// CHECK-32-EX-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
+// CHECK-32-EX:       user_code.entry:
+// CHECK-32-EX-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
+// CHECK-32-EX-NEXT:    [[TMP2:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8**
+// CHECK-32-EX-NEXT:    call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__90 to i8*), i8* null, i8** [[TMP2]], i32 0)
+// CHECK-32-EX-NEXT:    call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 2)
+// CHECK-32-EX-NEXT:    ret void
+// CHECK-32-EX:       worker.exit:
+// CHECK-32-EX-NEXT:    ret void
+//
+//
+// CHECK-32-EX-LABEL: define {{[^@]+}}@__omp_outlined__90
+// CHECK-32-EX-SAME: (i32* noalias noundef [[DOTGLOBAL_TID_:%.*]], i32* noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
+// CHECK-32-EX-NEXT:  entry:
+// CHECK-32-EX-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-EX-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-EX-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4
+// CHECK-32-EX-NEXT:    store i32 0, i32* [[DOTOMP_LB]], align 4
+// CHECK-32-EX-NEXT:    store i32 9, i32* [[DOTOMP_UB]], align 4
+// CHECK-32-EX-NEXT:    store i32 1, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-32-EX-NEXT:    store i32 0, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-32-EX-NEXT:    [[TMP0:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-EX-NEXT:    [[TMP1:%.*]] = load i32, i32* [[TMP0]], align 4
+// CHECK-32-EX-NEXT:    call void @__kmpc_dispatch_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]], i32 1073741859, i32 0, i32 9, i32 1, i32 1)
+// CHECK-32-EX-NEXT:    br label [[OMP_DISPATCH_COND:%.*]]
+// CHECK-32-EX:       omp.dispatch.cond:
+// CHECK-32-EX-NEXT:    [[TMP2:%.*]] = call i32 @__kmpc_dispatch_next_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]], i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]])
+// CHECK-32-EX-NEXT:    [[TOBOOL:%.*]] = icmp ne i32 [[TMP2]], 0
+// CHECK-32-EX-NEXT:    br i1 [[TOBOOL]], label [[OMP_DISPATCH_BODY:%.*]], label [[OMP_DISPATCH_END:%.*]]
+// CHECK-32-EX:       omp.dispatch.body:
+// CHECK-32-EX-NEXT:    [[TMP3:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4
+// CHECK-32-EX-NEXT:    store i32 [[TMP3]], i32* [[DOTOMP_IV]], align 4
+// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK-32-EX:       omp.inner.for.cond:
+// CHECK-32-EX-NEXT:    [[TMP4:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP306:![0-9]+]]
+// CHECK-32-EX-NEXT:    [[TMP5:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP306]]
+// CHECK-32-EX-NEXT:    [[CMP:%.*]] = icmp sle i32 [[TMP4]], [[TMP5]]
+// CHECK-32-EX-NEXT:    br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK-32-EX:       omp.inner.for.body:
+// CHECK-32-EX-NEXT:    [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP306]]
+// CHECK-32-EX-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP6]], 1
+// CHECK-32-EX-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// CHECK-32-EX-NEXT:    store i32 [[ADD]], i32* [[I]], align 4, !llvm.access.group [[ACC_GRP306]]
+// CHECK-32-EX-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK-32-EX:       omp.body.continue:
+// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK-32-EX:       omp.inner.for.inc:
+// CHECK-32-EX-NEXT:    [[TMP7:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP306]]
+// CHECK-32-EX-NEXT:    [[ADD1:%.*]] = add nsw i32 [[TMP7]], 1
+// CHECK-32-EX-NEXT:    store i32 [[ADD1]], i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP306]]
+// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP307:![0-9]+]]
+// CHECK-32-EX:       omp.inner.for.end:
+// CHECK-32-EX-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
+// CHECK-32-EX:       omp.dispatch.inc:
+// CHECK-32-EX-NEXT:    br label [[OMP_DISPATCH_COND]]
+// CHECK-32-EX:       omp.dispatch.end:
+// CHECK-32-EX-NEXT:    [[TMP8:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-32-EX-NEXT:    [[TMP9:%.*]] = icmp ne i32 [[TMP8]], 0
+// CHECK-32-EX-NEXT:    br i1 [[TMP9]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]]
+// CHECK-32-EX:       .omp.final.then:
+// CHECK-32-EX-NEXT:    store i32 10, i32* [[I]], align 4
+// CHECK-32-EX-NEXT:    br label [[DOTOMP_FINAL_DONE]]
+// CHECK-32-EX:       .omp.final.done:
+// CHECK-32-EX-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[TMP1]])
+// CHECK-32-EX-NEXT:    ret void
+//
+//
+// CHECK-32-EX-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l238
+// CHECK-32-EX-SAME: () #[[ATTR8]] {
+// CHECK-32-EX-NEXT:  entry:
+// CHECK-32-EX-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x i8*], align 4
+// CHECK-32-EX-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 2, i1 false)
+// CHECK-32-EX-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
+// CHECK-32-EX-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
+// CHECK-32-EX:       user_code.entry:
+// CHECK-32-EX-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
+// CHECK-32-EX-NEXT:    [[TMP2:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8**
+// CHECK-32-EX-NEXT:    call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__91 to i8*), i8* null, i8** [[TMP2]], i32 0)
+// CHECK-32-EX-NEXT:    call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 2)
+// CHECK-32-EX-NEXT:    ret void
+// CHECK-32-EX:       worker.exit:
+// CHECK-32-EX-NEXT:    ret void
+//
+//
+// CHECK-32-EX-LABEL: define {{[^@]+}}@__omp_outlined__91
+// CHECK-32-EX-SAME: (i32* noalias noundef [[DOTGLOBAL_TID_:%.*]], i32* noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
+// CHECK-32-EX-NEXT:  entry:
+// CHECK-32-EX-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-EX-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-EX-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4
+// CHECK-32-EX-NEXT:    store i32 0, i32* [[DOTOMP_LB]], align 4
+// CHECK-32-EX-NEXT:    store i32 9, i32* [[DOTOMP_UB]], align 4
+// CHECK-32-EX-NEXT:    store i32 1, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-32-EX-NEXT:    store i32 0, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-32-EX-NEXT:    [[TMP0:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-EX-NEXT:    [[TMP1:%.*]] = load i32, i32* [[TMP0]], align 4
+// CHECK-32-EX-NEXT:    call void @__kmpc_dispatch_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]], i32 1073741860, i32 0, i32 9, i32 1, i32 1)
+// CHECK-32-EX-NEXT:    br label [[OMP_DISPATCH_COND:%.*]]
+// CHECK-32-EX:       omp.dispatch.cond:
+// CHECK-32-EX-NEXT:    [[TMP2:%.*]] = call i32 @__kmpc_dispatch_next_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]], i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]])
+// CHECK-32-EX-NEXT:    [[TOBOOL:%.*]] = icmp ne i32 [[TMP2]], 0
+// CHECK-32-EX-NEXT:    br i1 [[TOBOOL]], label [[OMP_DISPATCH_BODY:%.*]], label [[OMP_DISPATCH_END:%.*]]
+// CHECK-32-EX:       omp.dispatch.body:
+// CHECK-32-EX-NEXT:    [[TMP3:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4
+// CHECK-32-EX-NEXT:    store i32 [[TMP3]], i32* [[DOTOMP_IV]], align 4
+// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK-32-EX:       omp.inner.for.cond:
+// CHECK-32-EX-NEXT:    [[TMP4:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP309:![0-9]+]]
+// CHECK-32-EX-NEXT:    [[TMP5:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP309]]
+// CHECK-32-EX-NEXT:    [[CMP:%.*]] = icmp sle i32 [[TMP4]], [[TMP5]]
+// CHECK-32-EX-NEXT:    br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK-32-EX:       omp.inner.for.body:
+// CHECK-32-EX-NEXT:    [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP309]]
+// CHECK-32-EX-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP6]], 1
+// CHECK-32-EX-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// CHECK-32-EX-NEXT:    store i32 [[ADD]], i32* [[I]], align 4, !llvm.access.group [[ACC_GRP309]]
+// CHECK-32-EX-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK-32-EX:       omp.body.continue:
+// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK-32-EX:       omp.inner.for.inc:
+// CHECK-32-EX-NEXT:    [[TMP7:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP309]]
+// CHECK-32-EX-NEXT:    [[ADD1:%.*]] = add nsw i32 [[TMP7]], 1
+// CHECK-32-EX-NEXT:    store i32 [[ADD1]], i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP309]]
+// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP310:![0-9]+]]
+// CHECK-32-EX:       omp.inner.for.end:
+// CHECK-32-EX-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
+// CHECK-32-EX:       omp.dispatch.inc:
+// CHECK-32-EX-NEXT:    br label [[OMP_DISPATCH_COND]]
+// CHECK-32-EX:       omp.dispatch.end:
+// CHECK-32-EX-NEXT:    [[TMP8:%.*]] = load i32, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-32-EX-NEXT:    [[TMP9:%.*]] = icmp ne i32 [[TMP8]], 0
+// CHECK-32-EX-NEXT:    br i1 [[TMP9]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]]
+// CHECK-32-EX:       .omp.final.then:
+// CHECK-32-EX-NEXT:    store i32 10, i32* [[I]], align 4
+// CHECK-32-EX-NEXT:    br label [[DOTOMP_FINAL_DONE]]
+// CHECK-32-EX:       .omp.final.done:
+// CHECK-32-EX-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[TMP1]])
+// CHECK-32-EX-NEXT:    ret void
+//
+//
+// CHECK-32-EX-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l243
+// CHECK-32-EX-SAME: () #[[ATTR8]] {
+// CHECK-32-EX-NEXT:  entry:
+// CHECK-32-EX-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x i8*], align 4
+// CHECK-32-EX-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 2, i1 false)
+// CHECK-32-EX-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
+// CHECK-32-EX-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
+// CHECK-32-EX:       user_code.entry:
+// CHECK-32-EX-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
+// CHECK-32-EX-NEXT:    [[TMP2:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8**
+// CHECK-32-EX-NEXT:    call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__92 to i8*), i8* null, i8** [[TMP2]], i32 0)
+// CHECK-32-EX-NEXT:    call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 2)
+// CHECK-32-EX-NEXT:    ret void
+// CHECK-32-EX:       worker.exit:
+// CHECK-32-EX-NEXT:    ret void
+//
+//
+// CHECK-32-EX-LABEL: define {{[^@]+}}@__omp_outlined__92
+// CHECK-32-EX-SAME: (i32* noalias noundef [[DOTGLOBAL_TID_:%.*]], i32* noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
+// CHECK-32-EX-NEXT:  entry:
+// CHECK-32-EX-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-EX-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-EX-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4
+// CHECK-32-EX-NEXT:    store i32 0, i32* [[DOTOMP_LB]], align 4
+// CHECK-32-EX-NEXT:    store i32 9, i32* [[DOTOMP_UB]], align 4
+// CHECK-32-EX-NEXT:    store i32 1, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-32-EX-NEXT:    store i32 0, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-32-EX-NEXT:    [[TMP0:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-EX-NEXT:    [[TMP1:%.*]] = load i32, i32* [[TMP0]], align 4
+// CHECK-32-EX-NEXT:    call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB3]], i32 [[TMP1]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1)
+// CHECK-32-EX-NEXT:    br label [[OMP_DISPATCH_COND:%.*]]
+// CHECK-32-EX:       omp.dispatch.cond:
+// CHECK-32-EX-NEXT:    [[TMP2:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4
+// CHECK-32-EX-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[TMP2]], 9
+// CHECK-32-EX-NEXT:    br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK-32-EX:       cond.true:
+// CHECK-32-EX-NEXT:    br label [[COND_END:%.*]]
+// CHECK-32-EX:       cond.false:
+// CHECK-32-EX-NEXT:    [[TMP3:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4
+// CHECK-32-EX-NEXT:    br label [[COND_END]]
+// CHECK-32-EX:       cond.end:
+// CHECK-32-EX-NEXT:    [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP3]], [[COND_FALSE]] ]
+// CHECK-32-EX-NEXT:    store i32 [[COND]], i32* [[DOTOMP_UB]], align 4
+// CHECK-32-EX-NEXT:    [[TMP4:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4
+// CHECK-32-EX-NEXT:    store i32 [[TMP4]], i32* [[DOTOMP_IV]], align 4
+// CHECK-32-EX-NEXT:    [[TMP5:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
+// CHECK-32-EX-NEXT:    [[TMP6:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4
+// CHECK-32-EX-NEXT:    [[CMP1:%.*]] = icmp sle i32 [[TMP5]], [[TMP6]]
+// CHECK-32-EX-NEXT:    br i1 [[CMP1]], label [[OMP_DISPATCH_BODY:%.*]], label [[OMP_DISPATCH_END:%.*]]
+// CHECK-32-EX:       omp.dispatch.body:
+// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK-32-EX:       omp.inner.for.cond:
+// CHECK-32-EX-NEXT:    [[TMP7:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
+// CHECK-32-EX-NEXT:    [[TMP8:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4
+// CHECK-32-EX-NEXT:    [[CMP2:%.*]] = icmp sle i32 [[TMP7]], [[TMP8]]
+// CHECK-32-EX-NEXT:    br i1 [[CMP2]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK-32-EX:       omp.inner.for.body:
+// CHECK-32-EX-NEXT:    [[TMP9:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
+// CHECK-32-EX-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP9]], 1
+// CHECK-32-EX-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// CHECK-32-EX-NEXT:    store i32 [[ADD]], i32* [[I]], align 4
+// CHECK-32-EX-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK-32-EX:       omp.body.continue:
+// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK-32-EX:       omp.inner.for.inc:
+// CHECK-32-EX-NEXT:    [[TMP10:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
+// CHECK-32-EX-NEXT:    [[ADD3:%.*]] = add nsw i32 [[TMP10]], 1
+// CHECK-32-EX-NEXT:    store i32 [[ADD3]], i32* [[DOTOMP_IV]], align 4
+// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND]]
+// CHECK-32-EX:       omp.inner.for.end:
+// CHECK-32-EX-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
+// CHECK-32-EX:       omp.dispatch.inc:
+// CHECK-32-EX-NEXT:    [[TMP11:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4
+// CHECK-32-EX-NEXT:    [[TMP12:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-32-EX-NEXT:    [[ADD4:%.*]] = add nsw i32 [[TMP11]], [[TMP12]]
+// CHECK-32-EX-NEXT:    store i32 [[ADD4]], i32* [[DOTOMP_LB]], align 4
+// CHECK-32-EX-NEXT:    [[TMP13:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4
+// CHECK-32-EX-NEXT:    [[TMP14:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-32-EX-NEXT:    [[ADD5:%.*]] = add nsw i32 [[TMP13]], [[TMP14]]
+// CHECK-32-EX-NEXT:    store i32 [[ADD5]], i32* [[DOTOMP_UB]], align 4
+// CHECK-32-EX-NEXT:    br label [[OMP_DISPATCH_COND]]
+// CHECK-32-EX:       omp.dispatch.end:
+// CHECK-32-EX-NEXT:    call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB3]], i32 [[TMP1]])
+// CHECK-32-EX-NEXT:    ret void
+//
+//
+// CHECK-32-EX-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l247
+// CHECK-32-EX-SAME: () #[[ATTR8]] {
+// CHECK-32-EX-NEXT:  entry:
+// CHECK-32-EX-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x i8*], align 4
+// CHECK-32-EX-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 2, i1 false)
+// CHECK-32-EX-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
+// CHECK-32-EX-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
+// CHECK-32-EX:       user_code.entry:
+// CHECK-32-EX-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
+// CHECK-32-EX-NEXT:    [[TMP2:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8**
+// CHECK-32-EX-NEXT:    call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__93 to i8*), i8* null, i8** [[TMP2]], i32 0)
+// CHECK-32-EX-NEXT:    call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 2)
+// CHECK-32-EX-NEXT:    ret void
+// CHECK-32-EX:       worker.exit:
+// CHECK-32-EX-NEXT:    ret void
+//
+//
+// CHECK-32-EX-LABEL: define {{[^@]+}}@__omp_outlined__93
+// CHECK-32-EX-SAME: (i32* noalias noundef [[DOTGLOBAL_TID_:%.*]], i32* noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
+// CHECK-32-EX-NEXT:  entry:
+// CHECK-32-EX-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-EX-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-EX-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4
+// CHECK-32-EX-NEXT:    store i32 0, i32* [[DOTOMP_LB]], align 4
+// CHECK-32-EX-NEXT:    store i32 9, i32* [[DOTOMP_UB]], align 4
+// CHECK-32-EX-NEXT:    store i32 1, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-32-EX-NEXT:    store i32 0, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-32-EX-NEXT:    [[TMP0:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-EX-NEXT:    [[TMP1:%.*]] = load i32, i32* [[TMP0]], align 4
+// CHECK-32-EX-NEXT:    call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB3]], i32 [[TMP1]], i32 34, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1)
+// CHECK-32-EX-NEXT:    [[TMP2:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4
+// CHECK-32-EX-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[TMP2]], 9
+// CHECK-32-EX-NEXT:    br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK-32-EX:       cond.true:
+// CHECK-32-EX-NEXT:    br label [[COND_END:%.*]]
+// CHECK-32-EX:       cond.false:
+// CHECK-32-EX-NEXT:    [[TMP3:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4
+// CHECK-32-EX-NEXT:    br label [[COND_END]]
+// CHECK-32-EX:       cond.end:
+// CHECK-32-EX-NEXT:    [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP3]], [[COND_FALSE]] ]
+// CHECK-32-EX-NEXT:    store i32 [[COND]], i32* [[DOTOMP_UB]], align 4
+// CHECK-32-EX-NEXT:    [[TMP4:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4
+// CHECK-32-EX-NEXT:    store i32 [[TMP4]], i32* [[DOTOMP_IV]], align 4
+// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK-32-EX:       omp.inner.for.cond:
+// CHECK-32-EX-NEXT:    [[TMP5:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
+// CHECK-32-EX-NEXT:    [[TMP6:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4
+// CHECK-32-EX-NEXT:    [[CMP1:%.*]] = icmp sle i32 [[TMP5]], [[TMP6]]
+// CHECK-32-EX-NEXT:    br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK-32-EX:       omp.inner.for.body:
+// CHECK-32-EX-NEXT:    [[TMP7:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
+// CHECK-32-EX-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP7]], 1
+// CHECK-32-EX-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// CHECK-32-EX-NEXT:    store i32 [[ADD]], i32* [[I]], align 4
+// CHECK-32-EX-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK-32-EX:       omp.body.continue:
+// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK-32-EX:       omp.inner.for.inc:
+// CHECK-32-EX-NEXT:    [[TMP8:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
+// CHECK-32-EX-NEXT:    [[ADD2:%.*]] = add nsw i32 [[TMP8]], 1
+// CHECK-32-EX-NEXT:    store i32 [[ADD2]], i32* [[DOTOMP_IV]], align 4
+// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND]]
+// CHECK-32-EX:       omp.inner.for.end:
+// CHECK-32-EX-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK-32-EX:       omp.loop.exit:
+// CHECK-32-EX-NEXT:    call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB3]], i32 [[TMP1]])
+// CHECK-32-EX-NEXT:    ret void
+//
+//
+// CHECK-32-EX-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l251
+// CHECK-32-EX-SAME: () #[[ATTR8]] {
+// CHECK-32-EX-NEXT:  entry:
+// CHECK-32-EX-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x i8*], align 4
+// CHECK-32-EX-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 2, i1 false)
+// CHECK-32-EX-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
+// CHECK-32-EX-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
+// CHECK-32-EX:       user_code.entry:
+// CHECK-32-EX-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
+// CHECK-32-EX-NEXT:    [[TMP2:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8**
+// CHECK-32-EX-NEXT:    call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__94 to i8*), i8* null, i8** [[TMP2]], i32 0)
+// CHECK-32-EX-NEXT:    call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 2)
+// CHECK-32-EX-NEXT:    ret void
+// CHECK-32-EX:       worker.exit:
+// CHECK-32-EX-NEXT:    ret void
+//
+//
+// CHECK-32-EX-LABEL: define {{[^@]+}}@__omp_outlined__94
+// CHECK-32-EX-SAME: (i32* noalias noundef [[DOTGLOBAL_TID_:%.*]], i32* noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
+// CHECK-32-EX-NEXT:  entry:
+// CHECK-32-EX-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-EX-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-EX-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4
+// CHECK-32-EX-NEXT:    store i32 0, i32* [[DOTOMP_LB]], align 4
+// CHECK-32-EX-NEXT:    store i32 9, i32* [[DOTOMP_UB]], align 4
+// CHECK-32-EX-NEXT:    store i32 1, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-32-EX-NEXT:    store i32 0, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-32-EX-NEXT:    [[TMP0:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-EX-NEXT:    [[TMP1:%.*]] = load i32, i32* [[TMP0]], align 4
+// CHECK-32-EX-NEXT:    call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB3]], i32 [[TMP1]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1)
+// CHECK-32-EX-NEXT:    br label [[OMP_DISPATCH_COND:%.*]]
+// CHECK-32-EX:       omp.dispatch.cond:
+// CHECK-32-EX-NEXT:    [[TMP2:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4
+// CHECK-32-EX-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[TMP2]], 9
+// CHECK-32-EX-NEXT:    br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK-32-EX:       cond.true:
+// CHECK-32-EX-NEXT:    br label [[COND_END:%.*]]
+// CHECK-32-EX:       cond.false:
+// CHECK-32-EX-NEXT:    [[TMP3:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4
+// CHECK-32-EX-NEXT:    br label [[COND_END]]
+// CHECK-32-EX:       cond.end:
+// CHECK-32-EX-NEXT:    [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP3]], [[COND_FALSE]] ]
+// CHECK-32-EX-NEXT:    store i32 [[COND]], i32* [[DOTOMP_UB]], align 4
+// CHECK-32-EX-NEXT:    [[TMP4:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4
+// CHECK-32-EX-NEXT:    store i32 [[TMP4]], i32* [[DOTOMP_IV]], align 4
+// CHECK-32-EX-NEXT:    [[TMP5:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
+// CHECK-32-EX-NEXT:    [[TMP6:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4
+// CHECK-32-EX-NEXT:    [[CMP1:%.*]] = icmp sle i32 [[TMP5]], [[TMP6]]
+// CHECK-32-EX-NEXT:    br i1 [[CMP1]], label [[OMP_DISPATCH_BODY:%.*]], label [[OMP_DISPATCH_END:%.*]]
+// CHECK-32-EX:       omp.dispatch.body:
+// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK-32-EX:       omp.inner.for.cond:
+// CHECK-32-EX-NEXT:    [[TMP7:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
+// CHECK-32-EX-NEXT:    [[TMP8:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4
+// CHECK-32-EX-NEXT:    [[CMP2:%.*]] = icmp sle i32 [[TMP7]], [[TMP8]]
+// CHECK-32-EX-NEXT:    br i1 [[CMP2]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK-32-EX:       omp.inner.for.body:
+// CHECK-32-EX-NEXT:    [[TMP9:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
+// CHECK-32-EX-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP9]], 1
+// CHECK-32-EX-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// CHECK-32-EX-NEXT:    store i32 [[ADD]], i32* [[I]], align 4
+// CHECK-32-EX-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK-32-EX:       omp.body.continue:
+// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK-32-EX:       omp.inner.for.inc:
+// CHECK-32-EX-NEXT:    [[TMP10:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
+// CHECK-32-EX-NEXT:    [[ADD3:%.*]] = add nsw i32 [[TMP10]], 1
+// CHECK-32-EX-NEXT:    store i32 [[ADD3]], i32* [[DOTOMP_IV]], align 4
+// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND]]
+// CHECK-32-EX:       omp.inner.for.end:
+// CHECK-32-EX-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
+// CHECK-32-EX:       omp.dispatch.inc:
+// CHECK-32-EX-NEXT:    [[TMP11:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4
+// CHECK-32-EX-NEXT:    [[TMP12:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-32-EX-NEXT:    [[ADD4:%.*]] = add nsw i32 [[TMP11]], [[TMP12]]
+// CHECK-32-EX-NEXT:    store i32 [[ADD4]], i32* [[DOTOMP_LB]], align 4
+// CHECK-32-EX-NEXT:    [[TMP13:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4
+// CHECK-32-EX-NEXT:    [[TMP14:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-32-EX-NEXT:    [[ADD5:%.*]] = add nsw i32 [[TMP13]], [[TMP14]]
+// CHECK-32-EX-NEXT:    store i32 [[ADD5]], i32* [[DOTOMP_UB]], align 4
+// CHECK-32-EX-NEXT:    br label [[OMP_DISPATCH_COND]]
+// CHECK-32-EX:       omp.dispatch.end:
+// CHECK-32-EX-NEXT:    call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB3]], i32 [[TMP1]])
+// CHECK-32-EX-NEXT:    ret void
+//
+//
+// CHECK-32-EX-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l255
+// CHECK-32-EX-SAME: () #[[ATTR8]] {
+// CHECK-32-EX-NEXT:  entry:
+// CHECK-32-EX-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x i8*], align 4
+// CHECK-32-EX-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 2, i1 false)
+// CHECK-32-EX-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
+// CHECK-32-EX-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
+// CHECK-32-EX:       user_code.entry:
+// CHECK-32-EX-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
+// CHECK-32-EX-NEXT:    [[TMP2:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8**
+// CHECK-32-EX-NEXT:    call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__95 to i8*), i8* null, i8** [[TMP2]], i32 0)
+// CHECK-32-EX-NEXT:    call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 2)
+// CHECK-32-EX-NEXT:    ret void
+// CHECK-32-EX:       worker.exit:
+// CHECK-32-EX-NEXT:    ret void
+//
+//
+// CHECK-32-EX-LABEL: define {{[^@]+}}@__omp_outlined__95
+// CHECK-32-EX-SAME: (i32* noalias noundef [[DOTGLOBAL_TID_:%.*]], i32* noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
+// CHECK-32-EX-NEXT:  entry:
+// CHECK-32-EX-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-EX-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-EX-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4
+// CHECK-32-EX-NEXT:    store i32 0, i32* [[DOTOMP_LB]], align 4
+// CHECK-32-EX-NEXT:    store i32 9, i32* [[DOTOMP_UB]], align 4
+// CHECK-32-EX-NEXT:    store i32 1, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-32-EX-NEXT:    store i32 0, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-32-EX-NEXT:    [[TMP0:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-EX-NEXT:    [[TMP1:%.*]] = load i32, i32* [[TMP0]], align 4
+// CHECK-32-EX-NEXT:    call void @__kmpc_dispatch_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]], i32 1073741862, i32 0, i32 9, i32 1, i32 1)
+// CHECK-32-EX-NEXT:    br label [[OMP_DISPATCH_COND:%.*]]
+// CHECK-32-EX:       omp.dispatch.cond:
+// CHECK-32-EX-NEXT:    [[TMP2:%.*]] = call i32 @__kmpc_dispatch_next_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]], i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]])
+// CHECK-32-EX-NEXT:    [[TOBOOL:%.*]] = icmp ne i32 [[TMP2]], 0
+// CHECK-32-EX-NEXT:    br i1 [[TOBOOL]], label [[OMP_DISPATCH_BODY:%.*]], label [[OMP_DISPATCH_END:%.*]]
+// CHECK-32-EX:       omp.dispatch.body:
+// CHECK-32-EX-NEXT:    [[TMP3:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4
+// CHECK-32-EX-NEXT:    store i32 [[TMP3]], i32* [[DOTOMP_IV]], align 4
+// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK-32-EX:       omp.inner.for.cond:
+// CHECK-32-EX-NEXT:    [[TMP4:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP312:![0-9]+]]
+// CHECK-32-EX-NEXT:    [[TMP5:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP312]]
+// CHECK-32-EX-NEXT:    [[CMP:%.*]] = icmp sle i32 [[TMP4]], [[TMP5]]
+// CHECK-32-EX-NEXT:    br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK-32-EX:       omp.inner.for.body:
+// CHECK-32-EX-NEXT:    [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP312]]
+// CHECK-32-EX-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP6]], 1
+// CHECK-32-EX-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// CHECK-32-EX-NEXT:    store i32 [[ADD]], i32* [[I]], align 4, !llvm.access.group [[ACC_GRP312]]
+// CHECK-32-EX-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK-32-EX:       omp.body.continue:
+// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK-32-EX:       omp.inner.for.inc:
+// CHECK-32-EX-NEXT:    [[TMP7:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP312]]
+// CHECK-32-EX-NEXT:    [[ADD1:%.*]] = add nsw i32 [[TMP7]], 1
+// CHECK-32-EX-NEXT:    store i32 [[ADD1]], i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP312]]
+// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP313:![0-9]+]]
+// CHECK-32-EX:       omp.inner.for.end:
+// CHECK-32-EX-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
+// CHECK-32-EX:       omp.dispatch.inc:
+// CHECK-32-EX-NEXT:    br label [[OMP_DISPATCH_COND]]
+// CHECK-32-EX:       omp.dispatch.end:
+// CHECK-32-EX-NEXT:    ret void
+//
+//
+// CHECK-32-EX-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l259
+// CHECK-32-EX-SAME: () #[[ATTR8]] {
+// CHECK-32-EX-NEXT:  entry:
+// CHECK-32-EX-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x i8*], align 4
+// CHECK-32-EX-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 2, i1 false)
+// CHECK-32-EX-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
+// CHECK-32-EX-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
+// CHECK-32-EX:       user_code.entry:
+// CHECK-32-EX-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
+// CHECK-32-EX-NEXT:    [[TMP2:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8**
+// CHECK-32-EX-NEXT:    call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__96 to i8*), i8* null, i8** [[TMP2]], i32 0)
+// CHECK-32-EX-NEXT:    call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 2)
+// CHECK-32-EX-NEXT:    ret void
+// CHECK-32-EX:       worker.exit:
+// CHECK-32-EX-NEXT:    ret void
+//
+//
+// CHECK-32-EX-LABEL: define {{[^@]+}}@__omp_outlined__96
+// CHECK-32-EX-SAME: (i32* noalias noundef [[DOTGLOBAL_TID_:%.*]], i32* noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
+// CHECK-32-EX-NEXT:  entry:
+// CHECK-32-EX-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-EX-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-EX-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4
+// CHECK-32-EX-NEXT:    store i32 0, i32* [[DOTOMP_LB]], align 4
+// CHECK-32-EX-NEXT:    store i32 9, i32* [[DOTOMP_UB]], align 4
+// CHECK-32-EX-NEXT:    store i32 1, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-32-EX-NEXT:    store i32 0, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-32-EX-NEXT:    [[TMP0:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-EX-NEXT:    [[TMP1:%.*]] = load i32, i32* [[TMP0]], align 4
+// CHECK-32-EX-NEXT:    call void @__kmpc_dispatch_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]], i32 1073741861, i32 0, i32 9, i32 1, i32 1)
+// CHECK-32-EX-NEXT:    br label [[OMP_DISPATCH_COND:%.*]]
+// CHECK-32-EX:       omp.dispatch.cond:
+// CHECK-32-EX-NEXT:    [[TMP2:%.*]] = call i32 @__kmpc_dispatch_next_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]], i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]])
+// CHECK-32-EX-NEXT:    [[TOBOOL:%.*]] = icmp ne i32 [[TMP2]], 0
+// CHECK-32-EX-NEXT:    br i1 [[TOBOOL]], label [[OMP_DISPATCH_BODY:%.*]], label [[OMP_DISPATCH_END:%.*]]
+// CHECK-32-EX:       omp.dispatch.body:
+// CHECK-32-EX-NEXT:    [[TMP3:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4
+// CHECK-32-EX-NEXT:    store i32 [[TMP3]], i32* [[DOTOMP_IV]], align 4
+// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK-32-EX:       omp.inner.for.cond:
+// CHECK-32-EX-NEXT:    [[TMP4:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP315:![0-9]+]]
+// CHECK-32-EX-NEXT:    [[TMP5:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP315]]
+// CHECK-32-EX-NEXT:    [[CMP:%.*]] = icmp sle i32 [[TMP4]], [[TMP5]]
+// CHECK-32-EX-NEXT:    br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK-32-EX:       omp.inner.for.body:
+// CHECK-32-EX-NEXT:    [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP315]]
+// CHECK-32-EX-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP6]], 1
+// CHECK-32-EX-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// CHECK-32-EX-NEXT:    store i32 [[ADD]], i32* [[I]], align 4, !llvm.access.group [[ACC_GRP315]]
+// CHECK-32-EX-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK-32-EX:       omp.body.continue:
+// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK-32-EX:       omp.inner.for.inc:
+// CHECK-32-EX-NEXT:    [[TMP7:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP315]]
+// CHECK-32-EX-NEXT:    [[ADD1:%.*]] = add nsw i32 [[TMP7]], 1
+// CHECK-32-EX-NEXT:    store i32 [[ADD1]], i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP315]]
+// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP316:![0-9]+]]
+// CHECK-32-EX:       omp.inner.for.end:
+// CHECK-32-EX-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
+// CHECK-32-EX:       omp.dispatch.inc:
+// CHECK-32-EX-NEXT:    br label [[OMP_DISPATCH_COND]]
+// CHECK-32-EX:       omp.dispatch.end:
+// CHECK-32-EX-NEXT:    ret void
+//
+//
+// CHECK-32-EX-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l263
+// CHECK-32-EX-SAME: () #[[ATTR8]] {
+// CHECK-32-EX-NEXT:  entry:
+// CHECK-32-EX-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x i8*], align 4
+// CHECK-32-EX-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 2, i1 false)
+// CHECK-32-EX-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
+// CHECK-32-EX-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
+// CHECK-32-EX:       user_code.entry:
+// CHECK-32-EX-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
+// CHECK-32-EX-NEXT:    [[TMP2:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8**
+// CHECK-32-EX-NEXT:    call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__97 to i8*), i8* null, i8** [[TMP2]], i32 0)
+// CHECK-32-EX-NEXT:    call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 2)
+// CHECK-32-EX-NEXT:    ret void
+// CHECK-32-EX:       worker.exit:
+// CHECK-32-EX-NEXT:    ret void
+//
+//
+// CHECK-32-EX-LABEL: define {{[^@]+}}@__omp_outlined__97
+// CHECK-32-EX-SAME: (i32* noalias noundef [[DOTGLOBAL_TID_:%.*]], i32* noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
+// CHECK-32-EX-NEXT:  entry:
+// CHECK-32-EX-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-EX-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-EX-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4
+// CHECK-32-EX-NEXT:    store i32 0, i32* [[DOTOMP_LB]], align 4
+// CHECK-32-EX-NEXT:    store i32 9, i32* [[DOTOMP_UB]], align 4
+// CHECK-32-EX-NEXT:    store i32 1, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-32-EX-NEXT:    store i32 0, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-32-EX-NEXT:    [[TMP0:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-EX-NEXT:    [[TMP1:%.*]] = load i32, i32* [[TMP0]], align 4
+// CHECK-32-EX-NEXT:    call void @__kmpc_dispatch_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]], i32 1073741859, i32 0, i32 9, i32 1, i32 1)
+// CHECK-32-EX-NEXT:    br label [[OMP_DISPATCH_COND:%.*]]
+// CHECK-32-EX:       omp.dispatch.cond:
+// CHECK-32-EX-NEXT:    [[TMP2:%.*]] = call i32 @__kmpc_dispatch_next_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]], i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]])
+// CHECK-32-EX-NEXT:    [[TOBOOL:%.*]] = icmp ne i32 [[TMP2]], 0
+// CHECK-32-EX-NEXT:    br i1 [[TOBOOL]], label [[OMP_DISPATCH_BODY:%.*]], label [[OMP_DISPATCH_END:%.*]]
+// CHECK-32-EX:       omp.dispatch.body:
+// CHECK-32-EX-NEXT:    [[TMP3:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4
+// CHECK-32-EX-NEXT:    store i32 [[TMP3]], i32* [[DOTOMP_IV]], align 4
+// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK-32-EX:       omp.inner.for.cond:
+// CHECK-32-EX-NEXT:    [[TMP4:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP318:![0-9]+]]
+// CHECK-32-EX-NEXT:    [[TMP5:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP318]]
+// CHECK-32-EX-NEXT:    [[CMP:%.*]] = icmp sle i32 [[TMP4]], [[TMP5]]
+// CHECK-32-EX-NEXT:    br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK-32-EX:       omp.inner.for.body:
+// CHECK-32-EX-NEXT:    [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP318]]
+// CHECK-32-EX-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP6]], 1
+// CHECK-32-EX-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// CHECK-32-EX-NEXT:    store i32 [[ADD]], i32* [[I]], align 4, !llvm.access.group [[ACC_GRP318]]
+// CHECK-32-EX-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK-32-EX:       omp.body.continue:
+// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK-32-EX:       omp.inner.for.inc:
+// CHECK-32-EX-NEXT:    [[TMP7:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP318]]
+// CHECK-32-EX-NEXT:    [[ADD1:%.*]] = add nsw i32 [[TMP7]], 1
+// CHECK-32-EX-NEXT:    store i32 [[ADD1]], i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP318]]
+// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP319:![0-9]+]]
+// CHECK-32-EX:       omp.inner.for.end:
+// CHECK-32-EX-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
+// CHECK-32-EX:       omp.dispatch.inc:
+// CHECK-32-EX-NEXT:    br label [[OMP_DISPATCH_COND]]
+// CHECK-32-EX:       omp.dispatch.end:
+// CHECK-32-EX-NEXT:    ret void
+//
+//
+// CHECK-32-EX-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l267
+// CHECK-32-EX-SAME: () #[[ATTR8]] {
+// CHECK-32-EX-NEXT:  entry:
+// CHECK-32-EX-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x i8*], align 4
+// CHECK-32-EX-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 2, i1 false)
+// CHECK-32-EX-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
+// CHECK-32-EX-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
+// CHECK-32-EX:       user_code.entry:
+// CHECK-32-EX-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
+// CHECK-32-EX-NEXT:    [[TMP2:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8**
+// CHECK-32-EX-NEXT:    call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__98 to i8*), i8* null, i8** [[TMP2]], i32 0)
+// CHECK-32-EX-NEXT:    call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 2)
+// CHECK-32-EX-NEXT:    ret void
+// CHECK-32-EX:       worker.exit:
+// CHECK-32-EX-NEXT:    ret void
+//
+//
+// CHECK-32-EX-LABEL: define {{[^@]+}}@__omp_outlined__98
+// CHECK-32-EX-SAME: (i32* noalias noundef [[DOTGLOBAL_TID_:%.*]], i32* noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
+// CHECK-32-EX-NEXT:  entry:
+// CHECK-32-EX-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-EX-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-EX-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4
+// CHECK-32-EX-NEXT:    store i32 0, i32* [[DOTOMP_LB]], align 4
+// CHECK-32-EX-NEXT:    store i32 9, i32* [[DOTOMP_UB]], align 4
+// CHECK-32-EX-NEXT:    store i32 1, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-32-EX-NEXT:    store i32 0, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-32-EX-NEXT:    [[TMP0:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-EX-NEXT:    [[TMP1:%.*]] = load i32, i32* [[TMP0]], align 4
+// CHECK-32-EX-NEXT:    call void @__kmpc_dispatch_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]], i32 1073741860, i32 0, i32 9, i32 1, i32 1)
+// CHECK-32-EX-NEXT:    br label [[OMP_DISPATCH_COND:%.*]]
+// CHECK-32-EX:       omp.dispatch.cond:
+// CHECK-32-EX-NEXT:    [[TMP2:%.*]] = call i32 @__kmpc_dispatch_next_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]], i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]])
+// CHECK-32-EX-NEXT:    [[TOBOOL:%.*]] = icmp ne i32 [[TMP2]], 0
+// CHECK-32-EX-NEXT:    br i1 [[TOBOOL]], label [[OMP_DISPATCH_BODY:%.*]], label [[OMP_DISPATCH_END:%.*]]
+// CHECK-32-EX:       omp.dispatch.body:
+// CHECK-32-EX-NEXT:    [[TMP3:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4
+// CHECK-32-EX-NEXT:    store i32 [[TMP3]], i32* [[DOTOMP_IV]], align 4
+// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK-32-EX:       omp.inner.for.cond:
+// CHECK-32-EX-NEXT:    [[TMP4:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP321:![0-9]+]]
+// CHECK-32-EX-NEXT:    [[TMP5:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP321]]
+// CHECK-32-EX-NEXT:    [[CMP:%.*]] = icmp sle i32 [[TMP4]], [[TMP5]]
+// CHECK-32-EX-NEXT:    br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK-32-EX:       omp.inner.for.body:
+// CHECK-32-EX-NEXT:    [[TMP6:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP321]]
+// CHECK-32-EX-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP6]], 1
+// CHECK-32-EX-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// CHECK-32-EX-NEXT:    store i32 [[ADD]], i32* [[I]], align 4, !llvm.access.group [[ACC_GRP321]]
+// CHECK-32-EX-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK-32-EX:       omp.body.continue:
+// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK-32-EX:       omp.inner.for.inc:
+// CHECK-32-EX-NEXT:    [[TMP7:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP321]]
+// CHECK-32-EX-NEXT:    [[ADD1:%.*]] = add nsw i32 [[TMP7]], 1
+// CHECK-32-EX-NEXT:    store i32 [[ADD1]], i32* [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP321]]
+// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP322:![0-9]+]]
+// CHECK-32-EX:       omp.inner.for.end:
+// CHECK-32-EX-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
+// CHECK-32-EX:       omp.dispatch.inc:
+// CHECK-32-EX-NEXT:    br label [[OMP_DISPATCH_COND]]
+// CHECK-32-EX:       omp.dispatch.end:
+// CHECK-32-EX-NEXT:    ret void
+//

diff  --git a/clang/test/OpenMP/nvptx_target_parallel_proc_bind_codegen.cpp b/clang/test/OpenMP/nvptx_target_parallel_proc_bind_codegen.cpp
index 0f14d687bf79b..2d8805471c9fc 100644
--- a/clang/test/OpenMP/nvptx_target_parallel_proc_bind_codegen.cpp
+++ b/clang/test/OpenMP/nvptx_target_parallel_proc_bind_codegen.cpp
@@ -1,24 +1,22 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --function-signature --include-generated-funcs --replace-value-regex "__omp_offloading_[0-9a-z]+_[0-9a-z]+" "reduction_size[.].+[.]" "pl_cond[.].+[.|,]" --prefix-filecheck-ir-name _
 // Test target codegen - host bc file has to be created first.
 // RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -fopenmp-cuda-mode -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc
-// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -fopenmp-cuda-mode -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-64
+// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -fopenmp-cuda-mode -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix=CHECK45-64
 // RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -fopenmp-cuda-mode -x c++ -triple i386-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm-bc %s -o %t-x86-host.bc
-// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -fopenmp-cuda-mode -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32
-// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -fopenmp-cuda-mode -fexceptions -fcxx-exceptions -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32
+// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -fopenmp-cuda-mode -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix=CHECK45-32
+// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -fopenmp-cuda-mode -fexceptions -fcxx-exceptions -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefixes=CHECK45-32,CHECK45-32-EX
 
 // RUN: %clang_cc1 -verify -fopenmp -fopenmp-cuda-mode -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc
-// RUN: %clang_cc1 -verify -fopenmp -fopenmp-cuda-mode -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-64
+// RUN: %clang_cc1 -verify -fopenmp -fopenmp-cuda-mode -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix=CHECK-64
 // RUN: %clang_cc1 -verify -fopenmp -fopenmp-cuda-mode -x c++ -triple i386-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm-bc %s -o %t-x86-host.bc
-// RUN: %clang_cc1 -verify -fopenmp -fopenmp-cuda-mode -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32
-// RUN: %clang_cc1 -verify -fopenmp -fopenmp-cuda-mode -fexceptions -fcxx-exceptions -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32
+// RUN: %clang_cc1 -verify -fopenmp -fopenmp-cuda-mode -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix CHECK --check-prefix=CHECK-32
+// RUN: %clang_cc1 -verify -fopenmp -fopenmp-cuda-mode -fexceptions -fcxx-exceptions -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefixes=CHECK-32,CHECK-32-EX
 
 // expected-no-diagnostics
 #ifndef HEADER
 #define HEADER
 
 // Check that the execution mode of all 3 target regions on the gpu is set to SPMD Mode.
-// CHECK-DAG: {{@__omp_offloading_.+l29}}_exec_mode = weak protected constant i8 2
-// CHECK-DAG: {{@__omp_offloading_.+l33}}_exec_mode = weak protected constant i8 2
-// CHECK-DAG: {{@__omp_offloading_.+l38}}_exec_mode = weak protected constant i8 2
 
 template<typename tx>
 tx ftemplate(int n) {
@@ -53,5 +51,955 @@ int bar(int n){
   return a;
 }
 
-// CHECK-NOT: call void @__kmpc_push_proc_bind
 #endif
+// CHECK45-64-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l26
+// CHECK45-64-SAME: () #[[ATTR0:[0-9]+]] {
+// CHECK45-64-NEXT:  entry:
+// CHECK45-64-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x ptr], align 8
+// CHECK45-64-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @[[GLOB1:[0-9]+]], i8 2, i1 false)
+// CHECK45-64-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
+// CHECK45-64-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
+// CHECK45-64:       user_code.entry:
+// CHECK45-64-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]])
+// CHECK45-64-NEXT:    call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, ptr @__omp_outlined__, ptr null, ptr [[CAPTURED_VARS_ADDRS]], i64 0)
+// CHECK45-64-NEXT:    call void @__kmpc_target_deinit(ptr @[[GLOB1]], i8 2)
+// CHECK45-64-NEXT:    ret void
+// CHECK45-64:       worker.exit:
+// CHECK45-64-NEXT:    ret void
+//
+//
+// CHECK45-64-LABEL: define {{[^@]+}}@__omp_outlined__
+// CHECK45-64-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1:[0-9]+]] {
+// CHECK45-64-NEXT:  entry:
+// CHECK45-64-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK45-64-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK45-64-NEXT:    store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK45-64-NEXT:    store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
+// CHECK45-64-NEXT:    ret void
+//
+//
+// CHECK45-64-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l30
+// CHECK45-64-SAME: (i64 noundef [[AA:%.*]]) #[[ATTR0]] {
+// CHECK45-64-NEXT:  entry:
+// CHECK45-64-NEXT:    [[AA_ADDR:%.*]] = alloca i64, align 8
+// CHECK45-64-NEXT:    [[AA_CASTED:%.*]] = alloca i64, align 8
+// CHECK45-64-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [1 x ptr], align 8
+// CHECK45-64-NEXT:    store i64 [[AA]], ptr [[AA_ADDR]], align 8
+// CHECK45-64-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @[[GLOB1]], i8 2, i1 false)
+// CHECK45-64-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
+// CHECK45-64-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
+// CHECK45-64:       user_code.entry:
+// CHECK45-64-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]])
+// CHECK45-64-NEXT:    [[TMP2:%.*]] = load i16, ptr [[AA_ADDR]], align 2
+// CHECK45-64-NEXT:    store i16 [[TMP2]], ptr [[AA_CASTED]], align 2
+// CHECK45-64-NEXT:    [[TMP3:%.*]] = load i64, ptr [[AA_CASTED]], align 8
+// CHECK45-64-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 0
+// CHECK45-64-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP3]] to ptr
+// CHECK45-64-NEXT:    store ptr [[TMP5]], ptr [[TMP4]], align 8
+// CHECK45-64-NEXT:    call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, ptr @__omp_outlined__1, ptr null, ptr [[CAPTURED_VARS_ADDRS]], i64 1)
+// CHECK45-64-NEXT:    call void @__kmpc_target_deinit(ptr @[[GLOB1]], i8 2)
+// CHECK45-64-NEXT:    ret void
+// CHECK45-64:       worker.exit:
+// CHECK45-64-NEXT:    ret void
+//
+//
+// CHECK45-64-LABEL: define {{[^@]+}}@__omp_outlined__1
+// CHECK45-64-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[AA:%.*]]) #[[ATTR1]] {
+// CHECK45-64-NEXT:  entry:
+// CHECK45-64-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK45-64-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK45-64-NEXT:    [[AA_ADDR:%.*]] = alloca i64, align 8
+// CHECK45-64-NEXT:    store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK45-64-NEXT:    store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
+// CHECK45-64-NEXT:    store i64 [[AA]], ptr [[AA_ADDR]], align 8
+// CHECK45-64-NEXT:    [[TMP0:%.*]] = load i16, ptr [[AA_ADDR]], align 2
+// CHECK45-64-NEXT:    [[CONV:%.*]] = sext i16 [[TMP0]] to i32
+// CHECK45-64-NEXT:    [[ADD:%.*]] = add nsw i32 [[CONV]], 1
+// CHECK45-64-NEXT:    [[CONV1:%.*]] = trunc i32 [[ADD]] to i16
+// CHECK45-64-NEXT:    store i16 [[CONV1]], ptr [[AA_ADDR]], align 2
+// CHECK45-64-NEXT:    ret void
+//
+//
+// CHECK45-64-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l35
+// CHECK45-64-SAME: (i64 noundef [[A:%.*]], i64 noundef [[AA:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] {
+// CHECK45-64-NEXT:  entry:
+// CHECK45-64-NEXT:    [[A_ADDR:%.*]] = alloca i64, align 8
+// CHECK45-64-NEXT:    [[AA_ADDR:%.*]] = alloca i64, align 8
+// CHECK45-64-NEXT:    [[B_ADDR:%.*]] = alloca ptr, align 8
+// CHECK45-64-NEXT:    [[A_CASTED:%.*]] = alloca i64, align 8
+// CHECK45-64-NEXT:    [[AA_CASTED:%.*]] = alloca i64, align 8
+// CHECK45-64-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [3 x ptr], align 8
+// CHECK45-64-NEXT:    store i64 [[A]], ptr [[A_ADDR]], align 8
+// CHECK45-64-NEXT:    store i64 [[AA]], ptr [[AA_ADDR]], align 8
+// CHECK45-64-NEXT:    store ptr [[B]], ptr [[B_ADDR]], align 8
+// CHECK45-64-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[B_ADDR]], align 8
+// CHECK45-64-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr @[[GLOB1]], i8 2, i1 false)
+// CHECK45-64-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1
+// CHECK45-64-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
+// CHECK45-64:       user_code.entry:
+// CHECK45-64-NEXT:    [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]])
+// CHECK45-64-NEXT:    [[TMP3:%.*]] = load i32, ptr [[A_ADDR]], align 4
+// CHECK45-64-NEXT:    store i32 [[TMP3]], ptr [[A_CASTED]], align 4
+// CHECK45-64-NEXT:    [[TMP4:%.*]] = load i64, ptr [[A_CASTED]], align 8
+// CHECK45-64-NEXT:    [[TMP5:%.*]] = load i16, ptr [[AA_ADDR]], align 2
+// CHECK45-64-NEXT:    store i16 [[TMP5]], ptr [[AA_CASTED]], align 2
+// CHECK45-64-NEXT:    [[TMP6:%.*]] = load i64, ptr [[AA_CASTED]], align 8
+// CHECK45-64-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [3 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 0
+// CHECK45-64-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP4]] to ptr
+// CHECK45-64-NEXT:    store ptr [[TMP8]], ptr [[TMP7]], align 8
+// CHECK45-64-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [3 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 1
+// CHECK45-64-NEXT:    [[TMP10:%.*]] = inttoptr i64 [[TMP6]] to ptr
+// CHECK45-64-NEXT:    store ptr [[TMP10]], ptr [[TMP9]], align 8
+// CHECK45-64-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [3 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 2
+// CHECK45-64-NEXT:    store ptr [[TMP0]], ptr [[TMP11]], align 8
+// CHECK45-64-NEXT:    call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP2]], i32 1, i32 -1, i32 -1, ptr @__omp_outlined__2, ptr null, ptr [[CAPTURED_VARS_ADDRS]], i64 3)
+// CHECK45-64-NEXT:    call void @__kmpc_target_deinit(ptr @[[GLOB1]], i8 2)
+// CHECK45-64-NEXT:    ret void
+// CHECK45-64:       worker.exit:
+// CHECK45-64-NEXT:    ret void
+//
+//
+// CHECK45-64-LABEL: define {{[^@]+}}@__omp_outlined__2
+// CHECK45-64-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[A:%.*]], i64 noundef [[AA:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR1]] {
+// CHECK45-64-NEXT:  entry:
+// CHECK45-64-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK45-64-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK45-64-NEXT:    [[A_ADDR:%.*]] = alloca i64, align 8
+// CHECK45-64-NEXT:    [[AA_ADDR:%.*]] = alloca i64, align 8
+// CHECK45-64-NEXT:    [[B_ADDR:%.*]] = alloca ptr, align 8
+// CHECK45-64-NEXT:    store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK45-64-NEXT:    store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
+// CHECK45-64-NEXT:    store i64 [[A]], ptr [[A_ADDR]], align 8
+// CHECK45-64-NEXT:    store i64 [[AA]], ptr [[AA_ADDR]], align 8
+// CHECK45-64-NEXT:    store ptr [[B]], ptr [[B_ADDR]], align 8
+// CHECK45-64-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[B_ADDR]], align 8
+// CHECK45-64-NEXT:    [[TMP1:%.*]] = load i32, ptr [[A_ADDR]], align 4
+// CHECK45-64-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP1]], 1
+// CHECK45-64-NEXT:    store i32 [[ADD]], ptr [[A_ADDR]], align 4
+// CHECK45-64-NEXT:    [[TMP2:%.*]] = load i16, ptr [[AA_ADDR]], align 2
+// CHECK45-64-NEXT:    [[CONV:%.*]] = sext i16 [[TMP2]] to i32
+// CHECK45-64-NEXT:    [[ADD1:%.*]] = add nsw i32 [[CONV]], 1
+// CHECK45-64-NEXT:    [[CONV2:%.*]] = trunc i32 [[ADD1]] to i16
+// CHECK45-64-NEXT:    store i16 [[CONV2]], ptr [[AA_ADDR]], align 2
+// CHECK45-64-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], ptr [[TMP0]], i64 0, i64 2
+// CHECK45-64-NEXT:    [[TMP3:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+// CHECK45-64-NEXT:    [[ADD3:%.*]] = add nsw i32 [[TMP3]], 1
+// CHECK45-64-NEXT:    store i32 [[ADD3]], ptr [[ARRAYIDX]], align 4
+// CHECK45-64-NEXT:    ret void
+//
+//
+// CHECK45-32-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l26
+// CHECK45-32-SAME: () #[[ATTR0:[0-9]+]] {
+// CHECK45-32-NEXT:  entry:
+// CHECK45-32-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x ptr], align 4
+// CHECK45-32-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @[[GLOB1:[0-9]+]], i8 2, i1 false)
+// CHECK45-32-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
+// CHECK45-32-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
+// CHECK45-32:       user_code.entry:
+// CHECK45-32-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]])
+// CHECK45-32-NEXT:    call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, ptr @__omp_outlined__, ptr null, ptr [[CAPTURED_VARS_ADDRS]], i32 0)
+// CHECK45-32-NEXT:    call void @__kmpc_target_deinit(ptr @[[GLOB1]], i8 2)
+// CHECK45-32-NEXT:    ret void
+// CHECK45-32:       worker.exit:
+// CHECK45-32-NEXT:    ret void
+//
+//
+// CHECK45-32-LABEL: define {{[^@]+}}@__omp_outlined__
+// CHECK45-32-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1:[0-9]+]] {
+// CHECK45-32-NEXT:  entry:
+// CHECK45-32-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
+// CHECK45-32-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
+// CHECK45-32-NEXT:    store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK45-32-NEXT:    store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 4
+// CHECK45-32-NEXT:    ret void
+//
+//
+// CHECK45-32-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l30
+// CHECK45-32-SAME: (i32 noundef [[AA:%.*]]) #[[ATTR0]] {
+// CHECK45-32-NEXT:  entry:
+// CHECK45-32-NEXT:    [[AA_ADDR:%.*]] = alloca i32, align 4
+// CHECK45-32-NEXT:    [[AA_CASTED:%.*]] = alloca i32, align 4
+// CHECK45-32-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [1 x ptr], align 4
+// CHECK45-32-NEXT:    store i32 [[AA]], ptr [[AA_ADDR]], align 4
+// CHECK45-32-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @[[GLOB1]], i8 2, i1 false)
+// CHECK45-32-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
+// CHECK45-32-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
+// CHECK45-32:       user_code.entry:
+// CHECK45-32-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]])
+// CHECK45-32-NEXT:    [[TMP2:%.*]] = load i16, ptr [[AA_ADDR]], align 2
+// CHECK45-32-NEXT:    store i16 [[TMP2]], ptr [[AA_CASTED]], align 2
+// CHECK45-32-NEXT:    [[TMP3:%.*]] = load i32, ptr [[AA_CASTED]], align 4
+// CHECK45-32-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i32 0, i32 0
+// CHECK45-32-NEXT:    [[TMP5:%.*]] = inttoptr i32 [[TMP3]] to ptr
+// CHECK45-32-NEXT:    store ptr [[TMP5]], ptr [[TMP4]], align 4
+// CHECK45-32-NEXT:    call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, ptr @__omp_outlined__1, ptr null, ptr [[CAPTURED_VARS_ADDRS]], i32 1)
+// CHECK45-32-NEXT:    call void @__kmpc_target_deinit(ptr @[[GLOB1]], i8 2)
+// CHECK45-32-NEXT:    ret void
+// CHECK45-32:       worker.exit:
+// CHECK45-32-NEXT:    ret void
+//
+//
+// CHECK45-32-LABEL: define {{[^@]+}}@__omp_outlined__1
+// CHECK45-32-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[AA:%.*]]) #[[ATTR1]] {
+// CHECK45-32-NEXT:  entry:
+// CHECK45-32-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
+// CHECK45-32-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
+// CHECK45-32-NEXT:    [[AA_ADDR:%.*]] = alloca i32, align 4
+// CHECK45-32-NEXT:    store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK45-32-NEXT:    store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 4
+// CHECK45-32-NEXT:    store i32 [[AA]], ptr [[AA_ADDR]], align 4
+// CHECK45-32-NEXT:    [[TMP0:%.*]] = load i16, ptr [[AA_ADDR]], align 2
+// CHECK45-32-NEXT:    [[CONV:%.*]] = sext i16 [[TMP0]] to i32
+// CHECK45-32-NEXT:    [[ADD:%.*]] = add nsw i32 [[CONV]], 1
+// CHECK45-32-NEXT:    [[CONV1:%.*]] = trunc i32 [[ADD]] to i16
+// CHECK45-32-NEXT:    store i16 [[CONV1]], ptr [[AA_ADDR]], align 2
+// CHECK45-32-NEXT:    ret void
+//
+//
+// CHECK45-32-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l35
+// CHECK45-32-SAME: (i32 noundef [[A:%.*]], i32 noundef [[AA:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] {
+// CHECK45-32-NEXT:  entry:
+// CHECK45-32-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4
+// CHECK45-32-NEXT:    [[AA_ADDR:%.*]] = alloca i32, align 4
+// CHECK45-32-NEXT:    [[B_ADDR:%.*]] = alloca ptr, align 4
+// CHECK45-32-NEXT:    [[A_CASTED:%.*]] = alloca i32, align 4
+// CHECK45-32-NEXT:    [[AA_CASTED:%.*]] = alloca i32, align 4
+// CHECK45-32-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [3 x ptr], align 4
+// CHECK45-32-NEXT:    store i32 [[A]], ptr [[A_ADDR]], align 4
+// CHECK45-32-NEXT:    store i32 [[AA]], ptr [[AA_ADDR]], align 4
+// CHECK45-32-NEXT:    store ptr [[B]], ptr [[B_ADDR]], align 4
+// CHECK45-32-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[B_ADDR]], align 4
+// CHECK45-32-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr @[[GLOB1]], i8 2, i1 false)
+// CHECK45-32-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1
+// CHECK45-32-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
+// CHECK45-32:       user_code.entry:
+// CHECK45-32-NEXT:    [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]])
+// CHECK45-32-NEXT:    [[TMP3:%.*]] = load i32, ptr [[A_ADDR]], align 4
+// CHECK45-32-NEXT:    store i32 [[TMP3]], ptr [[A_CASTED]], align 4
+// CHECK45-32-NEXT:    [[TMP4:%.*]] = load i32, ptr [[A_CASTED]], align 4
+// CHECK45-32-NEXT:    [[TMP5:%.*]] = load i16, ptr [[AA_ADDR]], align 2
+// CHECK45-32-NEXT:    store i16 [[TMP5]], ptr [[AA_CASTED]], align 2
+// CHECK45-32-NEXT:    [[TMP6:%.*]] = load i32, ptr [[AA_CASTED]], align 4
+// CHECK45-32-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [3 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i32 0, i32 0
+// CHECK45-32-NEXT:    [[TMP8:%.*]] = inttoptr i32 [[TMP4]] to ptr
+// CHECK45-32-NEXT:    store ptr [[TMP8]], ptr [[TMP7]], align 4
+// CHECK45-32-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [3 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i32 0, i32 1
+// CHECK45-32-NEXT:    [[TMP10:%.*]] = inttoptr i32 [[TMP6]] to ptr
+// CHECK45-32-NEXT:    store ptr [[TMP10]], ptr [[TMP9]], align 4
+// CHECK45-32-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [3 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i32 0, i32 2
+// CHECK45-32-NEXT:    store ptr [[TMP0]], ptr [[TMP11]], align 4
+// CHECK45-32-NEXT:    call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP2]], i32 1, i32 -1, i32 -1, ptr @__omp_outlined__2, ptr null, ptr [[CAPTURED_VARS_ADDRS]], i32 3)
+// CHECK45-32-NEXT:    call void @__kmpc_target_deinit(ptr @[[GLOB1]], i8 2)
+// CHECK45-32-NEXT:    ret void
+// CHECK45-32:       worker.exit:
+// CHECK45-32-NEXT:    ret void
+//
+//
+// CHECK45-32-LABEL: define {{[^@]+}}@__omp_outlined__2
+// CHECK45-32-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[A:%.*]], i32 noundef [[AA:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR1]] {
+// CHECK45-32-NEXT:  entry:
+// CHECK45-32-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
+// CHECK45-32-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
+// CHECK45-32-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4
+// CHECK45-32-NEXT:    [[AA_ADDR:%.*]] = alloca i32, align 4
+// CHECK45-32-NEXT:    [[B_ADDR:%.*]] = alloca ptr, align 4
+// CHECK45-32-NEXT:    store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK45-32-NEXT:    store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 4
+// CHECK45-32-NEXT:    store i32 [[A]], ptr [[A_ADDR]], align 4
+// CHECK45-32-NEXT:    store i32 [[AA]], ptr [[AA_ADDR]], align 4
+// CHECK45-32-NEXT:    store ptr [[B]], ptr [[B_ADDR]], align 4
+// CHECK45-32-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[B_ADDR]], align 4
+// CHECK45-32-NEXT:    [[TMP1:%.*]] = load i32, ptr [[A_ADDR]], align 4
+// CHECK45-32-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP1]], 1
+// CHECK45-32-NEXT:    store i32 [[ADD]], ptr [[A_ADDR]], align 4
+// CHECK45-32-NEXT:    [[TMP2:%.*]] = load i16, ptr [[AA_ADDR]], align 2
+// CHECK45-32-NEXT:    [[CONV:%.*]] = sext i16 [[TMP2]] to i32
+// CHECK45-32-NEXT:    [[ADD1:%.*]] = add nsw i32 [[CONV]], 1
+// CHECK45-32-NEXT:    [[CONV2:%.*]] = trunc i32 [[ADD1]] to i16
+// CHECK45-32-NEXT:    store i16 [[CONV2]], ptr [[AA_ADDR]], align 2
+// CHECK45-32-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], ptr [[TMP0]], i32 0, i32 2
+// CHECK45-32-NEXT:    [[TMP3:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+// CHECK45-32-NEXT:    [[ADD3:%.*]] = add nsw i32 [[TMP3]], 1
+// CHECK45-32-NEXT:    store i32 [[ADD3]], ptr [[ARRAYIDX]], align 4
+// CHECK45-32-NEXT:    ret void
+//
+//
+// CHECK45-32-EX-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l26
+// CHECK45-32-EX-SAME: () #[[ATTR0:[0-9]+]] {
+// CHECK45-32-EX-NEXT:  entry:
+// CHECK45-32-EX-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x ptr], align 4
+// CHECK45-32-EX-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @[[GLOB1:[0-9]+]], i8 2, i1 false)
+// CHECK45-32-EX-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
+// CHECK45-32-EX-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
+// CHECK45-32-EX:       user_code.entry:
+// CHECK45-32-EX-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]])
+// CHECK45-32-EX-NEXT:    call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, ptr @__omp_outlined__, ptr null, ptr [[CAPTURED_VARS_ADDRS]], i32 0)
+// CHECK45-32-EX-NEXT:    call void @__kmpc_target_deinit(ptr @[[GLOB1]], i8 2)
+// CHECK45-32-EX-NEXT:    ret void
+// CHECK45-32-EX:       worker.exit:
+// CHECK45-32-EX-NEXT:    ret void
+//
+//
+// CHECK45-32-EX-LABEL: define {{[^@]+}}@__omp_outlined__
+// CHECK45-32-EX-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1:[0-9]+]] {
+// CHECK45-32-EX-NEXT:  entry:
+// CHECK45-32-EX-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
+// CHECK45-32-EX-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
+// CHECK45-32-EX-NEXT:    store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK45-32-EX-NEXT:    store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 4
+// CHECK45-32-EX-NEXT:    ret void
+//
+//
+// CHECK45-32-EX-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l30
+// CHECK45-32-EX-SAME: (i32 noundef [[AA:%.*]]) #[[ATTR0]] {
+// CHECK45-32-EX-NEXT:  entry:
+// CHECK45-32-EX-NEXT:    [[AA_ADDR:%.*]] = alloca i32, align 4
+// CHECK45-32-EX-NEXT:    [[AA_CASTED:%.*]] = alloca i32, align 4
+// CHECK45-32-EX-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [1 x ptr], align 4
+// CHECK45-32-EX-NEXT:    store i32 [[AA]], ptr [[AA_ADDR]], align 4
+// CHECK45-32-EX-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @[[GLOB1]], i8 2, i1 false)
+// CHECK45-32-EX-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
+// CHECK45-32-EX-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
+// CHECK45-32-EX:       user_code.entry:
+// CHECK45-32-EX-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]])
+// CHECK45-32-EX-NEXT:    [[TMP2:%.*]] = load i16, ptr [[AA_ADDR]], align 2
+// CHECK45-32-EX-NEXT:    store i16 [[TMP2]], ptr [[AA_CASTED]], align 2
+// CHECK45-32-EX-NEXT:    [[TMP3:%.*]] = load i32, ptr [[AA_CASTED]], align 4
+// CHECK45-32-EX-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i32 0, i32 0
+// CHECK45-32-EX-NEXT:    [[TMP5:%.*]] = inttoptr i32 [[TMP3]] to ptr
+// CHECK45-32-EX-NEXT:    store ptr [[TMP5]], ptr [[TMP4]], align 4
+// CHECK45-32-EX-NEXT:    call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, ptr @__omp_outlined__1, ptr null, ptr [[CAPTURED_VARS_ADDRS]], i32 1)
+// CHECK45-32-EX-NEXT:    call void @__kmpc_target_deinit(ptr @[[GLOB1]], i8 2)
+// CHECK45-32-EX-NEXT:    ret void
+// CHECK45-32-EX:       worker.exit:
+// CHECK45-32-EX-NEXT:    ret void
+//
+//
+// CHECK45-32-EX-LABEL: define {{[^@]+}}@__omp_outlined__1
+// CHECK45-32-EX-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[AA:%.*]]) #[[ATTR1]] {
+// CHECK45-32-EX-NEXT:  entry:
+// CHECK45-32-EX-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
+// CHECK45-32-EX-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
+// CHECK45-32-EX-NEXT:    [[AA_ADDR:%.*]] = alloca i32, align 4
+// CHECK45-32-EX-NEXT:    store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK45-32-EX-NEXT:    store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 4
+// CHECK45-32-EX-NEXT:    store i32 [[AA]], ptr [[AA_ADDR]], align 4
+// CHECK45-32-EX-NEXT:    [[TMP0:%.*]] = load i16, ptr [[AA_ADDR]], align 2
+// CHECK45-32-EX-NEXT:    [[CONV:%.*]] = sext i16 [[TMP0]] to i32
+// CHECK45-32-EX-NEXT:    [[ADD:%.*]] = add nsw i32 [[CONV]], 1
+// CHECK45-32-EX-NEXT:    [[CONV1:%.*]] = trunc i32 [[ADD]] to i16
+// CHECK45-32-EX-NEXT:    store i16 [[CONV1]], ptr [[AA_ADDR]], align 2
+// CHECK45-32-EX-NEXT:    ret void
+//
+//
+// CHECK45-32-EX-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l35
+// CHECK45-32-EX-SAME: (i32 noundef [[A:%.*]], i32 noundef [[AA:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] {
+// CHECK45-32-EX-NEXT:  entry:
+// CHECK45-32-EX-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4
+// CHECK45-32-EX-NEXT:    [[AA_ADDR:%.*]] = alloca i32, align 4
+// CHECK45-32-EX-NEXT:    [[B_ADDR:%.*]] = alloca ptr, align 4
+// CHECK45-32-EX-NEXT:    [[A_CASTED:%.*]] = alloca i32, align 4
+// CHECK45-32-EX-NEXT:    [[AA_CASTED:%.*]] = alloca i32, align 4
+// CHECK45-32-EX-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [3 x ptr], align 4
+// CHECK45-32-EX-NEXT:    store i32 [[A]], ptr [[A_ADDR]], align 4
+// CHECK45-32-EX-NEXT:    store i32 [[AA]], ptr [[AA_ADDR]], align 4
+// CHECK45-32-EX-NEXT:    store ptr [[B]], ptr [[B_ADDR]], align 4
+// CHECK45-32-EX-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[B_ADDR]], align 4
+// CHECK45-32-EX-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr @[[GLOB1]], i8 2, i1 false)
+// CHECK45-32-EX-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1
+// CHECK45-32-EX-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
+// CHECK45-32-EX:       user_code.entry:
+// CHECK45-32-EX-NEXT:    [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]])
+// CHECK45-32-EX-NEXT:    [[TMP3:%.*]] = load i32, ptr [[A_ADDR]], align 4
+// CHECK45-32-EX-NEXT:    store i32 [[TMP3]], ptr [[A_CASTED]], align 4
+// CHECK45-32-EX-NEXT:    [[TMP4:%.*]] = load i32, ptr [[A_CASTED]], align 4
+// CHECK45-32-EX-NEXT:    [[TMP5:%.*]] = load i16, ptr [[AA_ADDR]], align 2
+// CHECK45-32-EX-NEXT:    store i16 [[TMP5]], ptr [[AA_CASTED]], align 2
+// CHECK45-32-EX-NEXT:    [[TMP6:%.*]] = load i32, ptr [[AA_CASTED]], align 4
+// CHECK45-32-EX-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [3 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i32 0, i32 0
+// CHECK45-32-EX-NEXT:    [[TMP8:%.*]] = inttoptr i32 [[TMP4]] to ptr
+// CHECK45-32-EX-NEXT:    store ptr [[TMP8]], ptr [[TMP7]], align 4
+// CHECK45-32-EX-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [3 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i32 0, i32 1
+// CHECK45-32-EX-NEXT:    [[TMP10:%.*]] = inttoptr i32 [[TMP6]] to ptr
+// CHECK45-32-EX-NEXT:    store ptr [[TMP10]], ptr [[TMP9]], align 4
+// CHECK45-32-EX-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [3 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i32 0, i32 2
+// CHECK45-32-EX-NEXT:    store ptr [[TMP0]], ptr [[TMP11]], align 4
+// CHECK45-32-EX-NEXT:    call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP2]], i32 1, i32 -1, i32 -1, ptr @__omp_outlined__2, ptr null, ptr [[CAPTURED_VARS_ADDRS]], i32 3)
+// CHECK45-32-EX-NEXT:    call void @__kmpc_target_deinit(ptr @[[GLOB1]], i8 2)
+// CHECK45-32-EX-NEXT:    ret void
+// CHECK45-32-EX:       worker.exit:
+// CHECK45-32-EX-NEXT:    ret void
+//
+//
+// CHECK45-32-EX-LABEL: define {{[^@]+}}@__omp_outlined__2
+// CHECK45-32-EX-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[A:%.*]], i32 noundef [[AA:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR1]] {
+// CHECK45-32-EX-NEXT:  entry:
+// CHECK45-32-EX-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
+// CHECK45-32-EX-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
+// CHECK45-32-EX-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4
+// CHECK45-32-EX-NEXT:    [[AA_ADDR:%.*]] = alloca i32, align 4
+// CHECK45-32-EX-NEXT:    [[B_ADDR:%.*]] = alloca ptr, align 4
+// CHECK45-32-EX-NEXT:    store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK45-32-EX-NEXT:    store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 4
+// CHECK45-32-EX-NEXT:    store i32 [[A]], ptr [[A_ADDR]], align 4
+// CHECK45-32-EX-NEXT:    store i32 [[AA]], ptr [[AA_ADDR]], align 4
+// CHECK45-32-EX-NEXT:    store ptr [[B]], ptr [[B_ADDR]], align 4
+// CHECK45-32-EX-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[B_ADDR]], align 4
+// CHECK45-32-EX-NEXT:    [[TMP1:%.*]] = load i32, ptr [[A_ADDR]], align 4
+// CHECK45-32-EX-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP1]], 1
+// CHECK45-32-EX-NEXT:    store i32 [[ADD]], ptr [[A_ADDR]], align 4
+// CHECK45-32-EX-NEXT:    [[TMP2:%.*]] = load i16, ptr [[AA_ADDR]], align 2
+// CHECK45-32-EX-NEXT:    [[CONV:%.*]] = sext i16 [[TMP2]] to i32
+// CHECK45-32-EX-NEXT:    [[ADD1:%.*]] = add nsw i32 [[CONV]], 1
+// CHECK45-32-EX-NEXT:    [[CONV2:%.*]] = trunc i32 [[ADD1]] to i16
+// CHECK45-32-EX-NEXT:    store i16 [[CONV2]], ptr [[AA_ADDR]], align 2
+// CHECK45-32-EX-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], ptr [[TMP0]], i32 0, i32 2
+// CHECK45-32-EX-NEXT:    [[TMP3:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+// CHECK45-32-EX-NEXT:    [[ADD3:%.*]] = add nsw i32 [[TMP3]], 1
+// CHECK45-32-EX-NEXT:    store i32 [[ADD3]], ptr [[ARRAYIDX]], align 4
+// CHECK45-32-EX-NEXT:    ret void
+//
+//
+// CHECK-64-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l26
+// CHECK-64-SAME: () #[[ATTR0:[0-9]+]] {
+// CHECK-64-NEXT:  entry:
+// CHECK-64-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x ptr], align 8
+// CHECK-64-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @[[GLOB1:[0-9]+]], i8 2, i1 false)
+// CHECK-64-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
+// CHECK-64-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
+// CHECK-64:       user_code.entry:
+// CHECK-64-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]])
+// CHECK-64-NEXT:    call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, ptr @__omp_outlined__, ptr null, ptr [[CAPTURED_VARS_ADDRS]], i64 0)
+// CHECK-64-NEXT:    call void @__kmpc_target_deinit(ptr @[[GLOB1]], i8 2)
+// CHECK-64-NEXT:    ret void
+// CHECK-64:       worker.exit:
+// CHECK-64-NEXT:    ret void
+//
+//
+// CHECK-64-LABEL: define {{[^@]+}}@__omp_outlined__
+// CHECK-64-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1:[0-9]+]] {
+// CHECK-64-NEXT:  entry:
+// CHECK-64-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK-64-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK-64-NEXT:    store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK-64-NEXT:    store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
+// CHECK-64-NEXT:    ret void
+//
+//
+// CHECK-64-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l30
+// CHECK-64-SAME: (i64 noundef [[AA:%.*]]) #[[ATTR0]] {
+// CHECK-64-NEXT:  entry:
+// CHECK-64-NEXT:    [[AA_ADDR:%.*]] = alloca i64, align 8
+// CHECK-64-NEXT:    [[AA_CASTED:%.*]] = alloca i64, align 8
+// CHECK-64-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [1 x ptr], align 8
+// CHECK-64-NEXT:    store i64 [[AA]], ptr [[AA_ADDR]], align 8
+// CHECK-64-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @[[GLOB1]], i8 2, i1 false)
+// CHECK-64-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
+// CHECK-64-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
+// CHECK-64:       user_code.entry:
+// CHECK-64-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]])
+// CHECK-64-NEXT:    [[TMP2:%.*]] = load i16, ptr [[AA_ADDR]], align 2
+// CHECK-64-NEXT:    store i16 [[TMP2]], ptr [[AA_CASTED]], align 2
+// CHECK-64-NEXT:    [[TMP3:%.*]] = load i64, ptr [[AA_CASTED]], align 8
+// CHECK-64-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 0
+// CHECK-64-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP3]] to ptr
+// CHECK-64-NEXT:    store ptr [[TMP5]], ptr [[TMP4]], align 8
+// CHECK-64-NEXT:    call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, ptr @__omp_outlined__1, ptr null, ptr [[CAPTURED_VARS_ADDRS]], i64 1)
+// CHECK-64-NEXT:    call void @__kmpc_target_deinit(ptr @[[GLOB1]], i8 2)
+// CHECK-64-NEXT:    ret void
+// CHECK-64:       worker.exit:
+// CHECK-64-NEXT:    ret void
+//
+//
+// CHECK-64-LABEL: define {{[^@]+}}@__omp_outlined__1
+// CHECK-64-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[AA:%.*]]) #[[ATTR1]] {
+// CHECK-64-NEXT:  entry:
+// CHECK-64-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK-64-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK-64-NEXT:    [[AA_ADDR:%.*]] = alloca i64, align 8
+// CHECK-64-NEXT:    store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK-64-NEXT:    store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
+// CHECK-64-NEXT:    store i64 [[AA]], ptr [[AA_ADDR]], align 8
+// CHECK-64-NEXT:    [[TMP0:%.*]] = load i16, ptr [[AA_ADDR]], align 2
+// CHECK-64-NEXT:    [[CONV:%.*]] = sext i16 [[TMP0]] to i32
+// CHECK-64-NEXT:    [[ADD:%.*]] = add nsw i32 [[CONV]], 1
+// CHECK-64-NEXT:    [[CONV1:%.*]] = trunc i32 [[ADD]] to i16
+// CHECK-64-NEXT:    store i16 [[CONV1]], ptr [[AA_ADDR]], align 2
+// CHECK-64-NEXT:    ret void
+//
+//
+// CHECK-64-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l35
+// CHECK-64-SAME: (i64 noundef [[A:%.*]], i64 noundef [[AA:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] {
+// CHECK-64-NEXT:  entry:
+// CHECK-64-NEXT:    [[A_ADDR:%.*]] = alloca i64, align 8
+// CHECK-64-NEXT:    [[AA_ADDR:%.*]] = alloca i64, align 8
+// CHECK-64-NEXT:    [[B_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-64-NEXT:    [[A_CASTED:%.*]] = alloca i64, align 8
+// CHECK-64-NEXT:    [[AA_CASTED:%.*]] = alloca i64, align 8
+// CHECK-64-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [3 x ptr], align 8
+// CHECK-64-NEXT:    store i64 [[A]], ptr [[A_ADDR]], align 8
+// CHECK-64-NEXT:    store i64 [[AA]], ptr [[AA_ADDR]], align 8
+// CHECK-64-NEXT:    store ptr [[B]], ptr [[B_ADDR]], align 8
+// CHECK-64-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[B_ADDR]], align 8
+// CHECK-64-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr @[[GLOB1]], i8 2, i1 false)
+// CHECK-64-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1
+// CHECK-64-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
+// CHECK-64:       user_code.entry:
+// CHECK-64-NEXT:    [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]])
+// CHECK-64-NEXT:    [[TMP3:%.*]] = load i32, ptr [[A_ADDR]], align 4
+// CHECK-64-NEXT:    store i32 [[TMP3]], ptr [[A_CASTED]], align 4
+// CHECK-64-NEXT:    [[TMP4:%.*]] = load i64, ptr [[A_CASTED]], align 8
+// CHECK-64-NEXT:    [[TMP5:%.*]] = load i16, ptr [[AA_ADDR]], align 2
+// CHECK-64-NEXT:    store i16 [[TMP5]], ptr [[AA_CASTED]], align 2
+// CHECK-64-NEXT:    [[TMP6:%.*]] = load i64, ptr [[AA_CASTED]], align 8
+// CHECK-64-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [3 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 0
+// CHECK-64-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP4]] to ptr
+// CHECK-64-NEXT:    store ptr [[TMP8]], ptr [[TMP7]], align 8
+// CHECK-64-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [3 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 1
+// CHECK-64-NEXT:    [[TMP10:%.*]] = inttoptr i64 [[TMP6]] to ptr
+// CHECK-64-NEXT:    store ptr [[TMP10]], ptr [[TMP9]], align 8
+// CHECK-64-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [3 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 2
+// CHECK-64-NEXT:    store ptr [[TMP0]], ptr [[TMP11]], align 8
+// CHECK-64-NEXT:    call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP2]], i32 1, i32 -1, i32 -1, ptr @__omp_outlined__2, ptr null, ptr [[CAPTURED_VARS_ADDRS]], i64 3)
+// CHECK-64-NEXT:    call void @__kmpc_target_deinit(ptr @[[GLOB1]], i8 2)
+// CHECK-64-NEXT:    ret void
+// CHECK-64:       worker.exit:
+// CHECK-64-NEXT:    ret void
+//
+//
+// CHECK-64-LABEL: define {{[^@]+}}@__omp_outlined__2
+// CHECK-64-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[A:%.*]], i64 noundef [[AA:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR1]] {
+// CHECK-64-NEXT:  entry:
+// CHECK-64-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK-64-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK-64-NEXT:    [[A_ADDR:%.*]] = alloca i64, align 8
+// CHECK-64-NEXT:    [[AA_ADDR:%.*]] = alloca i64, align 8
+// CHECK-64-NEXT:    [[B_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-64-NEXT:    store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK-64-NEXT:    store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
+// CHECK-64-NEXT:    store i64 [[A]], ptr [[A_ADDR]], align 8
+// CHECK-64-NEXT:    store i64 [[AA]], ptr [[AA_ADDR]], align 8
+// CHECK-64-NEXT:    store ptr [[B]], ptr [[B_ADDR]], align 8
+// CHECK-64-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[B_ADDR]], align 8
+// CHECK-64-NEXT:    [[TMP1:%.*]] = load i32, ptr [[A_ADDR]], align 4
+// CHECK-64-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP1]], 1
+// CHECK-64-NEXT:    store i32 [[ADD]], ptr [[A_ADDR]], align 4
+// CHECK-64-NEXT:    [[TMP2:%.*]] = load i16, ptr [[AA_ADDR]], align 2
+// CHECK-64-NEXT:    [[CONV:%.*]] = sext i16 [[TMP2]] to i32
+// CHECK-64-NEXT:    [[ADD1:%.*]] = add nsw i32 [[CONV]], 1
+// CHECK-64-NEXT:    [[CONV2:%.*]] = trunc i32 [[ADD1]] to i16
+// CHECK-64-NEXT:    store i16 [[CONV2]], ptr [[AA_ADDR]], align 2
+// CHECK-64-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], ptr [[TMP0]], i64 0, i64 2
+// CHECK-64-NEXT:    [[TMP3:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+// CHECK-64-NEXT:    [[ADD3:%.*]] = add nsw i32 [[TMP3]], 1
+// CHECK-64-NEXT:    store i32 [[ADD3]], ptr [[ARRAYIDX]], align 4
+// CHECK-64-NEXT:    ret void
+//
+//
+// CHECK-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l26
+// CHECK-SAME: () #[[ATTR0:[0-9]+]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x ptr], align 4
+// CHECK-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @[[GLOB1:[0-9]+]], i8 2, i1 false)
+// CHECK-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
+// CHECK-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
+// CHECK:       user_code.entry:
+// CHECK-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]])
+// CHECK-NEXT:    call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, ptr @__omp_outlined__, ptr null, ptr [[CAPTURED_VARS_ADDRS]], i32 0)
+// CHECK-NEXT:    call void @__kmpc_target_deinit(ptr @[[GLOB1]], i8 2)
+// CHECK-NEXT:    ret void
+// CHECK:       worker.exit:
+// CHECK-NEXT:    ret void
+//
+//
+// CHECK-LABEL: define {{[^@]+}}@__omp_outlined__
+// CHECK-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1:[0-9]+]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
+// CHECK-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
+// CHECK-NEXT:    store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-NEXT:    store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 4
+// CHECK-NEXT:    ret void
+//
+//
+// CHECK-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l30
+// CHECK-SAME: (i32 noundef [[AA:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[AA_ADDR:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[AA_CASTED:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [1 x ptr], align 4
+// CHECK-NEXT:    store i32 [[AA]], ptr [[AA_ADDR]], align 4
+// CHECK-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @[[GLOB1]], i8 2, i1 false)
+// CHECK-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
+// CHECK-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
+// CHECK:       user_code.entry:
+// CHECK-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]])
+// CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr [[AA_ADDR]], align 2
+// CHECK-NEXT:    store i16 [[TMP2]], ptr [[AA_CASTED]], align 2
+// CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr [[AA_CASTED]], align 4
+// CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i32 [[TMP3]] to ptr
+// CHECK-NEXT:    store ptr [[TMP5]], ptr [[TMP4]], align 4
+// CHECK-NEXT:    call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, ptr @__omp_outlined__1, ptr null, ptr [[CAPTURED_VARS_ADDRS]], i32 1)
+// CHECK-NEXT:    call void @__kmpc_target_deinit(ptr @[[GLOB1]], i8 2)
+// CHECK-NEXT:    ret void
+// CHECK:       worker.exit:
+// CHECK-NEXT:    ret void
+//
+//
+// CHECK-LABEL: define {{[^@]+}}@__omp_outlined__1
+// CHECK-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[AA:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
+// CHECK-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
+// CHECK-NEXT:    [[AA_ADDR:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-NEXT:    store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 4
+// CHECK-NEXT:    store i32 [[AA]], ptr [[AA_ADDR]], align 4
+// CHECK-NEXT:    [[TMP0:%.*]] = load i16, ptr [[AA_ADDR]], align 2
+// CHECK-NEXT:    [[CONV:%.*]] = sext i16 [[TMP0]] to i32
+// CHECK-NEXT:    [[ADD:%.*]] = add nsw i32 [[CONV]], 1
+// CHECK-NEXT:    [[CONV1:%.*]] = trunc i32 [[ADD]] to i16
+// CHECK-NEXT:    store i16 [[CONV1]], ptr [[AA_ADDR]], align 2
+// CHECK-NEXT:    ret void
+//
+//
+// CHECK-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l35
+// CHECK-SAME: (i32 noundef [[A:%.*]], i32 noundef [[AA:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[AA_ADDR:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[B_ADDR:%.*]] = alloca ptr, align 4
+// CHECK-NEXT:    [[A_CASTED:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[AA_CASTED:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [3 x ptr], align 4
+// CHECK-NEXT:    store i32 [[A]], ptr [[A_ADDR]], align 4
+// CHECK-NEXT:    store i32 [[AA]], ptr [[AA_ADDR]], align 4
+// CHECK-NEXT:    store ptr [[B]], ptr [[B_ADDR]], align 4
+// CHECK-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[B_ADDR]], align 4
+// CHECK-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr @[[GLOB1]], i8 2, i1 false)
+// CHECK-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1
+// CHECK-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
+// CHECK:       user_code.entry:
+// CHECK-NEXT:    [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr [[A_ADDR]], align 4
+// CHECK-NEXT:    store i32 [[TMP3]], ptr [[A_CASTED]], align 4
+// CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr [[A_CASTED]], align 4
+// CHECK-NEXT:    [[TMP5:%.*]] = load i16, ptr [[AA_ADDR]], align 2
+// CHECK-NEXT:    store i16 [[TMP5]], ptr [[AA_CASTED]], align 2
+// CHECK-NEXT:    [[TMP6:%.*]] = load i32, ptr [[AA_CASTED]], align 4
+// CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [3 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i32 [[TMP4]] to ptr
+// CHECK-NEXT:    store ptr [[TMP8]], ptr [[TMP7]], align 4
+// CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [3 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i32 0, i32 1
+// CHECK-NEXT:    [[TMP10:%.*]] = inttoptr i32 [[TMP6]] to ptr
+// CHECK-NEXT:    store ptr [[TMP10]], ptr [[TMP9]], align 4
+// CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [3 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i32 0, i32 2
+// CHECK-NEXT:    store ptr [[TMP0]], ptr [[TMP11]], align 4
+// CHECK-NEXT:    call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP2]], i32 1, i32 -1, i32 -1, ptr @__omp_outlined__2, ptr null, ptr [[CAPTURED_VARS_ADDRS]], i32 3)
+// CHECK-NEXT:    call void @__kmpc_target_deinit(ptr @[[GLOB1]], i8 2)
+// CHECK-NEXT:    ret void
+// CHECK:       worker.exit:
+// CHECK-NEXT:    ret void
+//
+//
+// CHECK-LABEL: define {{[^@]+}}@__omp_outlined__2
+// CHECK-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[A:%.*]], i32 noundef [[AA:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
+// CHECK-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[AA_ADDR:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[B_ADDR:%.*]] = alloca ptr, align 4
+// CHECK-NEXT:    store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-NEXT:    store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 4
+// CHECK-NEXT:    store i32 [[A]], ptr [[A_ADDR]], align 4
+// CHECK-NEXT:    store i32 [[AA]], ptr [[AA_ADDR]], align 4
+// CHECK-NEXT:    store ptr [[B]], ptr [[B_ADDR]], align 4
+// CHECK-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[B_ADDR]], align 4
+// CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[A_ADDR]], align 4
+// CHECK-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP1]], 1
+// CHECK-NEXT:    store i32 [[ADD]], ptr [[A_ADDR]], align 4
+// CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr [[AA_ADDR]], align 2
+// CHECK-NEXT:    [[CONV:%.*]] = sext i16 [[TMP2]] to i32
+// CHECK-NEXT:    [[ADD1:%.*]] = add nsw i32 [[CONV]], 1
+// CHECK-NEXT:    [[CONV2:%.*]] = trunc i32 [[ADD1]] to i16
+// CHECK-NEXT:    store i16 [[CONV2]], ptr [[AA_ADDR]], align 2
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], ptr [[TMP0]], i32 0, i32 2
+// CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+// CHECK-NEXT:    [[ADD3:%.*]] = add nsw i32 [[TMP3]], 1
+// CHECK-NEXT:    store i32 [[ADD3]], ptr [[ARRAYIDX]], align 4
+// CHECK-NEXT:    ret void
+//
+//
+// CHECK-32-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l26
+// CHECK-32-SAME: () #[[ATTR0:[0-9]+]] {
+// CHECK-32-NEXT:  entry:
+// CHECK-32-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x ptr], align 4
+// CHECK-32-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @[[GLOB1:[0-9]+]], i8 2, i1 false)
+// CHECK-32-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
+// CHECK-32-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
+// CHECK-32:       user_code.entry:
+// CHECK-32-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]])
+// CHECK-32-NEXT:    call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, ptr @__omp_outlined__, ptr null, ptr [[CAPTURED_VARS_ADDRS]], i32 0)
+// CHECK-32-NEXT:    call void @__kmpc_target_deinit(ptr @[[GLOB1]], i8 2)
+// CHECK-32-NEXT:    ret void
+// CHECK-32:       worker.exit:
+// CHECK-32-NEXT:    ret void
+//
+//
+// CHECK-32-LABEL: define {{[^@]+}}@__omp_outlined__
+// CHECK-32-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1:[0-9]+]] {
+// CHECK-32-NEXT:  entry:
+// CHECK-32-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
+// CHECK-32-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
+// CHECK-32-NEXT:    store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-NEXT:    store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 4
+// CHECK-32-NEXT:    ret void
+//
+//
+// CHECK-32-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l30
+// CHECK-32-SAME: (i32 noundef [[AA:%.*]]) #[[ATTR0]] {
+// CHECK-32-NEXT:  entry:
+// CHECK-32-NEXT:    [[AA_ADDR:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[AA_CASTED:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [1 x ptr], align 4
+// CHECK-32-NEXT:    store i32 [[AA]], ptr [[AA_ADDR]], align 4
+// CHECK-32-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @[[GLOB1]], i8 2, i1 false)
+// CHECK-32-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
+// CHECK-32-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
+// CHECK-32:       user_code.entry:
+// CHECK-32-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]])
+// CHECK-32-NEXT:    [[TMP2:%.*]] = load i16, ptr [[AA_ADDR]], align 2
+// CHECK-32-NEXT:    store i16 [[TMP2]], ptr [[AA_CASTED]], align 2
+// CHECK-32-NEXT:    [[TMP3:%.*]] = load i32, ptr [[AA_CASTED]], align 4
+// CHECK-32-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i32 0, i32 0
+// CHECK-32-NEXT:    [[TMP5:%.*]] = inttoptr i32 [[TMP3]] to ptr
+// CHECK-32-NEXT:    store ptr [[TMP5]], ptr [[TMP4]], align 4
+// CHECK-32-NEXT:    call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, ptr @__omp_outlined__1, ptr null, ptr [[CAPTURED_VARS_ADDRS]], i32 1)
+// CHECK-32-NEXT:    call void @__kmpc_target_deinit(ptr @[[GLOB1]], i8 2)
+// CHECK-32-NEXT:    ret void
+// CHECK-32:       worker.exit:
+// CHECK-32-NEXT:    ret void
+//
+//
+// CHECK-32-LABEL: define {{[^@]+}}@__omp_outlined__1
+// CHECK-32-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[AA:%.*]]) #[[ATTR1]] {
+// CHECK-32-NEXT:  entry:
+// CHECK-32-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
+// CHECK-32-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
+// CHECK-32-NEXT:    [[AA_ADDR:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-NEXT:    store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 4
+// CHECK-32-NEXT:    store i32 [[AA]], ptr [[AA_ADDR]], align 4
+// CHECK-32-NEXT:    [[TMP0:%.*]] = load i16, ptr [[AA_ADDR]], align 2
+// CHECK-32-NEXT:    [[CONV:%.*]] = sext i16 [[TMP0]] to i32
+// CHECK-32-NEXT:    [[ADD:%.*]] = add nsw i32 [[CONV]], 1
+// CHECK-32-NEXT:    [[CONV1:%.*]] = trunc i32 [[ADD]] to i16
+// CHECK-32-NEXT:    store i16 [[CONV1]], ptr [[AA_ADDR]], align 2
+// CHECK-32-NEXT:    ret void
+//
+//
+// CHECK-32-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l35
+// CHECK-32-SAME: (i32 noundef [[A:%.*]], i32 noundef [[AA:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] {
+// CHECK-32-NEXT:  entry:
+// CHECK-32-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[AA_ADDR:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[B_ADDR:%.*]] = alloca ptr, align 4
+// CHECK-32-NEXT:    [[A_CASTED:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[AA_CASTED:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [3 x ptr], align 4
+// CHECK-32-NEXT:    store i32 [[A]], ptr [[A_ADDR]], align 4
+// CHECK-32-NEXT:    store i32 [[AA]], ptr [[AA_ADDR]], align 4
+// CHECK-32-NEXT:    store ptr [[B]], ptr [[B_ADDR]], align 4
+// CHECK-32-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[B_ADDR]], align 4
+// CHECK-32-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr @[[GLOB1]], i8 2, i1 false)
+// CHECK-32-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1
+// CHECK-32-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
+// CHECK-32:       user_code.entry:
+// CHECK-32-NEXT:    [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]])
+// CHECK-32-NEXT:    [[TMP3:%.*]] = load i32, ptr [[A_ADDR]], align 4
+// CHECK-32-NEXT:    store i32 [[TMP3]], ptr [[A_CASTED]], align 4
+// CHECK-32-NEXT:    [[TMP4:%.*]] = load i32, ptr [[A_CASTED]], align 4
+// CHECK-32-NEXT:    [[TMP5:%.*]] = load i16, ptr [[AA_ADDR]], align 2
+// CHECK-32-NEXT:    store i16 [[TMP5]], ptr [[AA_CASTED]], align 2
+// CHECK-32-NEXT:    [[TMP6:%.*]] = load i32, ptr [[AA_CASTED]], align 4
+// CHECK-32-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [3 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i32 0, i32 0
+// CHECK-32-NEXT:    [[TMP8:%.*]] = inttoptr i32 [[TMP4]] to ptr
+// CHECK-32-NEXT:    store ptr [[TMP8]], ptr [[TMP7]], align 4
+// CHECK-32-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [3 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i32 0, i32 1
+// CHECK-32-NEXT:    [[TMP10:%.*]] = inttoptr i32 [[TMP6]] to ptr
+// CHECK-32-NEXT:    store ptr [[TMP10]], ptr [[TMP9]], align 4
+// CHECK-32-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [3 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i32 0, i32 2
+// CHECK-32-NEXT:    store ptr [[TMP0]], ptr [[TMP11]], align 4
+// CHECK-32-NEXT:    call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP2]], i32 1, i32 -1, i32 -1, ptr @__omp_outlined__2, ptr null, ptr [[CAPTURED_VARS_ADDRS]], i32 3)
+// CHECK-32-NEXT:    call void @__kmpc_target_deinit(ptr @[[GLOB1]], i8 2)
+// CHECK-32-NEXT:    ret void
+// CHECK-32:       worker.exit:
+// CHECK-32-NEXT:    ret void
+//
+//
+// CHECK-32-LABEL: define {{[^@]+}}@__omp_outlined__2
+// CHECK-32-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[A:%.*]], i32 noundef [[AA:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR1]] {
+// CHECK-32-NEXT:  entry:
+// CHECK-32-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
+// CHECK-32-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
+// CHECK-32-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[AA_ADDR:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[B_ADDR:%.*]] = alloca ptr, align 4
+// CHECK-32-NEXT:    store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-NEXT:    store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 4
+// CHECK-32-NEXT:    store i32 [[A]], ptr [[A_ADDR]], align 4
+// CHECK-32-NEXT:    store i32 [[AA]], ptr [[AA_ADDR]], align 4
+// CHECK-32-NEXT:    store ptr [[B]], ptr [[B_ADDR]], align 4
+// CHECK-32-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[B_ADDR]], align 4
+// CHECK-32-NEXT:    [[TMP1:%.*]] = load i32, ptr [[A_ADDR]], align 4
+// CHECK-32-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP1]], 1
+// CHECK-32-NEXT:    store i32 [[ADD]], ptr [[A_ADDR]], align 4
+// CHECK-32-NEXT:    [[TMP2:%.*]] = load i16, ptr [[AA_ADDR]], align 2
+// CHECK-32-NEXT:    [[CONV:%.*]] = sext i16 [[TMP2]] to i32
+// CHECK-32-NEXT:    [[ADD1:%.*]] = add nsw i32 [[CONV]], 1
+// CHECK-32-NEXT:    [[CONV2:%.*]] = trunc i32 [[ADD1]] to i16
+// CHECK-32-NEXT:    store i16 [[CONV2]], ptr [[AA_ADDR]], align 2
+// CHECK-32-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], ptr [[TMP0]], i32 0, i32 2
+// CHECK-32-NEXT:    [[TMP3:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+// CHECK-32-NEXT:    [[ADD3:%.*]] = add nsw i32 [[TMP3]], 1
+// CHECK-32-NEXT:    store i32 [[ADD3]], ptr [[ARRAYIDX]], align 4
+// CHECK-32-NEXT:    ret void
+//
+//
+// CHECK-32-EX-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l26
+// CHECK-32-EX-SAME: () #[[ATTR0:[0-9]+]] {
+// CHECK-32-EX-NEXT:  entry:
+// CHECK-32-EX-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x ptr], align 4
+// CHECK-32-EX-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @[[GLOB1:[0-9]+]], i8 2, i1 false)
+// CHECK-32-EX-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
+// CHECK-32-EX-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
+// CHECK-32-EX:       user_code.entry:
+// CHECK-32-EX-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]])
+// CHECK-32-EX-NEXT:    call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, ptr @__omp_outlined__, ptr null, ptr [[CAPTURED_VARS_ADDRS]], i32 0)
+// CHECK-32-EX-NEXT:    call void @__kmpc_target_deinit(ptr @[[GLOB1]], i8 2)
+// CHECK-32-EX-NEXT:    ret void
+// CHECK-32-EX:       worker.exit:
+// CHECK-32-EX-NEXT:    ret void
+//
+//
+// CHECK-32-EX-LABEL: define {{[^@]+}}@__omp_outlined__
+// CHECK-32-EX-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1:[0-9]+]] {
+// CHECK-32-EX-NEXT:  entry:
+// CHECK-32-EX-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
+// CHECK-32-EX-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
+// CHECK-32-EX-NEXT:    store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-EX-NEXT:    store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 4
+// CHECK-32-EX-NEXT:    ret void
+//
+//
+// CHECK-32-EX-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l30
+// CHECK-32-EX-SAME: (i32 noundef [[AA:%.*]]) #[[ATTR0]] {
+// CHECK-32-EX-NEXT:  entry:
+// CHECK-32-EX-NEXT:    [[AA_ADDR:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[AA_CASTED:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [1 x ptr], align 4
+// CHECK-32-EX-NEXT:    store i32 [[AA]], ptr [[AA_ADDR]], align 4
+// CHECK-32-EX-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @[[GLOB1]], i8 2, i1 false)
+// CHECK-32-EX-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
+// CHECK-32-EX-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
+// CHECK-32-EX:       user_code.entry:
+// CHECK-32-EX-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]])
+// CHECK-32-EX-NEXT:    [[TMP2:%.*]] = load i16, ptr [[AA_ADDR]], align 2
+// CHECK-32-EX-NEXT:    store i16 [[TMP2]], ptr [[AA_CASTED]], align 2
+// CHECK-32-EX-NEXT:    [[TMP3:%.*]] = load i32, ptr [[AA_CASTED]], align 4
+// CHECK-32-EX-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i32 0, i32 0
+// CHECK-32-EX-NEXT:    [[TMP5:%.*]] = inttoptr i32 [[TMP3]] to ptr
+// CHECK-32-EX-NEXT:    store ptr [[TMP5]], ptr [[TMP4]], align 4
+// CHECK-32-EX-NEXT:    call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, ptr @__omp_outlined__1, ptr null, ptr [[CAPTURED_VARS_ADDRS]], i32 1)
+// CHECK-32-EX-NEXT:    call void @__kmpc_target_deinit(ptr @[[GLOB1]], i8 2)
+// CHECK-32-EX-NEXT:    ret void
+// CHECK-32-EX:       worker.exit:
+// CHECK-32-EX-NEXT:    ret void
+//
+//
+// CHECK-32-EX-LABEL: define {{[^@]+}}@__omp_outlined__1
+// CHECK-32-EX-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[AA:%.*]]) #[[ATTR1]] {
+// CHECK-32-EX-NEXT:  entry:
+// CHECK-32-EX-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
+// CHECK-32-EX-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
+// CHECK-32-EX-NEXT:    [[AA_ADDR:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-EX-NEXT:    store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 4
+// CHECK-32-EX-NEXT:    store i32 [[AA]], ptr [[AA_ADDR]], align 4
+// CHECK-32-EX-NEXT:    [[TMP0:%.*]] = load i16, ptr [[AA_ADDR]], align 2
+// CHECK-32-EX-NEXT:    [[CONV:%.*]] = sext i16 [[TMP0]] to i32
+// CHECK-32-EX-NEXT:    [[ADD:%.*]] = add nsw i32 [[CONV]], 1
+// CHECK-32-EX-NEXT:    [[CONV1:%.*]] = trunc i32 [[ADD]] to i16
+// CHECK-32-EX-NEXT:    store i16 [[CONV1]], ptr [[AA_ADDR]], align 2
+// CHECK-32-EX-NEXT:    ret void
+//
+//
+// CHECK-32-EX-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l35
+// CHECK-32-EX-SAME: (i32 noundef [[A:%.*]], i32 noundef [[AA:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] {
+// CHECK-32-EX-NEXT:  entry:
+// CHECK-32-EX-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[AA_ADDR:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[B_ADDR:%.*]] = alloca ptr, align 4
+// CHECK-32-EX-NEXT:    [[A_CASTED:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[AA_CASTED:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [3 x ptr], align 4
+// CHECK-32-EX-NEXT:    store i32 [[A]], ptr [[A_ADDR]], align 4
+// CHECK-32-EX-NEXT:    store i32 [[AA]], ptr [[AA_ADDR]], align 4
+// CHECK-32-EX-NEXT:    store ptr [[B]], ptr [[B_ADDR]], align 4
+// CHECK-32-EX-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[B_ADDR]], align 4
+// CHECK-32-EX-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr @[[GLOB1]], i8 2, i1 false)
+// CHECK-32-EX-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1
+// CHECK-32-EX-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
+// CHECK-32-EX:       user_code.entry:
+// CHECK-32-EX-NEXT:    [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]])
+// CHECK-32-EX-NEXT:    [[TMP3:%.*]] = load i32, ptr [[A_ADDR]], align 4
+// CHECK-32-EX-NEXT:    store i32 [[TMP3]], ptr [[A_CASTED]], align 4
+// CHECK-32-EX-NEXT:    [[TMP4:%.*]] = load i32, ptr [[A_CASTED]], align 4
+// CHECK-32-EX-NEXT:    [[TMP5:%.*]] = load i16, ptr [[AA_ADDR]], align 2
+// CHECK-32-EX-NEXT:    store i16 [[TMP5]], ptr [[AA_CASTED]], align 2
+// CHECK-32-EX-NEXT:    [[TMP6:%.*]] = load i32, ptr [[AA_CASTED]], align 4
+// CHECK-32-EX-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [3 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i32 0, i32 0
+// CHECK-32-EX-NEXT:    [[TMP8:%.*]] = inttoptr i32 [[TMP4]] to ptr
+// CHECK-32-EX-NEXT:    store ptr [[TMP8]], ptr [[TMP7]], align 4
+// CHECK-32-EX-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [3 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i32 0, i32 1
+// CHECK-32-EX-NEXT:    [[TMP10:%.*]] = inttoptr i32 [[TMP6]] to ptr
+// CHECK-32-EX-NEXT:    store ptr [[TMP10]], ptr [[TMP9]], align 4
+// CHECK-32-EX-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [3 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i32 0, i32 2
+// CHECK-32-EX-NEXT:    store ptr [[TMP0]], ptr [[TMP11]], align 4
+// CHECK-32-EX-NEXT:    call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP2]], i32 1, i32 -1, i32 -1, ptr @__omp_outlined__2, ptr null, ptr [[CAPTURED_VARS_ADDRS]], i32 3)
+// CHECK-32-EX-NEXT:    call void @__kmpc_target_deinit(ptr @[[GLOB1]], i8 2)
+// CHECK-32-EX-NEXT:    ret void
+// CHECK-32-EX:       worker.exit:
+// CHECK-32-EX-NEXT:    ret void
+//
+//
+// CHECK-32-EX-LABEL: define {{[^@]+}}@__omp_outlined__2
+// CHECK-32-EX-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[A:%.*]], i32 noundef [[AA:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR1]] {
+// CHECK-32-EX-NEXT:  entry:
+// CHECK-32-EX-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
+// CHECK-32-EX-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
+// CHECK-32-EX-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[AA_ADDR:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[B_ADDR:%.*]] = alloca ptr, align 4
+// CHECK-32-EX-NEXT:    store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-EX-NEXT:    store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 4
+// CHECK-32-EX-NEXT:    store i32 [[A]], ptr [[A_ADDR]], align 4
+// CHECK-32-EX-NEXT:    store i32 [[AA]], ptr [[AA_ADDR]], align 4
+// CHECK-32-EX-NEXT:    store ptr [[B]], ptr [[B_ADDR]], align 4
+// CHECK-32-EX-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[B_ADDR]], align 4
+// CHECK-32-EX-NEXT:    [[TMP1:%.*]] = load i32, ptr [[A_ADDR]], align 4
+// CHECK-32-EX-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP1]], 1
+// CHECK-32-EX-NEXT:    store i32 [[ADD]], ptr [[A_ADDR]], align 4
+// CHECK-32-EX-NEXT:    [[TMP2:%.*]] = load i16, ptr [[AA_ADDR]], align 2
+// CHECK-32-EX-NEXT:    [[CONV:%.*]] = sext i16 [[TMP2]] to i32
+// CHECK-32-EX-NEXT:    [[ADD1:%.*]] = add nsw i32 [[CONV]], 1
+// CHECK-32-EX-NEXT:    [[CONV2:%.*]] = trunc i32 [[ADD1]] to i16
+// CHECK-32-EX-NEXT:    store i16 [[CONV2]], ptr [[AA_ADDR]], align 2
+// CHECK-32-EX-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], ptr [[TMP0]], i32 0, i32 2
+// CHECK-32-EX-NEXT:    [[TMP3:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+// CHECK-32-EX-NEXT:    [[ADD3:%.*]] = add nsw i32 [[TMP3]], 1
+// CHECK-32-EX-NEXT:    store i32 [[ADD3]], ptr [[ARRAYIDX]], align 4
+// CHECK-32-EX-NEXT:    ret void
+//

diff  --git a/clang/test/OpenMP/nvptx_target_parallel_reduction_codegen.cpp b/clang/test/OpenMP/nvptx_target_parallel_reduction_codegen.cpp
index b9e2c48001d60..6e8fd25f313da 100644
--- a/clang/test/OpenMP/nvptx_target_parallel_reduction_codegen.cpp
+++ b/clang/test/OpenMP/nvptx_target_parallel_reduction_codegen.cpp
@@ -1,20 +1,17 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --function-signature --include-generated-funcs --replace-value-regex "__omp_offloading_[0-9a-z]+_[0-9a-z]+" "reduction_size[.].+[.]" "pl_cond[.].+[.|,]" --prefix-filecheck-ir-name _
 // Test target codegen - host bc file has to be created first.
 // RUN: %clang_cc1 -no-opaque-pointers -verify -fopenmp -fopenmp-cuda-mode -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc
-// RUN: %clang_cc1 -no-opaque-pointers -verify -fopenmp -fopenmp-cuda-mode -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-64
+// RUN: %clang_cc1 -no-opaque-pointers -verify -fopenmp -fopenmp-cuda-mode -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix=CHECK-64
 // RUN: %clang_cc1 -no-opaque-pointers -verify -fopenmp -fopenmp-cuda-mode -x c++ -triple i386-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm-bc %s -o %t-x86-host.bc
-// RUN: %clang_cc1 -no-opaque-pointers -verify -fopenmp -fopenmp-cuda-mode -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32
-// RUN: %clang_cc1 -no-opaque-pointers -verify -fopenmp -fopenmp-cuda-mode -fexceptions -fcxx-exceptions -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32
+// RUN: %clang_cc1 -no-opaque-pointers -verify -fopenmp -fopenmp-cuda-mode -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix=CHECK-32
+// RUN: %clang_cc1 -no-opaque-pointers -verify -fopenmp -fopenmp-cuda-mode -fexceptions -fcxx-exceptions -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefixes=CHECK-32,CHECK-32-EX
 // expected-no-diagnostics
 #ifndef HEADER
 #define HEADER
 
 // Check for the data transfer medium in shared memory to transfer the reduction list to the first warp.
-// CHECK-DAG: [[TRANSFER_STORAGE:@.+]] = weak addrspace([[SHARED_ADDRSPACE:[0-9]+]]) global [32 x i32]
 
 // Check that the execution mode of all 3 target regions is set to Spmd Mode.
-// CHECK-DAG: {{@__omp_offloading_.+l27}}_exec_mode = weak protected constant i8 2
-// CHECK-DAG: {{@__omp_offloading_.+l32}}_exec_mode = weak protected constant i8 2
-// CHECK-DAG: {{@__omp_offloading_.+l38}}_exec_mode = weak protected constant i8 2
 
 template<typename tx>
 tx ftemplate(int n) {
@@ -52,741 +49,2246 @@ int bar(int n){
   return a;
 }
 
-// CHECK: define {{.*}}void {{@__omp_offloading_.+template.+l27}}(
-//
-// CHECK: call i32 @__kmpc_target_init({{.*}}, i8 2, i1 false)
-// CHECK: call void @__kmpc_target_deinit({{.*}}, i8 2)
-//
-//
 // define internal void [[PFN]](
-// CHECK: store double {{[0\.e\+]+}}, double* [[E:%.+]], align
-// CHECK: [[EV:%.+]] = load double, double* [[E]], align
-// CHECK: [[ADD:%.+]] = fadd double [[EV]], 5
-// CHECK: store double [[ADD]], double* [[E]], align
-// CHECK: [[PTR1:%.+]] = getelementptr inbounds [[RLT:.+]], [1 x i8*]* [[RL:%.+]], i{{32|64}} 0, i{{32|64}} 0
-// CHECK: [[E_CAST:%.+]] = bitcast double* [[E]] to i8*
-// CHECK: store i8* [[E_CAST]], i8** [[PTR1]], align
-// CHECK: [[ARG_RL:%.+]] = bitcast [[RLT]]* [[RL]] to i8*
-// CHECK: [[RET:%.+]] = call i32 @__kmpc_nvptx_parallel_reduce_nowait_v2(%struct.ident_t* @{{.+}}, i32 {{.+}}, i32 1, i{{32|64}} {{4|8}}, i8* [[ARG_RL]], void (i8*, i16, i16, i16)* [[SHUFFLE_REDUCE_FN:@.+]], void (i8*, i32)* [[WARP_COPY_FN:@.+]])
-// CHECK: [[CMP:%.+]] = icmp eq i32 [[RET]], 1
-// CHECK: br i1 [[CMP]], label
 
-// CHECK: [[E_INV:%.+]] = load double, double* [[E_IN:%.+]], align
-// CHECK: [[EV:%.+]] = load double, double* [[E]], align
-// CHECK: [[ADD:%.+]] = fadd double [[E_INV]], [[EV]]
-// CHECK: store double [[ADD]], double* [[E_IN]], align
-// CHECK: call void @__kmpc_nvptx_end_reduce_nowait(
-// CHECK: br label
-//
-// CHECK: ret
 
-//
 // Reduction function
-// CHECK: define internal void [[REDUCTION_FUNC:@.+]](i8* noundef %0, i8* noundef %1)
-// CHECK: [[VAR_RHS_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST_RHS:%.+]], i{{32|64}} 0, i{{32|64}} 0
-// CHECK: [[VAR_RHS_VOID:%.+]] = load i8*, i8** [[VAR_RHS_REF]],
-// CHECK: [[VAR_RHS:%.+]] = bitcast i8* [[VAR_RHS_VOID]] to double*
-//
-// CHECK: [[VAR_LHS_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST_LHS:%.+]], i{{32|64}} 0, i{{32|64}} 0
-// CHECK: [[VAR_LHS_VOID:%.+]] = load i8*, i8** [[VAR_LHS_REF]],
-// CHECK: [[VAR_LHS:%.+]] = bitcast i8* [[VAR_LHS_VOID]] to double*
-//
-// CHECK: [[VAR_LHS_VAL:%.+]] = load double, double* [[VAR_LHS]],
-// CHECK: [[VAR_RHS_VAL:%.+]] = load double, double* [[VAR_RHS]],
-// CHECK: [[RES:%.+]] = fadd double [[VAR_LHS_VAL]], [[VAR_RHS_VAL]]
-// CHECK: store double [[RES]], double* [[VAR_LHS]],
-// CHECK: ret void
 
-//
 // Shuffle and reduce function
-// CHECK: define internal void [[SHUFFLE_REDUCE_FN]](i8* noundef %0, i16 noundef {{.*}}, i16 noundef {{.*}}, i16 noundef {{.*}})
-// CHECK: [[REMOTE_RED_LIST:%.+]] = alloca [[RLT]], align
-// CHECK: [[REMOTE_ELT:%.+]] = alloca double
-//
-// CHECK: [[LANEID:%.+]] = load i16, i16* {{.+}}, align
-// CHECK: [[LANEOFFSET:%.+]] = load i16, i16* {{.+}}, align
-// CHECK: [[ALGVER:%.+]] = load i16, i16* {{.+}}, align
-//
-// CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST:%.+]], i{{32|64}} 0, i{{32|64}} 0
-// CHECK: [[ELT_REF_CAST:%.+]] = bitcast i8** [[ELT_REF]] to double**
-// CHECK: [[ELT:%.+]] = load double*, double** [[ELT_REF_CAST]],
-// CHECK: [[REMOTE_ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[REMOTE_RED_LIST:%.+]], i{{32|64}} 0, i{{32|64}} 0
-//
-// CHECK: [[ELT_CAST:%.+]] = bitcast double* [[ELT]] to i64*
-// CHECK: [[REMOTE_ELT_CAST:%.+]] = bitcast double* [[REMOTE_ELT]] to i64*
-// CHECK: [[ELT_VAL:%.+]] = load i64, i64* [[ELT_CAST]], align
-// CHECK: [[WS32:%.+]] = call i32 @__kmpc_get_warp_size()
-// CHECK: [[WS:%.+]] = trunc i32 [[WS32]] to i16
-// CHECK: [[REMOTE_ELT_VAL64:%.+]] = call i64 @__kmpc_shuffle_int64(i64 [[ELT_VAL]], i16 [[LANEOFFSET]], i16 [[WS]])
-//
-// CHECK: store i64 [[REMOTE_ELT_VAL64]], i64* [[REMOTE_ELT_CAST]], align
-// CHECK: [[REMOTE_ELT_VOID:%.+]] = bitcast double* [[REMOTE_ELT]] to i8*
-// CHECK: store i8* [[REMOTE_ELT_VOID]], i8** [[REMOTE_ELT_REF]], align
-//
 // Condition to reduce
-// CHECK: [[CONDALG0:%.+]] = icmp eq i16 [[ALGVER]], 0
-//
-// CHECK: [[COND1:%.+]] = icmp eq i16 [[ALGVER]], 1
-// CHECK: [[COND2:%.+]] = icmp ult i16 [[LANEID]], [[LANEOFFSET]]
-// CHECK: [[CONDALG1:%.+]] = and i1 [[COND1]], [[COND2]]
-//
-// CHECK: [[COND3:%.+]] = icmp eq i16 [[ALGVER]], 2
-// CHECK: [[COND4:%.+]] = and i16 [[LANEID]], 1
-// CHECK: [[COND5:%.+]] = icmp eq i16 [[COND4]], 0
-// CHECK: [[COND6:%.+]] = and i1 [[COND3]], [[COND5]]
-// CHECK: [[COND7:%.+]] = icmp sgt i16 [[LANEOFFSET]], 0
-// CHECK: [[CONDALG2:%.+]] = and i1 [[COND6]], [[COND7]]
-//
-// CHECK: [[COND8:%.+]] = or i1 [[CONDALG0]], [[CONDALG1]]
-// CHECK: [[SHOULD_REDUCE:%.+]] = or i1 [[COND8]], [[CONDALG2]]
-// CHECK: br i1 [[SHOULD_REDUCE]], label {{%?}}[[DO_REDUCE:.+]], label {{%?}}[[REDUCE_ELSE:.+]]
-//
-// CHECK: [[DO_REDUCE]]
-// CHECK: [[RED_LIST1_VOID:%.+]] = bitcast [[RLT]]* [[RED_LIST]] to i8*
-// CHECK: [[RED_LIST2_VOID:%.+]] = bitcast [[RLT]]* [[REMOTE_RED_LIST]] to i8*
-// CHECK: call void [[REDUCTION_FUNC]](i8* [[RED_LIST1_VOID]], i8* [[RED_LIST2_VOID]])
-// CHECK: br label {{%?}}[[REDUCE_CONT:.+]]
-//
-// CHECK: [[REDUCE_ELSE]]
-// CHECK: br label {{%?}}[[REDUCE_CONT]]
-//
-// CHECK: [[REDUCE_CONT]]
 // Now check if we should just copy over the remote reduction list
-// CHECK: [[COND1:%.+]] = icmp eq i16 [[ALGVER]], 1
-// CHECK: [[COND2:%.+]] = icmp uge i16 [[LANEID]], [[LANEOFFSET]]
-// CHECK: [[SHOULD_COPY:%.+]] = and i1 [[COND1]], [[COND2]]
-// CHECK: br i1 [[SHOULD_COPY]], label {{%?}}[[DO_COPY:.+]], label {{%?}}[[COPY_ELSE:.+]]
-//
-// CHECK: [[DO_COPY]]
-// CHECK: [[REMOTE_ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[REMOTE_RED_LIST]], i{{32|64}} 0, i{{32|64}} 0
-// CHECK: [[REMOTE_ELT_REF_CAST:%.+]] = bitcast i8** [[REMOTE_ELT_REF]] to double**
-// CHECK: [[REMOTE_ELT:%.+]] = load double*, double** [[REMOTE_ELT_REF_CAST]],
-// CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST]], i{{32|64}} 0, i{{32|64}} 0
-// CHECK: [[ELT_REF_CAST:%.+]] = bitcast i8** [[ELT_REF]] to double**
-// CHECK: [[ELT:%.+]] = load double*, double** [[ELT_REF_CAST]],
-// CHECK: [[REMOTE_ELT_VAL:%.+]] = load double, double* [[REMOTE_ELT]], align
-// CHECK: store double [[REMOTE_ELT_VAL]], double* [[ELT]], align
-// CHECK: br label {{%?}}[[COPY_CONT:.+]]
-//
-// CHECK: [[COPY_ELSE]]
-// CHECK: br label {{%?}}[[COPY_CONT]]
-//
-// CHECK: [[COPY_CONT]]
-// CHECK: void
 
-//
 // Inter warp copy function
-// CHECK: define internal void [[WARP_COPY_FN]](i8* noundef %0, i32 noundef %1)
-// CHECK-DAG: [[LANEID:%.+]] = and i32 {{.+}}, 31
-// CHECK-DAG: [[WARPID:%.+]] = ashr i32 {{.+}}, 5
-// CHECK-DAG: [[RED_LIST:%.+]] = bitcast i8* {{.+}} to [[RLT]]*
-// CHECK: store i32 0, i32* [[CNT_ADDR:%.+]],
-// CHECK: br label
-// CHECK: [[CNT:%.+]] = load i32, i32* [[CNT_ADDR]],
-// CHECK: [[DONE_COPY:%.+]] = icmp ult i32 [[CNT]], 2
-// CHECK: br i1 [[DONE_COPY]], label
-// CHECK: call void @__kmpc_barrier(%struct.ident_t* @
-// CHECK: [[IS_WARP_MASTER:%.+]] = icmp eq i32 [[LANEID]], 0
-// CHECK: br i1 [[IS_WARP_MASTER]], label {{%?}}[[DO_COPY:.+]], label {{%?}}[[COPY_ELSE:.+]]
-//
 // [[DO_COPY]]
-// CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST]], i{{32|64}} 0, i{{32|64}} 0
-// CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
-// CHECK: [[BASE_ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i32*
-// CHECK: [[ELT:%.+]] = getelementptr i32, i32* [[BASE_ELT]], i32 [[CNT]]
-//
-// CHECK: [[MEDIUM_ELT:%.+]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[WARPID]]
-// CHECK: [[ELT_VAL:%.+]] = load i32, i32* [[ELT]],
-// CHECK: store volatile i32 [[ELT_VAL]], i32 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]],
-// CHECK: br label {{%?}}[[COPY_CONT:.+]]
-//
-// CHECK: [[COPY_ELSE]]
-// CHECK: br label {{%?}}[[COPY_CONT]]
-//
 // Barrier after copy to shared memory storage medium.
-// CHECK: [[COPY_CONT]]
-// CHECK: call void @__kmpc_barrier(%struct.ident_t* @
-// CHECK: [[ACTIVE_WARPS:%.+]] = load i32, i32*
-//
 // Read into warp 0.
-// CHECK: [[IS_W0_ACTIVE_THREAD:%.+]] = icmp ult i32 [[TID:%.+]], [[ACTIVE_WARPS]]
-// CHECK: br i1 [[IS_W0_ACTIVE_THREAD]], label {{%?}}[[DO_READ:.+]], label {{%?}}[[READ_ELSE:.+]]
-//
-// CHECK: [[DO_READ]]
-// CHECK: [[MEDIUM_ELT:%.+]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[TID]]
-// CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST:%.+]], i{{32|64}} 0, i{{32|64}} 0
-// CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
-// CHECK: [[ELT_BASE:%.+]] = bitcast i8* [[ELT_VOID]] to i32*
-// CHECK: [[ELT:%.+]] = getelementptr i32, i32* [[ELT_BASE]], i32 [[CNT]]
-// CHECK: [[MEDIUM_ELT_VAL:%.+]] = load volatile i32, i32 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]],
-// CHECK: store i32 [[MEDIUM_ELT_VAL]], i32* [[ELT]],
-// CHECK: br label {{%?}}[[READ_CONT:.+]]
-//
-// CHECK: [[READ_ELSE]]
-// CHECK: br label {{%?}}[[READ_CONT]]
-//
-// CHECK: [[READ_CONT]]
-// CHECK: [[NEXT:%.+]] = add nsw i32 [[CNT]], 1
-// CHECK: store i32 [[NEXT]], i32* [[CNT_ADDR]],
-// CHECK: br label
-// CHECK: ret
 
-// CHECK: define {{.*}}void {{@__omp_offloading_.+template.+l32}}(
-//
-// CHECK: call i32 @__kmpc_target_init({{.*}}, i8 2, i1 false)
-// CHECK: call void @__kmpc_target_deinit({{.*}}, i8 2)
-//
-//
 // define internal void [[PFN1]](
-// CHECK: store float {{1\.[0e\+]+}}, float* [[D:%.+]], align
-// CHECK: [[C_VAL:%.+]] = load i8, i8* [[C:%.+]], align
-// CHECK: [[CONV:%.+]] = sext i8 [[C_VAL]] to i32
-// CHECK: [[XOR:%.+]] = xor i32 [[CONV]], 2
-// CHECK: [[TRUNC:%.+]] = trunc i32 [[XOR]] to i8
-// CHECK: store i8 [[TRUNC]], i8* [[C]], align
-// CHECK: [[DV:%.+]] = load float, float* [[D]], align
-// CHECK: [[MUL:%.+]] = fmul float [[DV]], {{[0-9e\.\+]+}}
-// CHECK: store float [[MUL]], float* [[D]], align
-// CHECK: [[PTR1:%.+]] = getelementptr inbounds [[RLT:.+]], [2 x i8*]* [[RL:%.+]], i{{32|64}} 0, i{{32|64}} 0
-// CHECK: store i8* [[C]], i8** [[PTR1]], align
-// CHECK: [[PTR2:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RL]], i{{32|64}} 0, i{{32|64}} 1
-// CHECK: [[D_CAST:%.+]] = bitcast float* [[D]] to i8*
-// CHECK: store i8* [[D_CAST]], i8** [[PTR2]], align
-// CHECK: [[ARG_RL:%.+]] = bitcast [[RLT]]* [[RL]] to i8*
-// CHECK: [[RET:%.+]] = call i32 @__kmpc_nvptx_parallel_reduce_nowait_v2(%struct.ident_t* @{{.+}}, i32 {{.+}}, i32 2, i{{32|64}} {{8|16}}, i8* [[ARG_RL]], void (i8*, i16, i16, i16)* [[SHUFFLE_REDUCE_FN:@.+]], void (i8*, i32)* [[WARP_COPY_FN:@.+]])
-// CHECK: [[CMP:%.+]] = icmp eq i32 [[RET]], 1
-// CHECK: br i1 [[CMP]], label
-// CHECK: [[C_INV8:%.+]] = load i8, i8* [[C_IN:%.+]], align
-// CHECK: [[C_INV:%.+]] = sext i8 [[C_INV8]] to i32
-// CHECK: [[CV8:%.+]] = load i8, i8* [[C]], align
-// CHECK: [[CV:%.+]] = sext i8 [[CV8]] to i32
-// CHECK: [[XOR:%.+]] = xor i32 [[C_INV]], [[CV]]
-// CHECK: [[TRUNC:%.+]] = trunc i32 [[XOR]] to i8
-// CHECK: store i8 [[TRUNC]], i8* [[C_IN]], align
-// CHECK: [[D_INV:%.+]] = load float, float* [[D_IN:%.+]], align
-// CHECK: [[DV:%.+]] = load float, float* [[D]], align
-// CHECK: [[MUL:%.+]] = fmul float [[D_INV]], [[DV]]
-// CHECK: store float [[MUL]], float* [[D_IN]], align
-// CHECK: call void @__kmpc_nvptx_end_reduce_nowait(
-// CHECK: br label
-//
-// CHECK: ret
 
-//
 // Reduction function
-// CHECK: define internal void [[REDUCTION_FUNC:@.+]](i8* noundef %0, i8* noundef %1)
-// CHECK: [[VAR1_RHS_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST_RHS:%.+]], i{{32|64}} 0, i{{32|64}} 0
-// CHECK: [[VAR1_RHS:%.+]] = load i8*, i8** [[VAR1_RHS_REF]],
-//
-// CHECK: [[VAR1_LHS_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST_LHS:%.+]], i{{32|64}} 0, i{{32|64}} 0
-// CHECK: [[VAR1_LHS:%.+]] = load i8*, i8** [[VAR1_LHS_REF]],
-//
-// CHECK: [[VAR2_RHS_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST_RHS]], i{{32|64}} 0, i{{32|64}} 1
-// CHECK: [[VAR2_RHS_VOID:%.+]] = load i8*, i8** [[VAR2_RHS_REF]],
-// CHECK: [[VAR2_RHS:%.+]] = bitcast i8* [[VAR2_RHS_VOID]] to float*
-//
-// CHECK: [[VAR2_LHS_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST_LHS]], i{{32|64}} 0, i{{32|64}} 1
-// CHECK: [[VAR2_LHS_VOID:%.+]] = load i8*, i8** [[VAR2_LHS_REF]],
-// CHECK: [[VAR2_LHS:%.+]] = bitcast i8* [[VAR2_LHS_VOID]] to float*
-//
-// CHECK: [[VAR1_LHS_VAL8:%.+]] = load i8, i8* [[VAR1_LHS]],
-// CHECK: [[VAR1_LHS_VAL:%.+]] = sext i8 [[VAR1_LHS_VAL8]] to i32
-// CHECK: [[VAR1_RHS_VAL8:%.+]] = load i8, i8* [[VAR1_RHS]],
-// CHECK: [[VAR1_RHS_VAL:%.+]] = sext i8 [[VAR1_RHS_VAL8]] to i32
-// CHECK: [[XOR:%.+]] = xor i32 [[VAR1_LHS_VAL]], [[VAR1_RHS_VAL]]
-// CHECK: [[RES:%.+]] = trunc i32 [[XOR]] to i8
-// CHECK: store i8 [[RES]], i8* [[VAR1_LHS]],
-//
-// CHECK: [[VAR2_LHS_VAL:%.+]] = load float, float* [[VAR2_LHS]],
-// CHECK: [[VAR2_RHS_VAL:%.+]] = load float, float* [[VAR2_RHS]],
-// CHECK: [[RES:%.+]] = fmul float [[VAR2_LHS_VAL]], [[VAR2_RHS_VAL]]
-// CHECK: store float [[RES]], float* [[VAR2_LHS]],
-// CHECK: ret void
 
-//
 // Shuffle and reduce function
-// CHECK: define internal void [[SHUFFLE_REDUCE_FN]](i8* noundef %0, i16 noundef {{.*}}, i16 noundef {{.*}}, i16 noundef {{.*}})
-// CHECK: [[REMOTE_RED_LIST:%.+]] = alloca [[RLT]], align
-// CHECK: [[REMOTE_ELT1:%.+]] = alloca i8
-// CHECK: [[REMOTE_ELT2:%.+]] = alloca float
-//
-// CHECK: [[LANEID:%.+]] = load i16, i16* {{.+}}, align
-// CHECK: [[LANEOFFSET:%.+]] = load i16, i16* {{.+}}, align
-// CHECK: [[ALGVER:%.+]] = load i16, i16* {{.+}}, align
-//
-// CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST:%.+]], i{{32|64}} 0, i{{32|64}} 0
-// CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
-// CHECK: [[REMOTE_ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[REMOTE_RED_LIST:%.+]], i{{32|64}} 0, i{{32|64}} 0
-// CHECK: [[ELT_VAL:%.+]] = load i8, i8* [[ELT_VOID]], align
-//
-// CHECK: [[ELT_CAST:%.+]] = sext i8 [[ELT_VAL]] to i32
-// CHECK: [[WS32:%.+]] = call i32 @__kmpc_get_warp_size()
-// CHECK: [[WS:%.+]] = trunc i32 [[WS32]] to i16
-// CHECK: [[REMOTE_ELT1_VAL32:%.+]] = call i32 @__kmpc_shuffle_int32(i32 [[ELT_CAST]], i16 [[LANEOFFSET]], i16 [[WS]])
-// CHECK: [[REMOTE_ELT1_VAL:%.+]] = trunc i32 [[REMOTE_ELT1_VAL32]] to i8
-//
-// CHECK: store i8 [[REMOTE_ELT1_VAL]], i8* [[REMOTE_ELT1]], align
-// CHECK: store i8* [[REMOTE_ELT1]], i8** [[REMOTE_ELT_REF]], align
-//
-// CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST]], i{{32|64}} 0, i{{32|64}} 1
-// CHECK: [[ELT_REF_CAST:%.+]] = bitcast i8** [[ELT_REF]] to float**
-// CHECK: [[ELT:%.+]] = load float*, float** [[ELT_REF_CAST]],
-// CHECK: [[REMOTE_ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[REMOTE_RED_LIST]], i{{32|64}} 0, i{{32|64}} 1
-//
-// CHECK: [[ELT_CAST:%.+]] = bitcast float* [[ELT]] to i32*
-// CHECK: [[REMOTE_ELT2_CAST:%.+]] = bitcast float* [[REMOTE_ELT2]] to i32*
-// CHECK: [[ELT_VAL:%.+]] = load i32, i32* [[ELT_CAST]], align
-// CHECK: [[WS32:%.+]] = call i32 @__kmpc_get_warp_size()
-// CHECK: [[WS:%.+]] = trunc i32 [[WS32]] to i16
-// CHECK: [[REMOTE_ELT2_VAL32:%.+]] = call i32 @__kmpc_shuffle_int32(i32 [[ELT_VAL]], i16 [[LANEOFFSET]], i16 [[WS]])
-//
-// CHECK: store i32 [[REMOTE_ELT2_VAL32]], i32* [[REMOTE_ELT2_CAST]], align
-// CHECK: [[REMOTE_ELT2C:%.+]] = bitcast float* [[REMOTE_ELT2]] to i8*
-// CHECK: store i8* [[REMOTE_ELT2C]], i8** [[REMOTE_ELT_REF]], align
-//
 // Condition to reduce
-// CHECK: [[CONDALG0:%.+]] = icmp eq i16 [[ALGVER]], 0
-//
-// CHECK: [[COND1:%.+]] = icmp eq i16 [[ALGVER]], 1
-// CHECK: [[COND2:%.+]] = icmp ult i16 [[LANEID]], [[LANEOFFSET]]
-// CHECK: [[CONDALG1:%.+]] = and i1 [[COND1]], [[COND2]]
-//
-// CHECK: [[COND3:%.+]] = icmp eq i16 [[ALGVER]], 2
-// CHECK: [[COND4:%.+]] = and i16 [[LANEID]], 1
-// CHECK: [[COND5:%.+]] = icmp eq i16 [[COND4]], 0
-// CHECK: [[COND6:%.+]] = and i1 [[COND3]], [[COND5]]
-// CHECK: [[COND7:%.+]] = icmp sgt i16 [[LANEOFFSET]], 0
-// CHECK: [[CONDALG2:%.+]] = and i1 [[COND6]], [[COND7]]
-//
-// CHECK: [[COND8:%.+]] = or i1 [[CONDALG0]], [[CONDALG1]]
-// CHECK: [[SHOULD_REDUCE:%.+]] = or i1 [[COND8]], [[CONDALG2]]
-// CHECK: br i1 [[SHOULD_REDUCE]], label {{%?}}[[DO_REDUCE:.+]], label {{%?}}[[REDUCE_ELSE:.+]]
-//
-// CHECK: [[DO_REDUCE]]
-// CHECK: [[RED_LIST1_VOID:%.+]] = bitcast [[RLT]]* [[RED_LIST]] to i8*
-// CHECK: [[RED_LIST2_VOID:%.+]] = bitcast [[RLT]]* [[REMOTE_RED_LIST]] to i8*
-// CHECK: call void [[REDUCTION_FUNC]](i8* [[RED_LIST1_VOID]], i8* [[RED_LIST2_VOID]])
-// CHECK: br label {{%?}}[[REDUCE_CONT:.+]]
-//
-// CHECK: [[REDUCE_ELSE]]
-// CHECK: br label {{%?}}[[REDUCE_CONT]]
-//
-// CHECK: [[REDUCE_CONT]]
 // Now check if we should just copy over the remote reduction list
-// CHECK: [[COND1:%.+]] = icmp eq i16 [[ALGVER]], 1
-// CHECK: [[COND2:%.+]] = icmp uge i16 [[LANEID]], [[LANEOFFSET]]
-// CHECK: [[SHOULD_COPY:%.+]] = and i1 [[COND1]], [[COND2]]
-// CHECK: br i1 [[SHOULD_COPY]], label {{%?}}[[DO_COPY:.+]], label {{%?}}[[COPY_ELSE:.+]]
-//
-// CHECK: [[DO_COPY]]
-// CHECK: [[REMOTE_ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[REMOTE_RED_LIST]], i{{32|64}} 0, i{{32|64}} 0
-// CHECK: [[REMOTE_ELT_VOID:%.+]] = load i8*, i8** [[REMOTE_ELT_REF]],
-// CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST]], i{{32|64}} 0, i{{32|64}} 0
-// CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
-// CHECK: [[REMOTE_ELT_VAL:%.+]] = load i8, i8* [[REMOTE_ELT_VOID]], align
-// CHECK: store i8 [[REMOTE_ELT_VAL]], i8* [[ELT_VOID]], align
-//
-// CHECK: [[REMOTE_ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[REMOTE_RED_LIST]], i{{32|64}} 0, i{{32|64}} 1
-// CHECK: [[REMOTE_ELT_REF_CAST:%.+]] = bitcast i8** [[REMOTE_ELT_REF]] to float**
-// CHECK: [[REMOTE_ELT:%.+]] = load float*, float** [[REMOTE_ELT_REF_CAST]],
-// CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST]], i{{32|64}} 0, i{{32|64}} 1
-// CHECK: [[ELT_REF_CAST:%.+]] = bitcast i8** [[ELT_REF]] to float**
-// CHECK: [[ELT:%.+]] = load float*, float** [[ELT_REF_CAST]],
-// CHECK: [[REMOTE_ELT_VAL:%.+]] = load float, float* [[REMOTE_ELT]], align
-// CHECK: store float [[REMOTE_ELT_VAL]], float* [[ELT]], align
-// CHECK: br label {{%?}}[[COPY_CONT:.+]]
-//
-// CHECK: [[COPY_ELSE]]
-// CHECK: br label {{%?}}[[COPY_CONT]]
-//
-// CHECK: [[COPY_CONT]]
-// CHECK: void
 
-//
 // Inter warp copy function
-// CHECK: define internal void [[WARP_COPY_FN]](i8* noundef %0, i32 noundef %1)
-// CHECK-DAG: [[LANEID:%.+]] = and i32 {{.+}}, 31
-// CHECK-DAG: [[WARPID:%.+]] = ashr i32 {{.+}}, 5
-// CHECK-DAG: [[RED_LIST:%.+]] = bitcast i8* {{.+}} to [[RLT]]*
-// CHECK: call void @__kmpc_barrier(%struct.ident_t* @
-// CHECK: [[IS_WARP_MASTER:%.+]] = icmp eq i32 [[LANEID]], 0
-// CHECK: br i1 [[IS_WARP_MASTER]], label {{%?}}[[DO_COPY:.+]], label {{%?}}[[COPY_ELSE:.+]]
-//
 // [[DO_COPY]]
-// CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST]], i{{32|64}} 0, i{{32|64}} 0
-// CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
-//
-// CHECK: [[MEDIUM_ELT64:%.+]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[WARPID]]
-// CHECK: [[MEDIUM_ELT:%.+]] = bitcast i32 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT64]] to i8 addrspace([[SHARED_ADDRSPACE]])*
-// CHECK: [[ELT_VAL:%.+]] = load i8, i8* [[ELT_VOID]], align
-// CHECK: store volatile i8 [[ELT_VAL]], i8 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align
-// CHECK: br label {{%?}}[[COPY_CONT:.+]]
-//
-// CHECK: [[COPY_ELSE]]
-// CHECK: br label {{%?}}[[COPY_CONT]]
-//
 // Barrier after copy to shared memory storage medium.
-// CHECK: [[COPY_CONT]]
-// CHECK: call void @__kmpc_barrier(%struct.ident_t* @
-// CHECK: [[ACTIVE_WARPS:%.+]] = load i32, i32*
-//
 // Read into warp 0.
-// CHECK: [[IS_W0_ACTIVE_THREAD:%.+]] = icmp ult i32 [[TID:%.+]], [[ACTIVE_WARPS]]
-// CHECK: br i1 [[IS_W0_ACTIVE_THREAD]], label {{%?}}[[DO_READ:.+]], label {{%?}}[[READ_ELSE:.+]]
-//
-// CHECK: [[DO_READ]]
-// CHECK: [[MEDIUM_ELT32:%.+]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[TID]]
-// CHECK: [[MEDIUM_ELT:%.+]] = bitcast i32 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT32]] to i8 addrspace([[SHARED_ADDRSPACE]])*
-// CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST:%.+]], i{{32|64}} 0, i{{32|64}} 0
-// CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
-// CHECK: [[MEDIUM_ELT_VAL:%.+]] = load volatile i8, i8 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align
-// CHECK: store i8 [[MEDIUM_ELT_VAL]], i8* [[ELT_VOID]], align
-// CHECK: br label {{%?}}[[READ_CONT:.+]]
-//
-// CHECK: [[READ_ELSE]]
-// CHECK: br label {{%?}}[[READ_CONT]]
-//
-// CHECK: [[READ_CONT]]
-// CHECK: call void @__kmpc_barrier(%struct.ident_t* @
-// CHECK: [[IS_WARP_MASTER:%.+]] = icmp eq i32 [[LANEID]], 0
-// CHECK: br i1 [[IS_WARP_MASTER]], label {{%?}}[[DO_COPY:.+]], label {{%?}}[[COPY_ELSE:.+]]
-//
 // [[DO_COPY]]
-// CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST]], i{{32|64}} 0, i{{32|64}} 1
-// CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
-// CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i32*
-//
-// CHECK: [[MEDIUM_ELT:%.+]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[WARPID]]
-// CHECK: [[ELT_VAL:%.+]] = load i32, i32* [[ELT]], align
-// CHECK: store volatile i32 [[ELT_VAL]], i32 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align
-// CHECK: br label {{%?}}[[COPY_CONT:.+]]
-//
-// CHECK: [[COPY_ELSE]]
-// CHECK: br label {{%?}}[[COPY_CONT]]
-//
 // Barrier after copy to shared memory storage medium.
-// CHECK: [[COPY_CONT]]
-// CHECK: call void @__kmpc_barrier(%struct.ident_t* @
-// CHECK: [[ACTIVE_WARPS:%.+]] = load i32, i32*
-//
 // Read into warp 0.
-// CHECK: [[IS_W0_ACTIVE_THREAD:%.+]] = icmp ult i32 [[TID:%.+]], [[ACTIVE_WARPS]]
-// CHECK: br i1 [[IS_W0_ACTIVE_THREAD]], label {{%?}}[[DO_READ:.+]], label {{%?}}[[READ_ELSE:.+]]
-//
-// CHECK: [[DO_READ]]
-// CHECK: [[MEDIUM_ELT:%.+]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[TID]]
-// CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST:%.+]], i{{32|64}} 0, i{{32|64}} 1
-// CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
-// CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i32*
-// CHECK: [[MEDIUM_ELT_VAL:%.+]] = load volatile i32, i32 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align
-// CHECK: store i32 [[MEDIUM_ELT_VAL]], i32* [[ELT]], align
-// CHECK: br label {{%?}}[[READ_CONT:.+]]
-//
-// CHECK: [[READ_ELSE]]
-// CHECK: br label {{%?}}[[READ_CONT]]
-//
-// CHECK: [[READ_CONT]]
-// CHECK: ret
 
-// CHECK: define {{.*}}void {{@__omp_offloading_.+template.+l38}}(
-//
-// CHECK: call i32 @__kmpc_target_init({{.*}}, i8 2, i1 false)
-// CHECK: call void @__kmpc_target_deinit({{.*}}, i8 2)
-//
-//
 // define internal void [[PFN2]](
-// CHECK: store i32 0, i32* [[A:%.+]], align
-// CHECK: store i16 -32768, i16* [[B:%.+]], align
-// CHECK: [[A_VAL:%.+]] = load i32, i32* [[A:%.+]], align
-// CHECK: [[OR:%.+]] = or i32 [[A_VAL]], 1
-// CHECK: store i32 [[OR]], i32* [[A]], align
-// CHECK: [[BV16:%.+]] = load i16, i16* [[B]], align
-// CHECK: [[BV:%.+]] = sext i16 [[BV16]] to i32
-// CHECK: [[CMP:%.+]] = icmp sgt i32 99, [[BV]]
-// CHECK: br i1 [[CMP]], label {{%?}}[[DO_MAX:.+]], label {{%?}}[[MAX_ELSE:.+]]
-//
-// CHECK: [[DO_MAX]]
-// CHECK: br label {{%?}}[[MAX_CONT:.+]]
-//
-// CHECK: [[MAX_ELSE]]
-// CHECK: [[BV:%.+]] = load i16, i16* [[B]], align
-// CHECK: [[MAX:%.+]] = sext i16 [[BV]] to i32
-// CHECK: br label {{%?}}[[MAX_CONT]]
-//
-// CHECK: [[MAX_CONT]]
-// CHECK: [[B_LVALUE:%.+]] = phi i32 [ 99, %[[DO_MAX]] ], [ [[MAX]], %[[MAX_ELSE]] ]
-// CHECK: [[TRUNC:%.+]] = trunc i32 [[B_LVALUE]] to i16
-// CHECK: store i16 [[TRUNC]], i16* [[B]], align
-// CHECK: [[PTR1:%.+]] = getelementptr inbounds [[RLT:.+]], [2 x i8*]* [[RL:%.+]], i{{32|64}} 0, i{{32|64}} 0
-// CHECK: [[A_CAST:%.+]] = bitcast i32* [[A]] to i8*
-// CHECK: store i8* [[A_CAST]], i8** [[PTR1]], align
-// CHECK: [[PTR2:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RL]], i{{32|64}} 0, i{{32|64}} 1
-// CHECK: [[B_CAST:%.+]] = bitcast i16* [[B]] to i8*
-// CHECK: store i8* [[B_CAST]], i8** [[PTR2]], align
-// CHECK: [[ARG_RL:%.+]] = bitcast [[RLT]]* [[RL]] to i8*
-// CHECK: [[RET:%.+]] = call i32 @__kmpc_nvptx_parallel_reduce_nowait_v2(%struct.ident_t* @{{.+}}, i32 {{.+}}, i32 2, i{{32|64}} {{8|16}}, i8* [[ARG_RL]], void (i8*, i16, i16, i16)* [[SHUFFLE_REDUCE_FN:@.+]], void (i8*, i32)* [[WARP_COPY_FN:@.+]])
-// CHECK: [[CMP:%.+]] = icmp eq i32 [[RET]], 1
-// CHECK: br i1 [[CMP]], label
 
-// CHECK: [[A_INV:%.+]] = load i32, i32* [[A_IN:%.+]], align
-// CHECK: [[AV:%.+]] = load i32, i32* [[A]], align
-// CHECK: [[OR:%.+]] = or i32 [[A_INV]], [[AV]]
-// CHECK: store i32 [[OR]], i32* [[A_IN]], align
-// CHECK: [[B_INV16:%.+]] = load i16, i16* [[B_IN:%.+]], align
-// CHECK: [[B_INV:%.+]] = sext i16 [[B_INV16]] to i32
-// CHECK: [[BV16:%.+]] = load i16, i16* [[B]], align
-// CHECK: [[BV:%.+]] = sext i16 [[BV16]] to i32
-// CHECK: [[CMP:%.+]] = icmp sgt i32 [[B_INV]], [[BV]]
-// CHECK: br i1 [[CMP]], label {{%?}}[[DO_MAX:.+]], label {{%?}}[[MAX_ELSE:.+]]
-//
-// CHECK: [[DO_MAX]]
-// CHECK: [[MAX1:%.+]] = load i16, i16* [[B_IN]], align
-// CHECK: br label {{%?}}[[MAX_CONT:.+]]
-//
-// CHECK: [[MAX_ELSE]]
-// CHECK: [[MAX2:%.+]] = load i16, i16* [[B]], align
-// CHECK: br label {{%?}}[[MAX_CONT]]
-//
-// CHECK: [[MAX_CONT]]
-// CHECK: [[B_MAX:%.+]] = phi i16 [ [[MAX1]], %[[DO_MAX]] ], [ [[MAX2]], %[[MAX_ELSE]] ]
-// CHECK: store i16 [[B_MAX]], i16* [[B_IN]], align
-// CHECK: call void @__kmpc_nvptx_end_reduce_nowait(
-// CHECK: br label
-//
-// CHECK: ret
 
-//
 // Reduction function
-// CHECK: define internal void [[REDUCTION_FUNC:@.+]](i8* noundef %0, i8* noundef %1)
-// CHECK: [[VAR1_RHS_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST_RHS:%.+]], i{{32|64}} 0, i{{32|64}} 0
-// CHECK: [[VAR1_RHS_VOID:%.+]] = load i8*, i8** [[VAR1_RHS_REF]],
-// CHECK: [[VAR1_RHS:%.+]] = bitcast i8* [[VAR1_RHS_VOID]] to i32*
-//
-// CHECK: [[VAR1_LHS_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST_LHS:%.+]], i{{32|64}} 0, i{{32|64}} 0
-// CHECK: [[VAR1_LHS_VOID:%.+]] = load i8*, i8** [[VAR1_LHS_REF]],
-// CHECK: [[VAR1_LHS:%.+]] = bitcast i8* [[VAR1_LHS_VOID]] to i32*
-//
-// CHECK: [[VAR2_RHS_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST_RHS]], i{{32|64}} 0, i{{32|64}} 1
-// CHECK: [[VAR2_RHS_VOID:%.+]] = load i8*, i8** [[VAR2_RHS_REF]],
-// CHECK: [[VAR2_RHS:%.+]] = bitcast i8* [[VAR2_RHS_VOID]] to i16*
-//
-// CHECK: [[VAR2_LHS_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST_LHS]], i{{32|64}} 0, i{{32|64}} 1
-// CHECK: [[VAR2_LHS_VOID:%.+]] = load i8*, i8** [[VAR2_LHS_REF]],
-// CHECK: [[VAR2_LHS:%.+]] = bitcast i8* [[VAR2_LHS_VOID]] to i16*
-//
-// CHECK: [[VAR1_LHS_VAL:%.+]] = load i32, i32* [[VAR1_LHS]],
-// CHECK: [[VAR1_RHS_VAL:%.+]] = load i32, i32* [[VAR1_RHS]],
-// CHECK: [[OR:%.+]] = or i32 [[VAR1_LHS_VAL]], [[VAR1_RHS_VAL]]
-// CHECK: store i32 [[OR]], i32* [[VAR1_LHS]],
-//
-// CHECK: [[VAR2_LHS_VAL16:%.+]] = load i16, i16* [[VAR2_LHS]],
-// CHECK: [[VAR2_LHS_VAL:%.+]] = sext i16 [[VAR2_LHS_VAL16]] to i32
-// CHECK: [[VAR2_RHS_VAL16:%.+]] = load i16, i16* [[VAR2_RHS]],
-// CHECK: [[VAR2_RHS_VAL:%.+]] = sext i16 [[VAR2_RHS_VAL16]] to i32
-//
-// CHECK: [[CMP:%.+]] = icmp sgt i32 [[VAR2_LHS_VAL]], [[VAR2_RHS_VAL]]
-// CHECK: br i1 [[CMP]], label {{%?}}[[DO_MAX:.+]], label {{%?}}[[MAX_ELSE:.+]]
-//
-// CHECK: [[DO_MAX]]
-// CHECK: [[MAX1:%.+]] = load i16, i16* [[VAR2_LHS]], align
-// CHECK: br label {{%?}}[[MAX_CONT:.+]]
-//
-// CHECK: [[MAX_ELSE]]
-// CHECK: [[MAX2:%.+]] = load i16, i16* [[VAR2_RHS]], align
-// CHECK: br label {{%?}}[[MAX_CONT]]
-//
-// CHECK: [[MAX_CONT]]
-// CHECK: [[MAXV:%.+]] = phi i16 [ [[MAX1]], %[[DO_MAX]] ], [ [[MAX2]], %[[MAX_ELSE]] ]
-// CHECK: store i16 [[MAXV]], i16* [[VAR2_LHS]],
-// CHECK: ret void
 
-//
 // Shuffle and reduce function
-// CHECK: define internal void [[SHUFFLE_REDUCE_FN]](i8* noundef %0, i16 noundef {{.*}}, i16 noundef {{.*}}, i16 noundef {{.*}})
-// CHECK: [[REMOTE_RED_LIST:%.+]] = alloca [[RLT]], align
-// CHECK: [[REMOTE_ELT1:%.+]] = alloca i32
-// CHECK: [[REMOTE_ELT2:%.+]] = alloca i16
-//
-// CHECK: [[LANEID:%.+]] = load i16, i16* {{.+}}, align
-// CHECK: [[LANEOFFSET:%.+]] = load i16, i16* {{.+}}, align
-// CHECK: [[ALGVER:%.+]] = load i16, i16* {{.+}}, align
-//
-// CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST:%.+]], i{{32|64}} 0, i{{32|64}} 0
-// CHECK: [[ELT_REF_CAST:%.+]] = bitcast i8** [[ELT_REF]] to i32**
-// CHECK: [[ELT:%.+]] = load i32*, i32** [[ELT_REF_CAST]],
-// CHECK: [[REMOTE_ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[REMOTE_RED_LIST:%.+]], i{{32|64}} 0, i{{32|64}} 0
-// CHECK: [[ELT_VAL:%.+]] = load i32, i32* [[ELT]], align
-//
-// CHECK: [[WS32:%.+]] = call i32 @__kmpc_get_warp_size()
-// CHECK: [[WS:%.+]] = trunc i32 [[WS32]] to i16
-// CHECK: [[REMOTE_ELT1_VAL:%.+]] = call i32 @__kmpc_shuffle_int32(i32 [[ELT_VAL]], i16 [[LANEOFFSET]], i16 [[WS]])
-//
-// CHECK: store i32 [[REMOTE_ELT1_VAL]], i32* [[REMOTE_ELT1]], align
-// CHECK: [[REMOTE_ELT1C:%.+]] = bitcast i32* [[REMOTE_ELT1]] to i8*
-// CHECK: store i8* [[REMOTE_ELT1C]], i8** [[REMOTE_ELT_REF]], align
-//
-// CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST]], i{{32|64}} 0, i{{32|64}} 1
-// CHECK: [[ELT_REF_CAST:%.+]] = bitcast i8** [[ELT_REF]] to i16**
-// CHECK: [[ELT:%.+]] = load i16*, i16** [[ELT_REF_CAST]],
-// CHECK: [[REMOTE_ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[REMOTE_RED_LIST]], i{{32|64}} 0, i{{32|64}} 1
-// CHECK: [[ELT_VAL:%.+]] = load i16, i16* [[ELT]], align
-//
-// CHECK: [[ELT_CAST:%.+]] = sext i16 [[ELT_VAL]] to i32
-// CHECK: [[WS32:%.+]] = call i32 @__kmpc_get_warp_size()
-// CHECK: [[WS:%.+]] = trunc i32 [[WS32]] to i16
-// CHECK: [[REMOTE_ELT2_VAL32:%.+]] = call i32 @__kmpc_shuffle_int32(i32 [[ELT_CAST]], i16 [[LANEOFFSET]], i16 [[WS]])
-// CHECK: [[REMOTE_ELT2_VAL:%.+]] = trunc i32 [[REMOTE_ELT2_VAL32]] to i16
-//
-// CHECK: store i16 [[REMOTE_ELT2_VAL]], i16* [[REMOTE_ELT2]], align
-// CHECK: [[REMOTE_ELT2C:%.+]] = bitcast i16* [[REMOTE_ELT2]] to i8*
-// CHECK: store i8* [[REMOTE_ELT2C]], i8** [[REMOTE_ELT_REF]], align
-//
 // Condition to reduce
-// CHECK: [[CONDALG0:%.+]] = icmp eq i16 [[ALGVER]], 0
-//
-// CHECK: [[COND1:%.+]] = icmp eq i16 [[ALGVER]], 1
-// CHECK: [[COND2:%.+]] = icmp ult i16 [[LANEID]], [[LANEOFFSET]]
-// CHECK: [[CONDALG1:%.+]] = and i1 [[COND1]], [[COND2]]
-//
-// CHECK: [[COND3:%.+]] = icmp eq i16 [[ALGVER]], 2
-// CHECK: [[COND4:%.+]] = and i16 [[LANEID]], 1
-// CHECK: [[COND5:%.+]] = icmp eq i16 [[COND4]], 0
-// CHECK: [[COND6:%.+]] = and i1 [[COND3]], [[COND5]]
-// CHECK: [[COND7:%.+]] = icmp sgt i16 [[LANEOFFSET]], 0
-// CHECK: [[CONDALG2:%.+]] = and i1 [[COND6]], [[COND7]]
-//
-// CHECK: [[COND8:%.+]] = or i1 [[CONDALG0]], [[CONDALG1]]
-// CHECK: [[SHOULD_REDUCE:%.+]] = or i1 [[COND8]], [[CONDALG2]]
-// CHECK: br i1 [[SHOULD_REDUCE]], label {{%?}}[[DO_REDUCE:.+]], label {{%?}}[[REDUCE_ELSE:.+]]
-//
-// CHECK: [[DO_REDUCE]]
-// CHECK: [[RED_LIST1_VOID:%.+]] = bitcast [[RLT]]* [[RED_LIST]] to i8*
-// CHECK: [[RED_LIST2_VOID:%.+]] = bitcast [[RLT]]* [[REMOTE_RED_LIST]] to i8*
-// CHECK: call void [[REDUCTION_FUNC]](i8* [[RED_LIST1_VOID]], i8* [[RED_LIST2_VOID]])
-// CHECK: br label {{%?}}[[REDUCE_CONT:.+]]
-//
-// CHECK: [[REDUCE_ELSE]]
-// CHECK: br label {{%?}}[[REDUCE_CONT]]
-//
-// CHECK: [[REDUCE_CONT]]
 // Now check if we should just copy over the remote reduction list
-// CHECK: [[COND1:%.+]] = icmp eq i16 [[ALGVER]], 1
-// CHECK: [[COND2:%.+]] = icmp uge i16 [[LANEID]], [[LANEOFFSET]]
-// CHECK: [[SHOULD_COPY:%.+]] = and i1 [[COND1]], [[COND2]]
-// CHECK: br i1 [[SHOULD_COPY]], label {{%?}}[[DO_COPY:.+]], label {{%?}}[[COPY_ELSE:.+]]
-//
-// CHECK: [[DO_COPY]]
-// CHECK: [[REMOTE_ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[REMOTE_RED_LIST]], i{{32|64}} 0, i{{32|64}} 0
-// CHECK: [[REMOTE_ELT_REF_CAST:%.+]] = bitcast i8** [[REMOTE_ELT_REF]] to i32**
-// CHECK: [[REMOTE_ELT:%.+]] = load i32*, i32** [[REMOTE_ELT_REF_CAST]],
-// CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST]], i{{32|64}} 0, i{{32|64}} 0
-// CHECK: [[ELT_REF_CAST:%.+]] = bitcast i8** [[ELT_REF]] to i32**
-// CHECK: [[ELT:%.+]] = load i32*, i32** [[ELT_REF_CAST]],
-// CHECK: [[REMOTE_ELT_VAL:%.+]] = load i32, i32* [[REMOTE_ELT]], align
-// CHECK: store i32 [[REMOTE_ELT_VAL]], i32* [[ELT]], align
-//
-// CHECK: [[REMOTE_ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[REMOTE_RED_LIST]], i{{32|64}} 0, i{{32|64}} 1
-// CHECK: [[REMOTE_ELT_REF_CAST:%.+]] = bitcast i8** [[REMOTE_ELT_REF]] to i16**
-// CHECK: [[REMOTE_ELT:%.+]] = load i16*, i16** [[REMOTE_ELT_REF_CAST]],
-// CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST]], i{{32|64}} 0, i{{32|64}} 1
-// CHECK: [[ELT_REF_CAST:%.+]] = bitcast i8** [[ELT_REF]] to i16**
-// CHECK: [[ELT:%.+]] = load i16*, i16** [[ELT_REF_CAST]],
-// CHECK: [[REMOTE_ELT_VAL:%.+]] = load i16, i16* [[REMOTE_ELT]], align
-// CHECK: store i16 [[REMOTE_ELT_VAL]], i16* [[ELT]], align
-// CHECK: br label {{%?}}[[COPY_CONT:.+]]
-//
-// CHECK: [[COPY_ELSE]]
-// CHECK: br label {{%?}}[[COPY_CONT]]
-//
-// CHECK: [[COPY_CONT]]
-// CHECK: void
 
-//
 // Inter warp copy function
-// CHECK: define internal void [[WARP_COPY_FN]](i8* noundef %0, i32 noundef %1)
-// CHECK-DAG: [[LANEID:%.+]] = and i32 {{.+}}, 31
-// CHECK-DAG: [[WARPID:%.+]] = ashr i32 {{.+}}, 5
-// CHECK-DAG: [[RED_LIST:%.+]] = bitcast i8* {{.+}} to [[RLT]]*
-// CHECK: call void @__kmpc_barrier(%struct.ident_t* @
-// CHECK: [[IS_WARP_MASTER:%.+]] = icmp eq i32 [[LANEID]], 0
-// CHECK: br i1 [[IS_WARP_MASTER]], label {{%?}}[[DO_COPY:.+]], label {{%?}}[[COPY_ELSE:.+]]
-//
 // [[DO_COPY]]
-// CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST]], i{{32|64}} 0, i{{32|64}} 0
-// CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
-// CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i32*
-//
-// CHECK: [[MEDIUM_ELT:%.+]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[WARPID]]
-// CHECK: [[ELT_VAL:%.+]] = load i32, i32* [[ELT]], align
-// CHECK: store volatile i32 [[ELT_VAL]], i32 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align
-// CHECK: br label {{%?}}[[COPY_CONT:.+]]
-//
-// CHECK: [[COPY_ELSE]]
-// CHECK: br label {{%?}}[[COPY_CONT]]
-//
 // Barrier after copy to shared memory storage medium.
-// CHECK: [[COPY_CONT]]
-// CHECK: call void @__kmpc_barrier(%struct.ident_t* @
-// CHECK: [[ACTIVE_WARPS:%.+]] = load i32, i32*
-//
 // Read into warp 0.
-// CHECK: [[IS_W0_ACTIVE_THREAD:%.+]] = icmp ult i32 [[TID:%.+]], [[ACTIVE_WARPS]]
-// CHECK: br i1 [[IS_W0_ACTIVE_THREAD]], label {{%?}}[[DO_READ:.+]], label {{%?}}[[READ_ELSE:.+]]
-//
-// CHECK: [[DO_READ]]
-// CHECK: [[MEDIUM_ELT:%.+]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[TID]]
-// CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST:%.+]], i{{32|64}} 0, i{{32|64}} 0
-// CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
-// CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i32*
-// CHECK: [[MEDIUM_ELT_VAL:%.+]] = load volatile i32, i32 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align
-// CHECK: store i32 [[MEDIUM_ELT_VAL]], i32* [[ELT]], align
-// CHECK: br label {{%?}}[[READ_CONT:.+]]
-//
-// CHECK: [[READ_ELSE]]
-// CHECK: br label {{%?}}[[READ_CONT]]
-//
-// CHECK: [[READ_CONT]]
-// CHECK: call void @__kmpc_barrier(%struct.ident_t* @
-// CHECK: [[IS_WARP_MASTER:%.+]] = icmp eq i32 [[LANEID]], 0
-// CHECK: br i1 [[IS_WARP_MASTER]], label {{%?}}[[DO_COPY:.+]], label {{%?}}[[COPY_ELSE:.+]]
-//
 // [[DO_COPY]]
-// CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST]], i{{32|64}} 0, i{{32|64}} 1
-// CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
-// CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i16*
-//
-// CHECK: [[MEDIUM_ELT32:%.+]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[WARPID]]
-// CHECK: [[MEDIUM_ELT:%.+]] = bitcast i32 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT32]] to i16 addrspace([[SHARED_ADDRSPACE]])*
-// CHECK: [[ELT_VAL:%.+]] = load i16, i16* [[ELT]], align
-// CHECK: store volatile i16 [[ELT_VAL]], i16 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align
-// CHECK: br label {{%?}}[[COPY_CONT:.+]]
-//
-// CHECK: [[COPY_ELSE]]
-// CHECK: br label {{%?}}[[COPY_CONT]]
-//
 // Barrier after copy to shared memory storage medium.
-// CHECK: [[COPY_CONT]]
-// CHECK: call void @__kmpc_barrier(%struct.ident_t* @
-// CHECK: [[ACTIVE_WARPS:%.+]] = load i32, i32*
-//
 // Read into warp 0.
-// CHECK: [[IS_W0_ACTIVE_THREAD:%.+]] = icmp ult i32 [[TID:%.+]], [[ACTIVE_WARPS]]
-// CHECK: br i1 [[IS_W0_ACTIVE_THREAD]], label {{%?}}[[DO_READ:.+]], label {{%?}}[[READ_ELSE:.+]]
-//
-// CHECK: [[DO_READ]]
-// CHECK: [[MEDIUM_ELT32:%.+]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[TID]]
-// CHECK: [[MEDIUM_ELT:%.+]] = bitcast i32 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT32]] to i16 addrspace([[SHARED_ADDRSPACE]])*
-// CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST:%.+]], i{{32|64}} 0, i{{32|64}} 1
-// CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
-// CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i16*
-// CHECK: [[MEDIUM_ELT_VAL:%.+]] = load volatile i16, i16 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align
-// CHECK: store i16 [[MEDIUM_ELT_VAL]], i16* [[ELT]], align
-// CHECK: br label {{%?}}[[READ_CONT:.+]]
-//
-// CHECK: [[READ_ELSE]]
-// CHECK: br label {{%?}}[[READ_CONT]]
-//
-// CHECK: [[READ_CONT]]
-// CHECK: ret
 
 #endif
+// CHECK-64-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l24
+// CHECK-64-SAME: (double* noundef nonnull align 8 dereferenceable(8) [[E:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-64-NEXT:  entry:
+// CHECK-64-NEXT:    [[E_ADDR:%.*]] = alloca double*, align 8
+// CHECK-64-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [1 x i8*], align 8
+// CHECK-64-NEXT:    store double* [[E]], double** [[E_ADDR]], align 8
+// CHECK-64-NEXT:    [[TMP0:%.*]] = load double*, double** [[E_ADDR]], align 8
+// CHECK-64-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1:[0-9]+]], i8 2, i1 false)
+// CHECK-64-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1
+// CHECK-64-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
+// CHECK-64:       user_code.entry:
+// CHECK-64-NEXT:    [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
+// CHECK-64-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 0
+// CHECK-64-NEXT:    [[TMP4:%.*]] = bitcast double* [[TMP0]] to i8*
+// CHECK-64-NEXT:    store i8* [[TMP4]], i8** [[TMP3]], align 8
+// CHECK-64-NEXT:    [[TMP5:%.*]] = bitcast [1 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8**
+// CHECK-64-NEXT:    call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, double*)* @__omp_outlined__ to i8*), i8* null, i8** [[TMP5]], i64 1)
+// CHECK-64-NEXT:    call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 2)
+// CHECK-64-NEXT:    ret void
+// CHECK-64:       worker.exit:
+// CHECK-64-NEXT:    ret void
+//
+//
+// CHECK-64-LABEL: define {{[^@]+}}@__omp_outlined__
+// CHECK-64-SAME: (i32* noalias noundef [[DOTGLOBAL_TID_:%.*]], i32* noalias noundef [[DOTBOUND_TID_:%.*]], double* noundef nonnull align 8 dereferenceable(8) [[E:%.*]]) #[[ATTR1:[0-9]+]] {
+// CHECK-64-NEXT:  entry:
+// CHECK-64-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8
+// CHECK-64-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8
+// CHECK-64-NEXT:    [[E_ADDR:%.*]] = alloca double*, align 8
+// CHECK-64-NEXT:    [[E1:%.*]] = alloca double, align 8
+// CHECK-64-NEXT:    [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [1 x i8*], align 8
+// CHECK-64-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK-64-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8
+// CHECK-64-NEXT:    store double* [[E]], double** [[E_ADDR]], align 8
+// CHECK-64-NEXT:    [[TMP0:%.*]] = load double*, double** [[E_ADDR]], align 8
+// CHECK-64-NEXT:    store double 0.000000e+00, double* [[E1]], align 8
+// CHECK-64-NEXT:    [[TMP1:%.*]] = load double, double* [[E1]], align 8
+// CHECK-64-NEXT:    [[ADD:%.*]] = fadd double [[TMP1]], 5.000000e+00
+// CHECK-64-NEXT:    store double [[ADD]], double* [[E1]], align 8
+// CHECK-64-NEXT:    [[TMP2:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK-64-NEXT:    [[TMP3:%.*]] = load i32, i32* [[TMP2]], align 4
+// CHECK-64-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i64 0, i64 0
+// CHECK-64-NEXT:    [[TMP5:%.*]] = bitcast double* [[E1]] to i8*
+// CHECK-64-NEXT:    store i8* [[TMP5]], i8** [[TMP4]], align 8
+// CHECK-64-NEXT:    [[TMP6:%.*]] = bitcast [1 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8*
+// CHECK-64-NEXT:    [[TMP7:%.*]] = call i32 @__kmpc_nvptx_parallel_reduce_nowait_v2(%struct.ident_t* @[[GLOB1]], i32 [[TMP3]], i32 1, i64 8, i8* [[TMP6]], void (i8*, i16, i16, i16)* @_omp_reduction_shuffle_and_reduce_func, void (i8*, i32)* @_omp_reduction_inter_warp_copy_func)
+// CHECK-64-NEXT:    [[TMP8:%.*]] = icmp eq i32 [[TMP7]], 1
+// CHECK-64-NEXT:    br i1 [[TMP8]], label [[DOTOMP_REDUCTION_THEN:%.*]], label [[DOTOMP_REDUCTION_DONE:%.*]]
+// CHECK-64:       .omp.reduction.then:
+// CHECK-64-NEXT:    [[TMP9:%.*]] = load double, double* [[TMP0]], align 8
+// CHECK-64-NEXT:    [[TMP10:%.*]] = load double, double* [[E1]], align 8
+// CHECK-64-NEXT:    [[ADD2:%.*]] = fadd double [[TMP9]], [[TMP10]]
+// CHECK-64-NEXT:    store double [[ADD2]], double* [[TMP0]], align 8
+// CHECK-64-NEXT:    call void @__kmpc_nvptx_end_reduce_nowait(i32 [[TMP3]])
+// CHECK-64-NEXT:    br label [[DOTOMP_REDUCTION_DONE]]
+// CHECK-64:       .omp.reduction.done:
+// CHECK-64-NEXT:    ret void
+//
+//
+// CHECK-64-LABEL: define {{[^@]+}}@_omp_reduction_shuffle_and_reduce_func
+// CHECK-64-SAME: (i8* noundef [[TMP0:%.*]], i16 noundef signext [[TMP1:%.*]], i16 noundef signext [[TMP2:%.*]], i16 noundef signext [[TMP3:%.*]]) #[[ATTR2:[0-9]+]] {
+// CHECK-64-NEXT:  entry:
+// CHECK-64-NEXT:    [[DOTADDR:%.*]] = alloca i8*, align 8
+// CHECK-64-NEXT:    [[DOTADDR1:%.*]] = alloca i16, align 2
+// CHECK-64-NEXT:    [[DOTADDR2:%.*]] = alloca i16, align 2
+// CHECK-64-NEXT:    [[DOTADDR3:%.*]] = alloca i16, align 2
+// CHECK-64-NEXT:    [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST:%.*]] = alloca [1 x i8*], align 8
+// CHECK-64-NEXT:    [[DOTOMP_REDUCTION_ELEMENT:%.*]] = alloca double, align 8
+// CHECK-64-NEXT:    store i8* [[TMP0]], i8** [[DOTADDR]], align 8
+// CHECK-64-NEXT:    store i16 [[TMP1]], i16* [[DOTADDR1]], align 2
+// CHECK-64-NEXT:    store i16 [[TMP2]], i16* [[DOTADDR2]], align 2
+// CHECK-64-NEXT:    store i16 [[TMP3]], i16* [[DOTADDR3]], align 2
+// CHECK-64-NEXT:    [[TMP4:%.*]] = load i8*, i8** [[DOTADDR]], align 8
+// CHECK-64-NEXT:    [[TMP5:%.*]] = bitcast i8* [[TMP4]] to [1 x i8*]*
+// CHECK-64-NEXT:    [[TMP6:%.*]] = load i16, i16* [[DOTADDR1]], align 2
+// CHECK-64-NEXT:    [[TMP7:%.*]] = load i16, i16* [[DOTADDR2]], align 2
+// CHECK-64-NEXT:    [[TMP8:%.*]] = load i16, i16* [[DOTADDR3]], align 2
+// CHECK-64-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[TMP5]], i64 0, i64 0
+// CHECK-64-NEXT:    [[TMP10:%.*]] = bitcast i8** [[TMP9]] to double**
+// CHECK-64-NEXT:    [[TMP11:%.*]] = load double*, double** [[TMP10]], align 8
+// CHECK-64-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i64 0, i64 0
+// CHECK-64-NEXT:    [[TMP13:%.*]] = getelementptr double, double* [[TMP11]], i64 1
+// CHECK-64-NEXT:    [[TMP14:%.*]] = bitcast double* [[TMP13]] to i8*
+// CHECK-64-NEXT:    [[TMP15:%.*]] = bitcast double* [[TMP11]] to i64*
+// CHECK-64-NEXT:    [[TMP16:%.*]] = bitcast double* [[DOTOMP_REDUCTION_ELEMENT]] to i64*
+// CHECK-64-NEXT:    [[TMP17:%.*]] = load i64, i64* [[TMP15]], align 8
+// CHECK-64-NEXT:    [[TMP18:%.*]] = call i32 @__kmpc_get_warp_size()
+// CHECK-64-NEXT:    [[TMP19:%.*]] = trunc i32 [[TMP18]] to i16
+// CHECK-64-NEXT:    [[TMP20:%.*]] = call i64 @__kmpc_shuffle_int64(i64 [[TMP17]], i16 [[TMP7]], i16 [[TMP19]])
+// CHECK-64-NEXT:    store i64 [[TMP20]], i64* [[TMP16]], align 8
+// CHECK-64-NEXT:    [[TMP21:%.*]] = getelementptr i64, i64* [[TMP15]], i64 1
+// CHECK-64-NEXT:    [[TMP22:%.*]] = getelementptr i64, i64* [[TMP16]], i64 1
+// CHECK-64-NEXT:    [[TMP23:%.*]] = bitcast double* [[DOTOMP_REDUCTION_ELEMENT]] to i8*
+// CHECK-64-NEXT:    store i8* [[TMP23]], i8** [[TMP12]], align 8
+// CHECK-64-NEXT:    [[TMP24:%.*]] = icmp eq i16 [[TMP8]], 0
+// CHECK-64-NEXT:    [[TMP25:%.*]] = icmp eq i16 [[TMP8]], 1
+// CHECK-64-NEXT:    [[TMP26:%.*]] = icmp ult i16 [[TMP6]], [[TMP7]]
+// CHECK-64-NEXT:    [[TMP27:%.*]] = and i1 [[TMP25]], [[TMP26]]
+// CHECK-64-NEXT:    [[TMP28:%.*]] = icmp eq i16 [[TMP8]], 2
+// CHECK-64-NEXT:    [[TMP29:%.*]] = and i16 [[TMP6]], 1
+// CHECK-64-NEXT:    [[TMP30:%.*]] = icmp eq i16 [[TMP29]], 0
+// CHECK-64-NEXT:    [[TMP31:%.*]] = and i1 [[TMP28]], [[TMP30]]
+// CHECK-64-NEXT:    [[TMP32:%.*]] = icmp sgt i16 [[TMP7]], 0
+// CHECK-64-NEXT:    [[TMP33:%.*]] = and i1 [[TMP31]], [[TMP32]]
+// CHECK-64-NEXT:    [[TMP34:%.*]] = or i1 [[TMP24]], [[TMP27]]
+// CHECK-64-NEXT:    [[TMP35:%.*]] = or i1 [[TMP34]], [[TMP33]]
+// CHECK-64-NEXT:    br i1 [[TMP35]], label [[THEN:%.*]], label [[ELSE:%.*]]
+// CHECK-64:       then:
+// CHECK-64-NEXT:    [[TMP36:%.*]] = bitcast [1 x i8*]* [[TMP5]] to i8*
+// CHECK-64-NEXT:    [[TMP37:%.*]] = bitcast [1 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]] to i8*
+// CHECK-64-NEXT:    call void @"_omp$reduction$reduction_func"(i8* [[TMP36]], i8* [[TMP37]]) #[[ATTR3:[0-9]+]]
+// CHECK-64-NEXT:    br label [[IFCONT:%.*]]
+// CHECK-64:       else:
+// CHECK-64-NEXT:    br label [[IFCONT]]
+// CHECK-64:       ifcont:
+// CHECK-64-NEXT:    [[TMP38:%.*]] = icmp eq i16 [[TMP8]], 1
+// CHECK-64-NEXT:    [[TMP39:%.*]] = icmp uge i16 [[TMP6]], [[TMP7]]
+// CHECK-64-NEXT:    [[TMP40:%.*]] = and i1 [[TMP38]], [[TMP39]]
+// CHECK-64-NEXT:    br i1 [[TMP40]], label [[THEN4:%.*]], label [[ELSE5:%.*]]
+// CHECK-64:       then4:
+// CHECK-64-NEXT:    [[TMP41:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i64 0, i64 0
+// CHECK-64-NEXT:    [[TMP42:%.*]] = bitcast i8** [[TMP41]] to double**
+// CHECK-64-NEXT:    [[TMP43:%.*]] = load double*, double** [[TMP42]], align 8
+// CHECK-64-NEXT:    [[TMP44:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[TMP5]], i64 0, i64 0
+// CHECK-64-NEXT:    [[TMP45:%.*]] = bitcast i8** [[TMP44]] to double**
+// CHECK-64-NEXT:    [[TMP46:%.*]] = load double*, double** [[TMP45]], align 8
+// CHECK-64-NEXT:    [[TMP47:%.*]] = load double, double* [[TMP43]], align 8
+// CHECK-64-NEXT:    store double [[TMP47]], double* [[TMP46]], align 8
+// CHECK-64-NEXT:    br label [[IFCONT6:%.*]]
+// CHECK-64:       else5:
+// CHECK-64-NEXT:    br label [[IFCONT6]]
+// CHECK-64:       ifcont6:
+// CHECK-64-NEXT:    ret void
+//
+//
+// CHECK-64-LABEL: define {{[^@]+}}@_omp_reduction_inter_warp_copy_func
+// CHECK-64-SAME: (i8* noundef [[TMP0:%.*]], i32 noundef [[TMP1:%.*]]) #[[ATTR2]] {
+// CHECK-64-NEXT:  entry:
+// CHECK-64-NEXT:    [[DOTADDR:%.*]] = alloca i8*, align 8
+// CHECK-64-NEXT:    [[DOTADDR1:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTCNT_ADDR:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
+// CHECK-64-NEXT:    store i8* [[TMP0]], i8** [[DOTADDR]], align 8
+// CHECK-64-NEXT:    store i32 [[TMP1]], i32* [[DOTADDR1]], align 4
+// CHECK-64-NEXT:    [[TMP3:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block()
+// CHECK-64-NEXT:    [[TMP4:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block()
+// CHECK-64-NEXT:    [[NVPTX_LANE_ID:%.*]] = and i32 [[TMP4]], 31
+// CHECK-64-NEXT:    [[TMP5:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block()
+// CHECK-64-NEXT:    [[NVPTX_WARP_ID:%.*]] = ashr i32 [[TMP5]], 5
+// CHECK-64-NEXT:    [[TMP6:%.*]] = load i8*, i8** [[DOTADDR]], align 8
+// CHECK-64-NEXT:    [[TMP7:%.*]] = bitcast i8* [[TMP6]] to [1 x i8*]*
+// CHECK-64-NEXT:    store i32 0, i32* [[DOTCNT_ADDR]], align 4
+// CHECK-64-NEXT:    br label [[PRECOND:%.*]]
+// CHECK-64:       precond:
+// CHECK-64-NEXT:    [[TMP8:%.*]] = load i32, i32* [[DOTCNT_ADDR]], align 4
+// CHECK-64-NEXT:    [[TMP9:%.*]] = icmp ult i32 [[TMP8]], 2
+// CHECK-64-NEXT:    br i1 [[TMP9]], label [[BODY:%.*]], label [[EXIT:%.*]]
+// CHECK-64:       body:
+// CHECK-64-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2:[0-9]+]], i32 [[TMP2]])
+// CHECK-64-NEXT:    [[WARP_MASTER:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0
+// CHECK-64-NEXT:    br i1 [[WARP_MASTER]], label [[THEN:%.*]], label [[ELSE:%.*]]
+// CHECK-64:       then:
+// CHECK-64-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[TMP7]], i64 0, i64 0
+// CHECK-64-NEXT:    [[TMP11:%.*]] = load i8*, i8** [[TMP10]], align 8
+// CHECK-64-NEXT:    [[TMP12:%.*]] = bitcast i8* [[TMP11]] to i32*
+// CHECK-64-NEXT:    [[TMP13:%.*]] = getelementptr i32, i32* [[TMP12]], i32 [[TMP8]]
+// CHECK-64-NEXT:    [[TMP14:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]]
+// CHECK-64-NEXT:    [[TMP15:%.*]] = load i32, i32* [[TMP13]], align 4
+// CHECK-64-NEXT:    store volatile i32 [[TMP15]], i32 addrspace(3)* [[TMP14]], align 4
+// CHECK-64-NEXT:    br label [[IFCONT:%.*]]
+// CHECK-64:       else:
+// CHECK-64-NEXT:    br label [[IFCONT]]
+// CHECK-64:       ifcont:
+// CHECK-64-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2]], i32 [[TMP2]])
+// CHECK-64-NEXT:    [[TMP16:%.*]] = load i32, i32* [[DOTADDR1]], align 4
+// CHECK-64-NEXT:    [[IS_ACTIVE_THREAD:%.*]] = icmp ult i32 [[TMP3]], [[TMP16]]
+// CHECK-64-NEXT:    br i1 [[IS_ACTIVE_THREAD]], label [[THEN2:%.*]], label [[ELSE3:%.*]]
+// CHECK-64:       then2:
+// CHECK-64-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[TMP3]]
+// CHECK-64-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[TMP7]], i64 0, i64 0
+// CHECK-64-NEXT:    [[TMP19:%.*]] = load i8*, i8** [[TMP18]], align 8
+// CHECK-64-NEXT:    [[TMP20:%.*]] = bitcast i8* [[TMP19]] to i32*
+// CHECK-64-NEXT:    [[TMP21:%.*]] = getelementptr i32, i32* [[TMP20]], i32 [[TMP8]]
+// CHECK-64-NEXT:    [[TMP22:%.*]] = load volatile i32, i32 addrspace(3)* [[TMP17]], align 4
+// CHECK-64-NEXT:    store i32 [[TMP22]], i32* [[TMP21]], align 4
+// CHECK-64-NEXT:    br label [[IFCONT4:%.*]]
+// CHECK-64:       else3:
+// CHECK-64-NEXT:    br label [[IFCONT4]]
+// CHECK-64:       ifcont4:
+// CHECK-64-NEXT:    [[TMP23:%.*]] = add nsw i32 [[TMP8]], 1
+// CHECK-64-NEXT:    store i32 [[TMP23]], i32* [[DOTCNT_ADDR]], align 4
+// CHECK-64-NEXT:    br label [[PRECOND]]
+// CHECK-64:       exit:
+// CHECK-64-NEXT:    ret void
+//
+//
+// CHECK-64-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l29
+// CHECK-64-SAME: (i8* noundef nonnull align 1 dereferenceable(1) [[C:%.*]], float* noundef nonnull align 4 dereferenceable(4) [[D:%.*]]) #[[ATTR0]] {
+// CHECK-64-NEXT:  entry:
+// CHECK-64-NEXT:    [[C_ADDR:%.*]] = alloca i8*, align 8
+// CHECK-64-NEXT:    [[D_ADDR:%.*]] = alloca float*, align 8
+// CHECK-64-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [2 x i8*], align 8
+// CHECK-64-NEXT:    store i8* [[C]], i8** [[C_ADDR]], align 8
+// CHECK-64-NEXT:    store float* [[D]], float** [[D_ADDR]], align 8
+// CHECK-64-NEXT:    [[TMP0:%.*]] = load i8*, i8** [[C_ADDR]], align 8
+// CHECK-64-NEXT:    [[TMP1:%.*]] = load float*, float** [[D_ADDR]], align 8
+// CHECK-64-NEXT:    [[TMP2:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 2, i1 false)
+// CHECK-64-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP2]], -1
+// CHECK-64-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
+// CHECK-64:       user_code.entry:
+// CHECK-64-NEXT:    [[TMP3:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
+// CHECK-64-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 0
+// CHECK-64-NEXT:    store i8* [[TMP0]], i8** [[TMP4]], align 8
+// CHECK-64-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 1
+// CHECK-64-NEXT:    [[TMP6:%.*]] = bitcast float* [[TMP1]] to i8*
+// CHECK-64-NEXT:    store i8* [[TMP6]], i8** [[TMP5]], align 8
+// CHECK-64-NEXT:    [[TMP7:%.*]] = bitcast [2 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8**
+// CHECK-64-NEXT:    call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP3]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i8*, float*)* @__omp_outlined__1 to i8*), i8* null, i8** [[TMP7]], i64 2)
+// CHECK-64-NEXT:    call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 2)
+// CHECK-64-NEXT:    ret void
+// CHECK-64:       worker.exit:
+// CHECK-64-NEXT:    ret void
+//
+//
+// CHECK-64-LABEL: define {{[^@]+}}@__omp_outlined__1
+// CHECK-64-SAME: (i32* noalias noundef [[DOTGLOBAL_TID_:%.*]], i32* noalias noundef [[DOTBOUND_TID_:%.*]], i8* noundef nonnull align 1 dereferenceable(1) [[C:%.*]], float* noundef nonnull align 4 dereferenceable(4) [[D:%.*]]) #[[ATTR1]] {
+// CHECK-64-NEXT:  entry:
+// CHECK-64-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8
+// CHECK-64-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8
+// CHECK-64-NEXT:    [[C_ADDR:%.*]] = alloca i8*, align 8
+// CHECK-64-NEXT:    [[D_ADDR:%.*]] = alloca float*, align 8
+// CHECK-64-NEXT:    [[C1:%.*]] = alloca i8, align 1
+// CHECK-64-NEXT:    [[D2:%.*]] = alloca float, align 4
+// CHECK-64-NEXT:    [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [2 x i8*], align 8
+// CHECK-64-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK-64-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8
+// CHECK-64-NEXT:    store i8* [[C]], i8** [[C_ADDR]], align 8
+// CHECK-64-NEXT:    store float* [[D]], float** [[D_ADDR]], align 8
+// CHECK-64-NEXT:    [[TMP0:%.*]] = load i8*, i8** [[C_ADDR]], align 8
+// CHECK-64-NEXT:    [[TMP1:%.*]] = load float*, float** [[D_ADDR]], align 8
+// CHECK-64-NEXT:    store i8 0, i8* [[C1]], align 1
+// CHECK-64-NEXT:    store float 1.000000e+00, float* [[D2]], align 4
+// CHECK-64-NEXT:    [[TMP2:%.*]] = load i8, i8* [[C1]], align 1
+// CHECK-64-NEXT:    [[CONV:%.*]] = sext i8 [[TMP2]] to i32
+// CHECK-64-NEXT:    [[XOR:%.*]] = xor i32 [[CONV]], 2
+// CHECK-64-NEXT:    [[CONV3:%.*]] = trunc i32 [[XOR]] to i8
+// CHECK-64-NEXT:    store i8 [[CONV3]], i8* [[C1]], align 1
+// CHECK-64-NEXT:    [[TMP3:%.*]] = load float, float* [[D2]], align 4
+// CHECK-64-NEXT:    [[MUL:%.*]] = fmul float [[TMP3]], 3.300000e+01
+// CHECK-64-NEXT:    store float [[MUL]], float* [[D2]], align 4
+// CHECK-64-NEXT:    [[TMP4:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK-64-NEXT:    [[TMP5:%.*]] = load i32, i32* [[TMP4]], align 4
+// CHECK-64-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i64 0, i64 0
+// CHECK-64-NEXT:    store i8* [[C1]], i8** [[TMP6]], align 8
+// CHECK-64-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i64 0, i64 1
+// CHECK-64-NEXT:    [[TMP8:%.*]] = bitcast float* [[D2]] to i8*
+// CHECK-64-NEXT:    store i8* [[TMP8]], i8** [[TMP7]], align 8
+// CHECK-64-NEXT:    [[TMP9:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8*
+// CHECK-64-NEXT:    [[TMP10:%.*]] = call i32 @__kmpc_nvptx_parallel_reduce_nowait_v2(%struct.ident_t* @[[GLOB1]], i32 [[TMP5]], i32 2, i64 16, i8* [[TMP9]], void (i8*, i16, i16, i16)* @_omp_reduction_shuffle_and_reduce_func3, void (i8*, i32)* @_omp_reduction_inter_warp_copy_func4)
+// CHECK-64-NEXT:    [[TMP11:%.*]] = icmp eq i32 [[TMP10]], 1
+// CHECK-64-NEXT:    br i1 [[TMP11]], label [[DOTOMP_REDUCTION_THEN:%.*]], label [[DOTOMP_REDUCTION_DONE:%.*]]
+// CHECK-64:       .omp.reduction.then:
+// CHECK-64-NEXT:    [[TMP12:%.*]] = load i8, i8* [[TMP0]], align 1
+// CHECK-64-NEXT:    [[CONV4:%.*]] = sext i8 [[TMP12]] to i32
+// CHECK-64-NEXT:    [[TMP13:%.*]] = load i8, i8* [[C1]], align 1
+// CHECK-64-NEXT:    [[CONV5:%.*]] = sext i8 [[TMP13]] to i32
+// CHECK-64-NEXT:    [[XOR6:%.*]] = xor i32 [[CONV4]], [[CONV5]]
+// CHECK-64-NEXT:    [[CONV7:%.*]] = trunc i32 [[XOR6]] to i8
+// CHECK-64-NEXT:    store i8 [[CONV7]], i8* [[TMP0]], align 1
+// CHECK-64-NEXT:    [[TMP14:%.*]] = load float, float* [[TMP1]], align 4
+// CHECK-64-NEXT:    [[TMP15:%.*]] = load float, float* [[D2]], align 4
+// CHECK-64-NEXT:    [[MUL8:%.*]] = fmul float [[TMP14]], [[TMP15]]
+// CHECK-64-NEXT:    store float [[MUL8]], float* [[TMP1]], align 4
+// CHECK-64-NEXT:    call void @__kmpc_nvptx_end_reduce_nowait(i32 [[TMP5]])
+// CHECK-64-NEXT:    br label [[DOTOMP_REDUCTION_DONE]]
+// CHECK-64:       .omp.reduction.done:
+// CHECK-64-NEXT:    ret void
+//
+//
+// CHECK-64-LABEL: define {{[^@]+}}@_omp_reduction_shuffle_and_reduce_func3
+// CHECK-64-SAME: (i8* noundef [[TMP0:%.*]], i16 noundef signext [[TMP1:%.*]], i16 noundef signext [[TMP2:%.*]], i16 noundef signext [[TMP3:%.*]]) #[[ATTR2]] {
+// CHECK-64-NEXT:  entry:
+// CHECK-64-NEXT:    [[DOTADDR:%.*]] = alloca i8*, align 8
+// CHECK-64-NEXT:    [[DOTADDR1:%.*]] = alloca i16, align 2
+// CHECK-64-NEXT:    [[DOTADDR2:%.*]] = alloca i16, align 2
+// CHECK-64-NEXT:    [[DOTADDR3:%.*]] = alloca i16, align 2
+// CHECK-64-NEXT:    [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST:%.*]] = alloca [2 x i8*], align 8
+// CHECK-64-NEXT:    [[DOTOMP_REDUCTION_ELEMENT:%.*]] = alloca i8, align 1
+// CHECK-64-NEXT:    [[DOTOMP_REDUCTION_ELEMENT4:%.*]] = alloca float, align 4
+// CHECK-64-NEXT:    store i8* [[TMP0]], i8** [[DOTADDR]], align 8
+// CHECK-64-NEXT:    store i16 [[TMP1]], i16* [[DOTADDR1]], align 2
+// CHECK-64-NEXT:    store i16 [[TMP2]], i16* [[DOTADDR2]], align 2
+// CHECK-64-NEXT:    store i16 [[TMP3]], i16* [[DOTADDR3]], align 2
+// CHECK-64-NEXT:    [[TMP4:%.*]] = load i8*, i8** [[DOTADDR]], align 8
+// CHECK-64-NEXT:    [[TMP5:%.*]] = bitcast i8* [[TMP4]] to [2 x i8*]*
+// CHECK-64-NEXT:    [[TMP6:%.*]] = load i16, i16* [[DOTADDR1]], align 2
+// CHECK-64-NEXT:    [[TMP7:%.*]] = load i16, i16* [[DOTADDR2]], align 2
+// CHECK-64-NEXT:    [[TMP8:%.*]] = load i16, i16* [[DOTADDR3]], align 2
+// CHECK-64-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i64 0, i64 0
+// CHECK-64-NEXT:    [[TMP10:%.*]] = load i8*, i8** [[TMP9]], align 8
+// CHECK-64-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i64 0, i64 0
+// CHECK-64-NEXT:    [[TMP12:%.*]] = getelementptr i8, i8* [[TMP10]], i64 1
+// CHECK-64-NEXT:    [[TMP13:%.*]] = load i8, i8* [[TMP10]], align 1
+// CHECK-64-NEXT:    [[TMP14:%.*]] = sext i8 [[TMP13]] to i32
+// CHECK-64-NEXT:    [[TMP15:%.*]] = call i32 @__kmpc_get_warp_size()
+// CHECK-64-NEXT:    [[TMP16:%.*]] = trunc i32 [[TMP15]] to i16
+// CHECK-64-NEXT:    [[TMP17:%.*]] = call i32 @__kmpc_shuffle_int32(i32 [[TMP14]], i16 [[TMP7]], i16 [[TMP16]])
+// CHECK-64-NEXT:    [[TMP18:%.*]] = trunc i32 [[TMP17]] to i8
+// CHECK-64-NEXT:    store i8 [[TMP18]], i8* [[DOTOMP_REDUCTION_ELEMENT]], align 1
+// CHECK-64-NEXT:    [[TMP19:%.*]] = getelementptr i8, i8* [[TMP10]], i64 1
+// CHECK-64-NEXT:    [[TMP20:%.*]] = getelementptr i8, i8* [[DOTOMP_REDUCTION_ELEMENT]], i64 1
+// CHECK-64-NEXT:    store i8* [[DOTOMP_REDUCTION_ELEMENT]], i8** [[TMP11]], align 8
+// CHECK-64-NEXT:    [[TMP21:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i64 0, i64 1
+// CHECK-64-NEXT:    [[TMP22:%.*]] = bitcast i8** [[TMP21]] to float**
+// CHECK-64-NEXT:    [[TMP23:%.*]] = load float*, float** [[TMP22]], align 8
+// CHECK-64-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i64 0, i64 1
+// CHECK-64-NEXT:    [[TMP25:%.*]] = getelementptr float, float* [[TMP23]], i64 1
+// CHECK-64-NEXT:    [[TMP26:%.*]] = bitcast float* [[TMP25]] to i8*
+// CHECK-64-NEXT:    [[TMP27:%.*]] = bitcast float* [[TMP23]] to i32*
+// CHECK-64-NEXT:    [[TMP28:%.*]] = bitcast float* [[DOTOMP_REDUCTION_ELEMENT4]] to i32*
+// CHECK-64-NEXT:    [[TMP29:%.*]] = load i32, i32* [[TMP27]], align 4
+// CHECK-64-NEXT:    [[TMP30:%.*]] = call i32 @__kmpc_get_warp_size()
+// CHECK-64-NEXT:    [[TMP31:%.*]] = trunc i32 [[TMP30]] to i16
+// CHECK-64-NEXT:    [[TMP32:%.*]] = call i32 @__kmpc_shuffle_int32(i32 [[TMP29]], i16 [[TMP7]], i16 [[TMP31]])
+// CHECK-64-NEXT:    store i32 [[TMP32]], i32* [[TMP28]], align 4
+// CHECK-64-NEXT:    [[TMP33:%.*]] = getelementptr i32, i32* [[TMP27]], i64 1
+// CHECK-64-NEXT:    [[TMP34:%.*]] = getelementptr i32, i32* [[TMP28]], i64 1
+// CHECK-64-NEXT:    [[TMP35:%.*]] = bitcast float* [[DOTOMP_REDUCTION_ELEMENT4]] to i8*
+// CHECK-64-NEXT:    store i8* [[TMP35]], i8** [[TMP24]], align 8
+// CHECK-64-NEXT:    [[TMP36:%.*]] = icmp eq i16 [[TMP8]], 0
+// CHECK-64-NEXT:    [[TMP37:%.*]] = icmp eq i16 [[TMP8]], 1
+// CHECK-64-NEXT:    [[TMP38:%.*]] = icmp ult i16 [[TMP6]], [[TMP7]]
+// CHECK-64-NEXT:    [[TMP39:%.*]] = and i1 [[TMP37]], [[TMP38]]
+// CHECK-64-NEXT:    [[TMP40:%.*]] = icmp eq i16 [[TMP8]], 2
+// CHECK-64-NEXT:    [[TMP41:%.*]] = and i16 [[TMP6]], 1
+// CHECK-64-NEXT:    [[TMP42:%.*]] = icmp eq i16 [[TMP41]], 0
+// CHECK-64-NEXT:    [[TMP43:%.*]] = and i1 [[TMP40]], [[TMP42]]
+// CHECK-64-NEXT:    [[TMP44:%.*]] = icmp sgt i16 [[TMP7]], 0
+// CHECK-64-NEXT:    [[TMP45:%.*]] = and i1 [[TMP43]], [[TMP44]]
+// CHECK-64-NEXT:    [[TMP46:%.*]] = or i1 [[TMP36]], [[TMP39]]
+// CHECK-64-NEXT:    [[TMP47:%.*]] = or i1 [[TMP46]], [[TMP45]]
+// CHECK-64-NEXT:    br i1 [[TMP47]], label [[THEN:%.*]], label [[ELSE:%.*]]
+// CHECK-64:       then:
+// CHECK-64-NEXT:    [[TMP48:%.*]] = bitcast [2 x i8*]* [[TMP5]] to i8*
+// CHECK-64-NEXT:    [[TMP49:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]] to i8*
+// CHECK-64-NEXT:    call void @"_omp$reduction$reduction_func2"(i8* [[TMP48]], i8* [[TMP49]]) #[[ATTR3]]
+// CHECK-64-NEXT:    br label [[IFCONT:%.*]]
+// CHECK-64:       else:
+// CHECK-64-NEXT:    br label [[IFCONT]]
+// CHECK-64:       ifcont:
+// CHECK-64-NEXT:    [[TMP50:%.*]] = icmp eq i16 [[TMP8]], 1
+// CHECK-64-NEXT:    [[TMP51:%.*]] = icmp uge i16 [[TMP6]], [[TMP7]]
+// CHECK-64-NEXT:    [[TMP52:%.*]] = and i1 [[TMP50]], [[TMP51]]
+// CHECK-64-NEXT:    br i1 [[TMP52]], label [[THEN5:%.*]], label [[ELSE6:%.*]]
+// CHECK-64:       then5:
+// CHECK-64-NEXT:    [[TMP53:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i64 0, i64 0
+// CHECK-64-NEXT:    [[TMP54:%.*]] = load i8*, i8** [[TMP53]], align 8
+// CHECK-64-NEXT:    [[TMP55:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i64 0, i64 0
+// CHECK-64-NEXT:    [[TMP56:%.*]] = load i8*, i8** [[TMP55]], align 8
+// CHECK-64-NEXT:    [[TMP57:%.*]] = load i8, i8* [[TMP54]], align 1
+// CHECK-64-NEXT:    store i8 [[TMP57]], i8* [[TMP56]], align 1
+// CHECK-64-NEXT:    [[TMP58:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i64 0, i64 1
+// CHECK-64-NEXT:    [[TMP59:%.*]] = bitcast i8** [[TMP58]] to float**
+// CHECK-64-NEXT:    [[TMP60:%.*]] = load float*, float** [[TMP59]], align 8
+// CHECK-64-NEXT:    [[TMP61:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i64 0, i64 1
+// CHECK-64-NEXT:    [[TMP62:%.*]] = bitcast i8** [[TMP61]] to float**
+// CHECK-64-NEXT:    [[TMP63:%.*]] = load float*, float** [[TMP62]], align 8
+// CHECK-64-NEXT:    [[TMP64:%.*]] = load float, float* [[TMP60]], align 4
+// CHECK-64-NEXT:    store float [[TMP64]], float* [[TMP63]], align 4
+// CHECK-64-NEXT:    br label [[IFCONT7:%.*]]
+// CHECK-64:       else6:
+// CHECK-64-NEXT:    br label [[IFCONT7]]
+// CHECK-64:       ifcont7:
+// CHECK-64-NEXT:    ret void
+//
+//
+// CHECK-64-LABEL: define {{[^@]+}}@_omp_reduction_inter_warp_copy_func4
+// CHECK-64-SAME: (i8* noundef [[TMP0:%.*]], i32 noundef [[TMP1:%.*]]) #[[ATTR2]] {
+// CHECK-64-NEXT:  entry:
+// CHECK-64-NEXT:    [[DOTADDR:%.*]] = alloca i8*, align 8
+// CHECK-64-NEXT:    [[DOTADDR1:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
+// CHECK-64-NEXT:    store i8* [[TMP0]], i8** [[DOTADDR]], align 8
+// CHECK-64-NEXT:    store i32 [[TMP1]], i32* [[DOTADDR1]], align 4
+// CHECK-64-NEXT:    [[TMP3:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block()
+// CHECK-64-NEXT:    [[TMP4:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block()
+// CHECK-64-NEXT:    [[NVPTX_LANE_ID:%.*]] = and i32 [[TMP4]], 31
+// CHECK-64-NEXT:    [[TMP5:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block()
+// CHECK-64-NEXT:    [[NVPTX_WARP_ID:%.*]] = ashr i32 [[TMP5]], 5
+// CHECK-64-NEXT:    [[TMP6:%.*]] = load i8*, i8** [[DOTADDR]], align 8
+// CHECK-64-NEXT:    [[TMP7:%.*]] = bitcast i8* [[TMP6]] to [2 x i8*]*
+// CHECK-64-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2]], i32 [[TMP2]])
+// CHECK-64-NEXT:    [[WARP_MASTER:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0
+// CHECK-64-NEXT:    br i1 [[WARP_MASTER]], label [[THEN:%.*]], label [[ELSE:%.*]]
+// CHECK-64:       then:
+// CHECK-64-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP7]], i64 0, i64 0
+// CHECK-64-NEXT:    [[TMP9:%.*]] = load i8*, i8** [[TMP8]], align 8
+// CHECK-64-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]]
+// CHECK-64-NEXT:    [[TMP11:%.*]] = bitcast i32 addrspace(3)* [[TMP10]] to i8 addrspace(3)*
+// CHECK-64-NEXT:    [[TMP12:%.*]] = load i8, i8* [[TMP9]], align 1
+// CHECK-64-NEXT:    store volatile i8 [[TMP12]], i8 addrspace(3)* [[TMP11]], align 1
+// CHECK-64-NEXT:    br label [[IFCONT:%.*]]
+// CHECK-64:       else:
+// CHECK-64-NEXT:    br label [[IFCONT]]
+// CHECK-64:       ifcont:
+// CHECK-64-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2]], i32 [[TMP2]])
+// CHECK-64-NEXT:    [[TMP13:%.*]] = load i32, i32* [[DOTADDR1]], align 4
+// CHECK-64-NEXT:    [[IS_ACTIVE_THREAD:%.*]] = icmp ult i32 [[TMP3]], [[TMP13]]
+// CHECK-64-NEXT:    br i1 [[IS_ACTIVE_THREAD]], label [[THEN2:%.*]], label [[ELSE3:%.*]]
+// CHECK-64:       then2:
+// CHECK-64-NEXT:    [[TMP14:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[TMP3]]
+// CHECK-64-NEXT:    [[TMP15:%.*]] = bitcast i32 addrspace(3)* [[TMP14]] to i8 addrspace(3)*
+// CHECK-64-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP7]], i64 0, i64 0
+// CHECK-64-NEXT:    [[TMP17:%.*]] = load i8*, i8** [[TMP16]], align 8
+// CHECK-64-NEXT:    [[TMP18:%.*]] = load volatile i8, i8 addrspace(3)* [[TMP15]], align 1
+// CHECK-64-NEXT:    store i8 [[TMP18]], i8* [[TMP17]], align 1
+// CHECK-64-NEXT:    br label [[IFCONT4:%.*]]
+// CHECK-64:       else3:
+// CHECK-64-NEXT:    br label [[IFCONT4]]
+// CHECK-64:       ifcont4:
+// CHECK-64-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2]], i32 [[TMP2]])
+// CHECK-64-NEXT:    [[WARP_MASTER5:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0
+// CHECK-64-NEXT:    br i1 [[WARP_MASTER5]], label [[THEN6:%.*]], label [[ELSE7:%.*]]
+// CHECK-64:       then6:
+// CHECK-64-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP7]], i64 0, i64 1
+// CHECK-64-NEXT:    [[TMP20:%.*]] = load i8*, i8** [[TMP19]], align 8
+// CHECK-64-NEXT:    [[TMP21:%.*]] = bitcast i8* [[TMP20]] to i32*
+// CHECK-64-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]]
+// CHECK-64-NEXT:    [[TMP23:%.*]] = load i32, i32* [[TMP21]], align 4
+// CHECK-64-NEXT:    store volatile i32 [[TMP23]], i32 addrspace(3)* [[TMP22]], align 4
+// CHECK-64-NEXT:    br label [[IFCONT8:%.*]]
+// CHECK-64:       else7:
+// CHECK-64-NEXT:    br label [[IFCONT8]]
+// CHECK-64:       ifcont8:
+// CHECK-64-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2]], i32 [[TMP2]])
+// CHECK-64-NEXT:    [[TMP24:%.*]] = load i32, i32* [[DOTADDR1]], align 4
+// CHECK-64-NEXT:    [[IS_ACTIVE_THREAD9:%.*]] = icmp ult i32 [[TMP3]], [[TMP24]]
+// CHECK-64-NEXT:    br i1 [[IS_ACTIVE_THREAD9]], label [[THEN10:%.*]], label [[ELSE11:%.*]]
+// CHECK-64:       then10:
+// CHECK-64-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[TMP3]]
+// CHECK-64-NEXT:    [[TMP26:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP7]], i64 0, i64 1
+// CHECK-64-NEXT:    [[TMP27:%.*]] = load i8*, i8** [[TMP26]], align 8
+// CHECK-64-NEXT:    [[TMP28:%.*]] = bitcast i8* [[TMP27]] to i32*
+// CHECK-64-NEXT:    [[TMP29:%.*]] = load volatile i32, i32 addrspace(3)* [[TMP25]], align 4
+// CHECK-64-NEXT:    store i32 [[TMP29]], i32* [[TMP28]], align 4
+// CHECK-64-NEXT:    br label [[IFCONT12:%.*]]
+// CHECK-64:       else11:
+// CHECK-64-NEXT:    br label [[IFCONT12]]
+// CHECK-64:       ifcont12:
+// CHECK-64-NEXT:    ret void
+//
+//
+// CHECK-64-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l35
+// CHECK-64-SAME: (i32* noundef nonnull align 4 dereferenceable(4) [[A:%.*]], i16* noundef nonnull align 2 dereferenceable(2) [[B:%.*]]) #[[ATTR0]] {
+// CHECK-64-NEXT:  entry:
+// CHECK-64-NEXT:    [[A_ADDR:%.*]] = alloca i32*, align 8
+// CHECK-64-NEXT:    [[B_ADDR:%.*]] = alloca i16*, align 8
+// CHECK-64-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [2 x i8*], align 8
+// CHECK-64-NEXT:    store i32* [[A]], i32** [[A_ADDR]], align 8
+// CHECK-64-NEXT:    store i16* [[B]], i16** [[B_ADDR]], align 8
+// CHECK-64-NEXT:    [[TMP0:%.*]] = load i32*, i32** [[A_ADDR]], align 8
+// CHECK-64-NEXT:    [[TMP1:%.*]] = load i16*, i16** [[B_ADDR]], align 8
+// CHECK-64-NEXT:    [[TMP2:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 2, i1 false)
+// CHECK-64-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP2]], -1
+// CHECK-64-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
+// CHECK-64:       user_code.entry:
+// CHECK-64-NEXT:    [[TMP3:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
+// CHECK-64-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 0
+// CHECK-64-NEXT:    [[TMP5:%.*]] = bitcast i32* [[TMP0]] to i8*
+// CHECK-64-NEXT:    store i8* [[TMP5]], i8** [[TMP4]], align 8
+// CHECK-64-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 1
+// CHECK-64-NEXT:    [[TMP7:%.*]] = bitcast i16* [[TMP1]] to i8*
+// CHECK-64-NEXT:    store i8* [[TMP7]], i8** [[TMP6]], align 8
+// CHECK-64-NEXT:    [[TMP8:%.*]] = bitcast [2 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8**
+// CHECK-64-NEXT:    call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP3]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32*, i16*)* @__omp_outlined__5 to i8*), i8* null, i8** [[TMP8]], i64 2)
+// CHECK-64-NEXT:    call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 2)
+// CHECK-64-NEXT:    ret void
+// CHECK-64:       worker.exit:
+// CHECK-64-NEXT:    ret void
+//
+//
+// CHECK-64-LABEL: define {{[^@]+}}@__omp_outlined__5
+// CHECK-64-SAME: (i32* noalias noundef [[DOTGLOBAL_TID_:%.*]], i32* noalias noundef [[DOTBOUND_TID_:%.*]], i32* noundef nonnull align 4 dereferenceable(4) [[A:%.*]], i16* noundef nonnull align 2 dereferenceable(2) [[B:%.*]]) #[[ATTR1]] {
+// CHECK-64-NEXT:  entry:
+// CHECK-64-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8
+// CHECK-64-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8
+// CHECK-64-NEXT:    [[A_ADDR:%.*]] = alloca i32*, align 8
+// CHECK-64-NEXT:    [[B_ADDR:%.*]] = alloca i16*, align 8
+// CHECK-64-NEXT:    [[A1:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[B2:%.*]] = alloca i16, align 2
+// CHECK-64-NEXT:    [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [2 x i8*], align 8
+// CHECK-64-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK-64-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8
+// CHECK-64-NEXT:    store i32* [[A]], i32** [[A_ADDR]], align 8
+// CHECK-64-NEXT:    store i16* [[B]], i16** [[B_ADDR]], align 8
+// CHECK-64-NEXT:    [[TMP0:%.*]] = load i32*, i32** [[A_ADDR]], align 8
+// CHECK-64-NEXT:    [[TMP1:%.*]] = load i16*, i16** [[B_ADDR]], align 8
+// CHECK-64-NEXT:    store i32 0, i32* [[A1]], align 4
+// CHECK-64-NEXT:    store i16 -32768, i16* [[B2]], align 2
+// CHECK-64-NEXT:    [[TMP2:%.*]] = load i32, i32* [[A1]], align 4
+// CHECK-64-NEXT:    [[OR:%.*]] = or i32 [[TMP2]], 1
+// CHECK-64-NEXT:    store i32 [[OR]], i32* [[A1]], align 4
+// CHECK-64-NEXT:    [[TMP3:%.*]] = load i16, i16* [[B2]], align 2
+// CHECK-64-NEXT:    [[CONV:%.*]] = sext i16 [[TMP3]] to i32
+// CHECK-64-NEXT:    [[CMP:%.*]] = icmp sgt i32 99, [[CONV]]
+// CHECK-64-NEXT:    br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK-64:       cond.true:
+// CHECK-64-NEXT:    br label [[COND_END:%.*]]
+// CHECK-64:       cond.false:
+// CHECK-64-NEXT:    [[TMP4:%.*]] = load i16, i16* [[B2]], align 2
+// CHECK-64-NEXT:    [[CONV3:%.*]] = sext i16 [[TMP4]] to i32
+// CHECK-64-NEXT:    br label [[COND_END]]
+// CHECK-64:       cond.end:
+// CHECK-64-NEXT:    [[COND:%.*]] = phi i32 [ 99, [[COND_TRUE]] ], [ [[CONV3]], [[COND_FALSE]] ]
+// CHECK-64-NEXT:    [[CONV4:%.*]] = trunc i32 [[COND]] to i16
+// CHECK-64-NEXT:    store i16 [[CONV4]], i16* [[B2]], align 2
+// CHECK-64-NEXT:    [[TMP5:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK-64-NEXT:    [[TMP6:%.*]] = load i32, i32* [[TMP5]], align 4
+// CHECK-64-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i64 0, i64 0
+// CHECK-64-NEXT:    [[TMP8:%.*]] = bitcast i32* [[A1]] to i8*
+// CHECK-64-NEXT:    store i8* [[TMP8]], i8** [[TMP7]], align 8
+// CHECK-64-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i64 0, i64 1
+// CHECK-64-NEXT:    [[TMP10:%.*]] = bitcast i16* [[B2]] to i8*
+// CHECK-64-NEXT:    store i8* [[TMP10]], i8** [[TMP9]], align 8
+// CHECK-64-NEXT:    [[TMP11:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8*
+// CHECK-64-NEXT:    [[TMP12:%.*]] = call i32 @__kmpc_nvptx_parallel_reduce_nowait_v2(%struct.ident_t* @[[GLOB1]], i32 [[TMP6]], i32 2, i64 16, i8* [[TMP11]], void (i8*, i16, i16, i16)* @_omp_reduction_shuffle_and_reduce_func7, void (i8*, i32)* @_omp_reduction_inter_warp_copy_func8)
+// CHECK-64-NEXT:    [[TMP13:%.*]] = icmp eq i32 [[TMP12]], 1
+// CHECK-64-NEXT:    br i1 [[TMP13]], label [[DOTOMP_REDUCTION_THEN:%.*]], label [[DOTOMP_REDUCTION_DONE:%.*]]
+// CHECK-64:       .omp.reduction.then:
+// CHECK-64-NEXT:    [[TMP14:%.*]] = load i32, i32* [[TMP0]], align 4
+// CHECK-64-NEXT:    [[TMP15:%.*]] = load i32, i32* [[A1]], align 4
+// CHECK-64-NEXT:    [[OR5:%.*]] = or i32 [[TMP14]], [[TMP15]]
+// CHECK-64-NEXT:    store i32 [[OR5]], i32* [[TMP0]], align 4
+// CHECK-64-NEXT:    [[TMP16:%.*]] = load i16, i16* [[TMP1]], align 2
+// CHECK-64-NEXT:    [[CONV6:%.*]] = sext i16 [[TMP16]] to i32
+// CHECK-64-NEXT:    [[TMP17:%.*]] = load i16, i16* [[B2]], align 2
+// CHECK-64-NEXT:    [[CONV7:%.*]] = sext i16 [[TMP17]] to i32
+// CHECK-64-NEXT:    [[CMP8:%.*]] = icmp sgt i32 [[CONV6]], [[CONV7]]
+// CHECK-64-NEXT:    br i1 [[CMP8]], label [[COND_TRUE9:%.*]], label [[COND_FALSE10:%.*]]
+// CHECK-64:       cond.true9:
+// CHECK-64-NEXT:    [[TMP18:%.*]] = load i16, i16* [[TMP1]], align 2
+// CHECK-64-NEXT:    br label [[COND_END11:%.*]]
+// CHECK-64:       cond.false10:
+// CHECK-64-NEXT:    [[TMP19:%.*]] = load i16, i16* [[B2]], align 2
+// CHECK-64-NEXT:    br label [[COND_END11]]
+// CHECK-64:       cond.end11:
+// CHECK-64-NEXT:    [[COND12:%.*]] = phi i16 [ [[TMP18]], [[COND_TRUE9]] ], [ [[TMP19]], [[COND_FALSE10]] ]
+// CHECK-64-NEXT:    store i16 [[COND12]], i16* [[TMP1]], align 2
+// CHECK-64-NEXT:    call void @__kmpc_nvptx_end_reduce_nowait(i32 [[TMP6]])
+// CHECK-64-NEXT:    br label [[DOTOMP_REDUCTION_DONE]]
+// CHECK-64:       .omp.reduction.done:
+// CHECK-64-NEXT:    ret void
+//
+//
+// CHECK-64-LABEL: define {{[^@]+}}@_omp_reduction_shuffle_and_reduce_func7
+// CHECK-64-SAME: (i8* noundef [[TMP0:%.*]], i16 noundef signext [[TMP1:%.*]], i16 noundef signext [[TMP2:%.*]], i16 noundef signext [[TMP3:%.*]]) #[[ATTR2]] {
+// CHECK-64-NEXT:  entry:
+// CHECK-64-NEXT:    [[DOTADDR:%.*]] = alloca i8*, align 8
+// CHECK-64-NEXT:    [[DOTADDR1:%.*]] = alloca i16, align 2
+// CHECK-64-NEXT:    [[DOTADDR2:%.*]] = alloca i16, align 2
+// CHECK-64-NEXT:    [[DOTADDR3:%.*]] = alloca i16, align 2
+// CHECK-64-NEXT:    [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST:%.*]] = alloca [2 x i8*], align 8
+// CHECK-64-NEXT:    [[DOTOMP_REDUCTION_ELEMENT:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_REDUCTION_ELEMENT4:%.*]] = alloca i16, align 2
+// CHECK-64-NEXT:    store i8* [[TMP0]], i8** [[DOTADDR]], align 8
+// CHECK-64-NEXT:    store i16 [[TMP1]], i16* [[DOTADDR1]], align 2
+// CHECK-64-NEXT:    store i16 [[TMP2]], i16* [[DOTADDR2]], align 2
+// CHECK-64-NEXT:    store i16 [[TMP3]], i16* [[DOTADDR3]], align 2
+// CHECK-64-NEXT:    [[TMP4:%.*]] = load i8*, i8** [[DOTADDR]], align 8
+// CHECK-64-NEXT:    [[TMP5:%.*]] = bitcast i8* [[TMP4]] to [2 x i8*]*
+// CHECK-64-NEXT:    [[TMP6:%.*]] = load i16, i16* [[DOTADDR1]], align 2
+// CHECK-64-NEXT:    [[TMP7:%.*]] = load i16, i16* [[DOTADDR2]], align 2
+// CHECK-64-NEXT:    [[TMP8:%.*]] = load i16, i16* [[DOTADDR3]], align 2
+// CHECK-64-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i64 0, i64 0
+// CHECK-64-NEXT:    [[TMP10:%.*]] = bitcast i8** [[TMP9]] to i32**
+// CHECK-64-NEXT:    [[TMP11:%.*]] = load i32*, i32** [[TMP10]], align 8
+// CHECK-64-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i64 0, i64 0
+// CHECK-64-NEXT:    [[TMP13:%.*]] = getelementptr i32, i32* [[TMP11]], i64 1
+// CHECK-64-NEXT:    [[TMP14:%.*]] = bitcast i32* [[TMP13]] to i8*
+// CHECK-64-NEXT:    [[TMP15:%.*]] = load i32, i32* [[TMP11]], align 4
+// CHECK-64-NEXT:    [[TMP16:%.*]] = call i32 @__kmpc_get_warp_size()
+// CHECK-64-NEXT:    [[TMP17:%.*]] = trunc i32 [[TMP16]] to i16
+// CHECK-64-NEXT:    [[TMP18:%.*]] = call i32 @__kmpc_shuffle_int32(i32 [[TMP15]], i16 [[TMP7]], i16 [[TMP17]])
+// CHECK-64-NEXT:    store i32 [[TMP18]], i32* [[DOTOMP_REDUCTION_ELEMENT]], align 4
+// CHECK-64-NEXT:    [[TMP19:%.*]] = getelementptr i32, i32* [[TMP11]], i64 1
+// CHECK-64-NEXT:    [[TMP20:%.*]] = getelementptr i32, i32* [[DOTOMP_REDUCTION_ELEMENT]], i64 1
+// CHECK-64-NEXT:    [[TMP21:%.*]] = bitcast i32* [[DOTOMP_REDUCTION_ELEMENT]] to i8*
+// CHECK-64-NEXT:    store i8* [[TMP21]], i8** [[TMP12]], align 8
+// CHECK-64-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i64 0, i64 1
+// CHECK-64-NEXT:    [[TMP23:%.*]] = bitcast i8** [[TMP22]] to i16**
+// CHECK-64-NEXT:    [[TMP24:%.*]] = load i16*, i16** [[TMP23]], align 8
+// CHECK-64-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i64 0, i64 1
+// CHECK-64-NEXT:    [[TMP26:%.*]] = getelementptr i16, i16* [[TMP24]], i64 1
+// CHECK-64-NEXT:    [[TMP27:%.*]] = bitcast i16* [[TMP26]] to i8*
+// CHECK-64-NEXT:    [[TMP28:%.*]] = load i16, i16* [[TMP24]], align 2
+// CHECK-64-NEXT:    [[TMP29:%.*]] = sext i16 [[TMP28]] to i32
+// CHECK-64-NEXT:    [[TMP30:%.*]] = call i32 @__kmpc_get_warp_size()
+// CHECK-64-NEXT:    [[TMP31:%.*]] = trunc i32 [[TMP30]] to i16
+// CHECK-64-NEXT:    [[TMP32:%.*]] = call i32 @__kmpc_shuffle_int32(i32 [[TMP29]], i16 [[TMP7]], i16 [[TMP31]])
+// CHECK-64-NEXT:    [[TMP33:%.*]] = trunc i32 [[TMP32]] to i16
+// CHECK-64-NEXT:    store i16 [[TMP33]], i16* [[DOTOMP_REDUCTION_ELEMENT4]], align 2
+// CHECK-64-NEXT:    [[TMP34:%.*]] = getelementptr i16, i16* [[TMP24]], i64 1
+// CHECK-64-NEXT:    [[TMP35:%.*]] = getelementptr i16, i16* [[DOTOMP_REDUCTION_ELEMENT4]], i64 1
+// CHECK-64-NEXT:    [[TMP36:%.*]] = bitcast i16* [[DOTOMP_REDUCTION_ELEMENT4]] to i8*
+// CHECK-64-NEXT:    store i8* [[TMP36]], i8** [[TMP25]], align 8
+// CHECK-64-NEXT:    [[TMP37:%.*]] = icmp eq i16 [[TMP8]], 0
+// CHECK-64-NEXT:    [[TMP38:%.*]] = icmp eq i16 [[TMP8]], 1
+// CHECK-64-NEXT:    [[TMP39:%.*]] = icmp ult i16 [[TMP6]], [[TMP7]]
+// CHECK-64-NEXT:    [[TMP40:%.*]] = and i1 [[TMP38]], [[TMP39]]
+// CHECK-64-NEXT:    [[TMP41:%.*]] = icmp eq i16 [[TMP8]], 2
+// CHECK-64-NEXT:    [[TMP42:%.*]] = and i16 [[TMP6]], 1
+// CHECK-64-NEXT:    [[TMP43:%.*]] = icmp eq i16 [[TMP42]], 0
+// CHECK-64-NEXT:    [[TMP44:%.*]] = and i1 [[TMP41]], [[TMP43]]
+// CHECK-64-NEXT:    [[TMP45:%.*]] = icmp sgt i16 [[TMP7]], 0
+// CHECK-64-NEXT:    [[TMP46:%.*]] = and i1 [[TMP44]], [[TMP45]]
+// CHECK-64-NEXT:    [[TMP47:%.*]] = or i1 [[TMP37]], [[TMP40]]
+// CHECK-64-NEXT:    [[TMP48:%.*]] = or i1 [[TMP47]], [[TMP46]]
+// CHECK-64-NEXT:    br i1 [[TMP48]], label [[THEN:%.*]], label [[ELSE:%.*]]
+// CHECK-64:       then:
+// CHECK-64-NEXT:    [[TMP49:%.*]] = bitcast [2 x i8*]* [[TMP5]] to i8*
+// CHECK-64-NEXT:    [[TMP50:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]] to i8*
+// CHECK-64-NEXT:    call void @"_omp$reduction$reduction_func6"(i8* [[TMP49]], i8* [[TMP50]]) #[[ATTR3]]
+// CHECK-64-NEXT:    br label [[IFCONT:%.*]]
+// CHECK-64:       else:
+// CHECK-64-NEXT:    br label [[IFCONT]]
+// CHECK-64:       ifcont:
+// CHECK-64-NEXT:    [[TMP51:%.*]] = icmp eq i16 [[TMP8]], 1
+// CHECK-64-NEXT:    [[TMP52:%.*]] = icmp uge i16 [[TMP6]], [[TMP7]]
+// CHECK-64-NEXT:    [[TMP53:%.*]] = and i1 [[TMP51]], [[TMP52]]
+// CHECK-64-NEXT:    br i1 [[TMP53]], label [[THEN5:%.*]], label [[ELSE6:%.*]]
+// CHECK-64:       then5:
+// CHECK-64-NEXT:    [[TMP54:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i64 0, i64 0
+// CHECK-64-NEXT:    [[TMP55:%.*]] = bitcast i8** [[TMP54]] to i32**
+// CHECK-64-NEXT:    [[TMP56:%.*]] = load i32*, i32** [[TMP55]], align 8
+// CHECK-64-NEXT:    [[TMP57:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i64 0, i64 0
+// CHECK-64-NEXT:    [[TMP58:%.*]] = bitcast i8** [[TMP57]] to i32**
+// CHECK-64-NEXT:    [[TMP59:%.*]] = load i32*, i32** [[TMP58]], align 8
+// CHECK-64-NEXT:    [[TMP60:%.*]] = load i32, i32* [[TMP56]], align 4
+// CHECK-64-NEXT:    store i32 [[TMP60]], i32* [[TMP59]], align 4
+// CHECK-64-NEXT:    [[TMP61:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i64 0, i64 1
+// CHECK-64-NEXT:    [[TMP62:%.*]] = bitcast i8** [[TMP61]] to i16**
+// CHECK-64-NEXT:    [[TMP63:%.*]] = load i16*, i16** [[TMP62]], align 8
+// CHECK-64-NEXT:    [[TMP64:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i64 0, i64 1
+// CHECK-64-NEXT:    [[TMP65:%.*]] = bitcast i8** [[TMP64]] to i16**
+// CHECK-64-NEXT:    [[TMP66:%.*]] = load i16*, i16** [[TMP65]], align 8
+// CHECK-64-NEXT:    [[TMP67:%.*]] = load i16, i16* [[TMP63]], align 2
+// CHECK-64-NEXT:    store i16 [[TMP67]], i16* [[TMP66]], align 2
+// CHECK-64-NEXT:    br label [[IFCONT7:%.*]]
+// CHECK-64:       else6:
+// CHECK-64-NEXT:    br label [[IFCONT7]]
+// CHECK-64:       ifcont7:
+// CHECK-64-NEXT:    ret void
+//
+//
+// CHECK-64-LABEL: define {{[^@]+}}@_omp_reduction_inter_warp_copy_func8
+// CHECK-64-SAME: (i8* noundef [[TMP0:%.*]], i32 noundef [[TMP1:%.*]]) #[[ATTR2]] {
+// CHECK-64-NEXT:  entry:
+// CHECK-64-NEXT:    [[DOTADDR:%.*]] = alloca i8*, align 8
+// CHECK-64-NEXT:    [[DOTADDR1:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
+// CHECK-64-NEXT:    store i8* [[TMP0]], i8** [[DOTADDR]], align 8
+// CHECK-64-NEXT:    store i32 [[TMP1]], i32* [[DOTADDR1]], align 4
+// CHECK-64-NEXT:    [[TMP3:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block()
+// CHECK-64-NEXT:    [[TMP4:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block()
+// CHECK-64-NEXT:    [[NVPTX_LANE_ID:%.*]] = and i32 [[TMP4]], 31
+// CHECK-64-NEXT:    [[TMP5:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block()
+// CHECK-64-NEXT:    [[NVPTX_WARP_ID:%.*]] = ashr i32 [[TMP5]], 5
+// CHECK-64-NEXT:    [[TMP6:%.*]] = load i8*, i8** [[DOTADDR]], align 8
+// CHECK-64-NEXT:    [[TMP7:%.*]] = bitcast i8* [[TMP6]] to [2 x i8*]*
+// CHECK-64-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2]], i32 [[TMP2]])
+// CHECK-64-NEXT:    [[WARP_MASTER:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0
+// CHECK-64-NEXT:    br i1 [[WARP_MASTER]], label [[THEN:%.*]], label [[ELSE:%.*]]
+// CHECK-64:       then:
+// CHECK-64-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP7]], i64 0, i64 0
+// CHECK-64-NEXT:    [[TMP9:%.*]] = load i8*, i8** [[TMP8]], align 8
+// CHECK-64-NEXT:    [[TMP10:%.*]] = bitcast i8* [[TMP9]] to i32*
+// CHECK-64-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]]
+// CHECK-64-NEXT:    [[TMP12:%.*]] = load i32, i32* [[TMP10]], align 4
+// CHECK-64-NEXT:    store volatile i32 [[TMP12]], i32 addrspace(3)* [[TMP11]], align 4
+// CHECK-64-NEXT:    br label [[IFCONT:%.*]]
+// CHECK-64:       else:
+// CHECK-64-NEXT:    br label [[IFCONT]]
+// CHECK-64:       ifcont:
+// CHECK-64-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2]], i32 [[TMP2]])
+// CHECK-64-NEXT:    [[TMP13:%.*]] = load i32, i32* [[DOTADDR1]], align 4
+// CHECK-64-NEXT:    [[IS_ACTIVE_THREAD:%.*]] = icmp ult i32 [[TMP3]], [[TMP13]]
+// CHECK-64-NEXT:    br i1 [[IS_ACTIVE_THREAD]], label [[THEN2:%.*]], label [[ELSE3:%.*]]
+// CHECK-64:       then2:
+// CHECK-64-NEXT:    [[TMP14:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[TMP3]]
+// CHECK-64-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP7]], i64 0, i64 0
+// CHECK-64-NEXT:    [[TMP16:%.*]] = load i8*, i8** [[TMP15]], align 8
+// CHECK-64-NEXT:    [[TMP17:%.*]] = bitcast i8* [[TMP16]] to i32*
+// CHECK-64-NEXT:    [[TMP18:%.*]] = load volatile i32, i32 addrspace(3)* [[TMP14]], align 4
+// CHECK-64-NEXT:    store i32 [[TMP18]], i32* [[TMP17]], align 4
+// CHECK-64-NEXT:    br label [[IFCONT4:%.*]]
+// CHECK-64:       else3:
+// CHECK-64-NEXT:    br label [[IFCONT4]]
+// CHECK-64:       ifcont4:
+// CHECK-64-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2]], i32 [[TMP2]])
+// CHECK-64-NEXT:    [[WARP_MASTER5:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0
+// CHECK-64-NEXT:    br i1 [[WARP_MASTER5]], label [[THEN6:%.*]], label [[ELSE7:%.*]]
+// CHECK-64:       then6:
+// CHECK-64-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP7]], i64 0, i64 1
+// CHECK-64-NEXT:    [[TMP20:%.*]] = load i8*, i8** [[TMP19]], align 8
+// CHECK-64-NEXT:    [[TMP21:%.*]] = bitcast i8* [[TMP20]] to i16*
+// CHECK-64-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]]
+// CHECK-64-NEXT:    [[TMP23:%.*]] = bitcast i32 addrspace(3)* [[TMP22]] to i16 addrspace(3)*
+// CHECK-64-NEXT:    [[TMP24:%.*]] = load i16, i16* [[TMP21]], align 2
+// CHECK-64-NEXT:    store volatile i16 [[TMP24]], i16 addrspace(3)* [[TMP23]], align 2
+// CHECK-64-NEXT:    br label [[IFCONT8:%.*]]
+// CHECK-64:       else7:
+// CHECK-64-NEXT:    br label [[IFCONT8]]
+// CHECK-64:       ifcont8:
+// CHECK-64-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2]], i32 [[TMP2]])
+// CHECK-64-NEXT:    [[TMP25:%.*]] = load i32, i32* [[DOTADDR1]], align 4
+// CHECK-64-NEXT:    [[IS_ACTIVE_THREAD9:%.*]] = icmp ult i32 [[TMP3]], [[TMP25]]
+// CHECK-64-NEXT:    br i1 [[IS_ACTIVE_THREAD9]], label [[THEN10:%.*]], label [[ELSE11:%.*]]
+// CHECK-64:       then10:
+// CHECK-64-NEXT:    [[TMP26:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[TMP3]]
+// CHECK-64-NEXT:    [[TMP27:%.*]] = bitcast i32 addrspace(3)* [[TMP26]] to i16 addrspace(3)*
+// CHECK-64-NEXT:    [[TMP28:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP7]], i64 0, i64 1
+// CHECK-64-NEXT:    [[TMP29:%.*]] = load i8*, i8** [[TMP28]], align 8
+// CHECK-64-NEXT:    [[TMP30:%.*]] = bitcast i8* [[TMP29]] to i16*
+// CHECK-64-NEXT:    [[TMP31:%.*]] = load volatile i16, i16 addrspace(3)* [[TMP27]], align 2
+// CHECK-64-NEXT:    store i16 [[TMP31]], i16* [[TMP30]], align 2
+// CHECK-64-NEXT:    br label [[IFCONT12:%.*]]
+// CHECK-64:       else11:
+// CHECK-64-NEXT:    br label [[IFCONT12]]
+// CHECK-64:       ifcont12:
+// CHECK-64-NEXT:    ret void
+//
+//
+// CHECK-32-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l24
+// CHECK-32-SAME: (double* noundef nonnull align 8 dereferenceable(8) [[E:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-32-NEXT:  entry:
+// CHECK-32-NEXT:    [[E_ADDR:%.*]] = alloca double*, align 4
+// CHECK-32-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [1 x i8*], align 4
+// CHECK-32-NEXT:    store double* [[E]], double** [[E_ADDR]], align 4
+// CHECK-32-NEXT:    [[TMP0:%.*]] = load double*, double** [[E_ADDR]], align 4
+// CHECK-32-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1:[0-9]+]], i8 2, i1 false)
+// CHECK-32-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1
+// CHECK-32-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
+// CHECK-32:       user_code.entry:
+// CHECK-32-NEXT:    [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
+// CHECK-32-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0
+// CHECK-32-NEXT:    [[TMP4:%.*]] = bitcast double* [[TMP0]] to i8*
+// CHECK-32-NEXT:    store i8* [[TMP4]], i8** [[TMP3]], align 4
+// CHECK-32-NEXT:    [[TMP5:%.*]] = bitcast [1 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8**
+// CHECK-32-NEXT:    call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, double*)* @__omp_outlined__ to i8*), i8* null, i8** [[TMP5]], i32 1)
+// CHECK-32-NEXT:    call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 2)
+// CHECK-32-NEXT:    ret void
+// CHECK-32:       worker.exit:
+// CHECK-32-NEXT:    ret void
+//
+//
+// CHECK-32-LABEL: define {{[^@]+}}@__omp_outlined__
+// CHECK-32-SAME: (i32* noalias noundef [[DOTGLOBAL_TID_:%.*]], i32* noalias noundef [[DOTBOUND_TID_:%.*]], double* noundef nonnull align 8 dereferenceable(8) [[E:%.*]]) #[[ATTR1:[0-9]+]] {
+// CHECK-32-NEXT:  entry:
+// CHECK-32-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-NEXT:    [[E_ADDR:%.*]] = alloca double*, align 4
+// CHECK-32-NEXT:    [[E1:%.*]] = alloca double, align 8
+// CHECK-32-NEXT:    [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [1 x i8*], align 4
+// CHECK-32-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4
+// CHECK-32-NEXT:    store double* [[E]], double** [[E_ADDR]], align 4
+// CHECK-32-NEXT:    [[TMP0:%.*]] = load double*, double** [[E_ADDR]], align 4
+// CHECK-32-NEXT:    store double 0.000000e+00, double* [[E1]], align 8
+// CHECK-32-NEXT:    [[TMP1:%.*]] = load double, double* [[E1]], align 8
+// CHECK-32-NEXT:    [[ADD:%.*]] = fadd double [[TMP1]], 5.000000e+00
+// CHECK-32-NEXT:    store double [[ADD]], double* [[E1]], align 8
+// CHECK-32-NEXT:    [[TMP2:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-NEXT:    [[TMP3:%.*]] = load i32, i32* [[TMP2]], align 4
+// CHECK-32-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0
+// CHECK-32-NEXT:    [[TMP5:%.*]] = bitcast double* [[E1]] to i8*
+// CHECK-32-NEXT:    store i8* [[TMP5]], i8** [[TMP4]], align 4
+// CHECK-32-NEXT:    [[TMP6:%.*]] = bitcast [1 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8*
+// CHECK-32-NEXT:    [[TMP7:%.*]] = call i32 @__kmpc_nvptx_parallel_reduce_nowait_v2(%struct.ident_t* @[[GLOB1]], i32 [[TMP3]], i32 1, i32 4, i8* [[TMP6]], void (i8*, i16, i16, i16)* @_omp_reduction_shuffle_and_reduce_func, void (i8*, i32)* @_omp_reduction_inter_warp_copy_func)
+// CHECK-32-NEXT:    [[TMP8:%.*]] = icmp eq i32 [[TMP7]], 1
+// CHECK-32-NEXT:    br i1 [[TMP8]], label [[DOTOMP_REDUCTION_THEN:%.*]], label [[DOTOMP_REDUCTION_DONE:%.*]]
+// CHECK-32:       .omp.reduction.then:
+// CHECK-32-NEXT:    [[TMP9:%.*]] = load double, double* [[TMP0]], align 8
+// CHECK-32-NEXT:    [[TMP10:%.*]] = load double, double* [[E1]], align 8
+// CHECK-32-NEXT:    [[ADD2:%.*]] = fadd double [[TMP9]], [[TMP10]]
+// CHECK-32-NEXT:    store double [[ADD2]], double* [[TMP0]], align 8
+// CHECK-32-NEXT:    call void @__kmpc_nvptx_end_reduce_nowait(i32 [[TMP3]])
+// CHECK-32-NEXT:    br label [[DOTOMP_REDUCTION_DONE]]
+// CHECK-32:       .omp.reduction.done:
+// CHECK-32-NEXT:    ret void
+//
+//
+// CHECK-32-LABEL: define {{[^@]+}}@_omp_reduction_shuffle_and_reduce_func
+// CHECK-32-SAME: (i8* noundef [[TMP0:%.*]], i16 noundef signext [[TMP1:%.*]], i16 noundef signext [[TMP2:%.*]], i16 noundef signext [[TMP3:%.*]]) #[[ATTR2:[0-9]+]] {
+// CHECK-32-NEXT:  entry:
+// CHECK-32-NEXT:    [[DOTADDR:%.*]] = alloca i8*, align 4
+// CHECK-32-NEXT:    [[DOTADDR1:%.*]] = alloca i16, align 2
+// CHECK-32-NEXT:    [[DOTADDR2:%.*]] = alloca i16, align 2
+// CHECK-32-NEXT:    [[DOTADDR3:%.*]] = alloca i16, align 2
+// CHECK-32-NEXT:    [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST:%.*]] = alloca [1 x i8*], align 4
+// CHECK-32-NEXT:    [[DOTOMP_REDUCTION_ELEMENT:%.*]] = alloca double, align 8
+// CHECK-32-NEXT:    store i8* [[TMP0]], i8** [[DOTADDR]], align 4
+// CHECK-32-NEXT:    store i16 [[TMP1]], i16* [[DOTADDR1]], align 2
+// CHECK-32-NEXT:    store i16 [[TMP2]], i16* [[DOTADDR2]], align 2
+// CHECK-32-NEXT:    store i16 [[TMP3]], i16* [[DOTADDR3]], align 2
+// CHECK-32-NEXT:    [[TMP4:%.*]] = load i8*, i8** [[DOTADDR]], align 4
+// CHECK-32-NEXT:    [[TMP5:%.*]] = bitcast i8* [[TMP4]] to [1 x i8*]*
+// CHECK-32-NEXT:    [[TMP6:%.*]] = load i16, i16* [[DOTADDR1]], align 2
+// CHECK-32-NEXT:    [[TMP7:%.*]] = load i16, i16* [[DOTADDR2]], align 2
+// CHECK-32-NEXT:    [[TMP8:%.*]] = load i16, i16* [[DOTADDR3]], align 2
+// CHECK-32-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[TMP5]], i32 0, i32 0
+// CHECK-32-NEXT:    [[TMP10:%.*]] = bitcast i8** [[TMP9]] to double**
+// CHECK-32-NEXT:    [[TMP11:%.*]] = load double*, double** [[TMP10]], align 4
+// CHECK-32-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 0
+// CHECK-32-NEXT:    [[TMP13:%.*]] = getelementptr double, double* [[TMP11]], i32 1
+// CHECK-32-NEXT:    [[TMP14:%.*]] = bitcast double* [[TMP13]] to i8*
+// CHECK-32-NEXT:    [[TMP15:%.*]] = bitcast double* [[TMP11]] to i64*
+// CHECK-32-NEXT:    [[TMP16:%.*]] = bitcast double* [[DOTOMP_REDUCTION_ELEMENT]] to i64*
+// CHECK-32-NEXT:    [[TMP17:%.*]] = load i64, i64* [[TMP15]], align 8
+// CHECK-32-NEXT:    [[TMP18:%.*]] = call i32 @__kmpc_get_warp_size()
+// CHECK-32-NEXT:    [[TMP19:%.*]] = trunc i32 [[TMP18]] to i16
+// CHECK-32-NEXT:    [[TMP20:%.*]] = call i64 @__kmpc_shuffle_int64(i64 [[TMP17]], i16 [[TMP7]], i16 [[TMP19]])
+// CHECK-32-NEXT:    store i64 [[TMP20]], i64* [[TMP16]], align 8
+// CHECK-32-NEXT:    [[TMP21:%.*]] = getelementptr i64, i64* [[TMP15]], i32 1
+// CHECK-32-NEXT:    [[TMP22:%.*]] = getelementptr i64, i64* [[TMP16]], i32 1
+// CHECK-32-NEXT:    [[TMP23:%.*]] = bitcast double* [[DOTOMP_REDUCTION_ELEMENT]] to i8*
+// CHECK-32-NEXT:    store i8* [[TMP23]], i8** [[TMP12]], align 4
+// CHECK-32-NEXT:    [[TMP24:%.*]] = icmp eq i16 [[TMP8]], 0
+// CHECK-32-NEXT:    [[TMP25:%.*]] = icmp eq i16 [[TMP8]], 1
+// CHECK-32-NEXT:    [[TMP26:%.*]] = icmp ult i16 [[TMP6]], [[TMP7]]
+// CHECK-32-NEXT:    [[TMP27:%.*]] = and i1 [[TMP25]], [[TMP26]]
+// CHECK-32-NEXT:    [[TMP28:%.*]] = icmp eq i16 [[TMP8]], 2
+// CHECK-32-NEXT:    [[TMP29:%.*]] = and i16 [[TMP6]], 1
+// CHECK-32-NEXT:    [[TMP30:%.*]] = icmp eq i16 [[TMP29]], 0
+// CHECK-32-NEXT:    [[TMP31:%.*]] = and i1 [[TMP28]], [[TMP30]]
+// CHECK-32-NEXT:    [[TMP32:%.*]] = icmp sgt i16 [[TMP7]], 0
+// CHECK-32-NEXT:    [[TMP33:%.*]] = and i1 [[TMP31]], [[TMP32]]
+// CHECK-32-NEXT:    [[TMP34:%.*]] = or i1 [[TMP24]], [[TMP27]]
+// CHECK-32-NEXT:    [[TMP35:%.*]] = or i1 [[TMP34]], [[TMP33]]
+// CHECK-32-NEXT:    br i1 [[TMP35]], label [[THEN:%.*]], label [[ELSE:%.*]]
+// CHECK-32:       then:
+// CHECK-32-NEXT:    [[TMP36:%.*]] = bitcast [1 x i8*]* [[TMP5]] to i8*
+// CHECK-32-NEXT:    [[TMP37:%.*]] = bitcast [1 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]] to i8*
+// CHECK-32-NEXT:    call void @"_omp$reduction$reduction_func"(i8* [[TMP36]], i8* [[TMP37]]) #[[ATTR3:[0-9]+]]
+// CHECK-32-NEXT:    br label [[IFCONT:%.*]]
+// CHECK-32:       else:
+// CHECK-32-NEXT:    br label [[IFCONT]]
+// CHECK-32:       ifcont:
+// CHECK-32-NEXT:    [[TMP38:%.*]] = icmp eq i16 [[TMP8]], 1
+// CHECK-32-NEXT:    [[TMP39:%.*]] = icmp uge i16 [[TMP6]], [[TMP7]]
+// CHECK-32-NEXT:    [[TMP40:%.*]] = and i1 [[TMP38]], [[TMP39]]
+// CHECK-32-NEXT:    br i1 [[TMP40]], label [[THEN4:%.*]], label [[ELSE5:%.*]]
+// CHECK-32:       then4:
+// CHECK-32-NEXT:    [[TMP41:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 0
+// CHECK-32-NEXT:    [[TMP42:%.*]] = bitcast i8** [[TMP41]] to double**
+// CHECK-32-NEXT:    [[TMP43:%.*]] = load double*, double** [[TMP42]], align 4
+// CHECK-32-NEXT:    [[TMP44:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[TMP5]], i32 0, i32 0
+// CHECK-32-NEXT:    [[TMP45:%.*]] = bitcast i8** [[TMP44]] to double**
+// CHECK-32-NEXT:    [[TMP46:%.*]] = load double*, double** [[TMP45]], align 4
+// CHECK-32-NEXT:    [[TMP47:%.*]] = load double, double* [[TMP43]], align 8
+// CHECK-32-NEXT:    store double [[TMP47]], double* [[TMP46]], align 8
+// CHECK-32-NEXT:    br label [[IFCONT6:%.*]]
+// CHECK-32:       else5:
+// CHECK-32-NEXT:    br label [[IFCONT6]]
+// CHECK-32:       ifcont6:
+// CHECK-32-NEXT:    ret void
+//
+//
+// CHECK-32-LABEL: define {{[^@]+}}@_omp_reduction_inter_warp_copy_func
+// CHECK-32-SAME: (i8* noundef [[TMP0:%.*]], i32 noundef [[TMP1:%.*]]) #[[ATTR2]] {
+// CHECK-32-NEXT:  entry:
+// CHECK-32-NEXT:    [[DOTADDR:%.*]] = alloca i8*, align 4
+// CHECK-32-NEXT:    [[DOTADDR1:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTCNT_ADDR:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
+// CHECK-32-NEXT:    store i8* [[TMP0]], i8** [[DOTADDR]], align 4
+// CHECK-32-NEXT:    store i32 [[TMP1]], i32* [[DOTADDR1]], align 4
+// CHECK-32-NEXT:    [[TMP3:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block()
+// CHECK-32-NEXT:    [[TMP4:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block()
+// CHECK-32-NEXT:    [[NVPTX_LANE_ID:%.*]] = and i32 [[TMP4]], 31
+// CHECK-32-NEXT:    [[TMP5:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block()
+// CHECK-32-NEXT:    [[NVPTX_WARP_ID:%.*]] = ashr i32 [[TMP5]], 5
+// CHECK-32-NEXT:    [[TMP6:%.*]] = load i8*, i8** [[DOTADDR]], align 4
+// CHECK-32-NEXT:    [[TMP7:%.*]] = bitcast i8* [[TMP6]] to [1 x i8*]*
+// CHECK-32-NEXT:    store i32 0, i32* [[DOTCNT_ADDR]], align 4
+// CHECK-32-NEXT:    br label [[PRECOND:%.*]]
+// CHECK-32:       precond:
+// CHECK-32-NEXT:    [[TMP8:%.*]] = load i32, i32* [[DOTCNT_ADDR]], align 4
+// CHECK-32-NEXT:    [[TMP9:%.*]] = icmp ult i32 [[TMP8]], 2
+// CHECK-32-NEXT:    br i1 [[TMP9]], label [[BODY:%.*]], label [[EXIT:%.*]]
+// CHECK-32:       body:
+// CHECK-32-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2:[0-9]+]], i32 [[TMP2]])
+// CHECK-32-NEXT:    [[WARP_MASTER:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0
+// CHECK-32-NEXT:    br i1 [[WARP_MASTER]], label [[THEN:%.*]], label [[ELSE:%.*]]
+// CHECK-32:       then:
+// CHECK-32-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[TMP7]], i32 0, i32 0
+// CHECK-32-NEXT:    [[TMP11:%.*]] = load i8*, i8** [[TMP10]], align 4
+// CHECK-32-NEXT:    [[TMP12:%.*]] = bitcast i8* [[TMP11]] to i32*
+// CHECK-32-NEXT:    [[TMP13:%.*]] = getelementptr i32, i32* [[TMP12]], i32 [[TMP8]]
+// CHECK-32-NEXT:    [[TMP14:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]]
+// CHECK-32-NEXT:    [[TMP15:%.*]] = load i32, i32* [[TMP13]], align 4
+// CHECK-32-NEXT:    store volatile i32 [[TMP15]], i32 addrspace(3)* [[TMP14]], align 4
+// CHECK-32-NEXT:    br label [[IFCONT:%.*]]
+// CHECK-32:       else:
+// CHECK-32-NEXT:    br label [[IFCONT]]
+// CHECK-32:       ifcont:
+// CHECK-32-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2]], i32 [[TMP2]])
+// CHECK-32-NEXT:    [[TMP16:%.*]] = load i32, i32* [[DOTADDR1]], align 4
+// CHECK-32-NEXT:    [[IS_ACTIVE_THREAD:%.*]] = icmp ult i32 [[TMP3]], [[TMP16]]
+// CHECK-32-NEXT:    br i1 [[IS_ACTIVE_THREAD]], label [[THEN2:%.*]], label [[ELSE3:%.*]]
+// CHECK-32:       then2:
+// CHECK-32-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[TMP3]]
+// CHECK-32-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[TMP7]], i32 0, i32 0
+// CHECK-32-NEXT:    [[TMP19:%.*]] = load i8*, i8** [[TMP18]], align 4
+// CHECK-32-NEXT:    [[TMP20:%.*]] = bitcast i8* [[TMP19]] to i32*
+// CHECK-32-NEXT:    [[TMP21:%.*]] = getelementptr i32, i32* [[TMP20]], i32 [[TMP8]]
+// CHECK-32-NEXT:    [[TMP22:%.*]] = load volatile i32, i32 addrspace(3)* [[TMP17]], align 4
+// CHECK-32-NEXT:    store i32 [[TMP22]], i32* [[TMP21]], align 4
+// CHECK-32-NEXT:    br label [[IFCONT4:%.*]]
+// CHECK-32:       else3:
+// CHECK-32-NEXT:    br label [[IFCONT4]]
+// CHECK-32:       ifcont4:
+// CHECK-32-NEXT:    [[TMP23:%.*]] = add nsw i32 [[TMP8]], 1
+// CHECK-32-NEXT:    store i32 [[TMP23]], i32* [[DOTCNT_ADDR]], align 4
+// CHECK-32-NEXT:    br label [[PRECOND]]
+// CHECK-32:       exit:
+// CHECK-32-NEXT:    ret void
+//
+//
+// CHECK-32-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l29
+// CHECK-32-SAME: (i8* noundef nonnull align 1 dereferenceable(1) [[C:%.*]], float* noundef nonnull align 4 dereferenceable(4) [[D:%.*]]) #[[ATTR0]] {
+// CHECK-32-NEXT:  entry:
+// CHECK-32-NEXT:    [[C_ADDR:%.*]] = alloca i8*, align 4
+// CHECK-32-NEXT:    [[D_ADDR:%.*]] = alloca float*, align 4
+// CHECK-32-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [2 x i8*], align 4
+// CHECK-32-NEXT:    store i8* [[C]], i8** [[C_ADDR]], align 4
+// CHECK-32-NEXT:    store float* [[D]], float** [[D_ADDR]], align 4
+// CHECK-32-NEXT:    [[TMP0:%.*]] = load i8*, i8** [[C_ADDR]], align 4
+// CHECK-32-NEXT:    [[TMP1:%.*]] = load float*, float** [[D_ADDR]], align 4
+// CHECK-32-NEXT:    [[TMP2:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 2, i1 false)
+// CHECK-32-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP2]], -1
+// CHECK-32-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
+// CHECK-32:       user_code.entry:
+// CHECK-32-NEXT:    [[TMP3:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
+// CHECK-32-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0
+// CHECK-32-NEXT:    store i8* [[TMP0]], i8** [[TMP4]], align 4
+// CHECK-32-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1
+// CHECK-32-NEXT:    [[TMP6:%.*]] = bitcast float* [[TMP1]] to i8*
+// CHECK-32-NEXT:    store i8* [[TMP6]], i8** [[TMP5]], align 4
+// CHECK-32-NEXT:    [[TMP7:%.*]] = bitcast [2 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8**
+// CHECK-32-NEXT:    call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP3]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i8*, float*)* @__omp_outlined__1 to i8*), i8* null, i8** [[TMP7]], i32 2)
+// CHECK-32-NEXT:    call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 2)
+// CHECK-32-NEXT:    ret void
+// CHECK-32:       worker.exit:
+// CHECK-32-NEXT:    ret void
+//
+//
+// CHECK-32-LABEL: define {{[^@]+}}@__omp_outlined__1
+// CHECK-32-SAME: (i32* noalias noundef [[DOTGLOBAL_TID_:%.*]], i32* noalias noundef [[DOTBOUND_TID_:%.*]], i8* noundef nonnull align 1 dereferenceable(1) [[C:%.*]], float* noundef nonnull align 4 dereferenceable(4) [[D:%.*]]) #[[ATTR1]] {
+// CHECK-32-NEXT:  entry:
+// CHECK-32-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-NEXT:    [[C_ADDR:%.*]] = alloca i8*, align 4
+// CHECK-32-NEXT:    [[D_ADDR:%.*]] = alloca float*, align 4
+// CHECK-32-NEXT:    [[C1:%.*]] = alloca i8, align 1
+// CHECK-32-NEXT:    [[D2:%.*]] = alloca float, align 4
+// CHECK-32-NEXT:    [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [2 x i8*], align 4
+// CHECK-32-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4
+// CHECK-32-NEXT:    store i8* [[C]], i8** [[C_ADDR]], align 4
+// CHECK-32-NEXT:    store float* [[D]], float** [[D_ADDR]], align 4
+// CHECK-32-NEXT:    [[TMP0:%.*]] = load i8*, i8** [[C_ADDR]], align 4
+// CHECK-32-NEXT:    [[TMP1:%.*]] = load float*, float** [[D_ADDR]], align 4
+// CHECK-32-NEXT:    store i8 0, i8* [[C1]], align 1
+// CHECK-32-NEXT:    store float 1.000000e+00, float* [[D2]], align 4
+// CHECK-32-NEXT:    [[TMP2:%.*]] = load i8, i8* [[C1]], align 1
+// CHECK-32-NEXT:    [[CONV:%.*]] = sext i8 [[TMP2]] to i32
+// CHECK-32-NEXT:    [[XOR:%.*]] = xor i32 [[CONV]], 2
+// CHECK-32-NEXT:    [[CONV3:%.*]] = trunc i32 [[XOR]] to i8
+// CHECK-32-NEXT:    store i8 [[CONV3]], i8* [[C1]], align 1
+// CHECK-32-NEXT:    [[TMP3:%.*]] = load float, float* [[D2]], align 4
+// CHECK-32-NEXT:    [[MUL:%.*]] = fmul float [[TMP3]], 3.300000e+01
+// CHECK-32-NEXT:    store float [[MUL]], float* [[D2]], align 4
+// CHECK-32-NEXT:    [[TMP4:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-NEXT:    [[TMP5:%.*]] = load i32, i32* [[TMP4]], align 4
+// CHECK-32-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0
+// CHECK-32-NEXT:    store i8* [[C1]], i8** [[TMP6]], align 4
+// CHECK-32-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 1
+// CHECK-32-NEXT:    [[TMP8:%.*]] = bitcast float* [[D2]] to i8*
+// CHECK-32-NEXT:    store i8* [[TMP8]], i8** [[TMP7]], align 4
+// CHECK-32-NEXT:    [[TMP9:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8*
+// CHECK-32-NEXT:    [[TMP10:%.*]] = call i32 @__kmpc_nvptx_parallel_reduce_nowait_v2(%struct.ident_t* @[[GLOB1]], i32 [[TMP5]], i32 2, i32 8, i8* [[TMP9]], void (i8*, i16, i16, i16)* @_omp_reduction_shuffle_and_reduce_func3, void (i8*, i32)* @_omp_reduction_inter_warp_copy_func4)
+// CHECK-32-NEXT:    [[TMP11:%.*]] = icmp eq i32 [[TMP10]], 1
+// CHECK-32-NEXT:    br i1 [[TMP11]], label [[DOTOMP_REDUCTION_THEN:%.*]], label [[DOTOMP_REDUCTION_DONE:%.*]]
+// CHECK-32:       .omp.reduction.then:
+// CHECK-32-NEXT:    [[TMP12:%.*]] = load i8, i8* [[TMP0]], align 1
+// CHECK-32-NEXT:    [[CONV4:%.*]] = sext i8 [[TMP12]] to i32
+// CHECK-32-NEXT:    [[TMP13:%.*]] = load i8, i8* [[C1]], align 1
+// CHECK-32-NEXT:    [[CONV5:%.*]] = sext i8 [[TMP13]] to i32
+// CHECK-32-NEXT:    [[XOR6:%.*]] = xor i32 [[CONV4]], [[CONV5]]
+// CHECK-32-NEXT:    [[CONV7:%.*]] = trunc i32 [[XOR6]] to i8
+// CHECK-32-NEXT:    store i8 [[CONV7]], i8* [[TMP0]], align 1
+// CHECK-32-NEXT:    [[TMP14:%.*]] = load float, float* [[TMP1]], align 4
+// CHECK-32-NEXT:    [[TMP15:%.*]] = load float, float* [[D2]], align 4
+// CHECK-32-NEXT:    [[MUL8:%.*]] = fmul float [[TMP14]], [[TMP15]]
+// CHECK-32-NEXT:    store float [[MUL8]], float* [[TMP1]], align 4
+// CHECK-32-NEXT:    call void @__kmpc_nvptx_end_reduce_nowait(i32 [[TMP5]])
+// CHECK-32-NEXT:    br label [[DOTOMP_REDUCTION_DONE]]
+// CHECK-32:       .omp.reduction.done:
+// CHECK-32-NEXT:    ret void
+//
+//
+// CHECK-32-LABEL: define {{[^@]+}}@_omp_reduction_shuffle_and_reduce_func3
+// CHECK-32-SAME: (i8* noundef [[TMP0:%.*]], i16 noundef signext [[TMP1:%.*]], i16 noundef signext [[TMP2:%.*]], i16 noundef signext [[TMP3:%.*]]) #[[ATTR2]] {
+// CHECK-32-NEXT:  entry:
+// CHECK-32-NEXT:    [[DOTADDR:%.*]] = alloca i8*, align 4
+// CHECK-32-NEXT:    [[DOTADDR1:%.*]] = alloca i16, align 2
+// CHECK-32-NEXT:    [[DOTADDR2:%.*]] = alloca i16, align 2
+// CHECK-32-NEXT:    [[DOTADDR3:%.*]] = alloca i16, align 2
+// CHECK-32-NEXT:    [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST:%.*]] = alloca [2 x i8*], align 4
+// CHECK-32-NEXT:    [[DOTOMP_REDUCTION_ELEMENT:%.*]] = alloca i8, align 1
+// CHECK-32-NEXT:    [[DOTOMP_REDUCTION_ELEMENT4:%.*]] = alloca float, align 4
+// CHECK-32-NEXT:    store i8* [[TMP0]], i8** [[DOTADDR]], align 4
+// CHECK-32-NEXT:    store i16 [[TMP1]], i16* [[DOTADDR1]], align 2
+// CHECK-32-NEXT:    store i16 [[TMP2]], i16* [[DOTADDR2]], align 2
+// CHECK-32-NEXT:    store i16 [[TMP3]], i16* [[DOTADDR3]], align 2
+// CHECK-32-NEXT:    [[TMP4:%.*]] = load i8*, i8** [[DOTADDR]], align 4
+// CHECK-32-NEXT:    [[TMP5:%.*]] = bitcast i8* [[TMP4]] to [2 x i8*]*
+// CHECK-32-NEXT:    [[TMP6:%.*]] = load i16, i16* [[DOTADDR1]], align 2
+// CHECK-32-NEXT:    [[TMP7:%.*]] = load i16, i16* [[DOTADDR2]], align 2
+// CHECK-32-NEXT:    [[TMP8:%.*]] = load i16, i16* [[DOTADDR3]], align 2
+// CHECK-32-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 0
+// CHECK-32-NEXT:    [[TMP10:%.*]] = load i8*, i8** [[TMP9]], align 4
+// CHECK-32-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 0
+// CHECK-32-NEXT:    [[TMP12:%.*]] = getelementptr i8, i8* [[TMP10]], i32 1
+// CHECK-32-NEXT:    [[TMP13:%.*]] = load i8, i8* [[TMP10]], align 1
+// CHECK-32-NEXT:    [[TMP14:%.*]] = sext i8 [[TMP13]] to i32
+// CHECK-32-NEXT:    [[TMP15:%.*]] = call i32 @__kmpc_get_warp_size()
+// CHECK-32-NEXT:    [[TMP16:%.*]] = trunc i32 [[TMP15]] to i16
+// CHECK-32-NEXT:    [[TMP17:%.*]] = call i32 @__kmpc_shuffle_int32(i32 [[TMP14]], i16 [[TMP7]], i16 [[TMP16]])
+// CHECK-32-NEXT:    [[TMP18:%.*]] = trunc i32 [[TMP17]] to i8
+// CHECK-32-NEXT:    store i8 [[TMP18]], i8* [[DOTOMP_REDUCTION_ELEMENT]], align 1
+// CHECK-32-NEXT:    [[TMP19:%.*]] = getelementptr i8, i8* [[TMP10]], i32 1
+// CHECK-32-NEXT:    [[TMP20:%.*]] = getelementptr i8, i8* [[DOTOMP_REDUCTION_ELEMENT]], i32 1
+// CHECK-32-NEXT:    store i8* [[DOTOMP_REDUCTION_ELEMENT]], i8** [[TMP11]], align 4
+// CHECK-32-NEXT:    [[TMP21:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 1
+// CHECK-32-NEXT:    [[TMP22:%.*]] = bitcast i8** [[TMP21]] to float**
+// CHECK-32-NEXT:    [[TMP23:%.*]] = load float*, float** [[TMP22]], align 4
+// CHECK-32-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 1
+// CHECK-32-NEXT:    [[TMP25:%.*]] = getelementptr float, float* [[TMP23]], i32 1
+// CHECK-32-NEXT:    [[TMP26:%.*]] = bitcast float* [[TMP25]] to i8*
+// CHECK-32-NEXT:    [[TMP27:%.*]] = bitcast float* [[TMP23]] to i32*
+// CHECK-32-NEXT:    [[TMP28:%.*]] = bitcast float* [[DOTOMP_REDUCTION_ELEMENT4]] to i32*
+// CHECK-32-NEXT:    [[TMP29:%.*]] = load i32, i32* [[TMP27]], align 4
+// CHECK-32-NEXT:    [[TMP30:%.*]] = call i32 @__kmpc_get_warp_size()
+// CHECK-32-NEXT:    [[TMP31:%.*]] = trunc i32 [[TMP30]] to i16
+// CHECK-32-NEXT:    [[TMP32:%.*]] = call i32 @__kmpc_shuffle_int32(i32 [[TMP29]], i16 [[TMP7]], i16 [[TMP31]])
+// CHECK-32-NEXT:    store i32 [[TMP32]], i32* [[TMP28]], align 4
+// CHECK-32-NEXT:    [[TMP33:%.*]] = getelementptr i32, i32* [[TMP27]], i32 1
+// CHECK-32-NEXT:    [[TMP34:%.*]] = getelementptr i32, i32* [[TMP28]], i32 1
+// CHECK-32-NEXT:    [[TMP35:%.*]] = bitcast float* [[DOTOMP_REDUCTION_ELEMENT4]] to i8*
+// CHECK-32-NEXT:    store i8* [[TMP35]], i8** [[TMP24]], align 4
+// CHECK-32-NEXT:    [[TMP36:%.*]] = icmp eq i16 [[TMP8]], 0
+// CHECK-32-NEXT:    [[TMP37:%.*]] = icmp eq i16 [[TMP8]], 1
+// CHECK-32-NEXT:    [[TMP38:%.*]] = icmp ult i16 [[TMP6]], [[TMP7]]
+// CHECK-32-NEXT:    [[TMP39:%.*]] = and i1 [[TMP37]], [[TMP38]]
+// CHECK-32-NEXT:    [[TMP40:%.*]] = icmp eq i16 [[TMP8]], 2
+// CHECK-32-NEXT:    [[TMP41:%.*]] = and i16 [[TMP6]], 1
+// CHECK-32-NEXT:    [[TMP42:%.*]] = icmp eq i16 [[TMP41]], 0
+// CHECK-32-NEXT:    [[TMP43:%.*]] = and i1 [[TMP40]], [[TMP42]]
+// CHECK-32-NEXT:    [[TMP44:%.*]] = icmp sgt i16 [[TMP7]], 0
+// CHECK-32-NEXT:    [[TMP45:%.*]] = and i1 [[TMP43]], [[TMP44]]
+// CHECK-32-NEXT:    [[TMP46:%.*]] = or i1 [[TMP36]], [[TMP39]]
+// CHECK-32-NEXT:    [[TMP47:%.*]] = or i1 [[TMP46]], [[TMP45]]
+// CHECK-32-NEXT:    br i1 [[TMP47]], label [[THEN:%.*]], label [[ELSE:%.*]]
+// CHECK-32:       then:
+// CHECK-32-NEXT:    [[TMP48:%.*]] = bitcast [2 x i8*]* [[TMP5]] to i8*
+// CHECK-32-NEXT:    [[TMP49:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]] to i8*
+// CHECK-32-NEXT:    call void @"_omp$reduction$reduction_func2"(i8* [[TMP48]], i8* [[TMP49]]) #[[ATTR3]]
+// CHECK-32-NEXT:    br label [[IFCONT:%.*]]
+// CHECK-32:       else:
+// CHECK-32-NEXT:    br label [[IFCONT]]
+// CHECK-32:       ifcont:
+// CHECK-32-NEXT:    [[TMP50:%.*]] = icmp eq i16 [[TMP8]], 1
+// CHECK-32-NEXT:    [[TMP51:%.*]] = icmp uge i16 [[TMP6]], [[TMP7]]
+// CHECK-32-NEXT:    [[TMP52:%.*]] = and i1 [[TMP50]], [[TMP51]]
+// CHECK-32-NEXT:    br i1 [[TMP52]], label [[THEN5:%.*]], label [[ELSE6:%.*]]
+// CHECK-32:       then5:
+// CHECK-32-NEXT:    [[TMP53:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 0
+// CHECK-32-NEXT:    [[TMP54:%.*]] = load i8*, i8** [[TMP53]], align 4
+// CHECK-32-NEXT:    [[TMP55:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 0
+// CHECK-32-NEXT:    [[TMP56:%.*]] = load i8*, i8** [[TMP55]], align 4
+// CHECK-32-NEXT:    [[TMP57:%.*]] = load i8, i8* [[TMP54]], align 1
+// CHECK-32-NEXT:    store i8 [[TMP57]], i8* [[TMP56]], align 1
+// CHECK-32-NEXT:    [[TMP58:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 1
+// CHECK-32-NEXT:    [[TMP59:%.*]] = bitcast i8** [[TMP58]] to float**
+// CHECK-32-NEXT:    [[TMP60:%.*]] = load float*, float** [[TMP59]], align 4
+// CHECK-32-NEXT:    [[TMP61:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 1
+// CHECK-32-NEXT:    [[TMP62:%.*]] = bitcast i8** [[TMP61]] to float**
+// CHECK-32-NEXT:    [[TMP63:%.*]] = load float*, float** [[TMP62]], align 4
+// CHECK-32-NEXT:    [[TMP64:%.*]] = load float, float* [[TMP60]], align 4
+// CHECK-32-NEXT:    store float [[TMP64]], float* [[TMP63]], align 4
+// CHECK-32-NEXT:    br label [[IFCONT7:%.*]]
+// CHECK-32:       else6:
+// CHECK-32-NEXT:    br label [[IFCONT7]]
+// CHECK-32:       ifcont7:
+// CHECK-32-NEXT:    ret void
+//
+//
+// CHECK-32-LABEL: define {{[^@]+}}@_omp_reduction_inter_warp_copy_func4
+// CHECK-32-SAME: (i8* noundef [[TMP0:%.*]], i32 noundef [[TMP1:%.*]]) #[[ATTR2]] {
+// CHECK-32-NEXT:  entry:
+// CHECK-32-NEXT:    [[DOTADDR:%.*]] = alloca i8*, align 4
+// CHECK-32-NEXT:    [[DOTADDR1:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
+// CHECK-32-NEXT:    store i8* [[TMP0]], i8** [[DOTADDR]], align 4
+// CHECK-32-NEXT:    store i32 [[TMP1]], i32* [[DOTADDR1]], align 4
+// CHECK-32-NEXT:    [[TMP3:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block()
+// CHECK-32-NEXT:    [[TMP4:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block()
+// CHECK-32-NEXT:    [[NVPTX_LANE_ID:%.*]] = and i32 [[TMP4]], 31
+// CHECK-32-NEXT:    [[TMP5:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block()
+// CHECK-32-NEXT:    [[NVPTX_WARP_ID:%.*]] = ashr i32 [[TMP5]], 5
+// CHECK-32-NEXT:    [[TMP6:%.*]] = load i8*, i8** [[DOTADDR]], align 4
+// CHECK-32-NEXT:    [[TMP7:%.*]] = bitcast i8* [[TMP6]] to [2 x i8*]*
+// CHECK-32-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2]], i32 [[TMP2]])
+// CHECK-32-NEXT:    [[WARP_MASTER:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0
+// CHECK-32-NEXT:    br i1 [[WARP_MASTER]], label [[THEN:%.*]], label [[ELSE:%.*]]
+// CHECK-32:       then:
+// CHECK-32-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP7]], i32 0, i32 0
+// CHECK-32-NEXT:    [[TMP9:%.*]] = load i8*, i8** [[TMP8]], align 4
+// CHECK-32-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]]
+// CHECK-32-NEXT:    [[TMP11:%.*]] = bitcast i32 addrspace(3)* [[TMP10]] to i8 addrspace(3)*
+// CHECK-32-NEXT:    [[TMP12:%.*]] = load i8, i8* [[TMP9]], align 1
+// CHECK-32-NEXT:    store volatile i8 [[TMP12]], i8 addrspace(3)* [[TMP11]], align 1
+// CHECK-32-NEXT:    br label [[IFCONT:%.*]]
+// CHECK-32:       else:
+// CHECK-32-NEXT:    br label [[IFCONT]]
+// CHECK-32:       ifcont:
+// CHECK-32-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2]], i32 [[TMP2]])
+// CHECK-32-NEXT:    [[TMP13:%.*]] = load i32, i32* [[DOTADDR1]], align 4
+// CHECK-32-NEXT:    [[IS_ACTIVE_THREAD:%.*]] = icmp ult i32 [[TMP3]], [[TMP13]]
+// CHECK-32-NEXT:    br i1 [[IS_ACTIVE_THREAD]], label [[THEN2:%.*]], label [[ELSE3:%.*]]
+// CHECK-32:       then2:
+// CHECK-32-NEXT:    [[TMP14:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[TMP3]]
+// CHECK-32-NEXT:    [[TMP15:%.*]] = bitcast i32 addrspace(3)* [[TMP14]] to i8 addrspace(3)*
+// CHECK-32-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP7]], i32 0, i32 0
+// CHECK-32-NEXT:    [[TMP17:%.*]] = load i8*, i8** [[TMP16]], align 4
+// CHECK-32-NEXT:    [[TMP18:%.*]] = load volatile i8, i8 addrspace(3)* [[TMP15]], align 1
+// CHECK-32-NEXT:    store i8 [[TMP18]], i8* [[TMP17]], align 1
+// CHECK-32-NEXT:    br label [[IFCONT4:%.*]]
+// CHECK-32:       else3:
+// CHECK-32-NEXT:    br label [[IFCONT4]]
+// CHECK-32:       ifcont4:
+// CHECK-32-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2]], i32 [[TMP2]])
+// CHECK-32-NEXT:    [[WARP_MASTER5:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0
+// CHECK-32-NEXT:    br i1 [[WARP_MASTER5]], label [[THEN6:%.*]], label [[ELSE7:%.*]]
+// CHECK-32:       then6:
+// CHECK-32-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP7]], i32 0, i32 1
+// CHECK-32-NEXT:    [[TMP20:%.*]] = load i8*, i8** [[TMP19]], align 4
+// CHECK-32-NEXT:    [[TMP21:%.*]] = bitcast i8* [[TMP20]] to i32*
+// CHECK-32-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]]
+// CHECK-32-NEXT:    [[TMP23:%.*]] = load i32, i32* [[TMP21]], align 4
+// CHECK-32-NEXT:    store volatile i32 [[TMP23]], i32 addrspace(3)* [[TMP22]], align 4
+// CHECK-32-NEXT:    br label [[IFCONT8:%.*]]
+// CHECK-32:       else7:
+// CHECK-32-NEXT:    br label [[IFCONT8]]
+// CHECK-32:       ifcont8:
+// CHECK-32-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2]], i32 [[TMP2]])
+// CHECK-32-NEXT:    [[TMP24:%.*]] = load i32, i32* [[DOTADDR1]], align 4
+// CHECK-32-NEXT:    [[IS_ACTIVE_THREAD9:%.*]] = icmp ult i32 [[TMP3]], [[TMP24]]
+// CHECK-32-NEXT:    br i1 [[IS_ACTIVE_THREAD9]], label [[THEN10:%.*]], label [[ELSE11:%.*]]
+// CHECK-32:       then10:
+// CHECK-32-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[TMP3]]
+// CHECK-32-NEXT:    [[TMP26:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP7]], i32 0, i32 1
+// CHECK-32-NEXT:    [[TMP27:%.*]] = load i8*, i8** [[TMP26]], align 4
+// CHECK-32-NEXT:    [[TMP28:%.*]] = bitcast i8* [[TMP27]] to i32*
+// CHECK-32-NEXT:    [[TMP29:%.*]] = load volatile i32, i32 addrspace(3)* [[TMP25]], align 4
+// CHECK-32-NEXT:    store i32 [[TMP29]], i32* [[TMP28]], align 4
+// CHECK-32-NEXT:    br label [[IFCONT12:%.*]]
+// CHECK-32:       else11:
+// CHECK-32-NEXT:    br label [[IFCONT12]]
+// CHECK-32:       ifcont12:
+// CHECK-32-NEXT:    ret void
+//
+//
+// CHECK-32-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l35
+// CHECK-32-SAME: (i32* noundef nonnull align 4 dereferenceable(4) [[A:%.*]], i16* noundef nonnull align 2 dereferenceable(2) [[B:%.*]]) #[[ATTR0]] {
+// CHECK-32-NEXT:  entry:
+// CHECK-32-NEXT:    [[A_ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-NEXT:    [[B_ADDR:%.*]] = alloca i16*, align 4
+// CHECK-32-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [2 x i8*], align 4
+// CHECK-32-NEXT:    store i32* [[A]], i32** [[A_ADDR]], align 4
+// CHECK-32-NEXT:    store i16* [[B]], i16** [[B_ADDR]], align 4
+// CHECK-32-NEXT:    [[TMP0:%.*]] = load i32*, i32** [[A_ADDR]], align 4
+// CHECK-32-NEXT:    [[TMP1:%.*]] = load i16*, i16** [[B_ADDR]], align 4
+// CHECK-32-NEXT:    [[TMP2:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 2, i1 false)
+// CHECK-32-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP2]], -1
+// CHECK-32-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
+// CHECK-32:       user_code.entry:
+// CHECK-32-NEXT:    [[TMP3:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
+// CHECK-32-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0
+// CHECK-32-NEXT:    [[TMP5:%.*]] = bitcast i32* [[TMP0]] to i8*
+// CHECK-32-NEXT:    store i8* [[TMP5]], i8** [[TMP4]], align 4
+// CHECK-32-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1
+// CHECK-32-NEXT:    [[TMP7:%.*]] = bitcast i16* [[TMP1]] to i8*
+// CHECK-32-NEXT:    store i8* [[TMP7]], i8** [[TMP6]], align 4
+// CHECK-32-NEXT:    [[TMP8:%.*]] = bitcast [2 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8**
+// CHECK-32-NEXT:    call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP3]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32*, i16*)* @__omp_outlined__5 to i8*), i8* null, i8** [[TMP8]], i32 2)
+// CHECK-32-NEXT:    call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 2)
+// CHECK-32-NEXT:    ret void
+// CHECK-32:       worker.exit:
+// CHECK-32-NEXT:    ret void
+//
+//
+// CHECK-32-LABEL: define {{[^@]+}}@__omp_outlined__5
+// CHECK-32-SAME: (i32* noalias noundef [[DOTGLOBAL_TID_:%.*]], i32* noalias noundef [[DOTBOUND_TID_:%.*]], i32* noundef nonnull align 4 dereferenceable(4) [[A:%.*]], i16* noundef nonnull align 2 dereferenceable(2) [[B:%.*]]) #[[ATTR1]] {
+// CHECK-32-NEXT:  entry:
+// CHECK-32-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-NEXT:    [[A_ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-NEXT:    [[B_ADDR:%.*]] = alloca i16*, align 4
+// CHECK-32-NEXT:    [[A1:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[B2:%.*]] = alloca i16, align 2
+// CHECK-32-NEXT:    [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [2 x i8*], align 4
+// CHECK-32-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4
+// CHECK-32-NEXT:    store i32* [[A]], i32** [[A_ADDR]], align 4
+// CHECK-32-NEXT:    store i16* [[B]], i16** [[B_ADDR]], align 4
+// CHECK-32-NEXT:    [[TMP0:%.*]] = load i32*, i32** [[A_ADDR]], align 4
+// CHECK-32-NEXT:    [[TMP1:%.*]] = load i16*, i16** [[B_ADDR]], align 4
+// CHECK-32-NEXT:    store i32 0, i32* [[A1]], align 4
+// CHECK-32-NEXT:    store i16 -32768, i16* [[B2]], align 2
+// CHECK-32-NEXT:    [[TMP2:%.*]] = load i32, i32* [[A1]], align 4
+// CHECK-32-NEXT:    [[OR:%.*]] = or i32 [[TMP2]], 1
+// CHECK-32-NEXT:    store i32 [[OR]], i32* [[A1]], align 4
+// CHECK-32-NEXT:    [[TMP3:%.*]] = load i16, i16* [[B2]], align 2
+// CHECK-32-NEXT:    [[CONV:%.*]] = sext i16 [[TMP3]] to i32
+// CHECK-32-NEXT:    [[CMP:%.*]] = icmp sgt i32 99, [[CONV]]
+// CHECK-32-NEXT:    br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK-32:       cond.true:
+// CHECK-32-NEXT:    br label [[COND_END:%.*]]
+// CHECK-32:       cond.false:
+// CHECK-32-NEXT:    [[TMP4:%.*]] = load i16, i16* [[B2]], align 2
+// CHECK-32-NEXT:    [[CONV3:%.*]] = sext i16 [[TMP4]] to i32
+// CHECK-32-NEXT:    br label [[COND_END]]
+// CHECK-32:       cond.end:
+// CHECK-32-NEXT:    [[COND:%.*]] = phi i32 [ 99, [[COND_TRUE]] ], [ [[CONV3]], [[COND_FALSE]] ]
+// CHECK-32-NEXT:    [[CONV4:%.*]] = trunc i32 [[COND]] to i16
+// CHECK-32-NEXT:    store i16 [[CONV4]], i16* [[B2]], align 2
+// CHECK-32-NEXT:    [[TMP5:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-NEXT:    [[TMP6:%.*]] = load i32, i32* [[TMP5]], align 4
+// CHECK-32-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0
+// CHECK-32-NEXT:    [[TMP8:%.*]] = bitcast i32* [[A1]] to i8*
+// CHECK-32-NEXT:    store i8* [[TMP8]], i8** [[TMP7]], align 4
+// CHECK-32-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 1
+// CHECK-32-NEXT:    [[TMP10:%.*]] = bitcast i16* [[B2]] to i8*
+// CHECK-32-NEXT:    store i8* [[TMP10]], i8** [[TMP9]], align 4
+// CHECK-32-NEXT:    [[TMP11:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8*
+// CHECK-32-NEXT:    [[TMP12:%.*]] = call i32 @__kmpc_nvptx_parallel_reduce_nowait_v2(%struct.ident_t* @[[GLOB1]], i32 [[TMP6]], i32 2, i32 8, i8* [[TMP11]], void (i8*, i16, i16, i16)* @_omp_reduction_shuffle_and_reduce_func7, void (i8*, i32)* @_omp_reduction_inter_warp_copy_func8)
+// CHECK-32-NEXT:    [[TMP13:%.*]] = icmp eq i32 [[TMP12]], 1
+// CHECK-32-NEXT:    br i1 [[TMP13]], label [[DOTOMP_REDUCTION_THEN:%.*]], label [[DOTOMP_REDUCTION_DONE:%.*]]
+// CHECK-32:       .omp.reduction.then:
+// CHECK-32-NEXT:    [[TMP14:%.*]] = load i32, i32* [[TMP0]], align 4
+// CHECK-32-NEXT:    [[TMP15:%.*]] = load i32, i32* [[A1]], align 4
+// CHECK-32-NEXT:    [[OR5:%.*]] = or i32 [[TMP14]], [[TMP15]]
+// CHECK-32-NEXT:    store i32 [[OR5]], i32* [[TMP0]], align 4
+// CHECK-32-NEXT:    [[TMP16:%.*]] = load i16, i16* [[TMP1]], align 2
+// CHECK-32-NEXT:    [[CONV6:%.*]] = sext i16 [[TMP16]] to i32
+// CHECK-32-NEXT:    [[TMP17:%.*]] = load i16, i16* [[B2]], align 2
+// CHECK-32-NEXT:    [[CONV7:%.*]] = sext i16 [[TMP17]] to i32
+// CHECK-32-NEXT:    [[CMP8:%.*]] = icmp sgt i32 [[CONV6]], [[CONV7]]
+// CHECK-32-NEXT:    br i1 [[CMP8]], label [[COND_TRUE9:%.*]], label [[COND_FALSE10:%.*]]
+// CHECK-32:       cond.true9:
+// CHECK-32-NEXT:    [[TMP18:%.*]] = load i16, i16* [[TMP1]], align 2
+// CHECK-32-NEXT:    br label [[COND_END11:%.*]]
+// CHECK-32:       cond.false10:
+// CHECK-32-NEXT:    [[TMP19:%.*]] = load i16, i16* [[B2]], align 2
+// CHECK-32-NEXT:    br label [[COND_END11]]
+// CHECK-32:       cond.end11:
+// CHECK-32-NEXT:    [[COND12:%.*]] = phi i16 [ [[TMP18]], [[COND_TRUE9]] ], [ [[TMP19]], [[COND_FALSE10]] ]
+// CHECK-32-NEXT:    store i16 [[COND12]], i16* [[TMP1]], align 2
+// CHECK-32-NEXT:    call void @__kmpc_nvptx_end_reduce_nowait(i32 [[TMP6]])
+// CHECK-32-NEXT:    br label [[DOTOMP_REDUCTION_DONE]]
+// CHECK-32:       .omp.reduction.done:
+// CHECK-32-NEXT:    ret void
+//
+//
+// CHECK-32-LABEL: define {{[^@]+}}@_omp_reduction_shuffle_and_reduce_func7
+// CHECK-32-SAME: (i8* noundef [[TMP0:%.*]], i16 noundef signext [[TMP1:%.*]], i16 noundef signext [[TMP2:%.*]], i16 noundef signext [[TMP3:%.*]]) #[[ATTR2]] {
+// CHECK-32-NEXT:  entry:
+// CHECK-32-NEXT:    [[DOTADDR:%.*]] = alloca i8*, align 4
+// CHECK-32-NEXT:    [[DOTADDR1:%.*]] = alloca i16, align 2
+// CHECK-32-NEXT:    [[DOTADDR2:%.*]] = alloca i16, align 2
+// CHECK-32-NEXT:    [[DOTADDR3:%.*]] = alloca i16, align 2
+// CHECK-32-NEXT:    [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST:%.*]] = alloca [2 x i8*], align 4
+// CHECK-32-NEXT:    [[DOTOMP_REDUCTION_ELEMENT:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_REDUCTION_ELEMENT4:%.*]] = alloca i16, align 2
+// CHECK-32-NEXT:    store i8* [[TMP0]], i8** [[DOTADDR]], align 4
+// CHECK-32-NEXT:    store i16 [[TMP1]], i16* [[DOTADDR1]], align 2
+// CHECK-32-NEXT:    store i16 [[TMP2]], i16* [[DOTADDR2]], align 2
+// CHECK-32-NEXT:    store i16 [[TMP3]], i16* [[DOTADDR3]], align 2
+// CHECK-32-NEXT:    [[TMP4:%.*]] = load i8*, i8** [[DOTADDR]], align 4
+// CHECK-32-NEXT:    [[TMP5:%.*]] = bitcast i8* [[TMP4]] to [2 x i8*]*
+// CHECK-32-NEXT:    [[TMP6:%.*]] = load i16, i16* [[DOTADDR1]], align 2
+// CHECK-32-NEXT:    [[TMP7:%.*]] = load i16, i16* [[DOTADDR2]], align 2
+// CHECK-32-NEXT:    [[TMP8:%.*]] = load i16, i16* [[DOTADDR3]], align 2
+// CHECK-32-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 0
+// CHECK-32-NEXT:    [[TMP10:%.*]] = bitcast i8** [[TMP9]] to i32**
+// CHECK-32-NEXT:    [[TMP11:%.*]] = load i32*, i32** [[TMP10]], align 4
+// CHECK-32-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 0
+// CHECK-32-NEXT:    [[TMP13:%.*]] = getelementptr i32, i32* [[TMP11]], i32 1
+// CHECK-32-NEXT:    [[TMP14:%.*]] = bitcast i32* [[TMP13]] to i8*
+// CHECK-32-NEXT:    [[TMP15:%.*]] = load i32, i32* [[TMP11]], align 4
+// CHECK-32-NEXT:    [[TMP16:%.*]] = call i32 @__kmpc_get_warp_size()
+// CHECK-32-NEXT:    [[TMP17:%.*]] = trunc i32 [[TMP16]] to i16
+// CHECK-32-NEXT:    [[TMP18:%.*]] = call i32 @__kmpc_shuffle_int32(i32 [[TMP15]], i16 [[TMP7]], i16 [[TMP17]])
+// CHECK-32-NEXT:    store i32 [[TMP18]], i32* [[DOTOMP_REDUCTION_ELEMENT]], align 4
+// CHECK-32-NEXT:    [[TMP19:%.*]] = getelementptr i32, i32* [[TMP11]], i32 1
+// CHECK-32-NEXT:    [[TMP20:%.*]] = getelementptr i32, i32* [[DOTOMP_REDUCTION_ELEMENT]], i32 1
+// CHECK-32-NEXT:    [[TMP21:%.*]] = bitcast i32* [[DOTOMP_REDUCTION_ELEMENT]] to i8*
+// CHECK-32-NEXT:    store i8* [[TMP21]], i8** [[TMP12]], align 4
+// CHECK-32-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 1
+// CHECK-32-NEXT:    [[TMP23:%.*]] = bitcast i8** [[TMP22]] to i16**
+// CHECK-32-NEXT:    [[TMP24:%.*]] = load i16*, i16** [[TMP23]], align 4
+// CHECK-32-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 1
+// CHECK-32-NEXT:    [[TMP26:%.*]] = getelementptr i16, i16* [[TMP24]], i32 1
+// CHECK-32-NEXT:    [[TMP27:%.*]] = bitcast i16* [[TMP26]] to i8*
+// CHECK-32-NEXT:    [[TMP28:%.*]] = load i16, i16* [[TMP24]], align 2
+// CHECK-32-NEXT:    [[TMP29:%.*]] = sext i16 [[TMP28]] to i32
+// CHECK-32-NEXT:    [[TMP30:%.*]] = call i32 @__kmpc_get_warp_size()
+// CHECK-32-NEXT:    [[TMP31:%.*]] = trunc i32 [[TMP30]] to i16
+// CHECK-32-NEXT:    [[TMP32:%.*]] = call i32 @__kmpc_shuffle_int32(i32 [[TMP29]], i16 [[TMP7]], i16 [[TMP31]])
+// CHECK-32-NEXT:    [[TMP33:%.*]] = trunc i32 [[TMP32]] to i16
+// CHECK-32-NEXT:    store i16 [[TMP33]], i16* [[DOTOMP_REDUCTION_ELEMENT4]], align 2
+// CHECK-32-NEXT:    [[TMP34:%.*]] = getelementptr i16, i16* [[TMP24]], i32 1
+// CHECK-32-NEXT:    [[TMP35:%.*]] = getelementptr i16, i16* [[DOTOMP_REDUCTION_ELEMENT4]], i32 1
+// CHECK-32-NEXT:    [[TMP36:%.*]] = bitcast i16* [[DOTOMP_REDUCTION_ELEMENT4]] to i8*
+// CHECK-32-NEXT:    store i8* [[TMP36]], i8** [[TMP25]], align 4
+// CHECK-32-NEXT:    [[TMP37:%.*]] = icmp eq i16 [[TMP8]], 0
+// CHECK-32-NEXT:    [[TMP38:%.*]] = icmp eq i16 [[TMP8]], 1
+// CHECK-32-NEXT:    [[TMP39:%.*]] = icmp ult i16 [[TMP6]], [[TMP7]]
+// CHECK-32-NEXT:    [[TMP40:%.*]] = and i1 [[TMP38]], [[TMP39]]
+// CHECK-32-NEXT:    [[TMP41:%.*]] = icmp eq i16 [[TMP8]], 2
+// CHECK-32-NEXT:    [[TMP42:%.*]] = and i16 [[TMP6]], 1
+// CHECK-32-NEXT:    [[TMP43:%.*]] = icmp eq i16 [[TMP42]], 0
+// CHECK-32-NEXT:    [[TMP44:%.*]] = and i1 [[TMP41]], [[TMP43]]
+// CHECK-32-NEXT:    [[TMP45:%.*]] = icmp sgt i16 [[TMP7]], 0
+// CHECK-32-NEXT:    [[TMP46:%.*]] = and i1 [[TMP44]], [[TMP45]]
+// CHECK-32-NEXT:    [[TMP47:%.*]] = or i1 [[TMP37]], [[TMP40]]
+// CHECK-32-NEXT:    [[TMP48:%.*]] = or i1 [[TMP47]], [[TMP46]]
+// CHECK-32-NEXT:    br i1 [[TMP48]], label [[THEN:%.*]], label [[ELSE:%.*]]
+// CHECK-32:       then:
+// CHECK-32-NEXT:    [[TMP49:%.*]] = bitcast [2 x i8*]* [[TMP5]] to i8*
+// CHECK-32-NEXT:    [[TMP50:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]] to i8*
+// CHECK-32-NEXT:    call void @"_omp$reduction$reduction_func6"(i8* [[TMP49]], i8* [[TMP50]]) #[[ATTR3]]
+// CHECK-32-NEXT:    br label [[IFCONT:%.*]]
+// CHECK-32:       else:
+// CHECK-32-NEXT:    br label [[IFCONT]]
+// CHECK-32:       ifcont:
+// CHECK-32-NEXT:    [[TMP51:%.*]] = icmp eq i16 [[TMP8]], 1
+// CHECK-32-NEXT:    [[TMP52:%.*]] = icmp uge i16 [[TMP6]], [[TMP7]]
+// CHECK-32-NEXT:    [[TMP53:%.*]] = and i1 [[TMP51]], [[TMP52]]
+// CHECK-32-NEXT:    br i1 [[TMP53]], label [[THEN5:%.*]], label [[ELSE6:%.*]]
+// CHECK-32:       then5:
+// CHECK-32-NEXT:    [[TMP54:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 0
+// CHECK-32-NEXT:    [[TMP55:%.*]] = bitcast i8** [[TMP54]] to i32**
+// CHECK-32-NEXT:    [[TMP56:%.*]] = load i32*, i32** [[TMP55]], align 4
+// CHECK-32-NEXT:    [[TMP57:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 0
+// CHECK-32-NEXT:    [[TMP58:%.*]] = bitcast i8** [[TMP57]] to i32**
+// CHECK-32-NEXT:    [[TMP59:%.*]] = load i32*, i32** [[TMP58]], align 4
+// CHECK-32-NEXT:    [[TMP60:%.*]] = load i32, i32* [[TMP56]], align 4
+// CHECK-32-NEXT:    store i32 [[TMP60]], i32* [[TMP59]], align 4
+// CHECK-32-NEXT:    [[TMP61:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 1
+// CHECK-32-NEXT:    [[TMP62:%.*]] = bitcast i8** [[TMP61]] to i16**
+// CHECK-32-NEXT:    [[TMP63:%.*]] = load i16*, i16** [[TMP62]], align 4
+// CHECK-32-NEXT:    [[TMP64:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 1
+// CHECK-32-NEXT:    [[TMP65:%.*]] = bitcast i8** [[TMP64]] to i16**
+// CHECK-32-NEXT:    [[TMP66:%.*]] = load i16*, i16** [[TMP65]], align 4
+// CHECK-32-NEXT:    [[TMP67:%.*]] = load i16, i16* [[TMP63]], align 2
+// CHECK-32-NEXT:    store i16 [[TMP67]], i16* [[TMP66]], align 2
+// CHECK-32-NEXT:    br label [[IFCONT7:%.*]]
+// CHECK-32:       else6:
+// CHECK-32-NEXT:    br label [[IFCONT7]]
+// CHECK-32:       ifcont7:
+// CHECK-32-NEXT:    ret void
+//
+//
+// CHECK-32-LABEL: define {{[^@]+}}@_omp_reduction_inter_warp_copy_func8
+// CHECK-32-SAME: (i8* noundef [[TMP0:%.*]], i32 noundef [[TMP1:%.*]]) #[[ATTR2]] {
+// CHECK-32-NEXT:  entry:
+// CHECK-32-NEXT:    [[DOTADDR:%.*]] = alloca i8*, align 4
+// CHECK-32-NEXT:    [[DOTADDR1:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
+// CHECK-32-NEXT:    store i8* [[TMP0]], i8** [[DOTADDR]], align 4
+// CHECK-32-NEXT:    store i32 [[TMP1]], i32* [[DOTADDR1]], align 4
+// CHECK-32-NEXT:    [[TMP3:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block()
+// CHECK-32-NEXT:    [[TMP4:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block()
+// CHECK-32-NEXT:    [[NVPTX_LANE_ID:%.*]] = and i32 [[TMP4]], 31
+// CHECK-32-NEXT:    [[TMP5:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block()
+// CHECK-32-NEXT:    [[NVPTX_WARP_ID:%.*]] = ashr i32 [[TMP5]], 5
+// CHECK-32-NEXT:    [[TMP6:%.*]] = load i8*, i8** [[DOTADDR]], align 4
+// CHECK-32-NEXT:    [[TMP7:%.*]] = bitcast i8* [[TMP6]] to [2 x i8*]*
+// CHECK-32-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2]], i32 [[TMP2]])
+// CHECK-32-NEXT:    [[WARP_MASTER:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0
+// CHECK-32-NEXT:    br i1 [[WARP_MASTER]], label [[THEN:%.*]], label [[ELSE:%.*]]
+// CHECK-32:       then:
+// CHECK-32-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP7]], i32 0, i32 0
+// CHECK-32-NEXT:    [[TMP9:%.*]] = load i8*, i8** [[TMP8]], align 4
+// CHECK-32-NEXT:    [[TMP10:%.*]] = bitcast i8* [[TMP9]] to i32*
+// CHECK-32-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]]
+// CHECK-32-NEXT:    [[TMP12:%.*]] = load i32, i32* [[TMP10]], align 4
+// CHECK-32-NEXT:    store volatile i32 [[TMP12]], i32 addrspace(3)* [[TMP11]], align 4
+// CHECK-32-NEXT:    br label [[IFCONT:%.*]]
+// CHECK-32:       else:
+// CHECK-32-NEXT:    br label [[IFCONT]]
+// CHECK-32:       ifcont:
+// CHECK-32-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2]], i32 [[TMP2]])
+// CHECK-32-NEXT:    [[TMP13:%.*]] = load i32, i32* [[DOTADDR1]], align 4
+// CHECK-32-NEXT:    [[IS_ACTIVE_THREAD:%.*]] = icmp ult i32 [[TMP3]], [[TMP13]]
+// CHECK-32-NEXT:    br i1 [[IS_ACTIVE_THREAD]], label [[THEN2:%.*]], label [[ELSE3:%.*]]
+// CHECK-32:       then2:
+// CHECK-32-NEXT:    [[TMP14:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[TMP3]]
+// CHECK-32-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP7]], i32 0, i32 0
+// CHECK-32-NEXT:    [[TMP16:%.*]] = load i8*, i8** [[TMP15]], align 4
+// CHECK-32-NEXT:    [[TMP17:%.*]] = bitcast i8* [[TMP16]] to i32*
+// CHECK-32-NEXT:    [[TMP18:%.*]] = load volatile i32, i32 addrspace(3)* [[TMP14]], align 4
+// CHECK-32-NEXT:    store i32 [[TMP18]], i32* [[TMP17]], align 4
+// CHECK-32-NEXT:    br label [[IFCONT4:%.*]]
+// CHECK-32:       else3:
+// CHECK-32-NEXT:    br label [[IFCONT4]]
+// CHECK-32:       ifcont4:
+// CHECK-32-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2]], i32 [[TMP2]])
+// CHECK-32-NEXT:    [[WARP_MASTER5:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0
+// CHECK-32-NEXT:    br i1 [[WARP_MASTER5]], label [[THEN6:%.*]], label [[ELSE7:%.*]]
+// CHECK-32:       then6:
+// CHECK-32-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP7]], i32 0, i32 1
+// CHECK-32-NEXT:    [[TMP20:%.*]] = load i8*, i8** [[TMP19]], align 4
+// CHECK-32-NEXT:    [[TMP21:%.*]] = bitcast i8* [[TMP20]] to i16*
+// CHECK-32-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]]
+// CHECK-32-NEXT:    [[TMP23:%.*]] = bitcast i32 addrspace(3)* [[TMP22]] to i16 addrspace(3)*
+// CHECK-32-NEXT:    [[TMP24:%.*]] = load i16, i16* [[TMP21]], align 2
+// CHECK-32-NEXT:    store volatile i16 [[TMP24]], i16 addrspace(3)* [[TMP23]], align 2
+// CHECK-32-NEXT:    br label [[IFCONT8:%.*]]
+// CHECK-32:       else7:
+// CHECK-32-NEXT:    br label [[IFCONT8]]
+// CHECK-32:       ifcont8:
+// CHECK-32-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2]], i32 [[TMP2]])
+// CHECK-32-NEXT:    [[TMP25:%.*]] = load i32, i32* [[DOTADDR1]], align 4
+// CHECK-32-NEXT:    [[IS_ACTIVE_THREAD9:%.*]] = icmp ult i32 [[TMP3]], [[TMP25]]
+// CHECK-32-NEXT:    br i1 [[IS_ACTIVE_THREAD9]], label [[THEN10:%.*]], label [[ELSE11:%.*]]
+// CHECK-32:       then10:
+// CHECK-32-NEXT:    [[TMP26:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[TMP3]]
+// CHECK-32-NEXT:    [[TMP27:%.*]] = bitcast i32 addrspace(3)* [[TMP26]] to i16 addrspace(3)*
+// CHECK-32-NEXT:    [[TMP28:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP7]], i32 0, i32 1
+// CHECK-32-NEXT:    [[TMP29:%.*]] = load i8*, i8** [[TMP28]], align 4
+// CHECK-32-NEXT:    [[TMP30:%.*]] = bitcast i8* [[TMP29]] to i16*
+// CHECK-32-NEXT:    [[TMP31:%.*]] = load volatile i16, i16 addrspace(3)* [[TMP27]], align 2
+// CHECK-32-NEXT:    store i16 [[TMP31]], i16* [[TMP30]], align 2
+// CHECK-32-NEXT:    br label [[IFCONT12:%.*]]
+// CHECK-32:       else11:
+// CHECK-32-NEXT:    br label [[IFCONT12]]
+// CHECK-32:       ifcont12:
+// CHECK-32-NEXT:    ret void
+//
+//
+// CHECK-32-EX-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l24
+// CHECK-32-EX-SAME: (double* noundef nonnull align 8 dereferenceable(8) [[E:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-32-EX-NEXT:  entry:
+// CHECK-32-EX-NEXT:    [[E_ADDR:%.*]] = alloca double*, align 4
+// CHECK-32-EX-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [1 x i8*], align 4
+// CHECK-32-EX-NEXT:    store double* [[E]], double** [[E_ADDR]], align 4
+// CHECK-32-EX-NEXT:    [[TMP0:%.*]] = load double*, double** [[E_ADDR]], align 4
+// CHECK-32-EX-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1:[0-9]+]], i8 2, i1 false)
+// CHECK-32-EX-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1
+// CHECK-32-EX-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
+// CHECK-32-EX:       user_code.entry:
+// CHECK-32-EX-NEXT:    [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
+// CHECK-32-EX-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0
+// CHECK-32-EX-NEXT:    [[TMP4:%.*]] = bitcast double* [[TMP0]] to i8*
+// CHECK-32-EX-NEXT:    store i8* [[TMP4]], i8** [[TMP3]], align 4
+// CHECK-32-EX-NEXT:    [[TMP5:%.*]] = bitcast [1 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8**
+// CHECK-32-EX-NEXT:    call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, double*)* @__omp_outlined__ to i8*), i8* null, i8** [[TMP5]], i32 1)
+// CHECK-32-EX-NEXT:    call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 2)
+// CHECK-32-EX-NEXT:    ret void
+// CHECK-32-EX:       worker.exit:
+// CHECK-32-EX-NEXT:    ret void
+//
+//
+// CHECK-32-EX-LABEL: define {{[^@]+}}@__omp_outlined__
+// CHECK-32-EX-SAME: (i32* noalias noundef [[DOTGLOBAL_TID_:%.*]], i32* noalias noundef [[DOTBOUND_TID_:%.*]], double* noundef nonnull align 8 dereferenceable(8) [[E:%.*]]) #[[ATTR1:[0-9]+]] {
+// CHECK-32-EX-NEXT:  entry:
+// CHECK-32-EX-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-EX-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-EX-NEXT:    [[E_ADDR:%.*]] = alloca double*, align 4
+// CHECK-32-EX-NEXT:    [[E1:%.*]] = alloca double, align 8
+// CHECK-32-EX-NEXT:    [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [1 x i8*], align 4
+// CHECK-32-EX-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-EX-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4
+// CHECK-32-EX-NEXT:    store double* [[E]], double** [[E_ADDR]], align 4
+// CHECK-32-EX-NEXT:    [[TMP0:%.*]] = load double*, double** [[E_ADDR]], align 4
+// CHECK-32-EX-NEXT:    store double 0.000000e+00, double* [[E1]], align 8
+// CHECK-32-EX-NEXT:    [[TMP1:%.*]] = load double, double* [[E1]], align 8
+// CHECK-32-EX-NEXT:    [[ADD:%.*]] = fadd double [[TMP1]], 5.000000e+00
+// CHECK-32-EX-NEXT:    store double [[ADD]], double* [[E1]], align 8
+// CHECK-32-EX-NEXT:    [[TMP2:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-EX-NEXT:    [[TMP3:%.*]] = load i32, i32* [[TMP2]], align 4
+// CHECK-32-EX-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0
+// CHECK-32-EX-NEXT:    [[TMP5:%.*]] = bitcast double* [[E1]] to i8*
+// CHECK-32-EX-NEXT:    store i8* [[TMP5]], i8** [[TMP4]], align 4
+// CHECK-32-EX-NEXT:    [[TMP6:%.*]] = bitcast [1 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8*
+// CHECK-32-EX-NEXT:    [[TMP7:%.*]] = call i32 @__kmpc_nvptx_parallel_reduce_nowait_v2(%struct.ident_t* @[[GLOB1]], i32 [[TMP3]], i32 1, i32 4, i8* [[TMP6]], void (i8*, i16, i16, i16)* @_omp_reduction_shuffle_and_reduce_func, void (i8*, i32)* @_omp_reduction_inter_warp_copy_func)
+// CHECK-32-EX-NEXT:    [[TMP8:%.*]] = icmp eq i32 [[TMP7]], 1
+// CHECK-32-EX-NEXT:    br i1 [[TMP8]], label [[DOTOMP_REDUCTION_THEN:%.*]], label [[DOTOMP_REDUCTION_DONE:%.*]]
+// CHECK-32-EX:       .omp.reduction.then:
+// CHECK-32-EX-NEXT:    [[TMP9:%.*]] = load double, double* [[TMP0]], align 8
+// CHECK-32-EX-NEXT:    [[TMP10:%.*]] = load double, double* [[E1]], align 8
+// CHECK-32-EX-NEXT:    [[ADD2:%.*]] = fadd double [[TMP9]], [[TMP10]]
+// CHECK-32-EX-NEXT:    store double [[ADD2]], double* [[TMP0]], align 8
+// CHECK-32-EX-NEXT:    call void @__kmpc_nvptx_end_reduce_nowait(i32 [[TMP3]])
+// CHECK-32-EX-NEXT:    br label [[DOTOMP_REDUCTION_DONE]]
+// CHECK-32-EX:       .omp.reduction.done:
+// CHECK-32-EX-NEXT:    ret void
+//
+//
+// CHECK-32-EX-LABEL: define {{[^@]+}}@_omp_reduction_shuffle_and_reduce_func
+// CHECK-32-EX-SAME: (i8* noundef [[TMP0:%.*]], i16 noundef signext [[TMP1:%.*]], i16 noundef signext [[TMP2:%.*]], i16 noundef signext [[TMP3:%.*]]) #[[ATTR2:[0-9]+]] {
+// CHECK-32-EX-NEXT:  entry:
+// CHECK-32-EX-NEXT:    [[DOTADDR:%.*]] = alloca i8*, align 4
+// CHECK-32-EX-NEXT:    [[DOTADDR1:%.*]] = alloca i16, align 2
+// CHECK-32-EX-NEXT:    [[DOTADDR2:%.*]] = alloca i16, align 2
+// CHECK-32-EX-NEXT:    [[DOTADDR3:%.*]] = alloca i16, align 2
+// CHECK-32-EX-NEXT:    [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST:%.*]] = alloca [1 x i8*], align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_REDUCTION_ELEMENT:%.*]] = alloca double, align 8
+// CHECK-32-EX-NEXT:    store i8* [[TMP0]], i8** [[DOTADDR]], align 4
+// CHECK-32-EX-NEXT:    store i16 [[TMP1]], i16* [[DOTADDR1]], align 2
+// CHECK-32-EX-NEXT:    store i16 [[TMP2]], i16* [[DOTADDR2]], align 2
+// CHECK-32-EX-NEXT:    store i16 [[TMP3]], i16* [[DOTADDR3]], align 2
+// CHECK-32-EX-NEXT:    [[TMP4:%.*]] = load i8*, i8** [[DOTADDR]], align 4
+// CHECK-32-EX-NEXT:    [[TMP5:%.*]] = bitcast i8* [[TMP4]] to [1 x i8*]*
+// CHECK-32-EX-NEXT:    [[TMP6:%.*]] = load i16, i16* [[DOTADDR1]], align 2
+// CHECK-32-EX-NEXT:    [[TMP7:%.*]] = load i16, i16* [[DOTADDR2]], align 2
+// CHECK-32-EX-NEXT:    [[TMP8:%.*]] = load i16, i16* [[DOTADDR3]], align 2
+// CHECK-32-EX-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[TMP5]], i32 0, i32 0
+// CHECK-32-EX-NEXT:    [[TMP10:%.*]] = bitcast i8** [[TMP9]] to double**
+// CHECK-32-EX-NEXT:    [[TMP11:%.*]] = load double*, double** [[TMP10]], align 4
+// CHECK-32-EX-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 0
+// CHECK-32-EX-NEXT:    [[TMP13:%.*]] = getelementptr double, double* [[TMP11]], i32 1
+// CHECK-32-EX-NEXT:    [[TMP14:%.*]] = bitcast double* [[TMP13]] to i8*
+// CHECK-32-EX-NEXT:    [[TMP15:%.*]] = bitcast double* [[TMP11]] to i64*
+// CHECK-32-EX-NEXT:    [[TMP16:%.*]] = bitcast double* [[DOTOMP_REDUCTION_ELEMENT]] to i64*
+// CHECK-32-EX-NEXT:    [[TMP17:%.*]] = load i64, i64* [[TMP15]], align 8
+// CHECK-32-EX-NEXT:    [[TMP18:%.*]] = call i32 @__kmpc_get_warp_size()
+// CHECK-32-EX-NEXT:    [[TMP19:%.*]] = trunc i32 [[TMP18]] to i16
+// CHECK-32-EX-NEXT:    [[TMP20:%.*]] = call i64 @__kmpc_shuffle_int64(i64 [[TMP17]], i16 [[TMP7]], i16 [[TMP19]])
+// CHECK-32-EX-NEXT:    store i64 [[TMP20]], i64* [[TMP16]], align 8
+// CHECK-32-EX-NEXT:    [[TMP21:%.*]] = getelementptr i64, i64* [[TMP15]], i32 1
+// CHECK-32-EX-NEXT:    [[TMP22:%.*]] = getelementptr i64, i64* [[TMP16]], i32 1
+// CHECK-32-EX-NEXT:    [[TMP23:%.*]] = bitcast double* [[DOTOMP_REDUCTION_ELEMENT]] to i8*
+// CHECK-32-EX-NEXT:    store i8* [[TMP23]], i8** [[TMP12]], align 4
+// CHECK-32-EX-NEXT:    [[TMP24:%.*]] = icmp eq i16 [[TMP8]], 0
+// CHECK-32-EX-NEXT:    [[TMP25:%.*]] = icmp eq i16 [[TMP8]], 1
+// CHECK-32-EX-NEXT:    [[TMP26:%.*]] = icmp ult i16 [[TMP6]], [[TMP7]]
+// CHECK-32-EX-NEXT:    [[TMP27:%.*]] = and i1 [[TMP25]], [[TMP26]]
+// CHECK-32-EX-NEXT:    [[TMP28:%.*]] = icmp eq i16 [[TMP8]], 2
+// CHECK-32-EX-NEXT:    [[TMP29:%.*]] = and i16 [[TMP6]], 1
+// CHECK-32-EX-NEXT:    [[TMP30:%.*]] = icmp eq i16 [[TMP29]], 0
+// CHECK-32-EX-NEXT:    [[TMP31:%.*]] = and i1 [[TMP28]], [[TMP30]]
+// CHECK-32-EX-NEXT:    [[TMP32:%.*]] = icmp sgt i16 [[TMP7]], 0
+// CHECK-32-EX-NEXT:    [[TMP33:%.*]] = and i1 [[TMP31]], [[TMP32]]
+// CHECK-32-EX-NEXT:    [[TMP34:%.*]] = or i1 [[TMP24]], [[TMP27]]
+// CHECK-32-EX-NEXT:    [[TMP35:%.*]] = or i1 [[TMP34]], [[TMP33]]
+// CHECK-32-EX-NEXT:    br i1 [[TMP35]], label [[THEN:%.*]], label [[ELSE:%.*]]
+// CHECK-32-EX:       then:
+// CHECK-32-EX-NEXT:    [[TMP36:%.*]] = bitcast [1 x i8*]* [[TMP5]] to i8*
+// CHECK-32-EX-NEXT:    [[TMP37:%.*]] = bitcast [1 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]] to i8*
+// CHECK-32-EX-NEXT:    call void @"_omp$reduction$reduction_func"(i8* [[TMP36]], i8* [[TMP37]]) #[[ATTR3:[0-9]+]]
+// CHECK-32-EX-NEXT:    br label [[IFCONT:%.*]]
+// CHECK-32-EX:       else:
+// CHECK-32-EX-NEXT:    br label [[IFCONT]]
+// CHECK-32-EX:       ifcont:
+// CHECK-32-EX-NEXT:    [[TMP38:%.*]] = icmp eq i16 [[TMP8]], 1
+// CHECK-32-EX-NEXT:    [[TMP39:%.*]] = icmp uge i16 [[TMP6]], [[TMP7]]
+// CHECK-32-EX-NEXT:    [[TMP40:%.*]] = and i1 [[TMP38]], [[TMP39]]
+// CHECK-32-EX-NEXT:    br i1 [[TMP40]], label [[THEN4:%.*]], label [[ELSE5:%.*]]
+// CHECK-32-EX:       then4:
+// CHECK-32-EX-NEXT:    [[TMP41:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 0
+// CHECK-32-EX-NEXT:    [[TMP42:%.*]] = bitcast i8** [[TMP41]] to double**
+// CHECK-32-EX-NEXT:    [[TMP43:%.*]] = load double*, double** [[TMP42]], align 4
+// CHECK-32-EX-NEXT:    [[TMP44:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[TMP5]], i32 0, i32 0
+// CHECK-32-EX-NEXT:    [[TMP45:%.*]] = bitcast i8** [[TMP44]] to double**
+// CHECK-32-EX-NEXT:    [[TMP46:%.*]] = load double*, double** [[TMP45]], align 4
+// CHECK-32-EX-NEXT:    [[TMP47:%.*]] = load double, double* [[TMP43]], align 8
+// CHECK-32-EX-NEXT:    store double [[TMP47]], double* [[TMP46]], align 8
+// CHECK-32-EX-NEXT:    br label [[IFCONT6:%.*]]
+// CHECK-32-EX:       else5:
+// CHECK-32-EX-NEXT:    br label [[IFCONT6]]
+// CHECK-32-EX:       ifcont6:
+// CHECK-32-EX-NEXT:    ret void
+//
+//
+// CHECK-32-EX-LABEL: define {{[^@]+}}@_omp_reduction_inter_warp_copy_func
+// CHECK-32-EX-SAME: (i8* noundef [[TMP0:%.*]], i32 noundef [[TMP1:%.*]]) #[[ATTR2]] {
+// CHECK-32-EX-NEXT:  entry:
+// CHECK-32-EX-NEXT:    [[DOTADDR:%.*]] = alloca i8*, align 4
+// CHECK-32-EX-NEXT:    [[DOTADDR1:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTCNT_ADDR:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
+// CHECK-32-EX-NEXT:    store i8* [[TMP0]], i8** [[DOTADDR]], align 4
+// CHECK-32-EX-NEXT:    store i32 [[TMP1]], i32* [[DOTADDR1]], align 4
+// CHECK-32-EX-NEXT:    [[TMP3:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block()
+// CHECK-32-EX-NEXT:    [[TMP4:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block()
+// CHECK-32-EX-NEXT:    [[NVPTX_LANE_ID:%.*]] = and i32 [[TMP4]], 31
+// CHECK-32-EX-NEXT:    [[TMP5:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block()
+// CHECK-32-EX-NEXT:    [[NVPTX_WARP_ID:%.*]] = ashr i32 [[TMP5]], 5
+// CHECK-32-EX-NEXT:    [[TMP6:%.*]] = load i8*, i8** [[DOTADDR]], align 4
+// CHECK-32-EX-NEXT:    [[TMP7:%.*]] = bitcast i8* [[TMP6]] to [1 x i8*]*
+// CHECK-32-EX-NEXT:    store i32 0, i32* [[DOTCNT_ADDR]], align 4
+// CHECK-32-EX-NEXT:    br label [[PRECOND:%.*]]
+// CHECK-32-EX:       precond:
+// CHECK-32-EX-NEXT:    [[TMP8:%.*]] = load i32, i32* [[DOTCNT_ADDR]], align 4
+// CHECK-32-EX-NEXT:    [[TMP9:%.*]] = icmp ult i32 [[TMP8]], 2
+// CHECK-32-EX-NEXT:    br i1 [[TMP9]], label [[BODY:%.*]], label [[EXIT:%.*]]
+// CHECK-32-EX:       body:
+// CHECK-32-EX-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2:[0-9]+]], i32 [[TMP2]])
+// CHECK-32-EX-NEXT:    [[WARP_MASTER:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0
+// CHECK-32-EX-NEXT:    br i1 [[WARP_MASTER]], label [[THEN:%.*]], label [[ELSE:%.*]]
+// CHECK-32-EX:       then:
+// CHECK-32-EX-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[TMP7]], i32 0, i32 0
+// CHECK-32-EX-NEXT:    [[TMP11:%.*]] = load i8*, i8** [[TMP10]], align 4
+// CHECK-32-EX-NEXT:    [[TMP12:%.*]] = bitcast i8* [[TMP11]] to i32*
+// CHECK-32-EX-NEXT:    [[TMP13:%.*]] = getelementptr i32, i32* [[TMP12]], i32 [[TMP8]]
+// CHECK-32-EX-NEXT:    [[TMP14:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]]
+// CHECK-32-EX-NEXT:    [[TMP15:%.*]] = load i32, i32* [[TMP13]], align 4
+// CHECK-32-EX-NEXT:    store volatile i32 [[TMP15]], i32 addrspace(3)* [[TMP14]], align 4
+// CHECK-32-EX-NEXT:    br label [[IFCONT:%.*]]
+// CHECK-32-EX:       else:
+// CHECK-32-EX-NEXT:    br label [[IFCONT]]
+// CHECK-32-EX:       ifcont:
+// CHECK-32-EX-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2]], i32 [[TMP2]])
+// CHECK-32-EX-NEXT:    [[TMP16:%.*]] = load i32, i32* [[DOTADDR1]], align 4
+// CHECK-32-EX-NEXT:    [[IS_ACTIVE_THREAD:%.*]] = icmp ult i32 [[TMP3]], [[TMP16]]
+// CHECK-32-EX-NEXT:    br i1 [[IS_ACTIVE_THREAD]], label [[THEN2:%.*]], label [[ELSE3:%.*]]
+// CHECK-32-EX:       then2:
+// CHECK-32-EX-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[TMP3]]
+// CHECK-32-EX-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[TMP7]], i32 0, i32 0
+// CHECK-32-EX-NEXT:    [[TMP19:%.*]] = load i8*, i8** [[TMP18]], align 4
+// CHECK-32-EX-NEXT:    [[TMP20:%.*]] = bitcast i8* [[TMP19]] to i32*
+// CHECK-32-EX-NEXT:    [[TMP21:%.*]] = getelementptr i32, i32* [[TMP20]], i32 [[TMP8]]
+// CHECK-32-EX-NEXT:    [[TMP22:%.*]] = load volatile i32, i32 addrspace(3)* [[TMP17]], align 4
+// CHECK-32-EX-NEXT:    store i32 [[TMP22]], i32* [[TMP21]], align 4
+// CHECK-32-EX-NEXT:    br label [[IFCONT4:%.*]]
+// CHECK-32-EX:       else3:
+// CHECK-32-EX-NEXT:    br label [[IFCONT4]]
+// CHECK-32-EX:       ifcont4:
+// CHECK-32-EX-NEXT:    [[TMP23:%.*]] = add nsw i32 [[TMP8]], 1
+// CHECK-32-EX-NEXT:    store i32 [[TMP23]], i32* [[DOTCNT_ADDR]], align 4
+// CHECK-32-EX-NEXT:    br label [[PRECOND]]
+// CHECK-32-EX:       exit:
+// CHECK-32-EX-NEXT:    ret void
+//
+//
+// CHECK-32-EX-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l29
+// CHECK-32-EX-SAME: (i8* noundef nonnull align 1 dereferenceable(1) [[C:%.*]], float* noundef nonnull align 4 dereferenceable(4) [[D:%.*]]) #[[ATTR0]] {
+// CHECK-32-EX-NEXT:  entry:
+// CHECK-32-EX-NEXT:    [[C_ADDR:%.*]] = alloca i8*, align 4
+// CHECK-32-EX-NEXT:    [[D_ADDR:%.*]] = alloca float*, align 4
+// CHECK-32-EX-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [2 x i8*], align 4
+// CHECK-32-EX-NEXT:    store i8* [[C]], i8** [[C_ADDR]], align 4
+// CHECK-32-EX-NEXT:    store float* [[D]], float** [[D_ADDR]], align 4
+// CHECK-32-EX-NEXT:    [[TMP0:%.*]] = load i8*, i8** [[C_ADDR]], align 4
+// CHECK-32-EX-NEXT:    [[TMP1:%.*]] = load float*, float** [[D_ADDR]], align 4
+// CHECK-32-EX-NEXT:    [[TMP2:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 2, i1 false)
+// CHECK-32-EX-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP2]], -1
+// CHECK-32-EX-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
+// CHECK-32-EX:       user_code.entry:
+// CHECK-32-EX-NEXT:    [[TMP3:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
+// CHECK-32-EX-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0
+// CHECK-32-EX-NEXT:    store i8* [[TMP0]], i8** [[TMP4]], align 4
+// CHECK-32-EX-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1
+// CHECK-32-EX-NEXT:    [[TMP6:%.*]] = bitcast float* [[TMP1]] to i8*
+// CHECK-32-EX-NEXT:    store i8* [[TMP6]], i8** [[TMP5]], align 4
+// CHECK-32-EX-NEXT:    [[TMP7:%.*]] = bitcast [2 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8**
+// CHECK-32-EX-NEXT:    call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP3]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i8*, float*)* @__omp_outlined__1 to i8*), i8* null, i8** [[TMP7]], i32 2)
+// CHECK-32-EX-NEXT:    call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 2)
+// CHECK-32-EX-NEXT:    ret void
+// CHECK-32-EX:       worker.exit:
+// CHECK-32-EX-NEXT:    ret void
+//
+//
+// CHECK-32-EX-LABEL: define {{[^@]+}}@__omp_outlined__1
+// CHECK-32-EX-SAME: (i32* noalias noundef [[DOTGLOBAL_TID_:%.*]], i32* noalias noundef [[DOTBOUND_TID_:%.*]], i8* noundef nonnull align 1 dereferenceable(1) [[C:%.*]], float* noundef nonnull align 4 dereferenceable(4) [[D:%.*]]) #[[ATTR1]] {
+// CHECK-32-EX-NEXT:  entry:
+// CHECK-32-EX-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-EX-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-EX-NEXT:    [[C_ADDR:%.*]] = alloca i8*, align 4
+// CHECK-32-EX-NEXT:    [[D_ADDR:%.*]] = alloca float*, align 4
+// CHECK-32-EX-NEXT:    [[C1:%.*]] = alloca i8, align 1
+// CHECK-32-EX-NEXT:    [[D2:%.*]] = alloca float, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [2 x i8*], align 4
+// CHECK-32-EX-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-EX-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4
+// CHECK-32-EX-NEXT:    store i8* [[C]], i8** [[C_ADDR]], align 4
+// CHECK-32-EX-NEXT:    store float* [[D]], float** [[D_ADDR]], align 4
+// CHECK-32-EX-NEXT:    [[TMP0:%.*]] = load i8*, i8** [[C_ADDR]], align 4
+// CHECK-32-EX-NEXT:    [[TMP1:%.*]] = load float*, float** [[D_ADDR]], align 4
+// CHECK-32-EX-NEXT:    store i8 0, i8* [[C1]], align 1
+// CHECK-32-EX-NEXT:    store float 1.000000e+00, float* [[D2]], align 4
+// CHECK-32-EX-NEXT:    [[TMP2:%.*]] = load i8, i8* [[C1]], align 1
+// CHECK-32-EX-NEXT:    [[CONV:%.*]] = sext i8 [[TMP2]] to i32
+// CHECK-32-EX-NEXT:    [[XOR:%.*]] = xor i32 [[CONV]], 2
+// CHECK-32-EX-NEXT:    [[CONV3:%.*]] = trunc i32 [[XOR]] to i8
+// CHECK-32-EX-NEXT:    store i8 [[CONV3]], i8* [[C1]], align 1
+// CHECK-32-EX-NEXT:    [[TMP3:%.*]] = load float, float* [[D2]], align 4
+// CHECK-32-EX-NEXT:    [[MUL:%.*]] = fmul float [[TMP3]], 3.300000e+01
+// CHECK-32-EX-NEXT:    store float [[MUL]], float* [[D2]], align 4
+// CHECK-32-EX-NEXT:    [[TMP4:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-EX-NEXT:    [[TMP5:%.*]] = load i32, i32* [[TMP4]], align 4
+// CHECK-32-EX-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0
+// CHECK-32-EX-NEXT:    store i8* [[C1]], i8** [[TMP6]], align 4
+// CHECK-32-EX-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 1
+// CHECK-32-EX-NEXT:    [[TMP8:%.*]] = bitcast float* [[D2]] to i8*
+// CHECK-32-EX-NEXT:    store i8* [[TMP8]], i8** [[TMP7]], align 4
+// CHECK-32-EX-NEXT:    [[TMP9:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8*
+// CHECK-32-EX-NEXT:    [[TMP10:%.*]] = call i32 @__kmpc_nvptx_parallel_reduce_nowait_v2(%struct.ident_t* @[[GLOB1]], i32 [[TMP5]], i32 2, i32 8, i8* [[TMP9]], void (i8*, i16, i16, i16)* @_omp_reduction_shuffle_and_reduce_func3, void (i8*, i32)* @_omp_reduction_inter_warp_copy_func4)
+// CHECK-32-EX-NEXT:    [[TMP11:%.*]] = icmp eq i32 [[TMP10]], 1
+// CHECK-32-EX-NEXT:    br i1 [[TMP11]], label [[DOTOMP_REDUCTION_THEN:%.*]], label [[DOTOMP_REDUCTION_DONE:%.*]]
+// CHECK-32-EX:       .omp.reduction.then:
+// CHECK-32-EX-NEXT:    [[TMP12:%.*]] = load i8, i8* [[TMP0]], align 1
+// CHECK-32-EX-NEXT:    [[CONV4:%.*]] = sext i8 [[TMP12]] to i32
+// CHECK-32-EX-NEXT:    [[TMP13:%.*]] = load i8, i8* [[C1]], align 1
+// CHECK-32-EX-NEXT:    [[CONV5:%.*]] = sext i8 [[TMP13]] to i32
+// CHECK-32-EX-NEXT:    [[XOR6:%.*]] = xor i32 [[CONV4]], [[CONV5]]
+// CHECK-32-EX-NEXT:    [[CONV7:%.*]] = trunc i32 [[XOR6]] to i8
+// CHECK-32-EX-NEXT:    store i8 [[CONV7]], i8* [[TMP0]], align 1
+// CHECK-32-EX-NEXT:    [[TMP14:%.*]] = load float, float* [[TMP1]], align 4
+// CHECK-32-EX-NEXT:    [[TMP15:%.*]] = load float, float* [[D2]], align 4
+// CHECK-32-EX-NEXT:    [[MUL8:%.*]] = fmul float [[TMP14]], [[TMP15]]
+// CHECK-32-EX-NEXT:    store float [[MUL8]], float* [[TMP1]], align 4
+// CHECK-32-EX-NEXT:    call void @__kmpc_nvptx_end_reduce_nowait(i32 [[TMP5]])
+// CHECK-32-EX-NEXT:    br label [[DOTOMP_REDUCTION_DONE]]
+// CHECK-32-EX:       .omp.reduction.done:
+// CHECK-32-EX-NEXT:    ret void
+//
+//
+// CHECK-32-EX-LABEL: define {{[^@]+}}@_omp_reduction_shuffle_and_reduce_func3
+// CHECK-32-EX-SAME: (i8* noundef [[TMP0:%.*]], i16 noundef signext [[TMP1:%.*]], i16 noundef signext [[TMP2:%.*]], i16 noundef signext [[TMP3:%.*]]) #[[ATTR2]] {
+// CHECK-32-EX-NEXT:  entry:
+// CHECK-32-EX-NEXT:    [[DOTADDR:%.*]] = alloca i8*, align 4
+// CHECK-32-EX-NEXT:    [[DOTADDR1:%.*]] = alloca i16, align 2
+// CHECK-32-EX-NEXT:    [[DOTADDR2:%.*]] = alloca i16, align 2
+// CHECK-32-EX-NEXT:    [[DOTADDR3:%.*]] = alloca i16, align 2
+// CHECK-32-EX-NEXT:    [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST:%.*]] = alloca [2 x i8*], align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_REDUCTION_ELEMENT:%.*]] = alloca i8, align 1
+// CHECK-32-EX-NEXT:    [[DOTOMP_REDUCTION_ELEMENT4:%.*]] = alloca float, align 4
+// CHECK-32-EX-NEXT:    store i8* [[TMP0]], i8** [[DOTADDR]], align 4
+// CHECK-32-EX-NEXT:    store i16 [[TMP1]], i16* [[DOTADDR1]], align 2
+// CHECK-32-EX-NEXT:    store i16 [[TMP2]], i16* [[DOTADDR2]], align 2
+// CHECK-32-EX-NEXT:    store i16 [[TMP3]], i16* [[DOTADDR3]], align 2
+// CHECK-32-EX-NEXT:    [[TMP4:%.*]] = load i8*, i8** [[DOTADDR]], align 4
+// CHECK-32-EX-NEXT:    [[TMP5:%.*]] = bitcast i8* [[TMP4]] to [2 x i8*]*
+// CHECK-32-EX-NEXT:    [[TMP6:%.*]] = load i16, i16* [[DOTADDR1]], align 2
+// CHECK-32-EX-NEXT:    [[TMP7:%.*]] = load i16, i16* [[DOTADDR2]], align 2
+// CHECK-32-EX-NEXT:    [[TMP8:%.*]] = load i16, i16* [[DOTADDR3]], align 2
+// CHECK-32-EX-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 0
+// CHECK-32-EX-NEXT:    [[TMP10:%.*]] = load i8*, i8** [[TMP9]], align 4
+// CHECK-32-EX-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 0
+// CHECK-32-EX-NEXT:    [[TMP12:%.*]] = getelementptr i8, i8* [[TMP10]], i32 1
+// CHECK-32-EX-NEXT:    [[TMP13:%.*]] = load i8, i8* [[TMP10]], align 1
+// CHECK-32-EX-NEXT:    [[TMP14:%.*]] = sext i8 [[TMP13]] to i32
+// CHECK-32-EX-NEXT:    [[TMP15:%.*]] = call i32 @__kmpc_get_warp_size()
+// CHECK-32-EX-NEXT:    [[TMP16:%.*]] = trunc i32 [[TMP15]] to i16
+// CHECK-32-EX-NEXT:    [[TMP17:%.*]] = call i32 @__kmpc_shuffle_int32(i32 [[TMP14]], i16 [[TMP7]], i16 [[TMP16]])
+// CHECK-32-EX-NEXT:    [[TMP18:%.*]] = trunc i32 [[TMP17]] to i8
+// CHECK-32-EX-NEXT:    store i8 [[TMP18]], i8* [[DOTOMP_REDUCTION_ELEMENT]], align 1
+// CHECK-32-EX-NEXT:    [[TMP19:%.*]] = getelementptr i8, i8* [[TMP10]], i32 1
+// CHECK-32-EX-NEXT:    [[TMP20:%.*]] = getelementptr i8, i8* [[DOTOMP_REDUCTION_ELEMENT]], i32 1
+// CHECK-32-EX-NEXT:    store i8* [[DOTOMP_REDUCTION_ELEMENT]], i8** [[TMP11]], align 4
+// CHECK-32-EX-NEXT:    [[TMP21:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 1
+// CHECK-32-EX-NEXT:    [[TMP22:%.*]] = bitcast i8** [[TMP21]] to float**
+// CHECK-32-EX-NEXT:    [[TMP23:%.*]] = load float*, float** [[TMP22]], align 4
+// CHECK-32-EX-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 1
+// CHECK-32-EX-NEXT:    [[TMP25:%.*]] = getelementptr float, float* [[TMP23]], i32 1
+// CHECK-32-EX-NEXT:    [[TMP26:%.*]] = bitcast float* [[TMP25]] to i8*
+// CHECK-32-EX-NEXT:    [[TMP27:%.*]] = bitcast float* [[TMP23]] to i32*
+// CHECK-32-EX-NEXT:    [[TMP28:%.*]] = bitcast float* [[DOTOMP_REDUCTION_ELEMENT4]] to i32*
+// CHECK-32-EX-NEXT:    [[TMP29:%.*]] = load i32, i32* [[TMP27]], align 4
+// CHECK-32-EX-NEXT:    [[TMP30:%.*]] = call i32 @__kmpc_get_warp_size()
+// CHECK-32-EX-NEXT:    [[TMP31:%.*]] = trunc i32 [[TMP30]] to i16
+// CHECK-32-EX-NEXT:    [[TMP32:%.*]] = call i32 @__kmpc_shuffle_int32(i32 [[TMP29]], i16 [[TMP7]], i16 [[TMP31]])
+// CHECK-32-EX-NEXT:    store i32 [[TMP32]], i32* [[TMP28]], align 4
+// CHECK-32-EX-NEXT:    [[TMP33:%.*]] = getelementptr i32, i32* [[TMP27]], i32 1
+// CHECK-32-EX-NEXT:    [[TMP34:%.*]] = getelementptr i32, i32* [[TMP28]], i32 1
+// CHECK-32-EX-NEXT:    [[TMP35:%.*]] = bitcast float* [[DOTOMP_REDUCTION_ELEMENT4]] to i8*
+// CHECK-32-EX-NEXT:    store i8* [[TMP35]], i8** [[TMP24]], align 4
+// CHECK-32-EX-NEXT:    [[TMP36:%.*]] = icmp eq i16 [[TMP8]], 0
+// CHECK-32-EX-NEXT:    [[TMP37:%.*]] = icmp eq i16 [[TMP8]], 1
+// CHECK-32-EX-NEXT:    [[TMP38:%.*]] = icmp ult i16 [[TMP6]], [[TMP7]]
+// CHECK-32-EX-NEXT:    [[TMP39:%.*]] = and i1 [[TMP37]], [[TMP38]]
+// CHECK-32-EX-NEXT:    [[TMP40:%.*]] = icmp eq i16 [[TMP8]], 2
+// CHECK-32-EX-NEXT:    [[TMP41:%.*]] = and i16 [[TMP6]], 1
+// CHECK-32-EX-NEXT:    [[TMP42:%.*]] = icmp eq i16 [[TMP41]], 0
+// CHECK-32-EX-NEXT:    [[TMP43:%.*]] = and i1 [[TMP40]], [[TMP42]]
+// CHECK-32-EX-NEXT:    [[TMP44:%.*]] = icmp sgt i16 [[TMP7]], 0
+// CHECK-32-EX-NEXT:    [[TMP45:%.*]] = and i1 [[TMP43]], [[TMP44]]
+// CHECK-32-EX-NEXT:    [[TMP46:%.*]] = or i1 [[TMP36]], [[TMP39]]
+// CHECK-32-EX-NEXT:    [[TMP47:%.*]] = or i1 [[TMP46]], [[TMP45]]
+// CHECK-32-EX-NEXT:    br i1 [[TMP47]], label [[THEN:%.*]], label [[ELSE:%.*]]
+// CHECK-32-EX:       then:
+// CHECK-32-EX-NEXT:    [[TMP48:%.*]] = bitcast [2 x i8*]* [[TMP5]] to i8*
+// CHECK-32-EX-NEXT:    [[TMP49:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]] to i8*
+// CHECK-32-EX-NEXT:    call void @"_omp$reduction$reduction_func2"(i8* [[TMP48]], i8* [[TMP49]]) #[[ATTR3]]
+// CHECK-32-EX-NEXT:    br label [[IFCONT:%.*]]
+// CHECK-32-EX:       else:
+// CHECK-32-EX-NEXT:    br label [[IFCONT]]
+// CHECK-32-EX:       ifcont:
+// CHECK-32-EX-NEXT:    [[TMP50:%.*]] = icmp eq i16 [[TMP8]], 1
+// CHECK-32-EX-NEXT:    [[TMP51:%.*]] = icmp uge i16 [[TMP6]], [[TMP7]]
+// CHECK-32-EX-NEXT:    [[TMP52:%.*]] = and i1 [[TMP50]], [[TMP51]]
+// CHECK-32-EX-NEXT:    br i1 [[TMP52]], label [[THEN5:%.*]], label [[ELSE6:%.*]]
+// CHECK-32-EX:       then5:
+// CHECK-32-EX-NEXT:    [[TMP53:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 0
+// CHECK-32-EX-NEXT:    [[TMP54:%.*]] = load i8*, i8** [[TMP53]], align 4
+// CHECK-32-EX-NEXT:    [[TMP55:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 0
+// CHECK-32-EX-NEXT:    [[TMP56:%.*]] = load i8*, i8** [[TMP55]], align 4
+// CHECK-32-EX-NEXT:    [[TMP57:%.*]] = load i8, i8* [[TMP54]], align 1
+// CHECK-32-EX-NEXT:    store i8 [[TMP57]], i8* [[TMP56]], align 1
+// CHECK-32-EX-NEXT:    [[TMP58:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 1
+// CHECK-32-EX-NEXT:    [[TMP59:%.*]] = bitcast i8** [[TMP58]] to float**
+// CHECK-32-EX-NEXT:    [[TMP60:%.*]] = load float*, float** [[TMP59]], align 4
+// CHECK-32-EX-NEXT:    [[TMP61:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 1
+// CHECK-32-EX-NEXT:    [[TMP62:%.*]] = bitcast i8** [[TMP61]] to float**
+// CHECK-32-EX-NEXT:    [[TMP63:%.*]] = load float*, float** [[TMP62]], align 4
+// CHECK-32-EX-NEXT:    [[TMP64:%.*]] = load float, float* [[TMP60]], align 4
+// CHECK-32-EX-NEXT:    store float [[TMP64]], float* [[TMP63]], align 4
+// CHECK-32-EX-NEXT:    br label [[IFCONT7:%.*]]
+// CHECK-32-EX:       else6:
+// CHECK-32-EX-NEXT:    br label [[IFCONT7]]
+// CHECK-32-EX:       ifcont7:
+// CHECK-32-EX-NEXT:    ret void
+//
+//
+// CHECK-32-EX-LABEL: define {{[^@]+}}@_omp_reduction_inter_warp_copy_func4
+// CHECK-32-EX-SAME: (i8* noundef [[TMP0:%.*]], i32 noundef [[TMP1:%.*]]) #[[ATTR2]] {
+// CHECK-32-EX-NEXT:  entry:
+// CHECK-32-EX-NEXT:    [[DOTADDR:%.*]] = alloca i8*, align 4
+// CHECK-32-EX-NEXT:    [[DOTADDR1:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
+// CHECK-32-EX-NEXT:    store i8* [[TMP0]], i8** [[DOTADDR]], align 4
+// CHECK-32-EX-NEXT:    store i32 [[TMP1]], i32* [[DOTADDR1]], align 4
+// CHECK-32-EX-NEXT:    [[TMP3:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block()
+// CHECK-32-EX-NEXT:    [[TMP4:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block()
+// CHECK-32-EX-NEXT:    [[NVPTX_LANE_ID:%.*]] = and i32 [[TMP4]], 31
+// CHECK-32-EX-NEXT:    [[TMP5:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block()
+// CHECK-32-EX-NEXT:    [[NVPTX_WARP_ID:%.*]] = ashr i32 [[TMP5]], 5
+// CHECK-32-EX-NEXT:    [[TMP6:%.*]] = load i8*, i8** [[DOTADDR]], align 4
+// CHECK-32-EX-NEXT:    [[TMP7:%.*]] = bitcast i8* [[TMP6]] to [2 x i8*]*
+// CHECK-32-EX-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2]], i32 [[TMP2]])
+// CHECK-32-EX-NEXT:    [[WARP_MASTER:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0
+// CHECK-32-EX-NEXT:    br i1 [[WARP_MASTER]], label [[THEN:%.*]], label [[ELSE:%.*]]
+// CHECK-32-EX:       then:
+// CHECK-32-EX-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP7]], i32 0, i32 0
+// CHECK-32-EX-NEXT:    [[TMP9:%.*]] = load i8*, i8** [[TMP8]], align 4
+// CHECK-32-EX-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]]
+// CHECK-32-EX-NEXT:    [[TMP11:%.*]] = bitcast i32 addrspace(3)* [[TMP10]] to i8 addrspace(3)*
+// CHECK-32-EX-NEXT:    [[TMP12:%.*]] = load i8, i8* [[TMP9]], align 1
+// CHECK-32-EX-NEXT:    store volatile i8 [[TMP12]], i8 addrspace(3)* [[TMP11]], align 1
+// CHECK-32-EX-NEXT:    br label [[IFCONT:%.*]]
+// CHECK-32-EX:       else:
+// CHECK-32-EX-NEXT:    br label [[IFCONT]]
+// CHECK-32-EX:       ifcont:
+// CHECK-32-EX-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2]], i32 [[TMP2]])
+// CHECK-32-EX-NEXT:    [[TMP13:%.*]] = load i32, i32* [[DOTADDR1]], align 4
+// CHECK-32-EX-NEXT:    [[IS_ACTIVE_THREAD:%.*]] = icmp ult i32 [[TMP3]], [[TMP13]]
+// CHECK-32-EX-NEXT:    br i1 [[IS_ACTIVE_THREAD]], label [[THEN2:%.*]], label [[ELSE3:%.*]]
+// CHECK-32-EX:       then2:
+// CHECK-32-EX-NEXT:    [[TMP14:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[TMP3]]
+// CHECK-32-EX-NEXT:    [[TMP15:%.*]] = bitcast i32 addrspace(3)* [[TMP14]] to i8 addrspace(3)*
+// CHECK-32-EX-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP7]], i32 0, i32 0
+// CHECK-32-EX-NEXT:    [[TMP17:%.*]] = load i8*, i8** [[TMP16]], align 4
+// CHECK-32-EX-NEXT:    [[TMP18:%.*]] = load volatile i8, i8 addrspace(3)* [[TMP15]], align 1
+// CHECK-32-EX-NEXT:    store i8 [[TMP18]], i8* [[TMP17]], align 1
+// CHECK-32-EX-NEXT:    br label [[IFCONT4:%.*]]
+// CHECK-32-EX:       else3:
+// CHECK-32-EX-NEXT:    br label [[IFCONT4]]
+// CHECK-32-EX:       ifcont4:
+// CHECK-32-EX-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2]], i32 [[TMP2]])
+// CHECK-32-EX-NEXT:    [[WARP_MASTER5:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0
+// CHECK-32-EX-NEXT:    br i1 [[WARP_MASTER5]], label [[THEN6:%.*]], label [[ELSE7:%.*]]
+// CHECK-32-EX:       then6:
+// CHECK-32-EX-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP7]], i32 0, i32 1
+// CHECK-32-EX-NEXT:    [[TMP20:%.*]] = load i8*, i8** [[TMP19]], align 4
+// CHECK-32-EX-NEXT:    [[TMP21:%.*]] = bitcast i8* [[TMP20]] to i32*
+// CHECK-32-EX-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]]
+// CHECK-32-EX-NEXT:    [[TMP23:%.*]] = load i32, i32* [[TMP21]], align 4
+// CHECK-32-EX-NEXT:    store volatile i32 [[TMP23]], i32 addrspace(3)* [[TMP22]], align 4
+// CHECK-32-EX-NEXT:    br label [[IFCONT8:%.*]]
+// CHECK-32-EX:       else7:
+// CHECK-32-EX-NEXT:    br label [[IFCONT8]]
+// CHECK-32-EX:       ifcont8:
+// CHECK-32-EX-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2]], i32 [[TMP2]])
+// CHECK-32-EX-NEXT:    [[TMP24:%.*]] = load i32, i32* [[DOTADDR1]], align 4
+// CHECK-32-EX-NEXT:    [[IS_ACTIVE_THREAD9:%.*]] = icmp ult i32 [[TMP3]], [[TMP24]]
+// CHECK-32-EX-NEXT:    br i1 [[IS_ACTIVE_THREAD9]], label [[THEN10:%.*]], label [[ELSE11:%.*]]
+// CHECK-32-EX:       then10:
+// CHECK-32-EX-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[TMP3]]
+// CHECK-32-EX-NEXT:    [[TMP26:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP7]], i32 0, i32 1
+// CHECK-32-EX-NEXT:    [[TMP27:%.*]] = load i8*, i8** [[TMP26]], align 4
+// CHECK-32-EX-NEXT:    [[TMP28:%.*]] = bitcast i8* [[TMP27]] to i32*
+// CHECK-32-EX-NEXT:    [[TMP29:%.*]] = load volatile i32, i32 addrspace(3)* [[TMP25]], align 4
+// CHECK-32-EX-NEXT:    store i32 [[TMP29]], i32* [[TMP28]], align 4
+// CHECK-32-EX-NEXT:    br label [[IFCONT12:%.*]]
+// CHECK-32-EX:       else11:
+// CHECK-32-EX-NEXT:    br label [[IFCONT12]]
+// CHECK-32-EX:       ifcont12:
+// CHECK-32-EX-NEXT:    ret void
+//
+//
+// CHECK-32-EX-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l35
+// CHECK-32-EX-SAME: (i32* noundef nonnull align 4 dereferenceable(4) [[A:%.*]], i16* noundef nonnull align 2 dereferenceable(2) [[B:%.*]]) #[[ATTR0]] {
+// CHECK-32-EX-NEXT:  entry:
+// CHECK-32-EX-NEXT:    [[A_ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-EX-NEXT:    [[B_ADDR:%.*]] = alloca i16*, align 4
+// CHECK-32-EX-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [2 x i8*], align 4
+// CHECK-32-EX-NEXT:    store i32* [[A]], i32** [[A_ADDR]], align 4
+// CHECK-32-EX-NEXT:    store i16* [[B]], i16** [[B_ADDR]], align 4
+// CHECK-32-EX-NEXT:    [[TMP0:%.*]] = load i32*, i32** [[A_ADDR]], align 4
+// CHECK-32-EX-NEXT:    [[TMP1:%.*]] = load i16*, i16** [[B_ADDR]], align 4
+// CHECK-32-EX-NEXT:    [[TMP2:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 2, i1 false)
+// CHECK-32-EX-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP2]], -1
+// CHECK-32-EX-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
+// CHECK-32-EX:       user_code.entry:
+// CHECK-32-EX-NEXT:    [[TMP3:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
+// CHECK-32-EX-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0
+// CHECK-32-EX-NEXT:    [[TMP5:%.*]] = bitcast i32* [[TMP0]] to i8*
+// CHECK-32-EX-NEXT:    store i8* [[TMP5]], i8** [[TMP4]], align 4
+// CHECK-32-EX-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1
+// CHECK-32-EX-NEXT:    [[TMP7:%.*]] = bitcast i16* [[TMP1]] to i8*
+// CHECK-32-EX-NEXT:    store i8* [[TMP7]], i8** [[TMP6]], align 4
+// CHECK-32-EX-NEXT:    [[TMP8:%.*]] = bitcast [2 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8**
+// CHECK-32-EX-NEXT:    call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP3]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32*, i16*)* @__omp_outlined__5 to i8*), i8* null, i8** [[TMP8]], i32 2)
+// CHECK-32-EX-NEXT:    call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 2)
+// CHECK-32-EX-NEXT:    ret void
+// CHECK-32-EX:       worker.exit:
+// CHECK-32-EX-NEXT:    ret void
+//
+//
+// CHECK-32-EX-LABEL: define {{[^@]+}}@__omp_outlined__5
+// CHECK-32-EX-SAME: (i32* noalias noundef [[DOTGLOBAL_TID_:%.*]], i32* noalias noundef [[DOTBOUND_TID_:%.*]], i32* noundef nonnull align 4 dereferenceable(4) [[A:%.*]], i16* noundef nonnull align 2 dereferenceable(2) [[B:%.*]]) #[[ATTR1]] {
+// CHECK-32-EX-NEXT:  entry:
+// CHECK-32-EX-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-EX-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-EX-NEXT:    [[A_ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-EX-NEXT:    [[B_ADDR:%.*]] = alloca i16*, align 4
+// CHECK-32-EX-NEXT:    [[A1:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[B2:%.*]] = alloca i16, align 2
+// CHECK-32-EX-NEXT:    [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [2 x i8*], align 4
+// CHECK-32-EX-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-EX-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4
+// CHECK-32-EX-NEXT:    store i32* [[A]], i32** [[A_ADDR]], align 4
+// CHECK-32-EX-NEXT:    store i16* [[B]], i16** [[B_ADDR]], align 4
+// CHECK-32-EX-NEXT:    [[TMP0:%.*]] = load i32*, i32** [[A_ADDR]], align 4
+// CHECK-32-EX-NEXT:    [[TMP1:%.*]] = load i16*, i16** [[B_ADDR]], align 4
+// CHECK-32-EX-NEXT:    store i32 0, i32* [[A1]], align 4
+// CHECK-32-EX-NEXT:    store i16 -32768, i16* [[B2]], align 2
+// CHECK-32-EX-NEXT:    [[TMP2:%.*]] = load i32, i32* [[A1]], align 4
+// CHECK-32-EX-NEXT:    [[OR:%.*]] = or i32 [[TMP2]], 1
+// CHECK-32-EX-NEXT:    store i32 [[OR]], i32* [[A1]], align 4
+// CHECK-32-EX-NEXT:    [[TMP3:%.*]] = load i16, i16* [[B2]], align 2
+// CHECK-32-EX-NEXT:    [[CONV:%.*]] = sext i16 [[TMP3]] to i32
+// CHECK-32-EX-NEXT:    [[CMP:%.*]] = icmp sgt i32 99, [[CONV]]
+// CHECK-32-EX-NEXT:    br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK-32-EX:       cond.true:
+// CHECK-32-EX-NEXT:    br label [[COND_END:%.*]]
+// CHECK-32-EX:       cond.false:
+// CHECK-32-EX-NEXT:    [[TMP4:%.*]] = load i16, i16* [[B2]], align 2
+// CHECK-32-EX-NEXT:    [[CONV3:%.*]] = sext i16 [[TMP4]] to i32
+// CHECK-32-EX-NEXT:    br label [[COND_END]]
+// CHECK-32-EX:       cond.end:
+// CHECK-32-EX-NEXT:    [[COND:%.*]] = phi i32 [ 99, [[COND_TRUE]] ], [ [[CONV3]], [[COND_FALSE]] ]
+// CHECK-32-EX-NEXT:    [[CONV4:%.*]] = trunc i32 [[COND]] to i16
+// CHECK-32-EX-NEXT:    store i16 [[CONV4]], i16* [[B2]], align 2
+// CHECK-32-EX-NEXT:    [[TMP5:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-EX-NEXT:    [[TMP6:%.*]] = load i32, i32* [[TMP5]], align 4
+// CHECK-32-EX-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0
+// CHECK-32-EX-NEXT:    [[TMP8:%.*]] = bitcast i32* [[A1]] to i8*
+// CHECK-32-EX-NEXT:    store i8* [[TMP8]], i8** [[TMP7]], align 4
+// CHECK-32-EX-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 1
+// CHECK-32-EX-NEXT:    [[TMP10:%.*]] = bitcast i16* [[B2]] to i8*
+// CHECK-32-EX-NEXT:    store i8* [[TMP10]], i8** [[TMP9]], align 4
+// CHECK-32-EX-NEXT:    [[TMP11:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8*
+// CHECK-32-EX-NEXT:    [[TMP12:%.*]] = call i32 @__kmpc_nvptx_parallel_reduce_nowait_v2(%struct.ident_t* @[[GLOB1]], i32 [[TMP6]], i32 2, i32 8, i8* [[TMP11]], void (i8*, i16, i16, i16)* @_omp_reduction_shuffle_and_reduce_func7, void (i8*, i32)* @_omp_reduction_inter_warp_copy_func8)
+// CHECK-32-EX-NEXT:    [[TMP13:%.*]] = icmp eq i32 [[TMP12]], 1
+// CHECK-32-EX-NEXT:    br i1 [[TMP13]], label [[DOTOMP_REDUCTION_THEN:%.*]], label [[DOTOMP_REDUCTION_DONE:%.*]]
+// CHECK-32-EX:       .omp.reduction.then:
+// CHECK-32-EX-NEXT:    [[TMP14:%.*]] = load i32, i32* [[TMP0]], align 4
+// CHECK-32-EX-NEXT:    [[TMP15:%.*]] = load i32, i32* [[A1]], align 4
+// CHECK-32-EX-NEXT:    [[OR5:%.*]] = or i32 [[TMP14]], [[TMP15]]
+// CHECK-32-EX-NEXT:    store i32 [[OR5]], i32* [[TMP0]], align 4
+// CHECK-32-EX-NEXT:    [[TMP16:%.*]] = load i16, i16* [[TMP1]], align 2
+// CHECK-32-EX-NEXT:    [[CONV6:%.*]] = sext i16 [[TMP16]] to i32
+// CHECK-32-EX-NEXT:    [[TMP17:%.*]] = load i16, i16* [[B2]], align 2
+// CHECK-32-EX-NEXT:    [[CONV7:%.*]] = sext i16 [[TMP17]] to i32
+// CHECK-32-EX-NEXT:    [[CMP8:%.*]] = icmp sgt i32 [[CONV6]], [[CONV7]]
+// CHECK-32-EX-NEXT:    br i1 [[CMP8]], label [[COND_TRUE9:%.*]], label [[COND_FALSE10:%.*]]
+// CHECK-32-EX:       cond.true9:
+// CHECK-32-EX-NEXT:    [[TMP18:%.*]] = load i16, i16* [[TMP1]], align 2
+// CHECK-32-EX-NEXT:    br label [[COND_END11:%.*]]
+// CHECK-32-EX:       cond.false10:
+// CHECK-32-EX-NEXT:    [[TMP19:%.*]] = load i16, i16* [[B2]], align 2
+// CHECK-32-EX-NEXT:    br label [[COND_END11]]
+// CHECK-32-EX:       cond.end11:
+// CHECK-32-EX-NEXT:    [[COND12:%.*]] = phi i16 [ [[TMP18]], [[COND_TRUE9]] ], [ [[TMP19]], [[COND_FALSE10]] ]
+// CHECK-32-EX-NEXT:    store i16 [[COND12]], i16* [[TMP1]], align 2
+// CHECK-32-EX-NEXT:    call void @__kmpc_nvptx_end_reduce_nowait(i32 [[TMP6]])
+// CHECK-32-EX-NEXT:    br label [[DOTOMP_REDUCTION_DONE]]
+// CHECK-32-EX:       .omp.reduction.done:
+// CHECK-32-EX-NEXT:    ret void
+//
+//
+// CHECK-32-EX-LABEL: define {{[^@]+}}@_omp_reduction_shuffle_and_reduce_func7
+// CHECK-32-EX-SAME: (i8* noundef [[TMP0:%.*]], i16 noundef signext [[TMP1:%.*]], i16 noundef signext [[TMP2:%.*]], i16 noundef signext [[TMP3:%.*]]) #[[ATTR2]] {
+// CHECK-32-EX-NEXT:  entry:
+// CHECK-32-EX-NEXT:    [[DOTADDR:%.*]] = alloca i8*, align 4
+// CHECK-32-EX-NEXT:    [[DOTADDR1:%.*]] = alloca i16, align 2
+// CHECK-32-EX-NEXT:    [[DOTADDR2:%.*]] = alloca i16, align 2
+// CHECK-32-EX-NEXT:    [[DOTADDR3:%.*]] = alloca i16, align 2
+// CHECK-32-EX-NEXT:    [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST:%.*]] = alloca [2 x i8*], align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_REDUCTION_ELEMENT:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_REDUCTION_ELEMENT4:%.*]] = alloca i16, align 2
+// CHECK-32-EX-NEXT:    store i8* [[TMP0]], i8** [[DOTADDR]], align 4
+// CHECK-32-EX-NEXT:    store i16 [[TMP1]], i16* [[DOTADDR1]], align 2
+// CHECK-32-EX-NEXT:    store i16 [[TMP2]], i16* [[DOTADDR2]], align 2
+// CHECK-32-EX-NEXT:    store i16 [[TMP3]], i16* [[DOTADDR3]], align 2
+// CHECK-32-EX-NEXT:    [[TMP4:%.*]] = load i8*, i8** [[DOTADDR]], align 4
+// CHECK-32-EX-NEXT:    [[TMP5:%.*]] = bitcast i8* [[TMP4]] to [2 x i8*]*
+// CHECK-32-EX-NEXT:    [[TMP6:%.*]] = load i16, i16* [[DOTADDR1]], align 2
+// CHECK-32-EX-NEXT:    [[TMP7:%.*]] = load i16, i16* [[DOTADDR2]], align 2
+// CHECK-32-EX-NEXT:    [[TMP8:%.*]] = load i16, i16* [[DOTADDR3]], align 2
+// CHECK-32-EX-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 0
+// CHECK-32-EX-NEXT:    [[TMP10:%.*]] = bitcast i8** [[TMP9]] to i32**
+// CHECK-32-EX-NEXT:    [[TMP11:%.*]] = load i32*, i32** [[TMP10]], align 4
+// CHECK-32-EX-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 0
+// CHECK-32-EX-NEXT:    [[TMP13:%.*]] = getelementptr i32, i32* [[TMP11]], i32 1
+// CHECK-32-EX-NEXT:    [[TMP14:%.*]] = bitcast i32* [[TMP13]] to i8*
+// CHECK-32-EX-NEXT:    [[TMP15:%.*]] = load i32, i32* [[TMP11]], align 4
+// CHECK-32-EX-NEXT:    [[TMP16:%.*]] = call i32 @__kmpc_get_warp_size()
+// CHECK-32-EX-NEXT:    [[TMP17:%.*]] = trunc i32 [[TMP16]] to i16
+// CHECK-32-EX-NEXT:    [[TMP18:%.*]] = call i32 @__kmpc_shuffle_int32(i32 [[TMP15]], i16 [[TMP7]], i16 [[TMP17]])
+// CHECK-32-EX-NEXT:    store i32 [[TMP18]], i32* [[DOTOMP_REDUCTION_ELEMENT]], align 4
+// CHECK-32-EX-NEXT:    [[TMP19:%.*]] = getelementptr i32, i32* [[TMP11]], i32 1
+// CHECK-32-EX-NEXT:    [[TMP20:%.*]] = getelementptr i32, i32* [[DOTOMP_REDUCTION_ELEMENT]], i32 1
+// CHECK-32-EX-NEXT:    [[TMP21:%.*]] = bitcast i32* [[DOTOMP_REDUCTION_ELEMENT]] to i8*
+// CHECK-32-EX-NEXT:    store i8* [[TMP21]], i8** [[TMP12]], align 4
+// CHECK-32-EX-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 1
+// CHECK-32-EX-NEXT:    [[TMP23:%.*]] = bitcast i8** [[TMP22]] to i16**
+// CHECK-32-EX-NEXT:    [[TMP24:%.*]] = load i16*, i16** [[TMP23]], align 4
+// CHECK-32-EX-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 1
+// CHECK-32-EX-NEXT:    [[TMP26:%.*]] = getelementptr i16, i16* [[TMP24]], i32 1
+// CHECK-32-EX-NEXT:    [[TMP27:%.*]] = bitcast i16* [[TMP26]] to i8*
+// CHECK-32-EX-NEXT:    [[TMP28:%.*]] = load i16, i16* [[TMP24]], align 2
+// CHECK-32-EX-NEXT:    [[TMP29:%.*]] = sext i16 [[TMP28]] to i32
+// CHECK-32-EX-NEXT:    [[TMP30:%.*]] = call i32 @__kmpc_get_warp_size()
+// CHECK-32-EX-NEXT:    [[TMP31:%.*]] = trunc i32 [[TMP30]] to i16
+// CHECK-32-EX-NEXT:    [[TMP32:%.*]] = call i32 @__kmpc_shuffle_int32(i32 [[TMP29]], i16 [[TMP7]], i16 [[TMP31]])
+// CHECK-32-EX-NEXT:    [[TMP33:%.*]] = trunc i32 [[TMP32]] to i16
+// CHECK-32-EX-NEXT:    store i16 [[TMP33]], i16* [[DOTOMP_REDUCTION_ELEMENT4]], align 2
+// CHECK-32-EX-NEXT:    [[TMP34:%.*]] = getelementptr i16, i16* [[TMP24]], i32 1
+// CHECK-32-EX-NEXT:    [[TMP35:%.*]] = getelementptr i16, i16* [[DOTOMP_REDUCTION_ELEMENT4]], i32 1
+// CHECK-32-EX-NEXT:    [[TMP36:%.*]] = bitcast i16* [[DOTOMP_REDUCTION_ELEMENT4]] to i8*
+// CHECK-32-EX-NEXT:    store i8* [[TMP36]], i8** [[TMP25]], align 4
+// CHECK-32-EX-NEXT:    [[TMP37:%.*]] = icmp eq i16 [[TMP8]], 0
+// CHECK-32-EX-NEXT:    [[TMP38:%.*]] = icmp eq i16 [[TMP8]], 1
+// CHECK-32-EX-NEXT:    [[TMP39:%.*]] = icmp ult i16 [[TMP6]], [[TMP7]]
+// CHECK-32-EX-NEXT:    [[TMP40:%.*]] = and i1 [[TMP38]], [[TMP39]]
+// CHECK-32-EX-NEXT:    [[TMP41:%.*]] = icmp eq i16 [[TMP8]], 2
+// CHECK-32-EX-NEXT:    [[TMP42:%.*]] = and i16 [[TMP6]], 1
+// CHECK-32-EX-NEXT:    [[TMP43:%.*]] = icmp eq i16 [[TMP42]], 0
+// CHECK-32-EX-NEXT:    [[TMP44:%.*]] = and i1 [[TMP41]], [[TMP43]]
+// CHECK-32-EX-NEXT:    [[TMP45:%.*]] = icmp sgt i16 [[TMP7]], 0
+// CHECK-32-EX-NEXT:    [[TMP46:%.*]] = and i1 [[TMP44]], [[TMP45]]
+// CHECK-32-EX-NEXT:    [[TMP47:%.*]] = or i1 [[TMP37]], [[TMP40]]
+// CHECK-32-EX-NEXT:    [[TMP48:%.*]] = or i1 [[TMP47]], [[TMP46]]
+// CHECK-32-EX-NEXT:    br i1 [[TMP48]], label [[THEN:%.*]], label [[ELSE:%.*]]
+// CHECK-32-EX:       then:
+// CHECK-32-EX-NEXT:    [[TMP49:%.*]] = bitcast [2 x i8*]* [[TMP5]] to i8*
+// CHECK-32-EX-NEXT:    [[TMP50:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]] to i8*
+// CHECK-32-EX-NEXT:    call void @"_omp$reduction$reduction_func6"(i8* [[TMP49]], i8* [[TMP50]]) #[[ATTR3]]
+// CHECK-32-EX-NEXT:    br label [[IFCONT:%.*]]
+// CHECK-32-EX:       else:
+// CHECK-32-EX-NEXT:    br label [[IFCONT]]
+// CHECK-32-EX:       ifcont:
+// CHECK-32-EX-NEXT:    [[TMP51:%.*]] = icmp eq i16 [[TMP8]], 1
+// CHECK-32-EX-NEXT:    [[TMP52:%.*]] = icmp uge i16 [[TMP6]], [[TMP7]]
+// CHECK-32-EX-NEXT:    [[TMP53:%.*]] = and i1 [[TMP51]], [[TMP52]]
+// CHECK-32-EX-NEXT:    br i1 [[TMP53]], label [[THEN5:%.*]], label [[ELSE6:%.*]]
+// CHECK-32-EX:       then5:
+// CHECK-32-EX-NEXT:    [[TMP54:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 0
+// CHECK-32-EX-NEXT:    [[TMP55:%.*]] = bitcast i8** [[TMP54]] to i32**
+// CHECK-32-EX-NEXT:    [[TMP56:%.*]] = load i32*, i32** [[TMP55]], align 4
+// CHECK-32-EX-NEXT:    [[TMP57:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 0
+// CHECK-32-EX-NEXT:    [[TMP58:%.*]] = bitcast i8** [[TMP57]] to i32**
+// CHECK-32-EX-NEXT:    [[TMP59:%.*]] = load i32*, i32** [[TMP58]], align 4
+// CHECK-32-EX-NEXT:    [[TMP60:%.*]] = load i32, i32* [[TMP56]], align 4
+// CHECK-32-EX-NEXT:    store i32 [[TMP60]], i32* [[TMP59]], align 4
+// CHECK-32-EX-NEXT:    [[TMP61:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 1
+// CHECK-32-EX-NEXT:    [[TMP62:%.*]] = bitcast i8** [[TMP61]] to i16**
+// CHECK-32-EX-NEXT:    [[TMP63:%.*]] = load i16*, i16** [[TMP62]], align 4
+// CHECK-32-EX-NEXT:    [[TMP64:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 1
+// CHECK-32-EX-NEXT:    [[TMP65:%.*]] = bitcast i8** [[TMP64]] to i16**
+// CHECK-32-EX-NEXT:    [[TMP66:%.*]] = load i16*, i16** [[TMP65]], align 4
+// CHECK-32-EX-NEXT:    [[TMP67:%.*]] = load i16, i16* [[TMP63]], align 2
+// CHECK-32-EX-NEXT:    store i16 [[TMP67]], i16* [[TMP66]], align 2
+// CHECK-32-EX-NEXT:    br label [[IFCONT7:%.*]]
+// CHECK-32-EX:       else6:
+// CHECK-32-EX-NEXT:    br label [[IFCONT7]]
+// CHECK-32-EX:       ifcont7:
+// CHECK-32-EX-NEXT:    ret void
+//
+//
+// CHECK-32-EX-LABEL: define {{[^@]+}}@_omp_reduction_inter_warp_copy_func8
+// CHECK-32-EX-SAME: (i8* noundef [[TMP0:%.*]], i32 noundef [[TMP1:%.*]]) #[[ATTR2]] {
+// CHECK-32-EX-NEXT:  entry:
+// CHECK-32-EX-NEXT:    [[DOTADDR:%.*]] = alloca i8*, align 4
+// CHECK-32-EX-NEXT:    [[DOTADDR1:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
+// CHECK-32-EX-NEXT:    store i8* [[TMP0]], i8** [[DOTADDR]], align 4
+// CHECK-32-EX-NEXT:    store i32 [[TMP1]], i32* [[DOTADDR1]], align 4
+// CHECK-32-EX-NEXT:    [[TMP3:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block()
+// CHECK-32-EX-NEXT:    [[TMP4:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block()
+// CHECK-32-EX-NEXT:    [[NVPTX_LANE_ID:%.*]] = and i32 [[TMP4]], 31
+// CHECK-32-EX-NEXT:    [[TMP5:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block()
+// CHECK-32-EX-NEXT:    [[NVPTX_WARP_ID:%.*]] = ashr i32 [[TMP5]], 5
+// CHECK-32-EX-NEXT:    [[TMP6:%.*]] = load i8*, i8** [[DOTADDR]], align 4
+// CHECK-32-EX-NEXT:    [[TMP7:%.*]] = bitcast i8* [[TMP6]] to [2 x i8*]*
+// CHECK-32-EX-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2]], i32 [[TMP2]])
+// CHECK-32-EX-NEXT:    [[WARP_MASTER:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0
+// CHECK-32-EX-NEXT:    br i1 [[WARP_MASTER]], label [[THEN:%.*]], label [[ELSE:%.*]]
+// CHECK-32-EX:       then:
+// CHECK-32-EX-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP7]], i32 0, i32 0
+// CHECK-32-EX-NEXT:    [[TMP9:%.*]] = load i8*, i8** [[TMP8]], align 4
+// CHECK-32-EX-NEXT:    [[TMP10:%.*]] = bitcast i8* [[TMP9]] to i32*
+// CHECK-32-EX-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]]
+// CHECK-32-EX-NEXT:    [[TMP12:%.*]] = load i32, i32* [[TMP10]], align 4
+// CHECK-32-EX-NEXT:    store volatile i32 [[TMP12]], i32 addrspace(3)* [[TMP11]], align 4
+// CHECK-32-EX-NEXT:    br label [[IFCONT:%.*]]
+// CHECK-32-EX:       else:
+// CHECK-32-EX-NEXT:    br label [[IFCONT]]
+// CHECK-32-EX:       ifcont:
+// CHECK-32-EX-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2]], i32 [[TMP2]])
+// CHECK-32-EX-NEXT:    [[TMP13:%.*]] = load i32, i32* [[DOTADDR1]], align 4
+// CHECK-32-EX-NEXT:    [[IS_ACTIVE_THREAD:%.*]] = icmp ult i32 [[TMP3]], [[TMP13]]
+// CHECK-32-EX-NEXT:    br i1 [[IS_ACTIVE_THREAD]], label [[THEN2:%.*]], label [[ELSE3:%.*]]
+// CHECK-32-EX:       then2:
+// CHECK-32-EX-NEXT:    [[TMP14:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[TMP3]]
+// CHECK-32-EX-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP7]], i32 0, i32 0
+// CHECK-32-EX-NEXT:    [[TMP16:%.*]] = load i8*, i8** [[TMP15]], align 4
+// CHECK-32-EX-NEXT:    [[TMP17:%.*]] = bitcast i8* [[TMP16]] to i32*
+// CHECK-32-EX-NEXT:    [[TMP18:%.*]] = load volatile i32, i32 addrspace(3)* [[TMP14]], align 4
+// CHECK-32-EX-NEXT:    store i32 [[TMP18]], i32* [[TMP17]], align 4
+// CHECK-32-EX-NEXT:    br label [[IFCONT4:%.*]]
+// CHECK-32-EX:       else3:
+// CHECK-32-EX-NEXT:    br label [[IFCONT4]]
+// CHECK-32-EX:       ifcont4:
+// CHECK-32-EX-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2]], i32 [[TMP2]])
+// CHECK-32-EX-NEXT:    [[WARP_MASTER5:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0
+// CHECK-32-EX-NEXT:    br i1 [[WARP_MASTER5]], label [[THEN6:%.*]], label [[ELSE7:%.*]]
+// CHECK-32-EX:       then6:
+// CHECK-32-EX-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP7]], i32 0, i32 1
+// CHECK-32-EX-NEXT:    [[TMP20:%.*]] = load i8*, i8** [[TMP19]], align 4
+// CHECK-32-EX-NEXT:    [[TMP21:%.*]] = bitcast i8* [[TMP20]] to i16*
+// CHECK-32-EX-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]]
+// CHECK-32-EX-NEXT:    [[TMP23:%.*]] = bitcast i32 addrspace(3)* [[TMP22]] to i16 addrspace(3)*
+// CHECK-32-EX-NEXT:    [[TMP24:%.*]] = load i16, i16* [[TMP21]], align 2
+// CHECK-32-EX-NEXT:    store volatile i16 [[TMP24]], i16 addrspace(3)* [[TMP23]], align 2
+// CHECK-32-EX-NEXT:    br label [[IFCONT8:%.*]]
+// CHECK-32-EX:       else7:
+// CHECK-32-EX-NEXT:    br label [[IFCONT8]]
+// CHECK-32-EX:       ifcont8:
+// CHECK-32-EX-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2]], i32 [[TMP2]])
+// CHECK-32-EX-NEXT:    [[TMP25:%.*]] = load i32, i32* [[DOTADDR1]], align 4
+// CHECK-32-EX-NEXT:    [[IS_ACTIVE_THREAD9:%.*]] = icmp ult i32 [[TMP3]], [[TMP25]]
+// CHECK-32-EX-NEXT:    br i1 [[IS_ACTIVE_THREAD9]], label [[THEN10:%.*]], label [[ELSE11:%.*]]
+// CHECK-32-EX:       then10:
+// CHECK-32-EX-NEXT:    [[TMP26:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[TMP3]]
+// CHECK-32-EX-NEXT:    [[TMP27:%.*]] = bitcast i32 addrspace(3)* [[TMP26]] to i16 addrspace(3)*
+// CHECK-32-EX-NEXT:    [[TMP28:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP7]], i32 0, i32 1
+// CHECK-32-EX-NEXT:    [[TMP29:%.*]] = load i8*, i8** [[TMP28]], align 4
+// CHECK-32-EX-NEXT:    [[TMP30:%.*]] = bitcast i8* [[TMP29]] to i16*
+// CHECK-32-EX-NEXT:    [[TMP31:%.*]] = load volatile i16, i16 addrspace(3)* [[TMP27]], align 2
+// CHECK-32-EX-NEXT:    store i16 [[TMP31]], i16* [[TMP30]], align 2
+// CHECK-32-EX-NEXT:    br label [[IFCONT12:%.*]]
+// CHECK-32-EX:       else11:
+// CHECK-32-EX-NEXT:    br label [[IFCONT12]]
+// CHECK-32-EX:       ifcont12:
+// CHECK-32-EX-NEXT:    ret void
+//

diff  --git a/clang/test/OpenMP/nvptx_target_simd_codegen.cpp b/clang/test/OpenMP/nvptx_target_simd_codegen.cpp
index 9e2b35a3b8b0b..09a01c364bc5c 100644
--- a/clang/test/OpenMP/nvptx_target_simd_codegen.cpp
+++ b/clang/test/OpenMP/nvptx_target_simd_codegen.cpp
@@ -1,25 +1,22 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --function-signature --include-generated-funcs --replace-value-regex "__omp_offloading_[0-9a-z]+_[0-9a-z]+" "reduction_size[.].+[.]" "pl_cond[.].+[.|,]" --prefix-filecheck-ir-name _
 // Test target codegen - host bc file has to be created first.
 // RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -fopenmp-cuda-mode -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc
-// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -fopenmp-cuda-mode -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-64
+// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -fopenmp-cuda-mode -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix=CHECK45-64
 // RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -fopenmp-cuda-mode -x c++ -triple i386-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm-bc %s -o %t-x86-host.bc
-// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -fopenmp-cuda-mode -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32
-// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -fopenmp-cuda-mode -fexceptions -fcxx-exceptions -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32
+// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -fopenmp-cuda-mode -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix=CHECK45-32
+// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -fopenmp-cuda-mode -fexceptions -fcxx-exceptions -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefixes=CHECK45-32,CHECK45-32-EX
 
 // RUN: %clang_cc1 -verify -fopenmp -fopenmp-cuda-mode -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc
-// RUN: %clang_cc1 -verify -fopenmp -fopenmp-cuda-mode -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-64
+// RUN: %clang_cc1 -verify -fopenmp -fopenmp-cuda-mode -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix=CHECK-64
 // RUN: %clang_cc1 -verify -fopenmp -fopenmp-cuda-mode -x c++ -triple i386-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm-bc %s -o %t-x86-host.bc
-// RUN: %clang_cc1 -verify -fopenmp -fopenmp-cuda-mode -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32
-// RUN: %clang_cc1 -verify -fopenmp -fopenmp-cuda-mode -fexceptions -fcxx-exceptions -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32
+// RUN: %clang_cc1 -verify -fopenmp -fopenmp-cuda-mode -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix=CHECK-32
+// RUN: %clang_cc1 -verify -fopenmp -fopenmp-cuda-mode -fexceptions -fcxx-exceptions -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefixes=CHECK-32,CHECK-32-EX
 
 // expected-no-diagnostics
 #ifndef HEADER
 #define HEADER
 
 // Check that the execution mode of all 2 target regions on the gpu is set to NonSPMD Mode.
-// CHECK-DAG: {{@__omp_offloading_.+l32}}_exec_mode = weak protected constant i8 2
-// CHECK-DAG: {{@__omp_offloading_.+l37}}_exec_mode = weak protected constant i8 2
-// CHECK-DAG: {{@__omp_offloading_.+l42}}_exec_mode = weak protected constant i8 2
-// CHECK-DAG: {{@__omp_offloading_.+l47}}_exec_mode = weak protected constant i8 2
 
 #define N 1000
 
@@ -60,33 +57,1457 @@ int bar(int n){
   return a;
 }
 
-// CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+l32}}(
-// CHECK: call i32 @__kmpc_target_init({{.*}}, i8 2, i1 false)
-// CHECK-NOT: call void @__kmpc_for_static_init
-// CHECK-NOT: call void @__kmpc_for_static_fini
-// CHECK: call void @__kmpc_target_deinit({{.*}}, i8 2)
-// CHECK: ret void
-
-// CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+l37}}(
-// CHECK: call i32 @__kmpc_target_init({{.*}}, i8 2, i1 false)
-// CHECK-NOT: call void @__kmpc_for_static_init
-// CHECK-NOT: call void @__kmpc_for_static_fini
-// CHECK: call void @__kmpc_target_deinit({{.*}}, i8 2)
-// CHECK: ret void
-
-// CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+l42}}(
-// CHECK: call i32 @__kmpc_target_init({{.*}}, i8 2, i1 false)
-// CHECK-NOT: call void @__kmpc_for_static_init
-// CHECK-NOT: call void @__kmpc_for_static_fini
-// CHECK: call void @__kmpc_target_deinit({{.*}}, i8 2)
-// CHECK: ret void
-
-// CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+l47}}(
-// CHECK: call i32 @__kmpc_target_init({{.*}}, i8 2, i1 false)
-// CHECK-NOT: call void @__kmpc_for_static_init
-// CHECK-NOT: call void @__kmpc_for_static_fini
-// CHECK-NOT: call void @__kmpc_nvptx_end_reduce_nowait(
-// CHECK: call void @__kmpc_target_deinit({{.*}}, i8 2)
-// CHECK: ret void
-
 #endif
+// CHECK45-64-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l28
+// CHECK45-64-SAME: (i64 noundef [[N:%.*]], ptr noundef nonnull align 4 dereferenceable(4000) [[A:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK45-64-NEXT:  entry:
+// CHECK45-64-NEXT:    [[N_ADDR:%.*]] = alloca i64, align 8
+// CHECK45-64-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
+// CHECK45-64-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// CHECK45-64-NEXT:    [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4
+// CHECK45-64-NEXT:    [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4
+// CHECK45-64-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK45-64-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK45-64-NEXT:    [[I3:%.*]] = alloca i32, align 4
+// CHECK45-64-NEXT:    store i64 [[N]], ptr [[N_ADDR]], align 8
+// CHECK45-64-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
+// CHECK45-64-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[A_ADDR]], align 8
+// CHECK45-64-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr @[[GLOB1:[0-9]+]], i8 2, i1 false)
+// CHECK45-64-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1
+// CHECK45-64-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
+// CHECK45-64:       user_code.entry:
+// CHECK45-64-NEXT:    [[TMP2:%.*]] = load i32, ptr [[N_ADDR]], align 4
+// CHECK45-64-NEXT:    store i32 [[TMP2]], ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK45-64-NEXT:    [[TMP3:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK45-64-NEXT:    [[SUB:%.*]] = sub nsw i32 [[TMP3]], 0
+// CHECK45-64-NEXT:    [[DIV:%.*]] = sdiv i32 [[SUB]], 1
+// CHECK45-64-NEXT:    [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1
+// CHECK45-64-NEXT:    store i32 [[SUB2]], ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK45-64-NEXT:    store i32 0, ptr [[I]], align 4
+// CHECK45-64-NEXT:    [[TMP4:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK45-64-NEXT:    [[CMP:%.*]] = icmp slt i32 0, [[TMP4]]
+// CHECK45-64-NEXT:    br i1 [[CMP]], label [[SIMD_IF_THEN:%.*]], label [[SIMD_IF_END:%.*]]
+// CHECK45-64:       simd.if.then:
+// CHECK45-64-NEXT:    store i32 0, ptr [[DOTOMP_IV]], align 4
+// CHECK45-64-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK45-64:       omp.inner.for.cond:
+// CHECK45-64-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP12:![0-9]+]]
+// CHECK45-64-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4, !llvm.access.group [[ACC_GRP12]]
+// CHECK45-64-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP6]], 1
+// CHECK45-64-NEXT:    [[CMP4:%.*]] = icmp slt i32 [[TMP5]], [[ADD]]
+// CHECK45-64-NEXT:    br i1 [[CMP4]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK45-64:       omp.inner.for.body:
+// CHECK45-64-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP12]]
+// CHECK45-64-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP7]], 1
+// CHECK45-64-NEXT:    [[ADD5:%.*]] = add nsw i32 0, [[MUL]]
+// CHECK45-64-NEXT:    store i32 [[ADD5]], ptr [[I3]], align 4, !llvm.access.group [[ACC_GRP12]]
+// CHECK45-64-NEXT:    [[TMP8:%.*]] = load i32, ptr [[I3]], align 4, !llvm.access.group [[ACC_GRP12]]
+// CHECK45-64-NEXT:    [[IDXPROM:%.*]] = sext i32 [[TMP8]] to i64
+// CHECK45-64-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [1000 x i32], ptr [[TMP0]], i64 0, i64 [[IDXPROM]]
+// CHECK45-64-NEXT:    store i32 1, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP12]]
+// CHECK45-64-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK45-64:       omp.body.continue:
+// CHECK45-64-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK45-64:       omp.inner.for.inc:
+// CHECK45-64-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP12]]
+// CHECK45-64-NEXT:    [[ADD6:%.*]] = add nsw i32 [[TMP9]], 1
+// CHECK45-64-NEXT:    store i32 [[ADD6]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP12]]
+// CHECK45-64-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP13:![0-9]+]]
+// CHECK45-64:       worker.exit:
+// CHECK45-64-NEXT:    ret void
+// CHECK45-64:       omp.inner.for.end:
+// CHECK45-64-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK45-64-NEXT:    [[SUB7:%.*]] = sub nsw i32 [[TMP10]], 0
+// CHECK45-64-NEXT:    [[DIV8:%.*]] = sdiv i32 [[SUB7]], 1
+// CHECK45-64-NEXT:    [[MUL9:%.*]] = mul nsw i32 [[DIV8]], 1
+// CHECK45-64-NEXT:    [[ADD10:%.*]] = add nsw i32 0, [[MUL9]]
+// CHECK45-64-NEXT:    store i32 [[ADD10]], ptr [[I3]], align 4
+// CHECK45-64-NEXT:    br label [[SIMD_IF_END]]
+// CHECK45-64:       simd.if.end:
+// CHECK45-64-NEXT:    call void @__kmpc_target_deinit(ptr @[[GLOB1]], i8 2)
+// CHECK45-64-NEXT:    ret void
+//
+//
+// CHECK45-64-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l33
+// CHECK45-64-SAME: (i64 noundef [[N:%.*]], ptr noundef nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR0]] {
+// CHECK45-64-NEXT:  entry:
+// CHECK45-64-NEXT:    [[N_ADDR:%.*]] = alloca i64, align 8
+// CHECK45-64-NEXT:    [[AA_ADDR:%.*]] = alloca ptr, align 8
+// CHECK45-64-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// CHECK45-64-NEXT:    [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4
+// CHECK45-64-NEXT:    [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4
+// CHECK45-64-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK45-64-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK45-64-NEXT:    [[I3:%.*]] = alloca i32, align 4
+// CHECK45-64-NEXT:    store i64 [[N]], ptr [[N_ADDR]], align 8
+// CHECK45-64-NEXT:    store ptr [[AA]], ptr [[AA_ADDR]], align 8
+// CHECK45-64-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[AA_ADDR]], align 8
+// CHECK45-64-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr @[[GLOB1]], i8 2, i1 false)
+// CHECK45-64-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1
+// CHECK45-64-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
+// CHECK45-64:       user_code.entry:
+// CHECK45-64-NEXT:    [[TMP2:%.*]] = load i32, ptr [[N_ADDR]], align 4
+// CHECK45-64-NEXT:    store i32 [[TMP2]], ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK45-64-NEXT:    [[TMP3:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK45-64-NEXT:    [[SUB:%.*]] = sub nsw i32 [[TMP3]], 0
+// CHECK45-64-NEXT:    [[DIV:%.*]] = sdiv i32 [[SUB]], 1
+// CHECK45-64-NEXT:    [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1
+// CHECK45-64-NEXT:    store i32 [[SUB2]], ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK45-64-NEXT:    store i32 0, ptr [[I]], align 4
+// CHECK45-64-NEXT:    [[TMP4:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK45-64-NEXT:    [[CMP:%.*]] = icmp slt i32 0, [[TMP4]]
+// CHECK45-64-NEXT:    br i1 [[CMP]], label [[SIMD_IF_THEN:%.*]], label [[SIMD_IF_END:%.*]]
+// CHECK45-64:       simd.if.then:
+// CHECK45-64-NEXT:    store i32 0, ptr [[DOTOMP_IV]], align 4
+// CHECK45-64-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK45-64:       omp.inner.for.cond:
+// CHECK45-64-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP16:![0-9]+]]
+// CHECK45-64-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4, !llvm.access.group [[ACC_GRP16]]
+// CHECK45-64-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP6]], 1
+// CHECK45-64-NEXT:    [[CMP4:%.*]] = icmp slt i32 [[TMP5]], [[ADD]]
+// CHECK45-64-NEXT:    br i1 [[CMP4]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK45-64:       omp.inner.for.body:
+// CHECK45-64-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP16]]
+// CHECK45-64-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP7]], 1
+// CHECK45-64-NEXT:    [[ADD5:%.*]] = add nsw i32 0, [[MUL]]
+// CHECK45-64-NEXT:    store i32 [[ADD5]], ptr [[I3]], align 4, !llvm.access.group [[ACC_GRP16]]
+// CHECK45-64-NEXT:    [[TMP8:%.*]] = load i32, ptr [[I3]], align 4, !llvm.access.group [[ACC_GRP16]]
+// CHECK45-64-NEXT:    [[IDXPROM:%.*]] = sext i32 [[TMP8]] to i64
+// CHECK45-64-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [1000 x i16], ptr [[TMP0]], i64 0, i64 [[IDXPROM]]
+// CHECK45-64-NEXT:    [[TMP9:%.*]] = load i16, ptr [[ARRAYIDX]], align 2, !llvm.access.group [[ACC_GRP16]]
+// CHECK45-64-NEXT:    [[CONV:%.*]] = sext i16 [[TMP9]] to i32
+// CHECK45-64-NEXT:    [[ADD6:%.*]] = add nsw i32 [[CONV]], 1
+// CHECK45-64-NEXT:    [[CONV7:%.*]] = trunc i32 [[ADD6]] to i16
+// CHECK45-64-NEXT:    store i16 [[CONV7]], ptr [[ARRAYIDX]], align 2, !llvm.access.group [[ACC_GRP16]]
+// CHECK45-64-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK45-64:       omp.body.continue:
+// CHECK45-64-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK45-64:       omp.inner.for.inc:
+// CHECK45-64-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP16]]
+// CHECK45-64-NEXT:    [[ADD8:%.*]] = add nsw i32 [[TMP10]], 1
+// CHECK45-64-NEXT:    store i32 [[ADD8]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP16]]
+// CHECK45-64-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP17:![0-9]+]]
+// CHECK45-64:       worker.exit:
+// CHECK45-64-NEXT:    ret void
+// CHECK45-64:       omp.inner.for.end:
+// CHECK45-64-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK45-64-NEXT:    [[SUB9:%.*]] = sub nsw i32 [[TMP11]], 0
+// CHECK45-64-NEXT:    [[DIV10:%.*]] = sdiv i32 [[SUB9]], 1
+// CHECK45-64-NEXT:    [[MUL11:%.*]] = mul nsw i32 [[DIV10]], 1
+// CHECK45-64-NEXT:    [[ADD12:%.*]] = add nsw i32 0, [[MUL11]]
+// CHECK45-64-NEXT:    store i32 [[ADD12]], ptr [[I3]], align 4
+// CHECK45-64-NEXT:    br label [[SIMD_IF_END]]
+// CHECK45-64:       simd.if.end:
+// CHECK45-64-NEXT:    call void @__kmpc_target_deinit(ptr @[[GLOB1]], i8 2)
+// CHECK45-64-NEXT:    ret void
+//
+//
+// CHECK45-64-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l38
+// CHECK45-64-SAME: (ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] {
+// CHECK45-64-NEXT:  entry:
+// CHECK45-64-NEXT:    [[B_ADDR:%.*]] = alloca ptr, align 8
+// CHECK45-64-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// CHECK45-64-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK45-64-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK45-64-NEXT:    store ptr [[B]], ptr [[B_ADDR]], align 8
+// CHECK45-64-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[B_ADDR]], align 8
+// CHECK45-64-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr @[[GLOB1]], i8 2, i1 false)
+// CHECK45-64-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1
+// CHECK45-64-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
+// CHECK45-64:       user_code.entry:
+// CHECK45-64-NEXT:    store i32 0, ptr [[DOTOMP_IV]], align 4
+// CHECK45-64-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK45-64:       omp.inner.for.cond:
+// CHECK45-64-NEXT:    [[TMP2:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP19:![0-9]+]]
+// CHECK45-64-NEXT:    [[CMP:%.*]] = icmp slt i32 [[TMP2]], 10
+// CHECK45-64-NEXT:    br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK45-64:       omp.inner.for.body:
+// CHECK45-64-NEXT:    [[TMP3:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP19]]
+// CHECK45-64-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP3]], 1
+// CHECK45-64-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// CHECK45-64-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP19]]
+// CHECK45-64-NEXT:    [[TMP4:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP19]]
+// CHECK45-64-NEXT:    [[IDXPROM:%.*]] = sext i32 [[TMP4]] to i64
+// CHECK45-64-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], ptr [[TMP0]], i64 0, i64 [[IDXPROM]]
+// CHECK45-64-NEXT:    [[TMP5:%.*]] = load i32, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP19]]
+// CHECK45-64-NEXT:    [[ADD1:%.*]] = add nsw i32 [[TMP5]], 1
+// CHECK45-64-NEXT:    store i32 [[ADD1]], ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP19]]
+// CHECK45-64-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK45-64:       omp.body.continue:
+// CHECK45-64-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK45-64:       omp.inner.for.inc:
+// CHECK45-64-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP19]]
+// CHECK45-64-NEXT:    [[ADD2:%.*]] = add nsw i32 [[TMP6]], 1
+// CHECK45-64-NEXT:    store i32 [[ADD2]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP19]]
+// CHECK45-64-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP20:![0-9]+]]
+// CHECK45-64:       worker.exit:
+// CHECK45-64-NEXT:    ret void
+// CHECK45-64:       omp.inner.for.end:
+// CHECK45-64-NEXT:    store i32 10, ptr [[I]], align 4
+// CHECK45-64-NEXT:    call void @__kmpc_target_deinit(ptr @[[GLOB1]], i8 2)
+// CHECK45-64-NEXT:    ret void
+//
+//
+// CHECK45-64-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l43
+// CHECK45-64-SAME: (ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]]) #[[ATTR0]] {
+// CHECK45-64-NEXT:  entry:
+// CHECK45-64-NEXT:    [[B_ADDR:%.*]] = alloca ptr, align 8
+// CHECK45-64-NEXT:    [[N_ADDR:%.*]] = alloca ptr, align 8
+// CHECK45-64-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// CHECK45-64-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK45-64-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK45-64-NEXT:    [[N1:%.*]] = alloca i32, align 4
+// CHECK45-64-NEXT:    store ptr [[B]], ptr [[B_ADDR]], align 8
+// CHECK45-64-NEXT:    store ptr [[N]], ptr [[N_ADDR]], align 8
+// CHECK45-64-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[B_ADDR]], align 8
+// CHECK45-64-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[N_ADDR]], align 8
+// CHECK45-64-NEXT:    [[TMP2:%.*]] = call i32 @__kmpc_target_init(ptr @[[GLOB1]], i8 2, i1 false)
+// CHECK45-64-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP2]], -1
+// CHECK45-64-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
+// CHECK45-64:       user_code.entry:
+// CHECK45-64-NEXT:    store i32 0, ptr [[DOTOMP_IV]], align 4
+// CHECK45-64-NEXT:    store i32 0, ptr [[N1]], align 4
+// CHECK45-64-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK45-64:       omp.inner.for.cond:
+// CHECK45-64-NEXT:    [[TMP3:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP22:![0-9]+]]
+// CHECK45-64-NEXT:    [[CMP:%.*]] = icmp slt i32 [[TMP3]], 10
+// CHECK45-64-NEXT:    br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK45-64:       omp.inner.for.body:
+// CHECK45-64-NEXT:    [[TMP4:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP22]]
+// CHECK45-64-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP4]], 1
+// CHECK45-64-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// CHECK45-64-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP22]]
+// CHECK45-64-NEXT:    [[TMP5:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP22]]
+// CHECK45-64-NEXT:    [[IDXPROM:%.*]] = sext i32 [[TMP5]] to i64
+// CHECK45-64-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], ptr [[TMP0]], i64 0, i64 [[IDXPROM]]
+// CHECK45-64-NEXT:    [[TMP6:%.*]] = load i32, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP22]]
+// CHECK45-64-NEXT:    [[ADD2:%.*]] = add nsw i32 [[TMP6]], 1
+// CHECK45-64-NEXT:    store i32 [[ADD2]], ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP22]]
+// CHECK45-64-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK45-64:       omp.body.continue:
+// CHECK45-64-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK45-64:       omp.inner.for.inc:
+// CHECK45-64-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP22]]
+// CHECK45-64-NEXT:    [[ADD3:%.*]] = add nsw i32 [[TMP7]], 1
+// CHECK45-64-NEXT:    store i32 [[ADD3]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP22]]
+// CHECK45-64-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP23:![0-9]+]]
+// CHECK45-64:       worker.exit:
+// CHECK45-64-NEXT:    ret void
+// CHECK45-64:       omp.inner.for.end:
+// CHECK45-64-NEXT:    store i32 10, ptr [[I]], align 4
+// CHECK45-64-NEXT:    [[TMP8:%.*]] = load i32, ptr [[TMP1]], align 4
+// CHECK45-64-NEXT:    [[TMP9:%.*]] = load i32, ptr [[N1]], align 4
+// CHECK45-64-NEXT:    [[ADD4:%.*]] = add nsw i32 [[TMP8]], [[TMP9]]
+// CHECK45-64-NEXT:    store i32 [[ADD4]], ptr [[TMP1]], align 4
+// CHECK45-64-NEXT:    call void @__kmpc_target_deinit(ptr @[[GLOB1]], i8 2)
+// CHECK45-64-NEXT:    ret void
+//
+//
+// CHECK45-32-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l28
+// CHECK45-32-SAME: (i32 noundef [[N:%.*]], ptr noundef nonnull align 4 dereferenceable(4000) [[A:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK45-32-NEXT:  entry:
+// CHECK45-32-NEXT:    [[N_ADDR:%.*]] = alloca i32, align 4
+// CHECK45-32-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 4
+// CHECK45-32-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// CHECK45-32-NEXT:    [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4
+// CHECK45-32-NEXT:    [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4
+// CHECK45-32-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK45-32-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK45-32-NEXT:    [[I3:%.*]] = alloca i32, align 4
+// CHECK45-32-NEXT:    store i32 [[N]], ptr [[N_ADDR]], align 4
+// CHECK45-32-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 4
+// CHECK45-32-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[A_ADDR]], align 4
+// CHECK45-32-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr @[[GLOB1:[0-9]+]], i8 2, i1 false)
+// CHECK45-32-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1
+// CHECK45-32-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
+// CHECK45-32:       user_code.entry:
+// CHECK45-32-NEXT:    [[TMP2:%.*]] = load i32, ptr [[N_ADDR]], align 4
+// CHECK45-32-NEXT:    store i32 [[TMP2]], ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK45-32-NEXT:    [[TMP3:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK45-32-NEXT:    [[SUB:%.*]] = sub nsw i32 [[TMP3]], 0
+// CHECK45-32-NEXT:    [[DIV:%.*]] = sdiv i32 [[SUB]], 1
+// CHECK45-32-NEXT:    [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1
+// CHECK45-32-NEXT:    store i32 [[SUB2]], ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK45-32-NEXT:    store i32 0, ptr [[I]], align 4
+// CHECK45-32-NEXT:    [[TMP4:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK45-32-NEXT:    [[CMP:%.*]] = icmp slt i32 0, [[TMP4]]
+// CHECK45-32-NEXT:    br i1 [[CMP]], label [[SIMD_IF_THEN:%.*]], label [[SIMD_IF_END:%.*]]
+// CHECK45-32:       simd.if.then:
+// CHECK45-32-NEXT:    store i32 0, ptr [[DOTOMP_IV]], align 4
+// CHECK45-32-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK45-32:       omp.inner.for.cond:
+// CHECK45-32-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP12:![0-9]+]]
+// CHECK45-32-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4, !llvm.access.group [[ACC_GRP12]]
+// CHECK45-32-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP6]], 1
+// CHECK45-32-NEXT:    [[CMP4:%.*]] = icmp slt i32 [[TMP5]], [[ADD]]
+// CHECK45-32-NEXT:    br i1 [[CMP4]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK45-32:       omp.inner.for.body:
+// CHECK45-32-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP12]]
+// CHECK45-32-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP7]], 1
+// CHECK45-32-NEXT:    [[ADD5:%.*]] = add nsw i32 0, [[MUL]]
+// CHECK45-32-NEXT:    store i32 [[ADD5]], ptr [[I3]], align 4, !llvm.access.group [[ACC_GRP12]]
+// CHECK45-32-NEXT:    [[TMP8:%.*]] = load i32, ptr [[I3]], align 4, !llvm.access.group [[ACC_GRP12]]
+// CHECK45-32-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [1000 x i32], ptr [[TMP0]], i32 0, i32 [[TMP8]]
+// CHECK45-32-NEXT:    store i32 1, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP12]]
+// CHECK45-32-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK45-32:       omp.body.continue:
+// CHECK45-32-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK45-32:       omp.inner.for.inc:
+// CHECK45-32-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP12]]
+// CHECK45-32-NEXT:    [[ADD6:%.*]] = add nsw i32 [[TMP9]], 1
+// CHECK45-32-NEXT:    store i32 [[ADD6]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP12]]
+// CHECK45-32-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP13:![0-9]+]]
+// CHECK45-32:       worker.exit:
+// CHECK45-32-NEXT:    ret void
+// CHECK45-32:       omp.inner.for.end:
+// CHECK45-32-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK45-32-NEXT:    [[SUB7:%.*]] = sub nsw i32 [[TMP10]], 0
+// CHECK45-32-NEXT:    [[DIV8:%.*]] = sdiv i32 [[SUB7]], 1
+// CHECK45-32-NEXT:    [[MUL9:%.*]] = mul nsw i32 [[DIV8]], 1
+// CHECK45-32-NEXT:    [[ADD10:%.*]] = add nsw i32 0, [[MUL9]]
+// CHECK45-32-NEXT:    store i32 [[ADD10]], ptr [[I3]], align 4
+// CHECK45-32-NEXT:    br label [[SIMD_IF_END]]
+// CHECK45-32:       simd.if.end:
+// CHECK45-32-NEXT:    call void @__kmpc_target_deinit(ptr @[[GLOB1]], i8 2)
+// CHECK45-32-NEXT:    ret void
+//
+//
+// CHECK45-32-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l33
+// CHECK45-32-SAME: (i32 noundef [[N:%.*]], ptr noundef nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR0]] {
+// CHECK45-32-NEXT:  entry:
+// CHECK45-32-NEXT:    [[N_ADDR:%.*]] = alloca i32, align 4
+// CHECK45-32-NEXT:    [[AA_ADDR:%.*]] = alloca ptr, align 4
+// CHECK45-32-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// CHECK45-32-NEXT:    [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4
+// CHECK45-32-NEXT:    [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4
+// CHECK45-32-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK45-32-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK45-32-NEXT:    [[I3:%.*]] = alloca i32, align 4
+// CHECK45-32-NEXT:    store i32 [[N]], ptr [[N_ADDR]], align 4
+// CHECK45-32-NEXT:    store ptr [[AA]], ptr [[AA_ADDR]], align 4
+// CHECK45-32-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[AA_ADDR]], align 4
+// CHECK45-32-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr @[[GLOB1]], i8 2, i1 false)
+// CHECK45-32-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1
+// CHECK45-32-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
+// CHECK45-32:       user_code.entry:
+// CHECK45-32-NEXT:    [[TMP2:%.*]] = load i32, ptr [[N_ADDR]], align 4
+// CHECK45-32-NEXT:    store i32 [[TMP2]], ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK45-32-NEXT:    [[TMP3:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK45-32-NEXT:    [[SUB:%.*]] = sub nsw i32 [[TMP3]], 0
+// CHECK45-32-NEXT:    [[DIV:%.*]] = sdiv i32 [[SUB]], 1
+// CHECK45-32-NEXT:    [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1
+// CHECK45-32-NEXT:    store i32 [[SUB2]], ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK45-32-NEXT:    store i32 0, ptr [[I]], align 4
+// CHECK45-32-NEXT:    [[TMP4:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK45-32-NEXT:    [[CMP:%.*]] = icmp slt i32 0, [[TMP4]]
+// CHECK45-32-NEXT:    br i1 [[CMP]], label [[SIMD_IF_THEN:%.*]], label [[SIMD_IF_END:%.*]]
+// CHECK45-32:       simd.if.then:
+// CHECK45-32-NEXT:    store i32 0, ptr [[DOTOMP_IV]], align 4
+// CHECK45-32-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK45-32:       omp.inner.for.cond:
+// CHECK45-32-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP16:![0-9]+]]
+// CHECK45-32-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4, !llvm.access.group [[ACC_GRP16]]
+// CHECK45-32-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP6]], 1
+// CHECK45-32-NEXT:    [[CMP4:%.*]] = icmp slt i32 [[TMP5]], [[ADD]]
+// CHECK45-32-NEXT:    br i1 [[CMP4]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK45-32:       omp.inner.for.body:
+// CHECK45-32-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP16]]
+// CHECK45-32-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP7]], 1
+// CHECK45-32-NEXT:    [[ADD5:%.*]] = add nsw i32 0, [[MUL]]
+// CHECK45-32-NEXT:    store i32 [[ADD5]], ptr [[I3]], align 4, !llvm.access.group [[ACC_GRP16]]
+// CHECK45-32-NEXT:    [[TMP8:%.*]] = load i32, ptr [[I3]], align 4, !llvm.access.group [[ACC_GRP16]]
+// CHECK45-32-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [1000 x i16], ptr [[TMP0]], i32 0, i32 [[TMP8]]
+// CHECK45-32-NEXT:    [[TMP9:%.*]] = load i16, ptr [[ARRAYIDX]], align 2, !llvm.access.group [[ACC_GRP16]]
+// CHECK45-32-NEXT:    [[CONV:%.*]] = sext i16 [[TMP9]] to i32
+// CHECK45-32-NEXT:    [[ADD6:%.*]] = add nsw i32 [[CONV]], 1
+// CHECK45-32-NEXT:    [[CONV7:%.*]] = trunc i32 [[ADD6]] to i16
+// CHECK45-32-NEXT:    store i16 [[CONV7]], ptr [[ARRAYIDX]], align 2, !llvm.access.group [[ACC_GRP16]]
+// CHECK45-32-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK45-32:       omp.body.continue:
+// CHECK45-32-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK45-32:       omp.inner.for.inc:
+// CHECK45-32-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP16]]
+// CHECK45-32-NEXT:    [[ADD8:%.*]] = add nsw i32 [[TMP10]], 1
+// CHECK45-32-NEXT:    store i32 [[ADD8]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP16]]
+// CHECK45-32-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP17:![0-9]+]]
+// CHECK45-32:       worker.exit:
+// CHECK45-32-NEXT:    ret void
+// CHECK45-32:       omp.inner.for.end:
+// CHECK45-32-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK45-32-NEXT:    [[SUB9:%.*]] = sub nsw i32 [[TMP11]], 0
+// CHECK45-32-NEXT:    [[DIV10:%.*]] = sdiv i32 [[SUB9]], 1
+// CHECK45-32-NEXT:    [[MUL11:%.*]] = mul nsw i32 [[DIV10]], 1
+// CHECK45-32-NEXT:    [[ADD12:%.*]] = add nsw i32 0, [[MUL11]]
+// CHECK45-32-NEXT:    store i32 [[ADD12]], ptr [[I3]], align 4
+// CHECK45-32-NEXT:    br label [[SIMD_IF_END]]
+// CHECK45-32:       simd.if.end:
+// CHECK45-32-NEXT:    call void @__kmpc_target_deinit(ptr @[[GLOB1]], i8 2)
+// CHECK45-32-NEXT:    ret void
+//
+//
+// CHECK45-32-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l38
+// CHECK45-32-SAME: (ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] {
+// CHECK45-32-NEXT:  entry:
+// CHECK45-32-NEXT:    [[B_ADDR:%.*]] = alloca ptr, align 4
+// CHECK45-32-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// CHECK45-32-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK45-32-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK45-32-NEXT:    store ptr [[B]], ptr [[B_ADDR]], align 4
+// CHECK45-32-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[B_ADDR]], align 4
+// CHECK45-32-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr @[[GLOB1]], i8 2, i1 false)
+// CHECK45-32-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1
+// CHECK45-32-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
+// CHECK45-32:       user_code.entry:
+// CHECK45-32-NEXT:    store i32 0, ptr [[DOTOMP_IV]], align 4
+// CHECK45-32-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK45-32:       omp.inner.for.cond:
+// CHECK45-32-NEXT:    [[TMP2:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP19:![0-9]+]]
+// CHECK45-32-NEXT:    [[CMP:%.*]] = icmp slt i32 [[TMP2]], 10
+// CHECK45-32-NEXT:    br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK45-32:       omp.inner.for.body:
+// CHECK45-32-NEXT:    [[TMP3:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP19]]
+// CHECK45-32-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP3]], 1
+// CHECK45-32-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// CHECK45-32-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP19]]
+// CHECK45-32-NEXT:    [[TMP4:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP19]]
+// CHECK45-32-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], ptr [[TMP0]], i32 0, i32 [[TMP4]]
+// CHECK45-32-NEXT:    [[TMP5:%.*]] = load i32, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP19]]
+// CHECK45-32-NEXT:    [[ADD1:%.*]] = add nsw i32 [[TMP5]], 1
+// CHECK45-32-NEXT:    store i32 [[ADD1]], ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP19]]
+// CHECK45-32-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK45-32:       omp.body.continue:
+// CHECK45-32-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK45-32:       omp.inner.for.inc:
+// CHECK45-32-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP19]]
+// CHECK45-32-NEXT:    [[ADD2:%.*]] = add nsw i32 [[TMP6]], 1
+// CHECK45-32-NEXT:    store i32 [[ADD2]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP19]]
+// CHECK45-32-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP20:![0-9]+]]
+// CHECK45-32:       worker.exit:
+// CHECK45-32-NEXT:    ret void
+// CHECK45-32:       omp.inner.for.end:
+// CHECK45-32-NEXT:    store i32 10, ptr [[I]], align 4
+// CHECK45-32-NEXT:    call void @__kmpc_target_deinit(ptr @[[GLOB1]], i8 2)
+// CHECK45-32-NEXT:    ret void
+//
+//
+// CHECK45-32-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l43
+// CHECK45-32-SAME: (ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]]) #[[ATTR0]] {
+// CHECK45-32-NEXT:  entry:
+// CHECK45-32-NEXT:    [[B_ADDR:%.*]] = alloca ptr, align 4
+// CHECK45-32-NEXT:    [[N_ADDR:%.*]] = alloca ptr, align 4
+// CHECK45-32-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// CHECK45-32-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK45-32-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK45-32-NEXT:    [[N1:%.*]] = alloca i32, align 4
+// CHECK45-32-NEXT:    store ptr [[B]], ptr [[B_ADDR]], align 4
+// CHECK45-32-NEXT:    store ptr [[N]], ptr [[N_ADDR]], align 4
+// CHECK45-32-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[B_ADDR]], align 4
+// CHECK45-32-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[N_ADDR]], align 4
+// CHECK45-32-NEXT:    [[TMP2:%.*]] = call i32 @__kmpc_target_init(ptr @[[GLOB1]], i8 2, i1 false)
+// CHECK45-32-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP2]], -1
+// CHECK45-32-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
+// CHECK45-32:       user_code.entry:
+// CHECK45-32-NEXT:    store i32 0, ptr [[DOTOMP_IV]], align 4
+// CHECK45-32-NEXT:    store i32 0, ptr [[N1]], align 4
+// CHECK45-32-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK45-32:       omp.inner.for.cond:
+// CHECK45-32-NEXT:    [[TMP3:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP22:![0-9]+]]
+// CHECK45-32-NEXT:    [[CMP:%.*]] = icmp slt i32 [[TMP3]], 10
+// CHECK45-32-NEXT:    br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK45-32:       omp.inner.for.body:
+// CHECK45-32-NEXT:    [[TMP4:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP22]]
+// CHECK45-32-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP4]], 1
+// CHECK45-32-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// CHECK45-32-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP22]]
+// CHECK45-32-NEXT:    [[TMP5:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP22]]
+// CHECK45-32-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], ptr [[TMP0]], i32 0, i32 [[TMP5]]
+// CHECK45-32-NEXT:    [[TMP6:%.*]] = load i32, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP22]]
+// CHECK45-32-NEXT:    [[ADD2:%.*]] = add nsw i32 [[TMP6]], 1
+// CHECK45-32-NEXT:    store i32 [[ADD2]], ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP22]]
+// CHECK45-32-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK45-32:       omp.body.continue:
+// CHECK45-32-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK45-32:       omp.inner.for.inc:
+// CHECK45-32-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP22]]
+// CHECK45-32-NEXT:    [[ADD3:%.*]] = add nsw i32 [[TMP7]], 1
+// CHECK45-32-NEXT:    store i32 [[ADD3]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP22]]
+// CHECK45-32-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP23:![0-9]+]]
+// CHECK45-32:       worker.exit:
+// CHECK45-32-NEXT:    ret void
+// CHECK45-32:       omp.inner.for.end:
+// CHECK45-32-NEXT:    store i32 10, ptr [[I]], align 4
+// CHECK45-32-NEXT:    [[TMP8:%.*]] = load i32, ptr [[TMP1]], align 4
+// CHECK45-32-NEXT:    [[TMP9:%.*]] = load i32, ptr [[N1]], align 4
+// CHECK45-32-NEXT:    [[ADD4:%.*]] = add nsw i32 [[TMP8]], [[TMP9]]
+// CHECK45-32-NEXT:    store i32 [[ADD4]], ptr [[TMP1]], align 4
+// CHECK45-32-NEXT:    call void @__kmpc_target_deinit(ptr @[[GLOB1]], i8 2)
+// CHECK45-32-NEXT:    ret void
+//
+//
+// CHECK45-32-EX-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l28
+// CHECK45-32-EX-SAME: (i32 noundef [[N:%.*]], ptr noundef nonnull align 4 dereferenceable(4000) [[A:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK45-32-EX-NEXT:  entry:
+// CHECK45-32-EX-NEXT:    [[N_ADDR:%.*]] = alloca i32, align 4
+// CHECK45-32-EX-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 4
+// CHECK45-32-EX-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// CHECK45-32-EX-NEXT:    [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4
+// CHECK45-32-EX-NEXT:    [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4
+// CHECK45-32-EX-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK45-32-EX-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK45-32-EX-NEXT:    [[I3:%.*]] = alloca i32, align 4
+// CHECK45-32-EX-NEXT:    store i32 [[N]], ptr [[N_ADDR]], align 4
+// CHECK45-32-EX-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 4
+// CHECK45-32-EX-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[A_ADDR]], align 4
+// CHECK45-32-EX-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr @[[GLOB1:[0-9]+]], i8 2, i1 false)
+// CHECK45-32-EX-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1
+// CHECK45-32-EX-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
+// CHECK45-32-EX:       user_code.entry:
+// CHECK45-32-EX-NEXT:    [[TMP2:%.*]] = load i32, ptr [[N_ADDR]], align 4
+// CHECK45-32-EX-NEXT:    store i32 [[TMP2]], ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK45-32-EX-NEXT:    [[TMP3:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK45-32-EX-NEXT:    [[SUB:%.*]] = sub nsw i32 [[TMP3]], 0
+// CHECK45-32-EX-NEXT:    [[DIV:%.*]] = sdiv i32 [[SUB]], 1
+// CHECK45-32-EX-NEXT:    [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1
+// CHECK45-32-EX-NEXT:    store i32 [[SUB2]], ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK45-32-EX-NEXT:    store i32 0, ptr [[I]], align 4
+// CHECK45-32-EX-NEXT:    [[TMP4:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK45-32-EX-NEXT:    [[CMP:%.*]] = icmp slt i32 0, [[TMP4]]
+// CHECK45-32-EX-NEXT:    br i1 [[CMP]], label [[SIMD_IF_THEN:%.*]], label [[SIMD_IF_END:%.*]]
+// CHECK45-32-EX:       simd.if.then:
+// CHECK45-32-EX-NEXT:    store i32 0, ptr [[DOTOMP_IV]], align 4
+// CHECK45-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK45-32-EX:       omp.inner.for.cond:
+// CHECK45-32-EX-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP12:![0-9]+]]
+// CHECK45-32-EX-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4, !llvm.access.group [[ACC_GRP12]]
+// CHECK45-32-EX-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP6]], 1
+// CHECK45-32-EX-NEXT:    [[CMP4:%.*]] = icmp slt i32 [[TMP5]], [[ADD]]
+// CHECK45-32-EX-NEXT:    br i1 [[CMP4]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK45-32-EX:       omp.inner.for.body:
+// CHECK45-32-EX-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP12]]
+// CHECK45-32-EX-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP7]], 1
+// CHECK45-32-EX-NEXT:    [[ADD5:%.*]] = add nsw i32 0, [[MUL]]
+// CHECK45-32-EX-NEXT:    store i32 [[ADD5]], ptr [[I3]], align 4, !llvm.access.group [[ACC_GRP12]]
+// CHECK45-32-EX-NEXT:    [[TMP8:%.*]] = load i32, ptr [[I3]], align 4, !llvm.access.group [[ACC_GRP12]]
+// CHECK45-32-EX-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [1000 x i32], ptr [[TMP0]], i32 0, i32 [[TMP8]]
+// CHECK45-32-EX-NEXT:    store i32 1, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP12]]
+// CHECK45-32-EX-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK45-32-EX:       omp.body.continue:
+// CHECK45-32-EX-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK45-32-EX:       omp.inner.for.inc:
+// CHECK45-32-EX-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP12]]
+// CHECK45-32-EX-NEXT:    [[ADD6:%.*]] = add nsw i32 [[TMP9]], 1
+// CHECK45-32-EX-NEXT:    store i32 [[ADD6]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP12]]
+// CHECK45-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP13:![0-9]+]]
+// CHECK45-32-EX:       worker.exit:
+// CHECK45-32-EX-NEXT:    ret void
+// CHECK45-32-EX:       omp.inner.for.end:
+// CHECK45-32-EX-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK45-32-EX-NEXT:    [[SUB7:%.*]] = sub nsw i32 [[TMP10]], 0
+// CHECK45-32-EX-NEXT:    [[DIV8:%.*]] = sdiv i32 [[SUB7]], 1
+// CHECK45-32-EX-NEXT:    [[MUL9:%.*]] = mul nsw i32 [[DIV8]], 1
+// CHECK45-32-EX-NEXT:    [[ADD10:%.*]] = add nsw i32 0, [[MUL9]]
+// CHECK45-32-EX-NEXT:    store i32 [[ADD10]], ptr [[I3]], align 4
+// CHECK45-32-EX-NEXT:    br label [[SIMD_IF_END]]
+// CHECK45-32-EX:       simd.if.end:
+// CHECK45-32-EX-NEXT:    call void @__kmpc_target_deinit(ptr @[[GLOB1]], i8 2)
+// CHECK45-32-EX-NEXT:    ret void
+//
+//
+// CHECK45-32-EX-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l33
+// CHECK45-32-EX-SAME: (i32 noundef [[N:%.*]], ptr noundef nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR0]] {
+// CHECK45-32-EX-NEXT:  entry:
+// CHECK45-32-EX-NEXT:    [[N_ADDR:%.*]] = alloca i32, align 4
+// CHECK45-32-EX-NEXT:    [[AA_ADDR:%.*]] = alloca ptr, align 4
+// CHECK45-32-EX-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// CHECK45-32-EX-NEXT:    [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4
+// CHECK45-32-EX-NEXT:    [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4
+// CHECK45-32-EX-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK45-32-EX-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK45-32-EX-NEXT:    [[I3:%.*]] = alloca i32, align 4
+// CHECK45-32-EX-NEXT:    store i32 [[N]], ptr [[N_ADDR]], align 4
+// CHECK45-32-EX-NEXT:    store ptr [[AA]], ptr [[AA_ADDR]], align 4
+// CHECK45-32-EX-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[AA_ADDR]], align 4
+// CHECK45-32-EX-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr @[[GLOB1]], i8 2, i1 false)
+// CHECK45-32-EX-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1
+// CHECK45-32-EX-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
+// CHECK45-32-EX:       user_code.entry:
+// CHECK45-32-EX-NEXT:    [[TMP2:%.*]] = load i32, ptr [[N_ADDR]], align 4
+// CHECK45-32-EX-NEXT:    store i32 [[TMP2]], ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK45-32-EX-NEXT:    [[TMP3:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK45-32-EX-NEXT:    [[SUB:%.*]] = sub nsw i32 [[TMP3]], 0
+// CHECK45-32-EX-NEXT:    [[DIV:%.*]] = sdiv i32 [[SUB]], 1
+// CHECK45-32-EX-NEXT:    [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1
+// CHECK45-32-EX-NEXT:    store i32 [[SUB2]], ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK45-32-EX-NEXT:    store i32 0, ptr [[I]], align 4
+// CHECK45-32-EX-NEXT:    [[TMP4:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK45-32-EX-NEXT:    [[CMP:%.*]] = icmp slt i32 0, [[TMP4]]
+// CHECK45-32-EX-NEXT:    br i1 [[CMP]], label [[SIMD_IF_THEN:%.*]], label [[SIMD_IF_END:%.*]]
+// CHECK45-32-EX:       simd.if.then:
+// CHECK45-32-EX-NEXT:    store i32 0, ptr [[DOTOMP_IV]], align 4
+// CHECK45-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK45-32-EX:       omp.inner.for.cond:
+// CHECK45-32-EX-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP16:![0-9]+]]
+// CHECK45-32-EX-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4, !llvm.access.group [[ACC_GRP16]]
+// CHECK45-32-EX-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP6]], 1
+// CHECK45-32-EX-NEXT:    [[CMP4:%.*]] = icmp slt i32 [[TMP5]], [[ADD]]
+// CHECK45-32-EX-NEXT:    br i1 [[CMP4]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK45-32-EX:       omp.inner.for.body:
+// CHECK45-32-EX-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP16]]
+// CHECK45-32-EX-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP7]], 1
+// CHECK45-32-EX-NEXT:    [[ADD5:%.*]] = add nsw i32 0, [[MUL]]
+// CHECK45-32-EX-NEXT:    store i32 [[ADD5]], ptr [[I3]], align 4, !llvm.access.group [[ACC_GRP16]]
+// CHECK45-32-EX-NEXT:    [[TMP8:%.*]] = load i32, ptr [[I3]], align 4, !llvm.access.group [[ACC_GRP16]]
+// CHECK45-32-EX-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [1000 x i16], ptr [[TMP0]], i32 0, i32 [[TMP8]]
+// CHECK45-32-EX-NEXT:    [[TMP9:%.*]] = load i16, ptr [[ARRAYIDX]], align 2, !llvm.access.group [[ACC_GRP16]]
+// CHECK45-32-EX-NEXT:    [[CONV:%.*]] = sext i16 [[TMP9]] to i32
+// CHECK45-32-EX-NEXT:    [[ADD6:%.*]] = add nsw i32 [[CONV]], 1
+// CHECK45-32-EX-NEXT:    [[CONV7:%.*]] = trunc i32 [[ADD6]] to i16
+// CHECK45-32-EX-NEXT:    store i16 [[CONV7]], ptr [[ARRAYIDX]], align 2, !llvm.access.group [[ACC_GRP16]]
+// CHECK45-32-EX-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK45-32-EX:       omp.body.continue:
+// CHECK45-32-EX-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK45-32-EX:       omp.inner.for.inc:
+// CHECK45-32-EX-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP16]]
+// CHECK45-32-EX-NEXT:    [[ADD8:%.*]] = add nsw i32 [[TMP10]], 1
+// CHECK45-32-EX-NEXT:    store i32 [[ADD8]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP16]]
+// CHECK45-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP17:![0-9]+]]
+// CHECK45-32-EX:       worker.exit:
+// CHECK45-32-EX-NEXT:    ret void
+// CHECK45-32-EX:       omp.inner.for.end:
+// CHECK45-32-EX-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK45-32-EX-NEXT:    [[SUB9:%.*]] = sub nsw i32 [[TMP11]], 0
+// CHECK45-32-EX-NEXT:    [[DIV10:%.*]] = sdiv i32 [[SUB9]], 1
+// CHECK45-32-EX-NEXT:    [[MUL11:%.*]] = mul nsw i32 [[DIV10]], 1
+// CHECK45-32-EX-NEXT:    [[ADD12:%.*]] = add nsw i32 0, [[MUL11]]
+// CHECK45-32-EX-NEXT:    store i32 [[ADD12]], ptr [[I3]], align 4
+// CHECK45-32-EX-NEXT:    br label [[SIMD_IF_END]]
+// CHECK45-32-EX:       simd.if.end:
+// CHECK45-32-EX-NEXT:    call void @__kmpc_target_deinit(ptr @[[GLOB1]], i8 2)
+// CHECK45-32-EX-NEXT:    ret void
+//
+//
+// CHECK45-32-EX-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l38
+// CHECK45-32-EX-SAME: (ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] {
+// CHECK45-32-EX-NEXT:  entry:
+// CHECK45-32-EX-NEXT:    [[B_ADDR:%.*]] = alloca ptr, align 4
+// CHECK45-32-EX-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// CHECK45-32-EX-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK45-32-EX-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK45-32-EX-NEXT:    store ptr [[B]], ptr [[B_ADDR]], align 4
+// CHECK45-32-EX-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[B_ADDR]], align 4
+// CHECK45-32-EX-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr @[[GLOB1]], i8 2, i1 false)
+// CHECK45-32-EX-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1
+// CHECK45-32-EX-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
+// CHECK45-32-EX:       user_code.entry:
+// CHECK45-32-EX-NEXT:    store i32 0, ptr [[DOTOMP_IV]], align 4
+// CHECK45-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK45-32-EX:       omp.inner.for.cond:
+// CHECK45-32-EX-NEXT:    [[TMP2:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP19:![0-9]+]]
+// CHECK45-32-EX-NEXT:    [[CMP:%.*]] = icmp slt i32 [[TMP2]], 10
+// CHECK45-32-EX-NEXT:    br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK45-32-EX:       omp.inner.for.body:
+// CHECK45-32-EX-NEXT:    [[TMP3:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP19]]
+// CHECK45-32-EX-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP3]], 1
+// CHECK45-32-EX-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// CHECK45-32-EX-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP19]]
+// CHECK45-32-EX-NEXT:    [[TMP4:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP19]]
+// CHECK45-32-EX-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], ptr [[TMP0]], i32 0, i32 [[TMP4]]
+// CHECK45-32-EX-NEXT:    [[TMP5:%.*]] = load i32, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP19]]
+// CHECK45-32-EX-NEXT:    [[ADD1:%.*]] = add nsw i32 [[TMP5]], 1
+// CHECK45-32-EX-NEXT:    store i32 [[ADD1]], ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP19]]
+// CHECK45-32-EX-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK45-32-EX:       omp.body.continue:
+// CHECK45-32-EX-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK45-32-EX:       omp.inner.for.inc:
+// CHECK45-32-EX-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP19]]
+// CHECK45-32-EX-NEXT:    [[ADD2:%.*]] = add nsw i32 [[TMP6]], 1
+// CHECK45-32-EX-NEXT:    store i32 [[ADD2]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP19]]
+// CHECK45-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP20:![0-9]+]]
+// CHECK45-32-EX:       worker.exit:
+// CHECK45-32-EX-NEXT:    ret void
+// CHECK45-32-EX:       omp.inner.for.end:
+// CHECK45-32-EX-NEXT:    store i32 10, ptr [[I]], align 4
+// CHECK45-32-EX-NEXT:    call void @__kmpc_target_deinit(ptr @[[GLOB1]], i8 2)
+// CHECK45-32-EX-NEXT:    ret void
+//
+//
+// CHECK45-32-EX-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l43
+// CHECK45-32-EX-SAME: (ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]]) #[[ATTR0]] {
+// CHECK45-32-EX-NEXT:  entry:
+// CHECK45-32-EX-NEXT:    [[B_ADDR:%.*]] = alloca ptr, align 4
+// CHECK45-32-EX-NEXT:    [[N_ADDR:%.*]] = alloca ptr, align 4
+// CHECK45-32-EX-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// CHECK45-32-EX-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK45-32-EX-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK45-32-EX-NEXT:    [[N1:%.*]] = alloca i32, align 4
+// CHECK45-32-EX-NEXT:    store ptr [[B]], ptr [[B_ADDR]], align 4
+// CHECK45-32-EX-NEXT:    store ptr [[N]], ptr [[N_ADDR]], align 4
+// CHECK45-32-EX-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[B_ADDR]], align 4
+// CHECK45-32-EX-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[N_ADDR]], align 4
+// CHECK45-32-EX-NEXT:    [[TMP2:%.*]] = call i32 @__kmpc_target_init(ptr @[[GLOB1]], i8 2, i1 false)
+// CHECK45-32-EX-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP2]], -1
+// CHECK45-32-EX-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
+// CHECK45-32-EX:       user_code.entry:
+// CHECK45-32-EX-NEXT:    store i32 0, ptr [[DOTOMP_IV]], align 4
+// CHECK45-32-EX-NEXT:    store i32 0, ptr [[N1]], align 4
+// CHECK45-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK45-32-EX:       omp.inner.for.cond:
+// CHECK45-32-EX-NEXT:    [[TMP3:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP22:![0-9]+]]
+// CHECK45-32-EX-NEXT:    [[CMP:%.*]] = icmp slt i32 [[TMP3]], 10
+// CHECK45-32-EX-NEXT:    br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK45-32-EX:       omp.inner.for.body:
+// CHECK45-32-EX-NEXT:    [[TMP4:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP22]]
+// CHECK45-32-EX-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP4]], 1
+// CHECK45-32-EX-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// CHECK45-32-EX-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP22]]
+// CHECK45-32-EX-NEXT:    [[TMP5:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP22]]
+// CHECK45-32-EX-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], ptr [[TMP0]], i32 0, i32 [[TMP5]]
+// CHECK45-32-EX-NEXT:    [[TMP6:%.*]] = load i32, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP22]]
+// CHECK45-32-EX-NEXT:    [[ADD2:%.*]] = add nsw i32 [[TMP6]], 1
+// CHECK45-32-EX-NEXT:    store i32 [[ADD2]], ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP22]]
+// CHECK45-32-EX-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK45-32-EX:       omp.body.continue:
+// CHECK45-32-EX-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK45-32-EX:       omp.inner.for.inc:
+// CHECK45-32-EX-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP22]]
+// CHECK45-32-EX-NEXT:    [[ADD3:%.*]] = add nsw i32 [[TMP7]], 1
+// CHECK45-32-EX-NEXT:    store i32 [[ADD3]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP22]]
+// CHECK45-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP23:![0-9]+]]
+// CHECK45-32-EX:       worker.exit:
+// CHECK45-32-EX-NEXT:    ret void
+// CHECK45-32-EX:       omp.inner.for.end:
+// CHECK45-32-EX-NEXT:    store i32 10, ptr [[I]], align 4
+// CHECK45-32-EX-NEXT:    [[TMP8:%.*]] = load i32, ptr [[TMP1]], align 4
+// CHECK45-32-EX-NEXT:    [[TMP9:%.*]] = load i32, ptr [[N1]], align 4
+// CHECK45-32-EX-NEXT:    [[ADD4:%.*]] = add nsw i32 [[TMP8]], [[TMP9]]
+// CHECK45-32-EX-NEXT:    store i32 [[ADD4]], ptr [[TMP1]], align 4
+// CHECK45-32-EX-NEXT:    call void @__kmpc_target_deinit(ptr @[[GLOB1]], i8 2)
+// CHECK45-32-EX-NEXT:    ret void
+//
+//
+// CHECK-64-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l28
+// CHECK-64-SAME: (i64 noundef [[N:%.*]], ptr noundef nonnull align 4 dereferenceable(4000) [[A:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-64-NEXT:  entry:
+// CHECK-64-NEXT:    [[N_ADDR:%.*]] = alloca i64, align 8
+// CHECK-64-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-64-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[I3:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    store i64 [[N]], ptr [[N_ADDR]], align 8
+// CHECK-64-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
+// CHECK-64-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[A_ADDR]], align 8
+// CHECK-64-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr @[[GLOB1:[0-9]+]], i8 2, i1 false)
+// CHECK-64-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1
+// CHECK-64-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
+// CHECK-64:       user_code.entry:
+// CHECK-64-NEXT:    [[TMP2:%.*]] = load i32, ptr [[N_ADDR]], align 4
+// CHECK-64-NEXT:    store i32 [[TMP2]], ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK-64-NEXT:    [[TMP3:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK-64-NEXT:    [[SUB:%.*]] = sub nsw i32 [[TMP3]], 0
+// CHECK-64-NEXT:    [[DIV:%.*]] = sdiv i32 [[SUB]], 1
+// CHECK-64-NEXT:    [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1
+// CHECK-64-NEXT:    store i32 [[SUB2]], ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK-64-NEXT:    store i32 0, ptr [[I]], align 4
+// CHECK-64-NEXT:    [[TMP4:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK-64-NEXT:    [[CMP:%.*]] = icmp slt i32 0, [[TMP4]]
+// CHECK-64-NEXT:    br i1 [[CMP]], label [[SIMD_IF_THEN:%.*]], label [[SIMD_IF_END:%.*]]
+// CHECK-64:       simd.if.then:
+// CHECK-64-NEXT:    store i32 0, ptr [[DOTOMP_IV]], align 4
+// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK-64:       omp.inner.for.cond:
+// CHECK-64-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP12:![0-9]+]]
+// CHECK-64-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4, !llvm.access.group [[ACC_GRP12]]
+// CHECK-64-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP6]], 1
+// CHECK-64-NEXT:    [[CMP4:%.*]] = icmp slt i32 [[TMP5]], [[ADD]]
+// CHECK-64-NEXT:    br i1 [[CMP4]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK-64:       omp.inner.for.body:
+// CHECK-64-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP12]]
+// CHECK-64-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP7]], 1
+// CHECK-64-NEXT:    [[ADD5:%.*]] = add nsw i32 0, [[MUL]]
+// CHECK-64-NEXT:    store i32 [[ADD5]], ptr [[I3]], align 4, !llvm.access.group [[ACC_GRP12]]
+// CHECK-64-NEXT:    [[TMP8:%.*]] = load i32, ptr [[I3]], align 4, !llvm.access.group [[ACC_GRP12]]
+// CHECK-64-NEXT:    [[IDXPROM:%.*]] = sext i32 [[TMP8]] to i64
+// CHECK-64-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [1000 x i32], ptr [[TMP0]], i64 0, i64 [[IDXPROM]]
+// CHECK-64-NEXT:    store i32 1, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP12]]
+// CHECK-64-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK-64:       omp.body.continue:
+// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK-64:       omp.inner.for.inc:
+// CHECK-64-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP12]]
+// CHECK-64-NEXT:    [[ADD6:%.*]] = add nsw i32 [[TMP9]], 1
+// CHECK-64-NEXT:    store i32 [[ADD6]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP12]]
+// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP13:![0-9]+]]
+// CHECK-64:       worker.exit:
+// CHECK-64-NEXT:    ret void
+// CHECK-64:       omp.inner.for.end:
+// CHECK-64-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK-64-NEXT:    [[SUB7:%.*]] = sub nsw i32 [[TMP10]], 0
+// CHECK-64-NEXT:    [[DIV8:%.*]] = sdiv i32 [[SUB7]], 1
+// CHECK-64-NEXT:    [[MUL9:%.*]] = mul nsw i32 [[DIV8]], 1
+// CHECK-64-NEXT:    [[ADD10:%.*]] = add nsw i32 0, [[MUL9]]
+// CHECK-64-NEXT:    store i32 [[ADD10]], ptr [[I3]], align 4
+// CHECK-64-NEXT:    br label [[SIMD_IF_END]]
+// CHECK-64:       simd.if.end:
+// CHECK-64-NEXT:    call void @__kmpc_target_deinit(ptr @[[GLOB1]], i8 2)
+// CHECK-64-NEXT:    ret void
+//
+//
+// CHECK-64-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l33
+// CHECK-64-SAME: (i64 noundef [[N:%.*]], ptr noundef nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR0]] {
+// CHECK-64-NEXT:  entry:
+// CHECK-64-NEXT:    [[N_ADDR:%.*]] = alloca i64, align 8
+// CHECK-64-NEXT:    [[AA_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-64-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[I3:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    store i64 [[N]], ptr [[N_ADDR]], align 8
+// CHECK-64-NEXT:    store ptr [[AA]], ptr [[AA_ADDR]], align 8
+// CHECK-64-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[AA_ADDR]], align 8
+// CHECK-64-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr @[[GLOB1]], i8 2, i1 false)
+// CHECK-64-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1
+// CHECK-64-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
+// CHECK-64:       user_code.entry:
+// CHECK-64-NEXT:    [[TMP2:%.*]] = load i32, ptr [[N_ADDR]], align 4
+// CHECK-64-NEXT:    store i32 [[TMP2]], ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK-64-NEXT:    [[TMP3:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK-64-NEXT:    [[SUB:%.*]] = sub nsw i32 [[TMP3]], 0
+// CHECK-64-NEXT:    [[DIV:%.*]] = sdiv i32 [[SUB]], 1
+// CHECK-64-NEXT:    [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1
+// CHECK-64-NEXT:    store i32 [[SUB2]], ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK-64-NEXT:    store i32 0, ptr [[I]], align 4
+// CHECK-64-NEXT:    [[TMP4:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK-64-NEXT:    [[CMP:%.*]] = icmp slt i32 0, [[TMP4]]
+// CHECK-64-NEXT:    br i1 [[CMP]], label [[SIMD_IF_THEN:%.*]], label [[SIMD_IF_END:%.*]]
+// CHECK-64:       simd.if.then:
+// CHECK-64-NEXT:    store i32 0, ptr [[DOTOMP_IV]], align 4
+// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK-64:       omp.inner.for.cond:
+// CHECK-64-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP16:![0-9]+]]
+// CHECK-64-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4, !llvm.access.group [[ACC_GRP16]]
+// CHECK-64-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP6]], 1
+// CHECK-64-NEXT:    [[CMP4:%.*]] = icmp slt i32 [[TMP5]], [[ADD]]
+// CHECK-64-NEXT:    br i1 [[CMP4]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK-64:       omp.inner.for.body:
+// CHECK-64-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP16]]
+// CHECK-64-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP7]], 1
+// CHECK-64-NEXT:    [[ADD5:%.*]] = add nsw i32 0, [[MUL]]
+// CHECK-64-NEXT:    store i32 [[ADD5]], ptr [[I3]], align 4, !llvm.access.group [[ACC_GRP16]]
+// CHECK-64-NEXT:    [[TMP8:%.*]] = load i32, ptr [[I3]], align 4, !llvm.access.group [[ACC_GRP16]]
+// CHECK-64-NEXT:    [[IDXPROM:%.*]] = sext i32 [[TMP8]] to i64
+// CHECK-64-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [1000 x i16], ptr [[TMP0]], i64 0, i64 [[IDXPROM]]
+// CHECK-64-NEXT:    [[TMP9:%.*]] = load i16, ptr [[ARRAYIDX]], align 2, !llvm.access.group [[ACC_GRP16]]
+// CHECK-64-NEXT:    [[CONV:%.*]] = sext i16 [[TMP9]] to i32
+// CHECK-64-NEXT:    [[ADD6:%.*]] = add nsw i32 [[CONV]], 1
+// CHECK-64-NEXT:    [[CONV7:%.*]] = trunc i32 [[ADD6]] to i16
+// CHECK-64-NEXT:    store i16 [[CONV7]], ptr [[ARRAYIDX]], align 2, !llvm.access.group [[ACC_GRP16]]
+// CHECK-64-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK-64:       omp.body.continue:
+// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK-64:       omp.inner.for.inc:
+// CHECK-64-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP16]]
+// CHECK-64-NEXT:    [[ADD8:%.*]] = add nsw i32 [[TMP10]], 1
+// CHECK-64-NEXT:    store i32 [[ADD8]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP16]]
+// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP17:![0-9]+]]
+// CHECK-64:       worker.exit:
+// CHECK-64-NEXT:    ret void
+// CHECK-64:       omp.inner.for.end:
+// CHECK-64-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK-64-NEXT:    [[SUB9:%.*]] = sub nsw i32 [[TMP11]], 0
+// CHECK-64-NEXT:    [[DIV10:%.*]] = sdiv i32 [[SUB9]], 1
+// CHECK-64-NEXT:    [[MUL11:%.*]] = mul nsw i32 [[DIV10]], 1
+// CHECK-64-NEXT:    [[ADD12:%.*]] = add nsw i32 0, [[MUL11]]
+// CHECK-64-NEXT:    store i32 [[ADD12]], ptr [[I3]], align 4
+// CHECK-64-NEXT:    br label [[SIMD_IF_END]]
+// CHECK-64:       simd.if.end:
+// CHECK-64-NEXT:    call void @__kmpc_target_deinit(ptr @[[GLOB1]], i8 2)
+// CHECK-64-NEXT:    ret void
+//
+//
+// CHECK-64-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l38
+// CHECK-64-SAME: (ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] {
+// CHECK-64-NEXT:  entry:
+// CHECK-64-NEXT:    [[B_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-64-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    store ptr [[B]], ptr [[B_ADDR]], align 8
+// CHECK-64-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[B_ADDR]], align 8
+// CHECK-64-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr @[[GLOB1]], i8 2, i1 false)
+// CHECK-64-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1
+// CHECK-64-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
+// CHECK-64:       user_code.entry:
+// CHECK-64-NEXT:    store i32 0, ptr [[DOTOMP_IV]], align 4
+// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK-64:       omp.inner.for.cond:
+// CHECK-64-NEXT:    [[TMP2:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP19:![0-9]+]]
+// CHECK-64-NEXT:    [[CMP:%.*]] = icmp slt i32 [[TMP2]], 10
+// CHECK-64-NEXT:    br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK-64:       omp.inner.for.body:
+// CHECK-64-NEXT:    [[TMP3:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP19]]
+// CHECK-64-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP3]], 1
+// CHECK-64-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// CHECK-64-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP19]]
+// CHECK-64-NEXT:    [[TMP4:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP19]]
+// CHECK-64-NEXT:    [[IDXPROM:%.*]] = sext i32 [[TMP4]] to i64
+// CHECK-64-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], ptr [[TMP0]], i64 0, i64 [[IDXPROM]]
+// CHECK-64-NEXT:    [[TMP5:%.*]] = load i32, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP19]]
+// CHECK-64-NEXT:    [[ADD1:%.*]] = add nsw i32 [[TMP5]], 1
+// CHECK-64-NEXT:    store i32 [[ADD1]], ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP19]]
+// CHECK-64-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK-64:       omp.body.continue:
+// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK-64:       omp.inner.for.inc:
+// CHECK-64-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP19]]
+// CHECK-64-NEXT:    [[ADD2:%.*]] = add nsw i32 [[TMP6]], 1
+// CHECK-64-NEXT:    store i32 [[ADD2]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP19]]
+// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP20:![0-9]+]]
+// CHECK-64:       worker.exit:
+// CHECK-64-NEXT:    ret void
+// CHECK-64:       omp.inner.for.end:
+// CHECK-64-NEXT:    store i32 10, ptr [[I]], align 4
+// CHECK-64-NEXT:    call void @__kmpc_target_deinit(ptr @[[GLOB1]], i8 2)
+// CHECK-64-NEXT:    ret void
+//
+//
+// CHECK-64-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l43
+// CHECK-64-SAME: (ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]]) #[[ATTR0]] {
+// CHECK-64-NEXT:  entry:
+// CHECK-64-NEXT:    [[B_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-64-NEXT:    [[N_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-64-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[N1:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    store ptr [[B]], ptr [[B_ADDR]], align 8
+// CHECK-64-NEXT:    store ptr [[N]], ptr [[N_ADDR]], align 8
+// CHECK-64-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[B_ADDR]], align 8
+// CHECK-64-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[N_ADDR]], align 8
+// CHECK-64-NEXT:    [[TMP2:%.*]] = call i32 @__kmpc_target_init(ptr @[[GLOB1]], i8 2, i1 false)
+// CHECK-64-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP2]], -1
+// CHECK-64-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
+// CHECK-64:       user_code.entry:
+// CHECK-64-NEXT:    store i32 0, ptr [[DOTOMP_IV]], align 4
+// CHECK-64-NEXT:    store i32 0, ptr [[N1]], align 4
+// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK-64:       omp.inner.for.cond:
+// CHECK-64-NEXT:    [[TMP3:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP22:![0-9]+]]
+// CHECK-64-NEXT:    [[CMP:%.*]] = icmp slt i32 [[TMP3]], 10
+// CHECK-64-NEXT:    br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK-64:       omp.inner.for.body:
+// CHECK-64-NEXT:    [[TMP4:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP22]]
+// CHECK-64-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP4]], 1
+// CHECK-64-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// CHECK-64-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP22]]
+// CHECK-64-NEXT:    [[TMP5:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP22]]
+// CHECK-64-NEXT:    [[IDXPROM:%.*]] = sext i32 [[TMP5]] to i64
+// CHECK-64-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], ptr [[TMP0]], i64 0, i64 [[IDXPROM]]
+// CHECK-64-NEXT:    [[TMP6:%.*]] = load i32, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP22]]
+// CHECK-64-NEXT:    [[ADD2:%.*]] = add nsw i32 [[TMP6]], 1
+// CHECK-64-NEXT:    store i32 [[ADD2]], ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP22]]
+// CHECK-64-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK-64:       omp.body.continue:
+// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK-64:       omp.inner.for.inc:
+// CHECK-64-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP22]]
+// CHECK-64-NEXT:    [[ADD3:%.*]] = add nsw i32 [[TMP7]], 1
+// CHECK-64-NEXT:    store i32 [[ADD3]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP22]]
+// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP23:![0-9]+]]
+// CHECK-64:       worker.exit:
+// CHECK-64-NEXT:    ret void
+// CHECK-64:       omp.inner.for.end:
+// CHECK-64-NEXT:    store i32 10, ptr [[I]], align 4
+// CHECK-64-NEXT:    [[TMP8:%.*]] = load i32, ptr [[TMP1]], align 4
+// CHECK-64-NEXT:    [[TMP9:%.*]] = load i32, ptr [[N1]], align 4
+// CHECK-64-NEXT:    [[ADD4:%.*]] = add nsw i32 [[TMP8]], [[TMP9]]
+// CHECK-64-NEXT:    store i32 [[ADD4]], ptr [[TMP1]], align 4
+// CHECK-64-NEXT:    call void @__kmpc_target_deinit(ptr @[[GLOB1]], i8 2)
+// CHECK-64-NEXT:    ret void
+//
+//
+// CHECK-32-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l28
+// CHECK-32-SAME: (i32 noundef [[N:%.*]], ptr noundef nonnull align 4 dereferenceable(4000) [[A:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-32-NEXT:  entry:
+// CHECK-32-NEXT:    [[N_ADDR:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 4
+// CHECK-32-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[I3:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    store i32 [[N]], ptr [[N_ADDR]], align 4
+// CHECK-32-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 4
+// CHECK-32-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[A_ADDR]], align 4
+// CHECK-32-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr @[[GLOB1:[0-9]+]], i8 2, i1 false)
+// CHECK-32-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1
+// CHECK-32-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
+// CHECK-32:       user_code.entry:
+// CHECK-32-NEXT:    [[TMP2:%.*]] = load i32, ptr [[N_ADDR]], align 4
+// CHECK-32-NEXT:    store i32 [[TMP2]], ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK-32-NEXT:    [[TMP3:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK-32-NEXT:    [[SUB:%.*]] = sub nsw i32 [[TMP3]], 0
+// CHECK-32-NEXT:    [[DIV:%.*]] = sdiv i32 [[SUB]], 1
+// CHECK-32-NEXT:    [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1
+// CHECK-32-NEXT:    store i32 [[SUB2]], ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK-32-NEXT:    store i32 0, ptr [[I]], align 4
+// CHECK-32-NEXT:    [[TMP4:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK-32-NEXT:    [[CMP:%.*]] = icmp slt i32 0, [[TMP4]]
+// CHECK-32-NEXT:    br i1 [[CMP]], label [[SIMD_IF_THEN:%.*]], label [[SIMD_IF_END:%.*]]
+// CHECK-32:       simd.if.then:
+// CHECK-32-NEXT:    store i32 0, ptr [[DOTOMP_IV]], align 4
+// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK-32:       omp.inner.for.cond:
+// CHECK-32-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP12:![0-9]+]]
+// CHECK-32-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4, !llvm.access.group [[ACC_GRP12]]
+// CHECK-32-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP6]], 1
+// CHECK-32-NEXT:    [[CMP4:%.*]] = icmp slt i32 [[TMP5]], [[ADD]]
+// CHECK-32-NEXT:    br i1 [[CMP4]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK-32:       omp.inner.for.body:
+// CHECK-32-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP12]]
+// CHECK-32-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP7]], 1
+// CHECK-32-NEXT:    [[ADD5:%.*]] = add nsw i32 0, [[MUL]]
+// CHECK-32-NEXT:    store i32 [[ADD5]], ptr [[I3]], align 4, !llvm.access.group [[ACC_GRP12]]
+// CHECK-32-NEXT:    [[TMP8:%.*]] = load i32, ptr [[I3]], align 4, !llvm.access.group [[ACC_GRP12]]
+// CHECK-32-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [1000 x i32], ptr [[TMP0]], i32 0, i32 [[TMP8]]
+// CHECK-32-NEXT:    store i32 1, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP12]]
+// CHECK-32-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK-32:       omp.body.continue:
+// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK-32:       omp.inner.for.inc:
+// CHECK-32-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP12]]
+// CHECK-32-NEXT:    [[ADD6:%.*]] = add nsw i32 [[TMP9]], 1
+// CHECK-32-NEXT:    store i32 [[ADD6]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP12]]
+// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP13:![0-9]+]]
+// CHECK-32:       worker.exit:
+// CHECK-32-NEXT:    ret void
+// CHECK-32:       omp.inner.for.end:
+// CHECK-32-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK-32-NEXT:    [[SUB7:%.*]] = sub nsw i32 [[TMP10]], 0
+// CHECK-32-NEXT:    [[DIV8:%.*]] = sdiv i32 [[SUB7]], 1
+// CHECK-32-NEXT:    [[MUL9:%.*]] = mul nsw i32 [[DIV8]], 1
+// CHECK-32-NEXT:    [[ADD10:%.*]] = add nsw i32 0, [[MUL9]]
+// CHECK-32-NEXT:    store i32 [[ADD10]], ptr [[I3]], align 4
+// CHECK-32-NEXT:    br label [[SIMD_IF_END]]
+// CHECK-32:       simd.if.end:
+// CHECK-32-NEXT:    call void @__kmpc_target_deinit(ptr @[[GLOB1]], i8 2)
+// CHECK-32-NEXT:    ret void
+//
+//
+// CHECK-32-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l33
+// CHECK-32-SAME: (i32 noundef [[N:%.*]], ptr noundef nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR0]] {
+// CHECK-32-NEXT:  entry:
+// CHECK-32-NEXT:    [[N_ADDR:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[AA_ADDR:%.*]] = alloca ptr, align 4
+// CHECK-32-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[I3:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    store i32 [[N]], ptr [[N_ADDR]], align 4
+// CHECK-32-NEXT:    store ptr [[AA]], ptr [[AA_ADDR]], align 4
+// CHECK-32-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[AA_ADDR]], align 4
+// CHECK-32-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr @[[GLOB1]], i8 2, i1 false)
+// CHECK-32-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1
+// CHECK-32-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
+// CHECK-32:       user_code.entry:
+// CHECK-32-NEXT:    [[TMP2:%.*]] = load i32, ptr [[N_ADDR]], align 4
+// CHECK-32-NEXT:    store i32 [[TMP2]], ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK-32-NEXT:    [[TMP3:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK-32-NEXT:    [[SUB:%.*]] = sub nsw i32 [[TMP3]], 0
+// CHECK-32-NEXT:    [[DIV:%.*]] = sdiv i32 [[SUB]], 1
+// CHECK-32-NEXT:    [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1
+// CHECK-32-NEXT:    store i32 [[SUB2]], ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK-32-NEXT:    store i32 0, ptr [[I]], align 4
+// CHECK-32-NEXT:    [[TMP4:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK-32-NEXT:    [[CMP:%.*]] = icmp slt i32 0, [[TMP4]]
+// CHECK-32-NEXT:    br i1 [[CMP]], label [[SIMD_IF_THEN:%.*]], label [[SIMD_IF_END:%.*]]
+// CHECK-32:       simd.if.then:
+// CHECK-32-NEXT:    store i32 0, ptr [[DOTOMP_IV]], align 4
+// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK-32:       omp.inner.for.cond:
+// CHECK-32-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP16:![0-9]+]]
+// CHECK-32-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4, !llvm.access.group [[ACC_GRP16]]
+// CHECK-32-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP6]], 1
+// CHECK-32-NEXT:    [[CMP4:%.*]] = icmp slt i32 [[TMP5]], [[ADD]]
+// CHECK-32-NEXT:    br i1 [[CMP4]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK-32:       omp.inner.for.body:
+// CHECK-32-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP16]]
+// CHECK-32-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP7]], 1
+// CHECK-32-NEXT:    [[ADD5:%.*]] = add nsw i32 0, [[MUL]]
+// CHECK-32-NEXT:    store i32 [[ADD5]], ptr [[I3]], align 4, !llvm.access.group [[ACC_GRP16]]
+// CHECK-32-NEXT:    [[TMP8:%.*]] = load i32, ptr [[I3]], align 4, !llvm.access.group [[ACC_GRP16]]
+// CHECK-32-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [1000 x i16], ptr [[TMP0]], i32 0, i32 [[TMP8]]
+// CHECK-32-NEXT:    [[TMP9:%.*]] = load i16, ptr [[ARRAYIDX]], align 2, !llvm.access.group [[ACC_GRP16]]
+// CHECK-32-NEXT:    [[CONV:%.*]] = sext i16 [[TMP9]] to i32
+// CHECK-32-NEXT:    [[ADD6:%.*]] = add nsw i32 [[CONV]], 1
+// CHECK-32-NEXT:    [[CONV7:%.*]] = trunc i32 [[ADD6]] to i16
+// CHECK-32-NEXT:    store i16 [[CONV7]], ptr [[ARRAYIDX]], align 2, !llvm.access.group [[ACC_GRP16]]
+// CHECK-32-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK-32:       omp.body.continue:
+// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK-32:       omp.inner.for.inc:
+// CHECK-32-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP16]]
+// CHECK-32-NEXT:    [[ADD8:%.*]] = add nsw i32 [[TMP10]], 1
+// CHECK-32-NEXT:    store i32 [[ADD8]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP16]]
+// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP17:![0-9]+]]
+// CHECK-32:       worker.exit:
+// CHECK-32-NEXT:    ret void
+// CHECK-32:       omp.inner.for.end:
+// CHECK-32-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK-32-NEXT:    [[SUB9:%.*]] = sub nsw i32 [[TMP11]], 0
+// CHECK-32-NEXT:    [[DIV10:%.*]] = sdiv i32 [[SUB9]], 1
+// CHECK-32-NEXT:    [[MUL11:%.*]] = mul nsw i32 [[DIV10]], 1
+// CHECK-32-NEXT:    [[ADD12:%.*]] = add nsw i32 0, [[MUL11]]
+// CHECK-32-NEXT:    store i32 [[ADD12]], ptr [[I3]], align 4
+// CHECK-32-NEXT:    br label [[SIMD_IF_END]]
+// CHECK-32:       simd.if.end:
+// CHECK-32-NEXT:    call void @__kmpc_target_deinit(ptr @[[GLOB1]], i8 2)
+// CHECK-32-NEXT:    ret void
+//
+//
+// CHECK-32-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l38
+// CHECK-32-SAME: (ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] {
+// CHECK-32-NEXT:  entry:
+// CHECK-32-NEXT:    [[B_ADDR:%.*]] = alloca ptr, align 4
+// CHECK-32-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    store ptr [[B]], ptr [[B_ADDR]], align 4
+// CHECK-32-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[B_ADDR]], align 4
+// CHECK-32-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr @[[GLOB1]], i8 2, i1 false)
+// CHECK-32-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1
+// CHECK-32-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
+// CHECK-32:       user_code.entry:
+// CHECK-32-NEXT:    store i32 0, ptr [[DOTOMP_IV]], align 4
+// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK-32:       omp.inner.for.cond:
+// CHECK-32-NEXT:    [[TMP2:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP19:![0-9]+]]
+// CHECK-32-NEXT:    [[CMP:%.*]] = icmp slt i32 [[TMP2]], 10
+// CHECK-32-NEXT:    br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK-32:       omp.inner.for.body:
+// CHECK-32-NEXT:    [[TMP3:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP19]]
+// CHECK-32-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP3]], 1
+// CHECK-32-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// CHECK-32-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP19]]
+// CHECK-32-NEXT:    [[TMP4:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP19]]
+// CHECK-32-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], ptr [[TMP0]], i32 0, i32 [[TMP4]]
+// CHECK-32-NEXT:    [[TMP5:%.*]] = load i32, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP19]]
+// CHECK-32-NEXT:    [[ADD1:%.*]] = add nsw i32 [[TMP5]], 1
+// CHECK-32-NEXT:    store i32 [[ADD1]], ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP19]]
+// CHECK-32-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK-32:       omp.body.continue:
+// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK-32:       omp.inner.for.inc:
+// CHECK-32-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP19]]
+// CHECK-32-NEXT:    [[ADD2:%.*]] = add nsw i32 [[TMP6]], 1
+// CHECK-32-NEXT:    store i32 [[ADD2]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP19]]
+// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP20:![0-9]+]]
+// CHECK-32:       worker.exit:
+// CHECK-32-NEXT:    ret void
+// CHECK-32:       omp.inner.for.end:
+// CHECK-32-NEXT:    store i32 10, ptr [[I]], align 4
+// CHECK-32-NEXT:    call void @__kmpc_target_deinit(ptr @[[GLOB1]], i8 2)
+// CHECK-32-NEXT:    ret void
+//
+//
+// CHECK-32-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l43
+// CHECK-32-SAME: (ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]]) #[[ATTR0]] {
+// CHECK-32-NEXT:  entry:
+// CHECK-32-NEXT:    [[B_ADDR:%.*]] = alloca ptr, align 4
+// CHECK-32-NEXT:    [[N_ADDR:%.*]] = alloca ptr, align 4
+// CHECK-32-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[N1:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    store ptr [[B]], ptr [[B_ADDR]], align 4
+// CHECK-32-NEXT:    store ptr [[N]], ptr [[N_ADDR]], align 4
+// CHECK-32-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[B_ADDR]], align 4
+// CHECK-32-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[N_ADDR]], align 4
+// CHECK-32-NEXT:    [[TMP2:%.*]] = call i32 @__kmpc_target_init(ptr @[[GLOB1]], i8 2, i1 false)
+// CHECK-32-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP2]], -1
+// CHECK-32-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
+// CHECK-32:       user_code.entry:
+// CHECK-32-NEXT:    store i32 0, ptr [[DOTOMP_IV]], align 4
+// CHECK-32-NEXT:    store i32 0, ptr [[N1]], align 4
+// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK-32:       omp.inner.for.cond:
+// CHECK-32-NEXT:    [[TMP3:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP22:![0-9]+]]
+// CHECK-32-NEXT:    [[CMP:%.*]] = icmp slt i32 [[TMP3]], 10
+// CHECK-32-NEXT:    br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK-32:       omp.inner.for.body:
+// CHECK-32-NEXT:    [[TMP4:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP22]]
+// CHECK-32-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP4]], 1
+// CHECK-32-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// CHECK-32-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP22]]
+// CHECK-32-NEXT:    [[TMP5:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP22]]
+// CHECK-32-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], ptr [[TMP0]], i32 0, i32 [[TMP5]]
+// CHECK-32-NEXT:    [[TMP6:%.*]] = load i32, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP22]]
+// CHECK-32-NEXT:    [[ADD2:%.*]] = add nsw i32 [[TMP6]], 1
+// CHECK-32-NEXT:    store i32 [[ADD2]], ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP22]]
+// CHECK-32-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK-32:       omp.body.continue:
+// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK-32:       omp.inner.for.inc:
+// CHECK-32-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP22]]
+// CHECK-32-NEXT:    [[ADD3:%.*]] = add nsw i32 [[TMP7]], 1
+// CHECK-32-NEXT:    store i32 [[ADD3]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP22]]
+// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP23:![0-9]+]]
+// CHECK-32:       worker.exit:
+// CHECK-32-NEXT:    ret void
+// CHECK-32:       omp.inner.for.end:
+// CHECK-32-NEXT:    store i32 10, ptr [[I]], align 4
+// CHECK-32-NEXT:    [[TMP8:%.*]] = load i32, ptr [[TMP1]], align 4
+// CHECK-32-NEXT:    [[TMP9:%.*]] = load i32, ptr [[N1]], align 4
+// CHECK-32-NEXT:    [[ADD4:%.*]] = add nsw i32 [[TMP8]], [[TMP9]]
+// CHECK-32-NEXT:    store i32 [[ADD4]], ptr [[TMP1]], align 4
+// CHECK-32-NEXT:    call void @__kmpc_target_deinit(ptr @[[GLOB1]], i8 2)
+// CHECK-32-NEXT:    ret void
+//
+//
+// CHECK-32-EX-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l28
+// CHECK-32-EX-SAME: (i32 noundef [[N:%.*]], ptr noundef nonnull align 4 dereferenceable(4000) [[A:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-32-EX-NEXT:  entry:
+// CHECK-32-EX-NEXT:    [[N_ADDR:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 4
+// CHECK-32-EX-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[I3:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    store i32 [[N]], ptr [[N_ADDR]], align 4
+// CHECK-32-EX-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 4
+// CHECK-32-EX-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[A_ADDR]], align 4
+// CHECK-32-EX-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr @[[GLOB1:[0-9]+]], i8 2, i1 false)
+// CHECK-32-EX-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1
+// CHECK-32-EX-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
+// CHECK-32-EX:       user_code.entry:
+// CHECK-32-EX-NEXT:    [[TMP2:%.*]] = load i32, ptr [[N_ADDR]], align 4
+// CHECK-32-EX-NEXT:    store i32 [[TMP2]], ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK-32-EX-NEXT:    [[TMP3:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK-32-EX-NEXT:    [[SUB:%.*]] = sub nsw i32 [[TMP3]], 0
+// CHECK-32-EX-NEXT:    [[DIV:%.*]] = sdiv i32 [[SUB]], 1
+// CHECK-32-EX-NEXT:    [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1
+// CHECK-32-EX-NEXT:    store i32 [[SUB2]], ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK-32-EX-NEXT:    store i32 0, ptr [[I]], align 4
+// CHECK-32-EX-NEXT:    [[TMP4:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK-32-EX-NEXT:    [[CMP:%.*]] = icmp slt i32 0, [[TMP4]]
+// CHECK-32-EX-NEXT:    br i1 [[CMP]], label [[SIMD_IF_THEN:%.*]], label [[SIMD_IF_END:%.*]]
+// CHECK-32-EX:       simd.if.then:
+// CHECK-32-EX-NEXT:    store i32 0, ptr [[DOTOMP_IV]], align 4
+// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK-32-EX:       omp.inner.for.cond:
+// CHECK-32-EX-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP12:![0-9]+]]
+// CHECK-32-EX-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4, !llvm.access.group [[ACC_GRP12]]
+// CHECK-32-EX-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP6]], 1
+// CHECK-32-EX-NEXT:    [[CMP4:%.*]] = icmp slt i32 [[TMP5]], [[ADD]]
+// CHECK-32-EX-NEXT:    br i1 [[CMP4]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK-32-EX:       omp.inner.for.body:
+// CHECK-32-EX-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP12]]
+// CHECK-32-EX-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP7]], 1
+// CHECK-32-EX-NEXT:    [[ADD5:%.*]] = add nsw i32 0, [[MUL]]
+// CHECK-32-EX-NEXT:    store i32 [[ADD5]], ptr [[I3]], align 4, !llvm.access.group [[ACC_GRP12]]
+// CHECK-32-EX-NEXT:    [[TMP8:%.*]] = load i32, ptr [[I3]], align 4, !llvm.access.group [[ACC_GRP12]]
+// CHECK-32-EX-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [1000 x i32], ptr [[TMP0]], i32 0, i32 [[TMP8]]
+// CHECK-32-EX-NEXT:    store i32 1, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP12]]
+// CHECK-32-EX-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK-32-EX:       omp.body.continue:
+// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK-32-EX:       omp.inner.for.inc:
+// CHECK-32-EX-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP12]]
+// CHECK-32-EX-NEXT:    [[ADD6:%.*]] = add nsw i32 [[TMP9]], 1
+// CHECK-32-EX-NEXT:    store i32 [[ADD6]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP12]]
+// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP13:![0-9]+]]
+// CHECK-32-EX:       worker.exit:
+// CHECK-32-EX-NEXT:    ret void
+// CHECK-32-EX:       omp.inner.for.end:
+// CHECK-32-EX-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK-32-EX-NEXT:    [[SUB7:%.*]] = sub nsw i32 [[TMP10]], 0
+// CHECK-32-EX-NEXT:    [[DIV8:%.*]] = sdiv i32 [[SUB7]], 1
+// CHECK-32-EX-NEXT:    [[MUL9:%.*]] = mul nsw i32 [[DIV8]], 1
+// CHECK-32-EX-NEXT:    [[ADD10:%.*]] = add nsw i32 0, [[MUL9]]
+// CHECK-32-EX-NEXT:    store i32 [[ADD10]], ptr [[I3]], align 4
+// CHECK-32-EX-NEXT:    br label [[SIMD_IF_END]]
+// CHECK-32-EX:       simd.if.end:
+// CHECK-32-EX-NEXT:    call void @__kmpc_target_deinit(ptr @[[GLOB1]], i8 2)
+// CHECK-32-EX-NEXT:    ret void
+//
+//
+// CHECK-32-EX-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l33
+// CHECK-32-EX-SAME: (i32 noundef [[N:%.*]], ptr noundef nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR0]] {
+// CHECK-32-EX-NEXT:  entry:
+// CHECK-32-EX-NEXT:    [[N_ADDR:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[AA_ADDR:%.*]] = alloca ptr, align 4
+// CHECK-32-EX-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[I3:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    store i32 [[N]], ptr [[N_ADDR]], align 4
+// CHECK-32-EX-NEXT:    store ptr [[AA]], ptr [[AA_ADDR]], align 4
+// CHECK-32-EX-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[AA_ADDR]], align 4
+// CHECK-32-EX-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr @[[GLOB1]], i8 2, i1 false)
+// CHECK-32-EX-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1
+// CHECK-32-EX-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
+// CHECK-32-EX:       user_code.entry:
+// CHECK-32-EX-NEXT:    [[TMP2:%.*]] = load i32, ptr [[N_ADDR]], align 4
+// CHECK-32-EX-NEXT:    store i32 [[TMP2]], ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK-32-EX-NEXT:    [[TMP3:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK-32-EX-NEXT:    [[SUB:%.*]] = sub nsw i32 [[TMP3]], 0
+// CHECK-32-EX-NEXT:    [[DIV:%.*]] = sdiv i32 [[SUB]], 1
+// CHECK-32-EX-NEXT:    [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1
+// CHECK-32-EX-NEXT:    store i32 [[SUB2]], ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK-32-EX-NEXT:    store i32 0, ptr [[I]], align 4
+// CHECK-32-EX-NEXT:    [[TMP4:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK-32-EX-NEXT:    [[CMP:%.*]] = icmp slt i32 0, [[TMP4]]
+// CHECK-32-EX-NEXT:    br i1 [[CMP]], label [[SIMD_IF_THEN:%.*]], label [[SIMD_IF_END:%.*]]
+// CHECK-32-EX:       simd.if.then:
+// CHECK-32-EX-NEXT:    store i32 0, ptr [[DOTOMP_IV]], align 4
+// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK-32-EX:       omp.inner.for.cond:
+// CHECK-32-EX-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP16:![0-9]+]]
+// CHECK-32-EX-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4, !llvm.access.group [[ACC_GRP16]]
+// CHECK-32-EX-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP6]], 1
+// CHECK-32-EX-NEXT:    [[CMP4:%.*]] = icmp slt i32 [[TMP5]], [[ADD]]
+// CHECK-32-EX-NEXT:    br i1 [[CMP4]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK-32-EX:       omp.inner.for.body:
+// CHECK-32-EX-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP16]]
+// CHECK-32-EX-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP7]], 1
+// CHECK-32-EX-NEXT:    [[ADD5:%.*]] = add nsw i32 0, [[MUL]]
+// CHECK-32-EX-NEXT:    store i32 [[ADD5]], ptr [[I3]], align 4, !llvm.access.group [[ACC_GRP16]]
+// CHECK-32-EX-NEXT:    [[TMP8:%.*]] = load i32, ptr [[I3]], align 4, !llvm.access.group [[ACC_GRP16]]
+// CHECK-32-EX-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [1000 x i16], ptr [[TMP0]], i32 0, i32 [[TMP8]]
+// CHECK-32-EX-NEXT:    [[TMP9:%.*]] = load i16, ptr [[ARRAYIDX]], align 2, !llvm.access.group [[ACC_GRP16]]
+// CHECK-32-EX-NEXT:    [[CONV:%.*]] = sext i16 [[TMP9]] to i32
+// CHECK-32-EX-NEXT:    [[ADD6:%.*]] = add nsw i32 [[CONV]], 1
+// CHECK-32-EX-NEXT:    [[CONV7:%.*]] = trunc i32 [[ADD6]] to i16
+// CHECK-32-EX-NEXT:    store i16 [[CONV7]], ptr [[ARRAYIDX]], align 2, !llvm.access.group [[ACC_GRP16]]
+// CHECK-32-EX-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK-32-EX:       omp.body.continue:
+// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK-32-EX:       omp.inner.for.inc:
+// CHECK-32-EX-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP16]]
+// CHECK-32-EX-NEXT:    [[ADD8:%.*]] = add nsw i32 [[TMP10]], 1
+// CHECK-32-EX-NEXT:    store i32 [[ADD8]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP16]]
+// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP17:![0-9]+]]
+// CHECK-32-EX:       worker.exit:
+// CHECK-32-EX-NEXT:    ret void
+// CHECK-32-EX:       omp.inner.for.end:
+// CHECK-32-EX-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK-32-EX-NEXT:    [[SUB9:%.*]] = sub nsw i32 [[TMP11]], 0
+// CHECK-32-EX-NEXT:    [[DIV10:%.*]] = sdiv i32 [[SUB9]], 1
+// CHECK-32-EX-NEXT:    [[MUL11:%.*]] = mul nsw i32 [[DIV10]], 1
+// CHECK-32-EX-NEXT:    [[ADD12:%.*]] = add nsw i32 0, [[MUL11]]
+// CHECK-32-EX-NEXT:    store i32 [[ADD12]], ptr [[I3]], align 4
+// CHECK-32-EX-NEXT:    br label [[SIMD_IF_END]]
+// CHECK-32-EX:       simd.if.end:
+// CHECK-32-EX-NEXT:    call void @__kmpc_target_deinit(ptr @[[GLOB1]], i8 2)
+// CHECK-32-EX-NEXT:    ret void
+//
+//
+// CHECK-32-EX-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l38
+// CHECK-32-EX-SAME: (ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] {
+// CHECK-32-EX-NEXT:  entry:
+// CHECK-32-EX-NEXT:    [[B_ADDR:%.*]] = alloca ptr, align 4
+// CHECK-32-EX-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    store ptr [[B]], ptr [[B_ADDR]], align 4
+// CHECK-32-EX-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[B_ADDR]], align 4
+// CHECK-32-EX-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr @[[GLOB1]], i8 2, i1 false)
+// CHECK-32-EX-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1
+// CHECK-32-EX-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
+// CHECK-32-EX:       user_code.entry:
+// CHECK-32-EX-NEXT:    store i32 0, ptr [[DOTOMP_IV]], align 4
+// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK-32-EX:       omp.inner.for.cond:
+// CHECK-32-EX-NEXT:    [[TMP2:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP19:![0-9]+]]
+// CHECK-32-EX-NEXT:    [[CMP:%.*]] = icmp slt i32 [[TMP2]], 10
+// CHECK-32-EX-NEXT:    br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK-32-EX:       omp.inner.for.body:
+// CHECK-32-EX-NEXT:    [[TMP3:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP19]]
+// CHECK-32-EX-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP3]], 1
+// CHECK-32-EX-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// CHECK-32-EX-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP19]]
+// CHECK-32-EX-NEXT:    [[TMP4:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP19]]
+// CHECK-32-EX-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], ptr [[TMP0]], i32 0, i32 [[TMP4]]
+// CHECK-32-EX-NEXT:    [[TMP5:%.*]] = load i32, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP19]]
+// CHECK-32-EX-NEXT:    [[ADD1:%.*]] = add nsw i32 [[TMP5]], 1
+// CHECK-32-EX-NEXT:    store i32 [[ADD1]], ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP19]]
+// CHECK-32-EX-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK-32-EX:       omp.body.continue:
+// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK-32-EX:       omp.inner.for.inc:
+// CHECK-32-EX-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP19]]
+// CHECK-32-EX-NEXT:    [[ADD2:%.*]] = add nsw i32 [[TMP6]], 1
+// CHECK-32-EX-NEXT:    store i32 [[ADD2]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP19]]
+// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP20:![0-9]+]]
+// CHECK-32-EX:       worker.exit:
+// CHECK-32-EX-NEXT:    ret void
+// CHECK-32-EX:       omp.inner.for.end:
+// CHECK-32-EX-NEXT:    store i32 10, ptr [[I]], align 4
+// CHECK-32-EX-NEXT:    call void @__kmpc_target_deinit(ptr @[[GLOB1]], i8 2)
+// CHECK-32-EX-NEXT:    ret void
+//
+//
+// CHECK-32-EX-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l43
+// CHECK-32-EX-SAME: (ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]]) #[[ATTR0]] {
+// CHECK-32-EX-NEXT:  entry:
+// CHECK-32-EX-NEXT:    [[B_ADDR:%.*]] = alloca ptr, align 4
+// CHECK-32-EX-NEXT:    [[N_ADDR:%.*]] = alloca ptr, align 4
+// CHECK-32-EX-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[N1:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    store ptr [[B]], ptr [[B_ADDR]], align 4
+// CHECK-32-EX-NEXT:    store ptr [[N]], ptr [[N_ADDR]], align 4
+// CHECK-32-EX-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[B_ADDR]], align 4
+// CHECK-32-EX-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[N_ADDR]], align 4
+// CHECK-32-EX-NEXT:    [[TMP2:%.*]] = call i32 @__kmpc_target_init(ptr @[[GLOB1]], i8 2, i1 false)
+// CHECK-32-EX-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP2]], -1
+// CHECK-32-EX-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
+// CHECK-32-EX:       user_code.entry:
+// CHECK-32-EX-NEXT:    store i32 0, ptr [[DOTOMP_IV]], align 4
+// CHECK-32-EX-NEXT:    store i32 0, ptr [[N1]], align 4
+// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK-32-EX:       omp.inner.for.cond:
+// CHECK-32-EX-NEXT:    [[TMP3:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP22:![0-9]+]]
+// CHECK-32-EX-NEXT:    [[CMP:%.*]] = icmp slt i32 [[TMP3]], 10
+// CHECK-32-EX-NEXT:    br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK-32-EX:       omp.inner.for.body:
+// CHECK-32-EX-NEXT:    [[TMP4:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP22]]
+// CHECK-32-EX-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP4]], 1
+// CHECK-32-EX-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// CHECK-32-EX-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP22]]
+// CHECK-32-EX-NEXT:    [[TMP5:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP22]]
+// CHECK-32-EX-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], ptr [[TMP0]], i32 0, i32 [[TMP5]]
+// CHECK-32-EX-NEXT:    [[TMP6:%.*]] = load i32, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP22]]
+// CHECK-32-EX-NEXT:    [[ADD2:%.*]] = add nsw i32 [[TMP6]], 1
+// CHECK-32-EX-NEXT:    store i32 [[ADD2]], ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP22]]
+// CHECK-32-EX-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK-32-EX:       omp.body.continue:
+// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK-32-EX:       omp.inner.for.inc:
+// CHECK-32-EX-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP22]]
+// CHECK-32-EX-NEXT:    [[ADD3:%.*]] = add nsw i32 [[TMP7]], 1
+// CHECK-32-EX-NEXT:    store i32 [[ADD3]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP22]]
+// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP23:![0-9]+]]
+// CHECK-32-EX:       worker.exit:
+// CHECK-32-EX-NEXT:    ret void
+// CHECK-32-EX:       omp.inner.for.end:
+// CHECK-32-EX-NEXT:    store i32 10, ptr [[I]], align 4
+// CHECK-32-EX-NEXT:    [[TMP8:%.*]] = load i32, ptr [[TMP1]], align 4
+// CHECK-32-EX-NEXT:    [[TMP9:%.*]] = load i32, ptr [[N1]], align 4
+// CHECK-32-EX-NEXT:    [[ADD4:%.*]] = add nsw i32 [[TMP8]], [[TMP9]]
+// CHECK-32-EX-NEXT:    store i32 [[ADD4]], ptr [[TMP1]], align 4
+// CHECK-32-EX-NEXT:    call void @__kmpc_target_deinit(ptr @[[GLOB1]], i8 2)
+// CHECK-32-EX-NEXT:    ret void
+//

diff  --git a/clang/test/OpenMP/nvptx_target_teams_distribute_simd_codegen.cpp b/clang/test/OpenMP/nvptx_target_teams_distribute_simd_codegen.cpp
index e7ba9671b97bd..b9e79e22c5ea0 100644
--- a/clang/test/OpenMP/nvptx_target_teams_distribute_simd_codegen.cpp
+++ b/clang/test/OpenMP/nvptx_target_teams_distribute_simd_codegen.cpp
@@ -1,25 +1,22 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --function-signature --include-generated-funcs --replace-value-regex "__omp_offloading_[0-9a-z]+_[0-9a-z]+" "reduction_size[.].+[.]" "pl_cond[.].+[.|,]" --prefix-filecheck-ir-name _
 // Test target codegen - host bc file has to be created first.
 // RUN: %clang_cc1 -no-enable-noundef-analysis -verify -fopenmp -fopenmp-version=45 -fopenmp-cuda-mode -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc
-// RUN: %clang_cc1 -no-enable-noundef-analysis -verify -fopenmp -fopenmp-version=45 -fopenmp-cuda-mode -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-64
+// RUN: %clang_cc1 -no-enable-noundef-analysis -verify -fopenmp -fopenmp-version=45 -fopenmp-cuda-mode -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix=CHECK45-64
 // RUN: %clang_cc1 -no-enable-noundef-analysis -verify -fopenmp -fopenmp-version=45 -fopenmp-cuda-mode -x c++ -triple i386-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm-bc %s -o %t-x86-host.bc
-// RUN: %clang_cc1 -no-enable-noundef-analysis -verify -fopenmp -fopenmp-version=45 -fopenmp-cuda-mode -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32
-// RUN: %clang_cc1 -no-enable-noundef-analysis -verify -fopenmp -fopenmp-version=45 -fopenmp-cuda-mode -fexceptions -fcxx-exceptions -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32
+// RUN: %clang_cc1 -no-enable-noundef-analysis -verify -fopenmp -fopenmp-version=45 -fopenmp-cuda-mode -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix=CHECK45-32
+// RUN: %clang_cc1 -no-enable-noundef-analysis -verify -fopenmp -fopenmp-version=45 -fopenmp-cuda-mode -fexceptions -fcxx-exceptions -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefixes=CHECK45-32,CHECK45-32-EX
 
 // RUN: %clang_cc1 -no-enable-noundef-analysis -verify -fopenmp -fopenmp-cuda-mode -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc
-// RUN: %clang_cc1 -no-enable-noundef-analysis -verify -fopenmp -fopenmp-cuda-mode -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-64
+// RUN: %clang_cc1 -no-enable-noundef-analysis -verify -fopenmp -fopenmp-cuda-mode -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix=CHECK-64
 // RUN: %clang_cc1 -no-enable-noundef-analysis -verify -fopenmp -fopenmp-cuda-mode -x c++ -triple i386-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm-bc %s -o %t-x86-host.bc
-// RUN: %clang_cc1 -no-enable-noundef-analysis -verify -fopenmp -fopenmp-cuda-mode -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32
-// RUN: %clang_cc1 -no-enable-noundef-analysis -verify -fopenmp -fopenmp-cuda-mode -fexceptions -fcxx-exceptions -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32
+// RUN: %clang_cc1 -no-enable-noundef-analysis -verify -fopenmp -fopenmp-cuda-mode -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix=CHECK-32
+// RUN: %clang_cc1 -no-enable-noundef-analysis -verify -fopenmp -fopenmp-cuda-mode -fexceptions -fcxx-exceptions -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefixes=CHECK-32,CHECK-32-EX
 
 // expected-no-diagnostics
 #ifndef HEADER
 #define HEADER
 
 // Check that the execution mode of all 2 target regions on the gpu is set to NonSPMD Mode.
-// CHECK-DAG: {{@__omp_offloading_.+l37}}_exec_mode = weak protected constant i8 2
-// CHECK-DAG: {{@__omp_offloading_.+l43}}_exec_mode = weak protected constant i8 2
-// CHECK-DAG: {{@__omp_offloading_.+l48}}_exec_mode = weak protected constant i8 2
-// CHECK-DAG: {{@__omp_offloading_.+l53}}_exec_mode = weak protected constant i8 2
 
 #define N 1000
 #define M 10
@@ -69,38 +66,3481 @@ int bar(int n){
   return a;
 }
 
-// CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+}}_l37(
-// CHECK: call i32 @__kmpc_target_init({{.*}}, i8 2, i1 false)
-// CHECK: call void @__kmpc_target_deinit({{.*}}, i8 2)
-
-// CHECK: call void @__kmpc_distribute_static_init_4({{.+}}, {{.+}}, {{.+}} 91,
-// CHECK: call void @__kmpc_distribute_static_fini(
-// CHECK: ret void
-
-// CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+}}_l43(
-// CHECK: call i32 @__kmpc_target_init({{.*}}, i8 2, i1 false)
-// CHECK: call void @__kmpc_target_deinit({{.*}}, i8 2)
-
-// CHECK: call void @__kmpc_distribute_static_init_4({{.+}}, {{.+}}, {{.+}} 91,
-// CHECK: call void @__kmpc_distribute_static_fini(
-// CHECK: ret void
-
-// CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+}}_l48(
-// CHECK: call i32 @__kmpc_target_init({{.*}}, i8 2, i1 false)
-// CHECK: call void @__kmpc_target_deinit({{.*}}, i8 2)
-
-// CHECK: call void @__kmpc_distribute_static_init_4({{.+}}, {{.+}}, {{.+}} 91,
-// CHECK: call void @__kmpc_distribute_static_fini(
-// CHECK: ret void
-
-// CHECK: define {{.*}}void {{@__omp_offloading_.+}}_l53({{.+}}, i{{32|64}} [[F_IN:%.+]])
-// CHECK: store {{.+}} [[F_IN]], ptr {{.+}},
-// CHECK: call i32 @__kmpc_target_init({{.*}}, i8 2, i1 false)
-// CHECK: call void @__kmpc_target_deinit({{.*}}, i8 2)
-
-// CHECK: store {{.+}} 99, ptr [[COMB_UB:%.+]], align
-// CHECK: call void @__kmpc_distribute_static_init_4({{.+}}, {{.+}}, {{.+}} 91, {{.+}}, {{.+}}, ptr [[COMB_UB]],
-// CHECK: call void @__kmpc_distribute_static_fini(
-// CHECK: ret void
-
 #endif
+// CHECK45-64-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l33
+// CHECK45-64-SAME: (i64 [[N:%.*]], ptr nonnull align 4 dereferenceable(4000) [[A:%.*]], i64 [[L:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK45-64-NEXT:  entry:
+// CHECK45-64-NEXT:    [[N_ADDR:%.*]] = alloca i64, align 8
+// CHECK45-64-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
+// CHECK45-64-NEXT:    [[L_ADDR:%.*]] = alloca i64, align 8
+// CHECK45-64-NEXT:    [[N_CASTED:%.*]] = alloca i64, align 8
+// CHECK45-64-NEXT:    [[L_CASTED:%.*]] = alloca i64, align 8
+// CHECK45-64-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
+// CHECK45-64-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
+// CHECK45-64-NEXT:    store i64 [[N]], ptr [[N_ADDR]], align 8
+// CHECK45-64-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
+// CHECK45-64-NEXT:    store i64 [[L]], ptr [[L_ADDR]], align 8
+// CHECK45-64-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[A_ADDR]], align 8
+// CHECK45-64-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr @[[GLOB1:[0-9]+]], i8 2, i1 false)
+// CHECK45-64-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1
+// CHECK45-64-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
+// CHECK45-64:       user_code.entry:
+// CHECK45-64-NEXT:    [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]])
+// CHECK45-64-NEXT:    [[TMP3:%.*]] = load i32, ptr [[N_ADDR]], align 4
+// CHECK45-64-NEXT:    store i32 [[TMP3]], ptr [[N_CASTED]], align 4
+// CHECK45-64-NEXT:    [[TMP4:%.*]] = load i64, ptr [[N_CASTED]], align 8
+// CHECK45-64-NEXT:    [[TMP5:%.*]] = load i32, ptr [[L_ADDR]], align 4
+// CHECK45-64-NEXT:    store i32 [[TMP5]], ptr [[L_CASTED]], align 4
+// CHECK45-64-NEXT:    [[TMP6:%.*]] = load i64, ptr [[L_CASTED]], align 8
+// CHECK45-64-NEXT:    store i32 0, ptr [[DOTZERO_ADDR]], align 4
+// CHECK45-64-NEXT:    store i32 [[TMP2]], ptr [[DOTTHREADID_TEMP_]], align 4
+// CHECK45-64-NEXT:    call void @__omp_outlined__(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]], i64 [[TMP4]], ptr [[TMP0]], i64 [[TMP6]]) #[[ATTR3:[0-9]+]]
+// CHECK45-64-NEXT:    call void @__kmpc_target_deinit(ptr @[[GLOB1]], i8 2)
+// CHECK45-64-NEXT:    ret void
+// CHECK45-64:       worker.exit:
+// CHECK45-64-NEXT:    ret void
+//
+//
+// CHECK45-64-LABEL: define {{[^@]+}}@__omp_outlined__
+// CHECK45-64-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]], i64 [[N:%.*]], ptr nonnull align 4 dereferenceable(4000) [[A:%.*]], i64 [[L:%.*]]) #[[ATTR1:[0-9]+]] {
+// CHECK45-64-NEXT:  entry:
+// CHECK45-64-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK45-64-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK45-64-NEXT:    [[N_ADDR:%.*]] = alloca i64, align 8
+// CHECK45-64-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
+// CHECK45-64-NEXT:    [[L_ADDR:%.*]] = alloca i64, align 8
+// CHECK45-64-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK45-64-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// CHECK45-64-NEXT:    [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4
+// CHECK45-64-NEXT:    [[DOTCAPTURE_EXPR_2:%.*]] = alloca i32, align 4
+// CHECK45-64-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK45-64-NEXT:    [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// CHECK45-64-NEXT:    [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// CHECK45-64-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK45-64-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK45-64-NEXT:    [[I4:%.*]] = alloca i32, align 4
+// CHECK45-64-NEXT:    store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK45-64-NEXT:    store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
+// CHECK45-64-NEXT:    store i64 [[N]], ptr [[N_ADDR]], align 8
+// CHECK45-64-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
+// CHECK45-64-NEXT:    store i64 [[L]], ptr [[L_ADDR]], align 8
+// CHECK45-64-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[A_ADDR]], align 8
+// CHECK45-64-NEXT:    [[L1:%.*]] = call align 8 ptr @__kmpc_alloc_shared(i64 4)
+// CHECK45-64-NEXT:    [[TMP1:%.*]] = load i32, ptr [[N_ADDR]], align 4
+// CHECK45-64-NEXT:    store i32 [[TMP1]], ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK45-64-NEXT:    [[TMP2:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK45-64-NEXT:    [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0
+// CHECK45-64-NEXT:    [[DIV:%.*]] = sdiv i32 [[SUB]], 1
+// CHECK45-64-NEXT:    [[SUB3:%.*]] = sub nsw i32 [[DIV]], 1
+// CHECK45-64-NEXT:    store i32 [[SUB3]], ptr [[DOTCAPTURE_EXPR_2]], align 4
+// CHECK45-64-NEXT:    store i32 0, ptr [[I]], align 4
+// CHECK45-64-NEXT:    [[TMP3:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK45-64-NEXT:    [[CMP:%.*]] = icmp slt i32 0, [[TMP3]]
+// CHECK45-64-NEXT:    br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]]
+// CHECK45-64:       omp.precond.then:
+// CHECK45-64-NEXT:    store i32 0, ptr [[DOTOMP_LB]], align 4
+// CHECK45-64-NEXT:    [[TMP4:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_2]], align 4
+// CHECK45-64-NEXT:    store i32 [[TMP4]], ptr [[DOTOMP_UB]], align 4
+// CHECK45-64-NEXT:    store i32 1, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK45-64-NEXT:    store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
+// CHECK45-64-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK45-64-NEXT:    [[TMP6:%.*]] = load i32, ptr [[TMP5]], align 4
+// CHECK45-64-NEXT:    call void @__kmpc_distribute_static_init_4(ptr @[[GLOB2:[0-9]+]], i32 [[TMP6]], i32 91, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_LB]], ptr [[DOTOMP_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 128)
+// CHECK45-64-NEXT:    br label [[OMP_DISPATCH_COND:%.*]]
+// CHECK45-64:       omp.dispatch.cond:
+// CHECK45-64-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK45-64-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_2]], align 4
+// CHECK45-64-NEXT:    [[CMP5:%.*]] = icmp sgt i32 [[TMP7]], [[TMP8]]
+// CHECK45-64-NEXT:    br i1 [[CMP5]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK45-64:       cond.true:
+// CHECK45-64-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_2]], align 4
+// CHECK45-64-NEXT:    br label [[COND_END:%.*]]
+// CHECK45-64:       cond.false:
+// CHECK45-64-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK45-64-NEXT:    br label [[COND_END]]
+// CHECK45-64:       cond.end:
+// CHECK45-64-NEXT:    [[COND:%.*]] = phi i32 [ [[TMP9]], [[COND_TRUE]] ], [ [[TMP10]], [[COND_FALSE]] ]
+// CHECK45-64-NEXT:    store i32 [[COND]], ptr [[DOTOMP_UB]], align 4
+// CHECK45-64-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4
+// CHECK45-64-NEXT:    store i32 [[TMP11]], ptr [[DOTOMP_IV]], align 4
+// CHECK45-64-NEXT:    [[TMP12:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK45-64-NEXT:    [[TMP13:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK45-64-NEXT:    [[CMP6:%.*]] = icmp sle i32 [[TMP12]], [[TMP13]]
+// CHECK45-64-NEXT:    br i1 [[CMP6]], label [[OMP_DISPATCH_BODY:%.*]], label [[OMP_DISPATCH_END:%.*]]
+// CHECK45-64:       omp.dispatch.body:
+// CHECK45-64-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK45-64:       omp.inner.for.cond:
+// CHECK45-64-NEXT:    [[TMP14:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP12:![0-9]+]]
+// CHECK45-64-NEXT:    [[TMP15:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP12]]
+// CHECK45-64-NEXT:    [[CMP7:%.*]] = icmp sle i32 [[TMP14]], [[TMP15]]
+// CHECK45-64-NEXT:    br i1 [[CMP7]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK45-64:       omp.inner.for.body:
+// CHECK45-64-NEXT:    [[TMP16:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP12]]
+// CHECK45-64-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP16]], 1
+// CHECK45-64-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// CHECK45-64-NEXT:    store i32 [[ADD]], ptr [[I4]], align 4, !llvm.access.group [[ACC_GRP12]]
+// CHECK45-64-NEXT:    [[TMP17:%.*]] = load i32, ptr [[I4]], align 4, !llvm.access.group [[ACC_GRP12]]
+// CHECK45-64-NEXT:    [[IDXPROM:%.*]] = sext i32 [[TMP17]] to i64
+// CHECK45-64-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [1000 x i32], ptr [[TMP0]], i64 0, i64 [[IDXPROM]]
+// CHECK45-64-NEXT:    store i32 1, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP12]]
+// CHECK45-64-NEXT:    [[TMP18:%.*]] = load i32, ptr [[I4]], align 4, !llvm.access.group [[ACC_GRP12]]
+// CHECK45-64-NEXT:    store i32 [[TMP18]], ptr [[L_ADDR]], align 4, !llvm.access.group [[ACC_GRP12]]
+// CHECK45-64-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK45-64:       omp.body.continue:
+// CHECK45-64-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK45-64:       omp.inner.for.inc:
+// CHECK45-64-NEXT:    [[TMP19:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP12]]
+// CHECK45-64-NEXT:    [[ADD8:%.*]] = add nsw i32 [[TMP19]], 1
+// CHECK45-64-NEXT:    store i32 [[ADD8]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP12]]
+// CHECK45-64-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP13:![0-9]+]]
+// CHECK45-64:       omp.inner.for.end:
+// CHECK45-64-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
+// CHECK45-64:       omp.dispatch.inc:
+// CHECK45-64-NEXT:    [[TMP20:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4
+// CHECK45-64-NEXT:    [[TMP21:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK45-64-NEXT:    [[ADD9:%.*]] = add nsw i32 [[TMP20]], [[TMP21]]
+// CHECK45-64-NEXT:    store i32 [[ADD9]], ptr [[DOTOMP_LB]], align 4
+// CHECK45-64-NEXT:    [[TMP22:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK45-64-NEXT:    [[TMP23:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK45-64-NEXT:    [[ADD10:%.*]] = add nsw i32 [[TMP22]], [[TMP23]]
+// CHECK45-64-NEXT:    store i32 [[ADD10]], ptr [[DOTOMP_UB]], align 4
+// CHECK45-64-NEXT:    br label [[OMP_DISPATCH_COND]]
+// CHECK45-64:       omp.dispatch.end:
+// CHECK45-64-NEXT:    [[TMP24:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK45-64-NEXT:    [[TMP25:%.*]] = load i32, ptr [[TMP24]], align 4
+// CHECK45-64-NEXT:    call void @__kmpc_distribute_static_fini(ptr @[[GLOB2]], i32 [[TMP25]])
+// CHECK45-64-NEXT:    [[TMP26:%.*]] = load i32, ptr [[DOTOMP_IS_LAST]], align 4
+// CHECK45-64-NEXT:    [[TMP27:%.*]] = icmp ne i32 [[TMP26]], 0
+// CHECK45-64-NEXT:    br i1 [[TMP27]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]]
+// CHECK45-64:       .omp.final.then:
+// CHECK45-64-NEXT:    [[TMP28:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK45-64-NEXT:    [[SUB11:%.*]] = sub nsw i32 [[TMP28]], 0
+// CHECK45-64-NEXT:    [[DIV12:%.*]] = sdiv i32 [[SUB11]], 1
+// CHECK45-64-NEXT:    [[MUL13:%.*]] = mul nsw i32 [[DIV12]], 1
+// CHECK45-64-NEXT:    [[ADD14:%.*]] = add nsw i32 0, [[MUL13]]
+// CHECK45-64-NEXT:    store i32 [[ADD14]], ptr [[I4]], align 4
+// CHECK45-64-NEXT:    br label [[DOTOMP_FINAL_DONE]]
+// CHECK45-64:       .omp.final.done:
+// CHECK45-64-NEXT:    [[TMP29:%.*]] = load i32, ptr [[DOTOMP_IS_LAST]], align 4
+// CHECK45-64-NEXT:    [[TMP30:%.*]] = icmp ne i32 [[TMP29]], 0
+// CHECK45-64-NEXT:    br i1 [[TMP30]], label [[DOTOMP_LASTPRIVATE_THEN:%.*]], label [[DOTOMP_LASTPRIVATE_DONE:%.*]]
+// CHECK45-64:       .omp.lastprivate.then:
+// CHECK45-64-NEXT:    [[TMP31:%.*]] = load i32, ptr [[L_ADDR]], align 4
+// CHECK45-64-NEXT:    store i32 [[TMP31]], ptr [[L_ADDR]], align 4
+// CHECK45-64-NEXT:    br label [[DOTOMP_LASTPRIVATE_DONE]]
+// CHECK45-64:       .omp.lastprivate.done:
+// CHECK45-64-NEXT:    br label [[OMP_PRECOND_END]]
+// CHECK45-64:       omp.precond.end:
+// CHECK45-64-NEXT:    call void @__kmpc_free_shared(ptr [[L1]], i64 4)
+// CHECK45-64-NEXT:    ret void
+//
+//
+// CHECK45-64-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l39
+// CHECK45-64-SAME: (i64 [[N:%.*]], ptr nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR5:[0-9]+]] {
+// CHECK45-64-NEXT:  entry:
+// CHECK45-64-NEXT:    [[N_ADDR:%.*]] = alloca i64, align 8
+// CHECK45-64-NEXT:    [[AA_ADDR:%.*]] = alloca ptr, align 8
+// CHECK45-64-NEXT:    [[N_CASTED:%.*]] = alloca i64, align 8
+// CHECK45-64-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
+// CHECK45-64-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
+// CHECK45-64-NEXT:    store i64 [[N]], ptr [[N_ADDR]], align 8
+// CHECK45-64-NEXT:    store ptr [[AA]], ptr [[AA_ADDR]], align 8
+// CHECK45-64-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[AA_ADDR]], align 8
+// CHECK45-64-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr @[[GLOB1]], i8 2, i1 false)
+// CHECK45-64-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1
+// CHECK45-64-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
+// CHECK45-64:       user_code.entry:
+// CHECK45-64-NEXT:    [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]])
+// CHECK45-64-NEXT:    [[TMP3:%.*]] = load i32, ptr [[N_ADDR]], align 4
+// CHECK45-64-NEXT:    store i32 [[TMP3]], ptr [[N_CASTED]], align 4
+// CHECK45-64-NEXT:    [[TMP4:%.*]] = load i64, ptr [[N_CASTED]], align 8
+// CHECK45-64-NEXT:    store i32 0, ptr [[DOTZERO_ADDR]], align 4
+// CHECK45-64-NEXT:    store i32 [[TMP2]], ptr [[DOTTHREADID_TEMP_]], align 4
+// CHECK45-64-NEXT:    call void @__omp_outlined__1(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]], i64 [[TMP4]], ptr [[TMP0]]) #[[ATTR3]]
+// CHECK45-64-NEXT:    call void @__kmpc_target_deinit(ptr @[[GLOB1]], i8 2)
+// CHECK45-64-NEXT:    ret void
+// CHECK45-64:       worker.exit:
+// CHECK45-64-NEXT:    ret void
+//
+//
+// CHECK45-64-LABEL: define {{[^@]+}}@__omp_outlined__1
+// CHECK45-64-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]], i64 [[N:%.*]], ptr nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR1]] {
+// CHECK45-64-NEXT:  entry:
+// CHECK45-64-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK45-64-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK45-64-NEXT:    [[N_ADDR:%.*]] = alloca i64, align 8
+// CHECK45-64-NEXT:    [[AA_ADDR:%.*]] = alloca ptr, align 8
+// CHECK45-64-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK45-64-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// CHECK45-64-NEXT:    [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4
+// CHECK45-64-NEXT:    [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4
+// CHECK45-64-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK45-64-NEXT:    [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// CHECK45-64-NEXT:    [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// CHECK45-64-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK45-64-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK45-64-NEXT:    [[I3:%.*]] = alloca i32, align 4
+// CHECK45-64-NEXT:    store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK45-64-NEXT:    store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
+// CHECK45-64-NEXT:    store i64 [[N]], ptr [[N_ADDR]], align 8
+// CHECK45-64-NEXT:    store ptr [[AA]], ptr [[AA_ADDR]], align 8
+// CHECK45-64-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[AA_ADDR]], align 8
+// CHECK45-64-NEXT:    [[TMP1:%.*]] = load i32, ptr [[N_ADDR]], align 4
+// CHECK45-64-NEXT:    store i32 [[TMP1]], ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK45-64-NEXT:    [[TMP2:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK45-64-NEXT:    [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0
+// CHECK45-64-NEXT:    [[DIV:%.*]] = sdiv i32 [[SUB]], 1
+// CHECK45-64-NEXT:    [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1
+// CHECK45-64-NEXT:    store i32 [[SUB2]], ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK45-64-NEXT:    store i32 0, ptr [[I]], align 4
+// CHECK45-64-NEXT:    [[TMP3:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK45-64-NEXT:    [[CMP:%.*]] = icmp slt i32 0, [[TMP3]]
+// CHECK45-64-NEXT:    br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]]
+// CHECK45-64:       omp.precond.then:
+// CHECK45-64-NEXT:    store i32 0, ptr [[DOTOMP_LB]], align 4
+// CHECK45-64-NEXT:    [[TMP4:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK45-64-NEXT:    store i32 [[TMP4]], ptr [[DOTOMP_UB]], align 4
+// CHECK45-64-NEXT:    store i32 1, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK45-64-NEXT:    store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
+// CHECK45-64-NEXT:    [[NVPTX_NUM_THREADS:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block()
+// CHECK45-64-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK45-64-NEXT:    [[TMP6:%.*]] = load i32, ptr [[TMP5]], align 4
+// CHECK45-64-NEXT:    call void @__kmpc_distribute_static_init_4(ptr @[[GLOB2]], i32 [[TMP6]], i32 91, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_LB]], ptr [[DOTOMP_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]])
+// CHECK45-64-NEXT:    br label [[OMP_DISPATCH_COND:%.*]]
+// CHECK45-64:       omp.dispatch.cond:
+// CHECK45-64-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK45-64-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK45-64-NEXT:    [[CMP4:%.*]] = icmp sgt i32 [[TMP7]], [[TMP8]]
+// CHECK45-64-NEXT:    br i1 [[CMP4]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK45-64:       cond.true:
+// CHECK45-64-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK45-64-NEXT:    br label [[COND_END:%.*]]
+// CHECK45-64:       cond.false:
+// CHECK45-64-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK45-64-NEXT:    br label [[COND_END]]
+// CHECK45-64:       cond.end:
+// CHECK45-64-NEXT:    [[COND:%.*]] = phi i32 [ [[TMP9]], [[COND_TRUE]] ], [ [[TMP10]], [[COND_FALSE]] ]
+// CHECK45-64-NEXT:    store i32 [[COND]], ptr [[DOTOMP_UB]], align 4
+// CHECK45-64-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4
+// CHECK45-64-NEXT:    store i32 [[TMP11]], ptr [[DOTOMP_IV]], align 4
+// CHECK45-64-NEXT:    [[TMP12:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK45-64-NEXT:    [[TMP13:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK45-64-NEXT:    [[CMP5:%.*]] = icmp sle i32 [[TMP12]], [[TMP13]]
+// CHECK45-64-NEXT:    br i1 [[CMP5]], label [[OMP_DISPATCH_BODY:%.*]], label [[OMP_DISPATCH_END:%.*]]
+// CHECK45-64:       omp.dispatch.body:
+// CHECK45-64-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK45-64:       omp.inner.for.cond:
+// CHECK45-64-NEXT:    [[TMP14:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP16:![0-9]+]]
+// CHECK45-64-NEXT:    [[TMP15:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP16]]
+// CHECK45-64-NEXT:    [[CMP6:%.*]] = icmp sle i32 [[TMP14]], [[TMP15]]
+// CHECK45-64-NEXT:    br i1 [[CMP6]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK45-64:       omp.inner.for.body:
+// CHECK45-64-NEXT:    [[TMP16:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP16]]
+// CHECK45-64-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP16]], 1
+// CHECK45-64-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// CHECK45-64-NEXT:    store i32 [[ADD]], ptr [[I3]], align 4, !llvm.access.group [[ACC_GRP16]]
+// CHECK45-64-NEXT:    [[TMP17:%.*]] = load i32, ptr [[I3]], align 4, !llvm.access.group [[ACC_GRP16]]
+// CHECK45-64-NEXT:    [[IDXPROM:%.*]] = sext i32 [[TMP17]] to i64
+// CHECK45-64-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [1000 x i16], ptr [[TMP0]], i64 0, i64 [[IDXPROM]]
+// CHECK45-64-NEXT:    [[TMP18:%.*]] = load i16, ptr [[ARRAYIDX]], align 2, !llvm.access.group [[ACC_GRP16]]
+// CHECK45-64-NEXT:    [[CONV:%.*]] = sext i16 [[TMP18]] to i32
+// CHECK45-64-NEXT:    [[ADD7:%.*]] = add nsw i32 [[CONV]], 1
+// CHECK45-64-NEXT:    [[CONV8:%.*]] = trunc i32 [[ADD7]] to i16
+// CHECK45-64-NEXT:    store i16 [[CONV8]], ptr [[ARRAYIDX]], align 2, !llvm.access.group [[ACC_GRP16]]
+// CHECK45-64-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK45-64:       omp.body.continue:
+// CHECK45-64-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK45-64:       omp.inner.for.inc:
+// CHECK45-64-NEXT:    [[TMP19:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP16]]
+// CHECK45-64-NEXT:    [[ADD9:%.*]] = add nsw i32 [[TMP19]], 1
+// CHECK45-64-NEXT:    store i32 [[ADD9]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP16]]
+// CHECK45-64-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP17:![0-9]+]]
+// CHECK45-64:       omp.inner.for.end:
+// CHECK45-64-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
+// CHECK45-64:       omp.dispatch.inc:
+// CHECK45-64-NEXT:    [[TMP20:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4
+// CHECK45-64-NEXT:    [[TMP21:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK45-64-NEXT:    [[ADD10:%.*]] = add nsw i32 [[TMP20]], [[TMP21]]
+// CHECK45-64-NEXT:    store i32 [[ADD10]], ptr [[DOTOMP_LB]], align 4
+// CHECK45-64-NEXT:    [[TMP22:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK45-64-NEXT:    [[TMP23:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK45-64-NEXT:    [[ADD11:%.*]] = add nsw i32 [[TMP22]], [[TMP23]]
+// CHECK45-64-NEXT:    store i32 [[ADD11]], ptr [[DOTOMP_UB]], align 4
+// CHECK45-64-NEXT:    br label [[OMP_DISPATCH_COND]]
+// CHECK45-64:       omp.dispatch.end:
+// CHECK45-64-NEXT:    [[TMP24:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK45-64-NEXT:    [[TMP25:%.*]] = load i32, ptr [[TMP24]], align 4
+// CHECK45-64-NEXT:    call void @__kmpc_distribute_static_fini(ptr @[[GLOB2]], i32 [[TMP25]])
+// CHECK45-64-NEXT:    [[TMP26:%.*]] = load i32, ptr [[DOTOMP_IS_LAST]], align 4
+// CHECK45-64-NEXT:    [[TMP27:%.*]] = icmp ne i32 [[TMP26]], 0
+// CHECK45-64-NEXT:    br i1 [[TMP27]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]]
+// CHECK45-64:       .omp.final.then:
+// CHECK45-64-NEXT:    [[TMP28:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK45-64-NEXT:    [[SUB12:%.*]] = sub nsw i32 [[TMP28]], 0
+// CHECK45-64-NEXT:    [[DIV13:%.*]] = sdiv i32 [[SUB12]], 1
+// CHECK45-64-NEXT:    [[MUL14:%.*]] = mul nsw i32 [[DIV13]], 1
+// CHECK45-64-NEXT:    [[ADD15:%.*]] = add nsw i32 0, [[MUL14]]
+// CHECK45-64-NEXT:    store i32 [[ADD15]], ptr [[I3]], align 4
+// CHECK45-64-NEXT:    br label [[DOTOMP_FINAL_DONE]]
+// CHECK45-64:       .omp.final.done:
+// CHECK45-64-NEXT:    br label [[OMP_PRECOND_END]]
+// CHECK45-64:       omp.precond.end:
+// CHECK45-64-NEXT:    ret void
+//
+//
+// CHECK45-64-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l44
+// CHECK45-64-SAME: (ptr nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] {
+// CHECK45-64-NEXT:  entry:
+// CHECK45-64-NEXT:    [[B_ADDR:%.*]] = alloca ptr, align 8
+// CHECK45-64-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
+// CHECK45-64-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
+// CHECK45-64-NEXT:    store ptr [[B]], ptr [[B_ADDR]], align 8
+// CHECK45-64-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[B_ADDR]], align 8
+// CHECK45-64-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr @[[GLOB1]], i8 2, i1 false)
+// CHECK45-64-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1
+// CHECK45-64-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
+// CHECK45-64:       user_code.entry:
+// CHECK45-64-NEXT:    [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]])
+// CHECK45-64-NEXT:    store i32 0, ptr [[DOTZERO_ADDR]], align 4
+// CHECK45-64-NEXT:    store i32 [[TMP2]], ptr [[DOTTHREADID_TEMP_]], align 4
+// CHECK45-64-NEXT:    call void @__omp_outlined__2(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]], ptr [[TMP0]]) #[[ATTR3]]
+// CHECK45-64-NEXT:    call void @__kmpc_target_deinit(ptr @[[GLOB1]], i8 2)
+// CHECK45-64-NEXT:    ret void
+// CHECK45-64:       worker.exit:
+// CHECK45-64-NEXT:    ret void
+//
+//
+// CHECK45-64-LABEL: define {{[^@]+}}@__omp_outlined__2
+// CHECK45-64-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]], ptr nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR1]] {
+// CHECK45-64-NEXT:  entry:
+// CHECK45-64-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK45-64-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK45-64-NEXT:    [[B_ADDR:%.*]] = alloca ptr, align 8
+// CHECK45-64-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK45-64-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// CHECK45-64-NEXT:    [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// CHECK45-64-NEXT:    [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// CHECK45-64-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK45-64-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK45-64-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK45-64-NEXT:    store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK45-64-NEXT:    store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
+// CHECK45-64-NEXT:    store ptr [[B]], ptr [[B_ADDR]], align 8
+// CHECK45-64-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[B_ADDR]], align 8
+// CHECK45-64-NEXT:    store i32 0, ptr [[DOTOMP_LB]], align 4
+// CHECK45-64-NEXT:    store i32 9, ptr [[DOTOMP_UB]], align 4
+// CHECK45-64-NEXT:    store i32 1, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK45-64-NEXT:    store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
+// CHECK45-64-NEXT:    [[NVPTX_NUM_THREADS:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block()
+// CHECK45-64-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK45-64-NEXT:    [[TMP2:%.*]] = load i32, ptr [[TMP1]], align 4
+// CHECK45-64-NEXT:    call void @__kmpc_distribute_static_init_4(ptr @[[GLOB2]], i32 [[TMP2]], i32 91, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_LB]], ptr [[DOTOMP_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]])
+// CHECK45-64-NEXT:    br label [[OMP_DISPATCH_COND:%.*]]
+// CHECK45-64:       omp.dispatch.cond:
+// CHECK45-64-NEXT:    [[TMP3:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK45-64-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[TMP3]], 9
+// CHECK45-64-NEXT:    br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK45-64:       cond.true:
+// CHECK45-64-NEXT:    br label [[COND_END:%.*]]
+// CHECK45-64:       cond.false:
+// CHECK45-64-NEXT:    [[TMP4:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK45-64-NEXT:    br label [[COND_END]]
+// CHECK45-64:       cond.end:
+// CHECK45-64-NEXT:    [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP4]], [[COND_FALSE]] ]
+// CHECK45-64-NEXT:    store i32 [[COND]], ptr [[DOTOMP_UB]], align 4
+// CHECK45-64-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4
+// CHECK45-64-NEXT:    store i32 [[TMP5]], ptr [[DOTOMP_IV]], align 4
+// CHECK45-64-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK45-64-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK45-64-NEXT:    [[CMP1:%.*]] = icmp sle i32 [[TMP6]], [[TMP7]]
+// CHECK45-64-NEXT:    br i1 [[CMP1]], label [[OMP_DISPATCH_BODY:%.*]], label [[OMP_DISPATCH_END:%.*]]
+// CHECK45-64:       omp.dispatch.body:
+// CHECK45-64-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK45-64:       omp.inner.for.cond:
+// CHECK45-64-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP19:![0-9]+]]
+// CHECK45-64-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP19]]
+// CHECK45-64-NEXT:    [[CMP2:%.*]] = icmp sle i32 [[TMP8]], [[TMP9]]
+// CHECK45-64-NEXT:    br i1 [[CMP2]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK45-64:       omp.inner.for.body:
+// CHECK45-64-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP19]]
+// CHECK45-64-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP10]], 1
+// CHECK45-64-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// CHECK45-64-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP19]]
+// CHECK45-64-NEXT:    [[TMP11:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP19]]
+// CHECK45-64-NEXT:    [[IDXPROM:%.*]] = sext i32 [[TMP11]] to i64
+// CHECK45-64-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], ptr [[TMP0]], i64 0, i64 [[IDXPROM]]
+// CHECK45-64-NEXT:    [[TMP12:%.*]] = load i32, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP19]]
+// CHECK45-64-NEXT:    [[ADD3:%.*]] = add nsw i32 [[TMP12]], 1
+// CHECK45-64-NEXT:    store i32 [[ADD3]], ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP19]]
+// CHECK45-64-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK45-64:       omp.body.continue:
+// CHECK45-64-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK45-64:       omp.inner.for.inc:
+// CHECK45-64-NEXT:    [[TMP13:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP19]]
+// CHECK45-64-NEXT:    [[ADD4:%.*]] = add nsw i32 [[TMP13]], 1
+// CHECK45-64-NEXT:    store i32 [[ADD4]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP19]]
+// CHECK45-64-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP20:![0-9]+]]
+// CHECK45-64:       omp.inner.for.end:
+// CHECK45-64-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
+// CHECK45-64:       omp.dispatch.inc:
+// CHECK45-64-NEXT:    [[TMP14:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4
+// CHECK45-64-NEXT:    [[TMP15:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK45-64-NEXT:    [[ADD5:%.*]] = add nsw i32 [[TMP14]], [[TMP15]]
+// CHECK45-64-NEXT:    store i32 [[ADD5]], ptr [[DOTOMP_LB]], align 4
+// CHECK45-64-NEXT:    [[TMP16:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK45-64-NEXT:    [[TMP17:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK45-64-NEXT:    [[ADD6:%.*]] = add nsw i32 [[TMP16]], [[TMP17]]
+// CHECK45-64-NEXT:    store i32 [[ADD6]], ptr [[DOTOMP_UB]], align 4
+// CHECK45-64-NEXT:    br label [[OMP_DISPATCH_COND]]
+// CHECK45-64:       omp.dispatch.end:
+// CHECK45-64-NEXT:    call void @__kmpc_distribute_static_fini(ptr @[[GLOB2]], i32 [[TMP2]])
+// CHECK45-64-NEXT:    [[TMP18:%.*]] = load i32, ptr [[DOTOMP_IS_LAST]], align 4
+// CHECK45-64-NEXT:    [[TMP19:%.*]] = icmp ne i32 [[TMP18]], 0
+// CHECK45-64-NEXT:    br i1 [[TMP19]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]]
+// CHECK45-64:       .omp.final.then:
+// CHECK45-64-NEXT:    store i32 10, ptr [[I]], align 4
+// CHECK45-64-NEXT:    br label [[DOTOMP_FINAL_DONE]]
+// CHECK45-64:       .omp.final.done:
+// CHECK45-64-NEXT:    ret void
+//
+//
+// CHECK45-64-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l49
+// CHECK45-64-SAME: (ptr nonnull align 4 dereferenceable(400) [[C:%.*]], i64 [[F:%.*]]) #[[ATTR0]] {
+// CHECK45-64-NEXT:  entry:
+// CHECK45-64-NEXT:    [[C_ADDR:%.*]] = alloca ptr, align 8
+// CHECK45-64-NEXT:    [[F_ADDR:%.*]] = alloca i64, align 8
+// CHECK45-64-NEXT:    [[F_CASTED:%.*]] = alloca i64, align 8
+// CHECK45-64-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
+// CHECK45-64-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
+// CHECK45-64-NEXT:    store ptr [[C]], ptr [[C_ADDR]], align 8
+// CHECK45-64-NEXT:    store i64 [[F]], ptr [[F_ADDR]], align 8
+// CHECK45-64-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[C_ADDR]], align 8
+// CHECK45-64-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr @[[GLOB1]], i8 2, i1 false)
+// CHECK45-64-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1
+// CHECK45-64-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
+// CHECK45-64:       user_code.entry:
+// CHECK45-64-NEXT:    [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]])
+// CHECK45-64-NEXT:    [[TMP3:%.*]] = load i32, ptr [[F_ADDR]], align 4
+// CHECK45-64-NEXT:    store i32 [[TMP3]], ptr [[F_CASTED]], align 4
+// CHECK45-64-NEXT:    [[TMP4:%.*]] = load i64, ptr [[F_CASTED]], align 8
+// CHECK45-64-NEXT:    store i32 0, ptr [[DOTZERO_ADDR]], align 4
+// CHECK45-64-NEXT:    store i32 [[TMP2]], ptr [[DOTTHREADID_TEMP_]], align 4
+// CHECK45-64-NEXT:    call void @__omp_outlined__3(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]], ptr [[TMP0]], i64 [[TMP4]]) #[[ATTR3]]
+// CHECK45-64-NEXT:    call void @__kmpc_target_deinit(ptr @[[GLOB1]], i8 2)
+// CHECK45-64-NEXT:    ret void
+// CHECK45-64:       worker.exit:
+// CHECK45-64-NEXT:    ret void
+//
+//
+// CHECK45-64-LABEL: define {{[^@]+}}@__omp_outlined__3
+// CHECK45-64-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]], ptr nonnull align 4 dereferenceable(400) [[C:%.*]], i64 [[F:%.*]]) #[[ATTR1]] {
+// CHECK45-64-NEXT:  entry:
+// CHECK45-64-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK45-64-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK45-64-NEXT:    [[C_ADDR:%.*]] = alloca ptr, align 8
+// CHECK45-64-NEXT:    [[F_ADDR:%.*]] = alloca i64, align 8
+// CHECK45-64-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK45-64-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// CHECK45-64-NEXT:    [[_TMP1:%.*]] = alloca i32, align 4
+// CHECK45-64-NEXT:    [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// CHECK45-64-NEXT:    [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// CHECK45-64-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK45-64-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK45-64-NEXT:    [[K:%.*]] = alloca i32, align 4
+// CHECK45-64-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK45-64-NEXT:    [[J:%.*]] = alloca i32, align 4
+// CHECK45-64-NEXT:    store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK45-64-NEXT:    store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
+// CHECK45-64-NEXT:    store ptr [[C]], ptr [[C_ADDR]], align 8
+// CHECK45-64-NEXT:    store i64 [[F]], ptr [[F_ADDR]], align 8
+// CHECK45-64-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[C_ADDR]], align 8
+// CHECK45-64-NEXT:    store i32 0, ptr [[DOTOMP_LB]], align 4
+// CHECK45-64-NEXT:    store i32 99, ptr [[DOTOMP_UB]], align 4
+// CHECK45-64-NEXT:    store i32 1, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK45-64-NEXT:    store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
+// CHECK45-64-NEXT:    [[NVPTX_NUM_THREADS:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block()
+// CHECK45-64-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK45-64-NEXT:    [[TMP2:%.*]] = load i32, ptr [[TMP1]], align 4
+// CHECK45-64-NEXT:    call void @__kmpc_distribute_static_init_4(ptr @[[GLOB2]], i32 [[TMP2]], i32 91, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_LB]], ptr [[DOTOMP_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]])
+// CHECK45-64-NEXT:    br label [[OMP_DISPATCH_COND:%.*]]
+// CHECK45-64:       omp.dispatch.cond:
+// CHECK45-64-NEXT:    [[TMP3:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK45-64-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[TMP3]], 99
+// CHECK45-64-NEXT:    br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK45-64:       cond.true:
+// CHECK45-64-NEXT:    br label [[COND_END:%.*]]
+// CHECK45-64:       cond.false:
+// CHECK45-64-NEXT:    [[TMP4:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK45-64-NEXT:    br label [[COND_END]]
+// CHECK45-64:       cond.end:
+// CHECK45-64-NEXT:    [[COND:%.*]] = phi i32 [ 99, [[COND_TRUE]] ], [ [[TMP4]], [[COND_FALSE]] ]
+// CHECK45-64-NEXT:    store i32 [[COND]], ptr [[DOTOMP_UB]], align 4
+// CHECK45-64-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4
+// CHECK45-64-NEXT:    store i32 [[TMP5]], ptr [[DOTOMP_IV]], align 4
+// CHECK45-64-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK45-64-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK45-64-NEXT:    [[CMP2:%.*]] = icmp sle i32 [[TMP6]], [[TMP7]]
+// CHECK45-64-NEXT:    br i1 [[CMP2]], label [[OMP_DISPATCH_BODY:%.*]], label [[OMP_DISPATCH_END:%.*]]
+// CHECK45-64:       omp.dispatch.body:
+// CHECK45-64-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK45-64:       omp.inner.for.cond:
+// CHECK45-64-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP22:![0-9]+]]
+// CHECK45-64-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP22]]
+// CHECK45-64-NEXT:    [[CMP3:%.*]] = icmp sle i32 [[TMP8]], [[TMP9]]
+// CHECK45-64-NEXT:    br i1 [[CMP3]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK45-64:       omp.inner.for.body:
+// CHECK45-64-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP22]]
+// CHECK45-64-NEXT:    [[DIV:%.*]] = sdiv i32 [[TMP10]], 10
+// CHECK45-64-NEXT:    [[MUL:%.*]] = mul nsw i32 [[DIV]], 1
+// CHECK45-64-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// CHECK45-64-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP22]]
+// CHECK45-64-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP22]]
+// CHECK45-64-NEXT:    [[TMP12:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP22]]
+// CHECK45-64-NEXT:    [[DIV4:%.*]] = sdiv i32 [[TMP12]], 10
+// CHECK45-64-NEXT:    [[MUL5:%.*]] = mul nsw i32 [[DIV4]], 10
+// CHECK45-64-NEXT:    [[SUB:%.*]] = sub nsw i32 [[TMP11]], [[MUL5]]
+// CHECK45-64-NEXT:    [[MUL6:%.*]] = mul nsw i32 [[SUB]], 1
+// CHECK45-64-NEXT:    [[ADD7:%.*]] = add nsw i32 0, [[MUL6]]
+// CHECK45-64-NEXT:    store i32 [[ADD7]], ptr [[J]], align 4, !llvm.access.group [[ACC_GRP22]]
+// CHECK45-64-NEXT:    store i32 10, ptr [[K]], align 4, !llvm.access.group [[ACC_GRP22]]
+// CHECK45-64-NEXT:    [[TMP13:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP22]]
+// CHECK45-64-NEXT:    [[TMP14:%.*]] = load i32, ptr [[J]], align 4, !llvm.access.group [[ACC_GRP22]]
+// CHECK45-64-NEXT:    [[TMP15:%.*]] = load i32, ptr [[F_ADDR]], align 4, !llvm.access.group [[ACC_GRP22]]
+// CHECK45-64-NEXT:    [[MUL8:%.*]] = mul nsw i32 [[TMP14]], [[TMP15]]
+// CHECK45-64-NEXT:    [[ADD9:%.*]] = add nsw i32 [[TMP13]], [[MUL8]]
+// CHECK45-64-NEXT:    [[TMP16:%.*]] = load i32, ptr [[K]], align 4, !llvm.access.group [[ACC_GRP22]]
+// CHECK45-64-NEXT:    [[ADD10:%.*]] = add nsw i32 [[ADD9]], [[TMP16]]
+// CHECK45-64-NEXT:    [[TMP17:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP22]]
+// CHECK45-64-NEXT:    [[IDXPROM:%.*]] = sext i32 [[TMP17]] to i64
+// CHECK45-64-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[TMP0]], i64 0, i64 [[IDXPROM]]
+// CHECK45-64-NEXT:    [[TMP18:%.*]] = load i32, ptr [[J]], align 4, !llvm.access.group [[ACC_GRP22]]
+// CHECK45-64-NEXT:    [[IDXPROM11:%.*]] = sext i32 [[TMP18]] to i64
+// CHECK45-64-NEXT:    [[ARRAYIDX12:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX]], i64 0, i64 [[IDXPROM11]]
+// CHECK45-64-NEXT:    store i32 [[ADD10]], ptr [[ARRAYIDX12]], align 4, !llvm.access.group [[ACC_GRP22]]
+// CHECK45-64-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK45-64:       omp.body.continue:
+// CHECK45-64-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK45-64:       omp.inner.for.inc:
+// CHECK45-64-NEXT:    [[TMP19:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP22]]
+// CHECK45-64-NEXT:    [[ADD13:%.*]] = add nsw i32 [[TMP19]], 1
+// CHECK45-64-NEXT:    store i32 [[ADD13]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP22]]
+// CHECK45-64-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP23:![0-9]+]]
+// CHECK45-64:       omp.inner.for.end:
+// CHECK45-64-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
+// CHECK45-64:       omp.dispatch.inc:
+// CHECK45-64-NEXT:    [[TMP20:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4
+// CHECK45-64-NEXT:    [[TMP21:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK45-64-NEXT:    [[ADD14:%.*]] = add nsw i32 [[TMP20]], [[TMP21]]
+// CHECK45-64-NEXT:    store i32 [[ADD14]], ptr [[DOTOMP_LB]], align 4
+// CHECK45-64-NEXT:    [[TMP22:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK45-64-NEXT:    [[TMP23:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK45-64-NEXT:    [[ADD15:%.*]] = add nsw i32 [[TMP22]], [[TMP23]]
+// CHECK45-64-NEXT:    store i32 [[ADD15]], ptr [[DOTOMP_UB]], align 4
+// CHECK45-64-NEXT:    br label [[OMP_DISPATCH_COND]]
+// CHECK45-64:       omp.dispatch.end:
+// CHECK45-64-NEXT:    call void @__kmpc_distribute_static_fini(ptr @[[GLOB2]], i32 [[TMP2]])
+// CHECK45-64-NEXT:    [[TMP24:%.*]] = load i32, ptr [[DOTOMP_IS_LAST]], align 4
+// CHECK45-64-NEXT:    [[TMP25:%.*]] = icmp ne i32 [[TMP24]], 0
+// CHECK45-64-NEXT:    br i1 [[TMP25]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]]
+// CHECK45-64:       .omp.final.then:
+// CHECK45-64-NEXT:    store i32 10, ptr [[I]], align 4
+// CHECK45-64-NEXT:    store i32 10, ptr [[J]], align 4
+// CHECK45-64-NEXT:    br label [[DOTOMP_FINAL_DONE]]
+// CHECK45-64:       .omp.final.done:
+// CHECK45-64-NEXT:    ret void
+//
+//
+// CHECK45-32-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l33
+// CHECK45-32-SAME: (i32 [[N:%.*]], ptr nonnull align 4 dereferenceable(4000) [[A:%.*]], i32 [[L:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK45-32-NEXT:  entry:
+// CHECK45-32-NEXT:    [[N_ADDR:%.*]] = alloca i32, align 4
+// CHECK45-32-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 4
+// CHECK45-32-NEXT:    [[L_ADDR:%.*]] = alloca i32, align 4
+// CHECK45-32-NEXT:    [[N_CASTED:%.*]] = alloca i32, align 4
+// CHECK45-32-NEXT:    [[L_CASTED:%.*]] = alloca i32, align 4
+// CHECK45-32-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
+// CHECK45-32-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
+// CHECK45-32-NEXT:    store i32 [[N]], ptr [[N_ADDR]], align 4
+// CHECK45-32-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 4
+// CHECK45-32-NEXT:    store i32 [[L]], ptr [[L_ADDR]], align 4
+// CHECK45-32-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[A_ADDR]], align 4
+// CHECK45-32-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr @[[GLOB1:[0-9]+]], i8 2, i1 false)
+// CHECK45-32-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1
+// CHECK45-32-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
+// CHECK45-32:       user_code.entry:
+// CHECK45-32-NEXT:    [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]])
+// CHECK45-32-NEXT:    [[TMP3:%.*]] = load i32, ptr [[N_ADDR]], align 4
+// CHECK45-32-NEXT:    store i32 [[TMP3]], ptr [[N_CASTED]], align 4
+// CHECK45-32-NEXT:    [[TMP4:%.*]] = load i32, ptr [[N_CASTED]], align 4
+// CHECK45-32-NEXT:    [[TMP5:%.*]] = load i32, ptr [[L_ADDR]], align 4
+// CHECK45-32-NEXT:    store i32 [[TMP5]], ptr [[L_CASTED]], align 4
+// CHECK45-32-NEXT:    [[TMP6:%.*]] = load i32, ptr [[L_CASTED]], align 4
+// CHECK45-32-NEXT:    store i32 0, ptr [[DOTZERO_ADDR]], align 4
+// CHECK45-32-NEXT:    store i32 [[TMP2]], ptr [[DOTTHREADID_TEMP_]], align 4
+// CHECK45-32-NEXT:    call void @__omp_outlined__(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]], i32 [[TMP4]], ptr [[TMP0]], i32 [[TMP6]]) #[[ATTR3:[0-9]+]]
+// CHECK45-32-NEXT:    call void @__kmpc_target_deinit(ptr @[[GLOB1]], i8 2)
+// CHECK45-32-NEXT:    ret void
+// CHECK45-32:       worker.exit:
+// CHECK45-32-NEXT:    ret void
+//
+//
+// CHECK45-32-LABEL: define {{[^@]+}}@__omp_outlined__
+// CHECK45-32-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]], i32 [[N:%.*]], ptr nonnull align 4 dereferenceable(4000) [[A:%.*]], i32 [[L:%.*]]) #[[ATTR1:[0-9]+]] {
+// CHECK45-32-NEXT:  entry:
+// CHECK45-32-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
+// CHECK45-32-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
+// CHECK45-32-NEXT:    [[N_ADDR:%.*]] = alloca i32, align 4
+// CHECK45-32-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 4
+// CHECK45-32-NEXT:    [[L_ADDR:%.*]] = alloca i32, align 4
+// CHECK45-32-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK45-32-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// CHECK45-32-NEXT:    [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4
+// CHECK45-32-NEXT:    [[DOTCAPTURE_EXPR_2:%.*]] = alloca i32, align 4
+// CHECK45-32-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK45-32-NEXT:    [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// CHECK45-32-NEXT:    [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// CHECK45-32-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK45-32-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK45-32-NEXT:    [[I4:%.*]] = alloca i32, align 4
+// CHECK45-32-NEXT:    store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK45-32-NEXT:    store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 4
+// CHECK45-32-NEXT:    store i32 [[N]], ptr [[N_ADDR]], align 4
+// CHECK45-32-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 4
+// CHECK45-32-NEXT:    store i32 [[L]], ptr [[L_ADDR]], align 4
+// CHECK45-32-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[A_ADDR]], align 4
+// CHECK45-32-NEXT:    [[L1:%.*]] = call align 8 ptr @__kmpc_alloc_shared(i32 4)
+// CHECK45-32-NEXT:    [[TMP1:%.*]] = load i32, ptr [[N_ADDR]], align 4
+// CHECK45-32-NEXT:    store i32 [[TMP1]], ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK45-32-NEXT:    [[TMP2:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK45-32-NEXT:    [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0
+// CHECK45-32-NEXT:    [[DIV:%.*]] = sdiv i32 [[SUB]], 1
+// CHECK45-32-NEXT:    [[SUB3:%.*]] = sub nsw i32 [[DIV]], 1
+// CHECK45-32-NEXT:    store i32 [[SUB3]], ptr [[DOTCAPTURE_EXPR_2]], align 4
+// CHECK45-32-NEXT:    store i32 0, ptr [[I]], align 4
+// CHECK45-32-NEXT:    [[TMP3:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK45-32-NEXT:    [[CMP:%.*]] = icmp slt i32 0, [[TMP3]]
+// CHECK45-32-NEXT:    br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]]
+// CHECK45-32:       omp.precond.then:
+// CHECK45-32-NEXT:    store i32 0, ptr [[DOTOMP_LB]], align 4
+// CHECK45-32-NEXT:    [[TMP4:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_2]], align 4
+// CHECK45-32-NEXT:    store i32 [[TMP4]], ptr [[DOTOMP_UB]], align 4
+// CHECK45-32-NEXT:    store i32 1, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK45-32-NEXT:    store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
+// CHECK45-32-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK45-32-NEXT:    [[TMP6:%.*]] = load i32, ptr [[TMP5]], align 4
+// CHECK45-32-NEXT:    call void @__kmpc_distribute_static_init_4(ptr @[[GLOB2:[0-9]+]], i32 [[TMP6]], i32 91, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_LB]], ptr [[DOTOMP_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 128)
+// CHECK45-32-NEXT:    br label [[OMP_DISPATCH_COND:%.*]]
+// CHECK45-32:       omp.dispatch.cond:
+// CHECK45-32-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK45-32-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_2]], align 4
+// CHECK45-32-NEXT:    [[CMP5:%.*]] = icmp sgt i32 [[TMP7]], [[TMP8]]
+// CHECK45-32-NEXT:    br i1 [[CMP5]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK45-32:       cond.true:
+// CHECK45-32-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_2]], align 4
+// CHECK45-32-NEXT:    br label [[COND_END:%.*]]
+// CHECK45-32:       cond.false:
+// CHECK45-32-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK45-32-NEXT:    br label [[COND_END]]
+// CHECK45-32:       cond.end:
+// CHECK45-32-NEXT:    [[COND:%.*]] = phi i32 [ [[TMP9]], [[COND_TRUE]] ], [ [[TMP10]], [[COND_FALSE]] ]
+// CHECK45-32-NEXT:    store i32 [[COND]], ptr [[DOTOMP_UB]], align 4
+// CHECK45-32-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4
+// CHECK45-32-NEXT:    store i32 [[TMP11]], ptr [[DOTOMP_IV]], align 4
+// CHECK45-32-NEXT:    [[TMP12:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK45-32-NEXT:    [[TMP13:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK45-32-NEXT:    [[CMP6:%.*]] = icmp sle i32 [[TMP12]], [[TMP13]]
+// CHECK45-32-NEXT:    br i1 [[CMP6]], label [[OMP_DISPATCH_BODY:%.*]], label [[OMP_DISPATCH_END:%.*]]
+// CHECK45-32:       omp.dispatch.body:
+// CHECK45-32-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK45-32:       omp.inner.for.cond:
+// CHECK45-32-NEXT:    [[TMP14:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP12:![0-9]+]]
+// CHECK45-32-NEXT:    [[TMP15:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP12]]
+// CHECK45-32-NEXT:    [[CMP7:%.*]] = icmp sle i32 [[TMP14]], [[TMP15]]
+// CHECK45-32-NEXT:    br i1 [[CMP7]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK45-32:       omp.inner.for.body:
+// CHECK45-32-NEXT:    [[TMP16:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP12]]
+// CHECK45-32-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP16]], 1
+// CHECK45-32-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// CHECK45-32-NEXT:    store i32 [[ADD]], ptr [[I4]], align 4, !llvm.access.group [[ACC_GRP12]]
+// CHECK45-32-NEXT:    [[TMP17:%.*]] = load i32, ptr [[I4]], align 4, !llvm.access.group [[ACC_GRP12]]
+// CHECK45-32-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [1000 x i32], ptr [[TMP0]], i32 0, i32 [[TMP17]]
+// CHECK45-32-NEXT:    store i32 1, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP12]]
+// CHECK45-32-NEXT:    [[TMP18:%.*]] = load i32, ptr [[I4]], align 4, !llvm.access.group [[ACC_GRP12]]
+// CHECK45-32-NEXT:    store i32 [[TMP18]], ptr [[L_ADDR]], align 4, !llvm.access.group [[ACC_GRP12]]
+// CHECK45-32-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK45-32:       omp.body.continue:
+// CHECK45-32-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK45-32:       omp.inner.for.inc:
+// CHECK45-32-NEXT:    [[TMP19:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP12]]
+// CHECK45-32-NEXT:    [[ADD8:%.*]] = add nsw i32 [[TMP19]], 1
+// CHECK45-32-NEXT:    store i32 [[ADD8]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP12]]
+// CHECK45-32-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP13:![0-9]+]]
+// CHECK45-32:       omp.inner.for.end:
+// CHECK45-32-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
+// CHECK45-32:       omp.dispatch.inc:
+// CHECK45-32-NEXT:    [[TMP20:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4
+// CHECK45-32-NEXT:    [[TMP21:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK45-32-NEXT:    [[ADD9:%.*]] = add nsw i32 [[TMP20]], [[TMP21]]
+// CHECK45-32-NEXT:    store i32 [[ADD9]], ptr [[DOTOMP_LB]], align 4
+// CHECK45-32-NEXT:    [[TMP22:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK45-32-NEXT:    [[TMP23:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK45-32-NEXT:    [[ADD10:%.*]] = add nsw i32 [[TMP22]], [[TMP23]]
+// CHECK45-32-NEXT:    store i32 [[ADD10]], ptr [[DOTOMP_UB]], align 4
+// CHECK45-32-NEXT:    br label [[OMP_DISPATCH_COND]]
+// CHECK45-32:       omp.dispatch.end:
+// CHECK45-32-NEXT:    [[TMP24:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK45-32-NEXT:    [[TMP25:%.*]] = load i32, ptr [[TMP24]], align 4
+// CHECK45-32-NEXT:    call void @__kmpc_distribute_static_fini(ptr @[[GLOB2]], i32 [[TMP25]])
+// CHECK45-32-NEXT:    [[TMP26:%.*]] = load i32, ptr [[DOTOMP_IS_LAST]], align 4
+// CHECK45-32-NEXT:    [[TMP27:%.*]] = icmp ne i32 [[TMP26]], 0
+// CHECK45-32-NEXT:    br i1 [[TMP27]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]]
+// CHECK45-32:       .omp.final.then:
+// CHECK45-32-NEXT:    [[TMP28:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK45-32-NEXT:    [[SUB11:%.*]] = sub nsw i32 [[TMP28]], 0
+// CHECK45-32-NEXT:    [[DIV12:%.*]] = sdiv i32 [[SUB11]], 1
+// CHECK45-32-NEXT:    [[MUL13:%.*]] = mul nsw i32 [[DIV12]], 1
+// CHECK45-32-NEXT:    [[ADD14:%.*]] = add nsw i32 0, [[MUL13]]
+// CHECK45-32-NEXT:    store i32 [[ADD14]], ptr [[I4]], align 4
+// CHECK45-32-NEXT:    br label [[DOTOMP_FINAL_DONE]]
+// CHECK45-32:       .omp.final.done:
+// CHECK45-32-NEXT:    [[TMP29:%.*]] = load i32, ptr [[DOTOMP_IS_LAST]], align 4
+// CHECK45-32-NEXT:    [[TMP30:%.*]] = icmp ne i32 [[TMP29]], 0
+// CHECK45-32-NEXT:    br i1 [[TMP30]], label [[DOTOMP_LASTPRIVATE_THEN:%.*]], label [[DOTOMP_LASTPRIVATE_DONE:%.*]]
+// CHECK45-32:       .omp.lastprivate.then:
+// CHECK45-32-NEXT:    [[TMP31:%.*]] = load i32, ptr [[L_ADDR]], align 4
+// CHECK45-32-NEXT:    store i32 [[TMP31]], ptr [[L_ADDR]], align 4
+// CHECK45-32-NEXT:    br label [[DOTOMP_LASTPRIVATE_DONE]]
+// CHECK45-32:       .omp.lastprivate.done:
+// CHECK45-32-NEXT:    br label [[OMP_PRECOND_END]]
+// CHECK45-32:       omp.precond.end:
+// CHECK45-32-NEXT:    call void @__kmpc_free_shared(ptr [[L1]], i32 4)
+// CHECK45-32-NEXT:    ret void
+//
+//
+// CHECK45-32-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l39
+// CHECK45-32-SAME: (i32 [[N:%.*]], ptr nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR5:[0-9]+]] {
+// CHECK45-32-NEXT:  entry:
+// CHECK45-32-NEXT:    [[N_ADDR:%.*]] = alloca i32, align 4
+// CHECK45-32-NEXT:    [[AA_ADDR:%.*]] = alloca ptr, align 4
+// CHECK45-32-NEXT:    [[N_CASTED:%.*]] = alloca i32, align 4
+// CHECK45-32-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
+// CHECK45-32-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
+// CHECK45-32-NEXT:    store i32 [[N]], ptr [[N_ADDR]], align 4
+// CHECK45-32-NEXT:    store ptr [[AA]], ptr [[AA_ADDR]], align 4
+// CHECK45-32-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[AA_ADDR]], align 4
+// CHECK45-32-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr @[[GLOB1]], i8 2, i1 false)
+// CHECK45-32-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1
+// CHECK45-32-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
+// CHECK45-32:       user_code.entry:
+// CHECK45-32-NEXT:    [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]])
+// CHECK45-32-NEXT:    [[TMP3:%.*]] = load i32, ptr [[N_ADDR]], align 4
+// CHECK45-32-NEXT:    store i32 [[TMP3]], ptr [[N_CASTED]], align 4
+// CHECK45-32-NEXT:    [[TMP4:%.*]] = load i32, ptr [[N_CASTED]], align 4
+// CHECK45-32-NEXT:    store i32 0, ptr [[DOTZERO_ADDR]], align 4
+// CHECK45-32-NEXT:    store i32 [[TMP2]], ptr [[DOTTHREADID_TEMP_]], align 4
+// CHECK45-32-NEXT:    call void @__omp_outlined__1(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]], i32 [[TMP4]], ptr [[TMP0]]) #[[ATTR3]]
+// CHECK45-32-NEXT:    call void @__kmpc_target_deinit(ptr @[[GLOB1]], i8 2)
+// CHECK45-32-NEXT:    ret void
+// CHECK45-32:       worker.exit:
+// CHECK45-32-NEXT:    ret void
+//
+//
+// CHECK45-32-LABEL: define {{[^@]+}}@__omp_outlined__1
+// CHECK45-32-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]], i32 [[N:%.*]], ptr nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR1]] {
+// CHECK45-32-NEXT:  entry:
+// CHECK45-32-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
+// CHECK45-32-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
+// CHECK45-32-NEXT:    [[N_ADDR:%.*]] = alloca i32, align 4
+// CHECK45-32-NEXT:    [[AA_ADDR:%.*]] = alloca ptr, align 4
+// CHECK45-32-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK45-32-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// CHECK45-32-NEXT:    [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4
+// CHECK45-32-NEXT:    [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4
+// CHECK45-32-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK45-32-NEXT:    [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// CHECK45-32-NEXT:    [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// CHECK45-32-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK45-32-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK45-32-NEXT:    [[I3:%.*]] = alloca i32, align 4
+// CHECK45-32-NEXT:    store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK45-32-NEXT:    store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 4
+// CHECK45-32-NEXT:    store i32 [[N]], ptr [[N_ADDR]], align 4
+// CHECK45-32-NEXT:    store ptr [[AA]], ptr [[AA_ADDR]], align 4
+// CHECK45-32-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[AA_ADDR]], align 4
+// CHECK45-32-NEXT:    [[TMP1:%.*]] = load i32, ptr [[N_ADDR]], align 4
+// CHECK45-32-NEXT:    store i32 [[TMP1]], ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK45-32-NEXT:    [[TMP2:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK45-32-NEXT:    [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0
+// CHECK45-32-NEXT:    [[DIV:%.*]] = sdiv i32 [[SUB]], 1
+// CHECK45-32-NEXT:    [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1
+// CHECK45-32-NEXT:    store i32 [[SUB2]], ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK45-32-NEXT:    store i32 0, ptr [[I]], align 4
+// CHECK45-32-NEXT:    [[TMP3:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK45-32-NEXT:    [[CMP:%.*]] = icmp slt i32 0, [[TMP3]]
+// CHECK45-32-NEXT:    br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]]
+// CHECK45-32:       omp.precond.then:
+// CHECK45-32-NEXT:    store i32 0, ptr [[DOTOMP_LB]], align 4
+// CHECK45-32-NEXT:    [[TMP4:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK45-32-NEXT:    store i32 [[TMP4]], ptr [[DOTOMP_UB]], align 4
+// CHECK45-32-NEXT:    store i32 1, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK45-32-NEXT:    store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
+// CHECK45-32-NEXT:    [[NVPTX_NUM_THREADS:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block()
+// CHECK45-32-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK45-32-NEXT:    [[TMP6:%.*]] = load i32, ptr [[TMP5]], align 4
+// CHECK45-32-NEXT:    call void @__kmpc_distribute_static_init_4(ptr @[[GLOB2]], i32 [[TMP6]], i32 91, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_LB]], ptr [[DOTOMP_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]])
+// CHECK45-32-NEXT:    br label [[OMP_DISPATCH_COND:%.*]]
+// CHECK45-32:       omp.dispatch.cond:
+// CHECK45-32-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK45-32-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK45-32-NEXT:    [[CMP4:%.*]] = icmp sgt i32 [[TMP7]], [[TMP8]]
+// CHECK45-32-NEXT:    br i1 [[CMP4]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK45-32:       cond.true:
+// CHECK45-32-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK45-32-NEXT:    br label [[COND_END:%.*]]
+// CHECK45-32:       cond.false:
+// CHECK45-32-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK45-32-NEXT:    br label [[COND_END]]
+// CHECK45-32:       cond.end:
+// CHECK45-32-NEXT:    [[COND:%.*]] = phi i32 [ [[TMP9]], [[COND_TRUE]] ], [ [[TMP10]], [[COND_FALSE]] ]
+// CHECK45-32-NEXT:    store i32 [[COND]], ptr [[DOTOMP_UB]], align 4
+// CHECK45-32-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4
+// CHECK45-32-NEXT:    store i32 [[TMP11]], ptr [[DOTOMP_IV]], align 4
+// CHECK45-32-NEXT:    [[TMP12:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK45-32-NEXT:    [[TMP13:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK45-32-NEXT:    [[CMP5:%.*]] = icmp sle i32 [[TMP12]], [[TMP13]]
+// CHECK45-32-NEXT:    br i1 [[CMP5]], label [[OMP_DISPATCH_BODY:%.*]], label [[OMP_DISPATCH_END:%.*]]
+// CHECK45-32:       omp.dispatch.body:
+// CHECK45-32-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK45-32:       omp.inner.for.cond:
+// CHECK45-32-NEXT:    [[TMP14:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP16:![0-9]+]]
+// CHECK45-32-NEXT:    [[TMP15:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP16]]
+// CHECK45-32-NEXT:    [[CMP6:%.*]] = icmp sle i32 [[TMP14]], [[TMP15]]
+// CHECK45-32-NEXT:    br i1 [[CMP6]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK45-32:       omp.inner.for.body:
+// CHECK45-32-NEXT:    [[TMP16:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP16]]
+// CHECK45-32-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP16]], 1
+// CHECK45-32-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// CHECK45-32-NEXT:    store i32 [[ADD]], ptr [[I3]], align 4, !llvm.access.group [[ACC_GRP16]]
+// CHECK45-32-NEXT:    [[TMP17:%.*]] = load i32, ptr [[I3]], align 4, !llvm.access.group [[ACC_GRP16]]
+// CHECK45-32-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [1000 x i16], ptr [[TMP0]], i32 0, i32 [[TMP17]]
+// CHECK45-32-NEXT:    [[TMP18:%.*]] = load i16, ptr [[ARRAYIDX]], align 2, !llvm.access.group [[ACC_GRP16]]
+// CHECK45-32-NEXT:    [[CONV:%.*]] = sext i16 [[TMP18]] to i32
+// CHECK45-32-NEXT:    [[ADD7:%.*]] = add nsw i32 [[CONV]], 1
+// CHECK45-32-NEXT:    [[CONV8:%.*]] = trunc i32 [[ADD7]] to i16
+// CHECK45-32-NEXT:    store i16 [[CONV8]], ptr [[ARRAYIDX]], align 2, !llvm.access.group [[ACC_GRP16]]
+// CHECK45-32-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK45-32:       omp.body.continue:
+// CHECK45-32-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK45-32:       omp.inner.for.inc:
+// CHECK45-32-NEXT:    [[TMP19:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP16]]
+// CHECK45-32-NEXT:    [[ADD9:%.*]] = add nsw i32 [[TMP19]], 1
+// CHECK45-32-NEXT:    store i32 [[ADD9]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP16]]
+// CHECK45-32-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP17:![0-9]+]]
+// CHECK45-32:       omp.inner.for.end:
+// CHECK45-32-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
+// CHECK45-32:       omp.dispatch.inc:
+// CHECK45-32-NEXT:    [[TMP20:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4
+// CHECK45-32-NEXT:    [[TMP21:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK45-32-NEXT:    [[ADD10:%.*]] = add nsw i32 [[TMP20]], [[TMP21]]
+// CHECK45-32-NEXT:    store i32 [[ADD10]], ptr [[DOTOMP_LB]], align 4
+// CHECK45-32-NEXT:    [[TMP22:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK45-32-NEXT:    [[TMP23:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK45-32-NEXT:    [[ADD11:%.*]] = add nsw i32 [[TMP22]], [[TMP23]]
+// CHECK45-32-NEXT:    store i32 [[ADD11]], ptr [[DOTOMP_UB]], align 4
+// CHECK45-32-NEXT:    br label [[OMP_DISPATCH_COND]]
+// CHECK45-32:       omp.dispatch.end:
+// CHECK45-32-NEXT:    [[TMP24:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK45-32-NEXT:    [[TMP25:%.*]] = load i32, ptr [[TMP24]], align 4
+// CHECK45-32-NEXT:    call void @__kmpc_distribute_static_fini(ptr @[[GLOB2]], i32 [[TMP25]])
+// CHECK45-32-NEXT:    [[TMP26:%.*]] = load i32, ptr [[DOTOMP_IS_LAST]], align 4
+// CHECK45-32-NEXT:    [[TMP27:%.*]] = icmp ne i32 [[TMP26]], 0
+// CHECK45-32-NEXT:    br i1 [[TMP27]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]]
+// CHECK45-32:       .omp.final.then:
+// CHECK45-32-NEXT:    [[TMP28:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK45-32-NEXT:    [[SUB12:%.*]] = sub nsw i32 [[TMP28]], 0
+// CHECK45-32-NEXT:    [[DIV13:%.*]] = sdiv i32 [[SUB12]], 1
+// CHECK45-32-NEXT:    [[MUL14:%.*]] = mul nsw i32 [[DIV13]], 1
+// CHECK45-32-NEXT:    [[ADD15:%.*]] = add nsw i32 0, [[MUL14]]
+// CHECK45-32-NEXT:    store i32 [[ADD15]], ptr [[I3]], align 4
+// CHECK45-32-NEXT:    br label [[DOTOMP_FINAL_DONE]]
+// CHECK45-32:       .omp.final.done:
+// CHECK45-32-NEXT:    br label [[OMP_PRECOND_END]]
+// CHECK45-32:       omp.precond.end:
+// CHECK45-32-NEXT:    ret void
+//
+//
+// CHECK45-32-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l44
+// CHECK45-32-SAME: (ptr nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] {
+// CHECK45-32-NEXT:  entry:
+// CHECK45-32-NEXT:    [[B_ADDR:%.*]] = alloca ptr, align 4
+// CHECK45-32-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
+// CHECK45-32-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
+// CHECK45-32-NEXT:    store ptr [[B]], ptr [[B_ADDR]], align 4
+// CHECK45-32-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[B_ADDR]], align 4
+// CHECK45-32-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr @[[GLOB1]], i8 2, i1 false)
+// CHECK45-32-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1
+// CHECK45-32-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
+// CHECK45-32:       user_code.entry:
+// CHECK45-32-NEXT:    [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]])
+// CHECK45-32-NEXT:    store i32 0, ptr [[DOTZERO_ADDR]], align 4
+// CHECK45-32-NEXT:    store i32 [[TMP2]], ptr [[DOTTHREADID_TEMP_]], align 4
+// CHECK45-32-NEXT:    call void @__omp_outlined__2(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]], ptr [[TMP0]]) #[[ATTR3]]
+// CHECK45-32-NEXT:    call void @__kmpc_target_deinit(ptr @[[GLOB1]], i8 2)
+// CHECK45-32-NEXT:    ret void
+// CHECK45-32:       worker.exit:
+// CHECK45-32-NEXT:    ret void
+//
+//
+// CHECK45-32-LABEL: define {{[^@]+}}@__omp_outlined__2
+// CHECK45-32-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]], ptr nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR1]] {
+// CHECK45-32-NEXT:  entry:
+// CHECK45-32-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
+// CHECK45-32-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
+// CHECK45-32-NEXT:    [[B_ADDR:%.*]] = alloca ptr, align 4
+// CHECK45-32-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK45-32-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// CHECK45-32-NEXT:    [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// CHECK45-32-NEXT:    [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// CHECK45-32-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK45-32-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK45-32-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK45-32-NEXT:    store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK45-32-NEXT:    store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 4
+// CHECK45-32-NEXT:    store ptr [[B]], ptr [[B_ADDR]], align 4
+// CHECK45-32-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[B_ADDR]], align 4
+// CHECK45-32-NEXT:    store i32 0, ptr [[DOTOMP_LB]], align 4
+// CHECK45-32-NEXT:    store i32 9, ptr [[DOTOMP_UB]], align 4
+// CHECK45-32-NEXT:    store i32 1, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK45-32-NEXT:    store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
+// CHECK45-32-NEXT:    [[NVPTX_NUM_THREADS:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block()
+// CHECK45-32-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK45-32-NEXT:    [[TMP2:%.*]] = load i32, ptr [[TMP1]], align 4
+// CHECK45-32-NEXT:    call void @__kmpc_distribute_static_init_4(ptr @[[GLOB2]], i32 [[TMP2]], i32 91, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_LB]], ptr [[DOTOMP_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]])
+// CHECK45-32-NEXT:    br label [[OMP_DISPATCH_COND:%.*]]
+// CHECK45-32:       omp.dispatch.cond:
+// CHECK45-32-NEXT:    [[TMP3:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK45-32-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[TMP3]], 9
+// CHECK45-32-NEXT:    br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK45-32:       cond.true:
+// CHECK45-32-NEXT:    br label [[COND_END:%.*]]
+// CHECK45-32:       cond.false:
+// CHECK45-32-NEXT:    [[TMP4:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK45-32-NEXT:    br label [[COND_END]]
+// CHECK45-32:       cond.end:
+// CHECK45-32-NEXT:    [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP4]], [[COND_FALSE]] ]
+// CHECK45-32-NEXT:    store i32 [[COND]], ptr [[DOTOMP_UB]], align 4
+// CHECK45-32-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4
+// CHECK45-32-NEXT:    store i32 [[TMP5]], ptr [[DOTOMP_IV]], align 4
+// CHECK45-32-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK45-32-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK45-32-NEXT:    [[CMP1:%.*]] = icmp sle i32 [[TMP6]], [[TMP7]]
+// CHECK45-32-NEXT:    br i1 [[CMP1]], label [[OMP_DISPATCH_BODY:%.*]], label [[OMP_DISPATCH_END:%.*]]
+// CHECK45-32:       omp.dispatch.body:
+// CHECK45-32-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK45-32:       omp.inner.for.cond:
+// CHECK45-32-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP19:![0-9]+]]
+// CHECK45-32-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP19]]
+// CHECK45-32-NEXT:    [[CMP2:%.*]] = icmp sle i32 [[TMP8]], [[TMP9]]
+// CHECK45-32-NEXT:    br i1 [[CMP2]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK45-32:       omp.inner.for.body:
+// CHECK45-32-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP19]]
+// CHECK45-32-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP10]], 1
+// CHECK45-32-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// CHECK45-32-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP19]]
+// CHECK45-32-NEXT:    [[TMP11:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP19]]
+// CHECK45-32-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], ptr [[TMP0]], i32 0, i32 [[TMP11]]
+// CHECK45-32-NEXT:    [[TMP12:%.*]] = load i32, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP19]]
+// CHECK45-32-NEXT:    [[ADD3:%.*]] = add nsw i32 [[TMP12]], 1
+// CHECK45-32-NEXT:    store i32 [[ADD3]], ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP19]]
+// CHECK45-32-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK45-32:       omp.body.continue:
+// CHECK45-32-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK45-32:       omp.inner.for.inc:
+// CHECK45-32-NEXT:    [[TMP13:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP19]]
+// CHECK45-32-NEXT:    [[ADD4:%.*]] = add nsw i32 [[TMP13]], 1
+// CHECK45-32-NEXT:    store i32 [[ADD4]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP19]]
+// CHECK45-32-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP20:![0-9]+]]
+// CHECK45-32:       omp.inner.for.end:
+// CHECK45-32-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
+// CHECK45-32:       omp.dispatch.inc:
+// CHECK45-32-NEXT:    [[TMP14:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4
+// CHECK45-32-NEXT:    [[TMP15:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK45-32-NEXT:    [[ADD5:%.*]] = add nsw i32 [[TMP14]], [[TMP15]]
+// CHECK45-32-NEXT:    store i32 [[ADD5]], ptr [[DOTOMP_LB]], align 4
+// CHECK45-32-NEXT:    [[TMP16:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK45-32-NEXT:    [[TMP17:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK45-32-NEXT:    [[ADD6:%.*]] = add nsw i32 [[TMP16]], [[TMP17]]
+// CHECK45-32-NEXT:    store i32 [[ADD6]], ptr [[DOTOMP_UB]], align 4
+// CHECK45-32-NEXT:    br label [[OMP_DISPATCH_COND]]
+// CHECK45-32:       omp.dispatch.end:
+// CHECK45-32-NEXT:    call void @__kmpc_distribute_static_fini(ptr @[[GLOB2]], i32 [[TMP2]])
+// CHECK45-32-NEXT:    [[TMP18:%.*]] = load i32, ptr [[DOTOMP_IS_LAST]], align 4
+// CHECK45-32-NEXT:    [[TMP19:%.*]] = icmp ne i32 [[TMP18]], 0
+// CHECK45-32-NEXT:    br i1 [[TMP19]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]]
+// CHECK45-32:       .omp.final.then:
+// CHECK45-32-NEXT:    store i32 10, ptr [[I]], align 4
+// CHECK45-32-NEXT:    br label [[DOTOMP_FINAL_DONE]]
+// CHECK45-32:       .omp.final.done:
+// CHECK45-32-NEXT:    ret void
+//
+//
+// CHECK45-32-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l49
+// CHECK45-32-SAME: (ptr nonnull align 4 dereferenceable(400) [[C:%.*]], i32 [[F:%.*]]) #[[ATTR0]] {
+// CHECK45-32-NEXT:  entry:
+// CHECK45-32-NEXT:    [[C_ADDR:%.*]] = alloca ptr, align 4
+// CHECK45-32-NEXT:    [[F_ADDR:%.*]] = alloca i32, align 4
+// CHECK45-32-NEXT:    [[F_CASTED:%.*]] = alloca i32, align 4
+// CHECK45-32-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
+// CHECK45-32-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
+// CHECK45-32-NEXT:    store ptr [[C]], ptr [[C_ADDR]], align 4
+// CHECK45-32-NEXT:    store i32 [[F]], ptr [[F_ADDR]], align 4
+// CHECK45-32-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[C_ADDR]], align 4
+// CHECK45-32-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr @[[GLOB1]], i8 2, i1 false)
+// CHECK45-32-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1
+// CHECK45-32-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
+// CHECK45-32:       user_code.entry:
+// CHECK45-32-NEXT:    [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]])
+// CHECK45-32-NEXT:    [[TMP3:%.*]] = load i32, ptr [[F_ADDR]], align 4
+// CHECK45-32-NEXT:    store i32 [[TMP3]], ptr [[F_CASTED]], align 4
+// CHECK45-32-NEXT:    [[TMP4:%.*]] = load i32, ptr [[F_CASTED]], align 4
+// CHECK45-32-NEXT:    store i32 0, ptr [[DOTZERO_ADDR]], align 4
+// CHECK45-32-NEXT:    store i32 [[TMP2]], ptr [[DOTTHREADID_TEMP_]], align 4
+// CHECK45-32-NEXT:    call void @__omp_outlined__3(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]], ptr [[TMP0]], i32 [[TMP4]]) #[[ATTR3]]
+// CHECK45-32-NEXT:    call void @__kmpc_target_deinit(ptr @[[GLOB1]], i8 2)
+// CHECK45-32-NEXT:    ret void
+// CHECK45-32:       worker.exit:
+// CHECK45-32-NEXT:    ret void
+//
+//
+// CHECK45-32-LABEL: define {{[^@]+}}@__omp_outlined__3
+// CHECK45-32-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]], ptr nonnull align 4 dereferenceable(400) [[C:%.*]], i32 [[F:%.*]]) #[[ATTR1]] {
+// CHECK45-32-NEXT:  entry:
+// CHECK45-32-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
+// CHECK45-32-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
+// CHECK45-32-NEXT:    [[C_ADDR:%.*]] = alloca ptr, align 4
+// CHECK45-32-NEXT:    [[F_ADDR:%.*]] = alloca i32, align 4
+// CHECK45-32-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK45-32-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// CHECK45-32-NEXT:    [[_TMP1:%.*]] = alloca i32, align 4
+// CHECK45-32-NEXT:    [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// CHECK45-32-NEXT:    [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// CHECK45-32-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK45-32-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK45-32-NEXT:    [[K:%.*]] = alloca i32, align 4
+// CHECK45-32-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK45-32-NEXT:    [[J:%.*]] = alloca i32, align 4
+// CHECK45-32-NEXT:    store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK45-32-NEXT:    store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 4
+// CHECK45-32-NEXT:    store ptr [[C]], ptr [[C_ADDR]], align 4
+// CHECK45-32-NEXT:    store i32 [[F]], ptr [[F_ADDR]], align 4
+// CHECK45-32-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[C_ADDR]], align 4
+// CHECK45-32-NEXT:    store i32 0, ptr [[DOTOMP_LB]], align 4
+// CHECK45-32-NEXT:    store i32 99, ptr [[DOTOMP_UB]], align 4
+// CHECK45-32-NEXT:    store i32 1, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK45-32-NEXT:    store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
+// CHECK45-32-NEXT:    [[NVPTX_NUM_THREADS:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block()
+// CHECK45-32-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK45-32-NEXT:    [[TMP2:%.*]] = load i32, ptr [[TMP1]], align 4
+// CHECK45-32-NEXT:    call void @__kmpc_distribute_static_init_4(ptr @[[GLOB2]], i32 [[TMP2]], i32 91, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_LB]], ptr [[DOTOMP_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]])
+// CHECK45-32-NEXT:    br label [[OMP_DISPATCH_COND:%.*]]
+// CHECK45-32:       omp.dispatch.cond:
+// CHECK45-32-NEXT:    [[TMP3:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK45-32-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[TMP3]], 99
+// CHECK45-32-NEXT:    br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK45-32:       cond.true:
+// CHECK45-32-NEXT:    br label [[COND_END:%.*]]
+// CHECK45-32:       cond.false:
+// CHECK45-32-NEXT:    [[TMP4:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK45-32-NEXT:    br label [[COND_END]]
+// CHECK45-32:       cond.end:
+// CHECK45-32-NEXT:    [[COND:%.*]] = phi i32 [ 99, [[COND_TRUE]] ], [ [[TMP4]], [[COND_FALSE]] ]
+// CHECK45-32-NEXT:    store i32 [[COND]], ptr [[DOTOMP_UB]], align 4
+// CHECK45-32-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4
+// CHECK45-32-NEXT:    store i32 [[TMP5]], ptr [[DOTOMP_IV]], align 4
+// CHECK45-32-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK45-32-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK45-32-NEXT:    [[CMP2:%.*]] = icmp sle i32 [[TMP6]], [[TMP7]]
+// CHECK45-32-NEXT:    br i1 [[CMP2]], label [[OMP_DISPATCH_BODY:%.*]], label [[OMP_DISPATCH_END:%.*]]
+// CHECK45-32:       omp.dispatch.body:
+// CHECK45-32-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK45-32:       omp.inner.for.cond:
+// CHECK45-32-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP22:![0-9]+]]
+// CHECK45-32-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP22]]
+// CHECK45-32-NEXT:    [[CMP3:%.*]] = icmp sle i32 [[TMP8]], [[TMP9]]
+// CHECK45-32-NEXT:    br i1 [[CMP3]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK45-32:       omp.inner.for.body:
+// CHECK45-32-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP22]]
+// CHECK45-32-NEXT:    [[DIV:%.*]] = sdiv i32 [[TMP10]], 10
+// CHECK45-32-NEXT:    [[MUL:%.*]] = mul nsw i32 [[DIV]], 1
+// CHECK45-32-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// CHECK45-32-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP22]]
+// CHECK45-32-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP22]]
+// CHECK45-32-NEXT:    [[TMP12:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP22]]
+// CHECK45-32-NEXT:    [[DIV4:%.*]] = sdiv i32 [[TMP12]], 10
+// CHECK45-32-NEXT:    [[MUL5:%.*]] = mul nsw i32 [[DIV4]], 10
+// CHECK45-32-NEXT:    [[SUB:%.*]] = sub nsw i32 [[TMP11]], [[MUL5]]
+// CHECK45-32-NEXT:    [[MUL6:%.*]] = mul nsw i32 [[SUB]], 1
+// CHECK45-32-NEXT:    [[ADD7:%.*]] = add nsw i32 0, [[MUL6]]
+// CHECK45-32-NEXT:    store i32 [[ADD7]], ptr [[J]], align 4, !llvm.access.group [[ACC_GRP22]]
+// CHECK45-32-NEXT:    store i32 10, ptr [[K]], align 4, !llvm.access.group [[ACC_GRP22]]
+// CHECK45-32-NEXT:    [[TMP13:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP22]]
+// CHECK45-32-NEXT:    [[TMP14:%.*]] = load i32, ptr [[J]], align 4, !llvm.access.group [[ACC_GRP22]]
+// CHECK45-32-NEXT:    [[TMP15:%.*]] = load i32, ptr [[F_ADDR]], align 4, !llvm.access.group [[ACC_GRP22]]
+// CHECK45-32-NEXT:    [[MUL8:%.*]] = mul nsw i32 [[TMP14]], [[TMP15]]
+// CHECK45-32-NEXT:    [[ADD9:%.*]] = add nsw i32 [[TMP13]], [[MUL8]]
+// CHECK45-32-NEXT:    [[TMP16:%.*]] = load i32, ptr [[K]], align 4, !llvm.access.group [[ACC_GRP22]]
+// CHECK45-32-NEXT:    [[ADD10:%.*]] = add nsw i32 [[ADD9]], [[TMP16]]
+// CHECK45-32-NEXT:    [[TMP17:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP22]]
+// CHECK45-32-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[TMP0]], i32 0, i32 [[TMP17]]
+// CHECK45-32-NEXT:    [[TMP18:%.*]] = load i32, ptr [[J]], align 4, !llvm.access.group [[ACC_GRP22]]
+// CHECK45-32-NEXT:    [[ARRAYIDX11:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX]], i32 0, i32 [[TMP18]]
+// CHECK45-32-NEXT:    store i32 [[ADD10]], ptr [[ARRAYIDX11]], align 4, !llvm.access.group [[ACC_GRP22]]
+// CHECK45-32-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK45-32:       omp.body.continue:
+// CHECK45-32-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK45-32:       omp.inner.for.inc:
+// CHECK45-32-NEXT:    [[TMP19:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP22]]
+// CHECK45-32-NEXT:    [[ADD12:%.*]] = add nsw i32 [[TMP19]], 1
+// CHECK45-32-NEXT:    store i32 [[ADD12]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP22]]
+// CHECK45-32-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP23:![0-9]+]]
+// CHECK45-32:       omp.inner.for.end:
+// CHECK45-32-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
+// CHECK45-32:       omp.dispatch.inc:
+// CHECK45-32-NEXT:    [[TMP20:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4
+// CHECK45-32-NEXT:    [[TMP21:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK45-32-NEXT:    [[ADD13:%.*]] = add nsw i32 [[TMP20]], [[TMP21]]
+// CHECK45-32-NEXT:    store i32 [[ADD13]], ptr [[DOTOMP_LB]], align 4
+// CHECK45-32-NEXT:    [[TMP22:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK45-32-NEXT:    [[TMP23:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK45-32-NEXT:    [[ADD14:%.*]] = add nsw i32 [[TMP22]], [[TMP23]]
+// CHECK45-32-NEXT:    store i32 [[ADD14]], ptr [[DOTOMP_UB]], align 4
+// CHECK45-32-NEXT:    br label [[OMP_DISPATCH_COND]]
+// CHECK45-32:       omp.dispatch.end:
+// CHECK45-32-NEXT:    call void @__kmpc_distribute_static_fini(ptr @[[GLOB2]], i32 [[TMP2]])
+// CHECK45-32-NEXT:    [[TMP24:%.*]] = load i32, ptr [[DOTOMP_IS_LAST]], align 4
+// CHECK45-32-NEXT:    [[TMP25:%.*]] = icmp ne i32 [[TMP24]], 0
+// CHECK45-32-NEXT:    br i1 [[TMP25]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]]
+// CHECK45-32:       .omp.final.then:
+// CHECK45-32-NEXT:    store i32 10, ptr [[I]], align 4
+// CHECK45-32-NEXT:    store i32 10, ptr [[J]], align 4
+// CHECK45-32-NEXT:    br label [[DOTOMP_FINAL_DONE]]
+// CHECK45-32:       .omp.final.done:
+// CHECK45-32-NEXT:    ret void
+//
+//
+// CHECK45-32-EX-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l33
+// CHECK45-32-EX-SAME: (i32 [[N:%.*]], ptr nonnull align 4 dereferenceable(4000) [[A:%.*]], i32 [[L:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK45-32-EX-NEXT:  entry:
+// CHECK45-32-EX-NEXT:    [[N_ADDR:%.*]] = alloca i32, align 4
+// CHECK45-32-EX-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 4
+// CHECK45-32-EX-NEXT:    [[L_ADDR:%.*]] = alloca i32, align 4
+// CHECK45-32-EX-NEXT:    [[N_CASTED:%.*]] = alloca i32, align 4
+// CHECK45-32-EX-NEXT:    [[L_CASTED:%.*]] = alloca i32, align 4
+// CHECK45-32-EX-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
+// CHECK45-32-EX-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
+// CHECK45-32-EX-NEXT:    store i32 [[N]], ptr [[N_ADDR]], align 4
+// CHECK45-32-EX-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 4
+// CHECK45-32-EX-NEXT:    store i32 [[L]], ptr [[L_ADDR]], align 4
+// CHECK45-32-EX-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[A_ADDR]], align 4
+// CHECK45-32-EX-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr @[[GLOB1:[0-9]+]], i8 2, i1 false)
+// CHECK45-32-EX-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1
+// CHECK45-32-EX-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
+// CHECK45-32-EX:       user_code.entry:
+// CHECK45-32-EX-NEXT:    [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]])
+// CHECK45-32-EX-NEXT:    [[TMP3:%.*]] = load i32, ptr [[N_ADDR]], align 4
+// CHECK45-32-EX-NEXT:    store i32 [[TMP3]], ptr [[N_CASTED]], align 4
+// CHECK45-32-EX-NEXT:    [[TMP4:%.*]] = load i32, ptr [[N_CASTED]], align 4
+// CHECK45-32-EX-NEXT:    [[TMP5:%.*]] = load i32, ptr [[L_ADDR]], align 4
+// CHECK45-32-EX-NEXT:    store i32 [[TMP5]], ptr [[L_CASTED]], align 4
+// CHECK45-32-EX-NEXT:    [[TMP6:%.*]] = load i32, ptr [[L_CASTED]], align 4
+// CHECK45-32-EX-NEXT:    store i32 0, ptr [[DOTZERO_ADDR]], align 4
+// CHECK45-32-EX-NEXT:    store i32 [[TMP2]], ptr [[DOTTHREADID_TEMP_]], align 4
+// CHECK45-32-EX-NEXT:    call void @__omp_outlined__(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]], i32 [[TMP4]], ptr [[TMP0]], i32 [[TMP6]]) #[[ATTR3:[0-9]+]]
+// CHECK45-32-EX-NEXT:    call void @__kmpc_target_deinit(ptr @[[GLOB1]], i8 2)
+// CHECK45-32-EX-NEXT:    ret void
+// CHECK45-32-EX:       worker.exit:
+// CHECK45-32-EX-NEXT:    ret void
+//
+//
+// CHECK45-32-EX-LABEL: define {{[^@]+}}@__omp_outlined__
+// CHECK45-32-EX-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]], i32 [[N:%.*]], ptr nonnull align 4 dereferenceable(4000) [[A:%.*]], i32 [[L:%.*]]) #[[ATTR1:[0-9]+]] {
+// CHECK45-32-EX-NEXT:  entry:
+// CHECK45-32-EX-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
+// CHECK45-32-EX-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
+// CHECK45-32-EX-NEXT:    [[N_ADDR:%.*]] = alloca i32, align 4
+// CHECK45-32-EX-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 4
+// CHECK45-32-EX-NEXT:    [[L_ADDR:%.*]] = alloca i32, align 4
+// CHECK45-32-EX-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK45-32-EX-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// CHECK45-32-EX-NEXT:    [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4
+// CHECK45-32-EX-NEXT:    [[DOTCAPTURE_EXPR_2:%.*]] = alloca i32, align 4
+// CHECK45-32-EX-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK45-32-EX-NEXT:    [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// CHECK45-32-EX-NEXT:    [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// CHECK45-32-EX-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK45-32-EX-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK45-32-EX-NEXT:    [[I4:%.*]] = alloca i32, align 4
+// CHECK45-32-EX-NEXT:    store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK45-32-EX-NEXT:    store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 4
+// CHECK45-32-EX-NEXT:    store i32 [[N]], ptr [[N_ADDR]], align 4
+// CHECK45-32-EX-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 4
+// CHECK45-32-EX-NEXT:    store i32 [[L]], ptr [[L_ADDR]], align 4
+// CHECK45-32-EX-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[A_ADDR]], align 4
+// CHECK45-32-EX-NEXT:    [[L1:%.*]] = call align 8 ptr @__kmpc_alloc_shared(i32 4)
+// CHECK45-32-EX-NEXT:    [[TMP1:%.*]] = load i32, ptr [[N_ADDR]], align 4
+// CHECK45-32-EX-NEXT:    store i32 [[TMP1]], ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK45-32-EX-NEXT:    [[TMP2:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK45-32-EX-NEXT:    [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0
+// CHECK45-32-EX-NEXT:    [[DIV:%.*]] = sdiv i32 [[SUB]], 1
+// CHECK45-32-EX-NEXT:    [[SUB3:%.*]] = sub nsw i32 [[DIV]], 1
+// CHECK45-32-EX-NEXT:    store i32 [[SUB3]], ptr [[DOTCAPTURE_EXPR_2]], align 4
+// CHECK45-32-EX-NEXT:    store i32 0, ptr [[I]], align 4
+// CHECK45-32-EX-NEXT:    [[TMP3:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK45-32-EX-NEXT:    [[CMP:%.*]] = icmp slt i32 0, [[TMP3]]
+// CHECK45-32-EX-NEXT:    br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]]
+// CHECK45-32-EX:       omp.precond.then:
+// CHECK45-32-EX-NEXT:    store i32 0, ptr [[DOTOMP_LB]], align 4
+// CHECK45-32-EX-NEXT:    [[TMP4:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_2]], align 4
+// CHECK45-32-EX-NEXT:    store i32 [[TMP4]], ptr [[DOTOMP_UB]], align 4
+// CHECK45-32-EX-NEXT:    store i32 1, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK45-32-EX-NEXT:    store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
+// CHECK45-32-EX-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK45-32-EX-NEXT:    [[TMP6:%.*]] = load i32, ptr [[TMP5]], align 4
+// CHECK45-32-EX-NEXT:    call void @__kmpc_distribute_static_init_4(ptr @[[GLOB2:[0-9]+]], i32 [[TMP6]], i32 91, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_LB]], ptr [[DOTOMP_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 128)
+// CHECK45-32-EX-NEXT:    br label [[OMP_DISPATCH_COND:%.*]]
+// CHECK45-32-EX:       omp.dispatch.cond:
+// CHECK45-32-EX-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK45-32-EX-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_2]], align 4
+// CHECK45-32-EX-NEXT:    [[CMP5:%.*]] = icmp sgt i32 [[TMP7]], [[TMP8]]
+// CHECK45-32-EX-NEXT:    br i1 [[CMP5]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK45-32-EX:       cond.true:
+// CHECK45-32-EX-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_2]], align 4
+// CHECK45-32-EX-NEXT:    br label [[COND_END:%.*]]
+// CHECK45-32-EX:       cond.false:
+// CHECK45-32-EX-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK45-32-EX-NEXT:    br label [[COND_END]]
+// CHECK45-32-EX:       cond.end:
+// CHECK45-32-EX-NEXT:    [[COND:%.*]] = phi i32 [ [[TMP9]], [[COND_TRUE]] ], [ [[TMP10]], [[COND_FALSE]] ]
+// CHECK45-32-EX-NEXT:    store i32 [[COND]], ptr [[DOTOMP_UB]], align 4
+// CHECK45-32-EX-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4
+// CHECK45-32-EX-NEXT:    store i32 [[TMP11]], ptr [[DOTOMP_IV]], align 4
+// CHECK45-32-EX-NEXT:    [[TMP12:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK45-32-EX-NEXT:    [[TMP13:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK45-32-EX-NEXT:    [[CMP6:%.*]] = icmp sle i32 [[TMP12]], [[TMP13]]
+// CHECK45-32-EX-NEXT:    br i1 [[CMP6]], label [[OMP_DISPATCH_BODY:%.*]], label [[OMP_DISPATCH_END:%.*]]
+// CHECK45-32-EX:       omp.dispatch.body:
+// CHECK45-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK45-32-EX:       omp.inner.for.cond:
+// CHECK45-32-EX-NEXT:    [[TMP14:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP12:![0-9]+]]
+// CHECK45-32-EX-NEXT:    [[TMP15:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP12]]
+// CHECK45-32-EX-NEXT:    [[CMP7:%.*]] = icmp sle i32 [[TMP14]], [[TMP15]]
+// CHECK45-32-EX-NEXT:    br i1 [[CMP7]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK45-32-EX:       omp.inner.for.body:
+// CHECK45-32-EX-NEXT:    [[TMP16:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP12]]
+// CHECK45-32-EX-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP16]], 1
+// CHECK45-32-EX-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// CHECK45-32-EX-NEXT:    store i32 [[ADD]], ptr [[I4]], align 4, !llvm.access.group [[ACC_GRP12]]
+// CHECK45-32-EX-NEXT:    [[TMP17:%.*]] = load i32, ptr [[I4]], align 4, !llvm.access.group [[ACC_GRP12]]
+// CHECK45-32-EX-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [1000 x i32], ptr [[TMP0]], i32 0, i32 [[TMP17]]
+// CHECK45-32-EX-NEXT:    store i32 1, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP12]]
+// CHECK45-32-EX-NEXT:    [[TMP18:%.*]] = load i32, ptr [[I4]], align 4, !llvm.access.group [[ACC_GRP12]]
+// CHECK45-32-EX-NEXT:    store i32 [[TMP18]], ptr [[L_ADDR]], align 4, !llvm.access.group [[ACC_GRP12]]
+// CHECK45-32-EX-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK45-32-EX:       omp.body.continue:
+// CHECK45-32-EX-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK45-32-EX:       omp.inner.for.inc:
+// CHECK45-32-EX-NEXT:    [[TMP19:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP12]]
+// CHECK45-32-EX-NEXT:    [[ADD8:%.*]] = add nsw i32 [[TMP19]], 1
+// CHECK45-32-EX-NEXT:    store i32 [[ADD8]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP12]]
+// CHECK45-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP13:![0-9]+]]
+// CHECK45-32-EX:       omp.inner.for.end:
+// CHECK45-32-EX-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
+// CHECK45-32-EX:       omp.dispatch.inc:
+// CHECK45-32-EX-NEXT:    [[TMP20:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4
+// CHECK45-32-EX-NEXT:    [[TMP21:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK45-32-EX-NEXT:    [[ADD9:%.*]] = add nsw i32 [[TMP20]], [[TMP21]]
+// CHECK45-32-EX-NEXT:    store i32 [[ADD9]], ptr [[DOTOMP_LB]], align 4
+// CHECK45-32-EX-NEXT:    [[TMP22:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK45-32-EX-NEXT:    [[TMP23:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK45-32-EX-NEXT:    [[ADD10:%.*]] = add nsw i32 [[TMP22]], [[TMP23]]
+// CHECK45-32-EX-NEXT:    store i32 [[ADD10]], ptr [[DOTOMP_UB]], align 4
+// CHECK45-32-EX-NEXT:    br label [[OMP_DISPATCH_COND]]
+// CHECK45-32-EX:       omp.dispatch.end:
+// CHECK45-32-EX-NEXT:    [[TMP24:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK45-32-EX-NEXT:    [[TMP25:%.*]] = load i32, ptr [[TMP24]], align 4
+// CHECK45-32-EX-NEXT:    call void @__kmpc_distribute_static_fini(ptr @[[GLOB2]], i32 [[TMP25]])
+// CHECK45-32-EX-NEXT:    [[TMP26:%.*]] = load i32, ptr [[DOTOMP_IS_LAST]], align 4
+// CHECK45-32-EX-NEXT:    [[TMP27:%.*]] = icmp ne i32 [[TMP26]], 0
+// CHECK45-32-EX-NEXT:    br i1 [[TMP27]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]]
+// CHECK45-32-EX:       .omp.final.then:
+// CHECK45-32-EX-NEXT:    [[TMP28:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK45-32-EX-NEXT:    [[SUB11:%.*]] = sub nsw i32 [[TMP28]], 0
+// CHECK45-32-EX-NEXT:    [[DIV12:%.*]] = sdiv i32 [[SUB11]], 1
+// CHECK45-32-EX-NEXT:    [[MUL13:%.*]] = mul nsw i32 [[DIV12]], 1
+// CHECK45-32-EX-NEXT:    [[ADD14:%.*]] = add nsw i32 0, [[MUL13]]
+// CHECK45-32-EX-NEXT:    store i32 [[ADD14]], ptr [[I4]], align 4
+// CHECK45-32-EX-NEXT:    br label [[DOTOMP_FINAL_DONE]]
+// CHECK45-32-EX:       .omp.final.done:
+// CHECK45-32-EX-NEXT:    [[TMP29:%.*]] = load i32, ptr [[DOTOMP_IS_LAST]], align 4
+// CHECK45-32-EX-NEXT:    [[TMP30:%.*]] = icmp ne i32 [[TMP29]], 0
+// CHECK45-32-EX-NEXT:    br i1 [[TMP30]], label [[DOTOMP_LASTPRIVATE_THEN:%.*]], label [[DOTOMP_LASTPRIVATE_DONE:%.*]]
+// CHECK45-32-EX:       .omp.lastprivate.then:
+// CHECK45-32-EX-NEXT:    [[TMP31:%.*]] = load i32, ptr [[L_ADDR]], align 4
+// CHECK45-32-EX-NEXT:    store i32 [[TMP31]], ptr [[L_ADDR]], align 4
+// CHECK45-32-EX-NEXT:    br label [[DOTOMP_LASTPRIVATE_DONE]]
+// CHECK45-32-EX:       .omp.lastprivate.done:
+// CHECK45-32-EX-NEXT:    br label [[OMP_PRECOND_END]]
+// CHECK45-32-EX:       omp.precond.end:
+// CHECK45-32-EX-NEXT:    call void @__kmpc_free_shared(ptr [[L1]], i32 4)
+// CHECK45-32-EX-NEXT:    ret void
+//
+//
+// CHECK45-32-EX-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l39
+// CHECK45-32-EX-SAME: (i32 [[N:%.*]], ptr nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR5:[0-9]+]] {
+// CHECK45-32-EX-NEXT:  entry:
+// CHECK45-32-EX-NEXT:    [[N_ADDR:%.*]] = alloca i32, align 4
+// CHECK45-32-EX-NEXT:    [[AA_ADDR:%.*]] = alloca ptr, align 4
+// CHECK45-32-EX-NEXT:    [[N_CASTED:%.*]] = alloca i32, align 4
+// CHECK45-32-EX-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
+// CHECK45-32-EX-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
+// CHECK45-32-EX-NEXT:    store i32 [[N]], ptr [[N_ADDR]], align 4
+// CHECK45-32-EX-NEXT:    store ptr [[AA]], ptr [[AA_ADDR]], align 4
+// CHECK45-32-EX-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[AA_ADDR]], align 4
+// CHECK45-32-EX-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr @[[GLOB1]], i8 2, i1 false)
+// CHECK45-32-EX-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1
+// CHECK45-32-EX-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
+// CHECK45-32-EX:       user_code.entry:
+// CHECK45-32-EX-NEXT:    [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]])
+// CHECK45-32-EX-NEXT:    [[TMP3:%.*]] = load i32, ptr [[N_ADDR]], align 4
+// CHECK45-32-EX-NEXT:    store i32 [[TMP3]], ptr [[N_CASTED]], align 4
+// CHECK45-32-EX-NEXT:    [[TMP4:%.*]] = load i32, ptr [[N_CASTED]], align 4
+// CHECK45-32-EX-NEXT:    store i32 0, ptr [[DOTZERO_ADDR]], align 4
+// CHECK45-32-EX-NEXT:    store i32 [[TMP2]], ptr [[DOTTHREADID_TEMP_]], align 4
+// CHECK45-32-EX-NEXT:    call void @__omp_outlined__1(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]], i32 [[TMP4]], ptr [[TMP0]]) #[[ATTR3]]
+// CHECK45-32-EX-NEXT:    call void @__kmpc_target_deinit(ptr @[[GLOB1]], i8 2)
+// CHECK45-32-EX-NEXT:    ret void
+// CHECK45-32-EX:       worker.exit:
+// CHECK45-32-EX-NEXT:    ret void
+//
+//
+// CHECK45-32-EX-LABEL: define {{[^@]+}}@__omp_outlined__1
+// CHECK45-32-EX-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]], i32 [[N:%.*]], ptr nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR1]] {
+// CHECK45-32-EX-NEXT:  entry:
+// CHECK45-32-EX-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
+// CHECK45-32-EX-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
+// CHECK45-32-EX-NEXT:    [[N_ADDR:%.*]] = alloca i32, align 4
+// CHECK45-32-EX-NEXT:    [[AA_ADDR:%.*]] = alloca ptr, align 4
+// CHECK45-32-EX-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK45-32-EX-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// CHECK45-32-EX-NEXT:    [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4
+// CHECK45-32-EX-NEXT:    [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4
+// CHECK45-32-EX-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK45-32-EX-NEXT:    [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// CHECK45-32-EX-NEXT:    [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// CHECK45-32-EX-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK45-32-EX-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK45-32-EX-NEXT:    [[I3:%.*]] = alloca i32, align 4
+// CHECK45-32-EX-NEXT:    store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK45-32-EX-NEXT:    store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 4
+// CHECK45-32-EX-NEXT:    store i32 [[N]], ptr [[N_ADDR]], align 4
+// CHECK45-32-EX-NEXT:    store ptr [[AA]], ptr [[AA_ADDR]], align 4
+// CHECK45-32-EX-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[AA_ADDR]], align 4
+// CHECK45-32-EX-NEXT:    [[TMP1:%.*]] = load i32, ptr [[N_ADDR]], align 4
+// CHECK45-32-EX-NEXT:    store i32 [[TMP1]], ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK45-32-EX-NEXT:    [[TMP2:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK45-32-EX-NEXT:    [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0
+// CHECK45-32-EX-NEXT:    [[DIV:%.*]] = sdiv i32 [[SUB]], 1
+// CHECK45-32-EX-NEXT:    [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1
+// CHECK45-32-EX-NEXT:    store i32 [[SUB2]], ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK45-32-EX-NEXT:    store i32 0, ptr [[I]], align 4
+// CHECK45-32-EX-NEXT:    [[TMP3:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK45-32-EX-NEXT:    [[CMP:%.*]] = icmp slt i32 0, [[TMP3]]
+// CHECK45-32-EX-NEXT:    br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]]
+// CHECK45-32-EX:       omp.precond.then:
+// CHECK45-32-EX-NEXT:    store i32 0, ptr [[DOTOMP_LB]], align 4
+// CHECK45-32-EX-NEXT:    [[TMP4:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK45-32-EX-NEXT:    store i32 [[TMP4]], ptr [[DOTOMP_UB]], align 4
+// CHECK45-32-EX-NEXT:    store i32 1, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK45-32-EX-NEXT:    store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
+// CHECK45-32-EX-NEXT:    [[NVPTX_NUM_THREADS:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block()
+// CHECK45-32-EX-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK45-32-EX-NEXT:    [[TMP6:%.*]] = load i32, ptr [[TMP5]], align 4
+// CHECK45-32-EX-NEXT:    call void @__kmpc_distribute_static_init_4(ptr @[[GLOB2]], i32 [[TMP6]], i32 91, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_LB]], ptr [[DOTOMP_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]])
+// CHECK45-32-EX-NEXT:    br label [[OMP_DISPATCH_COND:%.*]]
+// CHECK45-32-EX:       omp.dispatch.cond:
+// CHECK45-32-EX-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK45-32-EX-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK45-32-EX-NEXT:    [[CMP4:%.*]] = icmp sgt i32 [[TMP7]], [[TMP8]]
+// CHECK45-32-EX-NEXT:    br i1 [[CMP4]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK45-32-EX:       cond.true:
+// CHECK45-32-EX-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK45-32-EX-NEXT:    br label [[COND_END:%.*]]
+// CHECK45-32-EX:       cond.false:
+// CHECK45-32-EX-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK45-32-EX-NEXT:    br label [[COND_END]]
+// CHECK45-32-EX:       cond.end:
+// CHECK45-32-EX-NEXT:    [[COND:%.*]] = phi i32 [ [[TMP9]], [[COND_TRUE]] ], [ [[TMP10]], [[COND_FALSE]] ]
+// CHECK45-32-EX-NEXT:    store i32 [[COND]], ptr [[DOTOMP_UB]], align 4
+// CHECK45-32-EX-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4
+// CHECK45-32-EX-NEXT:    store i32 [[TMP11]], ptr [[DOTOMP_IV]], align 4
+// CHECK45-32-EX-NEXT:    [[TMP12:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK45-32-EX-NEXT:    [[TMP13:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK45-32-EX-NEXT:    [[CMP5:%.*]] = icmp sle i32 [[TMP12]], [[TMP13]]
+// CHECK45-32-EX-NEXT:    br i1 [[CMP5]], label [[OMP_DISPATCH_BODY:%.*]], label [[OMP_DISPATCH_END:%.*]]
+// CHECK45-32-EX:       omp.dispatch.body:
+// CHECK45-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK45-32-EX:       omp.inner.for.cond:
+// CHECK45-32-EX-NEXT:    [[TMP14:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP16:![0-9]+]]
+// CHECK45-32-EX-NEXT:    [[TMP15:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP16]]
+// CHECK45-32-EX-NEXT:    [[CMP6:%.*]] = icmp sle i32 [[TMP14]], [[TMP15]]
+// CHECK45-32-EX-NEXT:    br i1 [[CMP6]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK45-32-EX:       omp.inner.for.body:
+// CHECK45-32-EX-NEXT:    [[TMP16:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP16]]
+// CHECK45-32-EX-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP16]], 1
+// CHECK45-32-EX-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// CHECK45-32-EX-NEXT:    store i32 [[ADD]], ptr [[I3]], align 4, !llvm.access.group [[ACC_GRP16]]
+// CHECK45-32-EX-NEXT:    [[TMP17:%.*]] = load i32, ptr [[I3]], align 4, !llvm.access.group [[ACC_GRP16]]
+// CHECK45-32-EX-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [1000 x i16], ptr [[TMP0]], i32 0, i32 [[TMP17]]
+// CHECK45-32-EX-NEXT:    [[TMP18:%.*]] = load i16, ptr [[ARRAYIDX]], align 2, !llvm.access.group [[ACC_GRP16]]
+// CHECK45-32-EX-NEXT:    [[CONV:%.*]] = sext i16 [[TMP18]] to i32
+// CHECK45-32-EX-NEXT:    [[ADD7:%.*]] = add nsw i32 [[CONV]], 1
+// CHECK45-32-EX-NEXT:    [[CONV8:%.*]] = trunc i32 [[ADD7]] to i16
+// CHECK45-32-EX-NEXT:    store i16 [[CONV8]], ptr [[ARRAYIDX]], align 2, !llvm.access.group [[ACC_GRP16]]
+// CHECK45-32-EX-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK45-32-EX:       omp.body.continue:
+// CHECK45-32-EX-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK45-32-EX:       omp.inner.for.inc:
+// CHECK45-32-EX-NEXT:    [[TMP19:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP16]]
+// CHECK45-32-EX-NEXT:    [[ADD9:%.*]] = add nsw i32 [[TMP19]], 1
+// CHECK45-32-EX-NEXT:    store i32 [[ADD9]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP16]]
+// CHECK45-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP17:![0-9]+]]
+// CHECK45-32-EX:       omp.inner.for.end:
+// CHECK45-32-EX-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
+// CHECK45-32-EX:       omp.dispatch.inc:
+// CHECK45-32-EX-NEXT:    [[TMP20:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4
+// CHECK45-32-EX-NEXT:    [[TMP21:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK45-32-EX-NEXT:    [[ADD10:%.*]] = add nsw i32 [[TMP20]], [[TMP21]]
+// CHECK45-32-EX-NEXT:    store i32 [[ADD10]], ptr [[DOTOMP_LB]], align 4
+// CHECK45-32-EX-NEXT:    [[TMP22:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK45-32-EX-NEXT:    [[TMP23:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK45-32-EX-NEXT:    [[ADD11:%.*]] = add nsw i32 [[TMP22]], [[TMP23]]
+// CHECK45-32-EX-NEXT:    store i32 [[ADD11]], ptr [[DOTOMP_UB]], align 4
+// CHECK45-32-EX-NEXT:    br label [[OMP_DISPATCH_COND]]
+// CHECK45-32-EX:       omp.dispatch.end:
+// CHECK45-32-EX-NEXT:    [[TMP24:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK45-32-EX-NEXT:    [[TMP25:%.*]] = load i32, ptr [[TMP24]], align 4
+// CHECK45-32-EX-NEXT:    call void @__kmpc_distribute_static_fini(ptr @[[GLOB2]], i32 [[TMP25]])
+// CHECK45-32-EX-NEXT:    [[TMP26:%.*]] = load i32, ptr [[DOTOMP_IS_LAST]], align 4
+// CHECK45-32-EX-NEXT:    [[TMP27:%.*]] = icmp ne i32 [[TMP26]], 0
+// CHECK45-32-EX-NEXT:    br i1 [[TMP27]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]]
+// CHECK45-32-EX:       .omp.final.then:
+// CHECK45-32-EX-NEXT:    [[TMP28:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK45-32-EX-NEXT:    [[SUB12:%.*]] = sub nsw i32 [[TMP28]], 0
+// CHECK45-32-EX-NEXT:    [[DIV13:%.*]] = sdiv i32 [[SUB12]], 1
+// CHECK45-32-EX-NEXT:    [[MUL14:%.*]] = mul nsw i32 [[DIV13]], 1
+// CHECK45-32-EX-NEXT:    [[ADD15:%.*]] = add nsw i32 0, [[MUL14]]
+// CHECK45-32-EX-NEXT:    store i32 [[ADD15]], ptr [[I3]], align 4
+// CHECK45-32-EX-NEXT:    br label [[DOTOMP_FINAL_DONE]]
+// CHECK45-32-EX:       .omp.final.done:
+// CHECK45-32-EX-NEXT:    br label [[OMP_PRECOND_END]]
+// CHECK45-32-EX:       omp.precond.end:
+// CHECK45-32-EX-NEXT:    ret void
+//
+//
+// CHECK45-32-EX-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l44
+// CHECK45-32-EX-SAME: (ptr nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] {
+// CHECK45-32-EX-NEXT:  entry:
+// CHECK45-32-EX-NEXT:    [[B_ADDR:%.*]] = alloca ptr, align 4
+// CHECK45-32-EX-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
+// CHECK45-32-EX-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
+// CHECK45-32-EX-NEXT:    store ptr [[B]], ptr [[B_ADDR]], align 4
+// CHECK45-32-EX-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[B_ADDR]], align 4
+// CHECK45-32-EX-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr @[[GLOB1]], i8 2, i1 false)
+// CHECK45-32-EX-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1
+// CHECK45-32-EX-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
+// CHECK45-32-EX:       user_code.entry:
+// CHECK45-32-EX-NEXT:    [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]])
+// CHECK45-32-EX-NEXT:    store i32 0, ptr [[DOTZERO_ADDR]], align 4
+// CHECK45-32-EX-NEXT:    store i32 [[TMP2]], ptr [[DOTTHREADID_TEMP_]], align 4
+// CHECK45-32-EX-NEXT:    call void @__omp_outlined__2(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]], ptr [[TMP0]]) #[[ATTR3]]
+// CHECK45-32-EX-NEXT:    call void @__kmpc_target_deinit(ptr @[[GLOB1]], i8 2)
+// CHECK45-32-EX-NEXT:    ret void
+// CHECK45-32-EX:       worker.exit:
+// CHECK45-32-EX-NEXT:    ret void
+//
+//
+// CHECK45-32-EX-LABEL: define {{[^@]+}}@__omp_outlined__2
+// CHECK45-32-EX-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]], ptr nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR1]] {
+// CHECK45-32-EX-NEXT:  entry:
+// CHECK45-32-EX-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
+// CHECK45-32-EX-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
+// CHECK45-32-EX-NEXT:    [[B_ADDR:%.*]] = alloca ptr, align 4
+// CHECK45-32-EX-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK45-32-EX-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// CHECK45-32-EX-NEXT:    [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// CHECK45-32-EX-NEXT:    [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// CHECK45-32-EX-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK45-32-EX-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK45-32-EX-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK45-32-EX-NEXT:    store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK45-32-EX-NEXT:    store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 4
+// CHECK45-32-EX-NEXT:    store ptr [[B]], ptr [[B_ADDR]], align 4
+// CHECK45-32-EX-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[B_ADDR]], align 4
+// CHECK45-32-EX-NEXT:    store i32 0, ptr [[DOTOMP_LB]], align 4
+// CHECK45-32-EX-NEXT:    store i32 9, ptr [[DOTOMP_UB]], align 4
+// CHECK45-32-EX-NEXT:    store i32 1, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK45-32-EX-NEXT:    store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
+// CHECK45-32-EX-NEXT:    [[NVPTX_NUM_THREADS:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block()
+// CHECK45-32-EX-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK45-32-EX-NEXT:    [[TMP2:%.*]] = load i32, ptr [[TMP1]], align 4
+// CHECK45-32-EX-NEXT:    call void @__kmpc_distribute_static_init_4(ptr @[[GLOB2]], i32 [[TMP2]], i32 91, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_LB]], ptr [[DOTOMP_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]])
+// CHECK45-32-EX-NEXT:    br label [[OMP_DISPATCH_COND:%.*]]
+// CHECK45-32-EX:       omp.dispatch.cond:
+// CHECK45-32-EX-NEXT:    [[TMP3:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK45-32-EX-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[TMP3]], 9
+// CHECK45-32-EX-NEXT:    br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK45-32-EX:       cond.true:
+// CHECK45-32-EX-NEXT:    br label [[COND_END:%.*]]
+// CHECK45-32-EX:       cond.false:
+// CHECK45-32-EX-NEXT:    [[TMP4:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK45-32-EX-NEXT:    br label [[COND_END]]
+// CHECK45-32-EX:       cond.end:
+// CHECK45-32-EX-NEXT:    [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP4]], [[COND_FALSE]] ]
+// CHECK45-32-EX-NEXT:    store i32 [[COND]], ptr [[DOTOMP_UB]], align 4
+// CHECK45-32-EX-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4
+// CHECK45-32-EX-NEXT:    store i32 [[TMP5]], ptr [[DOTOMP_IV]], align 4
+// CHECK45-32-EX-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK45-32-EX-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK45-32-EX-NEXT:    [[CMP1:%.*]] = icmp sle i32 [[TMP6]], [[TMP7]]
+// CHECK45-32-EX-NEXT:    br i1 [[CMP1]], label [[OMP_DISPATCH_BODY:%.*]], label [[OMP_DISPATCH_END:%.*]]
+// CHECK45-32-EX:       omp.dispatch.body:
+// CHECK45-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK45-32-EX:       omp.inner.for.cond:
+// CHECK45-32-EX-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP19:![0-9]+]]
+// CHECK45-32-EX-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP19]]
+// CHECK45-32-EX-NEXT:    [[CMP2:%.*]] = icmp sle i32 [[TMP8]], [[TMP9]]
+// CHECK45-32-EX-NEXT:    br i1 [[CMP2]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK45-32-EX:       omp.inner.for.body:
+// CHECK45-32-EX-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP19]]
+// CHECK45-32-EX-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP10]], 1
+// CHECK45-32-EX-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// CHECK45-32-EX-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP19]]
+// CHECK45-32-EX-NEXT:    [[TMP11:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP19]]
+// CHECK45-32-EX-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], ptr [[TMP0]], i32 0, i32 [[TMP11]]
+// CHECK45-32-EX-NEXT:    [[TMP12:%.*]] = load i32, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP19]]
+// CHECK45-32-EX-NEXT:    [[ADD3:%.*]] = add nsw i32 [[TMP12]], 1
+// CHECK45-32-EX-NEXT:    store i32 [[ADD3]], ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP19]]
+// CHECK45-32-EX-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK45-32-EX:       omp.body.continue:
+// CHECK45-32-EX-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK45-32-EX:       omp.inner.for.inc:
+// CHECK45-32-EX-NEXT:    [[TMP13:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP19]]
+// CHECK45-32-EX-NEXT:    [[ADD4:%.*]] = add nsw i32 [[TMP13]], 1
+// CHECK45-32-EX-NEXT:    store i32 [[ADD4]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP19]]
+// CHECK45-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP20:![0-9]+]]
+// CHECK45-32-EX:       omp.inner.for.end:
+// CHECK45-32-EX-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
+// CHECK45-32-EX:       omp.dispatch.inc:
+// CHECK45-32-EX-NEXT:    [[TMP14:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4
+// CHECK45-32-EX-NEXT:    [[TMP15:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK45-32-EX-NEXT:    [[ADD5:%.*]] = add nsw i32 [[TMP14]], [[TMP15]]
+// CHECK45-32-EX-NEXT:    store i32 [[ADD5]], ptr [[DOTOMP_LB]], align 4
+// CHECK45-32-EX-NEXT:    [[TMP16:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK45-32-EX-NEXT:    [[TMP17:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK45-32-EX-NEXT:    [[ADD6:%.*]] = add nsw i32 [[TMP16]], [[TMP17]]
+// CHECK45-32-EX-NEXT:    store i32 [[ADD6]], ptr [[DOTOMP_UB]], align 4
+// CHECK45-32-EX-NEXT:    br label [[OMP_DISPATCH_COND]]
+// CHECK45-32-EX:       omp.dispatch.end:
+// CHECK45-32-EX-NEXT:    call void @__kmpc_distribute_static_fini(ptr @[[GLOB2]], i32 [[TMP2]])
+// CHECK45-32-EX-NEXT:    [[TMP18:%.*]] = load i32, ptr [[DOTOMP_IS_LAST]], align 4
+// CHECK45-32-EX-NEXT:    [[TMP19:%.*]] = icmp ne i32 [[TMP18]], 0
+// CHECK45-32-EX-NEXT:    br i1 [[TMP19]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]]
+// CHECK45-32-EX:       .omp.final.then:
+// CHECK45-32-EX-NEXT:    store i32 10, ptr [[I]], align 4
+// CHECK45-32-EX-NEXT:    br label [[DOTOMP_FINAL_DONE]]
+// CHECK45-32-EX:       .omp.final.done:
+// CHECK45-32-EX-NEXT:    ret void
+//
+//
+// CHECK45-32-EX-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l49
+// CHECK45-32-EX-SAME: (ptr nonnull align 4 dereferenceable(400) [[C:%.*]], i32 [[F:%.*]]) #[[ATTR0]] {
+// CHECK45-32-EX-NEXT:  entry:
+// CHECK45-32-EX-NEXT:    [[C_ADDR:%.*]] = alloca ptr, align 4
+// CHECK45-32-EX-NEXT:    [[F_ADDR:%.*]] = alloca i32, align 4
+// CHECK45-32-EX-NEXT:    [[F_CASTED:%.*]] = alloca i32, align 4
+// CHECK45-32-EX-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
+// CHECK45-32-EX-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
+// CHECK45-32-EX-NEXT:    store ptr [[C]], ptr [[C_ADDR]], align 4
+// CHECK45-32-EX-NEXT:    store i32 [[F]], ptr [[F_ADDR]], align 4
+// CHECK45-32-EX-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[C_ADDR]], align 4
+// CHECK45-32-EX-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr @[[GLOB1]], i8 2, i1 false)
+// CHECK45-32-EX-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1
+// CHECK45-32-EX-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
+// CHECK45-32-EX:       user_code.entry:
+// CHECK45-32-EX-NEXT:    [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]])
+// CHECK45-32-EX-NEXT:    [[TMP3:%.*]] = load i32, ptr [[F_ADDR]], align 4
+// CHECK45-32-EX-NEXT:    store i32 [[TMP3]], ptr [[F_CASTED]], align 4
+// CHECK45-32-EX-NEXT:    [[TMP4:%.*]] = load i32, ptr [[F_CASTED]], align 4
+// CHECK45-32-EX-NEXT:    store i32 0, ptr [[DOTZERO_ADDR]], align 4
+// CHECK45-32-EX-NEXT:    store i32 [[TMP2]], ptr [[DOTTHREADID_TEMP_]], align 4
+// CHECK45-32-EX-NEXT:    call void @__omp_outlined__3(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]], ptr [[TMP0]], i32 [[TMP4]]) #[[ATTR3]]
+// CHECK45-32-EX-NEXT:    call void @__kmpc_target_deinit(ptr @[[GLOB1]], i8 2)
+// CHECK45-32-EX-NEXT:    ret void
+// CHECK45-32-EX:       worker.exit:
+// CHECK45-32-EX-NEXT:    ret void
+//
+//
+// CHECK45-32-EX-LABEL: define {{[^@]+}}@__omp_outlined__3
+// CHECK45-32-EX-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]], ptr nonnull align 4 dereferenceable(400) [[C:%.*]], i32 [[F:%.*]]) #[[ATTR1]] {
+// CHECK45-32-EX-NEXT:  entry:
+// CHECK45-32-EX-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
+// CHECK45-32-EX-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
+// CHECK45-32-EX-NEXT:    [[C_ADDR:%.*]] = alloca ptr, align 4
+// CHECK45-32-EX-NEXT:    [[F_ADDR:%.*]] = alloca i32, align 4
+// CHECK45-32-EX-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK45-32-EX-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// CHECK45-32-EX-NEXT:    [[_TMP1:%.*]] = alloca i32, align 4
+// CHECK45-32-EX-NEXT:    [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// CHECK45-32-EX-NEXT:    [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// CHECK45-32-EX-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK45-32-EX-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK45-32-EX-NEXT:    [[K:%.*]] = alloca i32, align 4
+// CHECK45-32-EX-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK45-32-EX-NEXT:    [[J:%.*]] = alloca i32, align 4
+// CHECK45-32-EX-NEXT:    store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK45-32-EX-NEXT:    store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 4
+// CHECK45-32-EX-NEXT:    store ptr [[C]], ptr [[C_ADDR]], align 4
+// CHECK45-32-EX-NEXT:    store i32 [[F]], ptr [[F_ADDR]], align 4
+// CHECK45-32-EX-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[C_ADDR]], align 4
+// CHECK45-32-EX-NEXT:    store i32 0, ptr [[DOTOMP_LB]], align 4
+// CHECK45-32-EX-NEXT:    store i32 99, ptr [[DOTOMP_UB]], align 4
+// CHECK45-32-EX-NEXT:    store i32 1, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK45-32-EX-NEXT:    store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
+// CHECK45-32-EX-NEXT:    [[NVPTX_NUM_THREADS:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block()
+// CHECK45-32-EX-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK45-32-EX-NEXT:    [[TMP2:%.*]] = load i32, ptr [[TMP1]], align 4
+// CHECK45-32-EX-NEXT:    call void @__kmpc_distribute_static_init_4(ptr @[[GLOB2]], i32 [[TMP2]], i32 91, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_LB]], ptr [[DOTOMP_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]])
+// CHECK45-32-EX-NEXT:    br label [[OMP_DISPATCH_COND:%.*]]
+// CHECK45-32-EX:       omp.dispatch.cond:
+// CHECK45-32-EX-NEXT:    [[TMP3:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK45-32-EX-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[TMP3]], 99
+// CHECK45-32-EX-NEXT:    br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK45-32-EX:       cond.true:
+// CHECK45-32-EX-NEXT:    br label [[COND_END:%.*]]
+// CHECK45-32-EX:       cond.false:
+// CHECK45-32-EX-NEXT:    [[TMP4:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK45-32-EX-NEXT:    br label [[COND_END]]
+// CHECK45-32-EX:       cond.end:
+// CHECK45-32-EX-NEXT:    [[COND:%.*]] = phi i32 [ 99, [[COND_TRUE]] ], [ [[TMP4]], [[COND_FALSE]] ]
+// CHECK45-32-EX-NEXT:    store i32 [[COND]], ptr [[DOTOMP_UB]], align 4
+// CHECK45-32-EX-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4
+// CHECK45-32-EX-NEXT:    store i32 [[TMP5]], ptr [[DOTOMP_IV]], align 4
+// CHECK45-32-EX-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK45-32-EX-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK45-32-EX-NEXT:    [[CMP2:%.*]] = icmp sle i32 [[TMP6]], [[TMP7]]
+// CHECK45-32-EX-NEXT:    br i1 [[CMP2]], label [[OMP_DISPATCH_BODY:%.*]], label [[OMP_DISPATCH_END:%.*]]
+// CHECK45-32-EX:       omp.dispatch.body:
+// CHECK45-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK45-32-EX:       omp.inner.for.cond:
+// CHECK45-32-EX-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP22:![0-9]+]]
+// CHECK45-32-EX-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP22]]
+// CHECK45-32-EX-NEXT:    [[CMP3:%.*]] = icmp sle i32 [[TMP8]], [[TMP9]]
+// CHECK45-32-EX-NEXT:    br i1 [[CMP3]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK45-32-EX:       omp.inner.for.body:
+// CHECK45-32-EX-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP22]]
+// CHECK45-32-EX-NEXT:    [[DIV:%.*]] = sdiv i32 [[TMP10]], 10
+// CHECK45-32-EX-NEXT:    [[MUL:%.*]] = mul nsw i32 [[DIV]], 1
+// CHECK45-32-EX-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// CHECK45-32-EX-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP22]]
+// CHECK45-32-EX-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP22]]
+// CHECK45-32-EX-NEXT:    [[TMP12:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP22]]
+// CHECK45-32-EX-NEXT:    [[DIV4:%.*]] = sdiv i32 [[TMP12]], 10
+// CHECK45-32-EX-NEXT:    [[MUL5:%.*]] = mul nsw i32 [[DIV4]], 10
+// CHECK45-32-EX-NEXT:    [[SUB:%.*]] = sub nsw i32 [[TMP11]], [[MUL5]]
+// CHECK45-32-EX-NEXT:    [[MUL6:%.*]] = mul nsw i32 [[SUB]], 1
+// CHECK45-32-EX-NEXT:    [[ADD7:%.*]] = add nsw i32 0, [[MUL6]]
+// CHECK45-32-EX-NEXT:    store i32 [[ADD7]], ptr [[J]], align 4, !llvm.access.group [[ACC_GRP22]]
+// CHECK45-32-EX-NEXT:    store i32 10, ptr [[K]], align 4, !llvm.access.group [[ACC_GRP22]]
+// CHECK45-32-EX-NEXT:    [[TMP13:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP22]]
+// CHECK45-32-EX-NEXT:    [[TMP14:%.*]] = load i32, ptr [[J]], align 4, !llvm.access.group [[ACC_GRP22]]
+// CHECK45-32-EX-NEXT:    [[TMP15:%.*]] = load i32, ptr [[F_ADDR]], align 4, !llvm.access.group [[ACC_GRP22]]
+// CHECK45-32-EX-NEXT:    [[MUL8:%.*]] = mul nsw i32 [[TMP14]], [[TMP15]]
+// CHECK45-32-EX-NEXT:    [[ADD9:%.*]] = add nsw i32 [[TMP13]], [[MUL8]]
+// CHECK45-32-EX-NEXT:    [[TMP16:%.*]] = load i32, ptr [[K]], align 4, !llvm.access.group [[ACC_GRP22]]
+// CHECK45-32-EX-NEXT:    [[ADD10:%.*]] = add nsw i32 [[ADD9]], [[TMP16]]
+// CHECK45-32-EX-NEXT:    [[TMP17:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP22]]
+// CHECK45-32-EX-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[TMP0]], i32 0, i32 [[TMP17]]
+// CHECK45-32-EX-NEXT:    [[TMP18:%.*]] = load i32, ptr [[J]], align 4, !llvm.access.group [[ACC_GRP22]]
+// CHECK45-32-EX-NEXT:    [[ARRAYIDX11:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX]], i32 0, i32 [[TMP18]]
+// CHECK45-32-EX-NEXT:    store i32 [[ADD10]], ptr [[ARRAYIDX11]], align 4, !llvm.access.group [[ACC_GRP22]]
+// CHECK45-32-EX-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK45-32-EX:       omp.body.continue:
+// CHECK45-32-EX-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK45-32-EX:       omp.inner.for.inc:
+// CHECK45-32-EX-NEXT:    [[TMP19:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP22]]
+// CHECK45-32-EX-NEXT:    [[ADD12:%.*]] = add nsw i32 [[TMP19]], 1
+// CHECK45-32-EX-NEXT:    store i32 [[ADD12]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP22]]
+// CHECK45-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP23:![0-9]+]]
+// CHECK45-32-EX:       omp.inner.for.end:
+// CHECK45-32-EX-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
+// CHECK45-32-EX:       omp.dispatch.inc:
+// CHECK45-32-EX-NEXT:    [[TMP20:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4
+// CHECK45-32-EX-NEXT:    [[TMP21:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK45-32-EX-NEXT:    [[ADD13:%.*]] = add nsw i32 [[TMP20]], [[TMP21]]
+// CHECK45-32-EX-NEXT:    store i32 [[ADD13]], ptr [[DOTOMP_LB]], align 4
+// CHECK45-32-EX-NEXT:    [[TMP22:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK45-32-EX-NEXT:    [[TMP23:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK45-32-EX-NEXT:    [[ADD14:%.*]] = add nsw i32 [[TMP22]], [[TMP23]]
+// CHECK45-32-EX-NEXT:    store i32 [[ADD14]], ptr [[DOTOMP_UB]], align 4
+// CHECK45-32-EX-NEXT:    br label [[OMP_DISPATCH_COND]]
+// CHECK45-32-EX:       omp.dispatch.end:
+// CHECK45-32-EX-NEXT:    call void @__kmpc_distribute_static_fini(ptr @[[GLOB2]], i32 [[TMP2]])
+// CHECK45-32-EX-NEXT:    [[TMP24:%.*]] = load i32, ptr [[DOTOMP_IS_LAST]], align 4
+// CHECK45-32-EX-NEXT:    [[TMP25:%.*]] = icmp ne i32 [[TMP24]], 0
+// CHECK45-32-EX-NEXT:    br i1 [[TMP25]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]]
+// CHECK45-32-EX:       .omp.final.then:
+// CHECK45-32-EX-NEXT:    store i32 10, ptr [[I]], align 4
+// CHECK45-32-EX-NEXT:    store i32 10, ptr [[J]], align 4
+// CHECK45-32-EX-NEXT:    br label [[DOTOMP_FINAL_DONE]]
+// CHECK45-32-EX:       .omp.final.done:
+// CHECK45-32-EX-NEXT:    ret void
+//
+//
+// CHECK-64-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l33
+// CHECK-64-SAME: (i64 [[N:%.*]], ptr nonnull align 4 dereferenceable(4000) [[A:%.*]], i64 [[L:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-64-NEXT:  entry:
+// CHECK-64-NEXT:    [[N_ADDR:%.*]] = alloca i64, align 8
+// CHECK-64-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-64-NEXT:    [[L_ADDR:%.*]] = alloca i64, align 8
+// CHECK-64-NEXT:    [[N_CASTED:%.*]] = alloca i64, align 8
+// CHECK-64-NEXT:    [[L_CASTED:%.*]] = alloca i64, align 8
+// CHECK-64-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    store i64 [[N]], ptr [[N_ADDR]], align 8
+// CHECK-64-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
+// CHECK-64-NEXT:    store i64 [[L]], ptr [[L_ADDR]], align 8
+// CHECK-64-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[A_ADDR]], align 8
+// CHECK-64-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr @[[GLOB1:[0-9]+]], i8 2, i1 false)
+// CHECK-64-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1
+// CHECK-64-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
+// CHECK-64:       user_code.entry:
+// CHECK-64-NEXT:    [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]])
+// CHECK-64-NEXT:    [[TMP3:%.*]] = load i32, ptr [[N_ADDR]], align 4
+// CHECK-64-NEXT:    store i32 [[TMP3]], ptr [[N_CASTED]], align 4
+// CHECK-64-NEXT:    [[TMP4:%.*]] = load i64, ptr [[N_CASTED]], align 8
+// CHECK-64-NEXT:    [[TMP5:%.*]] = load i32, ptr [[L_ADDR]], align 4
+// CHECK-64-NEXT:    store i32 [[TMP5]], ptr [[L_CASTED]], align 4
+// CHECK-64-NEXT:    [[TMP6:%.*]] = load i64, ptr [[L_CASTED]], align 8
+// CHECK-64-NEXT:    store i32 0, ptr [[DOTZERO_ADDR]], align 4
+// CHECK-64-NEXT:    store i32 [[TMP2]], ptr [[DOTTHREADID_TEMP_]], align 4
+// CHECK-64-NEXT:    call void @__omp_outlined__(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]], i64 [[TMP4]], ptr [[TMP0]], i64 [[TMP6]]) #[[ATTR3:[0-9]+]]
+// CHECK-64-NEXT:    call void @__kmpc_target_deinit(ptr @[[GLOB1]], i8 2)
+// CHECK-64-NEXT:    ret void
+// CHECK-64:       worker.exit:
+// CHECK-64-NEXT:    ret void
+//
+//
+// CHECK-64-LABEL: define {{[^@]+}}@__omp_outlined__
+// CHECK-64-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]], i64 [[N:%.*]], ptr nonnull align 4 dereferenceable(4000) [[A:%.*]], i64 [[L:%.*]]) #[[ATTR1:[0-9]+]] {
+// CHECK-64-NEXT:  entry:
+// CHECK-64-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK-64-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK-64-NEXT:    [[N_ADDR:%.*]] = alloca i64, align 8
+// CHECK-64-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-64-NEXT:    [[L_ADDR:%.*]] = alloca i64, align 8
+// CHECK-64-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTCAPTURE_EXPR_2:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[I4:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK-64-NEXT:    store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
+// CHECK-64-NEXT:    store i64 [[N]], ptr [[N_ADDR]], align 8
+// CHECK-64-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
+// CHECK-64-NEXT:    store i64 [[L]], ptr [[L_ADDR]], align 8
+// CHECK-64-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[A_ADDR]], align 8
+// CHECK-64-NEXT:    [[L1:%.*]] = call align 8 ptr @__kmpc_alloc_shared(i64 4)
+// CHECK-64-NEXT:    [[TMP1:%.*]] = load i32, ptr [[N_ADDR]], align 4
+// CHECK-64-NEXT:    store i32 [[TMP1]], ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK-64-NEXT:    [[TMP2:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK-64-NEXT:    [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0
+// CHECK-64-NEXT:    [[DIV:%.*]] = sdiv i32 [[SUB]], 1
+// CHECK-64-NEXT:    [[SUB3:%.*]] = sub nsw i32 [[DIV]], 1
+// CHECK-64-NEXT:    store i32 [[SUB3]], ptr [[DOTCAPTURE_EXPR_2]], align 4
+// CHECK-64-NEXT:    store i32 0, ptr [[I]], align 4
+// CHECK-64-NEXT:    [[TMP3:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK-64-NEXT:    [[CMP:%.*]] = icmp slt i32 0, [[TMP3]]
+// CHECK-64-NEXT:    br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]]
+// CHECK-64:       omp.precond.then:
+// CHECK-64-NEXT:    store i32 0, ptr [[DOTOMP_LB]], align 4
+// CHECK-64-NEXT:    [[TMP4:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_2]], align 4
+// CHECK-64-NEXT:    store i32 [[TMP4]], ptr [[DOTOMP_UB]], align 4
+// CHECK-64-NEXT:    store i32 1, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK-64-NEXT:    store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
+// CHECK-64-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK-64-NEXT:    [[TMP6:%.*]] = load i32, ptr [[TMP5]], align 4
+// CHECK-64-NEXT:    call void @__kmpc_distribute_static_init_4(ptr @[[GLOB2:[0-9]+]], i32 [[TMP6]], i32 91, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_LB]], ptr [[DOTOMP_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 128)
+// CHECK-64-NEXT:    br label [[OMP_DISPATCH_COND:%.*]]
+// CHECK-64:       omp.dispatch.cond:
+// CHECK-64-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK-64-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_2]], align 4
+// CHECK-64-NEXT:    [[CMP5:%.*]] = icmp sgt i32 [[TMP7]], [[TMP8]]
+// CHECK-64-NEXT:    br i1 [[CMP5]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK-64:       cond.true:
+// CHECK-64-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_2]], align 4
+// CHECK-64-NEXT:    br label [[COND_END:%.*]]
+// CHECK-64:       cond.false:
+// CHECK-64-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK-64-NEXT:    br label [[COND_END]]
+// CHECK-64:       cond.end:
+// CHECK-64-NEXT:    [[COND:%.*]] = phi i32 [ [[TMP9]], [[COND_TRUE]] ], [ [[TMP10]], [[COND_FALSE]] ]
+// CHECK-64-NEXT:    store i32 [[COND]], ptr [[DOTOMP_UB]], align 4
+// CHECK-64-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4
+// CHECK-64-NEXT:    store i32 [[TMP11]], ptr [[DOTOMP_IV]], align 4
+// CHECK-64-NEXT:    [[TMP12:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK-64-NEXT:    [[TMP13:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK-64-NEXT:    [[CMP6:%.*]] = icmp sle i32 [[TMP12]], [[TMP13]]
+// CHECK-64-NEXT:    br i1 [[CMP6]], label [[OMP_DISPATCH_BODY:%.*]], label [[OMP_DISPATCH_END:%.*]]
+// CHECK-64:       omp.dispatch.body:
+// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK-64:       omp.inner.for.cond:
+// CHECK-64-NEXT:    [[TMP14:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP12:![0-9]+]]
+// CHECK-64-NEXT:    [[TMP15:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP12]]
+// CHECK-64-NEXT:    [[CMP7:%.*]] = icmp sle i32 [[TMP14]], [[TMP15]]
+// CHECK-64-NEXT:    br i1 [[CMP7]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK-64:       omp.inner.for.body:
+// CHECK-64-NEXT:    [[TMP16:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP12]]
+// CHECK-64-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP16]], 1
+// CHECK-64-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// CHECK-64-NEXT:    store i32 [[ADD]], ptr [[I4]], align 4, !llvm.access.group [[ACC_GRP12]]
+// CHECK-64-NEXT:    [[TMP17:%.*]] = load i32, ptr [[I4]], align 4, !llvm.access.group [[ACC_GRP12]]
+// CHECK-64-NEXT:    [[IDXPROM:%.*]] = sext i32 [[TMP17]] to i64
+// CHECK-64-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [1000 x i32], ptr [[TMP0]], i64 0, i64 [[IDXPROM]]
+// CHECK-64-NEXT:    store i32 1, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP12]]
+// CHECK-64-NEXT:    [[TMP18:%.*]] = load i32, ptr [[I4]], align 4, !llvm.access.group [[ACC_GRP12]]
+// CHECK-64-NEXT:    store i32 [[TMP18]], ptr [[L_ADDR]], align 4, !llvm.access.group [[ACC_GRP12]]
+// CHECK-64-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK-64:       omp.body.continue:
+// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK-64:       omp.inner.for.inc:
+// CHECK-64-NEXT:    [[TMP19:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP12]]
+// CHECK-64-NEXT:    [[ADD8:%.*]] = add nsw i32 [[TMP19]], 1
+// CHECK-64-NEXT:    store i32 [[ADD8]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP12]]
+// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP13:![0-9]+]]
+// CHECK-64:       omp.inner.for.end:
+// CHECK-64-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
+// CHECK-64:       omp.dispatch.inc:
+// CHECK-64-NEXT:    [[TMP20:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4
+// CHECK-64-NEXT:    [[TMP21:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK-64-NEXT:    [[ADD9:%.*]] = add nsw i32 [[TMP20]], [[TMP21]]
+// CHECK-64-NEXT:    store i32 [[ADD9]], ptr [[DOTOMP_LB]], align 4
+// CHECK-64-NEXT:    [[TMP22:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK-64-NEXT:    [[TMP23:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK-64-NEXT:    [[ADD10:%.*]] = add nsw i32 [[TMP22]], [[TMP23]]
+// CHECK-64-NEXT:    store i32 [[ADD10]], ptr [[DOTOMP_UB]], align 4
+// CHECK-64-NEXT:    br label [[OMP_DISPATCH_COND]]
+// CHECK-64:       omp.dispatch.end:
+// CHECK-64-NEXT:    [[TMP24:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK-64-NEXT:    [[TMP25:%.*]] = load i32, ptr [[TMP24]], align 4
+// CHECK-64-NEXT:    call void @__kmpc_distribute_static_fini(ptr @[[GLOB2]], i32 [[TMP25]])
+// CHECK-64-NEXT:    [[TMP26:%.*]] = load i32, ptr [[DOTOMP_IS_LAST]], align 4
+// CHECK-64-NEXT:    [[TMP27:%.*]] = icmp ne i32 [[TMP26]], 0
+// CHECK-64-NEXT:    br i1 [[TMP27]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]]
+// CHECK-64:       .omp.final.then:
+// CHECK-64-NEXT:    [[TMP28:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK-64-NEXT:    [[SUB11:%.*]] = sub nsw i32 [[TMP28]], 0
+// CHECK-64-NEXT:    [[DIV12:%.*]] = sdiv i32 [[SUB11]], 1
+// CHECK-64-NEXT:    [[MUL13:%.*]] = mul nsw i32 [[DIV12]], 1
+// CHECK-64-NEXT:    [[ADD14:%.*]] = add nsw i32 0, [[MUL13]]
+// CHECK-64-NEXT:    store i32 [[ADD14]], ptr [[I4]], align 4
+// CHECK-64-NEXT:    br label [[DOTOMP_FINAL_DONE]]
+// CHECK-64:       .omp.final.done:
+// CHECK-64-NEXT:    [[TMP29:%.*]] = load i32, ptr [[DOTOMP_IS_LAST]], align 4
+// CHECK-64-NEXT:    [[TMP30:%.*]] = icmp ne i32 [[TMP29]], 0
+// CHECK-64-NEXT:    br i1 [[TMP30]], label [[DOTOMP_LASTPRIVATE_THEN:%.*]], label [[DOTOMP_LASTPRIVATE_DONE:%.*]]
+// CHECK-64:       .omp.lastprivate.then:
+// CHECK-64-NEXT:    [[TMP31:%.*]] = load i32, ptr [[L_ADDR]], align 4
+// CHECK-64-NEXT:    store i32 [[TMP31]], ptr [[L_ADDR]], align 4
+// CHECK-64-NEXT:    br label [[DOTOMP_LASTPRIVATE_DONE]]
+// CHECK-64:       .omp.lastprivate.done:
+// CHECK-64-NEXT:    br label [[OMP_PRECOND_END]]
+// CHECK-64:       omp.precond.end:
+// CHECK-64-NEXT:    call void @__kmpc_free_shared(ptr [[L1]], i64 4)
+// CHECK-64-NEXT:    ret void
+//
+//
+// CHECK-64-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l39
+// CHECK-64-SAME: (i64 [[N:%.*]], ptr nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR5:[0-9]+]] {
+// CHECK-64-NEXT:  entry:
+// CHECK-64-NEXT:    [[N_ADDR:%.*]] = alloca i64, align 8
+// CHECK-64-NEXT:    [[AA_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-64-NEXT:    [[N_CASTED:%.*]] = alloca i64, align 8
+// CHECK-64-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    store i64 [[N]], ptr [[N_ADDR]], align 8
+// CHECK-64-NEXT:    store ptr [[AA]], ptr [[AA_ADDR]], align 8
+// CHECK-64-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[AA_ADDR]], align 8
+// CHECK-64-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr @[[GLOB1]], i8 2, i1 false)
+// CHECK-64-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1
+// CHECK-64-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
+// CHECK-64:       user_code.entry:
+// CHECK-64-NEXT:    [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]])
+// CHECK-64-NEXT:    [[TMP3:%.*]] = load i32, ptr [[N_ADDR]], align 4
+// CHECK-64-NEXT:    store i32 [[TMP3]], ptr [[N_CASTED]], align 4
+// CHECK-64-NEXT:    [[TMP4:%.*]] = load i64, ptr [[N_CASTED]], align 8
+// CHECK-64-NEXT:    store i32 0, ptr [[DOTZERO_ADDR]], align 4
+// CHECK-64-NEXT:    store i32 [[TMP2]], ptr [[DOTTHREADID_TEMP_]], align 4
+// CHECK-64-NEXT:    call void @__omp_outlined__1(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]], i64 [[TMP4]], ptr [[TMP0]]) #[[ATTR3]]
+// CHECK-64-NEXT:    call void @__kmpc_target_deinit(ptr @[[GLOB1]], i8 2)
+// CHECK-64-NEXT:    ret void
+// CHECK-64:       worker.exit:
+// CHECK-64-NEXT:    ret void
+//
+//
+// CHECK-64-LABEL: define {{[^@]+}}@__omp_outlined__1
+// CHECK-64-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]], i64 [[N:%.*]], ptr nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR1]] {
+// CHECK-64-NEXT:  entry:
+// CHECK-64-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK-64-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK-64-NEXT:    [[N_ADDR:%.*]] = alloca i64, align 8
+// CHECK-64-NEXT:    [[AA_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-64-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[I3:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK-64-NEXT:    store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
+// CHECK-64-NEXT:    store i64 [[N]], ptr [[N_ADDR]], align 8
+// CHECK-64-NEXT:    store ptr [[AA]], ptr [[AA_ADDR]], align 8
+// CHECK-64-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[AA_ADDR]], align 8
+// CHECK-64-NEXT:    [[TMP1:%.*]] = load i32, ptr [[N_ADDR]], align 4
+// CHECK-64-NEXT:    store i32 [[TMP1]], ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK-64-NEXT:    [[TMP2:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK-64-NEXT:    [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0
+// CHECK-64-NEXT:    [[DIV:%.*]] = sdiv i32 [[SUB]], 1
+// CHECK-64-NEXT:    [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1
+// CHECK-64-NEXT:    store i32 [[SUB2]], ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK-64-NEXT:    store i32 0, ptr [[I]], align 4
+// CHECK-64-NEXT:    [[TMP3:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK-64-NEXT:    [[CMP:%.*]] = icmp slt i32 0, [[TMP3]]
+// CHECK-64-NEXT:    br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]]
+// CHECK-64:       omp.precond.then:
+// CHECK-64-NEXT:    store i32 0, ptr [[DOTOMP_LB]], align 4
+// CHECK-64-NEXT:    [[TMP4:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK-64-NEXT:    store i32 [[TMP4]], ptr [[DOTOMP_UB]], align 4
+// CHECK-64-NEXT:    store i32 1, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK-64-NEXT:    store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
+// CHECK-64-NEXT:    [[NVPTX_NUM_THREADS:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block()
+// CHECK-64-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK-64-NEXT:    [[TMP6:%.*]] = load i32, ptr [[TMP5]], align 4
+// CHECK-64-NEXT:    call void @__kmpc_distribute_static_init_4(ptr @[[GLOB2]], i32 [[TMP6]], i32 91, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_LB]], ptr [[DOTOMP_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]])
+// CHECK-64-NEXT:    br label [[OMP_DISPATCH_COND:%.*]]
+// CHECK-64:       omp.dispatch.cond:
+// CHECK-64-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK-64-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK-64-NEXT:    [[CMP4:%.*]] = icmp sgt i32 [[TMP7]], [[TMP8]]
+// CHECK-64-NEXT:    br i1 [[CMP4]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK-64:       cond.true:
+// CHECK-64-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK-64-NEXT:    br label [[COND_END:%.*]]
+// CHECK-64:       cond.false:
+// CHECK-64-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK-64-NEXT:    br label [[COND_END]]
+// CHECK-64:       cond.end:
+// CHECK-64-NEXT:    [[COND:%.*]] = phi i32 [ [[TMP9]], [[COND_TRUE]] ], [ [[TMP10]], [[COND_FALSE]] ]
+// CHECK-64-NEXT:    store i32 [[COND]], ptr [[DOTOMP_UB]], align 4
+// CHECK-64-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4
+// CHECK-64-NEXT:    store i32 [[TMP11]], ptr [[DOTOMP_IV]], align 4
+// CHECK-64-NEXT:    [[TMP12:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK-64-NEXT:    [[TMP13:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK-64-NEXT:    [[CMP5:%.*]] = icmp sle i32 [[TMP12]], [[TMP13]]
+// CHECK-64-NEXT:    br i1 [[CMP5]], label [[OMP_DISPATCH_BODY:%.*]], label [[OMP_DISPATCH_END:%.*]]
+// CHECK-64:       omp.dispatch.body:
+// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK-64:       omp.inner.for.cond:
+// CHECK-64-NEXT:    [[TMP14:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP16:![0-9]+]]
+// CHECK-64-NEXT:    [[TMP15:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP16]]
+// CHECK-64-NEXT:    [[CMP6:%.*]] = icmp sle i32 [[TMP14]], [[TMP15]]
+// CHECK-64-NEXT:    br i1 [[CMP6]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK-64:       omp.inner.for.body:
+// CHECK-64-NEXT:    [[TMP16:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP16]]
+// CHECK-64-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP16]], 1
+// CHECK-64-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// CHECK-64-NEXT:    store i32 [[ADD]], ptr [[I3]], align 4, !llvm.access.group [[ACC_GRP16]]
+// CHECK-64-NEXT:    [[TMP17:%.*]] = load i32, ptr [[I3]], align 4, !llvm.access.group [[ACC_GRP16]]
+// CHECK-64-NEXT:    [[IDXPROM:%.*]] = sext i32 [[TMP17]] to i64
+// CHECK-64-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [1000 x i16], ptr [[TMP0]], i64 0, i64 [[IDXPROM]]
+// CHECK-64-NEXT:    [[TMP18:%.*]] = load i16, ptr [[ARRAYIDX]], align 2, !llvm.access.group [[ACC_GRP16]]
+// CHECK-64-NEXT:    [[CONV:%.*]] = sext i16 [[TMP18]] to i32
+// CHECK-64-NEXT:    [[ADD7:%.*]] = add nsw i32 [[CONV]], 1
+// CHECK-64-NEXT:    [[CONV8:%.*]] = trunc i32 [[ADD7]] to i16
+// CHECK-64-NEXT:    store i16 [[CONV8]], ptr [[ARRAYIDX]], align 2, !llvm.access.group [[ACC_GRP16]]
+// CHECK-64-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK-64:       omp.body.continue:
+// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK-64:       omp.inner.for.inc:
+// CHECK-64-NEXT:    [[TMP19:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP16]]
+// CHECK-64-NEXT:    [[ADD9:%.*]] = add nsw i32 [[TMP19]], 1
+// CHECK-64-NEXT:    store i32 [[ADD9]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP16]]
+// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP17:![0-9]+]]
+// CHECK-64:       omp.inner.for.end:
+// CHECK-64-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
+// CHECK-64:       omp.dispatch.inc:
+// CHECK-64-NEXT:    [[TMP20:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4
+// CHECK-64-NEXT:    [[TMP21:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK-64-NEXT:    [[ADD10:%.*]] = add nsw i32 [[TMP20]], [[TMP21]]
+// CHECK-64-NEXT:    store i32 [[ADD10]], ptr [[DOTOMP_LB]], align 4
+// CHECK-64-NEXT:    [[TMP22:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK-64-NEXT:    [[TMP23:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK-64-NEXT:    [[ADD11:%.*]] = add nsw i32 [[TMP22]], [[TMP23]]
+// CHECK-64-NEXT:    store i32 [[ADD11]], ptr [[DOTOMP_UB]], align 4
+// CHECK-64-NEXT:    br label [[OMP_DISPATCH_COND]]
+// CHECK-64:       omp.dispatch.end:
+// CHECK-64-NEXT:    [[TMP24:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK-64-NEXT:    [[TMP25:%.*]] = load i32, ptr [[TMP24]], align 4
+// CHECK-64-NEXT:    call void @__kmpc_distribute_static_fini(ptr @[[GLOB2]], i32 [[TMP25]])
+// CHECK-64-NEXT:    [[TMP26:%.*]] = load i32, ptr [[DOTOMP_IS_LAST]], align 4
+// CHECK-64-NEXT:    [[TMP27:%.*]] = icmp ne i32 [[TMP26]], 0
+// CHECK-64-NEXT:    br i1 [[TMP27]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]]
+// CHECK-64:       .omp.final.then:
+// CHECK-64-NEXT:    [[TMP28:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK-64-NEXT:    [[SUB12:%.*]] = sub nsw i32 [[TMP28]], 0
+// CHECK-64-NEXT:    [[DIV13:%.*]] = sdiv i32 [[SUB12]], 1
+// CHECK-64-NEXT:    [[MUL14:%.*]] = mul nsw i32 [[DIV13]], 1
+// CHECK-64-NEXT:    [[ADD15:%.*]] = add nsw i32 0, [[MUL14]]
+// CHECK-64-NEXT:    store i32 [[ADD15]], ptr [[I3]], align 4
+// CHECK-64-NEXT:    br label [[DOTOMP_FINAL_DONE]]
+// CHECK-64:       .omp.final.done:
+// CHECK-64-NEXT:    br label [[OMP_PRECOND_END]]
+// CHECK-64:       omp.precond.end:
+// CHECK-64-NEXT:    ret void
+//
+//
+// CHECK-64-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l44
+// CHECK-64-SAME: (ptr nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] {
+// CHECK-64-NEXT:  entry:
+// CHECK-64-NEXT:    [[B_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-64-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    store ptr [[B]], ptr [[B_ADDR]], align 8
+// CHECK-64-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[B_ADDR]], align 8
+// CHECK-64-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr @[[GLOB1]], i8 2, i1 false)
+// CHECK-64-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1
+// CHECK-64-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
+// CHECK-64:       user_code.entry:
+// CHECK-64-NEXT:    [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]])
+// CHECK-64-NEXT:    store i32 0, ptr [[DOTZERO_ADDR]], align 4
+// CHECK-64-NEXT:    store i32 [[TMP2]], ptr [[DOTTHREADID_TEMP_]], align 4
+// CHECK-64-NEXT:    call void @__omp_outlined__2(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]], ptr [[TMP0]]) #[[ATTR3]]
+// CHECK-64-NEXT:    call void @__kmpc_target_deinit(ptr @[[GLOB1]], i8 2)
+// CHECK-64-NEXT:    ret void
+// CHECK-64:       worker.exit:
+// CHECK-64-NEXT:    ret void
+//
+//
+// CHECK-64-LABEL: define {{[^@]+}}@__omp_outlined__2
+// CHECK-64-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]], ptr nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR1]] {
+// CHECK-64-NEXT:  entry:
+// CHECK-64-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK-64-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK-64-NEXT:    [[B_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-64-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK-64-NEXT:    store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
+// CHECK-64-NEXT:    store ptr [[B]], ptr [[B_ADDR]], align 8
+// CHECK-64-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[B_ADDR]], align 8
+// CHECK-64-NEXT:    store i32 0, ptr [[DOTOMP_LB]], align 4
+// CHECK-64-NEXT:    store i32 9, ptr [[DOTOMP_UB]], align 4
+// CHECK-64-NEXT:    store i32 1, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK-64-NEXT:    store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
+// CHECK-64-NEXT:    [[NVPTX_NUM_THREADS:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block()
+// CHECK-64-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK-64-NEXT:    [[TMP2:%.*]] = load i32, ptr [[TMP1]], align 4
+// CHECK-64-NEXT:    call void @__kmpc_distribute_static_init_4(ptr @[[GLOB2]], i32 [[TMP2]], i32 91, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_LB]], ptr [[DOTOMP_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]])
+// CHECK-64-NEXT:    br label [[OMP_DISPATCH_COND:%.*]]
+// CHECK-64:       omp.dispatch.cond:
+// CHECK-64-NEXT:    [[TMP3:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK-64-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[TMP3]], 9
+// CHECK-64-NEXT:    br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK-64:       cond.true:
+// CHECK-64-NEXT:    br label [[COND_END:%.*]]
+// CHECK-64:       cond.false:
+// CHECK-64-NEXT:    [[TMP4:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK-64-NEXT:    br label [[COND_END]]
+// CHECK-64:       cond.end:
+// CHECK-64-NEXT:    [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP4]], [[COND_FALSE]] ]
+// CHECK-64-NEXT:    store i32 [[COND]], ptr [[DOTOMP_UB]], align 4
+// CHECK-64-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4
+// CHECK-64-NEXT:    store i32 [[TMP5]], ptr [[DOTOMP_IV]], align 4
+// CHECK-64-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK-64-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK-64-NEXT:    [[CMP1:%.*]] = icmp sle i32 [[TMP6]], [[TMP7]]
+// CHECK-64-NEXT:    br i1 [[CMP1]], label [[OMP_DISPATCH_BODY:%.*]], label [[OMP_DISPATCH_END:%.*]]
+// CHECK-64:       omp.dispatch.body:
+// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK-64:       omp.inner.for.cond:
+// CHECK-64-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP19:![0-9]+]]
+// CHECK-64-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP19]]
+// CHECK-64-NEXT:    [[CMP2:%.*]] = icmp sle i32 [[TMP8]], [[TMP9]]
+// CHECK-64-NEXT:    br i1 [[CMP2]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK-64:       omp.inner.for.body:
+// CHECK-64-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP19]]
+// CHECK-64-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP10]], 1
+// CHECK-64-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// CHECK-64-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP19]]
+// CHECK-64-NEXT:    [[TMP11:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP19]]
+// CHECK-64-NEXT:    [[IDXPROM:%.*]] = sext i32 [[TMP11]] to i64
+// CHECK-64-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], ptr [[TMP0]], i64 0, i64 [[IDXPROM]]
+// CHECK-64-NEXT:    [[TMP12:%.*]] = load i32, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP19]]
+// CHECK-64-NEXT:    [[ADD3:%.*]] = add nsw i32 [[TMP12]], 1
+// CHECK-64-NEXT:    store i32 [[ADD3]], ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP19]]
+// CHECK-64-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK-64:       omp.body.continue:
+// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK-64:       omp.inner.for.inc:
+// CHECK-64-NEXT:    [[TMP13:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP19]]
+// CHECK-64-NEXT:    [[ADD4:%.*]] = add nsw i32 [[TMP13]], 1
+// CHECK-64-NEXT:    store i32 [[ADD4]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP19]]
+// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP20:![0-9]+]]
+// CHECK-64:       omp.inner.for.end:
+// CHECK-64-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
+// CHECK-64:       omp.dispatch.inc:
+// CHECK-64-NEXT:    [[TMP14:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4
+// CHECK-64-NEXT:    [[TMP15:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK-64-NEXT:    [[ADD5:%.*]] = add nsw i32 [[TMP14]], [[TMP15]]
+// CHECK-64-NEXT:    store i32 [[ADD5]], ptr [[DOTOMP_LB]], align 4
+// CHECK-64-NEXT:    [[TMP16:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK-64-NEXT:    [[TMP17:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK-64-NEXT:    [[ADD6:%.*]] = add nsw i32 [[TMP16]], [[TMP17]]
+// CHECK-64-NEXT:    store i32 [[ADD6]], ptr [[DOTOMP_UB]], align 4
+// CHECK-64-NEXT:    br label [[OMP_DISPATCH_COND]]
+// CHECK-64:       omp.dispatch.end:
+// CHECK-64-NEXT:    call void @__kmpc_distribute_static_fini(ptr @[[GLOB2]], i32 [[TMP2]])
+// CHECK-64-NEXT:    [[TMP18:%.*]] = load i32, ptr [[DOTOMP_IS_LAST]], align 4
+// CHECK-64-NEXT:    [[TMP19:%.*]] = icmp ne i32 [[TMP18]], 0
+// CHECK-64-NEXT:    br i1 [[TMP19]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]]
+// CHECK-64:       .omp.final.then:
+// CHECK-64-NEXT:    store i32 10, ptr [[I]], align 4
+// CHECK-64-NEXT:    br label [[DOTOMP_FINAL_DONE]]
+// CHECK-64:       .omp.final.done:
+// CHECK-64-NEXT:    ret void
+//
+//
+// CHECK-64-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l49
+// CHECK-64-SAME: (ptr nonnull align 4 dereferenceable(400) [[C:%.*]], i64 [[F:%.*]]) #[[ATTR0]] {
+// CHECK-64-NEXT:  entry:
+// CHECK-64-NEXT:    [[C_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-64-NEXT:    [[F_ADDR:%.*]] = alloca i64, align 8
+// CHECK-64-NEXT:    [[F_CASTED:%.*]] = alloca i64, align 8
+// CHECK-64-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    store ptr [[C]], ptr [[C_ADDR]], align 8
+// CHECK-64-NEXT:    store i64 [[F]], ptr [[F_ADDR]], align 8
+// CHECK-64-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[C_ADDR]], align 8
+// CHECK-64-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr @[[GLOB1]], i8 2, i1 false)
+// CHECK-64-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1
+// CHECK-64-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
+// CHECK-64:       user_code.entry:
+// CHECK-64-NEXT:    [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]])
+// CHECK-64-NEXT:    [[TMP3:%.*]] = load i32, ptr [[F_ADDR]], align 4
+// CHECK-64-NEXT:    store i32 [[TMP3]], ptr [[F_CASTED]], align 4
+// CHECK-64-NEXT:    [[TMP4:%.*]] = load i64, ptr [[F_CASTED]], align 8
+// CHECK-64-NEXT:    store i32 0, ptr [[DOTZERO_ADDR]], align 4
+// CHECK-64-NEXT:    store i32 [[TMP2]], ptr [[DOTTHREADID_TEMP_]], align 4
+// CHECK-64-NEXT:    call void @__omp_outlined__3(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]], ptr [[TMP0]], i64 [[TMP4]]) #[[ATTR3]]
+// CHECK-64-NEXT:    call void @__kmpc_target_deinit(ptr @[[GLOB1]], i8 2)
+// CHECK-64-NEXT:    ret void
+// CHECK-64:       worker.exit:
+// CHECK-64-NEXT:    ret void
+//
+//
+// CHECK-64-LABEL: define {{[^@]+}}@__omp_outlined__3
+// CHECK-64-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]], ptr nonnull align 4 dereferenceable(400) [[C:%.*]], i64 [[F:%.*]]) #[[ATTR1]] {
+// CHECK-64-NEXT:  entry:
+// CHECK-64-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK-64-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK-64-NEXT:    [[C_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-64-NEXT:    [[F_ADDR:%.*]] = alloca i64, align 8
+// CHECK-64-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[_TMP1:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[K:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[J:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK-64-NEXT:    store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
+// CHECK-64-NEXT:    store ptr [[C]], ptr [[C_ADDR]], align 8
+// CHECK-64-NEXT:    store i64 [[F]], ptr [[F_ADDR]], align 8
+// CHECK-64-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[C_ADDR]], align 8
+// CHECK-64-NEXT:    store i32 0, ptr [[DOTOMP_LB]], align 4
+// CHECK-64-NEXT:    store i32 99, ptr [[DOTOMP_UB]], align 4
+// CHECK-64-NEXT:    store i32 1, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK-64-NEXT:    store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
+// CHECK-64-NEXT:    [[NVPTX_NUM_THREADS:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block()
+// CHECK-64-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK-64-NEXT:    [[TMP2:%.*]] = load i32, ptr [[TMP1]], align 4
+// CHECK-64-NEXT:    call void @__kmpc_distribute_static_init_4(ptr @[[GLOB2]], i32 [[TMP2]], i32 91, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_LB]], ptr [[DOTOMP_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]])
+// CHECK-64-NEXT:    br label [[OMP_DISPATCH_COND:%.*]]
+// CHECK-64:       omp.dispatch.cond:
+// CHECK-64-NEXT:    [[TMP3:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK-64-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[TMP3]], 99
+// CHECK-64-NEXT:    br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK-64:       cond.true:
+// CHECK-64-NEXT:    br label [[COND_END:%.*]]
+// CHECK-64:       cond.false:
+// CHECK-64-NEXT:    [[TMP4:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK-64-NEXT:    br label [[COND_END]]
+// CHECK-64:       cond.end:
+// CHECK-64-NEXT:    [[COND:%.*]] = phi i32 [ 99, [[COND_TRUE]] ], [ [[TMP4]], [[COND_FALSE]] ]
+// CHECK-64-NEXT:    store i32 [[COND]], ptr [[DOTOMP_UB]], align 4
+// CHECK-64-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4
+// CHECK-64-NEXT:    store i32 [[TMP5]], ptr [[DOTOMP_IV]], align 4
+// CHECK-64-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK-64-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK-64-NEXT:    [[CMP2:%.*]] = icmp sle i32 [[TMP6]], [[TMP7]]
+// CHECK-64-NEXT:    br i1 [[CMP2]], label [[OMP_DISPATCH_BODY:%.*]], label [[OMP_DISPATCH_END:%.*]]
+// CHECK-64:       omp.dispatch.body:
+// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK-64:       omp.inner.for.cond:
+// CHECK-64-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP22:![0-9]+]]
+// CHECK-64-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP22]]
+// CHECK-64-NEXT:    [[CMP3:%.*]] = icmp sle i32 [[TMP8]], [[TMP9]]
+// CHECK-64-NEXT:    br i1 [[CMP3]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK-64:       omp.inner.for.body:
+// CHECK-64-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP22]]
+// CHECK-64-NEXT:    [[DIV:%.*]] = sdiv i32 [[TMP10]], 10
+// CHECK-64-NEXT:    [[MUL:%.*]] = mul nsw i32 [[DIV]], 1
+// CHECK-64-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// CHECK-64-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP22]]
+// CHECK-64-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP22]]
+// CHECK-64-NEXT:    [[TMP12:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP22]]
+// CHECK-64-NEXT:    [[DIV4:%.*]] = sdiv i32 [[TMP12]], 10
+// CHECK-64-NEXT:    [[MUL5:%.*]] = mul nsw i32 [[DIV4]], 10
+// CHECK-64-NEXT:    [[SUB:%.*]] = sub nsw i32 [[TMP11]], [[MUL5]]
+// CHECK-64-NEXT:    [[MUL6:%.*]] = mul nsw i32 [[SUB]], 1
+// CHECK-64-NEXT:    [[ADD7:%.*]] = add nsw i32 0, [[MUL6]]
+// CHECK-64-NEXT:    store i32 [[ADD7]], ptr [[J]], align 4, !llvm.access.group [[ACC_GRP22]]
+// CHECK-64-NEXT:    store i32 10, ptr [[K]], align 4, !llvm.access.group [[ACC_GRP22]]
+// CHECK-64-NEXT:    [[TMP13:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP22]]
+// CHECK-64-NEXT:    [[TMP14:%.*]] = load i32, ptr [[J]], align 4, !llvm.access.group [[ACC_GRP22]]
+// CHECK-64-NEXT:    [[TMP15:%.*]] = load i32, ptr [[F_ADDR]], align 4, !llvm.access.group [[ACC_GRP22]]
+// CHECK-64-NEXT:    [[MUL8:%.*]] = mul nsw i32 [[TMP14]], [[TMP15]]
+// CHECK-64-NEXT:    [[ADD9:%.*]] = add nsw i32 [[TMP13]], [[MUL8]]
+// CHECK-64-NEXT:    [[TMP16:%.*]] = load i32, ptr [[K]], align 4, !llvm.access.group [[ACC_GRP22]]
+// CHECK-64-NEXT:    [[ADD10:%.*]] = add nsw i32 [[ADD9]], [[TMP16]]
+// CHECK-64-NEXT:    [[TMP17:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP22]]
+// CHECK-64-NEXT:    [[IDXPROM:%.*]] = sext i32 [[TMP17]] to i64
+// CHECK-64-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[TMP0]], i64 0, i64 [[IDXPROM]]
+// CHECK-64-NEXT:    [[TMP18:%.*]] = load i32, ptr [[J]], align 4, !llvm.access.group [[ACC_GRP22]]
+// CHECK-64-NEXT:    [[IDXPROM11:%.*]] = sext i32 [[TMP18]] to i64
+// CHECK-64-NEXT:    [[ARRAYIDX12:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX]], i64 0, i64 [[IDXPROM11]]
+// CHECK-64-NEXT:    store i32 [[ADD10]], ptr [[ARRAYIDX12]], align 4, !llvm.access.group [[ACC_GRP22]]
+// CHECK-64-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK-64:       omp.body.continue:
+// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK-64:       omp.inner.for.inc:
+// CHECK-64-NEXT:    [[TMP19:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP22]]
+// CHECK-64-NEXT:    [[ADD13:%.*]] = add nsw i32 [[TMP19]], 1
+// CHECK-64-NEXT:    store i32 [[ADD13]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP22]]
+// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP23:![0-9]+]]
+// CHECK-64:       omp.inner.for.end:
+// CHECK-64-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
+// CHECK-64:       omp.dispatch.inc:
+// CHECK-64-NEXT:    [[TMP20:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4
+// CHECK-64-NEXT:    [[TMP21:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK-64-NEXT:    [[ADD14:%.*]] = add nsw i32 [[TMP20]], [[TMP21]]
+// CHECK-64-NEXT:    store i32 [[ADD14]], ptr [[DOTOMP_LB]], align 4
+// CHECK-64-NEXT:    [[TMP22:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK-64-NEXT:    [[TMP23:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK-64-NEXT:    [[ADD15:%.*]] = add nsw i32 [[TMP22]], [[TMP23]]
+// CHECK-64-NEXT:    store i32 [[ADD15]], ptr [[DOTOMP_UB]], align 4
+// CHECK-64-NEXT:    br label [[OMP_DISPATCH_COND]]
+// CHECK-64:       omp.dispatch.end:
+// CHECK-64-NEXT:    call void @__kmpc_distribute_static_fini(ptr @[[GLOB2]], i32 [[TMP2]])
+// CHECK-64-NEXT:    [[TMP24:%.*]] = load i32, ptr [[DOTOMP_IS_LAST]], align 4
+// CHECK-64-NEXT:    [[TMP25:%.*]] = icmp ne i32 [[TMP24]], 0
+// CHECK-64-NEXT:    br i1 [[TMP25]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]]
+// CHECK-64:       .omp.final.then:
+// CHECK-64-NEXT:    store i32 10, ptr [[I]], align 4
+// CHECK-64-NEXT:    store i32 10, ptr [[J]], align 4
+// CHECK-64-NEXT:    br label [[DOTOMP_FINAL_DONE]]
+// CHECK-64:       .omp.final.done:
+// CHECK-64-NEXT:    ret void
+//
+//
+// CHECK-32-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l33
+// CHECK-32-SAME: (i32 [[N:%.*]], ptr nonnull align 4 dereferenceable(4000) [[A:%.*]], i32 [[L:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-32-NEXT:  entry:
+// CHECK-32-NEXT:    [[N_ADDR:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 4
+// CHECK-32-NEXT:    [[L_ADDR:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[N_CASTED:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[L_CASTED:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    store i32 [[N]], ptr [[N_ADDR]], align 4
+// CHECK-32-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 4
+// CHECK-32-NEXT:    store i32 [[L]], ptr [[L_ADDR]], align 4
+// CHECK-32-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[A_ADDR]], align 4
+// CHECK-32-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr @[[GLOB1:[0-9]+]], i8 2, i1 false)
+// CHECK-32-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1
+// CHECK-32-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
+// CHECK-32:       user_code.entry:
+// CHECK-32-NEXT:    [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]])
+// CHECK-32-NEXT:    [[TMP3:%.*]] = load i32, ptr [[N_ADDR]], align 4
+// CHECK-32-NEXT:    store i32 [[TMP3]], ptr [[N_CASTED]], align 4
+// CHECK-32-NEXT:    [[TMP4:%.*]] = load i32, ptr [[N_CASTED]], align 4
+// CHECK-32-NEXT:    [[TMP5:%.*]] = load i32, ptr [[L_ADDR]], align 4
+// CHECK-32-NEXT:    store i32 [[TMP5]], ptr [[L_CASTED]], align 4
+// CHECK-32-NEXT:    [[TMP6:%.*]] = load i32, ptr [[L_CASTED]], align 4
+// CHECK-32-NEXT:    store i32 0, ptr [[DOTZERO_ADDR]], align 4
+// CHECK-32-NEXT:    store i32 [[TMP2]], ptr [[DOTTHREADID_TEMP_]], align 4
+// CHECK-32-NEXT:    call void @__omp_outlined__(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]], i32 [[TMP4]], ptr [[TMP0]], i32 [[TMP6]]) #[[ATTR3:[0-9]+]]
+// CHECK-32-NEXT:    call void @__kmpc_target_deinit(ptr @[[GLOB1]], i8 2)
+// CHECK-32-NEXT:    ret void
+// CHECK-32:       worker.exit:
+// CHECK-32-NEXT:    ret void
+//
+//
+// CHECK-32-LABEL: define {{[^@]+}}@__omp_outlined__
+// CHECK-32-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]], i32 [[N:%.*]], ptr nonnull align 4 dereferenceable(4000) [[A:%.*]], i32 [[L:%.*]]) #[[ATTR1:[0-9]+]] {
+// CHECK-32-NEXT:  entry:
+// CHECK-32-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
+// CHECK-32-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
+// CHECK-32-NEXT:    [[N_ADDR:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 4
+// CHECK-32-NEXT:    [[L_ADDR:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTCAPTURE_EXPR_2:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[I4:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-NEXT:    store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 4
+// CHECK-32-NEXT:    store i32 [[N]], ptr [[N_ADDR]], align 4
+// CHECK-32-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 4
+// CHECK-32-NEXT:    store i32 [[L]], ptr [[L_ADDR]], align 4
+// CHECK-32-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[A_ADDR]], align 4
+// CHECK-32-NEXT:    [[L1:%.*]] = call align 8 ptr @__kmpc_alloc_shared(i32 4)
+// CHECK-32-NEXT:    [[TMP1:%.*]] = load i32, ptr [[N_ADDR]], align 4
+// CHECK-32-NEXT:    store i32 [[TMP1]], ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK-32-NEXT:    [[TMP2:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK-32-NEXT:    [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0
+// CHECK-32-NEXT:    [[DIV:%.*]] = sdiv i32 [[SUB]], 1
+// CHECK-32-NEXT:    [[SUB3:%.*]] = sub nsw i32 [[DIV]], 1
+// CHECK-32-NEXT:    store i32 [[SUB3]], ptr [[DOTCAPTURE_EXPR_2]], align 4
+// CHECK-32-NEXT:    store i32 0, ptr [[I]], align 4
+// CHECK-32-NEXT:    [[TMP3:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK-32-NEXT:    [[CMP:%.*]] = icmp slt i32 0, [[TMP3]]
+// CHECK-32-NEXT:    br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]]
+// CHECK-32:       omp.precond.then:
+// CHECK-32-NEXT:    store i32 0, ptr [[DOTOMP_LB]], align 4
+// CHECK-32-NEXT:    [[TMP4:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_2]], align 4
+// CHECK-32-NEXT:    store i32 [[TMP4]], ptr [[DOTOMP_UB]], align 4
+// CHECK-32-NEXT:    store i32 1, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK-32-NEXT:    store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
+// CHECK-32-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-NEXT:    [[TMP6:%.*]] = load i32, ptr [[TMP5]], align 4
+// CHECK-32-NEXT:    call void @__kmpc_distribute_static_init_4(ptr @[[GLOB2:[0-9]+]], i32 [[TMP6]], i32 91, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_LB]], ptr [[DOTOMP_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 128)
+// CHECK-32-NEXT:    br label [[OMP_DISPATCH_COND:%.*]]
+// CHECK-32:       omp.dispatch.cond:
+// CHECK-32-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK-32-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_2]], align 4
+// CHECK-32-NEXT:    [[CMP5:%.*]] = icmp sgt i32 [[TMP7]], [[TMP8]]
+// CHECK-32-NEXT:    br i1 [[CMP5]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK-32:       cond.true:
+// CHECK-32-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_2]], align 4
+// CHECK-32-NEXT:    br label [[COND_END:%.*]]
+// CHECK-32:       cond.false:
+// CHECK-32-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK-32-NEXT:    br label [[COND_END]]
+// CHECK-32:       cond.end:
+// CHECK-32-NEXT:    [[COND:%.*]] = phi i32 [ [[TMP9]], [[COND_TRUE]] ], [ [[TMP10]], [[COND_FALSE]] ]
+// CHECK-32-NEXT:    store i32 [[COND]], ptr [[DOTOMP_UB]], align 4
+// CHECK-32-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4
+// CHECK-32-NEXT:    store i32 [[TMP11]], ptr [[DOTOMP_IV]], align 4
+// CHECK-32-NEXT:    [[TMP12:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK-32-NEXT:    [[TMP13:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK-32-NEXT:    [[CMP6:%.*]] = icmp sle i32 [[TMP12]], [[TMP13]]
+// CHECK-32-NEXT:    br i1 [[CMP6]], label [[OMP_DISPATCH_BODY:%.*]], label [[OMP_DISPATCH_END:%.*]]
+// CHECK-32:       omp.dispatch.body:
+// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK-32:       omp.inner.for.cond:
+// CHECK-32-NEXT:    [[TMP14:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP12:![0-9]+]]
+// CHECK-32-NEXT:    [[TMP15:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP12]]
+// CHECK-32-NEXT:    [[CMP7:%.*]] = icmp sle i32 [[TMP14]], [[TMP15]]
+// CHECK-32-NEXT:    br i1 [[CMP7]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK-32:       omp.inner.for.body:
+// CHECK-32-NEXT:    [[TMP16:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP12]]
+// CHECK-32-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP16]], 1
+// CHECK-32-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// CHECK-32-NEXT:    store i32 [[ADD]], ptr [[I4]], align 4, !llvm.access.group [[ACC_GRP12]]
+// CHECK-32-NEXT:    [[TMP17:%.*]] = load i32, ptr [[I4]], align 4, !llvm.access.group [[ACC_GRP12]]
+// CHECK-32-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [1000 x i32], ptr [[TMP0]], i32 0, i32 [[TMP17]]
+// CHECK-32-NEXT:    store i32 1, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP12]]
+// CHECK-32-NEXT:    [[TMP18:%.*]] = load i32, ptr [[I4]], align 4, !llvm.access.group [[ACC_GRP12]]
+// CHECK-32-NEXT:    store i32 [[TMP18]], ptr [[L_ADDR]], align 4, !llvm.access.group [[ACC_GRP12]]
+// CHECK-32-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK-32:       omp.body.continue:
+// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK-32:       omp.inner.for.inc:
+// CHECK-32-NEXT:    [[TMP19:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP12]]
+// CHECK-32-NEXT:    [[ADD8:%.*]] = add nsw i32 [[TMP19]], 1
+// CHECK-32-NEXT:    store i32 [[ADD8]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP12]]
+// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP13:![0-9]+]]
+// CHECK-32:       omp.inner.for.end:
+// CHECK-32-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
+// CHECK-32:       omp.dispatch.inc:
+// CHECK-32-NEXT:    [[TMP20:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4
+// CHECK-32-NEXT:    [[TMP21:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK-32-NEXT:    [[ADD9:%.*]] = add nsw i32 [[TMP20]], [[TMP21]]
+// CHECK-32-NEXT:    store i32 [[ADD9]], ptr [[DOTOMP_LB]], align 4
+// CHECK-32-NEXT:    [[TMP22:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK-32-NEXT:    [[TMP23:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK-32-NEXT:    [[ADD10:%.*]] = add nsw i32 [[TMP22]], [[TMP23]]
+// CHECK-32-NEXT:    store i32 [[ADD10]], ptr [[DOTOMP_UB]], align 4
+// CHECK-32-NEXT:    br label [[OMP_DISPATCH_COND]]
+// CHECK-32:       omp.dispatch.end:
+// CHECK-32-NEXT:    [[TMP24:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-NEXT:    [[TMP25:%.*]] = load i32, ptr [[TMP24]], align 4
+// CHECK-32-NEXT:    call void @__kmpc_distribute_static_fini(ptr @[[GLOB2]], i32 [[TMP25]])
+// CHECK-32-NEXT:    [[TMP26:%.*]] = load i32, ptr [[DOTOMP_IS_LAST]], align 4
+// CHECK-32-NEXT:    [[TMP27:%.*]] = icmp ne i32 [[TMP26]], 0
+// CHECK-32-NEXT:    br i1 [[TMP27]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]]
+// CHECK-32:       .omp.final.then:
+// CHECK-32-NEXT:    [[TMP28:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK-32-NEXT:    [[SUB11:%.*]] = sub nsw i32 [[TMP28]], 0
+// CHECK-32-NEXT:    [[DIV12:%.*]] = sdiv i32 [[SUB11]], 1
+// CHECK-32-NEXT:    [[MUL13:%.*]] = mul nsw i32 [[DIV12]], 1
+// CHECK-32-NEXT:    [[ADD14:%.*]] = add nsw i32 0, [[MUL13]]
+// CHECK-32-NEXT:    store i32 [[ADD14]], ptr [[I4]], align 4
+// CHECK-32-NEXT:    br label [[DOTOMP_FINAL_DONE]]
+// CHECK-32:       .omp.final.done:
+// CHECK-32-NEXT:    [[TMP29:%.*]] = load i32, ptr [[DOTOMP_IS_LAST]], align 4
+// CHECK-32-NEXT:    [[TMP30:%.*]] = icmp ne i32 [[TMP29]], 0
+// CHECK-32-NEXT:    br i1 [[TMP30]], label [[DOTOMP_LASTPRIVATE_THEN:%.*]], label [[DOTOMP_LASTPRIVATE_DONE:%.*]]
+// CHECK-32:       .omp.lastprivate.then:
+// CHECK-32-NEXT:    [[TMP31:%.*]] = load i32, ptr [[L_ADDR]], align 4
+// CHECK-32-NEXT:    store i32 [[TMP31]], ptr [[L_ADDR]], align 4
+// CHECK-32-NEXT:    br label [[DOTOMP_LASTPRIVATE_DONE]]
+// CHECK-32:       .omp.lastprivate.done:
+// CHECK-32-NEXT:    br label [[OMP_PRECOND_END]]
+// CHECK-32:       omp.precond.end:
+// CHECK-32-NEXT:    call void @__kmpc_free_shared(ptr [[L1]], i32 4)
+// CHECK-32-NEXT:    ret void
+//
+//
+// CHECK-32-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l39
+// CHECK-32-SAME: (i32 [[N:%.*]], ptr nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR5:[0-9]+]] {
+// CHECK-32-NEXT:  entry:
+// CHECK-32-NEXT:    [[N_ADDR:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[AA_ADDR:%.*]] = alloca ptr, align 4
+// CHECK-32-NEXT:    [[N_CASTED:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    store i32 [[N]], ptr [[N_ADDR]], align 4
+// CHECK-32-NEXT:    store ptr [[AA]], ptr [[AA_ADDR]], align 4
+// CHECK-32-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[AA_ADDR]], align 4
+// CHECK-32-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr @[[GLOB1]], i8 2, i1 false)
+// CHECK-32-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1
+// CHECK-32-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
+// CHECK-32:       user_code.entry:
+// CHECK-32-NEXT:    [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]])
+// CHECK-32-NEXT:    [[TMP3:%.*]] = load i32, ptr [[N_ADDR]], align 4
+// CHECK-32-NEXT:    store i32 [[TMP3]], ptr [[N_CASTED]], align 4
+// CHECK-32-NEXT:    [[TMP4:%.*]] = load i32, ptr [[N_CASTED]], align 4
+// CHECK-32-NEXT:    store i32 0, ptr [[DOTZERO_ADDR]], align 4
+// CHECK-32-NEXT:    store i32 [[TMP2]], ptr [[DOTTHREADID_TEMP_]], align 4
+// CHECK-32-NEXT:    call void @__omp_outlined__1(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]], i32 [[TMP4]], ptr [[TMP0]]) #[[ATTR3]]
+// CHECK-32-NEXT:    call void @__kmpc_target_deinit(ptr @[[GLOB1]], i8 2)
+// CHECK-32-NEXT:    ret void
+// CHECK-32:       worker.exit:
+// CHECK-32-NEXT:    ret void
+//
+//
+// CHECK-32-LABEL: define {{[^@]+}}@__omp_outlined__1
+// CHECK-32-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]], i32 [[N:%.*]], ptr nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR1]] {
+// CHECK-32-NEXT:  entry:
+// CHECK-32-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
+// CHECK-32-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
+// CHECK-32-NEXT:    [[N_ADDR:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[AA_ADDR:%.*]] = alloca ptr, align 4
+// CHECK-32-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[I3:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-NEXT:    store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 4
+// CHECK-32-NEXT:    store i32 [[N]], ptr [[N_ADDR]], align 4
+// CHECK-32-NEXT:    store ptr [[AA]], ptr [[AA_ADDR]], align 4
+// CHECK-32-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[AA_ADDR]], align 4
+// CHECK-32-NEXT:    [[TMP1:%.*]] = load i32, ptr [[N_ADDR]], align 4
+// CHECK-32-NEXT:    store i32 [[TMP1]], ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK-32-NEXT:    [[TMP2:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK-32-NEXT:    [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0
+// CHECK-32-NEXT:    [[DIV:%.*]] = sdiv i32 [[SUB]], 1
+// CHECK-32-NEXT:    [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1
+// CHECK-32-NEXT:    store i32 [[SUB2]], ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK-32-NEXT:    store i32 0, ptr [[I]], align 4
+// CHECK-32-NEXT:    [[TMP3:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK-32-NEXT:    [[CMP:%.*]] = icmp slt i32 0, [[TMP3]]
+// CHECK-32-NEXT:    br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]]
+// CHECK-32:       omp.precond.then:
+// CHECK-32-NEXT:    store i32 0, ptr [[DOTOMP_LB]], align 4
+// CHECK-32-NEXT:    [[TMP4:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK-32-NEXT:    store i32 [[TMP4]], ptr [[DOTOMP_UB]], align 4
+// CHECK-32-NEXT:    store i32 1, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK-32-NEXT:    store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
+// CHECK-32-NEXT:    [[NVPTX_NUM_THREADS:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block()
+// CHECK-32-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-NEXT:    [[TMP6:%.*]] = load i32, ptr [[TMP5]], align 4
+// CHECK-32-NEXT:    call void @__kmpc_distribute_static_init_4(ptr @[[GLOB2]], i32 [[TMP6]], i32 91, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_LB]], ptr [[DOTOMP_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]])
+// CHECK-32-NEXT:    br label [[OMP_DISPATCH_COND:%.*]]
+// CHECK-32:       omp.dispatch.cond:
+// CHECK-32-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK-32-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK-32-NEXT:    [[CMP4:%.*]] = icmp sgt i32 [[TMP7]], [[TMP8]]
+// CHECK-32-NEXT:    br i1 [[CMP4]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK-32:       cond.true:
+// CHECK-32-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK-32-NEXT:    br label [[COND_END:%.*]]
+// CHECK-32:       cond.false:
+// CHECK-32-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK-32-NEXT:    br label [[COND_END]]
+// CHECK-32:       cond.end:
+// CHECK-32-NEXT:    [[COND:%.*]] = phi i32 [ [[TMP9]], [[COND_TRUE]] ], [ [[TMP10]], [[COND_FALSE]] ]
+// CHECK-32-NEXT:    store i32 [[COND]], ptr [[DOTOMP_UB]], align 4
+// CHECK-32-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4
+// CHECK-32-NEXT:    store i32 [[TMP11]], ptr [[DOTOMP_IV]], align 4
+// CHECK-32-NEXT:    [[TMP12:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK-32-NEXT:    [[TMP13:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK-32-NEXT:    [[CMP5:%.*]] = icmp sle i32 [[TMP12]], [[TMP13]]
+// CHECK-32-NEXT:    br i1 [[CMP5]], label [[OMP_DISPATCH_BODY:%.*]], label [[OMP_DISPATCH_END:%.*]]
+// CHECK-32:       omp.dispatch.body:
+// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK-32:       omp.inner.for.cond:
+// CHECK-32-NEXT:    [[TMP14:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP16:![0-9]+]]
+// CHECK-32-NEXT:    [[TMP15:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP16]]
+// CHECK-32-NEXT:    [[CMP6:%.*]] = icmp sle i32 [[TMP14]], [[TMP15]]
+// CHECK-32-NEXT:    br i1 [[CMP6]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK-32:       omp.inner.for.body:
+// CHECK-32-NEXT:    [[TMP16:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP16]]
+// CHECK-32-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP16]], 1
+// CHECK-32-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// CHECK-32-NEXT:    store i32 [[ADD]], ptr [[I3]], align 4, !llvm.access.group [[ACC_GRP16]]
+// CHECK-32-NEXT:    [[TMP17:%.*]] = load i32, ptr [[I3]], align 4, !llvm.access.group [[ACC_GRP16]]
+// CHECK-32-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [1000 x i16], ptr [[TMP0]], i32 0, i32 [[TMP17]]
+// CHECK-32-NEXT:    [[TMP18:%.*]] = load i16, ptr [[ARRAYIDX]], align 2, !llvm.access.group [[ACC_GRP16]]
+// CHECK-32-NEXT:    [[CONV:%.*]] = sext i16 [[TMP18]] to i32
+// CHECK-32-NEXT:    [[ADD7:%.*]] = add nsw i32 [[CONV]], 1
+// CHECK-32-NEXT:    [[CONV8:%.*]] = trunc i32 [[ADD7]] to i16
+// CHECK-32-NEXT:    store i16 [[CONV8]], ptr [[ARRAYIDX]], align 2, !llvm.access.group [[ACC_GRP16]]
+// CHECK-32-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK-32:       omp.body.continue:
+// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK-32:       omp.inner.for.inc:
+// CHECK-32-NEXT:    [[TMP19:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP16]]
+// CHECK-32-NEXT:    [[ADD9:%.*]] = add nsw i32 [[TMP19]], 1
+// CHECK-32-NEXT:    store i32 [[ADD9]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP16]]
+// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP17:![0-9]+]]
+// CHECK-32:       omp.inner.for.end:
+// CHECK-32-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
+// CHECK-32:       omp.dispatch.inc:
+// CHECK-32-NEXT:    [[TMP20:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4
+// CHECK-32-NEXT:    [[TMP21:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK-32-NEXT:    [[ADD10:%.*]] = add nsw i32 [[TMP20]], [[TMP21]]
+// CHECK-32-NEXT:    store i32 [[ADD10]], ptr [[DOTOMP_LB]], align 4
+// CHECK-32-NEXT:    [[TMP22:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK-32-NEXT:    [[TMP23:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK-32-NEXT:    [[ADD11:%.*]] = add nsw i32 [[TMP22]], [[TMP23]]
+// CHECK-32-NEXT:    store i32 [[ADD11]], ptr [[DOTOMP_UB]], align 4
+// CHECK-32-NEXT:    br label [[OMP_DISPATCH_COND]]
+// CHECK-32:       omp.dispatch.end:
+// CHECK-32-NEXT:    [[TMP24:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-NEXT:    [[TMP25:%.*]] = load i32, ptr [[TMP24]], align 4
+// CHECK-32-NEXT:    call void @__kmpc_distribute_static_fini(ptr @[[GLOB2]], i32 [[TMP25]])
+// CHECK-32-NEXT:    [[TMP26:%.*]] = load i32, ptr [[DOTOMP_IS_LAST]], align 4
+// CHECK-32-NEXT:    [[TMP27:%.*]] = icmp ne i32 [[TMP26]], 0
+// CHECK-32-NEXT:    br i1 [[TMP27]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]]
+// CHECK-32:       .omp.final.then:
+// CHECK-32-NEXT:    [[TMP28:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK-32-NEXT:    [[SUB12:%.*]] = sub nsw i32 [[TMP28]], 0
+// CHECK-32-NEXT:    [[DIV13:%.*]] = sdiv i32 [[SUB12]], 1
+// CHECK-32-NEXT:    [[MUL14:%.*]] = mul nsw i32 [[DIV13]], 1
+// CHECK-32-NEXT:    [[ADD15:%.*]] = add nsw i32 0, [[MUL14]]
+// CHECK-32-NEXT:    store i32 [[ADD15]], ptr [[I3]], align 4
+// CHECK-32-NEXT:    br label [[DOTOMP_FINAL_DONE]]
+// CHECK-32:       .omp.final.done:
+// CHECK-32-NEXT:    br label [[OMP_PRECOND_END]]
+// CHECK-32:       omp.precond.end:
+// CHECK-32-NEXT:    ret void
+//
+//
+// CHECK-32-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l44
+// CHECK-32-SAME: (ptr nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] {
+// CHECK-32-NEXT:  entry:
+// CHECK-32-NEXT:    [[B_ADDR:%.*]] = alloca ptr, align 4
+// CHECK-32-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    store ptr [[B]], ptr [[B_ADDR]], align 4
+// CHECK-32-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[B_ADDR]], align 4
+// CHECK-32-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr @[[GLOB1]], i8 2, i1 false)
+// CHECK-32-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1
+// CHECK-32-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
+// CHECK-32:       user_code.entry:
+// CHECK-32-NEXT:    [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]])
+// CHECK-32-NEXT:    store i32 0, ptr [[DOTZERO_ADDR]], align 4
+// CHECK-32-NEXT:    store i32 [[TMP2]], ptr [[DOTTHREADID_TEMP_]], align 4
+// CHECK-32-NEXT:    call void @__omp_outlined__2(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]], ptr [[TMP0]]) #[[ATTR3]]
+// CHECK-32-NEXT:    call void @__kmpc_target_deinit(ptr @[[GLOB1]], i8 2)
+// CHECK-32-NEXT:    ret void
+// CHECK-32:       worker.exit:
+// CHECK-32-NEXT:    ret void
+//
+//
+// CHECK-32-LABEL: define {{[^@]+}}@__omp_outlined__2
+// CHECK-32-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]], ptr nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR1]] {
+// CHECK-32-NEXT:  entry:
+// CHECK-32-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
+// CHECK-32-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
+// CHECK-32-NEXT:    [[B_ADDR:%.*]] = alloca ptr, align 4
+// CHECK-32-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-NEXT:    store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 4
+// CHECK-32-NEXT:    store ptr [[B]], ptr [[B_ADDR]], align 4
+// CHECK-32-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[B_ADDR]], align 4
+// CHECK-32-NEXT:    store i32 0, ptr [[DOTOMP_LB]], align 4
+// CHECK-32-NEXT:    store i32 9, ptr [[DOTOMP_UB]], align 4
+// CHECK-32-NEXT:    store i32 1, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK-32-NEXT:    store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
+// CHECK-32-NEXT:    [[NVPTX_NUM_THREADS:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block()
+// CHECK-32-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-NEXT:    [[TMP2:%.*]] = load i32, ptr [[TMP1]], align 4
+// CHECK-32-NEXT:    call void @__kmpc_distribute_static_init_4(ptr @[[GLOB2]], i32 [[TMP2]], i32 91, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_LB]], ptr [[DOTOMP_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]])
+// CHECK-32-NEXT:    br label [[OMP_DISPATCH_COND:%.*]]
+// CHECK-32:       omp.dispatch.cond:
+// CHECK-32-NEXT:    [[TMP3:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK-32-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[TMP3]], 9
+// CHECK-32-NEXT:    br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK-32:       cond.true:
+// CHECK-32-NEXT:    br label [[COND_END:%.*]]
+// CHECK-32:       cond.false:
+// CHECK-32-NEXT:    [[TMP4:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK-32-NEXT:    br label [[COND_END]]
+// CHECK-32:       cond.end:
+// CHECK-32-NEXT:    [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP4]], [[COND_FALSE]] ]
+// CHECK-32-NEXT:    store i32 [[COND]], ptr [[DOTOMP_UB]], align 4
+// CHECK-32-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4
+// CHECK-32-NEXT:    store i32 [[TMP5]], ptr [[DOTOMP_IV]], align 4
+// CHECK-32-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK-32-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK-32-NEXT:    [[CMP1:%.*]] = icmp sle i32 [[TMP6]], [[TMP7]]
+// CHECK-32-NEXT:    br i1 [[CMP1]], label [[OMP_DISPATCH_BODY:%.*]], label [[OMP_DISPATCH_END:%.*]]
+// CHECK-32:       omp.dispatch.body:
+// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK-32:       omp.inner.for.cond:
+// CHECK-32-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP19:![0-9]+]]
+// CHECK-32-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP19]]
+// CHECK-32-NEXT:    [[CMP2:%.*]] = icmp sle i32 [[TMP8]], [[TMP9]]
+// CHECK-32-NEXT:    br i1 [[CMP2]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK-32:       omp.inner.for.body:
+// CHECK-32-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP19]]
+// CHECK-32-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP10]], 1
+// CHECK-32-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// CHECK-32-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP19]]
+// CHECK-32-NEXT:    [[TMP11:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP19]]
+// CHECK-32-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], ptr [[TMP0]], i32 0, i32 [[TMP11]]
+// CHECK-32-NEXT:    [[TMP12:%.*]] = load i32, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP19]]
+// CHECK-32-NEXT:    [[ADD3:%.*]] = add nsw i32 [[TMP12]], 1
+// CHECK-32-NEXT:    store i32 [[ADD3]], ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP19]]
+// CHECK-32-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK-32:       omp.body.continue:
+// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK-32:       omp.inner.for.inc:
+// CHECK-32-NEXT:    [[TMP13:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP19]]
+// CHECK-32-NEXT:    [[ADD4:%.*]] = add nsw i32 [[TMP13]], 1
+// CHECK-32-NEXT:    store i32 [[ADD4]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP19]]
+// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP20:![0-9]+]]
+// CHECK-32:       omp.inner.for.end:
+// CHECK-32-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
+// CHECK-32:       omp.dispatch.inc:
+// CHECK-32-NEXT:    [[TMP14:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4
+// CHECK-32-NEXT:    [[TMP15:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK-32-NEXT:    [[ADD5:%.*]] = add nsw i32 [[TMP14]], [[TMP15]]
+// CHECK-32-NEXT:    store i32 [[ADD5]], ptr [[DOTOMP_LB]], align 4
+// CHECK-32-NEXT:    [[TMP16:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK-32-NEXT:    [[TMP17:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK-32-NEXT:    [[ADD6:%.*]] = add nsw i32 [[TMP16]], [[TMP17]]
+// CHECK-32-NEXT:    store i32 [[ADD6]], ptr [[DOTOMP_UB]], align 4
+// CHECK-32-NEXT:    br label [[OMP_DISPATCH_COND]]
+// CHECK-32:       omp.dispatch.end:
+// CHECK-32-NEXT:    call void @__kmpc_distribute_static_fini(ptr @[[GLOB2]], i32 [[TMP2]])
+// CHECK-32-NEXT:    [[TMP18:%.*]] = load i32, ptr [[DOTOMP_IS_LAST]], align 4
+// CHECK-32-NEXT:    [[TMP19:%.*]] = icmp ne i32 [[TMP18]], 0
+// CHECK-32-NEXT:    br i1 [[TMP19]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]]
+// CHECK-32:       .omp.final.then:
+// CHECK-32-NEXT:    store i32 10, ptr [[I]], align 4
+// CHECK-32-NEXT:    br label [[DOTOMP_FINAL_DONE]]
+// CHECK-32:       .omp.final.done:
+// CHECK-32-NEXT:    ret void
+//
+//
+// CHECK-32-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l49
+// CHECK-32-SAME: (ptr nonnull align 4 dereferenceable(400) [[C:%.*]], i32 [[F:%.*]]) #[[ATTR0]] {
+// CHECK-32-NEXT:  entry:
+// CHECK-32-NEXT:    [[C_ADDR:%.*]] = alloca ptr, align 4
+// CHECK-32-NEXT:    [[F_ADDR:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[F_CASTED:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    store ptr [[C]], ptr [[C_ADDR]], align 4
+// CHECK-32-NEXT:    store i32 [[F]], ptr [[F_ADDR]], align 4
+// CHECK-32-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[C_ADDR]], align 4
+// CHECK-32-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr @[[GLOB1]], i8 2, i1 false)
+// CHECK-32-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1
+// CHECK-32-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
+// CHECK-32:       user_code.entry:
+// CHECK-32-NEXT:    [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]])
+// CHECK-32-NEXT:    [[TMP3:%.*]] = load i32, ptr [[F_ADDR]], align 4
+// CHECK-32-NEXT:    store i32 [[TMP3]], ptr [[F_CASTED]], align 4
+// CHECK-32-NEXT:    [[TMP4:%.*]] = load i32, ptr [[F_CASTED]], align 4
+// CHECK-32-NEXT:    store i32 0, ptr [[DOTZERO_ADDR]], align 4
+// CHECK-32-NEXT:    store i32 [[TMP2]], ptr [[DOTTHREADID_TEMP_]], align 4
+// CHECK-32-NEXT:    call void @__omp_outlined__3(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]], ptr [[TMP0]], i32 [[TMP4]]) #[[ATTR3]]
+// CHECK-32-NEXT:    call void @__kmpc_target_deinit(ptr @[[GLOB1]], i8 2)
+// CHECK-32-NEXT:    ret void
+// CHECK-32:       worker.exit:
+// CHECK-32-NEXT:    ret void
+//
+//
+// CHECK-32-LABEL: define {{[^@]+}}@__omp_outlined__3
+// CHECK-32-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]], ptr nonnull align 4 dereferenceable(400) [[C:%.*]], i32 [[F:%.*]]) #[[ATTR1]] {
+// CHECK-32-NEXT:  entry:
+// CHECK-32-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
+// CHECK-32-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
+// CHECK-32-NEXT:    [[C_ADDR:%.*]] = alloca ptr, align 4
+// CHECK-32-NEXT:    [[F_ADDR:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[_TMP1:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[K:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[J:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-NEXT:    store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 4
+// CHECK-32-NEXT:    store ptr [[C]], ptr [[C_ADDR]], align 4
+// CHECK-32-NEXT:    store i32 [[F]], ptr [[F_ADDR]], align 4
+// CHECK-32-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[C_ADDR]], align 4
+// CHECK-32-NEXT:    store i32 0, ptr [[DOTOMP_LB]], align 4
+// CHECK-32-NEXT:    store i32 99, ptr [[DOTOMP_UB]], align 4
+// CHECK-32-NEXT:    store i32 1, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK-32-NEXT:    store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
+// CHECK-32-NEXT:    [[NVPTX_NUM_THREADS:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block()
+// CHECK-32-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-NEXT:    [[TMP2:%.*]] = load i32, ptr [[TMP1]], align 4
+// CHECK-32-NEXT:    call void @__kmpc_distribute_static_init_4(ptr @[[GLOB2]], i32 [[TMP2]], i32 91, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_LB]], ptr [[DOTOMP_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]])
+// CHECK-32-NEXT:    br label [[OMP_DISPATCH_COND:%.*]]
+// CHECK-32:       omp.dispatch.cond:
+// CHECK-32-NEXT:    [[TMP3:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK-32-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[TMP3]], 99
+// CHECK-32-NEXT:    br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK-32:       cond.true:
+// CHECK-32-NEXT:    br label [[COND_END:%.*]]
+// CHECK-32:       cond.false:
+// CHECK-32-NEXT:    [[TMP4:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK-32-NEXT:    br label [[COND_END]]
+// CHECK-32:       cond.end:
+// CHECK-32-NEXT:    [[COND:%.*]] = phi i32 [ 99, [[COND_TRUE]] ], [ [[TMP4]], [[COND_FALSE]] ]
+// CHECK-32-NEXT:    store i32 [[COND]], ptr [[DOTOMP_UB]], align 4
+// CHECK-32-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4
+// CHECK-32-NEXT:    store i32 [[TMP5]], ptr [[DOTOMP_IV]], align 4
+// CHECK-32-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK-32-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK-32-NEXT:    [[CMP2:%.*]] = icmp sle i32 [[TMP6]], [[TMP7]]
+// CHECK-32-NEXT:    br i1 [[CMP2]], label [[OMP_DISPATCH_BODY:%.*]], label [[OMP_DISPATCH_END:%.*]]
+// CHECK-32:       omp.dispatch.body:
+// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK-32:       omp.inner.for.cond:
+// CHECK-32-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP22:![0-9]+]]
+// CHECK-32-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP22]]
+// CHECK-32-NEXT:    [[CMP3:%.*]] = icmp sle i32 [[TMP8]], [[TMP9]]
+// CHECK-32-NEXT:    br i1 [[CMP3]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK-32:       omp.inner.for.body:
+// CHECK-32-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP22]]
+// CHECK-32-NEXT:    [[DIV:%.*]] = sdiv i32 [[TMP10]], 10
+// CHECK-32-NEXT:    [[MUL:%.*]] = mul nsw i32 [[DIV]], 1
+// CHECK-32-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// CHECK-32-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP22]]
+// CHECK-32-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP22]]
+// CHECK-32-NEXT:    [[TMP12:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP22]]
+// CHECK-32-NEXT:    [[DIV4:%.*]] = sdiv i32 [[TMP12]], 10
+// CHECK-32-NEXT:    [[MUL5:%.*]] = mul nsw i32 [[DIV4]], 10
+// CHECK-32-NEXT:    [[SUB:%.*]] = sub nsw i32 [[TMP11]], [[MUL5]]
+// CHECK-32-NEXT:    [[MUL6:%.*]] = mul nsw i32 [[SUB]], 1
+// CHECK-32-NEXT:    [[ADD7:%.*]] = add nsw i32 0, [[MUL6]]
+// CHECK-32-NEXT:    store i32 [[ADD7]], ptr [[J]], align 4, !llvm.access.group [[ACC_GRP22]]
+// CHECK-32-NEXT:    store i32 10, ptr [[K]], align 4, !llvm.access.group [[ACC_GRP22]]
+// CHECK-32-NEXT:    [[TMP13:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP22]]
+// CHECK-32-NEXT:    [[TMP14:%.*]] = load i32, ptr [[J]], align 4, !llvm.access.group [[ACC_GRP22]]
+// CHECK-32-NEXT:    [[TMP15:%.*]] = load i32, ptr [[F_ADDR]], align 4, !llvm.access.group [[ACC_GRP22]]
+// CHECK-32-NEXT:    [[MUL8:%.*]] = mul nsw i32 [[TMP14]], [[TMP15]]
+// CHECK-32-NEXT:    [[ADD9:%.*]] = add nsw i32 [[TMP13]], [[MUL8]]
+// CHECK-32-NEXT:    [[TMP16:%.*]] = load i32, ptr [[K]], align 4, !llvm.access.group [[ACC_GRP22]]
+// CHECK-32-NEXT:    [[ADD10:%.*]] = add nsw i32 [[ADD9]], [[TMP16]]
+// CHECK-32-NEXT:    [[TMP17:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP22]]
+// CHECK-32-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[TMP0]], i32 0, i32 [[TMP17]]
+// CHECK-32-NEXT:    [[TMP18:%.*]] = load i32, ptr [[J]], align 4, !llvm.access.group [[ACC_GRP22]]
+// CHECK-32-NEXT:    [[ARRAYIDX11:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX]], i32 0, i32 [[TMP18]]
+// CHECK-32-NEXT:    store i32 [[ADD10]], ptr [[ARRAYIDX11]], align 4, !llvm.access.group [[ACC_GRP22]]
+// CHECK-32-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK-32:       omp.body.continue:
+// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK-32:       omp.inner.for.inc:
+// CHECK-32-NEXT:    [[TMP19:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP22]]
+// CHECK-32-NEXT:    [[ADD12:%.*]] = add nsw i32 [[TMP19]], 1
+// CHECK-32-NEXT:    store i32 [[ADD12]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP22]]
+// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP23:![0-9]+]]
+// CHECK-32:       omp.inner.for.end:
+// CHECK-32-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
+// CHECK-32:       omp.dispatch.inc:
+// CHECK-32-NEXT:    [[TMP20:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4
+// CHECK-32-NEXT:    [[TMP21:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK-32-NEXT:    [[ADD13:%.*]] = add nsw i32 [[TMP20]], [[TMP21]]
+// CHECK-32-NEXT:    store i32 [[ADD13]], ptr [[DOTOMP_LB]], align 4
+// CHECK-32-NEXT:    [[TMP22:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK-32-NEXT:    [[TMP23:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK-32-NEXT:    [[ADD14:%.*]] = add nsw i32 [[TMP22]], [[TMP23]]
+// CHECK-32-NEXT:    store i32 [[ADD14]], ptr [[DOTOMP_UB]], align 4
+// CHECK-32-NEXT:    br label [[OMP_DISPATCH_COND]]
+// CHECK-32:       omp.dispatch.end:
+// CHECK-32-NEXT:    call void @__kmpc_distribute_static_fini(ptr @[[GLOB2]], i32 [[TMP2]])
+// CHECK-32-NEXT:    [[TMP24:%.*]] = load i32, ptr [[DOTOMP_IS_LAST]], align 4
+// CHECK-32-NEXT:    [[TMP25:%.*]] = icmp ne i32 [[TMP24]], 0
+// CHECK-32-NEXT:    br i1 [[TMP25]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]]
+// CHECK-32:       .omp.final.then:
+// CHECK-32-NEXT:    store i32 10, ptr [[I]], align 4
+// CHECK-32-NEXT:    store i32 10, ptr [[J]], align 4
+// CHECK-32-NEXT:    br label [[DOTOMP_FINAL_DONE]]
+// CHECK-32:       .omp.final.done:
+// CHECK-32-NEXT:    ret void
+//
+//
+// CHECK-32-EX-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l33
+// CHECK-32-EX-SAME: (i32 [[N:%.*]], ptr nonnull align 4 dereferenceable(4000) [[A:%.*]], i32 [[L:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-32-EX-NEXT:  entry:
+// CHECK-32-EX-NEXT:    [[N_ADDR:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 4
+// CHECK-32-EX-NEXT:    [[L_ADDR:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[N_CASTED:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[L_CASTED:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    store i32 [[N]], ptr [[N_ADDR]], align 4
+// CHECK-32-EX-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 4
+// CHECK-32-EX-NEXT:    store i32 [[L]], ptr [[L_ADDR]], align 4
+// CHECK-32-EX-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[A_ADDR]], align 4
+// CHECK-32-EX-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr @[[GLOB1:[0-9]+]], i8 2, i1 false)
+// CHECK-32-EX-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1
+// CHECK-32-EX-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
+// CHECK-32-EX:       user_code.entry:
+// CHECK-32-EX-NEXT:    [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]])
+// CHECK-32-EX-NEXT:    [[TMP3:%.*]] = load i32, ptr [[N_ADDR]], align 4
+// CHECK-32-EX-NEXT:    store i32 [[TMP3]], ptr [[N_CASTED]], align 4
+// CHECK-32-EX-NEXT:    [[TMP4:%.*]] = load i32, ptr [[N_CASTED]], align 4
+// CHECK-32-EX-NEXT:    [[TMP5:%.*]] = load i32, ptr [[L_ADDR]], align 4
+// CHECK-32-EX-NEXT:    store i32 [[TMP5]], ptr [[L_CASTED]], align 4
+// CHECK-32-EX-NEXT:    [[TMP6:%.*]] = load i32, ptr [[L_CASTED]], align 4
+// CHECK-32-EX-NEXT:    store i32 0, ptr [[DOTZERO_ADDR]], align 4
+// CHECK-32-EX-NEXT:    store i32 [[TMP2]], ptr [[DOTTHREADID_TEMP_]], align 4
+// CHECK-32-EX-NEXT:    call void @__omp_outlined__(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]], i32 [[TMP4]], ptr [[TMP0]], i32 [[TMP6]]) #[[ATTR3:[0-9]+]]
+// CHECK-32-EX-NEXT:    call void @__kmpc_target_deinit(ptr @[[GLOB1]], i8 2)
+// CHECK-32-EX-NEXT:    ret void
+// CHECK-32-EX:       worker.exit:
+// CHECK-32-EX-NEXT:    ret void
+//
+//
+// CHECK-32-EX-LABEL: define {{[^@]+}}@__omp_outlined__
+// CHECK-32-EX-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]], i32 [[N:%.*]], ptr nonnull align 4 dereferenceable(4000) [[A:%.*]], i32 [[L:%.*]]) #[[ATTR1:[0-9]+]] {
+// CHECK-32-EX-NEXT:  entry:
+// CHECK-32-EX-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
+// CHECK-32-EX-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
+// CHECK-32-EX-NEXT:    [[N_ADDR:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 4
+// CHECK-32-EX-NEXT:    [[L_ADDR:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTCAPTURE_EXPR_2:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[I4:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-EX-NEXT:    store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 4
+// CHECK-32-EX-NEXT:    store i32 [[N]], ptr [[N_ADDR]], align 4
+// CHECK-32-EX-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 4
+// CHECK-32-EX-NEXT:    store i32 [[L]], ptr [[L_ADDR]], align 4
+// CHECK-32-EX-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[A_ADDR]], align 4
+// CHECK-32-EX-NEXT:    [[L1:%.*]] = call align 8 ptr @__kmpc_alloc_shared(i32 4)
+// CHECK-32-EX-NEXT:    [[TMP1:%.*]] = load i32, ptr [[N_ADDR]], align 4
+// CHECK-32-EX-NEXT:    store i32 [[TMP1]], ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK-32-EX-NEXT:    [[TMP2:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK-32-EX-NEXT:    [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0
+// CHECK-32-EX-NEXT:    [[DIV:%.*]] = sdiv i32 [[SUB]], 1
+// CHECK-32-EX-NEXT:    [[SUB3:%.*]] = sub nsw i32 [[DIV]], 1
+// CHECK-32-EX-NEXT:    store i32 [[SUB3]], ptr [[DOTCAPTURE_EXPR_2]], align 4
+// CHECK-32-EX-NEXT:    store i32 0, ptr [[I]], align 4
+// CHECK-32-EX-NEXT:    [[TMP3:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK-32-EX-NEXT:    [[CMP:%.*]] = icmp slt i32 0, [[TMP3]]
+// CHECK-32-EX-NEXT:    br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]]
+// CHECK-32-EX:       omp.precond.then:
+// CHECK-32-EX-NEXT:    store i32 0, ptr [[DOTOMP_LB]], align 4
+// CHECK-32-EX-NEXT:    [[TMP4:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_2]], align 4
+// CHECK-32-EX-NEXT:    store i32 [[TMP4]], ptr [[DOTOMP_UB]], align 4
+// CHECK-32-EX-NEXT:    store i32 1, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK-32-EX-NEXT:    store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
+// CHECK-32-EX-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-EX-NEXT:    [[TMP6:%.*]] = load i32, ptr [[TMP5]], align 4
+// CHECK-32-EX-NEXT:    call void @__kmpc_distribute_static_init_4(ptr @[[GLOB2:[0-9]+]], i32 [[TMP6]], i32 91, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_LB]], ptr [[DOTOMP_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 128)
+// CHECK-32-EX-NEXT:    br label [[OMP_DISPATCH_COND:%.*]]
+// CHECK-32-EX:       omp.dispatch.cond:
+// CHECK-32-EX-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK-32-EX-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_2]], align 4
+// CHECK-32-EX-NEXT:    [[CMP5:%.*]] = icmp sgt i32 [[TMP7]], [[TMP8]]
+// CHECK-32-EX-NEXT:    br i1 [[CMP5]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK-32-EX:       cond.true:
+// CHECK-32-EX-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_2]], align 4
+// CHECK-32-EX-NEXT:    br label [[COND_END:%.*]]
+// CHECK-32-EX:       cond.false:
+// CHECK-32-EX-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK-32-EX-NEXT:    br label [[COND_END]]
+// CHECK-32-EX:       cond.end:
+// CHECK-32-EX-NEXT:    [[COND:%.*]] = phi i32 [ [[TMP9]], [[COND_TRUE]] ], [ [[TMP10]], [[COND_FALSE]] ]
+// CHECK-32-EX-NEXT:    store i32 [[COND]], ptr [[DOTOMP_UB]], align 4
+// CHECK-32-EX-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4
+// CHECK-32-EX-NEXT:    store i32 [[TMP11]], ptr [[DOTOMP_IV]], align 4
+// CHECK-32-EX-NEXT:    [[TMP12:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK-32-EX-NEXT:    [[TMP13:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK-32-EX-NEXT:    [[CMP6:%.*]] = icmp sle i32 [[TMP12]], [[TMP13]]
+// CHECK-32-EX-NEXT:    br i1 [[CMP6]], label [[OMP_DISPATCH_BODY:%.*]], label [[OMP_DISPATCH_END:%.*]]
+// CHECK-32-EX:       omp.dispatch.body:
+// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK-32-EX:       omp.inner.for.cond:
+// CHECK-32-EX-NEXT:    [[TMP14:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP12:![0-9]+]]
+// CHECK-32-EX-NEXT:    [[TMP15:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP12]]
+// CHECK-32-EX-NEXT:    [[CMP7:%.*]] = icmp sle i32 [[TMP14]], [[TMP15]]
+// CHECK-32-EX-NEXT:    br i1 [[CMP7]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK-32-EX:       omp.inner.for.body:
+// CHECK-32-EX-NEXT:    [[TMP16:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP12]]
+// CHECK-32-EX-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP16]], 1
+// CHECK-32-EX-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// CHECK-32-EX-NEXT:    store i32 [[ADD]], ptr [[I4]], align 4, !llvm.access.group [[ACC_GRP12]]
+// CHECK-32-EX-NEXT:    [[TMP17:%.*]] = load i32, ptr [[I4]], align 4, !llvm.access.group [[ACC_GRP12]]
+// CHECK-32-EX-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [1000 x i32], ptr [[TMP0]], i32 0, i32 [[TMP17]]
+// CHECK-32-EX-NEXT:    store i32 1, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP12]]
+// CHECK-32-EX-NEXT:    [[TMP18:%.*]] = load i32, ptr [[I4]], align 4, !llvm.access.group [[ACC_GRP12]]
+// CHECK-32-EX-NEXT:    store i32 [[TMP18]], ptr [[L_ADDR]], align 4, !llvm.access.group [[ACC_GRP12]]
+// CHECK-32-EX-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK-32-EX:       omp.body.continue:
+// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK-32-EX:       omp.inner.for.inc:
+// CHECK-32-EX-NEXT:    [[TMP19:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP12]]
+// CHECK-32-EX-NEXT:    [[ADD8:%.*]] = add nsw i32 [[TMP19]], 1
+// CHECK-32-EX-NEXT:    store i32 [[ADD8]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP12]]
+// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP13:![0-9]+]]
+// CHECK-32-EX:       omp.inner.for.end:
+// CHECK-32-EX-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
+// CHECK-32-EX:       omp.dispatch.inc:
+// CHECK-32-EX-NEXT:    [[TMP20:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4
+// CHECK-32-EX-NEXT:    [[TMP21:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK-32-EX-NEXT:    [[ADD9:%.*]] = add nsw i32 [[TMP20]], [[TMP21]]
+// CHECK-32-EX-NEXT:    store i32 [[ADD9]], ptr [[DOTOMP_LB]], align 4
+// CHECK-32-EX-NEXT:    [[TMP22:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK-32-EX-NEXT:    [[TMP23:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK-32-EX-NEXT:    [[ADD10:%.*]] = add nsw i32 [[TMP22]], [[TMP23]]
+// CHECK-32-EX-NEXT:    store i32 [[ADD10]], ptr [[DOTOMP_UB]], align 4
+// CHECK-32-EX-NEXT:    br label [[OMP_DISPATCH_COND]]
+// CHECK-32-EX:       omp.dispatch.end:
+// CHECK-32-EX-NEXT:    [[TMP24:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-EX-NEXT:    [[TMP25:%.*]] = load i32, ptr [[TMP24]], align 4
+// CHECK-32-EX-NEXT:    call void @__kmpc_distribute_static_fini(ptr @[[GLOB2]], i32 [[TMP25]])
+// CHECK-32-EX-NEXT:    [[TMP26:%.*]] = load i32, ptr [[DOTOMP_IS_LAST]], align 4
+// CHECK-32-EX-NEXT:    [[TMP27:%.*]] = icmp ne i32 [[TMP26]], 0
+// CHECK-32-EX-NEXT:    br i1 [[TMP27]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]]
+// CHECK-32-EX:       .omp.final.then:
+// CHECK-32-EX-NEXT:    [[TMP28:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK-32-EX-NEXT:    [[SUB11:%.*]] = sub nsw i32 [[TMP28]], 0
+// CHECK-32-EX-NEXT:    [[DIV12:%.*]] = sdiv i32 [[SUB11]], 1
+// CHECK-32-EX-NEXT:    [[MUL13:%.*]] = mul nsw i32 [[DIV12]], 1
+// CHECK-32-EX-NEXT:    [[ADD14:%.*]] = add nsw i32 0, [[MUL13]]
+// CHECK-32-EX-NEXT:    store i32 [[ADD14]], ptr [[I4]], align 4
+// CHECK-32-EX-NEXT:    br label [[DOTOMP_FINAL_DONE]]
+// CHECK-32-EX:       .omp.final.done:
+// CHECK-32-EX-NEXT:    [[TMP29:%.*]] = load i32, ptr [[DOTOMP_IS_LAST]], align 4
+// CHECK-32-EX-NEXT:    [[TMP30:%.*]] = icmp ne i32 [[TMP29]], 0
+// CHECK-32-EX-NEXT:    br i1 [[TMP30]], label [[DOTOMP_LASTPRIVATE_THEN:%.*]], label [[DOTOMP_LASTPRIVATE_DONE:%.*]]
+// CHECK-32-EX:       .omp.lastprivate.then:
+// CHECK-32-EX-NEXT:    [[TMP31:%.*]] = load i32, ptr [[L_ADDR]], align 4
+// CHECK-32-EX-NEXT:    store i32 [[TMP31]], ptr [[L_ADDR]], align 4
+// CHECK-32-EX-NEXT:    br label [[DOTOMP_LASTPRIVATE_DONE]]
+// CHECK-32-EX:       .omp.lastprivate.done:
+// CHECK-32-EX-NEXT:    br label [[OMP_PRECOND_END]]
+// CHECK-32-EX:       omp.precond.end:
+// CHECK-32-EX-NEXT:    call void @__kmpc_free_shared(ptr [[L1]], i32 4)
+// CHECK-32-EX-NEXT:    ret void
+//
+//
+// CHECK-32-EX-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l39
+// CHECK-32-EX-SAME: (i32 [[N:%.*]], ptr nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR5:[0-9]+]] {
+// CHECK-32-EX-NEXT:  entry:
+// CHECK-32-EX-NEXT:    [[N_ADDR:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[AA_ADDR:%.*]] = alloca ptr, align 4
+// CHECK-32-EX-NEXT:    [[N_CASTED:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    store i32 [[N]], ptr [[N_ADDR]], align 4
+// CHECK-32-EX-NEXT:    store ptr [[AA]], ptr [[AA_ADDR]], align 4
+// CHECK-32-EX-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[AA_ADDR]], align 4
+// CHECK-32-EX-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr @[[GLOB1]], i8 2, i1 false)
+// CHECK-32-EX-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1
+// CHECK-32-EX-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
+// CHECK-32-EX:       user_code.entry:
+// CHECK-32-EX-NEXT:    [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]])
+// CHECK-32-EX-NEXT:    [[TMP3:%.*]] = load i32, ptr [[N_ADDR]], align 4
+// CHECK-32-EX-NEXT:    store i32 [[TMP3]], ptr [[N_CASTED]], align 4
+// CHECK-32-EX-NEXT:    [[TMP4:%.*]] = load i32, ptr [[N_CASTED]], align 4
+// CHECK-32-EX-NEXT:    store i32 0, ptr [[DOTZERO_ADDR]], align 4
+// CHECK-32-EX-NEXT:    store i32 [[TMP2]], ptr [[DOTTHREADID_TEMP_]], align 4
+// CHECK-32-EX-NEXT:    call void @__omp_outlined__1(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]], i32 [[TMP4]], ptr [[TMP0]]) #[[ATTR3]]
+// CHECK-32-EX-NEXT:    call void @__kmpc_target_deinit(ptr @[[GLOB1]], i8 2)
+// CHECK-32-EX-NEXT:    ret void
+// CHECK-32-EX:       worker.exit:
+// CHECK-32-EX-NEXT:    ret void
+//
+//
+// CHECK-32-EX-LABEL: define {{[^@]+}}@__omp_outlined__1
+// CHECK-32-EX-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]], i32 [[N:%.*]], ptr nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR1]] {
+// CHECK-32-EX-NEXT:  entry:
+// CHECK-32-EX-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
+// CHECK-32-EX-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
+// CHECK-32-EX-NEXT:    [[N_ADDR:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[AA_ADDR:%.*]] = alloca ptr, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[I3:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-EX-NEXT:    store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 4
+// CHECK-32-EX-NEXT:    store i32 [[N]], ptr [[N_ADDR]], align 4
+// CHECK-32-EX-NEXT:    store ptr [[AA]], ptr [[AA_ADDR]], align 4
+// CHECK-32-EX-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[AA_ADDR]], align 4
+// CHECK-32-EX-NEXT:    [[TMP1:%.*]] = load i32, ptr [[N_ADDR]], align 4
+// CHECK-32-EX-NEXT:    store i32 [[TMP1]], ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK-32-EX-NEXT:    [[TMP2:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK-32-EX-NEXT:    [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0
+// CHECK-32-EX-NEXT:    [[DIV:%.*]] = sdiv i32 [[SUB]], 1
+// CHECK-32-EX-NEXT:    [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1
+// CHECK-32-EX-NEXT:    store i32 [[SUB2]], ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK-32-EX-NEXT:    store i32 0, ptr [[I]], align 4
+// CHECK-32-EX-NEXT:    [[TMP3:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK-32-EX-NEXT:    [[CMP:%.*]] = icmp slt i32 0, [[TMP3]]
+// CHECK-32-EX-NEXT:    br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]]
+// CHECK-32-EX:       omp.precond.then:
+// CHECK-32-EX-NEXT:    store i32 0, ptr [[DOTOMP_LB]], align 4
+// CHECK-32-EX-NEXT:    [[TMP4:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK-32-EX-NEXT:    store i32 [[TMP4]], ptr [[DOTOMP_UB]], align 4
+// CHECK-32-EX-NEXT:    store i32 1, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK-32-EX-NEXT:    store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
+// CHECK-32-EX-NEXT:    [[NVPTX_NUM_THREADS:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block()
+// CHECK-32-EX-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-EX-NEXT:    [[TMP6:%.*]] = load i32, ptr [[TMP5]], align 4
+// CHECK-32-EX-NEXT:    call void @__kmpc_distribute_static_init_4(ptr @[[GLOB2]], i32 [[TMP6]], i32 91, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_LB]], ptr [[DOTOMP_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]])
+// CHECK-32-EX-NEXT:    br label [[OMP_DISPATCH_COND:%.*]]
+// CHECK-32-EX:       omp.dispatch.cond:
+// CHECK-32-EX-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK-32-EX-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK-32-EX-NEXT:    [[CMP4:%.*]] = icmp sgt i32 [[TMP7]], [[TMP8]]
+// CHECK-32-EX-NEXT:    br i1 [[CMP4]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK-32-EX:       cond.true:
+// CHECK-32-EX-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK-32-EX-NEXT:    br label [[COND_END:%.*]]
+// CHECK-32-EX:       cond.false:
+// CHECK-32-EX-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK-32-EX-NEXT:    br label [[COND_END]]
+// CHECK-32-EX:       cond.end:
+// CHECK-32-EX-NEXT:    [[COND:%.*]] = phi i32 [ [[TMP9]], [[COND_TRUE]] ], [ [[TMP10]], [[COND_FALSE]] ]
+// CHECK-32-EX-NEXT:    store i32 [[COND]], ptr [[DOTOMP_UB]], align 4
+// CHECK-32-EX-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4
+// CHECK-32-EX-NEXT:    store i32 [[TMP11]], ptr [[DOTOMP_IV]], align 4
+// CHECK-32-EX-NEXT:    [[TMP12:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK-32-EX-NEXT:    [[TMP13:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK-32-EX-NEXT:    [[CMP5:%.*]] = icmp sle i32 [[TMP12]], [[TMP13]]
+// CHECK-32-EX-NEXT:    br i1 [[CMP5]], label [[OMP_DISPATCH_BODY:%.*]], label [[OMP_DISPATCH_END:%.*]]
+// CHECK-32-EX:       omp.dispatch.body:
+// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK-32-EX:       omp.inner.for.cond:
+// CHECK-32-EX-NEXT:    [[TMP14:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP16:![0-9]+]]
+// CHECK-32-EX-NEXT:    [[TMP15:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP16]]
+// CHECK-32-EX-NEXT:    [[CMP6:%.*]] = icmp sle i32 [[TMP14]], [[TMP15]]
+// CHECK-32-EX-NEXT:    br i1 [[CMP6]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK-32-EX:       omp.inner.for.body:
+// CHECK-32-EX-NEXT:    [[TMP16:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP16]]
+// CHECK-32-EX-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP16]], 1
+// CHECK-32-EX-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// CHECK-32-EX-NEXT:    store i32 [[ADD]], ptr [[I3]], align 4, !llvm.access.group [[ACC_GRP16]]
+// CHECK-32-EX-NEXT:    [[TMP17:%.*]] = load i32, ptr [[I3]], align 4, !llvm.access.group [[ACC_GRP16]]
+// CHECK-32-EX-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [1000 x i16], ptr [[TMP0]], i32 0, i32 [[TMP17]]
+// CHECK-32-EX-NEXT:    [[TMP18:%.*]] = load i16, ptr [[ARRAYIDX]], align 2, !llvm.access.group [[ACC_GRP16]]
+// CHECK-32-EX-NEXT:    [[CONV:%.*]] = sext i16 [[TMP18]] to i32
+// CHECK-32-EX-NEXT:    [[ADD7:%.*]] = add nsw i32 [[CONV]], 1
+// CHECK-32-EX-NEXT:    [[CONV8:%.*]] = trunc i32 [[ADD7]] to i16
+// CHECK-32-EX-NEXT:    store i16 [[CONV8]], ptr [[ARRAYIDX]], align 2, !llvm.access.group [[ACC_GRP16]]
+// CHECK-32-EX-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK-32-EX:       omp.body.continue:
+// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK-32-EX:       omp.inner.for.inc:
+// CHECK-32-EX-NEXT:    [[TMP19:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP16]]
+// CHECK-32-EX-NEXT:    [[ADD9:%.*]] = add nsw i32 [[TMP19]], 1
+// CHECK-32-EX-NEXT:    store i32 [[ADD9]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP16]]
+// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP17:![0-9]+]]
+// CHECK-32-EX:       omp.inner.for.end:
+// CHECK-32-EX-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
+// CHECK-32-EX:       omp.dispatch.inc:
+// CHECK-32-EX-NEXT:    [[TMP20:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4
+// CHECK-32-EX-NEXT:    [[TMP21:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK-32-EX-NEXT:    [[ADD10:%.*]] = add nsw i32 [[TMP20]], [[TMP21]]
+// CHECK-32-EX-NEXT:    store i32 [[ADD10]], ptr [[DOTOMP_LB]], align 4
+// CHECK-32-EX-NEXT:    [[TMP22:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK-32-EX-NEXT:    [[TMP23:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK-32-EX-NEXT:    [[ADD11:%.*]] = add nsw i32 [[TMP22]], [[TMP23]]
+// CHECK-32-EX-NEXT:    store i32 [[ADD11]], ptr [[DOTOMP_UB]], align 4
+// CHECK-32-EX-NEXT:    br label [[OMP_DISPATCH_COND]]
+// CHECK-32-EX:       omp.dispatch.end:
+// CHECK-32-EX-NEXT:    [[TMP24:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-EX-NEXT:    [[TMP25:%.*]] = load i32, ptr [[TMP24]], align 4
+// CHECK-32-EX-NEXT:    call void @__kmpc_distribute_static_fini(ptr @[[GLOB2]], i32 [[TMP25]])
+// CHECK-32-EX-NEXT:    [[TMP26:%.*]] = load i32, ptr [[DOTOMP_IS_LAST]], align 4
+// CHECK-32-EX-NEXT:    [[TMP27:%.*]] = icmp ne i32 [[TMP26]], 0
+// CHECK-32-EX-NEXT:    br i1 [[TMP27]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]]
+// CHECK-32-EX:       .omp.final.then:
+// CHECK-32-EX-NEXT:    [[TMP28:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK-32-EX-NEXT:    [[SUB12:%.*]] = sub nsw i32 [[TMP28]], 0
+// CHECK-32-EX-NEXT:    [[DIV13:%.*]] = sdiv i32 [[SUB12]], 1
+// CHECK-32-EX-NEXT:    [[MUL14:%.*]] = mul nsw i32 [[DIV13]], 1
+// CHECK-32-EX-NEXT:    [[ADD15:%.*]] = add nsw i32 0, [[MUL14]]
+// CHECK-32-EX-NEXT:    store i32 [[ADD15]], ptr [[I3]], align 4
+// CHECK-32-EX-NEXT:    br label [[DOTOMP_FINAL_DONE]]
+// CHECK-32-EX:       .omp.final.done:
+// CHECK-32-EX-NEXT:    br label [[OMP_PRECOND_END]]
+// CHECK-32-EX:       omp.precond.end:
+// CHECK-32-EX-NEXT:    ret void
+//
+//
+// CHECK-32-EX-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l44
+// CHECK-32-EX-SAME: (ptr nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] {
+// CHECK-32-EX-NEXT:  entry:
+// CHECK-32-EX-NEXT:    [[B_ADDR:%.*]] = alloca ptr, align 4
+// CHECK-32-EX-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    store ptr [[B]], ptr [[B_ADDR]], align 4
+// CHECK-32-EX-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[B_ADDR]], align 4
+// CHECK-32-EX-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr @[[GLOB1]], i8 2, i1 false)
+// CHECK-32-EX-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1
+// CHECK-32-EX-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
+// CHECK-32-EX:       user_code.entry:
+// CHECK-32-EX-NEXT:    [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]])
+// CHECK-32-EX-NEXT:    store i32 0, ptr [[DOTZERO_ADDR]], align 4
+// CHECK-32-EX-NEXT:    store i32 [[TMP2]], ptr [[DOTTHREADID_TEMP_]], align 4
+// CHECK-32-EX-NEXT:    call void @__omp_outlined__2(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]], ptr [[TMP0]]) #[[ATTR3]]
+// CHECK-32-EX-NEXT:    call void @__kmpc_target_deinit(ptr @[[GLOB1]], i8 2)
+// CHECK-32-EX-NEXT:    ret void
+// CHECK-32-EX:       worker.exit:
+// CHECK-32-EX-NEXT:    ret void
+//
+//
+// CHECK-32-EX-LABEL: define {{[^@]+}}@__omp_outlined__2
+// CHECK-32-EX-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]], ptr nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR1]] {
+// CHECK-32-EX-NEXT:  entry:
+// CHECK-32-EX-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
+// CHECK-32-EX-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
+// CHECK-32-EX-NEXT:    [[B_ADDR:%.*]] = alloca ptr, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-EX-NEXT:    store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 4
+// CHECK-32-EX-NEXT:    store ptr [[B]], ptr [[B_ADDR]], align 4
+// CHECK-32-EX-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[B_ADDR]], align 4
+// CHECK-32-EX-NEXT:    store i32 0, ptr [[DOTOMP_LB]], align 4
+// CHECK-32-EX-NEXT:    store i32 9, ptr [[DOTOMP_UB]], align 4
+// CHECK-32-EX-NEXT:    store i32 1, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK-32-EX-NEXT:    store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
+// CHECK-32-EX-NEXT:    [[NVPTX_NUM_THREADS:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block()
+// CHECK-32-EX-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-EX-NEXT:    [[TMP2:%.*]] = load i32, ptr [[TMP1]], align 4
+// CHECK-32-EX-NEXT:    call void @__kmpc_distribute_static_init_4(ptr @[[GLOB2]], i32 [[TMP2]], i32 91, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_LB]], ptr [[DOTOMP_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]])
+// CHECK-32-EX-NEXT:    br label [[OMP_DISPATCH_COND:%.*]]
+// CHECK-32-EX:       omp.dispatch.cond:
+// CHECK-32-EX-NEXT:    [[TMP3:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK-32-EX-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[TMP3]], 9
+// CHECK-32-EX-NEXT:    br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK-32-EX:       cond.true:
+// CHECK-32-EX-NEXT:    br label [[COND_END:%.*]]
+// CHECK-32-EX:       cond.false:
+// CHECK-32-EX-NEXT:    [[TMP4:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK-32-EX-NEXT:    br label [[COND_END]]
+// CHECK-32-EX:       cond.end:
+// CHECK-32-EX-NEXT:    [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP4]], [[COND_FALSE]] ]
+// CHECK-32-EX-NEXT:    store i32 [[COND]], ptr [[DOTOMP_UB]], align 4
+// CHECK-32-EX-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4
+// CHECK-32-EX-NEXT:    store i32 [[TMP5]], ptr [[DOTOMP_IV]], align 4
+// CHECK-32-EX-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK-32-EX-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK-32-EX-NEXT:    [[CMP1:%.*]] = icmp sle i32 [[TMP6]], [[TMP7]]
+// CHECK-32-EX-NEXT:    br i1 [[CMP1]], label [[OMP_DISPATCH_BODY:%.*]], label [[OMP_DISPATCH_END:%.*]]
+// CHECK-32-EX:       omp.dispatch.body:
+// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK-32-EX:       omp.inner.for.cond:
+// CHECK-32-EX-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP19:![0-9]+]]
+// CHECK-32-EX-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP19]]
+// CHECK-32-EX-NEXT:    [[CMP2:%.*]] = icmp sle i32 [[TMP8]], [[TMP9]]
+// CHECK-32-EX-NEXT:    br i1 [[CMP2]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK-32-EX:       omp.inner.for.body:
+// CHECK-32-EX-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP19]]
+// CHECK-32-EX-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP10]], 1
+// CHECK-32-EX-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// CHECK-32-EX-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP19]]
+// CHECK-32-EX-NEXT:    [[TMP11:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP19]]
+// CHECK-32-EX-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], ptr [[TMP0]], i32 0, i32 [[TMP11]]
+// CHECK-32-EX-NEXT:    [[TMP12:%.*]] = load i32, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP19]]
+// CHECK-32-EX-NEXT:    [[ADD3:%.*]] = add nsw i32 [[TMP12]], 1
+// CHECK-32-EX-NEXT:    store i32 [[ADD3]], ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP19]]
+// CHECK-32-EX-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK-32-EX:       omp.body.continue:
+// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK-32-EX:       omp.inner.for.inc:
+// CHECK-32-EX-NEXT:    [[TMP13:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP19]]
+// CHECK-32-EX-NEXT:    [[ADD4:%.*]] = add nsw i32 [[TMP13]], 1
+// CHECK-32-EX-NEXT:    store i32 [[ADD4]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP19]]
+// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP20:![0-9]+]]
+// CHECK-32-EX:       omp.inner.for.end:
+// CHECK-32-EX-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
+// CHECK-32-EX:       omp.dispatch.inc:
+// CHECK-32-EX-NEXT:    [[TMP14:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4
+// CHECK-32-EX-NEXT:    [[TMP15:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK-32-EX-NEXT:    [[ADD5:%.*]] = add nsw i32 [[TMP14]], [[TMP15]]
+// CHECK-32-EX-NEXT:    store i32 [[ADD5]], ptr [[DOTOMP_LB]], align 4
+// CHECK-32-EX-NEXT:    [[TMP16:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK-32-EX-NEXT:    [[TMP17:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK-32-EX-NEXT:    [[ADD6:%.*]] = add nsw i32 [[TMP16]], [[TMP17]]
+// CHECK-32-EX-NEXT:    store i32 [[ADD6]], ptr [[DOTOMP_UB]], align 4
+// CHECK-32-EX-NEXT:    br label [[OMP_DISPATCH_COND]]
+// CHECK-32-EX:       omp.dispatch.end:
+// CHECK-32-EX-NEXT:    call void @__kmpc_distribute_static_fini(ptr @[[GLOB2]], i32 [[TMP2]])
+// CHECK-32-EX-NEXT:    [[TMP18:%.*]] = load i32, ptr [[DOTOMP_IS_LAST]], align 4
+// CHECK-32-EX-NEXT:    [[TMP19:%.*]] = icmp ne i32 [[TMP18]], 0
+// CHECK-32-EX-NEXT:    br i1 [[TMP19]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]]
+// CHECK-32-EX:       .omp.final.then:
+// CHECK-32-EX-NEXT:    store i32 10, ptr [[I]], align 4
+// CHECK-32-EX-NEXT:    br label [[DOTOMP_FINAL_DONE]]
+// CHECK-32-EX:       .omp.final.done:
+// CHECK-32-EX-NEXT:    ret void
+//
+//
+// CHECK-32-EX-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l49
+// CHECK-32-EX-SAME: (ptr nonnull align 4 dereferenceable(400) [[C:%.*]], i32 [[F:%.*]]) #[[ATTR0]] {
+// CHECK-32-EX-NEXT:  entry:
+// CHECK-32-EX-NEXT:    [[C_ADDR:%.*]] = alloca ptr, align 4
+// CHECK-32-EX-NEXT:    [[F_ADDR:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[F_CASTED:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    store ptr [[C]], ptr [[C_ADDR]], align 4
+// CHECK-32-EX-NEXT:    store i32 [[F]], ptr [[F_ADDR]], align 4
+// CHECK-32-EX-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[C_ADDR]], align 4
+// CHECK-32-EX-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr @[[GLOB1]], i8 2, i1 false)
+// CHECK-32-EX-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1
+// CHECK-32-EX-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
+// CHECK-32-EX:       user_code.entry:
+// CHECK-32-EX-NEXT:    [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]])
+// CHECK-32-EX-NEXT:    [[TMP3:%.*]] = load i32, ptr [[F_ADDR]], align 4
+// CHECK-32-EX-NEXT:    store i32 [[TMP3]], ptr [[F_CASTED]], align 4
+// CHECK-32-EX-NEXT:    [[TMP4:%.*]] = load i32, ptr [[F_CASTED]], align 4
+// CHECK-32-EX-NEXT:    store i32 0, ptr [[DOTZERO_ADDR]], align 4
+// CHECK-32-EX-NEXT:    store i32 [[TMP2]], ptr [[DOTTHREADID_TEMP_]], align 4
+// CHECK-32-EX-NEXT:    call void @__omp_outlined__3(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]], ptr [[TMP0]], i32 [[TMP4]]) #[[ATTR3]]
+// CHECK-32-EX-NEXT:    call void @__kmpc_target_deinit(ptr @[[GLOB1]], i8 2)
+// CHECK-32-EX-NEXT:    ret void
+// CHECK-32-EX:       worker.exit:
+// CHECK-32-EX-NEXT:    ret void
+//
+//
+// CHECK-32-EX-LABEL: define {{[^@]+}}@__omp_outlined__3
+// CHECK-32-EX-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]], ptr nonnull align 4 dereferenceable(400) [[C:%.*]], i32 [[F:%.*]]) #[[ATTR1]] {
+// CHECK-32-EX-NEXT:  entry:
+// CHECK-32-EX-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
+// CHECK-32-EX-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
+// CHECK-32-EX-NEXT:    [[C_ADDR:%.*]] = alloca ptr, align 4
+// CHECK-32-EX-NEXT:    [[F_ADDR:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[_TMP1:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[K:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[J:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-EX-NEXT:    store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 4
+// CHECK-32-EX-NEXT:    store ptr [[C]], ptr [[C_ADDR]], align 4
+// CHECK-32-EX-NEXT:    store i32 [[F]], ptr [[F_ADDR]], align 4
+// CHECK-32-EX-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[C_ADDR]], align 4
+// CHECK-32-EX-NEXT:    store i32 0, ptr [[DOTOMP_LB]], align 4
+// CHECK-32-EX-NEXT:    store i32 99, ptr [[DOTOMP_UB]], align 4
+// CHECK-32-EX-NEXT:    store i32 1, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK-32-EX-NEXT:    store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
+// CHECK-32-EX-NEXT:    [[NVPTX_NUM_THREADS:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block()
+// CHECK-32-EX-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-EX-NEXT:    [[TMP2:%.*]] = load i32, ptr [[TMP1]], align 4
+// CHECK-32-EX-NEXT:    call void @__kmpc_distribute_static_init_4(ptr @[[GLOB2]], i32 [[TMP2]], i32 91, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_LB]], ptr [[DOTOMP_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]])
+// CHECK-32-EX-NEXT:    br label [[OMP_DISPATCH_COND:%.*]]
+// CHECK-32-EX:       omp.dispatch.cond:
+// CHECK-32-EX-NEXT:    [[TMP3:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK-32-EX-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[TMP3]], 99
+// CHECK-32-EX-NEXT:    br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK-32-EX:       cond.true:
+// CHECK-32-EX-NEXT:    br label [[COND_END:%.*]]
+// CHECK-32-EX:       cond.false:
+// CHECK-32-EX-NEXT:    [[TMP4:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK-32-EX-NEXT:    br label [[COND_END]]
+// CHECK-32-EX:       cond.end:
+// CHECK-32-EX-NEXT:    [[COND:%.*]] = phi i32 [ 99, [[COND_TRUE]] ], [ [[TMP4]], [[COND_FALSE]] ]
+// CHECK-32-EX-NEXT:    store i32 [[COND]], ptr [[DOTOMP_UB]], align 4
+// CHECK-32-EX-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4
+// CHECK-32-EX-NEXT:    store i32 [[TMP5]], ptr [[DOTOMP_IV]], align 4
+// CHECK-32-EX-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK-32-EX-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK-32-EX-NEXT:    [[CMP2:%.*]] = icmp sle i32 [[TMP6]], [[TMP7]]
+// CHECK-32-EX-NEXT:    br i1 [[CMP2]], label [[OMP_DISPATCH_BODY:%.*]], label [[OMP_DISPATCH_END:%.*]]
+// CHECK-32-EX:       omp.dispatch.body:
+// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK-32-EX:       omp.inner.for.cond:
+// CHECK-32-EX-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP22:![0-9]+]]
+// CHECK-32-EX-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP22]]
+// CHECK-32-EX-NEXT:    [[CMP3:%.*]] = icmp sle i32 [[TMP8]], [[TMP9]]
+// CHECK-32-EX-NEXT:    br i1 [[CMP3]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK-32-EX:       omp.inner.for.body:
+// CHECK-32-EX-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP22]]
+// CHECK-32-EX-NEXT:    [[DIV:%.*]] = sdiv i32 [[TMP10]], 10
+// CHECK-32-EX-NEXT:    [[MUL:%.*]] = mul nsw i32 [[DIV]], 1
+// CHECK-32-EX-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// CHECK-32-EX-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP22]]
+// CHECK-32-EX-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP22]]
+// CHECK-32-EX-NEXT:    [[TMP12:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP22]]
+// CHECK-32-EX-NEXT:    [[DIV4:%.*]] = sdiv i32 [[TMP12]], 10
+// CHECK-32-EX-NEXT:    [[MUL5:%.*]] = mul nsw i32 [[DIV4]], 10
+// CHECK-32-EX-NEXT:    [[SUB:%.*]] = sub nsw i32 [[TMP11]], [[MUL5]]
+// CHECK-32-EX-NEXT:    [[MUL6:%.*]] = mul nsw i32 [[SUB]], 1
+// CHECK-32-EX-NEXT:    [[ADD7:%.*]] = add nsw i32 0, [[MUL6]]
+// CHECK-32-EX-NEXT:    store i32 [[ADD7]], ptr [[J]], align 4, !llvm.access.group [[ACC_GRP22]]
+// CHECK-32-EX-NEXT:    store i32 10, ptr [[K]], align 4, !llvm.access.group [[ACC_GRP22]]
+// CHECK-32-EX-NEXT:    [[TMP13:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP22]]
+// CHECK-32-EX-NEXT:    [[TMP14:%.*]] = load i32, ptr [[J]], align 4, !llvm.access.group [[ACC_GRP22]]
+// CHECK-32-EX-NEXT:    [[TMP15:%.*]] = load i32, ptr [[F_ADDR]], align 4, !llvm.access.group [[ACC_GRP22]]
+// CHECK-32-EX-NEXT:    [[MUL8:%.*]] = mul nsw i32 [[TMP14]], [[TMP15]]
+// CHECK-32-EX-NEXT:    [[ADD9:%.*]] = add nsw i32 [[TMP13]], [[MUL8]]
+// CHECK-32-EX-NEXT:    [[TMP16:%.*]] = load i32, ptr [[K]], align 4, !llvm.access.group [[ACC_GRP22]]
+// CHECK-32-EX-NEXT:    [[ADD10:%.*]] = add nsw i32 [[ADD9]], [[TMP16]]
+// CHECK-32-EX-NEXT:    [[TMP17:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP22]]
+// CHECK-32-EX-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[TMP0]], i32 0, i32 [[TMP17]]
+// CHECK-32-EX-NEXT:    [[TMP18:%.*]] = load i32, ptr [[J]], align 4, !llvm.access.group [[ACC_GRP22]]
+// CHECK-32-EX-NEXT:    [[ARRAYIDX11:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX]], i32 0, i32 [[TMP18]]
+// CHECK-32-EX-NEXT:    store i32 [[ADD10]], ptr [[ARRAYIDX11]], align 4, !llvm.access.group [[ACC_GRP22]]
+// CHECK-32-EX-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK-32-EX:       omp.body.continue:
+// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK-32-EX:       omp.inner.for.inc:
+// CHECK-32-EX-NEXT:    [[TMP19:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP22]]
+// CHECK-32-EX-NEXT:    [[ADD12:%.*]] = add nsw i32 [[TMP19]], 1
+// CHECK-32-EX-NEXT:    store i32 [[ADD12]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP22]]
+// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP23:![0-9]+]]
+// CHECK-32-EX:       omp.inner.for.end:
+// CHECK-32-EX-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
+// CHECK-32-EX:       omp.dispatch.inc:
+// CHECK-32-EX-NEXT:    [[TMP20:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4
+// CHECK-32-EX-NEXT:    [[TMP21:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK-32-EX-NEXT:    [[ADD13:%.*]] = add nsw i32 [[TMP20]], [[TMP21]]
+// CHECK-32-EX-NEXT:    store i32 [[ADD13]], ptr [[DOTOMP_LB]], align 4
+// CHECK-32-EX-NEXT:    [[TMP22:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK-32-EX-NEXT:    [[TMP23:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK-32-EX-NEXT:    [[ADD14:%.*]] = add nsw i32 [[TMP22]], [[TMP23]]
+// CHECK-32-EX-NEXT:    store i32 [[ADD14]], ptr [[DOTOMP_UB]], align 4
+// CHECK-32-EX-NEXT:    br label [[OMP_DISPATCH_COND]]
+// CHECK-32-EX:       omp.dispatch.end:
+// CHECK-32-EX-NEXT:    call void @__kmpc_distribute_static_fini(ptr @[[GLOB2]], i32 [[TMP2]])
+// CHECK-32-EX-NEXT:    [[TMP24:%.*]] = load i32, ptr [[DOTOMP_IS_LAST]], align 4
+// CHECK-32-EX-NEXT:    [[TMP25:%.*]] = icmp ne i32 [[TMP24]], 0
+// CHECK-32-EX-NEXT:    br i1 [[TMP25]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]]
+// CHECK-32-EX:       .omp.final.then:
+// CHECK-32-EX-NEXT:    store i32 10, ptr [[I]], align 4
+// CHECK-32-EX-NEXT:    store i32 10, ptr [[J]], align 4
+// CHECK-32-EX-NEXT:    br label [[DOTOMP_FINAL_DONE]]
+// CHECK-32-EX:       .omp.final.done:
+// CHECK-32-EX-NEXT:    ret void
+//


        


More information about the cfe-commits mailing list