[clang] 3a12ff0 - [OpenMP][RTL] Remove dead code

Pushpinder Singh via cfe-commits cfe-commits at lists.llvm.org
Tue Oct 6 02:44:05 PDT 2020


Author: Pushpinder Singh
Date: 2020-10-06T05:43:47-04:00
New Revision: 3a12ff0dac5ab4f0e1f446abe66b451c1df8dac1

URL: https://github.com/llvm/llvm-project/commit/3a12ff0dac5ab4f0e1f446abe66b451c1df8dac1
DIFF: https://github.com/llvm/llvm-project/commit/3a12ff0dac5ab4f0e1f446abe66b451c1df8dac1.diff

LOG: [OpenMP][RTL] Remove dead code

RequiresDataSharing was always 0, resulting dead code in device runtime library.

Reviewed By: jdoerfert, JonChesterfield

Differential Revision: https://reviews.llvm.org/D88829

Added: 
    

Modified: 
    clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
    clang/test/OpenMP/amdgcn_target_codegen.cpp
    clang/test/OpenMP/nvptx_SPMD_codegen.cpp
    clang/test/OpenMP/nvptx_force_full_runtime_SPMD_codegen.cpp
    clang/test/OpenMP/nvptx_target_parallel_codegen.cpp
    clang/test/OpenMP/nvptx_target_parallel_num_threads_codegen.cpp
    clang/test/OpenMP/nvptx_target_parallel_proc_bind_codegen.cpp
    clang/test/OpenMP/nvptx_target_parallel_reduction_codegen.cpp
    clang/test/OpenMP/nvptx_target_simd_codegen.cpp
    clang/test/OpenMP/nvptx_target_teams_codegen.cpp
    clang/test/OpenMP/nvptx_target_teams_distribute_parallel_for_codegen.cpp
    clang/test/OpenMP/nvptx_target_teams_distribute_parallel_for_simd_codegen.cpp
    clang/test/OpenMP/nvptx_target_teams_distribute_simd_codegen.cpp
    openmp/libomptarget/deviceRTLs/common/omptarget.h
    openmp/libomptarget/deviceRTLs/common/src/omptarget.cu
    openmp/libomptarget/deviceRTLs/interface.h

Removed: 
    


################################################################################
diff  --git a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
index 433256313c12..5d1856f4f5e7 100644
--- a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
+++ b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
@@ -35,7 +35,7 @@ enum OpenMPRTLFunctionNVPTX {
   /// Call to void __kmpc_kernel_deinit(int16_t IsOMPRuntimeInitialized);
   OMPRTL_NVPTX__kmpc_kernel_deinit,
   /// Call to void __kmpc_spmd_kernel_init(kmp_int32 thread_limit,
-  /// int16_t RequiresOMPRuntime, int16_t RequiresDataSharing);
+  /// int16_t RequiresOMPRuntime);
   OMPRTL_NVPTX__kmpc_spmd_kernel_init,
   /// Call to void __kmpc_spmd_kernel_deinit_v2(int16_t RequiresOMPRuntime);
   OMPRTL_NVPTX__kmpc_spmd_kernel_deinit_v2,
@@ -1345,8 +1345,7 @@ void CGOpenMPRuntimeGPU::emitSPMDEntryHeader(
 
   llvm::Value *Args[] = {getThreadLimit(CGF, /*IsInSPMDExecutionMode=*/true),
                          /*RequiresOMPRuntime=*/
-                         Bld.getInt16(RequiresFullRuntime ? 1 : 0),
-                         /*RequiresDataSharing=*/Bld.getInt16(0)};
+                         Bld.getInt16(RequiresFullRuntime ? 1 : 0)};
   CGF.EmitRuntimeCall(
       createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_spmd_kernel_init), Args);
 
@@ -1561,7 +1560,7 @@ CGOpenMPRuntimeGPU::createNVPTXRuntimeFunction(unsigned Function) {
   case OMPRTL_NVPTX__kmpc_spmd_kernel_init: {
     // Build void __kmpc_spmd_kernel_init(kmp_int32 thread_limit,
     // int16_t RequiresOMPRuntime, int16_t RequiresDataSharing);
-    llvm::Type *TypeParams[] = {CGM.Int32Ty, CGM.Int16Ty, CGM.Int16Ty};
+    llvm::Type *TypeParams[] = {CGM.Int32Ty, CGM.Int16Ty};
     auto *FnTy =
         llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ false);
     RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_spmd_kernel_init");

diff  --git a/clang/test/OpenMP/amdgcn_target_codegen.cpp b/clang/test/OpenMP/amdgcn_target_codegen.cpp
index 0b6f2d40ffe8..85ef69942a50 100644
--- a/clang/test/OpenMP/amdgcn_target_codegen.cpp
+++ b/clang/test/OpenMP/amdgcn_target_codegen.cpp
@@ -32,7 +32,7 @@ int test_amdgcn_target_tid_threads_simd() {
 
 // CHECK: [[NUM_THREADS:%.+]] = call i64 @__ockl_get_local_size(i32 0)
 // CHECK-NEXT: [[VAR:%.+]] = trunc i64 [[NUM_THREADS]] to i32
-// CHECK-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[VAR]], i16 0, i16 0)
+// CHECK-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[VAR]], i16 0)
 #pragma omp target simd
   for (int i = 0; i < N; i++) {
     arr[i] = 1;

diff  --git a/clang/test/OpenMP/nvptx_SPMD_codegen.cpp b/clang/test/OpenMP/nvptx_SPMD_codegen.cpp
index 5fa820fcba70..6c54818e23d5 100644
--- a/clang/test/OpenMP/nvptx_SPMD_codegen.cpp
+++ b/clang/test/OpenMP/nvptx_SPMD_codegen.cpp
@@ -21,28 +21,28 @@ int a;
 // CHECK-NOT: @__omp_offloading_{{.+}}_exec_mode = weak constant i8 1
 
 void foo() {
-// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 0, i16 0)
+// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 0)
 // CHECK-DAG: [[DISTR_LIGHT]]
 // CHECK-DAG: [[FOR_LIGHT]]
 // CHECK-DAG: [[LIGHT]]
-// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 0, i16 0)
+// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 0)
 // CHECK-DAG: [[DISTR_LIGHT]]
 // CHECK-DAG: [[FOR_LIGHT]]
 // CHECK-DAG: [[LIGHT]]
-// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 0, i16 0)
+// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 0)
 // CHECK-DAG: [[DISTR_LIGHT]]
 // CHECK-DAG: [[FOR_LIGHT]]
 // CHECK-DAG: [[LIGHT]]
-// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1, i16 {{.+}})
+// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1)
 // CHECK-DAG: [[DISTR_FULL]]
 // CHECK-DAG: [[FULL]]
-// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1, i16 {{.+}})
+// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1)
 // CHECK-DAG: [[DISTR_FULL]]
 // CHECK-DAG: [[FULL]]
-// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1, i16 {{.+}})
+// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1)
 // CHECK-DAG: [[DISTR_FULL]]
 // CHECK-DAG: [[FULL]]
-// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1, i16 {{.+}})
+// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1)
 // CHECK-DAG: [[DISTR_FULL]]
 // CHECK-DAG: [[FULL]]
 #pragma omp target teams distribute parallel for simd if(a)
@@ -67,28 +67,28 @@ void foo() {
   for (int i = 0; i < 10; ++i)
     ;
 int a;
-// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 0, i16 0)
+// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 0)
 // CHECK-DAG: [[DISTR_LIGHT]]
 // CHECK-DAG: [[FOR_LIGHT]]
 // CHECK-DAG: [[LIGHT]]
-// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 0, i16 0)
+// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 0)
 // CHECK-DAG: [[DISTR_LIGHT]]
 // CHECK-DAG: [[FOR_LIGHT]]
 // CHECK-DAG: [[LIGHT]]
-// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 0, i16 0)
+// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 0)
 // CHECK-DAG: [[DISTR_LIGHT]]
 // CHECK-DAG: [[FOR_LIGHT]]
 // CHECK-DAG: [[LIGHT]]
-// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1, i16 {{.+}})
+// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1)
 // CHECK-DAG: [[DISTR_FULL]]
 // CHECK-DAG: [[FULL]]
-// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1, i16 {{.+}})
+// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1)
 // CHECK-DAG: [[DISTR_FULL]]
 // CHECK-DAG: [[FULL]]
-// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1, i16 {{.+}})
+// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1)
 // CHECK-DAG: [[DISTR_FULL]]
 // CHECK-DAG: [[FULL]]
-// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1, i16 {{.+}})
+// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1)
 // CHECK-DAG: [[DISTR_FULL]]
 // CHECK-DAG: [[FULL]]
 #pragma omp target teams distribute parallel for lastprivate(a)
@@ -112,28 +112,28 @@ int a;
 #pragma omp target teams distribute parallel for schedule(guided)
   for (int i = 0; i < 10; ++i)
     ;
-// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 0, i16 0)
+// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 0)
 // CHECK-DAG: [[DISTR_LIGHT]]
 // CHECK-DAG: [[FOR_LIGHT]]
 // CHECK-DAG: [[LIGHT]]
-// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 0, i16 0)
+// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 0)
 // CHECK-DAG: [[DISTR_LIGHT]]
 // CHECK-DAG: [[FOR_LIGHT]]
 // CHECK-DAG: [[LIGHT]]
-// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 0, i16 0)
+// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 0)
 // CHECK-DAG: [[DISTR_LIGHT]]
 // CHECK-DAG: [[FOR_LIGHT]]
 // CHECK-DAG: [[LIGHT]]
-// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1, i16 {{.+}})
+// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1)
 // CHECK-DAG: [[DISTR_FULL]]
 // CHECK-DAG: [[FULL]]
-// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1, i16 {{.+}})
+// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1)
 // CHECK-DAG: [[DISTR_FULL]]
 // CHECK-DAG: [[FULL]]
-// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1, i16 {{.+}})
+// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1)
 // CHECK-DAG: [[DISTR_FULL]]
 // CHECK-DAG: [[FULL]]
-// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1, i16 {{.+}})
+// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1)
 // CHECK-DAG: [[DISTR_FULL]]
 // CHECK-DAG: [[FULL]]
 #pragma omp target teams
@@ -175,28 +175,28 @@ int a;
 #pragma omp distribute parallel for simd schedule(guided)
   for (int i = 0; i < 10; ++i)
     ;
-// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 0, i16 0)
+// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 0)
 // CHECK-DAG: [[DISTR_LIGHT]]
 // CHECK-DAG: [[FOR_LIGHT]]
 // CHECK-DAG: [[LIGHT]]
-// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 0, i16 0)
+// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 0)
 // CHECK-DAG: [[DISTR_LIGHT]]
 // CHECK-DAG: [[FOR_LIGHT]]
 // CHECK-DAG: [[LIGHT]]
-// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 0, i16 0)
+// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 0)
 // CHECK-DAG: [[DISTR_LIGHT]]
 // CHECK-DAG: [[FOR_LIGHT]]
 // CHECK-DAG: [[LIGHT]]
-// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1, i16 {{.+}})
+// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1)
 // CHECK-DAG: [[DISTR_FULL]]
 // CHECK-DAG: [[FULL]]
-// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1, i16 {{.+}})
+// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1)
 // CHECK-DAG: [[DISTR_FULL]]
 // CHECK-DAG: [[FULL]]
-// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1, i16 {{.+}})
+// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1)
 // CHECK-DAG: [[DISTR_FULL]]
 // CHECK-DAG: [[FULL]]
-// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1, i16 {{.+}})
+// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1)
 // CHECK-DAG: [[DISTR_FULL]]
 // CHECK-DAG: [[FULL]]
 #pragma omp target teams
@@ -227,28 +227,28 @@ int a;
 #pragma omp distribute parallel for schedule(guided)
   for (int i = 0; i < 10; ++i)
     ;
-// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 0, i16 0)
+// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 0)
 // CHECK-DAG: [[DISTR_LIGHT]]
 // CHECK-DAG: [[FOR_LIGHT]]
 // CHECK-DAG: [[LIGHT]]
-// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 0, i16 0)
+// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 0)
 // CHECK-DAG: [[DISTR_LIGHT]]
 // CHECK-DAG: [[FOR_LIGHT]]
 // CHECK-DAG: [[LIGHT]]
-// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 0, i16 0)
+// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 0)
 // CHECK-DAG: [[DISTR_LIGHT]]
 // CHECK-DAG: [[FOR_LIGHT]]
 // CHECK-DAG: [[LIGHT]]
-// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1, i16 {{.+}})
+// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1)
 // CHECK-DAG: [[DISTR_FULL]]
 // CHECK-DAG: [[FULL]]
-// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1, i16 {{.+}})
+// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1)
 // CHECK-DAG: [[DISTR_FULL]]
 // CHECK-DAG: [[FULL]]
-// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1, i16 {{.+}})
+// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1)
 // CHECK-DAG: [[DISTR_FULL]]
 // CHECK-DAG: [[FULL]]
-// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1, i16 {{.+}})
+// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1)
 // CHECK-DAG: [[DISTR_FULL]]
 // CHECK-DAG: [[FULL]]
 #pragma omp target
@@ -286,22 +286,22 @@ int a;
 #pragma omp distribute parallel for schedule(guided)
   for (int i = 0; i < 10; ++i)
     ;
-// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 0, i16 0)
+// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 0)
 // CHECK-DAG: [[FOR_LIGHT]]
 // CHECK-DAG: [[LIGHT]]
-// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 0, i16 0)
+// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 0)
 // CHECK-DAG: [[FOR_LIGHT]]
 // CHECK-DAG: [[LIGHT]]
-// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 0, i16 0)
+// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 0)
 // CHECK-DAG: [[FOR_LIGHT]]
 // CHECK-DAG: [[LIGHT]]
-// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1, i16 {{.+}})
+// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1)
 // CHECK-DAG: [[FULL]]
-// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1, i16 {{.+}})
+// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1)
 // CHECK-DAG: [[FULL]]
-// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1, i16 {{.+}})
+// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1)
 // CHECK-DAG: [[FULL]]
-// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1, i16 {{.+}})
+// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1)
 // CHECK-DAG: [[FULL]]
 #pragma omp target parallel for if(a)
   for (int i = 0; i < 10; ++i)
@@ -324,28 +324,28 @@ int a;
 #pragma omp target parallel for schedule(guided)
   for (int i = 0; i < 10; ++i)
     ;
-// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 0, i16 0)
+// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 0)
 // CHECK-DAG: [[FOR_LIGHT]]
 // CHECK-DAG: [[LIGHT]]
 // CHECK-DAG: [[BAR_LIGHT]]
-// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 0, i16 0)
+// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 0)
 // CHECK-DAG: [[FOR_LIGHT]]
 // CHECK-DAG: [[LIGHT]]
 // CHECK-DAG: [[BAR_LIGHT]]
-// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 0, i16 0)
+// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 0)
 // CHECK-DAG: [[FOR_LIGHT]]
 // CHECK-DAG: [[LIGHT]]
 // CHECK-DAG: [[BAR_LIGHT]]
-// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1, i16 {{.+}})
+// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1)
 // CHECK-DAG: [[FULL]]
 // CHECK-DAG: [[BAR_FULL]]
-// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1, i16 {{.+}})
+// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1)
 // CHECK-DAG: [[FULL]]
 // CHECK-DAG: [[BAR_FULL]]
-// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1, i16 {{.+}})
+// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1)
 // CHECK-DAG: [[FULL]]
 // CHECK-DAG: [[BAR_FULL]]
-// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1, i16 {{.+}})
+// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1)
 // CHECK-DAG: [[FULL]]
 // CHECK-DAG: [[BAR_FULL]]
 #pragma omp target parallel if(a)
@@ -376,27 +376,27 @@ int a;
 #pragma omp for simd schedule(guided)
   for (int i = 0; i < 10; ++i)
     ;
-// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1, i16 {{.+}})
+// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1)
 // CHECK-DAG: [[FULL]]
 // CHECK-DAG: [[BAR_FULL]]
-// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 0, i16 0)
+// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 0)
 // CHECK-DAG: [[FOR_LIGHT]]
 // CHECK-DAG: [[LIGHT]]
 // CHECK-DAG: [[BAR_LIGHT]]
-// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 0, i16 0)
+// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 0)
 // CHECK-DAG: [[FOR_LIGHT]]
 // CHECK-DAG: [[LIGHT]]
 // CHECK-DAG: [[BAR_LIGHT]]
-// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1, i16 {{.+}})
+// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1)
 // CHECK-DAG: [[FULL]]
 // CHECK-DAG: [[BAR_FULL]]
-// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1, i16 {{.+}})
+// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1)
 // CHECK-DAG: [[FULL]]
 // CHECK-DAG: [[BAR_FULL]]
-// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1, i16 {{.+}})
+// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1)
 // CHECK-DAG: [[FULL]]
 // CHECK-DAG: [[BAR_FULL]]
-// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1, i16 {{.+}})
+// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1)
 // CHECK-DAG: [[FULL]]
 // CHECK-DAG: [[BAR_FULL]]
 #pragma omp target
@@ -434,22 +434,22 @@ int a;
 #pragma omp for simd schedule(guided)
   for (int i = 0; i < 10; ++i)
     ;
-// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 0, i16 0)
+// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 0)
 // CHECK-DAG: [[FOR_LIGHT]]
 // CHECK-DAG: [[LIGHT]]
-// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 0, i16 0)
+// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 0)
 // CHECK-DAG: [[FOR_LIGHT]]
 // CHECK-DAG: [[LIGHT]]
-// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 0, i16 0)
+// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 0)
 // CHECK-DAG: [[FOR_LIGHT]]
 // CHECK-DAG: [[LIGHT]]
-// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1, i16 {{.+}})
+// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1)
 // CHECK-DAG: [[FULL]]
-// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1, i16 {{.+}})
+// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1)
 // CHECK-DAG: [[FULL]]
-// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1, i16 {{.+}})
+// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1)
 // CHECK-DAG: [[FULL]]
-// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1, i16 {{.+}})
+// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1)
 // CHECK-DAG: [[FULL]]
 #pragma omp target
 #pragma omp parallel for

diff  --git a/clang/test/OpenMP/nvptx_force_full_runtime_SPMD_codegen.cpp b/clang/test/OpenMP/nvptx_force_full_runtime_SPMD_codegen.cpp
index 395a2d48ff0f..37e71d0dcec6 100644
--- a/clang/test/OpenMP/nvptx_force_full_runtime_SPMD_codegen.cpp
+++ b/clang/test/OpenMP/nvptx_force_full_runtime_SPMD_codegen.cpp
@@ -11,13 +11,13 @@
 // CHECK-NOT: @__omp_offloading_{{.+}}_exec_mode = weak constant i8 1
 
 void foo() {
-// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1, i16 {{.+}})
-// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1, i16 {{.+}})
-// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1, i16 {{.+}})
-// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1, i16 {{.+}})
-// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1, i16 {{.+}})
-// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1, i16 {{.+}})
-// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1, i16 {{.+}})
+// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1)
+// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1)
+// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1)
+// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1)
+// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1)
+// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1)
+// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1)
 #pragma omp target teams distribute parallel for simd
   for (int i = 0; i < 10; ++i)
     ;
@@ -40,13 +40,13 @@ void foo() {
   for (int i = 0; i < 10; ++i)
     ;
 int a;
-// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1, i16 {{.+}})
-// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1, i16 {{.+}})
-// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1, i16 {{.+}})
-// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1, i16 {{.+}})
-// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1, i16 {{.+}})
-// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1, i16 {{.+}})
-// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1, i16 {{.+}})
+// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1)
+// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1)
+// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1)
+// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1)
+// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1)
+// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1)
+// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1)
 #pragma omp target teams distribute parallel for lastprivate(a)
   for (int i = 0; i < 10; ++i)
     a = i;
@@ -68,13 +68,13 @@ int a;
 #pragma omp target teams distribute parallel for schedule(guided)
   for (int i = 0; i < 10; ++i)
     ;
-// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1, i16 {{.+}})
-// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1, i16 {{.+}})
-// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1, i16 {{.+}})
-// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1, i16 {{.+}})
-// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1, i16 {{.+}})
-// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1, i16 {{.+}})
-// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1, i16 {{.+}})
+// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1)
+// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1)
+// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1)
+// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1)
+// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1)
+// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1)
+// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1)
 #pragma omp target teams
 #pragma omp distribute parallel for simd
   for (int i = 0; i < 10; ++i)
@@ -103,13 +103,13 @@ int a;
 #pragma omp distribute parallel for simd schedule(guided)
   for (int i = 0; i < 10; ++i)
     ;
-// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1, i16 {{.+}})
-// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1, i16 {{.+}})
-// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1, i16 {{.+}})
-// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1, i16 {{.+}})
-// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1, i16 {{.+}})
-// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1, i16 {{.+}})
-// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1, i16 {{.+}})
+// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1)
+// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1)
+// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1)
+// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1)
+// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1)
+// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1)
+// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1)
 #pragma omp target teams
 #pragma omp distribute parallel for
   for (int i = 0; i < 10; ++i)
@@ -138,13 +138,13 @@ int a;
 #pragma omp distribute parallel for schedule(guided)
   for (int i = 0; i < 10; ++i)
     ;
-// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1, i16 {{.+}})
-// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1, i16 {{.+}})
-// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1, i16 {{.+}})
-// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1, i16 {{.+}})
-// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1, i16 {{.+}})
-// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1, i16 {{.+}})
-// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1, i16 {{.+}})
+// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1)
+// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1)
+// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1)
+// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1)
+// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1)
+// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1)
+// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1)
 #pragma omp target
 #pragma omp teams
 #pragma omp distribute parallel for
@@ -180,13 +180,13 @@ int a;
 #pragma omp distribute parallel for schedule(guided)
   for (int i = 0; i < 10; ++i)
     ;
-// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1, i16 {{.+}})
-// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1, i16 {{.+}})
-// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1, i16 {{.+}})
-// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1, i16 {{.+}})
-// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1, i16 {{.+}})
-// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1, i16 {{.+}})
-// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1, i16 {{.+}})
+// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1)
+// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1)
+// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1)
+// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1)
+// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1)
+// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1)
+// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1)
 #pragma omp target parallel for
   for (int i = 0; i < 10; ++i)
     ;
@@ -208,13 +208,13 @@ int a;
 #pragma omp target parallel for schedule(guided)
   for (int i = 0; i < 10; ++i)
     ;
-// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1, i16 {{.+}})
-// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1, i16 {{.+}})
-// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1, i16 {{.+}})
-// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1, i16 {{.+}})
-// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1, i16 {{.+}})
-// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1, i16 {{.+}})
-// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1, i16 {{.+}})
+// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1)
+// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1)
+// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1)
+// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1)
+// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1)
+// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1)
+// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1)
 #pragma omp target parallel
 #pragma omp for simd
   for (int i = 0; i < 10; ++i)
@@ -243,13 +243,13 @@ int a;
 #pragma omp for simd schedule(guided)
   for (int i = 0; i < 10; ++i)
     ;
-// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1, i16 {{.+}})
-// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1, i16 {{.+}})
-// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1, i16 {{.+}})
-// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1, i16 {{.+}})
-// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1, i16 {{.+}})
-// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1, i16 {{.+}})
-// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1, i16 {{.+}})
+// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1)
+// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1)
+// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1)
+// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1)
+// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1)
+// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1)
+// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1)
 #pragma omp target
 #pragma omp parallel
 #pragma omp for simd ordered
@@ -285,13 +285,13 @@ int a;
 #pragma omp for simd schedule(guided)
   for (int i = 0; i < 10; ++i)
     ;
-// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1, i16 {{.+}})
-// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1, i16 {{.+}})
-// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1, i16 {{.+}})
-// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1, i16 {{.+}})
-// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1, i16 {{.+}})
-// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1, i16 {{.+}})
-// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1, i16 {{.+}})
+// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1)
+// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1)
+// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1)
+// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1)
+// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1)
+// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1)
+// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1)
 #pragma omp target
 #pragma omp parallel for
   for (int i = 0; i < 10; ++i)

diff  --git a/clang/test/OpenMP/nvptx_target_parallel_codegen.cpp b/clang/test/OpenMP/nvptx_target_parallel_codegen.cpp
index de8d4e0d234c..db3d0474c2c0 100644
--- a/clang/test/OpenMP/nvptx_target_parallel_codegen.cpp
+++ b/clang/test/OpenMP/nvptx_target_parallel_codegen.cpp
@@ -61,7 +61,7 @@ int bar(int n){
 // CHECK: store i16* {{%.+}}, i16** [[AA_ADDR]], align
 // CHECK: [[AA:%.+]] = load i16*, i16** [[AA_ADDR]], align
 // CHECK: [[THREAD_LIMIT:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
-// CHECK: call void @__kmpc_spmd_kernel_init(i32 [[THREAD_LIMIT]], i16 1, i16 0)
+// CHECK: call void @__kmpc_spmd_kernel_init(i32 [[THREAD_LIMIT]], i16 1)
 // CHECK: call void @__kmpc_data_sharing_init_stack_spmd
 // CHECK: br label {{%?}}[[EXEC:.+]]
 //
@@ -99,7 +99,7 @@ int bar(int n){
 // CHECK: [[AA:%.+]] = load i16*, i16** [[AA_ADDR]], align
 // CHECK: [[B:%.+]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align
 // CHECK: [[THREAD_LIMIT:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
-// CHECK: call void @__kmpc_spmd_kernel_init(i32 [[THREAD_LIMIT]], i16 1, i16 0)
+// CHECK: call void @__kmpc_spmd_kernel_init(i32 [[THREAD_LIMIT]], i16 1)
 // CHECK: call void @__kmpc_data_sharing_init_stack_spmd
 // CHECK: br label {{%?}}[[EXEC:.+]]
 //

diff  --git a/clang/test/OpenMP/nvptx_target_parallel_num_threads_codegen.cpp b/clang/test/OpenMP/nvptx_target_parallel_num_threads_codegen.cpp
index 93de4b6397ba..123ade5094a2 100644
--- a/clang/test/OpenMP/nvptx_target_parallel_num_threads_codegen.cpp
+++ b/clang/test/OpenMP/nvptx_target_parallel_num_threads_codegen.cpp
@@ -53,7 +53,7 @@ int bar(int n){
 // CHECK: store i16* {{%.+}}, i16** [[AA_ADDR]], align
 // CHECK: [[AA:%.+]] = load i16*, i16** [[AA_ADDR]], align
 // CHECK: [[THREAD_LIMIT:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
-// CHECK: call void @__kmpc_spmd_kernel_init(i32 [[THREAD_LIMIT]], i16 1, i16 0)
+// CHECK: call void @__kmpc_spmd_kernel_init(i32 [[THREAD_LIMIT]], i16 1)
 // CHECK: call void @__kmpc_data_sharing_init_stack_spmd()
 // CHECK: [[GTID:%.+]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @{{.+}})
 // CHECK: store i32 [[GTID]], i32* [[THREADID:%.+]],
@@ -84,7 +84,7 @@ int bar(int n){
 // CHECK: [[AA:%.+]] = load i16*, i16** [[AA_ADDR]], align
 // CHECK: [[B:%.+]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align
 // CHECK: [[THREAD_LIMIT:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
-// CHECK: call void @__kmpc_spmd_kernel_init(i32 [[THREAD_LIMIT]], i16 1, i16 0)
+// CHECK: call void @__kmpc_spmd_kernel_init(i32 [[THREAD_LIMIT]], i16 1)
 // CHECK: call void @__kmpc_data_sharing_init_stack_spmd()
 // CHECK: [[GTID:%.+]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @{{.+}})
 // CHECK: store i32 [[GTID]], i32* [[THREADID:%.+]],

diff  --git a/clang/test/OpenMP/nvptx_target_parallel_proc_bind_codegen.cpp b/clang/test/OpenMP/nvptx_target_parallel_proc_bind_codegen.cpp
index 778b8d3300df..7b3de7d462d2 100644
--- a/clang/test/OpenMP/nvptx_target_parallel_proc_bind_codegen.cpp
+++ b/clang/test/OpenMP/nvptx_target_parallel_proc_bind_codegen.cpp
@@ -54,7 +54,7 @@ int bar(int n){
 }
 
 // CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+template.+l29}}(
-// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1, i16 0)
+// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1)
 // CHECK: call void @__kmpc_data_sharing_init_stack_spmd
 // CHECK: br label {{%?}}[[EXEC:.+]]
 //
@@ -72,7 +72,7 @@ int bar(int n){
 // CHECK: }
 
 // CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+template.+l33}}(
-// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1, i16 0)
+// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1)
 // CHECK: call void @__kmpc_data_sharing_init_stack_spmd
 // CHECK: br label {{%?}}[[EXEC:.+]]
 //
@@ -90,7 +90,7 @@ int bar(int n){
 // CHECK: }
 
 // CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+template.+l38}}(
-// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1, i16 0)
+// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1)
 // CHECK: call void @__kmpc_data_sharing_init_stack_spmd
 // CHECK: br label {{%?}}[[EXEC:.+]]
 //

diff  --git a/clang/test/OpenMP/nvptx_target_parallel_reduction_codegen.cpp b/clang/test/OpenMP/nvptx_target_parallel_reduction_codegen.cpp
index 35240086d3da..459330d31f66 100644
--- a/clang/test/OpenMP/nvptx_target_parallel_reduction_codegen.cpp
+++ b/clang/test/OpenMP/nvptx_target_parallel_reduction_codegen.cpp
@@ -52,774 +52,756 @@ int bar(int n){
   return a;
 }
 
-  // CHECK: define {{.*}}void {{@__omp_offloading_.+template.+l27}}(
-  //
-  // CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1, i16 0)
-  // CHECK: call void @__kmpc_data_sharing_init_stack_spmd
-  // CHECK: br label {{%?}}[[EXECUTE:.+]]
-  //
-  // CHECK: [[EXECUTE]]
-  // CHECK: {{call|invoke}} void [[PFN:@.+]](i32*
-  // CHECK: call void @__kmpc_spmd_kernel_deinit_v2(i16 1)
-  //
-  //
-  // define internal void [[PFN]](
-  // CHECK: store double {{[0\.e\+]+}}, double* [[E:%.+]], align
-  // CHECK: [[EV:%.+]] = load double, double* [[E]], align
-  // CHECK: [[ADD:%.+]] = fadd double [[EV]], 5
-  // CHECK: store double [[ADD]], double* [[E]], align
-  // CHECK: [[PTR1:%.+]] = getelementptr inbounds [[RLT:.+]], [1 x i8*]* [[RL:%.+]], i{{32|64}} 0, i{{32|64}} 0
-  // CHECK: [[E_CAST:%.+]] = bitcast double* [[E]] to i8*
-  // CHECK: store i8* [[E_CAST]], i8** [[PTR1]], align
-  // CHECK: [[ARG_RL:%.+]] = bitcast [[RLT]]* [[RL]] to i8*
-  // CHECK: [[RET:%.+]] = call i32 @__kmpc_nvptx_parallel_reduce_nowait_v2(%struct.ident_t* @{{.+}}, i32 {{.+}}, i32 1, i{{32|64}} {{4|8}}, i8* [[ARG_RL]], void (i8*, i16, i16, i16)* [[SHUFFLE_REDUCE_FN:@.+]], void (i8*, i32)* [[WARP_COPY_FN:@.+]])
-  // CHECK: [[CMP:%.+]] = icmp eq i32 [[RET]], 1
-  // CHECK: br i1 [[CMP]], label
-
-  // CHECK: [[E_INV:%.+]] = load double, double* [[E_IN:%.+]], align
-  // CHECK: [[EV:%.+]] = load double, double* [[E]], align
-  // CHECK: [[ADD:%.+]] = fadd double [[E_INV]], [[EV]]
-  // CHECK: store double [[ADD]], double* [[E_IN]], align
-  // CHECK: call void @__kmpc_nvptx_end_reduce_nowait(
-  // CHECK: br label
-  //
-  // CHECK: ret
-
-  //
-  // Reduction function
-  // CHECK: define internal void [[REDUCTION_FUNC:@.+]](i8* %0, i8* %1)
-  // CHECK: [[VAR_RHS_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST_RHS:%.+]], i{{32|64}} 0, i{{32|64}} 0
-  // CHECK: [[VAR_RHS_VOID:%.+]] = load i8*, i8** [[VAR_RHS_REF]],
-  // CHECK: [[VAR_RHS:%.+]] = bitcast i8* [[VAR_RHS_VOID]] to double*
-  //
-  // CHECK: [[VAR_LHS_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST_LHS:%.+]], i{{32|64}} 0, i{{32|64}} 0
-  // CHECK: [[VAR_LHS_VOID:%.+]] = load i8*, i8** [[VAR_LHS_REF]],
-  // CHECK: [[VAR_LHS:%.+]] = bitcast i8* [[VAR_LHS_VOID]] to double*
-  //
-  // CHECK: [[VAR_LHS_VAL:%.+]] = load double, double* [[VAR_LHS]],
-  // CHECK: [[VAR_RHS_VAL:%.+]] = load double, double* [[VAR_RHS]],
-  // CHECK: [[RES:%.+]] = fadd double [[VAR_LHS_VAL]], [[VAR_RHS_VAL]]
-  // CHECK: store double [[RES]], double* [[VAR_LHS]],
-  // CHECK: ret void
-
-  //
-  // Shuffle and reduce function
-  // CHECK: define internal void [[SHUFFLE_REDUCE_FN]](i8* %0, i16 {{.*}}, i16 {{.*}}, i16 {{.*}})
-  // CHECK: [[REMOTE_RED_LIST:%.+]] = alloca [[RLT]], align
-  // CHECK: [[REMOTE_ELT:%.+]] = alloca double
-  //
-  // CHECK: [[LANEID:%.+]] = load i16, i16* {{.+}}, align
-  // CHECK: [[LANEOFFSET:%.+]] = load i16, i16* {{.+}}, align
-  // CHECK: [[ALGVER:%.+]] = load i16, i16* {{.+}}, align
-  //
-  // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST:%.+]], i{{32|64}} 0, i{{32|64}} 0
-  // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
-  // CHECK: [[REMOTE_ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[REMOTE_RED_LIST:%.+]], i{{32|64}} 0, i{{32|64}} 0
-  // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to double*
-  //
-  // CHECK: [[ELT_CAST:%.+]] = bitcast double* [[ELT]] to i64*
-  // CHECK: [[REMOTE_ELT_CAST:%.+]] = bitcast double* [[REMOTE_ELT]] to i64*
-  // CHECK: [[ELT_VAL:%.+]] = load i64, i64* [[ELT_CAST]], align
-  // CHECK: [[WS32:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
-  // CHECK: [[WS:%.+]] = trunc i32 [[WS32]] to i16
-  // CHECK: [[REMOTE_ELT_VAL64:%.+]] = call i64 @__kmpc_shuffle_int64(i64 [[ELT_VAL]], i16 [[LANEOFFSET]], i16 [[WS]])
-  //
-  // CHECK: store i64 [[REMOTE_ELT_VAL64]], i64* [[REMOTE_ELT_CAST]], align
-  // CHECK: [[REMOTE_ELT_VOID:%.+]] = bitcast double* [[REMOTE_ELT]] to i8*
-  // CHECK: store i8* [[REMOTE_ELT_VOID]], i8** [[REMOTE_ELT_REF]], align
-  //
-  // Condition to reduce
-  // CHECK: [[CONDALG0:%.+]] = icmp eq i16 [[ALGVER]], 0
-  //
-  // CHECK: [[COND1:%.+]] = icmp eq i16 [[ALGVER]], 1
-  // CHECK: [[COND2:%.+]] = icmp ult i16 [[LANEID]], [[LANEOFFSET]]
-  // CHECK: [[CONDALG1:%.+]] = and i1 [[COND1]], [[COND2]]
-  //
-  // CHECK: [[COND3:%.+]] = icmp eq i16 [[ALGVER]], 2
-  // CHECK: [[COND4:%.+]] = and i16 [[LANEID]], 1
-  // CHECK: [[COND5:%.+]] = icmp eq i16 [[COND4]], 0
-  // CHECK: [[COND6:%.+]] = and i1 [[COND3]], [[COND5]]
-  // CHECK: [[COND7:%.+]] = icmp sgt i16 [[LANEOFFSET]], 0
-  // CHECK: [[CONDALG2:%.+]] = and i1 [[COND6]], [[COND7]]
-  //
-  // CHECK: [[COND8:%.+]] = or i1 [[CONDALG0]], [[CONDALG1]]
-  // CHECK: [[SHOULD_REDUCE:%.+]] = or i1 [[COND8]], [[CONDALG2]]
-  // CHECK: br i1 [[SHOULD_REDUCE]], label {{%?}}[[DO_REDUCE:.+]], label {{%?}}[[REDUCE_ELSE:.+]]
-  //
-  // CHECK: [[DO_REDUCE]]
-  // CHECK: [[RED_LIST1_VOID:%.+]] = bitcast [[RLT]]* [[RED_LIST]] to i8*
-  // CHECK: [[RED_LIST2_VOID:%.+]] = bitcast [[RLT]]* [[REMOTE_RED_LIST]] to i8*
-  // CHECK: call void [[REDUCTION_FUNC]](i8* [[RED_LIST1_VOID]], i8* [[RED_LIST2_VOID]])
-  // CHECK: br label {{%?}}[[REDUCE_CONT:.+]]
-  //
-  // CHECK: [[REDUCE_ELSE]]
-  // CHECK: br label {{%?}}[[REDUCE_CONT]]
-  //
-  // CHECK: [[REDUCE_CONT]]
-  // Now check if we should just copy over the remote reduction list
-  // CHECK: [[COND1:%.+]] = icmp eq i16 [[ALGVER]], 1
-  // CHECK: [[COND2:%.+]] = icmp uge i16 [[LANEID]], [[LANEOFFSET]]
-  // CHECK: [[SHOULD_COPY:%.+]] = and i1 [[COND1]], [[COND2]]
-  // CHECK: br i1 [[SHOULD_COPY]], label {{%?}}[[DO_COPY:.+]], label {{%?}}[[COPY_ELSE:.+]]
-  //
-  // CHECK: [[DO_COPY]]
-  // CHECK: [[REMOTE_ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[REMOTE_RED_LIST]], i{{32|64}} 0, i{{32|64}} 0
-  // CHECK: [[REMOTE_ELT_VOID:%.+]] = load i8*, i8** [[REMOTE_ELT_REF]],
-  // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST]], i{{32|64}} 0, i{{32|64}} 0
-  // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
-  // CHECK: [[REMOTE_ELT:%.+]] = bitcast i8* [[REMOTE_ELT_VOID]] to double*
-  // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to double*
-  // CHECK: [[REMOTE_ELT_VAL:%.+]] = load double, double* [[REMOTE_ELT]], align
-  // CHECK: store double [[REMOTE_ELT_VAL]], double* [[ELT]], align
-  // CHECK: br label {{%?}}[[COPY_CONT:.+]]
-  //
-  // CHECK: [[COPY_ELSE]]
-  // CHECK: br label {{%?}}[[COPY_CONT]]
-  //
-  // CHECK: [[COPY_CONT]]
-  // CHECK: void
-
-  //
-  // Inter warp copy function
-  // CHECK: define internal void [[WARP_COPY_FN]](i8* %0, i32 %1)
-  // CHECK-DAG: [[LANEID:%.+]] = and i32 {{.+}}, 31
-  // CHECK-DAG: [[WARPID:%.+]] = ashr i32 {{.+}}, 5
-  // CHECK-DAG: [[RED_LIST:%.+]] = bitcast i8* {{.+}} to [[RLT]]*
-  // CHECK: store i32 0, i32* [[CNT_ADDR:%.+]],
-  // CHECK: br label
-  // CHECK: [[CNT:%.+]] = load i32, i32* [[CNT_ADDR]],
-  // CHECK: [[DONE_COPY:%.+]] = icmp ult i32 [[CNT]], 2
-  // CHECK: br i1 [[DONE_COPY]], label
-  // CHECK: call void @__kmpc_barrier(%struct.ident_t* @
-  // CHECK: [[IS_WARP_MASTER:%.+]] = icmp eq i32 [[LANEID]], 0
-  // CHECK: br i1 [[IS_WARP_MASTER]], label {{%?}}[[DO_COPY:.+]], label {{%?}}[[COPY_ELSE:.+]]
-  //
-  // [[DO_COPY]]
-  // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST]], i{{32|64}} 0, i{{32|64}} 0
-  // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
-  // CHECK: [[BASE_ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i32*
-  // CHECK: [[ELT:%.+]] = getelementptr i32, i32* [[BASE_ELT]], i32 [[CNT]]
-  //
-  // CHECK: [[MEDIUM_ELT:%.+]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[WARPID]]
-  // CHECK: [[ELT_VAL:%.+]] = load i32, i32* [[ELT]],
-  // CHECK: store volatile i32 [[ELT_VAL]], i32 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]],
-  // CHECK: br label {{%?}}[[COPY_CONT:.+]]
-  //
-  // CHECK: [[COPY_ELSE]]
-  // CHECK: br label {{%?}}[[COPY_CONT]]
-  //
-  // Barrier after copy to shared memory storage medium.
-  // CHECK: [[COPY_CONT]]
-  // CHECK: call void @__kmpc_barrier(%struct.ident_t* @
-  // CHECK: [[ACTIVE_WARPS:%.+]] = load i32, i32*
-  //
-  // Read into warp 0.
-  // CHECK: [[IS_W0_ACTIVE_THREAD:%.+]] = icmp ult i32 [[TID:%.+]], [[ACTIVE_WARPS]]
-  // CHECK: br i1 [[IS_W0_ACTIVE_THREAD]], label {{%?}}[[DO_READ:.+]], label {{%?}}[[READ_ELSE:.+]]
-  //
-  // CHECK: [[DO_READ]]
-  // CHECK: [[MEDIUM_ELT:%.+]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[TID]]
-  // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST:%.+]], i{{32|64}} 0, i{{32|64}} 0
-  // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
-  // CHECK: [[ELT_BASE:%.+]] = bitcast i8* [[ELT_VOID]] to i32*
-  // CHECK: [[ELT:%.+]] = getelementptr i32, i32* [[ELT_BASE]], i32 [[CNT]]
-  // CHECK: [[MEDIUM_ELT_VAL:%.+]] = load volatile i32, i32 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]],
-  // CHECK: store i32 [[MEDIUM_ELT_VAL]], i32* [[ELT]],
-  // CHECK: br label {{%?}}[[READ_CONT:.+]]
-  //
-  // CHECK: [[READ_ELSE]]
-  // CHECK: br label {{%?}}[[READ_CONT]]
-  //
-  // CHECK: [[READ_CONT]]
-  // CHECK: [[NEXT:%.+]] = add nsw i32 [[CNT]], 1
-  // CHECK: store i32 [[NEXT]], i32* [[CNT_ADDR]],
-  // CHECK: br label
-  // CHECK: ret
-
-
-
-
-
-
-
-
-
-
-  // CHECK: define {{.*}}void {{@__omp_offloading_.+template.+l32}}(
-  //
-  // CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1, i16 0)
-  // CHECK: call void @__kmpc_data_sharing_init_stack_spmd
-  // CHECK: br label {{%?}}[[EXECUTE:.+]]
-  //
-  // CHECK: [[EXECUTE]]
-  // CHECK: {{call|invoke}} void [[PFN1:@.+]](i32*
-  // CHECK: call void @__kmpc_spmd_kernel_deinit_v2(i16 1)
-  //
-  //
-  // define internal void [[PFN1]](
-  // CHECK: store float {{1\.[0e\+]+}}, float* [[D:%.+]], align
-  // CHECK: [[C_VAL:%.+]] = load i8, i8* [[C:%.+]], align
-  // CHECK: [[CONV:%.+]] = sext i8 [[C_VAL]] to i32
-  // CHECK: [[XOR:%.+]] = xor i32 [[CONV]], 2
-  // CHECK: [[TRUNC:%.+]] = trunc i32 [[XOR]] to i8
-  // CHECK: store i8 [[TRUNC]], i8* [[C]], align
-  // CHECK: [[DV:%.+]] = load float, float* [[D]], align
-  // CHECK: [[MUL:%.+]] = fmul float [[DV]], {{[0-9e\.\+]+}}
-  // CHECK: store float [[MUL]], float* [[D]], align
-  // CHECK: [[PTR1:%.+]] = getelementptr inbounds [[RLT:.+]], [2 x i8*]* [[RL:%.+]], i{{32|64}} 0, i{{32|64}} 0
-  // CHECK: store i8* [[C]], i8** [[PTR1]], align
-  // CHECK: [[PTR2:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RL]], i{{32|64}} 0, i{{32|64}} 1
-  // CHECK: [[D_CAST:%.+]] = bitcast float* [[D]] to i8*
-  // CHECK: store i8* [[D_CAST]], i8** [[PTR2]], align
-  // CHECK: [[ARG_RL:%.+]] = bitcast [[RLT]]* [[RL]] to i8*
-  // CHECK: [[RET:%.+]] = call i32 @__kmpc_nvptx_parallel_reduce_nowait_v2(%struct.ident_t* @{{.+}}, i32 {{.+}}, i32 2, i{{32|64}} {{8|16}}, i8* [[ARG_RL]], void (i8*, i16, i16, i16)* [[SHUFFLE_REDUCE_FN:@.+]], void (i8*, i32)* [[WARP_COPY_FN:@.+]])
-  // CHECK: [[CMP:%.+]] = icmp eq i32 [[RET]], 1
-  // CHECK: br i1 [[CMP]], label
-  // CHECK: [[C_INV8:%.+]] = load i8, i8* [[C_IN:%.+]], align
-  // CHECK: [[C_INV:%.+]] = sext i8 [[C_INV8]] to i32
-  // CHECK: [[CV8:%.+]] = load i8, i8* [[C]], align
-  // CHECK: [[CV:%.+]] = sext i8 [[CV8]] to i32
-  // CHECK: [[XOR:%.+]] = xor i32 [[C_INV]], [[CV]]
-  // CHECK: [[TRUNC:%.+]] = trunc i32 [[XOR]] to i8
-  // CHECK: store i8 [[TRUNC]], i8* [[C_IN]], align
-  // CHECK: [[D_INV:%.+]] = load float, float* [[D_IN:%.+]], align
-  // CHECK: [[DV:%.+]] = load float, float* [[D]], align
-  // CHECK: [[MUL:%.+]] = fmul float [[D_INV]], [[DV]]
-  // CHECK: store float [[MUL]], float* [[D_IN]], align
-  // CHECK: call void @__kmpc_nvptx_end_reduce_nowait(
-  // CHECK: br label
-  //
-  // CHECK: ret
-
-  //
-  // Reduction function
-  // CHECK: define internal void [[REDUCTION_FUNC:@.+]](i8* %0, i8* %1)
-  // CHECK: [[VAR1_RHS_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST_RHS:%.+]], i{{32|64}} 0, i{{32|64}} 0
-  // CHECK: [[VAR1_RHS:%.+]] = load i8*, i8** [[VAR1_RHS_REF]],
-  //
-  // CHECK: [[VAR1_LHS_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST_LHS:%.+]], i{{32|64}} 0, i{{32|64}} 0
-  // CHECK: [[VAR1_LHS:%.+]] = load i8*, i8** [[VAR1_LHS_REF]],
-  //
-  // CHECK: [[VAR2_RHS_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST_RHS]], i{{32|64}} 0, i{{32|64}} 1
-  // CHECK: [[VAR2_RHS_VOID:%.+]] = load i8*, i8** [[VAR2_RHS_REF]],
-  // CHECK: [[VAR2_RHS:%.+]] = bitcast i8* [[VAR2_RHS_VOID]] to float*
-  //
-  // CHECK: [[VAR2_LHS_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST_LHS]], i{{32|64}} 0, i{{32|64}} 1
-  // CHECK: [[VAR2_LHS_VOID:%.+]] = load i8*, i8** [[VAR2_LHS_REF]],
-  // CHECK: [[VAR2_LHS:%.+]] = bitcast i8* [[VAR2_LHS_VOID]] to float*
-  //
-  // CHECK: [[VAR1_LHS_VAL8:%.+]] = load i8, i8* [[VAR1_LHS]],
-  // CHECK: [[VAR1_LHS_VAL:%.+]] = sext i8 [[VAR1_LHS_VAL8]] to i32
-  // CHECK: [[VAR1_RHS_VAL8:%.+]] = load i8, i8* [[VAR1_RHS]],
-  // CHECK: [[VAR1_RHS_VAL:%.+]] = sext i8 [[VAR1_RHS_VAL8]] to i32
-  // CHECK: [[XOR:%.+]] = xor i32 [[VAR1_LHS_VAL]], [[VAR1_RHS_VAL]]
-  // CHECK: [[RES:%.+]] = trunc i32 [[XOR]] to i8
-  // CHECK: store i8 [[RES]], i8* [[VAR1_LHS]],
-  //
-  // CHECK: [[VAR2_LHS_VAL:%.+]] = load float, float* [[VAR2_LHS]],
-  // CHECK: [[VAR2_RHS_VAL:%.+]] = load float, float* [[VAR2_RHS]],
-  // CHECK: [[RES:%.+]] = fmul float [[VAR2_LHS_VAL]], [[VAR2_RHS_VAL]]
-  // CHECK: store float [[RES]], float* [[VAR2_LHS]],
-  // CHECK: ret void
-
-  //
-  // Shuffle and reduce function
-  // CHECK: define internal void [[SHUFFLE_REDUCE_FN]](i8* %0, i16 {{.*}}, i16 {{.*}}, i16 {{.*}})
-  // CHECK: [[REMOTE_RED_LIST:%.+]] = alloca [[RLT]], align
-  // CHECK: [[REMOTE_ELT1:%.+]] = alloca i8
-  // CHECK: [[REMOTE_ELT2:%.+]] = alloca float
-  //
-  // CHECK: [[LANEID:%.+]] = load i16, i16* {{.+}}, align
-  // CHECK: [[LANEOFFSET:%.+]] = load i16, i16* {{.+}}, align
-  // CHECK: [[ALGVER:%.+]] = load i16, i16* {{.+}}, align
-  //
-  // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST:%.+]], i{{32|64}} 0, i{{32|64}} 0
-  // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
-  // CHECK: [[REMOTE_ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[REMOTE_RED_LIST:%.+]], i{{32|64}} 0, i{{32|64}} 0
-  // CHECK: [[ELT_VAL:%.+]] = load i8, i8* [[ELT_VOID]], align
-  //
-  // CHECK: [[ELT_CAST:%.+]] = sext i8 [[ELT_VAL]] to i32
-  // CHECK: [[WS32:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
-  // CHECK: [[WS:%.+]] = trunc i32 [[WS32]] to i16
-  // CHECK: [[REMOTE_ELT1_VAL32:%.+]] = call i32 @__kmpc_shuffle_int32(i32 [[ELT_CAST]], i16 [[LANEOFFSET]], i16 [[WS]])
-  // CHECK: [[REMOTE_ELT1_VAL:%.+]] = trunc i32 [[REMOTE_ELT1_VAL32]] to i8
-  //
-  // CHECK: store i8 [[REMOTE_ELT1_VAL]], i8* [[REMOTE_ELT1]], align
-  // CHECK: store i8* [[REMOTE_ELT1]], i8** [[REMOTE_ELT_REF]], align
-  //
-  // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST]], i{{32|64}} 0, i{{32|64}} 1
-  // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
-  // CHECK: [[REMOTE_ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[REMOTE_RED_LIST]], i{{32|64}} 0, i{{32|64}} 1
-  // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to float*
-  //
-  // CHECK: [[ELT_CAST:%.+]] = bitcast float* [[ELT]] to i32*
-  // CHECK: [[REMOTE_ELT2_CAST:%.+]] = bitcast float* [[REMOTE_ELT2]] to i32*
-  // CHECK: [[ELT_VAL:%.+]] = load i32, i32* [[ELT_CAST]], align
-  // CHECK: [[WS32:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
-  // CHECK: [[WS:%.+]] = trunc i32 [[WS32]] to i16
-  // CHECK: [[REMOTE_ELT2_VAL32:%.+]] = call i32 @__kmpc_shuffle_int32(i32 [[ELT_VAL]], i16 [[LANEOFFSET]], i16 [[WS]])
-  //
-  // CHECK: store i32 [[REMOTE_ELT2_VAL32]], i32* [[REMOTE_ELT2_CAST]], align
-  // CHECK: [[REMOTE_ELT2C:%.+]] = bitcast float* [[REMOTE_ELT2]] to i8*
-  // CHECK: store i8* [[REMOTE_ELT2C]], i8** [[REMOTE_ELT_REF]], align
-  //
-  // Condition to reduce
-  // CHECK: [[CONDALG0:%.+]] = icmp eq i16 [[ALGVER]], 0
-  //
-  // CHECK: [[COND1:%.+]] = icmp eq i16 [[ALGVER]], 1
-  // CHECK: [[COND2:%.+]] = icmp ult i16 [[LANEID]], [[LANEOFFSET]]
-  // CHECK: [[CONDALG1:%.+]] = and i1 [[COND1]], [[COND2]]
-  //
-  // CHECK: [[COND3:%.+]] = icmp eq i16 [[ALGVER]], 2
-  // CHECK: [[COND4:%.+]] = and i16 [[LANEID]], 1
-  // CHECK: [[COND5:%.+]] = icmp eq i16 [[COND4]], 0
-  // CHECK: [[COND6:%.+]] = and i1 [[COND3]], [[COND5]]
-  // CHECK: [[COND7:%.+]] = icmp sgt i16 [[LANEOFFSET]], 0
-  // CHECK: [[CONDALG2:%.+]] = and i1 [[COND6]], [[COND7]]
-  //
-  // CHECK: [[COND8:%.+]] = or i1 [[CONDALG0]], [[CONDALG1]]
-  // CHECK: [[SHOULD_REDUCE:%.+]] = or i1 [[COND8]], [[CONDALG2]]
-  // CHECK: br i1 [[SHOULD_REDUCE]], label {{%?}}[[DO_REDUCE:.+]], label {{%?}}[[REDUCE_ELSE:.+]]
-  //
-  // CHECK: [[DO_REDUCE]]
-  // CHECK: [[RED_LIST1_VOID:%.+]] = bitcast [[RLT]]* [[RED_LIST]] to i8*
-  // CHECK: [[RED_LIST2_VOID:%.+]] = bitcast [[RLT]]* [[REMOTE_RED_LIST]] to i8*
-  // CHECK: call void [[REDUCTION_FUNC]](i8* [[RED_LIST1_VOID]], i8* [[RED_LIST2_VOID]])
-  // CHECK: br label {{%?}}[[REDUCE_CONT:.+]]
-  //
-  // CHECK: [[REDUCE_ELSE]]
-  // CHECK: br label {{%?}}[[REDUCE_CONT]]
-  //
-  // CHECK: [[REDUCE_CONT]]
-  // Now check if we should just copy over the remote reduction list
-  // CHECK: [[COND1:%.+]] = icmp eq i16 [[ALGVER]], 1
-  // CHECK: [[COND2:%.+]] = icmp uge i16 [[LANEID]], [[LANEOFFSET]]
-  // CHECK: [[SHOULD_COPY:%.+]] = and i1 [[COND1]], [[COND2]]
-  // CHECK: br i1 [[SHOULD_COPY]], label {{%?}}[[DO_COPY:.+]], label {{%?}}[[COPY_ELSE:.+]]
-  //
-  // CHECK: [[DO_COPY]]
-  // CHECK: [[REMOTE_ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[REMOTE_RED_LIST]], i{{32|64}} 0, i{{32|64}} 0
-  // CHECK: [[REMOTE_ELT_VOID:%.+]] = load i8*, i8** [[REMOTE_ELT_REF]],
-  // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST]], i{{32|64}} 0, i{{32|64}} 0
-  // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
-  // CHECK: [[REMOTE_ELT_VAL:%.+]] = load i8, i8* [[REMOTE_ELT_VOID]], align
-  // CHECK: store i8 [[REMOTE_ELT_VAL]], i8* [[ELT_VOID]], align
-  //
-  // CHECK: [[REMOTE_ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[REMOTE_RED_LIST]], i{{32|64}} 0, i{{32|64}} 1
-  // CHECK: [[REMOTE_ELT_VOID:%.+]] = load i8*, i8** [[REMOTE_ELT_REF]],
-  // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST]], i{{32|64}} 0, i{{32|64}} 1
-  // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
-  // CHECK: [[REMOTE_ELT:%.+]] = bitcast i8* [[REMOTE_ELT_VOID]] to float*
-  // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to float*
-  // CHECK: [[REMOTE_ELT_VAL:%.+]] = load float, float* [[REMOTE_ELT]], align
-  // CHECK: store float [[REMOTE_ELT_VAL]], float* [[ELT]], align
-  // CHECK: br label {{%?}}[[COPY_CONT:.+]]
-  //
-  // CHECK: [[COPY_ELSE]]
-  // CHECK: br label {{%?}}[[COPY_CONT]]
-  //
-  // CHECK: [[COPY_CONT]]
-  // CHECK: void
-
-  //
-  // Inter warp copy function
-  // CHECK: define internal void [[WARP_COPY_FN]](i8* %0, i32 %1)
-  // CHECK-DAG: [[LANEID:%.+]] = and i32 {{.+}}, 31
-  // CHECK-DAG: [[WARPID:%.+]] = ashr i32 {{.+}}, 5
-  // CHECK-DAG: [[RED_LIST:%.+]] = bitcast i8* {{.+}} to [[RLT]]*
-  // CHECK: call void @__kmpc_barrier(%struct.ident_t* @
-  // CHECK: [[IS_WARP_MASTER:%.+]] = icmp eq i32 [[LANEID]], 0
-  // CHECK: br i1 [[IS_WARP_MASTER]], label {{%?}}[[DO_COPY:.+]], label {{%?}}[[COPY_ELSE:.+]]
-  //
-  // [[DO_COPY]]
-  // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST]], i{{32|64}} 0, i{{32|64}} 0
-  // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
-  //
-  // CHECK: [[MEDIUM_ELT64:%.+]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[WARPID]]
-  // CHECK: [[MEDIUM_ELT:%.+]] = bitcast i32 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT64]] to i8 addrspace([[SHARED_ADDRSPACE]])*
-  // CHECK: [[ELT_VAL:%.+]] = load i8, i8* [[ELT_VOID]], align
-  // CHECK: store volatile i8 [[ELT_VAL]], i8 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align
-  // CHECK: br label {{%?}}[[COPY_CONT:.+]]
-  //
-  // CHECK: [[COPY_ELSE]]
-  // CHECK: br label {{%?}}[[COPY_CONT]]
-  //
-  // Barrier after copy to shared memory storage medium.
-  // CHECK: [[COPY_CONT]]
-  // CHECK: call void @__kmpc_barrier(%struct.ident_t* @
-  // CHECK: [[ACTIVE_WARPS:%.+]] = load i32, i32*
-  //
-  // Read into warp 0.
-  // CHECK: [[IS_W0_ACTIVE_THREAD:%.+]] = icmp ult i32 [[TID:%.+]], [[ACTIVE_WARPS]]
-  // CHECK: br i1 [[IS_W0_ACTIVE_THREAD]], label {{%?}}[[DO_READ:.+]], label {{%?}}[[READ_ELSE:.+]]
-  //
-  // CHECK: [[DO_READ]]
-  // CHECK: [[MEDIUM_ELT32:%.+]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[TID]]
-  // CHECK: [[MEDIUM_ELT:%.+]] = bitcast i32 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT32]] to i8 addrspace([[SHARED_ADDRSPACE]])*
-  // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST:%.+]], i{{32|64}} 0, i{{32|64}} 0
-  // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
-  // CHECK: [[MEDIUM_ELT_VAL:%.+]] = load volatile i8, i8 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align
-  // CHECK: store i8 [[MEDIUM_ELT_VAL]], i8* [[ELT_VOID]], align
-  // CHECK: br label {{%?}}[[READ_CONT:.+]]
-  //
-  // CHECK: [[READ_ELSE]]
-  // CHECK: br label {{%?}}[[READ_CONT]]
-  //
-  // CHECK: [[READ_CONT]]
-  // CHECK: call void @__kmpc_barrier(%struct.ident_t* @
-  // CHECK: [[IS_WARP_MASTER:%.+]] = icmp eq i32 [[LANEID]], 0
-  // CHECK: br i1 [[IS_WARP_MASTER]], label {{%?}}[[DO_COPY:.+]], label {{%?}}[[COPY_ELSE:.+]]
-  //
-  // [[DO_COPY]]
-  // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST]], i{{32|64}} 0, i{{32|64}} 1
-  // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
-  // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i32*
-  //
-  // CHECK: [[MEDIUM_ELT:%.+]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[WARPID]]
-  // CHECK: [[ELT_VAL:%.+]] = load i32, i32* [[ELT]], align
-  // CHECK: store volatile i32 [[ELT_VAL]], i32 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align
-  // CHECK: br label {{%?}}[[COPY_CONT:.+]]
-  //
-  // CHECK: [[COPY_ELSE]]
-  // CHECK: br label {{%?}}[[COPY_CONT]]
-  //
-  // Barrier after copy to shared memory storage medium.
-  // CHECK: [[COPY_CONT]]
-  // CHECK: call void @__kmpc_barrier(%struct.ident_t* @
-  // CHECK: [[ACTIVE_WARPS:%.+]] = load i32, i32*
-  //
-  // Read into warp 0.
-  // CHECK: [[IS_W0_ACTIVE_THREAD:%.+]] = icmp ult i32 [[TID:%.+]], [[ACTIVE_WARPS]]
-  // CHECK: br i1 [[IS_W0_ACTIVE_THREAD]], label {{%?}}[[DO_READ:.+]], label {{%?}}[[READ_ELSE:.+]]
-  //
-  // CHECK: [[DO_READ]]
-  // CHECK: [[MEDIUM_ELT:%.+]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[TID]]
-  // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST:%.+]], i{{32|64}} 0, i{{32|64}} 1
-  // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
-  // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i32*
-  // CHECK: [[MEDIUM_ELT_VAL:%.+]] = load volatile i32, i32 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align
-  // CHECK: store i32 [[MEDIUM_ELT_VAL]], i32* [[ELT]], align
-  // CHECK: br label {{%?}}[[READ_CONT:.+]]
-  //
-  // CHECK: [[READ_ELSE]]
-  // CHECK: br label {{%?}}[[READ_CONT]]
-  //
-  // CHECK: [[READ_CONT]]
-  // CHECK: ret
-
-
-
-
-
-
-
-
-
-
-  // CHECK: define {{.*}}void {{@__omp_offloading_.+template.+l38}}(
-  //
-  // CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1, i16 0)
-  // CHECK: call void @__kmpc_data_sharing_init_stack_spmd
-  // CHECK: br label {{%?}}[[EXECUTE:.+]]
-  //
-  // CHECK: [[EXECUTE]]
-  // CHECK: {{call|invoke}} void [[PFN2:@.+]](i32*
-  // CHECK: call void @__kmpc_spmd_kernel_deinit_v2(i16 1)
-  //
-  //
-  // define internal void [[PFN2]](
-  // CHECK: store i32 0, i32* [[A:%.+]], align
-  // CHECK: store i16 -32768, i16* [[B:%.+]], align
-  // CHECK: [[A_VAL:%.+]] = load i32, i32* [[A:%.+]], align
-  // CHECK: [[OR:%.+]] = or i32 [[A_VAL]], 1
-  // CHECK: store i32 [[OR]], i32* [[A]], align
-  // CHECK: [[BV16:%.+]] = load i16, i16* [[B]], align
-  // CHECK: [[BV:%.+]] = sext i16 [[BV16]] to i32
-  // CHECK: [[CMP:%.+]] = icmp sgt i32 99, [[BV]]
-  // CHECK: br i1 [[CMP]], label {{%?}}[[DO_MAX:.+]], label {{%?}}[[MAX_ELSE:.+]]
-  //
-  // CHECK: [[DO_MAX]]
-  // CHECK: br label {{%?}}[[MAX_CONT:.+]]
-  //
-  // CHECK: [[MAX_ELSE]]
-  // CHECK: [[BV:%.+]] = load i16, i16* [[B]], align
-  // CHECK: [[MAX:%.+]] = sext i16 [[BV]] to i32
-  // CHECK: br label {{%?}}[[MAX_CONT]]
-  //
-  // CHECK: [[MAX_CONT]]
-  // CHECK: [[B_LVALUE:%.+]] = phi i32 [ 99, %[[DO_MAX]] ], [ [[MAX]], %[[MAX_ELSE]] ]
-  // CHECK: [[TRUNC:%.+]] = trunc i32 [[B_LVALUE]] to i16
-  // CHECK: store i16 [[TRUNC]], i16* [[B]], align
-  // CHECK: [[PTR1:%.+]] = getelementptr inbounds [[RLT:.+]], [2 x i8*]* [[RL:%.+]], i{{32|64}} 0, i{{32|64}} 0
-  // CHECK: [[A_CAST:%.+]] = bitcast i32* [[A]] to i8*
-  // CHECK: store i8* [[A_CAST]], i8** [[PTR1]], align
-  // CHECK: [[PTR2:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RL]], i{{32|64}} 0, i{{32|64}} 1
-  // CHECK: [[B_CAST:%.+]] = bitcast i16* [[B]] to i8*
-  // CHECK: store i8* [[B_CAST]], i8** [[PTR2]], align
-  // CHECK: [[ARG_RL:%.+]] = bitcast [[RLT]]* [[RL]] to i8*
-  // CHECK: [[RET:%.+]] = call i32 @__kmpc_nvptx_parallel_reduce_nowait_v2(%struct.ident_t* @{{.+}}, i32 {{.+}}, i32 2, i{{32|64}} {{8|16}}, i8* [[ARG_RL]], void (i8*, i16, i16, i16)* [[SHUFFLE_REDUCE_FN:@.+]], void (i8*, i32)* [[WARP_COPY_FN:@.+]])
-  // CHECK: [[CMP:%.+]] = icmp eq i32 [[RET]], 1
-  // CHECK: br i1 [[CMP]], label
-
-  // CHECK: [[A_INV:%.+]] = load i32, i32* [[A_IN:%.+]], align
-  // CHECK: [[AV:%.+]] = load i32, i32* [[A]], align
-  // CHECK: [[OR:%.+]] = or i32 [[A_INV]], [[AV]]
-  // CHECK: store i32 [[OR]], i32* [[A_IN]], align
-  // CHECK: [[B_INV16:%.+]] = load i16, i16* [[B_IN:%.+]], align
-  // CHECK: [[B_INV:%.+]] = sext i16 [[B_INV16]] to i32
-  // CHECK: [[BV16:%.+]] = load i16, i16* [[B]], align
-  // CHECK: [[BV:%.+]] = sext i16 [[BV16]] to i32
-  // CHECK: [[CMP:%.+]] = icmp sgt i32 [[B_INV]], [[BV]]
-  // CHECK: br i1 [[CMP]], label {{%?}}[[DO_MAX:.+]], label {{%?}}[[MAX_ELSE:.+]]
-  //
-  // CHECK: [[DO_MAX]]
-  // CHECK: [[MAX1:%.+]] = load i16, i16* [[B_IN]], align
-  // CHECK: br label {{%?}}[[MAX_CONT:.+]]
-  //
-  // CHECK: [[MAX_ELSE]]
-  // CHECK: [[MAX2:%.+]] = load i16, i16* [[B]], align
-  // CHECK: br label {{%?}}[[MAX_CONT]]
-  //
-  // CHECK: [[MAX_CONT]]
-  // CHECK: [[B_MAX:%.+]] = phi i16 [ [[MAX1]], %[[DO_MAX]] ], [ [[MAX2]], %[[MAX_ELSE]] ]
-  // CHECK: store i16 [[B_MAX]], i16* [[B_IN]], align
-  // CHECK: call void @__kmpc_nvptx_end_reduce_nowait(
-  // CHECK: br label
-  //
-  // CHECK: ret
-
-  //
-  // Reduction function
-  // CHECK: define internal void [[REDUCTION_FUNC:@.+]](i8* %0, i8* %1)
-  // CHECK: [[VAR1_RHS_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST_RHS:%.+]], i{{32|64}} 0, i{{32|64}} 0
-  // CHECK: [[VAR1_RHS_VOID:%.+]] = load i8*, i8** [[VAR1_RHS_REF]],
-  // CHECK: [[VAR1_RHS:%.+]] = bitcast i8* [[VAR1_RHS_VOID]] to i32*
-  //
-  // CHECK: [[VAR1_LHS_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST_LHS:%.+]], i{{32|64}} 0, i{{32|64}} 0
-  // CHECK: [[VAR1_LHS_VOID:%.+]] = load i8*, i8** [[VAR1_LHS_REF]],
-  // CHECK: [[VAR1_LHS:%.+]] = bitcast i8* [[VAR1_LHS_VOID]] to i32*
-  //
-  // CHECK: [[VAR2_RHS_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST_RHS]], i{{32|64}} 0, i{{32|64}} 1
-  // CHECK: [[VAR2_RHS_VOID:%.+]] = load i8*, i8** [[VAR2_RHS_REF]],
-  // CHECK: [[VAR2_RHS:%.+]] = bitcast i8* [[VAR2_RHS_VOID]] to i16*
-  //
-  // CHECK: [[VAR2_LHS_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST_LHS]], i{{32|64}} 0, i{{32|64}} 1
-  // CHECK: [[VAR2_LHS_VOID:%.+]] = load i8*, i8** [[VAR2_LHS_REF]],
-  // CHECK: [[VAR2_LHS:%.+]] = bitcast i8* [[VAR2_LHS_VOID]] to i16*
-  //
-  // CHECK: [[VAR1_LHS_VAL:%.+]] = load i32, i32* [[VAR1_LHS]],
-  // CHECK: [[VAR1_RHS_VAL:%.+]] = load i32, i32* [[VAR1_RHS]],
-  // CHECK: [[OR:%.+]] = or i32 [[VAR1_LHS_VAL]], [[VAR1_RHS_VAL]]
-  // CHECK: store i32 [[OR]], i32* [[VAR1_LHS]],
-  //
-  // CHECK: [[VAR2_LHS_VAL16:%.+]] = load i16, i16* [[VAR2_LHS]],
-  // CHECK: [[VAR2_LHS_VAL:%.+]] = sext i16 [[VAR2_LHS_VAL16]] to i32
-  // CHECK: [[VAR2_RHS_VAL16:%.+]] = load i16, i16* [[VAR2_RHS]],
-  // CHECK: [[VAR2_RHS_VAL:%.+]] = sext i16 [[VAR2_RHS_VAL16]] to i32
-  //
-  // CHECK: [[CMP:%.+]] = icmp sgt i32 [[VAR2_LHS_VAL]], [[VAR2_RHS_VAL]]
-  // CHECK: br i1 [[CMP]], label {{%?}}[[DO_MAX:.+]], label {{%?}}[[MAX_ELSE:.+]]
-  //
-  // CHECK: [[DO_MAX]]
-  // CHECK: [[MAX1:%.+]] = load i16, i16* [[VAR2_LHS]], align
-  // CHECK: br label {{%?}}[[MAX_CONT:.+]]
-  //
-  // CHECK: [[MAX_ELSE]]
-  // CHECK: [[MAX2:%.+]] = load i16, i16* [[VAR2_RHS]], align
-  // CHECK: br label {{%?}}[[MAX_CONT]]
-  //
-  // CHECK: [[MAX_CONT]]
-  // CHECK: [[MAXV:%.+]] = phi i16 [ [[MAX1]], %[[DO_MAX]] ], [ [[MAX2]], %[[MAX_ELSE]] ]
-  // CHECK: store i16 [[MAXV]], i16* [[VAR2_LHS]],
-  // CHECK: ret void
-
-  //
-  // Shuffle and reduce function
-  // CHECK: define internal void [[SHUFFLE_REDUCE_FN]](i8* %0, i16 {{.*}}, i16 {{.*}}, i16 {{.*}})
-  // CHECK: [[REMOTE_RED_LIST:%.+]] = alloca [[RLT]], align
-  // CHECK: [[REMOTE_ELT1:%.+]] = alloca i32
-  // CHECK: [[REMOTE_ELT2:%.+]] = alloca i16
-  //
-  // CHECK: [[LANEID:%.+]] = load i16, i16* {{.+}}, align
-  // CHECK: [[LANEOFFSET:%.+]] = load i16, i16* {{.+}}, align
-  // CHECK: [[ALGVER:%.+]] = load i16, i16* {{.+}}, align
-  //
-  // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST:%.+]], i{{32|64}} 0, i{{32|64}} 0
-  // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
-  // CHECK: [[REMOTE_ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[REMOTE_RED_LIST:%.+]], i{{32|64}} 0, i{{32|64}} 0
-  // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i32*
-  // CHECK: [[ELT_VAL:%.+]] = load i32, i32* [[ELT]], align
-  //
-  // CHECK: [[WS32:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
-  // CHECK: [[WS:%.+]] = trunc i32 [[WS32]] to i16
-  // CHECK: [[REMOTE_ELT1_VAL:%.+]] = call i32 @__kmpc_shuffle_int32(i32 [[ELT_VAL]], i16 [[LANEOFFSET]], i16 [[WS]])
-  //
-  // CHECK: store i32 [[REMOTE_ELT1_VAL]], i32* [[REMOTE_ELT1]], align
-  // CHECK: [[REMOTE_ELT1C:%.+]] = bitcast i32* [[REMOTE_ELT1]] to i8*
-  // CHECK: store i8* [[REMOTE_ELT1C]], i8** [[REMOTE_ELT_REF]], align
-  //
-  // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST]], i{{32|64}} 0, i{{32|64}} 1
-  // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
-  // CHECK: [[REMOTE_ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[REMOTE_RED_LIST]], i{{32|64}} 0, i{{32|64}} 1
-  // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i16*
-  // CHECK: [[ELT_VAL:%.+]] = load i16, i16* [[ELT]], align
-  //
-  // CHECK: [[ELT_CAST:%.+]] = sext i16 [[ELT_VAL]] to i32
-  // CHECK: [[WS32:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
-  // CHECK: [[WS:%.+]] = trunc i32 [[WS32]] to i16
-  // CHECK: [[REMOTE_ELT2_VAL32:%.+]] = call i32 @__kmpc_shuffle_int32(i32 [[ELT_CAST]], i16 [[LANEOFFSET]], i16 [[WS]])
-  // CHECK: [[REMOTE_ELT2_VAL:%.+]] = trunc i32 [[REMOTE_ELT2_VAL32]] to i16
-  //
-  // CHECK: store i16 [[REMOTE_ELT2_VAL]], i16* [[REMOTE_ELT2]], align
-  // CHECK: [[REMOTE_ELT2C:%.+]] = bitcast i16* [[REMOTE_ELT2]] to i8*
-  // CHECK: store i8* [[REMOTE_ELT2C]], i8** [[REMOTE_ELT_REF]], align
-  //
-  // Condition to reduce
-  // CHECK: [[CONDALG0:%.+]] = icmp eq i16 [[ALGVER]], 0
-  //
-  // CHECK: [[COND1:%.+]] = icmp eq i16 [[ALGVER]], 1
-  // CHECK: [[COND2:%.+]] = icmp ult i16 [[LANEID]], [[LANEOFFSET]]
-  // CHECK: [[CONDALG1:%.+]] = and i1 [[COND1]], [[COND2]]
-  //
-  // CHECK: [[COND3:%.+]] = icmp eq i16 [[ALGVER]], 2
-  // CHECK: [[COND4:%.+]] = and i16 [[LANEID]], 1
-  // CHECK: [[COND5:%.+]] = icmp eq i16 [[COND4]], 0
-  // CHECK: [[COND6:%.+]] = and i1 [[COND3]], [[COND5]]
-  // CHECK: [[COND7:%.+]] = icmp sgt i16 [[LANEOFFSET]], 0
-  // CHECK: [[CONDALG2:%.+]] = and i1 [[COND6]], [[COND7]]
-  //
-  // CHECK: [[COND8:%.+]] = or i1 [[CONDALG0]], [[CONDALG1]]
-  // CHECK: [[SHOULD_REDUCE:%.+]] = or i1 [[COND8]], [[CONDALG2]]
-  // CHECK: br i1 [[SHOULD_REDUCE]], label {{%?}}[[DO_REDUCE:.+]], label {{%?}}[[REDUCE_ELSE:.+]]
-  //
-  // CHECK: [[DO_REDUCE]]
-  // CHECK: [[RED_LIST1_VOID:%.+]] = bitcast [[RLT]]* [[RED_LIST]] to i8*
-  // CHECK: [[RED_LIST2_VOID:%.+]] = bitcast [[RLT]]* [[REMOTE_RED_LIST]] to i8*
-  // CHECK: call void [[REDUCTION_FUNC]](i8* [[RED_LIST1_VOID]], i8* [[RED_LIST2_VOID]])
-  // CHECK: br label {{%?}}[[REDUCE_CONT:.+]]
-  //
-  // CHECK: [[REDUCE_ELSE]]
-  // CHECK: br label {{%?}}[[REDUCE_CONT]]
-  //
-  // CHECK: [[REDUCE_CONT]]
-  // Now check if we should just copy over the remote reduction list
-  // CHECK: [[COND1:%.+]] = icmp eq i16 [[ALGVER]], 1
-  // CHECK: [[COND2:%.+]] = icmp uge i16 [[LANEID]], [[LANEOFFSET]]
-  // CHECK: [[SHOULD_COPY:%.+]] = and i1 [[COND1]], [[COND2]]
-  // CHECK: br i1 [[SHOULD_COPY]], label {{%?}}[[DO_COPY:.+]], label {{%?}}[[COPY_ELSE:.+]]
-  //
-  // CHECK: [[DO_COPY]]
-  // CHECK: [[REMOTE_ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[REMOTE_RED_LIST]], i{{32|64}} 0, i{{32|64}} 0
-  // CHECK: [[REMOTE_ELT_VOID:%.+]] = load i8*, i8** [[REMOTE_ELT_REF]],
-  // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST]], i{{32|64}} 0, i{{32|64}} 0
-  // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
-  // CHECK: [[REMOTE_ELT:%.+]] = bitcast i8* [[REMOTE_ELT_VOID]] to i32*
-  // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i32*
-  // CHECK: [[REMOTE_ELT_VAL:%.+]] = load i32, i32* [[REMOTE_ELT]], align
-  // CHECK: store i32 [[REMOTE_ELT_VAL]], i32* [[ELT]], align
-  //
-  // CHECK: [[REMOTE_ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[REMOTE_RED_LIST]], i{{32|64}} 0, i{{32|64}} 1
-  // CHECK: [[REMOTE_ELT_VOID:%.+]] = load i8*, i8** [[REMOTE_ELT_REF]],
-  // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST]], i{{32|64}} 0, i{{32|64}} 1
-  // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
-  // CHECK: [[REMOTE_ELT:%.+]] = bitcast i8* [[REMOTE_ELT_VOID]] to i16*
-  // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i16*
-  // CHECK: [[REMOTE_ELT_VAL:%.+]] = load i16, i16* [[REMOTE_ELT]], align
-  // CHECK: store i16 [[REMOTE_ELT_VAL]], i16* [[ELT]], align
-  // CHECK: br label {{%?}}[[COPY_CONT:.+]]
-  //
-  // CHECK: [[COPY_ELSE]]
-  // CHECK: br label {{%?}}[[COPY_CONT]]
-  //
-  // CHECK: [[COPY_CONT]]
-  // CHECK: void
-
-  //
-  // Inter warp copy function
-  // CHECK: define internal void [[WARP_COPY_FN]](i8* %0, i32 %1)
-  // CHECK-DAG: [[LANEID:%.+]] = and i32 {{.+}}, 31
-  // CHECK-DAG: [[WARPID:%.+]] = ashr i32 {{.+}}, 5
-  // CHECK-DAG: [[RED_LIST:%.+]] = bitcast i8* {{.+}} to [[RLT]]*
-  // CHECK: call void @__kmpc_barrier(%struct.ident_t* @
-  // CHECK: [[IS_WARP_MASTER:%.+]] = icmp eq i32 [[LANEID]], 0
-  // CHECK: br i1 [[IS_WARP_MASTER]], label {{%?}}[[DO_COPY:.+]], label {{%?}}[[COPY_ELSE:.+]]
-  //
-  // [[DO_COPY]]
-  // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST]], i{{32|64}} 0, i{{32|64}} 0
-  // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
-  // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i32*
-  //
-  // CHECK: [[MEDIUM_ELT:%.+]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[WARPID]]
-  // CHECK: [[ELT_VAL:%.+]] = load i32, i32* [[ELT]], align
-  // CHECK: store volatile i32 [[ELT_VAL]], i32 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align
-  // CHECK: br label {{%?}}[[COPY_CONT:.+]]
-  //
-  // CHECK: [[COPY_ELSE]]
-  // CHECK: br label {{%?}}[[COPY_CONT]]
-  //
-  // Barrier after copy to shared memory storage medium.
-  // CHECK: [[COPY_CONT]]
-  // CHECK: call void @__kmpc_barrier(%struct.ident_t* @
-  // CHECK: [[ACTIVE_WARPS:%.+]] = load i32, i32*
-  //
-  // Read into warp 0.
-  // CHECK: [[IS_W0_ACTIVE_THREAD:%.+]] = icmp ult i32 [[TID:%.+]], [[ACTIVE_WARPS]]
-  // CHECK: br i1 [[IS_W0_ACTIVE_THREAD]], label {{%?}}[[DO_READ:.+]], label {{%?}}[[READ_ELSE:.+]]
-  //
-  // CHECK: [[DO_READ]]
-  // CHECK: [[MEDIUM_ELT:%.+]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[TID]]
-  // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST:%.+]], i{{32|64}} 0, i{{32|64}} 0
-  // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
-  // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i32*
-  // CHECK: [[MEDIUM_ELT_VAL:%.+]] = load volatile i32, i32 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align
-  // CHECK: store i32 [[MEDIUM_ELT_VAL]], i32* [[ELT]], align
-  // CHECK: br label {{%?}}[[READ_CONT:.+]]
-  //
-  // CHECK: [[READ_ELSE]]
-  // CHECK: br label {{%?}}[[READ_CONT]]
-  //
-  // CHECK: [[READ_CONT]]
-  // CHECK: call void @__kmpc_barrier(%struct.ident_t* @
-  // CHECK: [[IS_WARP_MASTER:%.+]] = icmp eq i32 [[LANEID]], 0
-  // CHECK: br i1 [[IS_WARP_MASTER]], label {{%?}}[[DO_COPY:.+]], label {{%?}}[[COPY_ELSE:.+]]
-  //
-  // [[DO_COPY]]
-  // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST]], i{{32|64}} 0, i{{32|64}} 1
-  // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
-  // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i16*
-  //
-  // CHECK: [[MEDIUM_ELT32:%.+]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[WARPID]]
-  // CHECK: [[MEDIUM_ELT:%.+]] = bitcast i32 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT32]] to i16 addrspace([[SHARED_ADDRSPACE]])*
-  // CHECK: [[ELT_VAL:%.+]] = load i16, i16* [[ELT]], align
-  // CHECK: store volatile i16 [[ELT_VAL]], i16 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align
-  // CHECK: br label {{%?}}[[COPY_CONT:.+]]
-  //
-  // CHECK: [[COPY_ELSE]]
-  // CHECK: br label {{%?}}[[COPY_CONT]]
-  //
-  // Barrier after copy to shared memory storage medium.
-  // CHECK: [[COPY_CONT]]
-  // CHECK: call void @__kmpc_barrier(%struct.ident_t* @
-  // CHECK: [[ACTIVE_WARPS:%.+]] = load i32, i32*
-  //
-  // Read into warp 0.
-  // CHECK: [[IS_W0_ACTIVE_THREAD:%.+]] = icmp ult i32 [[TID:%.+]], [[ACTIVE_WARPS]]
-  // CHECK: br i1 [[IS_W0_ACTIVE_THREAD]], label {{%?}}[[DO_READ:.+]], label {{%?}}[[READ_ELSE:.+]]
-  //
-  // CHECK: [[DO_READ]]
-  // CHECK: [[MEDIUM_ELT32:%.+]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[TID]]
-  // CHECK: [[MEDIUM_ELT:%.+]] = bitcast i32 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT32]] to i16 addrspace([[SHARED_ADDRSPACE]])*
-  // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST:%.+]], i{{32|64}} 0, i{{32|64}} 1
-  // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
-  // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i16*
-  // CHECK: [[MEDIUM_ELT_VAL:%.+]] = load volatile i16, i16 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align
-  // CHECK: store i16 [[MEDIUM_ELT_VAL]], i16* [[ELT]], align
-  // CHECK: br label {{%?}}[[READ_CONT:.+]]
-  //
-  // CHECK: [[READ_ELSE]]
-  // CHECK: br label {{%?}}[[READ_CONT]]
-  //
-  // CHECK: [[READ_CONT]]
-  // CHECK: ret
+// CHECK: define {{.*}}void {{@__omp_offloading_.+template.+l27}}(
+//
+// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1)
+// CHECK: call void @__kmpc_data_sharing_init_stack_spmd
+// CHECK: br label {{%?}}[[EXECUTE:.+]]
+//
+// CHECK: [[EXECUTE]]
+// CHECK: {{call|invoke}} void [[PFN:@.+]](i32*
+// CHECK: call void @__kmpc_spmd_kernel_deinit_v2(i16 1)
+//
+//
+// define internal void [[PFN]](
+// CHECK: store double {{[0\.e\+]+}}, double* [[E:%.+]], align
+// CHECK: [[EV:%.+]] = load double, double* [[E]], align
+// CHECK: [[ADD:%.+]] = fadd double [[EV]], 5
+// CHECK: store double [[ADD]], double* [[E]], align
+// CHECK: [[PTR1:%.+]] = getelementptr inbounds [[RLT:.+]], [1 x i8*]* [[RL:%.+]], i{{32|64}} 0, i{{32|64}} 0
+// CHECK: [[E_CAST:%.+]] = bitcast double* [[E]] to i8*
+// CHECK: store i8* [[E_CAST]], i8** [[PTR1]], align
+// CHECK: [[ARG_RL:%.+]] = bitcast [[RLT]]* [[RL]] to i8*
+// CHECK: [[RET:%.+]] = call i32 @__kmpc_nvptx_parallel_reduce_nowait_v2(%struct.ident_t* @{{.+}}, i32 {{.+}}, i32 1, i{{32|64}} {{4|8}}, i8* [[ARG_RL]], void (i8*, i16, i16, i16)* [[SHUFFLE_REDUCE_FN:@.+]], void (i8*, i32)* [[WARP_COPY_FN:@.+]])
+// CHECK: [[CMP:%.+]] = icmp eq i32 [[RET]], 1
+// CHECK: br i1 [[CMP]], label
+
+// CHECK: [[E_INV:%.+]] = load double, double* [[E_IN:%.+]], align
+// CHECK: [[EV:%.+]] = load double, double* [[E]], align
+// CHECK: [[ADD:%.+]] = fadd double [[E_INV]], [[EV]]
+// CHECK: store double [[ADD]], double* [[E_IN]], align
+// CHECK: call void @__kmpc_nvptx_end_reduce_nowait(
+// CHECK: br label
+//
+// CHECK: ret
+
+//
+// Reduction function
+// CHECK: define internal void [[REDUCTION_FUNC:@.+]](i8* %0, i8* %1)
+// CHECK: [[VAR_RHS_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST_RHS:%.+]], i{{32|64}} 0, i{{32|64}} 0
+// CHECK: [[VAR_RHS_VOID:%.+]] = load i8*, i8** [[VAR_RHS_REF]],
+// CHECK: [[VAR_RHS:%.+]] = bitcast i8* [[VAR_RHS_VOID]] to double*
+//
+// CHECK: [[VAR_LHS_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST_LHS:%.+]], i{{32|64}} 0, i{{32|64}} 0
+// CHECK: [[VAR_LHS_VOID:%.+]] = load i8*, i8** [[VAR_LHS_REF]],
+// CHECK: [[VAR_LHS:%.+]] = bitcast i8* [[VAR_LHS_VOID]] to double*
+//
+// CHECK: [[VAR_LHS_VAL:%.+]] = load double, double* [[VAR_LHS]],
+// CHECK: [[VAR_RHS_VAL:%.+]] = load double, double* [[VAR_RHS]],
+// CHECK: [[RES:%.+]] = fadd double [[VAR_LHS_VAL]], [[VAR_RHS_VAL]]
+// CHECK: store double [[RES]], double* [[VAR_LHS]],
+// CHECK: ret void
+
+//
+// Shuffle and reduce function
+// CHECK: define internal void [[SHUFFLE_REDUCE_FN]](i8* %0, i16 {{.*}}, i16 {{.*}}, i16 {{.*}})
+// CHECK: [[REMOTE_RED_LIST:%.+]] = alloca [[RLT]], align
+// CHECK: [[REMOTE_ELT:%.+]] = alloca double
+//
+// CHECK: [[LANEID:%.+]] = load i16, i16* {{.+}}, align
+// CHECK: [[LANEOFFSET:%.+]] = load i16, i16* {{.+}}, align
+// CHECK: [[ALGVER:%.+]] = load i16, i16* {{.+}}, align
+//
+// CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST:%.+]], i{{32|64}} 0, i{{32|64}} 0
+// CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
+// CHECK: [[REMOTE_ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[REMOTE_RED_LIST:%.+]], i{{32|64}} 0, i{{32|64}} 0
+// CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to double*
+//
+// CHECK: [[ELT_CAST:%.+]] = bitcast double* [[ELT]] to i64*
+// CHECK: [[REMOTE_ELT_CAST:%.+]] = bitcast double* [[REMOTE_ELT]] to i64*
+// CHECK: [[ELT_VAL:%.+]] = load i64, i64* [[ELT_CAST]], align
+// CHECK: [[WS32:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
+// CHECK: [[WS:%.+]] = trunc i32 [[WS32]] to i16
+// CHECK: [[REMOTE_ELT_VAL64:%.+]] = call i64 @__kmpc_shuffle_int64(i64 [[ELT_VAL]], i16 [[LANEOFFSET]], i16 [[WS]])
+//
+// CHECK: store i64 [[REMOTE_ELT_VAL64]], i64* [[REMOTE_ELT_CAST]], align
+// CHECK: [[REMOTE_ELT_VOID:%.+]] = bitcast double* [[REMOTE_ELT]] to i8*
+// CHECK: store i8* [[REMOTE_ELT_VOID]], i8** [[REMOTE_ELT_REF]], align
+//
+// Condition to reduce
+// CHECK: [[CONDALG0:%.+]] = icmp eq i16 [[ALGVER]], 0
+//
+// CHECK: [[COND1:%.+]] = icmp eq i16 [[ALGVER]], 1
+// CHECK: [[COND2:%.+]] = icmp ult i16 [[LANEID]], [[LANEOFFSET]]
+// CHECK: [[CONDALG1:%.+]] = and i1 [[COND1]], [[COND2]]
+//
+// CHECK: [[COND3:%.+]] = icmp eq i16 [[ALGVER]], 2
+// CHECK: [[COND4:%.+]] = and i16 [[LANEID]], 1
+// CHECK: [[COND5:%.+]] = icmp eq i16 [[COND4]], 0
+// CHECK: [[COND6:%.+]] = and i1 [[COND3]], [[COND5]]
+// CHECK: [[COND7:%.+]] = icmp sgt i16 [[LANEOFFSET]], 0
+// CHECK: [[CONDALG2:%.+]] = and i1 [[COND6]], [[COND7]]
+//
+// CHECK: [[COND8:%.+]] = or i1 [[CONDALG0]], [[CONDALG1]]
+// CHECK: [[SHOULD_REDUCE:%.+]] = or i1 [[COND8]], [[CONDALG2]]
+// CHECK: br i1 [[SHOULD_REDUCE]], label {{%?}}[[DO_REDUCE:.+]], label {{%?}}[[REDUCE_ELSE:.+]]
+//
+// CHECK: [[DO_REDUCE]]
+// CHECK: [[RED_LIST1_VOID:%.+]] = bitcast [[RLT]]* [[RED_LIST]] to i8*
+// CHECK: [[RED_LIST2_VOID:%.+]] = bitcast [[RLT]]* [[REMOTE_RED_LIST]] to i8*
+// CHECK: call void [[REDUCTION_FUNC]](i8* [[RED_LIST1_VOID]], i8* [[RED_LIST2_VOID]])
+// CHECK: br label {{%?}}[[REDUCE_CONT:.+]]
+//
+// CHECK: [[REDUCE_ELSE]]
+// CHECK: br label {{%?}}[[REDUCE_CONT]]
+//
+// CHECK: [[REDUCE_CONT]]
+// Now check if we should just copy over the remote reduction list
+// CHECK: [[COND1:%.+]] = icmp eq i16 [[ALGVER]], 1
+// CHECK: [[COND2:%.+]] = icmp uge i16 [[LANEID]], [[LANEOFFSET]]
+// CHECK: [[SHOULD_COPY:%.+]] = and i1 [[COND1]], [[COND2]]
+// CHECK: br i1 [[SHOULD_COPY]], label {{%?}}[[DO_COPY:.+]], label {{%?}}[[COPY_ELSE:.+]]
+//
+// CHECK: [[DO_COPY]]
+// CHECK: [[REMOTE_ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[REMOTE_RED_LIST]], i{{32|64}} 0, i{{32|64}} 0
+// CHECK: [[REMOTE_ELT_VOID:%.+]] = load i8*, i8** [[REMOTE_ELT_REF]],
+// CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST]], i{{32|64}} 0, i{{32|64}} 0
+// CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
+// CHECK: [[REMOTE_ELT:%.+]] = bitcast i8* [[REMOTE_ELT_VOID]] to double*
+// CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to double*
+// CHECK: [[REMOTE_ELT_VAL:%.+]] = load double, double* [[REMOTE_ELT]], align
+// CHECK: store double [[REMOTE_ELT_VAL]], double* [[ELT]], align
+// CHECK: br label {{%?}}[[COPY_CONT:.+]]
+//
+// CHECK: [[COPY_ELSE]]
+// CHECK: br label {{%?}}[[COPY_CONT]]
+//
+// CHECK: [[COPY_CONT]]
+// CHECK: void
+
+//
+// Inter warp copy function
+// CHECK: define internal void [[WARP_COPY_FN]](i8* %0, i32 %1)
+// CHECK-DAG: [[LANEID:%.+]] = and i32 {{.+}}, 31
+// CHECK-DAG: [[WARPID:%.+]] = ashr i32 {{.+}}, 5
+// CHECK-DAG: [[RED_LIST:%.+]] = bitcast i8* {{.+}} to [[RLT]]*
+// CHECK: store i32 0, i32* [[CNT_ADDR:%.+]],
+// CHECK: br label
+// CHECK: [[CNT:%.+]] = load i32, i32* [[CNT_ADDR]],
+// CHECK: [[DONE_COPY:%.+]] = icmp ult i32 [[CNT]], 2
+// CHECK: br i1 [[DONE_COPY]], label
+// CHECK: call void @__kmpc_barrier(%struct.ident_t* @
+// CHECK: [[IS_WARP_MASTER:%.+]] = icmp eq i32 [[LANEID]], 0
+// CHECK: br i1 [[IS_WARP_MASTER]], label {{%?}}[[DO_COPY:.+]], label {{%?}}[[COPY_ELSE:.+]]
+//
+// [[DO_COPY]]
+// CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST]], i{{32|64}} 0, i{{32|64}} 0
+// CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
+// CHECK: [[BASE_ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i32*
+// CHECK: [[ELT:%.+]] = getelementptr i32, i32* [[BASE_ELT]], i32 [[CNT]]
+//
+// CHECK: [[MEDIUM_ELT:%.+]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[WARPID]]
+// CHECK: [[ELT_VAL:%.+]] = load i32, i32* [[ELT]],
+// CHECK: store volatile i32 [[ELT_VAL]], i32 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]],
+// CHECK: br label {{%?}}[[COPY_CONT:.+]]
+//
+// CHECK: [[COPY_ELSE]]
+// CHECK: br label {{%?}}[[COPY_CONT]]
+//
+// Barrier after copy to shared memory storage medium.
+// CHECK: [[COPY_CONT]]
+// CHECK: call void @__kmpc_barrier(%struct.ident_t* @
+// CHECK: [[ACTIVE_WARPS:%.+]] = load i32, i32*
+//
+// Read into warp 0.
+// CHECK: [[IS_W0_ACTIVE_THREAD:%.+]] = icmp ult i32 [[TID:%.+]], [[ACTIVE_WARPS]]
+// CHECK: br i1 [[IS_W0_ACTIVE_THREAD]], label {{%?}}[[DO_READ:.+]], label {{%?}}[[READ_ELSE:.+]]
+//
+// CHECK: [[DO_READ]]
+// CHECK: [[MEDIUM_ELT:%.+]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[TID]]
+// CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST:%.+]], i{{32|64}} 0, i{{32|64}} 0
+// CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
+// CHECK: [[ELT_BASE:%.+]] = bitcast i8* [[ELT_VOID]] to i32*
+// CHECK: [[ELT:%.+]] = getelementptr i32, i32* [[ELT_BASE]], i32 [[CNT]]
+// CHECK: [[MEDIUM_ELT_VAL:%.+]] = load volatile i32, i32 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]],
+// CHECK: store i32 [[MEDIUM_ELT_VAL]], i32* [[ELT]],
+// CHECK: br label {{%?}}[[READ_CONT:.+]]
+//
+// CHECK: [[READ_ELSE]]
+// CHECK: br label {{%?}}[[READ_CONT]]
+//
+// CHECK: [[READ_CONT]]
+// CHECK: [[NEXT:%.+]] = add nsw i32 [[CNT]], 1
+// CHECK: store i32 [[NEXT]], i32* [[CNT_ADDR]],
+// CHECK: br label
+// CHECK: ret
+
+// CHECK: define {{.*}}void {{@__omp_offloading_.+template.+l32}}(
+//
+// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1)
+// CHECK: call void @__kmpc_data_sharing_init_stack_spmd
+// CHECK: br label {{%?}}[[EXECUTE:.+]]
+//
+// CHECK: [[EXECUTE]]
+// CHECK: {{call|invoke}} void [[PFN1:@.+]](i32*
+// CHECK: call void @__kmpc_spmd_kernel_deinit_v2(i16 1)
+//
+//
+// define internal void [[PFN1]](
+// CHECK: store float {{1\.[0e\+]+}}, float* [[D:%.+]], align
+// CHECK: [[C_VAL:%.+]] = load i8, i8* [[C:%.+]], align
+// CHECK: [[CONV:%.+]] = sext i8 [[C_VAL]] to i32
+// CHECK: [[XOR:%.+]] = xor i32 [[CONV]], 2
+// CHECK: [[TRUNC:%.+]] = trunc i32 [[XOR]] to i8
+// CHECK: store i8 [[TRUNC]], i8* [[C]], align
+// CHECK: [[DV:%.+]] = load float, float* [[D]], align
+// CHECK: [[MUL:%.+]] = fmul float [[DV]], {{[0-9e\.\+]+}}
+// CHECK: store float [[MUL]], float* [[D]], align
+// CHECK: [[PTR1:%.+]] = getelementptr inbounds [[RLT:.+]], [2 x i8*]* [[RL:%.+]], i{{32|64}} 0, i{{32|64}} 0
+// CHECK: store i8* [[C]], i8** [[PTR1]], align
+// CHECK: [[PTR2:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RL]], i{{32|64}} 0, i{{32|64}} 1
+// CHECK: [[D_CAST:%.+]] = bitcast float* [[D]] to i8*
+// CHECK: store i8* [[D_CAST]], i8** [[PTR2]], align
+// CHECK: [[ARG_RL:%.+]] = bitcast [[RLT]]* [[RL]] to i8*
+// CHECK: [[RET:%.+]] = call i32 @__kmpc_nvptx_parallel_reduce_nowait_v2(%struct.ident_t* @{{.+}}, i32 {{.+}}, i32 2, i{{32|64}} {{8|16}}, i8* [[ARG_RL]], void (i8*, i16, i16, i16)* [[SHUFFLE_REDUCE_FN:@.+]], void (i8*, i32)* [[WARP_COPY_FN:@.+]])
+// CHECK: [[CMP:%.+]] = icmp eq i32 [[RET]], 1
+// CHECK: br i1 [[CMP]], label
+// CHECK: [[C_INV8:%.+]] = load i8, i8* [[C_IN:%.+]], align
+// CHECK: [[C_INV:%.+]] = sext i8 [[C_INV8]] to i32
+// CHECK: [[CV8:%.+]] = load i8, i8* [[C]], align
+// CHECK: [[CV:%.+]] = sext i8 [[CV8]] to i32
+// CHECK: [[XOR:%.+]] = xor i32 [[C_INV]], [[CV]]
+// CHECK: [[TRUNC:%.+]] = trunc i32 [[XOR]] to i8
+// CHECK: store i8 [[TRUNC]], i8* [[C_IN]], align
+// CHECK: [[D_INV:%.+]] = load float, float* [[D_IN:%.+]], align
+// CHECK: [[DV:%.+]] = load float, float* [[D]], align
+// CHECK: [[MUL:%.+]] = fmul float [[D_INV]], [[DV]]
+// CHECK: store float [[MUL]], float* [[D_IN]], align
+// CHECK: call void @__kmpc_nvptx_end_reduce_nowait(
+// CHECK: br label
+//
+// CHECK: ret
+
+//
+// Reduction function
+// CHECK: define internal void [[REDUCTION_FUNC:@.+]](i8* %0, i8* %1)
+// CHECK: [[VAR1_RHS_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST_RHS:%.+]], i{{32|64}} 0, i{{32|64}} 0
+// CHECK: [[VAR1_RHS:%.+]] = load i8*, i8** [[VAR1_RHS_REF]],
+//
+// CHECK: [[VAR1_LHS_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST_LHS:%.+]], i{{32|64}} 0, i{{32|64}} 0
+// CHECK: [[VAR1_LHS:%.+]] = load i8*, i8** [[VAR1_LHS_REF]],
+//
+// CHECK: [[VAR2_RHS_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST_RHS]], i{{32|64}} 0, i{{32|64}} 1
+// CHECK: [[VAR2_RHS_VOID:%.+]] = load i8*, i8** [[VAR2_RHS_REF]],
+// CHECK: [[VAR2_RHS:%.+]] = bitcast i8* [[VAR2_RHS_VOID]] to float*
+//
+// CHECK: [[VAR2_LHS_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST_LHS]], i{{32|64}} 0, i{{32|64}} 1
+// CHECK: [[VAR2_LHS_VOID:%.+]] = load i8*, i8** [[VAR2_LHS_REF]],
+// CHECK: [[VAR2_LHS:%.+]] = bitcast i8* [[VAR2_LHS_VOID]] to float*
+//
+// CHECK: [[VAR1_LHS_VAL8:%.+]] = load i8, i8* [[VAR1_LHS]],
+// CHECK: [[VAR1_LHS_VAL:%.+]] = sext i8 [[VAR1_LHS_VAL8]] to i32
+// CHECK: [[VAR1_RHS_VAL8:%.+]] = load i8, i8* [[VAR1_RHS]],
+// CHECK: [[VAR1_RHS_VAL:%.+]] = sext i8 [[VAR1_RHS_VAL8]] to i32
+// CHECK: [[XOR:%.+]] = xor i32 [[VAR1_LHS_VAL]], [[VAR1_RHS_VAL]]
+// CHECK: [[RES:%.+]] = trunc i32 [[XOR]] to i8
+// CHECK: store i8 [[RES]], i8* [[VAR1_LHS]],
+//
+// CHECK: [[VAR2_LHS_VAL:%.+]] = load float, float* [[VAR2_LHS]],
+// CHECK: [[VAR2_RHS_VAL:%.+]] = load float, float* [[VAR2_RHS]],
+// CHECK: [[RES:%.+]] = fmul float [[VAR2_LHS_VAL]], [[VAR2_RHS_VAL]]
+// CHECK: store float [[RES]], float* [[VAR2_LHS]],
+// CHECK: ret void
+
+//
+// Shuffle and reduce function
+// CHECK: define internal void [[SHUFFLE_REDUCE_FN]](i8* %0, i16 {{.*}}, i16 {{.*}}, i16 {{.*}})
+// CHECK: [[REMOTE_RED_LIST:%.+]] = alloca [[RLT]], align
+// CHECK: [[REMOTE_ELT1:%.+]] = alloca i8
+// CHECK: [[REMOTE_ELT2:%.+]] = alloca float
+//
+// CHECK: [[LANEID:%.+]] = load i16, i16* {{.+}}, align
+// CHECK: [[LANEOFFSET:%.+]] = load i16, i16* {{.+}}, align
+// CHECK: [[ALGVER:%.+]] = load i16, i16* {{.+}}, align
+//
+// CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST:%.+]], i{{32|64}} 0, i{{32|64}} 0
+// CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
+// CHECK: [[REMOTE_ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[REMOTE_RED_LIST:%.+]], i{{32|64}} 0, i{{32|64}} 0
+// CHECK: [[ELT_VAL:%.+]] = load i8, i8* [[ELT_VOID]], align
+//
+// CHECK: [[ELT_CAST:%.+]] = sext i8 [[ELT_VAL]] to i32
+// CHECK: [[WS32:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
+// CHECK: [[WS:%.+]] = trunc i32 [[WS32]] to i16
+// CHECK: [[REMOTE_ELT1_VAL32:%.+]] = call i32 @__kmpc_shuffle_int32(i32 [[ELT_CAST]], i16 [[LANEOFFSET]], i16 [[WS]])
+// CHECK: [[REMOTE_ELT1_VAL:%.+]] = trunc i32 [[REMOTE_ELT1_VAL32]] to i8
+//
+// CHECK: store i8 [[REMOTE_ELT1_VAL]], i8* [[REMOTE_ELT1]], align
+// CHECK: store i8* [[REMOTE_ELT1]], i8** [[REMOTE_ELT_REF]], align
+//
+// CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST]], i{{32|64}} 0, i{{32|64}} 1
+// CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
+// CHECK: [[REMOTE_ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[REMOTE_RED_LIST]], i{{32|64}} 0, i{{32|64}} 1
+// CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to float*
+//
+// CHECK: [[ELT_CAST:%.+]] = bitcast float* [[ELT]] to i32*
+// CHECK: [[REMOTE_ELT2_CAST:%.+]] = bitcast float* [[REMOTE_ELT2]] to i32*
+// CHECK: [[ELT_VAL:%.+]] = load i32, i32* [[ELT_CAST]], align
+// CHECK: [[WS32:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
+// CHECK: [[WS:%.+]] = trunc i32 [[WS32]] to i16
+// CHECK: [[REMOTE_ELT2_VAL32:%.+]] = call i32 @__kmpc_shuffle_int32(i32 [[ELT_VAL]], i16 [[LANEOFFSET]], i16 [[WS]])
+//
+// CHECK: store i32 [[REMOTE_ELT2_VAL32]], i32* [[REMOTE_ELT2_CAST]], align
+// CHECK: [[REMOTE_ELT2C:%.+]] = bitcast float* [[REMOTE_ELT2]] to i8*
+// CHECK: store i8* [[REMOTE_ELT2C]], i8** [[REMOTE_ELT_REF]], align
+//
+// Condition to reduce
+// CHECK: [[CONDALG0:%.+]] = icmp eq i16 [[ALGVER]], 0
+//
+// CHECK: [[COND1:%.+]] = icmp eq i16 [[ALGVER]], 1
+// CHECK: [[COND2:%.+]] = icmp ult i16 [[LANEID]], [[LANEOFFSET]]
+// CHECK: [[CONDALG1:%.+]] = and i1 [[COND1]], [[COND2]]
+//
+// CHECK: [[COND3:%.+]] = icmp eq i16 [[ALGVER]], 2
+// CHECK: [[COND4:%.+]] = and i16 [[LANEID]], 1
+// CHECK: [[COND5:%.+]] = icmp eq i16 [[COND4]], 0
+// CHECK: [[COND6:%.+]] = and i1 [[COND3]], [[COND5]]
+// CHECK: [[COND7:%.+]] = icmp sgt i16 [[LANEOFFSET]], 0
+// CHECK: [[CONDALG2:%.+]] = and i1 [[COND6]], [[COND7]]
+//
+// CHECK: [[COND8:%.+]] = or i1 [[CONDALG0]], [[CONDALG1]]
+// CHECK: [[SHOULD_REDUCE:%.+]] = or i1 [[COND8]], [[CONDALG2]]
+// CHECK: br i1 [[SHOULD_REDUCE]], label {{%?}}[[DO_REDUCE:.+]], label {{%?}}[[REDUCE_ELSE:.+]]
+//
+// CHECK: [[DO_REDUCE]]
+// CHECK: [[RED_LIST1_VOID:%.+]] = bitcast [[RLT]]* [[RED_LIST]] to i8*
+// CHECK: [[RED_LIST2_VOID:%.+]] = bitcast [[RLT]]* [[REMOTE_RED_LIST]] to i8*
+// CHECK: call void [[REDUCTION_FUNC]](i8* [[RED_LIST1_VOID]], i8* [[RED_LIST2_VOID]])
+// CHECK: br label {{%?}}[[REDUCE_CONT:.+]]
+//
+// CHECK: [[REDUCE_ELSE]]
+// CHECK: br label {{%?}}[[REDUCE_CONT]]
+//
+// CHECK: [[REDUCE_CONT]]
+// Now check if we should just copy over the remote reduction list
+// CHECK: [[COND1:%.+]] = icmp eq i16 [[ALGVER]], 1
+// CHECK: [[COND2:%.+]] = icmp uge i16 [[LANEID]], [[LANEOFFSET]]
+// CHECK: [[SHOULD_COPY:%.+]] = and i1 [[COND1]], [[COND2]]
+// CHECK: br i1 [[SHOULD_COPY]], label {{%?}}[[DO_COPY:.+]], label {{%?}}[[COPY_ELSE:.+]]
+//
+// CHECK: [[DO_COPY]]
+// CHECK: [[REMOTE_ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[REMOTE_RED_LIST]], i{{32|64}} 0, i{{32|64}} 0
+// CHECK: [[REMOTE_ELT_VOID:%.+]] = load i8*, i8** [[REMOTE_ELT_REF]],
+// CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST]], i{{32|64}} 0, i{{32|64}} 0
+// CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
+// CHECK: [[REMOTE_ELT_VAL:%.+]] = load i8, i8* [[REMOTE_ELT_VOID]], align
+// CHECK: store i8 [[REMOTE_ELT_VAL]], i8* [[ELT_VOID]], align
+//
+// CHECK: [[REMOTE_ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[REMOTE_RED_LIST]], i{{32|64}} 0, i{{32|64}} 1
+// CHECK: [[REMOTE_ELT_VOID:%.+]] = load i8*, i8** [[REMOTE_ELT_REF]],
+// CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST]], i{{32|64}} 0, i{{32|64}} 1
+// CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
+// CHECK: [[REMOTE_ELT:%.+]] = bitcast i8* [[REMOTE_ELT_VOID]] to float*
+// CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to float*
+// CHECK: [[REMOTE_ELT_VAL:%.+]] = load float, float* [[REMOTE_ELT]], align
+// CHECK: store float [[REMOTE_ELT_VAL]], float* [[ELT]], align
+// CHECK: br label {{%?}}[[COPY_CONT:.+]]
+//
+// CHECK: [[COPY_ELSE]]
+// CHECK: br label {{%?}}[[COPY_CONT]]
+//
+// CHECK: [[COPY_CONT]]
+// CHECK: void
+
+//
+// Inter warp copy function
+// CHECK: define internal void [[WARP_COPY_FN]](i8* %0, i32 %1)
+// CHECK-DAG: [[LANEID:%.+]] = and i32 {{.+}}, 31
+// CHECK-DAG: [[WARPID:%.+]] = ashr i32 {{.+}}, 5
+// CHECK-DAG: [[RED_LIST:%.+]] = bitcast i8* {{.+}} to [[RLT]]*
+// CHECK: call void @__kmpc_barrier(%struct.ident_t* @
+// CHECK: [[IS_WARP_MASTER:%.+]] = icmp eq i32 [[LANEID]], 0
+// CHECK: br i1 [[IS_WARP_MASTER]], label {{%?}}[[DO_COPY:.+]], label {{%?}}[[COPY_ELSE:.+]]
+//
+// [[DO_COPY]]
+// CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST]], i{{32|64}} 0, i{{32|64}} 0
+// CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
+//
+// CHECK: [[MEDIUM_ELT64:%.+]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[WARPID]]
+// CHECK: [[MEDIUM_ELT:%.+]] = bitcast i32 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT64]] to i8 addrspace([[SHARED_ADDRSPACE]])*
+// CHECK: [[ELT_VAL:%.+]] = load i8, i8* [[ELT_VOID]], align
+// CHECK: store volatile i8 [[ELT_VAL]], i8 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align
+// CHECK: br label {{%?}}[[COPY_CONT:.+]]
+//
+// CHECK: [[COPY_ELSE]]
+// CHECK: br label {{%?}}[[COPY_CONT]]
+//
+// Barrier after copy to shared memory storage medium.
+// CHECK: [[COPY_CONT]]
+// CHECK: call void @__kmpc_barrier(%struct.ident_t* @
+// CHECK: [[ACTIVE_WARPS:%.+]] = load i32, i32*
+//
+// Read into warp 0.
+// CHECK: [[IS_W0_ACTIVE_THREAD:%.+]] = icmp ult i32 [[TID:%.+]], [[ACTIVE_WARPS]]
+// CHECK: br i1 [[IS_W0_ACTIVE_THREAD]], label {{%?}}[[DO_READ:.+]], label {{%?}}[[READ_ELSE:.+]]
+//
+// CHECK: [[DO_READ]]
+// CHECK: [[MEDIUM_ELT32:%.+]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[TID]]
+// CHECK: [[MEDIUM_ELT:%.+]] = bitcast i32 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT32]] to i8 addrspace([[SHARED_ADDRSPACE]])*
+// CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST:%.+]], i{{32|64}} 0, i{{32|64}} 0
+// CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
+// CHECK: [[MEDIUM_ELT_VAL:%.+]] = load volatile i8, i8 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align
+// CHECK: store i8 [[MEDIUM_ELT_VAL]], i8* [[ELT_VOID]], align
+// CHECK: br label {{%?}}[[READ_CONT:.+]]
+//
+// CHECK: [[READ_ELSE]]
+// CHECK: br label {{%?}}[[READ_CONT]]
+//
+// CHECK: [[READ_CONT]]
+// CHECK: call void @__kmpc_barrier(%struct.ident_t* @
+// CHECK: [[IS_WARP_MASTER:%.+]] = icmp eq i32 [[LANEID]], 0
+// CHECK: br i1 [[IS_WARP_MASTER]], label {{%?}}[[DO_COPY:.+]], label {{%?}}[[COPY_ELSE:.+]]
+//
+// [[DO_COPY]]
+// CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST]], i{{32|64}} 0, i{{32|64}} 1
+// CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
+// CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i32*
+//
+// CHECK: [[MEDIUM_ELT:%.+]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[WARPID]]
+// CHECK: [[ELT_VAL:%.+]] = load i32, i32* [[ELT]], align
+// CHECK: store volatile i32 [[ELT_VAL]], i32 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align
+// CHECK: br label {{%?}}[[COPY_CONT:.+]]
+//
+// CHECK: [[COPY_ELSE]]
+// CHECK: br label {{%?}}[[COPY_CONT]]
+//
+// Barrier after copy to shared memory storage medium.
+// CHECK: [[COPY_CONT]]
+// CHECK: call void @__kmpc_barrier(%struct.ident_t* @
+// CHECK: [[ACTIVE_WARPS:%.+]] = load i32, i32*
+//
+// Read into warp 0.
+// CHECK: [[IS_W0_ACTIVE_THREAD:%.+]] = icmp ult i32 [[TID:%.+]], [[ACTIVE_WARPS]]
+// CHECK: br i1 [[IS_W0_ACTIVE_THREAD]], label {{%?}}[[DO_READ:.+]], label {{%?}}[[READ_ELSE:.+]]
+//
+// CHECK: [[DO_READ]]
+// CHECK: [[MEDIUM_ELT:%.+]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[TID]]
+// CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST:%.+]], i{{32|64}} 0, i{{32|64}} 1
+// CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
+// CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i32*
+// CHECK: [[MEDIUM_ELT_VAL:%.+]] = load volatile i32, i32 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align
+// CHECK: store i32 [[MEDIUM_ELT_VAL]], i32* [[ELT]], align
+// CHECK: br label {{%?}}[[READ_CONT:.+]]
+//
+// CHECK: [[READ_ELSE]]
+// CHECK: br label {{%?}}[[READ_CONT]]
+//
+// CHECK: [[READ_CONT]]
+// CHECK: ret
+
+// CHECK: define {{.*}}void {{@__omp_offloading_.+template.+l38}}(
+//
+// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1)
+// CHECK: call void @__kmpc_data_sharing_init_stack_spmd
+// CHECK: br label {{%?}}[[EXECUTE:.+]]
+//
+// CHECK: [[EXECUTE]]
+// CHECK: {{call|invoke}} void [[PFN2:@.+]](i32*
+// CHECK: call void @__kmpc_spmd_kernel_deinit_v2(i16 1)
+//
+//
+// define internal void [[PFN2]](
+// CHECK: store i32 0, i32* [[A:%.+]], align
+// CHECK: store i16 -32768, i16* [[B:%.+]], align
+// CHECK: [[A_VAL:%.+]] = load i32, i32* [[A:%.+]], align
+// CHECK: [[OR:%.+]] = or i32 [[A_VAL]], 1
+// CHECK: store i32 [[OR]], i32* [[A]], align
+// CHECK: [[BV16:%.+]] = load i16, i16* [[B]], align
+// CHECK: [[BV:%.+]] = sext i16 [[BV16]] to i32
+// CHECK: [[CMP:%.+]] = icmp sgt i32 99, [[BV]]
+// CHECK: br i1 [[CMP]], label {{%?}}[[DO_MAX:.+]], label {{%?}}[[MAX_ELSE:.+]]
+//
+// CHECK: [[DO_MAX]]
+// CHECK: br label {{%?}}[[MAX_CONT:.+]]
+//
+// CHECK: [[MAX_ELSE]]
+// CHECK: [[BV:%.+]] = load i16, i16* [[B]], align
+// CHECK: [[MAX:%.+]] = sext i16 [[BV]] to i32
+// CHECK: br label {{%?}}[[MAX_CONT]]
+//
+// CHECK: [[MAX_CONT]]
+// CHECK: [[B_LVALUE:%.+]] = phi i32 [ 99, %[[DO_MAX]] ], [ [[MAX]], %[[MAX_ELSE]] ]
+// CHECK: [[TRUNC:%.+]] = trunc i32 [[B_LVALUE]] to i16
+// CHECK: store i16 [[TRUNC]], i16* [[B]], align
+// CHECK: [[PTR1:%.+]] = getelementptr inbounds [[RLT:.+]], [2 x i8*]* [[RL:%.+]], i{{32|64}} 0, i{{32|64}} 0
+// CHECK: [[A_CAST:%.+]] = bitcast i32* [[A]] to i8*
+// CHECK: store i8* [[A_CAST]], i8** [[PTR1]], align
+// CHECK: [[PTR2:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RL]], i{{32|64}} 0, i{{32|64}} 1
+// CHECK: [[B_CAST:%.+]] = bitcast i16* [[B]] to i8*
+// CHECK: store i8* [[B_CAST]], i8** [[PTR2]], align
+// CHECK: [[ARG_RL:%.+]] = bitcast [[RLT]]* [[RL]] to i8*
+// CHECK: [[RET:%.+]] = call i32 @__kmpc_nvptx_parallel_reduce_nowait_v2(%struct.ident_t* @{{.+}}, i32 {{.+}}, i32 2, i{{32|64}} {{8|16}}, i8* [[ARG_RL]], void (i8*, i16, i16, i16)* [[SHUFFLE_REDUCE_FN:@.+]], void (i8*, i32)* [[WARP_COPY_FN:@.+]])
+// CHECK: [[CMP:%.+]] = icmp eq i32 [[RET]], 1
+// CHECK: br i1 [[CMP]], label
+
+// CHECK: [[A_INV:%.+]] = load i32, i32* [[A_IN:%.+]], align
+// CHECK: [[AV:%.+]] = load i32, i32* [[A]], align
+// CHECK: [[OR:%.+]] = or i32 [[A_INV]], [[AV]]
+// CHECK: store i32 [[OR]], i32* [[A_IN]], align
+// CHECK: [[B_INV16:%.+]] = load i16, i16* [[B_IN:%.+]], align
+// CHECK: [[B_INV:%.+]] = sext i16 [[B_INV16]] to i32
+// CHECK: [[BV16:%.+]] = load i16, i16* [[B]], align
+// CHECK: [[BV:%.+]] = sext i16 [[BV16]] to i32
+// CHECK: [[CMP:%.+]] = icmp sgt i32 [[B_INV]], [[BV]]
+// CHECK: br i1 [[CMP]], label {{%?}}[[DO_MAX:.+]], label {{%?}}[[MAX_ELSE:.+]]
+//
+// CHECK: [[DO_MAX]]
+// CHECK: [[MAX1:%.+]] = load i16, i16* [[B_IN]], align
+// CHECK: br label {{%?}}[[MAX_CONT:.+]]
+//
+// CHECK: [[MAX_ELSE]]
+// CHECK: [[MAX2:%.+]] = load i16, i16* [[B]], align
+// CHECK: br label {{%?}}[[MAX_CONT]]
+//
+// CHECK: [[MAX_CONT]]
+// CHECK: [[B_MAX:%.+]] = phi i16 [ [[MAX1]], %[[DO_MAX]] ], [ [[MAX2]], %[[MAX_ELSE]] ]
+// CHECK: store i16 [[B_MAX]], i16* [[B_IN]], align
+// CHECK: call void @__kmpc_nvptx_end_reduce_nowait(
+// CHECK: br label
+//
+// CHECK: ret
+
+//
+// Reduction function
+// CHECK: define internal void [[REDUCTION_FUNC:@.+]](i8* %0, i8* %1)
+// CHECK: [[VAR1_RHS_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST_RHS:%.+]], i{{32|64}} 0, i{{32|64}} 0
+// CHECK: [[VAR1_RHS_VOID:%.+]] = load i8*, i8** [[VAR1_RHS_REF]],
+// CHECK: [[VAR1_RHS:%.+]] = bitcast i8* [[VAR1_RHS_VOID]] to i32*
+//
+// CHECK: [[VAR1_LHS_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST_LHS:%.+]], i{{32|64}} 0, i{{32|64}} 0
+// CHECK: [[VAR1_LHS_VOID:%.+]] = load i8*, i8** [[VAR1_LHS_REF]],
+// CHECK: [[VAR1_LHS:%.+]] = bitcast i8* [[VAR1_LHS_VOID]] to i32*
+//
+// CHECK: [[VAR2_RHS_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST_RHS]], i{{32|64}} 0, i{{32|64}} 1
+// CHECK: [[VAR2_RHS_VOID:%.+]] = load i8*, i8** [[VAR2_RHS_REF]],
+// CHECK: [[VAR2_RHS:%.+]] = bitcast i8* [[VAR2_RHS_VOID]] to i16*
+//
+// CHECK: [[VAR2_LHS_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST_LHS]], i{{32|64}} 0, i{{32|64}} 1
+// CHECK: [[VAR2_LHS_VOID:%.+]] = load i8*, i8** [[VAR2_LHS_REF]],
+// CHECK: [[VAR2_LHS:%.+]] = bitcast i8* [[VAR2_LHS_VOID]] to i16*
+//
+// CHECK: [[VAR1_LHS_VAL:%.+]] = load i32, i32* [[VAR1_LHS]],
+// CHECK: [[VAR1_RHS_VAL:%.+]] = load i32, i32* [[VAR1_RHS]],
+// CHECK: [[OR:%.+]] = or i32 [[VAR1_LHS_VAL]], [[VAR1_RHS_VAL]]
+// CHECK: store i32 [[OR]], i32* [[VAR1_LHS]],
+//
+// CHECK: [[VAR2_LHS_VAL16:%.+]] = load i16, i16* [[VAR2_LHS]],
+// CHECK: [[VAR2_LHS_VAL:%.+]] = sext i16 [[VAR2_LHS_VAL16]] to i32
+// CHECK: [[VAR2_RHS_VAL16:%.+]] = load i16, i16* [[VAR2_RHS]],
+// CHECK: [[VAR2_RHS_VAL:%.+]] = sext i16 [[VAR2_RHS_VAL16]] to i32
+//
+// CHECK: [[CMP:%.+]] = icmp sgt i32 [[VAR2_LHS_VAL]], [[VAR2_RHS_VAL]]
+// CHECK: br i1 [[CMP]], label {{%?}}[[DO_MAX:.+]], label {{%?}}[[MAX_ELSE:.+]]
+//
+// CHECK: [[DO_MAX]]
+// CHECK: [[MAX1:%.+]] = load i16, i16* [[VAR2_LHS]], align
+// CHECK: br label {{%?}}[[MAX_CONT:.+]]
+//
+// CHECK: [[MAX_ELSE]]
+// CHECK: [[MAX2:%.+]] = load i16, i16* [[VAR2_RHS]], align
+// CHECK: br label {{%?}}[[MAX_CONT]]
+//
+// CHECK: [[MAX_CONT]]
+// CHECK: [[MAXV:%.+]] = phi i16 [ [[MAX1]], %[[DO_MAX]] ], [ [[MAX2]], %[[MAX_ELSE]] ]
+// CHECK: store i16 [[MAXV]], i16* [[VAR2_LHS]],
+// CHECK: ret void
+
+//
+// Shuffle and reduce function
+// CHECK: define internal void [[SHUFFLE_REDUCE_FN]](i8* %0, i16 {{.*}}, i16 {{.*}}, i16 {{.*}})
+// CHECK: [[REMOTE_RED_LIST:%.+]] = alloca [[RLT]], align
+// CHECK: [[REMOTE_ELT1:%.+]] = alloca i32
+// CHECK: [[REMOTE_ELT2:%.+]] = alloca i16
+//
+// CHECK: [[LANEID:%.+]] = load i16, i16* {{.+}}, align
+// CHECK: [[LANEOFFSET:%.+]] = load i16, i16* {{.+}}, align
+// CHECK: [[ALGVER:%.+]] = load i16, i16* {{.+}}, align
+//
+// CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST:%.+]], i{{32|64}} 0, i{{32|64}} 0
+// CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
+// CHECK: [[REMOTE_ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[REMOTE_RED_LIST:%.+]], i{{32|64}} 0, i{{32|64}} 0
+// CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i32*
+// CHECK: [[ELT_VAL:%.+]] = load i32, i32* [[ELT]], align
+//
+// CHECK: [[WS32:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
+// CHECK: [[WS:%.+]] = trunc i32 [[WS32]] to i16
+// CHECK: [[REMOTE_ELT1_VAL:%.+]] = call i32 @__kmpc_shuffle_int32(i32 [[ELT_VAL]], i16 [[LANEOFFSET]], i16 [[WS]])
+//
+// CHECK: store i32 [[REMOTE_ELT1_VAL]], i32* [[REMOTE_ELT1]], align
+// CHECK: [[REMOTE_ELT1C:%.+]] = bitcast i32* [[REMOTE_ELT1]] to i8*
+// CHECK: store i8* [[REMOTE_ELT1C]], i8** [[REMOTE_ELT_REF]], align
+//
+// CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST]], i{{32|64}} 0, i{{32|64}} 1
+// CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
+// CHECK: [[REMOTE_ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[REMOTE_RED_LIST]], i{{32|64}} 0, i{{32|64}} 1
+// CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i16*
+// CHECK: [[ELT_VAL:%.+]] = load i16, i16* [[ELT]], align
+//
+// CHECK: [[ELT_CAST:%.+]] = sext i16 [[ELT_VAL]] to i32
+// CHECK: [[WS32:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
+// CHECK: [[WS:%.+]] = trunc i32 [[WS32]] to i16
+// CHECK: [[REMOTE_ELT2_VAL32:%.+]] = call i32 @__kmpc_shuffle_int32(i32 [[ELT_CAST]], i16 [[LANEOFFSET]], i16 [[WS]])
+// CHECK: [[REMOTE_ELT2_VAL:%.+]] = trunc i32 [[REMOTE_ELT2_VAL32]] to i16
+//
+// CHECK: store i16 [[REMOTE_ELT2_VAL]], i16* [[REMOTE_ELT2]], align
+// CHECK: [[REMOTE_ELT2C:%.+]] = bitcast i16* [[REMOTE_ELT2]] to i8*
+// CHECK: store i8* [[REMOTE_ELT2C]], i8** [[REMOTE_ELT_REF]], align
+//
+// Condition to reduce
+// CHECK: [[CONDALG0:%.+]] = icmp eq i16 [[ALGVER]], 0
+//
+// CHECK: [[COND1:%.+]] = icmp eq i16 [[ALGVER]], 1
+// CHECK: [[COND2:%.+]] = icmp ult i16 [[LANEID]], [[LANEOFFSET]]
+// CHECK: [[CONDALG1:%.+]] = and i1 [[COND1]], [[COND2]]
+//
+// CHECK: [[COND3:%.+]] = icmp eq i16 [[ALGVER]], 2
+// CHECK: [[COND4:%.+]] = and i16 [[LANEID]], 1
+// CHECK: [[COND5:%.+]] = icmp eq i16 [[COND4]], 0
+// CHECK: [[COND6:%.+]] = and i1 [[COND3]], [[COND5]]
+// CHECK: [[COND7:%.+]] = icmp sgt i16 [[LANEOFFSET]], 0
+// CHECK: [[CONDALG2:%.+]] = and i1 [[COND6]], [[COND7]]
+//
+// CHECK: [[COND8:%.+]] = or i1 [[CONDALG0]], [[CONDALG1]]
+// CHECK: [[SHOULD_REDUCE:%.+]] = or i1 [[COND8]], [[CONDALG2]]
+// CHECK: br i1 [[SHOULD_REDUCE]], label {{%?}}[[DO_REDUCE:.+]], label {{%?}}[[REDUCE_ELSE:.+]]
+//
+// CHECK: [[DO_REDUCE]]
+// CHECK: [[RED_LIST1_VOID:%.+]] = bitcast [[RLT]]* [[RED_LIST]] to i8*
+// CHECK: [[RED_LIST2_VOID:%.+]] = bitcast [[RLT]]* [[REMOTE_RED_LIST]] to i8*
+// CHECK: call void [[REDUCTION_FUNC]](i8* [[RED_LIST1_VOID]], i8* [[RED_LIST2_VOID]])
+// CHECK: br label {{%?}}[[REDUCE_CONT:.+]]
+//
+// CHECK: [[REDUCE_ELSE]]
+// CHECK: br label {{%?}}[[REDUCE_CONT]]
+//
+// CHECK: [[REDUCE_CONT]]
+// Now check if we should just copy over the remote reduction list
+// CHECK: [[COND1:%.+]] = icmp eq i16 [[ALGVER]], 1
+// CHECK: [[COND2:%.+]] = icmp uge i16 [[LANEID]], [[LANEOFFSET]]
+// CHECK: [[SHOULD_COPY:%.+]] = and i1 [[COND1]], [[COND2]]
+// CHECK: br i1 [[SHOULD_COPY]], label {{%?}}[[DO_COPY:.+]], label {{%?}}[[COPY_ELSE:.+]]
+//
+// CHECK: [[DO_COPY]]
+// CHECK: [[REMOTE_ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[REMOTE_RED_LIST]], i{{32|64}} 0, i{{32|64}} 0
+// CHECK: [[REMOTE_ELT_VOID:%.+]] = load i8*, i8** [[REMOTE_ELT_REF]],
+// CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST]], i{{32|64}} 0, i{{32|64}} 0
+// CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
+// CHECK: [[REMOTE_ELT:%.+]] = bitcast i8* [[REMOTE_ELT_VOID]] to i32*
+// CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i32*
+// CHECK: [[REMOTE_ELT_VAL:%.+]] = load i32, i32* [[REMOTE_ELT]], align
+// CHECK: store i32 [[REMOTE_ELT_VAL]], i32* [[ELT]], align
+//
+// CHECK: [[REMOTE_ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[REMOTE_RED_LIST]], i{{32|64}} 0, i{{32|64}} 1
+// CHECK: [[REMOTE_ELT_VOID:%.+]] = load i8*, i8** [[REMOTE_ELT_REF]],
+// CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST]], i{{32|64}} 0, i{{32|64}} 1
+// CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
+// CHECK: [[REMOTE_ELT:%.+]] = bitcast i8* [[REMOTE_ELT_VOID]] to i16*
+// CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i16*
+// CHECK: [[REMOTE_ELT_VAL:%.+]] = load i16, i16* [[REMOTE_ELT]], align
+// CHECK: store i16 [[REMOTE_ELT_VAL]], i16* [[ELT]], align
+// CHECK: br label {{%?}}[[COPY_CONT:.+]]
+//
+// CHECK: [[COPY_ELSE]]
+// CHECK: br label {{%?}}[[COPY_CONT]]
+//
+// CHECK: [[COPY_CONT]]
+// CHECK: void
+
+//
+// Inter warp copy function
+// CHECK: define internal void [[WARP_COPY_FN]](i8* %0, i32 %1)
+// CHECK-DAG: [[LANEID:%.+]] = and i32 {{.+}}, 31
+// CHECK-DAG: [[WARPID:%.+]] = ashr i32 {{.+}}, 5
+// CHECK-DAG: [[RED_LIST:%.+]] = bitcast i8* {{.+}} to [[RLT]]*
+// CHECK: call void @__kmpc_barrier(%struct.ident_t* @
+// CHECK: [[IS_WARP_MASTER:%.+]] = icmp eq i32 [[LANEID]], 0
+// CHECK: br i1 [[IS_WARP_MASTER]], label {{%?}}[[DO_COPY:.+]], label {{%?}}[[COPY_ELSE:.+]]
+//
+// [[DO_COPY]]
+// CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST]], i{{32|64}} 0, i{{32|64}} 0
+// CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
+// CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i32*
+//
+// CHECK: [[MEDIUM_ELT:%.+]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[WARPID]]
+// CHECK: [[ELT_VAL:%.+]] = load i32, i32* [[ELT]], align
+// CHECK: store volatile i32 [[ELT_VAL]], i32 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align
+// CHECK: br label {{%?}}[[COPY_CONT:.+]]
+//
+// CHECK: [[COPY_ELSE]]
+// CHECK: br label {{%?}}[[COPY_CONT]]
+//
+// Barrier after copy to shared memory storage medium.
+// CHECK: [[COPY_CONT]]
+// CHECK: call void @__kmpc_barrier(%struct.ident_t* @
+// CHECK: [[ACTIVE_WARPS:%.+]] = load i32, i32*
+//
+// Read into warp 0.
+// CHECK: [[IS_W0_ACTIVE_THREAD:%.+]] = icmp ult i32 [[TID:%.+]], [[ACTIVE_WARPS]]
+// CHECK: br i1 [[IS_W0_ACTIVE_THREAD]], label {{%?}}[[DO_READ:.+]], label {{%?}}[[READ_ELSE:.+]]
+//
+// CHECK: [[DO_READ]]
+// CHECK: [[MEDIUM_ELT:%.+]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[TID]]
+// CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST:%.+]], i{{32|64}} 0, i{{32|64}} 0
+// CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
+// CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i32*
+// CHECK: [[MEDIUM_ELT_VAL:%.+]] = load volatile i32, i32 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align
+// CHECK: store i32 [[MEDIUM_ELT_VAL]], i32* [[ELT]], align
+// CHECK: br label {{%?}}[[READ_CONT:.+]]
+//
+// CHECK: [[READ_ELSE]]
+// CHECK: br label {{%?}}[[READ_CONT]]
+//
+// CHECK: [[READ_CONT]]
+// CHECK: call void @__kmpc_barrier(%struct.ident_t* @
+// CHECK: [[IS_WARP_MASTER:%.+]] = icmp eq i32 [[LANEID]], 0
+// CHECK: br i1 [[IS_WARP_MASTER]], label {{%?}}[[DO_COPY:.+]], label {{%?}}[[COPY_ELSE:.+]]
+//
+// [[DO_COPY]]
+// CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST]], i{{32|64}} 0, i{{32|64}} 1
+// CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
+// CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i16*
+//
+// CHECK: [[MEDIUM_ELT32:%.+]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[WARPID]]
+// CHECK: [[MEDIUM_ELT:%.+]] = bitcast i32 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT32]] to i16 addrspace([[SHARED_ADDRSPACE]])*
+// CHECK: [[ELT_VAL:%.+]] = load i16, i16* [[ELT]], align
+// CHECK: store volatile i16 [[ELT_VAL]], i16 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align
+// CHECK: br label {{%?}}[[COPY_CONT:.+]]
+//
+// CHECK: [[COPY_ELSE]]
+// CHECK: br label {{%?}}[[COPY_CONT]]
+//
+// Barrier after copy to shared memory storage medium.
+// CHECK: [[COPY_CONT]]
+// CHECK: call void @__kmpc_barrier(%struct.ident_t* @
+// CHECK: [[ACTIVE_WARPS:%.+]] = load i32, i32*
+//
+// Read into warp 0.
+// CHECK: [[IS_W0_ACTIVE_THREAD:%.+]] = icmp ult i32 [[TID:%.+]], [[ACTIVE_WARPS]]
+// CHECK: br i1 [[IS_W0_ACTIVE_THREAD]], label {{%?}}[[DO_READ:.+]], label {{%?}}[[READ_ELSE:.+]]
+//
+// CHECK: [[DO_READ]]
+// CHECK: [[MEDIUM_ELT32:%.+]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[TID]]
+// CHECK: [[MEDIUM_ELT:%.+]] = bitcast i32 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT32]] to i16 addrspace([[SHARED_ADDRSPACE]])*
+// CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST:%.+]], i{{32|64}} 0, i{{32|64}} 1
+// CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
+// CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i16*
+// CHECK: [[MEDIUM_ELT_VAL:%.+]] = load volatile i16, i16 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align
+// CHECK: store i16 [[MEDIUM_ELT_VAL]], i16* [[ELT]], align
+// CHECK: br label {{%?}}[[READ_CONT:.+]]
+//
+// CHECK: [[READ_ELSE]]
+// CHECK: br label {{%?}}[[READ_CONT]]
+//
+// CHECK: [[READ_CONT]]
+// CHECK: ret
 
 #endif

diff  --git a/clang/test/OpenMP/nvptx_target_simd_codegen.cpp b/clang/test/OpenMP/nvptx_target_simd_codegen.cpp
index 168b82057cac..b35609cc5e92 100644
--- a/clang/test/OpenMP/nvptx_target_simd_codegen.cpp
+++ b/clang/test/OpenMP/nvptx_target_simd_codegen.cpp
@@ -61,28 +61,28 @@ int bar(int n){
 }
 
 // CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+l32}}(
-// CHECK: call void @__kmpc_spmd_kernel_init(i32 %{{.+}}, i16 0, i16 0)
+// CHECK: call void @__kmpc_spmd_kernel_init(i32 %{{.+}}, i16 0)
 // CHECK-NOT: call void @__kmpc_for_static_init
 // CHECK-NOT: call void @__kmpc_for_static_fini
 // CHECK: call void @__kmpc_spmd_kernel_deinit_v2(i16 0)
 // CHECK: ret void
 
 // CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+l37}}(
-// CHECK: call void @__kmpc_spmd_kernel_init(i32 %{{.+}}, i16 0, i16 0)
+// CHECK: call void @__kmpc_spmd_kernel_init(i32 %{{.+}}, i16 0)
 // CHECK-NOT: call void @__kmpc_for_static_init
 // CHECK-NOT: call void @__kmpc_for_static_fini
 // CHECK: call void @__kmpc_spmd_kernel_deinit_v2(i16 0)
 // CHECK: ret void
 
 // CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+l42}}(
-// CHECK: call void @__kmpc_spmd_kernel_init(i32 %{{.+}}, i16 0, i16 0)
+// CHECK: call void @__kmpc_spmd_kernel_init(i32 %{{.+}}, i16 0)
 // CHECK-NOT: call void @__kmpc_for_static_init
 // CHECK-NOT: call void @__kmpc_for_static_fini
 // CHECK: call void @__kmpc_spmd_kernel_deinit_v2(i16 0)
 // CHECK: ret void
 
 // CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+l47}}(
-// CHECK: call void @__kmpc_spmd_kernel_init(i32 %{{.+}}, i16 0, i16 0)
+// CHECK: call void @__kmpc_spmd_kernel_init(i32 %{{.+}}, i16 0)
 // CHECK-NOT: call void @__kmpc_for_static_init
 // CHECK-NOT: call void @__kmpc_for_static_fini
 // CHECK-NOT: call void @__kmpc_nvptx_end_reduce_nowait(

diff  --git a/clang/test/OpenMP/nvptx_target_teams_codegen.cpp b/clang/test/OpenMP/nvptx_target_teams_codegen.cpp
index 8ff393f074e4..3a6e39dfdba1 100644
--- a/clang/test/OpenMP/nvptx_target_teams_codegen.cpp
+++ b/clang/test/OpenMP/nvptx_target_teams_codegen.cpp
@@ -232,7 +232,7 @@ int bar(int n){
   // CHECK: ret void
 
 // CHECK: define weak void @__omp_offloading_{{.*}}ftemplate{{.*}}_l37(
-// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1, i16 0)
+// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1)
 // CHECK: call void @__kmpc_data_sharing_init_stack_spmd
 // CHECK-NOT: call i8* @__kmpc_data_sharing_push_stack(
 // CHECK-NOT: call void @__kmpc_serialized_parallel(

diff  --git a/clang/test/OpenMP/nvptx_target_teams_distribute_parallel_for_codegen.cpp b/clang/test/OpenMP/nvptx_target_teams_distribute_parallel_for_codegen.cpp
index 4fd0f71c5e5d..d40aad3dee77 100644
--- a/clang/test/OpenMP/nvptx_target_teams_distribute_parallel_for_codegen.cpp
+++ b/clang/test/OpenMP/nvptx_target_teams_distribute_parallel_for_codegen.cpp
@@ -100,7 +100,7 @@ int bar(int n){
 
 // CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+}}_l50(
 // CHECK-DAG: [[THREAD_LIMIT:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
-// CHECK: call void @__kmpc_spmd_kernel_init(i32 [[THREAD_LIMIT]], i16 0, i16 0)
+// CHECK: call void @__kmpc_spmd_kernel_init(i32 [[THREAD_LIMIT]], i16 0)
 // CHECK: call void [[PARALLEL:@.+]](
 // CHECK: call void @__kmpc_spmd_kernel_deinit_v2(i16 0)
 
@@ -128,7 +128,7 @@ int bar(int n){
 
 // CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+}}(
 // CHECK-DAG: [[THREAD_LIMIT:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
-// CHECK: call void @__kmpc_spmd_kernel_init(i32 [[THREAD_LIMIT]], i16 0, i16 0)
+// CHECK: call void @__kmpc_spmd_kernel_init(i32 [[THREAD_LIMIT]], i16 0)
 // CHECK: call void @__kmpc_spmd_kernel_deinit_v2(i16 0)
 // CHECK: ret void
 
@@ -143,7 +143,7 @@ int bar(int n){
 
 // CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+}}(
 // CHECK-DAG: [[THREAD_LIMIT:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
-// CHECK: call void @__kmpc_spmd_kernel_init(i32 [[THREAD_LIMIT]], i16 0, i16 0)
+// CHECK: call void @__kmpc_spmd_kernel_init(i32 [[THREAD_LIMIT]], i16 0)
 // CHECK: call void @__kmpc_spmd_kernel_deinit_v2(i16 0)
 // CHECK: ret void
 
@@ -159,7 +159,7 @@ int bar(int n){
 // Distribute with collapse(2)
 // CHECK: define {{.*}}void {{@__omp_offloading_.+}}({{.+}}, i{{32|64}} [[F_IN:%.+]])
 // CHECK-DAG: [[THREAD_LIMIT:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
-// CHECK: call void @__kmpc_spmd_kernel_init(i32 [[THREAD_LIMIT]], i16 0, i16 0)
+// CHECK: call void @__kmpc_spmd_kernel_init(i32 [[THREAD_LIMIT]], i16 0)
 // CHECK: call void @__kmpc_spmd_kernel_deinit_v2(i16 0)
 // CHECK: ret void
 

diff  --git a/clang/test/OpenMP/nvptx_target_teams_distribute_parallel_for_simd_codegen.cpp b/clang/test/OpenMP/nvptx_target_teams_distribute_parallel_for_simd_codegen.cpp
index a933c7e021b8..e9126fce7020 100644
--- a/clang/test/OpenMP/nvptx_target_teams_distribute_parallel_for_simd_codegen.cpp
+++ b/clang/test/OpenMP/nvptx_target_teams_distribute_parallel_for_simd_codegen.cpp
@@ -83,7 +83,7 @@ int bar(int n){
 
 // CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+}}_l43(
 // CHECK-DAG: [[THREAD_LIMIT:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
-// CHECK: call void @__kmpc_spmd_kernel_init(i32 [[THREAD_LIMIT]], i16 0, i16 0)
+// CHECK: call void @__kmpc_spmd_kernel_init(i32 [[THREAD_LIMIT]], i16 0)
 // CHECK: call void @__kmpc_spmd_kernel_deinit_v2(i16 0)
 
 // SEQ: [[SHARED:%.+]] = load i16, i16* [[KERNEL_SHARED]],
@@ -109,7 +109,7 @@ int bar(int n){
 
 // CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+}}(
 // CHECK-DAG: [[THREAD_LIMIT:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
-// CHECK: call void @__kmpc_spmd_kernel_init(i32 [[THREAD_LIMIT]], i16 0, i16 0)
+// CHECK: call void @__kmpc_spmd_kernel_init(i32 [[THREAD_LIMIT]], i16 0)
 // CHECK: call void @__kmpc_spmd_kernel_deinit_v2(i16 0)
 
 // CHECK: call void @__kmpc_for_static_init_4({{.+}}, {{.+}}, {{.+}} 91,
@@ -124,7 +124,7 @@ int bar(int n){
 
 // CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+}}(
 // CHECK-DAG: [[THREAD_LIMIT:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
-// CHECK: call void @__kmpc_spmd_kernel_init(i32 [[THREAD_LIMIT]], i16 0, i16 0)
+// CHECK: call void @__kmpc_spmd_kernel_init(i32 [[THREAD_LIMIT]], i16 0)
 // CHECK: call void @__kmpc_spmd_kernel_deinit_v2(i16 0)
 
 // CHECK: call void @__kmpc_for_static_init_4({{.+}}, {{.+}}, {{.+}} 91,
@@ -140,7 +140,7 @@ int bar(int n){
 // CHECK: define {{.*}}void {{@__omp_offloading_.+}}({{.+}}, i{{32|64}} [[F_IN:%.+]])
 // CHECK: store {{.+}} [[F_IN]], {{.+}}* {{.+}},
 // CHECK-DAG: [[THREAD_LIMIT:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
-// CHECK: call void @__kmpc_spmd_kernel_init(i32 [[THREAD_LIMIT]], i16 0, i16 0)
+// CHECK: call void @__kmpc_spmd_kernel_init(i32 [[THREAD_LIMIT]], i16 0)
 // CHECK: call void @__kmpc_spmd_kernel_deinit_v2(i16 0)
 
 // CHECK: store {{.+}} 99, {{.+}}* [[COMB_UB:%.+]], align

diff  --git a/clang/test/OpenMP/nvptx_target_teams_distribute_simd_codegen.cpp b/clang/test/OpenMP/nvptx_target_teams_distribute_simd_codegen.cpp
index 8d12c857cb43..15f5f09f3899 100644
--- a/clang/test/OpenMP/nvptx_target_teams_distribute_simd_codegen.cpp
+++ b/clang/test/OpenMP/nvptx_target_teams_distribute_simd_codegen.cpp
@@ -70,7 +70,7 @@ int bar(int n){
 }
 
 // CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+}}_l37(
-// CHECK: call void @__kmpc_spmd_kernel_init(i32 %{{.+}}, i16 0, i16 0)
+// CHECK: call void @__kmpc_spmd_kernel_init(i32 %{{.+}}, i16 0)
 // CHECK: call void @__kmpc_spmd_kernel_deinit_v2(i16 0)
 
 // CHECK: call void @__kmpc_for_static_init_4({{.+}}, {{.+}}, {{.+}} 91,
@@ -78,7 +78,7 @@ int bar(int n){
 // CHECK: ret void
 
 // CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+}}_l43(
-// CHECK: call void @__kmpc_spmd_kernel_init(i32 %{{.+}}, i16 0, i16 0)
+// CHECK: call void @__kmpc_spmd_kernel_init(i32 %{{.+}}, i16 0)
 // CHECK: call void @__kmpc_spmd_kernel_deinit_v2(i16 0)
 
 // CHECK: call void @__kmpc_for_static_init_4({{.+}}, {{.+}}, {{.+}} 91,
@@ -86,7 +86,7 @@ int bar(int n){
 // CHECK: ret void
 
 // CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+}}_l48(
-// CHECK: call void @__kmpc_spmd_kernel_init(i32 %{{.+}}, i16 0, i16 0)
+// CHECK: call void @__kmpc_spmd_kernel_init(i32 %{{.+}}, i16 0)
 // CHECK: call void @__kmpc_spmd_kernel_deinit_v2(i16 0)
 
 // CHECK: call void @__kmpc_for_static_init_4({{.+}}, {{.+}}, {{.+}} 91,
@@ -95,7 +95,7 @@ int bar(int n){
 
 // CHECK: define {{.*}}void {{@__omp_offloading_.+}}_l53({{.+}}, i{{32|64}} [[F_IN:%.+]])
 // CHECK: store {{.+}} [[F_IN]], {{.+}}* {{.+}},
-// CHECK: call void @__kmpc_spmd_kernel_init(i32 %{{.+}}, i16 0, i16 0)
+// CHECK: call void @__kmpc_spmd_kernel_init(i32 %{{.+}}, i16 0)
 // CHECK: call void @__kmpc_spmd_kernel_deinit_v2(i16 0)
 
 // CHECK: store {{.+}} 99, {{.+}}* [[COMB_UB:%.+]], align

diff  --git a/openmp/libomptarget/deviceRTLs/common/omptarget.h b/openmp/libomptarget/deviceRTLs/common/omptarget.h
index 6d5d6cd19bd6..0ccd71c3b55f 100644
--- a/openmp/libomptarget/deviceRTLs/common/omptarget.h
+++ b/openmp/libomptarget/deviceRTLs/common/omptarget.h
@@ -92,15 +92,7 @@ struct __kmpc_data_sharing_worker_slot_static {
   void *DataEnd;
   char Data[DS_Worker_Warp_Slot_Size];
 };
-// Additional master slot type which is initialized with the default master slot
-// size of 4 bytes.
-struct __kmpc_data_sharing_master_slot_static {
-  __kmpc_data_sharing_slot *Next;
-  __kmpc_data_sharing_slot *Prev;
-  void *PrevSlotStackPtr;
-  void *DataEnd;
-  char Data[DS_Slot_Size];
-};
+
 extern DEVICE SHARED DataSharingStateTy DataSharingState;
 
 ////////////////////////////////////////////////////////////////////////////////
@@ -204,37 +196,6 @@ class omptarget_nvptx_TeamDescr {
   // init
   INLINE void InitTeamDescr();
 
-  INLINE __kmpc_data_sharing_slot *RootS(int wid, bool IsMasterThread) {
-    // If this is invoked by the master thread of the master warp then
-    // initialize it with a smaller slot.
-    if (IsMasterThread) {
-      // Do not initialize this slot again if it has already been initalized.
-      if (master_rootS[0].DataEnd == &master_rootS[0].Data[0] + DS_Slot_Size)
-        return 0;
-      // Initialize the pointer to the end of the slot given the size of the
-      // data section. DataEnd is non-inclusive.
-      master_rootS[0].DataEnd = &master_rootS[0].Data[0] + DS_Slot_Size;
-      // We currently do not have a next slot.
-      master_rootS[0].Next = 0;
-      master_rootS[0].Prev = 0;
-      master_rootS[0].PrevSlotStackPtr = 0;
-      return (__kmpc_data_sharing_slot *)&master_rootS[0];
-    }
-    // Do not initialize this slot again if it has already been initalized.
-    if (worker_rootS[wid].DataEnd ==
-        &worker_rootS[wid].Data[0] + DS_Worker_Warp_Slot_Size)
-      return 0;
-    // Initialize the pointer to the end of the slot given the size of the data
-    // section. DataEnd is non-inclusive.
-    worker_rootS[wid].DataEnd =
-        &worker_rootS[wid].Data[0] + DS_Worker_Warp_Slot_Size;
-    // We currently do not have a next slot.
-    worker_rootS[wid].Next = 0;
-    worker_rootS[wid].Prev = 0;
-    worker_rootS[wid].PrevSlotStackPtr = 0;
-    return (__kmpc_data_sharing_slot *)&worker_rootS[wid];
-  }
-
   INLINE __kmpc_data_sharing_slot *GetPreallocatedSlotAddr(int wid) {
     worker_rootS[wid].DataEnd =
         &worker_rootS[wid].Data[0] + DS_Worker_Warp_Slot_Size;
@@ -253,7 +214,6 @@ class omptarget_nvptx_TeamDescr {
 
   ALIGN(16)
   __kmpc_data_sharing_worker_slot_static worker_rootS[DS_Max_Warp_Number];
-  ALIGN(16) __kmpc_data_sharing_master_slot_static master_rootS[1];
 };
 
 ////////////////////////////////////////////////////////////////////////////////

diff  --git a/openmp/libomptarget/deviceRTLs/common/src/omptarget.cu b/openmp/libomptarget/deviceRTLs/common/src/omptarget.cu
index 6c1d5319595c..5ccc84539400 100644
--- a/openmp/libomptarget/deviceRTLs/common/src/omptarget.cu
+++ b/openmp/libomptarget/deviceRTLs/common/src/omptarget.cu
@@ -77,8 +77,7 @@ EXTERN void __kmpc_kernel_deinit(int16_t IsOMPRuntimeInitialized) {
   omptarget_nvptx_workFn = 0;
 }
 
-EXTERN void __kmpc_spmd_kernel_init(int ThreadLimit, int16_t RequiresOMPRuntime,
-                                    int16_t RequiresDataSharing) {
+EXTERN void __kmpc_spmd_kernel_init(int ThreadLimit, int16_t RequiresOMPRuntime) {
   PRINT0(LD_IO, "call to __kmpc_spmd_kernel_init\n");
 
   setExecutionParameters(Spmd, RequiresOMPRuntime ? RuntimeInitialized
@@ -134,15 +133,6 @@ EXTERN void __kmpc_spmd_kernel_init(int ThreadLimit, int16_t RequiresOMPRuntime,
         "thread will execute parallel region with id %d in a team of "
         "%d threads\n",
         (int)newTaskDescr->ThreadId(), (int)ThreadLimit);
-
-  if (RequiresDataSharing && GetLaneId() == 0) {
-    // Warp master initializes data sharing environment.
-    unsigned WID = threadId / WARPSIZE;
-    __kmpc_data_sharing_slot *RootS = currTeamDescr.RootS(
-        WID, WID == WARPSIZE - 1);
-    DataSharingState.SlotPtr[WID] = RootS;
-    DataSharingState.StackPtr[WID] = (void *)&RootS->Data[0];
-  }
 }
 
 EXTERN void __kmpc_spmd_kernel_deinit_v2(int16_t RequiresOMPRuntime) {

diff  --git a/openmp/libomptarget/deviceRTLs/interface.h b/openmp/libomptarget/deviceRTLs/interface.h
index 4d352bc648fa..330880556293 100644
--- a/openmp/libomptarget/deviceRTLs/interface.h
+++ b/openmp/libomptarget/deviceRTLs/interface.h
@@ -421,8 +421,8 @@ EXTERN int32_t __kmpc_cancel(kmp_Ident *loc, int32_t global_tid,
 // non standard
 EXTERN void __kmpc_kernel_init(int ThreadLimit, int16_t RequiresOMPRuntime);
 EXTERN void __kmpc_kernel_deinit(int16_t IsOMPRuntimeInitialized);
-EXTERN void __kmpc_spmd_kernel_init(int ThreadLimit, int16_t RequiresOMPRuntime,
-                                    int16_t RequiresDataSharing);
+EXTERN void __kmpc_spmd_kernel_init(int ThreadLimit,
+                                    int16_t RequiresOMPRuntime);
 EXTERN void __kmpc_spmd_kernel_deinit_v2(int16_t RequiresOMPRuntime);
 EXTERN void __kmpc_kernel_prepare_parallel(void *WorkFn);
 EXTERN bool __kmpc_kernel_parallel(void **WorkFn);


        


More information about the cfe-commits mailing list