[Openmp-commits] [openmp] 3a12ff0 - [OpenMP][RTL] Remove dead code
Pushpinder Singh via Openmp-commits
openmp-commits at lists.llvm.org
Tue Oct 6 02:44:08 PDT 2020
Author: Pushpinder Singh
Date: 2020-10-06T05:43:47-04:00
New Revision: 3a12ff0dac5ab4f0e1f446abe66b451c1df8dac1
URL: https://github.com/llvm/llvm-project/commit/3a12ff0dac5ab4f0e1f446abe66b451c1df8dac1
DIFF: https://github.com/llvm/llvm-project/commit/3a12ff0dac5ab4f0e1f446abe66b451c1df8dac1.diff
LOG: [OpenMP][RTL] Remove dead code
RequiresDataSharing was always 0, resulting dead code in device runtime library.
Reviewed By: jdoerfert, JonChesterfield
Differential Revision: https://reviews.llvm.org/D88829
Added:
Modified:
clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
clang/test/OpenMP/amdgcn_target_codegen.cpp
clang/test/OpenMP/nvptx_SPMD_codegen.cpp
clang/test/OpenMP/nvptx_force_full_runtime_SPMD_codegen.cpp
clang/test/OpenMP/nvptx_target_parallel_codegen.cpp
clang/test/OpenMP/nvptx_target_parallel_num_threads_codegen.cpp
clang/test/OpenMP/nvptx_target_parallel_proc_bind_codegen.cpp
clang/test/OpenMP/nvptx_target_parallel_reduction_codegen.cpp
clang/test/OpenMP/nvptx_target_simd_codegen.cpp
clang/test/OpenMP/nvptx_target_teams_codegen.cpp
clang/test/OpenMP/nvptx_target_teams_distribute_parallel_for_codegen.cpp
clang/test/OpenMP/nvptx_target_teams_distribute_parallel_for_simd_codegen.cpp
clang/test/OpenMP/nvptx_target_teams_distribute_simd_codegen.cpp
openmp/libomptarget/deviceRTLs/common/omptarget.h
openmp/libomptarget/deviceRTLs/common/src/omptarget.cu
openmp/libomptarget/deviceRTLs/interface.h
Removed:
################################################################################
diff --git a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
index 433256313c12..5d1856f4f5e7 100644
--- a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
+++ b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
@@ -35,7 +35,7 @@ enum OpenMPRTLFunctionNVPTX {
/// Call to void __kmpc_kernel_deinit(int16_t IsOMPRuntimeInitialized);
OMPRTL_NVPTX__kmpc_kernel_deinit,
/// Call to void __kmpc_spmd_kernel_init(kmp_int32 thread_limit,
- /// int16_t RequiresOMPRuntime, int16_t RequiresDataSharing);
+ /// int16_t RequiresOMPRuntime);
OMPRTL_NVPTX__kmpc_spmd_kernel_init,
/// Call to void __kmpc_spmd_kernel_deinit_v2(int16_t RequiresOMPRuntime);
OMPRTL_NVPTX__kmpc_spmd_kernel_deinit_v2,
@@ -1345,8 +1345,7 @@ void CGOpenMPRuntimeGPU::emitSPMDEntryHeader(
llvm::Value *Args[] = {getThreadLimit(CGF, /*IsInSPMDExecutionMode=*/true),
/*RequiresOMPRuntime=*/
- Bld.getInt16(RequiresFullRuntime ? 1 : 0),
- /*RequiresDataSharing=*/Bld.getInt16(0)};
+ Bld.getInt16(RequiresFullRuntime ? 1 : 0)};
CGF.EmitRuntimeCall(
createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_spmd_kernel_init), Args);
@@ -1561,7 +1560,7 @@ CGOpenMPRuntimeGPU::createNVPTXRuntimeFunction(unsigned Function) {
case OMPRTL_NVPTX__kmpc_spmd_kernel_init: {
// Build void __kmpc_spmd_kernel_init(kmp_int32 thread_limit,
// int16_t RequiresOMPRuntime, int16_t RequiresDataSharing);
- llvm::Type *TypeParams[] = {CGM.Int32Ty, CGM.Int16Ty, CGM.Int16Ty};
+ llvm::Type *TypeParams[] = {CGM.Int32Ty, CGM.Int16Ty};
auto *FnTy =
llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ false);
RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_spmd_kernel_init");
diff --git a/clang/test/OpenMP/amdgcn_target_codegen.cpp b/clang/test/OpenMP/amdgcn_target_codegen.cpp
index 0b6f2d40ffe8..85ef69942a50 100644
--- a/clang/test/OpenMP/amdgcn_target_codegen.cpp
+++ b/clang/test/OpenMP/amdgcn_target_codegen.cpp
@@ -32,7 +32,7 @@ int test_amdgcn_target_tid_threads_simd() {
// CHECK: [[NUM_THREADS:%.+]] = call i64 @__ockl_get_local_size(i32 0)
// CHECK-NEXT: [[VAR:%.+]] = trunc i64 [[NUM_THREADS]] to i32
-// CHECK-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[VAR]], i16 0, i16 0)
+// CHECK-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[VAR]], i16 0)
#pragma omp target simd
for (int i = 0; i < N; i++) {
arr[i] = 1;
diff --git a/clang/test/OpenMP/nvptx_SPMD_codegen.cpp b/clang/test/OpenMP/nvptx_SPMD_codegen.cpp
index 5fa820fcba70..6c54818e23d5 100644
--- a/clang/test/OpenMP/nvptx_SPMD_codegen.cpp
+++ b/clang/test/OpenMP/nvptx_SPMD_codegen.cpp
@@ -21,28 +21,28 @@ int a;
// CHECK-NOT: @__omp_offloading_{{.+}}_exec_mode = weak constant i8 1
void foo() {
-// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 0, i16 0)
+// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 0)
// CHECK-DAG: [[DISTR_LIGHT]]
// CHECK-DAG: [[FOR_LIGHT]]
// CHECK-DAG: [[LIGHT]]
-// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 0, i16 0)
+// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 0)
// CHECK-DAG: [[DISTR_LIGHT]]
// CHECK-DAG: [[FOR_LIGHT]]
// CHECK-DAG: [[LIGHT]]
-// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 0, i16 0)
+// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 0)
// CHECK-DAG: [[DISTR_LIGHT]]
// CHECK-DAG: [[FOR_LIGHT]]
// CHECK-DAG: [[LIGHT]]
-// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1, i16 {{.+}})
+// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1)
// CHECK-DAG: [[DISTR_FULL]]
// CHECK-DAG: [[FULL]]
-// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1, i16 {{.+}})
+// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1)
// CHECK-DAG: [[DISTR_FULL]]
// CHECK-DAG: [[FULL]]
-// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1, i16 {{.+}})
+// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1)
// CHECK-DAG: [[DISTR_FULL]]
// CHECK-DAG: [[FULL]]
-// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1, i16 {{.+}})
+// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1)
// CHECK-DAG: [[DISTR_FULL]]
// CHECK-DAG: [[FULL]]
#pragma omp target teams distribute parallel for simd if(a)
@@ -67,28 +67,28 @@ void foo() {
for (int i = 0; i < 10; ++i)
;
int a;
-// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 0, i16 0)
+// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 0)
// CHECK-DAG: [[DISTR_LIGHT]]
// CHECK-DAG: [[FOR_LIGHT]]
// CHECK-DAG: [[LIGHT]]
-// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 0, i16 0)
+// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 0)
// CHECK-DAG: [[DISTR_LIGHT]]
// CHECK-DAG: [[FOR_LIGHT]]
// CHECK-DAG: [[LIGHT]]
-// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 0, i16 0)
+// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 0)
// CHECK-DAG: [[DISTR_LIGHT]]
// CHECK-DAG: [[FOR_LIGHT]]
// CHECK-DAG: [[LIGHT]]
-// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1, i16 {{.+}})
+// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1)
// CHECK-DAG: [[DISTR_FULL]]
// CHECK-DAG: [[FULL]]
-// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1, i16 {{.+}})
+// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1)
// CHECK-DAG: [[DISTR_FULL]]
// CHECK-DAG: [[FULL]]
-// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1, i16 {{.+}})
+// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1)
// CHECK-DAG: [[DISTR_FULL]]
// CHECK-DAG: [[FULL]]
-// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1, i16 {{.+}})
+// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1)
// CHECK-DAG: [[DISTR_FULL]]
// CHECK-DAG: [[FULL]]
#pragma omp target teams distribute parallel for lastprivate(a)
@@ -112,28 +112,28 @@ int a;
#pragma omp target teams distribute parallel for schedule(guided)
for (int i = 0; i < 10; ++i)
;
-// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 0, i16 0)
+// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 0)
// CHECK-DAG: [[DISTR_LIGHT]]
// CHECK-DAG: [[FOR_LIGHT]]
// CHECK-DAG: [[LIGHT]]
-// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 0, i16 0)
+// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 0)
// CHECK-DAG: [[DISTR_LIGHT]]
// CHECK-DAG: [[FOR_LIGHT]]
// CHECK-DAG: [[LIGHT]]
-// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 0, i16 0)
+// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 0)
// CHECK-DAG: [[DISTR_LIGHT]]
// CHECK-DAG: [[FOR_LIGHT]]
// CHECK-DAG: [[LIGHT]]
-// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1, i16 {{.+}})
+// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1)
// CHECK-DAG: [[DISTR_FULL]]
// CHECK-DAG: [[FULL]]
-// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1, i16 {{.+}})
+// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1)
// CHECK-DAG: [[DISTR_FULL]]
// CHECK-DAG: [[FULL]]
-// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1, i16 {{.+}})
+// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1)
// CHECK-DAG: [[DISTR_FULL]]
// CHECK-DAG: [[FULL]]
-// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1, i16 {{.+}})
+// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1)
// CHECK-DAG: [[DISTR_FULL]]
// CHECK-DAG: [[FULL]]
#pragma omp target teams
@@ -175,28 +175,28 @@ int a;
#pragma omp distribute parallel for simd schedule(guided)
for (int i = 0; i < 10; ++i)
;
-// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 0, i16 0)
+// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 0)
// CHECK-DAG: [[DISTR_LIGHT]]
// CHECK-DAG: [[FOR_LIGHT]]
// CHECK-DAG: [[LIGHT]]
-// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 0, i16 0)
+// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 0)
// CHECK-DAG: [[DISTR_LIGHT]]
// CHECK-DAG: [[FOR_LIGHT]]
// CHECK-DAG: [[LIGHT]]
-// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 0, i16 0)
+// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 0)
// CHECK-DAG: [[DISTR_LIGHT]]
// CHECK-DAG: [[FOR_LIGHT]]
// CHECK-DAG: [[LIGHT]]
-// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1, i16 {{.+}})
+// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1)
// CHECK-DAG: [[DISTR_FULL]]
// CHECK-DAG: [[FULL]]
-// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1, i16 {{.+}})
+// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1)
// CHECK-DAG: [[DISTR_FULL]]
// CHECK-DAG: [[FULL]]
-// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1, i16 {{.+}})
+// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1)
// CHECK-DAG: [[DISTR_FULL]]
// CHECK-DAG: [[FULL]]
-// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1, i16 {{.+}})
+// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1)
// CHECK-DAG: [[DISTR_FULL]]
// CHECK-DAG: [[FULL]]
#pragma omp target teams
@@ -227,28 +227,28 @@ int a;
#pragma omp distribute parallel for schedule(guided)
for (int i = 0; i < 10; ++i)
;
-// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 0, i16 0)
+// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 0)
// CHECK-DAG: [[DISTR_LIGHT]]
// CHECK-DAG: [[FOR_LIGHT]]
// CHECK-DAG: [[LIGHT]]
-// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 0, i16 0)
+// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 0)
// CHECK-DAG: [[DISTR_LIGHT]]
// CHECK-DAG: [[FOR_LIGHT]]
// CHECK-DAG: [[LIGHT]]
-// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 0, i16 0)
+// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 0)
// CHECK-DAG: [[DISTR_LIGHT]]
// CHECK-DAG: [[FOR_LIGHT]]
// CHECK-DAG: [[LIGHT]]
-// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1, i16 {{.+}})
+// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1)
// CHECK-DAG: [[DISTR_FULL]]
// CHECK-DAG: [[FULL]]
-// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1, i16 {{.+}})
+// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1)
// CHECK-DAG: [[DISTR_FULL]]
// CHECK-DAG: [[FULL]]
-// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1, i16 {{.+}})
+// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1)
// CHECK-DAG: [[DISTR_FULL]]
// CHECK-DAG: [[FULL]]
-// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1, i16 {{.+}})
+// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1)
// CHECK-DAG: [[DISTR_FULL]]
// CHECK-DAG: [[FULL]]
#pragma omp target
@@ -286,22 +286,22 @@ int a;
#pragma omp distribute parallel for schedule(guided)
for (int i = 0; i < 10; ++i)
;
-// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 0, i16 0)
+// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 0)
// CHECK-DAG: [[FOR_LIGHT]]
// CHECK-DAG: [[LIGHT]]
-// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 0, i16 0)
+// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 0)
// CHECK-DAG: [[FOR_LIGHT]]
// CHECK-DAG: [[LIGHT]]
-// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 0, i16 0)
+// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 0)
// CHECK-DAG: [[FOR_LIGHT]]
// CHECK-DAG: [[LIGHT]]
-// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1, i16 {{.+}})
+// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1)
// CHECK-DAG: [[FULL]]
-// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1, i16 {{.+}})
+// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1)
// CHECK-DAG: [[FULL]]
-// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1, i16 {{.+}})
+// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1)
// CHECK-DAG: [[FULL]]
-// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1, i16 {{.+}})
+// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1)
// CHECK-DAG: [[FULL]]
#pragma omp target parallel for if(a)
for (int i = 0; i < 10; ++i)
@@ -324,28 +324,28 @@ int a;
#pragma omp target parallel for schedule(guided)
for (int i = 0; i < 10; ++i)
;
-// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 0, i16 0)
+// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 0)
// CHECK-DAG: [[FOR_LIGHT]]
// CHECK-DAG: [[LIGHT]]
// CHECK-DAG: [[BAR_LIGHT]]
-// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 0, i16 0)
+// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 0)
// CHECK-DAG: [[FOR_LIGHT]]
// CHECK-DAG: [[LIGHT]]
// CHECK-DAG: [[BAR_LIGHT]]
-// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 0, i16 0)
+// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 0)
// CHECK-DAG: [[FOR_LIGHT]]
// CHECK-DAG: [[LIGHT]]
// CHECK-DAG: [[BAR_LIGHT]]
-// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1, i16 {{.+}})
+// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1)
// CHECK-DAG: [[FULL]]
// CHECK-DAG: [[BAR_FULL]]
-// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1, i16 {{.+}})
+// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1)
// CHECK-DAG: [[FULL]]
// CHECK-DAG: [[BAR_FULL]]
-// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1, i16 {{.+}})
+// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1)
// CHECK-DAG: [[FULL]]
// CHECK-DAG: [[BAR_FULL]]
-// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1, i16 {{.+}})
+// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1)
// CHECK-DAG: [[FULL]]
// CHECK-DAG: [[BAR_FULL]]
#pragma omp target parallel if(a)
@@ -376,27 +376,27 @@ int a;
#pragma omp for simd schedule(guided)
for (int i = 0; i < 10; ++i)
;
-// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1, i16 {{.+}})
+// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1)
// CHECK-DAG: [[FULL]]
// CHECK-DAG: [[BAR_FULL]]
-// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 0, i16 0)
+// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 0)
// CHECK-DAG: [[FOR_LIGHT]]
// CHECK-DAG: [[LIGHT]]
// CHECK-DAG: [[BAR_LIGHT]]
-// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 0, i16 0)
+// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 0)
// CHECK-DAG: [[FOR_LIGHT]]
// CHECK-DAG: [[LIGHT]]
// CHECK-DAG: [[BAR_LIGHT]]
-// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1, i16 {{.+}})
+// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1)
// CHECK-DAG: [[FULL]]
// CHECK-DAG: [[BAR_FULL]]
-// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1, i16 {{.+}})
+// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1)
// CHECK-DAG: [[FULL]]
// CHECK-DAG: [[BAR_FULL]]
-// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1, i16 {{.+}})
+// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1)
// CHECK-DAG: [[FULL]]
// CHECK-DAG: [[BAR_FULL]]
-// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1, i16 {{.+}})
+// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1)
// CHECK-DAG: [[FULL]]
// CHECK-DAG: [[BAR_FULL]]
#pragma omp target
@@ -434,22 +434,22 @@ int a;
#pragma omp for simd schedule(guided)
for (int i = 0; i < 10; ++i)
;
-// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 0, i16 0)
+// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 0)
// CHECK-DAG: [[FOR_LIGHT]]
// CHECK-DAG: [[LIGHT]]
-// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 0, i16 0)
+// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 0)
// CHECK-DAG: [[FOR_LIGHT]]
// CHECK-DAG: [[LIGHT]]
-// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 0, i16 0)
+// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 0)
// CHECK-DAG: [[FOR_LIGHT]]
// CHECK-DAG: [[LIGHT]]
-// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1, i16 {{.+}})
+// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1)
// CHECK-DAG: [[FULL]]
-// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1, i16 {{.+}})
+// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1)
// CHECK-DAG: [[FULL]]
-// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1, i16 {{.+}})
+// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1)
// CHECK-DAG: [[FULL]]
-// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1, i16 {{.+}})
+// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1)
// CHECK-DAG: [[FULL]]
#pragma omp target
#pragma omp parallel for
diff --git a/clang/test/OpenMP/nvptx_force_full_runtime_SPMD_codegen.cpp b/clang/test/OpenMP/nvptx_force_full_runtime_SPMD_codegen.cpp
index 395a2d48ff0f..37e71d0dcec6 100644
--- a/clang/test/OpenMP/nvptx_force_full_runtime_SPMD_codegen.cpp
+++ b/clang/test/OpenMP/nvptx_force_full_runtime_SPMD_codegen.cpp
@@ -11,13 +11,13 @@
// CHECK-NOT: @__omp_offloading_{{.+}}_exec_mode = weak constant i8 1
void foo() {
-// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1, i16 {{.+}})
-// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1, i16 {{.+}})
-// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1, i16 {{.+}})
-// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1, i16 {{.+}})
-// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1, i16 {{.+}})
-// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1, i16 {{.+}})
-// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1, i16 {{.+}})
+// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1)
+// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1)
+// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1)
+// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1)
+// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1)
+// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1)
+// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1)
#pragma omp target teams distribute parallel for simd
for (int i = 0; i < 10; ++i)
;
@@ -40,13 +40,13 @@ void foo() {
for (int i = 0; i < 10; ++i)
;
int a;
-// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1, i16 {{.+}})
-// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1, i16 {{.+}})
-// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1, i16 {{.+}})
-// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1, i16 {{.+}})
-// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1, i16 {{.+}})
-// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1, i16 {{.+}})
-// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1, i16 {{.+}})
+// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1)
+// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1)
+// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1)
+// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1)
+// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1)
+// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1)
+// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1)
#pragma omp target teams distribute parallel for lastprivate(a)
for (int i = 0; i < 10; ++i)
a = i;
@@ -68,13 +68,13 @@ int a;
#pragma omp target teams distribute parallel for schedule(guided)
for (int i = 0; i < 10; ++i)
;
-// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1, i16 {{.+}})
-// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1, i16 {{.+}})
-// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1, i16 {{.+}})
-// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1, i16 {{.+}})
-// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1, i16 {{.+}})
-// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1, i16 {{.+}})
-// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1, i16 {{.+}})
+// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1)
+// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1)
+// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1)
+// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1)
+// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1)
+// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1)
+// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1)
#pragma omp target teams
#pragma omp distribute parallel for simd
for (int i = 0; i < 10; ++i)
@@ -103,13 +103,13 @@ int a;
#pragma omp distribute parallel for simd schedule(guided)
for (int i = 0; i < 10; ++i)
;
-// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1, i16 {{.+}})
-// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1, i16 {{.+}})
-// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1, i16 {{.+}})
-// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1, i16 {{.+}})
-// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1, i16 {{.+}})
-// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1, i16 {{.+}})
-// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1, i16 {{.+}})
+// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1)
+// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1)
+// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1)
+// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1)
+// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1)
+// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1)
+// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1)
#pragma omp target teams
#pragma omp distribute parallel for
for (int i = 0; i < 10; ++i)
@@ -138,13 +138,13 @@ int a;
#pragma omp distribute parallel for schedule(guided)
for (int i = 0; i < 10; ++i)
;
-// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1, i16 {{.+}})
-// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1, i16 {{.+}})
-// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1, i16 {{.+}})
-// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1, i16 {{.+}})
-// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1, i16 {{.+}})
-// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1, i16 {{.+}})
-// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1, i16 {{.+}})
+// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1)
+// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1)
+// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1)
+// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1)
+// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1)
+// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1)
+// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1)
#pragma omp target
#pragma omp teams
#pragma omp distribute parallel for
@@ -180,13 +180,13 @@ int a;
#pragma omp distribute parallel for schedule(guided)
for (int i = 0; i < 10; ++i)
;
-// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1, i16 {{.+}})
-// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1, i16 {{.+}})
-// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1, i16 {{.+}})
-// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1, i16 {{.+}})
-// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1, i16 {{.+}})
-// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1, i16 {{.+}})
-// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1, i16 {{.+}})
+// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1)
+// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1)
+// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1)
+// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1)
+// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1)
+// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1)
+// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1)
#pragma omp target parallel for
for (int i = 0; i < 10; ++i)
;
@@ -208,13 +208,13 @@ int a;
#pragma omp target parallel for schedule(guided)
for (int i = 0; i < 10; ++i)
;
-// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1, i16 {{.+}})
-// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1, i16 {{.+}})
-// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1, i16 {{.+}})
-// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1, i16 {{.+}})
-// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1, i16 {{.+}})
-// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1, i16 {{.+}})
-// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1, i16 {{.+}})
+// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1)
+// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1)
+// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1)
+// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1)
+// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1)
+// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1)
+// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1)
#pragma omp target parallel
#pragma omp for simd
for (int i = 0; i < 10; ++i)
@@ -243,13 +243,13 @@ int a;
#pragma omp for simd schedule(guided)
for (int i = 0; i < 10; ++i)
;
-// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1, i16 {{.+}})
-// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1, i16 {{.+}})
-// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1, i16 {{.+}})
-// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1, i16 {{.+}})
-// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1, i16 {{.+}})
-// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1, i16 {{.+}})
-// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1, i16 {{.+}})
+// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1)
+// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1)
+// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1)
+// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1)
+// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1)
+// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1)
+// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1)
#pragma omp target
#pragma omp parallel
#pragma omp for simd ordered
@@ -285,13 +285,13 @@ int a;
#pragma omp for simd schedule(guided)
for (int i = 0; i < 10; ++i)
;
-// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1, i16 {{.+}})
-// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1, i16 {{.+}})
-// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1, i16 {{.+}})
-// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1, i16 {{.+}})
-// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1, i16 {{.+}})
-// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1, i16 {{.+}})
-// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1, i16 {{.+}})
+// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1)
+// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1)
+// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1)
+// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1)
+// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1)
+// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1)
+// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1)
#pragma omp target
#pragma omp parallel for
for (int i = 0; i < 10; ++i)
diff --git a/clang/test/OpenMP/nvptx_target_parallel_codegen.cpp b/clang/test/OpenMP/nvptx_target_parallel_codegen.cpp
index de8d4e0d234c..db3d0474c2c0 100644
--- a/clang/test/OpenMP/nvptx_target_parallel_codegen.cpp
+++ b/clang/test/OpenMP/nvptx_target_parallel_codegen.cpp
@@ -61,7 +61,7 @@ int bar(int n){
// CHECK: store i16* {{%.+}}, i16** [[AA_ADDR]], align
// CHECK: [[AA:%.+]] = load i16*, i16** [[AA_ADDR]], align
// CHECK: [[THREAD_LIMIT:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
-// CHECK: call void @__kmpc_spmd_kernel_init(i32 [[THREAD_LIMIT]], i16 1, i16 0)
+// CHECK: call void @__kmpc_spmd_kernel_init(i32 [[THREAD_LIMIT]], i16 1)
// CHECK: call void @__kmpc_data_sharing_init_stack_spmd
// CHECK: br label {{%?}}[[EXEC:.+]]
//
@@ -99,7 +99,7 @@ int bar(int n){
// CHECK: [[AA:%.+]] = load i16*, i16** [[AA_ADDR]], align
// CHECK: [[B:%.+]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align
// CHECK: [[THREAD_LIMIT:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
-// CHECK: call void @__kmpc_spmd_kernel_init(i32 [[THREAD_LIMIT]], i16 1, i16 0)
+// CHECK: call void @__kmpc_spmd_kernel_init(i32 [[THREAD_LIMIT]], i16 1)
// CHECK: call void @__kmpc_data_sharing_init_stack_spmd
// CHECK: br label {{%?}}[[EXEC:.+]]
//
diff --git a/clang/test/OpenMP/nvptx_target_parallel_num_threads_codegen.cpp b/clang/test/OpenMP/nvptx_target_parallel_num_threads_codegen.cpp
index 93de4b6397ba..123ade5094a2 100644
--- a/clang/test/OpenMP/nvptx_target_parallel_num_threads_codegen.cpp
+++ b/clang/test/OpenMP/nvptx_target_parallel_num_threads_codegen.cpp
@@ -53,7 +53,7 @@ int bar(int n){
// CHECK: store i16* {{%.+}}, i16** [[AA_ADDR]], align
// CHECK: [[AA:%.+]] = load i16*, i16** [[AA_ADDR]], align
// CHECK: [[THREAD_LIMIT:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
-// CHECK: call void @__kmpc_spmd_kernel_init(i32 [[THREAD_LIMIT]], i16 1, i16 0)
+// CHECK: call void @__kmpc_spmd_kernel_init(i32 [[THREAD_LIMIT]], i16 1)
// CHECK: call void @__kmpc_data_sharing_init_stack_spmd()
// CHECK: [[GTID:%.+]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @{{.+}})
// CHECK: store i32 [[GTID]], i32* [[THREADID:%.+]],
@@ -84,7 +84,7 @@ int bar(int n){
// CHECK: [[AA:%.+]] = load i16*, i16** [[AA_ADDR]], align
// CHECK: [[B:%.+]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align
// CHECK: [[THREAD_LIMIT:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
-// CHECK: call void @__kmpc_spmd_kernel_init(i32 [[THREAD_LIMIT]], i16 1, i16 0)
+// CHECK: call void @__kmpc_spmd_kernel_init(i32 [[THREAD_LIMIT]], i16 1)
// CHECK: call void @__kmpc_data_sharing_init_stack_spmd()
// CHECK: [[GTID:%.+]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @{{.+}})
// CHECK: store i32 [[GTID]], i32* [[THREADID:%.+]],
diff --git a/clang/test/OpenMP/nvptx_target_parallel_proc_bind_codegen.cpp b/clang/test/OpenMP/nvptx_target_parallel_proc_bind_codegen.cpp
index 778b8d3300df..7b3de7d462d2 100644
--- a/clang/test/OpenMP/nvptx_target_parallel_proc_bind_codegen.cpp
+++ b/clang/test/OpenMP/nvptx_target_parallel_proc_bind_codegen.cpp
@@ -54,7 +54,7 @@ int bar(int n){
}
// CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+template.+l29}}(
-// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1, i16 0)
+// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1)
// CHECK: call void @__kmpc_data_sharing_init_stack_spmd
// CHECK: br label {{%?}}[[EXEC:.+]]
//
@@ -72,7 +72,7 @@ int bar(int n){
// CHECK: }
// CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+template.+l33}}(
-// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1, i16 0)
+// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1)
// CHECK: call void @__kmpc_data_sharing_init_stack_spmd
// CHECK: br label {{%?}}[[EXEC:.+]]
//
@@ -90,7 +90,7 @@ int bar(int n){
// CHECK: }
// CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+template.+l38}}(
-// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1, i16 0)
+// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1)
// CHECK: call void @__kmpc_data_sharing_init_stack_spmd
// CHECK: br label {{%?}}[[EXEC:.+]]
//
diff --git a/clang/test/OpenMP/nvptx_target_parallel_reduction_codegen.cpp b/clang/test/OpenMP/nvptx_target_parallel_reduction_codegen.cpp
index 35240086d3da..459330d31f66 100644
--- a/clang/test/OpenMP/nvptx_target_parallel_reduction_codegen.cpp
+++ b/clang/test/OpenMP/nvptx_target_parallel_reduction_codegen.cpp
@@ -52,774 +52,756 @@ int bar(int n){
return a;
}
- // CHECK: define {{.*}}void {{@__omp_offloading_.+template.+l27}}(
- //
- // CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1, i16 0)
- // CHECK: call void @__kmpc_data_sharing_init_stack_spmd
- // CHECK: br label {{%?}}[[EXECUTE:.+]]
- //
- // CHECK: [[EXECUTE]]
- // CHECK: {{call|invoke}} void [[PFN:@.+]](i32*
- // CHECK: call void @__kmpc_spmd_kernel_deinit_v2(i16 1)
- //
- //
- // define internal void [[PFN]](
- // CHECK: store double {{[0\.e\+]+}}, double* [[E:%.+]], align
- // CHECK: [[EV:%.+]] = load double, double* [[E]], align
- // CHECK: [[ADD:%.+]] = fadd double [[EV]], 5
- // CHECK: store double [[ADD]], double* [[E]], align
- // CHECK: [[PTR1:%.+]] = getelementptr inbounds [[RLT:.+]], [1 x i8*]* [[RL:%.+]], i{{32|64}} 0, i{{32|64}} 0
- // CHECK: [[E_CAST:%.+]] = bitcast double* [[E]] to i8*
- // CHECK: store i8* [[E_CAST]], i8** [[PTR1]], align
- // CHECK: [[ARG_RL:%.+]] = bitcast [[RLT]]* [[RL]] to i8*
- // CHECK: [[RET:%.+]] = call i32 @__kmpc_nvptx_parallel_reduce_nowait_v2(%struct.ident_t* @{{.+}}, i32 {{.+}}, i32 1, i{{32|64}} {{4|8}}, i8* [[ARG_RL]], void (i8*, i16, i16, i16)* [[SHUFFLE_REDUCE_FN:@.+]], void (i8*, i32)* [[WARP_COPY_FN:@.+]])
- // CHECK: [[CMP:%.+]] = icmp eq i32 [[RET]], 1
- // CHECK: br i1 [[CMP]], label
-
- // CHECK: [[E_INV:%.+]] = load double, double* [[E_IN:%.+]], align
- // CHECK: [[EV:%.+]] = load double, double* [[E]], align
- // CHECK: [[ADD:%.+]] = fadd double [[E_INV]], [[EV]]
- // CHECK: store double [[ADD]], double* [[E_IN]], align
- // CHECK: call void @__kmpc_nvptx_end_reduce_nowait(
- // CHECK: br label
- //
- // CHECK: ret
-
- //
- // Reduction function
- // CHECK: define internal void [[REDUCTION_FUNC:@.+]](i8* %0, i8* %1)
- // CHECK: [[VAR_RHS_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST_RHS:%.+]], i{{32|64}} 0, i{{32|64}} 0
- // CHECK: [[VAR_RHS_VOID:%.+]] = load i8*, i8** [[VAR_RHS_REF]],
- // CHECK: [[VAR_RHS:%.+]] = bitcast i8* [[VAR_RHS_VOID]] to double*
- //
- // CHECK: [[VAR_LHS_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST_LHS:%.+]], i{{32|64}} 0, i{{32|64}} 0
- // CHECK: [[VAR_LHS_VOID:%.+]] = load i8*, i8** [[VAR_LHS_REF]],
- // CHECK: [[VAR_LHS:%.+]] = bitcast i8* [[VAR_LHS_VOID]] to double*
- //
- // CHECK: [[VAR_LHS_VAL:%.+]] = load double, double* [[VAR_LHS]],
- // CHECK: [[VAR_RHS_VAL:%.+]] = load double, double* [[VAR_RHS]],
- // CHECK: [[RES:%.+]] = fadd double [[VAR_LHS_VAL]], [[VAR_RHS_VAL]]
- // CHECK: store double [[RES]], double* [[VAR_LHS]],
- // CHECK: ret void
-
- //
- // Shuffle and reduce function
- // CHECK: define internal void [[SHUFFLE_REDUCE_FN]](i8* %0, i16 {{.*}}, i16 {{.*}}, i16 {{.*}})
- // CHECK: [[REMOTE_RED_LIST:%.+]] = alloca [[RLT]], align
- // CHECK: [[REMOTE_ELT:%.+]] = alloca double
- //
- // CHECK: [[LANEID:%.+]] = load i16, i16* {{.+}}, align
- // CHECK: [[LANEOFFSET:%.+]] = load i16, i16* {{.+}}, align
- // CHECK: [[ALGVER:%.+]] = load i16, i16* {{.+}}, align
- //
- // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST:%.+]], i{{32|64}} 0, i{{32|64}} 0
- // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
- // CHECK: [[REMOTE_ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[REMOTE_RED_LIST:%.+]], i{{32|64}} 0, i{{32|64}} 0
- // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to double*
- //
- // CHECK: [[ELT_CAST:%.+]] = bitcast double* [[ELT]] to i64*
- // CHECK: [[REMOTE_ELT_CAST:%.+]] = bitcast double* [[REMOTE_ELT]] to i64*
- // CHECK: [[ELT_VAL:%.+]] = load i64, i64* [[ELT_CAST]], align
- // CHECK: [[WS32:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
- // CHECK: [[WS:%.+]] = trunc i32 [[WS32]] to i16
- // CHECK: [[REMOTE_ELT_VAL64:%.+]] = call i64 @__kmpc_shuffle_int64(i64 [[ELT_VAL]], i16 [[LANEOFFSET]], i16 [[WS]])
- //
- // CHECK: store i64 [[REMOTE_ELT_VAL64]], i64* [[REMOTE_ELT_CAST]], align
- // CHECK: [[REMOTE_ELT_VOID:%.+]] = bitcast double* [[REMOTE_ELT]] to i8*
- // CHECK: store i8* [[REMOTE_ELT_VOID]], i8** [[REMOTE_ELT_REF]], align
- //
- // Condition to reduce
- // CHECK: [[CONDALG0:%.+]] = icmp eq i16 [[ALGVER]], 0
- //
- // CHECK: [[COND1:%.+]] = icmp eq i16 [[ALGVER]], 1
- // CHECK: [[COND2:%.+]] = icmp ult i16 [[LANEID]], [[LANEOFFSET]]
- // CHECK: [[CONDALG1:%.+]] = and i1 [[COND1]], [[COND2]]
- //
- // CHECK: [[COND3:%.+]] = icmp eq i16 [[ALGVER]], 2
- // CHECK: [[COND4:%.+]] = and i16 [[LANEID]], 1
- // CHECK: [[COND5:%.+]] = icmp eq i16 [[COND4]], 0
- // CHECK: [[COND6:%.+]] = and i1 [[COND3]], [[COND5]]
- // CHECK: [[COND7:%.+]] = icmp sgt i16 [[LANEOFFSET]], 0
- // CHECK: [[CONDALG2:%.+]] = and i1 [[COND6]], [[COND7]]
- //
- // CHECK: [[COND8:%.+]] = or i1 [[CONDALG0]], [[CONDALG1]]
- // CHECK: [[SHOULD_REDUCE:%.+]] = or i1 [[COND8]], [[CONDALG2]]
- // CHECK: br i1 [[SHOULD_REDUCE]], label {{%?}}[[DO_REDUCE:.+]], label {{%?}}[[REDUCE_ELSE:.+]]
- //
- // CHECK: [[DO_REDUCE]]
- // CHECK: [[RED_LIST1_VOID:%.+]] = bitcast [[RLT]]* [[RED_LIST]] to i8*
- // CHECK: [[RED_LIST2_VOID:%.+]] = bitcast [[RLT]]* [[REMOTE_RED_LIST]] to i8*
- // CHECK: call void [[REDUCTION_FUNC]](i8* [[RED_LIST1_VOID]], i8* [[RED_LIST2_VOID]])
- // CHECK: br label {{%?}}[[REDUCE_CONT:.+]]
- //
- // CHECK: [[REDUCE_ELSE]]
- // CHECK: br label {{%?}}[[REDUCE_CONT]]
- //
- // CHECK: [[REDUCE_CONT]]
- // Now check if we should just copy over the remote reduction list
- // CHECK: [[COND1:%.+]] = icmp eq i16 [[ALGVER]], 1
- // CHECK: [[COND2:%.+]] = icmp uge i16 [[LANEID]], [[LANEOFFSET]]
- // CHECK: [[SHOULD_COPY:%.+]] = and i1 [[COND1]], [[COND2]]
- // CHECK: br i1 [[SHOULD_COPY]], label {{%?}}[[DO_COPY:.+]], label {{%?}}[[COPY_ELSE:.+]]
- //
- // CHECK: [[DO_COPY]]
- // CHECK: [[REMOTE_ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[REMOTE_RED_LIST]], i{{32|64}} 0, i{{32|64}} 0
- // CHECK: [[REMOTE_ELT_VOID:%.+]] = load i8*, i8** [[REMOTE_ELT_REF]],
- // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST]], i{{32|64}} 0, i{{32|64}} 0
- // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
- // CHECK: [[REMOTE_ELT:%.+]] = bitcast i8* [[REMOTE_ELT_VOID]] to double*
- // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to double*
- // CHECK: [[REMOTE_ELT_VAL:%.+]] = load double, double* [[REMOTE_ELT]], align
- // CHECK: store double [[REMOTE_ELT_VAL]], double* [[ELT]], align
- // CHECK: br label {{%?}}[[COPY_CONT:.+]]
- //
- // CHECK: [[COPY_ELSE]]
- // CHECK: br label {{%?}}[[COPY_CONT]]
- //
- // CHECK: [[COPY_CONT]]
- // CHECK: void
-
- //
- // Inter warp copy function
- // CHECK: define internal void [[WARP_COPY_FN]](i8* %0, i32 %1)
- // CHECK-DAG: [[LANEID:%.+]] = and i32 {{.+}}, 31
- // CHECK-DAG: [[WARPID:%.+]] = ashr i32 {{.+}}, 5
- // CHECK-DAG: [[RED_LIST:%.+]] = bitcast i8* {{.+}} to [[RLT]]*
- // CHECK: store i32 0, i32* [[CNT_ADDR:%.+]],
- // CHECK: br label
- // CHECK: [[CNT:%.+]] = load i32, i32* [[CNT_ADDR]],
- // CHECK: [[DONE_COPY:%.+]] = icmp ult i32 [[CNT]], 2
- // CHECK: br i1 [[DONE_COPY]], label
- // CHECK: call void @__kmpc_barrier(%struct.ident_t* @
- // CHECK: [[IS_WARP_MASTER:%.+]] = icmp eq i32 [[LANEID]], 0
- // CHECK: br i1 [[IS_WARP_MASTER]], label {{%?}}[[DO_COPY:.+]], label {{%?}}[[COPY_ELSE:.+]]
- //
- // [[DO_COPY]]
- // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST]], i{{32|64}} 0, i{{32|64}} 0
- // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
- // CHECK: [[BASE_ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i32*
- // CHECK: [[ELT:%.+]] = getelementptr i32, i32* [[BASE_ELT]], i32 [[CNT]]
- //
- // CHECK: [[MEDIUM_ELT:%.+]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[WARPID]]
- // CHECK: [[ELT_VAL:%.+]] = load i32, i32* [[ELT]],
- // CHECK: store volatile i32 [[ELT_VAL]], i32 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]],
- // CHECK: br label {{%?}}[[COPY_CONT:.+]]
- //
- // CHECK: [[COPY_ELSE]]
- // CHECK: br label {{%?}}[[COPY_CONT]]
- //
- // Barrier after copy to shared memory storage medium.
- // CHECK: [[COPY_CONT]]
- // CHECK: call void @__kmpc_barrier(%struct.ident_t* @
- // CHECK: [[ACTIVE_WARPS:%.+]] = load i32, i32*
- //
- // Read into warp 0.
- // CHECK: [[IS_W0_ACTIVE_THREAD:%.+]] = icmp ult i32 [[TID:%.+]], [[ACTIVE_WARPS]]
- // CHECK: br i1 [[IS_W0_ACTIVE_THREAD]], label {{%?}}[[DO_READ:.+]], label {{%?}}[[READ_ELSE:.+]]
- //
- // CHECK: [[DO_READ]]
- // CHECK: [[MEDIUM_ELT:%.+]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[TID]]
- // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST:%.+]], i{{32|64}} 0, i{{32|64}} 0
- // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
- // CHECK: [[ELT_BASE:%.+]] = bitcast i8* [[ELT_VOID]] to i32*
- // CHECK: [[ELT:%.+]] = getelementptr i32, i32* [[ELT_BASE]], i32 [[CNT]]
- // CHECK: [[MEDIUM_ELT_VAL:%.+]] = load volatile i32, i32 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]],
- // CHECK: store i32 [[MEDIUM_ELT_VAL]], i32* [[ELT]],
- // CHECK: br label {{%?}}[[READ_CONT:.+]]
- //
- // CHECK: [[READ_ELSE]]
- // CHECK: br label {{%?}}[[READ_CONT]]
- //
- // CHECK: [[READ_CONT]]
- // CHECK: [[NEXT:%.+]] = add nsw i32 [[CNT]], 1
- // CHECK: store i32 [[NEXT]], i32* [[CNT_ADDR]],
- // CHECK: br label
- // CHECK: ret
-
-
-
-
-
-
-
-
-
-
- // CHECK: define {{.*}}void {{@__omp_offloading_.+template.+l32}}(
- //
- // CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1, i16 0)
- // CHECK: call void @__kmpc_data_sharing_init_stack_spmd
- // CHECK: br label {{%?}}[[EXECUTE:.+]]
- //
- // CHECK: [[EXECUTE]]
- // CHECK: {{call|invoke}} void [[PFN1:@.+]](i32*
- // CHECK: call void @__kmpc_spmd_kernel_deinit_v2(i16 1)
- //
- //
- // define internal void [[PFN1]](
- // CHECK: store float {{1\.[0e\+]+}}, float* [[D:%.+]], align
- // CHECK: [[C_VAL:%.+]] = load i8, i8* [[C:%.+]], align
- // CHECK: [[CONV:%.+]] = sext i8 [[C_VAL]] to i32
- // CHECK: [[XOR:%.+]] = xor i32 [[CONV]], 2
- // CHECK: [[TRUNC:%.+]] = trunc i32 [[XOR]] to i8
- // CHECK: store i8 [[TRUNC]], i8* [[C]], align
- // CHECK: [[DV:%.+]] = load float, float* [[D]], align
- // CHECK: [[MUL:%.+]] = fmul float [[DV]], {{[0-9e\.\+]+}}
- // CHECK: store float [[MUL]], float* [[D]], align
- // CHECK: [[PTR1:%.+]] = getelementptr inbounds [[RLT:.+]], [2 x i8*]* [[RL:%.+]], i{{32|64}} 0, i{{32|64}} 0
- // CHECK: store i8* [[C]], i8** [[PTR1]], align
- // CHECK: [[PTR2:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RL]], i{{32|64}} 0, i{{32|64}} 1
- // CHECK: [[D_CAST:%.+]] = bitcast float* [[D]] to i8*
- // CHECK: store i8* [[D_CAST]], i8** [[PTR2]], align
- // CHECK: [[ARG_RL:%.+]] = bitcast [[RLT]]* [[RL]] to i8*
- // CHECK: [[RET:%.+]] = call i32 @__kmpc_nvptx_parallel_reduce_nowait_v2(%struct.ident_t* @{{.+}}, i32 {{.+}}, i32 2, i{{32|64}} {{8|16}}, i8* [[ARG_RL]], void (i8*, i16, i16, i16)* [[SHUFFLE_REDUCE_FN:@.+]], void (i8*, i32)* [[WARP_COPY_FN:@.+]])
- // CHECK: [[CMP:%.+]] = icmp eq i32 [[RET]], 1
- // CHECK: br i1 [[CMP]], label
- // CHECK: [[C_INV8:%.+]] = load i8, i8* [[C_IN:%.+]], align
- // CHECK: [[C_INV:%.+]] = sext i8 [[C_INV8]] to i32
- // CHECK: [[CV8:%.+]] = load i8, i8* [[C]], align
- // CHECK: [[CV:%.+]] = sext i8 [[CV8]] to i32
- // CHECK: [[XOR:%.+]] = xor i32 [[C_INV]], [[CV]]
- // CHECK: [[TRUNC:%.+]] = trunc i32 [[XOR]] to i8
- // CHECK: store i8 [[TRUNC]], i8* [[C_IN]], align
- // CHECK: [[D_INV:%.+]] = load float, float* [[D_IN:%.+]], align
- // CHECK: [[DV:%.+]] = load float, float* [[D]], align
- // CHECK: [[MUL:%.+]] = fmul float [[D_INV]], [[DV]]
- // CHECK: store float [[MUL]], float* [[D_IN]], align
- // CHECK: call void @__kmpc_nvptx_end_reduce_nowait(
- // CHECK: br label
- //
- // CHECK: ret
-
- //
- // Reduction function
- // CHECK: define internal void [[REDUCTION_FUNC:@.+]](i8* %0, i8* %1)
- // CHECK: [[VAR1_RHS_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST_RHS:%.+]], i{{32|64}} 0, i{{32|64}} 0
- // CHECK: [[VAR1_RHS:%.+]] = load i8*, i8** [[VAR1_RHS_REF]],
- //
- // CHECK: [[VAR1_LHS_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST_LHS:%.+]], i{{32|64}} 0, i{{32|64}} 0
- // CHECK: [[VAR1_LHS:%.+]] = load i8*, i8** [[VAR1_LHS_REF]],
- //
- // CHECK: [[VAR2_RHS_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST_RHS]], i{{32|64}} 0, i{{32|64}} 1
- // CHECK: [[VAR2_RHS_VOID:%.+]] = load i8*, i8** [[VAR2_RHS_REF]],
- // CHECK: [[VAR2_RHS:%.+]] = bitcast i8* [[VAR2_RHS_VOID]] to float*
- //
- // CHECK: [[VAR2_LHS_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST_LHS]], i{{32|64}} 0, i{{32|64}} 1
- // CHECK: [[VAR2_LHS_VOID:%.+]] = load i8*, i8** [[VAR2_LHS_REF]],
- // CHECK: [[VAR2_LHS:%.+]] = bitcast i8* [[VAR2_LHS_VOID]] to float*
- //
- // CHECK: [[VAR1_LHS_VAL8:%.+]] = load i8, i8* [[VAR1_LHS]],
- // CHECK: [[VAR1_LHS_VAL:%.+]] = sext i8 [[VAR1_LHS_VAL8]] to i32
- // CHECK: [[VAR1_RHS_VAL8:%.+]] = load i8, i8* [[VAR1_RHS]],
- // CHECK: [[VAR1_RHS_VAL:%.+]] = sext i8 [[VAR1_RHS_VAL8]] to i32
- // CHECK: [[XOR:%.+]] = xor i32 [[VAR1_LHS_VAL]], [[VAR1_RHS_VAL]]
- // CHECK: [[RES:%.+]] = trunc i32 [[XOR]] to i8
- // CHECK: store i8 [[RES]], i8* [[VAR1_LHS]],
- //
- // CHECK: [[VAR2_LHS_VAL:%.+]] = load float, float* [[VAR2_LHS]],
- // CHECK: [[VAR2_RHS_VAL:%.+]] = load float, float* [[VAR2_RHS]],
- // CHECK: [[RES:%.+]] = fmul float [[VAR2_LHS_VAL]], [[VAR2_RHS_VAL]]
- // CHECK: store float [[RES]], float* [[VAR2_LHS]],
- // CHECK: ret void
-
- //
- // Shuffle and reduce function
- // CHECK: define internal void [[SHUFFLE_REDUCE_FN]](i8* %0, i16 {{.*}}, i16 {{.*}}, i16 {{.*}})
- // CHECK: [[REMOTE_RED_LIST:%.+]] = alloca [[RLT]], align
- // CHECK: [[REMOTE_ELT1:%.+]] = alloca i8
- // CHECK: [[REMOTE_ELT2:%.+]] = alloca float
- //
- // CHECK: [[LANEID:%.+]] = load i16, i16* {{.+}}, align
- // CHECK: [[LANEOFFSET:%.+]] = load i16, i16* {{.+}}, align
- // CHECK: [[ALGVER:%.+]] = load i16, i16* {{.+}}, align
- //
- // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST:%.+]], i{{32|64}} 0, i{{32|64}} 0
- // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
- // CHECK: [[REMOTE_ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[REMOTE_RED_LIST:%.+]], i{{32|64}} 0, i{{32|64}} 0
- // CHECK: [[ELT_VAL:%.+]] = load i8, i8* [[ELT_VOID]], align
- //
- // CHECK: [[ELT_CAST:%.+]] = sext i8 [[ELT_VAL]] to i32
- // CHECK: [[WS32:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
- // CHECK: [[WS:%.+]] = trunc i32 [[WS32]] to i16
- // CHECK: [[REMOTE_ELT1_VAL32:%.+]] = call i32 @__kmpc_shuffle_int32(i32 [[ELT_CAST]], i16 [[LANEOFFSET]], i16 [[WS]])
- // CHECK: [[REMOTE_ELT1_VAL:%.+]] = trunc i32 [[REMOTE_ELT1_VAL32]] to i8
- //
- // CHECK: store i8 [[REMOTE_ELT1_VAL]], i8* [[REMOTE_ELT1]], align
- // CHECK: store i8* [[REMOTE_ELT1]], i8** [[REMOTE_ELT_REF]], align
- //
- // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST]], i{{32|64}} 0, i{{32|64}} 1
- // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
- // CHECK: [[REMOTE_ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[REMOTE_RED_LIST]], i{{32|64}} 0, i{{32|64}} 1
- // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to float*
- //
- // CHECK: [[ELT_CAST:%.+]] = bitcast float* [[ELT]] to i32*
- // CHECK: [[REMOTE_ELT2_CAST:%.+]] = bitcast float* [[REMOTE_ELT2]] to i32*
- // CHECK: [[ELT_VAL:%.+]] = load i32, i32* [[ELT_CAST]], align
- // CHECK: [[WS32:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
- // CHECK: [[WS:%.+]] = trunc i32 [[WS32]] to i16
- // CHECK: [[REMOTE_ELT2_VAL32:%.+]] = call i32 @__kmpc_shuffle_int32(i32 [[ELT_VAL]], i16 [[LANEOFFSET]], i16 [[WS]])
- //
- // CHECK: store i32 [[REMOTE_ELT2_VAL32]], i32* [[REMOTE_ELT2_CAST]], align
- // CHECK: [[REMOTE_ELT2C:%.+]] = bitcast float* [[REMOTE_ELT2]] to i8*
- // CHECK: store i8* [[REMOTE_ELT2C]], i8** [[REMOTE_ELT_REF]], align
- //
- // Condition to reduce
- // CHECK: [[CONDALG0:%.+]] = icmp eq i16 [[ALGVER]], 0
- //
- // CHECK: [[COND1:%.+]] = icmp eq i16 [[ALGVER]], 1
- // CHECK: [[COND2:%.+]] = icmp ult i16 [[LANEID]], [[LANEOFFSET]]
- // CHECK: [[CONDALG1:%.+]] = and i1 [[COND1]], [[COND2]]
- //
- // CHECK: [[COND3:%.+]] = icmp eq i16 [[ALGVER]], 2
- // CHECK: [[COND4:%.+]] = and i16 [[LANEID]], 1
- // CHECK: [[COND5:%.+]] = icmp eq i16 [[COND4]], 0
- // CHECK: [[COND6:%.+]] = and i1 [[COND3]], [[COND5]]
- // CHECK: [[COND7:%.+]] = icmp sgt i16 [[LANEOFFSET]], 0
- // CHECK: [[CONDALG2:%.+]] = and i1 [[COND6]], [[COND7]]
- //
- // CHECK: [[COND8:%.+]] = or i1 [[CONDALG0]], [[CONDALG1]]
- // CHECK: [[SHOULD_REDUCE:%.+]] = or i1 [[COND8]], [[CONDALG2]]
- // CHECK: br i1 [[SHOULD_REDUCE]], label {{%?}}[[DO_REDUCE:.+]], label {{%?}}[[REDUCE_ELSE:.+]]
- //
- // CHECK: [[DO_REDUCE]]
- // CHECK: [[RED_LIST1_VOID:%.+]] = bitcast [[RLT]]* [[RED_LIST]] to i8*
- // CHECK: [[RED_LIST2_VOID:%.+]] = bitcast [[RLT]]* [[REMOTE_RED_LIST]] to i8*
- // CHECK: call void [[REDUCTION_FUNC]](i8* [[RED_LIST1_VOID]], i8* [[RED_LIST2_VOID]])
- // CHECK: br label {{%?}}[[REDUCE_CONT:.+]]
- //
- // CHECK: [[REDUCE_ELSE]]
- // CHECK: br label {{%?}}[[REDUCE_CONT]]
- //
- // CHECK: [[REDUCE_CONT]]
- // Now check if we should just copy over the remote reduction list
- // CHECK: [[COND1:%.+]] = icmp eq i16 [[ALGVER]], 1
- // CHECK: [[COND2:%.+]] = icmp uge i16 [[LANEID]], [[LANEOFFSET]]
- // CHECK: [[SHOULD_COPY:%.+]] = and i1 [[COND1]], [[COND2]]
- // CHECK: br i1 [[SHOULD_COPY]], label {{%?}}[[DO_COPY:.+]], label {{%?}}[[COPY_ELSE:.+]]
- //
- // CHECK: [[DO_COPY]]
- // CHECK: [[REMOTE_ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[REMOTE_RED_LIST]], i{{32|64}} 0, i{{32|64}} 0
- // CHECK: [[REMOTE_ELT_VOID:%.+]] = load i8*, i8** [[REMOTE_ELT_REF]],
- // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST]], i{{32|64}} 0, i{{32|64}} 0
- // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
- // CHECK: [[REMOTE_ELT_VAL:%.+]] = load i8, i8* [[REMOTE_ELT_VOID]], align
- // CHECK: store i8 [[REMOTE_ELT_VAL]], i8* [[ELT_VOID]], align
- //
- // CHECK: [[REMOTE_ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[REMOTE_RED_LIST]], i{{32|64}} 0, i{{32|64}} 1
- // CHECK: [[REMOTE_ELT_VOID:%.+]] = load i8*, i8** [[REMOTE_ELT_REF]],
- // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST]], i{{32|64}} 0, i{{32|64}} 1
- // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
- // CHECK: [[REMOTE_ELT:%.+]] = bitcast i8* [[REMOTE_ELT_VOID]] to float*
- // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to float*
- // CHECK: [[REMOTE_ELT_VAL:%.+]] = load float, float* [[REMOTE_ELT]], align
- // CHECK: store float [[REMOTE_ELT_VAL]], float* [[ELT]], align
- // CHECK: br label {{%?}}[[COPY_CONT:.+]]
- //
- // CHECK: [[COPY_ELSE]]
- // CHECK: br label {{%?}}[[COPY_CONT]]
- //
- // CHECK: [[COPY_CONT]]
- // CHECK: void
-
- //
- // Inter warp copy function
- // CHECK: define internal void [[WARP_COPY_FN]](i8* %0, i32 %1)
- // CHECK-DAG: [[LANEID:%.+]] = and i32 {{.+}}, 31
- // CHECK-DAG: [[WARPID:%.+]] = ashr i32 {{.+}}, 5
- // CHECK-DAG: [[RED_LIST:%.+]] = bitcast i8* {{.+}} to [[RLT]]*
- // CHECK: call void @__kmpc_barrier(%struct.ident_t* @
- // CHECK: [[IS_WARP_MASTER:%.+]] = icmp eq i32 [[LANEID]], 0
- // CHECK: br i1 [[IS_WARP_MASTER]], label {{%?}}[[DO_COPY:.+]], label {{%?}}[[COPY_ELSE:.+]]
- //
- // [[DO_COPY]]
- // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST]], i{{32|64}} 0, i{{32|64}} 0
- // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
- //
- // CHECK: [[MEDIUM_ELT64:%.+]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[WARPID]]
- // CHECK: [[MEDIUM_ELT:%.+]] = bitcast i32 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT64]] to i8 addrspace([[SHARED_ADDRSPACE]])*
- // CHECK: [[ELT_VAL:%.+]] = load i8, i8* [[ELT_VOID]], align
- // CHECK: store volatile i8 [[ELT_VAL]], i8 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align
- // CHECK: br label {{%?}}[[COPY_CONT:.+]]
- //
- // CHECK: [[COPY_ELSE]]
- // CHECK: br label {{%?}}[[COPY_CONT]]
- //
- // Barrier after copy to shared memory storage medium.
- // CHECK: [[COPY_CONT]]
- // CHECK: call void @__kmpc_barrier(%struct.ident_t* @
- // CHECK: [[ACTIVE_WARPS:%.+]] = load i32, i32*
- //
- // Read into warp 0.
- // CHECK: [[IS_W0_ACTIVE_THREAD:%.+]] = icmp ult i32 [[TID:%.+]], [[ACTIVE_WARPS]]
- // CHECK: br i1 [[IS_W0_ACTIVE_THREAD]], label {{%?}}[[DO_READ:.+]], label {{%?}}[[READ_ELSE:.+]]
- //
- // CHECK: [[DO_READ]]
- // CHECK: [[MEDIUM_ELT32:%.+]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[TID]]
- // CHECK: [[MEDIUM_ELT:%.+]] = bitcast i32 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT32]] to i8 addrspace([[SHARED_ADDRSPACE]])*
- // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST:%.+]], i{{32|64}} 0, i{{32|64}} 0
- // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
- // CHECK: [[MEDIUM_ELT_VAL:%.+]] = load volatile i8, i8 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align
- // CHECK: store i8 [[MEDIUM_ELT_VAL]], i8* [[ELT_VOID]], align
- // CHECK: br label {{%?}}[[READ_CONT:.+]]
- //
- // CHECK: [[READ_ELSE]]
- // CHECK: br label {{%?}}[[READ_CONT]]
- //
- // CHECK: [[READ_CONT]]
- // CHECK: call void @__kmpc_barrier(%struct.ident_t* @
- // CHECK: [[IS_WARP_MASTER:%.+]] = icmp eq i32 [[LANEID]], 0
- // CHECK: br i1 [[IS_WARP_MASTER]], label {{%?}}[[DO_COPY:.+]], label {{%?}}[[COPY_ELSE:.+]]
- //
- // [[DO_COPY]]
- // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST]], i{{32|64}} 0, i{{32|64}} 1
- // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
- // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i32*
- //
- // CHECK: [[MEDIUM_ELT:%.+]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[WARPID]]
- // CHECK: [[ELT_VAL:%.+]] = load i32, i32* [[ELT]], align
- // CHECK: store volatile i32 [[ELT_VAL]], i32 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align
- // CHECK: br label {{%?}}[[COPY_CONT:.+]]
- //
- // CHECK: [[COPY_ELSE]]
- // CHECK: br label {{%?}}[[COPY_CONT]]
- //
- // Barrier after copy to shared memory storage medium.
- // CHECK: [[COPY_CONT]]
- // CHECK: call void @__kmpc_barrier(%struct.ident_t* @
- // CHECK: [[ACTIVE_WARPS:%.+]] = load i32, i32*
- //
- // Read into warp 0.
- // CHECK: [[IS_W0_ACTIVE_THREAD:%.+]] = icmp ult i32 [[TID:%.+]], [[ACTIVE_WARPS]]
- // CHECK: br i1 [[IS_W0_ACTIVE_THREAD]], label {{%?}}[[DO_READ:.+]], label {{%?}}[[READ_ELSE:.+]]
- //
- // CHECK: [[DO_READ]]
- // CHECK: [[MEDIUM_ELT:%.+]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[TID]]
- // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST:%.+]], i{{32|64}} 0, i{{32|64}} 1
- // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
- // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i32*
- // CHECK: [[MEDIUM_ELT_VAL:%.+]] = load volatile i32, i32 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align
- // CHECK: store i32 [[MEDIUM_ELT_VAL]], i32* [[ELT]], align
- // CHECK: br label {{%?}}[[READ_CONT:.+]]
- //
- // CHECK: [[READ_ELSE]]
- // CHECK: br label {{%?}}[[READ_CONT]]
- //
- // CHECK: [[READ_CONT]]
- // CHECK: ret
-
-
-
-
-
-
-
-
-
-
- // CHECK: define {{.*}}void {{@__omp_offloading_.+template.+l38}}(
- //
- // CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1, i16 0)
- // CHECK: call void @__kmpc_data_sharing_init_stack_spmd
- // CHECK: br label {{%?}}[[EXECUTE:.+]]
- //
- // CHECK: [[EXECUTE]]
- // CHECK: {{call|invoke}} void [[PFN2:@.+]](i32*
- // CHECK: call void @__kmpc_spmd_kernel_deinit_v2(i16 1)
- //
- //
- // define internal void [[PFN2]](
- // CHECK: store i32 0, i32* [[A:%.+]], align
- // CHECK: store i16 -32768, i16* [[B:%.+]], align
- // CHECK: [[A_VAL:%.+]] = load i32, i32* [[A:%.+]], align
- // CHECK: [[OR:%.+]] = or i32 [[A_VAL]], 1
- // CHECK: store i32 [[OR]], i32* [[A]], align
- // CHECK: [[BV16:%.+]] = load i16, i16* [[B]], align
- // CHECK: [[BV:%.+]] = sext i16 [[BV16]] to i32
- // CHECK: [[CMP:%.+]] = icmp sgt i32 99, [[BV]]
- // CHECK: br i1 [[CMP]], label {{%?}}[[DO_MAX:.+]], label {{%?}}[[MAX_ELSE:.+]]
- //
- // CHECK: [[DO_MAX]]
- // CHECK: br label {{%?}}[[MAX_CONT:.+]]
- //
- // CHECK: [[MAX_ELSE]]
- // CHECK: [[BV:%.+]] = load i16, i16* [[B]], align
- // CHECK: [[MAX:%.+]] = sext i16 [[BV]] to i32
- // CHECK: br label {{%?}}[[MAX_CONT]]
- //
- // CHECK: [[MAX_CONT]]
- // CHECK: [[B_LVALUE:%.+]] = phi i32 [ 99, %[[DO_MAX]] ], [ [[MAX]], %[[MAX_ELSE]] ]
- // CHECK: [[TRUNC:%.+]] = trunc i32 [[B_LVALUE]] to i16
- // CHECK: store i16 [[TRUNC]], i16* [[B]], align
- // CHECK: [[PTR1:%.+]] = getelementptr inbounds [[RLT:.+]], [2 x i8*]* [[RL:%.+]], i{{32|64}} 0, i{{32|64}} 0
- // CHECK: [[A_CAST:%.+]] = bitcast i32* [[A]] to i8*
- // CHECK: store i8* [[A_CAST]], i8** [[PTR1]], align
- // CHECK: [[PTR2:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RL]], i{{32|64}} 0, i{{32|64}} 1
- // CHECK: [[B_CAST:%.+]] = bitcast i16* [[B]] to i8*
- // CHECK: store i8* [[B_CAST]], i8** [[PTR2]], align
- // CHECK: [[ARG_RL:%.+]] = bitcast [[RLT]]* [[RL]] to i8*
- // CHECK: [[RET:%.+]] = call i32 @__kmpc_nvptx_parallel_reduce_nowait_v2(%struct.ident_t* @{{.+}}, i32 {{.+}}, i32 2, i{{32|64}} {{8|16}}, i8* [[ARG_RL]], void (i8*, i16, i16, i16)* [[SHUFFLE_REDUCE_FN:@.+]], void (i8*, i32)* [[WARP_COPY_FN:@.+]])
- // CHECK: [[CMP:%.+]] = icmp eq i32 [[RET]], 1
- // CHECK: br i1 [[CMP]], label
-
- // CHECK: [[A_INV:%.+]] = load i32, i32* [[A_IN:%.+]], align
- // CHECK: [[AV:%.+]] = load i32, i32* [[A]], align
- // CHECK: [[OR:%.+]] = or i32 [[A_INV]], [[AV]]
- // CHECK: store i32 [[OR]], i32* [[A_IN]], align
- // CHECK: [[B_INV16:%.+]] = load i16, i16* [[B_IN:%.+]], align
- // CHECK: [[B_INV:%.+]] = sext i16 [[B_INV16]] to i32
- // CHECK: [[BV16:%.+]] = load i16, i16* [[B]], align
- // CHECK: [[BV:%.+]] = sext i16 [[BV16]] to i32
- // CHECK: [[CMP:%.+]] = icmp sgt i32 [[B_INV]], [[BV]]
- // CHECK: br i1 [[CMP]], label {{%?}}[[DO_MAX:.+]], label {{%?}}[[MAX_ELSE:.+]]
- //
- // CHECK: [[DO_MAX]]
- // CHECK: [[MAX1:%.+]] = load i16, i16* [[B_IN]], align
- // CHECK: br label {{%?}}[[MAX_CONT:.+]]
- //
- // CHECK: [[MAX_ELSE]]
- // CHECK: [[MAX2:%.+]] = load i16, i16* [[B]], align
- // CHECK: br label {{%?}}[[MAX_CONT]]
- //
- // CHECK: [[MAX_CONT]]
- // CHECK: [[B_MAX:%.+]] = phi i16 [ [[MAX1]], %[[DO_MAX]] ], [ [[MAX2]], %[[MAX_ELSE]] ]
- // CHECK: store i16 [[B_MAX]], i16* [[B_IN]], align
- // CHECK: call void @__kmpc_nvptx_end_reduce_nowait(
- // CHECK: br label
- //
- // CHECK: ret
-
- //
- // Reduction function
- // CHECK: define internal void [[REDUCTION_FUNC:@.+]](i8* %0, i8* %1)
- // CHECK: [[VAR1_RHS_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST_RHS:%.+]], i{{32|64}} 0, i{{32|64}} 0
- // CHECK: [[VAR1_RHS_VOID:%.+]] = load i8*, i8** [[VAR1_RHS_REF]],
- // CHECK: [[VAR1_RHS:%.+]] = bitcast i8* [[VAR1_RHS_VOID]] to i32*
- //
- // CHECK: [[VAR1_LHS_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST_LHS:%.+]], i{{32|64}} 0, i{{32|64}} 0
- // CHECK: [[VAR1_LHS_VOID:%.+]] = load i8*, i8** [[VAR1_LHS_REF]],
- // CHECK: [[VAR1_LHS:%.+]] = bitcast i8* [[VAR1_LHS_VOID]] to i32*
- //
- // CHECK: [[VAR2_RHS_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST_RHS]], i{{32|64}} 0, i{{32|64}} 1
- // CHECK: [[VAR2_RHS_VOID:%.+]] = load i8*, i8** [[VAR2_RHS_REF]],
- // CHECK: [[VAR2_RHS:%.+]] = bitcast i8* [[VAR2_RHS_VOID]] to i16*
- //
- // CHECK: [[VAR2_LHS_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST_LHS]], i{{32|64}} 0, i{{32|64}} 1
- // CHECK: [[VAR2_LHS_VOID:%.+]] = load i8*, i8** [[VAR2_LHS_REF]],
- // CHECK: [[VAR2_LHS:%.+]] = bitcast i8* [[VAR2_LHS_VOID]] to i16*
- //
- // CHECK: [[VAR1_LHS_VAL:%.+]] = load i32, i32* [[VAR1_LHS]],
- // CHECK: [[VAR1_RHS_VAL:%.+]] = load i32, i32* [[VAR1_RHS]],
- // CHECK: [[OR:%.+]] = or i32 [[VAR1_LHS_VAL]], [[VAR1_RHS_VAL]]
- // CHECK: store i32 [[OR]], i32* [[VAR1_LHS]],
- //
- // CHECK: [[VAR2_LHS_VAL16:%.+]] = load i16, i16* [[VAR2_LHS]],
- // CHECK: [[VAR2_LHS_VAL:%.+]] = sext i16 [[VAR2_LHS_VAL16]] to i32
- // CHECK: [[VAR2_RHS_VAL16:%.+]] = load i16, i16* [[VAR2_RHS]],
- // CHECK: [[VAR2_RHS_VAL:%.+]] = sext i16 [[VAR2_RHS_VAL16]] to i32
- //
- // CHECK: [[CMP:%.+]] = icmp sgt i32 [[VAR2_LHS_VAL]], [[VAR2_RHS_VAL]]
- // CHECK: br i1 [[CMP]], label {{%?}}[[DO_MAX:.+]], label {{%?}}[[MAX_ELSE:.+]]
- //
- // CHECK: [[DO_MAX]]
- // CHECK: [[MAX1:%.+]] = load i16, i16* [[VAR2_LHS]], align
- // CHECK: br label {{%?}}[[MAX_CONT:.+]]
- //
- // CHECK: [[MAX_ELSE]]
- // CHECK: [[MAX2:%.+]] = load i16, i16* [[VAR2_RHS]], align
- // CHECK: br label {{%?}}[[MAX_CONT]]
- //
- // CHECK: [[MAX_CONT]]
- // CHECK: [[MAXV:%.+]] = phi i16 [ [[MAX1]], %[[DO_MAX]] ], [ [[MAX2]], %[[MAX_ELSE]] ]
- // CHECK: store i16 [[MAXV]], i16* [[VAR2_LHS]],
- // CHECK: ret void
-
- //
- // Shuffle and reduce function
- // CHECK: define internal void [[SHUFFLE_REDUCE_FN]](i8* %0, i16 {{.*}}, i16 {{.*}}, i16 {{.*}})
- // CHECK: [[REMOTE_RED_LIST:%.+]] = alloca [[RLT]], align
- // CHECK: [[REMOTE_ELT1:%.+]] = alloca i32
- // CHECK: [[REMOTE_ELT2:%.+]] = alloca i16
- //
- // CHECK: [[LANEID:%.+]] = load i16, i16* {{.+}}, align
- // CHECK: [[LANEOFFSET:%.+]] = load i16, i16* {{.+}}, align
- // CHECK: [[ALGVER:%.+]] = load i16, i16* {{.+}}, align
- //
- // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST:%.+]], i{{32|64}} 0, i{{32|64}} 0
- // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
- // CHECK: [[REMOTE_ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[REMOTE_RED_LIST:%.+]], i{{32|64}} 0, i{{32|64}} 0
- // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i32*
- // CHECK: [[ELT_VAL:%.+]] = load i32, i32* [[ELT]], align
- //
- // CHECK: [[WS32:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
- // CHECK: [[WS:%.+]] = trunc i32 [[WS32]] to i16
- // CHECK: [[REMOTE_ELT1_VAL:%.+]] = call i32 @__kmpc_shuffle_int32(i32 [[ELT_VAL]], i16 [[LANEOFFSET]], i16 [[WS]])
- //
- // CHECK: store i32 [[REMOTE_ELT1_VAL]], i32* [[REMOTE_ELT1]], align
- // CHECK: [[REMOTE_ELT1C:%.+]] = bitcast i32* [[REMOTE_ELT1]] to i8*
- // CHECK: store i8* [[REMOTE_ELT1C]], i8** [[REMOTE_ELT_REF]], align
- //
- // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST]], i{{32|64}} 0, i{{32|64}} 1
- // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
- // CHECK: [[REMOTE_ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[REMOTE_RED_LIST]], i{{32|64}} 0, i{{32|64}} 1
- // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i16*
- // CHECK: [[ELT_VAL:%.+]] = load i16, i16* [[ELT]], align
- //
- // CHECK: [[ELT_CAST:%.+]] = sext i16 [[ELT_VAL]] to i32
- // CHECK: [[WS32:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
- // CHECK: [[WS:%.+]] = trunc i32 [[WS32]] to i16
- // CHECK: [[REMOTE_ELT2_VAL32:%.+]] = call i32 @__kmpc_shuffle_int32(i32 [[ELT_CAST]], i16 [[LANEOFFSET]], i16 [[WS]])
- // CHECK: [[REMOTE_ELT2_VAL:%.+]] = trunc i32 [[REMOTE_ELT2_VAL32]] to i16
- //
- // CHECK: store i16 [[REMOTE_ELT2_VAL]], i16* [[REMOTE_ELT2]], align
- // CHECK: [[REMOTE_ELT2C:%.+]] = bitcast i16* [[REMOTE_ELT2]] to i8*
- // CHECK: store i8* [[REMOTE_ELT2C]], i8** [[REMOTE_ELT_REF]], align
- //
- // Condition to reduce
- // CHECK: [[CONDALG0:%.+]] = icmp eq i16 [[ALGVER]], 0
- //
- // CHECK: [[COND1:%.+]] = icmp eq i16 [[ALGVER]], 1
- // CHECK: [[COND2:%.+]] = icmp ult i16 [[LANEID]], [[LANEOFFSET]]
- // CHECK: [[CONDALG1:%.+]] = and i1 [[COND1]], [[COND2]]
- //
- // CHECK: [[COND3:%.+]] = icmp eq i16 [[ALGVER]], 2
- // CHECK: [[COND4:%.+]] = and i16 [[LANEID]], 1
- // CHECK: [[COND5:%.+]] = icmp eq i16 [[COND4]], 0
- // CHECK: [[COND6:%.+]] = and i1 [[COND3]], [[COND5]]
- // CHECK: [[COND7:%.+]] = icmp sgt i16 [[LANEOFFSET]], 0
- // CHECK: [[CONDALG2:%.+]] = and i1 [[COND6]], [[COND7]]
- //
- // CHECK: [[COND8:%.+]] = or i1 [[CONDALG0]], [[CONDALG1]]
- // CHECK: [[SHOULD_REDUCE:%.+]] = or i1 [[COND8]], [[CONDALG2]]
- // CHECK: br i1 [[SHOULD_REDUCE]], label {{%?}}[[DO_REDUCE:.+]], label {{%?}}[[REDUCE_ELSE:.+]]
- //
- // CHECK: [[DO_REDUCE]]
- // CHECK: [[RED_LIST1_VOID:%.+]] = bitcast [[RLT]]* [[RED_LIST]] to i8*
- // CHECK: [[RED_LIST2_VOID:%.+]] = bitcast [[RLT]]* [[REMOTE_RED_LIST]] to i8*
- // CHECK: call void [[REDUCTION_FUNC]](i8* [[RED_LIST1_VOID]], i8* [[RED_LIST2_VOID]])
- // CHECK: br label {{%?}}[[REDUCE_CONT:.+]]
- //
- // CHECK: [[REDUCE_ELSE]]
- // CHECK: br label {{%?}}[[REDUCE_CONT]]
- //
- // CHECK: [[REDUCE_CONT]]
- // Now check if we should just copy over the remote reduction list
- // CHECK: [[COND1:%.+]] = icmp eq i16 [[ALGVER]], 1
- // CHECK: [[COND2:%.+]] = icmp uge i16 [[LANEID]], [[LANEOFFSET]]
- // CHECK: [[SHOULD_COPY:%.+]] = and i1 [[COND1]], [[COND2]]
- // CHECK: br i1 [[SHOULD_COPY]], label {{%?}}[[DO_COPY:.+]], label {{%?}}[[COPY_ELSE:.+]]
- //
- // CHECK: [[DO_COPY]]
- // CHECK: [[REMOTE_ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[REMOTE_RED_LIST]], i{{32|64}} 0, i{{32|64}} 0
- // CHECK: [[REMOTE_ELT_VOID:%.+]] = load i8*, i8** [[REMOTE_ELT_REF]],
- // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST]], i{{32|64}} 0, i{{32|64}} 0
- // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
- // CHECK: [[REMOTE_ELT:%.+]] = bitcast i8* [[REMOTE_ELT_VOID]] to i32*
- // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i32*
- // CHECK: [[REMOTE_ELT_VAL:%.+]] = load i32, i32* [[REMOTE_ELT]], align
- // CHECK: store i32 [[REMOTE_ELT_VAL]], i32* [[ELT]], align
- //
- // CHECK: [[REMOTE_ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[REMOTE_RED_LIST]], i{{32|64}} 0, i{{32|64}} 1
- // CHECK: [[REMOTE_ELT_VOID:%.+]] = load i8*, i8** [[REMOTE_ELT_REF]],
- // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST]], i{{32|64}} 0, i{{32|64}} 1
- // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
- // CHECK: [[REMOTE_ELT:%.+]] = bitcast i8* [[REMOTE_ELT_VOID]] to i16*
- // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i16*
- // CHECK: [[REMOTE_ELT_VAL:%.+]] = load i16, i16* [[REMOTE_ELT]], align
- // CHECK: store i16 [[REMOTE_ELT_VAL]], i16* [[ELT]], align
- // CHECK: br label {{%?}}[[COPY_CONT:.+]]
- //
- // CHECK: [[COPY_ELSE]]
- // CHECK: br label {{%?}}[[COPY_CONT]]
- //
- // CHECK: [[COPY_CONT]]
- // CHECK: void
-
- //
- // Inter warp copy function
- // CHECK: define internal void [[WARP_COPY_FN]](i8* %0, i32 %1)
- // CHECK-DAG: [[LANEID:%.+]] = and i32 {{.+}}, 31
- // CHECK-DAG: [[WARPID:%.+]] = ashr i32 {{.+}}, 5
- // CHECK-DAG: [[RED_LIST:%.+]] = bitcast i8* {{.+}} to [[RLT]]*
- // CHECK: call void @__kmpc_barrier(%struct.ident_t* @
- // CHECK: [[IS_WARP_MASTER:%.+]] = icmp eq i32 [[LANEID]], 0
- // CHECK: br i1 [[IS_WARP_MASTER]], label {{%?}}[[DO_COPY:.+]], label {{%?}}[[COPY_ELSE:.+]]
- //
- // [[DO_COPY]]
- // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST]], i{{32|64}} 0, i{{32|64}} 0
- // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
- // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i32*
- //
- // CHECK: [[MEDIUM_ELT:%.+]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[WARPID]]
- // CHECK: [[ELT_VAL:%.+]] = load i32, i32* [[ELT]], align
- // CHECK: store volatile i32 [[ELT_VAL]], i32 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align
- // CHECK: br label {{%?}}[[COPY_CONT:.+]]
- //
- // CHECK: [[COPY_ELSE]]
- // CHECK: br label {{%?}}[[COPY_CONT]]
- //
- // Barrier after copy to shared memory storage medium.
- // CHECK: [[COPY_CONT]]
- // CHECK: call void @__kmpc_barrier(%struct.ident_t* @
- // CHECK: [[ACTIVE_WARPS:%.+]] = load i32, i32*
- //
- // Read into warp 0.
- // CHECK: [[IS_W0_ACTIVE_THREAD:%.+]] = icmp ult i32 [[TID:%.+]], [[ACTIVE_WARPS]]
- // CHECK: br i1 [[IS_W0_ACTIVE_THREAD]], label {{%?}}[[DO_READ:.+]], label {{%?}}[[READ_ELSE:.+]]
- //
- // CHECK: [[DO_READ]]
- // CHECK: [[MEDIUM_ELT:%.+]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[TID]]
- // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST:%.+]], i{{32|64}} 0, i{{32|64}} 0
- // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
- // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i32*
- // CHECK: [[MEDIUM_ELT_VAL:%.+]] = load volatile i32, i32 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align
- // CHECK: store i32 [[MEDIUM_ELT_VAL]], i32* [[ELT]], align
- // CHECK: br label {{%?}}[[READ_CONT:.+]]
- //
- // CHECK: [[READ_ELSE]]
- // CHECK: br label {{%?}}[[READ_CONT]]
- //
- // CHECK: [[READ_CONT]]
- // CHECK: call void @__kmpc_barrier(%struct.ident_t* @
- // CHECK: [[IS_WARP_MASTER:%.+]] = icmp eq i32 [[LANEID]], 0
- // CHECK: br i1 [[IS_WARP_MASTER]], label {{%?}}[[DO_COPY:.+]], label {{%?}}[[COPY_ELSE:.+]]
- //
- // [[DO_COPY]]
- // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST]], i{{32|64}} 0, i{{32|64}} 1
- // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
- // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i16*
- //
- // CHECK: [[MEDIUM_ELT32:%.+]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[WARPID]]
- // CHECK: [[MEDIUM_ELT:%.+]] = bitcast i32 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT32]] to i16 addrspace([[SHARED_ADDRSPACE]])*
- // CHECK: [[ELT_VAL:%.+]] = load i16, i16* [[ELT]], align
- // CHECK: store volatile i16 [[ELT_VAL]], i16 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align
- // CHECK: br label {{%?}}[[COPY_CONT:.+]]
- //
- // CHECK: [[COPY_ELSE]]
- // CHECK: br label {{%?}}[[COPY_CONT]]
- //
- // Barrier after copy to shared memory storage medium.
- // CHECK: [[COPY_CONT]]
- // CHECK: call void @__kmpc_barrier(%struct.ident_t* @
- // CHECK: [[ACTIVE_WARPS:%.+]] = load i32, i32*
- //
- // Read into warp 0.
- // CHECK: [[IS_W0_ACTIVE_THREAD:%.+]] = icmp ult i32 [[TID:%.+]], [[ACTIVE_WARPS]]
- // CHECK: br i1 [[IS_W0_ACTIVE_THREAD]], label {{%?}}[[DO_READ:.+]], label {{%?}}[[READ_ELSE:.+]]
- //
- // CHECK: [[DO_READ]]
- // CHECK: [[MEDIUM_ELT32:%.+]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[TID]]
- // CHECK: [[MEDIUM_ELT:%.+]] = bitcast i32 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT32]] to i16 addrspace([[SHARED_ADDRSPACE]])*
- // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST:%.+]], i{{32|64}} 0, i{{32|64}} 1
- // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
- // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i16*
- // CHECK: [[MEDIUM_ELT_VAL:%.+]] = load volatile i16, i16 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align
- // CHECK: store i16 [[MEDIUM_ELT_VAL]], i16* [[ELT]], align
- // CHECK: br label {{%?}}[[READ_CONT:.+]]
- //
- // CHECK: [[READ_ELSE]]
- // CHECK: br label {{%?}}[[READ_CONT]]
- //
- // CHECK: [[READ_CONT]]
- // CHECK: ret
+// CHECK: define {{.*}}void {{@__omp_offloading_.+template.+l27}}(
+//
+// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1)
+// CHECK: call void @__kmpc_data_sharing_init_stack_spmd
+// CHECK: br label {{%?}}[[EXECUTE:.+]]
+//
+// CHECK: [[EXECUTE]]
+// CHECK: {{call|invoke}} void [[PFN:@.+]](i32*
+// CHECK: call void @__kmpc_spmd_kernel_deinit_v2(i16 1)
+//
+//
+// define internal void [[PFN]](
+// CHECK: store double {{[0\.e\+]+}}, double* [[E:%.+]], align
+// CHECK: [[EV:%.+]] = load double, double* [[E]], align
+// CHECK: [[ADD:%.+]] = fadd double [[EV]], 5
+// CHECK: store double [[ADD]], double* [[E]], align
+// CHECK: [[PTR1:%.+]] = getelementptr inbounds [[RLT:.+]], [1 x i8*]* [[RL:%.+]], i{{32|64}} 0, i{{32|64}} 0
+// CHECK: [[E_CAST:%.+]] = bitcast double* [[E]] to i8*
+// CHECK: store i8* [[E_CAST]], i8** [[PTR1]], align
+// CHECK: [[ARG_RL:%.+]] = bitcast [[RLT]]* [[RL]] to i8*
+// CHECK: [[RET:%.+]] = call i32 @__kmpc_nvptx_parallel_reduce_nowait_v2(%struct.ident_t* @{{.+}}, i32 {{.+}}, i32 1, i{{32|64}} {{4|8}}, i8* [[ARG_RL]], void (i8*, i16, i16, i16)* [[SHUFFLE_REDUCE_FN:@.+]], void (i8*, i32)* [[WARP_COPY_FN:@.+]])
+// CHECK: [[CMP:%.+]] = icmp eq i32 [[RET]], 1
+// CHECK: br i1 [[CMP]], label
+
+// CHECK: [[E_INV:%.+]] = load double, double* [[E_IN:%.+]], align
+// CHECK: [[EV:%.+]] = load double, double* [[E]], align
+// CHECK: [[ADD:%.+]] = fadd double [[E_INV]], [[EV]]
+// CHECK: store double [[ADD]], double* [[E_IN]], align
+// CHECK: call void @__kmpc_nvptx_end_reduce_nowait(
+// CHECK: br label
+//
+// CHECK: ret
+
+//
+// Reduction function
+// CHECK: define internal void [[REDUCTION_FUNC:@.+]](i8* %0, i8* %1)
+// CHECK: [[VAR_RHS_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST_RHS:%.+]], i{{32|64}} 0, i{{32|64}} 0
+// CHECK: [[VAR_RHS_VOID:%.+]] = load i8*, i8** [[VAR_RHS_REF]],
+// CHECK: [[VAR_RHS:%.+]] = bitcast i8* [[VAR_RHS_VOID]] to double*
+//
+// CHECK: [[VAR_LHS_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST_LHS:%.+]], i{{32|64}} 0, i{{32|64}} 0
+// CHECK: [[VAR_LHS_VOID:%.+]] = load i8*, i8** [[VAR_LHS_REF]],
+// CHECK: [[VAR_LHS:%.+]] = bitcast i8* [[VAR_LHS_VOID]] to double*
+//
+// CHECK: [[VAR_LHS_VAL:%.+]] = load double, double* [[VAR_LHS]],
+// CHECK: [[VAR_RHS_VAL:%.+]] = load double, double* [[VAR_RHS]],
+// CHECK: [[RES:%.+]] = fadd double [[VAR_LHS_VAL]], [[VAR_RHS_VAL]]
+// CHECK: store double [[RES]], double* [[VAR_LHS]],
+// CHECK: ret void
+
+//
+// Shuffle and reduce function
+// CHECK: define internal void [[SHUFFLE_REDUCE_FN]](i8* %0, i16 {{.*}}, i16 {{.*}}, i16 {{.*}})
+// CHECK: [[REMOTE_RED_LIST:%.+]] = alloca [[RLT]], align
+// CHECK: [[REMOTE_ELT:%.+]] = alloca double
+//
+// CHECK: [[LANEID:%.+]] = load i16, i16* {{.+}}, align
+// CHECK: [[LANEOFFSET:%.+]] = load i16, i16* {{.+}}, align
+// CHECK: [[ALGVER:%.+]] = load i16, i16* {{.+}}, align
+//
+// CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST:%.+]], i{{32|64}} 0, i{{32|64}} 0
+// CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
+// CHECK: [[REMOTE_ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[REMOTE_RED_LIST:%.+]], i{{32|64}} 0, i{{32|64}} 0
+// CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to double*
+//
+// CHECK: [[ELT_CAST:%.+]] = bitcast double* [[ELT]] to i64*
+// CHECK: [[REMOTE_ELT_CAST:%.+]] = bitcast double* [[REMOTE_ELT]] to i64*
+// CHECK: [[ELT_VAL:%.+]] = load i64, i64* [[ELT_CAST]], align
+// CHECK: [[WS32:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
+// CHECK: [[WS:%.+]] = trunc i32 [[WS32]] to i16
+// CHECK: [[REMOTE_ELT_VAL64:%.+]] = call i64 @__kmpc_shuffle_int64(i64 [[ELT_VAL]], i16 [[LANEOFFSET]], i16 [[WS]])
+//
+// CHECK: store i64 [[REMOTE_ELT_VAL64]], i64* [[REMOTE_ELT_CAST]], align
+// CHECK: [[REMOTE_ELT_VOID:%.+]] = bitcast double* [[REMOTE_ELT]] to i8*
+// CHECK: store i8* [[REMOTE_ELT_VOID]], i8** [[REMOTE_ELT_REF]], align
+//
+// Condition to reduce
+// CHECK: [[CONDALG0:%.+]] = icmp eq i16 [[ALGVER]], 0
+//
+// CHECK: [[COND1:%.+]] = icmp eq i16 [[ALGVER]], 1
+// CHECK: [[COND2:%.+]] = icmp ult i16 [[LANEID]], [[LANEOFFSET]]
+// CHECK: [[CONDALG1:%.+]] = and i1 [[COND1]], [[COND2]]
+//
+// CHECK: [[COND3:%.+]] = icmp eq i16 [[ALGVER]], 2
+// CHECK: [[COND4:%.+]] = and i16 [[LANEID]], 1
+// CHECK: [[COND5:%.+]] = icmp eq i16 [[COND4]], 0
+// CHECK: [[COND6:%.+]] = and i1 [[COND3]], [[COND5]]
+// CHECK: [[COND7:%.+]] = icmp sgt i16 [[LANEOFFSET]], 0
+// CHECK: [[CONDALG2:%.+]] = and i1 [[COND6]], [[COND7]]
+//
+// CHECK: [[COND8:%.+]] = or i1 [[CONDALG0]], [[CONDALG1]]
+// CHECK: [[SHOULD_REDUCE:%.+]] = or i1 [[COND8]], [[CONDALG2]]
+// CHECK: br i1 [[SHOULD_REDUCE]], label {{%?}}[[DO_REDUCE:.+]], label {{%?}}[[REDUCE_ELSE:.+]]
+//
+// CHECK: [[DO_REDUCE]]
+// CHECK: [[RED_LIST1_VOID:%.+]] = bitcast [[RLT]]* [[RED_LIST]] to i8*
+// CHECK: [[RED_LIST2_VOID:%.+]] = bitcast [[RLT]]* [[REMOTE_RED_LIST]] to i8*
+// CHECK: call void [[REDUCTION_FUNC]](i8* [[RED_LIST1_VOID]], i8* [[RED_LIST2_VOID]])
+// CHECK: br label {{%?}}[[REDUCE_CONT:.+]]
+//
+// CHECK: [[REDUCE_ELSE]]
+// CHECK: br label {{%?}}[[REDUCE_CONT]]
+//
+// CHECK: [[REDUCE_CONT]]
+// Now check if we should just copy over the remote reduction list
+// CHECK: [[COND1:%.+]] = icmp eq i16 [[ALGVER]], 1
+// CHECK: [[COND2:%.+]] = icmp uge i16 [[LANEID]], [[LANEOFFSET]]
+// CHECK: [[SHOULD_COPY:%.+]] = and i1 [[COND1]], [[COND2]]
+// CHECK: br i1 [[SHOULD_COPY]], label {{%?}}[[DO_COPY:.+]], label {{%?}}[[COPY_ELSE:.+]]
+//
+// CHECK: [[DO_COPY]]
+// CHECK: [[REMOTE_ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[REMOTE_RED_LIST]], i{{32|64}} 0, i{{32|64}} 0
+// CHECK: [[REMOTE_ELT_VOID:%.+]] = load i8*, i8** [[REMOTE_ELT_REF]],
+// CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST]], i{{32|64}} 0, i{{32|64}} 0
+// CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
+// CHECK: [[REMOTE_ELT:%.+]] = bitcast i8* [[REMOTE_ELT_VOID]] to double*
+// CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to double*
+// CHECK: [[REMOTE_ELT_VAL:%.+]] = load double, double* [[REMOTE_ELT]], align
+// CHECK: store double [[REMOTE_ELT_VAL]], double* [[ELT]], align
+// CHECK: br label {{%?}}[[COPY_CONT:.+]]
+//
+// CHECK: [[COPY_ELSE]]
+// CHECK: br label {{%?}}[[COPY_CONT]]
+//
+// CHECK: [[COPY_CONT]]
+// CHECK: void
+
+//
+// Inter warp copy function
+// CHECK: define internal void [[WARP_COPY_FN]](i8* %0, i32 %1)
+// CHECK-DAG: [[LANEID:%.+]] = and i32 {{.+}}, 31
+// CHECK-DAG: [[WARPID:%.+]] = ashr i32 {{.+}}, 5
+// CHECK-DAG: [[RED_LIST:%.+]] = bitcast i8* {{.+}} to [[RLT]]*
+// CHECK: store i32 0, i32* [[CNT_ADDR:%.+]],
+// CHECK: br label
+// CHECK: [[CNT:%.+]] = load i32, i32* [[CNT_ADDR]],
+// CHECK: [[DONE_COPY:%.+]] = icmp ult i32 [[CNT]], 2
+// CHECK: br i1 [[DONE_COPY]], label
+// CHECK: call void @__kmpc_barrier(%struct.ident_t* @
+// CHECK: [[IS_WARP_MASTER:%.+]] = icmp eq i32 [[LANEID]], 0
+// CHECK: br i1 [[IS_WARP_MASTER]], label {{%?}}[[DO_COPY:.+]], label {{%?}}[[COPY_ELSE:.+]]
+//
+// [[DO_COPY]]
+// CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST]], i{{32|64}} 0, i{{32|64}} 0
+// CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
+// CHECK: [[BASE_ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i32*
+// CHECK: [[ELT:%.+]] = getelementptr i32, i32* [[BASE_ELT]], i32 [[CNT]]
+//
+// CHECK: [[MEDIUM_ELT:%.+]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[WARPID]]
+// CHECK: [[ELT_VAL:%.+]] = load i32, i32* [[ELT]],
+// CHECK: store volatile i32 [[ELT_VAL]], i32 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]],
+// CHECK: br label {{%?}}[[COPY_CONT:.+]]
+//
+// CHECK: [[COPY_ELSE]]
+// CHECK: br label {{%?}}[[COPY_CONT]]
+//
+// Barrier after copy to shared memory storage medium.
+// CHECK: [[COPY_CONT]]
+// CHECK: call void @__kmpc_barrier(%struct.ident_t* @
+// CHECK: [[ACTIVE_WARPS:%.+]] = load i32, i32*
+//
+// Read into warp 0.
+// CHECK: [[IS_W0_ACTIVE_THREAD:%.+]] = icmp ult i32 [[TID:%.+]], [[ACTIVE_WARPS]]
+// CHECK: br i1 [[IS_W0_ACTIVE_THREAD]], label {{%?}}[[DO_READ:.+]], label {{%?}}[[READ_ELSE:.+]]
+//
+// CHECK: [[DO_READ]]
+// CHECK: [[MEDIUM_ELT:%.+]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[TID]]
+// CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST:%.+]], i{{32|64}} 0, i{{32|64}} 0
+// CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
+// CHECK: [[ELT_BASE:%.+]] = bitcast i8* [[ELT_VOID]] to i32*
+// CHECK: [[ELT:%.+]] = getelementptr i32, i32* [[ELT_BASE]], i32 [[CNT]]
+// CHECK: [[MEDIUM_ELT_VAL:%.+]] = load volatile i32, i32 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]],
+// CHECK: store i32 [[MEDIUM_ELT_VAL]], i32* [[ELT]],
+// CHECK: br label {{%?}}[[READ_CONT:.+]]
+//
+// CHECK: [[READ_ELSE]]
+// CHECK: br label {{%?}}[[READ_CONT]]
+//
+// CHECK: [[READ_CONT]]
+// CHECK: [[NEXT:%.+]] = add nsw i32 [[CNT]], 1
+// CHECK: store i32 [[NEXT]], i32* [[CNT_ADDR]],
+// CHECK: br label
+// CHECK: ret
+
+// CHECK: define {{.*}}void {{@__omp_offloading_.+template.+l32}}(
+//
+// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1)
+// CHECK: call void @__kmpc_data_sharing_init_stack_spmd
+// CHECK: br label {{%?}}[[EXECUTE:.+]]
+//
+// CHECK: [[EXECUTE]]
+// CHECK: {{call|invoke}} void [[PFN1:@.+]](i32*
+// CHECK: call void @__kmpc_spmd_kernel_deinit_v2(i16 1)
+//
+//
+// define internal void [[PFN1]](
+// CHECK: store float {{1\.[0e\+]+}}, float* [[D:%.+]], align
+// CHECK: [[C_VAL:%.+]] = load i8, i8* [[C:%.+]], align
+// CHECK: [[CONV:%.+]] = sext i8 [[C_VAL]] to i32
+// CHECK: [[XOR:%.+]] = xor i32 [[CONV]], 2
+// CHECK: [[TRUNC:%.+]] = trunc i32 [[XOR]] to i8
+// CHECK: store i8 [[TRUNC]], i8* [[C]], align
+// CHECK: [[DV:%.+]] = load float, float* [[D]], align
+// CHECK: [[MUL:%.+]] = fmul float [[DV]], {{[0-9e\.\+]+}}
+// CHECK: store float [[MUL]], float* [[D]], align
+// CHECK: [[PTR1:%.+]] = getelementptr inbounds [[RLT:.+]], [2 x i8*]* [[RL:%.+]], i{{32|64}} 0, i{{32|64}} 0
+// CHECK: store i8* [[C]], i8** [[PTR1]], align
+// CHECK: [[PTR2:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RL]], i{{32|64}} 0, i{{32|64}} 1
+// CHECK: [[D_CAST:%.+]] = bitcast float* [[D]] to i8*
+// CHECK: store i8* [[D_CAST]], i8** [[PTR2]], align
+// CHECK: [[ARG_RL:%.+]] = bitcast [[RLT]]* [[RL]] to i8*
+// CHECK: [[RET:%.+]] = call i32 @__kmpc_nvptx_parallel_reduce_nowait_v2(%struct.ident_t* @{{.+}}, i32 {{.+}}, i32 2, i{{32|64}} {{8|16}}, i8* [[ARG_RL]], void (i8*, i16, i16, i16)* [[SHUFFLE_REDUCE_FN:@.+]], void (i8*, i32)* [[WARP_COPY_FN:@.+]])
+// CHECK: [[CMP:%.+]] = icmp eq i32 [[RET]], 1
+// CHECK: br i1 [[CMP]], label
+// CHECK: [[C_INV8:%.+]] = load i8, i8* [[C_IN:%.+]], align
+// CHECK: [[C_INV:%.+]] = sext i8 [[C_INV8]] to i32
+// CHECK: [[CV8:%.+]] = load i8, i8* [[C]], align
+// CHECK: [[CV:%.+]] = sext i8 [[CV8]] to i32
+// CHECK: [[XOR:%.+]] = xor i32 [[C_INV]], [[CV]]
+// CHECK: [[TRUNC:%.+]] = trunc i32 [[XOR]] to i8
+// CHECK: store i8 [[TRUNC]], i8* [[C_IN]], align
+// CHECK: [[D_INV:%.+]] = load float, float* [[D_IN:%.+]], align
+// CHECK: [[DV:%.+]] = load float, float* [[D]], align
+// CHECK: [[MUL:%.+]] = fmul float [[D_INV]], [[DV]]
+// CHECK: store float [[MUL]], float* [[D_IN]], align
+// CHECK: call void @__kmpc_nvptx_end_reduce_nowait(
+// CHECK: br label
+//
+// CHECK: ret
+
+//
+// Reduction function
+// CHECK: define internal void [[REDUCTION_FUNC:@.+]](i8* %0, i8* %1)
+// CHECK: [[VAR1_RHS_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST_RHS:%.+]], i{{32|64}} 0, i{{32|64}} 0
+// CHECK: [[VAR1_RHS:%.+]] = load i8*, i8** [[VAR1_RHS_REF]],
+//
+// CHECK: [[VAR1_LHS_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST_LHS:%.+]], i{{32|64}} 0, i{{32|64}} 0
+// CHECK: [[VAR1_LHS:%.+]] = load i8*, i8** [[VAR1_LHS_REF]],
+//
+// CHECK: [[VAR2_RHS_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST_RHS]], i{{32|64}} 0, i{{32|64}} 1
+// CHECK: [[VAR2_RHS_VOID:%.+]] = load i8*, i8** [[VAR2_RHS_REF]],
+// CHECK: [[VAR2_RHS:%.+]] = bitcast i8* [[VAR2_RHS_VOID]] to float*
+//
+// CHECK: [[VAR2_LHS_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST_LHS]], i{{32|64}} 0, i{{32|64}} 1
+// CHECK: [[VAR2_LHS_VOID:%.+]] = load i8*, i8** [[VAR2_LHS_REF]],
+// CHECK: [[VAR2_LHS:%.+]] = bitcast i8* [[VAR2_LHS_VOID]] to float*
+//
+// CHECK: [[VAR1_LHS_VAL8:%.+]] = load i8, i8* [[VAR1_LHS]],
+// CHECK: [[VAR1_LHS_VAL:%.+]] = sext i8 [[VAR1_LHS_VAL8]] to i32
+// CHECK: [[VAR1_RHS_VAL8:%.+]] = load i8, i8* [[VAR1_RHS]],
+// CHECK: [[VAR1_RHS_VAL:%.+]] = sext i8 [[VAR1_RHS_VAL8]] to i32
+// CHECK: [[XOR:%.+]] = xor i32 [[VAR1_LHS_VAL]], [[VAR1_RHS_VAL]]
+// CHECK: [[RES:%.+]] = trunc i32 [[XOR]] to i8
+// CHECK: store i8 [[RES]], i8* [[VAR1_LHS]],
+//
+// CHECK: [[VAR2_LHS_VAL:%.+]] = load float, float* [[VAR2_LHS]],
+// CHECK: [[VAR2_RHS_VAL:%.+]] = load float, float* [[VAR2_RHS]],
+// CHECK: [[RES:%.+]] = fmul float [[VAR2_LHS_VAL]], [[VAR2_RHS_VAL]]
+// CHECK: store float [[RES]], float* [[VAR2_LHS]],
+// CHECK: ret void
+
+//
+// Shuffle and reduce function
+// CHECK: define internal void [[SHUFFLE_REDUCE_FN]](i8* %0, i16 {{.*}}, i16 {{.*}}, i16 {{.*}})
+// CHECK: [[REMOTE_RED_LIST:%.+]] = alloca [[RLT]], align
+// CHECK: [[REMOTE_ELT1:%.+]] = alloca i8
+// CHECK: [[REMOTE_ELT2:%.+]] = alloca float
+//
+// CHECK: [[LANEID:%.+]] = load i16, i16* {{.+}}, align
+// CHECK: [[LANEOFFSET:%.+]] = load i16, i16* {{.+}}, align
+// CHECK: [[ALGVER:%.+]] = load i16, i16* {{.+}}, align
+//
+// CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST:%.+]], i{{32|64}} 0, i{{32|64}} 0
+// CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
+// CHECK: [[REMOTE_ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[REMOTE_RED_LIST:%.+]], i{{32|64}} 0, i{{32|64}} 0
+// CHECK: [[ELT_VAL:%.+]] = load i8, i8* [[ELT_VOID]], align
+//
+// CHECK: [[ELT_CAST:%.+]] = sext i8 [[ELT_VAL]] to i32
+// CHECK: [[WS32:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
+// CHECK: [[WS:%.+]] = trunc i32 [[WS32]] to i16
+// CHECK: [[REMOTE_ELT1_VAL32:%.+]] = call i32 @__kmpc_shuffle_int32(i32 [[ELT_CAST]], i16 [[LANEOFFSET]], i16 [[WS]])
+// CHECK: [[REMOTE_ELT1_VAL:%.+]] = trunc i32 [[REMOTE_ELT1_VAL32]] to i8
+//
+// CHECK: store i8 [[REMOTE_ELT1_VAL]], i8* [[REMOTE_ELT1]], align
+// CHECK: store i8* [[REMOTE_ELT1]], i8** [[REMOTE_ELT_REF]], align
+//
+// CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST]], i{{32|64}} 0, i{{32|64}} 1
+// CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
+// CHECK: [[REMOTE_ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[REMOTE_RED_LIST]], i{{32|64}} 0, i{{32|64}} 1
+// CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to float*
+//
+// CHECK: [[ELT_CAST:%.+]] = bitcast float* [[ELT]] to i32*
+// CHECK: [[REMOTE_ELT2_CAST:%.+]] = bitcast float* [[REMOTE_ELT2]] to i32*
+// CHECK: [[ELT_VAL:%.+]] = load i32, i32* [[ELT_CAST]], align
+// CHECK: [[WS32:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
+// CHECK: [[WS:%.+]] = trunc i32 [[WS32]] to i16
+// CHECK: [[REMOTE_ELT2_VAL32:%.+]] = call i32 @__kmpc_shuffle_int32(i32 [[ELT_VAL]], i16 [[LANEOFFSET]], i16 [[WS]])
+//
+// CHECK: store i32 [[REMOTE_ELT2_VAL32]], i32* [[REMOTE_ELT2_CAST]], align
+// CHECK: [[REMOTE_ELT2C:%.+]] = bitcast float* [[REMOTE_ELT2]] to i8*
+// CHECK: store i8* [[REMOTE_ELT2C]], i8** [[REMOTE_ELT_REF]], align
+//
+// Condition to reduce
+// CHECK: [[CONDALG0:%.+]] = icmp eq i16 [[ALGVER]], 0
+//
+// CHECK: [[COND1:%.+]] = icmp eq i16 [[ALGVER]], 1
+// CHECK: [[COND2:%.+]] = icmp ult i16 [[LANEID]], [[LANEOFFSET]]
+// CHECK: [[CONDALG1:%.+]] = and i1 [[COND1]], [[COND2]]
+//
+// CHECK: [[COND3:%.+]] = icmp eq i16 [[ALGVER]], 2
+// CHECK: [[COND4:%.+]] = and i16 [[LANEID]], 1
+// CHECK: [[COND5:%.+]] = icmp eq i16 [[COND4]], 0
+// CHECK: [[COND6:%.+]] = and i1 [[COND3]], [[COND5]]
+// CHECK: [[COND7:%.+]] = icmp sgt i16 [[LANEOFFSET]], 0
+// CHECK: [[CONDALG2:%.+]] = and i1 [[COND6]], [[COND7]]
+//
+// CHECK: [[COND8:%.+]] = or i1 [[CONDALG0]], [[CONDALG1]]
+// CHECK: [[SHOULD_REDUCE:%.+]] = or i1 [[COND8]], [[CONDALG2]]
+// CHECK: br i1 [[SHOULD_REDUCE]], label {{%?}}[[DO_REDUCE:.+]], label {{%?}}[[REDUCE_ELSE:.+]]
+//
+// CHECK: [[DO_REDUCE]]
+// CHECK: [[RED_LIST1_VOID:%.+]] = bitcast [[RLT]]* [[RED_LIST]] to i8*
+// CHECK: [[RED_LIST2_VOID:%.+]] = bitcast [[RLT]]* [[REMOTE_RED_LIST]] to i8*
+// CHECK: call void [[REDUCTION_FUNC]](i8* [[RED_LIST1_VOID]], i8* [[RED_LIST2_VOID]])
+// CHECK: br label {{%?}}[[REDUCE_CONT:.+]]
+//
+// CHECK: [[REDUCE_ELSE]]
+// CHECK: br label {{%?}}[[REDUCE_CONT]]
+//
+// CHECK: [[REDUCE_CONT]]
+// Now check if we should just copy over the remote reduction list
+// CHECK: [[COND1:%.+]] = icmp eq i16 [[ALGVER]], 1
+// CHECK: [[COND2:%.+]] = icmp uge i16 [[LANEID]], [[LANEOFFSET]]
+// CHECK: [[SHOULD_COPY:%.+]] = and i1 [[COND1]], [[COND2]]
+// CHECK: br i1 [[SHOULD_COPY]], label {{%?}}[[DO_COPY:.+]], label {{%?}}[[COPY_ELSE:.+]]
+//
+// CHECK: [[DO_COPY]]
+// CHECK: [[REMOTE_ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[REMOTE_RED_LIST]], i{{32|64}} 0, i{{32|64}} 0
+// CHECK: [[REMOTE_ELT_VOID:%.+]] = load i8*, i8** [[REMOTE_ELT_REF]],
+// CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST]], i{{32|64}} 0, i{{32|64}} 0
+// CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
+// CHECK: [[REMOTE_ELT_VAL:%.+]] = load i8, i8* [[REMOTE_ELT_VOID]], align
+// CHECK: store i8 [[REMOTE_ELT_VAL]], i8* [[ELT_VOID]], align
+//
+// CHECK: [[REMOTE_ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[REMOTE_RED_LIST]], i{{32|64}} 0, i{{32|64}} 1
+// CHECK: [[REMOTE_ELT_VOID:%.+]] = load i8*, i8** [[REMOTE_ELT_REF]],
+// CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST]], i{{32|64}} 0, i{{32|64}} 1
+// CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
+// CHECK: [[REMOTE_ELT:%.+]] = bitcast i8* [[REMOTE_ELT_VOID]] to float*
+// CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to float*
+// CHECK: [[REMOTE_ELT_VAL:%.+]] = load float, float* [[REMOTE_ELT]], align
+// CHECK: store float [[REMOTE_ELT_VAL]], float* [[ELT]], align
+// CHECK: br label {{%?}}[[COPY_CONT:.+]]
+//
+// CHECK: [[COPY_ELSE]]
+// CHECK: br label {{%?}}[[COPY_CONT]]
+//
+// CHECK: [[COPY_CONT]]
+// CHECK: void
+
+//
+// Inter warp copy function
+// CHECK: define internal void [[WARP_COPY_FN]](i8* %0, i32 %1)
+// CHECK-DAG: [[LANEID:%.+]] = and i32 {{.+}}, 31
+// CHECK-DAG: [[WARPID:%.+]] = ashr i32 {{.+}}, 5
+// CHECK-DAG: [[RED_LIST:%.+]] = bitcast i8* {{.+}} to [[RLT]]*
+// CHECK: call void @__kmpc_barrier(%struct.ident_t* @
+// CHECK: [[IS_WARP_MASTER:%.+]] = icmp eq i32 [[LANEID]], 0
+// CHECK: br i1 [[IS_WARP_MASTER]], label {{%?}}[[DO_COPY:.+]], label {{%?}}[[COPY_ELSE:.+]]
+//
+// [[DO_COPY]]
+// CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST]], i{{32|64}} 0, i{{32|64}} 0
+// CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
+//
+// CHECK: [[MEDIUM_ELT64:%.+]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[WARPID]]
+// CHECK: [[MEDIUM_ELT:%.+]] = bitcast i32 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT64]] to i8 addrspace([[SHARED_ADDRSPACE]])*
+// CHECK: [[ELT_VAL:%.+]] = load i8, i8* [[ELT_VOID]], align
+// CHECK: store volatile i8 [[ELT_VAL]], i8 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align
+// CHECK: br label {{%?}}[[COPY_CONT:.+]]
+//
+// CHECK: [[COPY_ELSE]]
+// CHECK: br label {{%?}}[[COPY_CONT]]
+//
+// Barrier after copy to shared memory storage medium.
+// CHECK: [[COPY_CONT]]
+// CHECK: call void @__kmpc_barrier(%struct.ident_t* @
+// CHECK: [[ACTIVE_WARPS:%.+]] = load i32, i32*
+//
+// Read into warp 0.
+// CHECK: [[IS_W0_ACTIVE_THREAD:%.+]] = icmp ult i32 [[TID:%.+]], [[ACTIVE_WARPS]]
+// CHECK: br i1 [[IS_W0_ACTIVE_THREAD]], label {{%?}}[[DO_READ:.+]], label {{%?}}[[READ_ELSE:.+]]
+//
+// CHECK: [[DO_READ]]
+// CHECK: [[MEDIUM_ELT32:%.+]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[TID]]
+// CHECK: [[MEDIUM_ELT:%.+]] = bitcast i32 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT32]] to i8 addrspace([[SHARED_ADDRSPACE]])*
+// CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST:%.+]], i{{32|64}} 0, i{{32|64}} 0
+// CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
+// CHECK: [[MEDIUM_ELT_VAL:%.+]] = load volatile i8, i8 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align
+// CHECK: store i8 [[MEDIUM_ELT_VAL]], i8* [[ELT_VOID]], align
+// CHECK: br label {{%?}}[[READ_CONT:.+]]
+//
+// CHECK: [[READ_ELSE]]
+// CHECK: br label {{%?}}[[READ_CONT]]
+//
+// CHECK: [[READ_CONT]]
+// CHECK: call void @__kmpc_barrier(%struct.ident_t* @
+// CHECK: [[IS_WARP_MASTER:%.+]] = icmp eq i32 [[LANEID]], 0
+// CHECK: br i1 [[IS_WARP_MASTER]], label {{%?}}[[DO_COPY:.+]], label {{%?}}[[COPY_ELSE:.+]]
+//
+// [[DO_COPY]]
+// CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST]], i{{32|64}} 0, i{{32|64}} 1
+// CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
+// CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i32*
+//
+// CHECK: [[MEDIUM_ELT:%.+]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[WARPID]]
+// CHECK: [[ELT_VAL:%.+]] = load i32, i32* [[ELT]], align
+// CHECK: store volatile i32 [[ELT_VAL]], i32 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align
+// CHECK: br label {{%?}}[[COPY_CONT:.+]]
+//
+// CHECK: [[COPY_ELSE]]
+// CHECK: br label {{%?}}[[COPY_CONT]]
+//
+// Barrier after copy to shared memory storage medium.
+// CHECK: [[COPY_CONT]]
+// CHECK: call void @__kmpc_barrier(%struct.ident_t* @
+// CHECK: [[ACTIVE_WARPS:%.+]] = load i32, i32*
+//
+// Read into warp 0.
+// CHECK: [[IS_W0_ACTIVE_THREAD:%.+]] = icmp ult i32 [[TID:%.+]], [[ACTIVE_WARPS]]
+// CHECK: br i1 [[IS_W0_ACTIVE_THREAD]], label {{%?}}[[DO_READ:.+]], label {{%?}}[[READ_ELSE:.+]]
+//
+// CHECK: [[DO_READ]]
+// CHECK: [[MEDIUM_ELT:%.+]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[TID]]
+// CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST:%.+]], i{{32|64}} 0, i{{32|64}} 1
+// CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
+// CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i32*
+// CHECK: [[MEDIUM_ELT_VAL:%.+]] = load volatile i32, i32 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align
+// CHECK: store i32 [[MEDIUM_ELT_VAL]], i32* [[ELT]], align
+// CHECK: br label {{%?}}[[READ_CONT:.+]]
+//
+// CHECK: [[READ_ELSE]]
+// CHECK: br label {{%?}}[[READ_CONT]]
+//
+// CHECK: [[READ_CONT]]
+// CHECK: ret
+
+// CHECK: define {{.*}}void {{@__omp_offloading_.+template.+l38}}(
+//
+// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1)
+// CHECK: call void @__kmpc_data_sharing_init_stack_spmd
+// CHECK: br label {{%?}}[[EXECUTE:.+]]
+//
+// CHECK: [[EXECUTE]]
+// CHECK: {{call|invoke}} void [[PFN2:@.+]](i32*
+// CHECK: call void @__kmpc_spmd_kernel_deinit_v2(i16 1)
+//
+//
+// define internal void [[PFN2]](
+// CHECK: store i32 0, i32* [[A:%.+]], align
+// CHECK: store i16 -32768, i16* [[B:%.+]], align
+// CHECK: [[A_VAL:%.+]] = load i32, i32* [[A:%.+]], align
+// CHECK: [[OR:%.+]] = or i32 [[A_VAL]], 1
+// CHECK: store i32 [[OR]], i32* [[A]], align
+// CHECK: [[BV16:%.+]] = load i16, i16* [[B]], align
+// CHECK: [[BV:%.+]] = sext i16 [[BV16]] to i32
+// CHECK: [[CMP:%.+]] = icmp sgt i32 99, [[BV]]
+// CHECK: br i1 [[CMP]], label {{%?}}[[DO_MAX:.+]], label {{%?}}[[MAX_ELSE:.+]]
+//
+// CHECK: [[DO_MAX]]
+// CHECK: br label {{%?}}[[MAX_CONT:.+]]
+//
+// CHECK: [[MAX_ELSE]]
+// CHECK: [[BV:%.+]] = load i16, i16* [[B]], align
+// CHECK: [[MAX:%.+]] = sext i16 [[BV]] to i32
+// CHECK: br label {{%?}}[[MAX_CONT]]
+//
+// CHECK: [[MAX_CONT]]
+// CHECK: [[B_LVALUE:%.+]] = phi i32 [ 99, %[[DO_MAX]] ], [ [[MAX]], %[[MAX_ELSE]] ]
+// CHECK: [[TRUNC:%.+]] = trunc i32 [[B_LVALUE]] to i16
+// CHECK: store i16 [[TRUNC]], i16* [[B]], align
+// CHECK: [[PTR1:%.+]] = getelementptr inbounds [[RLT:.+]], [2 x i8*]* [[RL:%.+]], i{{32|64}} 0, i{{32|64}} 0
+// CHECK: [[A_CAST:%.+]] = bitcast i32* [[A]] to i8*
+// CHECK: store i8* [[A_CAST]], i8** [[PTR1]], align
+// CHECK: [[PTR2:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RL]], i{{32|64}} 0, i{{32|64}} 1
+// CHECK: [[B_CAST:%.+]] = bitcast i16* [[B]] to i8*
+// CHECK: store i8* [[B_CAST]], i8** [[PTR2]], align
+// CHECK: [[ARG_RL:%.+]] = bitcast [[RLT]]* [[RL]] to i8*
+// CHECK: [[RET:%.+]] = call i32 @__kmpc_nvptx_parallel_reduce_nowait_v2(%struct.ident_t* @{{.+}}, i32 {{.+}}, i32 2, i{{32|64}} {{8|16}}, i8* [[ARG_RL]], void (i8*, i16, i16, i16)* [[SHUFFLE_REDUCE_FN:@.+]], void (i8*, i32)* [[WARP_COPY_FN:@.+]])
+// CHECK: [[CMP:%.+]] = icmp eq i32 [[RET]], 1
+// CHECK: br i1 [[CMP]], label
+
+// CHECK: [[A_INV:%.+]] = load i32, i32* [[A_IN:%.+]], align
+// CHECK: [[AV:%.+]] = load i32, i32* [[A]], align
+// CHECK: [[OR:%.+]] = or i32 [[A_INV]], [[AV]]
+// CHECK: store i32 [[OR]], i32* [[A_IN]], align
+// CHECK: [[B_INV16:%.+]] = load i16, i16* [[B_IN:%.+]], align
+// CHECK: [[B_INV:%.+]] = sext i16 [[B_INV16]] to i32
+// CHECK: [[BV16:%.+]] = load i16, i16* [[B]], align
+// CHECK: [[BV:%.+]] = sext i16 [[BV16]] to i32
+// CHECK: [[CMP:%.+]] = icmp sgt i32 [[B_INV]], [[BV]]
+// CHECK: br i1 [[CMP]], label {{%?}}[[DO_MAX:.+]], label {{%?}}[[MAX_ELSE:.+]]
+//
+// CHECK: [[DO_MAX]]
+// CHECK: [[MAX1:%.+]] = load i16, i16* [[B_IN]], align
+// CHECK: br label {{%?}}[[MAX_CONT:.+]]
+//
+// CHECK: [[MAX_ELSE]]
+// CHECK: [[MAX2:%.+]] = load i16, i16* [[B]], align
+// CHECK: br label {{%?}}[[MAX_CONT]]
+//
+// CHECK: [[MAX_CONT]]
+// CHECK: [[B_MAX:%.+]] = phi i16 [ [[MAX1]], %[[DO_MAX]] ], [ [[MAX2]], %[[MAX_ELSE]] ]
+// CHECK: store i16 [[B_MAX]], i16* [[B_IN]], align
+// CHECK: call void @__kmpc_nvptx_end_reduce_nowait(
+// CHECK: br label
+//
+// CHECK: ret
+
+//
+// Reduction function
+// CHECK: define internal void [[REDUCTION_FUNC:@.+]](i8* %0, i8* %1)
+// CHECK: [[VAR1_RHS_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST_RHS:%.+]], i{{32|64}} 0, i{{32|64}} 0
+// CHECK: [[VAR1_RHS_VOID:%.+]] = load i8*, i8** [[VAR1_RHS_REF]],
+// CHECK: [[VAR1_RHS:%.+]] = bitcast i8* [[VAR1_RHS_VOID]] to i32*
+//
+// CHECK: [[VAR1_LHS_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST_LHS:%.+]], i{{32|64}} 0, i{{32|64}} 0
+// CHECK: [[VAR1_LHS_VOID:%.+]] = load i8*, i8** [[VAR1_LHS_REF]],
+// CHECK: [[VAR1_LHS:%.+]] = bitcast i8* [[VAR1_LHS_VOID]] to i32*
+//
+// CHECK: [[VAR2_RHS_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST_RHS]], i{{32|64}} 0, i{{32|64}} 1
+// CHECK: [[VAR2_RHS_VOID:%.+]] = load i8*, i8** [[VAR2_RHS_REF]],
+// CHECK: [[VAR2_RHS:%.+]] = bitcast i8* [[VAR2_RHS_VOID]] to i16*
+//
+// CHECK: [[VAR2_LHS_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST_LHS]], i{{32|64}} 0, i{{32|64}} 1
+// CHECK: [[VAR2_LHS_VOID:%.+]] = load i8*, i8** [[VAR2_LHS_REF]],
+// CHECK: [[VAR2_LHS:%.+]] = bitcast i8* [[VAR2_LHS_VOID]] to i16*
+//
+// CHECK: [[VAR1_LHS_VAL:%.+]] = load i32, i32* [[VAR1_LHS]],
+// CHECK: [[VAR1_RHS_VAL:%.+]] = load i32, i32* [[VAR1_RHS]],
+// CHECK: [[OR:%.+]] = or i32 [[VAR1_LHS_VAL]], [[VAR1_RHS_VAL]]
+// CHECK: store i32 [[OR]], i32* [[VAR1_LHS]],
+//
+// CHECK: [[VAR2_LHS_VAL16:%.+]] = load i16, i16* [[VAR2_LHS]],
+// CHECK: [[VAR2_LHS_VAL:%.+]] = sext i16 [[VAR2_LHS_VAL16]] to i32
+// CHECK: [[VAR2_RHS_VAL16:%.+]] = load i16, i16* [[VAR2_RHS]],
+// CHECK: [[VAR2_RHS_VAL:%.+]] = sext i16 [[VAR2_RHS_VAL16]] to i32
+//
+// CHECK: [[CMP:%.+]] = icmp sgt i32 [[VAR2_LHS_VAL]], [[VAR2_RHS_VAL]]
+// CHECK: br i1 [[CMP]], label {{%?}}[[DO_MAX:.+]], label {{%?}}[[MAX_ELSE:.+]]
+//
+// CHECK: [[DO_MAX]]
+// CHECK: [[MAX1:%.+]] = load i16, i16* [[VAR2_LHS]], align
+// CHECK: br label {{%?}}[[MAX_CONT:.+]]
+//
+// CHECK: [[MAX_ELSE]]
+// CHECK: [[MAX2:%.+]] = load i16, i16* [[VAR2_RHS]], align
+// CHECK: br label {{%?}}[[MAX_CONT]]
+//
+// CHECK: [[MAX_CONT]]
+// CHECK: [[MAXV:%.+]] = phi i16 [ [[MAX1]], %[[DO_MAX]] ], [ [[MAX2]], %[[MAX_ELSE]] ]
+// CHECK: store i16 [[MAXV]], i16* [[VAR2_LHS]],
+// CHECK: ret void
+
+//
+// Shuffle and reduce function
+// CHECK: define internal void [[SHUFFLE_REDUCE_FN]](i8* %0, i16 {{.*}}, i16 {{.*}}, i16 {{.*}})
+// CHECK: [[REMOTE_RED_LIST:%.+]] = alloca [[RLT]], align
+// CHECK: [[REMOTE_ELT1:%.+]] = alloca i32
+// CHECK: [[REMOTE_ELT2:%.+]] = alloca i16
+//
+// CHECK: [[LANEID:%.+]] = load i16, i16* {{.+}}, align
+// CHECK: [[LANEOFFSET:%.+]] = load i16, i16* {{.+}}, align
+// CHECK: [[ALGVER:%.+]] = load i16, i16* {{.+}}, align
+//
+// CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST:%.+]], i{{32|64}} 0, i{{32|64}} 0
+// CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
+// CHECK: [[REMOTE_ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[REMOTE_RED_LIST:%.+]], i{{32|64}} 0, i{{32|64}} 0
+// CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i32*
+// CHECK: [[ELT_VAL:%.+]] = load i32, i32* [[ELT]], align
+//
+// CHECK: [[WS32:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
+// CHECK: [[WS:%.+]] = trunc i32 [[WS32]] to i16
+// CHECK: [[REMOTE_ELT1_VAL:%.+]] = call i32 @__kmpc_shuffle_int32(i32 [[ELT_VAL]], i16 [[LANEOFFSET]], i16 [[WS]])
+//
+// CHECK: store i32 [[REMOTE_ELT1_VAL]], i32* [[REMOTE_ELT1]], align
+// CHECK: [[REMOTE_ELT1C:%.+]] = bitcast i32* [[REMOTE_ELT1]] to i8*
+// CHECK: store i8* [[REMOTE_ELT1C]], i8** [[REMOTE_ELT_REF]], align
+//
+// CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST]], i{{32|64}} 0, i{{32|64}} 1
+// CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
+// CHECK: [[REMOTE_ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[REMOTE_RED_LIST]], i{{32|64}} 0, i{{32|64}} 1
+// CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i16*
+// CHECK: [[ELT_VAL:%.+]] = load i16, i16* [[ELT]], align
+//
+// CHECK: [[ELT_CAST:%.+]] = sext i16 [[ELT_VAL]] to i32
+// CHECK: [[WS32:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
+// CHECK: [[WS:%.+]] = trunc i32 [[WS32]] to i16
+// CHECK: [[REMOTE_ELT2_VAL32:%.+]] = call i32 @__kmpc_shuffle_int32(i32 [[ELT_CAST]], i16 [[LANEOFFSET]], i16 [[WS]])
+// CHECK: [[REMOTE_ELT2_VAL:%.+]] = trunc i32 [[REMOTE_ELT2_VAL32]] to i16
+//
+// CHECK: store i16 [[REMOTE_ELT2_VAL]], i16* [[REMOTE_ELT2]], align
+// CHECK: [[REMOTE_ELT2C:%.+]] = bitcast i16* [[REMOTE_ELT2]] to i8*
+// CHECK: store i8* [[REMOTE_ELT2C]], i8** [[REMOTE_ELT_REF]], align
+//
+// Condition to reduce
+// CHECK: [[CONDALG0:%.+]] = icmp eq i16 [[ALGVER]], 0
+//
+// CHECK: [[COND1:%.+]] = icmp eq i16 [[ALGVER]], 1
+// CHECK: [[COND2:%.+]] = icmp ult i16 [[LANEID]], [[LANEOFFSET]]
+// CHECK: [[CONDALG1:%.+]] = and i1 [[COND1]], [[COND2]]
+//
+// CHECK: [[COND3:%.+]] = icmp eq i16 [[ALGVER]], 2
+// CHECK: [[COND4:%.+]] = and i16 [[LANEID]], 1
+// CHECK: [[COND5:%.+]] = icmp eq i16 [[COND4]], 0
+// CHECK: [[COND6:%.+]] = and i1 [[COND3]], [[COND5]]
+// CHECK: [[COND7:%.+]] = icmp sgt i16 [[LANEOFFSET]], 0
+// CHECK: [[CONDALG2:%.+]] = and i1 [[COND6]], [[COND7]]
+//
+// CHECK: [[COND8:%.+]] = or i1 [[CONDALG0]], [[CONDALG1]]
+// CHECK: [[SHOULD_REDUCE:%.+]] = or i1 [[COND8]], [[CONDALG2]]
+// CHECK: br i1 [[SHOULD_REDUCE]], label {{%?}}[[DO_REDUCE:.+]], label {{%?}}[[REDUCE_ELSE:.+]]
+//
+// CHECK: [[DO_REDUCE]]
+// CHECK: [[RED_LIST1_VOID:%.+]] = bitcast [[RLT]]* [[RED_LIST]] to i8*
+// CHECK: [[RED_LIST2_VOID:%.+]] = bitcast [[RLT]]* [[REMOTE_RED_LIST]] to i8*
+// CHECK: call void [[REDUCTION_FUNC]](i8* [[RED_LIST1_VOID]], i8* [[RED_LIST2_VOID]])
+// CHECK: br label {{%?}}[[REDUCE_CONT:.+]]
+//
+// CHECK: [[REDUCE_ELSE]]
+// CHECK: br label {{%?}}[[REDUCE_CONT]]
+//
+// CHECK: [[REDUCE_CONT]]
+// Now check if we should just copy over the remote reduction list
+// CHECK: [[COND1:%.+]] = icmp eq i16 [[ALGVER]], 1
+// CHECK: [[COND2:%.+]] = icmp uge i16 [[LANEID]], [[LANEOFFSET]]
+// CHECK: [[SHOULD_COPY:%.+]] = and i1 [[COND1]], [[COND2]]
+// CHECK: br i1 [[SHOULD_COPY]], label {{%?}}[[DO_COPY:.+]], label {{%?}}[[COPY_ELSE:.+]]
+//
+// CHECK: [[DO_COPY]]
+// CHECK: [[REMOTE_ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[REMOTE_RED_LIST]], i{{32|64}} 0, i{{32|64}} 0
+// CHECK: [[REMOTE_ELT_VOID:%.+]] = load i8*, i8** [[REMOTE_ELT_REF]],
+// CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST]], i{{32|64}} 0, i{{32|64}} 0
+// CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
+// CHECK: [[REMOTE_ELT:%.+]] = bitcast i8* [[REMOTE_ELT_VOID]] to i32*
+// CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i32*
+// CHECK: [[REMOTE_ELT_VAL:%.+]] = load i32, i32* [[REMOTE_ELT]], align
+// CHECK: store i32 [[REMOTE_ELT_VAL]], i32* [[ELT]], align
+//
+// CHECK: [[REMOTE_ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[REMOTE_RED_LIST]], i{{32|64}} 0, i{{32|64}} 1
+// CHECK: [[REMOTE_ELT_VOID:%.+]] = load i8*, i8** [[REMOTE_ELT_REF]],
+// CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST]], i{{32|64}} 0, i{{32|64}} 1
+// CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
+// CHECK: [[REMOTE_ELT:%.+]] = bitcast i8* [[REMOTE_ELT_VOID]] to i16*
+// CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i16*
+// CHECK: [[REMOTE_ELT_VAL:%.+]] = load i16, i16* [[REMOTE_ELT]], align
+// CHECK: store i16 [[REMOTE_ELT_VAL]], i16* [[ELT]], align
+// CHECK: br label {{%?}}[[COPY_CONT:.+]]
+//
+// CHECK: [[COPY_ELSE]]
+// CHECK: br label {{%?}}[[COPY_CONT]]
+//
+// CHECK: [[COPY_CONT]]
+// CHECK: void
+
+//
+// Inter warp copy function
+// CHECK: define internal void [[WARP_COPY_FN]](i8* %0, i32 %1)
+// CHECK-DAG: [[LANEID:%.+]] = and i32 {{.+}}, 31
+// CHECK-DAG: [[WARPID:%.+]] = ashr i32 {{.+}}, 5
+// CHECK-DAG: [[RED_LIST:%.+]] = bitcast i8* {{.+}} to [[RLT]]*
+// CHECK: call void @__kmpc_barrier(%struct.ident_t* @
+// CHECK: [[IS_WARP_MASTER:%.+]] = icmp eq i32 [[LANEID]], 0
+// CHECK: br i1 [[IS_WARP_MASTER]], label {{%?}}[[DO_COPY:.+]], label {{%?}}[[COPY_ELSE:.+]]
+//
+// [[DO_COPY]]
+// CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST]], i{{32|64}} 0, i{{32|64}} 0
+// CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
+// CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i32*
+//
+// CHECK: [[MEDIUM_ELT:%.+]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[WARPID]]
+// CHECK: [[ELT_VAL:%.+]] = load i32, i32* [[ELT]], align
+// CHECK: store volatile i32 [[ELT_VAL]], i32 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align
+// CHECK: br label {{%?}}[[COPY_CONT:.+]]
+//
+// CHECK: [[COPY_ELSE]]
+// CHECK: br label {{%?}}[[COPY_CONT]]
+//
+// Barrier after copy to shared memory storage medium.
+// CHECK: [[COPY_CONT]]
+// CHECK: call void @__kmpc_barrier(%struct.ident_t* @
+// CHECK: [[ACTIVE_WARPS:%.+]] = load i32, i32*
+//
+// Read into warp 0.
+// CHECK: [[IS_W0_ACTIVE_THREAD:%.+]] = icmp ult i32 [[TID:%.+]], [[ACTIVE_WARPS]]
+// CHECK: br i1 [[IS_W0_ACTIVE_THREAD]], label {{%?}}[[DO_READ:.+]], label {{%?}}[[READ_ELSE:.+]]
+//
+// CHECK: [[DO_READ]]
+// CHECK: [[MEDIUM_ELT:%.+]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[TID]]
+// CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST:%.+]], i{{32|64}} 0, i{{32|64}} 0
+// CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
+// CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i32*
+// CHECK: [[MEDIUM_ELT_VAL:%.+]] = load volatile i32, i32 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align
+// CHECK: store i32 [[MEDIUM_ELT_VAL]], i32* [[ELT]], align
+// CHECK: br label {{%?}}[[READ_CONT:.+]]
+//
+// CHECK: [[READ_ELSE]]
+// CHECK: br label {{%?}}[[READ_CONT]]
+//
+// CHECK: [[READ_CONT]]
+// CHECK: call void @__kmpc_barrier(%struct.ident_t* @
+// CHECK: [[IS_WARP_MASTER:%.+]] = icmp eq i32 [[LANEID]], 0
+// CHECK: br i1 [[IS_WARP_MASTER]], label {{%?}}[[DO_COPY:.+]], label {{%?}}[[COPY_ELSE:.+]]
+//
+// [[DO_COPY]]
+// CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST]], i{{32|64}} 0, i{{32|64}} 1
+// CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
+// CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i16*
+//
+// CHECK: [[MEDIUM_ELT32:%.+]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[WARPID]]
+// CHECK: [[MEDIUM_ELT:%.+]] = bitcast i32 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT32]] to i16 addrspace([[SHARED_ADDRSPACE]])*
+// CHECK: [[ELT_VAL:%.+]] = load i16, i16* [[ELT]], align
+// CHECK: store volatile i16 [[ELT_VAL]], i16 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align
+// CHECK: br label {{%?}}[[COPY_CONT:.+]]
+//
+// CHECK: [[COPY_ELSE]]
+// CHECK: br label {{%?}}[[COPY_CONT]]
+//
+// Barrier after copy to shared memory storage medium.
+// CHECK: [[COPY_CONT]]
+// CHECK: call void @__kmpc_barrier(%struct.ident_t* @
+// CHECK: [[ACTIVE_WARPS:%.+]] = load i32, i32*
+//
+// Read into warp 0.
+// CHECK: [[IS_W0_ACTIVE_THREAD:%.+]] = icmp ult i32 [[TID:%.+]], [[ACTIVE_WARPS]]
+// CHECK: br i1 [[IS_W0_ACTIVE_THREAD]], label {{%?}}[[DO_READ:.+]], label {{%?}}[[READ_ELSE:.+]]
+//
+// CHECK: [[DO_READ]]
+// CHECK: [[MEDIUM_ELT32:%.+]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[TID]]
+// CHECK: [[MEDIUM_ELT:%.+]] = bitcast i32 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT32]] to i16 addrspace([[SHARED_ADDRSPACE]])*
+// CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST:%.+]], i{{32|64}} 0, i{{32|64}} 1
+// CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
+// CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i16*
+// CHECK: [[MEDIUM_ELT_VAL:%.+]] = load volatile i16, i16 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align
+// CHECK: store i16 [[MEDIUM_ELT_VAL]], i16* [[ELT]], align
+// CHECK: br label {{%?}}[[READ_CONT:.+]]
+//
+// CHECK: [[READ_ELSE]]
+// CHECK: br label {{%?}}[[READ_CONT]]
+//
+// CHECK: [[READ_CONT]]
+// CHECK: ret
#endif
diff --git a/clang/test/OpenMP/nvptx_target_simd_codegen.cpp b/clang/test/OpenMP/nvptx_target_simd_codegen.cpp
index 168b82057cac..b35609cc5e92 100644
--- a/clang/test/OpenMP/nvptx_target_simd_codegen.cpp
+++ b/clang/test/OpenMP/nvptx_target_simd_codegen.cpp
@@ -61,28 +61,28 @@ int bar(int n){
}
// CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+l32}}(
-// CHECK: call void @__kmpc_spmd_kernel_init(i32 %{{.+}}, i16 0, i16 0)
+// CHECK: call void @__kmpc_spmd_kernel_init(i32 %{{.+}}, i16 0)
// CHECK-NOT: call void @__kmpc_for_static_init
// CHECK-NOT: call void @__kmpc_for_static_fini
// CHECK: call void @__kmpc_spmd_kernel_deinit_v2(i16 0)
// CHECK: ret void
// CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+l37}}(
-// CHECK: call void @__kmpc_spmd_kernel_init(i32 %{{.+}}, i16 0, i16 0)
+// CHECK: call void @__kmpc_spmd_kernel_init(i32 %{{.+}}, i16 0)
// CHECK-NOT: call void @__kmpc_for_static_init
// CHECK-NOT: call void @__kmpc_for_static_fini
// CHECK: call void @__kmpc_spmd_kernel_deinit_v2(i16 0)
// CHECK: ret void
// CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+l42}}(
-// CHECK: call void @__kmpc_spmd_kernel_init(i32 %{{.+}}, i16 0, i16 0)
+// CHECK: call void @__kmpc_spmd_kernel_init(i32 %{{.+}}, i16 0)
// CHECK-NOT: call void @__kmpc_for_static_init
// CHECK-NOT: call void @__kmpc_for_static_fini
// CHECK: call void @__kmpc_spmd_kernel_deinit_v2(i16 0)
// CHECK: ret void
// CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+l47}}(
-// CHECK: call void @__kmpc_spmd_kernel_init(i32 %{{.+}}, i16 0, i16 0)
+// CHECK: call void @__kmpc_spmd_kernel_init(i32 %{{.+}}, i16 0)
// CHECK-NOT: call void @__kmpc_for_static_init
// CHECK-NOT: call void @__kmpc_for_static_fini
// CHECK-NOT: call void @__kmpc_nvptx_end_reduce_nowait(
diff --git a/clang/test/OpenMP/nvptx_target_teams_codegen.cpp b/clang/test/OpenMP/nvptx_target_teams_codegen.cpp
index 8ff393f074e4..3a6e39dfdba1 100644
--- a/clang/test/OpenMP/nvptx_target_teams_codegen.cpp
+++ b/clang/test/OpenMP/nvptx_target_teams_codegen.cpp
@@ -232,7 +232,7 @@ int bar(int n){
// CHECK: ret void
// CHECK: define weak void @__omp_offloading_{{.*}}ftemplate{{.*}}_l37(
-// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1, i16 0)
+// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1)
// CHECK: call void @__kmpc_data_sharing_init_stack_spmd
// CHECK-NOT: call i8* @__kmpc_data_sharing_push_stack(
// CHECK-NOT: call void @__kmpc_serialized_parallel(
diff --git a/clang/test/OpenMP/nvptx_target_teams_distribute_parallel_for_codegen.cpp b/clang/test/OpenMP/nvptx_target_teams_distribute_parallel_for_codegen.cpp
index 4fd0f71c5e5d..d40aad3dee77 100644
--- a/clang/test/OpenMP/nvptx_target_teams_distribute_parallel_for_codegen.cpp
+++ b/clang/test/OpenMP/nvptx_target_teams_distribute_parallel_for_codegen.cpp
@@ -100,7 +100,7 @@ int bar(int n){
// CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+}}_l50(
// CHECK-DAG: [[THREAD_LIMIT:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
-// CHECK: call void @__kmpc_spmd_kernel_init(i32 [[THREAD_LIMIT]], i16 0, i16 0)
+// CHECK: call void @__kmpc_spmd_kernel_init(i32 [[THREAD_LIMIT]], i16 0)
// CHECK: call void [[PARALLEL:@.+]](
// CHECK: call void @__kmpc_spmd_kernel_deinit_v2(i16 0)
@@ -128,7 +128,7 @@ int bar(int n){
// CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+}}(
// CHECK-DAG: [[THREAD_LIMIT:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
-// CHECK: call void @__kmpc_spmd_kernel_init(i32 [[THREAD_LIMIT]], i16 0, i16 0)
+// CHECK: call void @__kmpc_spmd_kernel_init(i32 [[THREAD_LIMIT]], i16 0)
// CHECK: call void @__kmpc_spmd_kernel_deinit_v2(i16 0)
// CHECK: ret void
@@ -143,7 +143,7 @@ int bar(int n){
// CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+}}(
// CHECK-DAG: [[THREAD_LIMIT:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
-// CHECK: call void @__kmpc_spmd_kernel_init(i32 [[THREAD_LIMIT]], i16 0, i16 0)
+// CHECK: call void @__kmpc_spmd_kernel_init(i32 [[THREAD_LIMIT]], i16 0)
// CHECK: call void @__kmpc_spmd_kernel_deinit_v2(i16 0)
// CHECK: ret void
@@ -159,7 +159,7 @@ int bar(int n){
// Distribute with collapse(2)
// CHECK: define {{.*}}void {{@__omp_offloading_.+}}({{.+}}, i{{32|64}} [[F_IN:%.+]])
// CHECK-DAG: [[THREAD_LIMIT:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
-// CHECK: call void @__kmpc_spmd_kernel_init(i32 [[THREAD_LIMIT]], i16 0, i16 0)
+// CHECK: call void @__kmpc_spmd_kernel_init(i32 [[THREAD_LIMIT]], i16 0)
// CHECK: call void @__kmpc_spmd_kernel_deinit_v2(i16 0)
// CHECK: ret void
diff --git a/clang/test/OpenMP/nvptx_target_teams_distribute_parallel_for_simd_codegen.cpp b/clang/test/OpenMP/nvptx_target_teams_distribute_parallel_for_simd_codegen.cpp
index a933c7e021b8..e9126fce7020 100644
--- a/clang/test/OpenMP/nvptx_target_teams_distribute_parallel_for_simd_codegen.cpp
+++ b/clang/test/OpenMP/nvptx_target_teams_distribute_parallel_for_simd_codegen.cpp
@@ -83,7 +83,7 @@ int bar(int n){
// CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+}}_l43(
// CHECK-DAG: [[THREAD_LIMIT:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
-// CHECK: call void @__kmpc_spmd_kernel_init(i32 [[THREAD_LIMIT]], i16 0, i16 0)
+// CHECK: call void @__kmpc_spmd_kernel_init(i32 [[THREAD_LIMIT]], i16 0)
// CHECK: call void @__kmpc_spmd_kernel_deinit_v2(i16 0)
// SEQ: [[SHARED:%.+]] = load i16, i16* [[KERNEL_SHARED]],
@@ -109,7 +109,7 @@ int bar(int n){
// CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+}}(
// CHECK-DAG: [[THREAD_LIMIT:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
-// CHECK: call void @__kmpc_spmd_kernel_init(i32 [[THREAD_LIMIT]], i16 0, i16 0)
+// CHECK: call void @__kmpc_spmd_kernel_init(i32 [[THREAD_LIMIT]], i16 0)
// CHECK: call void @__kmpc_spmd_kernel_deinit_v2(i16 0)
// CHECK: call void @__kmpc_for_static_init_4({{.+}}, {{.+}}, {{.+}} 91,
@@ -124,7 +124,7 @@ int bar(int n){
// CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+}}(
// CHECK-DAG: [[THREAD_LIMIT:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
-// CHECK: call void @__kmpc_spmd_kernel_init(i32 [[THREAD_LIMIT]], i16 0, i16 0)
+// CHECK: call void @__kmpc_spmd_kernel_init(i32 [[THREAD_LIMIT]], i16 0)
// CHECK: call void @__kmpc_spmd_kernel_deinit_v2(i16 0)
// CHECK: call void @__kmpc_for_static_init_4({{.+}}, {{.+}}, {{.+}} 91,
@@ -140,7 +140,7 @@ int bar(int n){
// CHECK: define {{.*}}void {{@__omp_offloading_.+}}({{.+}}, i{{32|64}} [[F_IN:%.+]])
// CHECK: store {{.+}} [[F_IN]], {{.+}}* {{.+}},
// CHECK-DAG: [[THREAD_LIMIT:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
-// CHECK: call void @__kmpc_spmd_kernel_init(i32 [[THREAD_LIMIT]], i16 0, i16 0)
+// CHECK: call void @__kmpc_spmd_kernel_init(i32 [[THREAD_LIMIT]], i16 0)
// CHECK: call void @__kmpc_spmd_kernel_deinit_v2(i16 0)
// CHECK: store {{.+}} 99, {{.+}}* [[COMB_UB:%.+]], align
diff --git a/clang/test/OpenMP/nvptx_target_teams_distribute_simd_codegen.cpp b/clang/test/OpenMP/nvptx_target_teams_distribute_simd_codegen.cpp
index 8d12c857cb43..15f5f09f3899 100644
--- a/clang/test/OpenMP/nvptx_target_teams_distribute_simd_codegen.cpp
+++ b/clang/test/OpenMP/nvptx_target_teams_distribute_simd_codegen.cpp
@@ -70,7 +70,7 @@ int bar(int n){
}
// CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+}}_l37(
-// CHECK: call void @__kmpc_spmd_kernel_init(i32 %{{.+}}, i16 0, i16 0)
+// CHECK: call void @__kmpc_spmd_kernel_init(i32 %{{.+}}, i16 0)
// CHECK: call void @__kmpc_spmd_kernel_deinit_v2(i16 0)
// CHECK: call void @__kmpc_for_static_init_4({{.+}}, {{.+}}, {{.+}} 91,
@@ -78,7 +78,7 @@ int bar(int n){
// CHECK: ret void
// CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+}}_l43(
-// CHECK: call void @__kmpc_spmd_kernel_init(i32 %{{.+}}, i16 0, i16 0)
+// CHECK: call void @__kmpc_spmd_kernel_init(i32 %{{.+}}, i16 0)
// CHECK: call void @__kmpc_spmd_kernel_deinit_v2(i16 0)
// CHECK: call void @__kmpc_for_static_init_4({{.+}}, {{.+}}, {{.+}} 91,
@@ -86,7 +86,7 @@ int bar(int n){
// CHECK: ret void
// CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+}}_l48(
-// CHECK: call void @__kmpc_spmd_kernel_init(i32 %{{.+}}, i16 0, i16 0)
+// CHECK: call void @__kmpc_spmd_kernel_init(i32 %{{.+}}, i16 0)
// CHECK: call void @__kmpc_spmd_kernel_deinit_v2(i16 0)
// CHECK: call void @__kmpc_for_static_init_4({{.+}}, {{.+}}, {{.+}} 91,
@@ -95,7 +95,7 @@ int bar(int n){
// CHECK: define {{.*}}void {{@__omp_offloading_.+}}_l53({{.+}}, i{{32|64}} [[F_IN:%.+]])
// CHECK: store {{.+}} [[F_IN]], {{.+}}* {{.+}},
-// CHECK: call void @__kmpc_spmd_kernel_init(i32 %{{.+}}, i16 0, i16 0)
+// CHECK: call void @__kmpc_spmd_kernel_init(i32 %{{.+}}, i16 0)
// CHECK: call void @__kmpc_spmd_kernel_deinit_v2(i16 0)
// CHECK: store {{.+}} 99, {{.+}}* [[COMB_UB:%.+]], align
diff --git a/openmp/libomptarget/deviceRTLs/common/omptarget.h b/openmp/libomptarget/deviceRTLs/common/omptarget.h
index 6d5d6cd19bd6..0ccd71c3b55f 100644
--- a/openmp/libomptarget/deviceRTLs/common/omptarget.h
+++ b/openmp/libomptarget/deviceRTLs/common/omptarget.h
@@ -92,15 +92,7 @@ struct __kmpc_data_sharing_worker_slot_static {
void *DataEnd;
char Data[DS_Worker_Warp_Slot_Size];
};
-// Additional master slot type which is initialized with the default master slot
-// size of 4 bytes.
-struct __kmpc_data_sharing_master_slot_static {
- __kmpc_data_sharing_slot *Next;
- __kmpc_data_sharing_slot *Prev;
- void *PrevSlotStackPtr;
- void *DataEnd;
- char Data[DS_Slot_Size];
-};
+
extern DEVICE SHARED DataSharingStateTy DataSharingState;
////////////////////////////////////////////////////////////////////////////////
@@ -204,37 +196,6 @@ class omptarget_nvptx_TeamDescr {
// init
INLINE void InitTeamDescr();
- INLINE __kmpc_data_sharing_slot *RootS(int wid, bool IsMasterThread) {
- // If this is invoked by the master thread of the master warp then
- // initialize it with a smaller slot.
- if (IsMasterThread) {
- // Do not initialize this slot again if it has already been initalized.
- if (master_rootS[0].DataEnd == &master_rootS[0].Data[0] + DS_Slot_Size)
- return 0;
- // Initialize the pointer to the end of the slot given the size of the
- // data section. DataEnd is non-inclusive.
- master_rootS[0].DataEnd = &master_rootS[0].Data[0] + DS_Slot_Size;
- // We currently do not have a next slot.
- master_rootS[0].Next = 0;
- master_rootS[0].Prev = 0;
- master_rootS[0].PrevSlotStackPtr = 0;
- return (__kmpc_data_sharing_slot *)&master_rootS[0];
- }
- // Do not initialize this slot again if it has already been initalized.
- if (worker_rootS[wid].DataEnd ==
- &worker_rootS[wid].Data[0] + DS_Worker_Warp_Slot_Size)
- return 0;
- // Initialize the pointer to the end of the slot given the size of the data
- // section. DataEnd is non-inclusive.
- worker_rootS[wid].DataEnd =
- &worker_rootS[wid].Data[0] + DS_Worker_Warp_Slot_Size;
- // We currently do not have a next slot.
- worker_rootS[wid].Next = 0;
- worker_rootS[wid].Prev = 0;
- worker_rootS[wid].PrevSlotStackPtr = 0;
- return (__kmpc_data_sharing_slot *)&worker_rootS[wid];
- }
-
INLINE __kmpc_data_sharing_slot *GetPreallocatedSlotAddr(int wid) {
worker_rootS[wid].DataEnd =
&worker_rootS[wid].Data[0] + DS_Worker_Warp_Slot_Size;
@@ -253,7 +214,6 @@ class omptarget_nvptx_TeamDescr {
ALIGN(16)
__kmpc_data_sharing_worker_slot_static worker_rootS[DS_Max_Warp_Number];
- ALIGN(16) __kmpc_data_sharing_master_slot_static master_rootS[1];
};
////////////////////////////////////////////////////////////////////////////////
diff --git a/openmp/libomptarget/deviceRTLs/common/src/omptarget.cu b/openmp/libomptarget/deviceRTLs/common/src/omptarget.cu
index 6c1d5319595c..5ccc84539400 100644
--- a/openmp/libomptarget/deviceRTLs/common/src/omptarget.cu
+++ b/openmp/libomptarget/deviceRTLs/common/src/omptarget.cu
@@ -77,8 +77,7 @@ EXTERN void __kmpc_kernel_deinit(int16_t IsOMPRuntimeInitialized) {
omptarget_nvptx_workFn = 0;
}
-EXTERN void __kmpc_spmd_kernel_init(int ThreadLimit, int16_t RequiresOMPRuntime,
- int16_t RequiresDataSharing) {
+EXTERN void __kmpc_spmd_kernel_init(int ThreadLimit, int16_t RequiresOMPRuntime) {
PRINT0(LD_IO, "call to __kmpc_spmd_kernel_init\n");
setExecutionParameters(Spmd, RequiresOMPRuntime ? RuntimeInitialized
@@ -134,15 +133,6 @@ EXTERN void __kmpc_spmd_kernel_init(int ThreadLimit, int16_t RequiresOMPRuntime,
"thread will execute parallel region with id %d in a team of "
"%d threads\n",
(int)newTaskDescr->ThreadId(), (int)ThreadLimit);
-
- if (RequiresDataSharing && GetLaneId() == 0) {
- // Warp master initializes data sharing environment.
- unsigned WID = threadId / WARPSIZE;
- __kmpc_data_sharing_slot *RootS = currTeamDescr.RootS(
- WID, WID == WARPSIZE - 1);
- DataSharingState.SlotPtr[WID] = RootS;
- DataSharingState.StackPtr[WID] = (void *)&RootS->Data[0];
- }
}
EXTERN void __kmpc_spmd_kernel_deinit_v2(int16_t RequiresOMPRuntime) {
diff --git a/openmp/libomptarget/deviceRTLs/interface.h b/openmp/libomptarget/deviceRTLs/interface.h
index 4d352bc648fa..330880556293 100644
--- a/openmp/libomptarget/deviceRTLs/interface.h
+++ b/openmp/libomptarget/deviceRTLs/interface.h
@@ -421,8 +421,8 @@ EXTERN int32_t __kmpc_cancel(kmp_Ident *loc, int32_t global_tid,
// non standard
EXTERN void __kmpc_kernel_init(int ThreadLimit, int16_t RequiresOMPRuntime);
EXTERN void __kmpc_kernel_deinit(int16_t IsOMPRuntimeInitialized);
-EXTERN void __kmpc_spmd_kernel_init(int ThreadLimit, int16_t RequiresOMPRuntime,
- int16_t RequiresDataSharing);
+EXTERN void __kmpc_spmd_kernel_init(int ThreadLimit,
+ int16_t RequiresOMPRuntime);
EXTERN void __kmpc_spmd_kernel_deinit_v2(int16_t RequiresOMPRuntime);
EXTERN void __kmpc_kernel_prepare_parallel(void *WorkFn);
EXTERN bool __kmpc_kernel_parallel(void **WorkFn);
More information about the Openmp-commits
mailing list