[Openmp-commits] [openmp] cd0ea03 - [OpenMP][NFC] Remove unused and untested code from the device runtime
Johannes Doerfert via Openmp-commits
openmp-commits at lists.llvm.org
Fri Jul 10 17:11:36 PDT 2020
Author: Johannes Doerfert
Date: 2020-07-10T19:09:41-05:00
New Revision: cd0ea03e6f157e8fb477cd8368b29e1448eeb265
URL: https://github.com/llvm/llvm-project/commit/cd0ea03e6f157e8fb477cd8368b29e1448eeb265
DIFF: https://github.com/llvm/llvm-project/commit/cd0ea03e6f157e8fb477cd8368b29e1448eeb265.diff
LOG: [OpenMP][NFC] Remove unused and untested code from the device runtime
Summary:
We carried a lot of unused and untested code in the device runtime.
Among other reasons, we are planning major rewrites for which reduced
size is going to help a lot.
The number of code lines reduced by 14%!
Before:
-------------------------------------------------------------------------------
Language files blank comment code
-------------------------------------------------------------------------------
CUDA 13 489 841 2454
C/C++ Header 14 322 493 1377
C 12 117 124 559
CMake 4 64 64 262
C++ 1 6 6 39
-------------------------------------------------------------------------------
SUM: 44 998 1528 4691
-------------------------------------------------------------------------------
After:
-------------------------------------------------------------------------------
Language files blank comment code
-------------------------------------------------------------------------------
CUDA 13 366 733 1879
C/C++ Header 14 317 484 1293
C 12 117 124 559
CMake 4 64 64 262
C++ 1 6 6 39
-------------------------------------------------------------------------------
SUM: 44 870 1411 4032
-------------------------------------------------------------------------------
Reviewers: hfinkel, jhuber6, fghanim, JonChesterfield, grokos, AndreyChurbanov, ye-luo, tianshilei1992, ggeorgakoudis, Hahnfeld, ABataev, hbae, ronlieb, gregrodgers
Subscribers: jvesely, yaxunl, bollu, guansong, jfb, sstefan1, aaron.ballman, openmp-commits, cfe-commits
Tags: #clang, #openmp
Differential Revision: https://reviews.llvm.org/D83349
Added:
Modified:
clang/test/OpenMP/nvptx_target_simd_codegen.cpp
openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.h
openmp/libomptarget/deviceRTLs/common/omptarget.h
openmp/libomptarget/deviceRTLs/common/src/data_sharing.cu
openmp/libomptarget/deviceRTLs/common/src/libcall.cu
openmp/libomptarget/deviceRTLs/common/src/loop.cu
openmp/libomptarget/deviceRTLs/common/src/omptarget.cu
openmp/libomptarget/deviceRTLs/common/src/parallel.cu
openmp/libomptarget/deviceRTLs/common/src/reduction.cu
openmp/libomptarget/deviceRTLs/common/src/support.cu
openmp/libomptarget/deviceRTLs/common/src/sync.cu
openmp/libomptarget/deviceRTLs/common/support.h
openmp/libomptarget/deviceRTLs/interface.h
openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.h
Removed:
################################################################################
diff --git a/clang/test/OpenMP/nvptx_target_simd_codegen.cpp b/clang/test/OpenMP/nvptx_target_simd_codegen.cpp
index 073d6fa2f14e..7a1f01c1f1ad 100644
--- a/clang/test/OpenMP/nvptx_target_simd_codegen.cpp
+++ b/clang/test/OpenMP/nvptx_target_simd_codegen.cpp
@@ -78,7 +78,6 @@ int bar(int n){
// CHECK: call void @__kmpc_spmd_kernel_init(i32 %{{.+}}, i16 0, i16 0)
// CHECK-NOT: call void @__kmpc_for_static_init
// CHECK-NOT: call void @__kmpc_for_static_fini
-// CHECK-NOT: call i32 @__kmpc_nvptx_simd_reduce_nowait(
// CHECK-NOT: call void @__kmpc_nvptx_end_reduce_nowait(
// CHECK: call void @__kmpc_spmd_kernel_deinit_v2(i16 0)
// CHECK: ret void
diff --git a/openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.h b/openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.h
index 77a0ffb54f95..3c90b39282c9 100644
--- a/openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.h
+++ b/openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.h
@@ -140,8 +140,6 @@ DEVICE int GetNumberOfThreadsInBlock();
DEVICE unsigned GetWarpId();
DEVICE unsigned GetLaneId();
-DEVICE bool __kmpc_impl_is_first_active_thread();
-
// Locks
DEVICE void __kmpc_impl_init_lock(omp_lock_t *lock);
DEVICE void __kmpc_impl_destroy_lock(omp_lock_t *lock);
diff --git a/openmp/libomptarget/deviceRTLs/common/omptarget.h b/openmp/libomptarget/deviceRTLs/common/omptarget.h
index 986eb3677dcf..88807de4e19c 100644
--- a/openmp/libomptarget/deviceRTLs/common/omptarget.h
+++ b/openmp/libomptarget/deviceRTLs/common/omptarget.h
@@ -200,7 +200,6 @@ class omptarget_nvptx_TeamDescr {
INLINE omptarget_nvptx_WorkDescr &WorkDescr() {
return workDescrForActiveParallel;
}
- INLINE uint64_t *getLastprivateIterBuffer() { return &lastprivateIterBuffer; }
// init
INLINE void InitTeamDescr();
@@ -251,7 +250,6 @@ class omptarget_nvptx_TeamDescr {
levelZeroTaskDescr; // icv for team master initial thread
omptarget_nvptx_WorkDescr
workDescrForActiveParallel; // one, ONLY for the active par
- uint64_t lastprivateIterBuffer;
ALIGN(16)
__kmpc_data_sharing_worker_slot_static worker_rootS[WARPSIZE];
@@ -277,10 +275,6 @@ class omptarget_nvptx_ThreadPrivateContext {
INLINE uint16_t &NumThreadsForNextParallel(int tid) {
return nextRegion.tnum[tid];
}
- // simd
- INLINE uint16_t &SimdLimitForNextSimd(int tid) {
- return nextRegion.slim[tid];
- }
// schedule (for dispatch)
INLINE kmp_sched_t &ScheduleType(int tid) { return schedule[tid]; }
INLINE int64_t &Chunk(int tid) { return chunk[tid]; }
@@ -304,8 +298,6 @@ class omptarget_nvptx_ThreadPrivateContext {
// Only one of the two is live at the same time.
// parallel
uint16_t tnum[MAX_THREADS_PER_TEAM];
- // simd limit
- uint16_t slim[MAX_THREADS_PER_TEAM];
} nextRegion;
// schedule (for dispatch)
kmp_sched_t schedule[MAX_THREADS_PER_TEAM]; // remember schedule type for #for
diff --git a/openmp/libomptarget/deviceRTLs/common/src/data_sharing.cu b/openmp/libomptarget/deviceRTLs/common/src/data_sharing.cu
index f6523c8ce8aa..ca2fd1d30754 100644
--- a/openmp/libomptarget/deviceRTLs/common/src/data_sharing.cu
+++ b/openmp/libomptarget/deviceRTLs/common/src/data_sharing.cu
@@ -17,297 +17,6 @@ INLINE static bool IsMasterThread(bool isSPMDExecutionMode) {
return !isSPMDExecutionMode && GetMasterThreadID() == GetThreadIdInBlock();
}
-/// Return the provided size aligned to the size of a pointer.
-INLINE static size_t AlignVal(size_t Val) {
- const size_t Align = (size_t)sizeof(void *);
- if (Val & (Align - 1)) {
- Val += Align;
- Val &= ~(Align - 1);
- }
- return Val;
-}
-
-#define DSFLAG 0
-#define DSFLAG_INIT 0
-#define DSPRINT(_flag, _str, _args...) \
- { \
- if (_flag) { \
- /*printf("(%d,%d) -> " _str, blockIdx.x, threadIdx.x, _args);*/ \
- } \
- }
-#define DSPRINT0(_flag, _str) \
- { \
- if (_flag) { \
- /*printf("(%d,%d) -> " _str, blockIdx.x, threadIdx.x);*/ \
- } \
- }
-
-// Initialize the shared data structures. This is expected to be called for the
-// master thread and warp masters. \param RootS: A pointer to the root of the
-// data sharing stack. \param InitialDataSize: The initial size of the data in
-// the slot.
-EXTERN void
-__kmpc_initialize_data_sharing_environment(__kmpc_data_sharing_slot *rootS,
- size_t InitialDataSize) {
- ASSERT0(LT_FUSSY, isRuntimeInitialized(), "Runtime must be initialized.");
- DSPRINT0(DSFLAG_INIT,
- "Entering __kmpc_initialize_data_sharing_environment\n");
-
- unsigned WID = GetWarpId();
- DSPRINT(DSFLAG_INIT, "Warp ID: %u\n", WID);
-
- omptarget_nvptx_TeamDescr *teamDescr =
- &omptarget_nvptx_threadPrivateContext->TeamContext();
- __kmpc_data_sharing_slot *RootS =
- teamDescr->RootS(WID, IsMasterThread(isSPMDMode()));
-
- DataSharingState.SlotPtr[WID] = RootS;
- DataSharingState.StackPtr[WID] = (void *)&RootS->Data[0];
-
- // We don't need to initialize the frame and active threads.
-
- DSPRINT(DSFLAG_INIT, "Initial data size: %08x \n", (unsigned)InitialDataSize);
- DSPRINT(DSFLAG_INIT, "Root slot at: %016llx \n", (unsigned long long)RootS);
- DSPRINT(DSFLAG_INIT, "Root slot data-end at: %016llx \n",
- (unsigned long long)RootS->DataEnd);
- DSPRINT(DSFLAG_INIT, "Root slot next at: %016llx \n",
- (unsigned long long)RootS->Next);
- DSPRINT(DSFLAG_INIT, "Shared slot ptr at: %016llx \n",
- (unsigned long long)DataSharingState.SlotPtr[WID]);
- DSPRINT(DSFLAG_INIT, "Shared stack ptr at: %016llx \n",
- (unsigned long long)DataSharingState.StackPtr[WID]);
-
- DSPRINT0(DSFLAG_INIT, "Exiting __kmpc_initialize_data_sharing_environment\n");
-}
-
-EXTERN void *__kmpc_data_sharing_environment_begin(
- __kmpc_data_sharing_slot **SavedSharedSlot, void **SavedSharedStack,
- void **SavedSharedFrame, __kmpc_impl_lanemask_t *SavedActiveThreads,
- size_t SharingDataSize, size_t SharingDefaultDataSize,
- int16_t IsOMPRuntimeInitialized) {
-
- DSPRINT0(DSFLAG, "Entering __kmpc_data_sharing_environment_begin\n");
-
- // If the runtime has been elided, used shared memory for master-worker
- // data sharing.
- if (!IsOMPRuntimeInitialized)
- return (void *)&DataSharingState;
-
- DSPRINT(DSFLAG, "Data Size %016llx\n", (unsigned long long)SharingDataSize);
- DSPRINT(DSFLAG, "Default Data Size %016llx\n",
- (unsigned long long)SharingDefaultDataSize);
-
- unsigned WID = GetWarpId();
- __kmpc_impl_lanemask_t CurActiveThreads = __kmpc_impl_activemask();
-
- __kmpc_data_sharing_slot *&SlotP = DataSharingState.SlotPtr[WID];
- void *&StackP = DataSharingState.StackPtr[WID];
- void * volatile &FrameP = DataSharingState.FramePtr[WID];
- __kmpc_impl_lanemask_t &ActiveT = DataSharingState.ActiveThreads[WID];
-
- DSPRINT0(DSFLAG, "Save current slot/stack values.\n");
- // Save the current values.
- *SavedSharedSlot = SlotP;
- *SavedSharedStack = StackP;
- *SavedSharedFrame = FrameP;
- *SavedActiveThreads = ActiveT;
-
- DSPRINT(DSFLAG, "Warp ID: %u\n", WID);
- DSPRINT(DSFLAG, "Saved slot ptr at: %016llx \n", (unsigned long long)SlotP);
- DSPRINT(DSFLAG, "Saved stack ptr at: %016llx \n", (unsigned long long)StackP);
- DSPRINT(DSFLAG, "Saved frame ptr at: %016llx \n", (long long)FrameP);
- DSPRINT(DSFLAG, "Active threads: %08x \n", (unsigned)ActiveT);
-
- // Only the warp active master needs to grow the stack.
- if (__kmpc_impl_is_first_active_thread()) {
- // Save the current active threads.
- ActiveT = CurActiveThreads;
-
- // Make sure we use aligned sizes to avoid rematerialization of data.
- SharingDataSize = AlignVal(SharingDataSize);
- // FIXME: The default data size can be assumed to be aligned?
- SharingDefaultDataSize = AlignVal(SharingDefaultDataSize);
-
- // Check if we have room for the data in the current slot.
- const uintptr_t CurrentStartAddress = (uintptr_t)StackP;
- const uintptr_t CurrentEndAddress = (uintptr_t)SlotP->DataEnd;
- const uintptr_t RequiredEndAddress =
- CurrentStartAddress + (uintptr_t)SharingDataSize;
-
- DSPRINT(DSFLAG, "Data Size %016llx\n", (unsigned long long)SharingDataSize);
- DSPRINT(DSFLAG, "Default Data Size %016llx\n",
- (unsigned long long)SharingDefaultDataSize);
- DSPRINT(DSFLAG, "Current Start Address %016llx\n",
- (unsigned long long)CurrentStartAddress);
- DSPRINT(DSFLAG, "Current End Address %016llx\n",
- (unsigned long long)CurrentEndAddress);
- DSPRINT(DSFLAG, "Required End Address %016llx\n",
- (unsigned long long)RequiredEndAddress);
- DSPRINT(DSFLAG, "Active Threads %08x\n", (unsigned)ActiveT);
-
- // If we require a new slot, allocate it and initialize it (or attempt to
- // reuse one). Also, set the shared stack and slot pointers to the new
- // place. If we do not need to grow the stack, just adapt the stack and
- // frame pointers.
- if (CurrentEndAddress < RequiredEndAddress) {
- size_t NewSize = (SharingDataSize > SharingDefaultDataSize)
- ? SharingDataSize
- : SharingDefaultDataSize;
- __kmpc_data_sharing_slot *NewSlot = 0;
-
- // Attempt to reuse an existing slot.
- if (__kmpc_data_sharing_slot *ExistingSlot = SlotP->Next) {
- uintptr_t ExistingSlotSize = (uintptr_t)ExistingSlot->DataEnd -
- (uintptr_t)(&ExistingSlot->Data[0]);
- if (ExistingSlotSize >= NewSize) {
- DSPRINT(DSFLAG, "Reusing stack slot %016llx\n",
- (unsigned long long)ExistingSlot);
- NewSlot = ExistingSlot;
- } else {
- DSPRINT(DSFLAG, "Cleaning up -failed reuse - %016llx\n",
- (unsigned long long)SlotP->Next);
- SafeFree(ExistingSlot, "Failed reuse");
- }
- }
-
- if (!NewSlot) {
- NewSlot = (__kmpc_data_sharing_slot *)SafeMalloc(
- sizeof(__kmpc_data_sharing_slot) + NewSize,
- "Warp master slot allocation");
- DSPRINT(DSFLAG, "New slot allocated %016llx (data size=%016llx)\n",
- (unsigned long long)NewSlot, NewSize);
- }
-
- NewSlot->Next = 0;
- NewSlot->DataEnd = &NewSlot->Data[NewSize];
-
- SlotP->Next = NewSlot;
- SlotP = NewSlot;
- StackP = &NewSlot->Data[SharingDataSize];
- FrameP = &NewSlot->Data[0];
- } else {
-
- // Clean up any old slot that we may still have. The slot producers, do
- // not eliminate them because that may be used to return data.
- if (SlotP->Next) {
- DSPRINT(DSFLAG, "Cleaning up - old not required - %016llx\n",
- (unsigned long long)SlotP->Next);
- SafeFree(SlotP->Next, "Old slot not required");
- SlotP->Next = 0;
- }
-
- FrameP = StackP;
- StackP = (void *)RequiredEndAddress;
- }
- }
-
- // FIXME: Need to see the impact of doing it here.
- __kmpc_impl_threadfence_block();
-
- DSPRINT0(DSFLAG, "Exiting __kmpc_data_sharing_environment_begin\n");
-
- // All the threads in this warp get the frame they should work with.
- return FrameP;
-}
-
-EXTERN void __kmpc_data_sharing_environment_end(
- __kmpc_data_sharing_slot **SavedSharedSlot, void **SavedSharedStack,
- void **SavedSharedFrame, __kmpc_impl_lanemask_t *SavedActiveThreads,
- int32_t IsEntryPoint) {
-
- DSPRINT0(DSFLAG, "Entering __kmpc_data_sharing_environment_end\n");
-
- unsigned WID = GetWarpId();
-
- if (IsEntryPoint) {
- if (__kmpc_impl_is_first_active_thread()) {
- DSPRINT0(DSFLAG, "Doing clean up\n");
-
- // The master thread cleans the saved slot, because this is an environment
- // only for the master.
- __kmpc_data_sharing_slot *S = IsMasterThread(isSPMDMode())
- ? *SavedSharedSlot
- : DataSharingState.SlotPtr[WID];
-
- if (S->Next) {
- SafeFree(S->Next, "Sharing environment end");
- S->Next = 0;
- }
- }
-
- DSPRINT0(DSFLAG, "Exiting Exiting __kmpc_data_sharing_environment_end\n");
- return;
- }
-
- __kmpc_impl_lanemask_t CurActive = __kmpc_impl_activemask();
-
- // Only the warp master can restore the stack and frame information, and only
- // if there are no other threads left behind in this environment (i.e. the
- // warp diverged and returns in
diff erent places). This only works if we
- // assume that threads will converge right after the call site that started
- // the environment.
- if (__kmpc_impl_is_first_active_thread()) {
- __kmpc_impl_lanemask_t &ActiveT = DataSharingState.ActiveThreads[WID];
-
- DSPRINT0(DSFLAG, "Before restoring the stack\n");
- // Zero the bits in the mask. If it is still
diff erent from zero, then we
- // have other threads that will return after the current ones.
- ActiveT &= ~CurActive;
-
- DSPRINT(DSFLAG, "Active threads: %08x; New mask: %08x\n",
- (unsigned)CurActive, (unsigned)ActiveT);
-
- if (!ActiveT) {
- // No other active threads? Great, lets restore the stack.
-
- __kmpc_data_sharing_slot *&SlotP = DataSharingState.SlotPtr[WID];
- void *&StackP = DataSharingState.StackPtr[WID];
- void * volatile &FrameP = DataSharingState.FramePtr[WID];
-
- SlotP = *SavedSharedSlot;
- StackP = *SavedSharedStack;
- FrameP = *SavedSharedFrame;
- ActiveT = *SavedActiveThreads;
-
- DSPRINT(DSFLAG, "Restored slot ptr at: %016llx \n",
- (unsigned long long)SlotP);
- DSPRINT(DSFLAG, "Restored stack ptr at: %016llx \n",
- (unsigned long long)StackP);
- DSPRINT(DSFLAG, "Restored frame ptr at: %016llx \n",
- (unsigned long long)FrameP);
- DSPRINT(DSFLAG, "Active threads: %08x \n", (unsigned)ActiveT);
- }
- }
-
- // FIXME: Need to see the impact of doing it here.
- __kmpc_impl_threadfence_block();
-
- DSPRINT0(DSFLAG, "Exiting __kmpc_data_sharing_environment_end\n");
- return;
-}
-
-EXTERN void *
-__kmpc_get_data_sharing_environment_frame(int32_t SourceThreadID,
- int16_t IsOMPRuntimeInitialized) {
- DSPRINT0(DSFLAG, "Entering __kmpc_get_data_sharing_environment_frame\n");
-
- // If the runtime has been elided, use shared memory for master-worker
- // data sharing. We're reusing the statically allocated data structure
- // that is used for standard data sharing.
- if (!IsOMPRuntimeInitialized)
- return (void *)&DataSharingState;
-
- // Get the frame used by the requested thread.
-
- unsigned SourceWID = SourceThreadID / WARPSIZE;
-
- DSPRINT(DSFLAG, "Source warp: %u\n", SourceWID);
-
- void * volatile P = DataSharingState.FramePtr[SourceWID];
- DSPRINT0(DSFLAG, "Exiting __kmpc_get_data_sharing_environment_frame\n");
- return P;
-}
-
////////////////////////////////////////////////////////////////////////////////
// Runtime functions for trunk data sharing scheme.
////////////////////////////////////////////////////////////////////////////////
diff --git a/openmp/libomptarget/deviceRTLs/common/src/libcall.cu b/openmp/libomptarget/deviceRTLs/common/src/libcall.cu
index 89c481bcf8da..cfa438ff1717 100644
--- a/openmp/libomptarget/deviceRTLs/common/src/libcall.cu
+++ b/openmp/libomptarget/deviceRTLs/common/src/libcall.cu
@@ -362,53 +362,3 @@ EXTERN int omp_test_lock(omp_lock_t *lock) {
PRINT(LD_IO, "call omp_test_lock() return %d\n", rc);
return rc;
}
-
-// for xlf Fortran
-// Fortran, the return is LOGICAL type
-
-#define FLOGICAL long
-EXTERN FLOGICAL __xlf_omp_is_initial_device_i8() {
- int ret = omp_is_initial_device();
- if (ret == 0)
- return (FLOGICAL)0;
- else
- return (FLOGICAL)1;
-}
-
-EXTERN int __xlf_omp_is_initial_device_i4() {
- int ret = omp_is_initial_device();
- if (ret == 0)
- return 0;
- else
- return 1;
-}
-
-EXTERN long __xlf_omp_get_team_num_i4() {
- int ret = omp_get_team_num();
- return (long)ret;
-}
-
-EXTERN long __xlf_omp_get_num_teams_i4() {
- int ret = omp_get_num_teams();
- return (long)ret;
-}
-
-EXTERN void xlf_debug_print_int(int *p) {
- printf("xlf DEBUG %d): %p %d\n", omp_get_team_num(), p, p == 0 ? 0 : *p);
-}
-
-EXTERN void xlf_debug_print_long(long *p) {
- printf("xlf DEBUG %d): %p %ld\n", omp_get_team_num(), p, p == 0 ? 0 : *p);
-}
-
-EXTERN void xlf_debug_print_float(float *p) {
- printf("xlf DEBUG %d): %p %f\n", omp_get_team_num(), p, p == 0 ? 0 : *p);
-}
-
-EXTERN void xlf_debug_print_double(double *p) {
- printf("xlf DEBUG %d): %p %f\n", omp_get_team_num(), p, p == 0 ? 0 : *p);
-}
-
-EXTERN void xlf_debug_print_addr(void *p) {
- printf("xlf DEBUG %d): %p \n", omp_get_team_num(), p);
-}
diff --git a/openmp/libomptarget/deviceRTLs/common/src/loop.cu b/openmp/libomptarget/deviceRTLs/common/src/loop.cu
index 417460db138a..f625d9ea9e23 100644
--- a/openmp/libomptarget/deviceRTLs/common/src/loop.cu
+++ b/openmp/libomptarget/deviceRTLs/common/src/loop.cu
@@ -754,55 +754,3 @@ void __kmpc_for_static_init_8u_simple_generic(
EXTERN void __kmpc_for_static_fini(kmp_Ident *loc, int32_t global_tid) {
PRINT0(LD_IO, "call kmpc_for_static_fini\n");
}
-
-namespace {
-INLINE void syncWorkersInGenericMode(uint32_t NumThreads) {
- int NumWarps = ((NumThreads + WARPSIZE - 1) / WARPSIZE);
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700
- // On Volta and newer architectures we require that all lanes in
- // a warp (at least, all present for the kernel launch) participate in the
- // barrier. This is enforced when launching the parallel region. An
- // exception is when there are < WARPSIZE workers. In this case only 1 worker
- // is started, so we don't need a barrier.
- if (NumThreads > 1) {
-#endif
- __kmpc_impl_named_sync(L1_BARRIER, WARPSIZE * NumWarps);
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700
- }
-#endif
-}
-}; // namespace
-
-EXTERN void __kmpc_reduce_conditional_lastprivate(kmp_Ident *loc, int32_t gtid,
- int32_t varNum, void *array) {
- PRINT0(LD_IO, "call to __kmpc_reduce_conditional_lastprivate(...)\n");
- ASSERT0(LT_FUSSY, checkRuntimeInitialized(loc),
- "Expected non-SPMD mode + initialized runtime.");
-
- omptarget_nvptx_TeamDescr &teamDescr = getMyTeamDescriptor();
- uint32_t NumThreads = GetNumberOfOmpThreads(checkSPMDMode(loc));
- uint64_t *Buffer = teamDescr.getLastprivateIterBuffer();
- for (unsigned i = 0; i < varNum; i++) {
- // Reset buffer.
- if (gtid == 0)
- *Buffer = 0; // Reset to minimum loop iteration value.
-
- // Barrier.
- syncWorkersInGenericMode(NumThreads);
-
- // Atomic max of iterations.
- uint64_t *varArray = (uint64_t *)array;
- uint64_t elem = varArray[i];
- (void)__kmpc_atomic_max((unsigned long long int *)Buffer,
- (unsigned long long int)elem);
-
- // Barrier.
- syncWorkersInGenericMode(NumThreads);
-
- // Read max value and update thread private array.
- varArray[i] = *Buffer;
-
- // Barrier.
- syncWorkersInGenericMode(NumThreads);
- }
-}
diff --git a/openmp/libomptarget/deviceRTLs/common/src/omptarget.cu b/openmp/libomptarget/deviceRTLs/common/src/omptarget.cu
index 23fbd00cacaf..6c1d5319595c 100644
--- a/openmp/libomptarget/deviceRTLs/common/src/omptarget.cu
+++ b/openmp/libomptarget/deviceRTLs/common/src/omptarget.cu
@@ -25,13 +25,6 @@ extern DEVICE
// init entry points
////////////////////////////////////////////////////////////////////////////////
-EXTERN void __kmpc_kernel_init_params(void *Ptr) {
- PRINT(LD_IO, "call to __kmpc_kernel_init_params with version %f\n",
- OMPTARGET_NVPTX_VERSION);
-
- SetTeamsReductionScratchpadPtr(Ptr);
-}
-
EXTERN void __kmpc_kernel_init(int ThreadLimit, int16_t RequiresOMPRuntime) {
PRINT(LD_IO, "call to __kmpc_kernel_init with version %f\n",
OMPTARGET_NVPTX_VERSION);
@@ -152,10 +145,6 @@ EXTERN void __kmpc_spmd_kernel_init(int ThreadLimit, int16_t RequiresOMPRuntime,
}
}
-EXTERN __attribute__((deprecated)) void __kmpc_spmd_kernel_deinit() {
- __kmpc_spmd_kernel_deinit_v2(isRuntimeInitialized());
-}
-
EXTERN void __kmpc_spmd_kernel_deinit_v2(int16_t RequiresOMPRuntime) {
// We're not going to pop the task descr stack of each thread since
// there are no more parallel regions in SPMD mode.
diff --git a/openmp/libomptarget/deviceRTLs/common/src/parallel.cu b/openmp/libomptarget/deviceRTLs/common/src/parallel.cu
index ab031e99e51f..4f3c3ac0c08a 100644
--- a/openmp/libomptarget/deviceRTLs/common/src/parallel.cu
+++ b/openmp/libomptarget/deviceRTLs/common/src/parallel.cu
@@ -35,161 +35,6 @@
#include "common/omptarget.h"
#include "target_impl.h"
-typedef struct ConvergentSimdJob {
- omptarget_nvptx_TaskDescr taskDescr;
- omptarget_nvptx_TaskDescr *convHeadTaskDescr;
- uint16_t slimForNextSimd;
-} ConvergentSimdJob;
-
-////////////////////////////////////////////////////////////////////////////////
-// support for convergent simd (team of threads in a warp only)
-////////////////////////////////////////////////////////////////////////////////
-EXTERN bool __kmpc_kernel_convergent_simd(void *buffer,
- __kmpc_impl_lanemask_t Mask,
- bool *IsFinal, int32_t *LaneSource,
- int32_t *LaneId, int32_t *NumLanes) {
- PRINT0(LD_IO, "call to __kmpc_kernel_convergent_simd\n");
- __kmpc_impl_lanemask_t ConvergentMask = Mask;
- int32_t ConvergentSize = __kmpc_impl_popc(ConvergentMask);
- __kmpc_impl_lanemask_t WorkRemaining = ConvergentMask >> (*LaneSource + 1);
- *LaneSource += __kmpc_impl_ffs(WorkRemaining);
- *IsFinal = __kmpc_impl_popc(WorkRemaining) == 1;
- __kmpc_impl_lanemask_t lanemask_lt = __kmpc_impl_lanemask_lt();
- *LaneId = __kmpc_impl_popc(ConvergentMask & lanemask_lt);
-
- int threadId = GetLogicalThreadIdInBlock(isSPMDMode());
- int sourceThreadId = (threadId & ~(WARPSIZE - 1)) + *LaneSource;
-
- ConvergentSimdJob *job = (ConvergentSimdJob *)buffer;
- int32_t SimdLimit =
- omptarget_nvptx_threadPrivateContext->SimdLimitForNextSimd(threadId);
- job->slimForNextSimd = SimdLimit;
-
- int32_t SimdLimitSource = __kmpc_impl_shfl_sync(Mask, SimdLimit, *LaneSource);
- // reset simdlimit to avoid propagating to successive #simd
- if (SimdLimitSource > 0 && threadId == sourceThreadId)
- omptarget_nvptx_threadPrivateContext->SimdLimitForNextSimd(threadId) = 0;
-
- // We cannot have more than the # of convergent threads.
- if (SimdLimitSource > 0)
- *NumLanes = __kmpc_impl_min(ConvergentSize, SimdLimitSource);
- else
- *NumLanes = ConvergentSize;
- ASSERT(LT_FUSSY, *NumLanes > 0, "bad thread request of %d threads",
- (int)*NumLanes);
-
- // Set to true for lanes participating in the simd region.
- bool isActive = false;
- // Initialize state for active threads.
- if (*LaneId < *NumLanes) {
- omptarget_nvptx_TaskDescr *currTaskDescr =
- omptarget_nvptx_threadPrivateContext->GetTopLevelTaskDescr(threadId);
- omptarget_nvptx_TaskDescr *sourceTaskDescr =
- omptarget_nvptx_threadPrivateContext->GetTopLevelTaskDescr(
- sourceThreadId);
- job->convHeadTaskDescr = currTaskDescr;
- // install top descriptor from the thread for which the lanes are working.
- omptarget_nvptx_threadPrivateContext->SetTopLevelTaskDescr(threadId,
- sourceTaskDescr);
- isActive = true;
- }
-
- // requires a memory fence between threads of a warp
- return isActive;
-}
-
-EXTERN void __kmpc_kernel_end_convergent_simd(void *buffer) {
- PRINT0(LD_IO | LD_PAR, "call to __kmpc_kernel_end_convergent_parallel\n");
- // pop stack
- int threadId = GetLogicalThreadIdInBlock(isSPMDMode());
- ConvergentSimdJob *job = (ConvergentSimdJob *)buffer;
- omptarget_nvptx_threadPrivateContext->SimdLimitForNextSimd(threadId) =
- job->slimForNextSimd;
- omptarget_nvptx_threadPrivateContext->SetTopLevelTaskDescr(
- threadId, job->convHeadTaskDescr);
-}
-
-typedef struct ConvergentParallelJob {
- omptarget_nvptx_TaskDescr taskDescr;
- omptarget_nvptx_TaskDescr *convHeadTaskDescr;
- uint16_t tnumForNextPar;
-} ConvergentParallelJob;
-
-////////////////////////////////////////////////////////////////////////////////
-// support for convergent parallelism (team of threads in a warp only)
-////////////////////////////////////////////////////////////////////////////////
-EXTERN bool __kmpc_kernel_convergent_parallel(void *buffer,
- __kmpc_impl_lanemask_t Mask,
- bool *IsFinal,
- int32_t *LaneSource) {
- PRINT0(LD_IO, "call to __kmpc_kernel_convergent_parallel\n");
- __kmpc_impl_lanemask_t ConvergentMask = Mask;
- int32_t ConvergentSize = __kmpc_impl_popc(ConvergentMask);
- __kmpc_impl_lanemask_t WorkRemaining = ConvergentMask >> (*LaneSource + 1);
- *LaneSource += __kmpc_impl_ffs(WorkRemaining);
- *IsFinal = __kmpc_impl_popc(WorkRemaining) == 1;
- __kmpc_impl_lanemask_t lanemask_lt = __kmpc_impl_lanemask_lt();
- uint32_t OmpId = __kmpc_impl_popc(ConvergentMask & lanemask_lt);
-
- int threadId = GetLogicalThreadIdInBlock(isSPMDMode());
- int sourceThreadId = (threadId & ~(WARPSIZE - 1)) + *LaneSource;
-
- ConvergentParallelJob *job = (ConvergentParallelJob *)buffer;
- int32_t NumThreadsClause =
- omptarget_nvptx_threadPrivateContext->NumThreadsForNextParallel(threadId);
- job->tnumForNextPar = NumThreadsClause;
-
- int32_t NumThreadsSource =
- __kmpc_impl_shfl_sync(Mask, NumThreadsClause, *LaneSource);
- // reset numthreads to avoid propagating to successive #parallel
- if (NumThreadsSource > 0 && threadId == sourceThreadId)
- omptarget_nvptx_threadPrivateContext->NumThreadsForNextParallel(threadId) =
- 0;
-
- // We cannot have more than the # of convergent threads.
- uint16_t NumThreads;
- if (NumThreadsSource > 0)
- NumThreads = __kmpc_impl_min(ConvergentSize, NumThreadsSource);
- else
- NumThreads = ConvergentSize;
- ASSERT(LT_FUSSY, NumThreads > 0, "bad thread request of %d threads",
- (int)NumThreads);
-
- // Set to true for workers participating in the parallel region.
- bool isActive = false;
- // Initialize state for active threads.
- if (OmpId < NumThreads) {
- // init L2 task descriptor and storage for the L1 parallel task descriptor.
- omptarget_nvptx_TaskDescr *newTaskDescr = &job->taskDescr;
- ASSERT0(LT_FUSSY, newTaskDescr, "expected a task descr");
- omptarget_nvptx_TaskDescr *currTaskDescr =
- omptarget_nvptx_threadPrivateContext->GetTopLevelTaskDescr(threadId);
- omptarget_nvptx_TaskDescr *sourceTaskDescr =
- omptarget_nvptx_threadPrivateContext->GetTopLevelTaskDescr(
- sourceThreadId);
- job->convHeadTaskDescr = currTaskDescr;
- newTaskDescr->CopyConvergentParent(sourceTaskDescr, OmpId, NumThreads);
- // install new top descriptor
- omptarget_nvptx_threadPrivateContext->SetTopLevelTaskDescr(threadId,
- newTaskDescr);
- isActive = true;
- }
-
- // requires a memory fence between threads of a warp
- return isActive;
-}
-
-EXTERN void __kmpc_kernel_end_convergent_parallel(void *buffer) {
- PRINT0(LD_IO | LD_PAR, "call to __kmpc_kernel_end_convergent_parallel\n");
- // pop stack
- int threadId = GetLogicalThreadIdInBlock(isSPMDMode());
- ConvergentParallelJob *job = (ConvergentParallelJob *)buffer;
- omptarget_nvptx_threadPrivateContext->SetTopLevelTaskDescr(
- threadId, job->convHeadTaskDescr);
- omptarget_nvptx_threadPrivateContext->NumThreadsForNextParallel(threadId) =
- job->tnumForNextPar;
-}
-
////////////////////////////////////////////////////////////////////////////////
// support for parallel that goes parallel (1 static level only)
////////////////////////////////////////////////////////////////////////////////
@@ -446,14 +291,6 @@ EXTERN void __kmpc_push_num_threads(kmp_Ident *loc, int32_t tid,
num_threads;
}
-EXTERN void __kmpc_push_simd_limit(kmp_Ident *loc, int32_t tid,
- int32_t simd_limit) {
- PRINT(LD_IO, "call kmpc_push_simd_limit %d\n", (int)simd_limit);
- ASSERT0(LT_FUSSY, checkRuntimeInitialized(loc), "Runtime must be initialized.");
- tid = GetLogicalThreadIdInBlock(checkSPMDMode(loc));
- omptarget_nvptx_threadPrivateContext->SimdLimitForNextSimd(tid) = simd_limit;
-}
-
// Do nothing. The host guarantees we started the requested number of
// teams and we only need inspection of gridDim.
diff --git a/openmp/libomptarget/deviceRTLs/common/src/reduction.cu b/openmp/libomptarget/deviceRTLs/common/src/reduction.cu
index 427c90a7e091..0230fa26ac10 100644
--- a/openmp/libomptarget/deviceRTLs/common/src/reduction.cu
+++ b/openmp/libomptarget/deviceRTLs/common/src/reduction.cu
@@ -73,22 +73,6 @@ gpu_irregular_simd_reduce(void *reduce_data, kmp_ShuffleReductFctPtr shflFct) {
return (logical_lane_id == 0);
}
-EXTERN
-int32_t __kmpc_nvptx_simd_reduce_nowait(int32_t global_tid, int32_t num_vars,
- size_t reduce_size, void *reduce_data,
- kmp_ShuffleReductFctPtr shflFct,
- kmp_InterWarpCopyFctPtr cpyFct) {
- __kmpc_impl_lanemask_t Liveness = __kmpc_impl_activemask();
- if (Liveness == __kmpc_impl_all_lanes) {
- gpu_regular_warp_reduce(reduce_data, shflFct);
- return GetThreadIdInBlock() % WARPSIZE ==
- 0; // Result on lane 0 of the simd warp.
- } else {
- return gpu_irregular_simd_reduce(
- reduce_data, shflFct); // Result on the first active lane.
- }
-}
-
INLINE
static int32_t nvptx_parallel_reduce_nowait(
int32_t global_tid, int32_t num_vars, size_t reduce_size, void *reduce_data,
@@ -177,14 +161,6 @@ static int32_t nvptx_parallel_reduce_nowait(
#endif // __CUDA_ARCH__ >= 700
}
-EXTERN __attribute__((deprecated)) int32_t __kmpc_nvptx_parallel_reduce_nowait(
- int32_t global_tid, int32_t num_vars, size_t reduce_size, void *reduce_data,
- kmp_ShuffleReductFctPtr shflFct, kmp_InterWarpCopyFctPtr cpyFct) {
- return nvptx_parallel_reduce_nowait(global_tid, num_vars, reduce_size,
- reduce_data, shflFct, cpyFct,
- isSPMDMode(), isRuntimeUninitialized());
-}
-
EXTERN
int32_t __kmpc_nvptx_parallel_reduce_nowait_v2(
kmp_Ident *loc, int32_t global_tid, int32_t num_vars, size_t reduce_size,
@@ -195,201 +171,6 @@ int32_t __kmpc_nvptx_parallel_reduce_nowait_v2(
checkSPMDMode(loc), checkRuntimeUninitialized(loc));
}
-EXTERN
-int32_t __kmpc_nvptx_parallel_reduce_nowait_simple_spmd(
- int32_t global_tid, int32_t num_vars, size_t reduce_size, void *reduce_data,
- kmp_ShuffleReductFctPtr shflFct, kmp_InterWarpCopyFctPtr cpyFct) {
- return nvptx_parallel_reduce_nowait(
- global_tid, num_vars, reduce_size, reduce_data, shflFct, cpyFct,
- /*isSPMDExecutionMode=*/true, /*isRuntimeUninitialized=*/true);
-}
-
-EXTERN
-int32_t __kmpc_nvptx_parallel_reduce_nowait_simple_generic(
- int32_t global_tid, int32_t num_vars, size_t reduce_size, void *reduce_data,
- kmp_ShuffleReductFctPtr shflFct, kmp_InterWarpCopyFctPtr cpyFct) {
- return nvptx_parallel_reduce_nowait(
- global_tid, num_vars, reduce_size, reduce_data, shflFct, cpyFct,
- /*isSPMDExecutionMode=*/false, /*isRuntimeUninitialized=*/true);
-}
-
-INLINE
-static int32_t nvptx_teams_reduce_nowait(int32_t global_tid, int32_t num_vars,
- size_t reduce_size, void *reduce_data,
- kmp_ShuffleReductFctPtr shflFct,
- kmp_InterWarpCopyFctPtr cpyFct,
- kmp_CopyToScratchpadFctPtr scratchFct,
- kmp_LoadReduceFctPtr ldFct,
- bool isSPMDExecutionMode) {
- uint32_t ThreadId = GetLogicalThreadIdInBlock(isSPMDExecutionMode);
- // In non-generic mode all workers participate in the teams reduction.
- // In generic mode only the team master participates in the teams
- // reduction because the workers are waiting for parallel work.
- uint32_t NumThreads =
- isSPMDExecutionMode ? GetNumberOfOmpThreads(/*isSPMDExecutionMode=*/true)
- : /*Master thread only*/ 1;
- uint32_t TeamId = GetBlockIdInKernel();
- uint32_t NumTeams = GetNumberOfBlocksInKernel();
- static SHARED volatile bool IsLastTeam;
-
- // Team masters of all teams write to the scratchpad.
- if (ThreadId == 0) {
- unsigned int *timestamp = GetTeamsReductionTimestamp();
- char *scratchpad = GetTeamsReductionScratchpad();
-
- scratchFct(reduce_data, scratchpad, TeamId, NumTeams);
- __kmpc_impl_threadfence();
-
- // atomicInc increments 'timestamp' and has a range [0, NumTeams-1].
- // It resets 'timestamp' back to 0 once the last team increments
- // this counter.
- unsigned val = __kmpc_atomic_inc(timestamp, NumTeams - 1);
- IsLastTeam = val == NumTeams - 1;
- }
-
- // We have to wait on L1 barrier because in GENERIC mode the workers
- // are waiting on barrier 0 for work.
- //
- // If we guard this barrier as follows it leads to deadlock, probably
- // because of a compiler bug: if (!IsGenericMode()) __syncthreads();
- uint16_t SyncWarps = (NumThreads + WARPSIZE - 1) / WARPSIZE;
- __kmpc_impl_named_sync(L1_BARRIER, SyncWarps * WARPSIZE);
-
- // If this team is not the last, quit.
- if (/* Volatile read by all threads */ !IsLastTeam)
- return 0;
-
- //
- // Last team processing.
- //
-
- // Threads in excess of #teams do not participate in reduction of the
- // scratchpad values.
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700
- uint32_t ActiveThreads = NumThreads;
- if (NumTeams < NumThreads) {
- ActiveThreads =
- (NumTeams < WARPSIZE) ? 1 : NumTeams & ~((uint16_t)WARPSIZE - 1);
- }
- if (ThreadId >= ActiveThreads)
- return 0;
-
- // Load from scratchpad and reduce.
- char *scratchpad = GetTeamsReductionScratchpad();
- ldFct(reduce_data, scratchpad, ThreadId, NumTeams, /*Load only*/ 0);
- for (uint32_t i = ActiveThreads + ThreadId; i < NumTeams; i += ActiveThreads)
- ldFct(reduce_data, scratchpad, i, NumTeams, /*Load and reduce*/ 1);
-
- uint32_t WarpsNeeded = (ActiveThreads + WARPSIZE - 1) / WARPSIZE;
- uint32_t WarpId = ThreadId / WARPSIZE;
-
- // Reduce across warps to the warp master.
- if ((ActiveThreads % WARPSIZE == 0) ||
- (WarpId < WarpsNeeded - 1)) // Full warp
- gpu_regular_warp_reduce(reduce_data, shflFct);
- else if (ActiveThreads > 1) // Partial warp but contiguous lanes
- // Only SPMD execution mode comes thru this case.
- gpu_irregular_warp_reduce(reduce_data, shflFct,
- /*LaneCount=*/ActiveThreads % WARPSIZE,
- /*LaneId=*/ThreadId % WARPSIZE);
-
- // When we have more than [warpsize] number of threads
- // a block reduction is performed here.
- if (ActiveThreads > WARPSIZE) {
- // Gather all the reduced values from each warp
- // to the first warp.
- cpyFct(reduce_data, WarpsNeeded);
-
- if (WarpId == 0)
- gpu_irregular_warp_reduce(reduce_data, shflFct, WarpsNeeded, ThreadId);
- }
-#else
- if (ThreadId >= NumTeams)
- return 0;
-
- // Load from scratchpad and reduce.
- char *scratchpad = GetTeamsReductionScratchpad();
- ldFct(reduce_data, scratchpad, ThreadId, NumTeams, /*Load only*/ 0);
- for (uint32_t i = NumThreads + ThreadId; i < NumTeams; i += NumThreads)
- ldFct(reduce_data, scratchpad, i, NumTeams, /*Load and reduce*/ 1);
-
- // Reduce across warps to the warp master.
- __kmpc_impl_lanemask_t Liveness = __kmpc_impl_activemask();
- if (Liveness == __kmpc_impl_all_lanes) // Full warp
- gpu_regular_warp_reduce(reduce_data, shflFct);
- else // Partial warp but contiguous lanes
- gpu_irregular_warp_reduce(reduce_data, shflFct,
- /*LaneCount=*/__kmpc_impl_popc(Liveness),
- /*LaneId=*/ThreadId % WARPSIZE);
-
- // When we have more than [warpsize] number of threads
- // a block reduction is performed here.
- uint32_t ActiveThreads = NumTeams < NumThreads ? NumTeams : NumThreads;
- if (ActiveThreads > WARPSIZE) {
- uint32_t WarpsNeeded = (ActiveThreads + WARPSIZE - 1) / WARPSIZE;
- // Gather all the reduced values from each warp
- // to the first warp.
- cpyFct(reduce_data, WarpsNeeded);
-
- uint32_t WarpId = ThreadId / WARPSIZE;
- if (WarpId == 0)
- gpu_irregular_warp_reduce(reduce_data, shflFct, WarpsNeeded, ThreadId);
- }
-#endif // __CUDA_ARCH__ >= 700
-
- return ThreadId == 0;
-}
-
-EXTERN
-int32_t __kmpc_nvptx_teams_reduce_nowait(int32_t global_tid, int32_t num_vars,
- size_t reduce_size, void *reduce_data,
- kmp_ShuffleReductFctPtr shflFct,
- kmp_InterWarpCopyFctPtr cpyFct,
- kmp_CopyToScratchpadFctPtr scratchFct,
- kmp_LoadReduceFctPtr ldFct) {
- return nvptx_teams_reduce_nowait(global_tid, num_vars, reduce_size,
- reduce_data, shflFct, cpyFct, scratchFct,
- ldFct, isSPMDMode());
-}
-
-EXTERN
-int32_t __kmpc_nvptx_teams_reduce_nowait_simple_spmd(
- int32_t global_tid, int32_t num_vars, size_t reduce_size, void *reduce_data,
- kmp_ShuffleReductFctPtr shflFct, kmp_InterWarpCopyFctPtr cpyFct,
- kmp_CopyToScratchpadFctPtr scratchFct, kmp_LoadReduceFctPtr ldFct) {
- return nvptx_teams_reduce_nowait(global_tid, num_vars, reduce_size,
- reduce_data, shflFct, cpyFct, scratchFct,
- ldFct, /*isSPMDExecutionMode=*/true);
-}
-
-EXTERN
-int32_t __kmpc_nvptx_teams_reduce_nowait_simple_generic(
- int32_t global_tid, int32_t num_vars, size_t reduce_size, void *reduce_data,
- kmp_ShuffleReductFctPtr shflFct, kmp_InterWarpCopyFctPtr cpyFct,
- kmp_CopyToScratchpadFctPtr scratchFct, kmp_LoadReduceFctPtr ldFct) {
- return nvptx_teams_reduce_nowait(global_tid, num_vars, reduce_size,
- reduce_data, shflFct, cpyFct, scratchFct,
- ldFct, /*isSPMDExecutionMode=*/false);
-}
-
-EXTERN int32_t __kmpc_nvptx_teams_reduce_nowait_simple(kmp_Ident *loc,
- int32_t global_tid,
- kmp_CriticalName *crit) {
- if (checkSPMDMode(loc) && GetThreadIdInBlock() != 0)
- return 0;
- // The master thread of the team actually does the reduction.
- while (__kmpc_atomic_cas((uint32_t *)crit, 0u, 1u))
- ;
- return 1;
-}
-
-EXTERN void
-__kmpc_nvptx_teams_end_reduce_nowait_simple(kmp_Ident *loc, int32_t global_tid,
- kmp_CriticalName *crit) {
- __kmpc_impl_threadfence_system();
- (void)__kmpc_atomic_exchange((uint32_t *)crit, 0u);
-}
-
INLINE static bool isMaster(kmp_Ident *loc, uint32_t ThreadId) {
return checkGenericMode(loc) || IsTeamMaster(ThreadId);
}
diff --git a/openmp/libomptarget/deviceRTLs/common/src/support.cu b/openmp/libomptarget/deviceRTLs/common/src/support.cu
index 85747511d46c..e02c533e78b2 100644
--- a/openmp/libomptarget/deviceRTLs/common/src/support.cu
+++ b/openmp/libomptarget/deviceRTLs/common/src/support.cu
@@ -264,6 +264,3 @@ DEVICE char *GetTeamsReductionScratchpad() {
return static_cast<char *>(ReductionScratchpadPtr) + 256;
}
-DEVICE void SetTeamsReductionScratchpadPtr(void *ScratchpadPtr) {
- ReductionScratchpadPtr = ScratchpadPtr;
-}
diff --git a/openmp/libomptarget/deviceRTLs/common/src/sync.cu b/openmp/libomptarget/deviceRTLs/common/src/sync.cu
index 2ac3e3f9c7c0..3979e2054fc9 100644
--- a/openmp/libomptarget/deviceRTLs/common/src/sync.cu
+++ b/openmp/libomptarget/deviceRTLs/common/src/sync.cu
@@ -79,23 +79,6 @@ EXTERN void __kmpc_barrier_simple_spmd(kmp_Ident *loc_ref, int32_t tid) {
PRINT0(LD_SYNC, "completed kmpc_barrier_simple_spmd\n");
}
-// Emit a simple barrier call in Generic mode. Assumes the caller is in an L0
-// parallel region and that all worker threads participate.
-EXTERN void __kmpc_barrier_simple_generic(kmp_Ident *loc_ref, int32_t tid) {
- int numberOfActiveOMPThreads = GetNumberOfThreadsInBlock() - WARPSIZE;
- // The #threads parameter must be rounded up to the WARPSIZE.
- int threads =
- WARPSIZE * ((numberOfActiveOMPThreads + WARPSIZE - 1) / WARPSIZE);
-
- PRINT(LD_SYNC,
- "call kmpc_barrier_simple_generic with %d omp threads, sync parameter "
- "%d\n",
- (int)numberOfActiveOMPThreads, (int)threads);
- // Barrier #1 is for synchronization among active threads.
- __kmpc_impl_named_sync(L1_BARRIER, threads);
- PRINT0(LD_SYNC, "completed kmpc_barrier_simple_generic\n");
-}
-
////////////////////////////////////////////////////////////////////////////////
// KMP MASTER
////////////////////////////////////////////////////////////////////////////////
diff --git a/openmp/libomptarget/deviceRTLs/common/support.h b/openmp/libomptarget/deviceRTLs/common/support.h
index 913c4c3c323f..a46432825782 100644
--- a/openmp/libomptarget/deviceRTLs/common/support.h
+++ b/openmp/libomptarget/deviceRTLs/common/support.h
@@ -94,6 +94,5 @@ DEVICE unsigned long PadBytes(unsigned long size, unsigned long alignment);
////////////////////////////////////////////////////////////////////////////////
DEVICE unsigned int *GetTeamsReductionTimestamp();
DEVICE char *GetTeamsReductionScratchpad();
-DEVICE void SetTeamsReductionScratchpadPtr(void *ScratchpadPtr);
#endif
diff --git a/openmp/libomptarget/deviceRTLs/interface.h b/openmp/libomptarget/deviceRTLs/interface.h
index 3c216a5e61c5..39ce73cba957 100644
--- a/openmp/libomptarget/deviceRTLs/interface.h
+++ b/openmp/libomptarget/deviceRTLs/interface.h
@@ -193,17 +193,10 @@ typedef struct ident {
// parallel defs
typedef ident_t kmp_Ident;
-typedef void (*kmp_ParFctPtr)(int32_t *global_tid, int32_t *bound_tid, ...);
-typedef void (*kmp_ReductFctPtr)(void *lhsData, void *rhsData);
typedef void (*kmp_InterWarpCopyFctPtr)(void *src, int32_t warp_num);
typedef void (*kmp_ShuffleReductFctPtr)(void *rhsData, int16_t lane_id,
int16_t lane_offset,
int16_t shortCircuit);
-typedef void (*kmp_CopyToScratchpadFctPtr)(void *reduceData, void *scratchpad,
- int32_t index, int32_t width);
-typedef void (*kmp_LoadReduceFctPtr)(void *reduceData, void *scratchpad,
- int32_t index, int32_t width,
- int32_t reduce);
typedef void (*kmp_ListGlobalFctPtr)(void *buffer, int idx, void *reduce_data);
// task defs
@@ -227,12 +220,6 @@ typedef int32_t kmp_CriticalName[8];
EXTERN int32_t __kmpc_global_thread_num(kmp_Ident *loc);
EXTERN void __kmpc_push_num_threads(kmp_Ident *loc, int32_t global_tid,
int32_t num_threads);
-// simd
-EXTERN void __kmpc_push_simd_limit(kmp_Ident *loc, int32_t global_tid,
- int32_t simd_limit);
-// aee ... not supported
-// EXTERN void __kmpc_fork_call(kmp_Ident *loc, int32_t argc, kmp_ParFctPtr
-// microtask, ...);
EXTERN void __kmpc_serialized_parallel(kmp_Ident *loc, uint32_t global_tid);
EXTERN void __kmpc_end_serialized_parallel(kmp_Ident *loc,
uint32_t global_tid);
@@ -354,61 +341,25 @@ EXTERN void __kmpc_dispatch_fini_4u(kmp_Ident *loc, int32_t global_tid);
EXTERN void __kmpc_dispatch_fini_8(kmp_Ident *loc, int32_t global_tid);
EXTERN void __kmpc_dispatch_fini_8u(kmp_Ident *loc, int32_t global_tid);
-// Support for reducing conditional lastprivate variables
-EXTERN void __kmpc_reduce_conditional_lastprivate(kmp_Ident *loc,
- int32_t global_tid,
- int32_t varNum, void *array);
-
// reduction
EXTERN void __kmpc_nvptx_end_reduce(int32_t global_tid);
EXTERN void __kmpc_nvptx_end_reduce_nowait(int32_t global_tid);
-EXTERN __attribute__((deprecated)) int32_t __kmpc_nvptx_parallel_reduce_nowait(
- int32_t global_tid, int32_t num_vars, size_t reduce_size, void *reduce_data,
- kmp_ShuffleReductFctPtr shflFct, kmp_InterWarpCopyFctPtr cpyFct);
EXTERN int32_t __kmpc_nvptx_parallel_reduce_nowait_v2(
kmp_Ident *loc, int32_t global_tid, int32_t num_vars, size_t reduce_size,
void *reduce_data, kmp_ShuffleReductFctPtr shflFct,
kmp_InterWarpCopyFctPtr cpyFct);
-EXTERN int32_t __kmpc_nvptx_parallel_reduce_nowait_simple_spmd(
- int32_t global_tid, int32_t num_vars, size_t reduce_size, void *reduce_data,
- kmp_ShuffleReductFctPtr shflFct, kmp_InterWarpCopyFctPtr cpyFct);
-EXTERN int32_t __kmpc_nvptx_parallel_reduce_nowait_simple_generic(
- int32_t global_tid, int32_t num_vars, size_t reduce_size, void *reduce_data,
- kmp_ShuffleReductFctPtr shflFct, kmp_InterWarpCopyFctPtr cpyFct);
-EXTERN int32_t __kmpc_nvptx_simd_reduce_nowait(
- int32_t global_tid, int32_t num_vars, size_t reduce_size, void *reduce_data,
- kmp_ShuffleReductFctPtr shflFct, kmp_InterWarpCopyFctPtr cpyFct);
EXTERN int32_t __kmpc_nvptx_teams_reduce_nowait_v2(
kmp_Ident *loc, int32_t global_tid, void *global_buffer,
int32_t num_of_records, void *reduce_data, kmp_ShuffleReductFctPtr shflFct,
kmp_InterWarpCopyFctPtr cpyFct, kmp_ListGlobalFctPtr lgcpyFct,
kmp_ListGlobalFctPtr lgredFct, kmp_ListGlobalFctPtr glcpyFct,
kmp_ListGlobalFctPtr glredFct);
-EXTERN int32_t __kmpc_nvptx_teams_reduce_nowait(
- int32_t global_tid, int32_t num_vars, size_t reduce_size, void *reduce_data,
- kmp_ShuffleReductFctPtr shflFct, kmp_InterWarpCopyFctPtr cpyFct,
- kmp_CopyToScratchpadFctPtr sratchFct, kmp_LoadReduceFctPtr ldFct);
-EXTERN int32_t __kmpc_nvptx_teams_reduce_nowait_simple_spmd(
- int32_t global_tid, int32_t num_vars, size_t reduce_size, void *reduce_data,
- kmp_ShuffleReductFctPtr shflFct, kmp_InterWarpCopyFctPtr cpyFct,
- kmp_CopyToScratchpadFctPtr sratchFct, kmp_LoadReduceFctPtr ldFct);
-EXTERN int32_t __kmpc_nvptx_teams_reduce_nowait_simple_generic(
- int32_t global_tid, int32_t num_vars, size_t reduce_size, void *reduce_data,
- kmp_ShuffleReductFctPtr shflFct, kmp_InterWarpCopyFctPtr cpyFct,
- kmp_CopyToScratchpadFctPtr sratchFct, kmp_LoadReduceFctPtr ldFct);
-EXTERN int32_t __kmpc_nvptx_teams_reduce_nowait_simple(kmp_Ident *loc,
- int32_t global_tid,
- kmp_CriticalName *crit);
-EXTERN void __kmpc_nvptx_teams_end_reduce_nowait_simple(kmp_Ident *loc,
- int32_t global_tid,
- kmp_CriticalName *crit);
EXTERN int32_t __kmpc_shuffle_int32(int32_t val, int16_t delta, int16_t size);
EXTERN int64_t __kmpc_shuffle_int64(int64_t val, int16_t delta, int16_t size);
// sync barrier
EXTERN void __kmpc_barrier(kmp_Ident *loc_ref, int32_t tid);
EXTERN void __kmpc_barrier_simple_spmd(kmp_Ident *loc_ref, int32_t tid);
-EXTERN void __kmpc_barrier_simple_generic(kmp_Ident *loc_ref, int32_t tid);
EXTERN int32_t __kmpc_cancel_barrier(kmp_Ident *loc, int32_t global_tid);
// single
@@ -468,29 +419,16 @@ EXTERN int32_t __kmpc_cancel(kmp_Ident *loc, int32_t global_tid,
int32_t cancelVal);
// non standard
-EXTERN void __kmpc_kernel_init_params(void *ReductionScratchpadPtr);
EXTERN void __kmpc_kernel_init(int ThreadLimit, int16_t RequiresOMPRuntime);
EXTERN void __kmpc_kernel_deinit(int16_t IsOMPRuntimeInitialized);
EXTERN void __kmpc_spmd_kernel_init(int ThreadLimit, int16_t RequiresOMPRuntime,
int16_t RequiresDataSharing);
-EXTERN __attribute__((deprecated)) void __kmpc_spmd_kernel_deinit();
EXTERN void __kmpc_spmd_kernel_deinit_v2(int16_t RequiresOMPRuntime);
EXTERN void __kmpc_kernel_prepare_parallel(void *WorkFn,
int16_t IsOMPRuntimeInitialized);
EXTERN bool __kmpc_kernel_parallel(void **WorkFn,
int16_t IsOMPRuntimeInitialized);
EXTERN void __kmpc_kernel_end_parallel();
-EXTERN bool __kmpc_kernel_convergent_parallel(void *buffer,
- __kmpc_impl_lanemask_t Mask,
- bool *IsFinal,
- int32_t *LaneSource);
-EXTERN void __kmpc_kernel_end_convergent_parallel(void *buffer);
-EXTERN bool __kmpc_kernel_convergent_simd(void *buffer,
- __kmpc_impl_lanemask_t Mask,
- bool *IsFinal, int32_t *LaneSource,
- int32_t *LaneId, int32_t *NumLanes);
-EXTERN void __kmpc_kernel_end_convergent_simd(void *buffer);
-
EXTERN void __kmpc_data_sharing_init_stack();
EXTERN void __kmpc_data_sharing_init_stack_spmd();
@@ -512,22 +450,6 @@ struct __kmpc_data_sharing_slot {
void *DataEnd;
char Data[];
};
-EXTERN void
-__kmpc_initialize_data_sharing_environment(__kmpc_data_sharing_slot *RootS,
- size_t InitialDataSize);
-EXTERN void *__kmpc_data_sharing_environment_begin(
- __kmpc_data_sharing_slot **SavedSharedSlot, void **SavedSharedStack,
- void **SavedSharedFrame, __kmpc_impl_lanemask_t *SavedActiveThreads,
- size_t SharingDataSize, size_t SharingDefaultDataSize,
- int16_t IsOMPRuntimeInitialized);
-EXTERN void __kmpc_data_sharing_environment_end(
- __kmpc_data_sharing_slot **SavedSharedSlot, void **SavedSharedStack,
- void **SavedSharedFrame, __kmpc_impl_lanemask_t *SavedActiveThreads,
- int32_t IsEntryPoint);
-
-EXTERN void *
-__kmpc_get_data_sharing_environment_frame(int32_t SourceThreadID,
- int16_t IsOMPRuntimeInitialized);
// SPMD execution mode interrogation function.
EXTERN int8_t __kmpc_is_spmd_exec_mode();
diff --git a/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.h b/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.h
index 1b966510ec7e..d009e36a522f 100644
--- a/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.h
+++ b/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.h
@@ -195,15 +195,6 @@ INLINE int GetNumberOfThreadsInBlock() { return blockDim.x; }
INLINE unsigned GetWarpId() { return GetThreadIdInBlock() / WARPSIZE; }
INLINE unsigned GetLaneId() { return GetThreadIdInBlock() & (WARPSIZE - 1); }
-// Return true if this is the first active thread in the warp.
-INLINE bool __kmpc_impl_is_first_active_thread() {
- unsigned long long Mask = __kmpc_impl_activemask();
- unsigned long long ShNum = WARPSIZE - (GetThreadIdInBlock() % WARPSIZE);
- unsigned long long Sh = Mask << ShNum;
- // Truncate Sh to the 32 lower bits
- return (unsigned)Sh == 0;
-}
-
// Locks
EXTERN void __kmpc_impl_init_lock(omp_lock_t *lock);
EXTERN void __kmpc_impl_destroy_lock(omp_lock_t *lock);
More information about the Openmp-commits
mailing list