[Openmp-commits] [openmp] 21d91a8 - [libomptarget][devicertl] Replace lanemask with uint64 at interface
Jon Chesterfield via Openmp-commits
openmp-commits at lists.llvm.org
Wed Aug 18 12:47:49 PDT 2021
Author: Jon Chesterfield
Date: 2021-08-18T20:47:33+01:00
New Revision: 21d91a8ef319eec9c2c272e19beee726429524aa
URL: https://github.com/llvm/llvm-project/commit/21d91a8ef319eec9c2c272e19beee726429524aa
DIFF: https://github.com/llvm/llvm-project/commit/21d91a8ef319eec9c2c272e19beee726429524aa.diff
LOG: [libomptarget][devicertl] Replace lanemask with uint64 at interface
Use uint64_t for lanemask on all GPU architectures at the interface
with clang. Updates tests. The deviceRTL is always linked as IR so the zext
and trunc introduced for wave32 architectures will fold after inlining.
Simplification partly motivated by amdgpu gfx10 which will be wave32 and
is awkward to express in the current arch-dependant typedef interface.
Reviewed By: jdoerfert
Differential Revision: https://reviews.llvm.org/D108317
Added:
Modified:
clang/test/OpenMP/nvptx_parallel_codegen.cpp
llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
llvm/include/llvm/Frontend/OpenMP/OMPKinds.def
llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
llvm/test/Transforms/OpenMP/add_attributes.ll
openmp/libomptarget/DeviceRTL/include/Interface.h
openmp/libomptarget/DeviceRTL/src/Synchronization.cpp
openmp/libomptarget/deviceRTLs/common/src/sync.cu
openmp/libomptarget/deviceRTLs/interface.h
Removed:
################################################################################
diff --git a/clang/test/OpenMP/nvptx_parallel_codegen.cpp b/clang/test/OpenMP/nvptx_parallel_codegen.cpp
index 7cb86b80e158f..712c5a41c573d 100644
--- a/clang/test/OpenMP/nvptx_parallel_codegen.cpp
+++ b/clang/test/OpenMP/nvptx_parallel_codegen.cpp
@@ -485,7 +485,7 @@ int bar(int n){
// CHECK3-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4
// CHECK3-NEXT: store i32* [[A]], i32** [[A_ADDR]], align 4
// CHECK3-NEXT: [[TMP0:%.*]] = load i32*, i32** [[A_ADDR]], align 4
-// CHECK3-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_warp_active_thread_mask()
+// CHECK3-NEXT: [[TMP1:%.*]] = call i64 @__kmpc_warp_active_thread_mask()
// CHECK3-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
// CHECK3-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
// CHECK3-NEXT: store i32 0, i32* [[CRITICAL_COUNTER]], align 4
@@ -508,7 +508,7 @@ int bar(int n){
// CHECK3-NEXT: call void @__kmpc_end_critical(%struct.ident_t* @[[GLOB1]], i32 [[TMP7]], [8 x i32]* @"_gomp_critical_user_$var")
// CHECK3-NEXT: br label [[OMP_CRITICAL_SYNC]]
// CHECK3: omp.critical.sync:
-// CHECK3-NEXT: call void @__kmpc_syncwarp(i32 [[TMP1]])
+// CHECK3-NEXT: call void @__kmpc_syncwarp(i64 [[TMP1]])
// CHECK3-NEXT: [[TMP9:%.*]] = add nsw i32 [[TMP4]], 1
// CHECK3-NEXT: store i32 [[TMP9]], i32* [[CRITICAL_COUNTER]], align 4
// CHECK3-NEXT: br label [[OMP_CRITICAL_LOOP]]
@@ -938,7 +938,7 @@ int bar(int n){
// CHECK4-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4
// CHECK4-NEXT: store i32* [[A]], i32** [[A_ADDR]], align 4
// CHECK4-NEXT: [[TMP0:%.*]] = load i32*, i32** [[A_ADDR]], align 4
-// CHECK4-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_warp_active_thread_mask()
+// CHECK4-NEXT: [[TMP1:%.*]] = call i64 @__kmpc_warp_active_thread_mask()
// CHECK4-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
// CHECK4-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
// CHECK4-NEXT: store i32 0, i32* [[CRITICAL_COUNTER]], align 4
@@ -961,7 +961,7 @@ int bar(int n){
// CHECK4-NEXT: call void @__kmpc_end_critical(%struct.ident_t* @[[GLOB1]], i32 [[TMP7]], [8 x i32]* @"_gomp_critical_user_$var")
// CHECK4-NEXT: br label [[OMP_CRITICAL_SYNC]]
// CHECK4: omp.critical.sync:
-// CHECK4-NEXT: call void @__kmpc_syncwarp(i32 [[TMP1]])
+// CHECK4-NEXT: call void @__kmpc_syncwarp(i64 [[TMP1]])
// CHECK4-NEXT: [[TMP9:%.*]] = add nsw i32 [[TMP4]], 1
// CHECK4-NEXT: store i32 [[TMP9]], i32* [[CRITICAL_COUNTER]], align 4
// CHECK4-NEXT: br label [[OMP_CRITICAL_LOOP]]
@@ -1391,7 +1391,7 @@ int bar(int n){
// CHECK5-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4
// CHECK5-NEXT: store i32* [[A]], i32** [[A_ADDR]], align 4
// CHECK5-NEXT: [[TMP0:%.*]] = load i32*, i32** [[A_ADDR]], align 4
-// CHECK5-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_warp_active_thread_mask()
+// CHECK5-NEXT: [[TMP1:%.*]] = call i64 @__kmpc_warp_active_thread_mask()
// CHECK5-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
// CHECK5-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
// CHECK5-NEXT: store i32 0, i32* [[CRITICAL_COUNTER]], align 4
@@ -1414,7 +1414,7 @@ int bar(int n){
// CHECK5-NEXT: call void @__kmpc_end_critical(%struct.ident_t* @[[GLOB1]], i32 [[TMP7]], [8 x i32]* @"_gomp_critical_user_$var")
// CHECK5-NEXT: br label [[OMP_CRITICAL_SYNC]]
// CHECK5: omp.critical.sync:
-// CHECK5-NEXT: call void @__kmpc_syncwarp(i32 [[TMP1]])
+// CHECK5-NEXT: call void @__kmpc_syncwarp(i64 [[TMP1]])
// CHECK5-NEXT: [[TMP9:%.*]] = add nsw i32 [[TMP4]], 1
// CHECK5-NEXT: store i32 [[TMP9]], i32* [[CRITICAL_COUNTER]], align 4
// CHECK5-NEXT: br label [[OMP_CRITICAL_LOOP]]
@@ -1663,7 +1663,7 @@ int bar(int n){
// CHECK1-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8
// CHECK1-NEXT: store i32* [[A]], i32** [[A_ADDR]], align 8
// CHECK1-NEXT: [[TMP0:%.*]] = load i32*, i32** [[A_ADDR]], align 8
-// CHECK1-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_warp_active_thread_mask()
+// CHECK1-NEXT: [[TMP1:%.*]] = call i64 @__kmpc_warp_active_thread_mask()
// CHECK1-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
// CHECK1-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
// CHECK1-NEXT: store i32 0, i32* [[CRITICAL_COUNTER]], align 4
@@ -1686,7 +1686,7 @@ int bar(int n){
// CHECK1-NEXT: call void @__kmpc_end_critical(%struct.ident_t* @[[GLOB1]], i32 [[TMP7]], [8 x i32]* @"_gomp_critical_user_$var")
// CHECK1-NEXT: br label [[OMP_CRITICAL_SYNC]]
// CHECK1: omp.critical.sync:
-// CHECK1-NEXT: call void @__kmpc_syncwarp(i32 [[TMP1]])
+// CHECK1-NEXT: call void @__kmpc_syncwarp(i64 [[TMP1]])
// CHECK1-NEXT: [[TMP9:%.*]] = add nsw i32 [[TMP4]], 1
// CHECK1-NEXT: store i32 [[TMP9]], i32* [[CRITICAL_COUNTER]], align 4
// CHECK1-NEXT: br label [[OMP_CRITICAL_LOOP]]
@@ -1935,7 +1935,7 @@ int bar(int n){
// CHECK2-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4
// CHECK2-NEXT: store i32* [[A]], i32** [[A_ADDR]], align 4
// CHECK2-NEXT: [[TMP0:%.*]] = load i32*, i32** [[A_ADDR]], align 4
-// CHECK2-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_warp_active_thread_mask()
+// CHECK2-NEXT: [[TMP1:%.*]] = call i64 @__kmpc_warp_active_thread_mask()
// CHECK2-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
// CHECK2-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
// CHECK2-NEXT: store i32 0, i32* [[CRITICAL_COUNTER]], align 4
@@ -1958,7 +1958,7 @@ int bar(int n){
// CHECK2-NEXT: call void @__kmpc_end_critical(%struct.ident_t* @[[GLOB1]], i32 [[TMP7]], [8 x i32]* @"_gomp_critical_user_$var")
// CHECK2-NEXT: br label [[OMP_CRITICAL_SYNC]]
// CHECK2: omp.critical.sync:
-// CHECK2-NEXT: call void @__kmpc_syncwarp(i32 [[TMP1]])
+// CHECK2-NEXT: call void @__kmpc_syncwarp(i64 [[TMP1]])
// CHECK2-NEXT: [[TMP9:%.*]] = add nsw i32 [[TMP4]], 1
// CHECK2-NEXT: store i32 [[TMP9]], i32* [[CRITICAL_COUNTER]], align 4
// CHECK2-NEXT: br label [[OMP_CRITICAL_LOOP]]
diff --git a/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h b/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
index c90659a959706..5743e2df77088 100644
--- a/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
+++ b/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
@@ -654,9 +654,6 @@ class OpenMPIRBuilder {
omp::IdentFlag Flags = omp::IdentFlag(0),
unsigned Reserve2Flags = 0);
- // Get the type corresponding to __kmpc_impl_lanemask_t from the deviceRTL
- Type *getLanemaskType();
-
/// Generate control flow and cleanup for cancellation.
///
/// \param CancelFlag Flag indicating if the cancellation is performed.
diff --git a/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def b/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def
index 26141c9ce249e..e084036ac60b5 100644
--- a/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def
+++ b/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def
@@ -39,7 +39,6 @@ __OMP_TYPE(Int32Ptr)
__OMP_TYPE(Int64Ptr)
OMP_TYPE(SizeTy, M.getDataLayout().getIntPtrType(Ctx))
-OMP_TYPE(LanemaskTy, getLanemaskType())
#define __OMP_PTR_TYPE(NAME, BASE) OMP_TYPE(NAME, BASE->getPointerTo())
@@ -443,8 +442,8 @@ __OMP_RTL(__kmpc_parallel_level, false, Int8, )
__OMP_RTL(__kmpc_is_spmd_exec_mode, false, Int8, )
__OMP_RTL(__kmpc_barrier_simple_spmd, false, Void, IdentPtr, Int32)
-__OMP_RTL(__kmpc_warp_active_thread_mask, false, LanemaskTy,)
-__OMP_RTL(__kmpc_syncwarp, false, Void, LanemaskTy)
+__OMP_RTL(__kmpc_warp_active_thread_mask, false, Int64,)
+__OMP_RTL(__kmpc_syncwarp, false, Void, Int64)
__OMP_RTL(__kmpc_is_generic_main_thread_id, false, Int8, Int32)
diff --git a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
index 6ec3b4d891a03..29fe2a8f0b391 100644
--- a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
+++ b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
@@ -261,14 +261,6 @@ Value *OpenMPIRBuilder::getOrCreateIdent(Constant *SrcLocStr,
return Builder.CreatePointerCast(Ident, IdentPtr);
}
-Type *OpenMPIRBuilder::getLanemaskType() {
- LLVMContext &Ctx = M.getContext();
- Triple triple(M.getTargetTriple());
-
- // This test is adequate until deviceRTL has finer grained lane widths
- return triple.isAMDGCN() ? Type::getInt64Ty(Ctx) : Type::getInt32Ty(Ctx);
-}
-
Constant *OpenMPIRBuilder::getOrCreateSrcLocStr(StringRef LocStr) {
Constant *&SrcLocStr = SrcLocStrMap[LocStr];
if (!SrcLocStr) {
diff --git a/llvm/test/Transforms/OpenMP/add_attributes.ll b/llvm/test/Transforms/OpenMP/add_attributes.ll
index efc263ed16305..049b0c8da3450 100644
--- a/llvm/test/Transforms/OpenMP/add_attributes.ll
+++ b/llvm/test/Transforms/OpenMP/add_attributes.ll
@@ -626,9 +626,9 @@ declare void @__kmpc_destroy_allocator(i32, i8*)
declare void @__kmpc_push_target_tripcount_mapper(%struct.ident_t*, i64, i64)
-declare i32 @__kmpc_warp_active_thread_mask()
+declare i64 @__kmpc_warp_active_thread_mask()
-declare void @__kmpc_syncwarp(i32)
+declare void @__kmpc_syncwarp(i64)
declare i32 @__tgt_target_mapper(%struct.ident_t*, i64, i8*, i32, i8**, i8**, i64*, i64*, i8**, i8**)
@@ -1149,10 +1149,10 @@ attributes #0 = { noinline cold }
; CHECK-NEXT: declare void @__kmpc_push_target_tripcount_mapper(%struct.ident_t*, i64, i64)
; CHECK: ; Function Attrs: convergent nounwind
-; CHECK-NEXT: declare i32 @__kmpc_warp_active_thread_mask()
+; CHECK-NEXT: declare i64 @__kmpc_warp_active_thread_mask()
; CHECK: ; Function Attrs: convergent nounwind
-; CHECK-NEXT: declare void @__kmpc_syncwarp(i32)
+; CHECK-NEXT: declare void @__kmpc_syncwarp(i64)
; CHECK: ; Function Attrs: nounwind
; CHECK-NEXT: declare i32 @__tgt_target_mapper(%struct.ident_t*, i64, i8*, i32, i8**, i8**, i64*, i64*, i8**, i8**)
@@ -1677,10 +1677,10 @@ attributes #0 = { noinline cold }
; OPTIMISTIC-NEXT: declare void @__kmpc_push_target_tripcount_mapper(%struct.ident_t*, i64, i64)
; OPTIMISTIC: ; Function Attrs: convergent nounwind
-; OPTIMISTIC-NEXT: declare i32 @__kmpc_warp_active_thread_mask()
+; OPTIMISTIC-NEXT: declare i64 @__kmpc_warp_active_thread_mask()
; OPTIMISTIC: ; Function Attrs: convergent nounwind
-; OPTIMISTIC-NEXT: declare void @__kmpc_syncwarp(i32)
+; OPTIMISTIC-NEXT: declare void @__kmpc_syncwarp(i64)
; OPTIMISTIC: ; Function Attrs: nounwind
; OPTIMISTIC-NEXT: declare i32 @__tgt_target_mapper(%struct.ident_t*, i64, i8*, i32, i8**, i8**, i64*, i64*, i8**, i8**)
diff --git a/openmp/libomptarget/DeviceRTL/include/Interface.h b/openmp/libomptarget/DeviceRTL/include/Interface.h
index 21ff48c30d2cc..9ed396d06af4a 100644
--- a/openmp/libomptarget/DeviceRTL/include/Interface.h
+++ b/openmp/libomptarget/DeviceRTL/include/Interface.h
@@ -247,9 +247,9 @@ void __kmpc_end_single(IdentTy *Loc, int32_t TId);
void __kmpc_flush(IdentTy *Loc);
-__kmpc_impl_lanemask_t __kmpc_warp_active_thread_mask();
+uint64_t __kmpc_warp_active_thread_mask(void);
-void __kmpc_syncwarp(__kmpc_impl_lanemask_t Mask);
+void __kmpc_syncwarp(uint64_t Mask);
void __kmpc_critical(IdentTy *Loc, int32_t TId, CriticalNameTy *Name);
diff --git a/openmp/libomptarget/DeviceRTL/src/Synchronization.cpp b/openmp/libomptarget/DeviceRTL/src/Synchronization.cpp
index a055ad6d17ec2..c9a1ac6f73697 100644
--- a/openmp/libomptarget/DeviceRTL/src/Synchronization.cpp
+++ b/openmp/libomptarget/DeviceRTL/src/Synchronization.cpp
@@ -286,11 +286,9 @@ void __kmpc_end_single(IdentTy *Loc, int32_t TId) {
void __kmpc_flush(IdentTy *Loc) { fence::kernel(__ATOMIC_SEQ_CST); }
-__kmpc_impl_lanemask_t __kmpc_warp_active_thread_mask() {
- return mapping::activemask();
-}
+uint64_t __kmpc_warp_active_thread_mask(void) { return mapping::activemask(); }
-void __kmpc_syncwarp(__kmpc_impl_lanemask_t Mask) { synchronize::warp(Mask); }
+void __kmpc_syncwarp(uint64_t Mask) { synchronize::warp(Mask); }
void __kmpc_critical(IdentTy *Loc, int32_t TId, CriticalNameTy *Name) {
omp_set_lock(reinterpret_cast<omp_lock_t *>(Name));
diff --git a/openmp/libomptarget/deviceRTLs/common/src/sync.cu b/openmp/libomptarget/deviceRTLs/common/src/sync.cu
index 1dcd9abfa9e63..8711cd200051a 100644
--- a/openmp/libomptarget/deviceRTLs/common/src/sync.cu
+++ b/openmp/libomptarget/deviceRTLs/common/src/sync.cu
@@ -123,7 +123,7 @@ EXTERN void __kmpc_flush(kmp_Ident *loc) {
// Vote
////////////////////////////////////////////////////////////////////////////////
-EXTERN __kmpc_impl_lanemask_t __kmpc_warp_active_thread_mask() {
+EXTERN uint64_t __kmpc_warp_active_thread_mask(void) {
PRINT0(LD_IO, "call __kmpc_warp_active_thread_mask\n");
return __kmpc_impl_activemask();
}
@@ -132,7 +132,7 @@ EXTERN __kmpc_impl_lanemask_t __kmpc_warp_active_thread_mask() {
// Syncwarp
////////////////////////////////////////////////////////////////////////////////
-EXTERN void __kmpc_syncwarp(__kmpc_impl_lanemask_t Mask) {
+EXTERN void __kmpc_syncwarp(uint64_t Mask) {
PRINT0(LD_IO, "call __kmpc_syncwarp\n");
__kmpc_impl_syncwarp(Mask);
}
diff --git a/openmp/libomptarget/deviceRTLs/interface.h b/openmp/libomptarget/deviceRTLs/interface.h
index 2e80dc3a82ac9..ee5fc5b1f75c9 100644
--- a/openmp/libomptarget/deviceRTLs/interface.h
+++ b/openmp/libomptarget/deviceRTLs/interface.h
@@ -375,9 +375,9 @@ EXTERN void __kmpc_end_critical(kmp_Ident *loc, int32_t global_tid,
EXTERN void __kmpc_flush(kmp_Ident *loc);
// vote
-EXTERN __kmpc_impl_lanemask_t __kmpc_warp_active_thread_mask();
+EXTERN uint64_t __kmpc_warp_active_thread_mask(void);
// syncwarp
-EXTERN void __kmpc_syncwarp(__kmpc_impl_lanemask_t);
+EXTERN void __kmpc_syncwarp(uint64_t);
// tasks
EXTERN kmp_TaskDescr *__kmpc_omp_task_alloc(kmp_Ident *loc, uint32_t global_tid,
More information about the Openmp-commits
mailing list