[Openmp-commits] [openmp] c2a289d - [openmp] Provide an assembly implementation of __kmp_invoke_microtask on ARM

Martin Storsjö via Openmp-commits openmp-commits at lists.llvm.org
Thu Dec 8 12:53:58 PST 2022


Author: Martin Storsjö
Date: 2022-12-08T22:53:28+02:00
New Revision: c2a289dbdf3b8aac2e7f824b00cd54eb632e620a

URL: https://github.com/llvm/llvm-project/commit/c2a289dbdf3b8aac2e7f824b00cd54eb632e620a
DIFF: https://github.com/llvm/llvm-project/commit/c2a289dbdf3b8aac2e7f824b00cd54eb632e620a.diff

LOG: [openmp] Provide an assembly implementation of __kmp_invoke_microtask on ARM

This fixes passing an arbitrarily large number of arguments to
microtasks, fixing the misc_bugs/many-microtask-args.c testcase on
ARM.

Differential Revision: https://reviews.llvm.org/D138704

Added: 
    

Modified: 
    openmp/runtime/src/CMakeLists.txt
    openmp/runtime/src/z_Linux_asm.S
    openmp/runtime/src/z_Linux_util.cpp
    openmp/runtime/src/z_Windows_NT-586_util.cpp
    openmp/runtime/test/misc_bugs/many-microtask-args.c

Removed: 
    


################################################################################
diff  --git a/openmp/runtime/src/CMakeLists.txt b/openmp/runtime/src/CMakeLists.txt
index f222129ab2ec9..09a05abdf7bf9 100644
--- a/openmp/runtime/src/CMakeLists.txt
+++ b/openmp/runtime/src/CMakeLists.txt
@@ -95,8 +95,8 @@ else()
     libomp_append(LIBOMP_CXXFILES z_Windows_NT-586_util.cpp)
     if(${LIBOMP_ARCH} STREQUAL "i386" OR ${LIBOMP_ARCH} STREQUAL "x86_64")
       libomp_append(LIBOMP_ASMFILES z_Windows_NT-586_asm.asm) # Windows assembly file
-    elseif(${LIBOMP_ARCH} STREQUAL "aarch64" AND (NOT MSVC OR CMAKE_C_COMPILER_ID STREQUAL "Clang"))
-      # z_Linux_asm.S works for AArch64 Windows too.
+    elseif((${LIBOMP_ARCH} STREQUAL "aarch64" OR ${LIBOMP_ARCH} STREQUAL "arm") AND (NOT MSVC OR CMAKE_C_COMPILER_ID STREQUAL "Clang"))
+      # z_Linux_asm.S works for AArch64 and ARM Windows too.
       libomp_append(LIBOMP_GNUASMFILES z_Linux_asm.S)
     else()
       # AArch64 with MSVC gets implementations of the functions from

diff  --git a/openmp/runtime/src/z_Linux_asm.S b/openmp/runtime/src/z_Linux_asm.S
index bd008fa20ef9d..557f3784b3c48 100644
--- a/openmp/runtime/src/z_Linux_asm.S
+++ b/openmp/runtime/src/z_Linux_asm.S
@@ -108,7 +108,7 @@ KMP_PREFIX_UNDERSCORE(\proc):
 # endif // KMP_OS_DARWIN
 #endif // KMP_ARCH_X86 || KMP_ARCH_x86_64
 
-#if (KMP_OS_LINUX || KMP_OS_DARWIN || KMP_OS_WINDOWS) && KMP_ARCH_AARCH64
+#if (KMP_OS_LINUX || KMP_OS_DARWIN || KMP_OS_WINDOWS) && (KMP_ARCH_AARCH64 || KMP_ARCH_ARM)
 
 # if KMP_OS_DARWIN
 #  define KMP_PREFIX_UNDERSCORE(x) _##x  // extra underscore for OS X* symbols
@@ -160,7 +160,11 @@ KMP_PREFIX_UNDERSCORE(\proc):
 	.cfi_endproc
 // Not sure why we need .type and .size for the functions
 	ALIGN 2
+#if KMP_ARCH_ARM
+	.type  \proc,%function
+#else
 	.type  \proc, at function
+#endif
 	.size  \proc,.-\proc
 .endm
 
@@ -172,7 +176,7 @@ KMP_PREFIX_UNDERSCORE(\proc):
 .endm
 # endif // KMP_OS_DARWIN
 
-#endif // (KMP_OS_LINUX || KMP_OS_DARWIN || KMP_OS_WINDOWS) && KMP_ARCH_AARCH64
+#endif // (KMP_OS_LINUX || KMP_OS_DARWIN || KMP_OS_WINDOWS) && (KMP_ARCH_AARCH64 || KMP_ARCH_ARM)
 
 .macro COMMON name, size, align_power
 #if KMP_OS_DARWIN
@@ -1358,6 +1362,148 @@ KMP_LABEL(kmp_1):
 
 #endif /* (KMP_OS_LINUX || KMP_OS_DARWIN || KMP_OS_WINDOWS) && KMP_ARCH_AARCH64 */
 
+#if (KMP_OS_LINUX || KMP_OS_DARWIN || KMP_OS_WINDOWS) && KMP_ARCH_ARM
+
+//------------------------------------------------------------------------
+// int
+// __kmp_invoke_microtask( void (*pkfn) (int gtid, int tid, ...),
+//                         int gtid, int tid,
+//                         int argc, void *p_argv[]
+// #if OMPT_SUPPORT
+//                         ,
+//                         void **exit_frame_ptr
+// #endif
+//                       ) {
+// #if OMPT_SUPPORT
+//   *exit_frame_ptr = OMPT_GET_FRAME_ADDRESS(0);
+// #endif
+//
+//   (*pkfn)( & gtid, & tid, argv[0], ... );
+//
+// // FIXME: This is done at call-site and can be removed here.
+// #if OMPT_SUPPORT
+//   *exit_frame_ptr = 0;
+// #endif
+//
+//   return 1;
+// }
+//
+// parameters:
+//	r0:	pkfn
+//	r1:	gtid
+//	r2:	tid
+//	r3:	argc
+//	r4(stack):	p_argv
+//	r5(stack):	&exit_frame
+//
+// locals:
+//	__gtid:	gtid parm pushed on stack so can pass &gtid to pkfn
+//	__tid:	tid parm pushed on stack so can pass &tid to pkfn
+//
+// reg temps:
+//	 r4:	used to hold pkfn address
+//	 r5:	used as temporary for number of pkfn parms
+//	 r6:	used to traverse p_argv array
+//	 r7:	frame pointer (in some configurations)
+//	 r8:	used as temporary for stack placement calculation
+//	 	and as pointer to base of callee saved area
+//	 r9:	used as temporary for stack parameters
+//	r10:	used to preserve exit_frame_ptr, callee-save
+//	r11:	frame pointer (in some configurations)
+//
+// return:	r0	(always 1/TRUE)
+//
+
+__gtid = 4
+__tid = 8
+
+// -- Begin __kmp_invoke_microtask
+// mark_begin;
+	.text
+	PROC __kmp_invoke_microtask
+
+	// Pushing one extra register (r3) to keep the stack aligned
+	// for when we call pkfn below
+	push	{r3-r11,lr}
+	// Load p_argv and &exit_frame
+	ldrd	r4, r5, [sp, #10*4]
+
+# if KMP_OS_DARWIN || (defined(__thumb__) && !KMP_OS_WINDOWS)
+# define FP r7
+# define FPOFF 4*4
+#else
+# define FP r11
+# define FPOFF 8*4
+#endif
+	add	FP, sp, #FPOFF
+# if OMPT_SUPPORT
+	mov	r10, r5
+	str	FP, [r10]
+# endif
+	mov	r8, sp
+
+	// Calculate how much stack to allocate, in increments of 8 bytes.
+	// We strictly need 4*(argc-2) bytes (2 arguments are passed in
+	// registers) but allocate 4*argc for simplicity (to avoid needing
+	// to handle the argc<2 cases). We align the number of bytes
+	// allocated to 8 bytes, to keep the stack aligned. (Since we
+	// already allocate more than enough, it's ok to round down
+	// instead of up for the alignment.) We allocate another extra
+	// 8 bytes for gtid and tid.
+	mov	r5, #1
+	add	r5, r5, r3, lsr #1
+	sub	sp, sp, r5, lsl #3
+
+	str	r1, [r8, #-__gtid]
+	str	r2, [r8, #-__tid]
+	mov	r5, r3
+	mov	r6, r4
+	mov	r4, r0
+
+	// Prepare the first 2 parameters to pkfn - pointers to gtid and tid
+	// in our stack frame.
+	sub	r0, r8, #__gtid
+	sub	r1, r8, #__tid
+
+	mov	r8, sp
+
+	// Load p_argv[0] and p_argv[1] into r2 and r3, if argc >= 1/2
+	cmp	r5, #0
+	beq	KMP_LABEL(kmp_1)
+	ldr	r2, [r6]
+
+	subs	r5, r5, #1
+	beq	KMP_LABEL(kmp_1)
+	ldr	r3, [r6, #4]!
+
+	// Loop, loading the rest of p_argv and writing the elements on the
+	// stack.
+KMP_LABEL(kmp_0):
+	subs	r5, r5, #1
+	beq	KMP_LABEL(kmp_1)
+	ldr	r12, [r6, #4]!
+	str	r12, [r8], #4
+	b	KMP_LABEL(kmp_0)
+KMP_LABEL(kmp_1):
+	blx	r4
+	mov	r0, #1
+
+	sub	r4, FP, #FPOFF
+	mov	sp, r4
+# undef FP
+# undef FPOFF
+
+# if OMPT_SUPPORT
+	mov	r1, #0
+	str	r1, [r10]
+# endif
+	pop	{r3-r11,pc}
+
+	DEBUG_INFO __kmp_invoke_microtask
+// -- End  __kmp_invoke_microtask
+
+#endif /* (KMP_OS_LINUX || KMP_OS_DARWIN || KMP_OS_WINDOWS) && KMP_ARCH_AARCH64 */
+
 #if KMP_ARCH_PPC64
 
 //------------------------------------------------------------------------
@@ -1919,7 +2065,9 @@ __kmp_invoke_microtask:
     .global __kmp_unnamed_critical_addr
 __kmp_unnamed_critical_addr:
     .4byte .gomp_critical_user_
+#ifdef __ELF__
     .size __kmp_unnamed_critical_addr,4
+#endif
 #endif /* KMP_ARCH_ARM */
 
 #if KMP_ARCH_PPC64 || KMP_ARCH_AARCH64 || KMP_ARCH_MIPS64 || KMP_ARCH_RISCV64 || KMP_ARCH_LOONGARCH64

diff  --git a/openmp/runtime/src/z_Linux_util.cpp b/openmp/runtime/src/z_Linux_util.cpp
index b1525a3687ad8..95c9e97b37fa7 100644
--- a/openmp/runtime/src/z_Linux_util.cpp
+++ b/openmp/runtime/src/z_Linux_util.cpp
@@ -2448,7 +2448,8 @@ int __kmp_get_load_balance(int max) {
 
 #if !(KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_MIC ||                            \
       ((KMP_OS_LINUX || KMP_OS_DARWIN) && KMP_ARCH_AARCH64) ||                 \
-      KMP_ARCH_PPC64 || KMP_ARCH_RISCV64 || KMP_ARCH_LOONGARCH64)
+      KMP_ARCH_PPC64 || KMP_ARCH_RISCV64 || KMP_ARCH_LOONGARCH64 ||            \
+      KMP_ARCH_ARM)
 
 // we really only need the case with 1 argument, because CLANG always build
 // a struct of pointers to shared variables referenced in the outlined function

diff  --git a/openmp/runtime/src/z_Windows_NT-586_util.cpp b/openmp/runtime/src/z_Windows_NT-586_util.cpp
index c06e5aaa205dc..37759feafd453 100644
--- a/openmp/runtime/src/z_Windows_NT-586_util.cpp
+++ b/openmp/runtime/src/z_Windows_NT-586_util.cpp
@@ -189,95 +189,4 @@ int __kmp_invoke_microtask(microtask_t pkfn, int gtid, int tid, int argc,
 }
 #endif
 
-#if KMP_ARCH_ARM
-// This matches the generic fallback implementation of __kmp_invoke_microtask
-// from z_Linux_util.cpp, which is used on Linux on ARM.
-int __kmp_invoke_microtask(microtask_t pkfn, int gtid, int tid, int argc,
-                           void *p_argv[]
-#if OMPT_SUPPORT
-                           ,
-                           void **exit_frame_ptr
-#endif
-) {
-#if OMPT_SUPPORT
-  *exit_frame_ptr = OMPT_GET_FRAME_ADDRESS(0);
-#endif
-
-  switch (argc) {
-  default:
-    fprintf(stderr, "Too many args to microtask: %d!\n", argc);
-    fflush(stderr);
-    exit(-1);
-  case 0:
-    (*pkfn)(&gtid, &tid);
-    break;
-  case 1:
-    (*pkfn)(&gtid, &tid, p_argv[0]);
-    break;
-  case 2:
-    (*pkfn)(&gtid, &tid, p_argv[0], p_argv[1]);
-    break;
-  case 3:
-    (*pkfn)(&gtid, &tid, p_argv[0], p_argv[1], p_argv[2]);
-    break;
-  case 4:
-    (*pkfn)(&gtid, &tid, p_argv[0], p_argv[1], p_argv[2], p_argv[3]);
-    break;
-  case 5:
-    (*pkfn)(&gtid, &tid, p_argv[0], p_argv[1], p_argv[2], p_argv[3], p_argv[4]);
-    break;
-  case 6:
-    (*pkfn)(&gtid, &tid, p_argv[0], p_argv[1], p_argv[2], p_argv[3], p_argv[4],
-            p_argv[5]);
-    break;
-  case 7:
-    (*pkfn)(&gtid, &tid, p_argv[0], p_argv[1], p_argv[2], p_argv[3], p_argv[4],
-            p_argv[5], p_argv[6]);
-    break;
-  case 8:
-    (*pkfn)(&gtid, &tid, p_argv[0], p_argv[1], p_argv[2], p_argv[3], p_argv[4],
-            p_argv[5], p_argv[6], p_argv[7]);
-    break;
-  case 9:
-    (*pkfn)(&gtid, &tid, p_argv[0], p_argv[1], p_argv[2], p_argv[3], p_argv[4],
-            p_argv[5], p_argv[6], p_argv[7], p_argv[8]);
-    break;
-  case 10:
-    (*pkfn)(&gtid, &tid, p_argv[0], p_argv[1], p_argv[2], p_argv[3], p_argv[4],
-            p_argv[5], p_argv[6], p_argv[7], p_argv[8], p_argv[9]);
-    break;
-  case 11:
-    (*pkfn)(&gtid, &tid, p_argv[0], p_argv[1], p_argv[2], p_argv[3], p_argv[4],
-            p_argv[5], p_argv[6], p_argv[7], p_argv[8], p_argv[9], p_argv[10]);
-    break;
-  case 12:
-    (*pkfn)(&gtid, &tid, p_argv[0], p_argv[1], p_argv[2], p_argv[3], p_argv[4],
-            p_argv[5], p_argv[6], p_argv[7], p_argv[8], p_argv[9], p_argv[10],
-            p_argv[11]);
-    break;
-  case 13:
-    (*pkfn)(&gtid, &tid, p_argv[0], p_argv[1], p_argv[2], p_argv[3], p_argv[4],
-            p_argv[5], p_argv[6], p_argv[7], p_argv[8], p_argv[9], p_argv[10],
-            p_argv[11], p_argv[12]);
-    break;
-  case 14:
-    (*pkfn)(&gtid, &tid, p_argv[0], p_argv[1], p_argv[2], p_argv[3], p_argv[4],
-            p_argv[5], p_argv[6], p_argv[7], p_argv[8], p_argv[9], p_argv[10],
-            p_argv[11], p_argv[12], p_argv[13]);
-    break;
-  case 15:
-    (*pkfn)(&gtid, &tid, p_argv[0], p_argv[1], p_argv[2], p_argv[3], p_argv[4],
-            p_argv[5], p_argv[6], p_argv[7], p_argv[8], p_argv[9], p_argv[10],
-            p_argv[11], p_argv[12], p_argv[13], p_argv[14]);
-    break;
-  }
-
-#if OMPT_SUPPORT
-  *exit_frame_ptr = 0;
-#endif
-
-  return 1;
-}
-#endif
-
 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_AARCH64 || KMP_ARCH_ARM */

diff  --git a/openmp/runtime/test/misc_bugs/many-microtask-args.c b/openmp/runtime/test/misc_bugs/many-microtask-args.c
index 1ede11038eed3..d644515d9a4f2 100644
--- a/openmp/runtime/test/misc_bugs/many-microtask-args.c
+++ b/openmp/runtime/test/misc_bugs/many-microtask-args.c
@@ -1,11 +1,6 @@
 // RUN: %libomp-compile-and-run
 #include <stdio.h>
 
-// This test fails with Clang unless __kmp_invoke_microtask supports at least
-// 17 arguments. On ARM, the fallback C implementation of __kmp_invoke_microtask
-// is used, and that one only currently supports up to 15 arguments.
-// XFAIL: arm
-
 int main()
 {
 


        


More information about the Openmp-commits mailing list