[Openmp-commits] [openmp] c2a289d - [openmp] Provide an assembly implementation of __kmp_invoke_microtask on ARM
Martin Storsjö via Openmp-commits
openmp-commits at lists.llvm.org
Thu Dec 8 12:53:58 PST 2022
Author: Martin Storsjö
Date: 2022-12-08T22:53:28+02:00
New Revision: c2a289dbdf3b8aac2e7f824b00cd54eb632e620a
URL: https://github.com/llvm/llvm-project/commit/c2a289dbdf3b8aac2e7f824b00cd54eb632e620a
DIFF: https://github.com/llvm/llvm-project/commit/c2a289dbdf3b8aac2e7f824b00cd54eb632e620a.diff
LOG: [openmp] Provide an assembly implementation of __kmp_invoke_microtask on ARM
This fixes passing an arbitrarily large number of arguments to
microtasks, fixing the misc_bugs/many-microtask-args.c testcase on
ARM.
Differential Revision: https://reviews.llvm.org/D138704
Added:
Modified:
openmp/runtime/src/CMakeLists.txt
openmp/runtime/src/z_Linux_asm.S
openmp/runtime/src/z_Linux_util.cpp
openmp/runtime/src/z_Windows_NT-586_util.cpp
openmp/runtime/test/misc_bugs/many-microtask-args.c
Removed:
################################################################################
diff --git a/openmp/runtime/src/CMakeLists.txt b/openmp/runtime/src/CMakeLists.txt
index f222129ab2ec9..09a05abdf7bf9 100644
--- a/openmp/runtime/src/CMakeLists.txt
+++ b/openmp/runtime/src/CMakeLists.txt
@@ -95,8 +95,8 @@ else()
libomp_append(LIBOMP_CXXFILES z_Windows_NT-586_util.cpp)
if(${LIBOMP_ARCH} STREQUAL "i386" OR ${LIBOMP_ARCH} STREQUAL "x86_64")
libomp_append(LIBOMP_ASMFILES z_Windows_NT-586_asm.asm) # Windows assembly file
- elseif(${LIBOMP_ARCH} STREQUAL "aarch64" AND (NOT MSVC OR CMAKE_C_COMPILER_ID STREQUAL "Clang"))
- # z_Linux_asm.S works for AArch64 Windows too.
+ elseif((${LIBOMP_ARCH} STREQUAL "aarch64" OR ${LIBOMP_ARCH} STREQUAL "arm") AND (NOT MSVC OR CMAKE_C_COMPILER_ID STREQUAL "Clang"))
+ # z_Linux_asm.S works for AArch64 and ARM Windows too.
libomp_append(LIBOMP_GNUASMFILES z_Linux_asm.S)
else()
# AArch64 with MSVC gets implementations of the functions from
diff --git a/openmp/runtime/src/z_Linux_asm.S b/openmp/runtime/src/z_Linux_asm.S
index bd008fa20ef9d..557f3784b3c48 100644
--- a/openmp/runtime/src/z_Linux_asm.S
+++ b/openmp/runtime/src/z_Linux_asm.S
@@ -108,7 +108,7 @@ KMP_PREFIX_UNDERSCORE(\proc):
# endif // KMP_OS_DARWIN
#endif // KMP_ARCH_X86 || KMP_ARCH_x86_64
-#if (KMP_OS_LINUX || KMP_OS_DARWIN || KMP_OS_WINDOWS) && KMP_ARCH_AARCH64
+#if (KMP_OS_LINUX || KMP_OS_DARWIN || KMP_OS_WINDOWS) && (KMP_ARCH_AARCH64 || KMP_ARCH_ARM)
# if KMP_OS_DARWIN
# define KMP_PREFIX_UNDERSCORE(x) _##x // extra underscore for OS X* symbols
@@ -160,7 +160,11 @@ KMP_PREFIX_UNDERSCORE(\proc):
.cfi_endproc
// Not sure why we need .type and .size for the functions
ALIGN 2
+#if KMP_ARCH_ARM
+ .type \proc,%function
+#else
.type \proc, at function
+#endif
.size \proc,.-\proc
.endm
@@ -172,7 +176,7 @@ KMP_PREFIX_UNDERSCORE(\proc):
.endm
# endif // KMP_OS_DARWIN
-#endif // (KMP_OS_LINUX || KMP_OS_DARWIN || KMP_OS_WINDOWS) && KMP_ARCH_AARCH64
+#endif // (KMP_OS_LINUX || KMP_OS_DARWIN || KMP_OS_WINDOWS) && (KMP_ARCH_AARCH64 || KMP_ARCH_ARM)
.macro COMMON name, size, align_power
#if KMP_OS_DARWIN
@@ -1358,6 +1362,148 @@ KMP_LABEL(kmp_1):
#endif /* (KMP_OS_LINUX || KMP_OS_DARWIN || KMP_OS_WINDOWS) && KMP_ARCH_AARCH64 */
+#if (KMP_OS_LINUX || KMP_OS_DARWIN || KMP_OS_WINDOWS) && KMP_ARCH_ARM
+
+//------------------------------------------------------------------------
+// int
+// __kmp_invoke_microtask( void (*pkfn) (int gtid, int tid, ...),
+// int gtid, int tid,
+// int argc, void *p_argv[]
+// #if OMPT_SUPPORT
+// ,
+// void **exit_frame_ptr
+// #endif
+// ) {
+// #if OMPT_SUPPORT
+// *exit_frame_ptr = OMPT_GET_FRAME_ADDRESS(0);
+// #endif
+//
+// (*pkfn)( & gtid, & tid, argv[0], ... );
+//
+// // FIXME: This is done at call-site and can be removed here.
+// #if OMPT_SUPPORT
+// *exit_frame_ptr = 0;
+// #endif
+//
+// return 1;
+// }
+//
+// parameters:
+// r0: pkfn
+// r1: gtid
+// r2: tid
+// r3: argc
+// r4(stack): p_argv
+// r5(stack): &exit_frame
+//
+// locals:
+// __gtid: gtid parm pushed on stack so can pass >id to pkfn
+// __tid: tid parm pushed on stack so can pass &tid to pkfn
+//
+// reg temps:
+// r4: used to hold pkfn address
+// r5: used as temporary for number of pkfn parms
+// r6: used to traverse p_argv array
+// r7: frame pointer (in some configurations)
+// r8: used as temporary for stack placement calculation
+// and as pointer to base of callee saved area
+// r9: used as temporary for stack parameters
+// r10: used to preserve exit_frame_ptr, callee-save
+// r11: frame pointer (in some configurations)
+//
+// return: r0 (always 1/TRUE)
+//
+
+__gtid = 4
+__tid = 8
+
+// -- Begin __kmp_invoke_microtask
+// mark_begin;
+ .text
+ PROC __kmp_invoke_microtask
+
+ // Pushing one extra register (r3) to keep the stack aligned
+ // for when we call pkfn below
+ push {r3-r11,lr}
+ // Load p_argv and &exit_frame
+ ldrd r4, r5, [sp, #10*4]
+
+# if KMP_OS_DARWIN || (defined(__thumb__) && !KMP_OS_WINDOWS)
+# define FP r7
+# define FPOFF 4*4
+#else
+# define FP r11
+# define FPOFF 8*4
+#endif
+ add FP, sp, #FPOFF
+# if OMPT_SUPPORT
+ mov r10, r5
+ str FP, [r10]
+# endif
+ mov r8, sp
+
+ // Calculate how much stack to allocate, in increments of 8 bytes.
+ // We strictly need 4*(argc-2) bytes (2 arguments are passed in
+ // registers) but allocate 4*argc for simplicity (to avoid needing
+ // to handle the argc<2 cases). We align the number of bytes
+ // allocated to 8 bytes, to keep the stack aligned. (Since we
+ // already allocate more than enough, it's ok to round down
+ // instead of up for the alignment.) We allocate another extra
+ // 8 bytes for gtid and tid.
+ mov r5, #1
+ add r5, r5, r3, lsr #1
+ sub sp, sp, r5, lsl #3
+
+ str r1, [r8, #-__gtid]
+ str r2, [r8, #-__tid]
+ mov r5, r3
+ mov r6, r4
+ mov r4, r0
+
+ // Prepare the first 2 parameters to pkfn - pointers to gtid and tid
+ // in our stack frame.
+ sub r0, r8, #__gtid
+ sub r1, r8, #__tid
+
+ mov r8, sp
+
+ // Load p_argv[0] and p_argv[1] into r2 and r3, if argc >= 1/2
+ cmp r5, #0
+ beq KMP_LABEL(kmp_1)
+ ldr r2, [r6]
+
+ subs r5, r5, #1
+ beq KMP_LABEL(kmp_1)
+ ldr r3, [r6, #4]!
+
+ // Loop, loading the rest of p_argv and writing the elements on the
+ // stack.
+KMP_LABEL(kmp_0):
+ subs r5, r5, #1
+ beq KMP_LABEL(kmp_1)
+ ldr r12, [r6, #4]!
+ str r12, [r8], #4
+ b KMP_LABEL(kmp_0)
+KMP_LABEL(kmp_1):
+ blx r4
+ mov r0, #1
+
+ sub r4, FP, #FPOFF
+ mov sp, r4
+# undef FP
+# undef FPOFF
+
+# if OMPT_SUPPORT
+ mov r1, #0
+ str r1, [r10]
+# endif
+ pop {r3-r11,pc}
+
+ DEBUG_INFO __kmp_invoke_microtask
+// -- End __kmp_invoke_microtask
+
+#endif /* (KMP_OS_LINUX || KMP_OS_DARWIN || KMP_OS_WINDOWS) && KMP_ARCH_AARCH64 */
+
#if KMP_ARCH_PPC64
//------------------------------------------------------------------------
@@ -1919,7 +2065,9 @@ __kmp_invoke_microtask:
.global __kmp_unnamed_critical_addr
__kmp_unnamed_critical_addr:
.4byte .gomp_critical_user_
+#ifdef __ELF__
.size __kmp_unnamed_critical_addr,4
+#endif
#endif /* KMP_ARCH_ARM */
#if KMP_ARCH_PPC64 || KMP_ARCH_AARCH64 || KMP_ARCH_MIPS64 || KMP_ARCH_RISCV64 || KMP_ARCH_LOONGARCH64
diff --git a/openmp/runtime/src/z_Linux_util.cpp b/openmp/runtime/src/z_Linux_util.cpp
index b1525a3687ad8..95c9e97b37fa7 100644
--- a/openmp/runtime/src/z_Linux_util.cpp
+++ b/openmp/runtime/src/z_Linux_util.cpp
@@ -2448,7 +2448,8 @@ int __kmp_get_load_balance(int max) {
#if !(KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_MIC || \
((KMP_OS_LINUX || KMP_OS_DARWIN) && KMP_ARCH_AARCH64) || \
- KMP_ARCH_PPC64 || KMP_ARCH_RISCV64 || KMP_ARCH_LOONGARCH64)
+ KMP_ARCH_PPC64 || KMP_ARCH_RISCV64 || KMP_ARCH_LOONGARCH64 || \
+ KMP_ARCH_ARM)
// we really only need the case with 1 argument, because CLANG always build
// a struct of pointers to shared variables referenced in the outlined function
diff --git a/openmp/runtime/src/z_Windows_NT-586_util.cpp b/openmp/runtime/src/z_Windows_NT-586_util.cpp
index c06e5aaa205dc..37759feafd453 100644
--- a/openmp/runtime/src/z_Windows_NT-586_util.cpp
+++ b/openmp/runtime/src/z_Windows_NT-586_util.cpp
@@ -189,95 +189,4 @@ int __kmp_invoke_microtask(microtask_t pkfn, int gtid, int tid, int argc,
}
#endif
-#if KMP_ARCH_ARM
-// This matches the generic fallback implementation of __kmp_invoke_microtask
-// from z_Linux_util.cpp, which is used on Linux on ARM.
-int __kmp_invoke_microtask(microtask_t pkfn, int gtid, int tid, int argc,
- void *p_argv[]
-#if OMPT_SUPPORT
- ,
- void **exit_frame_ptr
-#endif
-) {
-#if OMPT_SUPPORT
- *exit_frame_ptr = OMPT_GET_FRAME_ADDRESS(0);
-#endif
-
- switch (argc) {
- default:
- fprintf(stderr, "Too many args to microtask: %d!\n", argc);
- fflush(stderr);
- exit(-1);
- case 0:
- (*pkfn)(>id, &tid);
- break;
- case 1:
- (*pkfn)(>id, &tid, p_argv[0]);
- break;
- case 2:
- (*pkfn)(>id, &tid, p_argv[0], p_argv[1]);
- break;
- case 3:
- (*pkfn)(>id, &tid, p_argv[0], p_argv[1], p_argv[2]);
- break;
- case 4:
- (*pkfn)(>id, &tid, p_argv[0], p_argv[1], p_argv[2], p_argv[3]);
- break;
- case 5:
- (*pkfn)(>id, &tid, p_argv[0], p_argv[1], p_argv[2], p_argv[3], p_argv[4]);
- break;
- case 6:
- (*pkfn)(>id, &tid, p_argv[0], p_argv[1], p_argv[2], p_argv[3], p_argv[4],
- p_argv[5]);
- break;
- case 7:
- (*pkfn)(>id, &tid, p_argv[0], p_argv[1], p_argv[2], p_argv[3], p_argv[4],
- p_argv[5], p_argv[6]);
- break;
- case 8:
- (*pkfn)(>id, &tid, p_argv[0], p_argv[1], p_argv[2], p_argv[3], p_argv[4],
- p_argv[5], p_argv[6], p_argv[7]);
- break;
- case 9:
- (*pkfn)(>id, &tid, p_argv[0], p_argv[1], p_argv[2], p_argv[3], p_argv[4],
- p_argv[5], p_argv[6], p_argv[7], p_argv[8]);
- break;
- case 10:
- (*pkfn)(>id, &tid, p_argv[0], p_argv[1], p_argv[2], p_argv[3], p_argv[4],
- p_argv[5], p_argv[6], p_argv[7], p_argv[8], p_argv[9]);
- break;
- case 11:
- (*pkfn)(>id, &tid, p_argv[0], p_argv[1], p_argv[2], p_argv[3], p_argv[4],
- p_argv[5], p_argv[6], p_argv[7], p_argv[8], p_argv[9], p_argv[10]);
- break;
- case 12:
- (*pkfn)(>id, &tid, p_argv[0], p_argv[1], p_argv[2], p_argv[3], p_argv[4],
- p_argv[5], p_argv[6], p_argv[7], p_argv[8], p_argv[9], p_argv[10],
- p_argv[11]);
- break;
- case 13:
- (*pkfn)(>id, &tid, p_argv[0], p_argv[1], p_argv[2], p_argv[3], p_argv[4],
- p_argv[5], p_argv[6], p_argv[7], p_argv[8], p_argv[9], p_argv[10],
- p_argv[11], p_argv[12]);
- break;
- case 14:
- (*pkfn)(>id, &tid, p_argv[0], p_argv[1], p_argv[2], p_argv[3], p_argv[4],
- p_argv[5], p_argv[6], p_argv[7], p_argv[8], p_argv[9], p_argv[10],
- p_argv[11], p_argv[12], p_argv[13]);
- break;
- case 15:
- (*pkfn)(>id, &tid, p_argv[0], p_argv[1], p_argv[2], p_argv[3], p_argv[4],
- p_argv[5], p_argv[6], p_argv[7], p_argv[8], p_argv[9], p_argv[10],
- p_argv[11], p_argv[12], p_argv[13], p_argv[14]);
- break;
- }
-
-#if OMPT_SUPPORT
- *exit_frame_ptr = 0;
-#endif
-
- return 1;
-}
-#endif
-
#endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_AARCH64 || KMP_ARCH_ARM */
diff --git a/openmp/runtime/test/misc_bugs/many-microtask-args.c b/openmp/runtime/test/misc_bugs/many-microtask-args.c
index 1ede11038eed3..d644515d9a4f2 100644
--- a/openmp/runtime/test/misc_bugs/many-microtask-args.c
+++ b/openmp/runtime/test/misc_bugs/many-microtask-args.c
@@ -1,11 +1,6 @@
// RUN: %libomp-compile-and-run
#include <stdio.h>
-// This test fails with Clang unless __kmp_invoke_microtask supports at least
-// 17 arguments. On ARM, the fallback C implementation of __kmp_invoke_microtask
-// is used, and that one only currently supports up to 15 arguments.
-// XFAIL: arm
-
int main()
{
More information about the Openmp-commits
mailing list