[libc-commits] [libc] [libc][libm][GPU] Add support for fast vendor math (PR #66439)

Thu Sep 14 14:55:21 PDT 2023

llvmbot wrote:




@llvm/pr-subscribers-libc
            
<details>
<summary>Changes</summary>
I have been experimenting with how we could potentially support the fast math versions of some of the vendor functions. I hope some of you can suggest a more elegant way of doing it as this is, perhaps, a bit hacky. Now, if users compile a program with `-ffast-math` when linking in `libmgpu`, the header will make sure that the appropriate version of `__nv_fast_cosf` or `__nv_cosf` is selected.
<br>
Is this a bad idea? Does the extra namespace cause problems? And do we need fast math support like this, or should we only use fast math versions if `libm` is compiled with fast math enabled?

## Extra entrypoint
By adding a placeholder entry point, there are now two versions of `cosf` in `libmgpu.a`, namely `internal::cosf` and `internal::fast::cosf`.
```bash
$ llvm-ar x install/lib/x86_64-unknown-linux-gnu/libmgpu.a fast_cosf.cpp.o
$ clang-offload-packager fast_cosf.cpp.o --image=file=output.bc,arch=sm_70
$ llvm-dis output.bc && grep output.ll -e _nv_ -C 5
```
```llvm
define linkonce_odr hidden noundef float @_ZN11__llvm_libc8internal4fast4cosfEf(float noundef %x) #0 comdat {
entry:
  %x.addr = alloca float, align 4
  store float %x, ptr %x.addr, align 4
  %0 = load float, ptr %x.addr, align 4
  %call = call float @__nv_fast_cosf(float noundef %0) #2
  ret float %call
}

declare float @__nv_fast_cosf(float noundef) #1
```

```bash
$ llvm-ar x install/lib/x86_64-unknown-linux-gnu/libmgpu.a cosf.cpp.o
$ /dev/shm/rydahl1/LLVM/install/bin/clang-offload-packager cosf.cpp.o --image=file=output.bc,arch=sm_70
$ llvm-dis output.bc && grep output.ll -e _nv_ -C 5
```
```llvm
@_ZN11__llvm_libc4cosfEf = hidden alias float (float), ptr @cosf

; Function Attrs: mustprogress nounwind
define hidden noundef float @cosf(float noundef %x) #0 {
entry:
  %call.i = tail call float @__nv_cosf(float noundef %x) #2
  ret float %call.i
}

declare float @__nv_cosf(float noundef) local_unnamed_addr #1
```

## Test Program
When compiling a nice, little test program
```C++
#define ABS(a) (a > 0) ? (a) : -(a)
#include "libc/src/math/cosf.h"
using namespace __llvm_libc::cosf;

int main(void){
  float result = -10.0;
# pragma omp target map(always,from:result)
  {
    result = cosf(-0.12345);
  }
  return ((result >= -1) && (result <= 1)) ;
}
```
it also looked like the correct versions of `cosf` were linked in:
```bash
$ gpu=sm_70
$ clang++ -std=c++2a -O2 -D__CLANG_GPU_APPROX_TRANSCENDENTALS__ -Wall -nogpuinc -fopenmp --offload-arch=$gpu -fopenmp-offload-mandatory -foffload-lto cosf.cpp -o fastmath -Linstall/lib/x86_64-unknown-linux-gnu/ -lmgpu -Linstall/lib -lomp -Linstall/lib -lomptarget
```
```llvm
; Function Attrs: mustprogress noinline nounwind optnone
define internal noundef float @_ZN11__llvm_libc8internal4fast4cosfEf(float noundef %0) #0 comdat {
  %2 = alloca float, align 4
  store float %0, ptr %2, align 4
  %3 = load float, ptr %2, align 4
  %4 = call float @__nv_fast_cosf(float noundef %3) #49
  ret float %4
}
```
```bash
$ gpu=sm_70
$ clang++ -std=c++2a -O2 -Wall -nogpuinc -fopenmp --offload-arch=$gpu -fopenmp-offload-mandatory -foffload-lto cosf.cpp -o fastmath -Linstall/lib/x86_64-unknown-linux-gnu/ -lmgpu -Linstall/lib -lomp -Linstall/lib -lomptarget
```
```llvm
@_ZN11__llvm_libc4cosfEf = internal alias float (float), ptr @cosf

; Function Attrs: mustprogress nounwind
define hidden noundef float @cosf(float noundef %0) #0 {
  %2 = tail call float @__nv_cosf(float noundef %0) #49
  ret float %2
}
```
I could of course have compiled the programs at more than level `-O2` to get rid of some of the redundancy here but in that case it is harder to see what is linked in as that replaces the functions calls by `nvvm` instructions.
--
Full diff: https://github.com/llvm/llvm-project/pull/66439.diff

9 Files Affected:

- (modified) libc/config/gpu/entrypoints.txt (+4) 
- (modified) libc/src/math/CMakeLists.txt (+2) 
- (modified) libc/src/math/cosf.h (+7-1) 
- (modified) libc/src/math/gpu/vendor/CMakeLists.txt (+11) 
- (modified) libc/src/math/gpu/vendor/amdgpu/amdgpu.h (+3) 
- (modified) libc/src/math/gpu/vendor/amdgpu/declarations.h (+1) 
- (modified) libc/src/math/gpu/vendor/cosf.cpp (+6) 
- (modified) libc/src/math/gpu/vendor/nvptx/declarations.h (+1) 
- (modified) libc/src/math/gpu/vendor/nvptx/nvptx.h (+3) 


<pre>

diff --git a/libc/config/gpu/entrypoints.txt b/libc/config/gpu/entrypoints.txt
index ba3e41ce3e5a8ca..149f218cc74ece6 100644
--- a/libc/config/gpu/entrypoints.txt
+++ b/libc/config/gpu/entrypoints.txt
@@ -198,6 +198,10 @@ set(TARGET_LIBM_ENTRYPOINTS
     libc.src.math.truncf
 )
 
+if(${LIBC_GPU_VENDOR_MATH})
+  list(APPEND TARGET_LIBM_ENTRYPOINTS libc.src.math.fast_cosf)
+endif()
+
 set(TARGET_LLVMLIBC_ENTRYPOINTS
   ${TARGET_LIBC_ENTRYPOINTS}
   ${TARGET_LIBM_ENTRYPOINTS}
diff --git a/libc/src/math/CMakeLists.txt b/libc/src/math/CMakeLists.txt
index 8b2021cac8239fe..1edbbf7bebae201 100644
--- a/libc/src/math/CMakeLists.txt
+++ b/libc/src/math/CMakeLists.txt
@@ -102,6 +102,8 @@ add_math_entrypoint_object(floor)
 add_math_entrypoint_object(floorf)
 add_math_entrypoint_object(floorl)
 
+add_math_entrypoint_object(fast_cosf)
+
 add_math_entrypoint_object(fma)
 add_math_entrypoint_object(fmaf)
 
diff --git a/libc/src/math/cosf.h b/libc/src/math/cosf.h
index 1aaabe900ba884f..8f7a204c3c5f9ec 100644
--- a/libc/src/math/cosf.h
+++ b/libc/src/math/cosf.h
@@ -10,8 +10,14 @@
 #define LLVM_LIBC_SRC_MATH_COSF_H
 
 namespace __llvm_libc {
-
+#if defined(__CLANG_GPU_APPROX_TRANSCENDENTALS__)
+namespace fast {
+    float cosf(float x);
+}
+using fast::cosf;
+#else
 float cosf(float x);
+#endif
 
 } // namespace __llvm_libc
 
diff --git a/libc/src/math/gpu/vendor/CMakeLists.txt b/libc/src/math/gpu/vendor/CMakeLists.txt
index 2ee74a06a02d461..24e90de5f94306b 100644
--- a/libc/src/math/gpu/vendor/CMakeLists.txt
+++ b/libc/src/math/gpu/vendor/CMakeLists.txt
@@ -117,6 +117,17 @@ add_entrypoint_object(
     -O2
 )
 
+add_entrypoint_object(
+  fast_cosf
+  SRCS
+    cosf.cpp
+  HDRS
+    ../../cosf.h
+  COMPILE_OPTIONS
+    ${bitcode_link_flags}
+    -D__CLANG_GPU_APPROX_TRANSCENDENTALS__ -O3 -ffast-math
+)
+
 add_entrypoint_object(
   cosh
   SRCS
diff --git a/libc/src/math/gpu/vendor/amdgpu/amdgpu.h b/libc/src/math/gpu/vendor/amdgpu/amdgpu.h
index 7755174e445b222..7dc58795c5076b0 100644
--- a/libc/src/math/gpu/vendor/amdgpu/amdgpu.h
+++ b/libc/src/math/gpu/vendor/amdgpu/amdgpu.h
@@ -30,6 +30,9 @@ LIBC_INLINE float expf(float x) { return __builtin_expf(x); }
 LIBC_INLINE float exp2f(float x) { return __builtin_exp2f(x); }
 LIBC_INLINE float exp10f(float x) { return __ocml_exp10_f32(x); }
 LIBC_INLINE float expm1f(float x) { return __ocml_expm1_f32(x); }
+namespace fast {
+  LIBC_INLINE float cosf(float x) { return __ocml_native_cos_f32(x); }
+}
 LIBC_INLINE double fdim(double x, double y) { return __ocml_fdim_f64(x, y); }
 LIBC_INLINE float fdimf(float x, float y) { return __ocml_fdim_f32(x, y); }
 LIBC_INLINE double hypot(double x, double y) { return __ocml_hypot_f64(x, y); }
diff --git a/libc/src/math/gpu/vendor/amdgpu/declarations.h b/libc/src/math/gpu/vendor/amdgpu/declarations.h
index 7219d5a7dfa6d73..a5b55403e276700 100644
--- a/libc/src/math/gpu/vendor/amdgpu/declarations.h
+++ b/libc/src/math/gpu/vendor/amdgpu/declarations.h
@@ -36,6 +36,7 @@ int __ocml_ilogb_f64(double);
 int __ocml_ilogb_f32(float);
 float __ocml_ldexp_f32(float, int);
 double __ocml_ldexp_f64(double, int);
+float __ocml_native_cos_f32(float);
 float __ocml_nextafter_f32(float, float);
 double __ocml_nextafter_f64(double, double);
 float __ocml_pow_f32(float, float);
diff --git a/libc/src/math/gpu/vendor/cosf.cpp b/libc/src/math/gpu/vendor/cosf.cpp
index 3ce8fa6361d6eb8..50aac9b71fd8c3a 100644
--- a/libc/src/math/gpu/vendor/cosf.cpp
+++ b/libc/src/math/gpu/vendor/cosf.cpp
@@ -13,6 +13,12 @@
 
 namespace __llvm_libc {
 
+#if defined(__CLANG_GPU_APPROX_TRANSCENDENTALS__)
+namespace fast {
+  LLVM_LIBC_FUNCTION(float, cosf, (float x)) { return __llvm_libc::internal::fast::cosf(x); }
+}
+#else
 LLVM_LIBC_FUNCTION(float, cosf, (float x)) { return internal::cosf(x); }
+#endif
 
 } // namespace __llvm_libc
diff --git a/libc/src/math/gpu/vendor/nvptx/declarations.h b/libc/src/math/gpu/vendor/nvptx/declarations.h
index 8b6702834a04cf4..0c1b8fda8cd50f7 100644
--- a/libc/src/math/gpu/vendor/nvptx/declarations.h
+++ b/libc/src/math/gpu/vendor/nvptx/declarations.h
@@ -26,6 +26,7 @@ float __nv_expf(float);
 float __nv_exp2f(float);
 float __nv_exp10f(float);
 float __nv_expm1f(float);
+float __nv_fast_cosf(float);
 double __nv_fdim(double, double);
 float __nv_fdimf(float, float);
 double __nv_hypot(double, double);
diff --git a/libc/src/math/gpu/vendor/nvptx/nvptx.h b/libc/src/math/gpu/vendor/nvptx/nvptx.h
index 6ea1743cf7a6f3e..5eee7a11a4a79d7 100644
--- a/libc/src/math/gpu/vendor/nvptx/nvptx.h
+++ b/libc/src/math/gpu/vendor/nvptx/nvptx.h
@@ -29,6 +29,9 @@ LIBC_INLINE float expf(float x) { return __nv_expf(x); }
 LIBC_INLINE float exp2f(float x) { return __nv_exp2f(x); }
 LIBC_INLINE float exp10f(float x) { return __nv_exp10f(x); }
 LIBC_INLINE float expm1f(float x) { return __nv_expm1f(x); }
+namespace fast {
+  LIBC_INLINE float cosf(float x) { return __nv_fast_cosf(x); }
+}
 LIBC_INLINE double fdim(double x, double y) { return __nv_fdim(x, y); }
 LIBC_INLINE float fdimf(float x, float y) { return __nv_fdimf(x, y); }
 LIBC_INLINE double hypot(double x, double y) { return __nv_hypot(x, y); }
</pre>
</details>


https://github.com/llvm/llvm-project/pull/66439