[libc-commits] [libc] [libc] Implement basic 'fenv.h' utilities on the AMD GPU (PR #83500)

Joseph Huber via libc-commits libc-commits at lists.llvm.org
Tue May 7 11:23:42 PDT 2024


================
@@ -0,0 +1,229 @@
+//===-- amdgpu floating point env manipulation functions --------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC___SUPPORT_FPUTIL_AMDGPU_FENVIMPL_H
+#define LLVM_LIBC_SRC___SUPPORT_FPUTIL_AMDGPU_FENVIMPL_H
+
+#include "src/__support/GPU/utils.h"
+#include "src/__support/macros/attributes.h"
+#include "src/__support/macros/properties/architectures.h"
+
+#if !defined(LIBC_TARGET_ARCH_IS_AMDGPU)
+#error "Invalid include"
+#endif
+
+#include "hdr/types/fenv_t.h"
+#include "hdr/fenv_macros.h"
+
+#include <stdint.h>
+
+namespace LIBC_NAMESPACE {
+namespace fputil {
+
+namespace internal {
+// Retuns the current status of the AMDGPU floating point environment. In
+// practice this is simply a 64-bit concatenation of the mode register and the
+// trap status register.
+//
+// The mode register controls the floating point behaviour of the device. It
+// can be read or written to by the kernel during runtime It is laid out as a
+// bit field with the following offsets and sizes listed for the relevant
+// entries.
+//
+// ┌─────┬─────────────┬─────┬─────────┬──────────┬─────────────┬────────────┐
+// │ ... │ EXCP[20:12] │ ... │ IEEE[9] │ CLAMP[8] │ DENORM[7:4] │ ROUND[3:0] │
+// └─────┴─────────────┴─────┴─────────┴──────────┴─────────────┴────────────┘
+//
+// The rounding mode and denormal modes both control f64/f16 and f32 precision
+// operations separately with two bits. The accepted values for the rounding
+// mode are nearest, upward, downward, and toward given 0, 1, 2, and 3
+// respectively.
+//
+// The CLAMP bit indicates that DirectX 10 handling of NaNs is enabled in the
+// vector ALU. When set this will clamp NaN values to zero and pass them
+// otherwise. A hardware bug causes this bit to prevent floating exceptions
+// from being recorded if this bit is set on all generations before GFX12.
+//
+// The IEEE bit controls whether or not floating point operations supporting
+// exception gathering are IEEE 754-2008 compliant.
+//
+// The EXCP field indicates which exceptions will cause the instruction to
+// take a trap if traps are enabled, see the status register. The bit layout
+// is identical to that in the trap status register. We are only concerned
+// with the first six bits and ignore the other three.
+//
+// The trap status register contains information about the status of the
+// exceptions. These bits are accumulated regarless of trap handling statuss
+// and are sticky until cleared.
+//
+// 5         4           3          2                1          0
+// ┌─────────┬───────────┬──────────┬────────────────┬──────────┬─────────┐
+// │ Inexact │ Underflow │ Overflow │ Divide by zero │ Denormal │ Invalid │
+// └─────────┴───────────┴──────────┴────────────────┴──────────┴─────────┘
+//
+// These exceptions indicate that at least one lane in the current wavefront
+// signalled an floating point exception. There is no way to increase the
+// granularity.
+//
+// The returned value has the following layout.
+//
+// ┌────────────────────┬─────────────────────┐
+// │ Trap Status[38:32] │ Mode Register[31:0] │
+// └────────────────────┴─────────────────────┘
+LIBC_INLINE uint64_t get_fpenv() { return __builtin_amdgcn_get_fpenv(); }
+
+// Set the floating point environment using the same layout as above.
+LIBC_INLINE void set_fpenv(uint64_t env) { __builtin_amdgcn_set_fpenv(env); }
+
+// The six bits used to encode the standard floating point exceptions in the
+// trap status register.
+enum ExceptionFlags : uint32_t {
+  EXCP_INVALID_F = 0x1,
+  EXCP_DENORMAL_F = 0x2,
+  EXCP_DIV_BY_ZERO_F = 0x4,
+  EXCP_OVERFLOW_F = 0x8,
+  EXCP_UNDERFLOW_F = 0x10,
+  EXCP_INEXACT_F = 0x20,
+};
+
+// The values used by the AMDGPU backend to handle the `llvm.get.rounding`
+// intrinsic function. See the values in the documentation for more information.
+// https://llvm.org/docs/AMDGPUUsage.html#amdgpu-rounding-mode-enumeration-values-table
+enum RoundingFlags : uint32_t {
+  ROUND_TOWARD_ZERO = 0x0,
+  ROUND_TO_NEAREST = 0x1,
+  ROUND_UPWARD = 0x2,
+  ROUND_DOWNWARD = 0x3,
+};
+
+// Exception flags are individual bits in the corresponding hardware register.
+// This converts between the exported C standard values and the hardware values.
+LIBC_INLINE uint32_t get_status_value_for_except(uint32_t excepts) {
+  return (excepts & FE_INVALID ? EXCP_INVALID_F : 0) |
+#ifdef __FE_DENORM
----------------
jhuber6 wrote:

My guess is that the original implementation thought it would be easier to make it fixed in the header than to have target specific values, but we could probably go that direction. It would be a larger rewrite though so it might be out of scope here.

https://github.com/llvm/llvm-project/pull/83500


More information about the libc-commits mailing list