[llvm] 6109e70 - [llvm][APFloat] Add NaN-in-negative-zero formats by AMD and GraphCore

Krzysztof Drewniak via llvm-commits llvm-commits at lists.llvm.org
Thu Feb 9 14:08:07 PST 2023


Author: Krzysztof Drewniak
Date: 2023-02-09T22:08:00Z
New Revision: 6109e70c72fc5171d25c4467fc3cfe6eb2029f50

URL: https://github.com/llvm/llvm-project/commit/6109e70c72fc5171d25c4467fc3cfe6eb2029f50
DIFF: https://github.com/llvm/llvm-project/commit/6109e70c72fc5171d25c4467fc3cfe6eb2029f50.diff

LOG: [llvm][APFloat] Add NaN-in-negative-zero formats by AMD and GraphCore

AMD, GraphCore, and Qualcom have published a standard for 8-bit floats that
differs from the 8-bit floats defined by Nvidia, Intel, and ARM. This
commit adds support for these alternate 8-bit floats to APFloat in
order to enable their usage in MLIR. These formats are presented in
the paper at https://arxiv.org/abs/2206.02915 and are implemented in
GRaphCore hardware whose ISA is available at
https://docs.graphcore.ai/projects/isa-mk2-with-fp8/en/latest/_static/TileVertexISA-IPU21-1.3.1.pdf .

In these formats, like the existing Float8E4M3FN, there are no
infinity values and there is only one NaN. Unlike in that format,
however, the NaN values is 0x80, which would be negative 0 in IEEE
formats. This means that these formats also make 0 unsigned.

To allow for these new variant semantics, this commit adds
fltNanEncoding, which can be IEEE (the default), AllOnes (used by
Fleat8E4M3FN), or NegativeZero (used by the new formats,
Float8E5M2FNUZ and Float8E4M3FNUZ). Normalization, arithmetic, and
other such routines have been updated to account for the potential
variant semantics.

The two new formats are Float8E5M2FNUZ (5 bits exponent, 2 bits
mantissa, finite, unsigned zero) and Float8E4M3FNUZ (4 bits exponent,
3 bits mantissa, finite, unsigned zero).

Reviewed By: jakeh-gc, reedwm, lattner

Differential Revision: https://reviews.llvm.org/D141863

Added: 
    

Modified: 
    llvm/include/llvm/ADT/APFloat.h
    llvm/lib/Support/APFloat.cpp
    llvm/unittests/ADT/APFloatTest.cpp

Removed: 
    


################################################################################
diff  --git a/llvm/include/llvm/ADT/APFloat.h b/llvm/include/llvm/ADT/APFloat.h
index cad6ef8caeb97..402ffba3ff8e4 100644
--- a/llvm/include/llvm/ADT/APFloat.h
+++ b/llvm/include/llvm/ADT/APFloat.h
@@ -158,11 +158,26 @@ struct APFloatBase {
     // 8-bit floating point number following IEEE-754 conventions with bit
     // layout S1E5M2 as described in https://arxiv.org/abs/2209.05433.
     S_Float8E5M2,
+    // 8-bit floating point number mostly following IEEE-754 conventions
+    // and bit layout S1E5M2 described in https://arxiv.org/abs/2206.02915,
+    // with expanded range and with no infinity or signed zero.
+    // NaN is represnted as negative zero. (FN -> Finite, UZ -> unsigned zero).
+    // This format's exponent bias is 16, instead of the 15 (2 ** (5 - 1) - 1)
+    //  that IEEE precedent would imply.
+    S_Float8E5M2FNUZ,
     // 8-bit floating point number mostly following IEEE-754 conventions with
     // bit layout S1E4M3 as described in https://arxiv.org/abs/2209.05433.
     // Unlike IEEE-754 types, there are no infinity values, and NaN is
     // represented with the exponent and mantissa bits set to all 1s.
     S_Float8E4M3FN,
+    // 8-bit floating point number mostly following IEEE-754 conventions
+    // and bit layout S1E4M3 described in https://arxiv.org/abs/2206.02915,
+    // with expanded range and with no infinity or signed zero.
+    // NaN is represnted as negative zero. (FN -> Finite, UZ -> unsigned zero).
+    // This format's exponent bias is 8, instead of the 7 (2 ** (4 - 1) - 1)
+    // that IEEE precedent would imply.
+    S_Float8E4M3FNUZ,
+
     S_x87DoubleExtended,
     S_MaxSemantics = S_x87DoubleExtended,
   };
@@ -177,7 +192,9 @@ struct APFloatBase {
   static const fltSemantics &IEEEquad() LLVM_READNONE;
   static const fltSemantics &PPCDoubleDouble() LLVM_READNONE;
   static const fltSemantics &Float8E5M2() LLVM_READNONE;
+  static const fltSemantics &Float8E5M2FNUZ() LLVM_READNONE;
   static const fltSemantics &Float8E4M3FN() LLVM_READNONE;
+  static const fltSemantics &Float8E4M3FNUZ() LLVM_READNONE;
   static const fltSemantics &x87DoubleExtended() LLVM_READNONE;
 
   /// A Pseudo fltsemantic used to construct APFloats that cannot conflict with
@@ -570,7 +587,9 @@ class IEEEFloat final : public APFloatBase {
   APInt convertF80LongDoubleAPFloatToAPInt() const;
   APInt convertPPCDoubleDoubleAPFloatToAPInt() const;
   APInt convertFloat8E5M2APFloatToAPInt() const;
+  APInt convertFloat8E5M2FNUZAPFloatToAPInt() const;
   APInt convertFloat8E4M3FNAPFloatToAPInt() const;
+  APInt convertFloat8E4M3FNUZAPFloatToAPInt() const;
   void initFromAPInt(const fltSemantics *Sem, const APInt &api);
   void initFromHalfAPInt(const APInt &api);
   void initFromBFloatAPInt(const APInt &api);
@@ -580,7 +599,9 @@ class IEEEFloat final : public APFloatBase {
   void initFromF80LongDoubleAPInt(const APInt &api);
   void initFromPPCDoubleDoubleAPInt(const APInt &api);
   void initFromFloat8E5M2APInt(const APInt &api);
+  void initFromFloat8E5M2FNUZAPInt(const APInt &api);
   void initFromFloat8E4M3FNAPInt(const APInt &api);
+  void initFromFloat8E4M3FNUZAPInt(const APInt &api);
 
   void assign(const IEEEFloat &);
   void copySignificand(const IEEEFloat &);

diff  --git a/llvm/lib/Support/APFloat.cpp b/llvm/lib/Support/APFloat.cpp
index e9998d71b82a1..2e7926b4aa7ac 100644
--- a/llvm/lib/Support/APFloat.cpp
+++ b/llvm/lib/Support/APFloat.cpp
@@ -14,6 +14,7 @@
 #include "llvm/ADT/APFloat.h"
 #include "llvm/ADT/APSInt.h"
 #include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/FloatingPointMode.h"
 #include "llvm/ADT/FoldingSet.h"
 #include "llvm/ADT/Hashing.h"
 #include "llvm/ADT/StringExtras.h"
@@ -51,49 +52,75 @@ static_assert(APFloatBase::integerPartWidth % 4 == 0, "Part width must be divisi
 
 namespace llvm {
 
-  // How the nonfinite values Inf and NaN are represented.
-  enum class fltNonfiniteBehavior {
-    // Represents standard IEEE 754 behavior. A value is nonfinite if the
-    // exponent field is all 1s. In such cases, a value is Inf if the
-    // significand bits are all zero, and NaN otherwise
-    IEEE754,
-
-    // Only the Float8E5M2 has this behavior. There is no Inf representation. A
-    // value is NaN if the exponent field and the mantissa field are all 1s.
-    // This behavior matches the FP8 E4M3 type described in
-    // https://arxiv.org/abs/2209.05433. We treat both signed and unsigned NaNs
-    // as non-signalling, although the paper does not state whether the NaN
-    // values are signalling or not.
-    NanOnly,
-  };
+// How the nonfinite values Inf and NaN are represented.
+enum class fltNonfiniteBehavior {
+  // Represents standard IEEE 754 behavior. A value is nonfinite if the
+  // exponent field is all 1s. In such cases, a value is Inf if the
+  // significand bits are all zero, and NaN otherwise
+  IEEE754,
+
+  // This behavior is present in the Float8ExMyFN* types (Float8E4M3FN,
+  // Float8E5M2FNUZ, and Float8E4M3FNUZ). There is no representation for Inf,
+  // and operations that would ordinarily produce Inf produce NaN instead.
+  // The details of the NaN representation(s) in this form are determined by the
+  // `fltNanEncoding` enum. We treat all NaNs as quiet, as the available
+  // encodings do not distinguish between signalling and quiet NaN.
+  NanOnly,
+};
 
-  /* Represents floating point arithmetic semantics.  */
-  struct fltSemantics {
-    /* The largest E such that 2^E is representable; this matches the
-       definition of IEEE 754.  */
-    APFloatBase::ExponentType maxExponent;
+// How NaN values are represented. This is curently only used in combination
+// with fltNonfiniteBehavior::NanOnly, and using a variant other than IEEE
+// while having IEEE non-finite behavior is liable to lead to unexpected
+// results.
+enum class fltNanEncoding {
+  // Represents the standard IEEE behavior where a value is NaN if its
+  // exponent is all 1s and the significand is non-zero.
+  IEEE,
+
+  // Represents the behavior in the Float8E4M3 floating point type where NaN is
+  // represented by having the exponent and mantissa set to all 1s.
+  // This behavior matches the FP8 E4M3 type described in
+  // https://arxiv.org/abs/2209.05433. We treat both signed and unsigned NaNs
+  // as non-signalling, although the paper does not state whether the NaN
+  // values are signalling or not.
+  AllOnes,
+
+  // Represents the behavior in Float8E{5,4}E{2,3}FNUZ floating point types
+  // where NaN is represented by a sign bit of 1 and all 0s in the exponent
+  // and mantissa (i.e. the negative zero encoding in a IEEE float). Since
+  // there is only one NaN value, it is treated as quiet NaN. This matches the
+  // behavior described in https://arxiv.org/abs/2206.02915 .
+  NegativeZero,
+};
 
-    /* The smallest E such that 2^E is a normalized number; this
-       matches the definition of IEEE 754.  */
-    APFloatBase::ExponentType minExponent;
+/* Represents floating point arithmetic semantics.  */
+struct fltSemantics {
+  /* The largest E such that 2^E is representable; this matches the
+     definition of IEEE 754.  */
+  APFloatBase::ExponentType maxExponent;
 
-    /* Number of bits in the significand.  This includes the integer
-       bit.  */
-    unsigned int precision;
+  /* The smallest E such that 2^E is a normalized number; this
+     matches the definition of IEEE 754.  */
+  APFloatBase::ExponentType minExponent;
 
-    /* Number of bits actually used in the semantics. */
-    unsigned int sizeInBits;
+  /* Number of bits in the significand.  This includes the integer
+     bit.  */
+  unsigned int precision;
 
-    fltNonfiniteBehavior nonFiniteBehavior = fltNonfiniteBehavior::IEEE754;
+  /* Number of bits actually used in the semantics. */
+  unsigned int sizeInBits;
 
-    // Returns true if any number described by this semantics can be precisely
-    // represented by the specified semantics. Does not take into account
-    // the value of fltNonfiniteBehavior.
-    bool isRepresentableBy(const fltSemantics &S) const {
-      return maxExponent <= S.maxExponent && minExponent >= S.minExponent &&
-             precision <= S.precision;
-    }
-  };
+  fltNonfiniteBehavior nonFiniteBehavior = fltNonfiniteBehavior::IEEE754;
+
+  fltNanEncoding nanEncoding = fltNanEncoding::IEEE;
+  // Returns true if any number described by this semantics can be precisely
+  // represented by the specified semantics. Does not take into account
+  // the value of fltNonfiniteBehavior.
+  bool isRepresentableBy(const fltSemantics &S) const {
+    return maxExponent <= S.maxExponent && minExponent >= S.minExponent &&
+           precision <= S.precision;
+  }
+};
 
   static const fltSemantics semIEEEhalf = {15, -14, 11, 16};
   static const fltSemantics semBFloat = {127, -126, 8, 16};
@@ -101,8 +128,16 @@ namespace llvm {
   static const fltSemantics semIEEEdouble = {1023, -1022, 53, 64};
   static const fltSemantics semIEEEquad = {16383, -16382, 113, 128};
   static const fltSemantics semFloat8E5M2 = {15, -14, 3, 8};
-  static const fltSemantics semFloat8E4M3FN = {8, -6, 4, 8,
-                                               fltNonfiniteBehavior::NanOnly};
+  static const fltSemantics semFloat8E5M2FNUZ = {15,
+                                                 -15,
+                                                 3,
+                                                 8,
+                                                 fltNonfiniteBehavior::NanOnly,
+                                                 fltNanEncoding::NegativeZero};
+  static const fltSemantics semFloat8E4M3FN = {
+      8, -6, 4, 8, fltNonfiniteBehavior::NanOnly, fltNanEncoding::AllOnes};
+  static const fltSemantics semFloat8E4M3FNUZ = {
+      7, -7, 4, 8, fltNonfiniteBehavior::NanOnly, fltNanEncoding::NegativeZero};
   static const fltSemantics semX87DoubleExtended = {16383, -16382, 64, 80};
   static const fltSemantics semBogus = {0, 0, 0, 0};
 
@@ -160,8 +195,12 @@ namespace llvm {
       return PPCDoubleDouble();
     case S_Float8E5M2:
       return Float8E5M2();
+    case S_Float8E5M2FNUZ:
+      return Float8E5M2FNUZ();
     case S_Float8E4M3FN:
       return Float8E4M3FN();
+    case S_Float8E4M3FNUZ:
+      return Float8E4M3FNUZ();
     case S_x87DoubleExtended:
       return x87DoubleExtended();
     }
@@ -184,8 +223,12 @@ namespace llvm {
       return S_PPCDoubleDouble;
     else if (&Sem == &llvm::APFloat::Float8E5M2())
       return S_Float8E5M2;
+    else if (&Sem == &llvm::APFloat::Float8E5M2FNUZ())
+      return S_Float8E5M2FNUZ;
     else if (&Sem == &llvm::APFloat::Float8E4M3FN())
       return S_Float8E4M3FN;
+    else if (&Sem == &llvm::APFloat::Float8E4M3FNUZ())
+      return S_Float8E4M3FNUZ;
     else if (&Sem == &llvm::APFloat::x87DoubleExtended())
       return S_x87DoubleExtended;
     else
@@ -209,7 +252,13 @@ namespace llvm {
     return semPPCDoubleDouble;
   }
   const fltSemantics &APFloatBase::Float8E5M2() { return semFloat8E5M2; }
+  const fltSemantics &APFloatBase::Float8E5M2FNUZ() {
+    return semFloat8E5M2FNUZ;
+  }
   const fltSemantics &APFloatBase::Float8E4M3FN() { return semFloat8E4M3FN; }
+  const fltSemantics &APFloatBase::Float8E4M3FNUZ() {
+    return semFloat8E4M3FNUZ;
+  }
   const fltSemantics &APFloatBase::x87DoubleExtended() {
     return semX87DoubleExtended;
   }
@@ -808,10 +857,15 @@ void IEEEFloat::makeNaN(bool SNaN, bool Negative, const APInt *fill) {
 
   APInt fill_storage;
   if (semantics->nonFiniteBehavior == fltNonfiniteBehavior::NanOnly) {
-    // The only NaN representation is where the mantissa is all 1s, which is
-    // non-signalling.
+    // Finite-only types do not distinguish signalling and quiet NaN, so
+    // make them all signalling.
     SNaN = false;
-    fill_storage = APInt::getAllOnes(semantics->precision - 1);
+    if (semantics->nanEncoding == fltNanEncoding::NegativeZero) {
+      sign = true;
+      fill_storage = APInt::getZero(semantics->precision - 1);
+    } else {
+      fill_storage = APInt::getAllOnes(semantics->precision - 1);
+    }
     fill = &fill_storage;
   }
 
@@ -842,6 +896,9 @@ void IEEEFloat::makeNaN(bool SNaN, bool Negative, const APInt *fill) {
     // conventionally, this is the next bit down from the QNaN bit.
     if (APInt::tcIsZero(significand, numParts))
       APInt::tcSetBit(significand, QNaNBit - 1);
+  } else if (semantics->nanEncoding == fltNanEncoding::NegativeZero) {
+    // The only NaN is a quiet NaN, and it has no bits sets in the significand.
+    // Do nothing.
   } else {
     // We always have to set the QNaN bit to make it a QNaN.
     APInt::tcSetBit(significand, QNaNBit);
@@ -986,7 +1043,8 @@ bool IEEEFloat::isSignificandAllZerosExceptMSB() const {
 }
 
 bool IEEEFloat::isLargest() const {
-  if (semantics->nonFiniteBehavior == fltNonfiniteBehavior::NanOnly) {
+  if (semantics->nonFiniteBehavior == fltNonfiniteBehavior::NanOnly &&
+      semantics->nanEncoding == fltNanEncoding::AllOnes) {
     // The largest number by magnitude in our format will be the floating point
     // number with maximum exponent and with significand that is all ones except
     // the LSB.
@@ -1428,7 +1486,8 @@ IEEEFloat::opStatus IEEEFloat::handleOverflow(roundingMode rounding_mode) {
   exponent = semantics->maxExponent;
   tcSetLeastSignificantBits(significandParts(), partCount(),
                             semantics->precision);
-  if (semantics->nonFiniteBehavior == fltNonfiniteBehavior::NanOnly)
+  if (semantics->nonFiniteBehavior == fltNonfiniteBehavior::NanOnly &&
+      semantics->nanEncoding == fltNanEncoding::AllOnes)
     APInt::tcClearBit(significandParts(), 0);
 
   return opInexact;
@@ -1529,7 +1588,10 @@ IEEEFloat::opStatus IEEEFloat::normalize(roundingMode rounding_mode,
     }
   }
 
+  // The all-ones values is an overflow if NaN is all ones. If NaN is
+  // represented by negative zero, then it is a valid finite value.
   if (semantics->nonFiniteBehavior == fltNonfiniteBehavior::NanOnly &&
+      semantics->nanEncoding == fltNanEncoding::AllOnes &&
       exponent == semantics->maxExponent && isSignificandAllOnes())
     return handleOverflow(rounding_mode);
 
@@ -1540,8 +1602,11 @@ IEEEFloat::opStatus IEEEFloat::normalize(roundingMode rounding_mode,
      underflow for exact results.  */
   if (lost_fraction == lfExactlyZero) {
     /* Canonicalize zeroes.  */
-    if (omsb == 0)
+    if (omsb == 0) {
       category = fcZero;
+      if (semantics->nanEncoding == fltNanEncoding::NegativeZero)
+        sign = false;
+    }
 
     return opOK;
   }
@@ -1559,18 +1624,22 @@ IEEEFloat::opStatus IEEEFloat::normalize(roundingMode rounding_mode,
       /* Renormalize by incrementing the exponent and shifting our
          significand right one.  However if we already have the
          maximum exponent we overflow to infinity.  */
-      if (exponent == semantics->maxExponent) {
-        category = fcInfinity;
-
-        return (opStatus) (opOverflow | opInexact);
-      }
+      if (exponent == semantics->maxExponent)
+        // Invoke overflow handling with a rounding mode that will guarantee
+        // that the result gets turned into the correct infinity representation.
+        // This is needed instead of just setting the category to infinity to
+        // account for 8-bit floating point types that have no inf, only NaN.
+        return handleOverflow(sign ? rmTowardNegative : rmTowardPositive);
 
       shiftSignificandRight(1);
 
       return opInexact;
     }
 
+    // The all-ones values is an overflow if NaN is all ones. If NaN is
+    // represented by negative zero, then it is a valid finite value.
     if (semantics->nonFiniteBehavior == fltNonfiniteBehavior::NanOnly &&
+        semantics->nanEncoding == fltNanEncoding::AllOnes &&
         exponent == semantics->maxExponent && isSignificandAllOnes())
       return handleOverflow(rounding_mode);
   }
@@ -1584,8 +1653,11 @@ IEEEFloat::opStatus IEEEFloat::normalize(roundingMode rounding_mode,
   assert(omsb < semantics->precision);
 
   /* Canonicalize zeroes.  */
-  if (omsb == 0)
+  if (omsb == 0) {
     category = fcZero;
+    if (semantics->nanEncoding == fltNanEncoding::NegativeZero)
+      sign = false;
+  }
 
   /* The fcZero case is a denormal that underflowed to zero.  */
   return (opStatus) (opUnderflow | opInexact);
@@ -1887,6 +1959,11 @@ IEEEFloat::opStatus IEEEFloat::remainderSpecials(const IEEEFloat &rhs) {
 
 /* Change sign.  */
 void IEEEFloat::changeSign() {
+  // With NaN-as-negative-zero, neither NaN or negative zero can change
+  // their signs.
+  if (semantics->nanEncoding == fltNanEncoding::NegativeZero &&
+      (isZero() || isNaN()))
+    return;
   /* Look mummy, this one's easy.  */
   sign = !sign;
 }
@@ -1916,6 +1993,9 @@ IEEEFloat::opStatus IEEEFloat::addOrSubtract(const IEEEFloat &rhs,
   if (category == fcZero) {
     if (rhs.category != fcZero || (sign == rhs.sign) == subtract)
       sign = (rounding_mode == rmTowardNegative);
+    // NaN-in-negative-zero means zeros need to be normalized to +0.
+    if (semantics->nanEncoding == fltNanEncoding::NegativeZero)
+      sign = false;
   }
 
   return fs;
@@ -1941,6 +2021,8 @@ IEEEFloat::opStatus IEEEFloat::multiply(const IEEEFloat &rhs,
   sign ^= rhs.sign;
   fs = multiplySpecials(rhs);
 
+  if (isZero() && semantics->nanEncoding == fltNanEncoding::NegativeZero)
+    sign = false;
   if (isFiniteNonZero()) {
     lostFraction lost_fraction = multiplySignificand(rhs);
     fs = normalize(rounding_mode, lost_fraction);
@@ -1959,6 +2041,8 @@ IEEEFloat::opStatus IEEEFloat::divide(const IEEEFloat &rhs,
   sign ^= rhs.sign;
   fs = divideSpecials(rhs);
 
+  if (isZero() && semantics->nanEncoding == fltNanEncoding::NegativeZero)
+    sign = false;
   if (isFiniteNonZero()) {
     lostFraction lost_fraction = divideSignificand(rhs);
     fs = normalize(rounding_mode, lost_fraction);
@@ -2067,8 +2151,13 @@ IEEEFloat::opStatus IEEEFloat::remainder(const IEEEFloat &rhs) {
     }
   }
 
-  if (isZero())
+  if (isZero()) {
     sign = origSign;    // IEEE754 requires this
+    if (semantics->nanEncoding == fltNanEncoding::NegativeZero)
+      // But some 8-bit floats only have positive 0.
+      sign = false;
+  }
+
   else
     sign ^= origSign;
   return fs;
@@ -2093,8 +2182,11 @@ IEEEFloat::opStatus IEEEFloat::mod(const IEEEFloat &rhs) {
     fs = subtract(V, rmNearestTiesToEven);
     assert(fs==opOK);
   }
-  if (isZero())
+  if (isZero()) {
     sign = origSign; // fmod requires this
+    if (semantics->nanEncoding == fltNanEncoding::NegativeZero)
+      sign = false;
+  }
   return fs;
 }
 
@@ -2122,8 +2214,11 @@ IEEEFloat::opStatus IEEEFloat::fusedMultiplyAdd(const IEEEFloat &multiplicand,
     /* If two numbers add (exactly) to zero, IEEE 754 decrees it is a
        positive zero unless rounding to minus infinity, except that
        adding two like-signed zeroes gives that zero.  */
-    if (category == fcZero && !(fs & opUnderflow) && sign != addend.sign)
+    if (category == fcZero && !(fs & opUnderflow) && sign != addend.sign) {
       sign = (rounding_mode == rmTowardNegative);
+      if (semantics->nanEncoding == fltNanEncoding::NegativeZero)
+        sign = false;
+    }
   } else {
     fs = multiplySpecials(multiplicand);
 
@@ -2399,6 +2494,12 @@ IEEEFloat::opStatus IEEEFloat::convert(const fltSemantics &toSemantics,
       return is_signaling ? opInvalidOp : opOK;
     }
 
+    // If NaN is negative zero, we need to create a new NaN to avoid converting
+    // NaN to -Inf.
+    if (fromSemantics.nanEncoding == fltNanEncoding::NegativeZero &&
+        semantics->nanEncoding != fltNanEncoding::NegativeZero)
+      makeNaN(false, false);
+
     *losesInfo = lostFraction != lfExactlyZero || X86SpecialNan;
 
     // For x87 extended precision, we want to make a NaN, not a special NaN if
@@ -2420,6 +2521,14 @@ IEEEFloat::opStatus IEEEFloat::convert(const fltSemantics &toSemantics,
     makeNaN(false, sign);
     *losesInfo = true;
     fs = opInexact;
+  } else if (category == fcZero &&
+             semantics->nanEncoding == fltNanEncoding::NegativeZero) {
+    // Negative zero loses info, but positive zero doesn't.
+    *losesInfo =
+        fromSemantics.nanEncoding != fltNanEncoding::NegativeZero && sign;
+    fs = *losesInfo ? opInexact : opOK;
+    // NaN is negative zero means -0 -> +0, which can lose information
+    sign = false;
   } else {
     *losesInfo = false;
     fs = opOK;
@@ -2887,9 +2996,11 @@ IEEEFloat::convertFromDecimalString(StringRef str, roundingMode rounding_mode) {
   if (D.firstSigDigit == str.end() || decDigitValue(*D.firstSigDigit) >= 10U) {
     category = fcZero;
     fs = opOK;
+    if (semantics->nanEncoding == fltNanEncoding::NegativeZero)
+      sign = false;
 
-  /* Check whether the normalized exponent is high enough to overflow
-     max during the log-rebasing in the max-exponent check below. */
+    /* Check whether the normalized exponent is high enough to overflow
+       max during the log-rebasing in the max-exponent check below. */
   } else if (D.normalizedExponent - 1 > INT_MAX / 42039) {
     fs = handleOverflow(rounding_mode);
 
@@ -3517,6 +3628,33 @@ APInt IEEEFloat::convertFloat8E5M2APFloatToAPInt() const {
                    (mysignificand & 0x3)));
 }
 
+APInt IEEEFloat::convertFloat8E5M2FNUZAPFloatToAPInt() const {
+  assert(semantics == (const llvm::fltSemantics *)&semFloat8E5M2FNUZ);
+  assert(partCount() == 1);
+
+  uint32_t myexponent, mysignificand;
+
+  if (isFiniteNonZero()) {
+    myexponent = exponent + 16; // bias
+    mysignificand = (uint32_t)*significandParts();
+    if (myexponent == 1 && !(mysignificand & 0x4))
+      myexponent = 0; // denormal
+  } else if (category == fcZero) {
+    myexponent = 0;
+    mysignificand = 0;
+  } else if (category == fcInfinity) {
+    myexponent = 0;
+    mysignificand = 0;
+  } else {
+    assert(category == fcNaN && "Unknown category!");
+    myexponent = 0;
+    mysignificand = (uint32_t)*significandParts();
+  }
+
+  return APInt(8, (((sign & 1) << 7) | ((myexponent & 0x1f) << 2) |
+                   (mysignificand & 0x3)));
+}
+
 APInt IEEEFloat::convertFloat8E4M3FNAPFloatToAPInt() const {
   assert(semantics == (const llvm::fltSemantics *)&semFloat8E4M3FN);
   assert(partCount() == 1);
@@ -3544,6 +3682,33 @@ APInt IEEEFloat::convertFloat8E4M3FNAPFloatToAPInt() const {
                    (mysignificand & 0x7)));
 }
 
+APInt IEEEFloat::convertFloat8E4M3FNUZAPFloatToAPInt() const {
+  assert(semantics == (const llvm::fltSemantics *)&semFloat8E4M3FNUZ);
+  assert(partCount() == 1);
+
+  uint32_t myexponent, mysignificand;
+
+  if (isFiniteNonZero()) {
+    myexponent = exponent + 8; // bias
+    mysignificand = (uint32_t)*significandParts();
+    if (myexponent == 1 && !(mysignificand & 0x8))
+      myexponent = 0; // denormal
+  } else if (category == fcZero) {
+    myexponent = 0;
+    mysignificand = 0;
+  } else if (category == fcInfinity) {
+    myexponent = 0;
+    mysignificand = 0;
+  } else {
+    assert(category == fcNaN && "Unknown category!");
+    myexponent = 0;
+    mysignificand = (uint32_t)*significandParts();
+  }
+
+  return APInt(8, (((sign & 1) << 7) | ((myexponent & 0xf) << 3) |
+                   (mysignificand & 0x7)));
+}
+
 // This function creates an APInt that is just a bit map of the floating
 // point constant as it would appear in memory.  It is not a conversion,
 // and treating the result as a normal integer is unlikely to be useful.
@@ -3570,9 +3735,15 @@ APInt IEEEFloat::bitcastToAPInt() const {
   if (semantics == (const llvm::fltSemantics *)&semFloat8E5M2)
     return convertFloat8E5M2APFloatToAPInt();
 
+  if (semantics == (const llvm::fltSemantics *)&semFloat8E5M2FNUZ)
+    return convertFloat8E5M2FNUZAPFloatToAPInt();
+
   if (semantics == (const llvm::fltSemantics *)&semFloat8E4M3FN)
     return convertFloat8E4M3FNAPFloatToAPInt();
 
+  if (semantics == (const llvm::fltSemantics *)&semFloat8E4M3FNUZ)
+    return convertFloat8E4M3FNUZAPFloatToAPInt();
+
   assert(semantics == (const llvm::fltSemantics*)&semX87DoubleExtended &&
          "unknown format!");
   return convertF80LongDoubleAPFloatToAPInt();
@@ -3828,6 +3999,32 @@ void IEEEFloat::initFromFloat8E5M2APInt(const APInt &api) {
   }
 }
 
+void IEEEFloat::initFromFloat8E5M2FNUZAPInt(const APInt &api) {
+  uint32_t i = (uint32_t)*api.getRawData();
+  uint32_t myexponent = (i >> 2) & 0x1f;
+  uint32_t mysignificand = i & 0x3;
+
+  initialize(&semFloat8E5M2FNUZ);
+  assert(partCount() == 1);
+
+  sign = i >> 7;
+  if (myexponent == 0 && mysignificand == 0 && sign == 0) {
+    makeZero(sign);
+  } else if (myexponent == 0 && mysignificand == 0 && sign == 1) {
+    category = fcNaN;
+    exponent = exponentNaN();
+    *significandParts() = mysignificand;
+  } else {
+    category = fcNormal;
+    exponent = myexponent - 16; // bias
+    *significandParts() = mysignificand;
+    if (myexponent == 0) // denormal
+      exponent = -15;
+    else
+      *significandParts() |= 0x4; // integer bit
+  }
+}
+
 void IEEEFloat::initFromFloat8E4M3FNAPInt(const APInt &api) {
   uint32_t i = (uint32_t)*api.getRawData();
   uint32_t myexponent = (i >> 3) & 0xf;
@@ -3854,6 +4051,32 @@ void IEEEFloat::initFromFloat8E4M3FNAPInt(const APInt &api) {
   }
 }
 
+void IEEEFloat::initFromFloat8E4M3FNUZAPInt(const APInt &api) {
+  uint32_t i = (uint32_t)*api.getRawData();
+  uint32_t myexponent = (i >> 3) & 0xf;
+  uint32_t mysignificand = i & 0x7;
+
+  initialize(&semFloat8E4M3FNUZ);
+  assert(partCount() == 1);
+
+  sign = i >> 7;
+  if (myexponent == 0 && mysignificand == 0 && sign == 0) {
+    makeZero(sign);
+  } else if (myexponent == 0 && mysignificand == 0 && sign == 1) {
+    category = fcNaN;
+    exponent = exponentNaN();
+    *significandParts() = mysignificand;
+  } else {
+    category = fcNormal;
+    exponent = myexponent - 8; // bias
+    *significandParts() = mysignificand;
+    if (myexponent == 0) // denormal
+      exponent = -7;
+    else
+      *significandParts() |= 0x8; // integer bit
+  }
+}
+
 /// Treat api as containing the bits of a floating point number.
 void IEEEFloat::initFromAPInt(const fltSemantics *Sem, const APInt &api) {
   assert(api.getBitWidth() == Sem->sizeInBits);
@@ -3873,8 +4096,12 @@ void IEEEFloat::initFromAPInt(const fltSemantics *Sem, const APInt &api) {
     return initFromPPCDoubleDoubleAPInt(api);
   if (Sem == &semFloat8E5M2)
     return initFromFloat8E5M2APInt(api);
+  if (Sem == &semFloat8E5M2FNUZ)
+    return initFromFloat8E5M2FNUZAPInt(api);
   if (Sem == &semFloat8E4M3FN)
     return initFromFloat8E4M3FNAPInt(api);
+  if (Sem == &semFloat8E4M3FNUZ)
+    return initFromFloat8E4M3FNUZAPInt(api);
 
   llvm_unreachable(nullptr);
 }
@@ -3903,7 +4130,8 @@ void IEEEFloat::makeLargest(bool Negative) {
                                    ? (~integerPart(0) >> NumUnusedHighBits)
                                    : 0;
 
-  if (semantics->nonFiniteBehavior == fltNonfiniteBehavior::NanOnly)
+  if (semantics->nonFiniteBehavior == fltNonfiniteBehavior::NanOnly &&
+      semantics->nanEncoding == fltNanEncoding::AllOnes)
     significand[0] &= ~integerPart(1);
 }
 
@@ -4331,6 +4559,8 @@ IEEEFloat::opStatus IEEEFloat::next(bool nextDown) {
       APInt::tcSet(significandParts(), 0, partCount());
       category = fcZero;
       exponent = 0;
+      if (semantics->nanEncoding == fltNanEncoding::NegativeZero)
+        sign = false;
       break;
     }
 
@@ -4417,8 +4647,11 @@ IEEEFloat::opStatus IEEEFloat::next(bool nextDown) {
 }
 
 APFloatBase::ExponentType IEEEFloat::exponentNaN() const {
-  if (semantics->nonFiniteBehavior == fltNonfiniteBehavior::NanOnly)
+  if (semantics->nonFiniteBehavior == fltNonfiniteBehavior::NanOnly) {
+    if (semantics->nanEncoding == fltNanEncoding::NegativeZero)
+      return semantics->minExponent;
     return semantics->maxExponent;
+  }
   return semantics->maxExponent + 1;
 }
 
@@ -4445,6 +4678,10 @@ void IEEEFloat::makeInf(bool Negative) {
 void IEEEFloat::makeZero(bool Negative) {
   category = fcZero;
   sign = Negative;
+  if (semantics->nanEncoding == fltNanEncoding::NegativeZero) {
+    // Merge negative zero to positive because 0b10000...000 is used for NaN
+    sign = false;
+  }
   exponent = exponentZero();
   APInt::tcSet(significandParts(), 0, partCount());
 }

diff  --git a/llvm/unittests/ADT/APFloatTest.cpp b/llvm/unittests/ADT/APFloatTest.cpp
index ff295f7b40c72..2ec8ebf30b632 100644
--- a/llvm/unittests/ADT/APFloatTest.cpp
+++ b/llvm/unittests/ADT/APFloatTest.cpp
@@ -9,6 +9,7 @@
 #include "llvm/ADT/APFloat.h"
 #include "llvm/ADT/APSInt.h"
 #include "llvm/ADT/Hashing.h"
+#include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/Support/Error.h"
 #include "llvm/Support/FormatVariadic.h"
@@ -1291,6 +1292,7 @@ TEST(APFloatTest, makeNaN) {
     bool Negative;
     uint64_t payload;
   } tests[] = {
+      // clang-format off
     /*             expected              semantics   SNaN    Neg                payload */
     {         0x7fc00000ULL, APFloat::IEEEsingle(), false, false,         0x00000000ULL },
     {         0xffc00000ULL, APFloat::IEEEsingle(), false,  true,         0x00000000ULL },
@@ -1312,6 +1314,15 @@ TEST(APFloatTest, makeNaN) {
     { 0x7ff000000000ae72ULL, APFloat::IEEEdouble(),  true, false, 0x000000000000ae72ULL },
     { 0x7ff7ffffffffae72ULL, APFloat::IEEEdouble(),  true, false, 0xffffffffffffae72ULL },
     { 0x7ff1aaaaaaaaae72ULL, APFloat::IEEEdouble(),  true, false, 0x0001aaaaaaaaae72ULL },
+    {               0x80ULL, APFloat::Float8E5M2FNUZ(), false, false,           0xaaULL },
+    {               0x80ULL, APFloat::Float8E5M2FNUZ(), false, true,            0xaaULL },
+    {               0x80ULL, APFloat::Float8E5M2FNUZ(), true, false,            0xaaULL },
+    {               0x80ULL, APFloat::Float8E5M2FNUZ(), true, true,             0xaaULL },
+    {               0x80ULL, APFloat::Float8E4M3FNUZ(), false, false,           0xaaULL },
+    {               0x80ULL, APFloat::Float8E4M3FNUZ(), false, true,            0xaaULL },
+    {               0x80ULL, APFloat::Float8E4M3FNUZ(), true, false,            0xaaULL },
+    {               0x80ULL, APFloat::Float8E4M3FNUZ(), true, true,             0xaaULL },
+      // clang-format on
   };
 
   for (const auto &t : tests) {
@@ -1735,6 +1746,10 @@ TEST(APFloatTest, getLargest) {
   EXPECT_EQ(3.402823466e+38f, APFloat::getLargest(APFloat::IEEEsingle()).convertToFloat());
   EXPECT_EQ(1.7976931348623158e+308, APFloat::getLargest(APFloat::IEEEdouble()).convertToDouble());
   EXPECT_EQ(448, APFloat::getLargest(APFloat::Float8E4M3FN()).convertToDouble());
+  EXPECT_EQ(240,
+            APFloat::getLargest(APFloat::Float8E4M3FNUZ()).convertToDouble());
+  EXPECT_EQ(57344,
+            APFloat::getLargest(APFloat::Float8E5M2FNUZ()).convertToDouble());
 }
 
 TEST(APFloatTest, getSmallest) {
@@ -1765,6 +1780,20 @@ TEST(APFloatTest, getSmallest) {
   EXPECT_TRUE(test.isFiniteNonZero());
   EXPECT_TRUE(test.isDenormal());
   EXPECT_TRUE(test.bitwiseIsEqual(expected));
+
+  test = APFloat::getSmallest(APFloat::Float8E5M2FNUZ(), false);
+  expected = APFloat(APFloat::Float8E5M2FNUZ(), "0x0.4p-15");
+  EXPECT_FALSE(test.isNegative());
+  EXPECT_TRUE(test.isFiniteNonZero());
+  EXPECT_TRUE(test.isDenormal());
+  EXPECT_TRUE(test.bitwiseIsEqual(expected));
+
+  test = APFloat::getSmallest(APFloat::Float8E4M3FNUZ(), false);
+  expected = APFloat(APFloat::Float8E4M3FNUZ(), "0x0.2p-7");
+  EXPECT_FALSE(test.isNegative());
+  EXPECT_TRUE(test.isFiniteNonZero());
+  EXPECT_TRUE(test.isDenormal());
+  EXPECT_TRUE(test.bitwiseIsEqual(expected));
 }
 
 TEST(APFloatTest, getSmallestNormalized) {
@@ -1815,33 +1844,53 @@ TEST(APFloatTest, getSmallestNormalized) {
   EXPECT_FALSE(test.isDenormal());
   EXPECT_TRUE(test.bitwiseIsEqual(expected));
   EXPECT_TRUE(test.isSmallestNormalized());
+
+  test = APFloat::getSmallestNormalized(APFloat::Float8E5M2FNUZ(), false);
+  expected = APFloat(APFloat::Float8E5M2FNUZ(), "0x1.0p-15");
+  EXPECT_FALSE(test.isNegative());
+  EXPECT_TRUE(test.isFiniteNonZero());
+  EXPECT_FALSE(test.isDenormal());
+  EXPECT_TRUE(test.bitwiseIsEqual(expected));
+  EXPECT_TRUE(test.isSmallestNormalized());
+
+  test = APFloat::getSmallestNormalized(APFloat::Float8E4M3FNUZ(), false);
+  expected = APFloat(APFloat::Float8E4M3FNUZ(), "0x1.0p-7");
+  EXPECT_FALSE(test.isNegative());
+  EXPECT_TRUE(test.isFiniteNonZero());
+  EXPECT_FALSE(test.isDenormal());
+  EXPECT_TRUE(test.bitwiseIsEqual(expected));
+  EXPECT_TRUE(test.isSmallestNormalized());
 }
 
 TEST(APFloatTest, getZero) {
   struct {
     const fltSemantics *semantics;
     const bool sign;
+    const bool signedZero;
     const unsigned long long bitPattern[2];
     const unsigned bitPatternLength;
   } const GetZeroTest[] = {
-      {&APFloat::IEEEhalf(), false, {0, 0}, 1},
-      {&APFloat::IEEEhalf(), true, {0x8000ULL, 0}, 1},
-      {&APFloat::IEEEsingle(), false, {0, 0}, 1},
-      {&APFloat::IEEEsingle(), true, {0x80000000ULL, 0}, 1},
-      {&APFloat::IEEEdouble(), false, {0, 0}, 1},
-      {&APFloat::IEEEdouble(), true, {0x8000000000000000ULL, 0}, 1},
-      {&APFloat::IEEEquad(), false, {0, 0}, 2},
-      {&APFloat::IEEEquad(), true, {0, 0x8000000000000000ULL}, 2},
-      {&APFloat::PPCDoubleDouble(), false, {0, 0}, 2},
-      {&APFloat::PPCDoubleDouble(), true, {0x8000000000000000ULL, 0}, 2},
-      {&APFloat::x87DoubleExtended(), false, {0, 0}, 2},
-      {&APFloat::x87DoubleExtended(), true, {0, 0x8000ULL}, 2},
-      {&APFloat::Float8E5M2(), false, {0, 0}, 1},
-      {&APFloat::Float8E5M2(), true, {0x80ULL, 0}, 1},
-      {&APFloat::Float8E4M3FN(), false, {0, 0}, 1},
-      {&APFloat::Float8E4M3FN(), true, {0x80ULL, 0}, 1},
-  };
-  const unsigned NumGetZeroTests = 12;
+      {&APFloat::IEEEhalf(), false, true, {0, 0}, 1},
+      {&APFloat::IEEEhalf(), true, true, {0x8000ULL, 0}, 1},
+      {&APFloat::IEEEsingle(), false, true, {0, 0}, 1},
+      {&APFloat::IEEEsingle(), true, true, {0x80000000ULL, 0}, 1},
+      {&APFloat::IEEEdouble(), false, true, {0, 0}, 1},
+      {&APFloat::IEEEdouble(), true, true, {0x8000000000000000ULL, 0}, 1},
+      {&APFloat::IEEEquad(), false, true, {0, 0}, 2},
+      {&APFloat::IEEEquad(), true, true, {0, 0x8000000000000000ULL}, 2},
+      {&APFloat::PPCDoubleDouble(), false, true, {0, 0}, 2},
+      {&APFloat::PPCDoubleDouble(), true, true, {0x8000000000000000ULL, 0}, 2},
+      {&APFloat::x87DoubleExtended(), false, true, {0, 0}, 2},
+      {&APFloat::x87DoubleExtended(), true, true, {0, 0x8000ULL}, 2},
+      {&APFloat::Float8E5M2(), false, true, {0, 0}, 1},
+      {&APFloat::Float8E5M2(), true, true, {0x80ULL, 0}, 1},
+      {&APFloat::Float8E5M2FNUZ(), false, false, {0, 0}, 1},
+      {&APFloat::Float8E5M2FNUZ(), true, false, {0, 0}, 1},
+      {&APFloat::Float8E4M3FN(), false, true, {0, 0}, 1},
+      {&APFloat::Float8E4M3FN(), true, true, {0x80ULL, 0}, 1},
+      {&APFloat::Float8E4M3FNUZ(), false, false, {0, 0}, 1},
+      {&APFloat::Float8E4M3FNUZ(), true, false, {0, 0}, 1}};
+  const unsigned NumGetZeroTests = std::size(GetZeroTest);
   for (unsigned i = 0; i < NumGetZeroTests; ++i) {
     APFloat test = APFloat::getZero(*GetZeroTest[i].semantics,
                                     GetZeroTest[i].sign);
@@ -1849,7 +1898,10 @@ TEST(APFloatTest, getZero) {
     APFloat expected = APFloat(*GetZeroTest[i].semantics,
                                pattern);
     EXPECT_TRUE(test.isZero());
-    EXPECT_TRUE(GetZeroTest[i].sign? test.isNegative() : !test.isNegative());
+    if (GetZeroTest[i].signedZero)
+      EXPECT_TRUE(GetZeroTest[i].sign ? test.isNegative() : !test.isNegative());
+    else
+      EXPECT_TRUE(!test.isNegative());
     EXPECT_TRUE(test.bitwiseIsEqual(expected));
     for (unsigned j = 0, je = GetZeroTest[i].bitPatternLength; j < je; ++j) {
       EXPECT_EQ(GetZeroTest[i].bitPattern[j],
@@ -1867,6 +1919,15 @@ TEST(APFloatTest, copySign) {
       APFloat::copySign(APFloat(-42.0), APFloat(-1.0))));
   EXPECT_TRUE(APFloat(42.0).bitwiseIsEqual(
       APFloat::copySign(APFloat(42.0), APFloat(1.0))));
+  // For floating-point formats with unsigned 0, copySign() to a zero is a noop
+  EXPECT_TRUE(
+      APFloat::getZero(APFloat::Float8E4M3FNUZ())
+          .bitwiseIsEqual(APFloat::copySign(
+              APFloat::getZero(APFloat::Float8E4M3FNUZ()), APFloat(-1.0))));
+  EXPECT_TRUE(
+      APFloat::getNaN(APFloat::Float8E4M3FNUZ(), true)
+          .bitwiseIsEqual(APFloat::copySign(
+              APFloat::getNaN(APFloat::Float8E4M3FNUZ(), true), APFloat(1.0))));
 }
 
 TEST(APFloatTest, convert) {
@@ -1979,6 +2040,67 @@ TEST(APFloatTest, convert) {
   EXPECT_TRUE(losesInfo);
 }
 
+TEST(APFloatTest, Float8UZConvert) {
+  bool losesInfo = false;
+  std::pair<APFloat, APFloat::opStatus> toNaNTests[] = {
+      {APFloat::getQNaN(APFloat::IEEEsingle(), false), APFloat::opOK},
+      {APFloat::getQNaN(APFloat::IEEEsingle(), true), APFloat::opOK},
+      {APFloat::getSNaN(APFloat::IEEEsingle(), false), APFloat::opInvalidOp},
+      {APFloat::getSNaN(APFloat::IEEEsingle(), true), APFloat::opInvalidOp},
+      {APFloat::getInf(APFloat::IEEEsingle(), false), APFloat::opInexact},
+      {APFloat::getInf(APFloat::IEEEsingle(), true), APFloat::opInexact}};
+  for (auto [toTest, expectedRes] : toNaNTests) {
+    llvm::SmallString<16> value;
+    toTest.toString(value);
+    SCOPED_TRACE("toTest = " + value);
+    for (const fltSemantics *sem :
+         {&APFloat::Float8E4M3FNUZ(), &APFloat::Float8E5M2FNUZ()}) {
+      SCOPED_TRACE("Semantics = " +
+                   std::to_string(APFloat::SemanticsToEnum(*sem)));
+      losesInfo = false;
+      APFloat test = toTest;
+      EXPECT_EQ(test.convert(*sem, APFloat::rmNearestTiesToAway, &losesInfo),
+                expectedRes);
+      EXPECT_TRUE(test.isNaN());
+      EXPECT_TRUE(test.isNegative());
+      EXPECT_FALSE(test.isSignaling());
+      EXPECT_FALSE(test.isInfinity());
+      EXPECT_EQ(0x80, test.bitcastToAPInt());
+      EXPECT_TRUE(losesInfo);
+    }
+  }
+
+  // Negative zero conversions are information losing.
+  losesInfo = false;
+  APFloat test = APFloat::getZero(APFloat::IEEEsingle(), true);
+  EXPECT_EQ(test.convert(APFloat::Float8E5M2FNUZ(),
+                         APFloat::rmNearestTiesToAway, &losesInfo),
+            APFloat::opInexact);
+  EXPECT_TRUE(test.isZero());
+  EXPECT_FALSE(test.isNegative());
+  EXPECT_TRUE(losesInfo);
+  EXPECT_EQ(0x0, test.bitcastToAPInt());
+
+  losesInfo = true;
+  test = APFloat::getZero(APFloat::IEEEsingle(), false);
+  EXPECT_EQ(test.convert(APFloat::Float8E5M2FNUZ(),
+                         APFloat::rmNearestTiesToAway, &losesInfo),
+            APFloat::opOK);
+  EXPECT_TRUE(test.isZero());
+  EXPECT_FALSE(test.isNegative());
+  EXPECT_FALSE(losesInfo);
+  EXPECT_EQ(0x0, test.bitcastToAPInt());
+
+  // Except in casts between ourselves.
+  losesInfo = true;
+  test = APFloat::getZero(APFloat::Float8E5M2FNUZ());
+  EXPECT_EQ(test.convert(APFloat::Float8E4M3FNUZ(),
+                         APFloat::rmNearestTiesToAway, &losesInfo),
+            APFloat::opOK);
+  EXPECT_FALSE(losesInfo);
+  EXPECT_EQ(0x0, test.bitcastToAPInt());
+}
+
 TEST(APFloatTest, PPCDoubleDouble) {
   APFloat test(APFloat::PPCDoubleDouble(), "1.0");
   EXPECT_EQ(0x3ff0000000000000ull, test.bitcastToAPInt().getRawData()[0]);
@@ -4850,6 +4972,87 @@ TEST(APFloatTest, x87Next) {
   EXPECT_TRUE(ilogb(F) == -1);
 }
 
+TEST(APFloatTest, Float8ExhaustivePair) {
+  // Test each pair of 8-bit floats with non-standard semantics
+  for (APFloat::Semantics Sem :
+       {APFloat::S_Float8E4M3FN, APFloat::S_Float8E5M2FNUZ,
+        APFloat::S_Float8E4M3FNUZ}) {
+    const llvm::fltSemantics &S = APFloat::EnumToSemantics(Sem);
+    for (int i = 0; i < 256; i++) {
+      for (int j = 0; j < 256; j++) {
+        SCOPED_TRACE("sem=" + std::to_string(Sem) + ",i=" + std::to_string(i) +
+                     ",j=" + std::to_string(j));
+        APFloat x(S, APInt(8, i));
+        APFloat y(S, APInt(8, j));
+
+        bool losesInfo;
+        APFloat x16 = x;
+        x16.convert(APFloat::IEEEhalf(), APFloat::rmNearestTiesToEven,
+                    &losesInfo);
+        EXPECT_FALSE(losesInfo);
+        APFloat y16 = y;
+        y16.convert(APFloat::IEEEhalf(), APFloat::rmNearestTiesToEven,
+                    &losesInfo);
+        EXPECT_FALSE(losesInfo);
+
+        // Add
+        APFloat z = x;
+        z.add(y, APFloat::rmNearestTiesToEven);
+        APFloat z16 = x16;
+        z16.add(y16, APFloat::rmNearestTiesToEven);
+        z16.convert(S, APFloat::rmNearestTiesToEven, &losesInfo);
+        EXPECT_TRUE(z.bitwiseIsEqual(z16))
+            << "sem=" << Sem << ", i=" << i << ", j=" << j;
+
+        // Subtract
+        z = x;
+        z.subtract(y, APFloat::rmNearestTiesToEven);
+        z16 = x16;
+        z16.subtract(y16, APFloat::rmNearestTiesToEven);
+        z16.convert(S, APFloat::rmNearestTiesToEven, &losesInfo);
+        EXPECT_TRUE(z.bitwiseIsEqual(z16))
+            << "sem=" << Sem << ", i=" << i << ", j=" << j;
+
+        // Multiply
+        z = x;
+        z.multiply(y, APFloat::rmNearestTiesToEven);
+        z16 = x16;
+        z16.multiply(y16, APFloat::rmNearestTiesToEven);
+        z16.convert(S, APFloat::rmNearestTiesToEven, &losesInfo);
+        EXPECT_TRUE(z.bitwiseIsEqual(z16))
+            << "sem=" << Sem << ", i=" << i << ", j=" << j;
+
+        // Divide
+        z = x;
+        z.divide(y, APFloat::rmNearestTiesToEven);
+        z16 = x16;
+        z16.divide(y16, APFloat::rmNearestTiesToEven);
+        z16.convert(S, APFloat::rmNearestTiesToEven, &losesInfo);
+        EXPECT_TRUE(z.bitwiseIsEqual(z16))
+            << "sem=" << Sem << ", i=" << i << ", j=" << j;
+
+        // Mod
+        z = x;
+        z.mod(y);
+        z16 = x16;
+        z16.mod(y16);
+        z16.convert(S, APFloat::rmNearestTiesToEven, &losesInfo);
+        EXPECT_TRUE(z.bitwiseIsEqual(z16))
+            << "sem=" << Sem << ", i=" << i << ", j=" << j;
+
+        // Remainder
+        z = x;
+        z.remainder(y);
+        z16 = x16;
+        z16.remainder(y16);
+        z16.convert(S, APFloat::rmNearestTiesToEven, &losesInfo);
+        EXPECT_TRUE(z.bitwiseIsEqual(z16))
+            << "sem=" << Sem << ", i=" << i << ", j=" << j;
+      }
+    }
+  }
+}
+
 TEST(APFloatTest, ConvertE4M3FNToE5M2) {
   bool losesInfo;
   APFloat test(APFloat::Float8E4M3FN(), "1.0");
@@ -5143,11 +5346,11 @@ TEST(APFloatTest, Float8E4M3FNExhaustive) {
 
     // convert to BFloat
     APFloat test2 = test;
-    bool loses_info;
+    bool losesInfo;
     APFloat::opStatus status = test2.convert(
-        APFloat::BFloat(), APFloat::rmNearestTiesToEven, &loses_info);
+        APFloat::BFloat(), APFloat::rmNearestTiesToEven, &losesInfo);
     EXPECT_EQ(status, APFloat::opOK);
-    EXPECT_FALSE(loses_info);
+    EXPECT_FALSE(losesInfo);
     if (i == 127 || i == 255)
       EXPECT_TRUE(test2.isNaN());
     else
@@ -5158,95 +5361,511 @@ TEST(APFloatTest, Float8E4M3FNExhaustive) {
   }
 }
 
-TEST(APFloatTest, Float8E4M3FNExhaustivePair) {
-  // Test each pair of Float8E4M3FN values.
-  for (int i = 0; i < 256; i++) {
-    for (int j = 0; j < 256; j++) {
-      SCOPED_TRACE("i=" + std::to_string(i) + ",j=" + std::to_string(j));
-      APFloat x(APFloat::Float8E4M3FN(), APInt(8, i));
-      APFloat y(APFloat::Float8E4M3FN(), APInt(8, j));
+TEST(APFloatTest, Float8E5M2FNUZNext) {
+  APFloat test(APFloat::Float8E5M2FNUZ(), APFloat::uninitialized);
+  APFloat expected(APFloat::Float8E5M2FNUZ(), APFloat::uninitialized);
+
+  // 1. NextUp of largest bit pattern is nan
+  test = APFloat::getLargest(APFloat::Float8E5M2FNUZ());
+  expected = APFloat::getNaN(APFloat::Float8E5M2FNUZ());
+  EXPECT_EQ(test.next(false), APFloat::opOK);
+  EXPECT_FALSE(test.isInfinity());
+  EXPECT_FALSE(test.isZero());
+  EXPECT_TRUE(test.isNaN());
+  EXPECT_TRUE(test.bitwiseIsEqual(expected));
+
+  // 2. NextUp of smallest negative denormal is +0
+  test = APFloat::getSmallest(APFloat::Float8E5M2FNUZ(), true);
+  expected = APFloat::getZero(APFloat::Float8E5M2FNUZ(), false);
+  EXPECT_EQ(test.next(false), APFloat::opOK);
+  EXPECT_FALSE(test.isNegZero());
+  EXPECT_TRUE(test.isPosZero());
+  EXPECT_TRUE(test.bitwiseIsEqual(expected));
+
+  // 3. nextDown of negative of largest value is NaN
+  test = APFloat::getLargest(APFloat::Float8E5M2FNUZ(), true);
+  expected = APFloat::getNaN(APFloat::Float8E5M2FNUZ());
+  EXPECT_EQ(test.next(true), APFloat::opOK);
+  EXPECT_FALSE(test.isInfinity());
+  EXPECT_FALSE(test.isZero());
+  EXPECT_TRUE(test.isNaN());
+  EXPECT_TRUE(test.bitwiseIsEqual(expected));
+
+  // 4. nextDown of +0 is smallest negative denormal
+  test = APFloat::getZero(APFloat::Float8E5M2FNUZ(), false);
+  expected = APFloat::getSmallest(APFloat::Float8E5M2FNUZ(), true);
+  EXPECT_EQ(test.next(true), APFloat::opOK);
+  EXPECT_FALSE(test.isZero());
+  EXPECT_TRUE(test.isDenormal());
+  EXPECT_TRUE(test.bitwiseIsEqual(expected));
+
+  // 5. nextUp of NaN is NaN
+  test = APFloat::getNaN(APFloat::Float8E5M2FNUZ(), false);
+  expected = APFloat::getNaN(APFloat::Float8E5M2FNUZ(), true);
+  EXPECT_EQ(test.next(false), APFloat::opOK);
+  EXPECT_TRUE(test.isNaN());
+
+  // 6. nextDown of NaN is NaN
+  test = APFloat::getNaN(APFloat::Float8E5M2FNUZ(), false);
+  expected = APFloat::getNaN(APFloat::Float8E5M2FNUZ(), true);
+  EXPECT_EQ(test.next(true), APFloat::opOK);
+  EXPECT_TRUE(test.isNaN());
+}
+
+TEST(APFloatTest, Float8E5M2FNUZChangeSign) {
+  APFloat test = APFloat(APFloat::Float8E5M2FNUZ(), "1.0");
+  APFloat expected = APFloat(APFloat::Float8E5M2FNUZ(), "-1.0");
+  test.changeSign();
+  EXPECT_TRUE(test.bitwiseIsEqual(expected));
+
+  test = APFloat::getZero(APFloat::Float8E5M2FNUZ());
+  expected = test;
+  test.changeSign();
+  EXPECT_TRUE(test.bitwiseIsEqual(expected));
+
+  test = APFloat::getNaN(APFloat::Float8E5M2FNUZ());
+  expected = test;
+  test.changeSign();
+  EXPECT_TRUE(test.bitwiseIsEqual(expected));
+}
+
+TEST(APFloatTest, Float8E5M2FNUZFromString) {
+  // Exactly representable
+  EXPECT_EQ(57344,
+            APFloat(APFloat::Float8E5M2FNUZ(), "57344").convertToDouble());
+  // Round down to maximum value
+  EXPECT_EQ(57344,
+            APFloat(APFloat::Float8E5M2FNUZ(), "59392").convertToDouble());
+  // Round up, causing overflow to NaN
+  EXPECT_TRUE(APFloat(APFloat::Float8E5M2FNUZ(), "61440").isNaN());
+  // Overflow without rounding
+  EXPECT_TRUE(APFloat(APFloat::Float8E5M2FNUZ(), "131072").isNaN());
+  // Inf converted to NaN
+  EXPECT_TRUE(APFloat(APFloat::Float8E5M2FNUZ(), "inf").isNaN());
+  // NaN converted to NaN
+  EXPECT_TRUE(APFloat(APFloat::Float8E5M2FNUZ(), "nan").isNaN());
+  // Negative zero converted to positive zero
+  EXPECT_TRUE(APFloat(APFloat::Float8E5M2FNUZ(), "-0").isPosZero());
+}
+
+TEST(APFloatTest, UnsignedZeroArithmeticSpecial) {
+  // Float semantics with only unsigned zero (ex. Float8E4M3FNUZ) violate the
+  // IEEE rules about signs in arithmetic operations when producing zeros,
+  // because they only have one zero. Most of the rest of the complexities of
+  // arithmetic on these values are covered by the other Float8 types' test
+  // cases and so are not repeated here.
+
+  // The IEEE round towards negative rule doesn't apply
+  APFloat test = APFloat::getSmallest(APFloat::Float8E4M3FNUZ());
+  APFloat rhs = test;
+  EXPECT_EQ(test.subtract(rhs, APFloat::rmTowardNegative), APFloat::opOK);
+  EXPECT_TRUE(test.isZero());
+  EXPECT_FALSE(test.isNegative());
+
+  // Multiplication of (small) * (-small) is +0
+  test = APFloat::getSmallestNormalized(APFloat::Float8E4M3FNUZ());
+  rhs = -test;
+  EXPECT_EQ(test.multiply(rhs, APFloat::rmNearestTiesToAway),
+            APFloat::opInexact | APFloat::opUnderflow);
+  EXPECT_TRUE(test.isZero());
+  EXPECT_FALSE(test.isNegative());
+
+  // Dividing the negatize float_min by anything gives +0
+  test = APFloat::getSmallest(APFloat::Float8E4M3FNUZ(), true);
+  rhs = APFloat(APFloat::Float8E4M3FNUZ(), "2.0");
+  EXPECT_EQ(test.divide(rhs, APFloat::rmNearestTiesToEven),
+            APFloat::opInexact | APFloat::opUnderflow);
+  EXPECT_TRUE(test.isZero());
+  EXPECT_FALSE(test.isNegative());
+
+  // Remainder can't copy sign because there's only one zero
+  test = APFloat(APFloat::Float8E4M3FNUZ(), "-4.0");
+  rhs = APFloat(APFloat::Float8E4M3FNUZ(), "2.0");
+  EXPECT_EQ(test.remainder(rhs), APFloat::opOK);
+  EXPECT_TRUE(test.isZero());
+  EXPECT_FALSE(test.isNegative());
+
+  // And same for mod
+  test = APFloat(APFloat::Float8E4M3FNUZ(), "-4.0");
+  rhs = APFloat(APFloat::Float8E4M3FNUZ(), "2.0");
+  EXPECT_EQ(test.mod(rhs), APFloat::opOK);
+  EXPECT_TRUE(test.isZero());
+  EXPECT_FALSE(test.isNegative());
+
+  // FMA correctly handles both the multiply and add parts of all this
+  test = APFloat(APFloat::Float8E4M3FNUZ(), "2.0");
+  rhs = test;
+  APFloat addend = APFloat(APFloat::Float8E4M3FNUZ(), "-4.0");
+  EXPECT_EQ(test.fusedMultiplyAdd(rhs, addend, APFloat::rmTowardNegative),
+            APFloat::opOK);
+  EXPECT_TRUE(test.isZero());
+  EXPECT_FALSE(test.isNegative());
+}
+
+TEST(APFloatTest, Float8E5M2FNUZAdd) {
+  APFloat QNaN = APFloat::getNaN(APFloat::Float8E5M2FNUZ(), false);
 
+  auto FromStr = [](StringRef S) {
+    return APFloat(APFloat::Float8E5M2FNUZ(), S);
+  };
+
+  struct {
+    APFloat x;
+    APFloat y;
+    const char *result;
+    int status;
+    int category;
+    APFloat::roundingMode roundingMode = APFloat::rmNearestTiesToEven;
+  } AdditionTests[] = {
+      // Test addition operations involving NaN, overflow, and the max E5M2FNUZ
+      // value (57344) because E5M2FNUZ 
diff ers from IEEE-754 types in these
+      // regards
+      {FromStr("57344"), FromStr("2048"), "57344", APFloat::opInexact,
+       APFloat::fcNormal},
+      {FromStr("57344"), FromStr("4096"), "NaN",
+       APFloat::opOverflow | APFloat::opInexact, APFloat::fcNaN},
+      {FromStr("-57344"), FromStr("-4096"), "NaN",
+       APFloat::opOverflow | APFloat::opInexact, APFloat::fcNaN},
+      {QNaN, FromStr("-57344"), "NaN", APFloat::opOK, APFloat::fcNaN},
+      {FromStr("57344"), FromStr("-8192"), "49152", APFloat::opOK,
+       APFloat::fcNormal},
+      {FromStr("57344"), FromStr("0"), "57344", APFloat::opOK,
+       APFloat::fcNormal},
+      {FromStr("57344"), FromStr("4096"), "57344", APFloat::opInexact,
+       APFloat::fcNormal, APFloat::rmTowardZero},
+      {FromStr("57344"), FromStr("57344"), "57344", APFloat::opInexact,
+       APFloat::fcNormal, APFloat::rmTowardZero},
+  };
+
+  for (size_t i = 0; i < std::size(AdditionTests); ++i) {
+    APFloat x(AdditionTests[i].x);
+    APFloat y(AdditionTests[i].y);
+    APFloat::opStatus status = x.add(y, AdditionTests[i].roundingMode);
+
+    APFloat result(APFloat::Float8E5M2FNUZ(), AdditionTests[i].result);
+
+    EXPECT_TRUE(result.bitwiseIsEqual(x));
+    EXPECT_EQ(AdditionTests[i].status, (int)status);
+    EXPECT_EQ(AdditionTests[i].category, (int)x.getCategory());
+  }
+}
+
+TEST(APFloatTest, Float8E5M2FNUZDivideByZero) {
+  APFloat x(APFloat::Float8E5M2FNUZ(), "1");
+  APFloat zero(APFloat::Float8E5M2FNUZ(), "0");
+  EXPECT_EQ(x.divide(zero, APFloat::rmNearestTiesToEven), APFloat::opDivByZero);
+  EXPECT_TRUE(x.isNaN());
+}
+
+TEST(APFloatTest, Float8UnsignedZeroExhaustive) {
+  struct {
+    const fltSemantics *semantics;
+    const double largest;
+    const double smallest;
+  } const exhaustiveTests[] = {{&APFloat::Float8E5M2FNUZ(), 57344., 0x1.0p-17},
+                               {&APFloat::Float8E4M3FNUZ(), 240., 0x1.0p-10}};
+  for (const auto &testInfo : exhaustiveTests) {
+    const fltSemantics &sem = *testInfo.semantics;
+    SCOPED_TRACE("Semantics=" + std::to_string(APFloat::SemanticsToEnum(sem)));
+    // Test each of the 256 values.
+    for (int i = 0; i < 256; i++) {
+      SCOPED_TRACE("i=" + std::to_string(i));
+      APFloat test(sem, APInt(8, i));
+
+      // isLargest
+      if (i == 127 || i == 255) {
+        EXPECT_TRUE(test.isLargest());
+        EXPECT_EQ(abs(test).convertToDouble(), testInfo.largest);
+      } else {
+        EXPECT_FALSE(test.isLargest());
+      }
+
+      // isSmallest
+      if (i == 1 || i == 129) {
+        EXPECT_TRUE(test.isSmallest());
+        EXPECT_EQ(abs(test).convertToDouble(), testInfo.smallest);
+      } else {
+        EXPECT_FALSE(test.isSmallest());
+      }
+
+      // convert to BFloat
+      APFloat test2 = test;
       bool losesInfo;
-      APFloat x16 = x;
-      x16.convert(APFloat::IEEEhalf(), APFloat::rmNearestTiesToEven,
-                  &losesInfo);
-      EXPECT_FALSE(losesInfo);
-      APFloat y16 = y;
-      y16.convert(APFloat::IEEEhalf(), APFloat::rmNearestTiesToEven,
-                  &losesInfo);
+      APFloat::opStatus status = test2.convert(
+          APFloat::BFloat(), APFloat::rmNearestTiesToEven, &losesInfo);
+      EXPECT_EQ(status, APFloat::opOK);
       EXPECT_FALSE(losesInfo);
+      if (i == 128)
+        EXPECT_TRUE(test2.isNaN());
+      else
+        EXPECT_EQ(test.convertToFloat(), test2.convertToFloat());
 
-      // Add
-      APFloat z = x;
-      z.add(y, APFloat::rmNearestTiesToEven);
-      APFloat z16 = x16;
-      z16.add(y16, APFloat::rmNearestTiesToEven);
-      z16.convert(APFloat::Float8E4M3FN(), APFloat::rmNearestTiesToEven,
-                  &losesInfo);
-      EXPECT_TRUE(z.bitwiseIsEqual(z16));
-
-      // Subtract
-      z = x;
-      z.subtract(y, APFloat::rmNearestTiesToEven);
-      z16 = x16;
-      z16.subtract(y16, APFloat::rmNearestTiesToEven);
-      z16.convert(APFloat::Float8E4M3FN(), APFloat::rmNearestTiesToEven,
-                  &losesInfo);
-      EXPECT_TRUE(z.bitwiseIsEqual(z16));
-
-      // Multiply
-      z = x;
-      z.multiply(y, APFloat::rmNearestTiesToEven);
-      z16 = x16;
-      z16.multiply(y16, APFloat::rmNearestTiesToEven);
-      z16.convert(APFloat::Float8E4M3FN(), APFloat::rmNearestTiesToEven,
-                  &losesInfo);
-      EXPECT_TRUE(z.bitwiseIsEqual(z16)) << "i=" << i << ", j=" << j;
-
-      // Divide
-      z = x;
-      z.divide(y, APFloat::rmNearestTiesToEven);
-      z16 = x16;
-      z16.divide(y16, APFloat::rmNearestTiesToEven);
-      z16.convert(APFloat::Float8E4M3FN(), APFloat::rmNearestTiesToEven,
-                  &losesInfo);
-      EXPECT_TRUE(z.bitwiseIsEqual(z16)) << "i=" << i << ", j=" << j;
-
-      // Mod
-      z = x;
-      z.mod(y);
-      z16 = x16;
-      z16.mod(y16);
-      z16.convert(APFloat::Float8E4M3FN(), APFloat::rmNearestTiesToEven,
-                  &losesInfo);
-      EXPECT_TRUE(z.bitwiseIsEqual(z16)) << "i=" << i << ", j=" << j;
-
-      // Remainder
-      z = x;
-      z.remainder(y);
-      z16 = x16;
-      z16.remainder(y16);
-      z16.convert(APFloat::Float8E4M3FN(), APFloat::rmNearestTiesToEven,
-                  &losesInfo);
-      EXPECT_TRUE(z.bitwiseIsEqual(z16)) << "i=" << i << ", j=" << j;
+      // bitcastToAPInt
+      EXPECT_EQ(i, test.bitcastToAPInt());
     }
   }
 }
 
+TEST(APFloatTest, Float8E4M3FNUZNext) {
+  APFloat test(APFloat::Float8E4M3FNUZ(), APFloat::uninitialized);
+  APFloat expected(APFloat::Float8E4M3FNUZ(), APFloat::uninitialized);
+
+  // 1. NextUp of largest bit pattern is nan
+  test = APFloat::getLargest(APFloat::Float8E4M3FNUZ());
+  expected = APFloat::getNaN(APFloat::Float8E4M3FNUZ());
+  EXPECT_EQ(test.next(false), APFloat::opOK);
+  EXPECT_FALSE(test.isInfinity());
+  EXPECT_FALSE(test.isZero());
+  EXPECT_TRUE(test.isNaN());
+  EXPECT_TRUE(test.bitwiseIsEqual(expected));
+
+  // 2. NextUp of smallest negative denormal is +0
+  test = APFloat::getSmallest(APFloat::Float8E4M3FNUZ(), true);
+  expected = APFloat::getZero(APFloat::Float8E4M3FNUZ(), false);
+  EXPECT_EQ(test.next(false), APFloat::opOK);
+  EXPECT_FALSE(test.isNegZero());
+  EXPECT_TRUE(test.isPosZero());
+  EXPECT_TRUE(test.bitwiseIsEqual(expected));
+
+  // 3. nextDown of negative of largest value is NaN
+  test = APFloat::getLargest(APFloat::Float8E4M3FNUZ(), true);
+  expected = APFloat::getNaN(APFloat::Float8E4M3FNUZ());
+  EXPECT_EQ(test.next(true), APFloat::opOK);
+  EXPECT_FALSE(test.isInfinity());
+  EXPECT_FALSE(test.isZero());
+  EXPECT_TRUE(test.isNaN());
+  EXPECT_TRUE(test.bitwiseIsEqual(expected));
+
+  // 4. nextDown of +0 is smallest negative denormal
+  test = APFloat::getZero(APFloat::Float8E4M3FNUZ(), false);
+  expected = APFloat::getSmallest(APFloat::Float8E4M3FNUZ(), true);
+  EXPECT_EQ(test.next(true), APFloat::opOK);
+  EXPECT_FALSE(test.isZero());
+  EXPECT_TRUE(test.isDenormal());
+  EXPECT_TRUE(test.bitwiseIsEqual(expected));
+
+  // 5. nextUp of NaN is NaN
+  test = APFloat::getNaN(APFloat::Float8E4M3FNUZ(), false);
+  expected = APFloat::getNaN(APFloat::Float8E4M3FNUZ(), true);
+  EXPECT_EQ(test.next(false), APFloat::opOK);
+  EXPECT_TRUE(test.isNaN());
+
+  // 6. nextDown of NaN is NaN
+  test = APFloat::getNaN(APFloat::Float8E4M3FNUZ(), false);
+  expected = APFloat::getNaN(APFloat::Float8E4M3FNUZ(), true);
+  EXPECT_EQ(test.next(true), APFloat::opOK);
+  EXPECT_TRUE(test.isNaN());
+}
+
+TEST(APFloatTest, Float8E4M3FNUZChangeSign) {
+  APFloat test = APFloat(APFloat::Float8E4M3FNUZ(), "1.0");
+  APFloat expected = APFloat(APFloat::Float8E4M3FNUZ(), "-1.0");
+  test.changeSign();
+  EXPECT_TRUE(test.bitwiseIsEqual(expected));
+
+  test = APFloat::getZero(APFloat::Float8E4M3FNUZ());
+  expected = test;
+  test.changeSign();
+  EXPECT_TRUE(test.bitwiseIsEqual(expected));
+
+  test = APFloat::getNaN(APFloat::Float8E4M3FNUZ());
+  expected = test;
+  test.changeSign();
+  EXPECT_TRUE(test.bitwiseIsEqual(expected));
+}
+
+TEST(APFloatTest, Float8E4M3FNUZFromString) {
+  // Exactly representable
+  EXPECT_EQ(240, APFloat(APFloat::Float8E4M3FNUZ(), "240").convertToDouble());
+  // Round down to maximum value
+  EXPECT_EQ(240, APFloat(APFloat::Float8E4M3FNUZ(), "247").convertToDouble());
+  // Round up, causing overflow to NaN
+  EXPECT_TRUE(APFloat(APFloat::Float8E4M3FNUZ(), "248").isNaN());
+  // Overflow without rounding
+  EXPECT_TRUE(APFloat(APFloat::Float8E4M3FNUZ(), "480").isNaN());
+  // Inf converted to NaN
+  EXPECT_TRUE(APFloat(APFloat::Float8E4M3FNUZ(), "inf").isNaN());
+  // NaN converted to NaN
+  EXPECT_TRUE(APFloat(APFloat::Float8E4M3FNUZ(), "nan").isNaN());
+  // Negative zero converted to positive zero
+  EXPECT_TRUE(APFloat(APFloat::Float8E4M3FNUZ(), "-0").isPosZero());
+}
+
+TEST(APFloatTest, Float8E4M3FNUZAdd) {
+  APFloat QNaN = APFloat::getNaN(APFloat::Float8E4M3FNUZ(), false);
+
+  auto FromStr = [](StringRef S) {
+    return APFloat(APFloat::Float8E4M3FNUZ(), S);
+  };
+
+  struct {
+    APFloat x;
+    APFloat y;
+    const char *result;
+    int status;
+    int category;
+    APFloat::roundingMode roundingMode = APFloat::rmNearestTiesToEven;
+  } AdditionTests[] = {
+      // Test addition operations involving NaN, overflow, and the max E4M3FNUZ
+      // value (240) because E4M3FNUZ 
diff ers from IEEE-754 types in these
+      // regards
+      {FromStr("240"), FromStr("4"), "240", APFloat::opInexact,
+       APFloat::fcNormal},
+      {FromStr("240"), FromStr("8"), "NaN",
+       APFloat::opOverflow | APFloat::opInexact, APFloat::fcNaN},
+      {FromStr("240"), FromStr("16"), "NaN",
+       APFloat::opOverflow | APFloat::opInexact, APFloat::fcNaN},
+      {FromStr("-240"), FromStr("-16"), "NaN",
+       APFloat::opOverflow | APFloat::opInexact, APFloat::fcNaN},
+      {QNaN, FromStr("-240"), "NaN", APFloat::opOK, APFloat::fcNaN},
+      {FromStr("240"), FromStr("-16"), "224", APFloat::opOK, APFloat::fcNormal},
+      {FromStr("240"), FromStr("0"), "240", APFloat::opOK, APFloat::fcNormal},
+      {FromStr("240"), FromStr("32"), "240", APFloat::opInexact,
+       APFloat::fcNormal, APFloat::rmTowardZero},
+      {FromStr("240"), FromStr("240"), "240", APFloat::opInexact,
+       APFloat::fcNormal, APFloat::rmTowardZero},
+  };
+
+  for (size_t i = 0; i < std::size(AdditionTests); ++i) {
+    APFloat x(AdditionTests[i].x);
+    APFloat y(AdditionTests[i].y);
+    APFloat::opStatus status = x.add(y, AdditionTests[i].roundingMode);
+
+    APFloat result(APFloat::Float8E4M3FNUZ(), AdditionTests[i].result);
+
+    EXPECT_TRUE(result.bitwiseIsEqual(x));
+    EXPECT_EQ(AdditionTests[i].status, (int)status);
+    EXPECT_EQ(AdditionTests[i].category, (int)x.getCategory());
+  }
+}
+
+TEST(APFloatTest, Float8E4M3FNUZDivideByZero) {
+  APFloat x(APFloat::Float8E4M3FNUZ(), "1");
+  APFloat zero(APFloat::Float8E4M3FNUZ(), "0");
+  EXPECT_EQ(x.divide(zero, APFloat::rmNearestTiesToEven), APFloat::opDivByZero);
+  EXPECT_TRUE(x.isNaN());
+}
+
+TEST(APFloatTest, ConvertE5M2FNUZToE4M3FNUZ) {
+  bool losesInfo;
+  APFloat test(APFloat::Float8E5M2FNUZ(), "1.0");
+  APFloat::opStatus status = test.convert(
+      APFloat::Float8E4M3FNUZ(), APFloat::rmNearestTiesToEven, &losesInfo);
+  EXPECT_EQ(1.0f, test.convertToFloat());
+  EXPECT_FALSE(losesInfo);
+  EXPECT_EQ(status, APFloat::opOK);
+
+  losesInfo = true;
+  test = APFloat(APFloat::Float8E5M2FNUZ(), "0.0");
+  status = test.convert(APFloat::Float8E4M3FNUZ(), APFloat::rmNearestTiesToEven,
+                        &losesInfo);
+  EXPECT_EQ(0.0f, test.convertToFloat());
+  EXPECT_FALSE(losesInfo);
+  EXPECT_EQ(status, APFloat::opOK);
+
+  losesInfo = true;
+  test = APFloat(APFloat::Float8E5M2FNUZ(), "0x1.Cp7"); // 224
+  status = test.convert(APFloat::Float8E4M3FNUZ(), APFloat::rmNearestTiesToEven,
+                        &losesInfo);
+  EXPECT_EQ(0x1.Cp7 /* 224 */, test.convertToFloat());
+  EXPECT_FALSE(losesInfo);
+  EXPECT_EQ(status, APFloat::opOK);
+
+  // Test overflow
+  losesInfo = false;
+  test = APFloat(APFloat::Float8E5M2FNUZ(), "0x1.0p8"); // 256
+  status = test.convert(APFloat::Float8E4M3FNUZ(), APFloat::rmNearestTiesToEven,
+                        &losesInfo);
+  EXPECT_TRUE(std::isnan(test.convertToFloat()));
+  EXPECT_TRUE(losesInfo);
+  EXPECT_EQ(status, APFloat::opOverflow | APFloat::opInexact);
+
+  // Test underflow
+  test = APFloat(APFloat::Float8E5M2FNUZ(), "0x1.0p-11");
+  status = test.convert(APFloat::Float8E4M3FNUZ(), APFloat::rmNearestTiesToEven,
+                        &losesInfo);
+  EXPECT_EQ(0., test.convertToFloat());
+  EXPECT_TRUE(losesInfo);
+  EXPECT_EQ(status, APFloat::opUnderflow | APFloat::opInexact);
+
+  // Test rounding up to smallest denormal number
+  losesInfo = false;
+  test = APFloat(APFloat::Float8E5M2FNUZ(), "0x1.8p-11");
+  status = test.convert(APFloat::Float8E4M3FNUZ(), APFloat::rmNearestTiesToEven,
+                        &losesInfo);
+  EXPECT_EQ(0x1.0p-10, test.convertToFloat());
+  EXPECT_TRUE(losesInfo);
+  EXPECT_EQ(status, APFloat::opUnderflow | APFloat::opInexact);
+
+  // Testing inexact rounding to denormal number
+  losesInfo = false;
+  test = APFloat(APFloat::Float8E5M2FNUZ(), "0x1.8p-10");
+  status = test.convert(APFloat::Float8E4M3FNUZ(), APFloat::rmNearestTiesToEven,
+                        &losesInfo);
+  EXPECT_EQ(0x1.0p-9, test.convertToFloat());
+  EXPECT_TRUE(losesInfo);
+  EXPECT_EQ(status, APFloat::opUnderflow | APFloat::opInexact);
+}
+
+TEST(APFloatTest, ConvertE4M3FNUZToE5M2FNUZ) {
+  bool losesInfo;
+  APFloat test(APFloat::Float8E4M3FNUZ(), "1.0");
+  APFloat::opStatus status = test.convert(
+      APFloat::Float8E5M2FNUZ(), APFloat::rmNearestTiesToEven, &losesInfo);
+  EXPECT_EQ(1.0f, test.convertToFloat());
+  EXPECT_FALSE(losesInfo);
+  EXPECT_EQ(status, APFloat::opOK);
+
+  losesInfo = true;
+  test = APFloat(APFloat::Float8E4M3FNUZ(), "0.0");
+  status = test.convert(APFloat::Float8E5M2FNUZ(), APFloat::rmNearestTiesToEven,
+                        &losesInfo);
+  EXPECT_EQ(0.0f, test.convertToFloat());
+  EXPECT_FALSE(losesInfo);
+  EXPECT_EQ(status, APFloat::opOK);
+
+  losesInfo = false;
+  test = APFloat(APFloat::Float8E4M3FNUZ(), "0x1.2p0"); // 1.125
+  status = test.convert(APFloat::Float8E5M2FNUZ(), APFloat::rmNearestTiesToEven,
+                        &losesInfo);
+  EXPECT_EQ(0x1.0p0 /* 1.0 */, test.convertToFloat());
+  EXPECT_TRUE(losesInfo);
+  EXPECT_EQ(status, APFloat::opInexact);
+
+  losesInfo = false;
+  test = APFloat(APFloat::Float8E4M3FNUZ(), "0x1.6p0"); // 1.375
+  status = test.convert(APFloat::Float8E5M2FNUZ(), APFloat::rmNearestTiesToEven,
+                        &losesInfo);
+  EXPECT_EQ(0x1.8p0 /* 1.5 */, test.convertToFloat());
+  EXPECT_TRUE(losesInfo);
+  EXPECT_EQ(status, APFloat::opInexact);
+
+  // Convert E4M3 denormal to E5M2 normal. Should not be truncated, despite the
+  // destination format having one fewer significand bit
+  losesInfo = true;
+  test = APFloat(APFloat::Float8E4M3FNUZ(), "0x1.Cp-8");
+  status = test.convert(APFloat::Float8E5M2FNUZ(), APFloat::rmNearestTiesToEven,
+                        &losesInfo);
+  EXPECT_EQ(0x1.Cp-8, test.convertToFloat());
+  EXPECT_FALSE(losesInfo);
+  EXPECT_EQ(status, APFloat::opOK);
+}
+
 TEST(APFloatTest, F8ToString) {
   for (APFloat::Semantics S :
-       {APFloat::S_Float8E5M2, APFloat::S_Float8E4M3FN}) {
+       {APFloat::S_Float8E5M2, APFloat::S_Float8E4M3FN,
+        APFloat::S_Float8E5M2FNUZ, APFloat::S_Float8E4M3FNUZ}) {
     SCOPED_TRACE("Semantics=" + std::to_string(S));
     for (int i = 0; i < 256; i++) {
       SCOPED_TRACE("i=" + std::to_string(i));
-      APFloat test(APFloat::Float8E5M2(), APInt(8, i));
+      APFloat test(APFloat::EnumToSemantics(S), APInt(8, i));
       llvm::SmallString<128> str;
       test.toString(str);
 
       if (test.isNaN()) {
         EXPECT_EQ(str, "NaN");
       } else {
-        APFloat test2(APFloat::Float8E5M2(), str);
+        APFloat test2(APFloat::EnumToSemantics(S), str);
         EXPECT_TRUE(test.bitwiseIsEqual(test2));
       }
     }
@@ -5458,6 +6077,120 @@ TEST(APFloatTest, Float8E4M3FNToDouble) {
   EXPECT_TRUE(std::isnan(QNaN.convertToDouble()));
 }
 
+TEST(APFloatTest, Float8E5M2FNUZToDouble) {
+  APFloat One(APFloat::Float8E5M2FNUZ(), "1.0");
+  EXPECT_EQ(1.0, One.convertToDouble());
+  APFloat Two(APFloat::Float8E5M2FNUZ(), "2.0");
+  EXPECT_EQ(2.0, Two.convertToDouble());
+  APFloat PosLargest = APFloat::getLargest(APFloat::Float8E5M2FNUZ(), false);
+  EXPECT_EQ(57344., PosLargest.convertToDouble());
+  APFloat NegLargest = APFloat::getLargest(APFloat::Float8E5M2FNUZ(), true);
+  EXPECT_EQ(-57344., NegLargest.convertToDouble());
+  APFloat PosSmallest =
+      APFloat::getSmallestNormalized(APFloat::Float8E5M2FNUZ(), false);
+  EXPECT_EQ(0x1.p-15, PosSmallest.convertToDouble());
+  APFloat NegSmallest =
+      APFloat::getSmallestNormalized(APFloat::Float8E5M2FNUZ(), true);
+  EXPECT_EQ(-0x1.p-15, NegSmallest.convertToDouble());
+
+  APFloat SmallestDenorm =
+      APFloat::getSmallest(APFloat::Float8E5M2FNUZ(), false);
+  EXPECT_TRUE(SmallestDenorm.isDenormal());
+  EXPECT_EQ(0x1p-17, SmallestDenorm.convertToDouble());
+
+  APFloat QNaN = APFloat::getQNaN(APFloat::Float8E5M2FNUZ());
+  EXPECT_TRUE(std::isnan(QNaN.convertToDouble()));
+}
+
+TEST(APFloatTest, Float8E4M3FNUZToDouble) {
+  APFloat One(APFloat::Float8E4M3FNUZ(), "1.0");
+  EXPECT_EQ(1.0, One.convertToDouble());
+  APFloat Two(APFloat::Float8E4M3FNUZ(), "2.0");
+  EXPECT_EQ(2.0, Two.convertToDouble());
+  APFloat PosLargest = APFloat::getLargest(APFloat::Float8E4M3FNUZ(), false);
+  EXPECT_EQ(240., PosLargest.convertToDouble());
+  APFloat NegLargest = APFloat::getLargest(APFloat::Float8E4M3FNUZ(), true);
+  EXPECT_EQ(-240., NegLargest.convertToDouble());
+  APFloat PosSmallest =
+      APFloat::getSmallestNormalized(APFloat::Float8E4M3FNUZ(), false);
+  EXPECT_EQ(0x1.p-7, PosSmallest.convertToDouble());
+  APFloat NegSmallest =
+      APFloat::getSmallestNormalized(APFloat::Float8E4M3FNUZ(), true);
+  EXPECT_EQ(-0x1.p-7, NegSmallest.convertToDouble());
+
+  APFloat SmallestDenorm =
+      APFloat::getSmallest(APFloat::Float8E4M3FNUZ(), false);
+  EXPECT_TRUE(SmallestDenorm.isDenormal());
+  EXPECT_EQ(0x1p-10, SmallestDenorm.convertToDouble());
+
+  APFloat QNaN = APFloat::getQNaN(APFloat::Float8E4M3FNUZ());
+  EXPECT_TRUE(std::isnan(QNaN.convertToDouble()));
+}
+
+TEST(APFloatTest, Float8E5M2FNUZToFloat) {
+  APFloat PosZero = APFloat::getZero(APFloat::Float8E5M2FNUZ());
+  APFloat PosZeroToFloat(PosZero.convertToFloat());
+  EXPECT_TRUE(PosZeroToFloat.isPosZero());
+  // Negative zero is not supported
+  APFloat NegZero = APFloat::getZero(APFloat::Float8E5M2FNUZ(), true);
+  APFloat NegZeroToFloat(NegZero.convertToFloat());
+  EXPECT_TRUE(NegZeroToFloat.isPosZero());
+  APFloat One(APFloat::Float8E5M2FNUZ(), "1.0");
+  EXPECT_EQ(1.0F, One.convertToFloat());
+  APFloat Two(APFloat::Float8E5M2FNUZ(), "2.0");
+  EXPECT_EQ(2.0F, Two.convertToFloat());
+  APFloat PosLargest = APFloat::getLargest(APFloat::Float8E5M2FNUZ(), false);
+  EXPECT_EQ(57344.F, PosLargest.convertToFloat());
+  APFloat NegLargest = APFloat::getLargest(APFloat::Float8E5M2FNUZ(), true);
+  EXPECT_EQ(-57344.F, NegLargest.convertToFloat());
+  APFloat PosSmallest =
+      APFloat::getSmallestNormalized(APFloat::Float8E5M2FNUZ(), false);
+  EXPECT_EQ(0x1.p-15F, PosSmallest.convertToFloat());
+  APFloat NegSmallest =
+      APFloat::getSmallestNormalized(APFloat::Float8E5M2FNUZ(), true);
+  EXPECT_EQ(-0x1.p-15F, NegSmallest.convertToFloat());
+
+  APFloat SmallestDenorm =
+      APFloat::getSmallest(APFloat::Float8E5M2FNUZ(), false);
+  EXPECT_TRUE(SmallestDenorm.isDenormal());
+  EXPECT_EQ(0x1p-17F, SmallestDenorm.convertToFloat());
+
+  APFloat QNaN = APFloat::getQNaN(APFloat::Float8E5M2FNUZ());
+  EXPECT_TRUE(std::isnan(QNaN.convertToFloat()));
+}
+
+TEST(APFloatTest, Float8E4M3FNUZToFloat) {
+  APFloat PosZero = APFloat::getZero(APFloat::Float8E4M3FNUZ());
+  APFloat PosZeroToFloat(PosZero.convertToFloat());
+  EXPECT_TRUE(PosZeroToFloat.isPosZero());
+  // Negative zero is not supported
+  APFloat NegZero = APFloat::getZero(APFloat::Float8E4M3FNUZ(), true);
+  APFloat NegZeroToFloat(NegZero.convertToFloat());
+  EXPECT_TRUE(NegZeroToFloat.isPosZero());
+  APFloat One(APFloat::Float8E4M3FNUZ(), "1.0");
+  EXPECT_EQ(1.0F, One.convertToFloat());
+  APFloat Two(APFloat::Float8E4M3FNUZ(), "2.0");
+  EXPECT_EQ(2.0F, Two.convertToFloat());
+  APFloat PosLargest = APFloat::getLargest(APFloat::Float8E4M3FNUZ(), false);
+  EXPECT_EQ(240.F, PosLargest.convertToFloat());
+  APFloat NegLargest = APFloat::getLargest(APFloat::Float8E4M3FNUZ(), true);
+  EXPECT_EQ(-240.F, NegLargest.convertToFloat());
+  APFloat PosSmallest =
+      APFloat::getSmallestNormalized(APFloat::Float8E4M3FNUZ(), false);
+  EXPECT_EQ(0x1.p-7F, PosSmallest.convertToFloat());
+  APFloat NegSmallest =
+      APFloat::getSmallestNormalized(APFloat::Float8E4M3FNUZ(), true);
+  EXPECT_EQ(-0x1.p-7F, NegSmallest.convertToFloat());
+
+  APFloat SmallestDenorm =
+      APFloat::getSmallest(APFloat::Float8E4M3FNUZ(), false);
+  EXPECT_TRUE(SmallestDenorm.isDenormal());
+  EXPECT_EQ(0x1p-10F, SmallestDenorm.convertToFloat());
+
+  APFloat QNaN = APFloat::getQNaN(APFloat::Float8E4M3FNUZ());
+  EXPECT_TRUE(std::isnan(QNaN.convertToFloat()));
+}
+
 TEST(APFloatTest, IEEEsingleToFloat) {
   APFloat FPosZero(0.0F);
   APFloat FPosZeroToFloat(FPosZero.convertToFloat());


        


More information about the llvm-commits mailing list