[clang] b1fe03f - [APFloat] Add APFloat support for FP6 data types (#94735)
via cfe-commits
cfe-commits at lists.llvm.org
Tue Jun 11 00:46:56 PDT 2024
Author: Durgadoss R
Date: 2024-06-11T13:16:51+05:30
New Revision: b1fe03f0840a2c488b1f07a669bfea3cc986ce3b
URL: https://github.com/llvm/llvm-project/commit/b1fe03f0840a2c488b1f07a669bfea3cc986ce3b
DIFF: https://github.com/llvm/llvm-project/commit/b1fe03f0840a2c488b1f07a669bfea3cc986ce3b.diff
LOG: [APFloat] Add APFloat support for FP6 data types (#94735)
This patch adds APFloat type support for two FP6 data types,
E2M3 and E3M2. The definitions for the two formats are detailed
in section 5.3.2 of the OCP specification, which can be accessed here:
https://www.opencompute.org/documents/ocp-microscaling-formats-mx-v1-0-spec-final-pdf
Signed-off-by: Durgadoss R <durgadossr at nvidia.com>
Added:
Modified:
clang/lib/AST/MicrosoftMangle.cpp
llvm/include/llvm/ADT/APFloat.h
llvm/lib/Support/APFloat.cpp
llvm/unittests/ADT/APFloatTest.cpp
Removed:
################################################################################
diff --git a/clang/lib/AST/MicrosoftMangle.cpp b/clang/lib/AST/MicrosoftMangle.cpp
index 2f7a276363920..ffc5d2d4cd8fc 100644
--- a/clang/lib/AST/MicrosoftMangle.cpp
+++ b/clang/lib/AST/MicrosoftMangle.cpp
@@ -899,6 +899,8 @@ void MicrosoftCXXNameMangler::mangleFloat(llvm::APFloat Number) {
case APFloat::S_Float8E4M3FNUZ:
case APFloat::S_Float8E4M3B11FNUZ:
case APFloat::S_FloatTF32:
+ case APFloat::S_Float6E3M2FN:
+ case APFloat::S_Float6E2M3FN:
llvm_unreachable("Tried to mangle unexpected APFloat semantics");
}
diff --git a/llvm/include/llvm/ADT/APFloat.h b/llvm/include/llvm/ADT/APFloat.h
index 78faadb30d9eb..a9bb6cc9999b1 100644
--- a/llvm/include/llvm/ADT/APFloat.h
+++ b/llvm/include/llvm/ADT/APFloat.h
@@ -189,6 +189,14 @@ struct APFloatBase {
// improved range compared to half (16-bit) formats, at (potentially)
// greater throughput than single precision (32-bit) formats.
S_FloatTF32,
+ // 6-bit floating point number with bit layout S1E3M2. Unlike IEEE-754
+ // types, there are no infinity or NaN values. The format is detailed in
+ // https://www.opencompute.org/documents/ocp-microscaling-formats-mx-v1-0-spec-final-pdf
+ S_Float6E3M2FN,
+ // 6-bit floating point number with bit layout S1E2M3. Unlike IEEE-754
+ // types, there are no infinity or NaN values. The format is detailed in
+ // https://www.opencompute.org/documents/ocp-microscaling-formats-mx-v1-0-spec-final-pdf
+ S_Float6E2M3FN,
S_x87DoubleExtended,
S_MaxSemantics = S_x87DoubleExtended,
@@ -209,6 +217,8 @@ struct APFloatBase {
static const fltSemantics &Float8E4M3FNUZ() LLVM_READNONE;
static const fltSemantics &Float8E4M3B11FNUZ() LLVM_READNONE;
static const fltSemantics &FloatTF32() LLVM_READNONE;
+ static const fltSemantics &Float6E3M2FN() LLVM_READNONE;
+ static const fltSemantics &Float6E2M3FN() LLVM_READNONE;
static const fltSemantics &x87DoubleExtended() LLVM_READNONE;
/// A Pseudo fltsemantic used to construct APFloats that cannot conflict with
@@ -627,6 +637,8 @@ class IEEEFloat final : public APFloatBase {
APInt convertFloat8E4M3FNUZAPFloatToAPInt() const;
APInt convertFloat8E4M3B11FNUZAPFloatToAPInt() const;
APInt convertFloatTF32APFloatToAPInt() const;
+ APInt convertFloat6E3M2FNAPFloatToAPInt() const;
+ APInt convertFloat6E2M3FNAPFloatToAPInt() const;
void initFromAPInt(const fltSemantics *Sem, const APInt &api);
template <const fltSemantics &S> void initFromIEEEAPInt(const APInt &api);
void initFromHalfAPInt(const APInt &api);
@@ -642,6 +654,8 @@ class IEEEFloat final : public APFloatBase {
void initFromFloat8E4M3FNUZAPInt(const APInt &api);
void initFromFloat8E4M3B11FNUZAPInt(const APInt &api);
void initFromFloatTF32APInt(const APInt &api);
+ void initFromFloat6E3M2FNAPInt(const APInt &api);
+ void initFromFloat6E2M3FNAPInt(const APInt &api);
void assign(const IEEEFloat &);
void copySignificand(const IEEEFloat &);
@@ -1046,6 +1060,17 @@ class APFloat : public APFloatBase {
/// \param Semantics - type float semantics
static APFloat getAllOnesValue(const fltSemantics &Semantics);
+ static bool hasNanOrInf(const fltSemantics &Sem) {
+ switch (SemanticsToEnum(Sem)) {
+ default:
+ return true;
+ // Below Semantics do not support {NaN or Inf}
+ case APFloat::S_Float6E3M2FN:
+ case APFloat::S_Float6E2M3FN:
+ return false;
+ }
+ }
+
/// Used to insert APFloat objects, or objects that contain APFloat objects,
/// into FoldingSets.
void Profile(FoldingSetNodeID &NID) const;
diff --git a/llvm/lib/Support/APFloat.cpp b/llvm/lib/Support/APFloat.cpp
index 283fcc153b33a..1209bf71a287d 100644
--- a/llvm/lib/Support/APFloat.cpp
+++ b/llvm/lib/Support/APFloat.cpp
@@ -68,6 +68,10 @@ enum class fltNonfiniteBehavior {
// `fltNanEncoding` enum. We treat all NaNs as quiet, as the available
// encodings do not distinguish between signalling and quiet NaN.
NanOnly,
+
+ // This behavior is present in Float6E3M2FN and Float6E2M3FN types,
+ // which do not support Inf or NaN values.
+ FiniteOnly,
};
// How NaN values are represented. This is curently only used in combination
@@ -139,6 +143,10 @@ static constexpr fltSemantics semFloat8E4M3FNUZ = {
static constexpr fltSemantics semFloat8E4M3B11FNUZ = {
4, -10, 4, 8, fltNonfiniteBehavior::NanOnly, fltNanEncoding::NegativeZero};
static constexpr fltSemantics semFloatTF32 = {127, -126, 11, 19};
+static constexpr fltSemantics semFloat6E3M2FN = {
+ 4, -2, 3, 6, fltNonfiniteBehavior::FiniteOnly};
+static constexpr fltSemantics semFloat6E2M3FN = {
+ 2, 0, 4, 6, fltNonfiniteBehavior::FiniteOnly};
static constexpr fltSemantics semX87DoubleExtended = {16383, -16382, 64, 80};
static constexpr fltSemantics semBogus = {0, 0, 0, 0};
@@ -206,6 +214,10 @@ const llvm::fltSemantics &APFloatBase::EnumToSemantics(Semantics S) {
return Float8E4M3B11FNUZ();
case S_FloatTF32:
return FloatTF32();
+ case S_Float6E3M2FN:
+ return Float6E3M2FN();
+ case S_Float6E2M3FN:
+ return Float6E2M3FN();
case S_x87DoubleExtended:
return x87DoubleExtended();
}
@@ -238,6 +250,10 @@ APFloatBase::SemanticsToEnum(const llvm::fltSemantics &Sem) {
return S_Float8E4M3B11FNUZ;
else if (&Sem == &llvm::APFloat::FloatTF32())
return S_FloatTF32;
+ else if (&Sem == &llvm::APFloat::Float6E3M2FN())
+ return S_Float6E3M2FN;
+ else if (&Sem == &llvm::APFloat::Float6E2M3FN())
+ return S_Float6E2M3FN;
else if (&Sem == &llvm::APFloat::x87DoubleExtended())
return S_x87DoubleExtended;
else
@@ -260,6 +276,8 @@ const fltSemantics &APFloatBase::Float8E4M3B11FNUZ() {
return semFloat8E4M3B11FNUZ;
}
const fltSemantics &APFloatBase::FloatTF32() { return semFloatTF32; }
+const fltSemantics &APFloatBase::Float6E3M2FN() { return semFloat6E3M2FN; }
+const fltSemantics &APFloatBase::Float6E2M3FN() { return semFloat6E2M3FN; }
const fltSemantics &APFloatBase::x87DoubleExtended() {
return semX87DoubleExtended;
}
@@ -878,6 +896,9 @@ void IEEEFloat::copySignificand(const IEEEFloat &rhs) {
for the significand. If double or longer, this is a signalling NaN,
which may not be ideal. If float, this is QNaN(0). */
void IEEEFloat::makeNaN(bool SNaN, bool Negative, const APInt *fill) {
+ if (semantics->nonFiniteBehavior == fltNonfiniteBehavior::FiniteOnly)
+ llvm_unreachable("This floating point format does not support NaN");
+
category = fcNaN;
sign = Negative;
exponent = exponentNaN();
@@ -1499,16 +1520,18 @@ static void tcSetLeastSignificantBits(APInt::WordType *dst, unsigned parts,
/* Handle overflow. Sign is preserved. We either become infinity or
the largest finite number. */
IEEEFloat::opStatus IEEEFloat::handleOverflow(roundingMode rounding_mode) {
- /* Infinity? */
- if (rounding_mode == rmNearestTiesToEven ||
- rounding_mode == rmNearestTiesToAway ||
- (rounding_mode == rmTowardPositive && !sign) ||
- (rounding_mode == rmTowardNegative && sign)) {
- if (semantics->nonFiniteBehavior == fltNonfiniteBehavior::NanOnly)
- makeNaN(false, sign);
- else
- category = fcInfinity;
- return (opStatus) (opOverflow | opInexact);
+ if (semantics->nonFiniteBehavior != fltNonfiniteBehavior::FiniteOnly) {
+ /* Infinity? */
+ if (rounding_mode == rmNearestTiesToEven ||
+ rounding_mode == rmNearestTiesToAway ||
+ (rounding_mode == rmTowardPositive && !sign) ||
+ (rounding_mode == rmTowardNegative && sign)) {
+ if (semantics->nonFiniteBehavior == fltNonfiniteBehavior::NanOnly)
+ makeNaN(false, sign);
+ else
+ category = fcInfinity;
+ return static_cast<opStatus>(opOverflow | opInexact);
+ }
}
/* Otherwise we become the largest finite number. */
@@ -3518,13 +3541,15 @@ APInt IEEEFloat::convertIEEEFloatToAPInt() const {
myexponent = ::exponentZero(S) + bias;
mysignificand.fill(0);
} else if (category == fcInfinity) {
- if (S.nonFiniteBehavior == fltNonfiniteBehavior::NanOnly) {
+ if (S.nonFiniteBehavior == fltNonfiniteBehavior::NanOnly ||
+ S.nonFiniteBehavior == fltNonfiniteBehavior::FiniteOnly)
llvm_unreachable("semantics don't support inf!");
- }
myexponent = ::exponentInf(S) + bias;
mysignificand.fill(0);
} else {
assert(category == fcNaN && "Unknown category!");
+ if (S.nonFiniteBehavior == fltNonfiniteBehavior::FiniteOnly)
+ llvm_unreachable("semantics don't support NaN!");
myexponent = ::exponentNaN(S) + bias;
std::copy_n(significandParts(), mysignificand.size(),
mysignificand.begin());
@@ -3605,6 +3630,16 @@ APInt IEEEFloat::convertFloatTF32APFloatToAPInt() const {
return convertIEEEFloatToAPInt<semFloatTF32>();
}
+APInt IEEEFloat::convertFloat6E3M2FNAPFloatToAPInt() const {
+ assert(partCount() == 1);
+ return convertIEEEFloatToAPInt<semFloat6E3M2FN>();
+}
+
+APInt IEEEFloat::convertFloat6E2M3FNAPFloatToAPInt() const {
+ assert(partCount() == 1);
+ return convertIEEEFloatToAPInt<semFloat6E2M3FN>();
+}
+
// This function creates an APInt that is just a bit map of the floating
// point constant as it would appear in memory. It is not a conversion,
// and treating the result as a normal integer is unlikely to be useful.
@@ -3646,6 +3681,12 @@ APInt IEEEFloat::bitcastToAPInt() const {
if (semantics == (const llvm::fltSemantics *)&semFloatTF32)
return convertFloatTF32APFloatToAPInt();
+ if (semantics == (const llvm::fltSemantics *)&semFloat6E3M2FN)
+ return convertFloat6E3M2FNAPFloatToAPInt();
+
+ if (semantics == (const llvm::fltSemantics *)&semFloat6E2M3FN)
+ return convertFloat6E2M3FNAPFloatToAPInt();
+
assert(semantics == (const llvm::fltSemantics*)&semX87DoubleExtended &&
"unknown format!");
return convertF80LongDoubleAPFloatToAPInt();
@@ -3862,6 +3903,14 @@ void IEEEFloat::initFromFloatTF32APInt(const APInt &api) {
initFromIEEEAPInt<semFloatTF32>(api);
}
+void IEEEFloat::initFromFloat6E3M2FNAPInt(const APInt &api) {
+ initFromIEEEAPInt<semFloat6E3M2FN>(api);
+}
+
+void IEEEFloat::initFromFloat6E2M3FNAPInt(const APInt &api) {
+ initFromIEEEAPInt<semFloat6E2M3FN>(api);
+}
+
/// Treat api as containing the bits of a floating point number.
void IEEEFloat::initFromAPInt(const fltSemantics *Sem, const APInt &api) {
assert(api.getBitWidth() == Sem->sizeInBits);
@@ -3891,6 +3940,10 @@ void IEEEFloat::initFromAPInt(const fltSemantics *Sem, const APInt &api) {
return initFromFloat8E4M3B11FNUZAPInt(api);
if (Sem == &semFloatTF32)
return initFromFloatTF32APInt(api);
+ if (Sem == &semFloat6E3M2FN)
+ return initFromFloat6E3M2FNAPInt(api);
+ if (Sem == &semFloat6E2M3FN)
+ return initFromFloat6E2M3FNAPInt(api);
llvm_unreachable(nullptr);
}
@@ -4328,7 +4381,8 @@ int IEEEFloat::getExactLog2Abs() const {
bool IEEEFloat::isSignaling() const {
if (!isNaN())
return false;
- if (semantics->nonFiniteBehavior == fltNonfiniteBehavior::NanOnly)
+ if (semantics->nonFiniteBehavior == fltNonfiniteBehavior::NanOnly ||
+ semantics->nonFiniteBehavior == fltNonfiniteBehavior::FiniteOnly)
return false;
// IEEE-754R 2008 6.2.1: A signaling NaN bit string should be encoded with the
@@ -4387,6 +4441,10 @@ IEEEFloat::opStatus IEEEFloat::next(bool nextDown) {
// nextUp(getLargest()) == NAN
makeNaN();
break;
+ } else if (semantics->nonFiniteBehavior ==
+ fltNonfiniteBehavior::FiniteOnly) {
+ // nextUp(getLargest()) == getLargest()
+ break;
} else {
// nextUp(getLargest()) == INFINITY
APInt::tcSet(significandParts(), 0, partCount());
@@ -4477,6 +4535,9 @@ APFloatBase::ExponentType IEEEFloat::exponentZero() const {
}
void IEEEFloat::makeInf(bool Negative) {
+ if (semantics->nonFiniteBehavior == fltNonfiniteBehavior::FiniteOnly)
+ llvm_unreachable("This floating point format does not support Inf");
+
if (semantics->nonFiniteBehavior == fltNonfiniteBehavior::NanOnly) {
// There is no Inf, so make NaN instead.
makeNaN(false, Negative);
diff --git a/llvm/unittests/ADT/APFloatTest.cpp b/llvm/unittests/ADT/APFloatTest.cpp
index 6e4dda8351a1b..7007d944801a7 100644
--- a/llvm/unittests/ADT/APFloatTest.cpp
+++ b/llvm/unittests/ADT/APFloatTest.cpp
@@ -723,11 +723,13 @@ TEST(APFloatTest, IsSmallestNormalized) {
EXPECT_FALSE(APFloat::getZero(Semantics, false).isSmallestNormalized());
EXPECT_FALSE(APFloat::getZero(Semantics, true).isSmallestNormalized());
- EXPECT_FALSE(APFloat::getInf(Semantics, false).isSmallestNormalized());
- EXPECT_FALSE(APFloat::getInf(Semantics, true).isSmallestNormalized());
+ if (APFloat::hasNanOrInf(Semantics)) {
+ EXPECT_FALSE(APFloat::getInf(Semantics, false).isSmallestNormalized());
+ EXPECT_FALSE(APFloat::getInf(Semantics, true).isSmallestNormalized());
- EXPECT_FALSE(APFloat::getQNaN(Semantics).isSmallestNormalized());
- EXPECT_FALSE(APFloat::getSNaN(Semantics).isSmallestNormalized());
+ EXPECT_FALSE(APFloat::getQNaN(Semantics).isSmallestNormalized());
+ EXPECT_FALSE(APFloat::getSNaN(Semantics).isSmallestNormalized());
+ }
EXPECT_FALSE(APFloat::getLargest(Semantics).isSmallestNormalized());
EXPECT_FALSE(APFloat::getLargest(Semantics, true).isSmallestNormalized());
@@ -1823,6 +1825,9 @@ TEST(APFloatTest, getLargest) {
30, APFloat::getLargest(APFloat::Float8E4M3B11FNUZ()).convertToDouble());
EXPECT_EQ(3.40116213421e+38f,
APFloat::getLargest(APFloat::FloatTF32()).convertToFloat());
+ EXPECT_EQ(28, APFloat::getLargest(APFloat::Float6E3M2FN()).convertToDouble());
+ EXPECT_EQ(7.5,
+ APFloat::getLargest(APFloat::Float6E2M3FN()).convertToDouble());
}
TEST(APFloatTest, getSmallest) {
@@ -1881,6 +1886,20 @@ TEST(APFloatTest, getSmallest) {
EXPECT_TRUE(test.isFiniteNonZero());
EXPECT_TRUE(test.isDenormal());
EXPECT_TRUE(test.bitwiseIsEqual(expected));
+
+ test = APFloat::getSmallest(APFloat::Float6E3M2FN(), false);
+ expected = APFloat(APFloat::Float6E3M2FN(), "0x0.1p0");
+ EXPECT_FALSE(test.isNegative());
+ EXPECT_TRUE(test.isFiniteNonZero());
+ EXPECT_TRUE(test.isDenormal());
+ EXPECT_TRUE(test.bitwiseIsEqual(expected));
+
+ test = APFloat::getSmallest(APFloat::Float6E2M3FN(), false);
+ expected = APFloat(APFloat::Float6E2M3FN(), "0x0.2p0");
+ EXPECT_FALSE(test.isNegative());
+ EXPECT_TRUE(test.isFiniteNonZero());
+ EXPECT_TRUE(test.isDenormal());
+ EXPECT_TRUE(test.bitwiseIsEqual(expected));
}
TEST(APFloatTest, getSmallestNormalized) {
@@ -1963,6 +1982,21 @@ TEST(APFloatTest, getSmallestNormalized) {
EXPECT_FALSE(test.isDenormal());
EXPECT_TRUE(test.bitwiseIsEqual(expected));
EXPECT_TRUE(test.isSmallestNormalized());
+ test = APFloat::getSmallestNormalized(APFloat::Float6E3M2FN(), false);
+ expected = APFloat(APFloat::Float6E3M2FN(), "0x1p-2");
+ EXPECT_FALSE(test.isNegative());
+ EXPECT_TRUE(test.isFiniteNonZero());
+ EXPECT_FALSE(test.isDenormal());
+ EXPECT_TRUE(test.bitwiseIsEqual(expected));
+ EXPECT_TRUE(test.isSmallestNormalized());
+
+ test = APFloat::getSmallestNormalized(APFloat::Float6E2M3FN(), false);
+ expected = APFloat(APFloat::Float6E2M3FN(), "0x1p0");
+ EXPECT_FALSE(test.isNegative());
+ EXPECT_TRUE(test.isFiniteNonZero());
+ EXPECT_FALSE(test.isDenormal());
+ EXPECT_TRUE(test.bitwiseIsEqual(expected));
+ EXPECT_TRUE(test.isSmallestNormalized());
}
TEST(APFloatTest, getZero) {
@@ -1996,7 +2030,11 @@ TEST(APFloatTest, getZero) {
{&APFloat::Float8E4M3B11FNUZ(), false, false, {0, 0}, 1},
{&APFloat::Float8E4M3B11FNUZ(), true, false, {0, 0}, 1},
{&APFloat::FloatTF32(), false, true, {0, 0}, 1},
- {&APFloat::FloatTF32(), true, true, {0x40000ULL, 0}, 1}};
+ {&APFloat::FloatTF32(), true, true, {0x40000ULL, 0}, 1},
+ {&APFloat::Float6E3M2FN(), false, true, {0, 0}, 1},
+ {&APFloat::Float6E3M2FN(), true, true, {0x20ULL, 0}, 1},
+ {&APFloat::Float6E2M3FN(), false, true, {0, 0}, 1},
+ {&APFloat::Float6E2M3FN(), true, true, {0x20ULL, 0}, 1}};
const unsigned NumGetZeroTests = std::size(GetZeroTest);
for (unsigned i = 0; i < NumGetZeroTests; ++i) {
APFloat test = APFloat::getZero(*GetZeroTest[i].semantics,
@@ -5161,6 +5199,90 @@ TEST(APFloatTest, Float8ExhaustivePair) {
}
}
+TEST(APFloatTest, Float6ExhaustivePair) {
+ // Test each pair of 6-bit floats with non-standard semantics
+ for (APFloat::Semantics Sem :
+ {APFloat::S_Float6E3M2FN, APFloat::S_Float6E2M3FN}) {
+ const llvm::fltSemantics &S = APFloat::EnumToSemantics(Sem);
+ for (int i = 1; i < 64; i++) {
+ for (int j = 1; j < 64; j++) {
+ SCOPED_TRACE("sem=" + std::to_string(Sem) + ",i=" + std::to_string(i) +
+ ",j=" + std::to_string(j));
+ APFloat x(S, APInt(6, i));
+ APFloat y(S, APInt(6, j));
+
+ bool losesInfo;
+ APFloat x16 = x;
+ x16.convert(APFloat::IEEEhalf(), APFloat::rmNearestTiesToEven,
+ &losesInfo);
+ EXPECT_FALSE(losesInfo);
+ APFloat y16 = y;
+ y16.convert(APFloat::IEEEhalf(), APFloat::rmNearestTiesToEven,
+ &losesInfo);
+ EXPECT_FALSE(losesInfo);
+
+ // Add
+ APFloat z = x;
+ z.add(y, APFloat::rmNearestTiesToEven);
+ APFloat z16 = x16;
+ z16.add(y16, APFloat::rmNearestTiesToEven);
+ z16.convert(S, APFloat::rmNearestTiesToEven, &losesInfo);
+ EXPECT_TRUE(z.bitwiseIsEqual(z16))
+ << "sem=" << Sem << ", i=" << i << ", j=" << j;
+
+ // Subtract
+ z = x;
+ z.subtract(y, APFloat::rmNearestTiesToEven);
+ z16 = x16;
+ z16.subtract(y16, APFloat::rmNearestTiesToEven);
+ z16.convert(S, APFloat::rmNearestTiesToEven, &losesInfo);
+ EXPECT_TRUE(z.bitwiseIsEqual(z16))
+ << "sem=" << Sem << ", i=" << i << ", j=" << j;
+
+ // Multiply
+ z = x;
+ z.multiply(y, APFloat::rmNearestTiesToEven);
+ z16 = x16;
+ z16.multiply(y16, APFloat::rmNearestTiesToEven);
+ z16.convert(S, APFloat::rmNearestTiesToEven, &losesInfo);
+ EXPECT_TRUE(z.bitwiseIsEqual(z16))
+ << "sem=" << Sem << ", i=" << i << ", j=" << j;
+
+ // Skip divide by 0
+ if (j == 0 || j == 32)
+ continue;
+
+ // Divide
+ z = x;
+ z.divide(y, APFloat::rmNearestTiesToEven);
+ z16 = x16;
+ z16.divide(y16, APFloat::rmNearestTiesToEven);
+ z16.convert(S, APFloat::rmNearestTiesToEven, &losesInfo);
+ EXPECT_TRUE(z.bitwiseIsEqual(z16))
+ << "sem=" << Sem << ", i=" << i << ", j=" << j;
+
+ // Mod
+ z = x;
+ z.mod(y);
+ z16 = x16;
+ z16.mod(y16);
+ z16.convert(S, APFloat::rmNearestTiesToEven, &losesInfo);
+ EXPECT_TRUE(z.bitwiseIsEqual(z16))
+ << "sem=" << Sem << ", i=" << i << ", j=" << j;
+
+ // Remainder
+ z = x;
+ z.remainder(y);
+ z16 = x16;
+ z16.remainder(y16);
+ z16.convert(S, APFloat::rmNearestTiesToEven, &losesInfo);
+ EXPECT_TRUE(z.bitwiseIsEqual(z16))
+ << "sem=" << Sem << ", i=" << i << ", j=" << j;
+ }
+ }
+ }
+}
+
TEST(APFloatTest, ConvertE4M3FNToE5M2) {
bool losesInfo;
APFloat test(APFloat::Float8E4M3FN(), "1.0");
@@ -6620,28 +6742,39 @@ TEST(APFloatTest, getExactLog2) {
EXPECT_EQ(INT_MIN, APFloat(Semantics, "-3.0").getExactLog2());
EXPECT_EQ(INT_MIN, APFloat(Semantics, "3.0").getExactLog2Abs());
EXPECT_EQ(INT_MIN, APFloat(Semantics, "-3.0").getExactLog2Abs());
- EXPECT_EQ(3, APFloat(Semantics, "8.0").getExactLog2());
- EXPECT_EQ(INT_MIN, APFloat(Semantics, "-8.0").getExactLog2());
- EXPECT_EQ(-2, APFloat(Semantics, "0.25").getExactLog2());
- EXPECT_EQ(-2, APFloat(Semantics, "0.25").getExactLog2Abs());
- EXPECT_EQ(INT_MIN, APFloat(Semantics, "-0.25").getExactLog2());
- EXPECT_EQ(-2, APFloat(Semantics, "-0.25").getExactLog2Abs());
- EXPECT_EQ(3, APFloat(Semantics, "8.0").getExactLog2Abs());
- EXPECT_EQ(3, APFloat(Semantics, "-8.0").getExactLog2Abs());
+
+ if (I == APFloat::S_Float6E2M3FN) {
+ EXPECT_EQ(2, APFloat(Semantics, "4.0").getExactLog2());
+ EXPECT_EQ(INT_MIN, APFloat(Semantics, "-4.0").getExactLog2());
+ EXPECT_EQ(2, APFloat(Semantics, "4.0").getExactLog2Abs());
+ EXPECT_EQ(2, APFloat(Semantics, "-4.0").getExactLog2Abs());
+ } else {
+ EXPECT_EQ(3, APFloat(Semantics, "8.0").getExactLog2());
+ EXPECT_EQ(INT_MIN, APFloat(Semantics, "-8.0").getExactLog2());
+ EXPECT_EQ(-2, APFloat(Semantics, "0.25").getExactLog2());
+ EXPECT_EQ(-2, APFloat(Semantics, "0.25").getExactLog2Abs());
+ EXPECT_EQ(INT_MIN, APFloat(Semantics, "-0.25").getExactLog2());
+ EXPECT_EQ(-2, APFloat(Semantics, "-0.25").getExactLog2Abs());
+ EXPECT_EQ(3, APFloat(Semantics, "8.0").getExactLog2Abs());
+ EXPECT_EQ(3, APFloat(Semantics, "-8.0").getExactLog2Abs());
+ }
EXPECT_EQ(INT_MIN, APFloat::getZero(Semantics, false).getExactLog2());
EXPECT_EQ(INT_MIN, APFloat::getZero(Semantics, true).getExactLog2());
- EXPECT_EQ(INT_MIN, APFloat::getInf(Semantics).getExactLog2());
- EXPECT_EQ(INT_MIN, APFloat::getInf(Semantics, true).getExactLog2());
- EXPECT_EQ(INT_MIN, APFloat::getNaN(Semantics, false).getExactLog2());
- EXPECT_EQ(INT_MIN, APFloat::getNaN(Semantics, true).getExactLog2());
-
EXPECT_EQ(INT_MIN, APFloat::getZero(Semantics, false).getExactLog2Abs());
EXPECT_EQ(INT_MIN, APFloat::getZero(Semantics, true).getExactLog2Abs());
- EXPECT_EQ(INT_MIN, APFloat::getInf(Semantics).getExactLog2Abs());
- EXPECT_EQ(INT_MIN, APFloat::getInf(Semantics, true).getExactLog2Abs());
- EXPECT_EQ(INT_MIN, APFloat::getNaN(Semantics, false).getExactLog2Abs());
- EXPECT_EQ(INT_MIN, APFloat::getNaN(Semantics, true).getExactLog2Abs());
+
+ if (APFloat::hasNanOrInf(Semantics)) {
+ EXPECT_EQ(INT_MIN, APFloat::getInf(Semantics).getExactLog2());
+ EXPECT_EQ(INT_MIN, APFloat::getInf(Semantics, true).getExactLog2());
+ EXPECT_EQ(INT_MIN, APFloat::getNaN(Semantics, false).getExactLog2());
+ EXPECT_EQ(INT_MIN, APFloat::getNaN(Semantics, true).getExactLog2());
+
+ EXPECT_EQ(INT_MIN, APFloat::getInf(Semantics).getExactLog2Abs());
+ EXPECT_EQ(INT_MIN, APFloat::getInf(Semantics, true).getExactLog2Abs());
+ EXPECT_EQ(INT_MIN, APFloat::getNaN(Semantics, false).getExactLog2Abs());
+ EXPECT_EQ(INT_MIN, APFloat::getNaN(Semantics, true).getExactLog2Abs());
+ }
EXPECT_EQ(INT_MIN,
scalbn(One, MinExp - Precision - 1, APFloat::rmNearestTiesToEven)
@@ -6660,4 +6793,311 @@ TEST(APFloatTest, getExactLog2) {
}
}
+TEST(APFloatTest, Float6E3M2FNFromString) {
+ // Exactly representable
+ EXPECT_EQ(28, APFloat(APFloat::Float6E3M2FN(), "28").convertToDouble());
+ // Round down to maximum value
+ EXPECT_EQ(28, APFloat(APFloat::Float6E3M2FN(), "32").convertToDouble());
+
+#ifdef GTEST_HAS_DEATH_TEST
+#ifndef NDEBUG
+ EXPECT_DEATH(APFloat(APFloat::Float6E3M2FN(), "inf"),
+ "This floating point format does not support Inf");
+ EXPECT_DEATH(APFloat(APFloat::Float6E3M2FN(), "nan"),
+ "This floating point format does not support NaN");
+#endif
+#endif
+
+ EXPECT_TRUE(APFloat(APFloat::Float6E3M2FN(), "0").isPosZero());
+ EXPECT_TRUE(APFloat(APFloat::Float6E3M2FN(), "-0").isNegZero());
+}
+
+TEST(APFloatTest, Float6E2M3FNFromString) {
+ // Exactly representable
+ EXPECT_EQ(7.5, APFloat(APFloat::Float6E2M3FN(), "7.5").convertToDouble());
+ // Round down to maximum value
+ EXPECT_EQ(7.5, APFloat(APFloat::Float6E2M3FN(), "32").convertToDouble());
+
+#ifdef GTEST_HAS_DEATH_TEST
+#ifndef NDEBUG
+ EXPECT_DEATH(APFloat(APFloat::Float6E2M3FN(), "inf"),
+ "This floating point format does not support Inf");
+ EXPECT_DEATH(APFloat(APFloat::Float6E2M3FN(), "nan"),
+ "This floating point format does not support NaN");
+#endif
+#endif
+
+ EXPECT_TRUE(APFloat(APFloat::Float6E2M3FN(), "0").isPosZero());
+ EXPECT_TRUE(APFloat(APFloat::Float6E2M3FN(), "-0").isNegZero());
+}
+
+TEST(APFloatTest, ConvertE3M2FToE2M3F) {
+ bool losesInfo;
+ APFloat test(APFloat::Float6E3M2FN(), "1.0");
+ APFloat::opStatus status = test.convert(
+ APFloat::Float6E2M3FN(), APFloat::rmNearestTiesToEven, &losesInfo);
+ EXPECT_EQ(1.0f, test.convertToFloat());
+ EXPECT_FALSE(losesInfo);
+ EXPECT_EQ(status, APFloat::opOK);
+
+ test = APFloat(APFloat::Float6E3M2FN(), "0.0");
+ status = test.convert(APFloat::Float6E2M3FN(), APFloat::rmNearestTiesToEven,
+ &losesInfo);
+ EXPECT_EQ(0.0f, test.convertToFloat());
+ EXPECT_FALSE(losesInfo);
+ EXPECT_EQ(status, APFloat::opOK);
+
+ // Test overflow
+ losesInfo = false;
+ test = APFloat(APFloat::Float6E3M2FN(), "28");
+ status = test.convert(APFloat::Float6E2M3FN(), APFloat::rmNearestTiesToEven,
+ &losesInfo);
+ EXPECT_EQ(7.5f, test.convertToFloat());
+ EXPECT_TRUE(losesInfo);
+ EXPECT_EQ(status, APFloat::opInexact);
+
+ // Test underflow
+ test = APFloat(APFloat::Float6E3M2FN(), ".0625");
+ status = test.convert(APFloat::Float6E2M3FN(), APFloat::rmNearestTiesToEven,
+ &losesInfo);
+ EXPECT_EQ(0., test.convertToFloat());
+ EXPECT_TRUE(losesInfo);
+ EXPECT_EQ(status, APFloat::opUnderflow | APFloat::opInexact);
+
+ // Testing inexact rounding to denormal number
+ losesInfo = false;
+ test = APFloat(APFloat::Float6E3M2FN(), "0.1875");
+ status = test.convert(APFloat::Float6E2M3FN(), APFloat::rmNearestTiesToEven,
+ &losesInfo);
+ EXPECT_EQ(0.25, test.convertToFloat());
+ EXPECT_TRUE(losesInfo);
+ EXPECT_EQ(status, APFloat::opUnderflow | APFloat::opInexact);
+}
+
+TEST(APFloatTest, ConvertE2M3FToE3M2F) {
+ bool losesInfo;
+ APFloat test(APFloat::Float6E2M3FN(), "1.0");
+ APFloat::opStatus status = test.convert(
+ APFloat::Float6E3M2FN(), APFloat::rmNearestTiesToEven, &losesInfo);
+ EXPECT_EQ(1.0f, test.convertToFloat());
+ EXPECT_FALSE(losesInfo);
+ EXPECT_EQ(status, APFloat::opOK);
+
+ test = APFloat(APFloat::Float6E2M3FN(), "0.0");
+ status = test.convert(APFloat::Float6E3M2FN(), APFloat::rmNearestTiesToEven,
+ &losesInfo);
+ EXPECT_EQ(0.0f, test.convertToFloat());
+ EXPECT_FALSE(losesInfo);
+ EXPECT_EQ(status, APFloat::opOK);
+
+ test = APFloat(APFloat::Float6E2M3FN(), ".125");
+ status = test.convert(APFloat::Float6E3M2FN(), APFloat::rmNearestTiesToEven,
+ &losesInfo);
+ EXPECT_EQ(.125, test.convertToFloat());
+ EXPECT_FALSE(losesInfo);
+ EXPECT_EQ(status, APFloat::opOK);
+
+ // Test inexact rounding
+ losesInfo = false;
+ test = APFloat(APFloat::Float6E2M3FN(), "7.5");
+ status = test.convert(APFloat::Float6E3M2FN(), APFloat::rmNearestTiesToEven,
+ &losesInfo);
+ EXPECT_EQ(8, test.convertToFloat());
+ EXPECT_TRUE(losesInfo);
+ EXPECT_EQ(status, APFloat::opInexact);
+}
+
+TEST(APFloatTest, Float6E3M2FNNext) {
+ APFloat test(APFloat::Float6E3M2FN(), APFloat::uninitialized);
+ APFloat expected(APFloat::Float6E3M2FN(), APFloat::uninitialized);
+
+ // 1. NextUp of largest bit pattern is the same
+ test = APFloat::getLargest(APFloat::Float6E3M2FN());
+ expected = APFloat::getLargest(APFloat::Float6E3M2FN());
+ EXPECT_EQ(test.next(false), APFloat::opOK);
+ EXPECT_FALSE(test.isInfinity());
+ EXPECT_FALSE(test.isZero());
+ EXPECT_TRUE(test.bitwiseIsEqual(expected));
+
+ // 2. NextUp of smallest negative denormal is -0
+ test = APFloat::getSmallest(APFloat::Float6E3M2FN(), true);
+ expected = APFloat::getZero(APFloat::Float6E3M2FN(), true);
+ EXPECT_EQ(test.next(false), APFloat::opOK);
+ EXPECT_TRUE(test.isNegZero());
+ EXPECT_FALSE(test.isPosZero());
+ EXPECT_TRUE(test.bitwiseIsEqual(expected));
+
+ // 3. nextDown of negative of largest value is the same
+ test = APFloat::getLargest(APFloat::Float6E3M2FN(), true);
+ expected = test;
+ EXPECT_EQ(test.next(true), APFloat::opOK);
+ EXPECT_FALSE(test.isInfinity());
+ EXPECT_FALSE(test.isZero());
+ EXPECT_FALSE(test.isNaN());
+ EXPECT_TRUE(test.bitwiseIsEqual(expected));
+
+ // 4. nextDown of +0 is smallest negative denormal
+ test = APFloat::getZero(APFloat::Float6E3M2FN(), false);
+ expected = APFloat::getSmallest(APFloat::Float6E3M2FN(), true);
+ EXPECT_EQ(test.next(true), APFloat::opOK);
+ EXPECT_FALSE(test.isZero());
+ EXPECT_TRUE(test.isDenormal());
+ EXPECT_TRUE(test.bitwiseIsEqual(expected));
+}
+
+TEST(APFloatTest, Float6E2M3FNNext) {
+ APFloat test(APFloat::Float6E2M3FN(), APFloat::uninitialized);
+ APFloat expected(APFloat::Float6E2M3FN(), APFloat::uninitialized);
+
+ // 1. NextUp of largest bit pattern is the same
+ test = APFloat::getLargest(APFloat::Float6E2M3FN());
+ expected = APFloat::getLargest(APFloat::Float6E2M3FN());
+ EXPECT_EQ(test.next(false), APFloat::opOK);
+ EXPECT_FALSE(test.isInfinity());
+ EXPECT_FALSE(test.isZero());
+ EXPECT_TRUE(test.bitwiseIsEqual(expected));
+
+ // 2. NextUp of smallest negative denormal is -0
+ test = APFloat::getSmallest(APFloat::Float6E2M3FN(), true);
+ expected = APFloat::getZero(APFloat::Float6E2M3FN(), true);
+ EXPECT_EQ(test.next(false), APFloat::opOK);
+ EXPECT_TRUE(test.isNegZero());
+ EXPECT_FALSE(test.isPosZero());
+ EXPECT_TRUE(test.bitwiseIsEqual(expected));
+
+ // 3. nextDown of negative of largest value is the same
+ test = APFloat::getLargest(APFloat::Float6E2M3FN(), true);
+ expected = test;
+ EXPECT_EQ(test.next(true), APFloat::opOK);
+ EXPECT_FALSE(test.isInfinity());
+ EXPECT_FALSE(test.isZero());
+ EXPECT_FALSE(test.isNaN());
+ EXPECT_TRUE(test.bitwiseIsEqual(expected));
+
+ // 4. nextDown of +0 is smallest negative denormal
+ test = APFloat::getZero(APFloat::Float6E2M3FN(), false);
+ expected = APFloat::getSmallest(APFloat::Float6E2M3FN(), true);
+ EXPECT_EQ(test.next(true), APFloat::opOK);
+ EXPECT_FALSE(test.isZero());
+ EXPECT_TRUE(test.isDenormal());
+ EXPECT_TRUE(test.bitwiseIsEqual(expected));
+}
+
+#ifdef GTEST_HAS_DEATH_TEST
+#ifndef NDEBUG
+TEST(APFloatTest, Float6E3M2FNGetInfNaN) {
+ EXPECT_DEATH(APFloat::getInf(APFloat::Float6E3M2FN()),
+ "This floating point format does not support Inf");
+ EXPECT_DEATH(APFloat::getNaN(APFloat::Float6E3M2FN()),
+ "This floating point format does not support NaN");
+}
+
+TEST(APFloatTest, Float6E2M3FNGetInfNaN) {
+ EXPECT_DEATH(APFloat::getInf(APFloat::Float6E2M3FN()),
+ "This floating point format does not support Inf");
+ EXPECT_DEATH(APFloat::getNaN(APFloat::Float6E2M3FN()),
+ "This floating point format does not support NaN");
+}
+#endif
+#endif
+
+TEST(APFloatTest, Float6E3M2FNToDouble) {
+ APFloat One(APFloat::Float6E3M2FN(), "1.0");
+ EXPECT_EQ(1.0, One.convertToDouble());
+ APFloat Two(APFloat::Float6E3M2FN(), "2.0");
+ EXPECT_EQ(2.0, Two.convertToDouble());
+ APFloat PosLargest = APFloat::getLargest(APFloat::Float6E3M2FN(), false);
+ EXPECT_EQ(28., PosLargest.convertToDouble());
+ APFloat NegLargest = APFloat::getLargest(APFloat::Float6E3M2FN(), true);
+ EXPECT_EQ(-28., NegLargest.convertToDouble());
+ APFloat PosSmallest =
+ APFloat::getSmallestNormalized(APFloat::Float6E3M2FN(), false);
+ EXPECT_EQ(0x1p-2, PosSmallest.convertToDouble());
+ APFloat NegSmallest =
+ APFloat::getSmallestNormalized(APFloat::Float6E3M2FN(), true);
+ EXPECT_EQ(-0x1p-2, NegSmallest.convertToDouble());
+
+ APFloat SmallestDenorm = APFloat::getSmallest(APFloat::Float6E3M2FN(), false);
+ EXPECT_TRUE(SmallestDenorm.isDenormal());
+ EXPECT_EQ(0x0.1p0, SmallestDenorm.convertToDouble());
+}
+
+TEST(APFloatTest, Float6E2M3FNToDouble) {
+ APFloat One(APFloat::Float6E2M3FN(), "1.0");
+ EXPECT_EQ(1.0, One.convertToDouble());
+ APFloat Two(APFloat::Float6E2M3FN(), "2.0");
+ EXPECT_EQ(2.0, Two.convertToDouble());
+ APFloat PosLargest = APFloat::getLargest(APFloat::Float6E2M3FN(), false);
+ EXPECT_EQ(7.5, PosLargest.convertToDouble());
+ APFloat NegLargest = APFloat::getLargest(APFloat::Float6E2M3FN(), true);
+ EXPECT_EQ(-7.5, NegLargest.convertToDouble());
+ APFloat PosSmallest =
+ APFloat::getSmallestNormalized(APFloat::Float6E2M3FN(), false);
+ EXPECT_EQ(0x1p0, PosSmallest.convertToDouble());
+ APFloat NegSmallest =
+ APFloat::getSmallestNormalized(APFloat::Float6E2M3FN(), true);
+ EXPECT_EQ(-0x1p0, NegSmallest.convertToDouble());
+
+ APFloat SmallestDenorm = APFloat::getSmallest(APFloat::Float6E2M3FN(), false);
+ EXPECT_TRUE(SmallestDenorm.isDenormal());
+ EXPECT_EQ(0x0.2p0, SmallestDenorm.convertToDouble());
+}
+
+TEST(APFloatTest, Float6E3M2FNToFloat) {
+ APFloat PosZero = APFloat::getZero(APFloat::Float6E3M2FN());
+ APFloat PosZeroToFloat(PosZero.convertToFloat());
+ EXPECT_TRUE(PosZeroToFloat.isPosZero());
+ APFloat NegZero = APFloat::getZero(APFloat::Float6E3M2FN(), true);
+ APFloat NegZeroToFloat(NegZero.convertToFloat());
+ EXPECT_TRUE(NegZeroToFloat.isNegZero());
+
+ APFloat One(APFloat::Float6E3M2FN(), "1.0");
+ EXPECT_EQ(1.0F, One.convertToFloat());
+ APFloat Two(APFloat::Float6E3M2FN(), "2.0");
+ EXPECT_EQ(2.0F, Two.convertToFloat());
+
+ APFloat PosLargest = APFloat::getLargest(APFloat::Float6E3M2FN(), false);
+ EXPECT_EQ(28., PosLargest.convertToFloat());
+ APFloat NegLargest = APFloat::getLargest(APFloat::Float6E3M2FN(), true);
+ EXPECT_EQ(-28, NegLargest.convertToFloat());
+ APFloat PosSmallest =
+ APFloat::getSmallestNormalized(APFloat::Float6E3M2FN(), false);
+ EXPECT_EQ(0x1p-2, PosSmallest.convertToFloat());
+ APFloat NegSmallest =
+ APFloat::getSmallestNormalized(APFloat::Float6E3M2FN(), true);
+ EXPECT_EQ(-0x1p-2, NegSmallest.convertToFloat());
+
+ APFloat SmallestDenorm = APFloat::getSmallest(APFloat::Float6E3M2FN(), false);
+ EXPECT_TRUE(SmallestDenorm.isDenormal());
+ EXPECT_EQ(0x0.1p0, SmallestDenorm.convertToFloat());
+}
+
+TEST(APFloatTest, Float6E2M3FNToFloat) {
+ APFloat PosZero = APFloat::getZero(APFloat::Float6E2M3FN());
+ APFloat PosZeroToFloat(PosZero.convertToFloat());
+ EXPECT_TRUE(PosZeroToFloat.isPosZero());
+ APFloat NegZero = APFloat::getZero(APFloat::Float6E2M3FN(), true);
+ APFloat NegZeroToFloat(NegZero.convertToFloat());
+ EXPECT_TRUE(NegZeroToFloat.isNegZero());
+
+ APFloat One(APFloat::Float6E2M3FN(), "1.0");
+ EXPECT_EQ(1.0F, One.convertToFloat());
+ APFloat Two(APFloat::Float6E2M3FN(), "2.0");
+ EXPECT_EQ(2.0F, Two.convertToFloat());
+
+ APFloat PosLargest = APFloat::getLargest(APFloat::Float6E2M3FN(), false);
+ EXPECT_EQ(7.5, PosLargest.convertToFloat());
+ APFloat NegLargest = APFloat::getLargest(APFloat::Float6E2M3FN(), true);
+ EXPECT_EQ(-7.5, NegLargest.convertToFloat());
+ APFloat PosSmallest =
+ APFloat::getSmallestNormalized(APFloat::Float6E2M3FN(), false);
+ EXPECT_EQ(0x1p0, PosSmallest.convertToFloat());
+ APFloat NegSmallest =
+ APFloat::getSmallestNormalized(APFloat::Float6E2M3FN(), true);
+ EXPECT_EQ(-0x1p0, NegSmallest.convertToFloat());
+
+ APFloat SmallestDenorm = APFloat::getSmallest(APFloat::Float6E2M3FN(), false);
+ EXPECT_TRUE(SmallestDenorm.isDenormal());
+ EXPECT_EQ(0x0.2p0, SmallestDenorm.convertToFloat());
+}
} // namespace
More information about the cfe-commits
mailing list