[Mlir-commits] [mlir] [Arith][Transforms] Adds Truncf f32 to f4e2m1 (PR #144157)
Krzysztof Drewniak
llvmlistbot at llvm.org
Fri Jun 13 14:05:25 PDT 2025
================
@@ -366,6 +378,133 @@ struct F8E8M0ExtFOpConverter : public OpRewritePattern<arith::ExtFOp> {
}
};
+/// Conversion from F32 to F4E2M1 according to the OCP Spec:
+/// www.opencompute.org/documents/ocp-microscaling-formats-mx-v1-0-spec-final-pdf
+///
+/// The spec requiers us to perform Round to Nearest, Ties to Even.
+///
+/// This means that after rounding, we should break ties by choosing the option
+/// which results in a mantissa of 0 in the least significant digit.
+///
+/// Table of representable values in F4E2M1:
+///
+/// Note: x is sign bit
+/// | Binary | Value ( + / - )
+/// | x000 | 0.0
+/// | x001 | 0.5
+/// | x010 | 1.0
+/// | x011 | 1.5
+/// | x100 | 2.0
+/// | x101 | 3.0
+/// | x110 | 4.0
+/// | x111 | 6.0
+///
+/// Conversion procedure:
+/// Step 1: Clamp to representable bounds.
+/// Step 2: Convert exponent by adjusting bias.
+/// Step 3: Set mantissa to first bit.
+/// Step 4: Special consideration for subnormal and zero exponent.
+/// Step 5: Round up if necessary, if mantissa[1:] greater than 1000000 or
+/// subnormal.
+struct F4E2M1TruncFOpConverter : public OpRewritePattern<arith::TruncFOp> {
+ using OpRewritePattern::OpRewritePattern;
+ LogicalResult matchAndRewrite(arith::TruncFOp op,
+ PatternRewriter &rewriter) const final {
+ ImplicitLocOpBuilder b(op.getLoc(), rewriter);
+ Value operand = op.getOperand();
+ Type operandTy = operand.getType();
+ Type resultTy = op.getType();
+ Type resultETy = getElementTypeOrSelf(resultTy);
+
+ if (!llvm::isa<Float4E2M1FNType>(resultETy)) {
+ return rewriter.notifyMatchFailure(op, "not a trunc of F4E2M1FN");
+ }
+
+ Type i4Ty = cloneToShapedType(operandTy, b.getI4Type());
+ Type i8Ty = cloneToShapedType(operandTy, b.getI8Type());
+ Type i32Ty = cloneToShapedType(operandTy, b.getI32Type());
+ Type f32Ty = cloneToShapedType(operandTy, b.getF32Type());
+
+ // Constants
+ Value c0x1 = createConst(op->getLoc(), i4Ty, 1, rewriter);
+ Value cF32MantissaWidth =
+ createConst(op->getLoc(), i32Ty, 23, rewriter); // 23
+ Value cF4MantissaWidth = c0x1; // 1
+ Value cF32FirstBitMask =
+ createConst(op.getLoc(), i32Ty, 0x400000, rewriter);
+ Value c0x00000016 = createConst(op->getLoc(), i32Ty, 22, rewriter);
+ Value c0x00 = createConst(op.getLoc(), i8Ty, 0x00, rewriter);
+ Value c0xff = createConst(op.getLoc(), i8Ty, 0xff, rewriter);
+ Value cF32MantissaMask =
+ createConst(op->getLoc(), i32Ty, 0x7fffff, rewriter);
+ Value c0x00000000 = createConst(op.getLoc(), i32Ty, 0, rewriter);
+ Value cF32Last22BitMask =
+ createConst(op->getLoc(), i32Ty, 0x3fffff, rewriter);
+ ;
+
+ // Step 1: Clamp to bounds.
+ Value cHigherBound = createFloatConst(op->getLoc(), f32Ty, 6.0, rewriter);
+ Value cLowerBound = createFloatConst(op->getLoc(), f32Ty, -6.0, rewriter);
+ Value clampHigh = b.create<arith::CmpFOp>(arith::CmpFPredicate::UGT,
+ operand, cHigherBound);
+ Value clampLow = b.create<arith::CmpFOp>(arith::CmpFPredicate::ULT, operand,
+ cLowerBound);
+ Value operandClamped =
----------------
krzysz00 wrote:
These can just be min/max ops
https://github.com/llvm/llvm-project/pull/144157
More information about the Mlir-commits
mailing list