[Mlir-commits] [mlir] 17e783b - [MLIR][NVVM] Add nvvm.addf and nvvm.subf Ops (#179162)
llvmlistbot at llvm.org
llvmlistbot at llvm.org
Thu Mar 5 00:33:45 PST 2026
Author: Srinivasa Ravi
Date: 2026-03-05T14:03:41+05:30
New Revision: 17e783b241acb3070edde48ae52cc5b4c1b2b03c
URL: https://github.com/llvm/llvm-project/commit/17e783b241acb3070edde48ae52cc5b4c1b2b03c
DIFF: https://github.com/llvm/llvm-project/commit/17e783b241acb3070edde48ae52cc5b4c1b2b03c.diff
LOG: [MLIR][NVVM] Add nvvm.addf and nvvm.subf Ops (#179162)
Adds `nvvm.addf` and `nvvm.subf` Ops to the NVVM dialect. `nvvm.addf`
performs a floating-point addition between two operands. `nvvm.subf`
performs a floating-point subtraction between two operands and is
equivalent to an `llvm.fneg` followed by an `nvvm.addf` operation.
PTX ISA Reference:
1.
https://docs.nvidia.com/cuda/parallel-thread-execution/#floating-point-instructions-add
2.
https://docs.nvidia.com/cuda/parallel-thread-execution/#half-precision-floating-point-instructions-add
Added:
mlir/test/Dialect/LLVMIR/nvvm-canonicalize.mlir
mlir/test/Target/LLVMIR/nvvm/addf/addf.mlir
mlir/test/Target/LLVMIR/nvvm/addf/addf_invalid.mlir
mlir/test/Target/LLVMIR/nvvm/addf/addf_vector.mlir
Modified:
mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td
mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp
mlir/lib/Target/LLVMIR/Dialect/NVVM/NVVMToLLVMIRTranslation.cpp
Removed:
################################################################################
diff --git a/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td b/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td
index 0629879b2e86a..43c7b3df73efe 100644
--- a/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td
+++ b/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td
@@ -1873,14 +1873,35 @@ def FPRoundingModeAttr : EnumAttr<NVVM_Dialect, FPRoundingMode, "fp_rnd_mode"> {
def SaturationModeNone : I32EnumAttrCase<"NONE", 0, "none">;
def SaturationModeFinite : I32EnumAttrCase<"SATFINITE", 1, "satfinite">;
+def SaturationModeSat : I32EnumAttrCase<"SAT", 2, "sat">;
def SaturationMode : I32EnumAttr<"SaturationMode", "NVVM SaturationMode kind",
- [SaturationModeNone, SaturationModeFinite]> {
+ [SaturationModeNone, SaturationModeFinite, SaturationModeSat]> {
let genSpecializedAttr = 0;
let cppNamespace = "::mlir::NVVM";
}
def SaturationModeAttr : EnumAttr<NVVM_Dialect, SaturationMode, "sat_mode"> {
- let assemblyFormat = "`<` $value `>`";
+ let summary = "Describes the saturation mode";
+ let description = [{
+ A `nvvm.sat_mode` attribute specifies the saturation mode for instructions
+ involving floating points or integers. It can be one of the following
+ values:
+ - `none`: No saturation is applied.
+ - `satfinite`: If the absolute value of input (ignoring sign) is greater
+ than the `MAX_NORM` of the specified destination format, then the result
+ is the sign-preserved `MAX_NORM` of the destination format and a positive
+ `MAX_NORM` in unsigned datatypes for which the destination sign is not
+ supported. If the input is `NaN`, then the result can be `NaN` or th
+ `MAX_NORM` of the destination format, depending on the format.
+ - `sat`: For integer destination types, this limits the value to `MININT..
+ MAXINT` and applies to both signed and unsigned integer datatypes. For
+ floating point destination types (applies to only `F16`, `F32`, and `F64`
+ types), this limits the value to the range `[0.0, 1.0]` and flushes NaN
+ results to positive zero.
+
+ [For more information, see PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/#data-movement-and-conversion-instructions-cvt)
+}];
+ let assemblyFormat = "`<` $value `>`";
}
def NVVM_ConvertFloatToTF32Op : NVVM_Op<"convert.float.to.tf32"> {
@@ -6280,6 +6301,73 @@ def NVVM_Tcgen05MMAWsSparseOp : NVVM_Op<"tcgen05.mma.ws.sp",
}];
}
+def SIMTFloatType : AnyTypeOf<[F16, BF16, F32, F64,
+ VectorOfLengthAndType<[2], [F16, BF16, F32, F64]>]>;
+
+def SaturationModeSatOrNone :
+ ConfinedAttr<SaturationModeAttr, [EnumAttrIsOneOf<SaturationModeAttr,
+ [SaturationModeNone, SaturationModeSat]>]>;
+
+def FPArithRoundingMode :
+ ConfinedAttr<FPRoundingModeAttr, [EnumAttrIsOneOf<FPRoundingModeAttr,
+ [FPRoundingModeNone, FPRoundingModeRM, FPRoundingModeRN,
+ FPRoundingModeRP, FPRoundingModeRZ]>]>;
+
+class NVVM_FloatBinaryOp<string mnemonic, list<Trait> traits = []> :
+ NVVM_Op<mnemonic, traits # [Pure, SameOperandsAndResultType]>,
+ Arguments<(ins SIMTFloatType:$lhs, SIMTFloatType:$rhs,
+ DefaultValuedAttr<FPArithRoundingMode, "FPRoundingMode::NONE">:$rnd,
+ DefaultValuedAttr<SaturationModeSatOrNone, "SaturationMode::NONE">:$sat,
+ DefaultValuedAttr<BoolAttr, "false">:$ftz)>,
+ Results<(outs SIMTFloatType:$res)> {
+ let assemblyFormat = "$lhs `,` $rhs attr-dict `:` type($res)";
+}
+
+def NVVM_AddFOp : NVVM_FloatBinaryOp<"addf", [Commutative]> {
+ let summary = [{
+ Performs floating point addition of the given arguments `lhs` and `rhs`
+ }];
+ let description = [{
+ The `nvvm.addf` operation performs floating point addition of two floating
+ point operands of the same type.
+
+ The rounding mode is specified by the `rnd` attribute, saturation mode by
+ the `sat` attribute, and flush-to-zero by the `ftz` attribute.
+
+ For more information, see PTX ISA:
+ - [floating point addition](https://docs.nvidia.com/cuda/parallel-thread-execution/#floating-point-instructions-add)
+ - [half-precision floating point addition](https://docs.nvidia.com/cuda/parallel-thread-execution/#half-precision-floating-point-instructions-add)
+ }];
+ let hasVerifier = 1;
+
+ let extraClassDeclaration = [{
+ static void lowerAddFToLLVMIR(
+ Operation &op, LLVM::ModuleTranslation &mt, llvm::IRBuilderBase &builder);
+ }];
+ let llvmBuilder = [{
+ NVVM::AddFOp::lowerAddFToLLVMIR(*op, moduleTranslation, builder);
+ }];
+}
+
+def NVVM_SubFOp : NVVM_FloatBinaryOp<"subf"> {
+ let summary = [{
+ Performs floating point subtraction of the given arguments `lhs` and `rhs`
+ }];
+ let description = [{
+ The `nvvm.subf` operation performs floating point subtraction of two
+ operands.
+
+ It supports the same type combinations and modifiers as `nvvm.addf`.
+ This is equivalent to `nvvm.addf(lhs, -rhs)`.
+
+ For more information, see PTX ISA:
+ - [floating point subtraction](https://docs.nvidia.com/cuda/parallel-thread-execution/#floating-point-instructions-sub)
+ - [half-precision floating point subtraction](https://docs.nvidia.com/cuda/parallel-thread-execution/#half-precision-floating-point-instructions-sub)
+
+ }];
+ let hasCanonicalizer = 1;
+}
+
//===----------------------------------------------------------------------===//
// NVVM tensormap.replace Op
//===----------------------------------------------------------------------===//
diff --git a/mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp b/mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp
index f0d22d896d88a..354a02f4a2aae 100644
--- a/mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp
+++ b/mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp
@@ -3059,6 +3059,46 @@ LogicalResult NVVM::TensormapReplaceOp::verify() {
return success();
}
+LogicalResult NVVM::AddFOp::verify() {
+ mlir::NVVM::FPRoundingMode rndMode = getRnd();
+ mlir::NVVM::SaturationMode satMode = getSat();
+ bool isFTZ = getFtz();
+
+ mlir::Type opType = getRes().getType();
+ mlir::Type opBaseType = isa<VectorType>(opType)
+ ? cast<VectorType>(opType).getElementType()
+ : opType;
+
+ if (opBaseType.isF64() && (satMode != NVVM::SaturationMode::NONE || isFTZ))
+ return emitOpError("FTZ and saturation are not supported for additions "
+ "involving f64 type");
+
+ if (opBaseType.isF16() && !(rndMode == NVVM::FPRoundingMode::RN ||
+ rndMode == NVVM::FPRoundingMode::NONE))
+ return emitOpError("only RN rounding mode is supported for f16 and "
+ "vector<2xf16> additions");
+
+ if (opBaseType.isBF16()) {
+ if (rndMode != NVVM::FPRoundingMode::RN &&
+ rndMode != NVVM::FPRoundingMode::NONE)
+ return emitOpError("only RN rounding mode is supported for bf16 and "
+ "vector<2xbf16> additions");
+ if (satMode != NVVM::SaturationMode::NONE || isFTZ)
+ return emitOpError("FTZ and saturation are not supported for bf16 and "
+ "vector<2xbf16> additions");
+ }
+
+ // FIXME: This is a temporary check disallowing lowering to add.rn.ftz.f16(x2)
+ // PTX instructions since the corresponding LLVM intrinsic is missing. This
+ // should be removed once the intrinsics for f16 addition (with FTZ only) are
+ // available.
+ if (opBaseType.isF16() && isFTZ && satMode == NVVM::SaturationMode::NONE)
+ return emitOpError("FTZ with no saturation is not supported for f16 and "
+ "vector<2xf16> additions");
+
+ return success();
+}
+
/// Packs the given `field` into the `result`.
/// The `result` is 64-bits and each `field` can be 32-bits or narrower.
static llvm::Value *
@@ -3135,6 +3175,30 @@ std::string NVVM::MBarrierTryWaitParityOp::getPtx() {
space);
}
+//===----------------------------------------------------------------------===//
+// Canonicalization patterns
+//===----------------------------------------------------------------------===//
+
+struct ConvertFsubToFnegFadd : public OpRewritePattern<SubFOp> {
+ using OpRewritePattern<SubFOp>::OpRewritePattern;
+
+ LogicalResult matchAndRewrite(SubFOp op,
+ PatternRewriter &rewriter) const override {
+ Location loc = op.getLoc();
+ Value negRhs =
+ LLVM::FNegOp::create(rewriter, loc, op.getRhs().getType(), op.getRhs());
+
+ rewriter.replaceOpWithNewOp<AddFOp>(op, op.getType(), op.getLhs(), negRhs,
+ op.getRnd(), op.getSat(), op.getFtz());
+ return success();
+ }
+};
+
+void SubFOp::getCanonicalizationPatterns(RewritePatternSet &patterns,
+ MLIRContext *context) {
+ patterns.add<ConvertFsubToFnegFadd>(context);
+}
+
//===----------------------------------------------------------------------===//
// getIntrinsicID/getIntrinsicIDAndArgs methods
//===----------------------------------------------------------------------===//
diff --git a/mlir/lib/Target/LLVMIR/Dialect/NVVM/NVVMToLLVMIRTranslation.cpp b/mlir/lib/Target/LLVMIR/Dialect/NVVM/NVVMToLLVMIRTranslation.cpp
index dd7a6e76f7569..02067bb456b25 100644
--- a/mlir/lib/Target/LLVMIR/Dialect/NVVM/NVVMToLLVMIRTranslation.cpp
+++ b/mlir/lib/Target/LLVMIR/Dialect/NVVM/NVVMToLLVMIRTranslation.cpp
@@ -446,6 +446,123 @@ getFenceProxySyncRestrictID(NVVM::MemOrderKind order) {
nvvm_fence_proxy_async_generic_release_sync_restrict_space_cta_scope_cluster;
}
+void NVVM::AddFOp::lowerAddFToLLVMIR(Operation &op, LLVM::ModuleTranslation &mt,
+ llvm::IRBuilderBase &builder) {
+ auto thisOp = cast<NVVM::AddFOp>(op);
+ NVVM::FPRoundingMode rndMode = thisOp.getRnd();
+ NVVM::SaturationMode satMode = thisOp.getSat();
+ bool isFTZ = thisOp.getFtz();
+ bool isSat = satMode != NVVM::SaturationMode::NONE;
+
+ llvm::Value *argLHS = mt.lookupValue(thisOp.getLhs());
+ llvm::Value *argRHS = mt.lookupValue(thisOp.getRhs());
+
+ mlir::Type opType = thisOp.getLhs().getType();
+ llvm::Type *opTypeLLVM = mt.convertType(opType);
+ bool isVectorAdd = opTypeLLVM->isVectorTy();
+
+ // FIXME: Add intrinsics for add.rn.ftz.f16x2 and add.rn.ftz.f16 here when
+ // they are available.
+ static constexpr llvm::Intrinsic::ID f16IDs[] = {
+ llvm::Intrinsic::nvvm_add_rn_sat_f16,
+ llvm::Intrinsic::nvvm_add_rn_ftz_sat_f16,
+ llvm::Intrinsic::nvvm_add_rn_sat_v2f16,
+ llvm::Intrinsic::nvvm_add_rn_ftz_sat_v2f16,
+ };
+
+ static constexpr llvm::Intrinsic::ID f32IDs[] = {
+ llvm::Intrinsic::nvvm_add_rn_f, // default rounding mode RN
+ llvm::Intrinsic::nvvm_add_rn_f,
+ llvm::Intrinsic::nvvm_add_rm_f,
+ llvm::Intrinsic::nvvm_add_rp_f,
+ llvm::Intrinsic::nvvm_add_rz_f,
+ llvm::Intrinsic::nvvm_add_rn_sat_f, // default rounding mode RN
+ llvm::Intrinsic::nvvm_add_rn_sat_f,
+ llvm::Intrinsic::nvvm_add_rm_sat_f,
+ llvm::Intrinsic::nvvm_add_rp_sat_f,
+ llvm::Intrinsic::nvvm_add_rz_sat_f,
+ llvm::Intrinsic::nvvm_add_rn_ftz_f, // default rounding mode RN
+ llvm::Intrinsic::nvvm_add_rn_ftz_f,
+ llvm::Intrinsic::nvvm_add_rm_ftz_f,
+ llvm::Intrinsic::nvvm_add_rp_ftz_f,
+ llvm::Intrinsic::nvvm_add_rz_ftz_f,
+ llvm::Intrinsic::nvvm_add_rn_ftz_sat_f, // default rounding mode RN
+ llvm::Intrinsic::nvvm_add_rn_ftz_sat_f,
+ llvm::Intrinsic::nvvm_add_rm_ftz_sat_f,
+ llvm::Intrinsic::nvvm_add_rp_ftz_sat_f,
+ llvm::Intrinsic::nvvm_add_rz_ftz_sat_f,
+ };
+
+ static constexpr llvm::Intrinsic::ID f64IDs[] = {
+ llvm::Intrinsic::nvvm_add_rn_d, // default rounding mode RN
+ llvm::Intrinsic::nvvm_add_rn_d, llvm::Intrinsic::nvvm_add_rm_d,
+ llvm::Intrinsic::nvvm_add_rp_d, llvm::Intrinsic::nvvm_add_rz_d};
+
+ auto addIntrinsic = [&](llvm::Intrinsic::ID IID) -> llvm::Value * {
+ auto createAddIntrinsicCall = [&](llvm::Intrinsic::ID IID, llvm::Value *LHS,
+ llvm::Value *RHS) -> llvm::CallInst * {
+ llvm::SmallVector<llvm::Value *, 2> callArgs;
+ callArgs.push_back(LHS);
+ callArgs.push_back(RHS);
+ return createIntrinsicCall(builder, IID, callArgs);
+ };
+
+ if (isVectorAdd && (opTypeLLVM->getScalarType()->isFloatTy() ||
+ opTypeLLVM->getScalarType()->isDoubleTy())) {
+ llvm::Value *result = llvm::PoisonValue::get(
+ llvm::FixedVectorType::get(opTypeLLVM->getScalarType(), 2));
+ for (int64_t i = 0; i < 2; ++i) {
+ llvm::Value *lhsElemi =
+ builder.CreateExtractElement(argLHS, builder.getInt32(i));
+ llvm::Value *rhsElemi =
+ builder.CreateExtractElement(argRHS, builder.getInt32(i));
+ llvm::Value *sum = createAddIntrinsicCall(IID, lhsElemi, rhsElemi);
+ result = builder.CreateInsertElement(result, sum, builder.getInt32(i));
+ };
+ return result;
+ }
+
+ return createAddIntrinsicCall(IID, argLHS, argRHS);
+ }; // addIntrinsic end
+
+ // f16 + f16 -> f16 / vector<2xf16> + vector<2xf16> -> vector<2xf16>
+ // FIXME: Allow lowering to add.rn.ftz.f16x2 and add.rn.ftz.f16 here when the
+ // intrinsics are available.
+ if (opTypeLLVM->getScalarType()->isHalfTy()) {
+ llvm::Value *result;
+ if (isSat) {
+ unsigned index = (isVectorAdd << 1) | isFTZ;
+ result = addIntrinsic(f16IDs[index]);
+ } else {
+ result = builder.CreateFAdd(argLHS, argRHS);
+ }
+ mt.mapValue(thisOp.getRes(), result);
+ return;
+ }
+
+ // bf16 + bf16 -> bf16 / vector<2xbf16> + vector<2xbf16> -> vector<2xbf16>
+ if (opTypeLLVM->getScalarType()->isBFloatTy()) {
+ mt.mapValue(thisOp.getRes(), builder.CreateFAdd(argLHS, argRHS));
+ return;
+ }
+
+ // f64 + f64 -> f64 / vector<2xf64> + vector<2xf64> -> vector<2xf64>
+ if (opTypeLLVM->getScalarType()->isDoubleTy()) {
+ unsigned index = static_cast<unsigned>(rndMode);
+ mt.mapValue(thisOp.getRes(), addIntrinsic(f64IDs[index]));
+ return;
+ }
+
+ // f32 + f32 -> f32 / vector<2xf32> + vector<2xf32> -> vector<2xf32>
+ const unsigned numRndModes = 5; // NONE, RM, RN, RP, RZ
+ if (opTypeLLVM->getScalarType()->isFloatTy()) {
+ unsigned index =
+ ((isFTZ << 1) | isSat) * numRndModes + static_cast<unsigned>(rndMode);
+ mt.mapValue(thisOp.getRes(), addIntrinsic(f32IDs[index]));
+ return;
+ }
+}
+
namespace {
/// Implementation of the dialect interface that converts operations belonging
/// to the NVVM dialect to LLVM IR.
diff --git a/mlir/test/Dialect/LLVMIR/nvvm-canonicalize.mlir b/mlir/test/Dialect/LLVMIR/nvvm-canonicalize.mlir
new file mode 100644
index 0000000000000..fe9afd840bab2
--- /dev/null
+++ b/mlir/test/Dialect/LLVMIR/nvvm-canonicalize.mlir
@@ -0,0 +1,9 @@
+// RUN: mlir-opt %s -split-input-file --canonicalize | FileCheck %s
+
+// CHECK-LABEL: @subf_canonicalize
+llvm.func @subf_canonicalize(%arg0 : f32, %arg1 : f32) -> f32 {
+ // CHECK: %[[NEG_ARG1:.*]] = llvm.fneg %arg1 : f32
+ // CHECK: %[[ADD_RESULT:.*]] = nvvm.addf %arg0, %[[NEG_ARG1]] : f32
+ %0 = nvvm.subf %arg0, %arg1 : f32
+ llvm.return %0 : f32
+}
diff --git a/mlir/test/Target/LLVMIR/nvvm/addf/addf.mlir b/mlir/test/Target/LLVMIR/nvvm/addf/addf.mlir
new file mode 100644
index 0000000000000..fd05c85ae441f
--- /dev/null
+++ b/mlir/test/Target/LLVMIR/nvvm/addf/addf.mlir
@@ -0,0 +1,89 @@
+// RUN: mlir-translate -mlir-to-llvmir %s | FileCheck %s
+
+// f16 + f16 -> f16
+llvm.func @fadd_f16_f16(%a : f16, %b : f16) -> f16 {
+ // CHECK-LABEL: define half @fadd_f16_f16(half %0, half %1) {
+ // CHECK-NEXT: %3 = fadd half %0, %1
+ // CHECK-NEXT: %4 = fadd half %3, %3
+ // CHECK-NEXT: %5 = call half @llvm.nvvm.add.rn.sat.f16(half %4, half %4)
+ // CHECK-NEXT: %6 = call half @llvm.nvvm.add.rn.ftz.sat.f16(half %5, half %5)
+ // CHECK-NEXT: ret half %6
+ // CHECK-NEXT: }
+ %f1 = nvvm.addf %a, %b : f16
+ %f2 = nvvm.addf %f1, %f1 {rnd = #nvvm.fp_rnd_mode<rn>} : f16
+ %f3 = nvvm.addf %f2, %f2 {rnd = #nvvm.fp_rnd_mode<rn>, sat = #nvvm.sat_mode<sat>} : f16
+ %f4 = nvvm.addf %f3, %f3 {rnd = #nvvm.fp_rnd_mode<rn>, sat = #nvvm.sat_mode<sat>, ftz=true} : f16
+ llvm.return %f4 : f16
+}
+
+// bf16 + bf16 -> bf16
+llvm.func @fadd_bf16_bf16(%a : bf16, %b : bf16) -> bf16 {
+ // CHECK-LABEL: define bfloat @fadd_bf16_bf16(bfloat %0, bfloat %1) {
+ // CHECK-NEXT: %3 = fadd bfloat %0, %1
+ // CHECK-NEXT: %4 = fadd bfloat %3, %3
+ // CHECK-NEXT: ret bfloat %4
+ // CHECK-NEXT: }
+ %f1 = nvvm.addf %a, %b : bf16
+ %f2 = nvvm.addf %f1, %f1 {rnd = #nvvm.fp_rnd_mode<rn>} : bf16
+ llvm.return %f2 : bf16
+}
+
+// f32 + f32 -> f32
+llvm.func @fadd_f32_f32(%a : f32, %b : f32) -> f32 {
+ // CHECK-LABEL: define float @fadd_f32_f32(float %0, float %1) {
+ // CHECK-NEXT: %3 = call float @llvm.nvvm.add.rn.f(float %0, float %1)
+ // CHECK-NEXT: %4 = call float @llvm.nvvm.add.rn.f(float %3, float %3)
+ // CHECK-NEXT: %5 = call float @llvm.nvvm.add.rn.sat.f(float %4, float %4)
+ // CHECK-NEXT: %6 = call float @llvm.nvvm.add.rn.ftz.f(float %5, float %5)
+ // CHECK-NEXT: %7 = call float @llvm.nvvm.add.rn.ftz.sat.f(float %6, float %6)
+ // CHECK-NEXT: %8 = call float @llvm.nvvm.add.rm.f(float %7, float %7)
+ // CHECK-NEXT: %9 = call float @llvm.nvvm.add.rm.sat.f(float %8, float %8)
+ // CHECK-NEXT: %10 = call float @llvm.nvvm.add.rm.ftz.f(float %9, float %9)
+ // CHECK-NEXT: %11 = call float @llvm.nvvm.add.rm.ftz.sat.f(float %10, float %10)
+ // CHECK-NEXT: %12 = call float @llvm.nvvm.add.rp.f(float %11, float %11)
+ // CHECK-NEXT: %13 = call float @llvm.nvvm.add.rp.sat.f(float %12, float %12)
+ // CHECK-NEXT: %14 = call float @llvm.nvvm.add.rp.ftz.f(float %13, float %13)
+ // CHECK-NEXT: %15 = call float @llvm.nvvm.add.rp.ftz.sat.f(float %14, float %14)
+ // CHECK-NEXT: %16 = call float @llvm.nvvm.add.rz.f(float %15, float %15)
+ // CHECK-NEXT: %17 = call float @llvm.nvvm.add.rz.sat.f(float %16, float %16)
+ // CHECK-NEXT: %18 = call float @llvm.nvvm.add.rz.ftz.f(float %17, float %17)
+ // CHECK-NEXT: %19 = call float @llvm.nvvm.add.rz.ftz.sat.f(float %18, float %18)
+ // CHECK-NEXT: ret float %19
+ // CHECK-NEXT: }
+ %f1 = nvvm.addf %a, %b : f32
+ %f2 = nvvm.addf %f1, %f1 {rnd = #nvvm.fp_rnd_mode<rn>} : f32
+ %f3 = nvvm.addf %f2, %f2 {rnd = #nvvm.fp_rnd_mode<rn>, sat = #nvvm.sat_mode<sat>} : f32
+ %f4 = nvvm.addf %f3, %f3 {rnd = #nvvm.fp_rnd_mode<rn>, ftz=true} : f32
+ %f5 = nvvm.addf %f4, %f4 {rnd = #nvvm.fp_rnd_mode<rn>, sat = #nvvm.sat_mode<sat>, ftz=true} : f32
+ %f6 = nvvm.addf %f5, %f5 {rnd = #nvvm.fp_rnd_mode<rm>} : f32
+ %f7 = nvvm.addf %f6, %f6 {rnd = #nvvm.fp_rnd_mode<rm>, sat = #nvvm.sat_mode<sat>} : f32
+ %f8 = nvvm.addf %f7, %f7 {rnd = #nvvm.fp_rnd_mode<rm>, ftz=true} : f32
+ %f9 = nvvm.addf %f8, %f8 {rnd = #nvvm.fp_rnd_mode<rm>, sat = #nvvm.sat_mode<sat>, ftz=true} : f32
+ %f10 = nvvm.addf %f9, %f9 {rnd = #nvvm.fp_rnd_mode<rp>} : f32
+ %f11 = nvvm.addf %f10, %f10 {rnd = #nvvm.fp_rnd_mode<rp>, sat = #nvvm.sat_mode<sat>} : f32
+ %f12 = nvvm.addf %f11, %f11 {rnd = #nvvm.fp_rnd_mode<rp>, ftz=true} : f32
+ %f13 = nvvm.addf %f12, %f12 {rnd = #nvvm.fp_rnd_mode<rp>, sat = #nvvm.sat_mode<sat>, ftz=true} : f32
+ %f14 = nvvm.addf %f13, %f13 {rnd = #nvvm.fp_rnd_mode<rz>} : f32
+ %f15 = nvvm.addf %f14, %f14 {rnd = #nvvm.fp_rnd_mode<rz>, sat = #nvvm.sat_mode<sat>} : f32
+ %f16 = nvvm.addf %f15, %f15 {rnd = #nvvm.fp_rnd_mode<rz>, ftz=true} : f32
+ %f17 = nvvm.addf %f16, %f16 {rnd = #nvvm.fp_rnd_mode<rz>, sat = #nvvm.sat_mode<sat>, ftz=true} : f32
+ llvm.return %f17 : f32
+}
+
+// f64 + f64 -> f64
+llvm.func @fadd_f64_f64(%a : f64, %b : f64) -> f64 {
+ // CHECK-LABEL: define double @fadd_f64_f64(double %0, double %1) {
+ // CHECK-NEXT: %3 = call double @llvm.nvvm.add.rn.d(double %0, double %1)
+ // CHECK-NEXT: %4 = call double @llvm.nvvm.add.rn.d(double %3, double %3)
+ // CHECK-NEXT: %5 = call double @llvm.nvvm.add.rm.d(double %4, double %4)
+ // CHECK-NEXT: %6 = call double @llvm.nvvm.add.rp.d(double %5, double %5)
+ // CHECK-NEXT: %7 = call double @llvm.nvvm.add.rz.d(double %6, double %6)
+ // CHECK-NEXT: ret double %7
+ // CHECK-NEXT: }
+ %f1 = nvvm.addf %a, %b : f64
+ %f2 = nvvm.addf %f1, %f1 {rnd = #nvvm.fp_rnd_mode<rn>} : f64
+ %f3 = nvvm.addf %f2, %f2 {rnd = #nvvm.fp_rnd_mode<rm>} : f64
+ %f4 = nvvm.addf %f3, %f3 {rnd = #nvvm.fp_rnd_mode<rp>} : f64
+ %f5 = nvvm.addf %f4, %f4 {rnd = #nvvm.fp_rnd_mode<rz>} : f64
+ llvm.return %f5 : f64
+}
diff --git a/mlir/test/Target/LLVMIR/nvvm/addf/addf_invalid.mlir b/mlir/test/Target/LLVMIR/nvvm/addf/addf_invalid.mlir
new file mode 100644
index 0000000000000..b0b162357fe2f
--- /dev/null
+++ b/mlir/test/Target/LLVMIR/nvvm/addf/addf_invalid.mlir
@@ -0,0 +1,67 @@
+// RUN: mlir-translate --mlir-to-llvmir --split-input-file --verify-diagnostics %s
+
+// -----
+
+llvm.func @addf_invalid_sat_mode(%a : f16, %b : f16) -> f16 {
+ // expected-error at +1 {{ attribute 'sat' failed to satisfy constraint: Describes the saturation mode whose value is one of {none, sat}}}
+ %f1 = nvvm.addf %a, %b {sat = #nvvm.sat_mode<satfinite>} : f16
+ llvm.return %f1 : f16
+}
+
+// -----
+
+llvm.func @addf_invalid_f64_sat_ftz(%a : f64, %b : f64) -> f64 {
+ // expected-error at +1 {{FTZ and saturation are not supported for additions involving f64 type}}
+ %f1 = nvvm.addf %a, %b {sat = #nvvm.sat_mode<sat>, ftz=true} : f64
+ llvm.return %f1 : f64
+}
+
+// -----
+
+llvm.func @addf_invalid_f16_rnd_mode(%a : f16, %b : f16) -> f16 {
+ // expected-error at +1 {{only RN rounding mode is supported for f16 and vector<2xf16> additions}}
+ %f1 = nvvm.addf %a, %b {rnd = #nvvm.fp_rnd_mode<rm>} : f16
+ llvm.return %f1 : f16
+}
+
+// -----
+
+llvm.func @addf_invalid_v2f16_rnd_mode(%a : vector<2xf16>, %b : vector<2xf16>) -> vector<2xf16> {
+ // expected-error at +1 {{only RN rounding mode is supported for f16 and vector<2xf16> additions}}
+ %f1 = nvvm.addf %a, %b {rnd = #nvvm.fp_rnd_mode<rm>} : vector<2xf16>
+ llvm.return %f1 : vector<2xf16>
+}
+
+// -----
+
+llvm.func @addf_invalid_bf16_rnd_mode(%a : bf16, %b : bf16) -> bf16 {
+ // expected-error at +1 {{only RN rounding mode is supported for bf16 and vector<2xbf16> additions}}
+ %f1 = nvvm.addf %a, %b {rnd = #nvvm.fp_rnd_mode<rm>} : bf16
+ llvm.return %f1 : bf16
+}
+
+// -----
+
+llvm.func @addf_invalid_v2bf16_rnd_mode(%a : vector<2xbf16>, %b : vector<2xbf16>) -> vector<2xbf16> {
+ // expected-error at +1 {{only RN rounding mode is supported for bf16 and vector<2xbf16> additions}}
+ %f1 = nvvm.addf %a, %b {rnd = #nvvm.fp_rnd_mode<rm>} : vector<2xbf16>
+ llvm.return %f1 : vector<2xbf16>
+}
+
+// -----
+
+llvm.func @addf_invalid_bf16_sat_ftz(%a : bf16, %b : bf16) -> bf16 {
+ // expected-error at +1 {{FTZ and saturation are not supported for bf16 and vector<2xbf16> additions}}
+ %f1 = nvvm.addf %a, %b {sat = #nvvm.sat_mode<sat>, ftz=true} : bf16
+ llvm.return %f1 : bf16
+}
+
+// -----
+
+// FIXME: Remove this test once intrinsics for f16 addition (with FTZ only) are
+// available.
+llvm.func @addf_invalid_f16_ftz_no_sat(%a : f16, %b : f16) -> f16 {
+ // expected-error at +1 {{FTZ with no saturation is not supported for f16 and vector<2xf16> additions}}
+ %f1 = nvvm.addf %a, %b {ftz=true} : f16
+ llvm.return %f1 : f16
+}
diff --git a/mlir/test/Target/LLVMIR/nvvm/addf/addf_vector.mlir b/mlir/test/Target/LLVMIR/nvvm/addf/addf_vector.mlir
new file mode 100644
index 0000000000000..b472de739c92a
--- /dev/null
+++ b/mlir/test/Target/LLVMIR/nvvm/addf/addf_vector.mlir
@@ -0,0 +1,285 @@
+// RUN: mlir-translate -mlir-to-llvmir %s | FileCheck %s
+
+// vector<2xf16> + vector<2xf16> -> vector<2xf16>
+llvm.func @addf_vector_f16_f16(%a : vector<2xf16>, %b : vector<2xf16>) -> vector<2xf16> {
+ // CHECK-LABEL: define <2 x half> @addf_vector_f16_f16(<2 x half> %0, <2 x half> %1) {
+ // CHECK-NEXT: %3 = fadd <2 x half> %0, %1
+ // CHECK-NEXT: %4 = fadd <2 x half> %3, %3
+ // CHECK-NEXT: %5 = call <2 x half> @llvm.nvvm.add.rn.sat.v2f16(<2 x half> %4, <2 x half> %4)
+ // CHECK-NEXT: %6 = call <2 x half> @llvm.nvvm.add.rn.ftz.sat.v2f16(<2 x half> %5, <2 x half> %5)
+ // CHECK-NEXT: ret <2 x half> %3
+ // CHECK-NEXT: }
+ %f1 = nvvm.addf %a, %b : vector<2xf16>
+ %f2 = nvvm.addf %f1, %f1 {rnd = #nvvm.fp_rnd_mode<rn>} : vector<2xf16>
+ %f3 = nvvm.addf %f2, %f2 {rnd = #nvvm.fp_rnd_mode<rn>, sat = #nvvm.sat_mode<sat>} : vector<2xf16>
+ %f4 = nvvm.addf %f3, %f3 {rnd = #nvvm.fp_rnd_mode<rn>, sat = #nvvm.sat_mode<sat>, ftz=true} : vector<2xf16>
+ llvm.return %f1 : vector<2xf16>
+}
+
+// vector<2xbf16> + vector<2xbf16> -> vector<2xbf16>
+llvm.func @addf_vector_bf16_bf16(%a : vector<2xbf16>, %b : vector<2xbf16>) -> vector<2xbf16> {
+ // CHECK-LABEL: define <2 x bfloat> @addf_vector_bf16_bf16(<2 x bfloat> %0, <2 x bfloat> %1) {
+ // CHECK-NEXT: %3 = fadd <2 x bfloat> %0, %1
+ // CHECK-NEXT: %4 = fadd <2 x bfloat> %3, %3
+ // CHECK-NEXT: ret <2 x bfloat> %4
+ // CHECK-NEXT: }
+ %f1 = nvvm.addf %a, %b : vector<2xbf16>
+ %f2 = nvvm.addf %f1, %f1 {rnd = #nvvm.fp_rnd_mode<rn>} : vector<2xbf16>
+ llvm.return %f2 : vector<2xbf16>
+}
+
+// vector<2xf32> + vector<2xf32> -> vector<2xf32>
+llvm.func @addf_vector_f32_f32_rn(%a : vector<2xf32>, %b : vector<2xf32>) -> vector<2xf32> {
+ // CHECK-LABEL: define <2 x float> @addf_vector_f32_f32_rn(<2 x float> %0, <2 x float> %1) {
+ // CHECK-NEXT: %3 = extractelement <2 x float> %0, i32 0
+ // CHECK-NEXT: %4 = extractelement <2 x float> %1, i32 0
+ // CHECK-NEXT: %5 = call float @llvm.nvvm.add.rn.f(float %3, float %4)
+ // CHECK-NEXT: %6 = insertelement <2 x float> poison, float %5, i32 0
+ // CHECK-NEXT: %7 = extractelement <2 x float> %0, i32 1
+ // CHECK-NEXT: %8 = extractelement <2 x float> %1, i32 1
+ // CHECK-NEXT: %9 = call float @llvm.nvvm.add.rn.f(float %7, float %8)
+ // CHECK-NEXT: %10 = insertelement <2 x float> %6, float %9, i32 1
+ // CHECK-NEXT: %11 = extractelement <2 x float> %10, i32 0
+ // CHECK-NEXT: %12 = extractelement <2 x float> %10, i32 0
+ // CHECK-NEXT: %13 = call float @llvm.nvvm.add.rn.f(float %11, float %12)
+ // CHECK-NEXT: %14 = insertelement <2 x float> poison, float %13, i32 0
+ // CHECK-NEXT: %15 = extractelement <2 x float> %10, i32 1
+ // CHECK-NEXT: %16 = extractelement <2 x float> %10, i32 1
+ // CHECK-NEXT: %17 = call float @llvm.nvvm.add.rn.f(float %15, float %16)
+ // CHECK-NEXT: %18 = insertelement <2 x float> %14, float %17, i32 1
+ // CHECK-NEXT: %19 = extractelement <2 x float> %18, i32 0
+ // CHECK-NEXT: %20 = extractelement <2 x float> %18, i32 0
+ // CHECK-NEXT: %21 = call float @llvm.nvvm.add.rn.sat.f(float %19, float %20)
+ // CHECK-NEXT: %22 = insertelement <2 x float> poison, float %21, i32 0
+ // CHECK-NEXT: %23 = extractelement <2 x float> %18, i32 1
+ // CHECK-NEXT: %24 = extractelement <2 x float> %18, i32 1
+ // CHECK-NEXT: %25 = call float @llvm.nvvm.add.rn.sat.f(float %23, float %24)
+ // CHECK-NEXT: %26 = insertelement <2 x float> %22, float %25, i32 1
+ // CHECK-NEXT: %27 = extractelement <2 x float> %26, i32 0
+ // CHECK-NEXT: %28 = extractelement <2 x float> %26, i32 0
+ // CHECK-NEXT: %29 = call float @llvm.nvvm.add.rn.ftz.f(float %27, float %28)
+ // CHECK-NEXT: %30 = insertelement <2 x float> poison, float %29, i32 0
+ // CHECK-NEXT: %31 = extractelement <2 x float> %26, i32 1
+ // CHECK-NEXT: %32 = extractelement <2 x float> %26, i32 1
+ // CHECK-NEXT: %33 = call float @llvm.nvvm.add.rn.ftz.f(float %31, float %32)
+ // CHECK-NEXT: %34 = insertelement <2 x float> %30, float %33, i32 1
+ // CHECK-NEXT: %35 = extractelement <2 x float> %34, i32 0
+ // CHECK-NEXT: %36 = extractelement <2 x float> %34, i32 0
+ // CHECK-NEXT: %37 = call float @llvm.nvvm.add.rn.ftz.sat.f(float %35, float %36)
+ // CHECK-NEXT: %38 = insertelement <2 x float> poison, float %37, i32 0
+ // CHECK-NEXT: %39 = extractelement <2 x float> %34, i32 1
+ // CHECK-NEXT: %40 = extractelement <2 x float> %34, i32 1
+ // CHECK-NEXT: %41 = call float @llvm.nvvm.add.rn.ftz.sat.f(float %39, float %40)
+ // CHECK-NEXT: %42 = insertelement <2 x float> %38, float %41, i32 1
+ // CHECK-NEXT: ret <2 x float> %34
+ // CHECK-NEXT: }
+ %f1 = nvvm.addf %a, %b : vector<2xf32>
+ %f2 = nvvm.addf %f1, %f1 {rnd = #nvvm.fp_rnd_mode<rn>} : vector<2xf32>
+ %f3 = nvvm.addf %f2, %f2 {rnd = #nvvm.fp_rnd_mode<rn>, sat = #nvvm.sat_mode<sat>} : vector<2xf32>
+ %f4 = nvvm.addf %f3, %f3 {rnd = #nvvm.fp_rnd_mode<rn>, ftz=true} : vector<2xf32>
+ %f5 = nvvm.addf %f4, %f4 {rnd = #nvvm.fp_rnd_mode<rn>, sat = #nvvm.sat_mode<sat>, ftz=true} : vector<2xf32>
+ llvm.return %f4 : vector<2xf32>
+}
+
+llvm.func @addf_vector_f32_f32_rm(%a : vector<2xf32>, %b : vector<2xf32>) -> vector<2xf32> {
+ // CHECK-LABEL: define <2 x float> @addf_vector_f32_f32_rm(<2 x float> %0, <2 x float> %1) {
+ // CHECK-NEXT: %3 = extractelement <2 x float> %0, i32 0
+ // CHECK-NEXT: %4 = extractelement <2 x float> %1, i32 0
+ // CHECK-NEXT: %5 = call float @llvm.nvvm.add.rm.f(float %3, float %4)
+ // CHECK-NEXT: %6 = insertelement <2 x float> poison, float %5, i32 0
+ // CHECK-NEXT: %7 = extractelement <2 x float> %0, i32 1
+ // CHECK-NEXT: %8 = extractelement <2 x float> %1, i32 1
+ // CHECK-NEXT: %9 = call float @llvm.nvvm.add.rm.f(float %7, float %8)
+ // CHECK-NEXT: %10 = insertelement <2 x float> %6, float %9, i32 1
+ // CHECK-NEXT: %11 = extractelement <2 x float> %10, i32 0
+ // CHECK-NEXT: %12 = extractelement <2 x float> %10, i32 0
+ // CHECK-NEXT: %13 = call float @llvm.nvvm.add.rm.sat.f(float %11, float %12)
+ // CHECK-NEXT: %14 = insertelement <2 x float> poison, float %13, i32 0
+ // CHECK-NEXT: %15 = extractelement <2 x float> %10, i32 1
+ // CHECK-NEXT: %16 = extractelement <2 x float> %10, i32 1
+ // CHECK-NEXT: %17 = call float @llvm.nvvm.add.rm.sat.f(float %15, float %16)
+ // CHECK-NEXT: %18 = insertelement <2 x float> %14, float %17, i32 1
+ // CHECK-NEXT: %19 = extractelement <2 x float> %18, i32 0
+ // CHECK-NEXT: %20 = extractelement <2 x float> %18, i32 0
+ // CHECK-NEXT: %21 = call float @llvm.nvvm.add.rm.ftz.f(float %19, float %20)
+ // CHECK-NEXT: %22 = insertelement <2 x float> poison, float %21, i32 0
+ // CHECK-NEXT: %23 = extractelement <2 x float> %18, i32 1
+ // CHECK-NEXT: %24 = extractelement <2 x float> %18, i32 1
+ // CHECK-NEXT: %25 = call float @llvm.nvvm.add.rm.ftz.f(float %23, float %24)
+ // CHECK-NEXT: %26 = insertelement <2 x float> %22, float %25, i32 1
+ // CHECK-NEXT: %27 = extractelement <2 x float> %26, i32 0
+ // CHECK-NEXT: %28 = extractelement <2 x float> %26, i32 0
+ // CHECK-NEXT: %29 = call float @llvm.nvvm.add.rm.ftz.sat.f(float %27, float %28)
+ // CHECK-NEXT: %30 = insertelement <2 x float> poison, float %29, i32 0
+ // CHECK-NEXT: %31 = extractelement <2 x float> %26, i32 1
+ // CHECK-NEXT: %32 = extractelement <2 x float> %26, i32 1
+ // CHECK-NEXT: %33 = call float @llvm.nvvm.add.rm.ftz.sat.f(float %31, float %32)
+ // CHECK-NEXT: %34 = insertelement <2 x float> %30, float %33, i32 1
+ // CHECK-NEXT: ret <2 x float> %34
+ // CHECK-NEXT: }
+ %f1 = nvvm.addf %a, %b {rnd = #nvvm.fp_rnd_mode<rm>} : vector<2xf32>
+ %f2 = nvvm.addf %f1, %f1 {rnd = #nvvm.fp_rnd_mode<rm>, sat = #nvvm.sat_mode<sat>} : vector<2xf32>
+ %f3 = nvvm.addf %f2, %f2 {rnd = #nvvm.fp_rnd_mode<rm>, ftz=true} : vector<2xf32>
+ %f4 = nvvm.addf %f3, %f3 {rnd = #nvvm.fp_rnd_mode<rm>, sat = #nvvm.sat_mode<sat>, ftz=true} : vector<2xf32>
+ llvm.return %f4 : vector<2xf32>
+}
+
+llvm.func @addf_vector_f32_f32_rp(%a : vector<2xf32>, %b : vector<2xf32>) -> vector<2xf32> {
+ // CHECK-LABEL: define <2 x float> @addf_vector_f32_f32_rp(<2 x float> %0, <2 x float> %1) {
+ // CHECK-NEXT: %3 = extractelement <2 x float> %0, i32 0
+ // CHECK-NEXT: %4 = extractelement <2 x float> %1, i32 0
+ // CHECK-NEXT: %5 = call float @llvm.nvvm.add.rp.f(float %3, float %4)
+ // CHECK-NEXT: %6 = insertelement <2 x float> poison, float %5, i32 0
+ // CHECK-NEXT: %7 = extractelement <2 x float> %0, i32 1
+ // CHECK-NEXT: %8 = extractelement <2 x float> %1, i32 1
+ // CHECK-NEXT: %9 = call float @llvm.nvvm.add.rp.f(float %7, float %8)
+ // CHECK-NEXT: %10 = insertelement <2 x float> %6, float %9, i32 1
+ // CHECK-NEXT: %11 = extractelement <2 x float> %10, i32 0
+ // CHECK-NEXT: %12 = extractelement <2 x float> %10, i32 0
+ // CHECK-NEXT: %13 = call float @llvm.nvvm.add.rp.sat.f(float %11, float %12)
+ // CHECK-NEXT: %14 = insertelement <2 x float> poison, float %13, i32 0
+ // CHECK-NEXT: %15 = extractelement <2 x float> %10, i32 1
+ // CHECK-NEXT: %16 = extractelement <2 x float> %10, i32 1
+ // CHECK-NEXT: %17 = call float @llvm.nvvm.add.rp.sat.f(float %15, float %16)
+ // CHECK-NEXT: %18 = insertelement <2 x float> %14, float %17, i32 1
+ // CHECK-NEXT: %19 = extractelement <2 x float> %18, i32 0
+ // CHECK-NEXT: %20 = extractelement <2 x float> %18, i32 0
+ // CHECK-NEXT: %21 = call float @llvm.nvvm.add.rp.ftz.f(float %19, float %20)
+ // CHECK-NEXT: %22 = insertelement <2 x float> poison, float %21, i32 0
+ // CHECK-NEXT: %23 = extractelement <2 x float> %18, i32 1
+ // CHECK-NEXT: %24 = extractelement <2 x float> %18, i32 1
+ // CHECK-NEXT: %25 = call float @llvm.nvvm.add.rp.ftz.f(float %23, float %24)
+ // CHECK-NEXT: %26 = insertelement <2 x float> %22, float %25, i32 1
+ // CHECK-NEXT: %27 = extractelement <2 x float> %26, i32 0
+ // CHECK-NEXT: %28 = extractelement <2 x float> %26, i32 0
+ // CHECK-NEXT: %29 = call float @llvm.nvvm.add.rp.ftz.sat.f(float %27, float %28)
+ // CHECK-NEXT: %30 = insertelement <2 x float> poison, float %29, i32 0
+ // CHECK-NEXT: %31 = extractelement <2 x float> %26, i32 1
+ // CHECK-NEXT: %32 = extractelement <2 x float> %26, i32 1
+ // CHECK-NEXT: %33 = call float @llvm.nvvm.add.rp.ftz.sat.f(float %31, float %32)
+ // CHECK-NEXT: %34 = insertelement <2 x float> %30, float %33, i32 1
+ // CHECK-NEXT: ret <2 x float> %34
+ // CHECK-NEXT: }
+ %f1 = nvvm.addf %a, %b {rnd = #nvvm.fp_rnd_mode<rp>} : vector<2xf32>
+ %f2 = nvvm.addf %f1, %f1 {rnd = #nvvm.fp_rnd_mode<rp>, sat = #nvvm.sat_mode<sat>} : vector<2xf32>
+ %f3 = nvvm.addf %f2, %f2 {rnd = #nvvm.fp_rnd_mode<rp>, ftz=true} : vector<2xf32>
+ %f4 = nvvm.addf %f3, %f3 {rnd = #nvvm.fp_rnd_mode<rp>, sat = #nvvm.sat_mode<sat>, ftz=true} : vector<2xf32>
+ llvm.return %f4 : vector<2xf32>
+}
+
+llvm.func @addf_vector_f32_f32_rz(%a : vector<2xf32>, %b : vector<2xf32>) -> vector<2xf32> {
+ // CHECK-LABEL: define <2 x float> @addf_vector_f32_f32_rz(<2 x float> %0, <2 x float> %1) {
+ // CHECK-NEXT: %3 = extractelement <2 x float> %0, i32 0
+ // CHECK-NEXT: %4 = extractelement <2 x float> %1, i32 0
+ // CHECK-NEXT: %5 = call float @llvm.nvvm.add.rz.f(float %3, float %4)
+ // CHECK-NEXT: %6 = insertelement <2 x float> poison, float %5, i32 0
+ // CHECK-NEXT: %7 = extractelement <2 x float> %0, i32 1
+ // CHECK-NEXT: %8 = extractelement <2 x float> %1, i32 1
+ // CHECK-NEXT: %9 = call float @llvm.nvvm.add.rz.f(float %7, float %8)
+ // CHECK-NEXT: %10 = insertelement <2 x float> %6, float %9, i32 1
+ // CHECK-NEXT: %11 = extractelement <2 x float> %10, i32 0
+ // CHECK-NEXT: %12 = extractelement <2 x float> %10, i32 0
+ // CHECK-NEXT: %13 = call float @llvm.nvvm.add.rz.sat.f(float %11, float %12)
+ // CHECK-NEXT: %14 = insertelement <2 x float> poison, float %13, i32 0
+ // CHECK-NEXT: %15 = extractelement <2 x float> %10, i32 1
+ // CHECK-NEXT: %16 = extractelement <2 x float> %10, i32 1
+ // CHECK-NEXT: %17 = call float @llvm.nvvm.add.rz.sat.f(float %15, float %16)
+ // CHECK-NEXT: %18 = insertelement <2 x float> %14, float %17, i32 1
+ // CHECK-NEXT: %19 = extractelement <2 x float> %18, i32 0
+ // CHECK-NEXT: %20 = extractelement <2 x float> %18, i32 0
+ // CHECK-NEXT: %21 = call float @llvm.nvvm.add.rz.ftz.f(float %19, float %20)
+ // CHECK-NEXT: %22 = insertelement <2 x float> poison, float %21, i32 0
+ // CHECK-NEXT: %23 = extractelement <2 x float> %18, i32 1
+ // CHECK-NEXT: %24 = extractelement <2 x float> %18, i32 1
+ // CHECK-NEXT: %25 = call float @llvm.nvvm.add.rz.ftz.f(float %23, float %24)
+ // CHECK-NEXT: %26 = insertelement <2 x float> %22, float %25, i32 1
+ // CHECK-NEXT: %27 = extractelement <2 x float> %26, i32 0
+ // CHECK-NEXT: %28 = extractelement <2 x float> %26, i32 0
+ // CHECK-NEXT: %29 = call float @llvm.nvvm.add.rz.ftz.sat.f(float %27, float %28)
+ // CHECK-NEXT: %30 = insertelement <2 x float> poison, float %29, i32 0
+ // CHECK-NEXT: %31 = extractelement <2 x float> %26, i32 1
+ // CHECK-NEXT: %32 = extractelement <2 x float> %26, i32 1
+ // CHECK-NEXT: %33 = call float @llvm.nvvm.add.rz.ftz.sat.f(float %31, float %32)
+ // CHECK-NEXT: %34 = insertelement <2 x float> %30, float %33, i32 1
+ // CHECK-NEXT: ret <2 x float> %34
+ // CHECK-NEXT: }
+ %f1 = nvvm.addf %a, %b {rnd = #nvvm.fp_rnd_mode<rz>} : vector<2xf32>
+ %f2 = nvvm.addf %f1, %f1 {rnd = #nvvm.fp_rnd_mode<rz>, sat = #nvvm.sat_mode<sat>} : vector<2xf32>
+ %f3 = nvvm.addf %f2, %f2 {rnd = #nvvm.fp_rnd_mode<rz>, ftz=true} : vector<2xf32>
+ %f4 = nvvm.addf %f3, %f3 {rnd = #nvvm.fp_rnd_mode<rz>, sat = #nvvm.sat_mode<sat>, ftz=true} : vector<2xf32>
+ llvm.return %f4 : vector<2xf32>
+}
+
+// vector<2xf64> + vector<2xf64> -> vector<2xf64>
+llvm.func @addf_vector_f64_f64_rn(%a : vector<2xf64>, %b : vector<2xf64>) -> vector<2xf64> {
+ // CHECK-LABEL: define <2 x double> @addf_vector_f64_f64_rn(<2 x double> %0, <2 x double> %1) {
+ // CHECK-NEXT: %3 = extractelement <2 x double> %0, i32 0
+ // CHECK-NEXT: %4 = extractelement <2 x double> %1, i32 0
+ // CHECK-NEXT: %5 = call double @llvm.nvvm.add.rn.d(double %3, double %4)
+ // CHECK-NEXT: %6 = insertelement <2 x double> poison, double %5, i32 0
+ // CHECK-NEXT: %7 = extractelement <2 x double> %0, i32 1
+ // CHECK-NEXT: %8 = extractelement <2 x double> %1, i32 1
+ // CHECK-NEXT: %9 = call double @llvm.nvvm.add.rn.d(double %7, double %8)
+ // CHECK-NEXT: %10 = insertelement <2 x double> %6, double %9, i32 1
+ // CHECK-NEXT: %11 = extractelement <2 x double> %10, i32 0
+ // CHECK-NEXT: %12 = extractelement <2 x double> %10, i32 0
+ // CHECK-NEXT: %13 = call double @llvm.nvvm.add.rn.d(double %11, double %12)
+ // CHECK-NEXT: %14 = insertelement <2 x double> poison, double %13, i32 0
+ // CHECK-NEXT: %15 = extractelement <2 x double> %10, i32 1
+ // CHECK-NEXT: %16 = extractelement <2 x double> %10, i32 1
+ // CHECK-NEXT: %17 = call double @llvm.nvvm.add.rn.d(double %15, double %16)
+ // CHECK-NEXT: %18 = insertelement <2 x double> %14, double %17, i32 1
+ // CHECK-NEXT: ret <2 x double> %18
+ // CHECK-NEXT: }
+ %f1 = nvvm.addf %a, %b : vector<2xf64>
+ %f2 = nvvm.addf %f1, %f1 {rnd = #nvvm.fp_rnd_mode<rn>} : vector<2xf64>
+ llvm.return %f2 : vector<2xf64>
+}
+
+llvm.func @addf_vector_f64_f64_rm(%a : vector<2xf64>, %b : vector<2xf64>) -> vector<2xf64> {
+ // CHECK-LABEL: define <2 x double> @addf_vector_f64_f64_rm(<2 x double> %0, <2 x double> %1) {
+ // CHECK-NEXT: %3 = extractelement <2 x double> %0, i32 0
+ // CHECK-NEXT: %4 = extractelement <2 x double> %1, i32 0
+ // CHECK-NEXT: %5 = call double @llvm.nvvm.add.rm.d(double %3, double %4)
+ // CHECK-NEXT: %6 = insertelement <2 x double> poison, double %5, i32 0
+ // CHECK-NEXT: %7 = extractelement <2 x double> %0, i32 1
+ // CHECK-NEXT: %8 = extractelement <2 x double> %1, i32 1
+ // CHECK-NEXT: %9 = call double @llvm.nvvm.add.rm.d(double %7, double %8)
+ // CHECK-NEXT: %10 = insertelement <2 x double> %6, double %9, i32 1
+ // CHECK-NEXT: ret <2 x double> %10
+ // CHECK-NEXT: }
+ %f1 = nvvm.addf %a, %b {rnd = #nvvm.fp_rnd_mode<rm>} : vector<2xf64>
+ llvm.return %f1 : vector<2xf64>
+}
+
+llvm.func @addf_vector_f64_f64_rp(%a : vector<2xf64>, %b : vector<2xf64>) -> vector<2xf64> {
+ // CHECK-LABEL: define <2 x double> @addf_vector_f64_f64_rp(<2 x double> %0, <2 x double> %1) {
+ // CHECK-NEXT: %3 = extractelement <2 x double> %0, i32 0
+ // CHECK-NEXT: %4 = extractelement <2 x double> %1, i32 0
+ // CHECK-NEXT: %5 = call double @llvm.nvvm.add.rp.d(double %3, double %4)
+ // CHECK-NEXT: %6 = insertelement <2 x double> poison, double %5, i32 0
+ // CHECK-NEXT: %7 = extractelement <2 x double> %0, i32 1
+ // CHECK-NEXT: %8 = extractelement <2 x double> %1, i32 1
+ // CHECK-NEXT: %9 = call double @llvm.nvvm.add.rp.d(double %7, double %8)
+ // CHECK-NEXT: %10 = insertelement <2 x double> %6, double %9, i32 1
+ // CHECK-NEXT: ret <2 x double> %10
+ // CHECK-NEXT: }
+ %f1 = nvvm.addf %a, %b {rnd = #nvvm.fp_rnd_mode<rp>} : vector<2xf64>
+ llvm.return %f1 : vector<2xf64>
+}
+
+llvm.func @addf_vector_f64_f64_rz(%a : vector<2xf64>, %b : vector<2xf64>) -> vector<2xf64> {
+ // CHECK-LABEL: define <2 x double> @addf_vector_f64_f64_rz(<2 x double> %0, <2 x double> %1) {
+ // CHECK-NEXT: %3 = extractelement <2 x double> %0, i32 0
+ // CHECK-NEXT: %4 = extractelement <2 x double> %1, i32 0
+ // CHECK-NEXT: %5 = call double @llvm.nvvm.add.rz.d(double %3, double %4)
+ // CHECK-NEXT: %6 = insertelement <2 x double> poison, double %5, i32 0
+ // CHECK-NEXT: %7 = extractelement <2 x double> %0, i32 1
+ // CHECK-NEXT: %8 = extractelement <2 x double> %1, i32 1
+ // CHECK-NEXT: %9 = call double @llvm.nvvm.add.rz.d(double %7, double %8)
+ // CHECK-NEXT: %10 = insertelement <2 x double> %6, double %9, i32 1
+ // CHECK-NEXT: ret <2 x double> %10
+ // CHECK-NEXT: }
+ %f1 = nvvm.addf %a, %b {rnd = #nvvm.fp_rnd_mode<rz>} : vector<2xf64>
+ llvm.return %f1 : vector<2xf64>
+}
More information about the Mlir-commits
mailing list