[Mlir-commits] [mlir] [MLIR][NVVM] Add nvvm.fadd and nvvm.fsub Ops (PR #179162)

Sun Feb 1 20:54:49 PST 2026

llvmbot wrote:




@llvm/pr-subscribers-mlir-llvm

Author: Srinivasa Ravi (Wolfram70)

<details>
<summary>Changes</summary>

This change adds the `nvvm.fadd` and `nvvm.fsub` Ops to the NVVM dialect. `nvvm.fadd` performs floating point addition of two operands along with any conversions necessary.
`nvvm.fsub` performs floating point subtraction of two operands and is equivalent to an `llvm.fneg` followed by an `nvvm.fadd` operation.

PTX ISA Reference:
1. https://docs.nvidia.com/cuda/parallel-thread-execution/#floating-point-instructions-add
2. https://docs.nvidia.com/cuda/parallel-thread-execution/#half-precision-floating-point-instructions-add
3. https://docs.nvidia.com/cuda/parallel-thread-execution/#mixed-precision-floating-point-instructions-add

---

Patch is 71.06 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/179162.diff


7 Files Affected:

- (modified) mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td (+76-5) 
- (modified) mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp (+212) 
- (added) mlir/test/Dialect/LLVMIR/nvvm-canonicalize.mlir (+9) 
- (added) mlir/test/Target/LLVMIR/nvvm/fadd_all_same_types.mlir (+89) 
- (added) mlir/test/Target/LLVMIR/nvvm/fadd_different_return_type.mlir (+400) 
- (added) mlir/test/Target/LLVMIR/nvvm/fadd_invalid.mlir (+107) 
- (added) mlir/test/Target/LLVMIR/nvvm/fadd_mixed_arg_types.mlir (+684) 


``````````diff

diff --git a/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td b/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td
index 64a52acbb2278..0dce63c4e5a74 100644
--- a/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td
+++ b/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td
@@ -1860,12 +1860,12 @@ def FPRoundingModeAttr : EnumAttr<NVVM_Dialect, FPRoundingMode, "fp_rnd_mode"> {
   let assemblyFormat = "`<` $value `>`";
 }
 
-def SaturationModeNone   : I32EnumAttrCase<"NONE", 0, "none">;
-def SaturationModeFinite : I32EnumAttrCase<"SATFINITE", 1, "satfinite">;
+def SaturationModeNone   : I32EnumCase<"NONE", 0, "none">;
+def SaturationModeFinite : I32EnumCase<"SATFINITE", 1, "satfinite">;
+def SaturationModeSat    : I32EnumCase<"SAT", 2, "sat">;
 
-def SaturationMode : I32EnumAttr<"SaturationMode", "NVVM SaturationMode kind",
-  [SaturationModeNone, SaturationModeFinite]> {
-  let genSpecializedAttr = 0;
+def SaturationMode : I32Enum<"SaturationMode", "NVVM SaturationMode kind",
+  [SaturationModeNone, SaturationModeFinite, SaturationModeSat]> {
   let cppNamespace = "::mlir::NVVM";
 }
 def SaturationModeAttr : EnumAttr<NVVM_Dialect, SaturationMode, "sat_mode"> {
@@ -6155,6 +6155,77 @@ def NVVM_Tcgen05MMAWsSparseOp : NVVM_Op<"tcgen05.mma.ws.sp",
   }];
 }
 
+def NVVM_FloatAdditionOp : 
+  NVVM_SingleResultIntrinsicOp<"fadd", [Pure, Commutative]> {
+  let summary = [{
+    Performs floating point addition operation with support for mixed precision 
+    operands
+  }];
+  let description = [{
+    The `nvvm.fadd` operation performs floating point addition of two operands.
+
+    The rounding mode to be used is specified by the `rnd` attribute, 
+    saturation mode by the `sat` attribute, and FTZ by the `ftz` unit attribute.
+    
+    The result type must be at least as wide as the operands. The operands are 
+    converted to the result type before addition if it is wider.
+    
+    For more information, see PTX ISA - [floating point addition](https://docs.nvidia.com/cuda/parallel-thread-execution/#floating-point-instructions-add), 
+    [half-precision floating point addition](https://docs.nvidia.com/cuda/parallel-thread-execution/#half-precision-floating-point-instructions-add), 
+    [mixed precision floating point addition](https://docs.nvidia.com/cuda/parallel-thread-execution/#mixed-precision-floating-point-instructions-add).
+  }];
+  let arguments = (ins
+    AnyTypeOf<[F16, BF16, F32, F64, VectorOfLengthAndType<[2], [F16, BF16]>]>:$lhs,
+    AnyTypeOf<[F16, BF16, F32, F64, VectorOfLengthAndType<[2], [F16, BF16]>]>:$rhs,
+    DefaultValuedAttr<FPRoundingModeAttr, "FPRoundingMode::NONE">:$rnd,
+    DefaultValuedAttr<SaturationModeAttr, "SaturationMode::NONE">:$sat,
+    UnitAttr:$ftz
+  );
+  let results = (outs AnyTypeOf<[F16, BF16, F32, F64, VectorOfLengthAndType<[2], [F16, BF16]>]>:$res);
+  let assemblyFormat = "$lhs `,` $rhs attr-dict `:` type(operands) `->` type($res)";
+  let hasVerifier = 1;
+  
+  let llvmBuilder = [{
+    auto [ID, args] = NVVM::FloatAdditionOp::getIntrinsicIDAndArgs(*op, moduleTranslation, builder);
+    if(ID != llvm::Intrinsic::not_intrinsic) {
+      llvm::Value *addResult = createIntrinsicCall(builder, ID, args);
+      $res = ($_resultType->getScalarSizeInBits() > 
+              addResult->getType()->getScalarSizeInBits()) 
+             ? builder.CreateFPExt(addResult, $_resultType) : addResult;
+    }
+  }];
+}
+
+def NVVM_FloatSubtractionOp : 
+  NVVM_Op<"fsub", [Pure]> {
+  let summary = [{
+    Performs floating point subtraction operation with support for mixed 
+    precision operands
+  }];
+  let description = [{
+    The `nvvm.fsub` operation performs floating point subtraction of two 
+    operands.
+
+    It supports the same type combinations and modifiers as `nvvm.fadd`.
+    This is equivalent to `nvvm.fadd(lhs, -rhs)`.
+    
+    For more information, see PTX ISA - [floating point subtraction](https://docs.nvidia.com/cuda/parallel-thread-execution/#floating-point-instructions-sub), 
+    [half-precision floating point subtraction](https://docs.nvidia.com/cuda/parallel-thread-execution/#half-precision-floating-point-instructions-sub), 
+    [mixed precision floating point subtraction](https://docs.nvidia.com/cuda/parallel-thread-execution/#mixed-precision-floating-point-instructions-sub).
+ 
+  }];
+  let arguments = (ins
+    AnyTypeOf<[F16, BF16, F32, F64, VectorOfLengthAndType<[2], [F16, BF16]>]>:$lhs,
+    AnyTypeOf<[F16, BF16, F32, F64, VectorOfLengthAndType<[2], [F16, BF16]>]>:$rhs,
+    DefaultValuedAttr<FPRoundingModeAttr, "FPRoundingMode::NONE">:$rnd,
+    DefaultValuedAttr<SaturationModeAttr, "SaturationMode::NONE">:$sat,
+    UnitAttr:$ftz
+  );
+  let results = (outs AnyTypeOf<[F16, BF16, F32, F64, VectorOfLengthAndType<[2], [F16, BF16]>]>:$res);
+  let assemblyFormat = "$lhs `,` $rhs attr-dict `:` type(operands) `->` type($res)";
+  let hasCanonicalizer = 1;
+}
+
 //===----------------------------------------------------------------------===//
 // NVVM tensormap.replace Op
 //===----------------------------------------------------------------------===//
diff --git a/mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp b/mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp
index 76ec8b8b7cfd2..033b420d0faee 100644
--- a/mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp
+++ b/mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp
@@ -3072,6 +3072,85 @@ LogicalResult NVVM::TensormapReplaceOp::verify() {
   return success();
 }
 
+LogicalResult NVVM::FloatAdditionOp::verify() {
+  auto resFType = getRes().getType();
+  auto lhsFType = getLhs().getType();
+  auto rhsFType = getRhs().getType();
+  auto rndMode = getRnd();
+  auto satMode = getSat();
+  auto isFTZ = getFtz();
+
+  if (satMode == NVVM::SaturationMode::SATFINITE)
+    return emitOpError("SATFINITE saturation mode is not supported for "
+                       "floating point addition operation");
+
+  if (isa<VectorType>(resFType) != isa<VectorType>(lhsFType) ||
+      isa<VectorType>(resFType) != isa<VectorType>(rhsFType))
+    return emitOpError("cannot mix vector and scalar types for floating point "
+                       "addition operation");
+
+  if (isa<VectorType>(lhsFType) &&
+      ((cast<VectorType>(lhsFType).getElementType() !=
+        cast<VectorType>(rhsFType).getElementType()) ||
+       (cast<VectorType>(lhsFType).getElementType() !=
+        cast<VectorType>(resFType).getElementType())))
+    return emitOpError(
+        "cannot mix different element types for vector floating point "
+        "addition operation");
+
+  if (resFType.isF64() && (satMode != NVVM::SaturationMode::NONE || isFTZ))
+    return emitOpError("FTZ and saturation are not supported for additions "
+                       "involving f64 type");
+
+  auto getBaseFType = [](Type type) -> Type {
+    if (isa<VectorType>(type))
+      return cast<VectorType>(type).getElementType();
+    return type;
+  };
+
+  auto resBaseFType = getBaseFType(resFType);
+  auto lhsBaseFType = getBaseFType(lhsFType);
+  auto rhsBaseFType = getBaseFType(rhsFType);
+
+  if (resBaseFType.getIntOrFloatBitWidth() <
+      std::max(lhsBaseFType.getIntOrFloatBitWidth(),
+               rhsBaseFType.getIntOrFloatBitWidth()))
+    return emitOpError("result type must be at least as wide as the operands");
+
+  if (resBaseFType.isF16() && rndMode != NVVM::FPRoundingMode::RN &&
+      rndMode != NVVM::FPRoundingMode::NONE)
+    return emitOpError("only RN rounding mode is supported for f16 and "
+                       "vector<2xf16> additions");
+
+  if (resBaseFType.isBF16()) {
+    if (rndMode != NVVM::FPRoundingMode::RN &&
+        rndMode != NVVM::FPRoundingMode::NONE)
+      return emitOpError("only RN rounding mode is supported for bf16 and "
+                         "vector<2xbf16> additions");
+    if (satMode != NVVM::SaturationMode::NONE || isFTZ)
+      return emitOpError("FTZ and saturation are not supported for bf16 and "
+                         "vector<2xbf16> additions");
+  }
+
+  if (resBaseFType.isF16() && !(lhsBaseFType.isF16() && rhsBaseFType.isF16()))
+    return emitOpError("only f16 + f16 is supported for f16 result type");
+
+  if (resBaseFType.isBF16() &&
+      !(lhsBaseFType.isBF16() && rhsBaseFType.isBF16()))
+    return emitOpError("only bf16 + bf16 is supported for bf16 result type");
+
+  // FIXME: This is a temporary check disallowing lowering to add.rn.ftz.f16(x2)
+  // PTX instructions since the corresponding LLVM intrinsic is missing. This
+  // should be removed once the intrinsics for f16 addition (with FTZ only) are
+  // available.
+  if ((isa<VectorType>(resFType) || resBaseFType.isF16()) && isFTZ &&
+      satMode == NVVM::SaturationMode::NONE)
+    return emitOpError(
+        "FTZ with no saturation is not supported for f16 additions");
+
+  return success();
+}
+
 /// Packs the given `field` into the `result`.
 /// The `result` is 64-bits and each `field` can be 32-bits or narrower.
 static llvm::Value *
@@ -3148,6 +3227,33 @@ std::string NVVM::MBarrierTryWaitParityOp::getPtx() {
                        space);
 }
 
+//===----------------------------------------------------------------------===//
+// Canonicalization patterns
+//===----------------------------------------------------------------------===//
+
+struct ConvertFsubToFnegFadd : public OpRewritePattern<FloatSubtractionOp> {
+  using OpRewritePattern<FloatSubtractionOp>::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(FloatSubtractionOp op,
+                                PatternRewriter &rewriter) const override {
+    Location loc = op.getLoc();
+
+    Value negRhs =
+        LLVM::FNegOp::create(rewriter, loc, op.getRhs().getType(), op.getRhs());
+
+    rewriter.replaceOpWithNewOp<FloatAdditionOp>(op, op.getType(), op.getLhs(),
+                                                 negRhs, op.getRnd(),
+                                                 op.getSat(), op.getFtz());
+
+    return success();
+  }
+};
+
+void FloatSubtractionOp::getCanonicalizationPatterns(
+    RewritePatternSet &patterns, MLIRContext *context) {
+  patterns.add<ConvertFsubToFnegFadd>(context);
+}
+
 //===----------------------------------------------------------------------===//
 // getIntrinsicID/getIntrinsicIDAndArgs methods
 //===----------------------------------------------------------------------===//
@@ -4887,6 +4993,112 @@ mlir::NVVM::IDArgPair TensormapReplaceOp::getIntrinsicIDAndArgs(
   return {IDs[fieldIndex], args};
 }
 
+mlir::NVVM::IDArgPair FloatAdditionOp::getIntrinsicIDAndArgs(
+    Operation &op, LLVM::ModuleTranslation &mt, llvm::IRBuilderBase &builder) {
+  auto thisOp = cast<NVVM::FloatAdditionOp>(op);
+  llvm::SmallVector<llvm::Value *> args;
+  auto rndMode = thisOp.getRnd();
+  bool isRndRN = rndMode == NVVM::FPRoundingMode::RN;
+  auto isSat = thisOp.getSat() == NVVM::SaturationMode::SAT;
+  auto isFTZ = thisOp.getFtz();
+
+  llvm::Value *argLHS = mt.lookupValue(thisOp.getLhs());
+  llvm::Value *argRHS = mt.lookupValue(thisOp.getRhs());
+
+  mlir::Type lhsType = thisOp.getLhs().getType();
+  mlir::Type rhsType = thisOp.getRhs().getType();
+  mlir::Type resType = thisOp.getRes().getType();
+
+  // FIXME: Add intrinsics for add.rn.ftz.f16x2 and add.rn.ftz.f16 here when
+  // they are available.
+  static constexpr llvm::Intrinsic::ID f16IDs[] = {
+      llvm::Intrinsic::nvvm_add_rn_sat_f16,
+      llvm::Intrinsic::nvvm_add_rn_ftz_sat_f16,
+      llvm::Intrinsic::nvvm_add_rn_sat_v2f16,
+      llvm::Intrinsic::nvvm_add_rn_ftz_sat_v2f16,
+  };
+
+  static constexpr llvm::Intrinsic::ID f32IDs[] = {
+      llvm::Intrinsic::nvvm_add_rn_f, // default rounding mode RN
+      llvm::Intrinsic::nvvm_add_rn_f,
+      llvm::Intrinsic::nvvm_add_rm_f,
+      llvm::Intrinsic::nvvm_add_rp_f,
+      llvm::Intrinsic::nvvm_add_rz_f,
+      llvm::Intrinsic::nvvm_add_rn_sat_f, // default rounding mode RN
+      llvm::Intrinsic::nvvm_add_rn_sat_f,
+      llvm::Intrinsic::nvvm_add_rm_sat_f,
+      llvm::Intrinsic::nvvm_add_rp_sat_f,
+      llvm::Intrinsic::nvvm_add_rz_sat_f,
+      llvm::Intrinsic::nvvm_add_rn_ftz_f, // default rounding mode RN
+      llvm::Intrinsic::nvvm_add_rn_ftz_f,
+      llvm::Intrinsic::nvvm_add_rm_ftz_f,
+      llvm::Intrinsic::nvvm_add_rp_ftz_f,
+      llvm::Intrinsic::nvvm_add_rz_ftz_f,
+      llvm::Intrinsic::nvvm_add_rn_ftz_sat_f, // default rounding mode RN
+      llvm::Intrinsic::nvvm_add_rn_ftz_sat_f,
+      llvm::Intrinsic::nvvm_add_rm_ftz_sat_f,
+      llvm::Intrinsic::nvvm_add_rp_ftz_sat_f,
+      llvm::Intrinsic::nvvm_add_rz_ftz_sat_f,
+  };
+
+  static constexpr llvm::Intrinsic::ID f64IDs[] = {
+      llvm::Intrinsic::nvvm_add_rn_d, // default rounding mode RN
+      llvm::Intrinsic::nvvm_add_rn_d, llvm::Intrinsic::nvvm_add_rm_d,
+      llvm::Intrinsic::nvvm_add_rp_d, llvm::Intrinsic::nvvm_add_rz_d};
+
+  auto addIntrinsic = [&](llvm::Intrinsic::ID IID, llvm::Value *LHS = nullptr,
+                          llvm::Value *RHS = nullptr) -> NVVM::IDArgPair {
+    args.push_back(LHS ? LHS : argLHS);
+    args.push_back(RHS ? RHS : argRHS);
+    return {IID, args};
+  };
+
+  // f16 + f16 -> f16 / vector<2xf16> + vector<2xf16> -> vector<2xf16>
+  // FIXME: Allow lowering to add.rn.ftz.f16x2 and add.rn.ftz.f16 here when the
+  // intrinsics are available.
+  bool isVectorF16Add = isa<VectorType>(resType) &&
+                        cast<VectorType>(resType).getElementType().isF16();
+  if (resType.isF16() || isVectorF16Add) {
+    if (isSat) {
+      unsigned index = (isVectorF16Add << 1) | isFTZ;
+      return addIntrinsic(f16IDs[index]);
+    } else {
+      mt.mapValue(thisOp.getRes(), builder.CreateFAdd(argLHS, argRHS));
+      return {llvm::Intrinsic::not_intrinsic, args};
+    }
+  }
+
+  // bf16 + bf16 -> bf16 / vector<2xbf16> + vector<2xbf16> -> vector<2xbf16>
+  bool isVectorBF16Add = isa<VectorType>(resType) &&
+                         cast<VectorType>(resType).getElementType().isBF16();
+  if (resType.isBF16() || isVectorBF16Add) {
+    mt.mapValue(thisOp.getRes(), builder.CreateFAdd(argLHS, argRHS));
+    return {llvm::Intrinsic::not_intrinsic, args};
+  }
+
+  // f64 + f64/f32/f16/bf16
+  if (resType.isF64()) {
+    llvm::Value *lhsF64 =
+        lhsType.isF64() ? argLHS
+                        : builder.CreateFPExt(argLHS, builder.getDoubleTy());
+    llvm::Value *rhsF64 =
+        rhsType.isF64() ? argRHS
+                        : builder.CreateFPExt(argRHS, builder.getDoubleTy());
+    unsigned index = static_cast<unsigned>(rndMode);
+    return addIntrinsic(f64IDs[index], lhsF64, rhsF64);
+  }
+
+  // f16 + f16 -> !f16 / bf16 + bf16 -> !bf16 / f16 + bf16 / f32 + f32/f16/bf16
+  llvm::Value *lhsF32 = lhsType.isF32()
+                            ? argLHS
+                            : builder.CreateFPExt(argLHS, builder.getFloatTy());
+  llvm::Value *rhsF32 = rhsType.isF32()
+                            ? argRHS
+                            : builder.CreateFPExt(argRHS, builder.getFloatTy());
+  unsigned index = ((isFTZ << 1) | isSat) * 5 + static_cast<unsigned>(rndMode);
+  return addIntrinsic(f32IDs[index], lhsF32, rhsF32);
+}
+
 //===----------------------------------------------------------------------===//
 // NVVM tcgen05.mma functions
 //===----------------------------------------------------------------------===//
diff --git a/mlir/test/Dialect/LLVMIR/nvvm-canonicalize.mlir b/mlir/test/Dialect/LLVMIR/nvvm-canonicalize.mlir
new file mode 100644
index 0000000000000..76d0a1453edf9
--- /dev/null
+++ b/mlir/test/Dialect/LLVMIR/nvvm-canonicalize.mlir
@@ -0,0 +1,9 @@
+// RUN: mlir-opt %s -split-input-file --canonicalize | FileCheck %s
+
+// CHECK-LABEL: @fsub_canonicalize
+llvm.func @fsub_canonicalize(%arg0 : f32, %arg1 : f32) -> f32 {
+  // CHECK: %[[NEG_ARG1:.*]] = llvm.fneg %arg1 : f32
+  // CHECK: %[[ADD_RESULT:.*]] = nvvm.fadd %arg0, %[[NEG_ARG1]] : f32, f32 -> f32
+  %0 = nvvm.fsub %arg0, %arg1 : f32, f32 -> f32
+  llvm.return %0 : f32
+}
diff --git a/mlir/test/Target/LLVMIR/nvvm/fadd_all_same_types.mlir b/mlir/test/Target/LLVMIR/nvvm/fadd_all_same_types.mlir
new file mode 100644
index 0000000000000..2aa2bf3a4906b
--- /dev/null
+++ b/mlir/test/Target/LLVMIR/nvvm/fadd_all_same_types.mlir
@@ -0,0 +1,89 @@
+// RUN: mlir-translate -mlir-to-llvmir %s | FileCheck %s
+
+// f16 + f16 -> f16
+llvm.func @fadd_f16_f16(%a : f16, %b : f16) -> f16 {
+  // CHECK-LABEL: define half @fadd_f16_f16(half %0, half %1) {
+  // CHECK-NEXT: %3 = fadd half %0, %1
+  // CHECK-NEXT: %4 = fadd half %3, %3
+  // CHECK-NEXT: %5 = call half @llvm.nvvm.add.rn.sat.f16(half %4, half %4)
+  // CHECK-NEXT: %6 = call half @llvm.nvvm.add.rn.ftz.sat.f16(half %5, half %5)
+  // CHECK-NEXT: ret half %6
+  // CHECK-NEXT: }
+  %f1 = nvvm.fadd %a, %b : f16, f16 -> f16
+  %f2 = nvvm.fadd %f1, %f1 {rnd = #nvvm.fp_rnd_mode<rn>} : f16, f16 -> f16
+  %f3 = nvvm.fadd %f2, %f2 {rnd = #nvvm.fp_rnd_mode<rn>, sat = #nvvm.sat_mode<sat>} : f16, f16 -> f16
+  %f4 = nvvm.fadd %f3, %f3 {rnd = #nvvm.fp_rnd_mode<rn>, sat = #nvvm.sat_mode<sat>, ftz} : f16, f16 -> f16
+  llvm.return %f4 : f16
+}
+
+// bf16 + bf16 -> bf16
+llvm.func @fadd_bf16_bf16(%a : bf16, %b : bf16) -> bf16 {
+  // CHECK-LABEL: define bfloat @fadd_bf16_bf16(bfloat %0, bfloat %1) {
+  // CHECK-NEXT: %3 = fadd bfloat %0, %1
+  // CHECK-NEXT: %4 = fadd bfloat %3, %3
+  // CHECK-NEXT: ret bfloat %4
+  // CHECK-NEXT: }
+  %f1 = nvvm.fadd %a, %b : bf16, bf16 -> bf16
+  %f2 = nvvm.fadd %f1, %f1 {rnd = #nvvm.fp_rnd_mode<rn>} : bf16, bf16 -> bf16
+  llvm.return %f2 : bf16
+}
+
+// f32 + f32 -> f32
+llvm.func @fadd_f32_f32(%a : f32, %b : f32) -> f32 {
+  // CHECK-LABEL: define float @fadd_f32_f32(float %0, float %1) {
+  // CHECK-NEXT: %3 = call float @llvm.nvvm.add.rn.f(float %0, float %1)
+  // CHECK-NEXT: %4 = call float @llvm.nvvm.add.rn.f(float %3, float %3)
+  // CHECK-NEXT: %5 = call float @llvm.nvvm.add.rn.sat.f(float %4, float %4)
+  // CHECK-NEXT: %6 = call float @llvm.nvvm.add.rn.ftz.f(float %5, float %5)
+  // CHECK-NEXT: %7 = call float @llvm.nvvm.add.rn.ftz.sat.f(float %6, float %6)
+  // CHECK-NEXT: %8 = call float @llvm.nvvm.add.rm.f(float %7, float %7)
+  // CHECK-NEXT: %9 = call float @llvm.nvvm.add.rm.sat.f(float %8, float %8)
+  // CHECK-NEXT: %10 = call float @llvm.nvvm.add.rm.ftz.f(float %9, float %9)
+  // CHECK-NEXT: %11 = call float @llvm.nvvm.add.rm.ftz.sat.f(float %10, float %10)
+  // CHECK-NEXT: %12 = call float @llvm.nvvm.add.rp.f(float %11, float %11)
+  // CHECK-NEXT: %13 = call float @llvm.nvvm.add.rp.sat.f(float %12, float %12)
+  // CHECK-NEXT: %14 = call float @llvm.nvvm.add.rp.ftz.f(float %13, float %13)
+  // CHECK-NEXT: %15 = call float @llvm.nvvm.add.rp.ftz.sat.f(float %14, float %14)
+  // CHECK-NEXT: %16 = call float @llvm.nvvm.add.rz.f(float %15, float %15)
+  // CHECK-NEXT: %17 = call float @llvm.nvvm.add.rz.sat.f(float %16, float %16)
+  // CHECK-NEXT: %18 = call float @llvm.nvvm.add.rz.ftz.f(float %17, float %17)
+  // CHECK-NEXT: %19 = call float @llvm.nvvm.add.rz.ftz.sat.f(float %18, float %18)
+  // CHECK-NEXT: ret float %19
+  // CHECK-NEXT: }
+  %f1 = nvvm.fadd %a, %b : f32, f32 -> f32
+  %f2 = nvvm.fadd %f1, %f1 {rnd = #nvvm.fp_rnd_mode<rn>} : f32, f32 -> f32
+  %f3 = nvvm.fadd %f2, %f2 {rnd = #nvvm.fp_rnd_mode<rn>, sat = #nvvm.sat_mode<sat>} : f32, f32 -> f32
+  %f4 = nvvm.fadd %f3, %f3 {rnd = #nvvm.fp_rnd_mode<rn>, ftz} : f32, f32 -> f32
+  %f5 = nvvm.fadd %f4, %f4 {rnd = #nvvm.fp_rnd_mode<rn>, sat = #nvvm.sat_mode<sat>, ftz} : f32, f32 -> f32
+  %f6 = nvvm.fadd %f5, %f5 {rnd = #nvvm.fp_rnd_mode<rm>} : f32, f32 -> f32
+  %f7 = nvvm.fadd %f6, %f6 {rnd = #nvvm.fp_rnd_mode<rm>, sat = #nvvm.sat_mode<sat>} : f32, f32 -> f32
+  %f8 = nvvm.fadd %f7, %f7 {rnd = #nvvm.fp_rnd_mode<rm>, ftz} : f32, f32 -> f32
+  %f9 = nvvm.fadd %f8, %f8 {rnd = #nvvm.fp_rnd_mode<rm>, sat = #nvvm.sat_mode<sat>, ftz} : f32, f32 -> f32
+  %f10 = nvvm.fadd %f9, %f9 {rnd = #nvvm.fp_rnd_mode<rp>} : f32, f32 -> f32
+  %f11 = nvvm.fadd %f10, %f10 {rnd = #nvvm.fp_rnd_mode<rp>, sat = #nvvm.sat_mode<sat>} : f32, f32 -> f32
+  %f12 = nvvm.fadd %f11, %f11 {rnd = #nvvm.fp_rnd_mode<rp>, ftz} : f32, f32 -> f32
+  %f13 = nvvm.fadd %f12, %f12 {rnd = #nvvm.fp_rnd_mode<rp>, sat = #nvvm.sat_mode<sat>, ftz} : f32, f32 -> f32
+  %f14 = nvvm.fadd %f13, %f13 {rnd = #nvvm.fp_rnd_mode<rz>} : f32, f32 -> f32
+  %f15 = nvvm.fadd %f14, %f14 {rnd = #nvvm.fp_rnd_mode<rz>, sat = #nvvm.sat_mode<sat>} : f32, f32 -> f32
+  %f16 = nvvm.fadd %f15, %f15 {rnd = #nvvm.fp_rnd_mode<rz>, ftz} : f32, f32 -> f32
+  %f17 = nvvm.fadd %f16, %f16 {rnd...
[truncated]

``````````

</details>


https://github.com/llvm/llvm-project/pull/179162