[clang] [llvm] [clang][NVPTX] Add intrinsics and builtins formixed-precision FP arithmetic (PR #168359)
Srinivasa Ravi via llvm-commits
llvm-commits at lists.llvm.org
Mon Nov 17 04:29:00 PST 2025
https://github.com/Wolfram70 created https://github.com/llvm/llvm-project/pull/168359
This change adds NVVM intrinsics and clang builtins for mixed-precision
FP arithmetic instructions.
Tests are added in `mixed-precision-fp.ll` and `builtins-nvptx.c` and
verified through `ptxas-13.0`.
PTX Spec Reference: https://docs.nvidia.com/cuda/parallel-thread-execution/#mixed-precision-floating-point-instructions
>From 02db2fe3ee31ea2a2183d8cbc4b7572bed839c65 Mon Sep 17 00:00:00 2001
From: Srinivasa Ravi <srinivasar at nvidia.com>
Date: Wed, 12 Nov 2025 09:02:24 +0000
Subject: [PATCH 1/2] [clang][NVPTX] Add intrinsics and builtins
formixed-precision FP arithmetic
This change adds NVVM intrinsics and clang builtins for mixed-precision
FP arithmetic instructions.
Tests are added in `mixed-precision-fp.ll` and `builtins-nvptx.c` and
verified through `ptxas-13.0`.
PTX Spec Reference: https://docs.nvidia.com/cuda/parallel-thread-execution/#mixed-precision-floating-point-instructions
---
clang/include/clang/Basic/BuiltinsNVPTX.td | 64 +++++
clang/lib/CodeGen/TargetBuiltins/NVPTX.cpp | 123 ++++++++++
clang/test/CodeGen/builtins-nvptx.c | 133 +++++++++++
llvm/include/llvm/IR/IntrinsicsNVVM.td | 25 ++
llvm/lib/Target/NVPTX/NVPTXIntrinsics.td | 44 ++++
llvm/test/CodeGen/NVPTX/mixed-precision-fp.ll | 225 ++++++++++++++++++
6 files changed, 614 insertions(+)
create mode 100644 llvm/test/CodeGen/NVPTX/mixed-precision-fp.ll
diff --git a/clang/include/clang/Basic/BuiltinsNVPTX.td b/clang/include/clang/Basic/BuiltinsNVPTX.td
index d923d2a90e908..47ba12bef058c 100644
--- a/clang/include/clang/Basic/BuiltinsNVPTX.td
+++ b/clang/include/clang/Basic/BuiltinsNVPTX.td
@@ -401,6 +401,24 @@ def __nvvm_fma_rz_d : NVPTXBuiltin<"double(double, double, double)">;
def __nvvm_fma_rm_d : NVPTXBuiltin<"double(double, double, double)">;
def __nvvm_fma_rp_d : NVPTXBuiltin<"double(double, double, double)">;
+def __nvvm_fma_mixed_rn_f16_f32 : NVPTXBuiltinSMAndPTX<"float(__fp16, __fp16, float)", SM_100, PTX86>;
+def __nvvm_fma_mixed_rz_f16_f32 : NVPTXBuiltinSMAndPTX<"float(__fp16, __fp16, float)", SM_100, PTX86>;
+def __nvvm_fma_mixed_rm_f16_f32 : NVPTXBuiltinSMAndPTX<"float(__fp16, __fp16, float)", SM_100, PTX86>;
+def __nvvm_fma_mixed_rp_f16_f32 : NVPTXBuiltinSMAndPTX<"float(__fp16, __fp16, float)", SM_100, PTX86>;
+def __nvvm_fma_mixed_rn_sat_f16_f32 : NVPTXBuiltinSMAndPTX<"float(__fp16, __fp16, float)", SM_100, PTX86>;
+def __nvvm_fma_mixed_rz_sat_f16_f32 : NVPTXBuiltinSMAndPTX<"float(__fp16, __fp16, float)", SM_100, PTX86>;
+def __nvvm_fma_mixed_rm_sat_f16_f32 : NVPTXBuiltinSMAndPTX<"float(__fp16, __fp16, float)", SM_100, PTX86>;
+def __nvvm_fma_mixed_rp_sat_f16_f32 : NVPTXBuiltinSMAndPTX<"float(__fp16, __fp16, float)", SM_100, PTX86>;
+
+def __nvvm_fma_mixed_rn_bf16_f32 : NVPTXBuiltinSMAndPTX<"float(__bf16, __bf16, float)", SM_100, PTX86>;
+def __nvvm_fma_mixed_rz_bf16_f32 : NVPTXBuiltinSMAndPTX<"float(__bf16, __bf16, float)", SM_100, PTX86>;
+def __nvvm_fma_mixed_rm_bf16_f32 : NVPTXBuiltinSMAndPTX<"float(__bf16, __bf16, float)", SM_100, PTX86>;
+def __nvvm_fma_mixed_rp_bf16_f32 : NVPTXBuiltinSMAndPTX<"float(__bf16, __bf16, float)", SM_100, PTX86>;
+def __nvvm_fma_mixed_rn_sat_bf16_f32 : NVPTXBuiltinSMAndPTX<"float(__bf16, __bf16, float)", SM_100, PTX86>;
+def __nvvm_fma_mixed_rz_sat_bf16_f32 : NVPTXBuiltinSMAndPTX<"float(__bf16, __bf16, float)", SM_100, PTX86>;
+def __nvvm_fma_mixed_rm_sat_bf16_f32 : NVPTXBuiltinSMAndPTX<"float(__bf16, __bf16, float)", SM_100, PTX86>;
+def __nvvm_fma_mixed_rp_sat_bf16_f32 : NVPTXBuiltinSMAndPTX<"float(__bf16, __bf16, float)", SM_100, PTX86>;
+
// Rcp
def __nvvm_rcp_rn_ftz_f : NVPTXBuiltin<"float(float)">;
@@ -460,6 +478,52 @@ def __nvvm_add_rz_d : NVPTXBuiltin<"double(double, double)">;
def __nvvm_add_rm_d : NVPTXBuiltin<"double(double, double)">;
def __nvvm_add_rp_d : NVPTXBuiltin<"double(double, double)">;
+def __nvvm_add_mixed_f16_f32 : NVPTXBuiltinSMAndPTX<"float(__fp16, float)", SM_100, PTX86>;
+def __nvvm_add_mixed_rn_f16_f32 : NVPTXBuiltinSMAndPTX<"float(__fp16, float)", SM_100, PTX86>;
+def __nvvm_add_mixed_rz_f16_f32 : NVPTXBuiltinSMAndPTX<"float(__fp16, float)", SM_100, PTX86>;
+def __nvvm_add_mixed_rm_f16_f32 : NVPTXBuiltinSMAndPTX<"float(__fp16, float)", SM_100, PTX86>;
+def __nvvm_add_mixed_rp_f16_f32 : NVPTXBuiltinSMAndPTX<"float(__fp16, float)", SM_100, PTX86>;
+def __nvvm_add_mixed_sat_f16_f32 : NVPTXBuiltinSMAndPTX<"float(__fp16, float)", SM_100, PTX86>;
+def __nvvm_add_mixed_rn_sat_f16_f32 : NVPTXBuiltinSMAndPTX<"float(__fp16, float)", SM_100, PTX86>;
+def __nvvm_add_mixed_rz_sat_f16_f32 : NVPTXBuiltinSMAndPTX<"float(__fp16, float)", SM_100, PTX86>;
+def __nvvm_add_mixed_rm_sat_f16_f32 : NVPTXBuiltinSMAndPTX<"float(__fp16, float)", SM_100, PTX86>;
+def __nvvm_add_mixed_rp_sat_f16_f32 : NVPTXBuiltinSMAndPTX<"float(__fp16, float)", SM_100, PTX86>;
+
+def __nvvm_add_mixed_bf16_f32 : NVPTXBuiltinSMAndPTX<"float(__bf16, float)", SM_100, PTX86>;
+def __nvvm_add_mixed_rn_bf16_f32 : NVPTXBuiltinSMAndPTX<"float(__bf16, float)", SM_100, PTX86>;
+def __nvvm_add_mixed_rz_bf16_f32 : NVPTXBuiltinSMAndPTX<"float(__bf16, float)", SM_100, PTX86>;
+def __nvvm_add_mixed_rm_bf16_f32 : NVPTXBuiltinSMAndPTX<"float(__bf16, float)", SM_100, PTX86>;
+def __nvvm_add_mixed_rp_bf16_f32 : NVPTXBuiltinSMAndPTX<"float(__bf16, float)", SM_100, PTX86>;
+def __nvvm_add_mixed_sat_bf16_f32 : NVPTXBuiltinSMAndPTX<"float(__bf16, float)", SM_100, PTX86>;
+def __nvvm_add_mixed_rn_sat_bf16_f32 : NVPTXBuiltinSMAndPTX<"float(__bf16, float)", SM_100, PTX86>;
+def __nvvm_add_mixed_rz_sat_bf16_f32 : NVPTXBuiltinSMAndPTX<"float(__bf16, float)", SM_100, PTX86>;
+def __nvvm_add_mixed_rm_sat_bf16_f32 : NVPTXBuiltinSMAndPTX<"float(__bf16, float)", SM_100, PTX86>;
+def __nvvm_add_mixed_rp_sat_bf16_f32 : NVPTXBuiltinSMAndPTX<"float(__bf16, float)", SM_100, PTX86>;
+
+// Sub
+
+def __nvvm_sub_mixed_f16_f32 : NVPTXBuiltinSMAndPTX<"float(__fp16, float)", SM_100, PTX86>;
+def __nvvm_sub_mixed_rn_f16_f32 : NVPTXBuiltinSMAndPTX<"float(__fp16, float)", SM_100, PTX86>;
+def __nvvm_sub_mixed_rz_f16_f32 : NVPTXBuiltinSMAndPTX<"float(__fp16, float)", SM_100, PTX86>;
+def __nvvm_sub_mixed_rm_f16_f32 : NVPTXBuiltinSMAndPTX<"float(__fp16, float)", SM_100, PTX86>;
+def __nvvm_sub_mixed_rp_f16_f32 : NVPTXBuiltinSMAndPTX<"float(__fp16, float)", SM_100, PTX86>;
+def __nvvm_sub_mixed_sat_f16_f32 : NVPTXBuiltinSMAndPTX<"float(__fp16, float)", SM_100, PTX86>;
+def __nvvm_sub_mixed_rn_sat_f16_f32 : NVPTXBuiltinSMAndPTX<"float(__fp16, float)", SM_100, PTX86>;
+def __nvvm_sub_mixed_rz_sat_f16_f32 : NVPTXBuiltinSMAndPTX<"float(__fp16, float)", SM_100, PTX86>;
+def __nvvm_sub_mixed_rm_sat_f16_f32 : NVPTXBuiltinSMAndPTX<"float(__fp16, float)", SM_100, PTX86>;
+def __nvvm_sub_mixed_rp_sat_f16_f32 : NVPTXBuiltinSMAndPTX<"float(__fp16, float)", SM_100, PTX86>;
+
+def __nvvm_sub_mixed_bf16_f32 : NVPTXBuiltinSMAndPTX<"float(__bf16, float)", SM_100, PTX86>;
+def __nvvm_sub_mixed_rn_bf16_f32 : NVPTXBuiltinSMAndPTX<"float(__bf16, float)", SM_100, PTX86>;
+def __nvvm_sub_mixed_rz_bf16_f32 : NVPTXBuiltinSMAndPTX<"float(__bf16, float)", SM_100, PTX86>;
+def __nvvm_sub_mixed_rm_bf16_f32 : NVPTXBuiltinSMAndPTX<"float(__bf16, float)", SM_100, PTX86>;
+def __nvvm_sub_mixed_rp_bf16_f32 : NVPTXBuiltinSMAndPTX<"float(__bf16, float)", SM_100, PTX86>;
+def __nvvm_sub_mixed_sat_bf16_f32 : NVPTXBuiltinSMAndPTX<"float(__bf16, float)", SM_100, PTX86>;
+def __nvvm_sub_mixed_rn_sat_bf16_f32 : NVPTXBuiltinSMAndPTX<"float(__bf16, float)", SM_100, PTX86>;
+def __nvvm_sub_mixed_rz_sat_bf16_f32 : NVPTXBuiltinSMAndPTX<"float(__bf16, float)", SM_100, PTX86>;
+def __nvvm_sub_mixed_rm_sat_bf16_f32 : NVPTXBuiltinSMAndPTX<"float(__bf16, float)", SM_100, PTX86>;
+def __nvvm_sub_mixed_rp_sat_bf16_f32 : NVPTXBuiltinSMAndPTX<"float(__bf16, float)", SM_100, PTX86>;
+
// Convert
def __nvvm_d2f_rn_ftz : NVPTXBuiltin<"float(double)">;
diff --git a/clang/lib/CodeGen/TargetBuiltins/NVPTX.cpp b/clang/lib/CodeGen/TargetBuiltins/NVPTX.cpp
index 8a1cab3417d98..6f57620f0fb00 100644
--- a/clang/lib/CodeGen/TargetBuiltins/NVPTX.cpp
+++ b/clang/lib/CodeGen/TargetBuiltins/NVPTX.cpp
@@ -415,6 +415,17 @@ static Value *MakeHalfType(unsigned IntrinsicID, unsigned BuiltinID,
return MakeHalfType(CGF.CGM.getIntrinsic(IntrinsicID), BuiltinID, E, CGF);
}
+static Value *MakeMixedPrecisionFPArithmetic(unsigned IntrinsicID,
+ const CallExpr *E,
+ CodeGenFunction &CGF) {
+ SmallVector<llvm::Value *, 3> Args;
+ for (unsigned i = 0; i < E->getNumArgs(); ++i) {
+ Args.push_back(CGF.EmitScalarExpr(E->getArg(i)));
+ }
+ return CGF.Builder.CreateCall(
+ CGF.CGM.getIntrinsic(IntrinsicID, {Args[0]->getType()}), Args);
+}
+
} // namespace
Value *CodeGenFunction::EmitNVPTXBuiltinExpr(unsigned BuiltinID,
@@ -1197,6 +1208,118 @@ Value *CodeGenFunction::EmitNVPTXBuiltinExpr(unsigned BuiltinID,
return Builder.CreateCall(
CGM.getIntrinsic(Intrinsic::nvvm_barrier_cta_sync_count),
{EmitScalarExpr(E->getArg(0)), EmitScalarExpr(E->getArg(1))});
+ case NVPTX::BI__nvvm_add_mixed_f16_f32:
+ case NVPTX::BI__nvvm_add_mixed_bf16_f32:
+ return MakeMixedPrecisionFPArithmetic(Intrinsic::nvvm_add_mixed_f32, E,
+ *this);
+ case NVPTX::BI__nvvm_add_mixed_rn_f16_f32:
+ case NVPTX::BI__nvvm_add_mixed_rn_bf16_f32:
+ return MakeMixedPrecisionFPArithmetic(Intrinsic::nvvm_add_mixed_rn_f32, E,
+ *this);
+ case NVPTX::BI__nvvm_add_mixed_rz_f16_f32:
+ case NVPTX::BI__nvvm_add_mixed_rz_bf16_f32:
+ return MakeMixedPrecisionFPArithmetic(Intrinsic::nvvm_add_mixed_rz_f32, E,
+ *this);
+ case NVPTX::BI__nvvm_add_mixed_rm_f16_f32:
+ case NVPTX::BI__nvvm_add_mixed_rm_bf16_f32:
+ return MakeMixedPrecisionFPArithmetic(Intrinsic::nvvm_add_mixed_rm_f32, E,
+ *this);
+ case NVPTX::BI__nvvm_add_mixed_rp_f16_f32:
+ case NVPTX::BI__nvvm_add_mixed_rp_bf16_f32:
+ return MakeMixedPrecisionFPArithmetic(Intrinsic::nvvm_add_mixed_rp_f32, E,
+ *this);
+ case NVPTX::BI__nvvm_add_mixed_sat_f16_f32:
+ case NVPTX::BI__nvvm_add_mixed_sat_bf16_f32:
+ return MakeMixedPrecisionFPArithmetic(Intrinsic::nvvm_add_mixed_sat_f32, E,
+ *this);
+ case NVPTX::BI__nvvm_add_mixed_rn_sat_f16_f32:
+ case NVPTX::BI__nvvm_add_mixed_rn_sat_bf16_f32:
+ return MakeMixedPrecisionFPArithmetic(Intrinsic::nvvm_add_mixed_rn_sat_f32,
+ E, *this);
+ case NVPTX::BI__nvvm_add_mixed_rz_sat_f16_f32:
+ case NVPTX::BI__nvvm_add_mixed_rz_sat_bf16_f32:
+ return MakeMixedPrecisionFPArithmetic(Intrinsic::nvvm_add_mixed_rz_sat_f32,
+ E, *this);
+ case NVPTX::BI__nvvm_add_mixed_rm_sat_f16_f32:
+ case NVPTX::BI__nvvm_add_mixed_rm_sat_bf16_f32:
+ return MakeMixedPrecisionFPArithmetic(Intrinsic::nvvm_add_mixed_rm_sat_f32,
+ E, *this);
+ case NVPTX::BI__nvvm_add_mixed_rp_sat_f16_f32:
+ case NVPTX::BI__nvvm_add_mixed_rp_sat_bf16_f32:
+ return MakeMixedPrecisionFPArithmetic(Intrinsic::nvvm_add_mixed_rp_sat_f32,
+ E, *this);
+ case NVPTX::BI__nvvm_sub_mixed_f16_f32:
+ case NVPTX::BI__nvvm_sub_mixed_bf16_f32:
+ return MakeMixedPrecisionFPArithmetic(Intrinsic::nvvm_sub_mixed_f32, E,
+ *this);
+ case NVPTX::BI__nvvm_sub_mixed_rn_f16_f32:
+ case NVPTX::BI__nvvm_sub_mixed_rn_bf16_f32:
+ return MakeMixedPrecisionFPArithmetic(Intrinsic::nvvm_sub_mixed_rn_f32, E,
+ *this);
+ case NVPTX::BI__nvvm_sub_mixed_rz_f16_f32:
+ case NVPTX::BI__nvvm_sub_mixed_rz_bf16_f32:
+ return MakeMixedPrecisionFPArithmetic(Intrinsic::nvvm_sub_mixed_rz_f32, E,
+ *this);
+ case NVPTX::BI__nvvm_sub_mixed_rm_f16_f32:
+ case NVPTX::BI__nvvm_sub_mixed_rm_bf16_f32:
+ return MakeMixedPrecisionFPArithmetic(Intrinsic::nvvm_sub_mixed_rm_f32, E,
+ *this);
+ case NVPTX::BI__nvvm_sub_mixed_rp_f16_f32:
+ case NVPTX::BI__nvvm_sub_mixed_rp_bf16_f32:
+ return MakeMixedPrecisionFPArithmetic(Intrinsic::nvvm_sub_mixed_rp_f32, E,
+ *this);
+ case NVPTX::BI__nvvm_sub_mixed_sat_f16_f32:
+ case NVPTX::BI__nvvm_sub_mixed_sat_bf16_f32:
+ return MakeMixedPrecisionFPArithmetic(Intrinsic::nvvm_sub_mixed_sat_f32, E,
+ *this);
+ case NVPTX::BI__nvvm_sub_mixed_rn_sat_f16_f32:
+ case NVPTX::BI__nvvm_sub_mixed_rn_sat_bf16_f32:
+ return MakeMixedPrecisionFPArithmetic(Intrinsic::nvvm_sub_mixed_rn_sat_f32,
+ E, *this);
+ case NVPTX::BI__nvvm_sub_mixed_rz_sat_f16_f32:
+ case NVPTX::BI__nvvm_sub_mixed_rz_sat_bf16_f32:
+ return MakeMixedPrecisionFPArithmetic(Intrinsic::nvvm_sub_mixed_rz_sat_f32,
+ E, *this);
+ case NVPTX::BI__nvvm_sub_mixed_rm_sat_f16_f32:
+ case NVPTX::BI__nvvm_sub_mixed_rm_sat_bf16_f32:
+ return MakeMixedPrecisionFPArithmetic(Intrinsic::nvvm_sub_mixed_rm_sat_f32,
+ E, *this);
+ case NVPTX::BI__nvvm_sub_mixed_rp_sat_f16_f32:
+ case NVPTX::BI__nvvm_sub_mixed_rp_sat_bf16_f32:
+ return MakeMixedPrecisionFPArithmetic(Intrinsic::nvvm_sub_mixed_rp_sat_f32,
+ E, *this);
+ case NVPTX::BI__nvvm_fma_mixed_rn_f16_f32:
+ case NVPTX::BI__nvvm_fma_mixed_rn_bf16_f32:
+ return MakeMixedPrecisionFPArithmetic(Intrinsic::nvvm_fma_mixed_rn_f32, E,
+ *this);
+ case NVPTX::BI__nvvm_fma_mixed_rz_f16_f32:
+ case NVPTX::BI__nvvm_fma_mixed_rz_bf16_f32:
+ return MakeMixedPrecisionFPArithmetic(Intrinsic::nvvm_fma_mixed_rz_f32, E,
+ *this);
+ case NVPTX::BI__nvvm_fma_mixed_rm_f16_f32:
+ case NVPTX::BI__nvvm_fma_mixed_rm_bf16_f32:
+ return MakeMixedPrecisionFPArithmetic(Intrinsic::nvvm_fma_mixed_rm_f32, E,
+ *this);
+ case NVPTX::BI__nvvm_fma_mixed_rp_f16_f32:
+ case NVPTX::BI__nvvm_fma_mixed_rp_bf16_f32:
+ return MakeMixedPrecisionFPArithmetic(Intrinsic::nvvm_fma_mixed_rp_f32, E,
+ *this);
+ case NVPTX::BI__nvvm_fma_mixed_rn_sat_f16_f32:
+ case NVPTX::BI__nvvm_fma_mixed_rn_sat_bf16_f32:
+ return MakeMixedPrecisionFPArithmetic(Intrinsic::nvvm_fma_mixed_rn_sat_f32,
+ E, *this);
+ case NVPTX::BI__nvvm_fma_mixed_rz_sat_f16_f32:
+ case NVPTX::BI__nvvm_fma_mixed_rz_sat_bf16_f32:
+ return MakeMixedPrecisionFPArithmetic(Intrinsic::nvvm_fma_mixed_rz_sat_f32,
+ E, *this);
+ case NVPTX::BI__nvvm_fma_mixed_rm_sat_f16_f32:
+ case NVPTX::BI__nvvm_fma_mixed_rm_sat_bf16_f32:
+ return MakeMixedPrecisionFPArithmetic(Intrinsic::nvvm_fma_mixed_rm_sat_f32,
+ E, *this);
+ case NVPTX::BI__nvvm_fma_mixed_rp_sat_f16_f32:
+ case NVPTX::BI__nvvm_fma_mixed_rp_sat_bf16_f32:
+ return MakeMixedPrecisionFPArithmetic(Intrinsic::nvvm_fma_mixed_rp_sat_f32,
+ E, *this);
default:
return nullptr;
}
diff --git a/clang/test/CodeGen/builtins-nvptx.c b/clang/test/CodeGen/builtins-nvptx.c
index e3be262622844..1753b4c7767e9 100644
--- a/clang/test/CodeGen/builtins-nvptx.c
+++ b/clang/test/CodeGen/builtins-nvptx.c
@@ -1466,3 +1466,136 @@ __device__ void nvvm_min_max_sm86() {
#endif
// CHECK: ret void
}
+
+#define F16 (__fp16)0.1f
+#define F16_2 (__fp16)0.2f
+
+__device__ void nvvm_add_mixed_precision_sm100() {
+#if __CUDA_ARCH__ >= 1000
+ // CHECK_PTX86_SM100: call float @llvm.nvvm.add.mixed.f32.f16(half 0xH2E66, float 1.000000e+00)
+ __nvvm_add_mixed_f16_f32(F16, 1.0f);
+ // CHECK_PTX86_SM100: call float @llvm.nvvm.add.mixed.rn.f32.f16(half 0xH2E66, float 1.000000e+00)
+ __nvvm_add_mixed_rn_f16_f32(F16, 1.0f);
+ // CHECK_PTX86_SM100: call float @llvm.nvvm.add.mixed.rz.f32.f16(half 0xH2E66, float 1.000000e+00)
+ __nvvm_add_mixed_rz_f16_f32(F16, 1.0f);
+ // CHECK_PTX86_SM100: call float @llvm.nvvm.add.mixed.rm.f32.f16(half 0xH2E66, float 1.000000e+00)
+ __nvvm_add_mixed_rm_f16_f32(F16, 1.0f);
+ // CHECK_PTX86_SM100: call float @llvm.nvvm.add.mixed.rp.f32.f16(half 0xH2E66, float 1.000000e+00)
+ __nvvm_add_mixed_rp_f16_f32(F16, 1.0f);
+ // CHECK_PTX86_SM100: call float @llvm.nvvm.add.mixed.sat.f32.f16(half 0xH2E66, float 1.000000e+00)
+ __nvvm_add_mixed_sat_f16_f32(F16, 1.0f);
+ // CHECK_PTX86_SM100: call float @llvm.nvvm.add.mixed.rn.sat.f32.f16(half 0xH2E66, float 1.000000e+00)
+ __nvvm_add_mixed_rn_sat_f16_f32(F16, 1.0f);
+ // CHECK_PTX86_SM100: call float @llvm.nvvm.add.mixed.rz.sat.f32.f16(half 0xH2E66, float 1.000000e+00)
+ __nvvm_add_mixed_rz_sat_f16_f32(F16, 1.0f);
+ // CHECK_PTX86_SM100: call float @llvm.nvvm.add.mixed.rm.sat.f32.f16(half 0xH2E66, float 1.000000e+00)
+ __nvvm_add_mixed_rm_sat_f16_f32(F16, 1.0f);
+ // CHECK_PTX86_SM100: call float @llvm.nvvm.add.mixed.rp.sat.f32.f16(half 0xH2E66, float 1.000000e+00)
+ __nvvm_add_mixed_rp_sat_f16_f32(F16, 1.0f);
+
+ // CHECK_PTX86_SM100: call float @llvm.nvvm.add.mixed.f32.bf16(bfloat 0xR3DCD, float 1.000000e+00)
+ __nvvm_add_mixed_bf16_f32(BF16, 1.0f);
+ // CHECK_PTX86_SM100: call float @llvm.nvvm.add.mixed.rn.f32.bf16(bfloat 0xR3DCD, float 1.000000e+00)
+ __nvvm_add_mixed_rn_bf16_f32(BF16, 1.0f);
+ // CHECK_PTX86_SM100: call float @llvm.nvvm.add.mixed.rz.f32.bf16(bfloat 0xR3DCD, float 1.000000e+00)
+ __nvvm_add_mixed_rz_bf16_f32(BF16, 1.0f);
+ // CHECK_PTX86_SM100: call float @llvm.nvvm.add.mixed.rm.f32.bf16(bfloat 0xR3DCD, float 1.000000e+00)
+ __nvvm_add_mixed_rm_bf16_f32(BF16, 1.0f);
+ // CHECK_PTX86_SM100: call float @llvm.nvvm.add.mixed.rp.f32.bf16(bfloat 0xR3DCD, float 1.000000e+00)
+ __nvvm_add_mixed_rp_bf16_f32(BF16, 1.0f);
+ // CHECK_PTX86_SM100: call float @llvm.nvvm.add.mixed.sat.f32.bf16(bfloat 0xR3DCD, float 1.000000e+00)
+ __nvvm_add_mixed_sat_bf16_f32(BF16, 1.0f);
+ // CHECK_PTX86_SM100: call float @llvm.nvvm.add.mixed.rn.sat.f32.bf16(bfloat 0xR3DCD, float 1.000000e+00)
+ __nvvm_add_mixed_rn_sat_bf16_f32(BF16, 1.0f);
+ // CHECK_PTX86_SM100: call float @llvm.nvvm.add.mixed.rz.sat.f32.bf16(bfloat 0xR3DCD, float 1.000000e+00)
+ __nvvm_add_mixed_rz_sat_bf16_f32(BF16, 1.0f);
+ // CHECK_PTX86_SM100: call float @llvm.nvvm.add.mixed.rm.sat.f32.bf16(bfloat 0xR3DCD, float 1.000000e+00)
+ __nvvm_add_mixed_rm_sat_bf16_f32(BF16, 1.0f);
+ // CHECK_PTX86_SM100: call float @llvm.nvvm.add.mixed.rp.sat.f32.bf16(bfloat 0xR3DCD, float 1.000000e+00)
+ __nvvm_add_mixed_rp_sat_bf16_f32(BF16, 1.0f);
+#endif
+}
+
+__device__ void nvvm_sub_mixed_precision_sm100() {
+#if __CUDA_ARCH__ >= 1000
+ // CHECK_PTX86_SM100: call float @llvm.nvvm.sub.mixed.f32.f16(half 0xH2E66, float 1.000000e+00)
+ __nvvm_sub_mixed_f16_f32(F16, 1.0f);
+ // CHECK_PTX86_SM100: call float @llvm.nvvm.sub.mixed.rn.f32.f16(half 0xH2E66, float 1.000000e+00)
+ __nvvm_sub_mixed_rn_f16_f32(F16, 1.0f);
+ // CHECK_PTX86_SM100: call float @llvm.nvvm.sub.mixed.rz.f32.f16(half 0xH2E66, float 1.000000e+00)
+ __nvvm_sub_mixed_rz_f16_f32(F16, 1.0f);
+ // CHECK_PTX86_SM100: call float @llvm.nvvm.sub.mixed.rm.f32.f16(half 0xH2E66, float 1.000000e+00)
+ __nvvm_sub_mixed_rm_f16_f32(F16, 1.0f);
+ // CHECK_PTX86_SM100: call float @llvm.nvvm.sub.mixed.rp.f32.f16(half 0xH2E66, float 1.000000e+00)
+ __nvvm_sub_mixed_rp_f16_f32(F16, 1.0f);
+ // CHECK_PTX86_SM100: call float @llvm.nvvm.sub.mixed.sat.f32.f16(half 0xH2E66, float 1.000000e+00)
+ __nvvm_sub_mixed_sat_f16_f32(F16, 1.0f);
+ // CHECK_PTX86_SM100: call float @llvm.nvvm.sub.mixed.rn.sat.f32.f16(half 0xH2E66, float 1.000000e+00)
+ __nvvm_sub_mixed_rn_sat_f16_f32(F16, 1.0f);
+ // CHECK_PTX86_SM100: call float @llvm.nvvm.sub.mixed.rz.sat.f32.f16(half 0xH2E66, float 1.000000e+00)
+ __nvvm_sub_mixed_rz_sat_f16_f32(F16, 1.0f);
+ // CHECK_PTX86_SM100: call float @llvm.nvvm.sub.mixed.rm.sat.f32.f16(half 0xH2E66, float 1.000000e+00)
+ __nvvm_sub_mixed_rm_sat_f16_f32(F16, 1.0f);
+ // CHECK_PTX86_SM100: call float @llvm.nvvm.sub.mixed.rp.sat.f32.f16(half 0xH2E66, float 1.000000e+00)
+ __nvvm_sub_mixed_rp_sat_f16_f32(F16, 1.0f);
+
+ // CHECK_PTX86_SM100: call float @llvm.nvvm.sub.mixed.f32.bf16(bfloat 0xR3DCD, float 1.000000e+00)
+ __nvvm_sub_mixed_bf16_f32(BF16, 1.0f);
+ // CHECK_PTX86_SM100: call float @llvm.nvvm.sub.mixed.rn.f32.bf16(bfloat 0xR3DCD, float 1.000000e+00)
+ __nvvm_sub_mixed_rn_bf16_f32(BF16, 1.0f);
+ // CHECK_PTX86_SM100: call float @llvm.nvvm.sub.mixed.rz.f32.bf16(bfloat 0xR3DCD, float 1.000000e+00)
+ __nvvm_sub_mixed_rz_bf16_f32(BF16, 1.0f);
+ // CHECK_PTX86_SM100: call float @llvm.nvvm.sub.mixed.rm.f32.bf16(bfloat 0xR3DCD, float 1.000000e+00)
+ __nvvm_sub_mixed_rm_bf16_f32(BF16, 1.0f);
+ // CHECK_PTX86_SM100: call float @llvm.nvvm.sub.mixed.rp.f32.bf16(bfloat 0xR3DCD, float 1.000000e+00)
+ __nvvm_sub_mixed_rp_bf16_f32(BF16, 1.0f);
+ // CHECK_PTX86_SM100: call float @llvm.nvvm.sub.mixed.sat.f32.bf16(bfloat 0xR3DCD, float 1.000000e+00)
+ __nvvm_sub_mixed_sat_bf16_f32(BF16, 1.0f);
+ // CHECK_PTX86_SM100: call float @llvm.nvvm.sub.mixed.rn.sat.f32.bf16(bfloat 0xR3DCD, float 1.000000e+00)
+ __nvvm_sub_mixed_rn_sat_bf16_f32(BF16, 1.0f);
+ // CHECK_PTX86_SM100: call float @llvm.nvvm.sub.mixed.rz.sat.f32.bf16(bfloat 0xR3DCD, float 1.000000e+00)
+ __nvvm_sub_mixed_rz_sat_bf16_f32(BF16, 1.0f);
+ // CHECK_PTX86_SM100: call float @llvm.nvvm.sub.mixed.rm.sat.f32.bf16(bfloat 0xR3DCD, float 1.000000e+00)
+ __nvvm_sub_mixed_rm_sat_bf16_f32(BF16, 1.0f);
+ // CHECK_PTX86_SM100: call float @llvm.nvvm.sub.mixed.rp.sat.f32.bf16(bfloat 0xR3DCD, float 1.000000e+00)
+ __nvvm_sub_mixed_rp_sat_bf16_f32(BF16, 1.0f);
+#endif
+}
+
+__device__ void nvvm_fma_mixed_precision_sm100() {
+#if __CUDA_ARCH__ >= 1000
+ // CHECK_PTX86_SM100: call float @llvm.nvvm.fma.mixed.rn.f32.f16(half 0xH2E66, half 0xH3266, float 1.000000e+00)
+ __nvvm_fma_mixed_rn_f16_f32(F16, F16_2, 1.0f);
+ // CHECK_PTX86_SM100: call float @llvm.nvvm.fma.mixed.rz.f32.f16(half 0xH2E66, half 0xH3266, float 1.000000e+00)
+ __nvvm_fma_mixed_rz_f16_f32(F16, F16_2, 1.0f);
+ // CHECK_PTX86_SM100: call float @llvm.nvvm.fma.mixed.rm.f32.f16(half 0xH2E66, half 0xH3266, float 1.000000e+00)
+ __nvvm_fma_mixed_rm_f16_f32(F16, F16_2, 1.0f);
+ // CHECK_PTX86_SM100: call float @llvm.nvvm.fma.mixed.rp.f32.f16(half 0xH2E66, half 0xH3266, float 1.000000e+00)
+ __nvvm_fma_mixed_rp_f16_f32(F16, F16_2, 1.0f);
+ // CHECK_PTX86_SM100: call float @llvm.nvvm.fma.mixed.rn.sat.f32.f16(half 0xH2E66, half 0xH3266, float 1.000000e+00)
+ __nvvm_fma_mixed_rn_sat_f16_f32(F16, F16_2, 1.0f);
+ // CHECK_PTX86_SM100: call float @llvm.nvvm.fma.mixed.rz.sat.f32.f16(half 0xH2E66, half 0xH3266, float 1.000000e+00)
+ __nvvm_fma_mixed_rz_sat_f16_f32(F16, F16_2, 1.0f);
+ // CHECK_PTX86_SM100: call float @llvm.nvvm.fma.mixed.rm.sat.f32.f16(half 0xH2E66, half 0xH3266, float 1.000000e+00)
+ __nvvm_fma_mixed_rm_sat_f16_f32(F16, F16_2, 1.0f);
+ // CHECK_PTX86_SM100: call float @llvm.nvvm.fma.mixed.rp.sat.f32.f16(half 0xH2E66, half 0xH3266, float 1.000000e+00)
+ __nvvm_fma_mixed_rp_sat_f16_f32(F16, F16_2, 1.0f);
+
+ // CHECK_PTX86_SM100: call float @llvm.nvvm.fma.mixed.rn.f32.bf16(bfloat 0xR3DCD, bfloat 0xR3E4D, float 1.000000e+00)
+ __nvvm_fma_mixed_rn_bf16_f32(BF16, BF16_2, 1.0f);
+ // CHECK_PTX86_SM100: call float @llvm.nvvm.fma.mixed.rz.f32.bf16(bfloat 0xR3DCD, bfloat 0xR3E4D, float 1.000000e+00)
+ __nvvm_fma_mixed_rz_bf16_f32(BF16, BF16_2, 1.0f);
+ // CHECK_PTX86_SM100: call float @llvm.nvvm.fma.mixed.rm.f32.bf16(bfloat 0xR3DCD, bfloat 0xR3E4D, float 1.000000e+00)
+ __nvvm_fma_mixed_rm_bf16_f32(BF16, BF16_2, 1.0f);
+ // CHECK_PTX86_SM100: call float @llvm.nvvm.fma.mixed.rp.f32.bf16(bfloat 0xR3DCD, bfloat 0xR3E4D, float 1.000000e+00)
+ __nvvm_fma_mixed_rp_bf16_f32(BF16, BF16_2, 1.0f);
+ // CHECK_PTX86_SM100: call float @llvm.nvvm.fma.mixed.rn.sat.f32.bf16(bfloat 0xR3DCD, bfloat 0xR3E4D, float 1.000000e+00)
+ __nvvm_fma_mixed_rn_sat_bf16_f32(BF16, BF16_2, 1.0f);
+ // CHECK_PTX86_SM100: call float @llvm.nvvm.fma.mixed.rz.sat.f32.bf16(bfloat 0xR3DCD, bfloat 0xR3E4D, float 1.000000e+00)
+ __nvvm_fma_mixed_rz_sat_bf16_f32(BF16, BF16_2, 1.0f);
+ // CHECK_PTX86_SM100: call float @llvm.nvvm.fma.mixed.rm.sat.f32.bf16(bfloat 0xR3DCD, bfloat 0xR3E4D, float 1.000000e+00)
+ __nvvm_fma_mixed_rm_sat_bf16_f32(BF16, BF16_2, 1.0f);
+ // CHECK_PTX86_SM100: call float @llvm.nvvm.fma.mixed.rp.sat.f32.bf16(bfloat 0xR3DCD, bfloat 0xR3E4D, float 1.000000e+00)
+ __nvvm_fma_mixed_rp_sat_bf16_f32(BF16, BF16_2, 1.0f);
+#endif
+}
diff --git a/llvm/include/llvm/IR/IntrinsicsNVVM.td b/llvm/include/llvm/IR/IntrinsicsNVVM.td
index 21badc2692037..7a7fce42e55b0 100644
--- a/llvm/include/llvm/IR/IntrinsicsNVVM.td
+++ b/llvm/include/llvm/IR/IntrinsicsNVVM.td
@@ -1386,6 +1386,14 @@ let TargetPrefix = "nvvm" in {
PureIntrinsic<[llvm_double_ty],
[llvm_double_ty, llvm_double_ty, llvm_double_ty]>;
}
+
+ foreach rnd = ["_rn", "_rz", "_rm", "_rp"] in {
+ foreach sat = ["", "_sat"] in {
+ def int_nvvm_fma_mixed # rnd # sat # _f32 :
+ PureIntrinsic<[llvm_float_ty],
+ [llvm_anyfloat_ty, LLVMMatchType<0>, llvm_float_ty]>;
+ }
+ }
//
// Rcp
@@ -1453,6 +1461,23 @@ let TargetPrefix = "nvvm" in {
}
}
+ foreach rnd = ["", "_rn", "_rz", "_rm", "_rp"] in {
+ foreach sat = ["", "_sat"] in {
+ def int_nvvm_add_mixed # rnd # sat # _f32 :
+ PureIntrinsic<[llvm_float_ty], [llvm_anyfloat_ty, llvm_float_ty]>;
+ }
+ }
+
+ //
+ // Sub
+ //
+ foreach rnd = ["", "_rn", "_rz", "_rm", "_rp"] in {
+ foreach sat = ["", "_sat"] in {
+ def int_nvvm_sub_mixed # rnd # sat # _f32 :
+ PureIntrinsic<[llvm_float_ty], [llvm_anyfloat_ty, llvm_float_ty]>;
+ }
+ }
+
//
// Dot Product
//
diff --git a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
index ea69a54e6db37..07483bf5a3e3d 100644
--- a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
+++ b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
@@ -1694,6 +1694,20 @@ multiclass FMA_INST {
defm INT_NVVM_FMA : FMA_INST;
+foreach rnd = ["_RN", "_RZ", "_RM", "_RP"] in {
+ foreach sat = ["", "_SAT"] in {
+ foreach type = ["F16", "BF16"] in {
+ def INT_NVVM_FMA # rnd # sat # _F32_ # type :
+ BasicNVPTXInst<(outs B32:$dst), (ins B16:$a, B16:$b, B32:$c),
+ !tolower(!subst("_", ".", "fma" # rnd # sat # "_f32_" # type)),
+ [(set f32:$dst,
+ (!cast<Intrinsic>(!tolower("int_nvvm_fma_mixed" # rnd # sat # "_f32"))
+ !cast<ValueType>(!tolower(type)):$a, !cast<ValueType>(!tolower(type)):$b, f32:$c))]>,
+ Requires<[hasSM<100>, hasPTX<86>]>;
+ }
+ }
+}
+
//
// Rcp
//
@@ -1806,6 +1820,36 @@ def INT_NVVM_ADD_RZ_D : F_MATH_2<"add.rz.f64", B64, B64, B64, int_nvvm_add_rz_d>
def INT_NVVM_ADD_RM_D : F_MATH_2<"add.rm.f64", B64, B64, B64, int_nvvm_add_rm_d>;
def INT_NVVM_ADD_RP_D : F_MATH_2<"add.rp.f64", B64, B64, B64, int_nvvm_add_rp_d>;
+foreach rnd = ["", "_RN", "_RZ", "_RM", "_RP"] in {
+ foreach sat = ["", "_SAT"] in {
+ foreach type = ["F16", "BF16"] in {
+ def INT_NVVM_ADD # rnd # sat # _F32_ # type :
+ BasicNVPTXInst<(outs B32:$dst), (ins B16:$a, B32:$b),
+ !tolower(!subst("_", ".", "add" # rnd # sat # "_f32_" # type)),
+ [(set f32:$dst,
+ (!cast<Intrinsic>(!tolower("int_nvvm_add_mixed" # rnd # sat # "_f32"))
+ !cast<ValueType>(!tolower(type)):$a, f32:$b))]>,
+ Requires<[hasSM<100>, hasPTX<86>]>;
+ }
+ }
+}
+//
+// Sub
+//
+
+foreach rnd = ["", "_RN", "_RZ", "_RM", "_RP"] in {
+ foreach sat = ["", "_SAT"] in {
+ foreach type = ["F16", "BF16"] in {
+ def INT_NVVM_SUB # rnd # sat # _F32_ # type :
+ BasicNVPTXInst<(outs B32:$dst), (ins B16:$a, B32:$b),
+ !tolower(!subst("_", ".", "sub" # rnd # sat # "_f32_" # type)),
+ [(set f32:$dst,
+ (!cast<Intrinsic>(!tolower("int_nvvm_sub_mixed" # rnd # sat # "_f32"))
+ !cast<ValueType>(!tolower(type)):$a, f32:$b))]>,
+ Requires<[hasSM<100>, hasPTX<86>]>;
+ }
+ }
+}
//
// BFIND
//
diff --git a/llvm/test/CodeGen/NVPTX/mixed-precision-fp.ll b/llvm/test/CodeGen/NVPTX/mixed-precision-fp.ll
new file mode 100644
index 0000000000000..a4f2fe68830f5
--- /dev/null
+++ b/llvm/test/CodeGen/NVPTX/mixed-precision-fp.ll
@@ -0,0 +1,225 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_100 -mattr=+ptx86 | FileCheck %s
+; RUN: %if ptxas-sm_100 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100 -mattr=+ptx86 | %ptxas-verify -arch=sm_100 %}
+
+; ADD
+
+define float @test_add_f32_f16(half %a, float %b) {
+; CHECK-LABEL: test_add_f32_f16(
+; CHECK: {
+; CHECK-NEXT: .reg .b16 %rs<2>;
+; CHECK-NEXT: .reg .b32 %r<12>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.b16 %rs1, [test_add_f32_f16_param_0];
+; CHECK-NEXT: ld.param.b32 %r1, [test_add_f32_f16_param_1];
+; CHECK-NEXT: add.f32.f16 %r2, %rs1, %r1;
+; CHECK-NEXT: add.rn.f32.f16 %r3, %rs1, %r2;
+; CHECK-NEXT: add.rz.f32.f16 %r4, %rs1, %r3;
+; CHECK-NEXT: add.rm.f32.f16 %r5, %rs1, %r4;
+; CHECK-NEXT: add.rp.f32.f16 %r6, %rs1, %r5;
+; CHECK-NEXT: add.sat.f32.f16 %r7, %rs1, %r6;
+; CHECK-NEXT: add.rn.sat.f32.f16 %r8, %rs1, %r7;
+; CHECK-NEXT: add.rz.sat.f32.f16 %r9, %rs1, %r8;
+; CHECK-NEXT: add.rm.sat.f32.f16 %r10, %rs1, %r9;
+; CHECK-NEXT: add.rp.sat.f32.f16 %r11, %rs1, %r10;
+; CHECK-NEXT: st.param.b32 [func_retval0], %r11;
+; CHECK-NEXT: ret;
+ %r1 = call float @llvm.nvvm.add.mixed.f32.f16(half %a, float %b)
+ %r2 = call float @llvm.nvvm.add.mixed.rn.f32.f16(half %a, float %r1)
+ %r3 = call float @llvm.nvvm.add.mixed.rz.f32.f16(half %a, float %r2)
+ %r4 = call float @llvm.nvvm.add.mixed.rm.f32.f16(half %a, float %r3)
+ %r5 = call float @llvm.nvvm.add.mixed.rp.f32.f16(half %a, float %r4)
+
+ ; SAT
+ %r6 = call float @llvm.nvvm.add.mixed.sat.f32.f16(half %a, float %r5)
+ %r7 = call float @llvm.nvvm.add.mixed.rn.sat.f32.f16(half %a, float %r6)
+ %r8 = call float @llvm.nvvm.add.mixed.rz.sat.f32.f16(half %a, float %r7)
+ %r9 = call float @llvm.nvvm.add.mixed.rm.sat.f32.f16(half %a, float %r8)
+ %r10 = call float @llvm.nvvm.add.mixed.rp.sat.f32.f16(half %a, float %r9)
+
+ ret float %r10
+}
+
+define float @test_add_f32_bf16(bfloat %a, float %b) {
+; CHECK-LABEL: test_add_f32_bf16(
+; CHECK: {
+; CHECK-NEXT: .reg .b16 %rs<2>;
+; CHECK-NEXT: .reg .b32 %r<12>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.b16 %rs1, [test_add_f32_bf16_param_0];
+; CHECK-NEXT: ld.param.b32 %r1, [test_add_f32_bf16_param_1];
+; CHECK-NEXT: add.f32.bf16 %r2, %rs1, %r1;
+; CHECK-NEXT: add.rn.f32.bf16 %r3, %rs1, %r2;
+; CHECK-NEXT: add.rz.f32.bf16 %r4, %rs1, %r3;
+; CHECK-NEXT: add.rm.f32.bf16 %r5, %rs1, %r4;
+; CHECK-NEXT: add.rp.f32.bf16 %r6, %rs1, %r5;
+; CHECK-NEXT: add.sat.f32.bf16 %r7, %rs1, %r6;
+; CHECK-NEXT: add.rn.sat.f32.bf16 %r8, %rs1, %r7;
+; CHECK-NEXT: add.rz.sat.f32.bf16 %r9, %rs1, %r8;
+; CHECK-NEXT: add.rm.sat.f32.bf16 %r10, %rs1, %r9;
+; CHECK-NEXT: add.rp.sat.f32.bf16 %r11, %rs1, %r10;
+; CHECK-NEXT: st.param.b32 [func_retval0], %r11;
+; CHECK-NEXT: ret;
+ %r1 = call float @llvm.nvvm.add.mixed.f32.bf16(bfloat %a, float %b)
+ %r2 = call float @llvm.nvvm.add.mixed.rn.f32.bf16(bfloat %a, float %r1)
+ %r3 = call float @llvm.nvvm.add.mixed.rz.f32.bf16(bfloat %a, float %r2)
+ %r4 = call float @llvm.nvvm.add.mixed.rm.f32.bf16(bfloat %a, float %r3)
+ %r5 = call float @llvm.nvvm.add.mixed.rp.f32.bf16(bfloat %a, float %r4)
+
+ ; SAT
+ %r6 = call float @llvm.nvvm.add.mixed.sat.f32.bf16(bfloat %a, float %r5)
+ %r7 = call float @llvm.nvvm.add.mixed.rn.sat.f32.bf16(bfloat %a, float %r6)
+ %r8 = call float @llvm.nvvm.add.mixed.rz.sat.f32.bf16(bfloat %a, float %r7)
+ %r9 = call float @llvm.nvvm.add.mixed.rm.sat.f32.bf16(bfloat %a, float %r8)
+ %r10 = call float @llvm.nvvm.add.mixed.rp.sat.f32.bf16(bfloat %a, float %r9)
+
+ ret float %r10
+}
+
+; SUB
+
+define float @test_sub_f32_f16(half %a, float %b) {
+; CHECK-LABEL: test_sub_f32_f16(
+; CHECK: {
+; CHECK-NEXT: .reg .b16 %rs<2>;
+; CHECK-NEXT: .reg .b32 %r<12>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.b16 %rs1, [test_sub_f32_f16_param_0];
+; CHECK-NEXT: ld.param.b32 %r1, [test_sub_f32_f16_param_1];
+; CHECK-NEXT: sub.f32.f16 %r2, %rs1, %r1;
+; CHECK-NEXT: sub.rn.f32.f16 %r3, %rs1, %r2;
+; CHECK-NEXT: sub.rz.f32.f16 %r4, %rs1, %r3;
+; CHECK-NEXT: sub.rm.f32.f16 %r5, %rs1, %r4;
+; CHECK-NEXT: sub.rp.f32.f16 %r6, %rs1, %r5;
+; CHECK-NEXT: sub.sat.f32.f16 %r7, %rs1, %r6;
+; CHECK-NEXT: sub.rn.sat.f32.f16 %r8, %rs1, %r7;
+; CHECK-NEXT: sub.rz.sat.f32.f16 %r9, %rs1, %r8;
+; CHECK-NEXT: sub.rm.sat.f32.f16 %r10, %rs1, %r9;
+; CHECK-NEXT: sub.rp.sat.f32.f16 %r11, %rs1, %r10;
+; CHECK-NEXT: st.param.b32 [func_retval0], %r11;
+; CHECK-NEXT: ret;
+ %r1 = call float @llvm.nvvm.sub.mixed.f32.f16(half %a, float %b)
+ %r2 = call float @llvm.nvvm.sub.mixed.rn.f32.f16(half %a, float %r1)
+ %r3 = call float @llvm.nvvm.sub.mixed.rz.f32.f16(half %a, float %r2)
+ %r4 = call float @llvm.nvvm.sub.mixed.rm.f32.f16(half %a, float %r3)
+ %r5 = call float @llvm.nvvm.sub.mixed.rp.f32.f16(half %a, float %r4)
+
+ ; SAT
+ %r6 = call float @llvm.nvvm.sub.mixed.sat.f32.f16(half %a, float %r5)
+ %r7 = call float @llvm.nvvm.sub.mixed.rn.sat.f32.f16(half %a, float %r6)
+ %r8 = call float @llvm.nvvm.sub.mixed.rz.sat.f32.f16(half %a, float %r7)
+ %r9 = call float @llvm.nvvm.sub.mixed.rm.sat.f32.f16(half %a, float %r8)
+ %r10 = call float @llvm.nvvm.sub.mixed.rp.sat.f32.f16(half %a, float %r9)
+
+ ret float %r10
+}
+
+define float @test_sub_f32_bf16(bfloat %a, float %b) {
+; CHECK-LABEL: test_sub_f32_bf16(
+; CHECK: {
+; CHECK-NEXT: .reg .b16 %rs<2>;
+; CHECK-NEXT: .reg .b32 %r<12>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.b16 %rs1, [test_sub_f32_bf16_param_0];
+; CHECK-NEXT: ld.param.b32 %r1, [test_sub_f32_bf16_param_1];
+; CHECK-NEXT: sub.f32.bf16 %r2, %rs1, %r1;
+; CHECK-NEXT: sub.rn.f32.bf16 %r3, %rs1, %r2;
+; CHECK-NEXT: sub.rz.f32.bf16 %r4, %rs1, %r3;
+; CHECK-NEXT: sub.rm.f32.bf16 %r5, %rs1, %r4;
+; CHECK-NEXT: sub.rp.f32.bf16 %r6, %rs1, %r5;
+; CHECK-NEXT: sub.sat.f32.bf16 %r7, %rs1, %r6;
+; CHECK-NEXT: sub.rn.sat.f32.bf16 %r8, %rs1, %r7;
+; CHECK-NEXT: sub.rz.sat.f32.bf16 %r9, %rs1, %r8;
+; CHECK-NEXT: sub.rm.sat.f32.bf16 %r10, %rs1, %r9;
+; CHECK-NEXT: sub.rp.sat.f32.bf16 %r11, %rs1, %r10;
+; CHECK-NEXT: st.param.b32 [func_retval0], %r11;
+; CHECK-NEXT: ret;
+ %r1 = call float @llvm.nvvm.sub.mixed.f32.bf16(bfloat %a, float %b)
+ %r2 = call float @llvm.nvvm.sub.mixed.rn.f32.bf16(bfloat %a, float %r1)
+ %r3 = call float @llvm.nvvm.sub.mixed.rz.f32.bf16(bfloat %a, float %r2)
+ %r4 = call float @llvm.nvvm.sub.mixed.rm.f32.bf16(bfloat %a, float %r3)
+ %r5 = call float @llvm.nvvm.sub.mixed.rp.f32.bf16(bfloat %a, float %r4)
+
+ ; SAT
+ %r6 = call float @llvm.nvvm.sub.mixed.sat.f32.bf16(bfloat %a, float %r5)
+ %r7 = call float @llvm.nvvm.sub.mixed.rn.sat.f32.bf16(bfloat %a, float %r6)
+ %r8 = call float @llvm.nvvm.sub.mixed.rz.sat.f32.bf16(bfloat %a, float %r7)
+ %r9 = call float @llvm.nvvm.sub.mixed.rm.sat.f32.bf16(bfloat %a, float %r8)
+ %r10 = call float @llvm.nvvm.sub.mixed.rp.sat.f32.bf16(bfloat %a, float %r9)
+
+ ret float %r10
+}
+
+; FMA
+
+define float @test_fma_f32_f16(half %a, half %b, float %c) {
+; CHECK-LABEL: test_fma_f32_f16(
+; CHECK: {
+; CHECK-NEXT: .reg .b16 %rs<3>;
+; CHECK-NEXT: .reg .b32 %r<10>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.b16 %rs1, [test_fma_f32_f16_param_0];
+; CHECK-NEXT: ld.param.b16 %rs2, [test_fma_f32_f16_param_1];
+; CHECK-NEXT: ld.param.b32 %r1, [test_fma_f32_f16_param_2];
+; CHECK-NEXT: fma.rn.f32.f16 %r2, %rs1, %rs2, %r1;
+; CHECK-NEXT: fma.rz.f32.f16 %r3, %rs1, %rs2, %r2;
+; CHECK-NEXT: fma.rm.f32.f16 %r4, %rs1, %rs2, %r3;
+; CHECK-NEXT: fma.rp.f32.f16 %r5, %rs1, %rs2, %r4;
+; CHECK-NEXT: fma.rn.sat.f32.f16 %r6, %rs1, %rs2, %r5;
+; CHECK-NEXT: fma.rz.sat.f32.f16 %r7, %rs1, %rs2, %r6;
+; CHECK-NEXT: fma.rm.sat.f32.f16 %r8, %rs1, %rs2, %r7;
+; CHECK-NEXT: fma.rp.sat.f32.f16 %r9, %rs1, %rs2, %r8;
+; CHECK-NEXT: st.param.b32 [func_retval0], %r9;
+; CHECK-NEXT: ret;
+ %r1= call float @llvm.nvvm.fma.mixed.rn.f32.f16(half %a, half %b, float %c)
+ %r2 = call float @llvm.nvvm.fma.mixed.rz.f32.f16(half %a, half %b, float %r1)
+ %r3 = call float @llvm.nvvm.fma.mixed.rm.f32.f16(half %a, half %b, float %r2)
+ %r4 = call float @llvm.nvvm.fma.mixed.rp.f32.f16(half %a, half %b, float %r3)
+
+ ; SAT
+ %r5 = call float @llvm.nvvm.fma.mixed.rn.sat.f32.f16(half %a, half %b, float %r4)
+ %r6 = call float @llvm.nvvm.fma.mixed.rz.sat.f32.f16(half %a, half %b, float %r5)
+ %r7 = call float @llvm.nvvm.fma.mixed.rm.sat.f32.f16(half %a, half %b, float %r6)
+ %r8 = call float @llvm.nvvm.fma.mixed.rp.sat.f32.f16(half %a, half %b, float %r7)
+
+ ret float %r8
+}
+
+define float @test_fma_f32_bf16(bfloat %a, bfloat %b, float %c) {
+; CHECK-LABEL: test_fma_f32_bf16(
+; CHECK: {
+; CHECK-NEXT: .reg .b16 %rs<3>;
+; CHECK-NEXT: .reg .b32 %r<10>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.b16 %rs1, [test_fma_f32_bf16_param_0];
+; CHECK-NEXT: ld.param.b16 %rs2, [test_fma_f32_bf16_param_1];
+; CHECK-NEXT: ld.param.b32 %r1, [test_fma_f32_bf16_param_2];
+; CHECK-NEXT: fma.rn.f32.bf16 %r2, %rs1, %rs2, %r1;
+; CHECK-NEXT: fma.rz.f32.bf16 %r3, %rs1, %rs2, %r2;
+; CHECK-NEXT: fma.rm.f32.bf16 %r4, %rs1, %rs2, %r3;
+; CHECK-NEXT: fma.rp.f32.bf16 %r5, %rs1, %rs2, %r4;
+; CHECK-NEXT: fma.rn.sat.f32.bf16 %r6, %rs1, %rs2, %r5;
+; CHECK-NEXT: fma.rz.sat.f32.bf16 %r7, %rs1, %rs2, %r6;
+; CHECK-NEXT: fma.rm.sat.f32.bf16 %r8, %rs1, %rs2, %r7;
+; CHECK-NEXT: fma.rp.sat.f32.bf16 %r9, %rs1, %rs2, %r8;
+; CHECK-NEXT: st.param.b32 [func_retval0], %r9;
+; CHECK-NEXT: ret;
+ %r1 = call float @llvm.nvvm.fma.mixed.rn.f32.bf16(bfloat %a, bfloat %b, float %c)
+ %r2 = call float @llvm.nvvm.fma.mixed.rz.f32.bf16(bfloat %a, bfloat %b, float %r1)
+ %r3 = call float @llvm.nvvm.fma.mixed.rm.f32.bf16(bfloat %a, bfloat %b, float %r2)
+ %r4 = call float @llvm.nvvm.fma.mixed.rp.f32.bf16(bfloat %a, bfloat %b, float %r3)
+
+ ; SAT
+ %r5 = call float @llvm.nvvm.fma.mixed.rn.sat.f32.bf16(bfloat %a, bfloat %b, float %r4)
+ %r6 = call float @llvm.nvvm.fma.mixed.rz.sat.f32.bf16(bfloat %a, bfloat %b, float %r5)
+ %r7 = call float @llvm.nvvm.fma.mixed.rm.sat.f32.bf16(bfloat %a, bfloat %b, float %r6)
+ %r8 = call float @llvm.nvvm.fma.mixed.rp.sat.f32.bf16(bfloat %a, bfloat %b, float %r7)
+
+ ret float %r8
+}
>From 444a0a76bfc48750f490abcb854e46057743e723 Mon Sep 17 00:00:00 2001
From: Srinivasa Ravi <srinivasar at nvidia.com>
Date: Mon, 17 Nov 2025 12:03:05 +0000
Subject: [PATCH 2/2] fix whitespace error
---
llvm/test/CodeGen/NVPTX/mixed-precision-fp.ll | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/llvm/test/CodeGen/NVPTX/mixed-precision-fp.ll b/llvm/test/CodeGen/NVPTX/mixed-precision-fp.ll
index a4f2fe68830f5..adebcf868b2e6 100644
--- a/llvm/test/CodeGen/NVPTX/mixed-precision-fp.ll
+++ b/llvm/test/CodeGen/NVPTX/mixed-precision-fp.ll
@@ -176,7 +176,7 @@ define float @test_fma_f32_f16(half %a, half %b, float %c) {
; CHECK-NEXT: fma.rp.sat.f32.f16 %r9, %rs1, %rs2, %r8;
; CHECK-NEXT: st.param.b32 [func_retval0], %r9;
; CHECK-NEXT: ret;
- %r1= call float @llvm.nvvm.fma.mixed.rn.f32.f16(half %a, half %b, float %c)
+ %r1 = call float @llvm.nvvm.fma.mixed.rn.f32.f16(half %a, half %b, float %c)
%r2 = call float @llvm.nvvm.fma.mixed.rz.f32.f16(half %a, half %b, float %r1)
%r3 = call float @llvm.nvvm.fma.mixed.rm.f32.f16(half %a, half %b, float %r2)
%r4 = call float @llvm.nvvm.fma.mixed.rp.f32.f16(half %a, half %b, float %r3)
More information about the llvm-commits
mailing list