[clang] [llvm] [clang][NVPTX] Add f16(x2) add/mul FTZ intrinsics (PR #178237)
Srinivasa Ravi via cfe-commits
cfe-commits at lists.llvm.org
Tue Jan 27 07:53:51 PST 2026
https://github.com/Wolfram70 updated https://github.com/llvm/llvm-project/pull/178237
>From 868c76297d3bb5185d47fa0ddcd4a21b76a3a45f Mon Sep 17 00:00:00 2001
From: Srinivasa Ravi <srinivasar at nvidia.com>
Date: Tue, 27 Jan 2026 11:27:40 +0000
Subject: [PATCH 1/2] [clang][NVPTX] Add f16(x2) add/mul FTZ intrinsics
This change adds `llvm.nvvm.{add/mul}.rn.ftz.{f16/f16x2}` intrinsics
and corresponding clang builtins. These variants were missed in
https://github.com/llvm/llvm-project/pull/170079 which added
half-precision arithmetic intrinsics.
PTX Spec Reference: https://docs.nvidia.com/cuda/parallel-thread-execution/#half-precision-floating-point-instructions
---
clang/include/clang/Basic/BuiltinsNVPTX.td | 4 +++
clang/test/CodeGen/builtins-nvptx.c | 10 +++++-
llvm/docs/NVPTXUsage.rst | 4 +++
llvm/include/llvm/IR/IntrinsicsNVVM.td | 12 +++++++
llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp | 5 +++
llvm/lib/Target/NVPTX/NVPTXIntrinsics.td | 7 +++++
llvm/test/CodeGen/NVPTX/f16-add-ftz.ll | 33 +++++++++++++++++++
llvm/test/CodeGen/NVPTX/f16-mul-ftz.ll | 33 +++++++++++++++++++
llvm/test/CodeGen/NVPTX/f16-sub-ftz.ll | 35 +++++++++++++++++++++
9 files changed, 142 insertions(+), 1 deletion(-)
create mode 100644 llvm/test/CodeGen/NVPTX/f16-add-ftz.ll
create mode 100644 llvm/test/CodeGen/NVPTX/f16-mul-ftz.ll
create mode 100644 llvm/test/CodeGen/NVPTX/f16-sub-ftz.ll
diff --git a/clang/include/clang/Basic/BuiltinsNVPTX.td b/clang/include/clang/Basic/BuiltinsNVPTX.td
index 821c362d100c5..59d96551cc250 100644
--- a/clang/include/clang/Basic/BuiltinsNVPTX.td
+++ b/clang/include/clang/Basic/BuiltinsNVPTX.td
@@ -467,8 +467,10 @@ def __nvvm_rsqrt_approx_d : NVPTXBuiltin<"double(double)">;
// Add
def __nvvm_add_rn_sat_f16 : NVPTXBuiltinSMAndPTX<"__fp16(__fp16, __fp16)", SM_53, PTX42>;
+def __nvvm_add_rn_ftz_f16 : NVPTXBuiltinSMAndPTX<"__fp16(__fp16, __fp16)", SM_53, PTX42>;
def __nvvm_add_rn_ftz_sat_f16 : NVPTXBuiltinSMAndPTX<"__fp16(__fp16, __fp16)", SM_53, PTX42>;
def __nvvm_add_rn_sat_v2f16 : NVPTXBuiltinSMAndPTX<"_Vector<2, __fp16>(_Vector<2, __fp16>, _Vector<2, __fp16>)", SM_53, PTX42>;
+def __nvvm_add_rn_ftz_v2f16 : NVPTXBuiltinSMAndPTX<"_Vector<2, __fp16>(_Vector<2, __fp16>, _Vector<2, __fp16>)", SM_53, PTX42>;
def __nvvm_add_rn_ftz_sat_v2f16 : NVPTXBuiltinSMAndPTX<"_Vector<2, __fp16>(_Vector<2, __fp16>, _Vector<2, __fp16>)", SM_53, PTX42>;
def __nvvm_add_rn_ftz_f : NVPTXBuiltin<"float(float, float)">;
@@ -496,8 +498,10 @@ def __nvvm_add_rp_d : NVPTXBuiltin<"double(double, double)">;
// Mul
def __nvvm_mul_rn_sat_f16 : NVPTXBuiltinSMAndPTX<"__fp16(__fp16, __fp16)", SM_53, PTX42>;
+def __nvvm_mul_rn_ftz_f16 : NVPTXBuiltinSMAndPTX<"__fp16(__fp16, __fp16)", SM_53, PTX42>;
def __nvvm_mul_rn_ftz_sat_f16 : NVPTXBuiltinSMAndPTX<"__fp16(__fp16, __fp16)", SM_53, PTX42>;
def __nvvm_mul_rn_sat_v2f16 : NVPTXBuiltinSMAndPTX<"_Vector<2, __fp16>(_Vector<2, __fp16>, _Vector<2, __fp16>)", SM_53, PTX42>;
+def __nvvm_mul_rn_ftz_v2f16 : NVPTXBuiltinSMAndPTX<"_Vector<2, __fp16>(_Vector<2, __fp16>, _Vector<2, __fp16>)", SM_53, PTX42>;
def __nvvm_mul_rn_ftz_sat_v2f16 : NVPTXBuiltinSMAndPTX<"_Vector<2, __fp16>(_Vector<2, __fp16>, _Vector<2, __fp16>)", SM_53, PTX42>;
// Convert
diff --git a/clang/test/CodeGen/builtins-nvptx.c b/clang/test/CodeGen/builtins-nvptx.c
index a739b66042f19..8271b29c18968 100644
--- a/clang/test/CodeGen/builtins-nvptx.c
+++ b/clang/test/CodeGen/builtins-nvptx.c
@@ -1589,22 +1589,30 @@ __device__ void nvvm_add_fma_f32_sat() {
#define F16X2_2 {(__fp16)0.2f, (__fp16)0.2f}
// CHECK-LABEL: nvvm_add_mul_f16_sat
-__device__ void nvvm_add_mul_f16_sat() {
+__device__ void nvvm_add_mul_f16_sat_ftz() {
// CHECK: call half @llvm.nvvm.add.rn.sat.f16
__nvvm_add_rn_sat_f16(F16, F16_2);
+ // CHECK: call half @llvm.nvvm.add.rn.ftz.f16
+ __nvvm_add_rn_ftz_f16(F16, F16_2);
// CHECK: call half @llvm.nvvm.add.rn.ftz.sat.f16
__nvvm_add_rn_ftz_sat_f16(F16, F16_2);
// CHECK: call <2 x half> @llvm.nvvm.add.rn.sat.v2f16
__nvvm_add_rn_sat_v2f16(F16X2, F16X2_2);
+ // CHECK: call <2 x half> @llvm.nvvm.add.rn.ftz.v2f16
+ __nvvm_add_rn_ftz_v2f16(F16X2, F16X2_2);
// CHECK: call <2 x half> @llvm.nvvm.add.rn.ftz.sat.v2f16
__nvvm_add_rn_ftz_sat_v2f16(F16X2, F16X2_2);
// CHECK: call half @llvm.nvvm.mul.rn.sat.f16
__nvvm_mul_rn_sat_f16(F16, F16_2);
+ // CHECK: call half @llvm.nvvm.mul.rn.ftz.f16
+ __nvvm_mul_rn_ftz_f16(F16, F16_2);
// CHECK: call half @llvm.nvvm.mul.rn.ftz.sat.f16
__nvvm_mul_rn_ftz_sat_f16(F16, F16_2);
// CHECK: call <2 x half> @llvm.nvvm.mul.rn.sat.v2f16
__nvvm_mul_rn_sat_v2f16(F16X2, F16X2_2);
+ // CHECK: call <2 x half> @llvm.nvvm.mul.rn.ftz.v2f16
+ __nvvm_mul_rn_ftz_v2f16(F16X2, F16X2_2);
// CHECK: call <2 x half> @llvm.nvvm.mul.rn.ftz.sat.v2f16
__nvvm_mul_rn_ftz_sat_v2f16(F16X2, F16X2_2);
diff --git a/llvm/docs/NVPTXUsage.rst b/llvm/docs/NVPTXUsage.rst
index 0e7e21ad46b8d..d712b2d548f81 100644
--- a/llvm/docs/NVPTXUsage.rst
+++ b/llvm/docs/NVPTXUsage.rst
@@ -1201,9 +1201,11 @@ Syntax:
.. code-block:: llvm
declare half @llvm.nvvm.add.rn.sat.f16(half %a, half %b)
+ declare half @llvm.nvvm.add.rn.ftz.f16(half %a, half %b)
declare <2 x half> @llvm.nvvm.add.rn.sat.v2f16(<2 x half> %a, <2 x half> %b)
declare half @llvm.nvvm.add.rn.ftz.sat.f16(half %a, half %b)
+ declare <2 x half> @llvm.nvvm.add.rn.ftz.v2f16(<2 x half> %a, <2 x half> %b)
declare <2 x half> @llvm.nvvm.add.rn.ftz.sat.v2f16(<2 x half> %a, <2 x half> %b)
Overview:
@@ -1229,9 +1231,11 @@ Syntax:
.. code-block:: llvm
declare half @llvm.nvvm.mul.rn.sat.f16(half %a, half %b)
+ declare half @llvm.nvvm.mul.rn.ftz.f16(half %a, half %b)
declare <2 x half> @llvm.nvvm.mul.rn.sat.v2f16(<2 x half> %a, <2 x half> %b)
declare half @llvm.nvvm.mul.rn.ftz.sat.f16(half %a, half %b)
+ declare <2 x half> @llvm.nvvm.mul.rn.ftz.v2f16(<2 x half> %a, <2 x half> %b)
declare <2 x half> @llvm.nvvm.mul.rn.ftz.sat.v2f16(<2 x half> %a, <2 x half> %b)
Overview:
diff --git a/llvm/include/llvm/IR/IntrinsicsNVVM.td b/llvm/include/llvm/IR/IntrinsicsNVVM.td
index e5e08aacd2535..7ff7515db82c5 100644
--- a/llvm/include/llvm/IR/IntrinsicsNVVM.td
+++ b/llvm/include/llvm/IR/IntrinsicsNVVM.td
@@ -1373,6 +1373,12 @@ let TargetPrefix = "nvvm" in {
def int_nvvm_mul_rn # ftz # _sat_v2f16 : NVVMBuiltin,
DefaultAttrsIntrinsic<[llvm_v2f16_ty], [llvm_v2f16_ty, llvm_v2f16_ty]>;
} // ftz
+
+ def int_nvvm_mul_rn_ftz_f16 : NVVMBuiltin,
+ DefaultAttrsIntrinsic<[llvm_half_ty], [llvm_half_ty, llvm_half_ty]>;
+
+ def int_nvvm_mul_rn_ftz_v2f16 : NVVMBuiltin,
+ DefaultAttrsIntrinsic<[llvm_v2f16_ty], [llvm_v2f16_ty, llvm_v2f16_ty]>;
}
//
@@ -1612,6 +1618,12 @@ let TargetPrefix = "nvvm" in {
DefaultAttrsIntrinsic<[llvm_v2f16_ty], [llvm_v2f16_ty, llvm_v2f16_ty]>;
} // ftz
+
+ def int_nvvm_add_rn_ftz_f16 : NVVMBuiltin,
+ DefaultAttrsIntrinsic<[llvm_half_ty], [llvm_half_ty, llvm_half_ty]>;
+
+ def int_nvvm_add_rn_ftz_v2f16 : NVVMBuiltin,
+ DefaultAttrsIntrinsic<[llvm_v2f16_ty], [llvm_v2f16_ty, llvm_v2f16_ty]>;
}
//
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
index 1be35a1c67457..5b62b9d073c19 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
@@ -6857,6 +6857,9 @@ static unsigned getF16SubOpc(Intrinsic::ID AddIntrinsicID) {
case Intrinsic::nvvm_add_rn_sat_f16:
case Intrinsic::nvvm_add_rn_sat_v2f16:
return NVPTXISD::SUB_RN_SAT;
+ case Intrinsic::nvvm_add_rn_ftz_f16:
+ case Intrinsic::nvvm_add_rn_ftz_v2f16:
+ return NVPTXISD::SUB_RN_FTZ;
case Intrinsic::nvvm_add_rn_ftz_sat_f16:
case Intrinsic::nvvm_add_rn_ftz_sat_v2f16:
return NVPTXISD::SUB_RN_FTZ_SAT;
@@ -6895,8 +6898,10 @@ static SDValue combineIntrinsicWOChain(SDNode *N,
default:
break;
case Intrinsic::nvvm_add_rn_sat_f16:
+ case Intrinsic::nvvm_add_rn_ftz_f16:
case Intrinsic::nvvm_add_rn_ftz_sat_f16:
case Intrinsic::nvvm_add_rn_sat_v2f16:
+ case Intrinsic::nvvm_add_rn_ftz_v2f16:
case Intrinsic::nvvm_add_rn_ftz_sat_v2f16:
return combineF16AddWithNeg(N, DCI.DAG, IID);
}
diff --git a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
index ad5dd356ee90f..64fba7339ad0a 100644
--- a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
+++ b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
@@ -1504,8 +1504,10 @@ def INT_NVVM_MUL24_I : F_MATH_2<"mul24.lo.s32", B32, B32, B32, int_nvvm_mul24_i>
def INT_NVVM_MUL24_UI : F_MATH_2<"mul24.lo.u32", B32, B32, B32, int_nvvm_mul24_ui>;
def INT_NVVM_MUL_RN_SAT_F16 : F_MATH_2<"mul.rn.sat.f16", B16, B16, B16, int_nvvm_mul_rn_sat_f16>;
+def INT_NVVM_MUL_RN_FTZ_F16 : F_MATH_2<"mul.rn.ftz.f16", B16, B16, B16, int_nvvm_mul_rn_ftz_f16>;
def INT_NVVM_MUL_RN_FTZ_SAT_F16 : F_MATH_2<"mul.rn.ftz.sat.f16", B16, B16, B16, int_nvvm_mul_rn_ftz_sat_f16>;
def INT_NVVM_MUL_RN_SAT_F16X2 : F_MATH_2<"mul.rn.sat.f16x2", B32, B32, B32, int_nvvm_mul_rn_sat_v2f16>;
+def INT_NVVM_MUL_RN_FTZ_F16X2 : F_MATH_2<"mul.rn.ftz.f16x2", B32, B32, B32, int_nvvm_mul_rn_ftz_v2f16>;
def INT_NVVM_MUL_RN_FTZ_SAT_F16X2 : F_MATH_2<"mul.rn.ftz.sat.f16x2", B32, B32, B32, int_nvvm_mul_rn_ftz_sat_v2f16>;
//
@@ -1876,8 +1878,10 @@ let Predicates = [doRsqrtOpt] in {
//
def INT_NVVM_ADD_RN_SAT_F16 : F_MATH_2<"add.rn.sat.f16", B16, B16, B16, int_nvvm_add_rn_sat_f16>;
+def INT_NVVM_ADD_RN_FTZ_F16 : F_MATH_2<"add.rn.ftz.f16", B16, B16, B16, int_nvvm_add_rn_ftz_f16>;
def INT_NVVM_ADD_RN_FTZ_SAT_F16 : F_MATH_2<"add.rn.ftz.sat.f16", B16, B16, B16, int_nvvm_add_rn_ftz_sat_f16>;
def INT_NVVM_ADD_RN_SAT_F16X2 : F_MATH_2<"add.rn.sat.f16x2", B32, B32, B32, int_nvvm_add_rn_sat_v2f16>;
+def INT_NVVM_ADD_RN_FTZ_F16X2 : F_MATH_2<"add.rn.ftz.f16x2", B32, B32, B32, int_nvvm_add_rn_ftz_v2f16>;
def INT_NVVM_ADD_RN_FTZ_SAT_F16X2 : F_MATH_2<"add.rn.ftz.sat.f16x2", B32, B32, B32, int_nvvm_add_rn_ftz_sat_v2f16>;
def INT_NVVM_ADD_RN_FTZ_F : F_MATH_2<"add.rn.ftz.f32", B32, B32, B32, int_nvvm_add_rn_ftz_f>;
@@ -1930,6 +1934,7 @@ let Predicates = [hasSM<100>, hasPTX<86>, doNoF32FTZ] in {
//
def sub_rn_sat : SDNode<"NVPTXISD::SUB_RN_SAT", SDTFPBinOp>;
+def sub_rn_ftz : SDNode<"NVPTXISD::SUB_RN_FTZ", SDTFPBinOp>;
def sub_rn_ftz_sat :
SDNode<"NVPTXISD::SUB_RN_FTZ_SAT", SDTFPBinOp>;
@@ -1940,8 +1945,10 @@ class INT_NVVM_SUB_RN<RegTyInfo TyInfo, string variant> :
(!cast<SDNode>("sub_rn" # variant) TyInfo.Ty:$a, TyInfo.Ty:$b))]>;
def INT_NVVM_SUB_RN_SAT_F16 : INT_NVVM_SUB_RN<F16RT, "_sat">;
+def INT_NVVM_SUB_RN_FTZ_F16 : INT_NVVM_SUB_RN<F16RT, "_ftz">;
def INT_NVVM_SUB_RN_FTZ_SAT_F16 : INT_NVVM_SUB_RN<F16RT, "_ftz_sat">;
def INT_NVVM_SUB_RN_SAT_F16X2 : INT_NVVM_SUB_RN<F16X2RT, "_sat">;
+def INT_NVVM_SUB_RN_FTZ_F16X2 : INT_NVVM_SUB_RN<F16X2RT, "_ftz">;
def INT_NVVM_SUB_RN_FTZ_SAT_F16X2 : INT_NVVM_SUB_RN<F16X2RT, "_ftz_sat">;
foreach rnd = ["_rn", "_rz", "_rm", "_rp"] in {
diff --git a/llvm/test/CodeGen/NVPTX/f16-add-ftz.ll b/llvm/test/CodeGen/NVPTX/f16-add-ftz.ll
new file mode 100644
index 0000000000000..fbe348fcc966f
--- /dev/null
+++ b/llvm/test/CodeGen/NVPTX/f16-add-ftz.ll
@@ -0,0 +1,33 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_53 -mattr=+ptx42 | FileCheck %s
+; RUN: %if ptxas-isa-4.2 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_53 -mattr=+ptx42 | %ptxas-verify%}
+
+define half @add_rn_ftz_f16(half %a, half %b) {
+; CHECK-LABEL: add_rn_ftz_f16(
+; CHECK: {
+; CHECK-NEXT: .reg .b16 %rs<4>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.b16 %rs1, [add_rn_ftz_f16_param_0];
+; CHECK-NEXT: ld.param.b16 %rs2, [add_rn_ftz_f16_param_1];
+; CHECK-NEXT: add.rn.ftz.f16 %rs3, %rs1, %rs2;
+; CHECK-NEXT: st.param.b16 [func_retval0], %rs3;
+; CHECK-NEXT: ret;
+ %f1 = call half @llvm.nvvm.add.rn.ftz.f16(half %a, half %b)
+ ret half %f1
+}
+
+define <2 x half> @add_rn_ftz_f16x2(<2 x half> %a, <2 x half> %b) {
+; CHECK-LABEL: add_rn_ftz_f16x2(
+; CHECK: {
+; CHECK-NEXT: .reg .b32 %r<4>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.b32 %r1, [add_rn_ftz_f16x2_param_0];
+; CHECK-NEXT: ld.param.b32 %r2, [add_rn_ftz_f16x2_param_1];
+; CHECK-NEXT: add.rn.ftz.f16x2 %r3, %r1, %r2;
+; CHECK-NEXT: st.param.b32 [func_retval0], %r3;
+; CHECK-NEXT: ret;
+ %f1 = call <2 x half> @llvm.nvvm.add.rn.ftz.v2f16(<2 x half> %a, <2 x half> %b)
+ ret <2 x half> %f1
+}
diff --git a/llvm/test/CodeGen/NVPTX/f16-mul-ftz.ll b/llvm/test/CodeGen/NVPTX/f16-mul-ftz.ll
new file mode 100644
index 0000000000000..c2ebf8fd49db3
--- /dev/null
+++ b/llvm/test/CodeGen/NVPTX/f16-mul-ftz.ll
@@ -0,0 +1,33 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_53 -mattr=+ptx42 | FileCheck %s
+; RUN: %if ptxas-isa-4.2 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_53 -mattr=+ptx42 | %ptxas-verify%}
+
+define half @mul_rn_ftz_f16(half %a, half %b) {
+; CHECK-LABEL: mul_rn_ftz_f16(
+; CHECK: {
+; CHECK-NEXT: .reg .b16 %rs<4>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.b16 %rs1, [mul_rn_ftz_f16_param_0];
+; CHECK-NEXT: ld.param.b16 %rs2, [mul_rn_ftz_f16_param_1];
+; CHECK-NEXT: mul.rn.ftz.f16 %rs3, %rs1, %rs2;
+; CHECK-NEXT: st.param.b16 [func_retval0], %rs3;
+; CHECK-NEXT: ret;
+ %f1 = call half @llvm.nvvm.mul.rn.ftz.f16(half %a, half %b)
+ ret half %f1
+}
+
+define <2 x half> @mul_rn_ftz_f16x2(<2 x half> %a, <2 x half> %b) {
+; CHECK-LABEL: mul_rn_ftz_f16x2(
+; CHECK: {
+; CHECK-NEXT: .reg .b32 %r<4>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.b32 %r1, [mul_rn_ftz_f16x2_param_0];
+; CHECK-NEXT: ld.param.b32 %r2, [mul_rn_ftz_f16x2_param_1];
+; CHECK-NEXT: mul.rn.ftz.f16x2 %r3, %r1, %r2;
+; CHECK-NEXT: st.param.b32 [func_retval0], %r3;
+; CHECK-NEXT: ret;
+ %f1 = call <2 x half> @llvm.nvvm.mul.rn.ftz.v2f16(<2 x half> %a, <2 x half> %b)
+ ret <2 x half> %f1
+}
diff --git a/llvm/test/CodeGen/NVPTX/f16-sub-ftz.ll b/llvm/test/CodeGen/NVPTX/f16-sub-ftz.ll
new file mode 100644
index 0000000000000..7164924caf620
--- /dev/null
+++ b/llvm/test/CodeGen/NVPTX/f16-sub-ftz.ll
@@ -0,0 +1,35 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_53 -mattr=+ptx42 | FileCheck %s
+; RUN: %if ptxas-isa-4.2 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_53 -mattr=+ptx42 | %ptxas-verify%}
+
+define half @sub_rn_ftz_f16(half %a, half %b) {
+; CHECK-LABEL: sub_rn_ftz_f16(
+; CHECK: {
+; CHECK-NEXT: .reg .b16 %rs<4>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.b16 %rs1, [sub_rn_ftz_f16_param_0];
+; CHECK-NEXT: ld.param.b16 %rs2, [sub_rn_ftz_f16_param_1];
+; CHECK-NEXT: sub.rn.ftz.f16 %rs3, %rs1, %rs2;
+; CHECK-NEXT: st.param.b16 [func_retval0], %rs3;
+; CHECK-NEXT: ret;
+ %f0 = fneg half %b
+ %f1 = call half @llvm.nvvm.add.rn.ftz.f16(half %a, half %f0)
+ ret half %f1
+}
+
+define <2 x half> @sub_rn_ftz_f16x2(<2 x half> %a, <2 x half> %b) {
+; CHECK-LABEL: sub_rn_ftz_f16x2(
+; CHECK: {
+; CHECK-NEXT: .reg .b32 %r<4>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.b32 %r1, [sub_rn_ftz_f16x2_param_0];
+; CHECK-NEXT: ld.param.b32 %r2, [sub_rn_ftz_f16x2_param_1];
+; CHECK-NEXT: sub.rn.ftz.f16x2 %r3, %r1, %r2;
+; CHECK-NEXT: st.param.b32 [func_retval0], %r3;
+; CHECK-NEXT: ret;
+ %f0 = fneg <2 x half> %b
+ %f1 = call <2 x half> @llvm.nvvm.add.rn.ftz.v2f16(<2 x half> %a, <2 x half> %f0)
+ ret <2 x half> %f1
+}
>From fd4fd4f9e645246a609a1d09dc9ce770bc689b55 Mon Sep 17 00:00:00 2001
From: Srinivasa Ravi <srinivasar at nvidia.com>
Date: Tue, 27 Jan 2026 21:23:40 +0530
Subject: [PATCH 2/2] update check line in builtins-nvptx.c
---
clang/test/CodeGen/builtins-nvptx.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/clang/test/CodeGen/builtins-nvptx.c b/clang/test/CodeGen/builtins-nvptx.c
index 8271b29c18968..9ea2f416be293 100644
--- a/clang/test/CodeGen/builtins-nvptx.c
+++ b/clang/test/CodeGen/builtins-nvptx.c
@@ -1588,7 +1588,7 @@ __device__ void nvvm_add_fma_f32_sat() {
#define F16X2 {(__fp16)0.1f, (__fp16)0.1f}
#define F16X2_2 {(__fp16)0.2f, (__fp16)0.2f}
-// CHECK-LABEL: nvvm_add_mul_f16_sat
+// CHECK-LABEL: nvvm_add_mul_f16_sat_ftz
__device__ void nvvm_add_mul_f16_sat_ftz() {
// CHECK: call half @llvm.nvvm.add.rn.sat.f16
__nvvm_add_rn_sat_f16(F16, F16_2);
More information about the cfe-commits
mailing list