[llvm] [NVPTX] Add intrinsics for new narrow FP conversions (PR #173954)
Srinivasa Ravi via llvm-commits
llvm-commits at lists.llvm.org
Wed Dec 31 02:00:48 PST 2025
https://github.com/Wolfram70 updated https://github.com/llvm/llvm-project/pull/173954
>From 559dfaec3f96ad1a84b5e3f8d71a8b89a3e9e5dc Mon Sep 17 00:00:00 2001
From: Srinivasa Ravi <srinivasar at nvidia.com>
Date: Tue, 16 Dec 2025 08:59:14 +0000
Subject: [PATCH 1/6] [NVPTX] Add intrinsics for new narrow FP conversions
This change adds intrinsics for the following new narrow FP conversions
introduced in PTX 9.1:
- `bf16x2` to `f8x2`
- `(b)f16x2` to `f6x2`
- `(b)f16x2` to `f4x2`
- All `s2f6x2` conversions.
PTX Spec Reference: https://docs.nvidia.com/cuda/parallel-thread-execution/#data-movement-and-conversion-instructions-cvt
---
llvm/include/llvm/IR/IntrinsicsNVVM.td | 42 +++
llvm/lib/Target/NVPTX/NVPTX.td | 2 +-
llvm/lib/Target/NVPTX/NVPTXInstrInfo.td | 77 ++++-
llvm/lib/Target/NVPTX/NVPTXIntrinsics.td | 78 +++++
llvm/lib/Target/NVPTX/NVPTXSubtarget.h | 14 +
.../CodeGen/NVPTX/convert_fp4x2_sm_100f.ll | 84 ++++++
.../CodeGen/NVPTX/convert_fp6x2_sm_100f.ll | 132 +++++++++
.../CodeGen/NVPTX/convert_fp8x2_sm_100f.ll | 67 +++++
.../CodeGen/NVPTX/convert_s2f6x2_sm_100a.ll | 269 ++++++++++++++++++
9 files changed, 761 insertions(+), 4 deletions(-)
create mode 100644 llvm/test/CodeGen/NVPTX/convert_fp4x2_sm_100f.ll
create mode 100644 llvm/test/CodeGen/NVPTX/convert_fp6x2_sm_100f.ll
create mode 100644 llvm/test/CodeGen/NVPTX/convert_fp8x2_sm_100f.ll
create mode 100644 llvm/test/CodeGen/NVPTX/convert_s2f6x2_sm_100a.ll
diff --git a/llvm/include/llvm/IR/IntrinsicsNVVM.td b/llvm/include/llvm/IR/IntrinsicsNVVM.td
index 911de5e14e9db..d9f9207a382ad 100644
--- a/llvm/include/llvm/IR/IntrinsicsNVVM.td
+++ b/llvm/include/llvm/IR/IntrinsicsNVVM.td
@@ -1736,6 +1736,33 @@ let TargetPrefix = "nvvm" in {
PureIntrinsic<[llvm_i32_ty], [llvm_float_ty]>;
}
+ foreach relu = ["", "_relu"] in {
+ def int_nvvm_ff_to_s2f6x2_rn # relu # _satfinite :
+ PureIntrinsic<[llvm_i16_ty], [llvm_float_ty, llvm_float_ty]>;
+
+ def int_nvvm_ff_to_s2f6x2_rn # relu # _satfinite_scale_n2_ue8m0 :
+ PureIntrinsic<[llvm_i16_ty], [llvm_float_ty, llvm_float_ty, llvm_i16_ty]>;
+
+ def int_nvvm_bf16x2_to_s2f6x2_rn # relu # _satfinite :
+ PureIntrinsic<[llvm_i16_ty], [llvm_v2bf16_ty]>;
+
+ def int_nvvm_bf16x2_to_s2f6x2_rn # relu # _satfinite_scale_n2_ue8m0 :
+ PureIntrinsic<[llvm_i16_ty], [llvm_v2bf16_ty, llvm_i16_ty]>;
+
+ def int_nvvm_s2f6x2_to_bf16x2_rn # relu # _satfinite :
+ PureIntrinsic<[llvm_v2bf16_ty], [llvm_i16_ty]>;
+
+ def int_nvvm_s2f6x2_to_bf16x2_rn # relu # _satfinite_scale_n2_ue8m0 :
+ PureIntrinsic<[llvm_v2bf16_ty], [llvm_i16_ty, llvm_i16_ty]>;
+
+ // No satfinite variants
+ def int_nvvm_s2f6x2_to_bf16x2_rn # relu :
+ PureIntrinsic<[llvm_v2bf16_ty], [llvm_i16_ty]>;
+
+ def int_nvvm_s2f6x2_to_bf16x2_rn # relu # _scale_n2_ue8m0 :
+ PureIntrinsic<[llvm_v2bf16_ty], [llvm_i16_ty, llvm_i16_ty]>;
+ }
+
foreach type = ["e4m3x2", "e5m2x2"] in {
foreach relu = ["", "_relu"] in {
def int_nvvm_ff_to_ # type # _rn # relu : NVVMBuiltin,
@@ -1746,6 +1773,9 @@ let TargetPrefix = "nvvm" in {
def int_nvvm_ # type # _to_f16x2_rn # relu : NVVMBuiltin,
PureIntrinsic<[llvm_v2f16_ty], [llvm_i16_ty]>;
+
+ def int_nvvm_bf16x2_to_ # type # _rn # relu
+ : PureIntrinsic<[llvm_i16_ty], [llvm_v2bf16_ty]>;
}
}
@@ -1765,6 +1795,12 @@ let TargetPrefix = "nvvm" in {
def int_nvvm_e2m1x2_to_f16x2_rn # relu : NVVMBuiltin,
PureIntrinsic<[llvm_v2f16_ty], [llvm_i16_ty]>;
+
+ def int_nvvm_f16x2_to_e2m1x2_rn # relu # _satfinite
+ : PureIntrinsic<[llvm_i16_ty], [llvm_v2f16_ty]>;
+
+ def int_nvvm_bf16x2_to_e2m1x2_rn # relu # _satfinite
+ : PureIntrinsic<[llvm_i16_ty], [llvm_v2bf16_ty]>;
}
// RS rounding mode (Stochastic Rounding) conversions for f4x4 type
@@ -1782,6 +1818,12 @@ let TargetPrefix = "nvvm" in {
def int_nvvm_ # type # _to_f16x2_rn # relu : NVVMBuiltin,
PureIntrinsic<[llvm_v2f16_ty], [llvm_i16_ty]>;
+
+ def int_nvvm_f16x2_to_ # type # _rn # relu # _satfinite
+ : PureIntrinsic<[llvm_i16_ty], [llvm_v2f16_ty]>;
+
+ def int_nvvm_bf16x2_to_ # type # _rn # relu # _satfinite
+ : PureIntrinsic<[llvm_i16_ty], [llvm_v2bf16_ty]>;
}
}
diff --git a/llvm/lib/Target/NVPTX/NVPTX.td b/llvm/lib/Target/NVPTX/NVPTX.td
index d41a43de95098..b704a664df106 100644
--- a/llvm/lib/Target/NVPTX/NVPTX.td
+++ b/llvm/lib/Target/NVPTX/NVPTX.td
@@ -99,7 +99,7 @@ foreach sm = [20, 21, 30, 32, 35, 37, 50, 52, 53, 60,
foreach version = [32, 40, 41, 42, 43, 50, 60, 61, 62, 63, 64, 65, 70, 71, 72,
73, 74, 75, 76, 77, 78, 80, 81, 82, 83, 84, 85, 86, 87, 88,
- 90] in
+ 90, 91] in
def PTX#version : FeaturePTX<version>;
//===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
index 74a552502ccf2..e3a038ed9a9df 100644
--- a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
+++ b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
@@ -659,6 +659,11 @@ let hasSideEffects = false in {
(ins B32:$src), (ins CvtMode:$mode),
"cvt${mode:base}.satfinite${mode:relu}." # F8Name # "x2.f16x2">,
Requires<[hasPTX<81>, hasSM<89>]>;
+ def _bf16x2
+ : NVPTXInst<(outs B16:$dst), (ins B32:$src, CvtMode:$mode),
+ !strconcat("cvt${mode:base}.satfinite${mode:relu}.", F8Name,
+ "x2.bf16x2 \t$dst, $src;"),
+ []>;
}
defm CVT_e4m3x2 : CVT_TO_F8X2<"e4m3">;
@@ -703,12 +708,40 @@ let hasSideEffects = false in {
defm CVT_to_tf32_rz_satf : CVT_TO_TF32<"rz.satfinite", [hasPTX<86>, hasSM<100>]>;
defm CVT_to_tf32_rn_relu_satf : CVT_TO_TF32<"rn.relu.satfinite", [hasPTX<86>, hasSM<100>]>;
defm CVT_to_tf32_rz_relu_satf : CVT_TO_TF32<"rz.relu.satfinite", [hasPTX<86>, hasSM<100>]>;
+
+ def CVT_s2f6x2_f32_sf : NVPTXInst<(outs B16:$dst),
+ (ins B32:$src1, B32:$src2, CvtMode:$mode),
+ "cvt${mode:base}.satfinite${mode:relu}.s2f6x2.f32 \t$dst, $src1, $src2;", []>;
+ def CVT_s2f6x2_f32_sf_scale : NVPTXInst<(outs B16:$dst),
+ (ins B32:$src1, B32:$src2, B16:$scale, CvtMode:$mode),
+ "cvt${mode:base}.satfinite${mode:relu}.scaled::n2::ue8m0.s2f6x2.f32 \t"
+ "$dst, $src1, $src2, $scale;", []>;
+
+ def CVT_s2f6x2_bf16x2_sf : NVPTXInst<(outs B16:$dst),
+ (ins B32:$src, CvtMode:$mode),
+ "cvt${mode:base}.satfinite${mode:relu}.s2f6x2.bf16x2 \t$dst, $src;", []>;
+ def CVT_s2f6x2_bf16x2_sf_scale : NVPTXInst<(outs B16:$dst),
+ (ins B32:$src, B16:$scale, CvtMode:$mode),
+ "cvt${mode:base}.satfinite${mode:relu}.scaled::n2::ue8m0.s2f6x2.bf16x2 \t"
+ "$dst, $src, $scale;", []>;
+
+ def CVT_bf16x2_s2f6x2 : NVPTXInst<(outs B32:$dst),
+ (ins B16:$src, CvtMode:$mode),
+ "cvt${mode:base}${mode:relu}.bf16x2.s2f6x2 \t$dst, $src;", []>;
+ def CVT_bf16x2_s2f6x2_scale : NVPTXInst<(outs B32:$dst),
+ (ins B16:$src, B16:$scale, CvtMode:$mode),
+ "cvt${mode:base}${mode:relu}.scaled::n2::ue8m0.bf16x2.s2f6x2 \t"
+ "$dst, $src, $scale;", []>;
+ def CVT_bf16x2_s2f6x2_sf : NVPTXInst<(outs B32:$dst),
+ (ins B16:$src, CvtMode:$mode),
+ "cvt${mode:base}.satfinite${mode:relu}.bf16x2.s2f6x2 \t$dst, $src;", []>;
+ def CVT_bf16x2_s2f6x2_sf_scale : NVPTXInst<(outs B32:$dst),
+ (ins B16:$src, B16:$scale, CvtMode:$mode),
+ "cvt${mode:base}.satfinite${mode:relu}.scaled::n2::ue8m0.bf16x2.s2f6x2 \t"
+ "$dst, $src, $scale;", []>;
// FP6 conversions.
foreach type = ["e2m3x2", "e3m2x2"] in {
- def CVT_ # type # _f32_sf : BasicFlagsNVPTXInst<(outs B16:$dst),
- (ins B32:$src1, B32:$src2), (ins CvtMode:$mode),
- "cvt${mode:base}.satfinite${mode:relu}." # type # ".f32">;
def CVT_f16x2_ # type : BasicFlagsNVPTXInst<(outs B32:$dst),
(ins B16:$src), (ins CvtMode:$mode),
"cvt${mode:base}${mode:relu}.f16x2." # type>;
@@ -723,6 +756,30 @@ let hasSideEffects = false in {
def CVT_e2m3x4_f32x4_rs_sf : CVT_TO_FP6X4<"e2m3">;
def CVT_e3m2x4_f32x4_rs_sf : CVT_TO_FP6X4<"e3m2">;
+ multiclass CVT_TO_FP6X2<string FP6Name> {
+ def _f32_sf :
+ NVPTXInst<(outs B16:$dst),
+ (ins B32:$src1, B32:$src2, CvtMode:$mode),
+ "cvt${mode:base}.satfinite${mode:relu}." # FP6Name
+ # "x2.f32"
+ "\t$dst, $src1, $src2;", []>;
+ def _f16x2_sf
+ : NVPTXInst<
+ (outs B16:$dst), (ins B32:$src, CvtMode:$mode),
+ "cvt${mode:base}.satfinite${mode:relu}."#FP6Name#"x2.f16x2 "
+ "\t$dst, $src;",
+ []>;
+ def _bf16x2_sf
+ : NVPTXInst<
+ (outs B16:$dst), (ins B32:$src, CvtMode:$mode),
+ "cvt${mode:base}.satfinite${mode:relu}."#FP6Name#"x2.bf16x2 "
+ "\t$dst, $src;",
+ []>;
+ }
+
+ defm CVT_e2m3x2 : CVT_TO_FP6X2<"e2m3">;
+ defm CVT_e3m2x2 : CVT_TO_FP6X2<"e3m2">;
+
// FP4 conversions.
def CVT_e2m1x2_f32_sf : NVPTXInst<(outs B16:$dst),
(ins B32:$src1, B32:$src2, CvtMode:$mode),
@@ -746,6 +803,20 @@ let hasSideEffects = false in {
"cvt${mode:base}${mode:relu}.satfinite.e2m1x4.f32 \t" #
"$dst, {{$src1, $src2, $src3, $src4}}, $src5;">;
+ def CVT_e2m1x2_f16x2_sf : NVPTXInst<(outs B16:$dst),
+ (ins B32:$src, CvtMode:$mode),
+ !strconcat("{{ \n\t", ".reg .b8 \t%e2m1x2_out; \n\t",
+ "cvt${mode:base}.satfinite${mode:relu}.e2m1x2."
+ "f16x2 \t%e2m1x2_out, $src; \n\t",
+ "cvt.u16.u8 \t$dst, %e2m1x2_out; \n\t", "}}"), []>;
+
+ def CVT_e2m1x2_bf16x2_sf : NVPTXInst<(outs B16:$dst),
+ (ins B32:$src, CvtMode:$mode),
+ !strconcat("{{ \n\t", ".reg .b8 \t%e2m1x2_out; \n\t",
+ "cvt${mode:base}.satfinite${mode:relu}.e2m1x2."
+ "bf16x2 \t%e2m1x2_out, $src; \n\t",
+ "cvt.u16.u8 \t$dst, %e2m1x2_out; \n\t", "}}"), []>;
+
// UE8M0x2 conversions.
class CVT_f32_to_ue8m0x2<string sat = ""> :
BasicFlagsNVPTXInst<(outs B16:$dst),
diff --git a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
index 2f6894867c43d..d31ab6fc4b2da 100644
--- a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
+++ b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
@@ -2194,7 +2194,20 @@ let Predicates = [callSubtarget<"hasFP8ConversionSupport">] in {
(CVT_e5m2x2_f32 $a, $b, CvtRN)>;
def : Pat<(int_nvvm_ff_to_e5m2x2_rn_relu f32:$a, f32:$b),
(CVT_e5m2x2_f32 $a, $b, CvtRN_RELU)>;
+}
+
+let Predicates = [callSubtarget<"hasFP16X2ToNarrowFPConversionSupport">] in {
+ foreach dst_type = ["e4m3x2", "e5m2x2"] in {
+ foreach relu = ["", "_relu"] in {
+ def : Pat<(!cast<Intrinsic>("int_nvvm_bf16x2_to_" # dst_type # "_rn" # relu)
+ B32:$a),
+ (!cast<NVPTXInst>("CVT_"# dst_type #"_bf16x2") $a,
+ !cast<PatLeaf>("CvtRN" # !toupper(relu)))>;
+ }
+ }
+}
+let Predicates = [callSubtarget<"hasFP8ConversionSupport">] in {
def : Pat<(int_nvvm_f16x2_to_e4m3x2_rn v2f16:$a),
(CVT_e4m3x2_f16x2 $a, CvtRN)>;
def : Pat<(int_nvvm_f16x2_to_e4m3x2_rn_relu v2f16:$a),
@@ -2214,6 +2227,43 @@ let Predicates = [callSubtarget<"hasFP8ConversionSupport">] in {
(CVT_f16x2_e5m2x2 $a, CvtRN_RELU)>;
}
+let Predicates = [callSubtarget<"hasS2F6X2ConversionSupport">] in {
+ def : Pat<(int_nvvm_ff_to_s2f6x2_rn_satfinite f32:$a, f32:$b),
+ (CVT_s2f6x2_f32_sf $a, $b, CvtRN)>;
+ def : Pat<(int_nvvm_ff_to_s2f6x2_rn_relu_satfinite f32:$a, f32:$b),
+ (CVT_s2f6x2_f32_sf $a, $b, CvtRN_RELU)>;
+ def : Pat<(int_nvvm_ff_to_s2f6x2_rn_satfinite_scale_n2_ue8m0 f32:$a, f32:$b, i16:$scale),
+ (CVT_s2f6x2_f32_sf_scale $a, $b, $scale, CvtRN)>;
+ def : Pat<(int_nvvm_ff_to_s2f6x2_rn_relu_satfinite_scale_n2_ue8m0 f32:$a, f32:$b, i16:$scale),
+ (CVT_s2f6x2_f32_sf_scale $a, $b, $scale, CvtRN_RELU)>;
+
+ def : Pat<(int_nvvm_bf16x2_to_s2f6x2_rn_satfinite v2bf16:$a),
+ (CVT_s2f6x2_bf16x2_sf $a, CvtRN)>;
+ def : Pat<(int_nvvm_bf16x2_to_s2f6x2_rn_relu_satfinite v2bf16:$a),
+ (CVT_s2f6x2_bf16x2_sf $a, CvtRN_RELU)>;
+ def : Pat<(int_nvvm_bf16x2_to_s2f6x2_rn_satfinite_scale_n2_ue8m0 v2bf16:$a, i16:$scale),
+ (CVT_s2f6x2_bf16x2_sf_scale $a, $scale, CvtRN)>;
+ def : Pat<(int_nvvm_bf16x2_to_s2f6x2_rn_relu_satfinite_scale_n2_ue8m0 v2bf16:$a, i16:$scale),
+ (CVT_s2f6x2_bf16x2_sf_scale $a, $scale, CvtRN_RELU)>;
+
+ def : Pat<(int_nvvm_s2f6x2_to_bf16x2_rn i16:$a),
+ (CVT_bf16x2_s2f6x2 $a, CvtRN)>;
+ def : Pat<(int_nvvm_s2f6x2_to_bf16x2_rn_relu i16:$a),
+ (CVT_bf16x2_s2f6x2 $a, CvtRN_RELU)>;
+ def : Pat<(int_nvvm_s2f6x2_to_bf16x2_rn_scale_n2_ue8m0 i16:$a, i16:$scale),
+ (CVT_bf16x2_s2f6x2_scale $a, $scale, CvtRN)>;
+ def : Pat<(int_nvvm_s2f6x2_to_bf16x2_rn_relu_scale_n2_ue8m0 i16:$a, i16:$scale),
+ (CVT_bf16x2_s2f6x2_scale $a, $scale, CvtRN_RELU)>;
+ def : Pat<(int_nvvm_s2f6x2_to_bf16x2_rn_satfinite i16:$a),
+ (CVT_bf16x2_s2f6x2_sf $a, CvtRN)>;
+ def : Pat<(int_nvvm_s2f6x2_to_bf16x2_rn_relu_satfinite i16:$a),
+ (CVT_bf16x2_s2f6x2_sf $a, CvtRN_RELU)>;
+ def : Pat<(int_nvvm_s2f6x2_to_bf16x2_rn_satfinite_scale_n2_ue8m0 i16:$a, i16:$scale),
+ (CVT_bf16x2_s2f6x2_sf_scale $a, $scale, CvtRN)>;
+ def : Pat<(int_nvvm_s2f6x2_to_bf16x2_rn_relu_satfinite_scale_n2_ue8m0 i16:$a, i16:$scale),
+ (CVT_bf16x2_s2f6x2_sf_scale $a, $scale, CvtRN_RELU)>;
+}
+
let Predicates = [callSubtarget<"hasNarrowFPConversionSupport">] in {
def : Pat<(int_nvvm_ff_to_e2m3x2_rn_satfinite f32:$a, f32:$b),
(CVT_e2m3x2_f32_sf $a, $b, CvtRN)>;
@@ -2232,7 +2282,22 @@ let Predicates = [callSubtarget<"hasNarrowFPConversionSupport">] in {
(CVT_f16x2_e3m2x2 $a, CvtRN)>;
def : Pat<(int_nvvm_e3m2x2_to_f16x2_rn_relu i16:$a),
(CVT_f16x2_e3m2x2 $a, CvtRN_RELU)>;
+}
+let Predicates = [callSubtarget<"hasFP16X2ToNarrowFPConversionSupport">] in {
+ foreach src_type = ["f16x2", "bf16x2"] in {
+ foreach dst_type = ["e2m3x2", "e3m2x2"] in {
+ foreach relu = ["", "_relu"] in {
+ def : Pat<(!cast<Intrinsic>("int_nvvm_" # src_type # "_to_" # dst_type #
+ "_rn" # relu # "_satfinite") B32:$a),
+ (!cast<NVPTXInst>("CVT_" # dst_type # "_" # src_type # "_sf")
+ $a, !cast<PatLeaf>("CvtRN" # !toupper(relu)))>;
+ }
+ }
+ }
+}
+
+let Predicates = [callSubtarget<"hasNarrowFPConversionSupport">] in {
def : Pat<(int_nvvm_ff_to_e2m1x2_rn_satfinite f32:$a, f32:$b),
(CVT_e2m1x2_f32_sf $a, $b, CvtRN)>;
def : Pat<(int_nvvm_ff_to_e2m1x2_rn_relu_satfinite f32:$a, f32:$b),
@@ -2242,7 +2307,20 @@ let Predicates = [callSubtarget<"hasNarrowFPConversionSupport">] in {
(CVT_f16x2_e2m1x2 $a, CvtRN)>;
def : Pat<(int_nvvm_e2m1x2_to_f16x2_rn_relu i16:$a),
(CVT_f16x2_e2m1x2 $a, CvtRN_RELU)>;
+}
+let Predicates = [callSubtarget<"hasFP16X2ToNarrowFPConversionSupport">] in {
+ foreach src_type = ["f16x2", "bf16x2"] in {
+ foreach relu = ["", "_relu"] in {
+ def : Pat<(!cast<Intrinsic>("int_nvvm_" # src_type # "_to_e2m1x2_rn" # relu
+ # "_satfinite") B32:$a),
+ (!cast<NVPTXInst>("CVT_e2m1x2_" # src_type # "_sf") $a,
+ !cast<PatLeaf>("CvtRN" # !toupper(relu)))>;
+ }
+ }
+}
+
+let Predicates = [callSubtarget<"hasNarrowFPConversionSupport">] in {
def : Pat<(int_nvvm_ff_to_ue8m0x2_rz f32:$a, f32:$b),
(CVT_ue8m0x2_f32 $a, $b, CvtRZ)>;
def : Pat<(int_nvvm_ff_to_ue8m0x2_rz_satfinite f32:$a, f32:$b),
diff --git a/llvm/lib/Target/NVPTX/NVPTXSubtarget.h b/llvm/lib/Target/NVPTX/NVPTXSubtarget.h
index 6f6057b3689e6..c8ca0441fd2ef 100644
--- a/llvm/lib/Target/NVPTX/NVPTXSubtarget.h
+++ b/llvm/lib/Target/NVPTX/NVPTXSubtarget.h
@@ -201,6 +201,20 @@ class NVPTXSubtarget : public NVPTXGenSubtargetInfo {
hasPTXWithFamilySMs(88, {100, 101, 120}) ||
hasPTXWithAccelSMs(86, {100, 101, 120});
}
+
+ // Checks support for conversions involving the following types:
+ // - bf16x2 -> f8x2
+ // - f16x2 -> f6x2
+ // - bf16x2 -> f6x2
+ // - f16x2 -> f4x2
+ // - bf16x2 -> f4x2
+ bool hasFP16X2ToNarrowFPConversionSupport() const {
+ return hasPTXWithFamilySMs(91, {100, 110, 120});
+ }
+
+ bool hasS2F6X2ConversionSupport() const {
+ return hasPTXWithAccelSMs(91, {100, 103, 110, 120, 121});
+ }
// Prior to CUDA 12.3 ptxas did not recognize that the trap instruction
// terminates a basic block. Instead, it would assume that control flow
diff --git a/llvm/test/CodeGen/NVPTX/convert_fp4x2_sm_100f.ll b/llvm/test/CodeGen/NVPTX/convert_fp4x2_sm_100f.ll
new file mode 100644
index 0000000000000..3862e23effe3b
--- /dev/null
+++ b/llvm/test/CodeGen/NVPTX/convert_fp4x2_sm_100f.ll
@@ -0,0 +1,84 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_100f -mattr=+ptx91 | FileCheck %s
+
+; F16x2 to E2M1x2 (fp4x2)
+define i16 @cvt_rn_f16x2_e2m1x2(<2 x half> %in) {
+; CHECK-LABEL: cvt_rn_f16x2_e2m1x2(
+; CHECK: {
+; CHECK-NEXT: .reg .b16 %rs<2>;
+; CHECK-NEXT: .reg .b32 %r<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.b32 %r1, [cvt_rn_f16x2_e2m1x2_param_0];
+; CHECK-NEXT: {
+; CHECK-NEXT: .reg .b8 %e2m1x2_out;
+; CHECK-NEXT: cvt.rn.satfinite.e2m1x2.f16x2 %e2m1x2_out, %r1;
+; CHECK-NEXT: cvt.u16.u8 %rs1, %e2m1x2_out;
+; CHECK-NEXT: }
+; CHECK-NEXT: cvt.u32.u16 %r2, %rs1;
+; CHECK-NEXT: st.param.b32 [func_retval0], %r2;
+; CHECK-NEXT: ret;
+ %val = call i16 @llvm.nvvm.f16x2.to.e2m1x2.rn.satfinite(<2 x half> %in)
+ ret i16 %val
+}
+
+define i16 @cvt_rn_relu_f16x2_e2m1x2(<2 x half> %in) {
+; CHECK-LABEL: cvt_rn_relu_f16x2_e2m1x2(
+; CHECK: {
+; CHECK-NEXT: .reg .b16 %rs<2>;
+; CHECK-NEXT: .reg .b32 %r<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.b32 %r1, [cvt_rn_relu_f16x2_e2m1x2_param_0];
+; CHECK-NEXT: {
+; CHECK-NEXT: .reg .b8 %e2m1x2_out;
+; CHECK-NEXT: cvt.rn.satfinite.relu.e2m1x2.f16x2 %e2m1x2_out, %r1;
+; CHECK-NEXT: cvt.u16.u8 %rs1, %e2m1x2_out;
+; CHECK-NEXT: }
+; CHECK-NEXT: cvt.u32.u16 %r2, %rs1;
+; CHECK-NEXT: st.param.b32 [func_retval0], %r2;
+; CHECK-NEXT: ret;
+ %val = call i16 @llvm.nvvm.f16x2.to.e2m1x2.rn.relu.satfinite(<2 x half> %in)
+ ret i16 %val
+}
+
+; BF16x2 to E2M1x2 (fp4x2)
+define i16 @cvt_rn_sf_e2m1x2_bf16x2(<2 x bfloat> %in) {
+; CHECK-LABEL: cvt_rn_sf_e2m1x2_bf16x2(
+; CHECK: {
+; CHECK-NEXT: .reg .b16 %rs<2>;
+; CHECK-NEXT: .reg .b32 %r<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.b32 %r1, [cvt_rn_sf_e2m1x2_bf16x2_param_0];
+; CHECK-NEXT: {
+; CHECK-NEXT: .reg .b8 %e2m1x2_out;
+; CHECK-NEXT: cvt.rn.satfinite.e2m1x2.bf16x2 %e2m1x2_out, %r1;
+; CHECK-NEXT: cvt.u16.u8 %rs1, %e2m1x2_out;
+; CHECK-NEXT: }
+; CHECK-NEXT: cvt.u32.u16 %r2, %rs1;
+; CHECK-NEXT: st.param.b32 [func_retval0], %r2;
+; CHECK-NEXT: ret;
+ %val = call i16 @llvm.nvvm.bf16x2.to.e2m1x2.rn.satfinite(<2 x bfloat> %in)
+ ret i16 %val
+}
+
+define i16 @cvt_rn_relu_sf_e2m1x2_bf16x2(<2 x bfloat> %in) {
+; CHECK-LABEL: cvt_rn_relu_sf_e2m1x2_bf16x2(
+; CHECK: {
+; CHECK-NEXT: .reg .b16 %rs<2>;
+; CHECK-NEXT: .reg .b32 %r<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.b32 %r1, [cvt_rn_relu_sf_e2m1x2_bf16x2_param_0];
+; CHECK-NEXT: {
+; CHECK-NEXT: .reg .b8 %e2m1x2_out;
+; CHECK-NEXT: cvt.rn.satfinite.relu.e2m1x2.bf16x2 %e2m1x2_out, %r1;
+; CHECK-NEXT: cvt.u16.u8 %rs1, %e2m1x2_out;
+; CHECK-NEXT: }
+; CHECK-NEXT: cvt.u32.u16 %r2, %rs1;
+; CHECK-NEXT: st.param.b32 [func_retval0], %r2;
+; CHECK-NEXT: ret;
+ %val = call i16 @llvm.nvvm.bf16x2.to.e2m1x2.rn.relu.satfinite(<2 x bfloat> %in)
+ ret i16 %val
+}
diff --git a/llvm/test/CodeGen/NVPTX/convert_fp6x2_sm_100f.ll b/llvm/test/CodeGen/NVPTX/convert_fp6x2_sm_100f.ll
new file mode 100644
index 0000000000000..43992328767eb
--- /dev/null
+++ b/llvm/test/CodeGen/NVPTX/convert_fp6x2_sm_100f.ll
@@ -0,0 +1,132 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_100f -mattr=+ptx91 | FileCheck %s
+
+; F16x2 to E2M3x2/E3M2x2 (fp6)
+define i16 @cvt_rn_f16x2_e2m3x2(<2 x half> %in) {
+; CHECK-LABEL: cvt_rn_f16x2_e2m3x2(
+; CHECK: {
+; CHECK-NEXT: .reg .b16 %rs<2>;
+; CHECK-NEXT: .reg .b32 %r<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.b32 %r1, [cvt_rn_f16x2_e2m3x2_param_0];
+; CHECK-NEXT: cvt.rn.satfinite.e2m3x2.f16x2 %rs1, %r1;
+; CHECK-NEXT: cvt.u32.u16 %r2, %rs1;
+; CHECK-NEXT: st.param.b32 [func_retval0], %r2;
+; CHECK-NEXT: ret;
+ %val = call i16 @llvm.nvvm.f16x2.to.e2m3x2.rn.satfinite(<2 x half> %in)
+ ret i16 %val
+}
+
+define i16 @cvt_rn_relu_f16x2_e2m3x2(<2 x half> %in) {
+; CHECK-LABEL: cvt_rn_relu_f16x2_e2m3x2(
+; CHECK: {
+; CHECK-NEXT: .reg .b16 %rs<2>;
+; CHECK-NEXT: .reg .b32 %r<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.b32 %r1, [cvt_rn_relu_f16x2_e2m3x2_param_0];
+; CHECK-NEXT: cvt.rn.satfinite.relu.e2m3x2.f16x2 %rs1, %r1;
+; CHECK-NEXT: cvt.u32.u16 %r2, %rs1;
+; CHECK-NEXT: st.param.b32 [func_retval0], %r2;
+; CHECK-NEXT: ret;
+ %val = call i16 @llvm.nvvm.f16x2.to.e2m3x2.rn.relu.satfinite(<2 x half> %in)
+ ret i16 %val
+}
+
+define i16 @cvt_rn_f16x2_e3m2x2(<2 x half> %in) {
+; CHECK-LABEL: cvt_rn_f16x2_e3m2x2(
+; CHECK: {
+; CHECK-NEXT: .reg .b16 %rs<2>;
+; CHECK-NEXT: .reg .b32 %r<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.b32 %r1, [cvt_rn_f16x2_e3m2x2_param_0];
+; CHECK-NEXT: cvt.rn.satfinite.e3m2x2.f16x2 %rs1, %r1;
+; CHECK-NEXT: cvt.u32.u16 %r2, %rs1;
+; CHECK-NEXT: st.param.b32 [func_retval0], %r2;
+; CHECK-NEXT: ret;
+ %val = call i16 @llvm.nvvm.f16x2.to.e3m2x2.rn.satfinite(<2 x half> %in)
+ ret i16 %val
+}
+
+define i16 @cvt_rn_relu_f16x2_e3m2x2(<2 x half> %in) {
+; CHECK-LABEL: cvt_rn_relu_f16x2_e3m2x2(
+; CHECK: {
+; CHECK-NEXT: .reg .b16 %rs<2>;
+; CHECK-NEXT: .reg .b32 %r<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.b32 %r1, [cvt_rn_relu_f16x2_e3m2x2_param_0];
+; CHECK-NEXT: cvt.rn.satfinite.relu.e3m2x2.f16x2 %rs1, %r1;
+; CHECK-NEXT: cvt.u32.u16 %r2, %rs1;
+; CHECK-NEXT: st.param.b32 [func_retval0], %r2;
+; CHECK-NEXT: ret;
+ %val = call i16 @llvm.nvvm.f16x2.to.e3m2x2.rn.relu.satfinite(<2 x half> %in)
+ ret i16 %val
+}
+
+; BF16x2 to E2M3x2/E3M2x2 (fp6)
+define i16 @cvt_rn_sf_e2m3x2_bf16x2(<2 x bfloat> %in) {
+; CHECK-LABEL: cvt_rn_sf_e2m3x2_bf16x2(
+; CHECK: {
+; CHECK-NEXT: .reg .b16 %rs<2>;
+; CHECK-NEXT: .reg .b32 %r<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.b32 %r1, [cvt_rn_sf_e2m3x2_bf16x2_param_0];
+; CHECK-NEXT: cvt.rn.satfinite.e2m3x2.bf16x2 %rs1, %r1;
+; CHECK-NEXT: cvt.u32.u16 %r2, %rs1;
+; CHECK-NEXT: st.param.b32 [func_retval0], %r2;
+; CHECK-NEXT: ret;
+ %val = call i16 @llvm.nvvm.bf16x2.to.e2m3x2.rn.satfinite(<2 x bfloat> %in)
+ ret i16 %val
+}
+
+define i16 @cvt_rn_relu_sf_e2m3x2_bf16x2(<2 x bfloat> %in) {
+; CHECK-LABEL: cvt_rn_relu_sf_e2m3x2_bf16x2(
+; CHECK: {
+; CHECK-NEXT: .reg .b16 %rs<2>;
+; CHECK-NEXT: .reg .b32 %r<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.b32 %r1, [cvt_rn_relu_sf_e2m3x2_bf16x2_param_0];
+; CHECK-NEXT: cvt.rn.satfinite.relu.e2m3x2.bf16x2 %rs1, %r1;
+; CHECK-NEXT: cvt.u32.u16 %r2, %rs1;
+; CHECK-NEXT: st.param.b32 [func_retval0], %r2;
+; CHECK-NEXT: ret;
+ %val = call i16 @llvm.nvvm.bf16x2.to.e2m3x2.rn.relu.satfinite(<2 x bfloat> %in)
+ ret i16 %val
+}
+
+define i16 @cvt_rn_sf_e3m2x2_bf16x2(<2 x bfloat> %in) {
+; CHECK-LABEL: cvt_rn_sf_e3m2x2_bf16x2(
+; CHECK: {
+; CHECK-NEXT: .reg .b16 %rs<2>;
+; CHECK-NEXT: .reg .b32 %r<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.b32 %r1, [cvt_rn_sf_e3m2x2_bf16x2_param_0];
+; CHECK-NEXT: cvt.rn.satfinite.e3m2x2.bf16x2 %rs1, %r1;
+; CHECK-NEXT: cvt.u32.u16 %r2, %rs1;
+; CHECK-NEXT: st.param.b32 [func_retval0], %r2;
+; CHECK-NEXT: ret;
+ %val = call i16 @llvm.nvvm.bf16x2.to.e3m2x2.rn.satfinite(<2 x bfloat> %in)
+ ret i16 %val
+}
+
+define i16 @cvt_rn_relu_sf_e3m2x2_bf16x2(<2 x bfloat> %in) {
+; CHECK-LABEL: cvt_rn_relu_sf_e3m2x2_bf16x2(
+; CHECK: {
+; CHECK-NEXT: .reg .b16 %rs<2>;
+; CHECK-NEXT: .reg .b32 %r<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.b32 %r1, [cvt_rn_relu_sf_e3m2x2_bf16x2_param_0];
+; CHECK-NEXT: cvt.rn.satfinite.relu.e3m2x2.bf16x2 %rs1, %r1;
+; CHECK-NEXT: cvt.u32.u16 %r2, %rs1;
+; CHECK-NEXT: st.param.b32 [func_retval0], %r2;
+; CHECK-NEXT: ret;
+ %val = call i16 @llvm.nvvm.bf16x2.to.e3m2x2.rn.relu.satfinite(<2 x bfloat> %in)
+ ret i16 %val
+}
diff --git a/llvm/test/CodeGen/NVPTX/convert_fp8x2_sm_100f.ll b/llvm/test/CodeGen/NVPTX/convert_fp8x2_sm_100f.ll
new file mode 100644
index 0000000000000..0797db31b8155
--- /dev/null
+++ b/llvm/test/CodeGen/NVPTX/convert_fp8x2_sm_100f.ll
@@ -0,0 +1,67 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_100f -mattr=+ptx91 | FileCheck %s
+
+; BF16x2 to E4M3x2/E5M2x2 (fp8x2)
+define i16 @cvt_rn_e4m3x2_bf16x2(<2 x bfloat> %in) {
+; CHECK-LABEL: cvt_rn_e4m3x2_bf16x2(
+; CHECK: {
+; CHECK-NEXT: .reg .b16 %rs<2>;
+; CHECK-NEXT: .reg .b32 %r<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.b32 %r1, [cvt_rn_e4m3x2_bf16x2_param_0];
+; CHECK-NEXT: cvt.rn.satfinite.e4m3x2.bf16x2 %rs1, %r1;
+; CHECK-NEXT: cvt.u32.u16 %r2, %rs1;
+; CHECK-NEXT: st.param.b32 [func_retval0], %r2;
+; CHECK-NEXT: ret;
+ %val = call i16 @llvm.nvvm.bf16x2.to.e4m3x2.rn(<2 x bfloat> %in)
+ ret i16 %val
+}
+
+define i16 @cvt_rn_relu_e4m3x2_bf16x2(<2 x bfloat> %in) {
+; CHECK-LABEL: cvt_rn_relu_e4m3x2_bf16x2(
+; CHECK: {
+; CHECK-NEXT: .reg .b16 %rs<2>;
+; CHECK-NEXT: .reg .b32 %r<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.b32 %r1, [cvt_rn_relu_e4m3x2_bf16x2_param_0];
+; CHECK-NEXT: cvt.rn.satfinite.relu.e4m3x2.bf16x2 %rs1, %r1;
+; CHECK-NEXT: cvt.u32.u16 %r2, %rs1;
+; CHECK-NEXT: st.param.b32 [func_retval0], %r2;
+; CHECK-NEXT: ret;
+ %val = call i16 @llvm.nvvm.bf16x2.to.e4m3x2.rn.relu(<2 x bfloat> %in)
+ ret i16 %val
+}
+
+define i16 @cvt_rn_e5m2x2_bf16x2(<2 x bfloat> %in) {
+; CHECK-LABEL: cvt_rn_e5m2x2_bf16x2(
+; CHECK: {
+; CHECK-NEXT: .reg .b16 %rs<2>;
+; CHECK-NEXT: .reg .b32 %r<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.b32 %r1, [cvt_rn_e5m2x2_bf16x2_param_0];
+; CHECK-NEXT: cvt.rn.satfinite.e5m2x2.bf16x2 %rs1, %r1;
+; CHECK-NEXT: cvt.u32.u16 %r2, %rs1;
+; CHECK-NEXT: st.param.b32 [func_retval0], %r2;
+; CHECK-NEXT: ret;
+ %val = call i16 @llvm.nvvm.bf16x2.to.e5m2x2.rn(<2 x bfloat> %in)
+ ret i16 %val
+}
+
+define i16 @cvt_rn_relu_e5m2x2_bf16x2(<2 x bfloat> %in) {
+; CHECK-LABEL: cvt_rn_relu_e5m2x2_bf16x2(
+; CHECK: {
+; CHECK-NEXT: .reg .b16 %rs<2>;
+; CHECK-NEXT: .reg .b32 %r<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.b32 %r1, [cvt_rn_relu_e5m2x2_bf16x2_param_0];
+; CHECK-NEXT: cvt.rn.satfinite.relu.e5m2x2.bf16x2 %rs1, %r1;
+; CHECK-NEXT: cvt.u32.u16 %r2, %rs1;
+; CHECK-NEXT: st.param.b32 [func_retval0], %r2;
+; CHECK-NEXT: ret;
+ %val = call i16 @llvm.nvvm.bf16x2.to.e5m2x2.rn.relu(<2 x bfloat> %in)
+ ret i16 %val
+}
diff --git a/llvm/test/CodeGen/NVPTX/convert_s2f6x2_sm_100a.ll b/llvm/test/CodeGen/NVPTX/convert_s2f6x2_sm_100a.ll
new file mode 100644
index 0000000000000..1da38d2e1eb59
--- /dev/null
+++ b/llvm/test/CodeGen/NVPTX/convert_s2f6x2_sm_100a.ll
@@ -0,0 +1,269 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_100a -mattr=+ptx91 | FileCheck %s
+
+; From a pair of floats to s2f6x2
+define i16 @cvt_s2f6x2_f32_f32_rn(float %f1, float %f2) {
+; CHECK-LABEL: cvt_s2f6x2_f32_f32_rn(
+; CHECK: {
+; CHECK-NEXT: .reg .b16 %rs<2>;
+; CHECK-NEXT: .reg .b32 %r<4>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.b32 %r1, [cvt_s2f6x2_f32_f32_rn_param_0];
+; CHECK-NEXT: ld.param.b32 %r2, [cvt_s2f6x2_f32_f32_rn_param_1];
+; CHECK-NEXT: cvt.rn.satfinite.s2f6x2.f32 %rs1, %r1, %r2;
+; CHECK-NEXT: cvt.u32.u16 %r3, %rs1;
+; CHECK-NEXT: st.param.b32 [func_retval0], %r3;
+; CHECK-NEXT: ret;
+ %val = call i16 @llvm.nvvm.ff.to.s2f6x2.rn.satfinite(float %f1, float %f2)
+ ret i16 %val
+}
+
+define i16 @cvt_s2f6x2_f32_f32_rn_relu(float %f1, float %f2) {
+; CHECK-LABEL: cvt_s2f6x2_f32_f32_rn_relu(
+; CHECK: {
+; CHECK-NEXT: .reg .b16 %rs<2>;
+; CHECK-NEXT: .reg .b32 %r<4>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.b32 %r1, [cvt_s2f6x2_f32_f32_rn_relu_param_0];
+; CHECK-NEXT: ld.param.b32 %r2, [cvt_s2f6x2_f32_f32_rn_relu_param_1];
+; CHECK-NEXT: cvt.rn.satfinite.relu.s2f6x2.f32 %rs1, %r1, %r2;
+; CHECK-NEXT: cvt.u32.u16 %r3, %rs1;
+; CHECK-NEXT: st.param.b32 [func_retval0], %r3;
+; CHECK-NEXT: ret;
+ %val = call i16 @llvm.nvvm.ff.to.s2f6x2.rn.relu.satfinite(float %f1, float %f2)
+ ret i16 %val
+}
+
+; From a pair of floats to s2f6x2, with scale variants
+define i16 @cvt_s2f6x2_f32_f32_rn_scale(float %f1, float %f2, i16 %scale) {
+; CHECK-LABEL: cvt_s2f6x2_f32_f32_rn_scale(
+; CHECK: {
+; CHECK-NEXT: .reg .b16 %rs<3>;
+; CHECK-NEXT: .reg .b32 %r<4>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.b32 %r1, [cvt_s2f6x2_f32_f32_rn_scale_param_0];
+; CHECK-NEXT: ld.param.b32 %r2, [cvt_s2f6x2_f32_f32_rn_scale_param_1];
+; CHECK-NEXT: ld.param.b16 %rs1, [cvt_s2f6x2_f32_f32_rn_scale_param_2];
+; CHECK-NEXT: cvt.rn.satfinite.scaled::n2::ue8m0.s2f6x2.f32 %rs2, %r1, %r2, %rs1;
+; CHECK-NEXT: cvt.u32.u16 %r3, %rs2;
+; CHECK-NEXT: st.param.b32 [func_retval0], %r3;
+; CHECK-NEXT: ret;
+ %val = call i16 @llvm.nvvm.ff.to.s2f6x2.rn.satfinite.scale.n2.ue8m0(float %f1, float %f2, i16 %scale)
+ ret i16 %val
+}
+
+define i16 @cvt_s2f6x2_f32_f32_rn_relu_scale(float %f1, float %f2, i16 %scale) {
+; CHECK-LABEL: cvt_s2f6x2_f32_f32_rn_relu_scale(
+; CHECK: {
+; CHECK-NEXT: .reg .b16 %rs<3>;
+; CHECK-NEXT: .reg .b32 %r<4>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.b32 %r1, [cvt_s2f6x2_f32_f32_rn_relu_scale_param_0];
+; CHECK-NEXT: ld.param.b32 %r2, [cvt_s2f6x2_f32_f32_rn_relu_scale_param_1];
+; CHECK-NEXT: ld.param.b16 %rs1, [cvt_s2f6x2_f32_f32_rn_relu_scale_param_2];
+; CHECK-NEXT: cvt.rn.satfinite.relu.scaled::n2::ue8m0.s2f6x2.f32 %rs2, %r1, %r2, %rs1;
+; CHECK-NEXT: cvt.u32.u16 %r3, %rs2;
+; CHECK-NEXT: st.param.b32 [func_retval0], %r3;
+; CHECK-NEXT: ret;
+ %val = call i16 @llvm.nvvm.ff.to.s2f6x2.rn.relu.satfinite.scale.n2.ue8m0(float %f1, float %f2, i16 %scale)
+ ret i16 %val
+}
+
+; From v2bf16 to s2f6x2
+define i16 @cvt_s2f6x2_bf16x2_rn(<2 x bfloat> %f) {
+; CHECK-LABEL: cvt_s2f6x2_bf16x2_rn(
+; CHECK: {
+; CHECK-NEXT: .reg .b16 %rs<2>;
+; CHECK-NEXT: .reg .b32 %r<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.b32 %r1, [cvt_s2f6x2_bf16x2_rn_param_0];
+; CHECK-NEXT: cvt.rn.satfinite.s2f6x2.bf16x2 %rs1, %r1;
+; CHECK-NEXT: cvt.u32.u16 %r2, %rs1;
+; CHECK-NEXT: st.param.b32 [func_retval0], %r2;
+; CHECK-NEXT: ret;
+ %val = call i16 @llvm.nvvm.bf16x2.to.s2f6x2.rn.satfinite(<2 x bfloat> %f)
+ ret i16 %val
+}
+
+define i16 @cvt_s2f6x2_bf16x2_rn_relu(<2 x bfloat> %f) {
+; CHECK-LABEL: cvt_s2f6x2_bf16x2_rn_relu(
+; CHECK: {
+; CHECK-NEXT: .reg .b16 %rs<2>;
+; CHECK-NEXT: .reg .b32 %r<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.b32 %r1, [cvt_s2f6x2_bf16x2_rn_relu_param_0];
+; CHECK-NEXT: cvt.rn.satfinite.relu.s2f6x2.bf16x2 %rs1, %r1;
+; CHECK-NEXT: cvt.u32.u16 %r2, %rs1;
+; CHECK-NEXT: st.param.b32 [func_retval0], %r2;
+; CHECK-NEXT: ret;
+ %val = call i16 @llvm.nvvm.bf16x2.to.s2f6x2.rn.relu.satfinite(<2 x bfloat> %f)
+ ret i16 %val
+}
+
+; From v2bf16 to s2f6x2 with scale variants
+define i16 @cvt_s2f6x2_bf16x2_rn_scale(<2 x bfloat> %f, i16 %scale) {
+; CHECK-LABEL: cvt_s2f6x2_bf16x2_rn_scale(
+; CHECK: {
+; CHECK-NEXT: .reg .b16 %rs<3>;
+; CHECK-NEXT: .reg .b32 %r<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.b32 %r1, [cvt_s2f6x2_bf16x2_rn_scale_param_0];
+; CHECK-NEXT: ld.param.b16 %rs1, [cvt_s2f6x2_bf16x2_rn_scale_param_1];
+; CHECK-NEXT: cvt.rn.satfinite.scaled::n2::ue8m0.s2f6x2.bf16x2 %rs2, %r1, %rs1;
+; CHECK-NEXT: cvt.u32.u16 %r2, %rs2;
+; CHECK-NEXT: st.param.b32 [func_retval0], %r2;
+; CHECK-NEXT: ret;
+ %val = call i16 @llvm.nvvm.bf16x2.to.s2f6x2.rn.satfinite.scale.n2.ue8m0(<2 x bfloat> %f, i16 %scale)
+ ret i16 %val
+}
+
+define i16 @cvt_s2f6x2_bf16x2_rn_relu_scale(<2 x bfloat> %f, i16 %scale) {
+; CHECK-LABEL: cvt_s2f6x2_bf16x2_rn_relu_scale(
+; CHECK: {
+; CHECK-NEXT: .reg .b16 %rs<3>;
+; CHECK-NEXT: .reg .b32 %r<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.b32 %r1, [cvt_s2f6x2_bf16x2_rn_relu_scale_param_0];
+; CHECK-NEXT: ld.param.b16 %rs1, [cvt_s2f6x2_bf16x2_rn_relu_scale_param_1];
+; CHECK-NEXT: cvt.rn.satfinite.relu.scaled::n2::ue8m0.s2f6x2.bf16x2 %rs2, %r1, %rs1;
+; CHECK-NEXT: cvt.u32.u16 %r2, %rs2;
+; CHECK-NEXT: st.param.b32 [func_retval0], %r2;
+; CHECK-NEXT: ret;
+ %val = call i16 @llvm.nvvm.bf16x2.to.s2f6x2.rn.relu.satfinite.scale.n2.ue8m0(<2 x bfloat> %f, i16 %scale)
+ ret i16 %val
+}
+
+; From s2f6x2 to v2bf16
+define <2 x bfloat> @cvt_bf16x2_s2f6x2_rn(i16 %a) {
+; CHECK-LABEL: cvt_bf16x2_s2f6x2_rn(
+; CHECK: {
+; CHECK-NEXT: .reg .b16 %rs<2>;
+; CHECK-NEXT: .reg .b32 %r<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.b16 %rs1, [cvt_bf16x2_s2f6x2_rn_param_0];
+; CHECK-NEXT: cvt.rn.bf16x2.s2f6x2 %r1, %rs1;
+; CHECK-NEXT: st.param.b32 [func_retval0], %r1;
+; CHECK-NEXT: ret;
+ %val = call <2 x bfloat> @llvm.nvvm.s2f6x2.to.bf16x2.rn(i16 %a)
+ ret <2 x bfloat> %val
+}
+
+define <2 x bfloat> @cvt_bf16x2_s2f6x2_rn_relu(i16 %a) {
+; CHECK-LABEL: cvt_bf16x2_s2f6x2_rn_relu(
+; CHECK: {
+; CHECK-NEXT: .reg .b16 %rs<2>;
+; CHECK-NEXT: .reg .b32 %r<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.b16 %rs1, [cvt_bf16x2_s2f6x2_rn_relu_param_0];
+; CHECK-NEXT: cvt.rn.relu.bf16x2.s2f6x2 %r1, %rs1;
+; CHECK-NEXT: st.param.b32 [func_retval0], %r1;
+; CHECK-NEXT: ret;
+ %val = call <2 x bfloat> @llvm.nvvm.s2f6x2.to.bf16x2.rn.relu(i16 %a)
+ ret <2 x bfloat> %val
+}
+
+define <2 x bfloat> @cvt_bf16x2_s2f6x2_rn_sf(i16 %a) {
+; CHECK-LABEL: cvt_bf16x2_s2f6x2_rn_sf(
+; CHECK: {
+; CHECK-NEXT: .reg .b16 %rs<2>;
+; CHECK-NEXT: .reg .b32 %r<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.b16 %rs1, [cvt_bf16x2_s2f6x2_rn_sf_param_0];
+; CHECK-NEXT: cvt.rn.satfinite.bf16x2.s2f6x2 %r1, %rs1;
+; CHECK-NEXT: st.param.b32 [func_retval0], %r1;
+; CHECK-NEXT: ret;
+ %val = call <2 x bfloat> @llvm.nvvm.s2f6x2.to.bf16x2.rn.satfinite(i16 %a)
+ ret <2 x bfloat> %val
+}
+
+define <2 x bfloat> @cvt_bf16x2_s2f6x2_rn_relu_sf(i16 %a) {
+; CHECK-LABEL: cvt_bf16x2_s2f6x2_rn_relu_sf(
+; CHECK: {
+; CHECK-NEXT: .reg .b16 %rs<2>;
+; CHECK-NEXT: .reg .b32 %r<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.b16 %rs1, [cvt_bf16x2_s2f6x2_rn_relu_sf_param_0];
+; CHECK-NEXT: cvt.rn.satfinite.relu.bf16x2.s2f6x2 %r1, %rs1;
+; CHECK-NEXT: st.param.b32 [func_retval0], %r1;
+; CHECK-NEXT: ret;
+ %val = call <2 x bfloat> @llvm.nvvm.s2f6x2.to.bf16x2.rn.relu.satfinite(i16 %a)
+ ret <2 x bfloat> %val
+}
+
+; From s2f6x2 to v2bf16 with scale variants
+define <2 x bfloat> @cvt_bf16x2_s2f6x2_rn_scale(i16 %a, i16 %scale) {
+; CHECK-LABEL: cvt_bf16x2_s2f6x2_rn_scale(
+; CHECK: {
+; CHECK-NEXT: .reg .b16 %rs<3>;
+; CHECK-NEXT: .reg .b32 %r<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.b16 %rs1, [cvt_bf16x2_s2f6x2_rn_scale_param_0];
+; CHECK-NEXT: ld.param.b16 %rs2, [cvt_bf16x2_s2f6x2_rn_scale_param_1];
+; CHECK-NEXT: cvt.rn.scaled::n2::ue8m0.bf16x2.s2f6x2 %r1, %rs1, %rs2;
+; CHECK-NEXT: st.param.b32 [func_retval0], %r1;
+; CHECK-NEXT: ret;
+ %val = call <2 x bfloat> @llvm.nvvm.s2f6x2.to.bf16x2.rn.scale.n2.ue8m0(i16 %a, i16 %scale)
+ ret <2 x bfloat> %val
+}
+
+define <2 x bfloat> @cvt_bf16x2_s2f6x2_rn_relu_scale(i16 %a, i16 %scale) {
+; CHECK-LABEL: cvt_bf16x2_s2f6x2_rn_relu_scale(
+; CHECK: {
+; CHECK-NEXT: .reg .b16 %rs<3>;
+; CHECK-NEXT: .reg .b32 %r<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.b16 %rs1, [cvt_bf16x2_s2f6x2_rn_relu_scale_param_0];
+; CHECK-NEXT: ld.param.b16 %rs2, [cvt_bf16x2_s2f6x2_rn_relu_scale_param_1];
+; CHECK-NEXT: cvt.rn.relu.scaled::n2::ue8m0.bf16x2.s2f6x2 %r1, %rs1, %rs2;
+; CHECK-NEXT: st.param.b32 [func_retval0], %r1;
+; CHECK-NEXT: ret;
+ %val = call <2 x bfloat> @llvm.nvvm.s2f6x2.to.bf16x2.rn.relu.scale.n2.ue8m0(i16 %a, i16 %scale)
+ ret <2 x bfloat> %val
+}
+
+define <2 x bfloat> @cvt_bf16x2_s2f6x2_rn_sf_scale(i16 %a, i16 %scale) {
+; CHECK-LABEL: cvt_bf16x2_s2f6x2_rn_sf_scale(
+; CHECK: {
+; CHECK-NEXT: .reg .b16 %rs<3>;
+; CHECK-NEXT: .reg .b32 %r<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.b16 %rs1, [cvt_bf16x2_s2f6x2_rn_sf_scale_param_0];
+; CHECK-NEXT: ld.param.b16 %rs2, [cvt_bf16x2_s2f6x2_rn_sf_scale_param_1];
+; CHECK-NEXT: cvt.rn.satfinite.scaled::n2::ue8m0.bf16x2.s2f6x2 %r1, %rs1, %rs2;
+; CHECK-NEXT: st.param.b32 [func_retval0], %r1;
+; CHECK-NEXT: ret;
+ %val = call <2 x bfloat> @llvm.nvvm.s2f6x2.to.bf16x2.rn.satfinite.scale.n2.ue8m0(i16 %a, i16 %scale)
+ ret <2 x bfloat> %val
+}
+
+define <2 x bfloat> @cvt_bf16x2_s2f6x2_rn_relu_sf_scale(i16 %a, i16 %scale) {
+; CHECK-LABEL: cvt_bf16x2_s2f6x2_rn_relu_sf_scale(
+; CHECK: {
+; CHECK-NEXT: .reg .b16 %rs<3>;
+; CHECK-NEXT: .reg .b32 %r<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.b16 %rs1, [cvt_bf16x2_s2f6x2_rn_relu_sf_scale_param_0];
+; CHECK-NEXT: ld.param.b16 %rs2, [cvt_bf16x2_s2f6x2_rn_relu_sf_scale_param_1];
+; CHECK-NEXT: cvt.rn.satfinite.relu.scaled::n2::ue8m0.bf16x2.s2f6x2 %r1, %rs1, %rs2;
+; CHECK-NEXT: st.param.b32 [func_retval0], %r1;
+; CHECK-NEXT: ret;
+ %val = call <2 x bfloat> @llvm.nvvm.s2f6x2.to.bf16x2.rn.relu.satfinite.scale.n2.ue8m0(i16 %a, i16 %scale)
+ ret <2 x bfloat> %val
+}
+
>From 70de12108c70b4ad17bda783be46e2235b9745b3 Mon Sep 17 00:00:00 2001
From: Srinivasa Ravi <srinivasar at nvidia.com>
Date: Thu, 18 Dec 2025 06:09:41 +0000
Subject: [PATCH 2/6] add all architectures in tests
---
llvm/test/CodeGen/NVPTX/convert_fp4x2_sm_100f.ll | 5 +++++
llvm/test/CodeGen/NVPTX/convert_fp6x2_sm_100f.ll | 5 +++++
llvm/test/CodeGen/NVPTX/convert_fp8x2_sm_100f.ll | 5 +++++
llvm/test/CodeGen/NVPTX/convert_s2f6x2_sm_100a.ll | 9 +++++++++
4 files changed, 24 insertions(+)
diff --git a/llvm/test/CodeGen/NVPTX/convert_fp4x2_sm_100f.ll b/llvm/test/CodeGen/NVPTX/convert_fp4x2_sm_100f.ll
index 3862e23effe3b..c35ccf5dc318b 100644
--- a/llvm/test/CodeGen/NVPTX/convert_fp4x2_sm_100f.ll
+++ b/llvm/test/CodeGen/NVPTX/convert_fp4x2_sm_100f.ll
@@ -1,5 +1,10 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
; RUN: llc < %s -march=nvptx64 -mcpu=sm_100f -mattr=+ptx91 | FileCheck %s
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_110f -mattr=+ptx91 | FileCheck %s
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_120f -mattr=+ptx91 | FileCheck %s
+; RUN: %if ptxas-sm_100f && ptxas-isa-9.1 %{ llc < %s -march=nvptx64 -mcpu=sm_100f -mattr=+ptx91 | %ptxas-verify -arch=sm_100f %}
+; RUN: %if ptxas-sm_110f && ptxas-isa-9.1 %{ llc < %s -march=nvptx64 -mcpu=sm_110f -mattr=+ptx91 | %ptxas-verify -arch=sm_110f %}
+; RUN: %if ptxas-sm_120f && ptxas-isa-9.1 %{ llc < %s -march=nvptx64 -mcpu=sm_120f -mattr=+ptx91 | %ptxas-verify -arch=sm_120f %}
; F16x2 to E2M1x2 (fp4x2)
define i16 @cvt_rn_f16x2_e2m1x2(<2 x half> %in) {
diff --git a/llvm/test/CodeGen/NVPTX/convert_fp6x2_sm_100f.ll b/llvm/test/CodeGen/NVPTX/convert_fp6x2_sm_100f.ll
index 43992328767eb..624e0ff14a13b 100644
--- a/llvm/test/CodeGen/NVPTX/convert_fp6x2_sm_100f.ll
+++ b/llvm/test/CodeGen/NVPTX/convert_fp6x2_sm_100f.ll
@@ -1,5 +1,10 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
; RUN: llc < %s -march=nvptx64 -mcpu=sm_100f -mattr=+ptx91 | FileCheck %s
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_110f -mattr=+ptx91 | FileCheck %s
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_120f -mattr=+ptx91 | FileCheck %s
+; RUN: %if ptxas-sm_100f && ptxas-isa-9.1 %{ llc < %s -march=nvptx64 -mcpu=sm_100f -mattr=+ptx91 | %ptxas-verify -arch=sm_100f %}
+; RUN: %if ptxas-sm_110f && ptxas-isa-9.1 %{ llc < %s -march=nvptx64 -mcpu=sm_110f -mattr=+ptx91 | %ptxas-verify -arch=sm_110f %}
+; RUN: %if ptxas-sm_120f && ptxas-isa-9.1 %{ llc < %s -march=nvptx64 -mcpu=sm_120f -mattr=+ptx91 | %ptxas-verify -arch=sm_120f %}
; F16x2 to E2M3x2/E3M2x2 (fp6)
define i16 @cvt_rn_f16x2_e2m3x2(<2 x half> %in) {
diff --git a/llvm/test/CodeGen/NVPTX/convert_fp8x2_sm_100f.ll b/llvm/test/CodeGen/NVPTX/convert_fp8x2_sm_100f.ll
index 0797db31b8155..b7e72c6224ced 100644
--- a/llvm/test/CodeGen/NVPTX/convert_fp8x2_sm_100f.ll
+++ b/llvm/test/CodeGen/NVPTX/convert_fp8x2_sm_100f.ll
@@ -1,5 +1,10 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
; RUN: llc < %s -march=nvptx64 -mcpu=sm_100f -mattr=+ptx91 | FileCheck %s
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_110f -mattr=+ptx91 | FileCheck %s
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_120f -mattr=+ptx91 | FileCheck %s
+; RUN: %if ptxas-sm_100f && ptxas-isa-9.1 %{ llc < %s -march=nvptx64 -mcpu=sm_100f -mattr=+ptx91 | %ptxas-verify -arch=sm_100f %}
+; RUN: %if ptxas-sm_110f && ptxas-isa-9.1 %{ llc < %s -march=nvptx64 -mcpu=sm_110f -mattr=+ptx91 | %ptxas-verify -arch=sm_110f %}
+; RUN: %if ptxas-sm_120f && ptxas-isa-9.1 %{ llc < %s -march=nvptx64 -mcpu=sm_120f -mattr=+ptx91 | %ptxas-verify -arch=sm_120f %}
; BF16x2 to E4M3x2/E5M2x2 (fp8x2)
define i16 @cvt_rn_e4m3x2_bf16x2(<2 x bfloat> %in) {
diff --git a/llvm/test/CodeGen/NVPTX/convert_s2f6x2_sm_100a.ll b/llvm/test/CodeGen/NVPTX/convert_s2f6x2_sm_100a.ll
index 1da38d2e1eb59..0e575774557eb 100644
--- a/llvm/test/CodeGen/NVPTX/convert_s2f6x2_sm_100a.ll
+++ b/llvm/test/CodeGen/NVPTX/convert_s2f6x2_sm_100a.ll
@@ -1,5 +1,14 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
; RUN: llc < %s -march=nvptx64 -mcpu=sm_100a -mattr=+ptx91 | FileCheck %s
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_103a -mattr=+ptx91 | FileCheck %s
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_110a -mattr=+ptx91 | FileCheck %s
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_120a -mattr=+ptx91 | FileCheck %s
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_121a -mattr=+ptx91 | FileCheck %s
+; RUN: %if ptxas-sm_100a && ptxas-isa-9.1 %{ llc < %s -march=nvptx64 -mcpu=sm_100a -mattr=+ptx91 | %ptxas-verify -arch=sm_100a %}
+; RUN: %if ptxas-sm_103a && ptxas-isa-9.1 %{ llc < %s -march=nvptx64 -mcpu=sm_103a -mattr=+ptx91 | %ptxas-verify -arch=sm_103a %}
+; RUN: %if ptxas-sm_110a && ptxas-isa-9.1 %{ llc < %s -march=nvptx64 -mcpu=sm_110a -mattr=+ptx91 | %ptxas-verify -arch=sm_110a %}
+; RUN: %if ptxas-sm_120a && ptxas-isa-9.1 %{ llc < %s -march=nvptx64 -mcpu=sm_120a -mattr=+ptx91 | %ptxas-verify -arch=sm_120a %}
+; RUN: %if ptxas-sm_121a && ptxas-isa-9.1 %{ llc < %s -march=nvptx64 -mcpu=sm_121a -mattr=+ptx91 | %ptxas-verify -arch=sm_121a %}
; From a pair of floats to s2f6x2
define i16 @cvt_s2f6x2_f32_f32_rn(float %f1, float %f2) {
>From 52d6dcb04e53a9799f119adb0c2e475112435371 Mon Sep 17 00:00:00 2001
From: Srinivasa Ravi <srinivasar at nvidia.com>
Date: Tue, 30 Dec 2025 05:49:14 +0000
Subject: [PATCH 3/6] add missing satfinite modifier to fp8x2 conversions
---
llvm/include/llvm/IR/IntrinsicsNVVM.td | 2 +-
llvm/lib/Target/NVPTX/NVPTXIntrinsics.td | 2 +-
llvm/test/CodeGen/NVPTX/convert_fp8x2_sm_100f.ll | 8 ++++----
3 files changed, 6 insertions(+), 6 deletions(-)
diff --git a/llvm/include/llvm/IR/IntrinsicsNVVM.td b/llvm/include/llvm/IR/IntrinsicsNVVM.td
index d9f9207a382ad..cae1958b5dbd8 100644
--- a/llvm/include/llvm/IR/IntrinsicsNVVM.td
+++ b/llvm/include/llvm/IR/IntrinsicsNVVM.td
@@ -1774,7 +1774,7 @@ let TargetPrefix = "nvvm" in {
def int_nvvm_ # type # _to_f16x2_rn # relu : NVVMBuiltin,
PureIntrinsic<[llvm_v2f16_ty], [llvm_i16_ty]>;
- def int_nvvm_bf16x2_to_ # type # _rn # relu
+ def int_nvvm_bf16x2_to_ # type # _rn # relu # _satfinite
: PureIntrinsic<[llvm_i16_ty], [llvm_v2bf16_ty]>;
}
}
diff --git a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
index d31ab6fc4b2da..2169b4439821c 100644
--- a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
+++ b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
@@ -2199,7 +2199,7 @@ let Predicates = [callSubtarget<"hasFP8ConversionSupport">] in {
let Predicates = [callSubtarget<"hasFP16X2ToNarrowFPConversionSupport">] in {
foreach dst_type = ["e4m3x2", "e5m2x2"] in {
foreach relu = ["", "_relu"] in {
- def : Pat<(!cast<Intrinsic>("int_nvvm_bf16x2_to_" # dst_type # "_rn" # relu)
+ def : Pat<(!cast<Intrinsic>("int_nvvm_bf16x2_to_" # dst_type # "_rn" # relu # "_satfinite")
B32:$a),
(!cast<NVPTXInst>("CVT_"# dst_type #"_bf16x2") $a,
!cast<PatLeaf>("CvtRN" # !toupper(relu)))>;
diff --git a/llvm/test/CodeGen/NVPTX/convert_fp8x2_sm_100f.ll b/llvm/test/CodeGen/NVPTX/convert_fp8x2_sm_100f.ll
index b7e72c6224ced..d889517e6f25d 100644
--- a/llvm/test/CodeGen/NVPTX/convert_fp8x2_sm_100f.ll
+++ b/llvm/test/CodeGen/NVPTX/convert_fp8x2_sm_100f.ll
@@ -19,7 +19,7 @@ define i16 @cvt_rn_e4m3x2_bf16x2(<2 x bfloat> %in) {
; CHECK-NEXT: cvt.u32.u16 %r2, %rs1;
; CHECK-NEXT: st.param.b32 [func_retval0], %r2;
; CHECK-NEXT: ret;
- %val = call i16 @llvm.nvvm.bf16x2.to.e4m3x2.rn(<2 x bfloat> %in)
+ %val = call i16 @llvm.nvvm.bf16x2.to.e4m3x2.rn.satfinite(<2 x bfloat> %in)
ret i16 %val
}
@@ -35,7 +35,7 @@ define i16 @cvt_rn_relu_e4m3x2_bf16x2(<2 x bfloat> %in) {
; CHECK-NEXT: cvt.u32.u16 %r2, %rs1;
; CHECK-NEXT: st.param.b32 [func_retval0], %r2;
; CHECK-NEXT: ret;
- %val = call i16 @llvm.nvvm.bf16x2.to.e4m3x2.rn.relu(<2 x bfloat> %in)
+ %val = call i16 @llvm.nvvm.bf16x2.to.e4m3x2.rn.relu.satfinite(<2 x bfloat> %in)
ret i16 %val
}
@@ -51,7 +51,7 @@ define i16 @cvt_rn_e5m2x2_bf16x2(<2 x bfloat> %in) {
; CHECK-NEXT: cvt.u32.u16 %r2, %rs1;
; CHECK-NEXT: st.param.b32 [func_retval0], %r2;
; CHECK-NEXT: ret;
- %val = call i16 @llvm.nvvm.bf16x2.to.e5m2x2.rn(<2 x bfloat> %in)
+ %val = call i16 @llvm.nvvm.bf16x2.to.e5m2x2.rn.satfinite(<2 x bfloat> %in)
ret i16 %val
}
@@ -67,6 +67,6 @@ define i16 @cvt_rn_relu_e5m2x2_bf16x2(<2 x bfloat> %in) {
; CHECK-NEXT: cvt.u32.u16 %r2, %rs1;
; CHECK-NEXT: st.param.b32 [func_retval0], %r2;
; CHECK-NEXT: ret;
- %val = call i16 @llvm.nvvm.bf16x2.to.e5m2x2.rn.relu(<2 x bfloat> %in)
+ %val = call i16 @llvm.nvvm.bf16x2.to.e5m2x2.rn.relu.satfinite(<2 x bfloat> %in)
ret i16 %val
}
>From bc8f74b6ca670dea745c7ab4d234088dc1bcd2e8 Mon Sep 17 00:00:00 2001
From: Srinivasa Ravi <srinivasar at nvidia.com>
Date: Tue, 30 Dec 2025 05:55:30 +0000
Subject: [PATCH 4/6] fix formatting
---
llvm/lib/Target/NVPTX/NVPTXSubtarget.h | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/llvm/lib/Target/NVPTX/NVPTXSubtarget.h b/llvm/lib/Target/NVPTX/NVPTXSubtarget.h
index c8ca0441fd2ef..2a2ed34003503 100644
--- a/llvm/lib/Target/NVPTX/NVPTXSubtarget.h
+++ b/llvm/lib/Target/NVPTX/NVPTXSubtarget.h
@@ -201,7 +201,7 @@ class NVPTXSubtarget : public NVPTXGenSubtargetInfo {
hasPTXWithFamilySMs(88, {100, 101, 120}) ||
hasPTXWithAccelSMs(86, {100, 101, 120});
}
-
+
// Checks support for conversions involving the following types:
// - bf16x2 -> f8x2
// - f16x2 -> f6x2
@@ -211,7 +211,7 @@ class NVPTXSubtarget : public NVPTXGenSubtargetInfo {
bool hasFP16X2ToNarrowFPConversionSupport() const {
return hasPTXWithFamilySMs(91, {100, 110, 120});
}
-
+
bool hasS2F6X2ConversionSupport() const {
return hasPTXWithAccelSMs(91, {100, 103, 110, 120, 121});
}
>From 3b83758c1076f34b1f1632827e6147aa2c98310c Mon Sep 17 00:00:00 2001
From: Srinivasa Ravi <srinivasar at nvidia.com>
Date: Tue, 30 Dec 2025 07:12:40 +0000
Subject: [PATCH 5/6] replace strconcat with #
---
llvm/lib/Target/NVPTX/NVPTXInstrInfo.td | 28 ++++++++++++-------------
1 file changed, 14 insertions(+), 14 deletions(-)
diff --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
index e3a038ed9a9df..3c20e4b66e17f 100644
--- a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
+++ b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
@@ -661,8 +661,8 @@ let hasSideEffects = false in {
Requires<[hasPTX<81>, hasSM<89>]>;
def _bf16x2
: NVPTXInst<(outs B16:$dst), (ins B32:$src, CvtMode:$mode),
- !strconcat("cvt${mode:base}.satfinite${mode:relu}.", F8Name,
- "x2.bf16x2 \t$dst, $src;"),
+ "cvt${mode:base}.satfinite${mode:relu}." # F8Name #
+ "x2.bf16x2 \t$dst, $src;",
[]>;
}
@@ -766,14 +766,12 @@ let hasSideEffects = false in {
def _f16x2_sf
: NVPTXInst<
(outs B16:$dst), (ins B32:$src, CvtMode:$mode),
- "cvt${mode:base}.satfinite${mode:relu}."#FP6Name#"x2.f16x2 "
- "\t$dst, $src;",
+ "cvt${mode:base}.satfinite${mode:relu}." # FP6Name # "x2.f16x2 \t$dst, $src;",
[]>;
def _bf16x2_sf
: NVPTXInst<
(outs B16:$dst), (ins B32:$src, CvtMode:$mode),
- "cvt${mode:base}.satfinite${mode:relu}."#FP6Name#"x2.bf16x2 "
- "\t$dst, $src;",
+ "cvt${mode:base}.satfinite${mode:relu}." # FP6Name # "x2.bf16x2 \t$dst, $src;",
[]>;
}
@@ -805,17 +803,19 @@ let hasSideEffects = false in {
def CVT_e2m1x2_f16x2_sf : NVPTXInst<(outs B16:$dst),
(ins B32:$src, CvtMode:$mode),
- !strconcat("{{ \n\t", ".reg .b8 \t%e2m1x2_out; \n\t",
- "cvt${mode:base}.satfinite${mode:relu}.e2m1x2."
- "f16x2 \t%e2m1x2_out, $src; \n\t",
- "cvt.u16.u8 \t$dst, %e2m1x2_out; \n\t", "}}"), []>;
+ "{{ \n\t" #
+ ".reg .b8 \t%e2m1x2_out; \n\t" #
+ "cvt${mode:base}.satfinite${mode:relu}.e2m1x2.f16x2 \t%e2m1x2_out, $src; \n\t" #
+ "cvt.u16.u8 \t$dst, %e2m1x2_out; \n\t" #
+ "}}", []>;
def CVT_e2m1x2_bf16x2_sf : NVPTXInst<(outs B16:$dst),
(ins B32:$src, CvtMode:$mode),
- !strconcat("{{ \n\t", ".reg .b8 \t%e2m1x2_out; \n\t",
- "cvt${mode:base}.satfinite${mode:relu}.e2m1x2."
- "bf16x2 \t%e2m1x2_out, $src; \n\t",
- "cvt.u16.u8 \t$dst, %e2m1x2_out; \n\t", "}}"), []>;
+ "{{ \n\t" #
+ ".reg .b8 \t%e2m1x2_out; \n\t" #
+ "cvt${mode:base}.satfinite${mode:relu}.e2m1x2.bf16x2 \t%e2m1x2_out, $src; \n\t" #
+ "cvt.u16.u8 \t$dst, %e2m1x2_out; \n\t" #
+ "}}", []>;
// UE8M0x2 conversions.
class CVT_f32_to_ue8m0x2<string sat = ""> :
>From 5eff068a6e6db1b466f4e0aebe4a92bffd593ff7 Mon Sep 17 00:00:00 2001
From: Srinivasa Ravi <srinivasar at nvidia.com>
Date: Wed, 31 Dec 2025 08:41:35 +0000
Subject: [PATCH 6/6] address comments
---
llvm/include/llvm/IR/IntrinsicsNVVM.td | 33 ++++-----
llvm/lib/Target/NVPTX/NVPTXInstrInfo.td | 86 ++++++++++--------------
llvm/lib/Target/NVPTX/NVPTXIntrinsics.td | 9 +--
3 files changed, 54 insertions(+), 74 deletions(-)
diff --git a/llvm/include/llvm/IR/IntrinsicsNVVM.td b/llvm/include/llvm/IR/IntrinsicsNVVM.td
index cae1958b5dbd8..499add5b78769 100644
--- a/llvm/include/llvm/IR/IntrinsicsNVVM.td
+++ b/llvm/include/llvm/IR/IntrinsicsNVVM.td
@@ -1737,30 +1737,23 @@ let TargetPrefix = "nvvm" in {
}
foreach relu = ["", "_relu"] in {
- def int_nvvm_ff_to_s2f6x2_rn # relu # _satfinite :
- PureIntrinsic<[llvm_i16_ty], [llvm_float_ty, llvm_float_ty]>;
-
- def int_nvvm_ff_to_s2f6x2_rn # relu # _satfinite_scale_n2_ue8m0 :
- PureIntrinsic<[llvm_i16_ty], [llvm_float_ty, llvm_float_ty, llvm_i16_ty]>;
-
- def int_nvvm_bf16x2_to_s2f6x2_rn # relu # _satfinite :
- PureIntrinsic<[llvm_i16_ty], [llvm_v2bf16_ty]>;
+ foreach has_scale = [true, false] in {
+ defvar scale_suffix = !if(has_scale, "_scale_n2_ue8m0", "");
+ defvar scale_arg = !if(has_scale, [llvm_i16_ty], []<LLVMType>);
- def int_nvvm_bf16x2_to_s2f6x2_rn # relu # _satfinite_scale_n2_ue8m0 :
- PureIntrinsic<[llvm_i16_ty], [llvm_v2bf16_ty, llvm_i16_ty]>;
+ def int_nvvm_ff_to_s2f6x2_rn # relu # _satfinite # scale_suffix :
+ PureIntrinsic<[llvm_i16_ty], [llvm_float_ty, llvm_float_ty] # scale_arg>;
- def int_nvvm_s2f6x2_to_bf16x2_rn # relu # _satfinite :
- PureIntrinsic<[llvm_v2bf16_ty], [llvm_i16_ty]>;
+ def int_nvvm_bf16x2_to_s2f6x2_rn # relu # _satfinite # scale_suffix :
+ PureIntrinsic<[llvm_i16_ty], [llvm_v2bf16_ty] # scale_arg>;
- def int_nvvm_s2f6x2_to_bf16x2_rn # relu # _satfinite_scale_n2_ue8m0 :
- PureIntrinsic<[llvm_v2bf16_ty], [llvm_i16_ty, llvm_i16_ty]>;
+ def int_nvvm_s2f6x2_to_bf16x2_rn # relu # _satfinite # scale_suffix :
+ PureIntrinsic<[llvm_v2bf16_ty], [llvm_i16_ty] # scale_arg>;
- // No satfinite variants
- def int_nvvm_s2f6x2_to_bf16x2_rn # relu :
- PureIntrinsic<[llvm_v2bf16_ty], [llvm_i16_ty]>;
-
- def int_nvvm_s2f6x2_to_bf16x2_rn # relu # _scale_n2_ue8m0 :
- PureIntrinsic<[llvm_v2bf16_ty], [llvm_i16_ty, llvm_i16_ty]>;
+ // No satfinite variants
+ def int_nvvm_s2f6x2_to_bf16x2_rn # relu # scale_suffix :
+ PureIntrinsic<[llvm_v2bf16_ty], [llvm_i16_ty] # scale_arg>;
+ }
}
foreach type = ["e4m3x2", "e5m2x2"] in {
diff --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
index 3c20e4b66e17f..24eceba481c29 100644
--- a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
+++ b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
@@ -659,11 +659,9 @@ let hasSideEffects = false in {
(ins B32:$src), (ins CvtMode:$mode),
"cvt${mode:base}.satfinite${mode:relu}." # F8Name # "x2.f16x2">,
Requires<[hasPTX<81>, hasSM<89>]>;
- def _bf16x2
- : NVPTXInst<(outs B16:$dst), (ins B32:$src, CvtMode:$mode),
- "cvt${mode:base}.satfinite${mode:relu}." # F8Name #
- "x2.bf16x2 \t$dst, $src;",
- []>;
+ def _bf16x2 :
+ BasicFlagsNVPTXInst<(outs B16:$dst), (ins B32:$src), (ins CvtMode:$mode),
+ "cvt${mode:base}.satfinite${mode:relu}." # F8Name # "x2.bf16x2">;
}
defm CVT_e4m3x2 : CVT_TO_F8X2<"e4m3">;
@@ -709,36 +707,30 @@ let hasSideEffects = false in {
defm CVT_to_tf32_rn_relu_satf : CVT_TO_TF32<"rn.relu.satfinite", [hasPTX<86>, hasSM<100>]>;
defm CVT_to_tf32_rz_relu_satf : CVT_TO_TF32<"rz.relu.satfinite", [hasPTX<86>, hasSM<100>]>;
- def CVT_s2f6x2_f32_sf : NVPTXInst<(outs B16:$dst),
- (ins B32:$src1, B32:$src2, CvtMode:$mode),
- "cvt${mode:base}.satfinite${mode:relu}.s2f6x2.f32 \t$dst, $src1, $src2;", []>;
- def CVT_s2f6x2_f32_sf_scale : NVPTXInst<(outs B16:$dst),
- (ins B32:$src1, B32:$src2, B16:$scale, CvtMode:$mode),
- "cvt${mode:base}.satfinite${mode:relu}.scaled::n2::ue8m0.s2f6x2.f32 \t"
- "$dst, $src1, $src2, $scale;", []>;
-
- def CVT_s2f6x2_bf16x2_sf : NVPTXInst<(outs B16:$dst),
- (ins B32:$src, CvtMode:$mode),
- "cvt${mode:base}.satfinite${mode:relu}.s2f6x2.bf16x2 \t$dst, $src;", []>;
- def CVT_s2f6x2_bf16x2_sf_scale : NVPTXInst<(outs B16:$dst),
- (ins B32:$src, B16:$scale, CvtMode:$mode),
- "cvt${mode:base}.satfinite${mode:relu}.scaled::n2::ue8m0.s2f6x2.bf16x2 \t"
- "$dst, $src, $scale;", []>;
-
- def CVT_bf16x2_s2f6x2 : NVPTXInst<(outs B32:$dst),
- (ins B16:$src, CvtMode:$mode),
- "cvt${mode:base}${mode:relu}.bf16x2.s2f6x2 \t$dst, $src;", []>;
- def CVT_bf16x2_s2f6x2_scale : NVPTXInst<(outs B32:$dst),
- (ins B16:$src, B16:$scale, CvtMode:$mode),
- "cvt${mode:base}${mode:relu}.scaled::n2::ue8m0.bf16x2.s2f6x2 \t"
- "$dst, $src, $scale;", []>;
- def CVT_bf16x2_s2f6x2_sf : NVPTXInst<(outs B32:$dst),
- (ins B16:$src, CvtMode:$mode),
- "cvt${mode:base}.satfinite${mode:relu}.bf16x2.s2f6x2 \t$dst, $src;", []>;
- def CVT_bf16x2_s2f6x2_sf_scale : NVPTXInst<(outs B32:$dst),
- (ins B16:$src, B16:$scale, CvtMode:$mode),
- "cvt${mode:base}.satfinite${mode:relu}.scaled::n2::ue8m0.bf16x2.s2f6x2 \t"
- "$dst, $src, $scale;", []>;
+ def CVT_s2f6x2_f32_sf : BasicFlagsNVPTXInst<(outs B16:$dst),
+ (ins B32:$src1, B32:$src2), (ins CvtMode:$mode),
+ "cvt${mode:base}.satfinite${mode:relu}.s2f6x2.f32">;
+ def CVT_s2f6x2_f32_sf_scale : BasicFlagsNVPTXInst<(outs B16:$dst),
+ (ins B32:$src1, B32:$src2, B16:$scale), (ins CvtMode:$mode),
+ "cvt${mode:base}.satfinite${mode:relu}.scaled::n2::ue8m0.s2f6x2.f32">;
+ def CVT_s2f6x2_bf16x2_sf : BasicFlagsNVPTXInst<(outs B16:$dst),
+ (ins B32:$src), (ins CvtMode:$mode),
+ "cvt${mode:base}.satfinite${mode:relu}.s2f6x2.bf16x2">;
+ def CVT_s2f6x2_bf16x2_sf_scale : BasicFlagsNVPTXInst<(outs B16:$dst),
+ (ins B32:$src, B16:$scale), (ins CvtMode:$mode),
+ "cvt${mode:base}.satfinite${mode:relu}.scaled::n2::ue8m0.s2f6x2.bf16x2">;
+ def CVT_bf16x2_s2f6x2 : BasicFlagsNVPTXInst<(outs B32:$dst),
+ (ins B16:$src), (ins CvtMode:$mode),
+ "cvt${mode:base}${mode:relu}.bf16x2.s2f6x2">;
+ def CVT_bf16x2_s2f6x2_scale : BasicFlagsNVPTXInst<(outs B32:$dst),
+ (ins B16:$src, B16:$scale), (ins CvtMode:$mode),
+ "cvt${mode:base}${mode:relu}.scaled::n2::ue8m0.bf16x2.s2f6x2">;
+ def CVT_bf16x2_s2f6x2_sf : BasicFlagsNVPTXInst<(outs B32:$dst),
+ (ins B16:$src), (ins CvtMode:$mode),
+ "cvt${mode:base}.satfinite${mode:relu}.bf16x2.s2f6x2">;
+ def CVT_bf16x2_s2f6x2_sf_scale : BasicFlagsNVPTXInst<(outs B32:$dst),
+ (ins B16:$src, B16:$scale), (ins CvtMode:$mode),
+ "cvt${mode:base}.satfinite${mode:relu}.scaled::n2::ue8m0.bf16x2.s2f6x2">;
// FP6 conversions.
foreach type = ["e2m3x2", "e3m2x2"] in {
@@ -758,21 +750,15 @@ let hasSideEffects = false in {
multiclass CVT_TO_FP6X2<string FP6Name> {
def _f32_sf :
- NVPTXInst<(outs B16:$dst),
- (ins B32:$src1, B32:$src2, CvtMode:$mode),
- "cvt${mode:base}.satfinite${mode:relu}." # FP6Name
- # "x2.f32"
- "\t$dst, $src1, $src2;", []>;
- def _f16x2_sf
- : NVPTXInst<
- (outs B16:$dst), (ins B32:$src, CvtMode:$mode),
- "cvt${mode:base}.satfinite${mode:relu}." # FP6Name # "x2.f16x2 \t$dst, $src;",
- []>;
- def _bf16x2_sf
- : NVPTXInst<
- (outs B16:$dst), (ins B32:$src, CvtMode:$mode),
- "cvt${mode:base}.satfinite${mode:relu}." # FP6Name # "x2.bf16x2 \t$dst, $src;",
- []>;
+ BasicFlagsNVPTXInst<(outs B16:$dst),
+ (ins B32:$src1, B32:$src2), (ins CvtMode:$mode),
+ "cvt${mode:base}.satfinite${mode:relu}." # FP6Name # "x2.f32">;
+ def _f16x2_sf :
+ BasicFlagsNVPTXInst<(outs B16:$dst), (ins B32:$src), (ins CvtMode:$mode),
+ "cvt${mode:base}.satfinite${mode:relu}." # FP6Name # "x2.f16x2">;
+ def _bf16x2_sf :
+ BasicFlagsNVPTXInst<(outs B16:$dst), (ins B32:$src), (ins CvtMode:$mode),
+ "cvt${mode:base}.satfinite${mode:relu}." # FP6Name # "x2.bf16x2">;
}
defm CVT_e2m3x2 : CVT_TO_FP6X2<"e2m3">;
diff --git a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
index 2169b4439821c..93887ea3a92bc 100644
--- a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
+++ b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
@@ -2199,10 +2199,11 @@ let Predicates = [callSubtarget<"hasFP8ConversionSupport">] in {
let Predicates = [callSubtarget<"hasFP16X2ToNarrowFPConversionSupport">] in {
foreach dst_type = ["e4m3x2", "e5m2x2"] in {
foreach relu = ["", "_relu"] in {
- def : Pat<(!cast<Intrinsic>("int_nvvm_bf16x2_to_" # dst_type # "_rn" # relu # "_satfinite")
- B32:$a),
- (!cast<NVPTXInst>("CVT_"# dst_type #"_bf16x2") $a,
- !cast<PatLeaf>("CvtRN" # !toupper(relu)))>;
+ defvar intrin = !cast<Intrinsic>("int_nvvm_bf16x2_to_" # dst_type # "_rn" # relu # "_satfinite");
+ defvar cvt_inst = !cast<NVPTXInst>("CVT_"# dst_type #"_bf16x2");
+ defvar cvt_mode = !cast<PatLeaf>("CvtRN" # !toupper(relu));
+ def : Pat<(intrin v2bf16:$a),
+ (cvt_inst $a, cvt_mode)>;
}
}
}
More information about the llvm-commits
mailing list