[llvm] b4a0d7e - [NVPTX] Fix PTX and SM conditions for narrow FP conversions (#168680)
via llvm-commits
llvm-commits at lists.llvm.org
Thu Nov 20 21:51:41 PST 2025
Author: Srinivasa Ravi
Date: 2025-11-21T11:21:37+05:30
New Revision: b4a0d7e89fac280b2917b3fca906ad2f3a52da74
URL: https://github.com/llvm/llvm-project/commit/b4a0d7e89fac280b2917b3fca906ad2f3a52da74
DIFF: https://github.com/llvm/llvm-project/commit/b4a0d7e89fac280b2917b3fca906ad2f3a52da74.diff
LOG: [NVPTX] Fix PTX and SM conditions for narrow FP conversions (#168680)
This change fixes the PTX and SM conditions for narrow FP
conversion intrinsics and adds support for family-conditionals.
Added:
Modified:
llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
llvm/lib/Target/NVPTX/NVPTXSubtarget.h
Removed:
################################################################################
diff --git a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
index 8fd014a09cc58..b54cce4781b8d 100644
--- a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
+++ b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
@@ -2071,34 +2071,36 @@ def : Pat<(int_nvvm_ull2d_rp i64:$a), (CVT_f64_u64 $a, CvtRP)>;
def : Pat<(int_nvvm_f2h_rn_ftz f32:$a), (CVT_f16_f32 $a, CvtRN_FTZ)>;
def : Pat<(int_nvvm_f2h_rn f32:$a), (CVT_f16_f32 $a, CvtRN)>;
-def : Pat<(int_nvvm_ff_to_e4m3x2_rn f32:$a, f32:$b),
- (CVT_e4m3x2_f32 $a, $b, CvtRN)>;
-def : Pat<(int_nvvm_ff_to_e4m3x2_rn_relu f32:$a, f32:$b),
- (CVT_e4m3x2_f32 $a, $b, CvtRN_RELU)>;
-def : Pat<(int_nvvm_ff_to_e5m2x2_rn f32:$a, f32:$b),
- (CVT_e5m2x2_f32 $a, $b, CvtRN)>;
-def : Pat<(int_nvvm_ff_to_e5m2x2_rn_relu f32:$a, f32:$b),
- (CVT_e5m2x2_f32 $a, $b, CvtRN_RELU)>;
-
-def : Pat<(int_nvvm_f16x2_to_e4m3x2_rn v2f16:$a),
- (CVT_e4m3x2_f16x2 $a, CvtRN)>;
-def : Pat<(int_nvvm_f16x2_to_e4m3x2_rn_relu v2f16:$a),
- (CVT_e4m3x2_f16x2 $a, CvtRN_RELU)>;
-def : Pat<(int_nvvm_f16x2_to_e5m2x2_rn v2f16:$a),
- (CVT_e5m2x2_f16x2 $a, CvtRN)>;
-def : Pat<(int_nvvm_f16x2_to_e5m2x2_rn_relu v2f16:$a),
- (CVT_e5m2x2_f16x2 $a, CvtRN_RELU)>;
-
-def : Pat<(int_nvvm_e4m3x2_to_f16x2_rn i16:$a),
- (CVT_f16x2_e4m3x2 $a, CvtRN)>;
-def : Pat<(int_nvvm_e4m3x2_to_f16x2_rn_relu i16:$a),
- (CVT_f16x2_e4m3x2 $a, CvtRN_RELU)>;
-def : Pat<(int_nvvm_e5m2x2_to_f16x2_rn i16:$a),
- (CVT_f16x2_e5m2x2 $a, CvtRN)>;
-def : Pat<(int_nvvm_e5m2x2_to_f16x2_rn_relu i16:$a),
- (CVT_f16x2_e5m2x2 $a, CvtRN_RELU)>;
-
-let Predicates = [hasPTX<86>, hasSM<100>, hasArchAccelFeatures] in {
+let Predicates = [callSubtarget<"hasFP8ConversionSupport">] in {
+ def : Pat<(int_nvvm_ff_to_e4m3x2_rn f32:$a, f32:$b),
+ (CVT_e4m3x2_f32 $a, $b, CvtRN)>;
+ def : Pat<(int_nvvm_ff_to_e4m3x2_rn_relu f32:$a, f32:$b),
+ (CVT_e4m3x2_f32 $a, $b, CvtRN_RELU)>;
+ def : Pat<(int_nvvm_ff_to_e5m2x2_rn f32:$a, f32:$b),
+ (CVT_e5m2x2_f32 $a, $b, CvtRN)>;
+ def : Pat<(int_nvvm_ff_to_e5m2x2_rn_relu f32:$a, f32:$b),
+ (CVT_e5m2x2_f32 $a, $b, CvtRN_RELU)>;
+
+ def : Pat<(int_nvvm_f16x2_to_e4m3x2_rn v2f16:$a),
+ (CVT_e4m3x2_f16x2 $a, CvtRN)>;
+ def : Pat<(int_nvvm_f16x2_to_e4m3x2_rn_relu v2f16:$a),
+ (CVT_e4m3x2_f16x2 $a, CvtRN_RELU)>;
+ def : Pat<(int_nvvm_f16x2_to_e5m2x2_rn v2f16:$a),
+ (CVT_e5m2x2_f16x2 $a, CvtRN)>;
+ def : Pat<(int_nvvm_f16x2_to_e5m2x2_rn_relu v2f16:$a),
+ (CVT_e5m2x2_f16x2 $a, CvtRN_RELU)>;
+
+ def : Pat<(int_nvvm_e4m3x2_to_f16x2_rn i16:$a),
+ (CVT_f16x2_e4m3x2 $a, CvtRN)>;
+ def : Pat<(int_nvvm_e4m3x2_to_f16x2_rn_relu i16:$a),
+ (CVT_f16x2_e4m3x2 $a, CvtRN_RELU)>;
+ def : Pat<(int_nvvm_e5m2x2_to_f16x2_rn i16:$a),
+ (CVT_f16x2_e5m2x2 $a, CvtRN)>;
+ def : Pat<(int_nvvm_e5m2x2_to_f16x2_rn_relu i16:$a),
+ (CVT_f16x2_e5m2x2 $a, CvtRN_RELU)>;
+}
+
+let Predicates = [callSubtarget<"hasNarrowFPConversionSupport">] in {
def : Pat<(int_nvvm_ff_to_e2m3x2_rn_satfinite f32:$a, f32:$b),
(CVT_e2m3x2_f32_sf $a, $b, CvtRN)>;
def : Pat<(int_nvvm_ff_to_e2m3x2_rn_relu_satfinite f32:$a, f32:$b),
diff --git a/llvm/lib/Target/NVPTX/NVPTXSubtarget.h b/llvm/lib/Target/NVPTX/NVPTXSubtarget.h
index 021b1f6d0bf57..f11d331862081 100644
--- a/llvm/lib/Target/NVPTX/NVPTXSubtarget.h
+++ b/llvm/lib/Target/NVPTX/NVPTXSubtarget.h
@@ -177,6 +177,27 @@ class NVPTXSubtarget : public NVPTXGenSubtargetInfo {
hasPTXWithAccelSMs(86, {100, 101});
}
+ // Checks support for conversions involving e4m3x2 and e5m2x2.
+ bool hasFP8ConversionSupport() const {
+ if (PTXVersion >= 81)
+ return SmVersion >= 89;
+
+ if (PTXVersion >= 78)
+ return SmVersion >= 90;
+
+ return false;
+ }
+
+ // Checks support for conversions involving the following types:
+ // - e2m3x2/e3m2x2
+ // - e2m1x2
+ // - ue8m0x2
+ bool hasNarrowFPConversionSupport() const {
+ return hasPTXWithFamilySMs(90, {100, 110, 120}) ||
+ hasPTXWithFamilySMs(88, {100, 101, 120}) ||
+ hasPTXWithAccelSMs(86, {100, 101, 120});
+ }
+
// Prior to CUDA 12.3 ptxas did not recognize that the trap instruction
// terminates a basic block. Instead, it would assume that control flow
// continued to the next instruction. The next instruction could be in the
More information about the llvm-commits
mailing list