[llvm] 1f82e81 - [clang][NVPTX] Add intrinsics and builtins for CVT RS rounding mode (#160494)
via llvm-commits
llvm-commits at lists.llvm.org
Sun Oct 5 20:45:28 PDT 2025
- Previous message: [llvm] [PowerPC] Exploit xxeval instruction for operations of the form ternary(A, X, nor(B,C)), ternary(A, X, eqv(B,C)), ternary(A, X, nand(B,C)), ternary(A, X, not(B)) and ternary(A, X, not(C)) (PR #158096)
- Next message: [llvm] [SPIRV] Added Support for the SPV_INTEL_arbitrary_precesion_floating_point Extension (PR #160054)
- Messages sorted by:
[ date ]
[ thread ]
[ subject ]
[ author ]
Author: Srinivasa Ravi
Date: 2025-10-06T09:15:23+05:30
New Revision: 1f82e818faac8a2ff868ec364ce1f3de5ceebf2d
URL: https://github.com/llvm/llvm-project/commit/1f82e818faac8a2ff868ec364ce1f3de5ceebf2d
DIFF: https://github.com/llvm/llvm-project/commit/1f82e818faac8a2ff868ec364ce1f3de5ceebf2d.diff
LOG: [clang][NVPTX] Add intrinsics and builtins for CVT RS rounding mode (#160494)
This change adds LLVM intrinsics and clang builtins for the `cvt`
RS rounding mode instruction variants.
Tests are added in `convert-sm103a.ll` and verified through ptxas-13.0.
Added:
llvm/test/CodeGen/NVPTX/convert-sm103a.ll
Modified:
clang/include/clang/Basic/BuiltinsNVPTX.td
clang/test/CodeGen/builtins-nvptx.c
llvm/include/llvm/IR/IntrinsicsNVVM.td
llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp
llvm/lib/Target/NVPTX/NVPTX.h
llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
llvm/lib/Target/NVPTX/NVPTXISelLowering.h
llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
Removed:
################################################################################
diff --git a/clang/include/clang/Basic/BuiltinsNVPTX.td b/clang/include/clang/Basic/BuiltinsNVPTX.td
index 2d6fa1771014d..d923d2a90e908 100644
--- a/clang/include/clang/Basic/BuiltinsNVPTX.td
+++ b/clang/include/clang/Basic/BuiltinsNVPTX.td
@@ -579,11 +579,35 @@ def __nvvm_ff2bf16x2_rn : NVPTXBuiltinSMAndPTX<"_Vector<2, __bf16>(float, float)
def __nvvm_ff2bf16x2_rn_relu : NVPTXBuiltinSMAndPTX<"_Vector<2, __bf16>(float, float)", SM_80, PTX70>;
def __nvvm_ff2bf16x2_rz : NVPTXBuiltinSMAndPTX<"_Vector<2, __bf16>(float, float)", SM_80, PTX70>;
def __nvvm_ff2bf16x2_rz_relu : NVPTXBuiltinSMAndPTX<"_Vector<2, __bf16>(float, float)", SM_80, PTX70>;
+def __nvvm_ff2bf16x2_rs :
+ NVPTXBuiltinSMAndPTX<"_Vector<2, __bf16>(float, float, uint32_t)",
+ SM<"100a", [SM_103a]>, PTX87>;
+def __nvvm_ff2bf16x2_rs_relu :
+ NVPTXBuiltinSMAndPTX<"_Vector<2, __bf16>(float, float, uint32_t)",
+ SM<"100a", [SM_103a]>, PTX87>;
+def __nvvm_ff2bf16x2_rs_satfinite :
+ NVPTXBuiltinSMAndPTX<"_Vector<2, __bf16>(float, float, uint32_t)",
+ SM<"100a", [SM_103a]>, PTX87>;
+def __nvvm_ff2bf16x2_rs_relu_satfinite :
+ NVPTXBuiltinSMAndPTX<"_Vector<2, __bf16>(float, float, uint32_t)",
+ SM<"100a", [SM_103a]>, PTX87>;
def __nvvm_ff2f16x2_rn : NVPTXBuiltinSMAndPTX<"_Vector<2, __fp16>(float, float)", SM_80, PTX70>;
def __nvvm_ff2f16x2_rn_relu : NVPTXBuiltinSMAndPTX<"_Vector<2, __fp16>(float, float)", SM_80, PTX70>;
def __nvvm_ff2f16x2_rz : NVPTXBuiltinSMAndPTX<"_Vector<2, __fp16>(float, float)", SM_80, PTX70>;
def __nvvm_ff2f16x2_rz_relu : NVPTXBuiltinSMAndPTX<"_Vector<2, __fp16>(float, float)", SM_80, PTX70>;
+def __nvvm_ff2f16x2_rs :
+ NVPTXBuiltinSMAndPTX<"_Vector<2, __fp16>(float, float, uint32_t)",
+ SM<"100a", [SM_103a]>, PTX87>;
+def __nvvm_ff2f16x2_rs_relu :
+ NVPTXBuiltinSMAndPTX<"_Vector<2, __fp16>(float, float, uint32_t)",
+ SM<"100a", [SM_103a]>, PTX87>;
+def __nvvm_ff2f16x2_rs_satfinite :
+ NVPTXBuiltinSMAndPTX<"_Vector<2, __fp16>(float, float, uint32_t)",
+ SM<"100a", [SM_103a]>, PTX87>;
+def __nvvm_ff2f16x2_rs_relu_satfinite :
+ NVPTXBuiltinSMAndPTX<"_Vector<2, __fp16>(float, float, uint32_t)",
+ SM<"100a", [SM_103a]>, PTX87>;
def __nvvm_f2bf16_rn : NVPTXBuiltinSMAndPTX<"__bf16(float)", SM_80, PTX70>;
def __nvvm_f2bf16_rn_relu : NVPTXBuiltinSMAndPTX<"__bf16(float)", SM_80, PTX70>;
@@ -616,6 +640,19 @@ def __nvvm_e4m3x2_to_f16x2_rn_relu : NVPTXBuiltinSMAndPTX<"_Vector<2, __fp16>(sh
def __nvvm_e5m2x2_to_f16x2_rn : NVPTXBuiltinSMAndPTX<"_Vector<2, __fp16>(short)", SM_89, PTX81>;
def __nvvm_e5m2x2_to_f16x2_rn_relu : NVPTXBuiltinSMAndPTX<"_Vector<2, __fp16>(short)", SM_89, PTX81>;
+def __nvvm_f32x4_to_e4m3x4_rs_satfinite :
+ NVPTXBuiltinSMAndPTX<"_Vector<4, char>(_Vector<4, float>, uint32_t)",
+ SM<"100a", [SM_103a]>, PTX87>;
+def __nvvm_f32x4_to_e4m3x4_rs_relu_satfinite :
+ NVPTXBuiltinSMAndPTX<"_Vector<4, char>(_Vector<4, float>, uint32_t)",
+ SM<"100a", [SM_103a]>, PTX87>;
+def __nvvm_f32x4_to_e5m2x4_rs_satfinite :
+ NVPTXBuiltinSMAndPTX<"_Vector<4, char>(_Vector<4, float>, uint32_t)",
+ SM<"100a", [SM_103a]>, PTX87>;
+def __nvvm_f32x4_to_e5m2x4_rs_relu_satfinite :
+ NVPTXBuiltinSMAndPTX<"_Vector<4, char>(_Vector<4, float>, uint32_t)",
+ SM<"100a", [SM_103a]>, PTX87>;
+
def __nvvm_ff_to_e2m3x2_rn_satfinite : NVPTXBuiltinSMAndPTX<"short(float, float)", SM<"100a", [SM_101a, SM_120a]>, PTX86>;
def __nvvm_ff_to_e2m3x2_rn_relu_satfinite : NVPTXBuiltinSMAndPTX<"short(float, float)", SM<"100a", [SM_101a, SM_120a]>, PTX86>;
def __nvvm_ff_to_e3m2x2_rn_satfinite : NVPTXBuiltinSMAndPTX<"short(float, float)", SM<"100a", [SM_101a, SM_120a]>, PTX86>;
@@ -626,12 +663,32 @@ def __nvvm_e2m3x2_to_f16x2_rn_relu : NVPTXBuiltinSMAndPTX<"_Vector<2, __fp16>(sh
def __nvvm_e3m2x2_to_f16x2_rn : NVPTXBuiltinSMAndPTX<"_Vector<2, __fp16>(short)", SM<"100a", [SM_101a, SM_120a]>, PTX86>;
def __nvvm_e3m2x2_to_f16x2_rn_relu : NVPTXBuiltinSMAndPTX<"_Vector<2, __fp16>(short)", SM<"100a", [SM_101a, SM_120a]>, PTX86>;
+def __nvvm_f32x4_to_e2m3x4_rs_satfinite :
+ NVPTXBuiltinSMAndPTX<"_Vector<4, char>(_Vector<4, float>, uint32_t)",
+ SM<"100a", [SM_103a]>, PTX87>;
+def __nvvm_f32x4_to_e2m3x4_rs_relu_satfinite :
+ NVPTXBuiltinSMAndPTX<"_Vector<4, char>(_Vector<4, float>, uint32_t)",
+ SM<"100a", [SM_103a]>, PTX87>;
+def __nvvm_f32x4_to_e3m2x4_rs_satfinite :
+ NVPTXBuiltinSMAndPTX<"_Vector<4, char>(_Vector<4, float>, uint32_t)",
+ SM<"100a", [SM_103a]>, PTX87>;
+def __nvvm_f32x4_to_e3m2x4_rs_relu_satfinite :
+ NVPTXBuiltinSMAndPTX<"_Vector<4, char>(_Vector<4, float>, uint32_t)",
+ SM<"100a", [SM_103a]>, PTX87>;
+
def __nvvm_ff_to_e2m1x2_rn_satfinite : NVPTXBuiltinSMAndPTX<"short(float, float)", SM<"100a", [SM_101a, SM_120a]>, PTX86>;
def __nvvm_ff_to_e2m1x2_rn_relu_satfinite : NVPTXBuiltinSMAndPTX<"short(float, float)", SM<"100a", [SM_101a, SM_120a]>, PTX86>;
def __nvvm_e2m1x2_to_f16x2_rn : NVPTXBuiltinSMAndPTX<"_Vector<2, __fp16>(short)", SM<"100a", [SM_101a, SM_120a]>, PTX86>;
def __nvvm_e2m1x2_to_f16x2_rn_relu : NVPTXBuiltinSMAndPTX<"_Vector<2, __fp16>(short)", SM<"100a", [SM_101a, SM_120a]>, PTX86>;
+def __nvvm_f32x4_to_e2m1x4_rs_satfinite :
+ NVPTXBuiltinSMAndPTX<"short(_Vector<4, float>, uint32_t)",
+ SM<"100a", [SM_103a]>, PTX87>;
+def __nvvm_f32x4_to_e2m1x4_rs_relu_satfinite :
+ NVPTXBuiltinSMAndPTX<"short(_Vector<4, float>, uint32_t)",
+ SM<"100a", [SM_103a]>, PTX87>;
+
def __nvvm_ff_to_ue8m0x2_rz : NVPTXBuiltinSMAndPTX<"short(float, float)", SM<"100a", [SM_101a, SM_120a]>, PTX86>;
def __nvvm_ff_to_ue8m0x2_rz_satfinite : NVPTXBuiltinSMAndPTX<"short(float, float)", SM<"100a", [SM_101a, SM_120a]>, PTX86>;
def __nvvm_ff_to_ue8m0x2_rp : NVPTXBuiltinSMAndPTX<"short(float, float)", SM<"100a", [SM_101a, SM_120a]>, PTX86>;
diff --git a/clang/test/CodeGen/builtins-nvptx.c b/clang/test/CodeGen/builtins-nvptx.c
index f994adb14e457..e3be262622844 100644
--- a/clang/test/CodeGen/builtins-nvptx.c
+++ b/clang/test/CodeGen/builtins-nvptx.c
@@ -43,6 +43,12 @@
// RUN: %clang_cc1 -ffp-contract=off -triple nvptx64-unknown-unknown -target-cpu sm_120a -target-feature +ptx86 -DPTX=86 \
// RUN: -disable-llvm-optzns -fcuda-is-device -emit-llvm -o - -x cuda %s \
// RUN: | FileCheck -check-prefix=CHECK -check-prefix=CHECK_PTX86_SM120a %s
+// RUN: %clang_cc1 -ffp-contract=off -triple nvptx64-unknown-unknown -target-cpu sm_103a -target-feature +ptx87 -DPTX=87 \
+// RUN: -disable-llvm-optzns -fcuda-is-device -emit-llvm -o - -x cuda %s \
+// RUN: | FileCheck -check-prefix=CHECK -check-prefix=CHECK_PTX87_SM103a %s
+// RUN: %clang_cc1 -ffp-contract=off -triple nvptx64-unknown-unknown -target-cpu sm_100a -target-feature +ptx87 -DPTX=87 \
+// RUN: -disable-llvm-optzns -fcuda-is-device -emit-llvm -o - -x cuda %s \
+// RUN: | FileCheck -check-prefix=CHECK -check-prefix=CHECK_PTX87_SM100a %s
// ### The last run to check with the highest SM and PTX version available
// ### to make sure target builtins are still accepted.
// RUN: %clang_cc1 -ffp-contract=off -triple nvptx64-unknown-unknown -target-cpu sm_120a -target-feature +ptx87 -DPTX=87 \
@@ -1203,6 +1209,123 @@ __device__ void nvvm_cvt_sm100a_sm101a_sm120a() {
// CHECK: ret void
}
+__device__ void nvvm_cvt_sm100a_sm103a() {
+#if (PTX >= 87) && (__CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM103_ALL)
+
+ typedef __fp16 f16x2 __attribute__((ext_vector_type(2)));
+ typedef __bf16 bf16x2 __attribute__((ext_vector_type(2)));
+ typedef char uint8x4 __attribute__((ext_vector_type(4)));
+
+// CHECK_PTX87_SM100a: %[[R1:.*]] = call <2 x half> @llvm.nvvm.ff2f16x2.rs(float 1.000000e+00, float 1.000000e+00, i32 0)
+// CHECK_PTX87_SM100a: store <2 x half> %[[R1]], ptr %r1
+// CHECK_PTX87_SM103a: %[[R1:.*]] = call <2 x half> @llvm.nvvm.ff2f16x2.rs(float 1.000000e+00, float 1.000000e+00, i32 0)
+// CHECK_PTX87_SM103a: store <2 x half> %[[R1]], ptr %r1
+ f16x2 r1 = __nvvm_ff2f16x2_rs(1.0f, 1.0f, 0);
+
+// CHECK_PTX87_SM100a: %[[R2:.*]] = call <2 x half> @llvm.nvvm.ff2f16x2.rs.relu(float 1.000000e+00, float 1.000000e+00, i32 0)
+// CHECK_PTX87_SM100a: store <2 x half> %[[R2]], ptr %r2
+// CHECK_PTX87_SM103a: %[[R2:.*]] = call <2 x half> @llvm.nvvm.ff2f16x2.rs.relu(float 1.000000e+00, float 1.000000e+00, i32 0)
+// CHECK_PTX87_SM103a: store <2 x half> %[[R2]], ptr %r2
+ f16x2 r2 = __nvvm_ff2f16x2_rs_relu(1.0f, 1.0f, 0);
+
+// CHECK_PTX87_SM100a: %[[R3:.*]] = call <2 x half> @llvm.nvvm.ff2f16x2.rs.satfinite(float 1.000000e+00, float 1.000000e+00, i32 0)
+// CHECK_PTX87_SM100a: store <2 x half> %[[R3]], ptr %r3
+// CHECK_PTX87_SM103a: %[[R3:.*]] = call <2 x half> @llvm.nvvm.ff2f16x2.rs.satfinite(float 1.000000e+00, float 1.000000e+00, i32 0)
+// CHECK_PTX87_SM103a: store <2 x half> %[[R3]], ptr %r3
+ f16x2 r3 = __nvvm_ff2f16x2_rs_satfinite(1.0f, 1.0f, 0);
+
+// CHECK_PTX87_SM100a: %[[R4:.*]] = call <2 x half> @llvm.nvvm.ff2f16x2.rs.relu.satfinite(float 1.000000e+00, float 1.000000e+00, i32 0)
+// CHECK_PTX87_SM100a: store <2 x half> %[[R4]], ptr %r4
+// CHECK_PTX87_SM103a: %[[R4:.*]] = call <2 x half> @llvm.nvvm.ff2f16x2.rs.relu.satfinite(float 1.000000e+00, float 1.000000e+00, i32 0)
+// CHECK_PTX87_SM103a: store <2 x half> %[[R4]], ptr %r4
+ f16x2 r4 = __nvvm_ff2f16x2_rs_relu_satfinite(1.0f, 1.0f, 0);
+
+// CHECK_PTX87_SM100a: %[[R5:.*]] = call <2 x bfloat> @llvm.nvvm.ff2bf16x2.rs(float 1.000000e+00, float 1.000000e+00, i32 0)
+// CHECK_PTX87_SM100a: store <2 x bfloat> %[[R5]], ptr %r5
+// CHECK_PTX87_SM103a: %[[R5:.*]] = call <2 x bfloat> @llvm.nvvm.ff2bf16x2.rs(float 1.000000e+00, float 1.000000e+00, i32 0)
+// CHECK_PTX87_SM103a: store <2 x bfloat> %[[R5]], ptr %r5
+ bf16x2 r5 = __nvvm_ff2bf16x2_rs(1.0f, 1.0f, 0);
+
+// CHECK_PTX87_SM100a: %[[R6:.*]] = call <2 x bfloat> @llvm.nvvm.ff2bf16x2.rs.relu(float 1.000000e+00, float 1.000000e+00, i32 0)
+// CHECK_PTX87_SM100a: store <2 x bfloat> %[[R6]], ptr %r6
+// CHECK_PTX87_SM103a: %[[R6:.*]] = call <2 x bfloat> @llvm.nvvm.ff2bf16x2.rs.relu(float 1.000000e+00, float 1.000000e+00, i32 0)
+// CHECK_PTX87_SM103a: store <2 x bfloat> %[[R6]], ptr %r6
+ bf16x2 r6 = __nvvm_ff2bf16x2_rs_relu(1.0f, 1.0f, 0);
+
+// CHECK_PTX87_SM100a: %[[R7:.*]] = call <2 x bfloat> @llvm.nvvm.ff2bf16x2.rs.satfinite(float 1.000000e+00, float 1.000000e+00, i32 0)
+// CHECK_PTX87_SM100a: store <2 x bfloat> %[[R7]], ptr %r7
+// CHECK_PTX87_SM103a: %[[R7:.*]] = call <2 x bfloat> @llvm.nvvm.ff2bf16x2.rs.satfinite(float 1.000000e+00, float 1.000000e+00, i32 0)
+// CHECK_PTX87_SM103a: store <2 x bfloat> %[[R7]], ptr %r7
+ bf16x2 r7 = __nvvm_ff2bf16x2_rs_satfinite(1.0f, 1.0f, 0);
+
+// CHECK_PTX87_SM100a: %[[R8:.*]] = call <2 x bfloat> @llvm.nvvm.ff2bf16x2.rs.relu.satfinite(float 1.000000e+00, float 1.000000e+00, i32 0)
+// CHECK_PTX87_SM100a: store <2 x bfloat> %[[R8]], ptr %r8
+// CHECK_PTX87_SM103a: %[[R8:.*]] = call <2 x bfloat> @llvm.nvvm.ff2bf16x2.rs.relu.satfinite(float 1.000000e+00, float 1.000000e+00, i32 0)
+// CHECK_PTX87_SM103a: store <2 x bfloat> %[[R8]], ptr %r8
+ bf16x2 r8 = __nvvm_ff2bf16x2_rs_relu_satfinite(1.0f, 1.0f, 0);
+
+// CHECK_PTX87_SM100a: %[[R9:.*]] = call <4 x i8> @llvm.nvvm.f32x4.to.e4m3x4.rs.satfinite(<4 x float> splat (float 1.000000e+00), i32 0)
+// CHECK_PTX87_SM100a: store <4 x i8> %[[R9]], ptr %r9
+// CHECK_PTX87_SM103a: %[[R9:.*]] = call <4 x i8> @llvm.nvvm.f32x4.to.e4m3x4.rs.satfinite(<4 x float> splat (float 1.000000e+00), i32 0)
+// CHECK_PTX87_SM103a: store <4 x i8> %[[R9]], ptr %r9
+ uint8x4 r9 = __nvvm_f32x4_to_e4m3x4_rs_satfinite({1.0f, 1.0f, 1.0f, 1.0f}, 0);
+
+// CHECK_PTX87_SM100a: %[[R10:.*]] = call <4 x i8> @llvm.nvvm.f32x4.to.e4m3x4.rs.relu.satfinite(<4 x float> splat (float 1.000000e+00), i32 0)
+// CHECK_PTX87_SM100a: store <4 x i8> %[[R10]], ptr %r10
+// CHECK_PTX87_SM103a: %[[R10:.*]] = call <4 x i8> @llvm.nvvm.f32x4.to.e4m3x4.rs.relu.satfinite(<4 x float> splat (float 1.000000e+00), i32 0)
+// CHECK_PTX87_SM103a: store <4 x i8> %[[R10]], ptr %r10
+ uint8x4 r10 = __nvvm_f32x4_to_e4m3x4_rs_relu_satfinite({1.0f, 1.0f, 1.0f, 1.0f}, 0);
+
+// CHECK_PTX87_SM100a: %[[R11:.*]] = call <4 x i8> @llvm.nvvm.f32x4.to.e5m2x4.rs.satfinite(<4 x float> splat (float 1.000000e+00), i32 0)
+// CHECK_PTX87_SM100a: store <4 x i8> %[[R11]], ptr %r11
+// CHECK_PTX87_SM103a: %[[R11:.*]] = call <4 x i8> @llvm.nvvm.f32x4.to.e5m2x4.rs.satfinite(<4 x float> splat (float 1.000000e+00), i32 0)
+// CHECK_PTX87_SM103a: store <4 x i8> %[[R11]], ptr %r11
+ uint8x4 r11 = __nvvm_f32x4_to_e5m2x4_rs_satfinite({1.0f, 1.0f, 1.0f, 1.0f}, 0);
+
+// CHECK_PTX87_SM100a: %[[R12:.*]] = call <4 x i8> @llvm.nvvm.f32x4.to.e5m2x4.rs.relu.satfinite(<4 x float> splat (float 1.000000e+00), i32 0)
+// CHECK_PTX87_SM100a: store <4 x i8> %[[R12]], ptr %r12
+// CHECK_PTX87_SM103a: %[[R12:.*]] = call <4 x i8> @llvm.nvvm.f32x4.to.e5m2x4.rs.relu.satfinite(<4 x float> splat (float 1.000000e+00), i32 0)
+// CHECK_PTX87_SM103a: store <4 x i8> %[[R12]], ptr %r12
+ uint8x4 r12 = __nvvm_f32x4_to_e5m2x4_rs_relu_satfinite({1.0f, 1.0f, 1.0f, 1.0f}, 0);
+
+// CHECK_PTX87_SM100a: %[[R13:.*]] = call <4 x i8> @llvm.nvvm.f32x4.to.e2m3x4.rs.satfinite(<4 x float> splat (float 1.000000e+00), i32 0)
+// CHECK_PTX87_SM100a: store <4 x i8> %[[R13]], ptr %r13
+// CHECK_PTX87_SM103a: %[[R13:.*]] = call <4 x i8> @llvm.nvvm.f32x4.to.e2m3x4.rs.satfinite(<4 x float> splat (float 1.000000e+00), i32 0)
+// CHECK_PTX87_SM103a: store <4 x i8> %[[R13]], ptr %r13
+ uint8x4 r13 = __nvvm_f32x4_to_e2m3x4_rs_satfinite({1.0f, 1.0f, 1.0f, 1.0f}, 0);
+
+// CHECK_PTX87_SM100a: %[[R14:.*]] = call <4 x i8> @llvm.nvvm.f32x4.to.e2m3x4.rs.relu.satfinite(<4 x float> splat (float 1.000000e+00), i32 0)
+// CHECK_PTX87_SM100a: store <4 x i8> %[[R14]], ptr %r14
+// CHECK_PTX87_SM103a: %[[R14:.*]] = call <4 x i8> @llvm.nvvm.f32x4.to.e2m3x4.rs.relu.satfinite(<4 x float> splat (float 1.000000e+00), i32 0)
+// CHECK_PTX87_SM103a: store <4 x i8> %[[R14]], ptr %r14
+ uint8x4 r14 = __nvvm_f32x4_to_e2m3x4_rs_relu_satfinite({1.0f, 1.0f, 1.0f, 1.0f}, 0);
+
+// CHECK_PTX87_SM100a: %[[R15:.*]] = call <4 x i8> @llvm.nvvm.f32x4.to.e3m2x4.rs.satfinite(<4 x float> splat (float 1.000000e+00), i32 0)
+// CHECK_PTX87_SM100a: store <4 x i8> %[[R15]], ptr %r15
+// CHECK_PTX87_SM103a: %[[R15:.*]] = call <4 x i8> @llvm.nvvm.f32x4.to.e3m2x4.rs.satfinite(<4 x float> splat (float 1.000000e+00), i32 0)
+// CHECK_PTX87_SM103a: store <4 x i8> %[[R15]], ptr %r15
+ uint8x4 r15 = __nvvm_f32x4_to_e3m2x4_rs_satfinite({1.0f, 1.0f, 1.0f, 1.0f}, 0);
+
+// CHECK_PTX87_SM100a: %[[R16:.*]] = call <4 x i8> @llvm.nvvm.f32x4.to.e3m2x4.rs.relu.satfinite(<4 x float> splat (float 1.000000e+00), i32 0)
+// CHECK_PTX87_SM100a: store <4 x i8> %[[R16]], ptr %r16
+// CHECK_PTX87_SM103a: %[[R16:.*]] = call <4 x i8> @llvm.nvvm.f32x4.to.e3m2x4.rs.relu.satfinite(<4 x float> splat (float 1.000000e+00), i32 0)
+// CHECK_PTX87_SM103a: store <4 x i8> %[[R16]], ptr %r16
+ uint8x4 r16 = __nvvm_f32x4_to_e3m2x4_rs_relu_satfinite({1.0f, 1.0f, 1.0f, 1.0f}, 0);
+
+// CHECK_PTX87_SM100a: %[[R17:.*]] = call i16 @llvm.nvvm.f32x4.to.e2m1x4.rs.satfinite(<4 x float> splat (float 1.000000e+00), i32 0)
+// CHECK_PTX87_SM100a: store i16 %[[R17]], ptr %r17
+// CHECK_PTX87_SM103a: %[[R17:.*]] = call i16 @llvm.nvvm.f32x4.to.e2m1x4.rs.satfinite(<4 x float> splat (float 1.000000e+00), i32 0)
+// CHECK_PTX87_SM103a: store i16 %[[R17]], ptr %r17
+ short r17 = __nvvm_f32x4_to_e2m1x4_rs_satfinite({1.0f, 1.0f, 1.0f, 1.0f}, 0);
+
+// CHECK_PTX87_SM100a: %[[R18:.*]] = call i16 @llvm.nvvm.f32x4.to.e2m1x4.rs.relu.satfinite(<4 x float> splat (float 1.000000e+00), i32 0)
+// CHECK_PTX87_SM100a: store i16 %[[R18]], ptr %r18
+// CHECK_PTX87_SM103a: %[[R18:.*]] = call i16 @llvm.nvvm.f32x4.to.e2m1x4.rs.relu.satfinite(<4 x float> splat (float 1.000000e+00), i32 0)
+// CHECK_PTX87_SM103a: store i16 %[[R18]], ptr %r18
+ short r18 = __nvvm_f32x4_to_e2m1x4_rs_relu_satfinite({1.0f, 1.0f, 1.0f, 1.0f}, 0);
+#endif
+}
+
#define NAN32 0x7FBFFFFF
#define NAN16 (__bf16)0x7FBF
#define BF16 (__bf16)0.1f
diff --git a/llvm/include/llvm/IR/IntrinsicsNVVM.td b/llvm/include/llvm/IR/IntrinsicsNVVM.td
index 9cfab26fffa54..23d878f726c5e 100644
--- a/llvm/include/llvm/IR/IntrinsicsNVVM.td
+++ b/llvm/include/llvm/IR/IntrinsicsNVVM.td
@@ -1493,6 +1493,18 @@ let TargetPrefix = "nvvm" in {
}
}
+ // RS rounding mode (Stochastic Rounding) conversions for f16x2, bf16x2 types
+ // The last i32 operand provides the random bits for the conversion
+ foreach relu = ["", "_relu"] in {
+ foreach satfinite = ["", "_satfinite"] in {
+ def int_nvvm_ff2f16x2_rs # relu # satfinite : NVVMBuiltin,
+ PureIntrinsic<[llvm_v2f16_ty], [llvm_float_ty, llvm_float_ty, llvm_i32_ty]>;
+
+ def int_nvvm_ff2bf16x2_rs # relu # satfinite : NVVMBuiltin,
+ PureIntrinsic<[llvm_v2bf16_ty], [llvm_float_ty, llvm_float_ty, llvm_i32_ty]>;
+ }
+ }
+
foreach satfinite = ["", "_satfinite"] in {
def int_nvvm_f2tf32_rna # satfinite : NVVMBuiltin,
PureIntrinsic<[llvm_i32_ty], [llvm_float_ty]>;
@@ -1515,6 +1527,15 @@ let TargetPrefix = "nvvm" in {
PureIntrinsic<[llvm_v2f16_ty], [llvm_i16_ty]>;
}
}
+
+ // RS rounding mode (Stochastic Rounding) conversions for f8x4 types
+ // The last i32 operand provides the random bits for the conversion
+ foreach type = ["e4m3x4", "e5m2x4"] in {
+ foreach relu = ["", "_relu"] in {
+ def int_nvvm_f32x4_to_ # type # _rs # relu # _satfinite : NVVMBuiltin,
+ PureIntrinsic<[llvm_v4i8_ty], [llvm_v4f32_ty, llvm_i32_ty]>;
+ }
+ }
// FP4 conversions.
foreach relu = ["", "_relu"] in {
@@ -1524,6 +1545,13 @@ let TargetPrefix = "nvvm" in {
def int_nvvm_e2m1x2_to_f16x2_rn # relu : NVVMBuiltin,
PureIntrinsic<[llvm_v2f16_ty], [llvm_i16_ty]>;
}
+
+ // RS rounding mode (Stochastic Rounding) conversions for f4x4 type
+ // The last i32 operand provides the random bits for the conversion
+ foreach relu = ["", "_relu"] in {
+ def int_nvvm_f32x4_to_e2m1x4_rs # relu # _satfinite : NVVMBuiltin,
+ PureIntrinsic<[llvm_i16_ty], [llvm_v4f32_ty, llvm_i32_ty]>;
+ }
// FP6 conversions.
foreach type = ["e2m3x2", "e3m2x2"] in {
@@ -1535,6 +1563,15 @@ let TargetPrefix = "nvvm" in {
PureIntrinsic<[llvm_v2f16_ty], [llvm_i16_ty]>;
}
}
+
+ // RS rounding mode (Stochastic Rounding) conversions for f6x4 types
+ // The last i32 operand provides the random bits for the conversion
+ foreach type = ["e2m3x4", "e3m2x4"] in {
+ foreach relu = ["", "_relu"] in {
+ def int_nvvm_f32x4_to_ # type # _rs # relu # _satfinite : NVVMBuiltin,
+ PureIntrinsic<[llvm_v4i8_ty], [llvm_v4f32_ty, llvm_i32_ty]>;
+ }
+ }
// UE8M0x2 conversions.
foreach rmode = ["_rz", "_rp"] in {
diff --git a/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp b/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp
index f9bdc09935330..77913f27838e2 100644
--- a/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp
+++ b/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp
@@ -149,6 +149,9 @@ void NVPTXInstPrinter::printCvtMode(const MCInst *MI, int OpNum, raw_ostream &O,
case NVPTX::PTXCvtMode::RNA:
O << ".rna";
return;
+ case NVPTX::PTXCvtMode::RS:
+ O << ".rs";
+ return;
}
}
llvm_unreachable("Invalid conversion modifier");
diff --git a/llvm/lib/Target/NVPTX/NVPTX.h b/llvm/lib/Target/NVPTX/NVPTX.h
index 77a0e03d4075a..1e0f747f8f7fc 100644
--- a/llvm/lib/Target/NVPTX/NVPTX.h
+++ b/llvm/lib/Target/NVPTX/NVPTX.h
@@ -207,6 +207,7 @@ enum CvtMode {
RM,
RP,
RNA,
+ RS,
BASE_MASK = 0x0F,
FTZ_FLAG = 0x10,
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
index 8c21746c4369e..bc047a4aa999d 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
@@ -1096,9 +1096,10 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM,
// Enable custom lowering for the following:
// * MVT::i128 - clusterlaunchcontrol
// * MVT::i32 - prmt
+ // * MVT::v4f32 - cvt_rs fp{4/6/8}x4 intrinsics
// * MVT::Other - internal.addrspace.wrap
- setOperationAction(ISD::INTRINSIC_WO_CHAIN, {MVT::i32, MVT::i128, MVT::Other},
- Custom);
+ setOperationAction(ISD::INTRINSIC_WO_CHAIN,
+ {MVT::i32, MVT::i128, MVT::v4f32, MVT::Other}, Custom);
}
const char *NVPTXTargetLowering::getTargetNodeName(unsigned Opcode) const {
@@ -1181,6 +1182,11 @@ const char *NVPTXTargetLowering::getTargetNodeName(unsigned Opcode) const {
NVPTXISD::TCGEN05_MMA_SP_TENSOR_SCALE_D_DISABLE_OUTPUT_LANE_CG1_ASHIFT)
MAKE_CASE(
NVPTXISD::TCGEN05_MMA_SP_TENSOR_SCALE_D_DISABLE_OUTPUT_LANE_CG2_ASHIFT)
+ MAKE_CASE(NVPTXISD::CVT_E4M3X4_F32X4_RS_SF)
+ MAKE_CASE(NVPTXISD::CVT_E5M2X4_F32X4_RS_SF)
+ MAKE_CASE(NVPTXISD::CVT_E2M3X4_F32X4_RS_SF)
+ MAKE_CASE(NVPTXISD::CVT_E3M2X4_F32X4_RS_SF)
+ MAKE_CASE(NVPTXISD::CVT_E2M1X4_F32X4_RS_SF)
}
return nullptr;
@@ -2903,6 +2909,61 @@ static SDValue LowerClusterLaunchControlQueryCancel(SDValue Op,
{TryCancelResponse0, TryCancelResponse1});
}
+static SDValue lowerCvtRSIntrinsics(SDValue Op, SelectionDAG &DAG) {
+ SDNode *N = Op.getNode();
+ SDLoc DL(N);
+ SDValue F32Vec = N->getOperand(1);
+ SDValue RBits = N->getOperand(2);
+
+ unsigned IntrinsicID = N->getConstantOperandVal(0);
+
+ // Extract the 4 float elements from the vector
+ SmallVector<SDValue, 6> Ops;
+ for (unsigned i = 0; i < 4; ++i)
+ Ops.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, F32Vec,
+ DAG.getIntPtrConstant(i, DL)));
+
+ using NVPTX::PTXCvtMode::CvtMode;
+
+ auto [OpCode, RetTy, CvtModeFlag] =
+ [&]() -> std::tuple<NVPTXISD::NodeType, MVT::SimpleValueType, uint32_t> {
+ switch (IntrinsicID) {
+ case Intrinsic::nvvm_f32x4_to_e4m3x4_rs_relu_satfinite:
+ return {NVPTXISD::CVT_E4M3X4_F32X4_RS_SF, MVT::v4i8,
+ CvtMode::RS | CvtMode::RELU_FLAG};
+ case Intrinsic::nvvm_f32x4_to_e4m3x4_rs_satfinite:
+ return {NVPTXISD::CVT_E4M3X4_F32X4_RS_SF, MVT::v4i8, CvtMode::RS};
+ case Intrinsic::nvvm_f32x4_to_e5m2x4_rs_relu_satfinite:
+ return {NVPTXISD::CVT_E5M2X4_F32X4_RS_SF, MVT::v4i8,
+ CvtMode::RS | CvtMode::RELU_FLAG};
+ case Intrinsic::nvvm_f32x4_to_e5m2x4_rs_satfinite:
+ return {NVPTXISD::CVT_E5M2X4_F32X4_RS_SF, MVT::v4i8, CvtMode::RS};
+ case Intrinsic::nvvm_f32x4_to_e2m3x4_rs_relu_satfinite:
+ return {NVPTXISD::CVT_E2M3X4_F32X4_RS_SF, MVT::v4i8,
+ CvtMode::RS | CvtMode::RELU_FLAG};
+ case Intrinsic::nvvm_f32x4_to_e2m3x4_rs_satfinite:
+ return {NVPTXISD::CVT_E2M3X4_F32X4_RS_SF, MVT::v4i8, CvtMode::RS};
+ case Intrinsic::nvvm_f32x4_to_e3m2x4_rs_relu_satfinite:
+ return {NVPTXISD::CVT_E3M2X4_F32X4_RS_SF, MVT::v4i8,
+ CvtMode::RS | CvtMode::RELU_FLAG};
+ case Intrinsic::nvvm_f32x4_to_e3m2x4_rs_satfinite:
+ return {NVPTXISD::CVT_E3M2X4_F32X4_RS_SF, MVT::v4i8, CvtMode::RS};
+ case Intrinsic::nvvm_f32x4_to_e2m1x4_rs_relu_satfinite:
+ return {NVPTXISD::CVT_E2M1X4_F32X4_RS_SF, MVT::i16,
+ CvtMode::RS | CvtMode::RELU_FLAG};
+ case Intrinsic::nvvm_f32x4_to_e2m1x4_rs_satfinite:
+ return {NVPTXISD::CVT_E2M1X4_F32X4_RS_SF, MVT::i16, CvtMode::RS};
+ default:
+ llvm_unreachable("unsupported/unhandled intrinsic");
+ }
+ }();
+
+ Ops.push_back(RBits);
+ Ops.push_back(DAG.getConstant(CvtModeFlag, DL, MVT::i32));
+
+ return DAG.getNode(OpCode, DL, RetTy, Ops);
+}
+
static SDValue lowerPrmtIntrinsic(SDValue Op, SelectionDAG &DAG) {
const unsigned Mode = [&]() {
switch (Op->getConstantOperandVal(0)) {
@@ -2972,6 +3033,17 @@ static SDValue lowerIntrinsicWOChain(SDValue Op, SelectionDAG &DAG) {
case Intrinsic::nvvm_clusterlaunchcontrol_query_cancel_get_first_ctaid_y:
case Intrinsic::nvvm_clusterlaunchcontrol_query_cancel_get_first_ctaid_z:
return LowerClusterLaunchControlQueryCancel(Op, DAG);
+ case Intrinsic::nvvm_f32x4_to_e4m3x4_rs_satfinite:
+ case Intrinsic::nvvm_f32x4_to_e4m3x4_rs_relu_satfinite:
+ case Intrinsic::nvvm_f32x4_to_e5m2x4_rs_satfinite:
+ case Intrinsic::nvvm_f32x4_to_e5m2x4_rs_relu_satfinite:
+ case Intrinsic::nvvm_f32x4_to_e2m3x4_rs_satfinite:
+ case Intrinsic::nvvm_f32x4_to_e2m3x4_rs_relu_satfinite:
+ case Intrinsic::nvvm_f32x4_to_e3m2x4_rs_satfinite:
+ case Intrinsic::nvvm_f32x4_to_e3m2x4_rs_relu_satfinite:
+ case Intrinsic::nvvm_f32x4_to_e2m1x4_rs_satfinite:
+ case Intrinsic::nvvm_f32x4_to_e2m1x4_rs_relu_satfinite:
+ return lowerCvtRSIntrinsics(Op, DAG);
}
}
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.h b/llvm/lib/Target/NVPTX/NVPTXISelLowering.h
index 769d2fe46f2c8..63fa0bb9159ff 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.h
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.h
@@ -79,6 +79,11 @@ enum NodeType : unsigned {
CLUSTERLAUNCHCONTROL_QUERY_CANCEL_GET_FIRST_CTAID_X,
CLUSTERLAUNCHCONTROL_QUERY_CANCEL_GET_FIRST_CTAID_Y,
CLUSTERLAUNCHCONTROL_QUERY_CANCEL_GET_FIRST_CTAID_Z,
+ CVT_E4M3X4_F32X4_RS_SF,
+ CVT_E5M2X4_F32X4_RS_SF,
+ CVT_E2M3X4_F32X4_RS_SF,
+ CVT_E3M2X4_F32X4_RS_SF,
+ CVT_E2M1X4_F32X4_RS_SF,
FIRST_MEMORY_OPCODE,
diff --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
index 4cacee2290763..6c14cf0b324e1 100644
--- a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
+++ b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
@@ -34,7 +34,8 @@ def CvtRN : PatLeaf<(i32 0x5)>;
def CvtRZ : PatLeaf<(i32 0x6)>;
def CvtRM : PatLeaf<(i32 0x7)>;
def CvtRP : PatLeaf<(i32 0x8)>;
-def CvtRNA : PatLeaf<(i32 0x9)>;
+def CvtRNA : PatLeaf<(i32 0x9)>;
+def CvtRS : PatLeaf<(i32 0xA)>;
def CvtNONE_FTZ : PatLeaf<(i32 0x10)>;
def CvtRNI_FTZ : PatLeaf<(i32 0x11)>;
@@ -50,8 +51,9 @@ def CvtSAT : PatLeaf<(i32 0x20)>;
def CvtSAT_FTZ : PatLeaf<(i32 0x30)>;
def CvtNONE_RELU : PatLeaf<(i32 0x40)>;
-def CvtRN_RELU : PatLeaf<(i32 0x45)>;
-def CvtRZ_RELU : PatLeaf<(i32 0x46)>;
+def CvtRN_RELU : PatLeaf<(i32 0x45)>;
+def CvtRZ_RELU : PatLeaf<(i32 0x46)>;
+def CvtRS_RELU : PatLeaf<(i32 0x4A)>;
def CvtMode : Operand<i32> {
let PrintMethod = "printCvtMode";
@@ -133,6 +135,11 @@ def hasSM100a : Predicate<"Subtarget->getSmVersion() == 100 && Subtarget->hasArc
def hasSM101a : Predicate<"Subtarget->getSmVersion() == 101 && Subtarget->hasArchAccelFeatures()">;
def hasSM120a : Predicate<"Subtarget->getSmVersion() == 120 && Subtarget->hasArchAccelFeatures()">;
+def hasSM100aOrSM103a :
+ Predicate<"(Subtarget->getSmVersion() == 100 || " #
+ "Subtarget->getSmVersion() == 103) " #
+ "&& Subtarget->hasArchAccelFeatures()">;
+
// non-sync shfl instructions are not available on sm_70+ in PTX6.4+
def hasSHFL : Predicate<"!(Subtarget->getSmVersion() >= 70"
"&& Subtarget->getPTXVersion() >= 64)">;
@@ -593,6 +600,23 @@ let hasSideEffects = false in {
defm CVT_f16x2 : CVT_FROM_FLOAT_V2_SM80<"f16x2", B32>;
defm CVT_bf16x2 : CVT_FROM_FLOAT_V2_SM80<"bf16x2", B32>;
+
+ multiclass CVT_FROM_FLOAT_V2_RS<string FromName, RegisterClass RC> {
+ def _f32_rs :
+ BasicFlagsNVPTXInst<(outs RC:$dst),
+ (ins B32:$src1, B32:$src2, B32:$src3),
+ (ins CvtMode:$mode),
+ "cvt${mode:base}${mode:relu}." # FromName # ".f32">;
+
+ def _f32_rs_sf :
+ BasicFlagsNVPTXInst<(outs RC:$dst),
+ (ins B32:$src1, B32:$src2, B32:$src3),
+ (ins CvtMode:$mode),
+ "cvt${mode:base}${mode:relu}.satfinite." # FromName # ".f32">;
+ }
+
+ defm CVT_f16x2 : CVT_FROM_FLOAT_V2_RS<"f16x2", B32>;
+ defm CVT_bf16x2 : CVT_FROM_FLOAT_V2_RS<"bf16x2", B32>;
// FP8 conversions.
multiclass CVT_TO_F8X2<string F8Name> {
@@ -619,6 +643,15 @@ let hasSideEffects = false in {
def CVT_f16x2_e4m3x2 : CVT_f16x2_fp8<"e4m3">;
def CVT_f16x2_e5m2x2 : CVT_f16x2_fp8<"e5m2">;
+
+ class CVT_TO_FP8X4<string F8Name> :
+ NVPTXInst<(outs B32:$dst),
+ (ins B32:$src1, B32:$src2, B32:$src3, B32:$src4, B32:$src5, CvtMode:$mode),
+ "cvt${mode:base}${mode:relu}.satfinite." # F8Name #
+ "x4.f32 \t$dst, {{$src1, $src2, $src3, $src4}}, $src5;">;
+
+ def CVT_e4m3x4_f32x4_rs_sf : CVT_TO_FP8X4<"e4m3">;
+ def CVT_e5m2x4_f32x4_rs_sf : CVT_TO_FP8X4<"e5m2">;
// Float to TF32 conversions
multiclass CVT_TO_TF32<string Modifier, list<Predicate> Preds = [hasPTX<78>, hasSM<90>]> {
@@ -652,6 +685,15 @@ let hasSideEffects = false in {
"cvt${mode:base}${mode:relu}.f16x2." # type>;
}
+ class CVT_TO_FP6X4<string F6Name> :
+ NVPTXInst<(outs B32:$dst),
+ (ins B32:$src1, B32:$src2, B32:$src3, B32:$src4, B32:$src5, CvtMode:$mode),
+ "cvt${mode:base}${mode:relu}.satfinite." # F6Name #
+ "x4.f32 \t$dst, {{$src1, $src2, $src3, $src4}}, $src5;">;
+
+ def CVT_e2m3x4_f32x4_rs_sf : CVT_TO_FP6X4<"e2m3">;
+ def CVT_e3m2x4_f32x4_rs_sf : CVT_TO_FP6X4<"e3m2">;
+
// FP4 conversions.
def CVT_e2m1x2_f32_sf : NVPTXInst<(outs B16:$dst),
(ins B32:$src1, B32:$src2, CvtMode:$mode),
@@ -668,6 +710,12 @@ let hasSideEffects = false in {
"cvt.u8.u16 \t%e2m1x2_in, $src; \n\t",
"cvt${mode:base}${mode:relu}.f16x2.e2m1x2 \t$dst, %e2m1x2_in; \n\t",
"}}"), []>;
+
+ def CVT_e2m1x4_f32x4_rs_sf :
+ NVPTXInst<(outs B16:$dst),
+ (ins B32:$src1, B32:$src2, B32:$src3, B32:$src4, B32:$src5, CvtMode:$mode),
+ "cvt${mode:base}${mode:relu}.satfinite.e2m1x4.f32 \t" #
+ "$dst, {{$src1, $src2, $src3, $src4}}, $src5;">;
// UE8M0x2 conversions.
class CVT_f32_to_ue8m0x2<string sat = ""> :
diff --git a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
index e91171c1ae38f..28d4bb917ff69 100644
--- a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
+++ b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
@@ -1782,11 +1782,32 @@ def : Pat<(int_nvvm_ff2bf16x2_rn_relu f32:$a, f32:$b), (CVT_bf16x2_f32 $a, $b, C
def : Pat<(int_nvvm_ff2bf16x2_rz f32:$a, f32:$b), (CVT_bf16x2_f32 $a, $b, CvtRZ)>;
def : Pat<(int_nvvm_ff2bf16x2_rz_relu f32:$a, f32:$b), (CVT_bf16x2_f32 $a, $b, CvtRZ_RELU)>;
+let Predicates = [hasPTX<87>, hasSM100aOrSM103a] in {
+def : Pat<(int_nvvm_ff2bf16x2_rs f32:$a, f32:$b, i32:$c),
+ (CVT_bf16x2_f32_rs $a, $b, $c, CvtRS)>;
+def : Pat<(int_nvvm_ff2bf16x2_rs_relu f32:$a, f32:$b, i32:$c),
+ (CVT_bf16x2_f32_rs $a, $b, $c, CvtRS_RELU)>;
+def : Pat<(int_nvvm_ff2bf16x2_rs_satfinite f32:$a, f32:$b, i32:$c),
+ (CVT_bf16x2_f32_rs_sf $a, $b, $c, CvtRS)>;
+def : Pat<(int_nvvm_ff2bf16x2_rs_relu_satfinite f32:$a, f32:$b, i32:$c),
+ (CVT_bf16x2_f32_rs_sf $a, $b, $c, CvtRS_RELU)>;
+}
+
def : Pat<(int_nvvm_ff2f16x2_rn f32:$a, f32:$b), (CVT_f16x2_f32 $a, $b, CvtRN)>;
def : Pat<(int_nvvm_ff2f16x2_rn_relu f32:$a, f32:$b), (CVT_f16x2_f32 $a, $b, CvtRN_RELU)>;
def : Pat<(int_nvvm_ff2f16x2_rz f32:$a, f32:$b), (CVT_f16x2_f32 $a, $b, CvtRZ)>;
def : Pat<(int_nvvm_ff2f16x2_rz_relu f32:$a, f32:$b), (CVT_f16x2_f32 $a, $b, CvtRZ_RELU)>;
+let Predicates = [hasPTX<87>, hasSM100aOrSM103a] in {
+def : Pat<(int_nvvm_ff2f16x2_rs f32:$a, f32:$b, i32:$c),
+ (CVT_f16x2_f32_rs $a, $b, $c, CvtRS)>;
+def : Pat<(int_nvvm_ff2f16x2_rs_relu f32:$a, f32:$b, i32:$c),
+ (CVT_f16x2_f32_rs $a, $b, $c, CvtRS_RELU)>;
+def : Pat<(int_nvvm_ff2f16x2_rs_satfinite f32:$a, f32:$b, i32:$c),
+ (CVT_f16x2_f32_rs_sf $a, $b, $c, CvtRS)>;
+def : Pat<(int_nvvm_ff2f16x2_rs_relu_satfinite f32:$a, f32:$b, i32:$c),
+ (CVT_f16x2_f32_rs_sf $a, $b, $c, CvtRS_RELU)>;
+}
def : Pat<(int_nvvm_f2bf16_rn f32:$a), (CVT_bf16_f32 $a, CvtRN)>;
def : Pat<(int_nvvm_f2bf16_rn_relu f32:$a), (CVT_bf16_f32 $a, CvtRN_RELU)>;
def : Pat<(int_nvvm_f2bf16_rz f32:$a), (CVT_bf16_f32 $a, CvtRZ)>;
@@ -1929,6 +1950,52 @@ let Predicates = [hasPTX<86>, hasSM<100>, hasArchAccelFeatures] in {
(CVT_bf16x2_ue8m0x2 $a)>;
}
+def SDT_CVT_F32X4_TO_FPX4_RS_VEC :
+ SDTypeProfile<1, 6, [SDTCisVec<0>, SDTCisFP<1>, SDTCisFP<2>, SDTCisFP<3>,
+ SDTCisFP<4>, SDTCisInt<5>, SDTCisInt<6>]>;
+
+def SDT_CVT_F32X4_TO_FPX4_RS_INT :
+ SDTypeProfile<1, 6, [SDTCisInt<0>, SDTCisFP<1>, SDTCisFP<2>, SDTCisFP<3>,
+ SDTCisFP<4>, SDTCisInt<5>, SDTCisInt<6>]>;
+
+class CVT_F32X4_TO_FPX4_RS_SF_NODE<string FPName, SDTypeProfile SDT> :
+ SDNode<"NVPTXISD::CVT_" # FPName # "X4_F32X4_RS_SF", SDT, []>;
+
+multiclass CVT_F32X4_TO_FPX4_RS_SF_VEC<string FPName, VTVec RetTy> {
+ def : Pat<(RetTy (CVT_F32X4_TO_FPX4_RS_SF_NODE<!toupper(FPName),
+ SDT_CVT_F32X4_TO_FPX4_RS_VEC>
+ f32:$f1, f32:$f2, f32:$f3, f32:$f4, i32:$rbits, CvtRS)),
+ (!cast<NVPTXInst>("CVT_" # FPName # "x4_f32x4_rs_sf")
+ $f1, $f2, $f3, $f4, $rbits, CvtRS)>;
+
+ def : Pat<(RetTy (CVT_F32X4_TO_FPX4_RS_SF_NODE<!toupper(FPName),
+ SDT_CVT_F32X4_TO_FPX4_RS_VEC>
+ f32:$f1, f32:$f2, f32:$f3, f32:$f4, i32:$rbits, CvtRS_RELU)),
+ (!cast<NVPTXInst>("CVT_" # FPName # "x4_f32x4_rs_sf")
+ $f1, $f2, $f3, $f4, $rbits, CvtRS_RELU)>;
+}
+
+// RS rounding mode conversions
+let Predicates = [hasPTX<87>, hasSM100aOrSM103a] in {
+// FP8x4 conversions
+defm : CVT_F32X4_TO_FPX4_RS_SF_VEC<"e4m3", v4i8>;
+defm : CVT_F32X4_TO_FPX4_RS_SF_VEC<"e5m2", v4i8>;
+
+// FP6x4 conversions
+defm : CVT_F32X4_TO_FPX4_RS_SF_VEC<"e2m3", v4i8>;
+defm : CVT_F32X4_TO_FPX4_RS_SF_VEC<"e3m2", v4i8>;
+
+// FP4x4 conversions
+def : Pat<(i16 (CVT_F32X4_TO_FPX4_RS_SF_NODE<"E2M1",
+ SDT_CVT_F32X4_TO_FPX4_RS_INT>
+ f32:$f1, f32:$f2, f32:$f3, f32:$f4, i32:$rbits, CvtRS)),
+ (CVT_e2m1x4_f32x4_rs_sf $f1, $f2, $f3, $f4, $rbits, CvtRS)>;
+def : Pat<(i16 (CVT_F32X4_TO_FPX4_RS_SF_NODE<"E2M1",
+ SDT_CVT_F32X4_TO_FPX4_RS_INT>
+ f32:$f1, f32:$f2, f32:$f3, f32:$f4, i32:$rbits, CvtRS_RELU)),
+ (CVT_e2m1x4_f32x4_rs_sf $f1, $f2, $f3, $f4, $rbits, CvtRS_RELU)>;
+}
+
//
// FNS
//
diff --git a/llvm/test/CodeGen/NVPTX/convert-sm103a.ll b/llvm/test/CodeGen/NVPTX/convert-sm103a.ll
new file mode 100644
index 0000000000000..54b4dd88867ed
--- /dev/null
+++ b/llvm/test/CodeGen/NVPTX/convert-sm103a.ll
@@ -0,0 +1,297 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx87 | FileCheck %s
+; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_103a -mattr=+ptx87 | FileCheck %s
+; RUN: %if ptxas-sm_100a && ptxas-isa-8.7 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx87 | %ptxas-verify -arch=sm_100a %}
+; RUN: %if ptxas-sm_103a && ptxas-isa-8.7 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_103a -mattr=+ptx87 | %ptxas-verify -arch=sm_103a %}
+
+; F16X2 conversions
+
+define <2 x half> @cvt_rs_f16x2_f32(float %f1, float %f2, i32 %rbits) {
+; CHECK-LABEL: cvt_rs_f16x2_f32(
+; CHECK: {
+; CHECK-NEXT: .reg .b32 %r<5>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.b32 %r1, [cvt_rs_f16x2_f32_param_0];
+; CHECK-NEXT: ld.param.b32 %r2, [cvt_rs_f16x2_f32_param_1];
+; CHECK-NEXT: ld.param.b32 %r3, [cvt_rs_f16x2_f32_param_2];
+; CHECK-NEXT: cvt.rs.f16x2.f32 %r4, %r1, %r2, %r3;
+; CHECK-NEXT: st.param.b32 [func_retval0], %r4;
+; CHECK-NEXT: ret;
+ %val = call <2 x half> @llvm.nvvm.ff2f16x2.rs(float %f1, float %f2, i32 %rbits)
+ ret <2 x half> %val
+}
+
+define <2 x half> @cvt_rs_relu_f16x2_f32(float %f1, float %f2, i32 %rbits) {
+; CHECK-LABEL: cvt_rs_relu_f16x2_f32(
+; CHECK: {
+; CHECK-NEXT: .reg .b32 %r<5>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.b32 %r1, [cvt_rs_relu_f16x2_f32_param_0];
+; CHECK-NEXT: ld.param.b32 %r2, [cvt_rs_relu_f16x2_f32_param_1];
+; CHECK-NEXT: ld.param.b32 %r3, [cvt_rs_relu_f16x2_f32_param_2];
+; CHECK-NEXT: cvt.rs.relu.f16x2.f32 %r4, %r1, %r2, %r3;
+; CHECK-NEXT: st.param.b32 [func_retval0], %r4;
+; CHECK-NEXT: ret;
+ %val = call <2 x half> @llvm.nvvm.ff2f16x2.rs.relu(float %f1, float %f2, i32 %rbits)
+ ret <2 x half> %val
+}
+
+define <2 x half> @cvt_rs_sf_f16x2_f32(float %f1, float %f2, i32 %rbits) {
+; CHECK-LABEL: cvt_rs_sf_f16x2_f32(
+; CHECK: {
+; CHECK-NEXT: .reg .b32 %r<5>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.b32 %r1, [cvt_rs_sf_f16x2_f32_param_0];
+; CHECK-NEXT: ld.param.b32 %r2, [cvt_rs_sf_f16x2_f32_param_1];
+; CHECK-NEXT: ld.param.b32 %r3, [cvt_rs_sf_f16x2_f32_param_2];
+; CHECK-NEXT: cvt.rs.satfinite.f16x2.f32 %r4, %r1, %r2, %r3;
+; CHECK-NEXT: st.param.b32 [func_retval0], %r4;
+; CHECK-NEXT: ret;
+ %val = call <2 x half> @llvm.nvvm.ff2f16x2.rs.satfinite(float %f1, float %f2, i32 %rbits)
+ ret <2 x half> %val
+}
+
+define <2 x half> @cvt_rs_relu_sf_f16x2_f32(float %f1, float %f2, i32 %rbits) {
+; CHECK-LABEL: cvt_rs_relu_sf_f16x2_f32(
+; CHECK: {
+; CHECK-NEXT: .reg .b32 %r<5>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.b32 %r1, [cvt_rs_relu_sf_f16x2_f32_param_0];
+; CHECK-NEXT: ld.param.b32 %r2, [cvt_rs_relu_sf_f16x2_f32_param_1];
+; CHECK-NEXT: ld.param.b32 %r3, [cvt_rs_relu_sf_f16x2_f32_param_2];
+; CHECK-NEXT: cvt.rs.relu.satfinite.f16x2.f32 %r4, %r1, %r2, %r3;
+; CHECK-NEXT: st.param.b32 [func_retval0], %r4;
+; CHECK-NEXT: ret;
+ %val = call <2 x half> @llvm.nvvm.ff2f16x2.rs.relu.satfinite(float %f1, float %f2, i32 %rbits)
+ ret <2 x half> %val
+}
+
+; BF16X2 conversions
+
+define <2 x bfloat> @cvt_rs_bf16x2_f32(float %f1, float %f2, i32 %rbits) {
+; CHECK-LABEL: cvt_rs_bf16x2_f32(
+; CHECK: {
+; CHECK-NEXT: .reg .b32 %r<5>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.b32 %r1, [cvt_rs_bf16x2_f32_param_0];
+; CHECK-NEXT: ld.param.b32 %r2, [cvt_rs_bf16x2_f32_param_1];
+; CHECK-NEXT: ld.param.b32 %r3, [cvt_rs_bf16x2_f32_param_2];
+; CHECK-NEXT: cvt.rs.bf16x2.f32 %r4, %r1, %r2, %r3;
+; CHECK-NEXT: st.param.b32 [func_retval0], %r4;
+; CHECK-NEXT: ret;
+ %val = call <2 x bfloat> @llvm.nvvm.ff2bf16x2.rs(float %f1, float %f2, i32 %rbits)
+ ret <2 x bfloat> %val
+}
+
+define <2 x bfloat> @cvt_rs_relu_bf16x2_f32(float %f1, float %f2, i32 %rbits) {
+; CHECK-LABEL: cvt_rs_relu_bf16x2_f32(
+; CHECK: {
+; CHECK-NEXT: .reg .b32 %r<5>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.b32 %r1, [cvt_rs_relu_bf16x2_f32_param_0];
+; CHECK-NEXT: ld.param.b32 %r2, [cvt_rs_relu_bf16x2_f32_param_1];
+; CHECK-NEXT: ld.param.b32 %r3, [cvt_rs_relu_bf16x2_f32_param_2];
+; CHECK-NEXT: cvt.rs.relu.bf16x2.f32 %r4, %r1, %r2, %r3;
+; CHECK-NEXT: st.param.b32 [func_retval0], %r4;
+; CHECK-NEXT: ret;
+ %val = call <2 x bfloat> @llvm.nvvm.ff2bf16x2.rs.relu(float %f1, float %f2, i32 %rbits)
+ ret <2 x bfloat> %val
+}
+
+define <2 x bfloat> @cvt_rs_sf_bf16x2_f32(float %f1, float %f2, i32 %rbits) {
+; CHECK-LABEL: cvt_rs_sf_bf16x2_f32(
+; CHECK: {
+; CHECK-NEXT: .reg .b32 %r<5>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.b32 %r1, [cvt_rs_sf_bf16x2_f32_param_0];
+; CHECK-NEXT: ld.param.b32 %r2, [cvt_rs_sf_bf16x2_f32_param_1];
+; CHECK-NEXT: ld.param.b32 %r3, [cvt_rs_sf_bf16x2_f32_param_2];
+; CHECK-NEXT: cvt.rs.satfinite.bf16x2.f32 %r4, %r1, %r2, %r3;
+; CHECK-NEXT: st.param.b32 [func_retval0], %r4;
+; CHECK-NEXT: ret;
+ %val = call <2 x bfloat> @llvm.nvvm.ff2bf16x2.rs.satfinite(float %f1, float %f2, i32 %rbits)
+ ret <2 x bfloat> %val
+}
+
+define <2 x bfloat> @cvt_rs_relu_sf_bf16x2_f32(float %f1, float %f2, i32 %rbits) {
+; CHECK-LABEL: cvt_rs_relu_sf_bf16x2_f32(
+; CHECK: {
+; CHECK-NEXT: .reg .b32 %r<5>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.b32 %r1, [cvt_rs_relu_sf_bf16x2_f32_param_0];
+; CHECK-NEXT: ld.param.b32 %r2, [cvt_rs_relu_sf_bf16x2_f32_param_1];
+; CHECK-NEXT: ld.param.b32 %r3, [cvt_rs_relu_sf_bf16x2_f32_param_2];
+; CHECK-NEXT: cvt.rs.relu.satfinite.bf16x2.f32 %r4, %r1, %r2, %r3;
+; CHECK-NEXT: st.param.b32 [func_retval0], %r4;
+; CHECK-NEXT: ret;
+ %val = call <2 x bfloat> @llvm.nvvm.ff2bf16x2.rs.relu.satfinite(float %f1, float %f2, i32 %rbits)
+ ret <2 x bfloat> %val
+}
+
+; F8X4 conversions
+
+define <4 x i8> @cvt_rs_sf_e4m3x4_f32(<4 x float> %fvec, i32 %rbits) {
+; CHECK-LABEL: cvt_rs_sf_e4m3x4_f32(
+; CHECK: {
+; CHECK-NEXT: .reg .b32 %r<7>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [cvt_rs_sf_e4m3x4_f32_param_0];
+; CHECK-NEXT: ld.param.b32 %r5, [cvt_rs_sf_e4m3x4_f32_param_1];
+; CHECK-NEXT: cvt.rs.satfinite.e4m3x4.f32 %r6, {%r1, %r2, %r3, %r4}, %r5;
+; CHECK-NEXT: st.param.b32 [func_retval0], %r6;
+; CHECK-NEXT: ret;
+ %val = call <4 x i8> @llvm.nvvm.f32x4.to.e4m3x4.rs.satfinite(<4 x float> %fvec, i32 %rbits)
+ ret <4 x i8> %val
+}
+
+define <4 x i8> @cvt_rs_relu_sf_e4m3x4_f32(<4 x float> %fvec, i32 %rbits) {
+; CHECK-LABEL: cvt_rs_relu_sf_e4m3x4_f32(
+; CHECK: {
+; CHECK-NEXT: .reg .b32 %r<7>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [cvt_rs_relu_sf_e4m3x4_f32_param_0];
+; CHECK-NEXT: ld.param.b32 %r5, [cvt_rs_relu_sf_e4m3x4_f32_param_1];
+; CHECK-NEXT: cvt.rs.relu.satfinite.e4m3x4.f32 %r6, {%r1, %r2, %r3, %r4}, %r5;
+; CHECK-NEXT: st.param.b32 [func_retval0], %r6;
+; CHECK-NEXT: ret;
+ %val = call <4 x i8> @llvm.nvvm.f32x4.to.e4m3x4.rs.relu.satfinite(<4 x float> %fvec, i32 %rbits)
+ ret <4 x i8> %val
+}
+
+define <4 x i8> @cvt_rs_sf_e5m2x4_f32(<4 x float> %fvec, i32 %rbits) {
+; CHECK-LABEL: cvt_rs_sf_e5m2x4_f32(
+; CHECK: {
+; CHECK-NEXT: .reg .b32 %r<7>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [cvt_rs_sf_e5m2x4_f32_param_0];
+; CHECK-NEXT: ld.param.b32 %r5, [cvt_rs_sf_e5m2x4_f32_param_1];
+; CHECK-NEXT: cvt.rs.satfinite.e5m2x4.f32 %r6, {%r1, %r2, %r3, %r4}, %r5;
+; CHECK-NEXT: st.param.b32 [func_retval0], %r6;
+; CHECK-NEXT: ret;
+ %val = call <4 x i8> @llvm.nvvm.f32x4.to.e5m2x4.rs.satfinite(<4 x float> %fvec, i32 %rbits)
+ ret <4 x i8> %val
+}
+
+define <4 x i8> @cvt_rs_relu_sf_e5m2x4_f32(<4 x float> %fvec, i32 %rbits) {
+; CHECK-LABEL: cvt_rs_relu_sf_e5m2x4_f32(
+; CHECK: {
+; CHECK-NEXT: .reg .b32 %r<7>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [cvt_rs_relu_sf_e5m2x4_f32_param_0];
+; CHECK-NEXT: ld.param.b32 %r5, [cvt_rs_relu_sf_e5m2x4_f32_param_1];
+; CHECK-NEXT: cvt.rs.relu.satfinite.e5m2x4.f32 %r6, {%r1, %r2, %r3, %r4}, %r5;
+; CHECK-NEXT: st.param.b32 [func_retval0], %r6;
+; CHECK-NEXT: ret;
+ %val = call <4 x i8> @llvm.nvvm.f32x4.to.e5m2x4.rs.relu.satfinite(<4 x float> %fvec, i32 %rbits)
+ ret <4 x i8> %val
+}
+
+; F6X4 conversions
+
+define <4 x i8> @cvt_rs_sf_e2m3x4_f32(<4 x float> %fvec, i32 %rbits) {
+; CHECK-LABEL: cvt_rs_sf_e2m3x4_f32(
+; CHECK: {
+; CHECK-NEXT: .reg .b32 %r<7>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [cvt_rs_sf_e2m3x4_f32_param_0];
+; CHECK-NEXT: ld.param.b32 %r5, [cvt_rs_sf_e2m3x4_f32_param_1];
+; CHECK-NEXT: cvt.rs.satfinite.e2m3x4.f32 %r6, {%r1, %r2, %r3, %r4}, %r5;
+; CHECK-NEXT: st.param.b32 [func_retval0], %r6;
+; CHECK-NEXT: ret;
+ %val = call <4 x i8> @llvm.nvvm.f32x4.to.e2m3x4.rs.satfinite(<4 x float> %fvec, i32 %rbits)
+ ret <4 x i8> %val
+}
+
+define <4 x i8> @cvt_rs_relu_sf_e2m3x4_f32(<4 x float> %fvec, i32 %rbits) {
+; CHECK-LABEL: cvt_rs_relu_sf_e2m3x4_f32(
+; CHECK: {
+; CHECK-NEXT: .reg .b32 %r<7>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [cvt_rs_relu_sf_e2m3x4_f32_param_0];
+; CHECK-NEXT: ld.param.b32 %r5, [cvt_rs_relu_sf_e2m3x4_f32_param_1];
+; CHECK-NEXT: cvt.rs.relu.satfinite.e2m3x4.f32 %r6, {%r1, %r2, %r3, %r4}, %r5;
+; CHECK-NEXT: st.param.b32 [func_retval0], %r6;
+; CHECK-NEXT: ret;
+ %val = call <4 x i8> @llvm.nvvm.f32x4.to.e2m3x4.rs.relu.satfinite(<4 x float> %fvec, i32 %rbits)
+ ret <4 x i8> %val
+}
+
+define <4 x i8> @cvt_rs_sf_e3m2x4_f32(<4 x float> %fvec, i32 %rbits) {
+; CHECK-LABEL: cvt_rs_sf_e3m2x4_f32(
+; CHECK: {
+; CHECK-NEXT: .reg .b32 %r<7>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [cvt_rs_sf_e3m2x4_f32_param_0];
+; CHECK-NEXT: ld.param.b32 %r5, [cvt_rs_sf_e3m2x4_f32_param_1];
+; CHECK-NEXT: cvt.rs.satfinite.e3m2x4.f32 %r6, {%r1, %r2, %r3, %r4}, %r5;
+; CHECK-NEXT: st.param.b32 [func_retval0], %r6;
+; CHECK-NEXT: ret;
+ %val = call <4 x i8> @llvm.nvvm.f32x4.to.e3m2x4.rs.satfinite(<4 x float> %fvec, i32 %rbits)
+ ret <4 x i8> %val
+}
+
+define <4 x i8> @cvt_rs_relu_sf_e3m2x4_f32(<4 x float> %fvec, i32 %rbits) {
+; CHECK-LABEL: cvt_rs_relu_sf_e3m2x4_f32(
+; CHECK: {
+; CHECK-NEXT: .reg .b32 %r<7>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [cvt_rs_relu_sf_e3m2x4_f32_param_0];
+; CHECK-NEXT: ld.param.b32 %r5, [cvt_rs_relu_sf_e3m2x4_f32_param_1];
+; CHECK-NEXT: cvt.rs.relu.satfinite.e3m2x4.f32 %r6, {%r1, %r2, %r3, %r4}, %r5;
+; CHECK-NEXT: st.param.b32 [func_retval0], %r6;
+; CHECK-NEXT: ret;
+ %val = call <4 x i8> @llvm.nvvm.f32x4.to.e3m2x4.rs.relu.satfinite(<4 x float> %fvec, i32 %rbits)
+ ret <4 x i8> %val
+}
+
+; F4X4 conversions
+
+define i16 @cvt_rs_sf_e2m1x4_f32(<4 x float> %fvec, i32 %rbits) {
+; CHECK-LABEL: cvt_rs_sf_e2m1x4_f32(
+; CHECK: {
+; CHECK-NEXT: .reg .b16 %rs<2>;
+; CHECK-NEXT: .reg .b32 %r<7>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [cvt_rs_sf_e2m1x4_f32_param_0];
+; CHECK-NEXT: ld.param.b32 %r5, [cvt_rs_sf_e2m1x4_f32_param_1];
+; CHECK-NEXT: cvt.rs.satfinite.e2m1x4.f32 %rs1, {%r1, %r2, %r3, %r4}, %r5;
+; CHECK-NEXT: cvt.u32.u16 %r6, %rs1;
+; CHECK-NEXT: st.param.b32 [func_retval0], %r6;
+; CHECK-NEXT: ret;
+ %val = call i16 @llvm.nvvm.f32x4.to.e2m1x4.rs.satfinite(<4 x float> %fvec, i32 %rbits)
+ ret i16 %val
+}
+
+define i16 @cvt_rs_relu_sf_e2m1x4_f32(<4 x float> %fvec, i32 %rbits) {
+; CHECK-LABEL: cvt_rs_relu_sf_e2m1x4_f32(
+; CHECK: {
+; CHECK-NEXT: .reg .b16 %rs<2>;
+; CHECK-NEXT: .reg .b32 %r<7>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [cvt_rs_relu_sf_e2m1x4_f32_param_0];
+; CHECK-NEXT: ld.param.b32 %r5, [cvt_rs_relu_sf_e2m1x4_f32_param_1];
+; CHECK-NEXT: cvt.rs.relu.satfinite.e2m1x4.f32 %rs1, {%r1, %r2, %r3, %r4}, %r5;
+; CHECK-NEXT: cvt.u32.u16 %r6, %rs1;
+; CHECK-NEXT: st.param.b32 [func_retval0], %r6;
+; CHECK-NEXT: ret;
+ %val = call i16 @llvm.nvvm.f32x4.to.e2m1x4.rs.relu.satfinite(<4 x float> %fvec, i32 %rbits)
+ ret i16 %val
+}
- Previous message: [llvm] [PowerPC] Exploit xxeval instruction for operations of the form ternary(A, X, nor(B,C)), ternary(A, X, eqv(B,C)), ternary(A, X, nand(B,C)), ternary(A, X, not(B)) and ternary(A, X, not(C)) (PR #158096)
- Next message: [llvm] [SPIRV] Added Support for the SPV_INTEL_arbitrary_precesion_floating_point Extension (PR #160054)
- Messages sorted by:
[ date ]
[ thread ]
[ subject ]
[ author ]
More information about the llvm-commits
mailing list