[clang] [llvm] [NVPTX] Add intrinsics for cvt .f6x2 and .ue8m0x2 variants (PR #134345)
via llvm-commits
llvm-commits at lists.llvm.org
Thu Apr 3 21:56:03 PDT 2025
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-clang
Author: Srinivasa Ravi (Wolfram70)
<details>
<summary>Changes</summary>
This change adds NVVM intrinsics and clang builtins for the cvt instruction variants of types `.e2m3x2`, `.e3m2x2`, and `.ue8m0x2` introduced in PTX 8.6 for `sm_100a`, `sm_101a`, and `sm_120a`.
Tests are added in `NVPTX/convert-sm100a.ll` and
`clang/test/CodeGen/builtins-nvptx.c` and verified through ptxas 12.8.0.
PTX Spec Reference: https://docs.nvidia.com/cuda/parallel-thread-execution/#data-movement-and-conversion-instructions-cvt
---
Patch is 36.55 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/134345.diff
6 Files Affected:
- (modified) clang/include/clang/Basic/BuiltinsNVPTX.td (+31)
- (modified) clang/test/CodeGen/builtins-nvptx.c (+187-4)
- (modified) llvm/include/llvm/IR/IntrinsicsNVVM.td (+39)
- (modified) llvm/lib/Target/NVPTX/NVPTXInstrInfo.td (+47)
- (modified) llvm/lib/Target/NVPTX/NVPTXIntrinsics.td (+56)
- (added) llvm/test/CodeGen/NVPTX/convert-sm100a.ll (+290)
``````````diff
diff --git a/clang/include/clang/Basic/BuiltinsNVPTX.td b/clang/include/clang/Basic/BuiltinsNVPTX.td
index 61e48b31c244b..d240b1a8d0d16 100644
--- a/clang/include/clang/Basic/BuiltinsNVPTX.td
+++ b/clang/include/clang/Basic/BuiltinsNVPTX.td
@@ -580,6 +580,15 @@ def __nvvm_f2bf16_rz : NVPTXBuiltinSMAndPTX<"__bf16(float)", SM_80, PTX70>;
def __nvvm_f2bf16_rz_relu : NVPTXBuiltinSMAndPTX<"__bf16(float)", SM_80, PTX70>;
def __nvvm_f2tf32_rna : NVPTXBuiltinSMAndPTX<"int32_t(float)", SM_80, PTX70>;
+def __nvvm_f2tf32_rna_satfinite : NVPTXBuiltinSMAndPTX<"int32_t(float)", SM_89, PTX81>;
+def __nvvm_f2tf32_rn : NVPTXBuiltinSMAndPTX<"int32_t(float)", SM_90, PTX78>;
+def __nvvm_f2tf32_rn_relu : NVPTXBuiltinSMAndPTX<"int32_t(float)", SM_90, PTX78>;
+def __nvvm_f2tf32_rn_satfinite : NVPTXBuiltinSMAndPTX<"int32_t(float)", SM_100, PTX86>;
+def __nvvm_f2tf32_rn_relu_satfinite : NVPTXBuiltinSMAndPTX<"int32_t(float)", SM_100, PTX86>;
+def __nvvm_f2tf32_rz : NVPTXBuiltinSMAndPTX<"int32_t(float)", SM_90, PTX78>;
+def __nvvm_f2tf32_rz_relu : NVPTXBuiltinSMAndPTX<"int32_t(float)", SM_90, PTX78>;
+def __nvvm_f2tf32_rz_satfinite : NVPTXBuiltinSMAndPTX<"int32_t(float)", SM_100, PTX86>;
+def __nvvm_f2tf32_rz_relu_satfinite : NVPTXBuiltinSMAndPTX<"int32_t(float)", SM_100, PTX86>;
def __nvvm_ff_to_e4m3x2_rn : NVPTXBuiltinSMAndPTX<"short(float, float)", SM_89, PTX81>;
def __nvvm_ff_to_e4m3x2_rn_relu : NVPTXBuiltinSMAndPTX<"short(float, float)", SM_89, PTX81>;
@@ -596,6 +605,28 @@ def __nvvm_e4m3x2_to_f16x2_rn_relu : NVPTXBuiltinSMAndPTX<"_Vector<2, __fp16>(sh
def __nvvm_e5m2x2_to_f16x2_rn : NVPTXBuiltinSMAndPTX<"_Vector<2, __fp16>(short)", SM_89, PTX81>;
def __nvvm_e5m2x2_to_f16x2_rn_relu : NVPTXBuiltinSMAndPTX<"_Vector<2, __fp16>(short)", SM_89, PTX81>;
+def __nvvm_ff_to_e2m3x2_rn : NVPTXBuiltinSMAndPTX<"short(float, float)", SM<"100a", [SM_101a, SM_120a]>, PTX86>;
+def __nvvm_ff_to_e2m3x2_rn_relu : NVPTXBuiltinSMAndPTX<"short(float, float)", SM<"100a", [SM_101a, SM_120a]>, PTX86>;
+def __nvvm_ff_to_e3m2x2_rn : NVPTXBuiltinSMAndPTX<"short(float, float)", SM<"100a", [SM_101a, SM_120a]>, PTX86>;
+def __nvvm_ff_to_e3m2x2_rn_relu : NVPTXBuiltinSMAndPTX<"short(float, float)", SM<"100a", [SM_101a, SM_120a]>, PTX86>;
+
+def __nvvm_e2m3x2_to_f16x2_rn : NVPTXBuiltinSMAndPTX<"_Vector<2, __fp16>(short)", SM<"100a", [SM_101a, SM_120a]>, PTX86>;
+def __nvvm_e2m3x2_to_f16x2_rn_relu : NVPTXBuiltinSMAndPTX<"_Vector<2, __fp16>(short)", SM<"100a", [SM_101a, SM_120a]>, PTX86>;
+def __nvvm_e3m2x2_to_f16x2_rn : NVPTXBuiltinSMAndPTX<"_Vector<2, __fp16>(short)", SM<"100a", [SM_101a, SM_120a]>, PTX86>;
+def __nvvm_e3m2x2_to_f16x2_rn_relu : NVPTXBuiltinSMAndPTX<"_Vector<2, __fp16>(short)", SM<"100a", [SM_101a, SM_120a]>, PTX86>;
+
+def __nvvm_ff_to_ue8m0x2_rz : NVPTXBuiltinSMAndPTX<"short(float, float)", SM<"100a", [SM_101a, SM_120a]>, PTX86>;
+def __nvvm_ff_to_ue8m0x2_rz_satfinite : NVPTXBuiltinSMAndPTX<"short(float, float)", SM<"100a", [SM_101a, SM_120a]>, PTX86>;
+def __nvvm_ff_to_ue8m0x2_rp : NVPTXBuiltinSMAndPTX<"short(float, float)", SM<"100a", [SM_101a, SM_120a]>, PTX86>;
+def __nvvm_ff_to_ue8m0x2_rp_satfinite : NVPTXBuiltinSMAndPTX<"short(float, float)", SM<"100a", [SM_101a, SM_120a]>, PTX86>;
+
+def __nvvm_bf16x2_to_ue8m0x2_rz : NVPTXBuiltinSMAndPTX<"short(_Vector<2, __bf16>)", SM<"100a", [SM_101a, SM_120a]>, PTX86>;
+def __nvvm_bf16x2_to_ue8m0x2_rz_satfinite : NVPTXBuiltinSMAndPTX<"short(_Vector<2, __bf16>)", SM<"100a", [SM_101a, SM_120a]>, PTX86>;
+def __nvvm_bf16x2_to_ue8m0x2_rp : NVPTXBuiltinSMAndPTX<"short(_Vector<2, __bf16>)", SM<"100a", [SM_101a, SM_120a]>, PTX86>;
+def __nvvm_bf16x2_to_ue8m0x2_rp_satfinite : NVPTXBuiltinSMAndPTX<"short(_Vector<2, __bf16>)", SM<"100a", [SM_101a, SM_120a]>, PTX86>;
+
+def __nvvm_ue8m0x2_to_bf16x2 : NVPTXBuiltinSMAndPTX<"_Vector<2, __bf16>(short)", SM<"100a", [SM_101a, SM_120a]>, PTX86>;
+
// FNS
let Attributes = [NoThrow] in {
def __nvvm_fns : NVPTXBuiltinPTX<"unsigned int(unsigned int, unsigned int, int)", PTX60>;
diff --git a/clang/test/CodeGen/builtins-nvptx.c b/clang/test/CodeGen/builtins-nvptx.c
index ffa41c85c2734..f74ae332ecf23 100644
--- a/clang/test/CodeGen/builtins-nvptx.c
+++ b/clang/test/CodeGen/builtins-nvptx.c
@@ -25,14 +25,29 @@
// RUN: %clang_cc1 -ffp-contract=off -triple nvptx64-unknown-unknown -target-cpu sm_86 -target-feature +ptx72 \
// RUN: -fcuda-is-device -emit-llvm -o - -x cuda %s \
// RUN: | FileCheck -check-prefix=CHECK -check-prefix=CHECK_PTX72_SM86 -check-prefix=LP64 %s
-// RUN: %clang_cc1 -ffp-contract=off -triple nvptx64-unknown-unknown -target-cpu sm_89 -target-feature +ptx81 \
+// RUN: %clang_cc1 -ffp-contract=off -triple nvptx64-unknown-unknown -target-cpu sm_89 -target-feature +ptx81 -DPTX=81\
// RUN: -fcuda-is-device -emit-llvm -o - -x cuda %s \
// RUN: | FileCheck -check-prefix=CHECK -check-prefix=CHECK_PTX81_SM89 %s
+// RUN: %clang_cc1 -ffp-contract=off -triple nvptx64-unknown-unknown -target-cpu sm_90 -target-feature +ptx78 -DPTX=78 \
+// RUN: -fcuda-is-device -emit-llvm -o - -x cuda %s \
+// RUN: | FileCheck -check-prefix=CHECK -check-prefix=CHECK_PTX78_SM90 %s
+// RUN: %clang_cc1 -ffp-contract=off -triple nvptx64-unknown-unknown -target-cpu sm_100 -target-feature +ptx86 -DPTX=86 \
+// RUN: -fcuda-is-device -emit-llvm -o - -x cuda %s \
+// RUN: | FileCheck -check-prefix=CHECK -check-prefix=CHECK_PTX86_SM100 %s
+// RUN: %clang_cc1 -ffp-contract=off -triple nvptx64-unknown-unknown -target-cpu sm_100a -target-feature +ptx86 -DPTX=86 \
+// RUN: -fcuda-is-device -emit-llvm -o - -x cuda %s \
+// RUN: | FileCheck -check-prefix=CHECK -check-prefix=CHECK_PTX86_SM100a %s
+// RUN: %clang_cc1 -ffp-contract=off -triple nvptx64-unknown-unknown -target-cpu sm_101a -target-feature +ptx86 -DPTX=86 \
+// RUN: -fcuda-is-device -emit-llvm -o - -x cuda %s \
+// RUN: | FileCheck -check-prefix=CHECK -check-prefix=CHECK_PTX86_SM101a %s
+// RUN: %clang_cc1 -ffp-contract=off -triple nvptx64-unknown-unknown -target-cpu sm_120a -target-feature +ptx86 -DPTX=86 \
+// RUN: -fcuda-is-device -emit-llvm -o - -x cuda %s \
+// RUN: | FileCheck -check-prefix=CHECK -check-prefix=CHECK_PTX86_SM120a %s
// ### The last run to check with the highest SM and PTX version available
// ### to make sure target builtins are still accepted.
-// RUN: %clang_cc1 -ffp-contract=off -triple nvptx64-unknown-unknown -target-cpu sm_100a -target-feature +ptx87 \
+// RUN: %clang_cc1 -ffp-contract=off -triple nvptx64-unknown-unknown -target-cpu sm_120a -target-feature +ptx87 -DPTX=87 \
// RUN: -fcuda-is-device -emit-llvm -o - -x cuda %s \
-// RUN: | FileCheck -check-prefix=CHECK -check-prefix=CHECK_PTX81_SM89 %s
+// RUN: | FileCheck -check-prefix=CHECK -check-prefix=CHECK_PTX86_SM120a %s
#define __device__ __attribute__((device))
#define __global__ __attribute__((global))
@@ -994,7 +1009,7 @@ __device__ void nvvm_cvt_sm80() {
// CHECK-LABEL: nvvm_cvt_sm89
__device__ void nvvm_cvt_sm89() {
-#if __CUDA_ARCH__ >= 890
+#if (PTX >= 81) && (__CUDA_ARCH__ >= 890)
// CHECK_PTX81_SM89: call i16 @llvm.nvvm.ff.to.e4m3x2.rn(float 1.000000e+00, float 1.000000e+00)
__nvvm_ff_to_e4m3x2_rn(1.0f, 1.0f);
// CHECK_PTX81_SM89: call i16 @llvm.nvvm.ff.to.e4m3x2.rn.relu(float 1.000000e+00, float 1.000000e+00)
@@ -1021,6 +1036,174 @@ __device__ void nvvm_cvt_sm89() {
__nvvm_e5m2x2_to_f16x2_rn(0x4c4c);
// CHECK_PTX81_SM89: call <2 x half> @llvm.nvvm.e5m2x2.to.f16x2.rn.relu(i16 19532)
__nvvm_e5m2x2_to_f16x2_rn_relu(0x4c4c);
+
+ // CHECK_PTX81_SM89: call i32 @llvm.nvvm.f2tf32.rna.satfinite(float 1.000000e+00)
+ __nvvm_f2tf32_rna_satfinite(1.0f);
+#endif
+ // CHECK: ret void
+}
+
+// CHECK-LABEL: nvvm_cvt_sm90
+__device__ void nvvm_cvt_sm90() {
+#if (PTX >= 78) && (__CUDA_ARCH__ >= 900)
+ // CHECK_PTX78_SM90: call i32 @llvm.nvvm.f2tf32.rn(float 1.000000e+00)
+ __nvvm_f2tf32_rn(1.0f);
+ // CHECK_PTX78_SM90: call i32 @llvm.nvvm.f2tf32.rn.relu(float 1.000000e+00)
+ __nvvm_f2tf32_rn_relu(1.0f);
+ // CHECK_PTX78_SM90: call i32 @llvm.nvvm.f2tf32.rz(float 1.000000e+00)
+ __nvvm_f2tf32_rz(1.0f);
+ // CHECK_PTX78_SM90: call i32 @llvm.nvvm.f2tf32.rz.relu(float 1.000000e+00)
+ __nvvm_f2tf32_rz_relu(1.0f);
+#endif
+ // CHECK: ret void
+}
+
+// CHECK-LABEL: nvvm_cvt_sm100
+__device__ void nvvm_cvt_sm100() {
+#if (PTX >= 86) && (__CUDA_ARCH__ >= 1000)
+ // CHECK_PTX86_SM100: call i32 @llvm.nvvm.f2tf32.rn.satfinite(float 1.000000e+00)
+ __nvvm_f2tf32_rn_satfinite(1.0f);
+ // CHECK_PTX86_SM100: call i32 @llvm.nvvm.f2tf32.rn.relu.satfinite(float 1.000000e+00)
+ __nvvm_f2tf32_rn_relu_satfinite(1.0f);
+ // CHECK_PTX86_SM100: call i32 @llvm.nvvm.f2tf32.rz.satfinite(float 1.000000e+00)
+ __nvvm_f2tf32_rz_satfinite(1.0f);
+ // CHECK_PTX86_SM100: call i32 @llvm.nvvm.f2tf32.rz.relu.satfinite(float 1.000000e+00)
+ __nvvm_f2tf32_rz_relu_satfinite(1.0f);
+#endif
+ // CHECK: ret void
+}
+
+// CHECK-LABEL: nvvm_cvt_sm100a
+__device__ void nvvm_cvt_sm100a() {
+#if (PTX >= 86) && __CUDA_ARCH_FEAT_SM100_ALL
+ // CHECK_PTX86_SM100a: call i16 @llvm.nvvm.ff.to.e2m3x2.rn(float 1.000000e+00, float 1.000000e+00)
+ __nvvm_ff_to_e2m3x2_rn(1.0f, 1.0f);
+ // CHECK_PTX86_SM100a: call i16 @llvm.nvvm.ff.to.e2m3x2.rn.relu(float 1.000000e+00, float 1.000000e+00)
+ __nvvm_ff_to_e2m3x2_rn_relu(1.0f, 1.0f);
+ // CHECK_PTX86_SM100a: call i16 @llvm.nvvm.ff.to.e3m2x2.rn(float 1.000000e+00, float 1.000000e+00)
+ __nvvm_ff_to_e3m2x2_rn(1.0f, 1.0f);
+ // CHECK_PTX86_SM100a: call i16 @llvm.nvvm.ff.to.e3m2x2.rn.relu(float 1.000000e+00, float 1.000000e+00)
+ __nvvm_ff_to_e3m2x2_rn_relu(1.0f, 1.0f);
+
+ // CHECK_PTX86_SM100a: call <2 x half> @llvm.nvvm.e2m3x2.to.f16x2.rn(i16 19532)
+ __nvvm_e2m3x2_to_f16x2_rn(0x4C4C);
+ // CHECK_PTX86_SM100a: call <2 x half> @llvm.nvvm.e2m3x2.to.f16x2.rn.relu(i16 18504)
+ __nvvm_e2m3x2_to_f16x2_rn_relu(0x4848);
+ // CHECK_PTX86_SM100a: call <2 x half> @llvm.nvvm.e3m2x2.to.f16x2.rn(i16 18504)
+ __nvvm_e3m2x2_to_f16x2_rn(0x4848);
+ // CHECK_PTX86_SM100a: call <2 x half> @llvm.nvvm.e3m2x2.to.f16x2.rn.relu(i16 19532)
+ __nvvm_e3m2x2_to_f16x2_rn_relu(0x4C4C);
+
+ // CHECK_PTX86_SM100a: call i16 @llvm.nvvm.ff.to.ue8m0x2.rz(float 1.000000e+00, float 1.000000e+00)
+ __nvvm_ff_to_ue8m0x2_rz(1.0f, 1.0f);
+ // CHECK_PTX86_SM100a: call i16 @llvm.nvvm.ff.to.ue8m0x2.rz.satfinite(float 1.000000e+00, float 1.000000e+00)
+ __nvvm_ff_to_ue8m0x2_rz_satfinite(1.0f, 1.0f);
+ // CHECK_PTX86_SM100a: call i16 @llvm.nvvm.ff.to.ue8m0x2.rp(float 1.000000e+00, float 1.000000e+00)
+ __nvvm_ff_to_ue8m0x2_rp(1.0f, 1.0f);
+ // CHECK_PTX86_SM100a: call i16 @llvm.nvvm.ff.to.ue8m0x2.rp.satfinite(float 1.000000e+00, float 1.000000e+00)
+ __nvvm_ff_to_ue8m0x2_rp_satfinite(1.0f, 1.0f);
+
+ // CHECK_PTX86_SM100a: call i16 @llvm.nvvm.bf16x2.to.ue8m0x2.rz(<2 x bfloat> splat (bfloat 0xR3DCD)
+ __nvvm_bf16x2_to_ue8m0x2_rz({(__bf16)0.1f, (__bf16)0.1f});
+ // CHECK_PTX86_SM100a: call i16 @llvm.nvvm.bf16x2.to.ue8m0x2.rz.satfinite(<2 x bfloat> splat (bfloat 0xR3DCD)
+ __nvvm_bf16x2_to_ue8m0x2_rz_satfinite({(__bf16)0.1f, (__bf16)0.1f});
+ // CHECK_PTX86_SM100a: call i16 @llvm.nvvm.bf16x2.to.ue8m0x2.rp(<2 x bfloat> splat (bfloat 0xR3DCD)
+ __nvvm_bf16x2_to_ue8m0x2_rp({(__bf16)0.1f, (__bf16)0.1f});
+ // CHECK_PTX86_SM100a: call i16 @llvm.nvvm.bf16x2.to.ue8m0x2.rp.satfinite(<2 x bfloat> splat (bfloat 0xR3DCD)
+ __nvvm_bf16x2_to_ue8m0x2_rp_satfinite({(__bf16)0.1f, (__bf16)0.1f});
+
+ // CHECK_PTX86_SM100a: call <2 x bfloat> @llvm.nvvm.ue8m0x2.to.bf16x2(i16 19532)
+ __nvvm_ue8m0x2_to_bf16x2(0x4C4C);
+#endif
+ // CHECK: ret void
+}
+
+// CHECK-LABEL: nvvm_cvt_sm101a
+__device__ void nvvm_cvt_sm101a() {
+#if (PTX >= 86) && __CUDA_ARCH_FEAT_SM101_ALL
+ // CHECK_PTX86_SM101a: call i16 @llvm.nvvm.ff.to.e2m3x2.rn(float 1.000000e+00, float 1.000000e+00)
+ __nvvm_ff_to_e2m3x2_rn(1.0f, 1.0f);
+ // CHECK_PTX86_SM101a: call i16 @llvm.nvvm.ff.to.e2m3x2.rn.relu(float 1.000000e+00, float 1.000000e+00)
+ __nvvm_ff_to_e2m3x2_rn_relu(1.0f, 1.0f);
+ // CHECK_PTX86_SM101a: call i16 @llvm.nvvm.ff.to.e3m2x2.rn(float 1.000000e+00, float 1.000000e+00)
+ __nvvm_ff_to_e3m2x2_rn(1.0f, 1.0f);
+ // CHECK_PTX86_SM101a: call i16 @llvm.nvvm.ff.to.e3m2x2.rn.relu(float 1.000000e+00, float 1.000000e+00)
+ __nvvm_ff_to_e3m2x2_rn_relu(1.0f, 1.0f);
+
+ // CHECK_PTX86_SM101a: call <2 x half> @llvm.nvvm.e2m3x2.to.f16x2.rn(i16 19532)
+ __nvvm_e2m3x2_to_f16x2_rn(0x4C4C);
+ // CHECK_PTX86_SM101a: call <2 x half> @llvm.nvvm.e2m3x2.to.f16x2.rn.relu(i16 18504)
+ __nvvm_e2m3x2_to_f16x2_rn_relu(0x4848);
+ // CHECK_PTX86_SM101a: call <2 x half> @llvm.nvvm.e3m2x2.to.f16x2.rn(i16 18504)
+ __nvvm_e3m2x2_to_f16x2_rn(0x4848);
+ // CHECK_PTX86_SM101a: call <2 x half> @llvm.nvvm.e3m2x2.to.f16x2.rn.relu(i16 19532)
+ __nvvm_e3m2x2_to_f16x2_rn_relu(0x4C4C);
+
+ // CHECK_PTX86_SM101a: call i16 @llvm.nvvm.ff.to.ue8m0x2.rz(float 1.000000e+00, float 1.000000e+00)
+ __nvvm_ff_to_ue8m0x2_rz(1.0f, 1.0f);
+ // CHECK_PTX86_SM101a: call i16 @llvm.nvvm.ff.to.ue8m0x2.rz.satfinite(float 1.000000e+00, float 1.000000e+00)
+ __nvvm_ff_to_ue8m0x2_rz_satfinite(1.0f, 1.0f);
+ // CHECK_PTX86_SM101a: call i16 @llvm.nvvm.ff.to.ue8m0x2.rp(float 1.000000e+00, float 1.000000e+00)
+ __nvvm_ff_to_ue8m0x2_rp(1.0f, 1.0f);
+ // CHECK_PTX86_SM101a: call i16 @llvm.nvvm.ff.to.ue8m0x2.rp.satfinite(float 1.000000e+00, float 1.000000e+00)
+ __nvvm_ff_to_ue8m0x2_rp_satfinite(1.0f, 1.0f);
+
+ // CHECK_PTX86_SM101a: call i16 @llvm.nvvm.bf16x2.to.ue8m0x2.rz(<2 x bfloat> splat (bfloat 0xR3DCD)
+ __nvvm_bf16x2_to_ue8m0x2_rz({(__bf16)0.1f, (__bf16)0.1f});
+ // CHECK_PTX86_SM101a: call i16 @llvm.nvvm.bf16x2.to.ue8m0x2.rz.satfinite(<2 x bfloat> splat (bfloat 0xR3DCD)
+ __nvvm_bf16x2_to_ue8m0x2_rz_satfinite({(__bf16)0.1f, (__bf16)0.1f});
+ // CHECK_PTX86_SM101a: call i16 @llvm.nvvm.bf16x2.to.ue8m0x2.rp(<2 x bfloat> splat (bfloat 0xR3DCD)
+ __nvvm_bf16x2_to_ue8m0x2_rp({(__bf16)0.1f, (__bf16)0.1f});
+ // CHECK_PTX86_SM101a: call i16 @llvm.nvvm.bf16x2.to.ue8m0x2.rp.satfinite(<2 x bfloat> splat (bfloat 0xR3DCD)
+ __nvvm_bf16x2_to_ue8m0x2_rp_satfinite({(__bf16)0.1f, (__bf16)0.1f});
+
+ // CHECK_PTX86_SM101a: call <2 x bfloat> @llvm.nvvm.ue8m0x2.to.bf16x2(i16 19532)
+ __nvvm_ue8m0x2_to_bf16x2(0x4C4C);
+#endif
+ // CHECK: ret void
+}
+
+// CHECK-LABEL: nvvm_cvt_sm120a
+__device__ void nvvm_cvt_sm120a() {
+#if (PTX >= 86) && __CUDA_ARCH_FEAT_SM120_ALL
+ // CHECK_PTX86_SM120a: call i16 @llvm.nvvm.ff.to.e2m3x2.rn(float 1.000000e+00, float 1.000000e+00)
+ __nvvm_ff_to_e2m3x2_rn(1.0f, 1.0f);
+ // CHECK_PTX86_SM120a: call i16 @llvm.nvvm.ff.to.e2m3x2.rn.relu(float 1.000000e+00, float 1.000000e+00)
+ __nvvm_ff_to_e2m3x2_rn_relu(1.0f, 1.0f);
+ // CHECK_PTX86_SM120a: call i16 @llvm.nvvm.ff.to.e3m2x2.rn(float 1.000000e+00, float 1.000000e+00)
+ __nvvm_ff_to_e3m2x2_rn(1.0f, 1.0f);
+ // CHECK_PTX86_SM120a: call i16 @llvm.nvvm.ff.to.e3m2x2.rn.relu(float 1.000000e+00, float 1.000000e+00)
+ __nvvm_ff_to_e3m2x2_rn_relu(1.0f, 1.0f);
+
+ // CHECK_PTX86_SM120a: call <2 x half> @llvm.nvvm.e2m3x2.to.f16x2.rn(i16 19532)
+ __nvvm_e2m3x2_to_f16x2_rn(0x4C4C);
+ // CHECK_PTX86_SM120a: call <2 x half> @llvm.nvvm.e2m3x2.to.f16x2.rn.relu(i16 18504)
+ __nvvm_e2m3x2_to_f16x2_rn_relu(0x4848);
+ // CHECK_PTX86_SM120a: call <2 x half> @llvm.nvvm.e3m2x2.to.f16x2.rn(i16 18504)
+ __nvvm_e3m2x2_to_f16x2_rn(0x4848);
+ // CHECK_PTX86_SM120a: call <2 x half> @llvm.nvvm.e3m2x2.to.f16x2.rn.relu(i16 19532)
+ __nvvm_e3m2x2_to_f16x2_rn_relu(0x4C4C);
+
+ // CHECK_PTX86_SM120a: call i16 @llvm.nvvm.ff.to.ue8m0x2.rz(float 1.000000e+00, float 1.000000e+00)
+ __nvvm_ff_to_ue8m0x2_rz(1.0f, 1.0f);
+ // CHECK_PTX86_SM120a: call i16 @llvm.nvvm.ff.to.ue8m0x2.rz.satfinite(float 1.000000e+00, float 1.000000e+00)
+ __nvvm_ff_to_ue8m0x2_rz_satfinite(1.0f, 1.0f);
+ // CHECK_PTX86_SM120a: call i16 @llvm.nvvm.ff.to.ue8m0x2.rp(float 1.000000e+00, float 1.000000e+00)
+ __nvvm_ff_to_ue8m0x2_rp(1.0f, 1.0f);
+ // CHECK_PTX86_SM120a: call i16 @llvm.nvvm.ff.to.ue8m0x2.rp.satfinite(float 1.000000e+00, float 1.000000e+00)
+ __nvvm_ff_to_ue8m0x2_rp_satfinite(1.0f, 1.0f);
+
+ // CHECK_PTX86_SM120a: call i16 @llvm.nvvm.bf16x2.to.ue8m0x2.rz(<2 x bfloat> splat (bfloat 0xR3DCD)
+ __nvvm_bf16x2_to_ue8m0x2_rz({(__bf16)0.1f, (__bf16)0.1f});
+ // CHECK_PTX86_SM120a: call i16 @llvm.nvvm.bf16x2.to.ue8m0x2.rz.satfinite(<2 x bfloat> splat (bfloat 0xR3DCD)
+ __nvvm_bf16x2_to_ue8m0x2_rz_satfinite({(__bf16)0.1f, (__bf16)0.1f});
+ // CHECK_PTX86_SM120a: call i16 @llvm.nvvm.bf16x2.to.ue8m0x2.rp(<2 x bfloat> splat (bfloat 0xR3DCD)
+ __nvvm_bf16x2_to_ue8m0x2_rp({(__bf16)0.1f, (__bf16)0.1f});
+ // CHECK_PTX86_SM120a: call i16 @llvm.nvvm.bf16x2.to.ue8m0x2.rp.satfinite(<2 x bfloat> splat (bfloat 0xR3DCD)
+ __nvvm_bf16x2_to_ue8m0x2_rp_satfinite({(__bf16)0.1f, (__bf16)0.1f});
+
+ // CHECK_PTX86_SM120a: call <2 x bfloat> @llvm.nvvm.ue8m0x2.to.bf16x2(i16 19532)
+ __nvvm_ue8m0x2_to_bf16x2(0x4C4C);
#endif
// CHECK: ret void
}
diff --git a/llvm/include/llvm/IR/IntrinsicsNVVM.td b/llvm/include/llvm/IR/IntrinsicsNVVM.td
index 80e10f33b770d..e764284d7fb7f 100644
--- a/llvm/include/llvm/IR/IntrinsicsNVVM.td
+++ b/llvm/include/llvm/IR/IntrinsicsNVVM.td
@@ -1548,6 +1548,45 @@ let TargetPrefix = "nvvm" in {
Intrinsic<[llvm_v2f16_ty], [llvm_i16_ty], [IntrNoMem, IntrNoCallback]>;
def int_nvvm_e5m2x2_to_f16x2_rn_relu : ClangBuiltin<"__nvvm_e5m2x2_to_f16x2_rn_relu">,
Intrinsic<[llvm_v2f16_ty], [llvm_i16_ty], [IntrNoMem, IntrNoCallback]>;
+
+ def int_nvvm_ff_to_e2m3x2_rn : ClangBuiltin<"__nvvm_ff_to_e2m3x2_rn">,
+ Intrinsic<[llvm_i16_ty], [llvm_float_ty, llvm_float_ty], [IntrNoMem, IntrNoCallback]>;
+ def int_nvvm_ff_to_e2m3x2_rn_relu : ClangBuiltin<"__nvvm_ff_to_e2m3x2_rn_relu">,
+ Intrinsic<[llvm_i16_ty], [llvm_float_ty, llvm_float_ty], [IntrNoMem, IntrNoCallback]>;
+ def int_nvvm_ff_to_e3m2x2_rn : ClangBuiltin<"__nvvm_ff_to_e3m2x2_rn">,
+ Intrinsic<[llvm_i16_ty], [llvm_float_ty, llvm_float_ty], [IntrNoMem, IntrNoCallback]>;
+ def int_nvvm_ff_to_e3m2x2_rn_relu : ClangBuiltin<"__nvvm_ff_to_e3m2x2_rn_relu">,
+ Intrinsic<[llvm_i16_ty], [llvm_float_ty, llvm_float_ty], [IntrNoMem, IntrNoCallback]>;
+
+ def int_nvvm_e2m3x2_to_f16x2_rn : ClangBuiltin<"__nvvm_e2m3x2_to_f16x2_rn">,
+ Intrinsic<[llvm_v2f16_ty], [llvm_i16_ty], [IntrNoMem, IntrNoCallback]>;
+ def int_nvvm_e2m3x2_to_f16x2_rn_relu : ClangBuiltin<"__nvvm_e2m3x2_to_f16x2_rn_relu">,
+ Intrinsic<[llvm_v2f16_ty], [llvm_i16_ty], [IntrNoMem, IntrNoCallback]>;
+ def int_nvvm_e3m2x2_to_f16x2_rn : ClangBuiltin<"__nvvm_e3m2x2_to_f16x2_rn">,
+ Intrinsic<[llvm_v2f16_ty], [llvm_i16_ty], [IntrNoMem, IntrNoCallback]>;
+ def int_nvvm_e3m2x2_to_f16x2_rn_relu : ClangBuiltin<"__nvvm_e3m2x2_to_f16x2_rn_relu">,
+ Intrinsic<[llvm_v2f16_ty], [llvm_i16_ty], [IntrNoMem, IntrNoCallback]>;
+
+ def int_nvvm_ff_to_ue8m0x2_rz : ClangBuiltin<"__nvvm_ff_to_ue8m0x2_rz">,
+ Intrinsic<[llvm_i16_ty], [llvm_float_ty, llvm_float_ty], [IntrNoMem, IntrNoCallback]>;
+ def int_nvvm_ff_to_ue8m0x2_rz_satfinite : ClangBuiltin<"__nvvm_ff_to_ue8m0x2_rz_satfinite">,
+ Intrinsic<[llvm_i16_ty], [llvm_float_ty, llvm_float_ty], [IntrNoMem, IntrNoCallback]>;
+ def int_nvvm_ff_to_ue8m0x2_rp : ClangBuiltin<"__nvvm_ff_to_ue8m0x2_rp">,
+ Intrinsic<[llvm_i16_ty], [llvm_float_ty, llvm_float_ty], [IntrNoMem, IntrNoCallback]>;
+ def int_nvvm_ff_to_ue8m0x2_rp_satfinite : ClangBuiltin<"__nvvm_ff_to_ue8m0x2_rp_satfinite">,
+ Intrinsic<[llvm_i16_ty], [llvm_float_ty, llvm_float_ty], [IntrNoMem, IntrNoCallback]>;
+
+ def int_nvvm_bf16x2_to_ue8m0x2_rz : ClangBuiltin<"__nvvm_bf16x2_to_ue8m0x2_rz">,
+ Intrinsic<[llvm_i16_ty], [llvm_v2bf16_ty], [IntrNoMem, IntrNoCallback]>;
+ def int_nvvm_bf16x2_to_ue8m0x2_rz_satfinite : ClangBuiltin<"__nvvm_bf16x2_to_ue8m0x2_rz_satfinite">,
+ Intrinsic<[llvm_i16_ty], [llvm_v2bf16_ty], [IntrNoMem, IntrNoCallback]>;
+ def int_nvvm_bf16x2_to_ue8m0x2_rp : ClangBuiltin<"__nvvm_bf16x2_to_ue8m0x2_rp">,
+ Intrinsic<[llvm_i16_ty], [llvm_v2bf16_ty], [IntrNoMem, IntrNoCallback]>;
+ def int_nvvm_bf16x2_to_ue8m0x2_rp_satfinite : ClangBuiltin<"__nvvm_bf16x2_to_ue8m0x2_rp_satfinite">,
+ Intrinsic<[llvm_i16_ty], [llvm_v2bf16_ty], [IntrNoMem, IntrNoCallback]>;
+
+ def int_nvvm_ue8m0x2_to_bf16x2 : ClangBuiltin<"__nvvm_ue8m0x2_to_bf16x2">,
+ Intrinsic<[llvm_v2bf16_ty], ...
[truncated]
``````````
</details>
https://github.com/llvm/llvm-project/pull/134345
More information about the llvm-commits
mailing list