[llvm] Added Conditions of SM90 and ISA7.8 for Using cvt.ftz.f32.bf16 Instruction (PR #165774)
via llvm-commits
llvm-commits at lists.llvm.org
Thu Oct 30 12:53:52 PDT 2025
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-backend-nvptx
Author: None (yasmincs)
<details>
<summary>Changes</summary>
Updated the conditions for generating the cvt.ftz.f32.bf16 instruction to include sm90 and isa7.8, so that ftz is only generated when it is supported.
---
Patch is 23.34 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/165774.diff
2 Files Affected:
- (modified) llvm/lib/Target/NVPTX/NVPTXInstrInfo.td (+1-1)
- (modified) llvm/test/CodeGen/NVPTX/bf16-instructions.ll (+310-33)
``````````diff
diff --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
index dfde0cca0f00c..af1601f8b60fe 100644
--- a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
+++ b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
@@ -2268,7 +2268,7 @@ def : Pat<(f32 (fpround f64:$a)), (CVT_f32_f64 $a, CvtRN)>;
def : Pat<(f32 (fpextend f16:$a)), (CVT_f32_f16 $a, CvtNONE_FTZ)>, Requires<[doF32FTZ]>;
def : Pat<(f32 (fpextend f16:$a)), (CVT_f32_f16 $a, CvtNONE)>;
// fpextend bf16 -> f32
-def : Pat<(f32 (fpextend bf16:$a)), (CVT_f32_bf16 $a, CvtNONE_FTZ)>, Requires<[doF32FTZ]>;
+def : Pat<(f32 (fpextend bf16:$a)), (CVT_f32_bf16 $a, CvtNONE_FTZ)>, Requires<[doF32FTZ,hasPTX<78>, hasSM<90>]>;
def : Pat<(f32 (fpextend bf16:$a)), (CVT_f32_bf16 $a, CvtNONE)>, Requires<[hasPTX<71>, hasSM<80>]>;
// fpextend f16 -> f64
diff --git a/llvm/test/CodeGen/NVPTX/bf16-instructions.ll b/llvm/test/CodeGen/NVPTX/bf16-instructions.ll
index 4d930cd9e57c0..3626613cf8511 100644
--- a/llvm/test/CodeGen/NVPTX/bf16-instructions.ll
+++ b/llvm/test/CodeGen/NVPTX/bf16-instructions.ll
@@ -2,6 +2,7 @@
; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_70 -mattr=+ptx71 | FileCheck --check-prefixes=CHECK,SM70 %s
; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_80 -mattr=+ptx71 | FileCheck --check-prefixes=CHECK,SM80 %s
; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_80 -mattr=+ptx71 -denormal-fp-math-f32=preserve-sign | FileCheck --check-prefixes=CHECK,SM80-FTZ %s
+; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx78 -denormal-fp-math-f32=preserve-sign | FileCheck --check-prefixes=CHECK,SM90-FTZ %s
; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx78 | FileCheck --check-prefixes=CHECK,SM90 %s
; RUN: %if ptxas-sm_80 && ptxas-isa-7.1 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_80 -mattr=+ptx71 | %ptxas-verify -arch=sm_80 %}
; RUN: %if ptxas-sm_80 && ptxas-isa-7.1 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_80 -mattr=+ptx71 -denormal-fp-math-f32=preserve-sign | %ptxas-verify -arch=sm_80 %}
@@ -55,13 +56,24 @@ define bfloat @test_fadd(bfloat %0, bfloat %1) {
; SM80-FTZ-NEXT: // %bb.0:
; SM80-FTZ-NEXT: ld.param.b16 %rs1, [test_fadd_param_0];
; SM80-FTZ-NEXT: ld.param.b16 %rs2, [test_fadd_param_1];
-; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %r1, %rs2;
-; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %r2, %rs1;
+; SM80-FTZ-NEXT: cvt.f32.bf16 %r1, %rs2;
+; SM80-FTZ-NEXT: cvt.f32.bf16 %r2, %rs1;
; SM80-FTZ-NEXT: add.rn.ftz.f32 %r3, %r2, %r1;
; SM80-FTZ-NEXT: cvt.rn.bf16.f32 %rs3, %r3;
; SM80-FTZ-NEXT: st.param.b16 [func_retval0], %rs3;
; SM80-FTZ-NEXT: ret;
;
+; SM90-FTZ-LABEL: test_fadd(
+; SM90-FTZ: {
+; SM90-FTZ-NEXT: .reg .b16 %rs<4>;
+; SM90-FTZ-EMPTY:
+; SM90-FTZ-NEXT: // %bb.0:
+; SM90-FTZ-NEXT: ld.param.b16 %rs1, [test_fadd_param_0];
+; SM90-FTZ-NEXT: ld.param.b16 %rs2, [test_fadd_param_1];
+; SM90-FTZ-NEXT: add.rn.bf16 %rs3, %rs1, %rs2;
+; SM90-FTZ-NEXT: st.param.b16 [func_retval0], %rs3;
+; SM90-FTZ-NEXT: ret;
+;
; SM90-LABEL: test_fadd(
; SM90: {
; SM90-NEXT: .reg .b16 %rs<4>;
@@ -118,13 +130,24 @@ define bfloat @test_fsub(bfloat %0, bfloat %1) {
; SM80-FTZ-NEXT: // %bb.0:
; SM80-FTZ-NEXT: ld.param.b16 %rs1, [test_fsub_param_0];
; SM80-FTZ-NEXT: ld.param.b16 %rs2, [test_fsub_param_1];
-; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %r1, %rs2;
-; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %r2, %rs1;
+; SM80-FTZ-NEXT: cvt.f32.bf16 %r1, %rs2;
+; SM80-FTZ-NEXT: cvt.f32.bf16 %r2, %rs1;
; SM80-FTZ-NEXT: sub.rn.ftz.f32 %r3, %r2, %r1;
; SM80-FTZ-NEXT: cvt.rn.bf16.f32 %rs3, %r3;
; SM80-FTZ-NEXT: st.param.b16 [func_retval0], %rs3;
; SM80-FTZ-NEXT: ret;
;
+; SM90-FTZ-LABEL: test_fsub(
+; SM90-FTZ: {
+; SM90-FTZ-NEXT: .reg .b16 %rs<4>;
+; SM90-FTZ-EMPTY:
+; SM90-FTZ-NEXT: // %bb.0:
+; SM90-FTZ-NEXT: ld.param.b16 %rs1, [test_fsub_param_0];
+; SM90-FTZ-NEXT: ld.param.b16 %rs2, [test_fsub_param_1];
+; SM90-FTZ-NEXT: sub.rn.bf16 %rs3, %rs1, %rs2;
+; SM90-FTZ-NEXT: st.param.b16 [func_retval0], %rs3;
+; SM90-FTZ-NEXT: ret;
+;
; SM90-LABEL: test_fsub(
; SM90: {
; SM90-NEXT: .reg .b16 %rs<4>;
@@ -195,16 +218,27 @@ define <2 x bfloat> @test_faddx2(<2 x bfloat> %a, <2 x bfloat> %b) #0 {
; SM80-FTZ-NEXT: // %bb.0:
; SM80-FTZ-NEXT: ld.param.v2.b16 {%rs1, %rs2}, [test_faddx2_param_0];
; SM80-FTZ-NEXT: ld.param.v2.b16 {%rs3, %rs4}, [test_faddx2_param_1];
-; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %r1, %rs3;
-; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %r2, %rs1;
+; SM80-FTZ-NEXT: cvt.f32.bf16 %r1, %rs3;
+; SM80-FTZ-NEXT: cvt.f32.bf16 %r2, %rs1;
; SM80-FTZ-NEXT: add.rn.ftz.f32 %r3, %r2, %r1;
-; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %r4, %rs4;
-; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %r5, %rs2;
+; SM80-FTZ-NEXT: cvt.f32.bf16 %r4, %rs4;
+; SM80-FTZ-NEXT: cvt.f32.bf16 %r5, %rs2;
; SM80-FTZ-NEXT: add.rn.ftz.f32 %r6, %r5, %r4;
; SM80-FTZ-NEXT: cvt.rn.bf16x2.f32 %r7, %r6, %r3;
; SM80-FTZ-NEXT: st.param.b32 [func_retval0], %r7;
; SM80-FTZ-NEXT: ret;
;
+; SM90-FTZ-LABEL: test_faddx2(
+; SM90-FTZ: {
+; SM90-FTZ-NEXT: .reg .b32 %r<4>;
+; SM90-FTZ-EMPTY:
+; SM90-FTZ-NEXT: // %bb.0:
+; SM90-FTZ-NEXT: ld.param.b32 %r1, [test_faddx2_param_0];
+; SM90-FTZ-NEXT: ld.param.b32 %r2, [test_faddx2_param_1];
+; SM90-FTZ-NEXT: add.rn.bf16x2 %r3, %r1, %r2;
+; SM90-FTZ-NEXT: st.param.b32 [func_retval0], %r3;
+; SM90-FTZ-NEXT: ret;
+;
; SM90-LABEL: test_faddx2(
; SM90: {
; SM90-NEXT: .reg .b32 %r<4>;
@@ -275,16 +309,27 @@ define <2 x bfloat> @test_fsubx2(<2 x bfloat> %a, <2 x bfloat> %b) #0 {
; SM80-FTZ-NEXT: // %bb.0:
; SM80-FTZ-NEXT: ld.param.v2.b16 {%rs1, %rs2}, [test_fsubx2_param_0];
; SM80-FTZ-NEXT: ld.param.v2.b16 {%rs3, %rs4}, [test_fsubx2_param_1];
-; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %r1, %rs3;
-; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %r2, %rs1;
+; SM80-FTZ-NEXT: cvt.f32.bf16 %r1, %rs3;
+; SM80-FTZ-NEXT: cvt.f32.bf16 %r2, %rs1;
; SM80-FTZ-NEXT: sub.rn.ftz.f32 %r3, %r2, %r1;
-; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %r4, %rs4;
-; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %r5, %rs2;
+; SM80-FTZ-NEXT: cvt.f32.bf16 %r4, %rs4;
+; SM80-FTZ-NEXT: cvt.f32.bf16 %r5, %rs2;
; SM80-FTZ-NEXT: sub.rn.ftz.f32 %r6, %r5, %r4;
; SM80-FTZ-NEXT: cvt.rn.bf16x2.f32 %r7, %r6, %r3;
; SM80-FTZ-NEXT: st.param.b32 [func_retval0], %r7;
; SM80-FTZ-NEXT: ret;
;
+; SM90-FTZ-LABEL: test_fsubx2(
+; SM90-FTZ: {
+; SM90-FTZ-NEXT: .reg .b32 %r<4>;
+; SM90-FTZ-EMPTY:
+; SM90-FTZ-NEXT: // %bb.0:
+; SM90-FTZ-NEXT: ld.param.b32 %r1, [test_fsubx2_param_0];
+; SM90-FTZ-NEXT: ld.param.b32 %r2, [test_fsubx2_param_1];
+; SM90-FTZ-NEXT: sub.rn.bf16x2 %r3, %r1, %r2;
+; SM90-FTZ-NEXT: st.param.b32 [func_retval0], %r3;
+; SM90-FTZ-NEXT: ret;
+;
; SM90-LABEL: test_fsubx2(
; SM90: {
; SM90-NEXT: .reg .b32 %r<4>;
@@ -355,16 +400,27 @@ define <2 x bfloat> @test_fmulx2(<2 x bfloat> %a, <2 x bfloat> %b) #0 {
; SM80-FTZ-NEXT: // %bb.0:
; SM80-FTZ-NEXT: ld.param.v2.b16 {%rs1, %rs2}, [test_fmulx2_param_0];
; SM80-FTZ-NEXT: ld.param.v2.b16 {%rs3, %rs4}, [test_fmulx2_param_1];
-; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %r1, %rs3;
-; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %r2, %rs1;
+; SM80-FTZ-NEXT: cvt.f32.bf16 %r1, %rs3;
+; SM80-FTZ-NEXT: cvt.f32.bf16 %r2, %rs1;
; SM80-FTZ-NEXT: mul.rn.ftz.f32 %r3, %r2, %r1;
-; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %r4, %rs4;
-; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %r5, %rs2;
+; SM80-FTZ-NEXT: cvt.f32.bf16 %r4, %rs4;
+; SM80-FTZ-NEXT: cvt.f32.bf16 %r5, %rs2;
; SM80-FTZ-NEXT: mul.rn.ftz.f32 %r6, %r5, %r4;
; SM80-FTZ-NEXT: cvt.rn.bf16x2.f32 %r7, %r6, %r3;
; SM80-FTZ-NEXT: st.param.b32 [func_retval0], %r7;
; SM80-FTZ-NEXT: ret;
;
+; SM90-FTZ-LABEL: test_fmulx2(
+; SM90-FTZ: {
+; SM90-FTZ-NEXT: .reg .b32 %r<4>;
+; SM90-FTZ-EMPTY:
+; SM90-FTZ-NEXT: // %bb.0:
+; SM90-FTZ-NEXT: ld.param.b32 %r1, [test_fmulx2_param_0];
+; SM90-FTZ-NEXT: ld.param.b32 %r2, [test_fmulx2_param_1];
+; SM90-FTZ-NEXT: mul.rn.bf16x2 %r3, %r1, %r2;
+; SM90-FTZ-NEXT: st.param.b32 [func_retval0], %r3;
+; SM90-FTZ-NEXT: ret;
+;
; SM90-LABEL: test_fmulx2(
; SM90: {
; SM90-NEXT: .reg .b32 %r<4>;
@@ -441,16 +497,34 @@ define <2 x bfloat> @test_fdiv(<2 x bfloat> %a, <2 x bfloat> %b) #0 {
; SM80-FTZ-NEXT: // %bb.0:
; SM80-FTZ-NEXT: ld.param.v2.b16 {%rs1, %rs2}, [test_fdiv_param_0];
; SM80-FTZ-NEXT: ld.param.v2.b16 {%rs3, %rs4}, [test_fdiv_param_1];
-; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %r1, %rs3;
-; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %r2, %rs1;
+; SM80-FTZ-NEXT: cvt.f32.bf16 %r1, %rs3;
+; SM80-FTZ-NEXT: cvt.f32.bf16 %r2, %rs1;
; SM80-FTZ-NEXT: div.rn.ftz.f32 %r3, %r2, %r1;
-; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %r4, %rs4;
-; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %r5, %rs2;
+; SM80-FTZ-NEXT: cvt.f32.bf16 %r4, %rs4;
+; SM80-FTZ-NEXT: cvt.f32.bf16 %r5, %rs2;
; SM80-FTZ-NEXT: div.rn.ftz.f32 %r6, %r5, %r4;
; SM80-FTZ-NEXT: cvt.rn.bf16x2.f32 %r7, %r6, %r3;
; SM80-FTZ-NEXT: st.param.b32 [func_retval0], %r7;
; SM80-FTZ-NEXT: ret;
;
+; SM90-FTZ-LABEL: test_fdiv(
+; SM90-FTZ: {
+; SM90-FTZ-NEXT: .reg .b16 %rs<5>;
+; SM90-FTZ-NEXT: .reg .b32 %r<8>;
+; SM90-FTZ-EMPTY:
+; SM90-FTZ-NEXT: // %bb.0:
+; SM90-FTZ-NEXT: ld.param.v2.b16 {%rs1, %rs2}, [test_fdiv_param_0];
+; SM90-FTZ-NEXT: ld.param.v2.b16 {%rs3, %rs4}, [test_fdiv_param_1];
+; SM90-FTZ-NEXT: cvt.ftz.f32.bf16 %r1, %rs3;
+; SM90-FTZ-NEXT: cvt.ftz.f32.bf16 %r2, %rs1;
+; SM90-FTZ-NEXT: div.rn.ftz.f32 %r3, %r2, %r1;
+; SM90-FTZ-NEXT: cvt.ftz.f32.bf16 %r4, %rs4;
+; SM90-FTZ-NEXT: cvt.ftz.f32.bf16 %r5, %rs2;
+; SM90-FTZ-NEXT: div.rn.ftz.f32 %r6, %r5, %r4;
+; SM90-FTZ-NEXT: cvt.rn.bf16x2.f32 %r7, %r6, %r3;
+; SM90-FTZ-NEXT: st.param.b32 [func_retval0], %r7;
+; SM90-FTZ-NEXT: ret;
+;
; SM90-LABEL: test_fdiv(
; SM90: {
; SM90-NEXT: .reg .b16 %rs<5>;
@@ -527,10 +601,21 @@ define float @test_fpext_float(bfloat %a) #0 {
; SM80-FTZ-EMPTY:
; SM80-FTZ-NEXT: // %bb.0:
; SM80-FTZ-NEXT: ld.param.b16 %rs1, [test_fpext_float_param_0];
-; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %r1, %rs1;
+; SM80-FTZ-NEXT: cvt.f32.bf16 %r1, %rs1;
; SM80-FTZ-NEXT: st.param.b32 [func_retval0], %r1;
; SM80-FTZ-NEXT: ret;
;
+; SM90-FTZ-LABEL: test_fpext_float(
+; SM90-FTZ: {
+; SM90-FTZ-NEXT: .reg .b16 %rs<2>;
+; SM90-FTZ-NEXT: .reg .b32 %r<2>;
+; SM90-FTZ-EMPTY:
+; SM90-FTZ-NEXT: // %bb.0:
+; SM90-FTZ-NEXT: ld.param.b16 %rs1, [test_fpext_float_param_0];
+; SM90-FTZ-NEXT: cvt.ftz.f32.bf16 %r1, %rs1;
+; SM90-FTZ-NEXT: st.param.b32 [func_retval0], %r1;
+; SM90-FTZ-NEXT: ret;
+;
; SM90-LABEL: test_fpext_float(
; SM90: {
; SM90-NEXT: .reg .b16 %rs<2>;
@@ -585,6 +670,17 @@ define bfloat @test_fptrunc_float(float %a) #0 {
; SM80-FTZ-NEXT: st.param.b16 [func_retval0], %rs1;
; SM80-FTZ-NEXT: ret;
;
+; SM90-FTZ-LABEL: test_fptrunc_float(
+; SM90-FTZ: {
+; SM90-FTZ-NEXT: .reg .b16 %rs<2>;
+; SM90-FTZ-NEXT: .reg .b32 %r<2>;
+; SM90-FTZ-EMPTY:
+; SM90-FTZ-NEXT: // %bb.0:
+; SM90-FTZ-NEXT: ld.param.b32 %r1, [test_fptrunc_float_param_0];
+; SM90-FTZ-NEXT: cvt.rn.bf16.f32 %rs1, %r1;
+; SM90-FTZ-NEXT: st.param.b16 [func_retval0], %rs1;
+; SM90-FTZ-NEXT: ret;
+;
; SM90-LABEL: test_fptrunc_float(
; SM90: {
; SM90-NEXT: .reg .b16 %rs<2>;
@@ -637,12 +733,23 @@ define bfloat @test_fadd_imm_1(bfloat %a) #0 {
; SM80-FTZ-EMPTY:
; SM80-FTZ-NEXT: // %bb.0:
; SM80-FTZ-NEXT: ld.param.b16 %rs1, [test_fadd_imm_1_param_0];
-; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %r1, %rs1;
+; SM80-FTZ-NEXT: cvt.f32.bf16 %r1, %rs1;
; SM80-FTZ-NEXT: add.rn.ftz.f32 %r2, %r1, 0f3F800000;
; SM80-FTZ-NEXT: cvt.rn.bf16.f32 %rs2, %r2;
; SM80-FTZ-NEXT: st.param.b16 [func_retval0], %rs2;
; SM80-FTZ-NEXT: ret;
;
+; SM90-FTZ-LABEL: test_fadd_imm_1(
+; SM90-FTZ: {
+; SM90-FTZ-NEXT: .reg .b16 %rs<4>;
+; SM90-FTZ-EMPTY:
+; SM90-FTZ-NEXT: // %bb.0:
+; SM90-FTZ-NEXT: ld.param.b16 %rs1, [test_fadd_imm_1_param_0];
+; SM90-FTZ-NEXT: mov.b16 %rs2, 0x3F80;
+; SM90-FTZ-NEXT: add.rn.bf16 %rs3, %rs1, %rs2;
+; SM90-FTZ-NEXT: st.param.b16 [func_retval0], %rs3;
+; SM90-FTZ-NEXT: ret;
+;
; SM90-LABEL: test_fadd_imm_1(
; SM90: {
; SM90-NEXT: .reg .b16 %rs<4>;
@@ -750,18 +857,43 @@ define <8 x float> @test_extload_bf16x8(ptr addrspace(3) noundef %arg) #0 {
; SM80-FTZ-NEXT: mov.b32 {%rs3, %rs4}, %r4;
; SM80-FTZ-NEXT: mov.b32 {%rs5, %rs6}, %r1;
; SM80-FTZ-NEXT: mov.b32 {%rs7, %rs8}, %r2;
-; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %r5, %rs8;
-; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %r6, %rs7;
-; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %r7, %rs6;
-; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %r8, %rs5;
-; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %r9, %rs4;
-; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %r10, %rs3;
-; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %r11, %rs2;
-; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %r12, %rs1;
+; SM80-FTZ-NEXT: cvt.f32.bf16 %r5, %rs8;
+; SM80-FTZ-NEXT: cvt.f32.bf16 %r6, %rs7;
+; SM80-FTZ-NEXT: cvt.f32.bf16 %r7, %rs6;
+; SM80-FTZ-NEXT: cvt.f32.bf16 %r8, %rs5;
+; SM80-FTZ-NEXT: cvt.f32.bf16 %r9, %rs4;
+; SM80-FTZ-NEXT: cvt.f32.bf16 %r10, %rs3;
+; SM80-FTZ-NEXT: cvt.f32.bf16 %r11, %rs2;
+; SM80-FTZ-NEXT: cvt.f32.bf16 %r12, %rs1;
; SM80-FTZ-NEXT: st.param.v4.b32 [func_retval0+16], {%r12, %r11, %r10, %r9};
; SM80-FTZ-NEXT: st.param.v4.b32 [func_retval0], {%r8, %r7, %r6, %r5};
; SM80-FTZ-NEXT: ret;
;
+; SM90-FTZ-LABEL: test_extload_bf16x8(
+; SM90-FTZ: {
+; SM90-FTZ-NEXT: .reg .b16 %rs<9>;
+; SM90-FTZ-NEXT: .reg .b32 %r<13>;
+; SM90-FTZ-NEXT: .reg .b64 %rd<2>;
+; SM90-FTZ-EMPTY:
+; SM90-FTZ-NEXT: // %bb.0:
+; SM90-FTZ-NEXT: ld.param.b64 %rd1, [test_extload_bf16x8_param_0];
+; SM90-FTZ-NEXT: ld.shared.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
+; SM90-FTZ-NEXT: mov.b32 {%rs1, %rs2}, %r3;
+; SM90-FTZ-NEXT: mov.b32 {%rs3, %rs4}, %r4;
+; SM90-FTZ-NEXT: mov.b32 {%rs5, %rs6}, %r1;
+; SM90-FTZ-NEXT: mov.b32 {%rs7, %rs8}, %r2;
+; SM90-FTZ-NEXT: cvt.ftz.f32.bf16 %r5, %rs8;
+; SM90-FTZ-NEXT: cvt.ftz.f32.bf16 %r6, %rs7;
+; SM90-FTZ-NEXT: cvt.ftz.f32.bf16 %r7, %rs6;
+; SM90-FTZ-NEXT: cvt.ftz.f32.bf16 %r8, %rs5;
+; SM90-FTZ-NEXT: cvt.ftz.f32.bf16 %r9, %rs4;
+; SM90-FTZ-NEXT: cvt.ftz.f32.bf16 %r10, %rs3;
+; SM90-FTZ-NEXT: cvt.ftz.f32.bf16 %r11, %rs2;
+; SM90-FTZ-NEXT: cvt.ftz.f32.bf16 %r12, %rs1;
+; SM90-FTZ-NEXT: st.param.v4.b32 [func_retval0+16], {%r12, %r11, %r10, %r9};
+; SM90-FTZ-NEXT: st.param.v4.b32 [func_retval0], {%r8, %r7, %r6, %r5};
+; SM90-FTZ-NEXT: ret;
+;
; SM90-LABEL: test_extload_bf16x8(
; SM90: {
; SM90-NEXT: .reg .b16 %rs<9>;
@@ -825,12 +957,24 @@ define i16 @test_fptosi_i16(bfloat %a) {
; SM80-FTZ-EMPTY:
; SM80-FTZ-NEXT: // %bb.0:
; SM80-FTZ-NEXT: ld.param.b16 %rs1, [test_fptosi_i16_param_0];
-; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %r1, %rs1;
+; SM80-FTZ-NEXT: cvt.f32.bf16 %r1, %rs1;
; SM80-FTZ-NEXT: cvt.rzi.ftz.s16.f32 %rs2, %r1;
; SM80-FTZ-NEXT: cvt.u32.u16 %r2, %rs2;
; SM80-FTZ-NEXT: st.param.b32 [func_retval0], %r2;
; SM80-FTZ-NEXT: ret;
;
+; SM90-FTZ-LABEL: test_fptosi_i16(
+; SM90-FTZ: {
+; SM90-FTZ-NEXT: .reg .b16 %rs<3>;
+; SM90-FTZ-NEXT: .reg .b32 %r<2>;
+; SM90-FTZ-EMPTY:
+; SM90-FTZ-NEXT: // %bb.0:
+; SM90-FTZ-NEXT: ld.param.b16 %rs1, [test_fptosi_i16_param_0];
+; SM90-FTZ-NEXT: cvt.rzi.s16.bf16 %rs2, %rs1;
+; SM90-FTZ-NEXT: cvt.u32.u16 %r1, %rs2;
+; SM90-FTZ-NEXT: st.param.b32 [func_retval0], %r1;
+; SM90-FTZ-NEXT: ret;
+;
; SM90-LABEL: test_fptosi_i16(
; SM90: {
; SM90-NEXT: .reg .b16 %rs<3>;
@@ -880,12 +1024,24 @@ define i16 @test_fptoui_i16(bfloat %a) {
; SM80-FTZ-EMPTY:
; SM80-FTZ-NEXT: // %bb.0:
; SM80-FTZ-NEXT: ld.param.b16 %rs1, [test_fptoui_i16_param_0];
-; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %r1, %rs1;
+; SM80-FTZ-NEXT: cvt.f32.bf16 %r1, %rs1;
; SM80-FTZ-NEXT: cvt.rzi.ftz.u16.f32 %rs2, %r1;
; SM80-FTZ-NEXT: cvt.u32.u16 %r2, %rs2;
; SM80-FTZ-NEXT: st.param.b32 [func_retval0], %r2;
; SM80-FTZ-NEXT: ret;
;
+; SM90-FTZ-LABEL: test_fptoui_i16(
+; SM90-FTZ: {
+; SM90-FTZ-NEXT: .reg .b16 %rs<3>;
+; SM90-FTZ-NEXT: .reg .b32 %r<2>;
+; SM90-FTZ-EMPTY:
+; SM90-FTZ-NEXT: // %bb.0:
+; SM90-FTZ-NEXT: ld.param.b16 %rs1, [test_fptoui_i16_param_0];
+; SM90-FTZ-NEXT: cvt.rzi.u16.bf16 %rs2, %rs1;
+; SM90-FTZ-NEXT: cvt.u32.u16 %r1, %rs2;
+; SM90-FTZ-NEXT: st.param.b32 [func_retval0], %r1;
+; SM90-FTZ-NEXT: ret;
+;
; SM90-LABEL: test_fptoui_i16(
; SM90: {
; SM90-NEXT: .reg .b16 %rs<3>;
@@ -945,6 +1101,16 @@ define bfloat @test_sitofp_i16(i16 %a) {
; SM80-FTZ-NEXT: st.param.b16 [func_retval0], %rs2;
; SM80-FTZ-NEXT: ret;
;
+; SM90-FTZ-LABEL: test_sitofp_i16(
+; SM90-FTZ: {
+; SM90-FTZ-NEXT: .reg .b16 %rs<3>;
+; SM90-FTZ-EMPTY:
+; SM90-FTZ-NEXT: // %bb.0:
+; SM90-FTZ-NEXT: ld.param.b16 %rs1, [test_sitofp_i16_param_0];
+; SM90-FTZ-NEXT: cvt.rn.bf16.s16 %rs2, %rs1;
+; SM90-FTZ-NEXT: st.param.b16 [func_retval0], %rs2;
+; SM90-FTZ-NEXT: ret;
+;
; SM90-LABEL: test_sitofp_i16(
; SM90: {
; SM90-NEXT: .reg .b16 %rs<3>;
@@ -1002,6 +1168,16 @@ define bfloat @test_uitofp_i8(i8 %a) {
; SM80-FTZ-NEXT: st.param.b16 [func_retval0], %rs2;
; SM80-FTZ-NEXT: ret;
;
+; SM90-FTZ-LABEL: test_uitofp_i8(
+; SM90-FTZ: {
+; SM90-FTZ-NEXT: .reg .b16 %rs<3>;
+; SM90-FTZ-EMPTY:
+; SM90-FTZ-NEXT: // %bb.0:
+; SM90-FTZ-NEXT: ld.param.b8 %rs1, [test_uitofp_i8_param_0];
+; SM90-FTZ-NEXT: cvt.rn.bf16.u16 %rs2, %rs1;
+; SM90-FTZ-NEXT: st.param.b16 [func_retval0], %rs2;
+; SM90-FTZ-NEXT: ret;
+;
; SM90-LABEL: test_uitofp_i8(
; SM90: {
; SM90-NEXT: .reg .b16 %rs<3>;
@@ -1070,6 +1246,21 @@ define bfloat @test_uitofp_i1(i1 %a) {
; SM80-FTZ-NEXT: st.param.b16 [func_retval0], %rs3;
; SM80-FTZ-NEXT: ret;
;
+; SM90-FTZ-LABEL: test_uitofp_i1(
+; SM90-FTZ: {
+; SM90-FTZ-NEXT: .reg .pred %p<2>;
+; SM90-FTZ-NEXT: .reg .b16 %rs<4>;
+; SM90-FTZ-NEXT: .reg .b32 %r<2>;
+; SM90-FTZ-EMPTY:
+; SM90-FTZ-NEXT: // %bb.0:
+; SM90-FTZ-NEXT: ld.param.b8 %rs1, [test_uitofp_i1_param_0];
+; SM90-FTZ-NEXT: and.b16 %rs2, %rs1, 1;
+; SM90-FTZ-NEXT: setp.ne.b16 %p1, %rs2, 0;
+; SM90-FTZ-NEXT: selp.b32 %r1, 1, 0, %p1;
+; SM90-FTZ-NEXT: cvt.rn.bf16.u32 %rs3, %r1;
+; SM90-FTZ-NEXT: st.param.b16 [func_retval0], %rs3;
+; SM90-FTZ-NEXT: ret;
+;
; SM90-LABEL: test_uitofp_i1(
; SM90: {
; SM90-NEXT: .reg .pred %p<2>;
@@ -1132,6 +1323,16 @@ define bfloat @test_uitofp_i16(i16 %a) {
; SM80-FTZ-NEXT: st.param.b16 [func_retval0], %rs2;
; SM80-FTZ-NEXT: ret;
;
+; SM90-FTZ-LABEL: test_uitofp_i16(
+; SM90-FTZ: {
+; SM90-FTZ-NEXT: .reg .b16 %rs<3>;
+; SM90-FTZ-EMPTY:
+; SM90-FTZ-NEXT: // %bb.0:
+; SM90-FTZ-NEXT: ld.param.b16 %rs1, [test_uitofp_i16_param_0];
+; SM90-FTZ-NEXT: cvt.rn.bf16.u16 %rs2, %rs1;
+; SM90-FTZ-NEXT: st.param.b16 [func_retval0], %rs2;
+; SM90-FTZ-NEXT: ret;
+;
; SM90-LABEL: test_uitofp_i16(
; SM90: {
; SM90-NEXT: .reg .b16 %rs<3>;
@@ -1188,6 +1389,17 @@ define bfloat @test_uitofp_i32(i32 %a) {
; SM80-FTZ-NEXT: st.param.b16 [func_retval0], %rs1;
; SM80-FTZ-NEXT: ret;
;
+; SM90-FTZ-LABEL: test_uitofp_i32(
+; SM90-FTZ: {
+; SM90-FTZ-NEXT: .reg .b16 %rs<2>;
+; SM90-FTZ-NEXT: .reg .b32 %r<2>;
+; SM90-FTZ-EMPTY:
+; SM90-FTZ-NEXT: // %bb.0:
+; SM90-FTZ-NEXT: ld.param.b32 %r1, [test_uitofp_i32_param_0];
+; SM90-FTZ-NEXT: cvt.rn.bf16.u32 %rs1, %r1;
+; SM90-FTZ-NEXT: st.param.b16 [func_retval0], %rs1;
+; SM90-FTZ-NEXT: ret;
+;
; SM90-LABEL: test_uitofp_i32(
; SM90: {
; SM90-NEXT: .reg .b16 %rs<2>;
@@ -1248,6 +1460,17 @@ define bfloat @test_uitofp_i64(i64 %a) {
; SM80-FTZ-NEXT: st.param.b16 [func_retval0], %rs1;
; SM80-FTZ-NEXT: ret;
;
+; SM90-FTZ-LABEL: test_uitofp_i64(
+; SM90-FTZ: {
+; SM90-FTZ-NEXT: .reg .b16 %rs<2>;
+; SM90-FTZ-NEXT: .reg .b64 %rd<2>;
+; SM90-FTZ-EMPTY:
+; SM90-FTZ-NEXT: // %bb.0:
+; SM90-FTZ-NEXT: ld.param.b64 %rd1, [test_uitofp_i64_param_0];
+; SM90-FTZ-NEXT: cvt.rn.bf16.u64 %rs1, %rd1;
+; SM90-FTZ-NEXT: st.param.b16 [func_retval0], %rs1;
+; SM90-FTZ-NEXT: ret;
+;
; SM90-LABEL: test_uitofp_i64(
; SM90: {
; SM90-NEXT: .reg .b16 %rs<2>;
@@ -1302,12 +1525,22 @@ define bfloat @test_roundeven(bfloat %a) {
; SM80-FTZ-EMPTY:...
[truncated]
``````````
</details>
https://github.com/llvm/llvm-project/pull/165774
More information about the llvm-commits
mailing list