[llvm] [AMDGPU][GlobalIsel] Add register bank legalization rules for fptoi and itofp (PR #176300)
via llvm-commits
llvm-commits at lists.llvm.org
Thu Jan 15 18:30:08 PST 2026
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-backend-amdgpu
Author: Syadus Sefat (mssefat)
<details>
<summary>Changes</summary>
This patch adds register bank legalization rules for fptoi and itofp operations in the AMDGPU GlobalISel pipeline.
---
Patch is 752.11 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/176300.diff
9 Files Affected:
- (modified) llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp (+28-5)
- (modified) llvm/lib/Target/AMDGPU/SOPInstructions.td (+12)
- (added) llvm/test/CodeGen/AMDGPU/GlobalISel/fptoi.ll (+1201)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-fptosi.mir (+1-2)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-fptoui.mir (+1-2)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-sitofp.mir (+1-2)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-uitofp.mir (+2-1)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.exp.ll (+2565-5092)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.exp10.ll (+2661-5303)
``````````diff
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp
index 58feb385251ab..f659acfc57e0c 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp
@@ -1079,14 +1079,37 @@ RegBankLegalizeRules::RegBankLegalizeRules(const GCNSubtarget &_ST,
.Any({{UniV2S32}, {{UniInVgprV2S32}, {VgprV2S32}}})
.Any({{DivV2S32}, {{VgprV2S32}, {VgprV2S32}}});
- addRulesForGOpcs({G_FPTOUI})
+ addRulesForGOpcs({G_FPTOUI, G_FPTOSI})
+ .Any({{UniS16, S16}, {{UniInVgprS16}, {Vgpr16}}})
+ .Any({{DivS16, S16}, {{Vgpr16}, {Vgpr16}}})
+ .Any({{UniS16, S32}, {{UniInVgprS16}, {Vgpr32}}})
+ .Any({{DivS16, S32}, {{Vgpr16}, {Vgpr32}}})
+ .Any({{UniS16, S64}, {{UniInVgprS16}, {Vgpr64}}})
+ .Any({{DivS16, S64}, {{Vgpr16}, {Vgpr64}}})
+ .Any({{UniS32, S16}, {{Sgpr32}, {Sgpr16}}}, hasSALUFloat)
+ .Any({{UniS32, S16}, {{UniInVgprS32}, {Vgpr16}}}, !hasSALUFloat)
+ .Any({{DivS32, S16}, {{Vgpr32}, {Vgpr16}}})
.Any({{UniS32, S32}, {{Sgpr32}, {Sgpr32}}}, hasSALUFloat)
- .Any({{UniS32, S32}, {{UniInVgprS32}, {Vgpr32}}}, !hasSALUFloat);
-
- addRulesForGOpcs({G_UITOFP})
+ .Any({{UniS32, S32}, {{UniInVgprS32}, {Vgpr32}}}, !hasSALUFloat)
.Any({{DivS32, S32}, {{Vgpr32}, {Vgpr32}}})
+ .Any({{UniS32, S64}, {{UniInVgprS32}, {Vgpr64}}})
+ .Any({{DivS32, S64}, {{Vgpr32}, {Vgpr64}}})
+ .Any({{UniS64, S16}, {{UniInVgprS64}, {Vgpr16}}})
+ .Any({{DivS64, S16}, {{Vgpr64}, {Vgpr16}}});
+
+ addRulesForGOpcs({G_UITOFP, G_SITOFP})
+ .Any({{UniS16, S16}, {{UniInVgprS16}, {Vgpr16}}})
+ .Any({{DivS16, S16}, {{Vgpr16}, {Vgpr16}}})
+ .Any({{UniS16, S32}, {{Sgpr16}, {Sgpr32}}}, hasSALUFloat)
+ .Any({{UniS16, S32}, {{UniInVgprS16}, {Vgpr32}}}, !hasSALUFloat)
+ .Any({{DivS16, S32}, {{Vgpr16}, {Vgpr32}}})
.Any({{UniS32, S32}, {{Sgpr32}, {Sgpr32}}}, hasSALUFloat)
- .Any({{UniS32, S32}, {{UniInVgprS32}, {Vgpr32}}}, !hasSALUFloat);
+ .Any({{UniS32, S32}, {{UniInVgprS32}, {Vgpr32}}}, !hasSALUFloat)
+ .Any({{DivS32, S32}, {{Vgpr32}, {Vgpr32}}})
+ .Any({{UniS32, S64}, {{UniInVgprS32}, {Vgpr64}}})
+ .Any({{DivS32, S64}, {{Vgpr32}, {Vgpr64}}})
+ .Any({{UniS64, S32}, {{UniInVgprS64}, {Vgpr32}}})
+ .Any({{DivS64, S32}, {{Vgpr64}, {Vgpr32}}});
addRulesForGOpcs({G_FPEXT})
.Any({{DivS32, S16}, {{Vgpr32}, {Vgpr16}}})
diff --git a/llvm/lib/Target/AMDGPU/SOPInstructions.td b/llvm/lib/Target/AMDGPU/SOPInstructions.td
index 99b352bdf6765..0ab7bcf83aa7a 100644
--- a/llvm/lib/Target/AMDGPU/SOPInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SOPInstructions.td
@@ -475,6 +475,18 @@ let SubtargetPredicate = HasSALUFloatInsts, AddedComplexity = 9 in {
(S_CVT_I32_F32 $src0)>;
def : GCNPat<(i16 (UniformUnaryFrag<fp_to_uint> f32:$src0)),
(S_CVT_U32_F32 $src0)>;
+
+ // f16 -> i32 : form chain f16 -> f32 -> i32
+ def : GCNPat<(i32 (UniformUnaryFrag<fp_to_sint> f16:$src0)),
+ (S_CVT_I32_F32 (S_CVT_F32_F16 $src0))>;
+ def : GCNPat<(i32 (UniformUnaryFrag<fp_to_uint> f16:$src0)),
+ (S_CVT_U32_F32 (S_CVT_F32_F16 $src0))>;
+
+ // i32 -> f16 : form chain i32 -> f32 -> f16
+ def : GCNPat<(f16 (UniformUnaryFrag<sint_to_fp> i32:$src0)),
+ (S_CVT_F16_F32 (S_CVT_F32_I32 $src0))>;
+ def : GCNPat<(f16 (UniformUnaryFrag<uint_to_fp> i32:$src0)),
+ (S_CVT_F16_F32 (S_CVT_F32_U32 $src0))>;
}
let hasSideEffects = 1 in {
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fptoi.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fptoi.ll
new file mode 100644
index 0000000000000..3134ec956e685
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fptoi.ll
@@ -0,0 +1,1201 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1010 -new-reg-bank-select < %s | FileCheck %s --check-prefix=GFX10
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1100 -new-reg-bank-select < %s | FileCheck %s --check-prefix=GFX11
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1250 -new-reg-bank-select < %s | FileCheck %s --check-prefix=GFX12
+
+define amdgpu_ps void @s_fptoui_f16_to_i16(half inreg %x, ptr addrspace(1) %out) {
+; GFX10-LABEL: s_fptoui_f16_to_i16:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: v_cvt_u16_f16_e32 v2, s0
+; GFX10-NEXT: global_store_short v[0:1], v2, off
+; GFX10-NEXT: s_endpgm
+;
+; GFX11-LABEL: s_fptoui_f16_to_i16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: v_cvt_u16_f16_e32 v2.l, s0
+; GFX11-NEXT: global_store_b16 v[0:1], v2, off
+; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: s_fptoui_f16_to_i16:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
+; GFX12-NEXT: v_cvt_u16_f16_e32 v2, s0
+; GFX12-NEXT: global_store_b16 v[0:1], v2, off
+; GFX12-NEXT: s_endpgm
+ %result = fptoui half %x to i16
+ store i16 %result, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @s_fptosi_f16_to_i16(half inreg %x, ptr addrspace(1) %out) {
+; GFX10-LABEL: s_fptosi_f16_to_i16:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: v_cvt_i16_f16_e32 v2, s0
+; GFX10-NEXT: global_store_short v[0:1], v2, off
+; GFX10-NEXT: s_endpgm
+;
+; GFX11-LABEL: s_fptosi_f16_to_i16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: v_cvt_i16_f16_e32 v2.l, s0
+; GFX11-NEXT: global_store_b16 v[0:1], v2, off
+; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: s_fptosi_f16_to_i16:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
+; GFX12-NEXT: v_cvt_i16_f16_e32 v2, s0
+; GFX12-NEXT: global_store_b16 v[0:1], v2, off
+; GFX12-NEXT: s_endpgm
+ %result = fptosi half %x to i16
+ store i16 %result, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @v_fptoui_f16_to_i16(half %x, ptr addrspace(1) %out) {
+; GFX10-LABEL: v_fptoui_f16_to_i16:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: v_cvt_u16_f16_e32 v0, v0
+; GFX10-NEXT: global_store_short v[1:2], v0, off
+; GFX10-NEXT: s_endpgm
+;
+; GFX11-LABEL: v_fptoui_f16_to_i16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: v_cvt_u16_f16_e32 v0.l, v0.l
+; GFX11-NEXT: global_store_b16 v[1:2], v0, off
+; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: v_fptoui_f16_to_i16:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
+; GFX12-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2
+; GFX12-NEXT: v_cvt_u16_f16_e32 v0, v0
+; GFX12-NEXT: global_store_b16 v[4:5], v0, off
+; GFX12-NEXT: s_endpgm
+ %result = fptoui half %x to i16
+ store i16 %result, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @v_fptosi_f16_to_i16(half %x, ptr addrspace(1) %out) {
+; GFX10-LABEL: v_fptosi_f16_to_i16:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: v_cvt_i16_f16_e32 v0, v0
+; GFX10-NEXT: global_store_short v[1:2], v0, off
+; GFX10-NEXT: s_endpgm
+;
+; GFX11-LABEL: v_fptosi_f16_to_i16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: v_cvt_i16_f16_e32 v0.l, v0.l
+; GFX11-NEXT: global_store_b16 v[1:2], v0, off
+; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: v_fptosi_f16_to_i16:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
+; GFX12-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2
+; GFX12-NEXT: v_cvt_i16_f16_e32 v0, v0
+; GFX12-NEXT: global_store_b16 v[4:5], v0, off
+; GFX12-NEXT: s_endpgm
+ %result = fptosi half %x to i16
+ store i16 %result, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @s_fptoui_f32_to_i16(float inreg %x, ptr addrspace(1) %out) {
+; GFX10-LABEL: s_fptoui_f32_to_i16:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: v_cvt_u32_f32_e32 v2, s0
+; GFX10-NEXT: global_store_short v[0:1], v2, off
+; GFX10-NEXT: s_endpgm
+;
+; GFX11-LABEL: s_fptoui_f32_to_i16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: v_cvt_u32_f32_e32 v2, s0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_readfirstlane_b32 s0, v2
+; GFX11-NEXT: v_mov_b16_e32 v2.l, s0
+; GFX11-NEXT: global_store_b16 v[0:1], v2, off
+; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: s_fptoui_f32_to_i16:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
+; GFX12-NEXT: s_cvt_u32_f32 s0, s0
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_3)
+; GFX12-NEXT: v_mov_b32_e32 v2, s0
+; GFX12-NEXT: global_store_b16 v[0:1], v2, off
+; GFX12-NEXT: s_endpgm
+ %result = fptoui float %x to i16
+ store i16 %result, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @s_fptosi_f32_to_i16(float inreg %x, ptr addrspace(1) %out) {
+; GFX10-LABEL: s_fptosi_f32_to_i16:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: v_cvt_i32_f32_e32 v2, s0
+; GFX10-NEXT: global_store_short v[0:1], v2, off
+; GFX10-NEXT: s_endpgm
+;
+; GFX11-LABEL: s_fptosi_f32_to_i16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: v_cvt_i32_f32_e32 v2, s0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_readfirstlane_b32 s0, v2
+; GFX11-NEXT: v_mov_b16_e32 v2.l, s0
+; GFX11-NEXT: global_store_b16 v[0:1], v2, off
+; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: s_fptosi_f32_to_i16:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
+; GFX12-NEXT: s_cvt_i32_f32 s0, s0
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_3)
+; GFX12-NEXT: v_mov_b32_e32 v2, s0
+; GFX12-NEXT: global_store_b16 v[0:1], v2, off
+; GFX12-NEXT: s_endpgm
+ %result = fptosi float %x to i16
+ store i16 %result, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @v_fptoui_f32_to_i16(float %x, ptr addrspace(1) %out) {
+; GFX10-LABEL: v_fptoui_f32_to_i16:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: v_cvt_u32_f32_e32 v0, v0
+; GFX10-NEXT: global_store_short v[1:2], v0, off
+; GFX10-NEXT: s_endpgm
+;
+; GFX11-LABEL: v_fptoui_f32_to_i16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: v_cvt_u32_f32_e32 v0, v0
+; GFX11-NEXT: global_store_b16 v[1:2], v0, off
+; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: v_fptoui_f32_to_i16:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
+; GFX12-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2
+; GFX12-NEXT: v_cvt_u32_f32_e32 v0, v0
+; GFX12-NEXT: global_store_b16 v[4:5], v0, off
+; GFX12-NEXT: s_endpgm
+ %result = fptoui float %x to i16
+ store i16 %result, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @v_fptosi_f32_to_i16(float %x, ptr addrspace(1) %out) {
+; GFX10-LABEL: v_fptosi_f32_to_i16:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: v_cvt_i32_f32_e32 v0, v0
+; GFX10-NEXT: global_store_short v[1:2], v0, off
+; GFX10-NEXT: s_endpgm
+;
+; GFX11-LABEL: v_fptosi_f32_to_i16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: v_cvt_i32_f32_e32 v0, v0
+; GFX11-NEXT: global_store_b16 v[1:2], v0, off
+; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: v_fptosi_f32_to_i16:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
+; GFX12-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2
+; GFX12-NEXT: v_cvt_i32_f32_e32 v0, v0
+; GFX12-NEXT: global_store_b16 v[4:5], v0, off
+; GFX12-NEXT: s_endpgm
+ %result = fptosi float %x to i16
+ store i16 %result, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @s_fptoui_f64_to_i16(double inreg %x, ptr addrspace(1) %out) {
+; GFX10-LABEL: s_fptoui_f64_to_i16:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: v_cvt_u32_f64_e32 v2, s[0:1]
+; GFX10-NEXT: global_store_short v[0:1], v2, off
+; GFX10-NEXT: s_endpgm
+;
+; GFX11-LABEL: s_fptoui_f64_to_i16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: v_cvt_u32_f64_e32 v2, s[0:1]
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_readfirstlane_b32 s0, v2
+; GFX11-NEXT: v_mov_b16_e32 v2.l, s0
+; GFX11-NEXT: global_store_b16 v[0:1], v2, off
+; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: s_fptoui_f64_to_i16:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
+; GFX12-NEXT: v_cvt_u32_f64_e32 v2, s[0:1]
+; GFX12-NEXT: global_store_b16 v[0:1], v2, off
+; GFX12-NEXT: s_endpgm
+ %result = fptoui double %x to i16
+ store i16 %result, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @s_fptosi_f64_to_i16(double inreg %x, ptr addrspace(1) %out) {
+; GFX10-LABEL: s_fptosi_f64_to_i16:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: v_cvt_i32_f64_e32 v2, s[0:1]
+; GFX10-NEXT: global_store_short v[0:1], v2, off
+; GFX10-NEXT: s_endpgm
+;
+; GFX11-LABEL: s_fptosi_f64_to_i16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: v_cvt_i32_f64_e32 v2, s[0:1]
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_readfirstlane_b32 s0, v2
+; GFX11-NEXT: v_mov_b16_e32 v2.l, s0
+; GFX11-NEXT: global_store_b16 v[0:1], v2, off
+; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: s_fptosi_f64_to_i16:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
+; GFX12-NEXT: v_cvt_i32_f64_e32 v2, s[0:1]
+; GFX12-NEXT: global_store_b16 v[0:1], v2, off
+; GFX12-NEXT: s_endpgm
+ %result = fptosi double %x to i16
+ store i16 %result, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @v_fptoui_f64_to_i16(double %x, ptr addrspace(1) %out) {
+; GFX10-LABEL: v_fptoui_f64_to_i16:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: v_cvt_u32_f64_e32 v0, v[0:1]
+; GFX10-NEXT: global_store_short v[2:3], v0, off
+; GFX10-NEXT: s_endpgm
+;
+; GFX11-LABEL: v_fptoui_f64_to_i16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: v_cvt_u32_f64_e32 v0, v[0:1]
+; GFX11-NEXT: global_store_b16 v[2:3], v0, off
+; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: v_fptoui_f64_to_i16:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
+; GFX12-NEXT: v_cvt_u32_f64_e32 v0, v[0:1]
+; GFX12-NEXT: global_store_b16 v[2:3], v0, off
+; GFX12-NEXT: s_endpgm
+ %result = fptoui double %x to i16
+ store i16 %result, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @v_fptosi_f64_to_i16(double %x, ptr addrspace(1) %out) {
+; GFX10-LABEL: v_fptosi_f64_to_i16:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: v_cvt_i32_f64_e32 v0, v[0:1]
+; GFX10-NEXT: global_store_short v[2:3], v0, off
+; GFX10-NEXT: s_endpgm
+;
+; GFX11-LABEL: v_fptosi_f64_to_i16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: v_cvt_i32_f64_e32 v0, v[0:1]
+; GFX11-NEXT: global_store_b16 v[2:3], v0, off
+; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: v_fptosi_f64_to_i16:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
+; GFX12-NEXT: v_cvt_i32_f64_e32 v0, v[0:1]
+; GFX12-NEXT: global_store_b16 v[2:3], v0, off
+; GFX12-NEXT: s_endpgm
+ %result = fptosi double %x to i16
+ store i16 %result, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @s_fptoui_f16_to_i32(half inreg %x, ptr addrspace(1) %out) {
+; GFX10-LABEL: s_fptoui_f16_to_i32:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: v_cvt_f32_f16_e32 v2, s0
+; GFX10-NEXT: v_cvt_u32_f32_e32 v2, v2
+; GFX10-NEXT: global_store_dword v[0:1], v2, off
+; GFX10-NEXT: s_endpgm
+;
+; GFX11-LABEL: s_fptoui_f16_to_i32:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: v_cvt_f32_f16_e32 v2, s0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_cvt_u32_f32_e32 v2, v2
+; GFX11-NEXT: global_store_b32 v[0:1], v2, off
+; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: s_fptoui_f16_to_i32:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
+; GFX12-NEXT: s_cvt_f32_f16 s0, s0
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_3) | instskip(NEXT) | instid1(SALU_CYCLE_3)
+; GFX12-NEXT: s_cvt_u32_f32 s0, s0
+; GFX12-NEXT: v_mov_b32_e32 v2, s0
+; GFX12-NEXT: global_store_b32 v[0:1], v2, off
+; GFX12-NEXT: s_endpgm
+ %result = fptoui half %x to i32
+ store i32 %result, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @s_fptosi_f16_to_i32(half inreg %x, ptr addrspace(1) %out) {
+; GFX10-LABEL: s_fptosi_f16_to_i32:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: v_cvt_f32_f16_e32 v2, s0
+; GFX10-NEXT: v_cvt_i32_f32_e32 v2, v2
+; GFX10-NEXT: global_store_dword v[0:1], v2, off
+; GFX10-NEXT: s_endpgm
+;
+; GFX11-LABEL: s_fptosi_f16_to_i32:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: v_cvt_f32_f16_e32 v2, s0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_cvt_i32_f32_e32 v2, v2
+; GFX11-NEXT: global_store_b32 v[0:1], v2, off
+; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: s_fptosi_f16_to_i32:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
+; GFX12-NEXT: s_cvt_f32_f16 s0, s0
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_3) | instskip(NEXT) | instid1(SALU_CYCLE_3)
+; GFX12-NEXT: s_cvt_i32_f32 s0, s0
+; GFX12-NEXT: v_mov_b32_e32 v2, s0
+; GFX12-NEXT: global_store_b32 v[0:1], v2, off
+; GFX12-NEXT: s_endpgm
+ %result = fptosi half %x to i32
+ store i32 %result, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @v_fptoui_f16_to_i32(half %x, ptr addrspace(1) %out) {
+; GFX10-LABEL: v_fptoui_f16_to_i32:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX10-NEXT: v_cvt_u32_f32_e32 v0, v0
+; GFX10-NEXT: global_store_dword v[1:2], v0, off
+; GFX10-NEXT: s_endpgm
+;
+; GFX11-LABEL: v_fptoui_f16_to_i32:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: v_cvt_f32_f16_e32 v0, v0.l
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_cvt_u32_f32_e32 v0, v0
+; GFX11-NEXT: global_store_b32 v[1:2], v0, off
+; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: v_fptoui_f16_to_i32:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
+; GFX12-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX12-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX12-NEXT: v_cvt_u32_f32_e32 v0, v0
+; GFX12-NEXT: global_store_b32 v[4:5], v0, off
+; GFX12-NEXT: s_endpgm
+ %result = fptoui half %x to i32
+ store i32 %result, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @v_fptosi_f16_to_i32(half %x, ptr addrspace(1) %out) {
+; GFX10-LABEL: v_fptosi_f16_to_i32:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX10-NEXT: v_cvt_i32_f32_e32 v0, v0
+; GFX10-NEXT: global_store_dword v[1:2], v0, off
+; GFX10-NEXT: s_endpgm
+;
+; GFX11-LABEL: v_fptosi_f16_to_i32:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: v_cvt_f32_f16_e32 v0, v0.l
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_cvt_i32_f32_e32 v0, v0
+; GFX11-NEXT: global_store_b32 v[1:2], v0, off
+; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: v_fptosi_f16_to_i32:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
+; GFX12-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX12-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX12-NEXT: v_cvt_i32_f32_e32 v0, v0
+; GFX12-NEXT: global_store_b32 v[4:5], v0, off
+; GFX12-NEXT: s_endpgm
+ %result = fptosi half %x to i32
+ store i32 %result, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @s_fptoui_f32_to_i32(float inreg %x, ptr addrspace(1) %out) {
+; GFX10-LABEL: s_fptoui_f32_to_i32:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: v_cvt_u32_f32_e32 v2, s0
+; GFX10-NEXT: global_store_dword v[0:1], v2, off
+; GFX10-NEXT: s_endpgm
+;
+; GFX11-LABEL: s_fptoui_f32_to_i32:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: v_cvt_u32_f32_e32 v2, s0
+; GFX11-NEXT: global_store_b32 v[0:1], v2, off
+; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: s_fptoui_f32_to_i32:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
+; GFX12-NEXT: s_cvt_u32_f32 s0, s0
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_3)
+; GFX12-NEXT: v_mov_b32_e32 v2, s0
+; GFX12-NEXT: global_store_b32 v[0:1], v2, off
+; GFX12-NEXT: s_endpgm
+ %result = fptoui float %x to i32
+ store i32 %result, ptr addrspace(...
[truncated]
``````````
</details>
https://github.com/llvm/llvm-project/pull/176300
More information about the llvm-commits
mailing list