[llvm] [AMDGPU] Implement llvm.lround intrinsic lowering. (PR #98970)

Matt Arsenault via llvm-commits llvm-commits at lists.llvm.org
Fri Jul 19 03:59:16 PDT 2024


================
@@ -0,0 +1,551 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9 %s
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GCN,GFX10 %s
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GCN,GFX11 %s
+
+declare float @llvm.round.f32(float)
+declare i32 @llvm.lround.i32.f32(float)
+declare i32 @llvm.lround.i32.f64(double)
+declare i64 @llvm.lround.i64.f32(float)
+declare i64 @llvm.lround.i64.f64(double)
+declare i64 @llvm.llround.i64.f32(float)
+declare half @llvm.round.f16(half)
+declare i32 @llvm.lround.i32.f16(half %arg)
+
+define float @intrinsic_fround(float %arg) {
+; GFX9-LABEL: intrinsic_fround:
+; GFX9:       ; %bb.0: ; %entry
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_trunc_f32_e32 v1, v0
+; GFX9-NEXT:    v_sub_f32_e32 v2, v0, v1
+; GFX9-NEXT:    v_cmp_ge_f32_e64 s[4:5], |v2|, 0.5
+; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, 1.0, s[4:5]
+; GFX9-NEXT:    v_bfrev_b32_e32 v3, 1
+; GFX9-NEXT:    v_and_or_b32 v0, v0, v3, v2
+; GFX9-NEXT:    v_add_f32_e32 v0, v1, v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: intrinsic_fround:
+; GFX10:       ; %bb.0: ; %entry
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_trunc_f32_e32 v1, v0
+; GFX10-NEXT:    v_sub_f32_e32 v2, v0, v1
+; GFX10-NEXT:    v_cmp_ge_f32_e64 s4, |v2|, 0.5
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, 0, 1.0, s4
+; GFX10-NEXT:    v_and_or_b32 v0, 0x80000000, v0, v2
+; GFX10-NEXT:    v_add_f32_e32 v0, v1, v0
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: intrinsic_fround:
+; GFX11:       ; %bb.0: ; %entry
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_trunc_f32_e32 v1, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_sub_f32_e32 v2, v0, v1
+; GFX11-NEXT:    v_cmp_ge_f32_e64 s0, |v2|, 0.5
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_cndmask_b32_e64 v2, 0, 1.0, s0
+; GFX11-NEXT:    v_and_or_b32 v0, 0x80000000, v0, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_add_f32_e32 v0, v1, v0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %0 = tail call float @llvm.round.f32(float %arg)
+  ret float %0
+}
+
+define i32 @intrinsic_lround_i32_f32(float %arg) {
+; GFX9-LABEL: intrinsic_lround_i32_f32:
+; GFX9:       ; %bb.0: ; %entry
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_trunc_f32_e32 v1, v0
+; GFX9-NEXT:    v_sub_f32_e32 v2, v0, v1
+; GFX9-NEXT:    v_cmp_ge_f32_e64 s[4:5], |v2|, 0.5
+; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, 1.0, s[4:5]
+; GFX9-NEXT:    v_bfrev_b32_e32 v3, 1
+; GFX9-NEXT:    v_and_or_b32 v0, v0, v3, v2
+; GFX9-NEXT:    v_add_f32_e32 v0, v1, v0
+; GFX9-NEXT:    v_cvt_i32_f32_e32 v0, v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: intrinsic_lround_i32_f32:
+; GFX10:       ; %bb.0: ; %entry
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_trunc_f32_e32 v1, v0
+; GFX10-NEXT:    v_sub_f32_e32 v2, v0, v1
+; GFX10-NEXT:    v_cmp_ge_f32_e64 s4, |v2|, 0.5
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, 0, 1.0, s4
+; GFX10-NEXT:    v_and_or_b32 v0, 0x80000000, v0, v2
+; GFX10-NEXT:    v_add_f32_e32 v0, v1, v0
+; GFX10-NEXT:    v_cvt_i32_f32_e32 v0, v0
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: intrinsic_lround_i32_f32:
+; GFX11:       ; %bb.0: ; %entry
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_trunc_f32_e32 v1, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_sub_f32_e32 v2, v0, v1
+; GFX11-NEXT:    v_cmp_ge_f32_e64 s0, |v2|, 0.5
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_cndmask_b32_e64 v2, 0, 1.0, s0
+; GFX11-NEXT:    v_and_or_b32 v0, 0x80000000, v0, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_add_f32_e32 v0, v1, v0
+; GFX11-NEXT:    v_cvt_i32_f32_e32 v0, v0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %0 = tail call i32 @llvm.lround.i32.f32(float %arg)
+  ret i32 %0
+}
+
+define i32 @intrinsic_lround_i32_f64(double %arg) {
+; GFX9-LABEL: intrinsic_lround_i32_f64:
+; GFX9:       ; %bb.0: ; %entry
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_trunc_f64_e32 v[2:3], v[0:1]
+; GFX9-NEXT:    v_mov_b32_e32 v6, 0x3ff00000
+; GFX9-NEXT:    s_brev_b32 s4, 1
+; GFX9-NEXT:    v_add_f64 v[4:5], v[0:1], -v[2:3]
+; GFX9-NEXT:    v_and_or_b32 v0, v0, 0, 0
+; GFX9-NEXT:    v_cmp_ge_f64_e64 vcc, |v[4:5]|, 0.5
+; GFX9-NEXT:    v_cndmask_b32_e32 v4, 0, v6, vcc
+; GFX9-NEXT:    v_and_or_b32 v1, v1, s4, v4
+; GFX9-NEXT:    v_add_f64 v[0:1], v[2:3], v[0:1]
+; GFX9-NEXT:    v_cvt_i32_f64_e32 v0, v[0:1]
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: intrinsic_lround_i32_f64:
+; GFX10:       ; %bb.0: ; %entry
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_trunc_f64_e32 v[2:3], v[0:1]
+; GFX10-NEXT:    v_add_f64 v[4:5], v[0:1], -v[2:3]
+; GFX10-NEXT:    v_and_or_b32 v0, v0, 0, 0
+; GFX10-NEXT:    v_cmp_ge_f64_e64 s4, |v[4:5]|, 0.5
+; GFX10-NEXT:    v_cndmask_b32_e64 v4, 0, 0x3ff00000, s4
+; GFX10-NEXT:    v_and_or_b32 v1, 0x80000000, v1, v4
+; GFX10-NEXT:    v_add_f64 v[0:1], v[2:3], v[0:1]
+; GFX10-NEXT:    v_cvt_i32_f64_e32 v0, v[0:1]
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: intrinsic_lround_i32_f64:
+; GFX11:       ; %bb.0: ; %entry
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_trunc_f64_e32 v[2:3], v[0:1]
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_add_f64 v[4:5], v[0:1], -v[2:3]
+; GFX11-NEXT:    v_and_or_b32 v0, v0, 0, 0
+; GFX11-NEXT:    v_cmp_ge_f64_e64 s0, |v[4:5]|, 0.5
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_cndmask_b32_e64 v4, 0, 0x3ff00000, s0
+; GFX11-NEXT:    v_and_or_b32 v1, 0x80000000, v1, v4
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_add_f64 v[0:1], v[2:3], v[0:1]
+; GFX11-NEXT:    v_cvt_i32_f64_e32 v0, v[0:1]
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %0 = tail call i32 @llvm.lround.i32.f64(double %arg)
+  ret i32 %0
+}
+
+define i64 @intrinsic_lround_i64_f32(float %arg) {
+; GFX9-LABEL: intrinsic_lround_i64_f32:
+; GFX9:       ; %bb.0: ; %entry
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_trunc_f32_e32 v1, v0
+; GFX9-NEXT:    v_sub_f32_e32 v2, v0, v1
+; GFX9-NEXT:    v_cmp_ge_f32_e64 s[4:5], |v2|, 0.5
+; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, 1.0, s[4:5]
+; GFX9-NEXT:    v_bfrev_b32_e32 v3, 1
+; GFX9-NEXT:    v_and_or_b32 v0, v0, v3, v2
+; GFX9-NEXT:    v_add_f32_e32 v0, v1, v0
+; GFX9-NEXT:    v_trunc_f32_e32 v1, v0
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0x2f800000
+; GFX9-NEXT:    v_mul_f32_e64 v2, |v1|, v2
+; GFX9-NEXT:    v_floor_f32_e32 v2, v2
+; GFX9-NEXT:    v_mov_b32_e32 v3, 0xcf800000
+; GFX9-NEXT:    v_fma_f32 v1, v2, v3, |v1|
+; GFX9-NEXT:    v_cvt_u32_f32_e32 v1, v1
+; GFX9-NEXT:    v_cvt_u32_f32_e32 v2, v2
+; GFX9-NEXT:    v_ashrrev_i32_e32 v3, 31, v0
+; GFX9-NEXT:    v_xor_b32_e32 v0, v1, v3
+; GFX9-NEXT:    v_xor_b32_e32 v1, v2, v3
+; GFX9-NEXT:    v_sub_co_u32_e32 v0, vcc, v0, v3
+; GFX9-NEXT:    v_subb_co_u32_e32 v1, vcc, v1, v3, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: intrinsic_lround_i64_f32:
+; GFX10:       ; %bb.0: ; %entry
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_trunc_f32_e32 v1, v0
+; GFX10-NEXT:    v_sub_f32_e32 v2, v0, v1
+; GFX10-NEXT:    v_cmp_ge_f32_e64 s4, |v2|, 0.5
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, 0, 1.0, s4
+; GFX10-NEXT:    v_and_or_b32 v0, 0x80000000, v0, v2
+; GFX10-NEXT:    v_add_f32_e32 v0, v1, v0
+; GFX10-NEXT:    v_trunc_f32_e32 v1, v0
+; GFX10-NEXT:    v_ashrrev_i32_e32 v3, 31, v0
+; GFX10-NEXT:    v_mul_f32_e64 v2, 0x2f800000, |v1|
+; GFX10-NEXT:    v_floor_f32_e32 v2, v2
+; GFX10-NEXT:    v_fma_f32 v1, 0xcf800000, v2, |v1|
+; GFX10-NEXT:    v_cvt_u32_f32_e32 v0, v1
+; GFX10-NEXT:    v_cvt_u32_f32_e32 v1, v2
+; GFX10-NEXT:    v_xor_b32_e32 v0, v0, v3
+; GFX10-NEXT:    v_xor_b32_e32 v1, v1, v3
+; GFX10-NEXT:    v_sub_co_u32 v0, vcc_lo, v0, v3
+; GFX10-NEXT:    v_sub_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: intrinsic_lround_i64_f32:
+; GFX11:       ; %bb.0: ; %entry
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_trunc_f32_e32 v1, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_sub_f32_e32 v2, v0, v1
+; GFX11-NEXT:    v_cmp_ge_f32_e64 s0, |v2|, 0.5
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_cndmask_b32_e64 v2, 0, 1.0, s0
+; GFX11-NEXT:    v_and_or_b32 v0, 0x80000000, v0, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_add_f32_e32 v0, v1, v0
+; GFX11-NEXT:    v_trunc_f32_e32 v1, v0
+; GFX11-NEXT:    v_ashrrev_i32_e32 v3, 31, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_mul_f32_e64 v2, 0x2f800000, |v1|
+; GFX11-NEXT:    v_floor_f32_e32 v2, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_fma_f32 v1, 0xcf800000, v2, |v1|
+; GFX11-NEXT:    v_cvt_u32_f32_e32 v0, v1
+; GFX11-NEXT:    v_cvt_u32_f32_e32 v1, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_xor_b32_e32 v0, v0, v3
+; GFX11-NEXT:    v_xor_b32_e32 v1, v1, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_sub_co_u32 v0, vcc_lo, v0, v3
+; GFX11-NEXT:    v_sub_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %0 = tail call i64 @llvm.lround.i64.f32(float %arg)
+  ret i64 %0
+}
+
+define i64 @intrinsic_lround_i64_f64(double %arg) {
+; GFX9-LABEL: intrinsic_lround_i64_f64:
+; GFX9:       ; %bb.0: ; %entry
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_trunc_f64_e32 v[2:3], v[0:1]
+; GFX9-NEXT:    v_mov_b32_e32 v6, 0x3ff00000
+; GFX9-NEXT:    s_brev_b32 s4, 1
+; GFX9-NEXT:    v_add_f64 v[4:5], v[0:1], -v[2:3]
+; GFX9-NEXT:    v_and_or_b32 v0, v0, 0, 0
+; GFX9-NEXT:    v_cmp_ge_f64_e64 vcc, |v[4:5]|, 0.5
+; GFX9-NEXT:    v_cndmask_b32_e32 v4, 0, v6, vcc
+; GFX9-NEXT:    v_and_or_b32 v1, v1, s4, v4
+; GFX9-NEXT:    v_add_f64 v[0:1], v[2:3], v[0:1]
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9-NEXT:    v_mov_b32_e32 v3, 0x3df00000
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0
+; GFX9-NEXT:    v_mov_b32_e32 v5, 0xc1f00000
+; GFX9-NEXT:    v_trunc_f64_e32 v[0:1], v[0:1]
+; GFX9-NEXT:    v_mul_f64 v[2:3], v[0:1], v[2:3]
+; GFX9-NEXT:    v_floor_f64_e32 v[2:3], v[2:3]
+; GFX9-NEXT:    v_fma_f64 v[0:1], v[2:3], v[4:5], v[0:1]
+; GFX9-NEXT:    v_cvt_u32_f64_e32 v0, v[0:1]
+; GFX9-NEXT:    v_cvt_i32_f64_e32 v1, v[2:3]
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: intrinsic_lround_i64_f64:
+; GFX10:       ; %bb.0: ; %entry
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_trunc_f64_e32 v[2:3], v[0:1]
+; GFX10-NEXT:    v_add_f64 v[4:5], v[0:1], -v[2:3]
+; GFX10-NEXT:    v_and_or_b32 v0, v0, 0, 0
+; GFX10-NEXT:    v_cmp_ge_f64_e64 s4, |v[4:5]|, 0.5
+; GFX10-NEXT:    v_cndmask_b32_e64 v4, 0, 0x3ff00000, s4
+; GFX10-NEXT:    v_and_or_b32 v1, 0x80000000, v1, v4
+; GFX10-NEXT:    v_add_f64 v[0:1], v[2:3], v[0:1]
+; GFX10-NEXT:    v_trunc_f64_e32 v[0:1], v[0:1]
+; GFX10-NEXT:    v_mul_f64 v[2:3], 0x3df00000, v[0:1]
+; GFX10-NEXT:    v_floor_f64_e32 v[2:3], v[2:3]
+; GFX10-NEXT:    v_fma_f64 v[0:1], 0xc1f00000, v[2:3], v[0:1]
+; GFX10-NEXT:    v_cvt_u32_f64_e32 v0, v[0:1]
+; GFX10-NEXT:    v_cvt_i32_f64_e32 v1, v[2:3]
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: intrinsic_lround_i64_f64:
+; GFX11:       ; %bb.0: ; %entry
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_trunc_f64_e32 v[2:3], v[0:1]
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_add_f64 v[4:5], v[0:1], -v[2:3]
+; GFX11-NEXT:    v_and_or_b32 v0, v0, 0, 0
+; GFX11-NEXT:    v_cmp_ge_f64_e64 s0, |v[4:5]|, 0.5
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_cndmask_b32_e64 v4, 0, 0x3ff00000, s0
+; GFX11-NEXT:    v_and_or_b32 v1, 0x80000000, v1, v4
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_add_f64 v[0:1], v[2:3], v[0:1]
+; GFX11-NEXT:    v_trunc_f64_e32 v[0:1], v[0:1]
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_mul_f64 v[2:3], 0x3df00000, v[0:1]
+; GFX11-NEXT:    v_floor_f64_e32 v[2:3], v[2:3]
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_fma_f64 v[0:1], 0xc1f00000, v[2:3], v[0:1]
+; GFX11-NEXT:    v_cvt_u32_f64_e32 v0, v[0:1]
+; GFX11-NEXT:    v_cvt_i32_f64_e32 v1, v[2:3]
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %0 = tail call i64 @llvm.lround.i64.f64(double %arg)
+  ret i64 %0
+}
+
+define i64 @intrinsic_llround_i64_f32(float %arg) {
+; GFX9-LABEL: intrinsic_llround_i64_f32:
+; GFX9:       ; %bb.0: ; %entry
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_trunc_f32_e32 v1, v0
+; GFX9-NEXT:    v_sub_f32_e32 v2, v0, v1
+; GFX9-NEXT:    v_cmp_ge_f32_e64 s[4:5], |v2|, 0.5
+; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, 1.0, s[4:5]
+; GFX9-NEXT:    v_bfrev_b32_e32 v3, 1
+; GFX9-NEXT:    v_and_or_b32 v0, v0, v3, v2
+; GFX9-NEXT:    v_add_f32_e32 v0, v1, v0
+; GFX9-NEXT:    v_trunc_f32_e32 v1, v0
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0x2f800000
+; GFX9-NEXT:    v_mul_f32_e64 v2, |v1|, v2
+; GFX9-NEXT:    v_floor_f32_e32 v2, v2
+; GFX9-NEXT:    v_mov_b32_e32 v3, 0xcf800000
+; GFX9-NEXT:    v_fma_f32 v1, v2, v3, |v1|
+; GFX9-NEXT:    v_cvt_u32_f32_e32 v1, v1
+; GFX9-NEXT:    v_cvt_u32_f32_e32 v2, v2
+; GFX9-NEXT:    v_ashrrev_i32_e32 v3, 31, v0
+; GFX9-NEXT:    v_xor_b32_e32 v0, v1, v3
+; GFX9-NEXT:    v_xor_b32_e32 v1, v2, v3
+; GFX9-NEXT:    v_sub_co_u32_e32 v0, vcc, v0, v3
+; GFX9-NEXT:    v_subb_co_u32_e32 v1, vcc, v1, v3, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: intrinsic_llround_i64_f32:
+; GFX10:       ; %bb.0: ; %entry
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_trunc_f32_e32 v1, v0
+; GFX10-NEXT:    v_sub_f32_e32 v2, v0, v1
+; GFX10-NEXT:    v_cmp_ge_f32_e64 s4, |v2|, 0.5
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, 0, 1.0, s4
+; GFX10-NEXT:    v_and_or_b32 v0, 0x80000000, v0, v2
+; GFX10-NEXT:    v_add_f32_e32 v0, v1, v0
+; GFX10-NEXT:    v_trunc_f32_e32 v1, v0
+; GFX10-NEXT:    v_ashrrev_i32_e32 v3, 31, v0
+; GFX10-NEXT:    v_mul_f32_e64 v2, 0x2f800000, |v1|
+; GFX10-NEXT:    v_floor_f32_e32 v2, v2
+; GFX10-NEXT:    v_fma_f32 v1, 0xcf800000, v2, |v1|
+; GFX10-NEXT:    v_cvt_u32_f32_e32 v0, v1
+; GFX10-NEXT:    v_cvt_u32_f32_e32 v1, v2
+; GFX10-NEXT:    v_xor_b32_e32 v0, v0, v3
+; GFX10-NEXT:    v_xor_b32_e32 v1, v1, v3
+; GFX10-NEXT:    v_sub_co_u32 v0, vcc_lo, v0, v3
+; GFX10-NEXT:    v_sub_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: intrinsic_llround_i64_f32:
+; GFX11:       ; %bb.0: ; %entry
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_trunc_f32_e32 v1, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_sub_f32_e32 v2, v0, v1
+; GFX11-NEXT:    v_cmp_ge_f32_e64 s0, |v2|, 0.5
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_cndmask_b32_e64 v2, 0, 1.0, s0
+; GFX11-NEXT:    v_and_or_b32 v0, 0x80000000, v0, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_add_f32_e32 v0, v1, v0
+; GFX11-NEXT:    v_trunc_f32_e32 v1, v0
+; GFX11-NEXT:    v_ashrrev_i32_e32 v3, 31, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_mul_f32_e64 v2, 0x2f800000, |v1|
+; GFX11-NEXT:    v_floor_f32_e32 v2, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_fma_f32 v1, 0xcf800000, v2, |v1|
+; GFX11-NEXT:    v_cvt_u32_f32_e32 v0, v1
+; GFX11-NEXT:    v_cvt_u32_f32_e32 v1, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_xor_b32_e32 v0, v0, v3
+; GFX11-NEXT:    v_xor_b32_e32 v1, v1, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_sub_co_u32 v0, vcc_lo, v0, v3
+; GFX11-NEXT:    v_sub_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %0 = tail call i64 @llvm.llround.i64.f32(float %arg)
----------------
arsenm wrote:

Use named values in tests 

https://github.com/llvm/llvm-project/pull/98970


More information about the llvm-commits mailing list