<table border="1" cellspacing="0" cellpadding="8">
<tr>
<th>Issue</th>
<td>
<a href=https://github.com/llvm/llvm-project/issues/60410>60410</a>
</td>
</tr>
<tr>
<th>Summary</th>
<td>
[AArch64] Suboptimal code-gen of llvm.fptosi.sat
</td>
</tr>
<tr>
<th>Labels</th>
<td>
new issue
</td>
</tr>
<tr>
<th>Assignees</th>
<td>
</td>
</tr>
<tr>
<th>Reporter</th>
<td>
stevesuzuki-arm
</td>
</tr>
</table>
<pre>
Code-gen of `llvm.fptosi.sat` looks inconsistent depending data types and some of them are suboptimal.
Please note the examples below are not exhaustive. (e.g. probably the same goes to `fptoui.sat`).
https://godbolt.org/z/eWd465hqa
```
target triple = "aarch64-linux-gnu"
; OK
define i32 @fptosi_sat(float %a) #0 {
%1 = call i32 @llvm.fptosi.sat.i32.f32(float %a)
ret i32 %1
}
declare i32 @llvm.fptosi.sat.i32.f32(float)
; OK
define <4 x i32> @fptosi_sat_v4(<4 x float> %a) #0 {
%1 = call <4 x i32> @llvm.fptosi.sat.v4i32.v4f32(<4 x float> %a)
ret <4 x i32> %1
}
declare <4 x i32> @llvm.fptosi.sat.v4i32.v4f32(<4 x float>)
; Sub optimal
define <vscale x 4 x i32> @fptosi_sat_nxv4(<vscale x 4 x float> %a) #0 {
%1 = call <vscale x 4 x i32> @llvm.fptosi.sat.nxv4i32.nxv4f32(<vscale x 4 x float> %a)
ret <vscale x 4 x i32> %1
}
declare <vscale x 4 x i32> @llvm.fptosi.sat.nxv4i32.nxv4f32(<vscale x 4 x float>)
; Sub optimal
define i16 @fptosi_sat_i16_f16(half %a) #0 {
%1 = call i16 @llvm.fptosi.sat.i16.f16(half %a)
ret i16 %1
}
declare i16 @llvm.fptosi.sat.i16.f16(half)
; OK
define <8 x i16> @fptosi_sat_v8i16_v8f16(<8 x half> %a) #0 {
%1 = call <8 x i16> @llvm.fptosi.sat.v8i16.v8f16(<8 x half> %a)
ret <8 x i16> %1
}
declare <8 x i16> @llvm.fptosi.sat.v8i16.v8f16(<8 x half>)
; Sub optimal
define <vscale x 8 x i16> @fptosi_sat_nxv8i16_nxv8f16(<vscale x 8 x half> %a) #0 {
%1 = call <vscale x 8 x i16> @llvm.fptosi.sat.nxv8i16.nxv8f16(<vscale x 8 x half> %a)
ret <vscale x 8 x i16> %1
}
declare <vscale x 8 x i16> @llvm.fptosi.sat.nxv8i16.nxv8f16(<vscale x 8 x half>)
; Sub optimal
define i16 @fptosi_sat_i16_f32(float %a) #0 {
%1 = call i16 @llvm.fptosi.sat.i16.f32(float %a)
ret i16 %1
}
declare i16 @llvm.fptosi.sat.i16.f32(float)
; OK
define <8 x i16> @fptosi_sat_v8i16_v8f32(<8 x float> %a) #0 {
%1 = call <8 x i16> @llvm.fptosi.sat.v8i16.v8f32(<8 x float> %a)
ret <8 x i16> %1
}
declare <8 x i16> @llvm.fptosi.sat.v8i16.v8f32(<8 x float>)
; Sub optimal
define i8 @fptosi_sat_i8_f32(float %a) #0 {
%1 = call i8 @llvm.fptosi.sat.i8.f32(float %a)
ret i8 %1
}
declare i8 @llvm.fptosi.sat.i8.f32(float)
; Sub optimal
define <16 x i8> @fptosi_sat_v16i8_v16f32(<16 x float> %a) #0 {
%1 = call <16 x i8> @llvm.fptosi.sat.v16i8.v16f32(<16 x float> %a)
ret <16 x i8> %1
}
declare <16 x i8> @llvm.fptosi.sat.v16i8.v16f32(<16 x float>)
; Sub optimal
define i8 @fptosi_sat_i8_f16(half %a) #0 {
%1 = call i8 @llvm.fptosi.sat.i8.f16(half %a)
ret i8 %1
}
declare i8 @llvm.fptosi.sat.i8.f16(half)
; OK
define <16 x i8> @fptosi_sat_v16i8_v16f16(<16 x half> %a) #0 {
%1 = call <16 x i8> @llvm.fptosi.sat.v16i8.v16f16(<16 x half> %a)
ret <16 x i8> %1
}
declare <16 x i8> @llvm.fptosi.sat.v16i8.v16f16(<16 x half>)
```
target triple = "aarch64-linux-gnu" -mattr=+sve2 -O3
```
fptosi_sat: // @fptosi_sat
fcvtzs w0, s0
ret
fptosi_sat_v4: // @fptosi_sat_v4
fcvtzs v0.4s, v0.4s
ret
fptosi_sat_nxv4: // @fptosi_sat_nxv4
mov w8, #-822083584
ptrue p0.s
mov z2.s, #0x80000000
mov z1.s, w8
mov w8, #1325400063
fcmge p1.s, p0/z, z0.s, z1.s
movprfx z1, z0
fcvtzs z1.s, p0/m, z0.s
not p1.b, p0/z, p1.b
mov z1.s, p1/m, z2.s
mov z2.s, w8
fcmgt p1.s, p0/z, z0.s, z2.s
mov z2.s, #0x7fffffff
mov z1.s, p1/m, z2.s
fcmuo p0.s, p0/z, z0.s, z0.s
mov z1.s, p0/m, #0 // =0x0
mov z0.d, z1.d
ret
fptosi_sat_i16_f16: // @fptosi_sat_i16_f16
fcvtzs w8, h0
mov w9, #32767
cmp w8, w9
csel w8, w8, w9, lt
mov w9, #-32768
cmn w8, #8, lsl #12 // =32768
csel w0, w8, w9, gt
ret
fptosi_sat_v8i16_v8f16: // @fptosi_sat_v8i16_v8f16
fcvtzs v0.8h, v0.8h
ret
fptosi_sat_nxv8i16_nxv8f16: // @fptosi_sat_nxv8i16_nxv8f16
mov w8, #63488
ptrue p0.h
mov z1.h, w8
mov w8, #30719
fcmge p1.h, p0/z, z0.h, z1.h
movprfx z1, z0
fcvtzs z1.h, p0/m, z0.h
not p1.b, p0/z, p1.b
mov z2.h, w8
mov z1.h, p1/m, #-32768 // =0xffffffffffff8000
fcmgt p1.h, p0/z, z0.h, z2.h
mov z2.h, #32767 // =0x7fff
mov z1.h, p1/m, z2.h
fcmuo p0.h, p0/z, z0.h, z0.h
mov z1.h, p0/m, #0 // =0x0
mov z0.d, z1.d
ret
fptosi_sat_i16_f32: // @fptosi_sat_i16_f32
fcvtzs w8, s0
mov w9, #32767
cmp w8, w9
csel w8, w8, w9, lt
mov w9, #-32768
cmn w8, #8, lsl #12 // =32768
csel w0, w8, w9, gt
ret
fptosi_sat_v8i16_v8f32: // @fptosi_sat_v8i16_v8f32
fcvtzs v0.4s, v0.4s
fcvtzs v1.4s, v1.4s
sqxtn v0.4h, v0.4s
sqxtn2 v0.8h, v1.4s
ret
fptosi_sat_i8_f32: // @fptosi_sat_i8_f32
fcvtzs w9, s0
mov w8, #127
cmp w9, #127
csel w8, w9, w8, lt
mov w9, #-128
cmn w8, #128
csel w0, w8, w9, gt
ret
fptosi_sat_v16i8_v16f32: // @fptosi_sat_v16i8_v16f32
movi v4.4s, #127
fcvtzs v3.4s, v3.4s
fcvtzs v2.4s, v2.4s
fcvtzs v1.4s, v1.4s
fcvtzs v0.4s, v0.4s
mvni v5.4s, #127
smin v3.4s, v3.4s, v4.4s
smin v2.4s, v2.4s, v4.4s
smin v1.4s, v1.4s, v4.4s
smin v0.4s, v0.4s, v4.4s
smax v3.4s, v3.4s, v5.4s
smax v2.4s, v2.4s, v5.4s
smax v1.4s, v1.4s, v5.4s
smax v0.4s, v0.4s, v5.4s
uzp1 v2.8h, v2.8h, v3.8h
uzp1 v0.8h, v0.8h, v1.8h
uzp1 v0.16b, v0.16b, v2.16b
ret
fptosi_sat_i8_f16: // @fptosi_sat_i8_f16
fcvtzs w9, h0
mov w8, #127
cmp w9, #127
csel w8, w9, w8, lt
mov w9, #-128
cmn w8, #128
csel w0, w8, w9, gt
ret
fptosi_sat_v16i8_v16f16: // @fptosi_sat_v16i8_v16f16
fcvtzs v0.8h, v0.8h
fcvtzs v1.8h, v1.8h
sqxtn v0.8b, v0.8h
sqxtn2 v0.16b, v1.8h
ret
```
</pre>
<img width="1px" height="1px" alt="" src="http://email.email.llvm.org/o/eJy8Wk1zqzgW_TXyRmVKSIDFwosknmxm0VPVi1mmwAjDPD78kCB-_vVT4st8SBg7Sbu6nulwdc7V0blXgrLHeXzKGNsD-xXYh41Xiigv9lywivHyWv6Kt16Rbvw8-LN_ywO2PbEM5iEEDkqSKjXCs8h5bHBPAAfBJM9_cRhnxzzjMRcsEzBgZ5YFcXaCgSc8KP6cGYdeFkCep0wiiYil0CsY5KWfn0WceokB0AGgl_8kzOMMZrlgMgqyi5eeE8ahz5L8sx6T5QKyS-SVXMQVMyDAlBknA56L3Pf85E89jnspg6eccShymbjMuexyBtht6SIhzhyQF4DfAX4_5YGfJ8LIixPA71eA39l_A8uxo99eE97-66D2v_p_hVecmICiiM8Jg4AcIMDY84pj5FjbJM7Ky_aUlQDjEQZ5hX_9u7kOWBhnDMYEQ2ChRt0PmSmmYZJ7AgJsewC7EGCCINi9wmYchFDeMmvOo5ckHcRklYyYYCMkeIp3QymYaMZi22zz2x265I6JVH0ddI-qnSUgbxa8SDhA_jWe70dlAUzbgAZOhqyd_Ax5mmtlyWwrq8lXwzMWZYKpl-dr5ArZ_i592JbGVL-KH72EwQvUCpldOilHsU9pqmGbzk9SyhnK736Oi-wzpZVMi5J_e27Thbi7HLHpTLSPTecjNB2AaeQl4QO12yDNCsx0jDncpHTl0IXSXYO8qnKplNl0FJVL5bQr2uC1kTXsg24bM8yKSNIYyzQzWw0hF930PPeT9auVM7s0gsrvnm807BltNbyKaqmnuppcX8irpf_u1L6rjueb5tcK-e4m_HwlP7gJ3y_lrlHSZ3eOlfW0xPPjxawif9w8dOod-qR1qHJ56Qrj0EXfrMB9rIeZjtSXKuxjOjGV__bS1qFPGWhMMltDyWTcZZpZaIi6aKEv0X-PiZ45R2jX-t4p4kkLPXaEWGGcrrHXoc9sdGsXbono522jYr9JOH7GhKsfMuE29YQoADkA_MorhuH2L6LEHDxoknqi2k_zdDx5OG3yCo-VuHIIPxHAb5Cjm2ytdFMu-ZC3RKfikmNGuB1rhQyLS-LmoolRkdYPRHpaJWk9poFM86qO-6SSDGCypRgjSmw6SewsipLJb2Tw8Z0O4ooN3oKgC0XNRxNqNqGfVH2_z8Yk2LYQQg6ZypSe6mxaoDNqXmu8wStq_lJTTMHPRXiBV7OJUwt_HSKmPeIoNstFo4lp-GP2-i-LUz6bPS6-p2Qvj5yuWJ7uXbB6WXZh85ms_prkwmNa5p0B1DnchJoB92rWXU5rU3JAF51nkBG0Cxvcr8XuWVVTGcqy6MYofdFYMtIk9-m2cyN45-zGMcf0PDD1pzu5y1kyuNvF4DeYiGmNdiRbyUKnLNmABWBSfyc8qcsILyiuAuuSQtOkTmKhFw2flRXCqxvgYMy48VbIoFHbAml0f8mnj5bkRdv9RnHLHcghFu3UGfTASFvl0arGRtDOdAfV3TazaFZZUev5OaGqmQ26WDTrYtHXutgVL8-t5zQHtd5YVeM8dAkHHzrbLgZtTycMVq1El2lXjzMnTrLY3Tri3TnNGQeNUZflXb_8TH_UNkaCH2-MBE8PRlR1MPpCR1zRCnUkuo74T3XD9rPYFNWiLzfFXvTeawvHw3FMZXYh5iyE_74IuV3I0ZEGpo7BwzY8x1EajC75S2Mw-qGdaqO01mb9SRF3Juvd5c5uTRd0sJTNtcpoN5OZ-N6mO494cicdvmuYS6k2zXDMdBax_K6s1hIqUTrbkM42ROcsWOEuButjFuy3xsVplTU526qceRrXyk-ylRfW3Mld8DjtSXAfNU78DuR4BjNI76LJ0lZAtsHzLG0F5DzLBch5loPg8no2W-K20PsLMjt49cHjw1mTSB88iDIdvw3rrnB9taqPLJzi2tsLLUN1Vle0i76Uv6FrzPeln2kZC7KNXjY93Dl0ii6dxW_7zdwInRFv-w31NTCD_aZzyhyon3H_xmcT7EngEtfbsL3p7GzTMV3X2kT7IKTh0XN9H_tBQCzGCPLsnWkzF-9cYrqbeI8RJsgkpomIg4iBXBa6NHSOtoMDRo7AQiz14sSo33TlxWkTc16yvYMsE20Sz2cJr39pgnHGPmF9E2AM7MOm2MsxW788cWChJOaC31BELJL6JyovL_VbLmAf4N_9z0XgcfDDlMk7tk1ZJPvJLztiEZW-ccxTgN9lePu1PRf5_9hRAPxeJ8YBfq8T_38AAAD___SPN8U">