<table border="1" cellspacing="0" cellpadding="8">
<tr>
<th>Issue</th>
<td>
<a href=https://github.com/llvm/llvm-project/issues/103855>103855</a>
</td>
</tr>
<tr>
<th>Summary</th>
<td>
[Isel Aarch64] extra instruction (i256) or 2 instructions (i320) when chaining icmp and select based on underflow
</td>
</tr>
<tr>
<th>Labels</th>
<td>
new issue
</td>
</tr>
<tr>
<th>Assignees</th>
<td>
</td>
</tr>
<tr>
<th>Reporter</th>
<td>
mratsim
</td>
</tr>
</table>
<pre>
Same IR as https://github.com/llvm/llvm-project/issues/103841 but applied to Aarch64 as an alternative to https://github.com/llvm/llvm-project/issues/103717
Unlike x86 there is always an extra instruction even for i256, and there are 2 unnecessary instruction for i320 or i384.
https://alive2.llvm.org/ce/z/-bGiUs
## Full code
### Original IR
```llvm
; ModuleID = 'x86_poc'
target triple = "arm64"
; target triple = "x86_64"
@bn254_snarks_fp_mod = constant i256 21888242871839275222246405745257275088696311157297823662689037894645226208583, section "ctt.bn254_snarks_fp.constants", align 64
@bls12_381_fp_mod = constant i384 4002409555221667393417789825735904156556882819939007885332058136124031650490837864442687629129015664037894272559787, section "ctt.bls12_381_fp.constants", align 64
@bls24_317_fp_mod = constant i320 136393071104295911515099765908274057061945112121419593977210139303905973197232025618026156731051, section "ctt.bls24_317_fp.constants", align 64
; Function Attrs: hot
define internal fastcc void @_modadd_noo.u64x4(ptr %0, ptr %1, ptr %2, ptr %3) #2 section "ctt.fields" {
%a = load i256, ptr %1, align 4
%b = load i256, ptr %2, align 4
%M = load i256, ptr %3, align 4
%a_plus_b = add i256 %a, %b
%5 = sub i256 %a_plus_b, %M
%6 = lshr i256 %5, 255
%7 = trunc i256 %6 to i1
%8 = select i1 %7, i256 %a_plus_b, i256 %5
store i256 %8, ptr %0, align 4
ret void
}
; Function Attrs: hot
define internal fastcc void @_modadd_noo.u64x5(ptr %0, ptr %1, ptr %2, ptr %3) #2 section "ctt.fields" {
%a = load i320, ptr %1, align 4
%b = load i320, ptr %2, align 4
%M = load i320, ptr %3, align 4
%a_plus_b = add i320 %a, %b
%5 = sub i320 %a_plus_b, %M
%6 = lshr i320 %5, 319
%7 = trunc i320 %6 to i1
%8 = select i1 %7, i320 %a_plus_b, i320 %5
store i320 %8, ptr %0, align 4
ret void
}
; Function Attrs: hot
define internal fastcc void @_modadd_noo.u64x6(ptr %0, ptr %1, ptr %2, ptr %3) #2 section "ctt.fields" {
%a = load i384, ptr %1, align 4
%b = load i384, ptr %2, align 4
%M = load i384, ptr %3, align 4
%a_plus_b = add i384 %a, %b
%5 = sub i384 %a_plus_b, %M
%6 = lshr i384 %5, 383
%7 = trunc i384 %6 to i1
%8 = select i1 %7, i384 %a_plus_b, i384 %5
store i384 %8, ptr %0, align 4
ret void
}
; Function Attrs: hot
define void @bn254_snarks_fp_add(ptr %0, ptr %1, ptr %2) #2 section "ctt.bn254_snarks_fp" {
call fastcc void @_modadd_noo.u64x4(ptr %0, ptr %1, ptr %2, ptr @bn254_snarks_fp_mod)
ret void
}
; Function Attrs: hot
define void @bls24_317_fp_add(ptr %0, ptr %1, ptr %2) #2 section "ctt.bls24_317_fp" {
call fastcc void @_modadd_noo.u64x5(ptr %0, ptr %1, ptr %2, ptr @bls24_317_fp_mod)
ret void
}
; Function Attrs: hot
define void @bls12_381_fp_add(ptr %0, ptr %1, ptr %2) #2 section "ctt.bls12_381_fp" {
call fastcc void @_modadd_noo.u64x6(ptr %0, ptr %1, ptr %2, ptr @bls12_381_fp_mod)
ret void
}
attributes #2 = { hot }
```
### After opt -O3
```llvm
target triple = "arm64"
; target triple = "x86_64"
define void @bn254_snarks_fp_add(ptr nocapture writeonly %0, ptr nocapture readonly %1, ptr nocapture readonly %2) local_unnamed_addr #0 section "ctt.bn254_snarks_fp" {
%.val = load i256, ptr %1, align 4
%.val1 = load i256, ptr %2, align 4
%a_plus_b.i = add i256 %.val1, %.val
%4 = add i256 %a_plus_b.i, -21888242871839275222246405745257275088696311157297823662689037894645226208583
%.not1.i = icmp slt i256 %4, 0
%5 = select i1 %.not1.i, i256 %a_plus_b.i, i256 %4
store i256 %5, ptr %0, align 4
ret void
}
define void @bls24_317_fp_add(ptr nocapture writeonly %0, ptr nocapture readonly %1, ptr nocapture readonly %2) local_unnamed_addr #0 section "ctt.bls24_317_fp" {
%.val = load i320, ptr %1, align 4
%.val1 = load i320, ptr %2, align 4
%a_plus_b.i = add i320 %.val1, %.val
%4 = add i320 %a_plus_b.i, -136393071104295911515099765908274057061945112121419593977210139303905973197232025618026156731051
%.not1.i = icmp slt i320 %4, 0
%5 = select i1 %.not1.i, i320 %a_plus_b.i, i320 %4
store i320 %5, ptr %0, align 4
ret void
}
define void @bls12_381_fp_add(ptr nocapture writeonly %0, ptr nocapture readonly %1, ptr nocapture readonly %2) local_unnamed_addr #0 section "ctt.bls12_381_fp" {
%.val = load i384, ptr %1, align 4
%.val1 = load i384, ptr %2, align 4
%a_plus_b.i = add i384 %.val1, %.val
%4 = add i384 %a_plus_b.i, -4002409555221667393417789825735904156556882819939007885332058136124031650490837864442687629129015664037894272559787
%.not1.i = icmp slt i384 %4, 0
%5 = select i1 %.not1.i, i384 %a_plus_b.i, i384 %4
store i384 %5, ptr %0, align 4
ret void
}
attributes #0 = { hot mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) }
```
### Assembly
```asm
bn254_snarks_fp_add: // @bn254_snarks_fp_add
ldp x8, x10, [x1]
mov x15, #24534 // =0x5fd6
ldp x9, x11, [x2]
movk x15, #7886, lsl #16
ldp x13, x14, [x2, #16]
movk x15, #45453, lsl #32
movk x15, #53147, lsl #48
adds x8, x9, x8
ldp x12, x9, [x1, #16]
adcs x10, x11, x10
mov x11, #697 // =0x2b9
movk x11, #10115, lsl #16
adcs x12, x13, x12
mov x13, #13682 // =0x3572
movk x11, #29673, lsl #32
movk x13, #38798, lsl #16
adc x9, x14, x9
mov x14, #42914 // =0xa7a2
movk x11, #50143, lsl #48
movk x13, #38254, lsl #32
movk x14, #32382, lsl #16
movk x13, #26750, lsl #48
adds x11, x8, x11
movk x14, #47689, lsl #32
movk x14, #18351, lsl #48
adcs x13, x10, x13
adcs x14, x12, x14
adc x15, x9, x15
cmp x15, #0
csel x12, x12, x14, lt
csel x9, x9, x15, lt
csel x8, x8, x11, lt
stp x12, x9, [x0, #16]
csel x9, x10, x13, lt
stp x8, x9, [x0]
ret
bls24_317_fp_add: // @bls24_317_fp_add
ldp x8, x10, [x1]
mov x16, #12230 // =0x2fc6
ldp x9, x11, [x2]
movk x16, #18438, lsl #16
ldp x13, x14, [x2, #16]
mov x17, #30419 // =0x76d3
ldr x15, [x2, #32]
movk x16, #14943, lsl #32
movk x17, #37023, lsl #16
adds x8, x9, x8
ldp x12, x9, [x1, #16]
adcs x10, x11, x10
ldr x11, [x1, #32]
movk x16, #3440, lsl #48
movk x17, #13789, lsl #32
adcs x12, x13, x12
mov x13, #54613 // =0xd555
movk x17, #61351, lsl #48
movk x13, #41556, lsl #16
adcs x9, x14, x9
mov x14, #16513 // =0x4081
movk x13, #53673, lsl #32
movk x14, #52187, lsl #16
adc x11, x15, x11
mov x15, #52153 // =0xcbb9
movk x13, #29358, lsl #48
movk x14, #50715, lsl #32
movk x15, #31544, lsl #16
movk x14, #10508, lsl #48
adds x13, x8, x13
movk x15, #40473, lsl #32
adcs x14, x10, x14
movk x15, #59749, lsl #48
adcs x15, x12, x15
adcs x16, x9, x16
adc x17, x11, x17
asr x18, x17, #63
cmp x18, #0
csel x11, x11, x17, lt
csel x9, x9, x16, lt
csel x12, x12, x15, lt
csel x8, x8, x13, lt
stp x9, x11, [x0, #24]
csel x9, x10, x14, lt
str x8, [x0]
stp x9, x12, [x0, #8]
ret
bls12_381_fp_add: // @bls12_381_fp_add
ldp x8, x10, [x1]
mov x17, #60736 // =0xed40
ldp x9, x11, [x2]
movk x17, #3194, lsl #16
ldp x13, x14, [x2, #16]
mov x18, #21288 // =0x5328
ldp x15, x16, [x2, #32]
movk x17, #46203, lsl #32
adds x8, x9, x8
ldp x12, x9, [x1, #16]
adcs x10, x11, x10
movk x18, #48308, lsl #16
movk x17, #39816, lsl #48
movk x18, #22601, lsl #32
adcs x12, x13, x12
ldp x11, x13, [x1, #32]
adcs x9, x14, x9
mov x14, #21845 // =0x5555
mov x1, #6501 // =0x1965
movk x14, #17921, lsl #48
movk x1, #50816, lsl #16
movk x18, #46308, lsl #48
adcs x11, x15, x11
mov x15, #1319895040 // =0x4eac0000
movk x1, #60949, lsl #32
adc x13, x16, x13
mov x16, #2523 // =0x9db
movk x15, #1, lsl #32
movk x16, #2383, lsl #16
movk x15, #57684, lsl #48
adds x14, x8, x14
movk x16, #11615, lsl #32
adcs x15, x10, x15
movk x1, #58878, lsl #48
movk x16, #39119, lsl #48
adcs x16, x12, x16
adcs x17, x9, x17
adcs x18, x11, x18
adc x1, x13, x1
asr x2, x1, #63
cmp x2, #0
csel x11, x11, x18, lt
csel x13, x13, x1, lt
csel x9, x9, x17, lt
stp x11, x13, [x0, #32]
csel x11, x12, x16, lt
csel x8, x8, x14, lt
stp x11, x9, [x0, #16]
csel x9, x10, x15, lt
stp x8, x9, [x0]
ret
```
## Analysis
With i256, the `cmp` is useless in this sequence
```asm
movk x11, #50143, lsl #48
movk x13, #38254, lsl #32
movk x14, #32382, lsl #16
movk x13, #26750, lsl #48
adds x11, x8, x11
movk x14, #47689, lsl #32
movk x14, #18351, lsl #48
adcs x13, x10, x13
adcs x14, x12, x14
adc x15, x9, x15
cmp x15, #0 // <----- unnecessary
csel x12, x12, x14, lt
csel x9, x9, x15, lt
csel x8, x8, x11, lt
stp x12, x9, [x0, #16]
csel x9, x10, x13, lt
stp x8, x9, [x0]
ret
```
as demonstrated by https://github.com/llvm/llvm-project/issues/103717
With i320, similar to x86 https://github.com/llvm/llvm-project/issues/103841, there is another additional `asr` instruction
```asm
movk x13, #29358, lsl #48
movk x14, #50715, lsl #32
movk x15, #31544, lsl #16
movk x14, #10508, lsl #48
adds x13, x8, x13
movk x15, #40473, lsl #32
adcs x14, x10, x14
movk x15, #59749, lsl #48
adcs x15, x12, x15
adcs x16, x9, x16
adc x17, x11, x17
asr x18, x17, #63 // <----- unnecessary
cmp x18, #0 // <----- unnecessary
csel x11, x11, x17, lt
csel x9, x9, x16, lt
csel x12, x12, x15, lt
csel x8, x8, x13, lt
stp x9, x11, [x0, #24]
csel x9, x10, x14, lt
str x8, [x0]
stp x9, x12, [x0, #8]
ret
```
</pre>
<img width="1px" height="1px" alt="" src="http://email.email.llvm.org/o/eJzkW0tv67gV_jXKhrgBefheZJE7QYq7GAwwxaDLgJaYRB1ZckU6j_76Qu8XZcs3uTPT1ghiRzo8L3485yNjGefSp9zam4h_jfjdlTn656K82ZfGu3R_tSuS95u_m71F335FxqFn7w8uorcR3Edw_5T65-PuOi72Edxn2Uv39uVQFv-0sY_gPnXuaF0E9wRTxQjaHT0yh0OW2gT5At2aMn4WrFJtcmQyb8vc-PTFVjc_ZEwSGeG7CN82v3_Ls_R3i96UQP7ZlhalDpns1bzXhu2bLw1Kc-fLY-zTIkf2xebosShRClxE8BMyedKONKVFgI55bmPrnCnfJwPrMRQwqt8Vux57MY3IZOmLhesqiOuifIrgPrYR3P87gvsvu7-lv7nx0AhoBBTdH7MMxUVil_eq27-U6VOamwx9-3UiIHDzU-etuUS_op-L5JjZb3cooncoAvmmxMOhiCNoU-dN-WQ98mV6yGwrBKbcCxYBDGqCYpWusVzzm-FdDpw9uNyUv7uHx8PDvkjqIXGRO29yXyccAVFKAQMliaIaJAcAYIJhLhkHLkFyrJTQghJCuAQtFVAhQCiNqVSaCcYBBGDFFa2mz9lmeiKA2PvrmRfXnXVXOVzNdpY-5Uiwwe3MEXigioR9poohhjEwrDnnAEQISTVlREqlFXBJucaMcMG5UAoU0ZpqjKVSnFLAXBEqCDBMieCYaayoVIIxBkJJAZqAxoQLwZrgQALnWioZimzk6KawgD1QIlfCAowIFVRTLAnBDDTXhHDCsdZScI0VyGpOsCCacUKAAGFEc021lEAwqUZSjbmWlGgJFDBwQRQGQbiQlGBOwjH0Xp2LoUfh_TFvlNx6X1aLDD0Xvrmd2Mc0tyjN6_KSoUfjfByjlyJNUMRwFbhJkoe8KK6Pgr2xCNTBlygCjiuL7Wcy-gyjzzQCjSKgMA_jMbVZUrmNIvm18QRVA0yd5awwSV9dxiaaANv4qqu7VXlYyleXf14dQIMGzMMhO7qHxo5JmlH1jUq88mCknNdS7rgbpNrhrfDPI2HReOKey16aV2LV2yAlaylfHvO4FxNVB0jJSEo1hm1mY49SUg-sdIXcGIy1450vqqLfXlajlOBADkvra3S06JJ3Pw5s_A8EG4WFiSV4dqsDNqBtOiCAtjDcqjoTgNsUbZ3QNrS10jXaKNFraGvFLkBbwI3B2BRt7eW_ENrEH4k2xS5E22TAFrRNBmxHm2IbilsntRFujXQDN0VX4daIXQK3pRuDsRncmst_ANw6eM3JnEmSTQALg2qmbYau2GSf27mDXDQC_SPyNKZZH0zSSNV3ZOiidrPkh5vT831JGij2h5PUq_qOJF1UJZd7g61JMt6X6e7orWuCqHdO8muVIDSIdlu38Ibv9tHbEhUHj778Qs9s-T59O7e5GuRFbA7-WFr0WqbeFnn2PknucL-0Juluk9O369nPithkD8c8N3ubVBar2aH4stISAb9-MdkltLwbQy7l5l0hv04XbLvW1_aZ6vPAgtiSmfdqqgFfPne_3Nu9zgtPWk_TeH9ALvO9C3XvxcvmOe5hrYIQTZ9eZWGizj_Sy7YV4T8fm-sVfQnMsxQ-gMtNLD6Ey5a_ruByBswZL26B-cNPD85gtfXqUqwGgxmUBWn-J2M11Av_Clhda6wBrJ7bAISwumUPEMRqQ343YnVKqlus_hkHeOfg2zh6MXxD8Q3KgtuGD8F3SmXwhMrsj84fyuKptM6hvHgsrUV5Udr4WLrqk3vPY5QXx_w1zRP0mmZZaf2xzNHe7ovyPQJlyqe93VfcscJtDf2a-W3lSM7Z_S57D9Ij41p2FGIv9BYFXs3p_RrjadPVvLLkUL-_1ZuyN1InNuJf30jE76ai--KlESW8wS8FxikLOdB7QO_wG39MxIpR3RglnVEYjO6Ll9-n1qRSNX3JXFb9SdaUEtpoZYPWRgERp9QzzpodequfwiL82QhOCZOjEUy1I0ySuFFSmyjVmrswCDWJX3prkrhR2MxPm7Hqr5UZ6pQILYPzM50h2Om1WHtvMGniXsn-4GETTjcJyySOJ6lSRIWCsx5SLmExc51voIXcNnOdUaqkVqFoTBJPkMnauVkJgnXYAU3OLwQjzapnXSwcE0ZDmDoRS1t-z0bfeUuBKjgxl_WIwQAI2dTbFZd6tLeoVB1Ez7jBpFD6IseJok2XXvUkdgP0cAfEFaCyDqHtTM_F2jnmozVM-FSqaoNoUhNmKzJ2NkPjZQEjXGXtMcMgpae2xjJzhWqa64Co84dldcGB6jJXrWf5W9Ot5qrnGkvbDlvsbMJ9a9q65mM-3rdEFzxUFOCU-boqPsYf6lu9NcVosNjMtW5tXK207JYzZkSfjUaKhM7tlj1wR8YohDI4j4lpdrbkjlyUGOjJ7vHJDbPXu6lvVplAozI8UropG5SxUwVyGCD7pidPlr7v7qacCUIn855wzs-5I8hKXV12G0Y4P8XCes_PNdBRVRec0LP4ZVitdpQ-erqVCHS2ORAlT4czRkbXDQLNrRXjg2J-Pqh41zOvZTSgKVebYNVHg-WEpNElZ-r8o4Qzdo4DTJsv5jjoztD_6bgn0XP0mWEWnq1Fi8bBFh0g5FoyfZ4f8Ekr5mtLT4yb8Sou5KSwyJmY6wqL6u63K46uEQk1IxIDgyAzS6fYgZ76viY6pyUXUI5VWoBCDbKjHcC20g4W1t_mU62TjrkPMPdBneQp01MtejtlJNO7H2ckPR6wpAKFXuNyYRPWwWILFVmsk74dE31q7Ve6t5GRRTwdeoGAUmfj4RRWmzwfYXczPekCZALwqUbwJ_ONweEuX0xRfIojBuZQKyI2dYd-TkBg8nHSMeSFjMTOcKbNtABNeg4QxfgUMiE6047q1hLHZDKGaLFKgfruJjWc2loOI_peO83_-oz1cyxouIGOc3Qx0SCUaKU5ZosNzYQ9WRNjjFdx2Jchzc7w0tY8HS3PUK9vxTqGDBzCbGjipU52q4TlFHKXjByoCu43ApxBCsVOzkoyISNqnYuMtkdEhInYfDl004yDXKTV2nmqlNzGB_t9iSZkAx8SEw6werAnx5xiTnR6KTUpfEuDaABcj6IV0tQ6NKdMPVeC6MyZC5n5corcdL7Qweg2dhUkYj0JWRRJvF4kF47DOf62YGUrrGnqzAcOg4IEsde__UDoxD9E0G1usneXTh4u-Efqn_vvL_hniyKB4_0hEhilDh2dzaxzKM2Rf04dcvZfR5vHdmar_0_KcsVcdPz633n2-r9-8Dp0kp--VK_xAzDfcyh7ctF_xtksCpHLTzyf_diSNA4ldl_kzpfG2wTt3j_1cadmQbdf_HDpPs1MiXxRP__00We42hrRPkOVF9XnaqWkPi1yk6G6FJR17Riei5oUog0V4wec0ixG_IDDmnHd-LQDm7UV_qnnNmO69COPbk6c2aALaszyQOeS0Rcf-mw98QmUvf-3Q5_TJOQquaGJptpc2Rsiq1mXkuCr5xsGOwtKKKGFoHwnARS1Wu2MtDhJsLlKbwADw4owAoRjcS25emRSyR1_3JEk1hHDdm_SrH-g8qouXDdV1eL8KjM7m7n6OVeA3L6i-m4EFVG8Km_qmrc7PrmI4Sx13g1qfOqz-gHZb1Va20dWI34XeGw0AtXQKI2KEsH4nqtv1iVZo9dnm6P42aR5mj813_cxedJ9oWdnnE1QkaNjntjyMSter45ldvOxws351csN_CcAAP__CR4CYA">