<table border="1" cellspacing="0" cellpadding="8">
<tr>
<th>Issue</th>
<td>
<a href=https://github.com/llvm/llvm-project/issues/103717>103717</a>
</td>
</tr>
<tr>
<th>Summary</th>
<td>
[Isel x86/Aarch64] i320+ generates useless extra instructions when chaining sub.with.overflow and select / conditional moves.
</td>
</tr>
<tr>
<th>Labels</th>
<td>
new issue
</td>
</tr>
<tr>
<th>Assignees</th>
<td>
</td>
</tr>
<tr>
<th>Reporter</th>
<td>
mratsim
</td>
</tr>
</table>
<pre>
Yet another experiment to properly generate efficient modular addition in pure LLVM IR in at least x86 and arm without inline assembly (follow-up of #102062, #102868). This time using the intrinsics llvm.uadd.with.overflow.iXXX.
The code is optimal on both x86 and ARM for i256 but has extra sbb, sbb, or sequence for i320 or i384. The LLVM IR has a similar structure so I assume it's at the MIR level that some threshold is reached.
Test case: https://alive2.llvm.org/ce/z/fV77tb
```llvm
; ModuleID = 'x86_poc'
source_filename = "x86_poc"
; target triple = "arm64"
target triple = "x86_64"
@bn254_snarks_fp_mod = constant i256 21888242871839275222246405745257275088696311157297823662689037894645226208583, section "ctt.bn254_snarks_fp.constants", align 64
@bls12_381_fp_mod = constant i384 4002409555221667393417789825735904156556882819939007885332058136124031650490837864442687629129015664037894272559787, section "ctt.bls12_381_fp.constants", align 64
@bls24_317_fp_mod = constant i320 136393071104295911515099765908274057061945112121419593977210139303905973197232025618026156731051, section "ctt.bls24_317_fp.constants", align 64
; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
declare { i256, i1 } @llvm.uadd.with.overflow.i256(i256, i256) #0
; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
declare { i256, i1 } @llvm.usub.with.overflow.i256(i256, i256) #0
; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
declare { i384, i1 } @llvm.uadd.with.overflow.i384(i384, i384) #0
; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
declare { i384, i1 } @llvm.usub.with.overflow.i384(i384, i384) #0
; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
declare { i320, i1 } @llvm.uadd.with.overflow.i320(i320, i320) #0
; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
declare { i320, i1 } @llvm.usub.with.overflow.i320(i320, i320) #0
; Function Attrs: hot mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: write)
define internal fastcc void @_modadd_noo.u64x4(ptr noalias nocapture writeonly %0, i256 %.0.val, i256 %.0.val1, i256 %.0.val3) #2 section "ctt.fields" {
%2 = call { i256, i1 } @llvm.uadd.with.overflow.i256(i256 %.0.val, i256 %.0.val1)
%.lo = extractvalue { i256, i1 } %2, 0
%3 = call { i256, i1 } @llvm.usub.with.overflow.i256(i256 %.lo, i256 %.0.val3)
%.lo1 = extractvalue { i256, i1 } %3, 0
%.borrow = extractvalue { i256, i1 } %3, 1
%4 = select i1 %.borrow, i256 %.lo, i256 %.lo1
store i256 %4, ptr %0, align 4
ret void
}
; Function Attrs: hot mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: write)
define internal fastcc void @_modadd_noo.u64x5(ptr noalias nocapture writeonly %0, i320 %.0.val, i320 %.0.val1, i320 %.0.val3) #2 section "ctt.fields" {
%2 = call { i320, i1 } @llvm.uadd.with.overflow.i320(i320 %.0.val, i320 %.0.val1)
%.lo = extractvalue { i320, i1 } %2, 0
%3 = call { i320, i1 } @llvm.usub.with.overflow.i320(i320 %.lo, i320 %.0.val3)
%.lo1 = extractvalue { i320, i1 } %3, 0
%.borrow = extractvalue { i320, i1 } %3, 1
%4 = select i1 %.borrow, i320 %.lo, i320 %.lo1
store i320 %4, ptr %0, align 4
ret void
}
; Function Attrs: hot mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: write)
define internal fastcc void @_modadd_noo.u64x6(ptr noalias nocapture writeonly %0, i384 %.0.val, i384 %.0.val1, i384 %.0.val3) #2 section "ctt.fields" {
%2 = call { i384, i1 } @llvm.uadd.with.overflow.i384(i384 %.0.val, i384 %.0.val1)
%.lo = extractvalue { i384, i1 } %2, 0
%3 = call { i384, i1 } @llvm.usub.with.overflow.i384(i384 %.lo, i384 %.0.val3)
%.lo1 = extractvalue { i384, i1 } %3, 0
%.borrow = extractvalue { i384, i1 } %3, 1
%4 = select i1 %.borrow, i384 %.lo, i384 %.lo1
store i384 %4, ptr %0, align 4
ret void
}
; Function Attrs: hot mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite)
define void @bn254_snarks_fp_add(ptr nocapture writeonly %0, ptr nocapture readonly %1, ptr nocapture readonly %2) #1 section "ctt.bn254_snarks_fp" {
%.val = load i256, ptr %1, align 4
%.val1 = load i256, ptr %2, align 4
call fastcc void @_modadd_noo.u64x4(ptr noalias %0, i256 %.val, i256 %.val1, i256 21888242871839275222246405745257275088696311157297823662689037894645226208583)
ret void
}
; Function Attrs: hot mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite)
define void @bls24_317_fp_add(ptr nocapture writeonly %0, ptr nocapture readonly %1, ptr nocapture readonly %2) #1 section "ctt.bls24_317_fp" {
%.val = load i320, ptr %1, align 4
%.val1 = load i320, ptr %2, align 4
call fastcc void @_modadd_noo.u64x5(ptr noalias %0, i320 %.val, i320 %.val1, i320 136393071104295911515099765908274057061945112121419593977210139303905973197232025618026156731051)
ret void
}
; Function Attrs: hot mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite)
define void @bls12_381_fp_add(ptr nocapture writeonly %0, ptr nocapture readonly %1, ptr nocapture readonly %2) #1 section "ctt.bls12_381_fp" {
%.val = load i384, ptr %1, align 4
%.val1 = load i384, ptr %2, align 4
call fastcc void @_modadd_noo.u64x6(ptr noalias %0, i384 %.val, i384 %.val1, i384 4002409555221667393417789825735904156556882819939007885332058136124031650490837864442687629129015664037894272559787)
ret void
}
attributes #0 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) }
attributes #1 = { hot mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) }
attributes #2 = { hot mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: write) }
```
## Details i256
```llvm
; Function Attrs: hot mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: write)
define internal fastcc void @_modadd_noo.u64x4(ptr noalias nocapture writeonly %0, i256 %.0.val, i256 %.0.val1, i256 %.0.val3) #2 section "ctt.fields" {
%2 = call { i256, i1 } @llvm.uadd.with.overflow.i256(i256 %.0.val, i256 %.0.val1)
%.lo = extractvalue { i256, i1 } %2, 0
%3 = call { i256, i1 } @llvm.usub.with.overflow.i256(i256 %.lo, i256 %.0.val3)
%.lo1 = extractvalue { i256, i1 } %3, 0
%.borrow = extractvalue { i256, i1 } %3, 1
%4 = select i1 %.borrow, i256 %.lo, i256 %.lo1
store i256 %4, ptr %0, align 4
ret void
}
```
Generated on x86:
```asm
_modadd_noo.u64x4: # @_modadd_noo.u64x4
add rsi, qword ptr [rsp + 8]
adc rdx, qword ptr [rsp + 16]
adc rcx, qword ptr [rsp + 24]
adc r8, qword ptr [rsp + 32]
mov rax, rsi
sub rax, qword ptr [rsp + 40]
mov r9, rdx
sbb r9, qword ptr [rsp + 48]
mov r10, rcx
sbb r10, qword ptr [rsp + 56]
mov r11, r8
sbb r11, qword ptr [rsp + 64]
cmovb r10, rcx
cmovb r11, r8
cmovb rax, rsi
cmovb r9, rdx
mov qword ptr [rdi + 24], r11
mov qword ptr [rdi + 16], r10
mov qword ptr [rdi + 8], r9
mov qword ptr [rdi], rax
ret
```
Notice how sub+3xsbb flows into cmovb.
On ARM64
```
_modadd_noo.u64x4: // @_modadd_noo.u64x4
ldp x8, x10, [sp]
adds x9, x2, x6
ldp x11, x13, [sp, #16]
adcs x12, x3, x7
ldp x14, x15, [sp, #32]
adcs x8, x4, x8
adc x10, x5, x10
subs x11, x9, x11
sbcs x13, x12, x13
sbcs x14, x8, x14
sbcs x15, x10, x15
csel x10, x10, x15, lo
csel x8, x8, x14, lo
stp x8, x10, [x0, #16]
csel x10, x12, x13, lo
csel x8, x9, x11, lo
stp x8, x10, [x0]
ret
```
subs and 3xsbs flow into conditional select
## Details i320
```llvm
; Function Attrs: hot mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: write)
define internal fastcc void @_modadd_noo.u64x5(ptr noalias nocapture writeonly %0, i320 %.0.val, i320 %.0.val1, i320 %.0.val3) #2 section "ctt.fields" {
%2 = call { i320, i1 } @llvm.uadd.with.overflow.i320(i320 %.0.val, i320 %.0.val1)
%.lo = extractvalue { i320, i1 } %2, 0
%3 = call { i320, i1 } @llvm.usub.with.overflow.i320(i320 %.lo, i320 %.0.val3)
%.lo1 = extractvalue { i320, i1 } %3, 0
%.borrow = extractvalue { i320, i1 } %3, 1
%4 = select i1 %.borrow, i320 %.lo, i320 %.lo1
store i320 %4, ptr %0, align 4
ret void
}
```
The structure is the same as i256 so you would expect the same optimized assembly, however
on x86-64
```asm
_modadd_noo.u64x5: # @_modadd_noo.u64x5
push r15
push r14
push r13
push r12
push rbx
add rsi, qword ptr [rsp + 48]
adc rdx, qword ptr [rsp + 56]
adc rcx, qword ptr [rsp + 64]
adc r8, qword ptr [rsp + 72]
adc r9, qword ptr [rsp + 80]
xor r10d, r10d
mov rax, rsi
sub rax, qword ptr [rsp + 96]
mov r11, rdx
sbb r11, qword ptr [rsp + 104]
mov rbx, rcx
sbb rbx, qword ptr [rsp + 112]
mov r14, r8
sbb r14, qword ptr [rsp + 120]
mov r15, r9
sbb r15, qword ptr [rsp + 128]
mov r12d, 0
sbb r12, r12
mov r13d, 0
sbb r13, r13
sbb r10, r10
or r10, r12
or r10, r13
cmovne r15, r9
cmovne rax, rsi
cmovne r11, rdx
cmovne rbx, rcx
cmovne r14, r8
mov qword ptr [rdi + 24], r14
mov qword ptr [rdi + 16], rbx
mov qword ptr [rdi + 8], r11
mov qword ptr [rdi], rax
mov qword ptr [rdi + 32], r15
pop rbx
pop r12
pop r13
pop r14
pop r15
ret
```
You have you have sub+4 useful sbb but then this whole sequence before cmovne is extremely wasteful
```S
mov r12d, 0
sbb r12, r12
mov r13d, 0
sbb r13, r13
sbb r10, r10
or r10, r12
or r10, r13
```
And on ARM64 it's the same
```asm
_modadd_noo.u64x5: // @_modadd_noo.u64x5
ldp x8, x9, [sp]
ldr x13, [sp, #32]
ldp x10, x11, [sp, #16]
ldr x17, [sp, #80]
ldp x12, x14, [sp, #48]
adds x8, x2, x8
ldp x15, x16, [sp, #64]
adcs x9, x3, x9
adcs x10, x4, x10
adcs x11, x5, x11
adc x13, x6, x13
subs x12, x8, x12
sbcs x14, x9, x14
sbcs x15, x10, x15
sbcs x16, x11, x16
sbcs x17, x13, x17
ngcs x18, xzr
ngcs x1, xzr
ngc x2, xzr
orr x18, x1, x18
orr x18, x18, x2
cmp x18, #0
csel x9, x9, x14, ne
csel x8, x8, x12, ne
csel x11, x11, x16, ne
csel x10, x10, x15, ne
stp x8, x9, [x0]
csel x9, x13, x17, ne
stp x10, x11, [x0, #16]
str x9, [x0, #32]
ret
```
This is totally unnecessary
```
ngcs x18, xzr
ngcs x1, xzr
ngc x2, xzr
orr x18, x1, x18
orr x18, x18, x2
cmp x18, #0
```
</pre>
<img width="1px" height="1px" alt="" src="http://email.email.llvm.org/o/eJzsW0tv5DYS_jXyhZgGH3pQBx_sMbwYILMLzAZBcjIoie3mhhI7JOVu59cvSKnVevbDmUHGuzMIrLRYVawqVpFflSRmjHiuOL8Novsgerhhtd0ofVtqZo0obzJVvN7-xi1glbIbrgHfb7kWJa8ssApstdpyLV_BM6-4ZpYDvl6LXLjhUhW1ZBqwohBWqAqICmxrzcFPP_3yGXz64n4zCyRnxoI9jQGrCsB0CXbCblRtgaikqDhgxvAyk68gwHStpFS7D_UWqDUIMEEQwxgH-GP7g8Y0wOkK_LwRBlhRclAbUT0Du-FAVFaLyojcAClfylXNimLl5lqpF67XUu1W4tdff10F8CGAd83fnzcc5KrgQBigtlaUTAJVgUzZTafy3ZfPYK00EDiKQVZbsGEG8L3VDJgsc7q1F6WB4X_UvMp5w0AwBP5KQ6fy0TNOAgNGlMI50Fhd59Z5zijwyfmjLjkQNsCJcR50xn3-9AVI_sIlsBtmgVElB3ajudkoWTjtNWf5hhdD67ixIGeGB-QObKzdmoDcBfgxwI9MiheOV95RSj8H-DHnAX78M8CP61-SxGZ9OUEMm_8ceXuL3IPPLgD4pwcQkAcQ4GRP46etygOcNDRG1TrnT2shecVK3pLhjgwfRVmmn7kFVout7AiZLuOwI5slcbJ6NO3fEGYVjsInUzH9u3lab59KVXiWXFXGsso2i4kRpRSHmCaIkhQnEcYYh3EIoySMcJTgJIKUxmlMEEJRgtOEYhLHOKYpJAlNwziMMI4xpBElPhB47jMhwDi3djXSYnWY3TiF8UfApHiuQBwe1ZYG4SdC0bzOhIYghBCHMI2iCGMUxwlJSYiShKYURwmJUhiiKI6imFJMUZqSFMKE0ogQDCOKSIxwCAmKIximkJKExmEY4pgmMU4RTiGK4jhsjMMJjqI0ocmcZT1FLzILh08EJQtmYQgQiUlKYIIQDHEapQhFKIJpmsRRCilO3JrAGKVhhBBGGIUojVKSJglGEDlOksIoTQhKE0wwxFGMKMQxiuKEIBiheRs6rc7Z0AXqY101Qu6s1S6dQFkbu9XqWXNjQKVyJmXG8t9Bpdaac1Ap81rloFJ1tRNVAcyW57VklmWSg52QUnNb6wqUvFT6NcC0UhUPcNpMWfBcMs1BkNz7kHWqCQSC5AEEIVzc5jwl7Rj8NXVbKHwH9pg6e5_2EBpeuD6eknYM_vpu7JlZn3diD4aXro-jpB2Dv74be-bW5y_Ys1F2bFNriOZ5rc3UpDkrmH4ueenk7bSwfYPWDgOKynJdMQnWzNg8By9KFM4id1qwoniqlFrVcbh3Yba1GlSKScEa_249dPJiVeVhZAQP24T7sYKrFyand9D0FmndgsdnxVpwWbizwXm-0Rw4NtwcZUzKt2_RZ3RMe9OtpPITevSZ2xcm6_m9FEf-FIM9XnKZqid331aHeccN9USXKkrGiq4ypbXaXcWPevyh5zRc8tx6uk7kQO-RGVJ1MoxVmncjfktzMXeIqwYahAdqza0P1zaHkod3llLRNSnloNowXAd30PTWVSk1k1Fv2bTPqHhZRo0mviCj3rIf90Nx4rfLMmqi6JUZNc9_RUYtmDHNqHbkfz2j4qsyiobjcO3fQdNbfzmj3gBTz6h4YUYNJ74ko96AQAehOPbbhRk1VvTajJrlvyaj5s2Yyahm5B1klOasmM2qQxaNWzasKLo8Ws6f4bib5DCMTg_jNovQmabNFPO5WPLLJxUrOkDSeh_NeL_lQYtMeIbJJ8GVaHiCfMeYcoB6v3L365Bb7yLW-g2pvy3QekqcjbL2kL4uyoZMb42yMUCcgMExzhoAwW_f2HtngXds8P6NgdcpcT7waPiGwBswvTXwxjhqgpnGcGSAl_6WRvlFscis1SKrLTe-BdM8y0juv1XnCHTTDydG3cTfMg2WZsffbPbpzN3zq8GOgEmACXjglglpmiP5_DOv77z0-D_uj_1oj72r9thcSv6jfcmgAKoCexoHZJyMzLS5OA18cgdm_7kkn82UVsHmHysKf9VGOCP-2CldNGZF99psQYDvAQ2ihzFXDnSxX-RA8SyLnyhfZsPhMhtd5CJ4wlWql4aL-cmccYNxU2f98VmxIVwWm3qpxX4kNct6w_NCp77shCIfRs4_81Kb8Vmx0dTfR7F-P9N0SSpalBpPlyMv1Ut2Qtnj-Oys3fDCujTjC-49WDTUtRC90HFsCF3B1gSqZ4NXsNEDVzplGjMcSNnIHM3tiS3hn8qKnION2rlgDfA92bslczuvcaeianw1eO3lXxW4-_K5e3Y_FDu7cTTvxFywTchi6697n4b7ZvWD6N5sZzK2MGDvl3DvT5t9vCCriZE9Ikdh7QtPs9uHaZgamZ5nnyxJDhvJ0VjyzF7RSW5Mazjp_C7UGr6PDl4Y7yqmb1jjgy4eTXYwoVG-tQSRcWYeyA6qeKpwiSrqrYj7NUwow2Vf8SMZ_gikWiCmw4lnSI2dj4c9XF7CiS64t_xndOlceZ0unQpLyeaXjFUFcOllfHq12aWq5u0-JlvMcApDEwzfP4b-nh6I_Xgi9uOJ2MUg-ucN771LKox_cdSwkgPW1LfAKPCqarBTtSz8m765PRL591_Fn7zo3sh1mmzUjr9w3U_rBpx_GL0cdxaiR9dC9NEevq3NxgOG8eZ-HDg47HiHLJHihYFsPy0MThUFM0i2w-onKoM-Ur2kJJjBoBeUBMnsMd9wLYNzOkX8e6UbLgSLFiYWX7_USM_D98Va4wR-R3DqvE5utj9dbWQnqjt0ouJq8MJyuREui8UnKq4GsYzx9lFsdELsqZoLF4MtciwWN4uOp-wakdOspGGdoLtBNXcsO9pAOw6M5pyMk2npVPGpp7r7J0quhm8-0LrxhYA58s8u_KVFW_imom28aV1WtF1aIS5VbnNVXq8T4acYb9WqwagTjbuBbr2Pd8Z7eDcQLg1EZ-Dub6oGG_bC_Wno_6cpLkNQG76upQ_PrPaHYwXsRhiw2yjJj993ZHztDvB21UXzQQgvuXwFO2asEzKa-d-Xpd4lOde65avm3VJ6TdNvnHmzdftd5Zt4vg4_fMpyQBpfDzYsVe3Ryao9XS7aZdHYP1OMz5TMxzob9uqyMxX8cYpkTD1z8B6nwL0ytM80C0CKfimPB6X8UWJbMcdjifNYo5GY9roO6RG_mL4bwtmuwJEK9XsHaKnDQNqeyWxvoOsw4H6Jjk93ENJhB-HC1sGRLO4tsvs1JWvXtOtsjPoy1fNBVKPwn3ppfDhcPbdOwbNsSh8iqvVD85eeoToEx-gY2_apjq-IdwSHhkQ69Cn-CCp-OGtnGyh4QDMWd3Ar6gXlVOBc-2YsctQHSWfaIP3JDyZ0S3ZK5DjRT_V5jNWD2Re3kdMdUP-poyvmlGVSvoK6qnjOjWH6dZbruwm2NwbarBNuiltSpCRlN_wWJZjEOIwRutncRpwVhBcsidI0yUgRE5JBiCjjSRJRlN6IWwxxCCkKISVxhFZrlhBMCIcc53idwyCEvGRCdt9C3ghjan6LIElQciNZxqXxX89iXPEd8KMBdst4o28d04esfjZBCKUw1hzFWGGl_-z2kw8zGgf48Y7pfOP317bLcN99WWsc9pDcHL4tFVVTywtVOfTBK5BvmKhE9Qwm_RPfumu7Ee5I7HftSvXCzeqm1vJ2-Anos7CbOlvlqgzwo-_QNZcPW63-w3Mb4EdvqwnwY-uMl1v83wAAAP__N8KxLg">