<table border="1" cellspacing="0" cellpadding="8">
<tr>
<th>Issue</th>
<td>
<a href=https://github.com/llvm/llvm-project/issues/102868>102868</a>
</td>
</tr>
<tr>
<th>Summary</th>
<td>
[x86 / Clang] Pessimization: missed fusion of substraction/compare/cmov after -O1 optimization or extra caller.
</td>
</tr>
<tr>
<th>Labels</th>
<td>
clang
</td>
</tr>
<tr>
<th>Assignees</th>
<td>
</td>
</tr>
<tr>
<th>Reporter</th>
<td>
mratsim
</td>
</tr>
</table>
<pre>
This is a follow-up to my quest to do efficient cross-ISA modular arithmetic for cryptography and finding a workaround to #102062
Ignoring the loads/stores, the following LLVM IR is optimized into an optimal sequence of adc,sbb,cmov with
- either -O0
- or a single caller
when compiled with Clang
https://alive2.llvm.org/ce/z/hnQycG
but it becomes adc,sbb,cmov,adc for a 33% more compute instructions with the following IR.
![image](https://github.com/user-attachments/assets/d6f88085-5422-470e-a761-3951658521da)
## LLVM IR
```llvm
; ModuleID = 'x86_poc'
source_filename = "x86_poc"
target triple = "x86_64-pc-linux-gnu"
@bn254_snarks_fp_mod = constant i256 21888242871839275222246405745257275088696311157297823662689037894645226208583, section "ctt.bn254_snarks_fp.constants", align 64
@bn254_snarks_fr_mod = constant i256 21888242871839275222246405745257275088548364400416034343698204186575808495617, section "ctt.bn254_snarks_fr.constants", align 64
define void @bn254_snarks_fp_add(ptr %0, ptr %1, ptr %2) section "bn254_snarks_fp" {
call fastcc void @_modadd_noo_u64x4(ptr %0, ptr %1, ptr %2, ptr @bn254_snarks_fp_mod)
ret void
}
define internal fastcc void @_modadd_noo_u64x4(ptr %0, ptr %1, ptr %2, ptr %3) section "ctt.fields" {
%a = load i256, ptr %1, align 4
%b = load i256, ptr %2, align 4
%a_plus_b = add i256 %a, %b
%5 = alloca [4 x i64], align 8
store i256 %a_plus_b, ptr %5, align 4
call fastcc void @_finalsub_noo_u64x4(ptr %0, ptr %5, ptr %3)
ret void
}
define internal fastcc void @_finalsub_noo_u64x4(ptr %0, ptr %1, ptr %2) section "ctt.fields" {
%M = load i256, ptr %2, align 4
%a = load i256, ptr %1, align 4
%a_minus_M = sub i256 %a, %M
%borrow = icmp ult i256 %a, %M
%4 = select i1 %borrow, i256 %a, i256 %a_minus_M
store i256 %4, ptr %0, align 4
ret void
}
; Comment this out for good codegen (or use Clang -O0)
define void @bn254_snarks_fr_add(ptr %0, ptr %1, ptr %2) section "bn254_snarks_fr" {
call fastcc void @_modadd_noo_u64x4(ptr %0, ptr %1, ptr %2, ptr @bn254_snarks_fr_mod)
ret void
}
```
## Clang -O1
```asm
bn254_snarks_fp_add: # @bn254_snarks_fp_add
push rbx
mov rax, qword ptr [rdx]
add rax, qword ptr [rsi]
mov rcx, qword ptr [rdx + 8]
adc rcx, qword ptr [rsi + 8]
mov r8, qword ptr [rdx + 16]
adc r8, qword ptr [rsi + 16]
mov rdx, qword ptr [rdx + 24]
adc rdx, qword ptr [rsi + 24]
xor esi, esi
movabs r9, 4332616871279656263
cmp rax, r9
movabs r9, -7529619929231668595
mov r10, rcx
sbb r10, r9
movabs r9, -5165552122434856867
mov r10, r8
sbb r10, r9
movabs r9, 3486998266802970665
mov r10, rdx
sbb r10, r9
movabs r9, -3486998266802970666
cmovb r9, rsi
movabs r10, 5165552122434856866
cmovb r10, rsi
movabs r11, 7529619929231668594
cmovb r11, rsi
movabs rbx, -4332616871279656263
cmovb rbx, rsi
add rbx, rax # This sequence shouldn't exist
adc r11, rcx
adc r10, r8
adc r9, rdx
mov qword ptr [rdi], rbx
mov qword ptr [rdi + 8], r11
mov qword ptr [rdi + 16], r10
mov qword ptr [rdi + 24], r9
pop rbx
ret
bn254_snarks_fr_add: # @bn254_snarks_fr_add
push rbx
mov rax, qword ptr [rdx]
add rax, qword ptr [rsi]
mov rcx, qword ptr [rdx + 8]
adc rcx, qword ptr [rsi + 8]
mov r8, qword ptr [rdx + 16]
adc r8, qword ptr [rsi + 16]
mov rdx, qword ptr [rdx + 24]
adc rdx, qword ptr [rsi + 24]
xor esi, esi
movabs r9, 4891460686036598785
cmp rax, r9
movabs r9, 2896914383306846353
mov r10, rcx
sbb r10, r9
movabs r9, -5165552122434856867
mov r10, r8
sbb r10, r9
movabs r9, 3486998266802970665
mov r10, rdx
sbb r10, r9
movabs r9, -3486998266802970666
cmovb r9, rsi
movabs r10, 5165552122434856866
cmovb r10, rsi
movabs r11, -2896914383306846354
cmovb r11, rsi
movabs rbx, -4891460686036598785
cmovb rbx, rsi
add rbx, rax
adc r11, rcx
adc r10, r8
adc r9, rdx
mov qword ptr [rdi], rbx
mov qword ptr [rdi + 8], r11
mov qword ptr [rdi + 16], r10
mov qword ptr [rdi + 24], r9
pop rbx
ret
bn254_snarks_fp_mod:
.quad 4332616871279656263
.quad -7529619929231668595
.quad -5165552122434856867
.quad 3486998266802970665
bn254_snarks_fr_mod:
.quad 4891460686036598785
.quad 2896914383306846353
.quad -5165552122434856867
.quad 3486998266802970665
```
## Clang -O0
```asm
bn254_snarks_fp_add: # @bn254_snarks_fp_add
push rax
mov rcx, qword ptr [rip + bn254_snarks_fp_mod@GOTPCREL]
call _modadd_noo_u64x4
pop rax
ret
_modadd_noo_u64x4: # @_modadd_noo_u64x4
push rbx
sub rsp, 48
mov qword ptr [rsp + 8], rcx # 8-byte Spill
mov r11, rdx
mov rdx, qword ptr [rsp + 8] # 8-byte Reload
mov rax, qword ptr [rsi + 24]
mov rcx, qword ptr [rsi + 16]
mov r8, qword ptr [rsi]
mov rsi, qword ptr [rsi + 8]
mov r9, qword ptr [r11 + 24]
mov r10, qword ptr [r11 + 16]
mov rbx, qword ptr [r11]
mov r11, qword ptr [r11 + 8]
add r8, rbx
adc rsi, r11
adc rcx, r10
adc rax, r9
mov qword ptr [rsp + 16], r8
mov qword ptr [rsp + 24], rsi
mov qword ptr [rsp + 32], rcx
mov qword ptr [rsp + 40], rax
lea rsi, [rsp + 16]
call _finalsub_noo_u64x4
add rsp, 48
pop rbx
ret
_finalsub_noo_u64x4: # @_finalsub_noo_u64x4
push rbx
mov rax, rsi
mov rbx, qword ptr [rdx + 24]
mov rsi, qword ptr [rdx + 16]
mov rcx, qword ptr [rdx]
mov rdx, qword ptr [rdx + 8]
mov r8, qword ptr [rax + 24]
mov r9, qword ptr [rax + 16]
mov r11, qword ptr [rax]
mov r10, qword ptr [rax + 8]
mov rax, r11
sub rax, rcx
mov rcx, r10
sbb rcx, rdx
mov rdx, r9
sbb rdx, rsi
mov rsi, r8
sbb rsi, rbx
cmovb rax, r11
cmovb rcx, r10
cmovb rdx, r9
cmovb rsi, r8
mov qword ptr [rdi + 24], rsi
mov qword ptr [rdi + 16], rdx
mov qword ptr [rdi + 8], rcx
mov qword ptr [rdi], rax
pop rbx
ret
bn254_snarks_fr_add: # @bn254_snarks_fr_add
push rax
mov rcx, qword ptr [rip + bn254_snarks_fr_mod@GOTPCREL]
call _modadd_noo_u64x4
pop rax
ret
bn254_snarks_fp_mod:
.quad 4332616871279656263
.quad -7529619929231668595
.quad -5165552122434856867
.quad 3486998266802970665
bn254_snarks_fr_mod:
.quad 4891460686036598785
.quad 2896914383306846353
.quad -5165552122434856867
.quad 3486998266802970665
```
## Clang -O1 but with only a single proc
```asm
bn254_snarks_fp_add: # @bn254_snarks_fp_add
push r14
push rbx
mov rcx, qword ptr [rdx + 24]
mov rax, qword ptr [rdx]
add rax, qword ptr [rsi]
mov r8, qword ptr [rdx + 8]
adc r8, qword ptr [rsi + 8]
mov rdx, qword ptr [rdx + 16]
adc rdx, qword ptr [rsi + 16]
adc rcx, qword ptr [rsi + 24]
movabs rsi, -4332616871279656263
add rsi, rax
movabs r9, 7529619929231668594
adc r9, r8
movabs r10, 5165552122434856866
adc r10, rdx
movabs r11, -3486998266802970666
adc r11, rcx
movabs rbx, 4332616871279656263
cmp rax, rbx
movabs rbx, -7529619929231668595
mov r14, r8
sbb r14, rbx
movabs rbx, -5165552122434856867
mov r14, rdx
sbb r14, rbx
movabs rbx, 3486998266802970665
mov r14, rcx
sbb r14, rbx
cmovb r9, r8
cmovb r11, rcx
cmovb r10, rdx
cmovb rsi, rax
mov qword ptr [rdi + 16], r10
mov qword ptr [rdi + 24], r11
mov qword ptr [rdi], rsi
mov qword ptr [rdi + 8], r9
pop rbx
pop r14
ret
bn254_snarks_fp_mod:
.quad 4332616871279656263
.quad -7529619929231668595
.quad -5165552122434856867
.quad 3486998266802970665
```
## Extra context
Modular addition is critical to optimize for cryptography and is used everywhere (HTTPS / authentication to websites). Currently state-of-the-art libraries have to use assembly for both speed and correctness reasons (constant-time) and improved compiler support is important for more robust software and also usage on wider hardware (WASM, GPUs, FPGAs, ...)
## Reproduction
I used to be able to reproduce it with `llc` and `opt`, `opt` moved the select one instruction apart from the borrow and reordered instructions so that either 0 or he modulus was added. Unfortunately I lost the exact fiddling that produced that output.
</pre>
<img width="1px" height="1px" alt="" src="http://email.email.llvm.org/o/eJzsO1tv2ziXv0Z5ObAhUSJNPeShSSbdAFNMt-3sPgaUSNvckUWVpBJnfv2ClHyTKV-azuD7PoxRxKp5buQ5PDcfM2PkohbiNsJ3EX64Ya1dKn270swaubopFH-7_baUBqQBBnNVVep10jZgFaze4HsrjHXPXIGYz2UpRW2h1MqYydPXD7BSvK2YBqalXa6ElSXMlYZSvzVWLTRrlm_Aag5zWXNZL4DBq9J_MK3amjuyEUqTGMUERfFDFH_o_j4taqUdtF0KqBTjJkKPxiotTITu_aednA7m11__5xM8fXHiq8bKlfxTcJC1VcDq7hNWgRHfW1GXAtQcGC8jdG-KIkL35Uq9wKu0y47xBIS0S6Fh8lu8-URpYGBkvagElKyqhN4X9XUpaijVqpGV4J4S3FesXuzDLK1tTJR-iNBjhB5ZJV8EmlbVy2qq9CJCj6WI0OOfEXpc1v_9Vn7cRy1aC9JCIUq1EuZI9AjdM96dOIM0jRCGldLCC9RaAbI2Vrellao2nXCHZ_f0ZbrPLUJJhO_kii1EhB8iRA8lX0i7bItpqVYRemyN0BNmLSuXK1FbpyFmjPAPnMwpjSme4AyhSTaLxYTNSDJJc5wQTDFKOItQfsC6FyCNULpR6cEKibt_7tz6j9I7-OTMTzw9QJQ-QIRma0qeG1VGaNbBGNXqUjzPZSVqthI9GNqC9WZnmV4IC1bLpjoAItmkKSeVrNv1ZFG3W4T-bxYXNcLZs6mZ_sM8z5vnleIev1S1say2IBEmgBJKKcoQnSU0zdEMI4RQRrIYzzKM8AzNcEwpyUmaJAmeoXxGUUoIIjSP0xnNM5JhhAiKKaapuwJGeKU6MUtrpwMpphvuxgmM7oFVclEDyUbE1u8SG2c0JVkWx1lC4jRLs5TkFMVZQgmeYRrTLMckmZ0VW58T2__lYi5rAS9KcgicP-M8QrSxGiKEY0ekf072nlGE8n1ZBkQihCCa3XX8wF96mDNjy3LL1h0Y4_y5Vuq5Jdk6u4hp_xy0mu19ANDCekb9tmcPgf3L2gpds58rGMLp4GScluZSVM4DHxxKhDDzFuPcs7eWIfVOedkeQjGKgMII7LmpWvPc4THeofkFB-8o7gHjDqqqVMkgwncZrEGSzDuyDXHaw_tYsqPWs9kTCAcECtrBXNasMm1x7sDx4JB_iq4vZX7K9k9p-NPVCrvWJNjzStatee44mbY4UvGnfQNSWqtXDyrLVQNtZU_CZx1VUYnSgkx2JBzsAeLOFHp5NlQOLSXb21Ec2NFJbbp4da9WLlqCdQmXaq0P3QulOJSKi4VwKqFKQ2tEl0f4TGRjLid9n_4Zvk__3b5PX-j7NuE_kC1szikJ5gvM9OlCKFKkH1wCOhZHepG6V9OapXvXxfpwwWWQmq3dDr-_Ks27feI7zdfO9xzAOh_miYTgjdzBO6IesAwShgjdAQ2QL8exjBzB2vKio6wSMs4rgNWzCmBtefHxfaEsyCyI0nMKoKyV9u_CSIfm3oaSsMIA6NwtZ2mKSELoLEGznGCCSHoI7tzNnuZ0foraZIZRTpI8RzlKE0IozvHIOST-yjiFHaybojhYP83OZdUYowShLM0oJpTMTrPbxsFr2KQZJXlOESE0RvksJgQPrbUnw9-1m2M-ZKgL9eIIdvB6VLMdt-PDGSPXSzdOz7u0Y91mY_SS0_QKb0qTC2yvJ9ghHBHcepV-na29V_NF_bbwNUvVVryO0MyCWEtjR65zL_TQIP0NHFjPEDUPKn9jHIO7LvvELOhPQ_A77-WwkuQKrM4RebT4CjS0SR6HBtuo5lhuLWwo0OhNoAm8grFHXxl7YCyevCf-DOn_TWHoLw1B74g9o2g_Lf7QPMlITCiJU4JzOqP4HfEH0ZzkSZbSNI0JzUiK03_Cz39I-JkcK_e98ee87V0bf66PLSHz2Fv-d40t18cVOPDzI0Gl6xilHw7Rp99b5jQxnlBsIM4nqFvI8cu9ARm9mOFwOC75WUPccDzr3zrA847p_BYGdedhxRn_UMUZeF1dhA6v2MlALRtvdiETyuKPv337fP_ll1-PApgv_QHguNzvwLbGOpRla7PHmLt6e4zqcKPDG2za3pebpoubgxgwCM_N4c0u1_snTifFmxXwtZFVFchFem815nPC-cCOYYDTF1Epxq9I4cbTi5MKv6D-DqZM4ZysS2CubSbkx0hJcrSbQegOwp_aRhHYfZKMw3c6DbIJ5bF877COTHEbtroDOgobgzT4KD74uDaazo1b8y7i0CuwdkEnkA-Mo6Vod3cuuWpZvIEfuoVKsP3jOtpQB7z1OoHedlg5A0fQv06XaCHqJz3zWXEuLdF2p3_ShsfLkS1a6FqO10pnKrkfatRd10pkJ7cU8hfs_G5C95mtL_Qv7Ow2ep0Nb7YLQpu1YTI7OOijW78tRvp1PrxW_YkPXcIWj4eT8IFdHNdY_edD-9wm9yN73a6P7Ge7HhZ7uzwQa0_qMznyZe7qKBs_OtfTOf-YGscqi6F3O07d-9e5ttDVPaD3p376otTvXIZ2lP79U6v8a9UqoYolgaK13SSQqqu33WRTo1X5Q9XM1aVLsi0gzobME13HsTDyFzZCTzQnT_RAx3uT9LrW5KkG6InW5Cm0E7VD-Hy7vlHnys9_b7FL0WTQZx423s5-rzL4nuE4klzVdhs2nQJl3kHb7Wxb8FyXa9B2u_obx8AtOWjjXfGdYxY6wF2XNLuI3xVd3-xMW_YihuP93xF-403tML9BX5eOrI6ot1sfs6ZhGjQWxP-ar68ubWj-UMpFr_mSbJs1JNlFadIgeTifNfSvC5KHC7KGA2I_FHd_WVvNoFS1FWu7v_5pMzfNufSjONJAqaWVJavAqu0oc3iaWhpojeAgXoR-e10KLSBC9L--ffv8FSL0CKy1S1E7Yp62VfAqCiOtMBHKp3Dfai1qW72BscyKiZpP7FJMmLZQyUIzLYWBJXsRDrM1ApgxYlVUb16aQtklmEYI7mUpldaitLUwBrRgRtXGCbOZ5ZxYuRIRyju5V41WL4JvJqY1mLZplLZuR3LlnljdDUX5OWatitZYMGpuX5kWngarjBOKLQSoGl4lFxqWTHMPECH6vx--fnL2-PHz735a_PHzxw_-YTqdDoeOOyV9EY1WvBuT3l9-6g7ZKigEsKLyx6F7YAGyT6T8UHIZkdiLF5FYNdZZA7rf_cddIUdqKTbjaKo-GM8G1rjjn2u18lD9qJujqIXSXGg_1b43zm0U2CWzm4n1GJSGpegG8lsDr8w44xJ8Cr_Xc6VtWzMrqjd4gkoZ65mINSstzCXnVTdtzyz0u-Pd_1Rrm9ZOb_htyvM0ZzfiNpmhNMMYU3yzvJ2XPE4TUfJZmVHB2SxJRZ5kLOcZL2ke38hbFKMspgmK8zTP8FSIOKZFIXCcxkmBkyiLxYrJajsQfyONacVtEiNK6E3FClEZ_9sFhEo_WY9QhB9u9K1DmBTtwkRZXEljzY6ElbbyP3hYU-IvRDeTjx_gszDGXSx_MVz-upLGKXneGqcFNXfFvbGaddaAHp2lMi3ck3ODbG79zwOSzQ3tbpjSILqr7n8hML1pdXV7YnreT7F3b5NGq_8TpY3Qo9-5idBjv_mXW_T_AQAA__97sVON">