<table border="1" cellspacing="0" cellpadding="8">
<tr>
<th>Issue</th>
<td>
<a href=https://github.com/llvm/llvm-project/issues/103946>103946</a>
</td>
</tr>
<tr>
<th>Summary</th>
<td>
[opt / isel] 66% more unnecessary instructions generated due to bad pattern or missed sbb/icmp/cmov fusion
</td>
</tr>
<tr>
<th>Labels</th>
<td>
new issue
</td>
</tr>
<tr>
<th>Assignees</th>
<td>
</td>
</tr>
<tr>
<th>Reporter</th>
<td>
mratsim
</td>
</tr>
</table>
<pre>
I'm not sure who to blame between `opt` or the code generator:
- either opt generates IR that uses a pattern that the codegenerator doesn't recognize and fuse correctly
- or the codegenerator should recognize the pattern
In any case the following code has a good version with 12 instructions in the critical path (add/adc, sub/sbb and cmov) and the bad version .v2 has 20 instructions in the critical path.
Both are from the same IR except that one has a constant inlined by `opt`.
This is similar to https://github.com/llvm/llvm-project/issues/102868 but even worse.
https://alive2.llvm.org/ce/z/KzsXa9
## Original IR
```llvm
define void @_modadd_mayo.u64x4(ptr %0, ptr %1, ptr %2, ptr %3) section "ctt.fields" {
%a = load i256, ptr %1, align 4
%b = load i256, ptr %2, align 4
%M = load i256, ptr %3, align 4
%ax = zext i256 %a to i320
%bx = zext i256 %b to i320
%a_plus_b = add i320 %ax, %bx
%mx = zext i256 %M to i320
%apb_minus_M = sub i320 %a_plus_b, %mx
%5 = lshr i320 %apb_minus_M, 319
%6 = trunc i320 %5 to i1
%7 = select i1 %6, i320 %a_plus_b, i320 %apb_minus_M
%8 = trunc i320 %7 to i256
store i256 %8, ptr %0, align 4
ret void
}
@secp256k1_fp_mod = constant i256 -4294968273, section "ctt.secp256k1_fp.constants", align 64
; Function Attrs: hot
define internal fastcc void @_modadd_mayo.u64x4.v2(ptr %0, ptr %1, ptr %2, ptr %3) #2 section "ctt.fields" {
%a = load i256, ptr %1, align 4
%b = load i256, ptr %2, align 4
%M = load i256, ptr %3, align 4
%ax = zext i256 %a to i320
%bx = zext i256 %b to i320
%a_plus_b = add i320 %ax, %bx
%mx = zext i256 %M to i320
%apb_minus_M = sub i320 %a_plus_b, %mx
%5 = lshr i320 %apb_minus_M, 319
%6 = trunc i320 %5 to i1
%7 = select i1 %6, i320 %a_plus_b, i320 %apb_minus_M
%8 = trunc i320 %7 to i256
store i256 %8, ptr %0, align 4
ret void
}
; Function Attrs: hot
define void @secp256k1_fp_add(ptr %0, ptr %1, ptr %2) #2 section "ctt.secp256k1_fp" {
call fastcc void @_modadd_mayo.u64x4.v2(ptr %0, ptr %1, ptr %2, ptr @secp256k1_fp_mod)
ret void
}
attributes #2 = { hot }
```
## opt -O3
```llvm
define void @_modadd_mayo.u64x4(ptr nocapture writeonly %0, ptr nocapture readonly %1, ptr nocapture readonly %2, ptr nocapture readonly %3) local_unnamed_addr #0 section "ctt.fields" {
%a = load i256, ptr %1, align 4
%b = load i256, ptr %2, align 4
%M = load i256, ptr %3, align 4
%ax = zext i256 %a to i320
%bx = zext i256 %b to i320
%a_plus_b = add nuw nsw i320 %bx, %ax
%mx = zext i256 %M to i320
%apb_minus_M = sub nsw i320 %a_plus_b, %mx
%.not1 = icmp slt i320 %apb_minus_M, 0
%5 = select i1 %.not1, i320 %a_plus_b, i320 %apb_minus_M
%6 = trunc i320 %5 to i256
store i256 %6, ptr %0, align 4
ret void
}
define void @secp256k1_fp_add(ptr nocapture writeonly %0, ptr nocapture readonly %1, ptr nocapture readonly %2) local_unnamed_addr #1 section "ctt.secp256k1_fp" {
%.val = load i256, ptr %1, align 4
%.val1 = load i256, ptr %2, align 4
%ax.i = zext i256 %.val to i320
%bx.i = zext i256 %.val1 to i320
%a_plus_b.i = add nuw nsw i320 %bx.i, %ax.i
%apb_minus_M.i = add nuw nsw i320 %a_plus_b.i, 4294968273
%.not.i = icmp ugt i320 %a_plus_b.i, 115792089237316195423570985008687907853269984665640564039457584007908834671662
%4 = select i1 %.not.i, i320 %apb_minus_M.i, i320 %a_plus_b.i
%5 = trunc i320 %4 to i256
store i256 %5, ptr %0, align 4
ret void
}
attributes #0 = { mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) }
attributes #1 = { hot mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) }
```
## Codegen
```asm
_modadd_mayo.u64x4: # @_modadd_mayo.u64x4
push r14
push rbx
mov r9, qword ptr [rdx + 24]
mov r8, qword ptr [rdx + 16]
mov rax, qword ptr [rdx]
mov rdx, qword ptr [rdx + 8]
add rax, qword ptr [rsi]
adc rdx, qword ptr [rsi + 8]
adc r8, qword ptr [rsi + 16]
adc r9, qword ptr [rsi + 24]
setb sil
movzx esi, sil
mov r10, rax
sub r10, qword ptr [rcx]
mov r11, rdx
sbb r11, qword ptr [rcx + 8]
mov rbx, r8
sbb rbx, qword ptr [rcx + 16]
mov r14, r9
sbb r14, qword ptr [rcx + 24]
sbb rsi, 0
sar rsi, 63
cmovs r11, rdx
cmovs r14, r9
cmovs rbx, r8
cmovs r10, rax
mov qword ptr [rdi + 16], rbx
mov qword ptr [rdi + 24], r14
mov qword ptr [rdi], r10
mov qword ptr [rdi + 8], r11
pop rbx
pop r14
ret
secp256k1_fp_add: # @secp256k1_fp_add
push r15
push r14
push r12
push rbx
mov r8, qword ptr [rdx + 24]
mov rcx, qword ptr [rdx + 16]
mov rax, qword ptr [rdx]
mov rdx, qword ptr [rdx + 8]
add rax, qword ptr [rsi]
adc rdx, qword ptr [rsi + 8]
adc rcx, qword ptr [rsi + 16]
adc r8, qword ptr [rsi + 24]
setb sil
movzx r11d, sil
movabs rsi, 4294968273
add rsi, rax
mov r9, rdx
adc r9, 0
mov r10, rcx
adc r10, 0
mov rbx, r8
adc rbx, 0
xor r14d, r14d
movabs r15, -4294968274
cmp r15, rax
mov r15, -1
mov r12, -1
sbb r12, rdx
mov r12, -1
sbb r12, rcx
sbb r15, r8
mov r15d, 0
sbb r15, r11
mov r11d, 0
sbb r11, r11
mov r11d, 0
sbb r11, r11
sbb r14, r14
cmovae r9, rdx
cmovae rbx, r8
cmovae r10, rcx
cmovae rsi, rax
mov qword ptr [rdi + 16], r10
mov qword ptr [rdi + 24], rbx
mov qword ptr [rdi], rsi
mov qword ptr [rdi + 8], r9
pop rbx
pop r12
pop r14
pop r15
ret
```
## Analysis
In the "good" version, the extra SAR when using i320 is unnecessary and has been already reported separately in https://github.com/llvm/llvm-project/issues/103841
In the bad version, the following is equivalent to substractiong by M
```asm
movabs rsi, 4294968273
add rsi, rax
mov r9, rdx
adc r9, 0
mov r10, rcx
adc r10, 0
mov rbx, r8
adc rbx, 0
```
The optimizer misses that there can't be carries here, but that's not the main issue.
Then there is this problematic extra 12 instructions as a followup
```asm
movabs r15, -4294968274
cmp r15, rax
mov r15, -1
mov r12, -1
sbb r12, rdx
mov r12, -1
sbb r12, rcx
sbb r15, r8
mov r15d, 0
sbb r15, r11
mov r11d, 0
sbb r11, r11
mov r11d, 0
sbb r11, r11
sbb r14, r14
```
</pre>
<img width="1px" height="1px" alt="" src="http://email.email.llvm.org/o/eJzsWktv4zgS_jXMpdCGRL0POXQmMBAsBgP0zmFvASXRNnclUkNSdpxfvyAl62XR7XS6L7sTzLQN1Yusqu8ryhJRiu05pY8oekLR8wNp9UHIx1oSrVj9kIvy_PiCcFIDFxpUKymcDgK0gLwiNYWc6hOlHFDsiUaj2AMhQR8oFKKksKecSqKFRMFX5D0j7-sXoEwfqATR6IuYKnj5BvpANLSKKiDQEK2p5N21i7fBGZSCKo5wokHSQuw5e6dAeAm7VhlVKWmhq_Ml4GRBowt1EG1VTuyNSh-2M-z-feFA-BkKojqVnagqcWJ8323wQMxy90KUcKRSMcHhxPQBfAyMKy3bQjPBFTDerUEyzQpSmUgHQDglZYnwlpQFwr-BanOEtyrP7WaKWhwRzux3Y5uTMcbmiG1o7H0_zGa6nSehD0AkhZ0UtdVVpoov34C-FbTRXcYFv-ysEFxpwjUwXjFOS8jPY6lnnv88MAVMgWI1q4g0HXLQulGm8niL8HbP9KHNN4WoEd5W1fHy8aWR4t-00AhvmVItVQhvfQ-ncQp5q4EeKYeTkIrOws19k4odKd4Ydxsh9whvC4rw9h3h7T_e1b9INjVFOEA4gD8k2zNOKnj5NpPGXvefXaK9VNId4xSOgpWAQu-1FiUpy9eanMWmjcO3EOG00RIQjjxTx_67P_mOJ98DU1VFbckAYVxovdkxWpUKYQwoeeqigtElgIJnqAQpgeEoXnonFdtzCCcGudMArxv87jQI1g3Im7V4p2_aWnTL1AJYgL3pSlb08hU98tpUrXrtFk7K0sq7QGYB1tNEvV5x-_ua2yZ_rRlv1Wu3Q9Xmo-c-ZO-_nvqPunyogxzVR1fGIvCziX5s9bVseTEYRHY9_kQr6dZAK1poYL41NL7WVrQWdvSUrsRLbDzjstdTWkg6pCedFNVbKaqk2jZ3D4DkeYaH0FO0aHAU_8d_3TWm-e0KRmYwUb6EOAuzOMWJ7ZpFd08dbC6GptvHxcThLGjwBNuWd06-ai0N1OEg9AyQjBu2JhXsiNJFcQOgmyP-AYwiHOC_gWr1fg1O_4bpz4PpXYi5IGSGaHsGuQMc64CYulrAoiDVz8fmNR0hnN2VIqK1ZHlrjpt2I6ZAKHkySYJR9TL-V04M5sj65Y_g0-cELgrSaHuUlkxTwavzbO-jXFJSXsT-bTG-LbaEVomCVK8t56Smpam7yW3g3cNx__sUt8pxvD0BV6cBwvmF68jPOpNMvd8ivA0X2rdWrKgbUJV2EZ93RZMzQrOOfozU3CTqJLX4M6R2H2v9Kji58OJ_gAJNvo-k-ihyjI3_UfSQtw277kMbfg0RLmX_Bi56m3VkbNiAjQ1b7_ob9mMI42VynJxDoPdgMdDutcPc96Mkw16a4SAJ_NjPohAHUeJlaeR5aZwmmZekUYDjLEvDOI7i0DP_B1kYJVEael6SeWkahHHixzEeKTBch1MXdAU7C8G4xiVC55gKb2Mq-gym5lPQG6Zg3SrdSLGXVCngYicpBS4kLVqpzDd15gVw0fIT4yWcWFVJqlvJoaa1kGeEUyL3Na3NscMAyWLRHhou8eeR_dn8_fXRbw3237qfhlZ_BiCqn-4r4zwwTRE4Zn1fjO6vadXBfEp_TSDHk3P3V4uj_ZSZKe9fJyHLruDRkyzfAOEnwCGKnh1WqdPKj91W3Tl-abaqb667QqRXFgbuzgiKjfqkLDrFNfeKOd0Xzm33RivbHqxWUtxbraRYUZ3bT1b1gloc398AgCqL9VGwTK9voSrJotTmDDCRzxdSrKe_07ezw6Rq7i_Pp_KlP0cOB6_dAUemDqf5SmGK77eWH1qvmWupodPrWg0uZl3GvYWUyKk0DubiohZH5UxeJ3UueBAvsjSardf4kocFXqa9aczWWGDVpEuKMVnSiSvSoO-NbeteUjqo-wu2Es3QB1eCq8VI2t99Xh3cgkFn8dfT6ZWBg0yj77HseAU7VJ3U6ybRW9RbuInxJ3EvuAjyJv9-l3sH3Q9R8KC-tu87yNdN2XeR75gWy8HS90sXCZNcXTjh-nA55glG6nDCuBsZI3nMR4l3m_-LN0cqOvkSnw5CHqw68SLmm-g50A_LnibK1YRYCOHfJj_fhkvG6wHf6Y0pGbfVOfBdu8Zr4pH58TyVH7RbJnOUR2tZmyy6XBsdC-sl-9mTT99iTlPfaQqTDl0xd5kupuQVy5rhQ-h1Vw7X1ztoEDv6cpAvwXDnMPMdOPjOQHPRsWugKfaBMMNcyz4w1mB1fKzNu1E7cgzCWzchXzmpzoqpqeile56LMN4LUSKML0-AzS6MhL5pSeCfX7_B6UA5tIrxfXcbyRS0nNOCKkXk2T5FPhAFOaUcSGXukc4gaSOkpiUo2hBJNK3OwPinntsGaeiv7GDy8Pqy9PFJOlNA_2rZkVSUa3Prq9pcaUnsjyx7yM_wu-umbFL2_weCX_TPnwcKotGsZu9UQs2Uomp4Z0JSKEj3lkRuvkrJqAJz3TjM2-5ZP8KJsq92mJLUhHGwxVw82qe898iMf6agkSKvaE00K_oeXL7xYF8f6GrcNneW765xdDWKrurwoxPpahrdbwprQ-kXTaM7Zgl8bhTdMneNo0VrPpSPQZkFGXmgj36CgzjEXhI-HB6DLPcxDXI_2pV-XoQlDpIwTSjFu7RMi_iBPWIPh17qh36Akyjc0MQPQpr7RZ7s4mjnodCjNWHV8LrHg23ZR98LsjB-qEhOK2XfZ8KY01PX0AhjFD0_yEdLXXm7Vyj0Kqa0Gt1opiv7IpRoNCC8BaZohaJniGOEI6iFpDNOnfX75V2mEsqW2vejSDm8xyR6dJYmfYYwi7pBeGtmLOxaw4oPraweP0W8Zu_HR_zfAAAA__-Ed1uE">