<table border="1" cellspacing="0" cellpadding="8">
<tr>
<th>Issue</th>
<td>
<a href=https://github.com/llvm/llvm-project/issues/109471>109471</a>
</td>
</tr>
<tr>
<th>Summary</th>
<td>
Missed optimization: `lshr(smax(x, 0), y)` can be transformed to `smax(ashr(x, y), 0)`
</td>
</tr>
<tr>
<th>Labels</th>
<td>
new issue
</td>
</tr>
<tr>
<th>Assignees</th>
<td>
</td>
</tr>
<tr>
<th>Reporter</th>
<td>
okaneco
</td>
</tr>
</table>
<pre>
While refactoring Rust code, I ended up with the `src` loop body which produces extra max/min instructions compared to `tgt`.
Reordering the right shift to be before the signed max with 0 produces better auto-vectorized assembly.
The arithmetic shift right preserves the sign, so the saturating truncation instructions can handle clamping to 0.
https://rust.godbolt.org/z/WW4sGoaf4
```rust
pub fn src(input: &[i32], output: &mut [u8]) {
const N: usize = 2;
for (&i, o) in input.iter().zip(output.iter_mut()) {
*o = (i.max(0) >> N).min(255) as u8;
}
}
pub fn tgt(input: &[i32], output: &mut [u8]) {
const N: usize = 2;
for (&i, o) in input.iter().zip(output.iter_mut()) {
*o = (i >> N).max(0).min(255) as u8;
}
}
```
<details><summary>Assembly instructions</summary>
```asm
; src ; tgt
.LBB0_6: .LBB0_6:
movdqu xmm2, xmmword ptr [rdi + 4*r8] movdqu xmm0, xmmword ptr [rdi + 4*r8]
movdqu xmm3, xmmword ptr [rdi + 4*r8 + 16] movdqu xmm1, xmmword ptr [rdi + 4*r8 + 16]
movdqa xmm4, xmm2 psrad xmm0, 2
pcmpgtd xmm4, xmm0 packssdw xmm0, xmm0
pand xmm4, xmm2 packuswb xmm0, xmm0
movdqa xmm2, xmm3 psrad xmm1, 2
pcmpgtd xmm2, xmm0 packssdw xmm1, xmm1
pand xmm2, xmm3 packuswb xmm1, xmm1
psrld xmm4, 2 movd dword ptr [rdx + r8], xmm0
psrld xmm2, 2 movd dword ptr [rdx + r8 + 4], xmm1
movdqa xmm3, xmm1 add r8, 8
pcmpgtd xmm3, xmm4 cmp rsi, r8
pand xmm4, xmm3 jne .LBB0_6
pandn xmm3, xmm1 cmp rcx, rsi
por xmm3, xmm4 je .LBB0_8
packuswb xmm3, xmm3
packuswb xmm3, xmm3
movdqa xmm4, xmm1
pcmpgtd xmm4, xmm2
pand xmm2, xmm4
pandn xmm4, xmm1
por xmm4, xmm2
packuswb xmm4, xmm4
packuswb xmm4, xmm4
movd dword ptr [rdx + r8], xmm3
movd dword ptr [rdx + r8 + 4], xmm4
add r8, 8
cmp rsi, r8
jne .LBB0_6
cmp rcx, rsi
je .LBB0_8
; `src` with rustc flags `-Copt-level=3 -Ctarget-cpu=x86-64-v2`
.LBB0_6:
movdqu xmm2, xmmword ptr [rdi + 4*r8]
movdqu xmm3, xmmword ptr [rdi + 4*r8 + 16]
pmaxsd xmm2, xmm0
pmaxsd xmm3, xmm0
psrld xmm2, 2
psrld xmm3, 2
pminud xmm2, xmm1
packusdw xmm2, xmm2
packuswb xmm2, xmm2
pminud xmm3, xmm1
packusdw xmm3, xmm3
packuswb xmm3, xmm3
movd dword ptr [rdx + r8], xmm2
movd dword ptr [rdx + r8 + 4], xmm3
add r8, 8
cmp rsi, r8
jne .LBB0_6
cmp rcx, rsi
je .LBB0_8
```
</details>
Emitted IR - https://alive2.llvm.org/ce/z/fa8cRT
`src` body
```llvm
vector.body:
%index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
%0 = getelementptr inbounds i32, ptr %input.0, i64 %index
%1 = getelementptr inbounds i8, ptr %output.0, i64 %index
%2 = getelementptr inbounds i8, ptr %0, i64 16
%wide.load = load <4 x i32>, ptr %0, align 4
%wide.load8 = load <4 x i32>, ptr %2, align 4
%3 = tail call <4 x i32> @llvm.smax.v4i32(<4 x i32> %wide.load, <4 x i32> zeroinitializer)
%4 = tail call <4 x i32> @llvm.smax.v4i32(<4 x i32> %wide.load8, <4 x i32> zeroinitializer)
%5 = lshr <4 x i32> %3, <i32 2, i32 2, i32 2, i32 2>
%6 = lshr <4 x i32> %4, <i32 2, i32 2, i32 2, i32 2>
%7 = tail call <4 x i32> @llvm.umin.v4i32(<4 x i32> %5, <4 x i32> <i32 255, i32 255, i32 255, i32 255>)
%8 = tail call <4 x i32> @llvm.umin.v4i32(<4 x i32> %6, <4 x i32> <i32 255, i32 255, i32 255, i32 255>)
%9 = trunc nuw <4 x i32> %7 to <4 x i8>
%10 = trunc nuw <4 x i32> %8 to <4 x i8>
%11 = getelementptr inbounds i8, ptr %1, i64 4
store <4 x i8> %9, ptr %1, align 1
store <4 x i8> %10, ptr %11, align 1
%index.next = add nuw i64 %index, 8
%12 = icmp eq i64 %index.next, %n.vec
br i1 %12, label %middle.block, label %vector.body
```
`tgt` body
```llvm
vector.body:
%index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
%0 = getelementptr inbounds i32, ptr %input.0, i64 %index
%1 = getelementptr inbounds i8, ptr %output.0, i64 %index
%2 = getelementptr inbounds i8, ptr %0, i64 16
%wide.load = load <4 x i32>, ptr %0, align 4
%wide.load9 = load <4 x i32>, ptr %2, align 4
%3 = ashr <4 x i32> %wide.load, <i32 2, i32 2, i32 2, i32 2>
%4 = ashr <4 x i32> %wide.load9, <i32 2, i32 2, i32 2, i32 2>
%5 = tail call <4 x i32> @llvm.smax.v4i32(<4 x i32> %3, <4 x i32> zeroinitializer)
%6 = tail call <4 x i32> @llvm.smax.v4i32(<4 x i32> %4, <4 x i32> zeroinitializer)
%7 = tail call <4 x i32> @llvm.umin.v4i32(<4 x i32> %5, <4 x i32> <i32 255, i32 255, i32 255, i32 255>)
%8 = tail call <4 x i32> @llvm.umin.v4i32(<4 x i32> %6, <4 x i32> <i32 255, i32 255, i32 255, i32 255>)
%9 = trunc nuw <4 x i32> %7 to <4 x i8>
%10 = trunc nuw <4 x i32> %8 to <4 x i8>
%11 = getelementptr inbounds i8, ptr %1, i64 4
store <4 x i8> %9, ptr %1, align 1
store <4 x i8> %10, ptr %11, align 1
%index.next = add nuw i64 %index, 8
%12 = icmp eq i64 %index.next, %n.vec
br i1 %12, label %middle.block, label %vector.body
```
---
alive2 proof - https://alive2.llvm.org/ce/z/iUbk-i
```llvm
define i8 @src(i32 %input, i32 %shift) {
%1 = tail call i32 @llvm.smax.i32(i32 %input, i32 0)
%2 = lshr i32 %1, %shift
%3 = tail call i32 @llvm.umin.i32(i32 %2, i32 255)
%4 = trunc nuw i32 %3 to i8
ret i8 %4
}
define i8 @tgt(i32 %input, i32 %shift) {
%1 = ashr i32 %input, %shift
%2 = tail call i32 @llvm.smax.i32(i32 %1, i32 0)
%3 = tail call i32 @llvm.umin.i32(i32 %2, i32 255)
%4 = trunc nuw i32 %3 to i8
ret i8 %4
}
```
</pre>
<img width="1px" height="1px" alt="" src="http://email.email.llvm.org/o/eJzsWc1u4zgSfhr6UrAgkZIsH3xInGQxwO4cGrOY44AWaYsdSdSQlOPO0y9I_cs_SbqDxizQRqMji1VfVX2sYpEm1VocSs43KLpH0cOC1iaTaiOfaclTudhJ9m3zZyZyDorvaWqkEuUBvtTaQCoZR3gLvwEvGWdQV_AiTAYm44BiX6sUxT7kUlZgYeAlE2kGlZKsTrkGfjKKQkFPCD8VogRRaqPq1AhZakhlUVHFGRhpsczBoNj3kP-A_LsvXCrGnR_WlBKHzIDOxN5Y6R2HHd9Lxd2gC45ZK41v_mB_x43hCmht5PLIXWSvnAHVmhe7_FtrrPn_j4wDVcJkBTcibY01hivFNVdHrnt7lhMtm6_U1Ioa56qqy5Ta6GaR0hIyWrKcQ5rTonKyErpgM2Mqjcgdwk8IP6laG-8g2U7mxpPqgPDTK8JPf_4Z6n9Jug_HPqPYb_5ZpeZVVe9gX4KdGZyIsqoNIneAcIyie0Ewih6s77I2w0hRG0DRfZ24wTWg1X2DBQCQylIb-N2K1lq8ckDkATAincheKkA4QTgWDtgCuJmuauMJw5UbXHuvokI4aey6938VtWnGzmzaD8J30tmyYXguhRLfSZJHRB7hdwtaiBLhBEeRHaAa6mRwzGKsHlqe-ocxRzbjfgpHAD-Hpik3PWU3abrAUZdUk0wjW8YNFbl2Nra6LgqqviHyeNcW0yTlEdki_DQIXUxaqosO_N4mLNi_dlLcS-_f9_f-X7EldXhshgp5ZH_XAKeiwJbNU1G8SMWgMspOkmICEL6HEOE7ZSesIwtGiv57FM_MkTe13GMQW6sjveADeiOj1CmHrTKGSivKYPAft_mcFtXBsLGsDxVNn7VmL13so5jbma1oydqRkQmaPtf6ZXddbeRZxz4ZexZc9gz3nt369F6PSAvO3B3svoXlQrmApVXORqHjm0Bt5gCbTt_JzVmTKFNiB3TcoDt1gGsIbR70OMEZ013iBUCZQ7J28RaSM547yRDSomoktVtzVHJ12m_z-LXkfQn2CCXM_Lr16T1JT84TLVogqQCmQOFtVzpP-ljOsrUDIu-VaFke0RFcLSt8NRXDM2rmaEOsZ2hnLoZz1CGTZ0NvZNaQU-Rd8vNMbK1cTLor-WXTxX4mKWNl57P_tZGbzujQEYb9pdvW2S1OCvucHrQdWm5lZZY5P_IckQcCy62h6sDNMq1qRB5OSbyMw-UR933sR5vID_WCdhoLetJsYtI_GyFvrCVnr8n4dSHKerLcBuMMmnQD_GYWziUa8EnZX0W_VoaX6-9dGYy_K4PJT8_g2f7JbYVGm6dRmj8WwhjO4LcvsITpMYDm4sixl-fHoj0FpLw9Cuxpkn75Y7alakvFnsJmbliI5lVzBPKcTFcGdgsZiZLxk9tGVpkAEYeWUnB9H-GoVasyaFm1g52WV_KTmQq6k2Cf9M6A78AP3PCcF7w0dtZEuZN1yTTYLTfeNjNpUe3W19l2jrR2RmDBTbBkhNXuo2-B4XeD9ShBPNJ_EYx7uaTM4bQP2xBOLizyOAeguTiUEF5CSN6GwJchiNO0GQYpzfOpOqDQ5YCnC3ryjqGjO5mJjNxwkzkZfeVKilIYQXPxao8q65Ht8DNtJx80HjWU6UzN7eKItFiCYHC8XXvoatIhxjcQw-9CXL2LoLoQ5XWConNeOj-asduPNofGtCWf4FL8uS6tG5dUXaZQ1i_n5K_c70Tt22RKceC_pZ3c0n7_ghJ0a0BXfdpIxSfALpi5SlOyQWf0slbgj9Uu6U0WXee0bWs24MnqNjS4Jr5mjRO2g_G_J6Lj1bv0jjzt1HYKRNAo2-Gc7nhuvxaCsZx7u1ymz5OBcW-58TNC9zvfr0b1_9WoxgDrH-xT9OLiOu8_H1tkw_cgr78LOvqEBkc-2NfiT7AZftDmrz71D-5TP9imfvWpj_Wp5XI5_tocxqBSUu4_clIT_909L8XVNsf4XpQcRGJTub2xIbjvLV0aIhy526jZBUDfYIb6cNLjxaGpiUug_jSt8bDtbaWDlu7G9PWzxtimq8SJTTwpqwvnhr4UWgViC0D0aaG4cfzY5Wx2TXFOYXuh8x0U0lHgveKF4PGHCA-ukP2PobBLyQXbELYma7rgm2CF4yQOSRQusg1L_CSJ41Uc7nnKw5StmU8Df5WuUsbigC7EBvs49NfYxz7BhHhpGCUswGmwZwSHQYBCnxdU5H19LITWNd8E_jpcBQtXl9rdTGNc8hdwowhjFD0s1MYqLXf1QVt6hDZ6gDHC5HzzH6E1ZyArIwrx6u5e3Y1d7NtcRjjRzT2YW2zcNOAtfLN_Y9_dy-44GEVLvZeq6O-iWyXaQJx6nQ4j9he1yjfTdeAgTFbvvFQWCD-5Im_-LCslv_LU2PXABqcRfmqjP27w_wIAAP__pNyhNg">