<table border="1" cellspacing="0" cellpadding="8">
<tr>
<th>Issue</th>
<td>
<a href=https://github.com/llvm/llvm-project/issues/104875>104875</a>
</td>
</tr>
<tr>
<th>Summary</th>
<td>
Missed optimization with autovectorized saturating truncation (clamp)
</td>
</tr>
<tr>
<th>Labels</th>
<td>
new issue
</td>
</tr>
<tr>
<th>Assignees</th>
<td>
</td>
</tr>
<tr>
<th>Reporter</th>
<td>
okaneco
</td>
</tr>
</table>
<pre>
In Rust, trying to clamp and truncate from a slice of `i32` to a slice of `u8` using the standard library's clamp function produces more instructions than manually clamping with `max` and `min`.
```rust
pub fn clamp(input: &[i32], output: &mut [u8]) {
for (&i, o) in input.iter().zip(output.iter_mut()) {
*o = i.clamp(0, 255) as u8;
}
}
pub fn manual_clamp(input: &[i32], output: &mut [u8]) {
for (&i, o) in input.iter().zip(output.iter_mut()) {
*o = i.max(0).min(255) as u8;
}
}
```
https://rust.godbolt.org/z/zf73jsqjq
<details><summary>Assembly instructions</summary>
Clamp
```asm
.LBB0_4:
movdqu xmm6, xmmword ptr [rdi + 4*r8]
movdqu xmm5, xmmword ptr [rdi + 4*r8 + 16]
pxor xmm3, xmm3
pcmpgtd xmm3, xmm6
packssdw xmm3, xmm3
packsswb xmm3, xmm3
pxor xmm4, xmm4
pcmpgtd xmm4, xmm5
packssdw xmm4, xmm4
packsswb xmm4, xmm4
movdqa xmm7, xmm6
pxor xmm7, xmm0
movdqa xmm8, xmm1
pcmpgtd xmm8, xmm7
pand xmm6, xmm8
pandn xmm8, xmm2
por xmm8, xmm6
packuswb xmm8, xmm8
packuswb xmm8, xmm8
pandn xmm3, xmm8
movdqa xmm6, xmm5
pxor xmm6, xmm0
movdqa xmm7, xmm1
pcmpgtd xmm7, xmm6
pand xmm5, xmm7
pandn xmm7, xmm2
por xmm7, xmm5
packuswb xmm7, xmm7
packuswb xmm7, xmm7
pandn xmm4, xmm7
movd dword ptr [rdx + r8], xmm3
movd dword ptr [rdx + r8 + 4], xmm4
add r8, 8
cmp rsi, r8
jne .LBB0_4
```
Manual clamp
```asm
.LBB0_4:
movdqu xmm0, xmmword ptr [rdi + 4*r8]
packssdw xmm0, xmm0
packuswb xmm0, xmm0
movdqu xmm1, xmmword ptr [rdi + 4*r8 + 16]
packssdw xmm1, xmm1
packuswb xmm1, xmm1
movd dword ptr [rdx + r8], xmm0
movd dword ptr [rdx + r8 + 4], xmm1
add r8, 8
cmp rsi, r8
jne .LBB0_4
```
</details>
Emitted IR - https://alive2.llvm.org/ce/z/hbU88w
Clamp
```llvm
vector.body: ; preds = %vector.body, %vector.ph
%index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
%0 = getelementptr inbounds i32, ptr %input.0, i64 %index
%1 = getelementptr inbounds i8, ptr %output.0, i64 %index
%2 = getelementptr inbounds i8, ptr %0, i64 16
%wide.load = load <4 x i32>, ptr %0, align 4
%wide.load6 = load <4 x i32>, ptr %2, align 4
%3 = icmp slt <4 x i32> %wide.load, zeroinitializer
%4 = icmp slt <4 x i32> %wide.load6, zeroinitializer
%5 = tail call <4 x i32> @llvm.umin.v4i32(<4 x i32> %wide.load, <4 x i32> <i32 255, i32 255, i32 255, i32 255>)
%6 = tail call <4 x i32> @llvm.umin.v4i32(<4 x i32> %wide.load6, <4 x i32> <i32 255, i32 255, i32 255, i32 255>)
%7 = trunc nuw <4 x i32> %5 to <4 x i8>
%8 = trunc nuw <4 x i32> %6 to <4 x i8>
%9 = select <4 x i1> %3, <4 x i8> zeroinitializer, <4 x i8> %7
%10 = select <4 x i1> %4, <4 x i8> zeroinitializer, <4 x i8> %8
%11 = getelementptr inbounds i8, ptr %1, i64 4
store <4 x i8> %9, ptr %1, align 1
store <4 x i8> %10, ptr %11, align 1
%index.next = add nuw i64 %index, 8
%12 = icmp eq i64 %index.next, %n.vec
br i1 %12, label %middle.block, label %vector.body, !llvm.loop !3
```
Manual clamp
```llvm
vector.body: ; preds = %vector.body, %vector.ph
%index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
%0 = getelementptr inbounds i32, ptr %input.0, i64 %index
%1 = getelementptr inbounds i8, ptr %output.0, i64 %index
%2 = getelementptr inbounds i8, ptr %0, i64 16
%wide.load = load <4 x i32>, ptr %0, align 4
%wide.load7 = load <4 x i32>, ptr %2, align 4
%3 = tail call <4 x i32> @llvm.smax.v4i32(<4 x i32> %wide.load, <4 x i32> zeroinitializer)
%4 = tail call <4 x i32> @llvm.smax.v4i32(<4 x i32> %wide.load7, <4 x i32> zeroinitializer)
%5 = tail call <4 x i32> @llvm.umin.v4i32(<4 x i32> %3, <4 x i32> <i32 255, i32 255, i32 255, i32 255>)
%6 = tail call <4 x i32> @llvm.umin.v4i32(<4 x i32> %4, <4 x i32> <i32 255, i32 255, i32 255, i32 255>)
%7 = trunc nuw <4 x i32> %5 to <4 x i8>
%8 = trunc nuw <4 x i32> %6 to <4 x i8>
%9 = getelementptr inbounds i8, ptr %1, i64 4
store <4 x i8> %7, ptr %1, align 1
store <4 x i8> %8, ptr %9, align 1
%index.next = add nuw i64 %index, 8
%10 = icmp eq i64 %index.next, %n.vec
br i1 %10, label %middle.block, label %vector.body, !llvm.loop !8
```
---
The standard library clamp is implemented as the following.
```rust
fn clamp(n: i32, min: i32, max: i32) -> i32 {
assert!(min <= max);
if n < min {
min
} else if n > max {
max
} else {
n
}
}
```
It seems OK to transform the standard library clamp to the manual clamp, minimized IR.
alive2 proof - https://alive2.llvm.org/ce/z/pRRVhU
Rust source - https://rust.godbolt.org/z/3PxW1xWqo
```llvm
define i8 @src(i32 %input) {
%1 = icmp slt i32 %input, 0
%2 = tail call i32 @llvm.umin.i32(i32 %input, i32 255)
%3 = trunc nuw i32 %2 to i8
%4 = select i1 %1, i8 0, i8 %3
ret i8 %4
}
define i8 @tgt(i32 %input) {
%1 = tail call i32 @llvm.smax.i32(i32 %input, i32 0)
%2 = tail call i32 @llvm.umin.i32(i32 %1, i32 255)
%3 = trunc nuw i32 %2 to i8
ret i8 %3
}
```
---
Real world examples from functions in the `image-webp` crate
https://rust.godbolt.org/z/veGzv1dPx - [source](https://docs.rs/image-webp/0.1.3/src/image_webp/vp8.rs.html#2311-2321)
https://rust.godbolt.org/z/sf4v6ceGM - [source](https://docs.rs/image-webp/0.1.3/src/image_webp/vp8.rs.html#2400-2431)
_Originally reported https://github.com/rust-lang/rust/issues/125738_
</pre>
<img width="1px" height="1px" alt="" src="http://email.email.llvm.org/o/eJzsWVtz4joS_jXiRYXLli-YBx4Skpya2p06p1J79jymhC1AM7bkSDKQ_Pqtlgz4wiWZYXfn4UxNALu7v76qW5ap1nwlGJuh-B7FDyNam7VUM_mdCpbJ0ULmb7MvAj_X2iAyx0a9cbHCRuKsoGWFqcixUbXIqGF4qWSJKdYFzxiWS4wSn4cEJT7wd-_XKdyutQVbM6wNFTlVOS74QlH1hshENyqWtcgMlwJXSuZ1xjQupWKYC21UbSkamzUVuKSipkXx5uQAecvNGrSVdAfqwFi44gIlvof8B-TfNZ-J7_4r8NPequoFXgqHhUjKRVUbFN5hRBIU34Nf8QNERNbmSClrg1F8X6eWOMVocu_Q8FIqjEiKSMKtFFC5wBbW44YpS5x67xy0OVB7_6WsjaN1AZt_iNxJjMIHzL29qT4oIHEM_FTjOkVhSwhNHhqf9z8aT134Xm7lMMb_C58hs9bjqQdpJelpv4dOHzLuLtfGVBqFd4g8IfIEVeCtZL6QhfGkWiHy9A5_y0n4Tb9-e-2UTjjPmaG80Ch8ROFc12UJBRw-3mnNykXx1ilVFM4ReToytaHmNvRd86gu3R3vn_f3_ksENnaCUcpN_lrjXVkmEORdWW6lynFlFCRG5Rwjco8jRO6UTdIpYZCKr0rbn0EyAKl2UsH3rizDBiTscWRltTJ5myNpOCqafdc63-5ZL4BYzu3iAmfLkqghRWct2XPEp9Tk20sgQ0vOcNr4Ussx6Tk-DN6ewz-LkTYcwd7fozd70qRvqsgb-H15pEMOgTvwpMfhLGxz9J2g2fe6G5C0p-0DLEODwh5HKxTJmewdw5lcDeekF85hjZxL2jGqcS_uLesnV8M5OV-CNlYtlkFeB-E8y3kwKDrNARGB77y79Hd2vbumcXJBXpFrOsdBurc0aG6FQQGZ414NZGXliNpODtUjfxPMfu974qmO7j6_2qHWTPEf66yuhj7dWk80Nv90RZ7I5RnOlkXB59v1CYuCMytgaFGf88NFc8KFTxRNcNuiuV4wbj63JnqL9lhyY1iOvzzjMe7uGGjBN4x4RbEpmw1Dxppdw3rxZ5pur815kHS3NiwzUnmw87abrPAeV4rl2u54EInbdDJv3anW-90OibnI2c5KVGuOeRJBmLHfF8BNpIG4l_IE25kuI-jCrQpHJPYt-IoZVrCSCQOZ5GIha5FrDJtFMnfZBVTY11nd1pBGTwssuAiWtrCaTeIlMPJhsANKkLTktzxnXiFpbnGaH_MI76xb4WMfgBZ8JXB0jP0BILmOQIYIcDt021uoaF2YrnRHBYi_MyW54IbTgr8z1YKJPgqTXMaJLQ4sCZzRougDRbZ6vbrkwttENvnpZZN71HDOQ-IeWeb48k-I3rRlWnJL05Lb2jZxtsHjMRb1dhj_GB6M93fTQ7cBUnpNNjkjC7SpFdasYNkx60EjGLadBMFB4vt0cKW9Wv1L8NGPwKdt-I93g2C_gA9rRxup2AB-2pdxKy64LBX4bbFTcp2Waa2GAQXp6vSm9qgCKHJcluy1w9ruvcLbsGwvtlCYB04YyAVdsAIuS57nBfMWhcy-dwiDKRHYhVBIWcFFeHIEXtgt3XI-_T2g_o8DqoMw-ckJda3v6pLufmgkDHrGdDDWbqR78knlt5iF4a83A6NfZPTdaPbdcnpMfmh6tDVNbzc7_J-ZHf5NZkd6cnaMx2P3418njvObo3yuMS8rlxeWY6rt2f9SFoXccrHyzh7Gtw7iBQydpn-XvHNFd4erKR5DHqA8O8fHVGumDCIBImnJBWQN4mmPkafH82K-xJaGLU___BnUts_TMSs028s8AtgJGbo7nkU7gQFPF_XygbX7_GKwZqzU-Pd_wJIwigq9lKo8-U6lSQLwrVlz3L8Pq_WUl_zdPt42iXAPtbhSUi4_88RbPT__e_2nw3iutcFa1ipjA4gzx-zhH7u_gt1fr7LzXD7ciORsyQXDPIUWp1WGSGrz3Qz03quDwxQ_PA51mefYH8zoY0u1vO1G6tpoH-LQGafDUXloZ40QgUzwdDDXmn11s2Qtauo2RuApDI9GQjHT3Ir6tTKMj1mZD8bntNN2mp532u-6_LnwBT8ZumMgwo8smkOnema0wFupihyzHYXGpN07zP0rR425sIsFJT4v6YqNt2xRocTHmaKGOZQP1fSG_fa-CfI_dngMe1m3IOzeNu3K5zLTntKIPLUUkiffC7wQkSdb5o700pA2Veop7a1NWSASkjAIxiSEBjf9hH16GW2SjP329b9uX-T7YxKFR_vc58vviq-4sO9vFaukggHRVb3iZl0vvEyWjR_jgopV8xu0al0zsCwg8SRMX0b5LMyn4ZSO2CyYkCjw40kSjtazhERpTEjKYsqWeZRkOSERmyRBHi0jkoYjPiM-ifw0mBISJH7o-SxkGWPBgoUxY3mAIp-VlBeH9jeyymeBH6WTeGQnqbav1AkRbIstFRGC4oeRmoHQeFGvNCwJro0-whhuCjb7yrVmOZaVgZZM7dtv-y6b1ka66Ww7taamVtTYl-juJTxwIpI2XX06qlUxuxBF20zd17hS8hvLuoF07mxm5D8BAAD__wYF2bQ">