<table border="1" cellspacing="0" cellpadding="8">
<tr>
<th>Issue</th>
<td>
<a href=https://github.com/llvm/llvm-project/issues/66652>66652</a>
</td>
</tr>
<tr>
<th>Summary</th>
<td>
Bizarre optimization of simple array-filling loop in -Os
</td>
</tr>
<tr>
<th>Labels</th>
<td>
new issue
</td>
</tr>
<tr>
<th>Assignees</th>
<td>
</td>
</tr>
<tr>
<th>Reporter</th>
<td>
dzaima
</td>
</tr>
</table>
<pre>
This code:
```c
#include<stdint.h>
#include<stddef.h>
void fill_i16(int16_t* a, int16_t v, size_t l) {
for (size_t i = 0; i < l; i++) a[i] = v;
}
```
compiled with `-Os` on either x86-64 or aarch64, leads to bizarre code, e.g. x86-64:
<details>
<summary> bizarre x86-64</summary>
```asm
.LCPI0_0:
.quad 6 # 0x6
.quad 7 # 0x7
.LCPI0_1:
.quad 4 # 0x4
.quad 5 # 0x5
.LCPI0_2:
.quad 2 # 0x2
.quad 3 # 0x3
.LCPI0_3:
.byte 0 # 0x0
.byte 0 # 0x0
.byte 0 # 0x0
.byte 0 # 0x0
.byte 0 # 0x0
.byte 0 # 0x0
.byte 0 # 0x0
.byte 0 # 0x0
.byte 1 # 0x1
.byte 0 # 0x0
.byte 0 # 0x0
.byte 0 # 0x0
.byte 0 # 0x0
.byte 0 # 0x0
.byte 0 # 0x0
.byte 0 # 0x0
.LCPI0_4:
.quad -9223372034707292160 # 0x8000000080000000
.quad -9223372034707292160 # 0x8000000080000000
.LCPI0_5:
.quad 8 # 0x8
.quad 8 # 0x8
fill_i16: # @fill_i16
test rdx, rdx
je .LBB0_19
lea rax, [rdx + 7]
and rax, -8
dec rdx
movq xmm0, rdx
pshufd xmm0, xmm0, 68 # xmm0 = xmm0[0,1,0,1]
movdqa xmm1, xmmword ptr [rip + .LCPI0_0] # xmm1 = [6,7]
movdqa xmm2, xmmword ptr [rip + .LCPI0_1] # xmm2 = [4,5]
movdqa xmm3, xmmword ptr [rip + .LCPI0_2] # xmm3 = [2,3]
movdqa xmm4, xmmword ptr [rip + .LCPI0_3] # xmm4 = [0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0]
xor ecx, ecx
movdqa xmm5, xmmword ptr [rip + .LCPI0_4] # xmm5 = [9223372039002259456,9223372039002259456]
pxor xmm0, xmm5
pcmpeqd xmm6, xmm6
movdqa xmm7, xmmword ptr [rip + .LCPI0_5] # xmm7 = [8,8]
.LBB0_2: # =>This Inner Loop Header: Depth=1
movdqa xmm8, xmm4
pxor xmm8, xmm5
movdqa xmm10, xmm8
pcmpgtd xmm10, xmm0
pshufd xmm9, xmm10, 160 # xmm9 = xmm10[0,0,2,2]
pshuflw xmm11, xmm9, 232 # xmm11 = xmm9[0,2,2,3,4,5,6,7]
pcmpeqd xmm8, xmm0
pshufd xmm8, xmm8, 245 # xmm8 = xmm8[1,1,3,3]
pshuflw xmm12, xmm8, 232 # xmm12 = xmm8[0,2,2,3,4,5,6,7]
pand xmm12, xmm11
pshufd xmm10, xmm10, 245 # xmm10 = xmm10[1,1,3,3]
pshuflw xmm11, xmm10, 232 # xmm11 = xmm10[0,2,2,3,4,5,6,7]
por xmm11, xmm12
pxor xmm11, xmm6
packssdw xmm11, xmm11
movd edx, xmm11
test dl, 1
je .LBB0_4
mov word ptr [rdi + 2*rcx], si
.LBB0_4: # in Loop: Header=BB0_2 Depth=1
pand xmm8, xmm9
por xmm8, xmm10
packssdw xmm8, xmm8
pxor xmm8, xmm6
packssdw xmm8, xmm8
movd edx, xmm8
shr edx, 16
test dl, 1
je .LBB0_6
mov word ptr [rdi + 2*rcx + 2], si
.LBB0_6: # in Loop: Header=BB0_2 Depth=1
movdqa xmm9, xmm3
pxor xmm9, xmm5
movdqa xmm10, xmm9
pcmpgtd xmm10, xmm0
pshufd xmm8, xmm10, 160 # xmm8 = xmm10[0,0,2,2]
pcmpeqd xmm9, xmm0
pshufd xmm9, xmm9, 245 # xmm9 = xmm9[1,1,3,3]
movdqa xmm11, xmm9
pand xmm11, xmm8
pshufd xmm10, xmm10, 245 # xmm10 = xmm10[1,1,3,3]
por xmm11, xmm10
packssdw xmm11, xmm11
pxor xmm11, xmm6
packssdw xmm11, xmm11
pextrw edx, xmm11, 2
test dl, 1
je .LBB0_8
mov word ptr [rdi + 2*rcx + 4], si
.LBB0_8: # in Loop: Header=BB0_2 Depth=1
pshufhw xmm8, xmm8, 132 # xmm8 = xmm8[0,1,2,3,4,5,4,6]
pshufhw xmm9, xmm9, 132 # xmm9 = xmm9[0,1,2,3,4,5,4,6]
pand xmm9, xmm8
pshufhw xmm8, xmm10, 132 # xmm8 = xmm10[0,1,2,3,4,5,4,6]
por xmm8, xmm9
pxor xmm8, xmm6
packssdw xmm8, xmm8
pextrw edx, xmm8, 3
test dl, 1
je .LBB0_10
mov word ptr [rdi + 2*rcx + 6], si
.LBB0_10: # in Loop: Header=BB0_2 Depth=1
movdqa xmm8, xmm2
pxor xmm8, xmm5
movdqa xmm10, xmm8
pcmpgtd xmm10, xmm0
pshufd xmm9, xmm10, 160 # xmm9 = xmm10[0,0,2,2]
pshuflw xmm11, xmm9, 232 # xmm11 = xmm9[0,2,2,3,4,5,6,7]
pcmpeqd xmm8, xmm0
pshufd xmm8, xmm8, 245 # xmm8 = xmm8[1,1,3,3]
pshuflw xmm12, xmm8, 232 # xmm12 = xmm8[0,2,2,3,4,5,6,7]
pand xmm12, xmm11
pshufd xmm10, xmm10, 245 # xmm10 = xmm10[1,1,3,3]
pshuflw xmm11, xmm10, 232 # xmm11 = xmm10[0,2,2,3,4,5,6,7]
por xmm11, xmm12
pxor xmm11, xmm6
packssdw xmm11, xmm11
pextrw edx, xmm11, 4
test dl, 1
je .LBB0_12
mov word ptr [rdi + 2*rcx + 8], si
.LBB0_12: # in Loop: Header=BB0_2 Depth=1
pand xmm8, xmm9
por xmm8, xmm10
packssdw xmm8, xmm8
pxor xmm8, xmm6
packssdw xmm8, xmm8
pextrw edx, xmm8, 5
test dl, 1
je .LBB0_14
mov word ptr [rdi + 2*rcx + 10], si
.LBB0_14: # in Loop: Header=BB0_2 Depth=1
movdqa xmm9, xmm1
pxor xmm9, xmm5
movdqa xmm10, xmm9
pcmpgtd xmm10, xmm0
pshufd xmm8, xmm10, 160 # xmm8 = xmm10[0,0,2,2]
pcmpeqd xmm9, xmm0
pshufd xmm9, xmm9, 245 # xmm9 = xmm9[1,1,3,3]
movdqa xmm11, xmm9
pand xmm11, xmm8
pshufd xmm10, xmm10, 245 # xmm10 = xmm10[1,1,3,3]
por xmm11, xmm10
packssdw xmm11, xmm11
pxor xmm11, xmm6
packssdw xmm11, xmm11
pextrw edx, xmm11, 6
test dl, 1
je .LBB0_16
mov word ptr [rdi + 2*rcx + 12], si
.LBB0_16: # in Loop: Header=BB0_2 Depth=1
pshufhw xmm8, xmm8, 132 # xmm8 = xmm8[0,1,2,3,4,5,4,6]
pshufhw xmm9, xmm9, 132 # xmm9 = xmm9[0,1,2,3,4,5,4,6]
pand xmm9, xmm8
pshufhw xmm8, xmm10, 132 # xmm8 = xmm10[0,1,2,3,4,5,4,6]
por xmm8, xmm9
pxor xmm8, xmm6
packssdw xmm8, xmm8
pextrw edx, xmm8, 7
test dl, 1
je .LBB0_18
mov word ptr [rdi + 2*rcx + 14], si
.LBB0_18: # in Loop: Header=BB0_2 Depth=1
add rcx, 8
paddq xmm4, xmm7
paddq xmm3, xmm7
paddq xmm2, xmm7
paddq xmm1, xmm7
cmp rax, rcx
jne .LBB0_2
.LBB0_19:
ret
```
</details>
Compiler Explorer: https://godbolt.org/z/9jfYh8Wz8; it appears the first version with this behavior was clang 12.
Use of SIMD here (the loop is unrolled, _not_ vectorized) is completely unnecessary; the output of `-Oz`, which is a simple scalar non-unrolled loop, is ~5x faster in a simple test.
The "optimized" IR in question:
<details>
<summary>IR</summary>
```llvm
define dso_local void @fill_i16(ptr nocapture noundef writeonly %a, i16 noundef signext %v, i64 noundef %l) local_unnamed_addr {
entry:
%cmp3.not = icmp eq i64 %l, 0
br i1 %cmp3.not, label %for.cond.cleanup, label %vector.ph
vector.ph:
%n.rnd.up = add i64 %l, 7
%n.vec = and i64 %n.rnd.up, -8
%trip.count.minus.1 = add i64 %l, -1
%broadcast.splatinsert = insertelement <8 x i64> poison, i64 %trip.count.minus.1, i64 0
%broadcast.splat = shufflevector <8 x i64> %broadcast.splatinsert, <8 x i64> poison, <8 x i32> zeroinitializer
br label %vector.body
vector.body:
%index = phi i64 [ 0, %vector.ph ], [ %index.next, %pred.store.continue18 ]
%vec.ind = phi <8 x i64> [ <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>, %vector.ph ], [ %vec.ind.next, %pred.store.continue18 ]
%0 = icmp ule <8 x i64> %vec.ind, %broadcast.splat
%1 = extractelement <8 x i1> %0, i64 0
br i1 %1, label %pred.store.if, label %pred.store.continue
pred.store.if:
%2 = getelementptr inbounds i16, ptr %a, i64 %index
store i16 %v, ptr %2, align 2
br label %pred.store.continue
pred.store.continue:
%3 = extractelement <8 x i1> %0, i64 1
br i1 %3, label %pred.store.if5, label %pred.store.continue6
pred.store.if5:
%4 = or i64 %index, 1
%5 = getelementptr inbounds i16, ptr %a, i64 %4
store i16 %v, ptr %5, align 2
br label %pred.store.continue6
pred.store.continue6:
%6 = extractelement <8 x i1> %0, i64 2
br i1 %6, label %pred.store.if7, label %pred.store.continue8
pred.store.if7:
%7 = or i64 %index, 2
%8 = getelementptr inbounds i16, ptr %a, i64 %7
store i16 %v, ptr %8, align 2
br label %pred.store.continue8
pred.store.continue8:
%9 = extractelement <8 x i1> %0, i64 3
br i1 %9, label %pred.store.if9, label %pred.store.continue10
pred.store.if9:
%10 = or i64 %index, 3
%11 = getelementptr inbounds i16, ptr %a, i64 %10
store i16 %v, ptr %11, align 2
br label %pred.store.continue10
pred.store.continue10:
%12 = extractelement <8 x i1> %0, i64 4
br i1 %12, label %pred.store.if11, label %pred.store.continue12
pred.store.if11:
%13 = or i64 %index, 4
%14 = getelementptr inbounds i16, ptr %a, i64 %13
store i16 %v, ptr %14, align 2
br label %pred.store.continue12
pred.store.continue12:
%15 = extractelement <8 x i1> %0, i64 5
br i1 %15, label %pred.store.if13, label %pred.store.continue14
pred.store.if13:
%16 = or i64 %index, 5
%17 = getelementptr inbounds i16, ptr %a, i64 %16
store i16 %v, ptr %17, align 2
br label %pred.store.continue14
pred.store.continue14:
%18 = extractelement <8 x i1> %0, i64 6
br i1 %18, label %pred.store.if15, label %pred.store.continue16
pred.store.if15:
%19 = or i64 %index, 6
%20 = getelementptr inbounds i16, ptr %a, i64 %19
store i16 %v, ptr %20, align 2
br label %pred.store.continue16
pred.store.continue16:
%21 = extractelement <8 x i1> %0, i64 7
br i1 %21, label %pred.store.if17, label %pred.store.continue18
pred.store.if17:
%22 = or i64 %index, 7
%23 = getelementptr inbounds i16, ptr %a, i64 %22
store i16 %v, ptr %23, align 2
br label %pred.store.continue18
pred.store.continue18:
%index.next = add i64 %index, 8
%vec.ind.next = add <8 x i64> %vec.ind, <i64 8, i64 8, i64 8, i64 8, i64 8, i64 8, i64 8, i64 8>
%24 = icmp eq i64 %index.next, %n.vec
br i1 %24, label %for.cond.cleanup, label %vector.body
for.cond.cleanup:
ret void
}
```
</details>
</pre>
<img width="1px" height="1px" alt="" src="http://email.email.llvm.org/o/eJzsW1tv47gV_jXMCxFDJHV9yMPYmaADbLHFdouiT4Es0TG3sqih6MTJQ397QYqUqQt9m9lFu7tGEskieXjO-c6Fh2LytmUvNaUPIFqC6PEu38stFw_lR852-d2al-8PP29ZCwteUkA-geARBPZvHHQ_hfmOCauLaq86rlpZsloutoB8nm8t6cZpfeWshBtWVc8MxQCnrJYofpYAf4I5wCtovsNX9aVlH_RZwgrgDIJk2VGAcMMFBDg1rQwC8ggDQJb6dgUrfQvwUv9kMAfRkoHoUfd7BcTQAcnjSLzua8F3DatoCd-Y3EIQB_c_tiAOIK8hZXJLBTyk8X0cQi5gnotiG4eK14rmZQslh2v2kQtBO0XiFaSLl4UZMtYrWZVU5qxqj8ojq3a_2-XiHZDPPSk7egXw07F5FqK83XVPFj-s_vYleA76OaH5LL7u8xJCGEOACQwO8XxzYpqTATnkJRfC05-OWjg_OLpocDRgBR9ZsWTwRWTwPA_ENJPBLGQq8PpdUghhYPoHvuZLePn-g79x3t-fTMg0o9-RTL8HnIyDhZ6Icp9hTEiCAxImQYIzjOJgSisNuo-9fm9KhsloGmvSi2RN52NNOmzu8yH5dAFVEAb9gAF1SVuprqI8qMSjLoP2X6jh4oflMnhGmWmtaK4fi1wPA9FSlAcI8BImIHocksjr0u17P5KvpAW0HHQNO_76VT057HbBLFNNu91vymMHe41nFKykV-06k-uO0VJ1RgCvuuuY3x1_Lb_mmjoy1N-4KGEjhRaUNVrQPlmqVUI3CdKTgGgZA7yaKsIhjM8TRg5hbAmrZUN0ijA5Txg7hIklrBgipwiH5wkTh3BoCQdGz75fdLJ9zNCBC32lhbYldeltxnIanec0dDiNLKe9y2dBgHGUhZGCce7pmKvGsOXYo1l3wKbYNfRrqR7Fpin26jg5z3nkcJ5YzlOAV2nPVeeq-Hxg6CIDeQTks17Gf6lrKuAPnDfwLzQvqVAkHmkjt4A8Ii_XqeE69ColHSllxtGs3kbBQanvRZaDLjZgO1EgM01dn1GsdqJAZqMAChzTxPq3B1XTrd50NxsA9ASYTBaL1vGRJZwZutj8Kn_snBavRlHBMY10JJqV_ihh2utHMRJOF7-Gk9QykoJoiYx_kVnnduXEA_J-ObFL_kI57XQmD7jTITTFsse5u5nKankJBmB6ZJ0D0xCeSDkBs7eSS9A0kWkwEfa6RN8nHmup-HfbliOGZ5xPXWmXtGd62LReVtohTuX0cEJbXwcxqGQ6BmGAP4nioITWlbYbb8IL4w2ErNYxRvW3YeZRRyxfrHEsJ-39cdjjqPz0CLIBxmrUfAbOdC5ixVcTmWIz6tBuhW30LcYuQG2aRE4hZu7ncLtgAalxuxo1J7rb8Exm9O00X5AexrifSA-2yzSGnssS6SVZwuHABPHsLAPZIJv4g3jmZpPTQdxVEhopyY24yGPyJ-PuVZHWUpyLg2OVXBLkvlewbOhBirdRsFQCXuJ7Q6eburq-nnO8cNbx0l8xYCpUt2-TdQOaJvbZdYNdl49TXqjT3uw6opttYOAnZsvGy6UrZjsadXbCpofSG4_3rmumHn8BQzNJZxye_AllxoJ9WWliv7ojud58x254qf3Gs_aLgsu2Hm6w35niwr-SGhcX31hVuEb0KxQXLvnfoMZwBP6z1JgT-LsXHCcA_r-vO3ypNLwhFuHbYlE6H4su2O34IxQfnlwRzeBzQY2Bri8N9T0K5kG6oET89oTRR2uvesflxp91xjfWGZaBby03Znn83y8-zOeWwHll_W_8aKb2h5e45fwGQPcK6VrH-41KjD9qbeHY83cpL442el3mSG4y0BvrZDRfKKP0FgM1n7wsoeheFo0lzsvyKxy820q8Pcioh9OEzw5G8z2KXaOv5r2oKMavXusuG9v3Oa5KsuPLZUFldzc6k9Qd_ZkcFtJ_V915JQE_H5qKi-5Nz1bKplV08RPATy-8XPNKLrh4AfjpA-Cn7JfNv7bpPz9SfVZKwrxpaC5aKLcUbphoJXylomW87k5ByS1r4Zpu81fGBXzLW1hUef0CEV64rPyjpZBv4N-__PURbqmgEOBUUaw4byBr4b4WvKpoqVT0XHP5DF9pIblgH-pZBvXxs11TUUmrd7iva1rQttWnnZaaNb6XzV6qOfS5rA-lILyCb1tWbNXoHLZMjYdtkVe5gDWv7-2kmgvVm7XwP9EBbvJWUqGssB-lXGIg0M9bJQPmjWS7jkkMv_ykxnzd01YyXl95pOvLTxed4qqqV3OMq6QbVlNYtvy54kVeQX2Azj0CgFPlhjUv8kbuBYU139cl3cA3wSTldfUOAY66s3Uo7lv1YcCDVG36qB2Lw74N4EgfudMTPu_rOt_R8jkvS3E8hUdrqdjvj24AHBW7hixqLnWMZMoj6FdNt6O3gn3yXgvIkDtEtVb5mlbq4YaLRcHrclFUNK_3zaCxM5hFs3X1dnw4YKheiLpc7BvNkAoeLjPJoOMrLbpedd_LjgaDcw4AR1KwZlHwfS0XO1bv2wWaneDeRi-Ao7XgeVnkrVy0TZVLVrdUGD3pW1rRHa3Vk1UKD4oMIJ9hw1nLa4vO7My2MXD4G02mp1GpblPRTlGjaXz8Kdo-huxzgtXzDyo4q5lkecU-qHBgHuO25uX7DHL68QA7Vpf0oFlvtqyTP1pC7e2uFUCTZVSjHbVQhm06NoKWi1ZyQZVJSVbvKUqhk5s7YgtWl_1kI-0oymTVKdlou1c7tjfE3oT2JrI3sb1JlLef5t-wcq0EwdHl9hWdwmvIGoojsB06nSGr1UNeTGwSGWLB2OZ6b0YDT3UYZxtfixXJtYnhwIFVdPtIL9QypyIfq9cqbrVQB8NVtyixEa_zG20WloymrGOhjX1mhAYzr9hLfXyf4Jrw5Wz3zQPmyVXaRZNgSfzqjc7pN_YqOBoy2Z314WKoOmeBCHAU3YZCeBaB6HoEfIId23vxAI7iqyDAEwhiPwTJOQhSLwTJEILEBwF2OqW3YZCcxSC9HgOfZMf2gXzZVSCQCQiZHwRvUx86Ay8K2ZBLs1cxAwM52pPZ5r0ahuM-hheHblPhOiC80jkdhkLiq7AIJ1h0W_LzYCBvPujZwV40EBpxSnxwhG6v8EY8yHk8whvw8MnndBhKGV2FRzTFw5sJ2AZ5E0jPTujHg4w4jX14RG6v5EY84vN4JDfg4ZPP6TCUMr0Kj3iKR3oCj7NZG_nTNhrlbZT58Ijd9VNwIx7Z-dVTcAMe57I3ikfrv-tWqMkED3xijYrOpnDkz-FolMQx9uHhFp7dmd3r8cD4PB7kBjzOZXKUzlRpi24jYVgC9-Km00prOOBktdIVXqkV_dYbu83SKT2c26GYlI56U2BqQOHVexXjmncy5qhSQaXe4DHdPf-hON4NvCsfSJmRLL-jDyjOoiBM4iS52z4kaZwnxTrChIRhiTZRRNI8jcoABTSLCnzHHnCASZChFIUkJOEiiVBAQ7Iuy3VSUroBYUB3OasWVfW6W3Dxcsfadk8f4jiO8J2WtNX_UYpxTd-gbgQYg-jxTjyoMffr_UsLwqBirWyPVCSTFX1Ymv9vNPtruWS8hnxjt-NyIfL3-w2rKla_mE3EGt7_2N7tRfUw2uJkcrtfLwq-A_hJb591l_tG8F9oIQF-0sy1AD9p5v8bAAD__9BoU2o">