<table border="1" cellspacing="0" cellpadding="8">
    <tr>
        <th>Issue</th>
        <td>
            <a href=https://github.com/llvm/llvm-project/issues/56330>56330</a>
        </td>
    </tr>

    <tr>
        <th>Summary</th>
        <td>
            Poor code generation on two simple nested loops (arm64)
        </td>
    </tr>

    <tr>
      <th>Labels</th>
      <td>
      </td>
    </tr>

    <tr>
      <th>Assignees</th>
      <td>
      </td>
    </tr>

    <tr>
      <th>Reporter</th>
      <td>
          uncleasm
      </td>
    </tr>
</table>

<pre>
    In the fragment https://godbolt.org/z/Tdjv4xq45

the code shows an image processing algorithm that will handle arbitrary image widths (that are not multiples of given block size).

```
do {
     do {
            auto d = vld1_u8(src);  vst1_u8(dst, vqadd_u8(d,d));  // KERNEL
            src += 8; dst += 8;
            asm("":"+r"(width));  // Artificial 'barrier' to disable a disaster
            width += block_size;
      } while (width >= 0);
      src += width;
      dst += width;
} while (width > block_size);
```


What I can tell, GCC manages the inner loop optimally producing code
```
.L2:
        ldr     d0, [x2], 8
        adds    x0, x0, x6
        uqadd   v0.8b, v0.8b, v0.8b
        str     d0, [x5], 8
        bpl     .L2
```

Instead clang tries to make the outer loop constant 0, adding one `mov` to the inner loop to catch the unmodified loop counter, which may or may not slow down the loop.
```
.LBB0_2:
        ldr     d0, [x2], #8
        mov     x10, x9
        adds    x9, x10, x6
        uqadd   v0.8b, v0.8b, v0.8b
        str     d0, [x5], #8
        b.pl    .LBB0_2
        add     x2, x2, x9
        add     x5, x5, x9
        cmp     x10, #0
        b.gt    .LBB0_2
```

Things become absolutely crazy when the `asm("" : "+r"(width));` statement is omitted:
there are forward jumps and backward jumps, precomputing and rematerializations:.

```
outer_loop_two_loops(long, int, unsigned char*, long, long, unsigned char*, long):      // @outer_loop_two_loops(long, int, unsigned char*, long, long, unsigned char*, long)
        add     x9, x3, x6, lsl #1
        add     x10, x4, x6, lsl #1
        lsl     x8, x6, #1
        add     x9, x9, #8
        add     x10, x10, #8
.LBB0_1:                                // =>This Loop Header: Depth=1
        mov     x11, x10
        mov     x12, x9
        mov     x13, x8
        mov     x14, x6
.LBB0_2:                                //   Parent Loop BB0_1 Depth=1
        mov     x15, x2
        mov     x16, x5
        cmn     x0, x13
        ldr     d0, [x15], #8
        uqadd   v0.8b, v0.8b, v0.8b
        str     d0, [x16], #8
        b.mi    .LBB0_4
        mov     x5, x16
        mov     x2, x15
        b       .LBB0_6
.LBB0_4:                                //   in Loop: Header=BB0_2 Depth=2
        add     x16, x2, x0
        add     x15, x5, x0
        add     x17, x0, x14
        cmp     x17, #0
        b.le    .LBB0_7
        add     x16, x16, x13
        add     x15, x15, x13
        add     x2, x16, #8
        add     x5, x15, #8
.LBB0_6:                                //   in Loop: Header=BB0_2 Depth=2
        add     x14, x14, x6
        add     x13, x13, x6
        add     x12, x12, x6
        add     x11, x11, x6
        b       .LBB0_2
.LBB0_7:                                //   in Loop: Header=BB0_1 Depth=1
        add     x2, x16, x12
        add     x5, x15, x11
        subs    w1, w1, #1
        b.ne    .LBB0_1
        ret
```



</pre>
<img width="1px" height="1px" alt="" src="http://email.email.llvm.org/o/eJy9WN1zozYQ_2vwi6YePoyxH_wQx0mb6U3npnMzfcwISQYlAnGSsJP89V0JSACD05u2xzgIpNV-_rS7JJX0dfdQIpMzdFQ4K1hpUG5Mpb3oxgvv4ZdJmkphllJl8PYGf9_o02n18n0Ve_7B82-au-VAJGVI5_KsES4RL3DGUKUkYVrzMkNYZFJxkxcgDht05kKgHJdUMIRVyo3C6rXddebU5Bp54caRYsVQKQ0qamF4JZhG8ogyfmIlSoUkz0jzN-aF22VfI2_ttz_3SiXykn3zjOx1MdFeuDYSUeRFB3QSNHisN6CGVgT4e9EeoZM27STVxgtv0ek7prSdgXdqCVvaxoXo97s__7j7cikKuALJ3oraWHpg2H-f0E0XIMULQ_uzEYKHvXLDxvnsUvaNMvzICccCZpIUK8UZ7EiQtZJrnFr3uydtYOFCpGPbaeW8_ei8PVTPSw7onHPg1WkCDryzW_xGnz5xz-xG6eFyzwvD5WkhfaV6okbh79__sph6QARAapgQNoa_3t6iApeAPe0OAy9LppCQskKyMgBKIV4tlmlNLJQt0ifFLL-4uAy8KKhq7PKtJC_ev4RefLDPmyEhwEjb8cURtvf1kKa2YIPx5C83qUPf8GFArM1YcjwtOa2EG6368957KAEimCIiMPjAAJC0RVGBn5lzmqxN5zQigRZDMnGSQWXrNVlC5NZ-IU9wtztHnoYZgg3J3XxdFpICchntONalBSiwAwwAUYFfkVRusLlBC3mGM31ukpndspyJ0H7vP_5IlLwwGrkLLHDjS9DEaDsTx61bDf7PQF4qly6bYHaGjlVrNA-dTuGM-g1N7FbjKRpSVH0PgBb-WIvMXGgxiapvOWBDo5QRWUAqSrUUgCM4bkTht1cINmtCCtv66Q-O_g26kgAtxACChrmaxqFkFNwYRt8DDzwVc6XlKNUZK4qe6qKypYuiFJPnjylrYaWsflVtXCUDEsUKYK4gr_I3bDjgHThfK0DudDxaYD6as3QPwHkjZJlZAbx0xaQuNc9KAD3JMdh1Y-c6km6cp9lap7irzf7eyv85cqcR1ByAqMW_Jde2DEXBDH17VFafbbCzbsPmg_IK220L4cnzMhb-juhNP2EE756dvzqfRwcoSwBrjb7YzPUb5ExIXLD_wCpbzg7BXDoJOg1m1ifP68dy4-jZZNW5dZQH_6FZCH2FwwJnyRnlfPK5QXGbZmaW122GGeWWsllu4xF9lqiD-WT4L7MsKDifZgtux8aPqxkLG_uD9cxyE89g5IC0HRvWg3itfiRevHSxsls6DB5c0N_jNlcd2sCEbR8yQ9SvD7NESa-XCVbjSFd9qqkqAs3euyeS69p2Q3Rd326YIwt7zK5kiz6zcbJY_6QorTqvTvQXH1RRZ-9Vqtbs8DpVm6CCKaohbMO-Q5L_wiGz6WYydNaUzyNnTRnmgDp1vdvZmdjcLytLuix7uBwtKmY-_QRZ0F1Et9EWLww3gu2-Smhl3fdzxqAfdg0F9MsIKjZ83BbwzYtKBt130wy7L2OsijVEfruoldiNvtnhK7tOl9CwwIsQp274Bb5gnhiBgn_Pta4Z9AH38TqK_EW-I2RDWQg9FtumlESrOIpWa5yEwTHGxF-tFwKnTOgdpEXIiQu-C_0w9BM_gL9NnCwJPiZR6EcxObJjEjDoPKBD4mJpBdv_HizUzumQ1pmGRcG10R-LWLvegnX84Ss8l2pXl0QwaPsWTt-dU_ZvwXFNTA">