<table border="1" cellspacing="0" cellpadding="8">
    <tr>
        <th>Issue</th>
        <td>
            <a href=https://github.com/llvm/llvm-project/issues/155402>155402</a>
        </td>
    </tr>

    <tr>
        <th>Summary</th>
        <td>
            Why was the cold block placed inside the hot loop? It appears that the MachineBlockPlacement pass did not perform effective optimization.
        </td>
    </tr>

    <tr>
      <th>Labels</th>
      <td>
            new issue
      </td>
    </tr>

    <tr>
      <th>Assignees</th>
      <td>
      </td>
    </tr>

    <tr>
      <th>Reporter</th>
      <td>
          Ganwenzhao
      </td>
    </tr>
</table>

<pre>
    ```cpp
extern int verify();
extern int slowpath();
extern int fastpath();
extern void other_work();

int test(int k, int t) {
    int res = 0;
    int ret_no = 0;
    // outer loop
    for (int i = 0; i < k; i++) {
        // inner loop for verify something
        while (t-- >= 0) {
            ret_no = verify();
 // unlikey branch, execute slowpath
            if (ret_no < 0) [[unlikely]] {
                res += slowpath();
                goto do_work;
            }
        }
        // verify ok, execute fastpath
 res += fastpath();

    do_work:
        other_work();
    }

 return res;
}

```

As shown in the example above, running the following command (llc version 19.1.7) reveals that 
the cold block %bb.5 remains inside the loop.

clang++ -c -O2 -target aarch64-linux-gnu -emit-llvm -S test.cc -o test.ll
llc -O3  -mtriple=aarch64-none-linux-gnu -filetype=asm test.ll -o test.s

```asm
// %bb.1:                               // %.preheader
        mov     w19, w0
        mov     w21, w1
        mov     w22, wzr
        mov     w20, wzr
.LBB0_2:                                // =>This Loop Header: Depth=1
 //     Child Loop BB0_3 Depth 2
        cmn     w21, #1
        csinv   w8, w21, wzr, lt
        sub     w23, w8, #1
.LBB0_3: //   Parent Loop BB0_2 Depth=1
 // =>  This Inner Loop Header: Depth=2
        tbnz    w21, #31, .LBB0_6
// %bb.4:                               //   in Loop: Header=BB0_3 Depth=2
        sub     w21, w21, #1
        bl      _Z6verifyv
        tbz     w0, #31, .LBB0_3
// %bb.5:                               //   in Loop: Header=BB0_2 Depth=1
        bl      _Z8slowpathv
        add     w20, w0, w20
        bl      _Z10other_workv
        add     w22, w22, #1
        cmp     w22, w19
        b.ne    .LBB0_2
        b       .LBB0_7
.LBB0_6: //   in Loop: Header=BB0_2 Depth=1
        bl      _Z8fastpathv
        mov     w21, w23
        add     w20, w0, w20
        bl      _Z10other_workv
        add     w22, w22, #1
        cmp     w22, w19
        b.ne    .LBB0_2
```

The following layout appears to be more efficient.

```asm
// %bb.1:                               // %.preheader
        mov     w19, w0
        mov     w21, w1
        mov     w22, wzr
        mov     w20, wzr
.LBB0_2:                                // =>This Loop Header: Depth=1
 //     Child Loop BB0_3 Depth 2
        cmn     w21, #1
        csinv   w8, w21, wzr, lt
        sub     w23, w8, #1
.LBB0_3: //   Parent Loop BB0_2 Depth=1
 // =>  This Inner Loop Header: Depth=2
        tbnz    w21, #31, .LBB0_5
// %bb.4:                               //   in Loop: Header=BB0_3 Depth=2
        sub     w21, w21, #1
        bl      _Z6verifyv
        tbz     w0, #31, .LBB0_3
        b       .LBB0_9
.LBB0_5:                                //   in Loop: Header=BB0_2 Depth=1
        bl      _Z8fastpathv
        mov     w21, w23
.LBB0_6: //   in Loop: Header=BB0_2 Depth=1
        add     w20, w0, w20
        bl      _Z10other_workv
        add     w22, w22, #1
        cmp     w22, w19
        b.ne    .LBB0_2
// %bb.7:
        .......                             // igonre some instructions
        ret
.LBB0_8:
        ....... //   igonre some instructions
        ret
.LBB0_9: //   in Loop: Header=BB0_2 Depth=1
        .cfi_restore_state
        bl      _Z8slowpathv
        b       .LBB0_6
```
</pre>
<img width="1" height="1" alt="" src="http://email.email.llvm.org/o/eJzsV92Oo7gSfhrnpgQCE5JwkYt0Z_qckeZoRjojrbQ3LQOV4G1jI9sk3f30K9skkHRmt_dPo5UGRQFcVV999UNhmDF8LxHXJL8j-XbGetsovf4Pk0eUrw1Ts1LVL2uySMKv6jqSbPDZopbApYUDar57IXRFaEGyu0uhEerYMdt8Q7xjxn5DfFC8BmUb1I9HpZ-mGiTZOGOLxhK6cpdPhN57QEtoAWTptADAL2k0QLItJMF4XLaPUl1LCH0g9AFUb1GDUKob1ndKw-CLn2385T08-UtC7_xv6n6CyKUcED1WSBoY1aJtuNxPDI4NF-ic2SgCkn0I3t7gumMSw9sqnDz3UvAnfIFSM1k1LlH4jFVvcSzOJSrfOe9n7PvBu2uPuwAmXki-Jfn2BqVAy4DLRba9Wf9r9b2yCmoVyvxGgyy302xe3oUIh2Sqp2lw59ZKNlNCtzpuQDxT2Exc3O7AkUqAt72WzsuAd5KcH5pwuzFgGnV0rQ-2QcBn1nYCgZXqgI677qXkcu-FOyWEOrq7SrUtk7UrihCVi9ZwJSEt4jReutJoPCATBmzDLJBk48wrJWoohaqegNC8LOMcNLaMSwNcGl6jd-L6MQ7kKsHkPjQxRBVEnylEluk9WmBMV81iHgku--doL3uIsOU2EuLQQvR__yDGVQWRCpdCkGTjqEafM4CotZp3Akm2PQFJJXGKtuMC7UvnVUx7AjnjmatsMtO621D8EF1Ksjd9dXWM-nGnsUFWo_bARasOJCmOaeFqcEwuFmnqF9PLReoXXy_taTKuxp_u7pJH-vuszrSyLck-fG24gU9uSPw38Ms2sMXONiTbppNn2h33DRd1UHa-sqAINHCqWjnSJzQbAqgMl57synMdonvV7iRs0DF96U0zL1tNAEJUmWN1JvKFaZR25EFvEg7hAfgAP_pZ-I0wB_62lK8XAWT-IjBYXJd__u7yu-HvXTuLk_ftJIEjhVMe0kmqxkyWgiTF48-LMHwOJ9aedHKDc3bNOf8bOF8me2S1Os3dgRer60mHhn-aXJikyTjqrqxCt4fTpJXabiJNiwEtlkiS4tT_Ye28sDw30eKiif5wfKcxfrj1sNLs-0d9Ofm_Xgx0wV5Ub4F1HTJtwCooEVqlEXC34xVHaeMfI-_HyJuOj_zfOvKmj39xTug7ht8_Oh3-0hD6nnNl2gLLsFV1CuF4Tzr5XkmN_svD7QSt7ivLlQx7rEKjPWdn9QZ-TNW7QYo_l-K42vFHjcYqjY_GMou_-X4bW2wxnZqzep3VRVawGa7TZb6gaTpfLGbNelWzZbpL6uVqReuyLFZY5lmdZXTBqizFfMbXNKF5sqKLlGZpsogr3M0ztivLcplW-XxB5onbTovYbYJjpfczbkyP6zTP5wmdCVaiMP6bmlKJR_BSQqn7xNZrZxSV_d6QeSK4sWaEsdwKXP_UvMCRGbjayneCVVhPd_CNsuE7NXuAj5N3ivsUcPL_sarhEu-c-Rdn3brR1TFjoOY1SGWhQ71TunUvH6wsPyCozvKWvzJX03jWa7FurO2M6wdfyT23TV_GlWoJfXDMh1PUafULVpbQBx-vIfRhSMhhTX8NAAD__2bdcuE">