<table border="1" cellspacing="0" cellpadding="8">
    <tr>
        <th>Issue</th>
        <td>
            <a href=https://github.com/llvm/llvm-project/issues/95761>95761</a>
        </td>
    </tr>

    <tr>
        <th>Summary</th>
        <td>
            [AArch64] clang has much more load instruction than gcc
        </td>
    </tr>

    <tr>
      <th>Labels</th>
      <td>
      </td>
    </tr>

    <tr>
      <th>Assignees</th>
      <td>
      </td>
    </tr>

    <tr>
      <th>Reporter</th>
      <td>
          vfdff
      </td>
    </tr>
</table>

<pre>
    * inner loop of test: https://gcc.godbolt.org/z/r3G9EWPrb
```
for (k = 1; k < nx-1; k ++) {
        __builtin_prefetch(aa + POffset +  0, 0, 0);
 b[k+i*nx+j*nx*ny] = aa[ 0] * xx( 0, 0, 0)
 + aa[ 1] * xx( 0,-1, 0)
                          + aa[ 2] * xx( 0, 1, 0)
                          + aa[ 3] * xx( 0, 0,-1)
 + aa[ 4] * xx( 0, 0, 1)
                          + aa[ 5] * xx( 0, 1, 1)
                          + aa[ 6] * xx( 0, 1,-1)
 + aa[ 7] * xx( 0,-1,-1)
                          + aa[ 8] * xx( 0,-1, 1)
                          + aa[ 9] * xx(-1, 0, 0)
 + aa[10] * xx(-1,-1, 0)
                          + aa[11] * xx(-1, 1, 0)
                          + aa[12] * xx(-1, 0,-1)
 + aa[13] * xx(-1, 0, 1)
                          + aa[14] * xx( 1, 0, 0)
                          + aa[15] * xx( 1,-1, 0)
                          + aa[16] * xx( 1, 1, 0)
 + aa[17] * xx( 1, 0,-1)
                          + aa[18] * xx( 1, 0, 1);
        aa += 19;
      }
```
* the comparision of insn number for the above inner loop body
```
#1 = clang.s
#2 = gcc.s
||=                             Insn =||=      1 =||=      2 =||= Diff  =||
||                                add ||       12 ||        2 || 10 ||
||                               b.ne ||        1 ||        0 ||        1 ||
||                                bgt ||        0 || 1 ||       -1 ||
||                                cmp ||        1 || 1 ||          ||
||                              fmadd ||       18 || 18 ||          ||
||                               fmov ||        0 || 10 ||      -10 ||         ---  it seems gcc use fmov to reuse the load value
||                               fmul ||        1 ||        1 || ||
||                                ldp ||       11 ||        9 || 2 ||
||                                ldr ||       16 ||       10 || 6 ||
||                               prfm ||        1 ||        1 ||          ||
||                                str ||        1 || 1 ||          ||
||              **Total**             ||       62 || 54 ||        8 ||
```
</pre>
<img width="1px" height="1px" alt="" src="http://email.email.llvm.org/o/eJysl0-PqzYQwD_NcBkR2Sb8O3Agm6bqqe9QqceVARN4D3Bkm2hfP31lkrAJgXZJa6Fdjxn_ZjwexjHXuj52QiTg78DfO7w3lVTJuSzK0slk8TMBlmLddUJhI-UJZYlGaANeipUxJw1eCuwA7HDM881RFplszEaqI7DDX8AOyvs1_uXPbyoDsgeSQkCuzyCWUiGw6AeCt0cK3g5t9w27D_cmsd3wxAjh7jIJr-39PevrxtTd-0mJUpi8AhZxbqfgt9_LUgsz9JEAexv_xODdOBn4ux_AdjWwtPsAtvt-7aTdT_D3g1Ocg79DMogsxY8PYNEUeKFZUxdt-qzt0on2YvvEsBmjL2C8Bd-tS8--b5dWSlcZ9Zd8X4cJFjDzvocLcXfXGY2Wtm8dJn7AjAkwnzOUPGu_kDOUzhh9AcMWfJ-NO_WWVroqYHSSerMB-wLGf8a8Eslgxpsp5lM7XPB9XerRaCkE9KFwXdul1g2lc_oSwv1swbVsUwnMZXviqta17GxFrzvdYde3mVBoa7JV4Zk8i_vKb0-DBahHh2KZN7w7bvQ4yoZRezDcxsI3-3j75WAg4m_WG_D2E206M8Yex_Z1WeLd0J3Rf7Rog1kU-KBJ2aOMo0wJruZnm05MeHQik4X3a1aRHc0SdWLOfQWft6elRUxXg6vxZfu8B9GIj_4zH8tWnhej8xh9l053A13XRawNaiFabZMaey0uTCNRCSvZ76aRvMAzb3qxyrW--Zf0GOX129YUk22jU3h8k9lLeDXBBxN5DGawHn9SZfvV0Ixt_SK0Uf9fagOzlfYPaXhz6T6-vZ8RjGXF307sRI92bkXXKRKviL2YOyKhIY0CGjKfOFVCaE4YFWWRRyyneRTRaBt6kRAxowWlwqkTRtiWBDQk8Zb4ZMNyvxDbwGNFzgtCBGyJaHndbJrm3Npf8U6tdS-S2A8D6jQ8E42-3RRUYpXcrD9q2JKm1kZ_TjO1aYY7RZqqvAqGo304HbDiGts-r7CV6vqx1J02qs-NPYxMxTv7cTm9apLJBaM2VZ9tctkCO1hL13_uScnvIjfADoO3Gtjh4vA5YX8HAAD__x5z1hU">