<table border="1" cellspacing="0" cellpadding="8">
<tr>
<th>Issue</th>
<td>
<a href=https://github.com/llvm/llvm-project/issues/144208>144208</a>
</td>
</tr>
<tr>
<th>Summary</th>
<td>
[AArch64] Missed optimization: Make full use of smaddl instruction in AArch64
</td>
</tr>
<tr>
<th>Labels</th>
<td>
backend:AArch64,
llvm:optimizations,
missed-optimization
</td>
</tr>
<tr>
<th>Assignees</th>
<td>
</td>
</tr>
<tr>
<th>Reporter</th>
<td>
hstk30-hw
</td>
</tr>
</table>
<pre>
https://godbolt.org/z/x6z11hPj7
Below code(from OpenSSL) in GCC can make full use of `smaddl` instruction, in LLVM it generate `madd` in llvm 15, and the performance is dropped significantly.
```
#include<stdint.h>
typedef int32_t fe[10];
static const int64_t kBottom21Bits = 0x1fffffLL;
static const int64_t kBottom25Bits = 0x1ffffffLL;
static const int64_t kBottom26Bits = 0x3ffffffLL;
static const int64_t kTop39Bits = 0xfffffffffe000000LL;
static const int64_t kTop38Bits = 0xfffffffffc000000LL;
void fe_mul(fe h, const fe f, const fe g)
{
int32_t f0 = f[0];
int32_t f1 = f[1];
int32_t f2 = f[2];
int32_t f3 = f[3];
int32_t f4 = f[4];
int32_t f5 = f[5];
int32_t f6 = f[6];
int32_t f7 = f[7];
int32_t f8 = f[8];
int32_t f9 = f[9];
int32_t g0 = g[0];
int32_t g1 = g[1];
int32_t g2 = g[2];
int32_t g3 = g[3];
int32_t g4 = g[4];
int32_t g5 = g[5];
int32_t g6 = g[6];
int32_t g7 = g[7];
int32_t g8 = g[8];
int32_t g9 = g[9];
int32_t g1_19 = 19 * g1;
int32_t g2_19 = 19 * g2;
int32_t g3_19 = 19 * g3;
int32_t g4_19 = 19 * g4;
int32_t g5_19 = 19 * g5;
int32_t g6_19 = 19 * g6;
int32_t g7_19 = 19 * g7;
int32_t g8_19 = 19 * g8;
int32_t g9_19 = 19 * g9;
int32_t f1_2 = 2 * f1;
int32_t f3_2 = 2 * f3;
int32_t f5_2 = 2 * f5;
int32_t f7_2 = 2 * f7;
int32_t f9_2 = 2 * f9;
int64_t f0g0 = f0 * (int64_t) g0;
int64_t f0g1 = f0 * (int64_t) g1;
int64_t f0g2 = f0 * (int64_t) g2;
int64_t f0g3 = f0 * (int64_t) g3;
int64_t f0g4 = f0 * (int64_t) g4;
int64_t f0g5 = f0 * (int64_t) g5;
int64_t f0g6 = f0 * (int64_t) g6;
int64_t f0g7 = f0 * (int64_t) g7;
int64_t f0g8 = f0 * (int64_t) g8;
int64_t f0g9 = f0 * (int64_t) g9;
int64_t f1g0 = f1 * (int64_t) g0;
int64_t f1g1_2 = f1_2 * (int64_t) g1;
int64_t f1g2 = f1 * (int64_t) g2;
int64_t f1g3_2 = f1_2 * (int64_t) g3;
int64_t f1g4 = f1 * (int64_t) g4;
int64_t f1g5_2 = f1_2 * (int64_t) g5;
int64_t f1g6 = f1 * (int64_t) g6;
int64_t f1g7_2 = f1_2 * (int64_t) g7;
int64_t f1g8 = f1 * (int64_t) g8;
int64_t f1g9_38 = f1_2 * (int64_t) g9_19;
int64_t f2g0 = f2 * (int64_t) g0;
int64_t f2g1 = f2 * (int64_t) g1;
int64_t f2g2 = f2 * (int64_t) g2;
int64_t f2g3 = f2 * (int64_t) g3;
int64_t f2g4 = f2 * (int64_t) g4;
int64_t f2g5 = f2 * (int64_t) g5;
int64_t f2g6 = f2 * (int64_t) g6;
int64_t f2g7 = f2 * (int64_t) g7;
int64_t f2g8_19 = f2 * (int64_t) g8_19;
int64_t f2g9_19 = f2 * (int64_t) g9_19;
int64_t f3g0 = f3 * (int64_t) g0;
int64_t f3g1_2 = f3_2 * (int64_t) g1;
int64_t f3g2 = f3 * (int64_t) g2;
int64_t f3g3_2 = f3_2 * (int64_t) g3;
int64_t f3g4 = f3 * (int64_t) g4;
int64_t f3g5_2 = f3_2 * (int64_t) g5;
int64_t f3g6 = f3 * (int64_t) g6;
int64_t f3g7_38 = f3_2 * (int64_t) g7_19;
int64_t f3g8_19 = f3 * (int64_t) g8_19;
int64_t f3g9_38 = f3_2 * (int64_t) g9_19;
int64_t f4g0 = f4 * (int64_t) g0;
int64_t f4g1 = f4 * (int64_t) g1;
int64_t f4g2 = f4 * (int64_t) g2;
int64_t f4g3 = f4 * (int64_t) g3;
int64_t f4g4 = f4 * (int64_t) g4;
int64_t f4g5 = f4 * (int64_t) g5;
int64_t f4g6_19 = f4 * (int64_t) g6_19;
int64_t f4g7_19 = f4 * (int64_t) g7_19;
int64_t f4g8_19 = f4 * (int64_t) g8_19;
int64_t f4g9_19 = f4 * (int64_t) g9_19;
int64_t f5g0 = f5 * (int64_t) g0;
int64_t f5g1_2 = f5_2 * (int64_t) g1;
int64_t f5g2 = f5 * (int64_t) g2;
int64_t f5g3_2 = f5_2 * (int64_t) g3;
int64_t f5g4 = f5 * (int64_t) g4;
int64_t f5g5_38 = f5_2 * (int64_t) g5_19;
int64_t f5g6_19 = f5 * (int64_t) g6_19;
int64_t f5g7_38 = f5_2 * (int64_t) g7_19;
int64_t f5g8_19 = f5 * (int64_t) g8_19;
int64_t f5g9_38 = f5_2 * (int64_t) g9_19;
int64_t f6g0 = f6 * (int64_t) g0;
int64_t f6g1 = f6 * (int64_t) g1;
int64_t f6g2 = f6 * (int64_t) g2;
int64_t f6g3 = f6 * (int64_t) g3;
int64_t f6g4_19 = f6 * (int64_t) g4_19;
int64_t f6g5_19 = f6 * (int64_t) g5_19;
int64_t f6g6_19 = f6 * (int64_t) g6_19;
int64_t f6g7_19 = f6 * (int64_t) g7_19;
int64_t f6g8_19 = f6 * (int64_t) g8_19;
int64_t f6g9_19 = f6 * (int64_t) g9_19;
int64_t f7g0 = f7 * (int64_t) g0;
int64_t f7g1_2 = f7_2 * (int64_t) g1;
int64_t f7g2 = f7 * (int64_t) g2;
int64_t f7g3_38 = f7_2 * (int64_t) g3_19;
int64_t f7g4_19 = f7 * (int64_t) g4_19;
int64_t f7g5_38 = f7_2 * (int64_t) g5_19;
int64_t f7g6_19 = f7 * (int64_t) g6_19;
int64_t f7g7_38 = f7_2 * (int64_t) g7_19;
int64_t f7g8_19 = f7 * (int64_t) g8_19;
int64_t f7g9_38 = f7_2 * (int64_t) g9_19;
int64_t f8g0 = f8 * (int64_t) g0;
int64_t f8g1 = f8 * (int64_t) g1;
int64_t f8g2_19 = f8 * (int64_t) g2_19;
int64_t f8g3_19 = f8 * (int64_t) g3_19;
int64_t f8g4_19 = f8 * (int64_t) g4_19;
int64_t f8g5_19 = f8 * (int64_t) g5_19;
int64_t f8g6_19 = f8 * (int64_t) g6_19;
int64_t f8g7_19 = f8 * (int64_t) g7_19;
int64_t f8g8_19 = f8 * (int64_t) g8_19;
int64_t f8g9_19 = f8 * (int64_t) g9_19;
int64_t f9g0 = f9 * (int64_t) g0;
int64_t f9g1_38 = f9_2 * (int64_t) g1_19;
int64_t f9g2_19 = f9 * (int64_t) g2_19;
int64_t f9g3_38 = f9_2 * (int64_t) g3_19;
int64_t f9g4_19 = f9 * (int64_t) g4_19;
int64_t f9g5_38 = f9_2 * (int64_t) g5_19;
int64_t f9g6_19 = f9 * (int64_t) g6_19;
int64_t f9g7_38 = f9_2 * (int64_t) g7_19;
int64_t f9g8_19 = f9 * (int64_t) g8_19;
int64_t f9g9_38 = f9_2 * (int64_t) g9_19;
int64_t h0 = f0g0 + f1g9_38 + f2g8_19 + f3g7_38 + f4g6_19 + f5g5_38 + f6g4_19 + f7g3_38 + f8g2_19 + f9g1_38;
int64_t h1 = f0g1 + f1g0 + f2g9_19 + f3g8_19 + f4g7_19 + f5g6_19 + f6g5_19 + f7g4_19 + f8g3_19 + f9g2_19;
int64_t h2 = f0g2 + f1g1_2 + f2g0 + f3g9_38 + f4g8_19 + f5g7_38 + f6g6_19 + f7g5_38 + f8g4_19 + f9g3_38;
int64_t h3 = f0g3 + f1g2 + f2g1 + f3g0 + f4g9_19 + f5g8_19 + f6g7_19 + f7g6_19 + f8g5_19 + f9g4_19;
int64_t h4 = f0g4 + f1g3_2 + f2g2 + f3g1_2 + f4g0 + f5g9_38 + f6g8_19 + f7g7_38 + f8g6_19 + f9g5_38;
int64_t h5 = f0g5 + f1g4 + f2g3 + f3g2 + f4g1 + f5g0 + f6g9_19 + f7g8_19 + f8g7_19 + f9g6_19;
int64_t h6 = f0g6 + f1g5_2 + f2g4 + f3g3_2 + f4g2 + f5g1_2 + f6g0 + f7g9_38 + f8g8_19 + f9g7_38;
int64_t h7 = f0g7 + f1g6 + f2g5 + f3g4 + f4g3 + f5g2 + f6g1 + f7g0 + f8g9_19 + f9g8_19;
int64_t h8 = f0g8 + f1g7_2 + f2g6 + f3g5_2 + f4g4 + f5g3_2 + f6g2 + f7g1_2 + f8g0 + f9g9_38;
int64_t h9 = f0g9 + f1g8 + f2g7 + f3g6 + f4g5 + f5g4 + f6g3 + f7g2 + f8g1 + f9g0 ;
int64_t carry0;
int64_t carry1;
int64_t carry2;
int64_t carry3;
int64_t carry4;
int64_t carry5;
int64_t carry6;
int64_t carry7;
int64_t carry8;
int64_t carry9;
carry0 = h0 + (1 << 25); h1 += carry0 >> 26; h0 -= carry0 & kTop38Bits;
carry4 = h4 + (1 << 25); h5 += carry4 >> 26; h4 -= carry4 & kTop38Bits;
carry1 = h1 + (1 << 24); h2 += carry1 >> 25; h1 -= carry1 & kTop39Bits;
carry5 = h5 + (1 << 24); h6 += carry5 >> 25; h5 -= carry5 & kTop39Bits;
carry2 = h2 + (1 << 25); h3 += carry2 >> 26; h2 -= carry2 & kTop38Bits;
carry6 = h6 + (1 << 25); h7 += carry6 >> 26; h6 -= carry6 & kTop38Bits;
carry3 = h3 + (1 << 24); h4 += carry3 >> 25; h3 -= carry3 & kTop39Bits;
carry7 = h7 + (1 << 24); h8 += carry7 >> 25; h7 -= carry7 & kTop39Bits;
carry4 = h4 + (1 << 25); h5 += carry4 >> 26; h4 -= carry4 & kTop38Bits;
carry8 = h8 + (1 << 25); h9 += carry8 >> 26; h8 -= carry8 & kTop38Bits;
carry9 = h9 + (1 << 24); h0 += (carry9 >> 25) * 19; h9 -= carry9 & kTop39Bits;
carry0 = h0 + (1 << 25); h1 += carry0 >> 26; h0 -= carry0 & kTop38Bits;
h[0] = (int32_t)h0;
h[1] = (int32_t)h1;
h[2] = (int32_t)h2;
h[3] = (int32_t)h3;
h[4] = (int32_t)h4;
h[5] = (int32_t)h5;
h[6] = (int32_t)h6;
h[7] = (int32_t)h7;
h[8] = (int32_t)h8;
h[9] = (int32_t)h9;
}
```
</pre>
<img width="1" height="1" alt="" src="http://email.email.llvm.org/o/eJy8mk9vo0oWxT9NeVNKRP2vWnhhJ53ZdGtGeqPZRg7m32vbWAb3e92ffgTlCwV9i7B6UUtx43P8u9w6XDDh0DRVccmyLVF7ol43h3tb1rdt2bTfRfJU_rX5qI8_t2XbXhsidoS_Ef5W1MeP-tQ-17eC8LdfhL_9rX8xVv7nT0OSHUl2--xU_0XT-pgRbvNbfab_vmaXP_74Srij1YX-6-WFpocLPR--ZzS_n0703mS0zinRSXM-HI8nohNaXZr2dk_bqr4Q_tL5vn793zdatbTILtnt0GadvpN7NT2dfpwpU534cDnStszoNbvl9e18uKQZrRp6vNXXa3ak3T5XeZUeLu3p57Mvmujk8S_ZES6qS3q6HzMiXpr2WF3a55KIL17Z_rxmxyyn1aUV_L2leUbUniVEvRKxJ8muaQ9tldK0vjRtJ9LyvaXf93Xb1mfO9lXbUCJeafI3y7ufr19X2NRvtnU-HfjEGt9_66twgSmHnyzpfz53W8ydTt30R10daZ69n--nLiMZLbtl85-WZzSf_K8g3HWtN72V0rHzSc_JidqP7Z8I2CBguIAPAo4LxCAQuEAOAokL1CBQuEAPAo0LzCAwuMAOAosL3CBwqKDwnSyinSzYIMA7WfBBgHeyEIMA72QhBwHeyUINAryThR4EeCcLMwjwThZ2EOCdLNwgiHSSvTOv6X7xHS0Y1qy5iGMNm4vEQzR2bK6QWNfmIoV1bi7SWPfmIoN1cC6y86rdXOGwI_fdB4r3khxpYi6mEoEde1MJsue5mUqQXcrdVDItt598efI4grqRxHeUcPt4qzvnFQnqYAsOhjr4goOjDrHgEKhDLjgk6lALDoU69IJDow6z4DCowy44LOpwCw5szRmsOVu75qyAWPt8r1t3BuuOcrB1Z4X4hIOtPYO1RzlyPIxBrj6BYIvPYPFRCLb4rDCfcLAAMAgAysECwAr3LuwiqJtciJNDEFATFgQOB__qEHAIAerAQsDh4F8dAA4BQB3Ywc_h4F-9_hzWH3Vg68_h4F-99nw8CeEmG1tHt-yLrL-A9Rdr11-Mg0CsHwQCMoBysAyIcRBEOFgOBOQA5WA5EOMsiHCwLAjIAsrR84EjCjMcoBGKiS3QGAiUFQmECEZChBiJhIRIyLWRkDASUAcWBwlxQB1YHCSMBNSBRUFCFFAHFgUJIwF1YDGQ49UnbtKxHptlXyQNMkgD6oukQQbjAfVFsqAgC2ptFtQ4HtT68aAgDygHy4Max0OEg2VCQSZQzm_XCapQw0EUoahY48ZgoKxIMFQwKCLESDRUEA2UGImGCgZFhBgJh4Zw6LXh0DAoUAcWDA3BQB1YMDQMCtSBhUKPX0Zxk4ztv1r2RbKhg2ygvkg2dDA0UF8kGTpIBuqLJEMHQwP1RXJhIBdmbS7MODTM8tAILHwBggXDFGIIeoQiYjs0BgTFRQJigvERIUYiYoKIoMQgIoHJfIaLJMQECUFxkYSYYHZEiJGMWMiIXZsRC7MDdWCzw473qnATj9Umln2RmNggJqgvEhMbzBHUFwmJDUKC-iJzxAZzBPVFUmKDlKC-SEpsMEdQXyQjDjLi1mbEFWzIo4sMkhhrzAqKi2TFBUMlQoykxQVpQYmRtLhgqESIkby4IC8oERsqLhgqEVwkLi6IC4qLxMUFQyVCxANTwn3MLjd8P94W6V7DF-vuNXwb617DFXz3Gi73utdwUdC9hjNH9xqGSvfaxw0pBG6PdiPLF5JAFS6oYqwIvg_4KsaK4BLDVzFWBCPKV8F_X7cS7rd2p0lfgr8xtH_c8_EVuKAPNqjABH3QQQUq6IMMKhB4H-AWbndJ5osYSmBQQgJ8F_Bt0AET8HXQARXw8WOlhBvC3VW_54uxCRwqGBojoRgVNEYHxZigMTYoxh-TSAFwf7n7YukLkIAXgB_gDOAJkF1AtsGum4CMD_kS7lMXGshq3HUJbDHuOgf60A4NhZigHTYoxM8GBA63vAsDcA1oBWgJYAHgAcsAmwDTBUx8bJRw07ywwDTjDmugqnGHJXDFuMMcyEMTLBThxxIChnvvhQPwMHIMYDVAFUAlIAUgByADYDfP5rj0cLv9RM59_XbkKqjfjlwO99uRb0P99vkX4X4jcv-j347c-Oy3I7c3--1ID_vtwaL6nez7Wvr2E267ofpCxAvlivBO3M9Zvu9Ug-ELEV8o1_27CX0K3-M6-Cv_jOUHRSkXWGrCkjOWDFhymeVPDr72KUsCi09YbGCpx14_he8ByyEsP3987RGWnrDUjKUClkJZIc6fdHz5kTaKCY7P2sgDHF9uox9wvvwIy0xYesbSAUujrBDnT2W-_Egn5QQnZp0UAU4sr5ofn778CMtOWGbGMgHLLLP-yeT7Ee1rj7DchGVnLBuw7DLLT2X_eZEeJsAi3A4eaCP3l6wMyhrBbrmh_9DYKh_Pu9DHDjz-4k-4K-HsUD6eeEEkbPpBHFfxqUrgKhHgJC6R0w9SuEpNVRpX6QBncImZfpDFVXaqcrjqcVYi5jV80m5z3IqjE-6wybbMSGetNInblFujbPaRW20ES1wuMpsdtE5VenS5tNyJTbXlCVeJZjIxiUrcs8o_XPqhVCrz9MhzTWSSnQ_V6fl0-nF-rm_Fpmqae7ZlUvLEbk6Hj-zU9A89cv5xSL9nlyMRu93ulpZaEs4JfyGcd2YidvW1rc7Vr0Nb1ZdmePNcNU12fArf7N5Tr5vbtjM-fdyLhsjkVDVtM9bRVu2pf9wSYOqVfus_ik4-Suzot_nzkf7hyPDJSFpd6OODNvfbaf6kZtWW94_ntD4T_tbvjP_1dL3Vf2ZpS_hb35aG8LdHZ35s-f8DAAD__5_A0pQ">