<table border="1" cellspacing="0" cellpadding="8">
    <tr>
        <th>Issue</th>
        <td>
            <a href=http://email.email.llvm.org/c/eJztW11z4jgW_TXkRQXlK_OVBx7SSXonNTOdqd6u6n2jhC1AO7ZFWzLQ-fV7JRmwAWPAXfsyTojBsnTu0dGVrm-wZjL8OXlLiF5yMpdRJDciWZAOpSoRqxXX-IlsliJYEqGISDRPQh4SLUkg41WmOcESJVM8TdYyyrSQiSJyTpT44AS2v22_d-jTGAGfyHjr4y8MSaaMDWMxYWLNO95Lx3uKuV7KEDmkhJFZJIO_DQzd0m2fMI1lWsS8Q59Nu5QTxXmsDI8ZfpYxJylfpFwptI9FesN5QoKIJYsuQM8jLEHSaZb83XPWPmdRhJxDTtiaiYjNIm6sLLVeqY7_1KGf8bWQ4UxGuifTBZ594B9bf_jL7e9_jhyKO34Xeml7Yw3YTyJJeEoiKVdkyRRZpHKTkI2px2Yy02TgYRWF9QMnmFqJKNqLwrEP2B2hNE8VmTGUwvBHaRCg2yV8jZ3bLPEgk-hnDkn7hTYMFcoUD3so_PgvibKYDm7YT6MYW0sRYuezKDTqYQlWJfM4YlMUjP-Yzn1qhjpFiiIwkmMXErkhQu9kX6JWKY_l2nDuzudM6W7MtHUSFim0QUKhmCFjR7Q4Esp1MotneBGH2DhVGvNQMM0LXTC6rVI5w6H56QTsdehjUfdvCCONNzgIrrRhI5TKuOGRkwz5XCTWp4fe96_vX_41fX5_eZ3-8fr08oYn396nn1-_v36d_vXt67-Jh5V2ghiWLNAZi7BLC5ni6MVGjLISksTcmYolio5yWzIJi5AtWbNUGN9yI7Lg6BXYy_L8QTMraRspqxVSUGnwH2PKvCPkpy5BXxEhx0s98p7gSNrB49sVD_SuowuWhmjJzr4lciwAo3XDnmsR2GkcryIec7wSugklVYHDZucYDFeDwNKdpzLGIVV8xQx_rCsjY8Z1KCKrLF0ZjMPooZfO0CmNVoWxV5ql2opmHZ6wBRNu6fny-v7l0LxXHGbsdP5yp9QXSRBlOHU7_nOgdIi8O_7r6VVKWRpPzWzqLfGkhEl96xec1LoEFNvNI8m0j0vStj_VOL9ZOEXPxEG3F8wqh0NmNEVOQ1PFjdt0G58rXBkX6Yw-OWyCPyxDbdAF_BeyjkKwUxHRiwbMJLAlGcKNp9oVoVnjKEV7j_aFTvQL4FY5nH_M9ccOnO7ArQCmJslBTIXxvqUZoHm96tjItIMjc6Hn7PGt3tn70VvjXBt88jqDF9Pd3Tnk534uQwi2aan6SXdC_2Z82OHTEj4U8bHTPFK8vjPp_6MH6hbaSSjmBxMp11makNI0QAcOPYQKwRyoOfidUQEFPxennj3mIShZH80dA0SKBXBcQI8LUJ7nA0P3s78eKl2qj-dlgEDy-VzlsxNDdKiXJ7MSeSptr78_k7I7ly-_ucswPHR-X-tk8udI9OntGQ_9EuRx5cRW7h6mY7FyKMt0cwAiA9vq_blUu6rF3oM-PmyzXJjjpnt1zfAjQ-l5ZszwDdwbdW_-FS3BtQTXElxLuKIlOJvgbIKzCdfYBGcTnE1wNqHCphFROBHfTkW8JORezMA7WX0_PvL10Srt1sf-WfQ9Ssg0sziFiLObLIUl-nCSHJZgz5kYX7YQQDXPa2gaN9jVQpNusbF3_YZ71Yp1hAAlBLgDgZYQ6B0IfgnBvx0B6nWAGoR6HeoQ6nWoQ6jX4SKCcyr6C50K7hOz4FRwn5gFp4L7xCw4FdwjZsmpqnSgVztVlQ51CPU61CHU61CDUPAHep8OBX-g9-lQ8Ad6nw4Ff6D36QD1OvhX-0OVDnUI9TrUIdTrcBGhIizaW8aLcdFVuhwWzU1Gw4gGjSMaNI5o0DiiQeOIBo0jGjSOaNA0ol24k7sqlkHjWAaNYxk0jmXQOJZB41gGjWMZNI5l0DSWXbjhvtmd7guF0DgUQuNQCI1DITQOhdA4FELjUAi3hsIKp6q-4T7f4JyrtQlem-Cd-V9gm6G1GVqbobUZ2h1hqSJPo3X_vnR1ymlam6y1yVqjZO2We542Zzv1qjZnK3uVgURPuvCVTpvBtRlcowyu5GLtt3FtstYma22ydnuy1qZsbcrWMGXzr3nixG9ztjZnuyfLaLOtE3_4B2dbbYJ0xh_-QQlSsXVn9GJ2IUV2g0W3K4KqL7UOTyZT-4hpZ_Qpf0LVPW86Oo-_Vnr_AHv-yDC2rbCBFXY9Of_4yBEJcCQgJwG3kIBfRYI6EjQnQW8hQX8VCd-R8HMS_i0k_LtJVD6he85ZIHcWqHaWMwzpZY_JadLLPM9QAkcJckqVrlNJqdJ_7qZEHaUcvtqRKilVetPdlHxHyc8pVbpVJaVK37qO0n6HQ6GguFzJk-XqqMJu28Hhjnm_Z2K3-ekhnPjho__IHlimlzKdZEkQcabihyyNJkf7FoVeZrNeIGM8iaL17q27SuV_eYDz6bPdJafww8AfP44flhM-GsKQ8dHcH4Sj_mDUD0b9_tjj43DOBzD2HiI245Ga4CrdoTThG7fRzuyrGrw8NGcgJtSj-IKRNx4MvHHP4zAaDGHmD4PhiM6Hnb7HYyainsExWzIf0omFnGULhRcjobQ6XGRKiUXCuSWMDLXQEZ98PWwSFYc9lrudbqbYbXVL42HfbVwj5e2jD5b2xHL-H4qZKXA>53898</a>
        </td>
    </tr>

    <tr>
        <th>Summary</th>
        <td>
            Regression in register allocation from arm64 clang 11.0 and trunk
        </td>
    </tr>

    <tr>
      <th>Labels</th>
      <td>
            new issue
      </td>
    </tr>

    <tr>
      <th>Assignees</th>
      <td>
      </td>
    </tr>

    <tr>
      <th>Reporter</th>
      <td>
          uncleasm
      </td>
    </tr>
</table>

<pre>
    In the following "snippet" which is intended to compute tensor convolutions of size 1xHxW*8 * 8x3x3x16 using the naive
method for a block of 2x2x4 at a time, there seems to be some regression between clang-11.0 and trunk.
Full code available at https://godbolt.org/z/avz3hxKM7

With the trunk the inner loop has grown with about 50 instructions spilling the neon registers back and forth -- even when only about 24 registers are used. (Possible way to avoid could be to use fmla_laneq_f32 intrinsic, as now it seems that removing -ffast-math is also a disaster for clang-11.0 as the number of intermediate registers has probably grown.)

The other interesting issue is that redefining `WRONG_CODE_LEADING_TO_FEWER_PTRS 0` to use the actual algorithm, now it seems to me that more (internal) variables are generated to compute the pointers for `srcX, srcX +- stride`. One would expect that regardless of how the pointer arithmetic is implemented, those pointers would be allocated from a separate pool of general purpose registers -- but now clang-11.0 starts to spill again the NEON registers.

```
#include <cstdint>
#include "arm_neon.h"

#define WRONG_CODE_LEADING_TO_FEWER_PTRS 1

float32x4x4_t load_row(float *src, int64_t stride_xm, int64_t stride_xp) {
    auto r = vld1q_f32((float *)((uint8_t*)src + stride_xm)), s = vld1q_f32((float *)((uint8_t*)src + stride_xp));
    auto q = vld2q_f32(src); src += 8;
#if WRONG_CODE_LEADING_TO_FEWER_PTRS == 1
    auto d0 = vextq_f32(q.val[0], q.val[1], 3), d1 = q.val[0];
    auto d3 = vextq_f32(q.val[0], q.val[1], 1), d2 = q.val[1];
#else
    auto d0 = vextq_f32(r, q.val[1], 3), d1 = q.val[0];
    auto d3 = vextq_f32(q.val[0], s, 1), d2 = q.val[1];
#endif
    return float32x4x4_t{d0,d1,d2,d3};
}


void conv(float *src0, float *src1, float *src2, float *src3,
          float *dst, float *dst2, float *coeffs, int width) {
    const int OC = 8;
    const int IC = 16;

    int64_t stride_xp = 8*IC*4;
    int64_t stride_xn = -stride_xp;
    do {
        int oc = OC;
        do {
            auto zz = coeffs;
            float32x4_t o000,o001,o002,o003;
            float32x4_t o010,o011,o012,o013;
            float32x4_t o100,o101,o102,o103;
            float32x4_t o110,o111,o112,o113;
            int ic = IC;
            do {
                auto c0 = vld1q_f32(zz);  zz += 4;
                auto data = load_row(src0, stride_xp, stride_xn); src0+=8;
                auto c1 = vld1q_f32(zz); zz += 4;
                o000 += c0[0] * data.val[0];
                o001 += c0[1] * data.val[0];
                o002 += c0[2] * data.val[0];
                o003 += c0[3] * data.val[0];
                o010 += c0[0] * data.val[1];
                o011 += c0[1] * data.val[1];
                o012 += c0[2] * data.val[1];
                o013 += c0[3] * data.val[1];
                auto c2 = vld1q_f32(zz); zz += 4;
                o000 += c1[0] * data.val[1];
                o001 += c1[1] * data.val[1];
                o002 += c1[2] * data.val[1];
                o003 += c1[3] * data.val[1];
                o010 += c1[0] * data.val[2];
                o011 += c1[1] * data.val[2];
                o012 += c1[2] * data.val[2];
                o013 += c1[3] * data.val[2];
                o000 += c2[0] * data.val[2];
                o001 += c2[1] * data.val[2];
                o002 += c2[2] * data.val[2];
                o003 += c2[3] * data.val[2];
                o010 += c2[0] * data.val[3];
                o011 += c2[1] * data.val[3];
                o012 += c2[2] * data.val[3];
                o013 += c2[3] * data.val[3];
                data = load_row(src1,  stride_xp, stride_xn);src1+=8;
                o100 += c0[0] * data.val[0];
                o101 += c0[1] * data.val[0];
                o102 += c0[2] * data.val[0];
                o103 += c0[3] * data.val[0];
                o110 += c0[0] * data.val[1];
                o111 += c0[1] * data.val[1];
                o112 += c0[2] * data.val[1];
                o113 += c0[3] * data.val[1];
                c0 = vld1q_f32(zz);zz += 4;
                o100 += c1[0] * data.val[1];
                o101 += c1[1] * data.val[1];
                o102 += c1[2] * data.val[1];
                o103 += c1[3] * data.val[1];
                o110 += c1[0] * data.val[2];
                o111 += c1[1] * data.val[2];
                o112 += c1[2] * data.val[2];
                o113 += c1[3] * data.val[2];
                c1 = vld1q_f32(zz);zz += 4;
                o100 += c2[0] * data.val[2];
                o101 += c2[1] * data.val[2];
                o102 += c2[2] * data.val[2];
                o103 += c2[3] * data.val[2];
                o110 += c2[0] * data.val[3];
                o111 += c2[1] * data.val[3];
                o112 += c2[2] * data.val[3];
                o113 += c2[3] * data.val[3];

                c2 = vld1q_f32(zz);
                zz += 4;

                o000 += c0[0] * data.val[0];
                o001 += c0[1] * data.val[0];
                o002 += c0[2] * data.val[0];
                o003 += c0[3] * data.val[0];
                o010 += c0[0] * data.val[1];
                o011 += c0[1] * data.val[1];
                o012 += c0[2] * data.val[1];
                o013 += c0[3] * data.val[1];

                o000 += c1[0] * data.val[1];
                o001 += c1[1] * data.val[1];
                o002 += c1[2] * data.val[1];
                o003 += c1[3] * data.val[1];
                o010 += c1[0] * data.val[2];
                o011 += c1[1] * data.val[2];
                o012 += c1[2] * data.val[2];
                o013 += c1[3] * data.val[2];
                o000 += c2[0] * data.val[2];
                o001 += c2[1] * data.val[2];
                o002 += c2[2] * data.val[2];
                o003 += c2[3] * data.val[2];
                o010 += c2[0] * data.val[3];
                o011 += c2[1] * data.val[3];
                o012 += c2[2] * data.val[3];
                o013 += c2[3] * data.val[3];

                data = load_row(src2, stride_xp, stride_xn);src2+=8;

                o100 += c0[0] * data.val[0];
                o101 += c0[1] * data.val[0];
                o102 += c0[2] * data.val[0];
                o103 += c0[3] * data.val[0];
                o110 += c0[0] * data.val[1];
                o111 += c0[1] * data.val[1];
                o112 += c0[2] * data.val[1];
                o113 += c0[3] * data.val[1];
                c0 = vld1q_f32(zz); zz += 4;

                o100 += c1[0] * data.val[1];
                o101 += c1[1] * data.val[1];
                o102 += c1[2] * data.val[1];
                o103 += c1[3] * data.val[1];
                o110 += c1[0] * data.val[2];
                o111 += c1[1] * data.val[2];
                o112 += c1[2] * data.val[2];
                o113 += c1[3] * data.val[2];
                c1 = vld1q_f32(zz + 4);  zz += 4;

                o100 += c2[0] * data.val[2];
                o101 += c2[1] * data.val[2];
                o102 += c2[2] * data.val[2];
                o103 += c2[3] * data.val[2];
                o110 += c2[0] * data.val[3];
                o111 += c2[1] * data.val[3];
                o112 += c2[2] * data.val[3];
                o113 += c2[3] * data.val[3];

                c2 = vld1q_f32(zz + 4); zz += 4;
                o000 += c0[0] * data.val[0];
                o001 += c0[1] * data.val[0];
                o002 += c0[2] * data.val[0];
                o003 += c0[3] * data.val[0];
                o010 += c0[0] * data.val[1];
                o011 += c0[1] * data.val[1];
                o012 += c0[2] * data.val[1];
                o013 += c0[3] * data.val[1];

                o000 += c1[0] * data.val[1];
                o001 += c1[1] * data.val[1];
                o002 += c1[2] * data.val[1];
                o003 += c1[3] * data.val[1];
                o010 += c1[0] * data.val[2];
                o011 += c1[1] * data.val[2];
                o012 += c1[2] * data.val[2];
                o013 += c1[3] * data.val[2];

                o000 += c2[0] * data.val[2];
                o001 += c2[1] * data.val[2];
                o002 += c2[2] * data.val[2];
                o003 += c2[3] * data.val[2];
                o010 += c2[0] * data.val[3];
                o011 += c2[1] * data.val[3];
                o012 += c2[2] * data.val[3];
                o013 += c2[3] * data.val[3];

                data = load_row(src3, stride_xp, stride_xn); src3+=8;

                o100 += c0[0] * data.val[0];
                o101 += c0[1] * data.val[0];
                o102 += c0[2] * data.val[0];
                o103 += c0[3] * data.val[0];
                o110 += c0[0] * data.val[1];
                o111 += c0[1] * data.val[1];
                o112 += c0[2] * data.val[1];
                o113 += c0[3] * data.val[1];
                o100 += c1[0] * data.val[1];
                o101 += c1[1] * data.val[1];
                o102 += c1[2] * data.val[1];
                o103 += c1[3] * data.val[1];
                o110 += c1[0] * data.val[2];
                o111 += c1[1] * data.val[2];
                o112 += c1[2] * data.val[2];
                o113 += c1[3] * data.val[2];
                o100 += c2[0] * data.val[2];
                o101 += c2[1] * data.val[2];
                o102 += c2[2] * data.val[2];
                o103 += c2[3] * data.val[2];
                o110 += c2[0] * data.val[3];
                o111 += c2[1] * data.val[3];
                o112 += c2[2] * data.val[3];
                o113 += c2[3] * data.val[3];
            } while (--ic);
            float32x4x2_t o0{o000, o010};
            vst2q_f32(dst, o0);
            dst += 8;
            float32x4x2_t o1{o001, o011};
            vst2q_f32(dst, o1);
            dst += 8;
            float32x4x2_t o2{o002, o012};
            vst2q_f32(dst, o2);
            dst += 8;
            float32x4x2_t o3{o003, o013};
            vst2q_f32(dst, o3);
            dst += 8;
            {
                float32x4x2_t o0{o100, o110};
                vst2q_f32(dst2, o0);
                dst2 += 8;
                float32x4x2_t o1{o101, o111};
                vst2q_f32(dst2, o1);
                dst2 += 8;
                float32x4x2_t o2{o102, o112};
                vst2q_f32(dst2, o2);
                dst2 += 8;
                float32x4x2_t o3{o103, o113};
                vst2q_f32(dst2, o3);
                dst2 += 8;
            }
        } while (--oc);
    } while (--width);
}
```
</pre>
<img width="1px" height="1px" alt="" src="http://email.email.llvm.org/o/eJztW11z4jgW_TXkRQXlK_OVBx7SSXonNTOdqd6u6n2jhC3AO7JFWzLQ-fV7JRmwAWPAXfsyTojBsnTu0dGVrm-wZjL8OXlLiF5yMpdCyE2ULEiHUpVEqxXX-IlsllGwJJEiUaJ5EvKQaEkCGa8yzQmWKJniabKWItORTBSRc6KiD05g-9v2e4c-jRHwiYy3Pv7CkGTK2DAWExatecd76XhPMddLGSKHlDAyEzL428DQLd32CdNYpqOYd-izaZdyojiPleExw88y5iTli5QrhfaxSG84T0ggWLLoAvQ8whIknWbJ3z1n7XMmBHIOOWFrFgk2E9xYWWq9Uh3_qUM_42shw5kUuifTBZ594B9bf_jL7e9_jhyKO36P9NL2xhqwn6Ik4SkRUq7IkimySOUmIRtTj81kpsnAwyoK6wdOMLWKhNiLwrEP2J1IaZ4qMmMoheGP0iBAt0v4Gju3WeJBJuJnDkn7hTYMFcoUD3so_PgvibKYDm7YT6MYW8soxM5nIjTqYQlWJfNYsCkKxn9M5z41Q50ixSgwkmMXErkhkd7JvkStUh7LteHcnc-Z0t2YaeskTCi0QcJIMUPGjmhxJJTrZBbP8CIOsXGqNOZhxDQvdMHotkrlDIfmpxOw16GPRd2_IYw03uAguNKGTaRUxg2PnGTI51FifXroff_6_uVf0-f3l9fpH69PL2948u19-vn1--vX6V_fvv6beFhpJ4hhyQKdMYFdWsgURy82YpSVkCTmzlQsUXSU25JJmEC2ZM3SyPiWG5EFR6_AXpbnD5pZSdtIWa2QgkqD_xhT5h0hP3UJ-koUcrzUI-8JjqQdPL5d8UDvOrpgaYiW7OxbIscCMFo37LmOAjuN45XgMccroZtQUhU4bHaOwXA1CCzdeSpjHFLFV8zwx7pSGDOuQ4KssnRlMA6jh146Q6c0WhXGXmmWaiuadXjCFixyS8-X1_cvh-a94jBjp_OXO6V-lAQiw6nb8Z8DpUPk3fFfT69SytJ4amZTb4knJUzqW7_gpNYloNhuLiTTPi5J2_5U4_xm4RQ9EwfdXjCrHA6Z0RQ5DU0VN27TbXyucGVcpDP65LAJ_rAMtUEX8F_IWoRgpyKiFw2YSWBLMoQbT7UrQrPGUYr2Hu0LnegXwK1yOP-Y648dON2BWwFMTZKDmArjfUszQPN61bGRaQdH5kLP2eNbvbP3o7fGuTb45HUGL6a7u3PIz_1chhBs01L1k-6E_s34sMOnJXwo4mOnuVC8vjPp_6MH6hbaSRjNDyZSrrM0IaVpgA4ceggVgjlQc_A7owIKfi5OPXvMQ1CyPpo7BogUC-C4gB4XoDzPB4buZ389VLpUH8_LAIHk87nKZyeG6FAvT2Yl8lTaXn9_JmV3Ll9-c5dheOj8vtbJ5M-R6NPbMx76Jcjjyomt3D1Mx2LlUJbp5gBEBrbV-3OpdlWLvQd9fNhmuTDHTffqmuFHhtLzzJjhG7g36t78K1qCawmuJbiWcEVLcDbB2QRnE66xCc4mOJvgbEKFTSNi5ER8OxXxkpB7MQPvZPX9-MjXR6u0Wx_7Z9H3KCHTzOIUIs5ushSW6MNJcliCPWdifNlCANU8r6Fp3GBXC026xcbe9RvuVSvWEQKUEOAOBFpCoHcg-CUE_3YEqNcBahDqdahDqNehDqFeh4sIzqnoL3QquE_MglPBfWIWnAruE7PgVHCPmCWnqtKBXu1UVTrUIdTrUIdQr0MNQsEf6H06FPyB3qdDwR_ofToU_IHepwPU6-Bf7Q9VOtQh1OtQh1Cvw0WEirBobxkvxkVX6XJYNDcZDSMaNI5o0DiiQeOIBo0jGjSOaNA4okHTiHbhTu6qWAaNYxk0jmXQOJZB41gGjWMZNI5l0DiWQdNYduGG-2Z3ui8UQuNQCI1DITQOhdA4FELjUAiNQyHcGgornKr6hvt8g3Ou1iZ4bYJ35n-BbYbWZmhthtZmaHeEpYo8jdb9-9LVKadpbbLWJmuNkrVb7nnanO3Uq9qcrexVBhI96cJXOm0G12ZwjTK4kou138a1yVqbrLXJ2u3JWpuytSlbw5TNv-aJE7_N2dqc7Z4so822TvzhH5xttQnSGX_4ByVIxdad0YvZhSTsBotuNwqqvtQ6PJlM7SOmndGn_AlV97zp6Dz-Wun9A-z5I8PYtsIGVtj15PzjI0ckwJGAnATcQgJ-FQnqSNCcBL2FBP1VJHxHws9J-LeQ8O8mUfmE7jlngdxZoNpZzjCklz0mp0kv8zxDCRwlyClVuk4lpUr_uZsSdZRy-GpHqqRU6U13U_IdJT-nVOlWlZQqfes6SvsdDoWC4nIlT5arowq7bQeHO-b9nond5qeHcOKHj_4je9CRFnzy9bDdMTrsFtzt2TLFbtNWGg_7bgsWKW-EfMhSMTna7xjpZTbrBTLGEyHWu7fuKpX_5QHOw892d53CDwN__Dh-WE64F9IR9EPmDcZ9fxhyOuPj0YDPx3Q29-joQbAZF2qCq3uH0oRv3AY9sx9r8PIQTahH8QUjbzwYeOOex2E0GMLMHwbDEZ0PO32PxywSPcPDbMR8SCeW0ixbKLwosNvqcJGhIouEc2sO8VmmlzKdZEkgOFPxg7U9sdz_BxGgFDs">