<table border="1" cellspacing="0" cellpadding="8">
<tr>
<th>Issue</th>
<td>
<a href=https://github.com/llvm/llvm-project/issues/68818>68818</a>
</td>
</tr>
<tr>
<th>Summary</th>
<td>
Inline asm register operands allocated only into `zmm0..zmm15`, not using `zmm16..zmm31`.
</td>
</tr>
<tr>
<th>Labels</th>
<td>
backend:X86,
llvm:codegen,
performance
</td>
</tr>
<tr>
<th>Assignees</th>
<td>
</td>
</tr>
<tr>
<th>Reporter</th>
<td>
bjacob
</td>
</tr>
</table>
<pre>
See Compiler Explorer testcase: https://godbolt.org/z/nh44Tsdo1
The hot loop in this piece of code needs to use 17 zmm registers, all as register operands to inline asm. But it's only using `zmm0..zmm15`, resulting in inefficient code copying between zmm registers and spilling.
Testcase pasted here for completeness:
```c
#include <immintrin.h>
#include <stdint.h>
static inline __m512 foo(__m512 acc, __m512 lhs, const float* rhs) {
asm("vfmadd231ps %[rhs]%{1to16%}, %[lhs], %[acc]"
: [acc] "+x"(acc)
: [lhs] "x"(lhs), [rhs] "m"(*rhs)
:);
return acc;
}
void bar(void* out_tile, const void* lhs_panel, const void* rhs_panel, int K) {
float* out_ptr = out_tile;
const float* lhs_ptr = lhs_panel;
const float* rhs_ptr = rhs_panel;
__m512 acc[16];
for (int i = 0; i < 16; ++i) {
acc[i] = _mm512_loadu_ps(out_ptr + i * 16);
}
for (int32_t k = 0; k < K; ++k) {
__m512 rhs = _mm512_loadu_ps(rhs_ptr);
rhs_ptr += 16;
acc[0] = foo(acc[0], rhs, lhs_ptr + 0);
acc[1] = foo(acc[1], rhs, lhs_ptr + 1);
acc[2] = foo(acc[2], rhs, lhs_ptr + 2);
acc[3] = foo(acc[3], rhs, lhs_ptr + 3);
acc[4] = foo(acc[4], rhs, lhs_ptr + 4);
acc[5] = foo(acc[5], rhs, lhs_ptr + 5);
acc[6] = foo(acc[6], rhs, lhs_ptr + 6);
acc[7] = foo(acc[7], rhs, lhs_ptr + 7);
acc[8] = foo(acc[8], rhs, lhs_ptr + 8);
acc[9] = foo(acc[9], rhs, lhs_ptr + 9);
acc[10] = foo(acc[10], rhs, lhs_ptr + 10);
acc[11] = foo(acc[11], rhs, lhs_ptr + 11);
acc[12] = foo(acc[12], rhs, lhs_ptr + 12);
acc[13] = foo(acc[13], rhs, lhs_ptr + 13);
acc[14] = foo(acc[14], rhs, lhs_ptr + 14);
acc[15] = foo(acc[15], rhs, lhs_ptr + 15);
lhs_ptr += 16;
}
for (int i = 0; i < 16; ++i) {
_mm512_storeu_ps(out_ptr + i * 16, acc[i]);
}
}
```
Compile with: `-O2 -mavx512f`
Result (hot loop excerpt):
```asm
.LBB0_1: # =>This Inner Loop Header: Depth=1
vmovups zmmword ptr [rsp - 64], zmm0 # 64-byte Spill
vmovups zmm0, zmmword ptr [rdx]
vfmadd231ps zmm3, zmm0, dword ptr [rsi]{1to16} # zmm3 = (zmm0 * mem) + zmm3
vfmadd231ps zmm15, zmm0, dword ptr [rsi + 4]{1to16} # zmm15 = (zmm0 * mem) + zmm15
vfmadd231ps zmm14, zmm0, dword ptr [rsi + 8]{1to16} # zmm14 = (zmm0 * mem) + zmm14
vfmadd231ps zmm13, zmm0, dword ptr [rsi + 12]{1to16} # zmm13 = (zmm0 * mem) + zmm13
vfmadd231ps zmm12, zmm0, dword ptr [rsi + 16]{1to16} # zmm12 = (zmm0 * mem) + zmm12
vfmadd231ps zmm11, zmm0, dword ptr [rsi + 20]{1to16} # zmm11 = (zmm0 * mem) + zmm11
vfmadd231ps zmm10, zmm0, dword ptr [rsi + 24]{1to16} # zmm10 = (zmm0 * mem) + zmm10
vfmadd231ps zmm9, zmm0, dword ptr [rsi + 28]{1to16} # zmm9 = (zmm0 * mem) + zmm9
vfmadd231ps zmm8, zmm0, dword ptr [rsi + 32]{1to16} # zmm8 = (zmm0 * mem) + zmm8
vfmadd231ps zmm7, zmm0, dword ptr [rsi + 36]{1to16} # zmm7 = (zmm0 * mem) + zmm7
vfmadd231ps zmm6, zmm0, dword ptr [rsi + 40]{1to16} # zmm6 = (zmm0 * mem) + zmm6
vfmadd231ps zmm5, zmm0, dword ptr [rsi + 44]{1to16} # zmm5 = (zmm0 * mem) + zmm5
vfmadd231ps zmm4, zmm0, dword ptr [rsi + 48]{1to16} # zmm4 = (zmm0 * mem) + zmm4
vfmadd231ps zmm2, zmm0, dword ptr [rsi + 52]{1to16} # zmm2 = (zmm0 * mem) + zmm2
vmovaps zmm1, zmm15
vmovaps zmm15, zmm14
vmovaps zmm14, zmm13
vmovaps zmm13, zmm12
vmovaps zmm12, zmm11
vmovaps zmm11, zmm10
vmovaps zmm10, zmm9
vmovaps zmm9, zmm8
vmovaps zmm8, zmm7
vmovaps zmm7, zmm6
vmovaps zmm6, zmm5
vmovaps zmm5, zmm4
vmovaps zmm4, zmm2
vmovups zmm2, zmmword ptr [rsp - 128] # 64-byte Reload
vfmadd231ps zmm2, zmm0, dword ptr [rsi + 56]{1to16} # zmm2 = (zmm0 * mem) + zmm2
vmovups zmmword ptr [rsp - 128], zmm2 # 64-byte Spill
vmovaps zmm2, zmm4
vmovaps zmm4, zmm5
vmovaps zmm5, zmm6
vmovaps zmm6, zmm7
vmovaps zmm7, zmm8
vmovaps zmm8, zmm9
vmovaps zmm9, zmm10
vmovaps zmm10, zmm11
vmovaps zmm11, zmm12
vmovaps zmm12, zmm13
vmovaps zmm13, zmm14
vmovaps zmm14, zmm15
vmovaps zmm15, zmm1
vmovups zmm1, zmmword ptr [rsp - 64] # 64-byte Reload
vfmadd231ps zmm1, zmm0, dword ptr [rsi + 60]{1to16} # zmm1 = (zmm0 * mem) + zmm1
vmovups zmmword ptr [rsp - 64], zmm1 # 64-byte Spill
vmovups zmm0, zmmword ptr [rsp - 64] # 64-byte Reload
add rdx, 64
add rsi, 64
dec ecx
jne .LBB0_1
```
The inefficient part is all these `vmovaps zmm, zmm` copying between registers. Also, that Spill explicitly called out in the comment.
This code should look like the ` vmovups zmm0, zmmword ptr [rdx]`, and then just the 16 `vfmadd231ps` instructions. But that would require allocating zmm asm operands beyond `zmm15`.
</pre>
<img width="1px" height="1px" alt="" src="http://email.email.llvm.org/o/eJycWU2P6yoS_TX0BnVkcOw4iyySzm3N03vSSO_dxexaxK6OuY3BA7i_fv0IYidOAnHmRq32B3BOVXEoF4IZw_cSYIWyDcq2D6yztdKr3S9Wqt3DTlVfq38A8JNqWi5A4x-frVAaNLZgbMkMoHSNa2tbg9I1os-IPu9VtVPCzpTeI_r8jeizrOfzn6ZSBCVblKwP_3_WgGtlsVCqxVxiW3ODWw4lYPWKS1UBlgCVwVbhzgAmC_zdNFjDnhsL2iD6hJkQmJnjO6xa0EwexnApuATMTDPDm85ibhFdGKyk-MKd4XKPUZ58N00ym303DclQnjhIDaYT1jVzibmE11decpD2YFGp2i_XtgP7ASDPLcJMVti0XAgu97MzX_tg4ZYZCxWuQQN-VRqXqmkFWJBgfABHY5w9_q_sn2nKZSm6CjBKn3jTcGk1l7MapT9CPYytuLTjZv_fWGZ5OUTn5aXJCMWvSiFa9A-sLF0g-idR-0iXShqLX4ViFtE11u7tEqPF5oCKMXaRRrRAlL6_NqyqaEpagxHNULZx3bOtu19siFUk97dbB3zoIPoOw7OzwT3TE7z7ObEdW7FrpptPfym81ctg9wO469739T4tPdtgm2ttDq2Irg_uXYG5d-nIYw2209IHbHjtvBpF-13xCu-YRrRwty50qrMvlgs4RXVoEbV5aZkEcd2kx01cWvznVfiPc-MIWqsxSrcnsrHZF3Ppafv-JxNuDNCjATo4YKSkbENyN5XjZqd8RAvnB_coCUo3_vYJk9zdI7pBdMOvnPQ686jcT1q6xS-N43oRilXdS2sQLY4BoB6Urh3oxdRdzNO5VSl9sfjtZNmbt-zPk2FvQcN6r3VtIob1gTuz5RhMunGDvPshf5PB38NaPb30SeuwSMUJCydnLP1MhDBIHINcRu1kDg1B0TgUjUOlIag0DpUGPJuHMOZxjHncnCwElcWhsjhUHoLK41B5wLNFCGMRx1jEzSlCUEUcqohDLUNQyzjUMqTGoKTJDU2TJG4RCYv7lrpvyJsE9U1uCJzQkItBbZMb4ibpDaOCKic3ZE5u6JwEhU5uKJ0EpS5uJrCbefZ3sn-fVY1VGm7m-6fRh-LM7pNJx5uh1Bqb2pe8-IPb2hcSefL4b4ofG_b-mRH6etH9b183OseORS18lqBb68nDpZ0rmfyb2V-bTfJCHM_ED9HUBQ2lP366gvkPKUHjvxzdv4BVoB3EFlpn85ach-69Ue9da1zR-qF0hX3Mso02LX7E-SAjVxJ7lnz-uPuygP9xFW0UKenHnCFWnw7sMGRcDbrfd9OkA4-7Vue2-OkaqsTF1lvihnilIFr05q1xA40XCN0cIM8NvCZ16o2z9l-FIDfJJshJNs0-n2QvIuzzKfZ5nN0134z2kLvC1FNRJ2l8kl1CnOLNI7x0ipdOB5xM0tMkQk-m6C8XVoA-maaPyS2Zok_iUV9O00Z0tpxgXU76XExypxGhFRPcxST3Ypo7IrbFBPfi5urKp5NKRGX5BG8en-Q7MllEW1OZbDqRTeexeURgU3nsRhrruaczShYR2FRCGfKJ-7CxPm_2ZFfJfdRlmIirFDzqMwSMXH6iRn2GFH2V10Z9BuePyWfceDQ2iQMMGelyIZ-6DMnjcrmdegxL_HJRnHoMCzG_tnJYK_GADvGMh3OIZiBQfUFCQwWJL3GIT319DTVUN3-D26DH19odmosklns1d-1CzPTB9wsXIgUaO4_HdEyz6ynLLmbzenB-tyKmVTWtzNOnLyDsq89yaIHcs8LuWan3rPh7MkdUBCSqY1-q41s6HtACdcmknPNYVTRVlfzmZoNcevIb-43_LyysqvzVbVPokxt31ez2IYGmCkp_hfLzvOGXBH8d9nHxbeXPGs6OFVqmLebGH2XYGgy4jeZIKb27KE-uDh-OBw8zvBZGuZ62ZvYQQAyfreAlt-ILl0wIqLDq7OGgBXCpmgaknQ02cXM43zC16kTltq9vWPA38J0d9307vsMRCpOVGyfxr85Yj0By79VJjg6SS2N1V1qupJlhf0bjzf_wJmj4b8c1uLiokvkDme-mwcw0p1OeHXwpWfXnOP4EZ_ZQrdJqmS7ZA6xIvlwk83Se0Id6VZa7BSSkoimk6YIsElbOE1aRZcpysqjyB76iCU1JQgjJs3xOZ7QooKDL5S5fZsuKVmieQMO4mAnx3syU3j9wYzpY5UVBigfBdiCMP0ajdMfKN5AVStf_KXJ_qvCEKHXjULp2gd6DPL5uQb8q3TBZgnuXbR_0ynV93HV7g-aJ4MaaE6nlVsDqj-PpVuD0q4-Zm3IpvjCXVoUPu6SyZ2dhJPftKfGh7LRYXRztcVt3u1mpGkSfvTuHy2Or1S8oLaLPPiYG0Wcflv8FAAD__2rC04E">