<table border="1" cellspacing="0" cellpadding="8">
<tr>
<th>Issue</th>
<td>
<a href=https://github.com/llvm/llvm-project/issues/102666>102666</a>
</td>
</tr>
<tr>
<th>Summary</th>
<td>
Slow code generated for avx2 four-way vector interleave
</td>
</tr>
<tr>
<th>Labels</th>
<td>
new issue
</td>
</tr>
<tr>
<th>Assignees</th>
<td>
</td>
</tr>
<tr>
<th>Reporter</th>
<td>
abadams
</td>
</tr>
</table>
<pre>
Four-way interleaves with avx2 aren't generating good code. Consider the following:
```
typedef float floatx8_vec __attribute__((ext_vector_type(8)));
typedef float floatx16_vec __attribute__((ext_vector_type(16)));
auto interleave2(floatx8_vec a, floatx8_vec b) {
return __builtin_shufflevector(a, b, 0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15);
}
auto interleave2(floatx16_vec a, floatx16_vec b) {
return __builtin_shufflevector(a, b, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31);
}
auto interleave4(floatx8_vec a, floatx8_vec b, floatx8_vec c, floatx8_vec &d) {
return interleave2(interleave2(a, c), interleave2(b, d));
}
```
With -O3 -mavx2 it generates good code for the interleave2 functions, but makes a mess with interleave4. It should generate 16 unpckh/unpckl/vperm2 instructions (18 cycles on skylake according to llvm-mca), but instead it generates 44 vshuf/vperm/blend instructions (32 cycles). See below (copy-pasted from godbolt):
```
interleave2(float vector[8], float vector[8]):
vunpckhps ymm2, ymm0, ymm1
vunpcklps ymm1, ymm0, ymm1
vperm2f128 ymm0, ymm1, ymm2, 32
vperm2f128 ymm1, ymm1, ymm2, 49
ret
interleave2(float vector[16], float vector[16]):
push rbp
mov rbp, rsp
and rsp, -64
sub rsp, 64
vmovaps ymm0, ymmword ptr [rbp + 16]
vmovaps ymm2, ymmword ptr [rbp + 48]
vmovaps ymm1, ymmword ptr [rbp + 80]
vmovaps ymm3, ymmword ptr [rbp + 112]
vunpckhps ymm4, ymm0, ymm1
vunpcklps ymm1, ymm0, ymm1
vperm2f128 ymm0, ymm1, ymm4, 32
vperm2f128 ymm1, ymm1, ymm4, 49
vunpckhps ymm4, ymm2, ymm3
vunpcklps ymm3, ymm2, ymm3
vperm2f128 ymm2, ymm3, ymm4, 32
vperm2f128 ymm3, ymm3, ymm4, 49
mov rsp, rbp
pop rbp
ret
interleave4(float vector[8], float vector[8], float vector[8], float vector[8]&):
vmovaps ymm3, ymmword ptr [rdi]
vshufps xmm4, xmm2, xmm2, 212
vpermpd ymm4, ymm4, 96
vshufps xmm5, xmm0, xmm0, 212
vpermpd ymm5, ymm5, 212
vblendps ymm4, ymm5, ymm4, 204
vshufps xmm5, xmm1, xmm1, 96
vpermpd ymm5, ymm5, 212
vshufps xmm6, xmm3, xmm3, 96
vpermpd ymm6, ymm6, 96
vblendps ymm5, ymm5, ymm6, 204
vblendps ymm6, ymm4, ymm5, 170
vshufps xmm4, xmm2, xmm2, 246
vpermpd ymm4, ymm4, 96
vshufps xmm5, xmm0, xmm0, 246
vpermpd ymm5, ymm5, 212
vblendps ymm4, ymm5, ymm4, 204
vshufps xmm5, xmm1, xmm1, 232
vpermpd ymm5, ymm5, 212
vshufps xmm7, xmm3, xmm3, 232
vpermpd ymm7, ymm7, 96
vblendps ymm5, ymm5, ymm7, 204
vblendps ymm4, ymm4, ymm5, 170
vshufps ymm5, ymm2, ymm2, 212
vpermpd ymm5, ymm5, 232
vshufps ymm7, ymm0, ymm0, 212
vpermpd ymm7, ymm7, 246
vblendps ymm5, ymm7, ymm5, 204
vshufps ymm7, ymm1, ymm1, 96
vpermpd ymm7, ymm7, 246
vshufps ymm8, ymm3, ymm3, 96
vpermpd ymm8, ymm8, 232
vblendps ymm7, ymm7, ymm8, 204
vblendps ymm5, ymm5, ymm7, 170
vshufps ymm2, ymm2, ymm2, 246
vpermpd ymm2, ymm2, 232
vshufps ymm0, ymm0, ymm0, 246
vpermpd ymm0, ymm0, 246
vblendps ymm0, ymm0, ymm2, 204
vshufps ymm1, ymm1, ymm1, 232
vpermpd ymm1, ymm1, 246
vshufps ymm2, ymm3, ymm3, 232
vpermpd ymm2, ymm2, 232
vblendps ymm1, ymm1, ymm2, 204
vblendps ymm3, ymm0, ymm1, 170
vmovaps ymm0, ymm6
vmovaps ymm1, ymm4
vmovaps ymm2, ymm5
ret
```
If you force materialization of the intermediate values, you get the expected code (minus the spill/reload used to force materialization):
```
auto interleave4_v2(floatx8_vec a, floatx8_vec b, floatx8_vec c, floatx8_vec d) {
auto ac = interleave2(a, c);
auto bd = interleave2(b, d);
decltype(ac) mem[2];
mem[0] = ac;
mem[1] = bd;
asm volatile ("" : "+m"(mem));
ac = mem[0];
bd = mem[1];
return interleave2(ac, bd);
}
```
```
interleave4_v2(float vector[8], float vector[8], float vector[8], float vector[8]):
push rbp
mov rbp, rsp
and rsp, -64
sub rsp, 192
vunpckhps ymm4, ymm0, ymm2
vunpcklps ymm0, ymm0, ymm2
vperm2f128 ymm2, ymm0, ymm4, 32
vperm2f128 ymm0, ymm0, ymm4, 49
vunpckhps ymm4, ymm1, ymm3
vunpcklps ymm1, ymm1, ymm3
vperm2f128 ymm3, ymm1, ymm4, 32
vperm2f128 ymm1, ymm1, ymm4, 49
vmovaps ymmword ptr [rsp + 32], ymm0
vmovaps ymmword ptr [rsp], ymm2
vmovaps ymmword ptr [rsp + 96], ymm1
vmovaps ymmword ptr [rsp + 64], ymm3
vmovaps ymm0, ymmword ptr [rsp]
vmovaps ymm2, ymmword ptr [rsp + 32]
vmovaps ymm1, ymmword ptr [rsp + 64]
vmovaps ymm3, ymmword ptr [rsp + 96]
vunpckhps ymm4, ymm0, ymm1
vunpcklps ymm1, ymm0, ymm1
vperm2f128 ymm0, ymm1, ymm4, 32
vperm2f128 ymm1, ymm1, ymm4, 49
vunpckhps ymm4, ymm2, ymm3
vunpcklps ymm3, ymm2, ymm3
vperm2f128 ymm2, ymm3, ymm4, 32
vperm2f128 ymm3, ymm3, ymm4, 49
mov rsp, rbp
pop rbp
ret
```
If this is hard to fix, I'd love a workaround that doesn't involve memory operations or inline assembly. Tagging @RKSimon as according to git history he has been working on x86 shuffle lowering most recently.
</pre>
<img width="1px" height="1px" alt="" src="http://email.email.llvm.org/o/eJy8WUuPqzoS_jXOptQtKJ5ZZHH7tFo6msVIc0aaZWTASZg2GGFDOvPrRxgSMM_0nTMXRUAo15OvjKtMpUzPOWMH4r0R731HK3UR5YFGNKGZ3EUiuR0-RFW-XOkN0lyxkjNaMwnXVF2A1l8ItGQ5wUDBmeWspCrNz3AWIoFYJOwVfohcpgkrQV0YnATn4prmZ-L8Qax3Yt3PvtX99F91K1jCTnDigqr2_BUeaxbD8UiVKtOoUux4JBgSDNmXakhKlMeGj2AYEtx3P-dtWaLtPy_S9qcy2zOtlBhEBgmGQ4MpwR-GBxHBPZCgkwEAUDJVlTkcj1GVcpXmR3mpTifOWgsIhlpG1Jys5hQ2J7s57ZsT6r-a4uhbTXL1raZ5-lbTfH2raYG-9QyHgvenPOsCN3CtezLy7buO2f7DMzvoXQt71_YP19B6uIb2wzXEh2voPGKFbh8sr48W-n24MOjjhWEfMNz3EXOse8h-gGM_Hzj3CUiYD-LxA4J-soQb8w2Z_7SqmLRemCStNBlD-uGImZDt-V9Nzr_83YGXTGd--sh5JvuUh5Nok32gD05VHqtU5FK_8UpBRj-ZBAoZk91kMgjYK_xUIC-i4slDA9g-VHkRf14IfugbTvCjLliZIaS5VGXVaoAmXUOIbzFnEkQO8vPG6ScDGseiTJrpSQngvM5esph2sWlMaoQwmphuuS7UDWzvugh-RJzlyUSlg51KgvtX-MUYRIyLa0OKRXF7KahULIFTKTI4iyQSXOnIr82DM8kHXfJ4byHx3h84GT_uBUPdRq2Q0B63LNMwv2WZ1V3tHlTN0XLwIYe9waFfw8nG8MHRj2yvWqeDi3yDoQaLuzdZSqaGEVsNke3Px6h7PghSdxSVvGglUWESMlHDnYA_oJR3Os2TliA14cV3TUZZRUP6mFxnoqaFHAbsKsoEClUC8d7KqACCb9AavMSJK5xu2HMOWOwVltBaU-asmWnjlHUKP_dZ-P0W4LkG8GYYxrBz52C34sY9_M5mFjkbHEY29IPm_Fj235nje7jzAHKLxwnQC1GAmQGLCed-c0763mN_Lj-3cJikA7Q303Yh4asLwVcX0vsV7blYFsnw1bYLB38q0eskWcPrmkSvk-iZA2v9MXlgZKjbG9qA1njimNpiD68Po582ZSrY7wQ6w-uKYL8T7M-Om3rqjTz15z2dMvrD0NwF2IG16Mk8ANxlV-YBsBx7EwcrgjeCf_f1t-EAzZnvmwgI5hAwKzLoRAZ_8t0HpncrefHcSx-IxuH1-RydTLW95GD0UdrMfiM6U3gshycwTFp6-YOBxsdsJVc3LOoFh6PPydYkcB8fzgdx6mowcjXcRMIsdtaQgLNIWMlSc-AKEqxZJKxI3hg4dXasAbeQMF7O2PM-9CaZA1egMF6ROFuSN8I4dXa2AljFgkPG68NZLEyX2f7yonh5jY4P5I3XRnP18s8T3ETV1MMxg4wqVqaUp_-hTc0I4tQXyRlL0qbErSmvmC6RG74zU3oI-ypY3NSOuromGGZpXklNkkXKmzK4ZFzQBCrJkqa4ndVoVJumueOGxbF-po210bOYNiy0GhoDcd5hqU9x70S0g6NkZnDfuTA7cY2KhMW8a9nRRh5kLCPemy5LnIEp7eOm0NEKaDxDte_UKDGoVGZQC05VyvXrIIgEEYjzB-jbt0xfwkaK2V7R3K3_vQEGtXO4N2DGx9m-D9Xxj5KnujlLXYbhi_-_LOq_WXFPqu3u-GbRbe9xuRcyLkbHk9S0ipvOyFsl2bjfslXKzSj5blFqE7PEXOnpjIcu--GQtdJ6me-5CrufZo2aTra9BQc7YOmgPMnZs-B0up_Tsvd7lnGrYZ3Td3vOcSQ3mjytnd_q7hhBWeKcbfIY5k5iMltVG7H5bc2dCTq2Wzyb3Z0_C8G_uLsznheebfFsdne6439p8iwsZNQllZBKuNCyXWKkX43onwSDBLioGVC4ivKTlqLKE1AXqiARTLZbg2leC16z5rsmyhuIQm8UilyCKCHNeZozoFKyLOK3V_gnPZ_T_AzEtf7xt19pJnKg0mzfn1MFl1SqRtqFwYVKiBjLtQnNEJHDV-hDt-EEXFxZ2TzPhFRQspjlit9ed8nBSfbOnu7YwQ7QQcf2XX93OQR-6EYs8WNnb3ueH3v0hKEd2z7zqBvEyS49oIWuFVp7e48B-q827hMaYOL4pz3FyCWuxTKa8lfO6-xVlOddKmXFDraFvu_vOI0Yl3rXFTFnV9DUZt3gve_Kg96diKqzJK7FU6lkL0alirPDLy6u7XrwvlWR6H0XvS9zum_Ytl_fwUphV5X8cFGqkM2HGD8IfpxTdami11hkBD8aLd3lpSjFv1msCH5o2yTBj874-oD_DQAA__9ynIK2">