<table border="1" cellspacing="0" cellpadding="8">
<tr>
<th>Issue</th>
<td>
<a href=https://github.com/llvm/llvm-project/issues/56332>56332</a>
</td>
</tr>
<tr>
<th>Summary</th>
<td>
Terrible vector codegen for cross product
</td>
</tr>
<tr>
<th>Labels</th>
<td>
backend:X86,
llvm:SLPVectorizer,
missed-optimization
</td>
</tr>
<tr>
<th>Assignees</th>
<td>
</td>
</tr>
<tr>
<th>Reporter</th>
<td>
RKSimon
</td>
</tr>
</table>
<pre>
https://godbolt.org/z/WGosfesMd
We struggle to efficiently vectorize a typical cross product pattern, and it doesn't seem to make much difference if the (unused) w component is specified or not.
```
template <typename T>
T cross_zero(T v1, T v2) {
T r;
r[0] = (v1[1] * v2[2]) - (v1[2] * v2[1]);
r[1] = (v1[2] * v2[0]) - (v1[0] * v2[2]);
r[2] = (v1[0] * v2[1]) - (v1[1] * v2[0]);
#ifdef ZEROW
r[3] = 0;
#endif
return r;
}
typedef float float4 __attribute__((vector_size(16)));
typedef double double4 __attribute__((vector_size(32)));
auto cross4(float4 x, float4 y) {
auto r = cross_zero(x,y);
return r;
}
auto cross4(double4 x, double4 y) {
auto r = cross_zero(x,y);
return r;
}
auto cross3(float4 x, float4 y) {
auto r = cross_zero(x,y);
return __builtin_shufflevector(r,r,0,1,2);
}
auto cross3(double4 x, double4 y) {
auto r = cross_zero(x,y);
return __builtin_shufflevector(r,r,0,1,2);
}
```
clang -g0 -O3 -march=btver2
```
cross4(float __vector(4), float __vector(4)): # @cross4(float __vector(4), float __vector(4))
vmovshdup %xmm0, %xmm2 # xmm2 = xmm0[1,1,3,3]
vpermilpd $1, %xmm1, %xmm3 # xmm3 = xmm1[1,0]
vpermilpd $1, %xmm0, %xmm4 # xmm4 = xmm0[1,0]
vmovshdup %xmm1, %xmm5 # xmm5 = xmm1[1,1,3,3]
vmulss %xmm3, %xmm2, %xmm2
vpermilps $226, %xmm1, %xmm3 # xmm3 = xmm1[2,0,2,3]
vmulss %xmm4, %xmm5, %xmm4
vmulps %xmm0, %xmm3, %xmm3
vpermilps $226, %xmm0, %xmm0 # xmm0 = xmm0[2,0,2,3]
vmulps %xmm1, %xmm0, %xmm0
vsubss %xmm4, %xmm2, %xmm2
vsubps %xmm3, %xmm0, %xmm0
vmovlhps %xmm0, %xmm2, %xmm1 # xmm1 = xmm2[0],xmm0[0]
vshufps $216, %xmm0, %xmm1, %xmm0 # xmm0 = xmm1[0,2],xmm0[1,3]
retq
cross4(double __vector(4), double __vector(4)): # @cross4(double __vector(4), double __vector(4))
vextractf128 $1, %ymm0, %xmm4
vextractf128 $1, %ymm1, %xmm6
vpermilpd $1, %xmm0, %xmm2 # xmm2 = xmm0[1,0]
vpermilpd $1, %xmm1, %xmm3 # xmm3 = xmm1[1,0]
vunpcklpd %xmm0, %xmm4, %xmm5 # xmm5 = xmm4[0],xmm0[0]
vshufpd $1, %xmm6, %xmm1, %xmm7 # xmm7 = xmm1[1],xmm6[0]
vshufpd $1, %xmm4, %xmm0, %xmm4 # xmm4 = xmm0[1],xmm4[0]
vunpcklpd %xmm1, %xmm6, %xmm6 # xmm6 = xmm6[0],xmm1[0]
vmulsd %xmm2, %xmm1, %xmm1
vmulsd %xmm3, %xmm0, %xmm0
vmulpd %xmm5, %xmm7, %xmm5
vmulpd %xmm6, %xmm4, %xmm4
vsubsd %xmm1, %xmm0, %xmm0
vsubpd %xmm5, %xmm4, %xmm4
vinsertf128 $1, %xmm0, %ymm4, %ymm0
retq
```
clang -g0 -O3 -march=btver2 -DZEROW
```
cross4(float __vector(4), float __vector(4)): # @cross4(float __vector(4), float __vector(4))
vmovshdup %xmm0, %xmm2 # xmm2 = xmm0[1,1,3,3]
vpermilpd $1, %xmm1, %xmm3 # xmm3 = xmm1[1,0]
vpermilpd $1, %xmm0, %xmm4 # xmm4 = xmm0[1,0]
vmovshdup %xmm1, %xmm5 # xmm5 = xmm1[1,1,3,3]
vmulss %xmm3, %xmm2, %xmm2
vpermilps $226, %xmm1, %xmm3 # xmm3 = xmm1[2,0,2,3]
vmulss %xmm4, %xmm5, %xmm4
vmulps %xmm0, %xmm3, %xmm3
vpermilps $226, %xmm0, %xmm0 # xmm0 = xmm0[2,0,2,3]
vmulps %xmm1, %xmm0, %xmm0
vsubss %xmm4, %xmm2, %xmm2
vxorps %xmm1, %xmm1, %xmm1
vsubps %xmm3, %xmm0, %xmm0
vblendps $1, %xmm2, %xmm1, %xmm1 # xmm1 = xmm2[0],xmm1[1,2,3]
vshufps $76, %xmm0, %xmm1, %xmm0 # xmm0 = xmm1[0,3],xmm0[0,1]
vpermilps $120, %xmm0, %xmm0 # xmm0 = xmm0[0,2,3,1]
retq
cross4(double __vector(4), double __vector(4)): # @cross4(double __vector(4), double __vector(4))
vextractf128 $1, %ymm0, %xmm4
vextractf128 $1, %ymm1, %xmm6
vpermilpd $1, %xmm0, %xmm2 # xmm2 = xmm0[1,0]
vpermilpd $1, %xmm1, %xmm3 # xmm3 = xmm1[1,0]
vunpcklpd %xmm0, %xmm4, %xmm5 # xmm5 = xmm4[0],xmm0[0]
vshufpd $1, %xmm6, %xmm1, %xmm7 # xmm7 = xmm1[1],xmm6[0]
vshufpd $1, %xmm4, %xmm0, %xmm4 # xmm4 = xmm0[1],xmm4[0]
vunpcklpd %xmm1, %xmm6, %xmm6 # xmm6 = xmm6[0],xmm1[0]
vmulsd %xmm2, %xmm1, %xmm1
vmulsd %xmm3, %xmm0, %xmm0
vmulpd %xmm5, %xmm7, %xmm5
vmulpd %xmm6, %xmm4, %xmm4
vsubsd %xmm1, %xmm0, %xmm0
vxorpd %xmm1, %xmm1, %xmm1
vsubpd %xmm5, %xmm4, %xmm4
vinsertf128 $1, %xmm0, %ymm4, %ymm0
vblendpd $8, %ymm1, %ymm0, %ymm0 # ymm0 = ymm0[0,1,2],ymm1[3]
retq
```
</pre>
<img width="1px" height="1px" alt="" src="http://email.email.llvm.org/o/eJztWU2T4yYQ_TXyhfIUAvnr4MPMeJNDktrU7lQ2lYtLEsgmKwkF0GQ8vz6gLyOMvePd2r3ELstGQrzX_WiatpVwcljvlapkgO8D9JN-7zhJeK7uuNjps1d9fPqZy4zK30gANwG8bz8_USCVqHe7nALFAc0yljJaqvwAnmmquGCvFMRAHSqWxjlIBZcSVIKTOlWgipWiogzQI4hLApgChFOpzxcKSEoLg1jEnyko6nQPCMsyKmiZUsAyoPYUBGhZl7WkJEAr8C9IeVHxUpMDJoGsaMoyRgngApRc3dlWB3PYvZtTRYsqj5UGxI_aUlrGBQVPAX7Xdj-1Zm9fqeCa8gk8h8Zk_Y0McbB4aO8D-pII8IPNBPSV2QMMZhsNvjEW68Gzh7C5gO4NxOxBHxuDNB36zRVwvCFsbxiwW9TQRUUjVHiCCseoyIuKXFQ4Qg1PUMMxKnRQA4RZRmgG_nr34f2nnqnlwj0XtG-npZ7qwSSqalHaui423azpmTK4Wc5j1X5GYLvVMSVYUiu63WoLjZFNHG6ljkR9Fs6Nce27h-yRCK8THcft1xuwMDrFaj_jWoduEzWRvq-z7cVETdc-OJEDQDNENGqMws2MOoyn6ZIqY-belYa6P_lu3K4F-Hv6vt0mNcsVK7dyX2dZTtu50aP08WgOqA-zUtFofk6Fwt9XqG811ElWaR6XOzDdQTB9j8G0iEW61_Yk6pkK5B8xikNtz2BA1ARvNzGnHdqYxhHPSy9TEETwG6AHmczrueDPck_qaoCfvRSF0aVrIpe8uWbmobnPZKFWQ9wcs42DX1FRsLwiA0IUHsGtJnY4cM8RdhzwSmzLh8jBjlz7PdheXSz4mUeXmWvzBV2KOtcbcu-8pbfd9Lorj-4iNL8kpk9K1IU9eoNZkeWupebpkEp6Igfbzas8sUDgyBNoT9yXPTma5Y8L6AyRdeJ1_sKc6CGVbxovsOjIyveD754VZ02pHWS9CGEvwnHTf-wk8cSxSX2azEgc-iUOT9Q-kbutRozUNlvoFV4n33-c9Ndt754kda7nigT4NeBjieiLEnGqshAtu2jsJTmM08g14yxV51-ZtobUey7tXpsSvzHd1mWVfrax3TTrZkc3LUZvDljiWO5Pcwt3aSzGXvRM8yuYIu9URC6Ts4X0TJGfyauc3725zTLvWeYj5UI_i8ndpEdHXsXCS0PemL7qxpGTbWFhbxYXhsy9Wrtry6RicnX29hp2gYWVkophAZ9ZiYcjxOGE9ZjtrqkWwXRj_Si7VY23qvFWNf6fqsYXLrwsFzL1VxSauvgpiV1o2oF-Zn9wg_NMsdmH6hkVj0Xn4s0159miEzsVg1kg5xa1FRshgtfGxjEufBy3wvZW2N4K21th-8MLW7Nb-IZ8Ybf44bVwt-H0wWAglqd54zBCG7J_ExmHPiEf7GQ__O9waCPk0n8OQy09IWtMVngVTxRTOV0_USGYyZptzgQpJ3RHS5CZtv1wbFKL3H0kx9S-Tu5SXuiTPH_uv6Z6yN8aTp8yKWsqdWM2xxhN9usUZyRMMERJtgwpXsIVhAijEJIloZiSSR4nNJdr7U-AUBKnn7V2mvHPpY6jLsZRw4XvP_76-x_9Mz1x7C00KSVTXilWsNdYMV6aztlmwtYIIgQXMISraIXDuxVdhqtkjuNZloTLBdH7DC1ilt8ZBvOocSLWjUdJvZO6M2dSyWNnLCXblZQ21mr8uFZ7LtYffvnICl5OGufXjef_ATzVgEU">