<table border="1" cellspacing="0" cellpadding="8">
<tr>
<th>Issue</th>
<td>
<a href=https://github.com/llvm/llvm-project/issues/133568>133568</a>
</td>
</tr>
<tr>
<th>Summary</th>
<td>
[AVX512] Avoid Memory form of Compress in AMD znver4
</td>
</tr>
<tr>
<th>Labels</th>
<td>
new issue
</td>
</tr>
<tr>
<th>Assignees</th>
<td>
</td>
</tr>
<tr>
<th>Reporter</th>
<td>
venkataramananhashkumar
</td>
</tr>
</table>
<pre>
for the given LLVM IR code X86 generates memory form of compress.
ref: https://godbolt.org/z/KhhczdbY8
.LBB0_4: # %vector.body
vptestmd k1, ymm1, ymm0
movsxd r8, r8d
vpaddd ymm1, ymm1, ymm2
vmovupd zmm3 {k1} {z}, zmmword ptr [rsi + r11]
kmovb ebx, k1
popcnt ebx, ebx
kortestb k1, k1
cmove ebx, r10d
add r11, 64
vcompresspd zmmword ptr [rdi + 8*r8] {k1}, zmm3
add r8d, ebx
cmp r9, r11
jne .LBB0_4
Memory form is micro coded and slower. We need to generate sequence as show below.
kmovb %k1, %r11d
pextl %r11d, %r11d, %ebx
vcompresspd %zmm3, %zmm3 {%k1} {z}
kmovd %ebx, %k1
vmovupd %zmm3, (%rdi,%rcx,8) {%k1}
</pre>
<img width="1" height="1" alt="" src="http://email.email.llvm.org/o/eJxsVE2TozgM_TXiopqUMeHrwIF0KlVb27nsYXb2tGWwCEwwZm2HdOfXbxmSHtLTXOxYetJ7kiJhbXcaiAqIdxDvA3FxrTbFRMNZOGGEEoMYWmHb80UJE1RavheNNuhawlM30YCvr9-P-MdfWGtJ-CNL8EQDGeHIoiKlzTs22ijUDdZajYas3SCw0lADUYmtc6OFqAR-AH44aVnp3m20OQE_3IAf_mzb-iarfzJgJbBy87rbsX-3Hgk8QuDxRLXTZuOJASvx_k2jI-uUfPw-h8Bf8F2px8lWzkpP9k0imswbTSafAgkpJa6hj5N7t0np6TJKvCkVIaS7cwjp3l9ukO69402pqzYSR2cQ4p2xHQLfoQlDiPerRGelpwoRqXrzsHPobaMe68F9PPpjhdDGi6yeNS7A-1crPdGvoCZka21CLvXxXPgLJtu17kezxsXnswy5yMiAlyaDeP-h_a45-ipPJn9XUatxMeYLw5n-z4Hmx0e7594fV9PUWVRdbfQ8dRLFINH2-kpmg_g34UAk0emPUURL_11oqAmFRdvqK1bU6-s8iF_0AHi8FBN4bMJwrtlIb65fbPPTyrpcn2V9rh_weK7K4vuYliXRamBYOZOQC-LetsVrFfs-c08xM89GdsBf_KX2wAx4vs4CrAxkEck8ykVARZhuI87TNM6DtmAJk0lESVSHSUNpFiW1zPKoSVjYbFlSBV3BGY9ZxHMWx3GcbyqqJFUNz5oqrdM0hy0jJbp-0_eT8v_goLP2QkUYRXGSBb2oqLfzmuF8oCvOVuDcbx1TeNC36nKysGV9Z539FcZ1rp_3U_n9Rxx6fywn3Uk8Pq-Xl3vFsRuwPO7xNkxktsHF9MWnLdO59lJtaq2AH3ya-_FtNPon1Q74YSZngR_u7KeC_x8AAP__YdJ-Ug">