<table border="1" cellspacing="0" cellpadding="8">
    <tr>
        <th>Issue</th>
        <td>
            <a href=https://github.com/llvm/llvm-project/issues/61683>61683</a>
        </td>
    </tr>

    <tr>
        <th>Summary</th>
        <td>
            x86 AVX2: Inefficient code when extracting vector values as a bitmask (through boolean vector, aiming for `movmsk`)
        </td>
    </tr>

    <tr>
      <th>Labels</th>
      <td>
            new issue
      </td>
    </tr>

    <tr>
      <th>Assignees</th>
      <td>
      </td>
    </tr>

    <tr>
      <th>Reporter</th>
      <td>
          He3lixxx
      </td>
    </tr>
</table>

<pre>
    Consider this piece of code with `as_bitmask` being a function that should simply extract the truthiness of each vector element and return them as a bitmask.
```c++
#include <cstddef>
#include <cstdint>

template<class VectorT>
uint8_t as_bitmask(VectorT vec) {
  constexpr size_t NUM_MASK_BITS = sizeof(VectorT) / sizeof(VectorT{}[0]);

  using MaskVecT __attribute__((ext_vector_type(NUM_MASK_BITS))) = bool;
  MaskVecT mask_vector = __builtin_convertvector(vec, MaskVecT);
  return reinterpret_cast<uint8_t&>(mask_vector);
}

using uint32x4_t __attribute__((vector_size(16))) = uint32_t;
using uint32x8_t __attribute__((vector_size(32))) = uint32_t;

template uint8_t as_bitmask(uint32x4_t);
template uint8_t as_bitmask(uint32x8_t);
```

When compiling with `-O3 -mavx2`, clang currently generates this code ([godbolt](https://godbolt.org/z/PMKMfx1vx))
```asm
unsigned char as_bitmask<unsigned int __vector(4)>(unsigned int __vector(4)):
 vpxor   xmm1, xmm1, xmm1          ; xmm1 = [0] * 4
        vpcmpeqd xmm0, xmm0, xmm1  ; xmm0 = [-1 if el == 0 else 0 for el in input]
 vpcmpeqd        xmm1, xmm1, xmm1  ; xmm1 = [1] * 4
        vpxor   xmm0, xmm0, xmm1          ; invert xmm1  -> xmm1 = [0 if el == 0 else -1 for el in input]
        vpackssdw       xmm0, xmm0, xmm0  ; compress 4B values to 2B, repeating values
        vpsllw  xmm0, xmm0, 15            ; superfluous left-shift?
        vpacksswb       xmm0, xmm0, xmm0  ; compress 2B values to 1B, again repeating values
        vpmovmskb eax, xmm0         ; extract from xmm0, giving  [0 if el == 0 else 1 for el in input] * 4
        ret
  
unsigned char as_bitmask<unsigned int __vector(8)>(unsigned int __vector(8)):
        vpxor   xmm1, xmm1, xmm1 ; ymm1 = [0] * 8
        vpcmpeqd        ymm0, ymm0, ymm1  ; ymm0 = [-1 if el == 0 else 0 for el in input]
        vmovmskps       eax, ymm0 ; extract from ymm0, giving [1 if el == 0 else 0 for el in input]
 not     eax                       ; negate result -> [0 if el == 0 else 1 for el in input]
        vzeroupper
        ret
```

It looks to me as if the `uint32x4_t`/`xmm`-variant could be simplified here, using the same approach that is used in the `uint32x8_t`/`ymm`-variant, saving 4 instructions:

```asm
unsigned char as_bitmask<unsigned int __vector(4)>(unsigned int __vector(4)):
        vpxor   xmm1, xmm1, xmm1 ; xmm1 = [0] * 4
        vpcmpeqd        xmm0, xmm0, xmm1  ; xmm0 = [-1 if el == 0 else 0 for el in input]
        movmskps        eax, xmm0 ; extract from xmm0, giving [1 if el == 0 else 0 for el in input]
 not     eax                       ; negate result -> [0 if el == 0 else 1 for el in input]
        ret
```

The problem does not occur when compiling for newer architectures supporting AVX512, as LLVM uses vector masks then.
</pre>
<img width="1px" height="1px" alt="" src="http://email.email.llvm.org/o/eJzUV19v4zYS_zT0y8CBRNmK_OCH2FnjFru5O2Bzub4JlDSy2FCkSlKOvJ--ICXbsuNssm2BooRgwSI58_vNH86QGcO3EnFJ5isyv5-w1lZKL_-FkeBd100yVeyXayUNL1CDrbiBhmOOoErIVYHwwm0FJA6YSTNua2aeSRxAhlxugUHZytxyJcFWzIKpVCsKMLxuxB6ws5rlFmyFYHVrKy7RGCcYWV7BDnOrNKDAGqUFJgvQaFvtZGENzACDQeMNCe5JcEfioH9yQlfu6b_SiMtctAUCida5sUWBJYk-vTXLpT3N-l-LdSOYRbdAMGPgyUN7PC5rubRJamFkA5oMixwPQhdAbgc8ALmSxmLXaDD8O6YW_v2_h_Th7tuXdPX58RuQ6N5PqPIkxUugm9ffb1fk9p7MVwGZ3xO6INFqjBygNc4RD8w8P2H-CGnKrNU8ay2mKaEJoQl2Nu1tndp9g4QmZ3CcUP94XJlS4qgDTnId50GKX5emWcuF5TLNldyhtv0coYk3x_q4c4wZDg7WyKVF3Wi0ac6MJdF6MDGhsbM6TUYKz2jf3o_59-zd3oh2s9Re4z9wd5YlNAnjc8L93tQeNZyJTD4iMqLviDyPMrgaTScOY7of2pOc7zlmyVj5_yuUkKu64cLROyT19D8RTGu266hbT9eQCya3kLdao7RiD1uUqJlF0x8N_kRwNpivtqrIlLA-LJPK2saQ6I7QDaGbYepG6S2hm--Ebv778OWh7MJdN1jqHCgz9WB76Q-rAvKK6THXaH2c4tK55BhuM0_dRcwPV7hFh5zZNZ3SANDVdeg4j99wHCRa9V-cS4cEBELvYHYI5n7smrxu8LfCLQ4GMcFJ3CAmOIiZhsBLQOH-u08BoDAIAZT-LAQugcum9XY9wB0UDOM67Eu44Ztwj-SvoB2T5z6vh4kpiT6dm-MqjWn4No8jAJY_G1O8nOhcAgl6AC5ctasYsxXsmGhdECpwJ_8aNDbIrAvlfuZShxHi5bXscA5wTtK0DepStKo1ILC0U1Px0pJocxX0S_Zx0HQMOvSg2ZZx-S70Wu1q85wBsu4kegT5UFZLreojgC3fOYlve-aaY67Fh0Z7-PCHczJ5NyeTy5x8HZ2vY9yR31_LyOSNjBzGfrDR6D0kzP5PJeZBXe-wxgz_B78Nsi_8tT_3l0vUn1UrlT3ogevDaZW4dXVDo2mF7dP3p2LjguN31KptGtRvRMvVovPZglDq2WdAja6j46XvBkkcjOqdqzwbEgddXbuStGOaM2kh971khn07yUuOBVSo0ZmvL9JOlGFOctNo5XpK34VyA63xMXehLBkp258pczIN8y6ZAZfG6ta3teYYoX9vyfpoevxUwXr7LPtL6tYwLrLj7Fh77zj7p6THj9PgsUJotMoE1lAoNB6iyvNWw8t5W-b0SHxBDUznFbeY21ajcVWqUdrXjLunX-Yh9dXEwNevTw8u1s3hMuUiz7VqKG8mxTIqFtGCTXAZxreLOE5iGk2q5WzGyjKLkGZBmSWLcDEPMIxZ5lZnJcsnfEkDGgURnYWz6DaIbuKwnM-KMMrmszhAGpFZgDXj4kaIXe26vAk3psVlHMZJNBEsQ2H8fZNSiS_gJwml7vqpl27PNGu3hswCwY01JymWW4HLLokdS0qiO_gssSx5ztGfB-426gw2hIwvoT3vodKO74yuTbWVVu228rcaZBIOCbYGxuuDwUkc9DHqz4bFpNViedHPclu12U2uakI3Du3wmjZa_Yq5JXTjORpCN94GvwcAAP__wGahTA">