<table border="1" cellspacing="0" cellpadding="8">
    <tr>
        <th>Issue</th>
        <td>
            <a href=https://github.com/llvm/llvm-project/issues/84967>84967</a>
        </td>
    </tr>

    <tr>
        <th>Summary</th>
        <td>
            Several SIMD equal-each's can sometimes be folded into a vector shuffle
        </td>
    </tr>

    <tr>
      <th>Labels</th>
      <td>
            new issue
      </td>
    </tr>

    <tr>
      <th>Assignees</th>
      <td>
      </td>
    </tr>

    <tr>
      <th>Reporter</th>
      <td>
          Validark
      </td>
    </tr>
</table>

<pre>
    Just saw this trick in [this video](https://www.youtube.com/watch?v=R_yX0XjdSBY&t=2058s).

[Godbolt link](https://zig.godbolt.org/z/EPqj79KxY)

Effectively, we can turn this:

```asm
.LCPI0_0:
        .zero   32,32
.LCPI0_1:
 .zero   32,247
.LCPI0_2:
        .zero 32,1
maskWhitespace:
        vpcmpeqb        ymm1, ymm0, ymmword ptr [rip + .LCPI0_0] ; match ' '/32/0x20
        vpaddb  ymm0, ymm0, ymmword ptr [rip + .LCPI0_1]; move '\t' and '\n' to 0 and 1
        vpminub ymm2, ymm0, ymmword ptr [rip + .LCPI0_2]; all elements will be set to 1 except '\t' corresponds to 0
        vpcmpeqb        ymm0, ymm0, ymm2; match 0's and 1's corresponding to '\t' and '\n'
        vpor    ymm0, ymm0, ymm1
        vpmovmskb       eax, ymm0
        vzeroupper
 ret
```

into this:

```asm
.LCPI1_1:
        .long 0x00000020
        .long   0x00000000
        .long   0x000a0900
 .long   0x00000000
maskWhitespaceWithShuffle:
 vbroadcastf128  ymm1, xmmword ptr [rip + .LCPI1_1]
        vpshufb ymm1, ymm1, ymm0; 0 in lower nibble => ' ', 9 in lower nibble => '\t', 10 in lower nibble => '\n', else => 0
        vpcmpeqb        ymm0, ymm1, ymm0; match the places where the shuffled result matched the original source
 vpmovmskb       eax, ymm0
        vzeroupper
 ret
```

This strategy would have to be modified if "0 in lower nibble" were not being matched. We can't have 0 => 0 if that's not what we're trying to match. In that case, any other byte that doesn't have a 0 in the lower nibble could be placed in that slot, e.g. 1.

This is applicable on ARM as well.

Zig implementation: [Godbolt link](https://zig.godbolt.org/z/EPqj79KxY)

(sorry it's kinda messy but Zig currently does not have a builtin dynamic shuffle and LLVM's autovectorizer can't identify opportunities to collapse a routine into a shuffle in context)

```zig
const std = @import("std");
const builtin = @import("builtin");
const assert = std.debug.assert;
const Chunk = @Vector(std.simd.suggestVectorLengthForCpu(u8, builtin.cpu).?, u8);

export fn maskWhitespace(v: Chunk) std.meta.Int(.unsigned, @sizeOf(Chunk)) {
    var accumulator: std.meta.Int(.unsigned, @sizeOf(@TypeOf(v))) = 0;
    inline for (" \t\n") |c| {
 accumulator |= @bitCast(v == @as(Chunk, @splat(c)));
    }
    return accumulator;
}

export fn maskWhitespaceWithShuffle(chunk: Chunk) std.meta.Int(.unsigned, @sizeOf(@TypeOf(chunk))) {
    comptime var buffer = std.mem.zeroes([16]u8);
    buffer[0] = ' ';
    buffer['\t'] = '\t';
    buffer['\n'] = '\n';
    return @as(std.meta.Int(.unsigned, @sizeOf(Chunk)), @bitCast(lookup_chunk(buffer, chunk) == chunk));
}

fn mm_shuffle_epi8(x: Chunk, mask: Chunk) Chunk {
    return asm (
        \\vpshufb %[mask], %[x], %[out]
 : [out] "=x" (-> Chunk),
        : [x] "+x" (x),
 [mask] "x" (mask),
    );
}

fn lookup_16_aarch64(x: @Vector(16, u8), mask: @Vector(16, u8)) @Vector(16, u8) {
    return asm (
        \\tbl  %[out].16b, {%[mask].16b}, %[x].16b
        : [out] "=&x" (-> @Vector(16, u8)),
        : [x] "x" (x),
 [mask] "x" (mask),
    );
}

fn lookup_chunk(comptime a: [16]u8, b: Chunk) Chunk {
    switch (builtin.cpu.arch) {
 .x86_64 => return mm_shuffle_epi8(a ** (@sizeOf(Chunk) / 16), b),
 .aarch64, .aarch64_be => return lookup_16_aarch64(b, a),
        else => {
            var r: Chunk = @splat(0);
            for (0..@sizeOf(Chunk)) |i| {
                const c = b[i];
 assert(c <= 0x0F);
                r[i] = a[c];
            }
 return r;
        },
    }
}
```
</pre>
<img width="1px" height="1px" alt="" src="http://email.email.llvm.org/o/eJy8WN2TmzgS_2s0L12hQBiDH_wwxvFV7pK6rc1WsnsvUwIaWxmBiCT8MX_9lQTYYI8n2a27pZIZu9Xqj19_Mkxrvq0RlyRakWj9wFqzk2r5hQleMPX8kMnitPxnqw1odgCz4xqM4vkz8BpItHKEPS9QkmhNaLIzptEkfCR0Q-jmcDh4J9maNkMvl5WlMJPvSLjZk3D969Ppd__3b8Xn1R-Ezg0J19SPEk3owiP-mviP_c9o9Q9ZZFIYELx-fk3PC996247Hk2prKYRu3v_y_Vu8-NfxD0IXY4HvyxJzw_coToSmcEDIWQ2mVbXzz0odq5_73T-mq47ifUx_-eA_-WdG6B_vBZUEgJASmoZ0wh1cuCdsdBZP-OgdqY456E4qpp-_7rhB3bAcby7sm7xq8Hs2fD9VVWAdPVWV3_8-SFVAY5QNoeINELqCs1vRGki4gsqGCgiN7X9CN9aCjX-k_rU2VhQZjKX_hJbAhtEqkXt08qPUWFWsLvqvtf1qJPiOFlwrrXjdZlYJ_XmltFfKhAAUWGFtNBy4EJAhaDRWXQB4zLExY6NyqRTqRtaFdhb9EO5rIOgFT5_QWPcu2U8X2bzeWul3wbjWKtUdbbdYyX2lnwcDkR3PF6aMNtHapkHV0xWaqxoYFwavjfwTBROMS2BIbSHrLfhH3z3XmdWdwvncf-uc-Yvz-b2L07r5ys3u864tSzEqoX2mJCtypk0Z0ORSO8c3EivosvkKdL1ry2xce5caDFfg2_4p5AEV1DzLBAIJ1yR8f6m3FBZv8PQpYtmCt2QNuUNTQKHPJz-fwhOruxQ2O4RGsBw1HHao0BF0h2QBCnUrTMeKhTuTim95zQRo2aocB6j_T3n5mx1J2ihmcHuCg2xFATu2R1tcGUIlC15yLICXQCi9AY9QCgfrVS0NZGirsvfFg69uVBAam06if8bTSjM7ZlxN25uHHTNwQEJjC5A69dXtRHnwoXbckDON1mlWn0CaHSrITga7s0KiHuliXc5YPCexzp2DWR-SouNhBrSQxoXd23oQeDcIcQ2saQTPmZUia3j89RMwDQcUYsL9H74FXjVdv2SGy5qEj_A_n8qEJloqdQLegfjM64JBhVqfIGsNWDPyVimsjTg5cBzOPTZZy4XhNRSnmlU8H9LRtdCPH7986rpua-QecyMVf0F1DiUvsDa8PIFsGqlMW3PD0XX6XArBGm0VKNkaXiO4rsfO8nkNuawNHs21O0NqvvBtR8llbbcoU9ikATLzeWXVEZoQSrUpCKVWRrgasw9-vXKlP3r1GtMalXG3tCm8ArN263XEK85019bPg_gvDhwbCVN4mleFp9vtFrXpDj5ivTW7jVRp0xKatInNr94ML7e0hUfCjaXas4tR3U88WuOhrOFqgaHJ3qaUM4XQhTO5QsO8D7X11Wtrt6MWVjCZ-Zq_4L9LQpPhgr1D4tWla-yZApbnbdUKZh0KH39aJpn5v52a7vO-k-3Eh2vwz_5YHbwWNh9KqaCLB7ie7Pot7SxKcxKnI8tGJtnTHvSMm5Rpa9S-ayeOyvTFwc7CRtj2kuRnoybmkHg0gRS6bXYCwRCKge_tmIyHI01yZ8dfCdEYzvwSrpuI5bJqDK_QhS5ryxLVOXsrrNwOjBYSEq2COYnW0wSzIrpbJFr1G-z6PEtf5RoN0Qv7QHnjRn1zo76-0cM_RPEvpHN_dEkNIeVz2zz1ECa9TTSFAdQhdcYg34m5DXb11HewJ2x4QmhyHIU3dckwiXffJuJbL5muXAFMRjaJUhKlwxJEaESilZNpx0TaE46Tb7I1ly2qnzAdzY5pEq6PrsZo8s4O3AtW6ZXm7uZxuEdXw73jhP1ikGUbeBzpSurbSPaBCeZPjKl8N58NWI7baTC_9MQRundZFveO_mwETCZggq8XzDOHebyaRMXR46vgOOJr6E7iQuh8Epr7Xv0oWH9LpIYSOncc1ltxbispZD_KfX3g3ctxMhp_nk2AaZC8YzJ_ms-GNbEP2G31MSD0kdBH6FrmTUsAQjdgwVx05o0d986Jl54_P2V4pfO1PHWZwF4LzPhdYeL4eSdnCtQZpWF_GGaUf92bh6eflr7n3Z_jKZ9OzaunW1typzIj0Yp3L_XDiO12HJpYBjdh_aO_uWeOK6JehhPISLTKJwJHz2XE9piqG7auhF6ZypcPw174UCzDYhEu2AMugzjw6TyZz5OH3XI2S7IgLONk4fsRK7GIw7wos0VBZ0UQJ_4DX1KfzvwwoEESxrPEizDHnGIQL1jpY5mRmY8V48ITYl_Z1fuBa93iMpkt5vGDYBkK7f7eR2mNB3CHtpaj9YNa2jvvsnarycwXXBt9kWK4Ebj8jHtUTMDnD5_WgN9bJt4hs4kfa_eHNC0rtIWl7UtJKUXhXkrc1twt38Py_NAqsZy-Mmy52bVZ_7dCq7f_9a5R8hvmhtCNs1YTunHe_DcAAP__ygrUFQ">