<table border="1" cellspacing="0" cellpadding="8">
<tr>
<th>Issue</th>
<td>
<a href=https://github.com/llvm/llvm-project/issues/76812>76812</a>
</td>
</tr>
<tr>
<th>Summary</th>
<td>
Basic vectorized compares should fallback on SWAR emulation on non-vector architectures
</td>
</tr>
<tr>
<th>Labels</th>
<td>
new issue
</td>
</tr>
<tr>
<th>Assignees</th>
<td>
</td>
</tr>
<tr>
<th>Reporter</th>
<td>
Validark
</td>
</tr>
</table>
<pre>
On non-vector architectures, we can efficiently emulate vector operations a lot of times.
First example is pretty similar to https://github.com/llvm/llvm-project/issues/66159. [Godbolt link](https://zig.godbolt.org/z/8WGG4hzfW). (`foo` should be the same as `bar`)
```zig
const std = @import("std");
export fn foo(data: [*]align(64) const u8) bool {
const vec: @Vector(64, u8) = data[0..64].*;
return @as(u64, @bitCast(vec >= @as(@Vector(64, u8), @splat(0x80)))) == 0;
}
export fn bar(data: [*]align(64) const u8) bool {
var vec_acc: u64 = 0;
inline for (0..8) |i| {
const vec: u64 = @bitCast(data[i*8..][0..8].*);
vec_acc |= vec;
}
return (vec_acc & 0x8080808080808080) == 0;
}
```
Movmasks are pretty useful too.
```zig
export fn foo(data: [*]align(64) const u8) u64 {
const vec: @Vector(64, u8) = data[0..64].*;
const underscores: u64 = @bitCast(vec == @as(@Vector(64, u8), @splat('_')));
const alpha_lo =
@as(u64, @bitCast(vec >= @as(@Vector(64, u8), @splat('a')))) &
@as(u64, @bitCast(vec <= @as(@Vector(64, u8), @splat('z'))));
const alpha_hi =
@as(u64, @bitCast(vec >= @as(@Vector(64, u8), @splat('A')))) &
@as(u64, @bitCast(vec <= @as(@Vector(64, u8), @splat('Z'))));
const digit =
@as(u64, @bitCast(vec >= @as(@Vector(64, u8), @splat('0')))) &
@as(u64, @bitCast(vec <= @as(@Vector(64, u8), @splat('9'))));
return underscores | alpha_lo | alpha_hi | digit;
}
```
In order to do the same thing efficiently on a non-vector architecture, we might do:
```zig
fn bar(data: [*]align(64) const u8) u64 {
var vec_acc: u64 = 0;
inline for (0..8) |i| {
const v: u64 = @bitCast(data[i * 8 ..][0..8].*);
const ones: @TypeOf(v) = @bitCast(@as(@Vector(@divExact(@bitSizeOf(@TypeOf(v)), 8), u8), @splat(1)));
const mask = comptime ones * 0x7F;
const low_7_bits = v & mask;
// Inspired by: http://0x80.pl/notesen/2023-03-06-swar-find-any.html
const non_underscore = (low_7_bits ^ comptime ones * '_') +% mask;
// alpha check:
// Upper 3 bits must be 4 or 6. Lower 5 bits must be between 1 and 26
const magic_mask = ((v & comptime ones * ~@as(u7, 0x20)) ^ comptime ones * 0x40);
const non_0x40_or_0x60 = magic_mask +% mask;
const non_alpha = magic_mask +% comptime ones * (0x80 - 27);
// digit check:
// Upper nibble must be 3. Lower nibble must be [0, 9]
const flipped_0x30 = low_7_bits ^ comptime ones * 0x30;
const non_digit = flipped_0x30 +% comptime ones * (0x80 - 0xA);
const chunk = ~v & (non_0x40_or_0x60 ^ (non_digit & non_alpha & non_underscore));
vec_acc |= swarMovMask(chunk) << (i * 8);
}
return vec_acc;
}
/// Creates a bitstring from the most significant bit of each byte in a given bitstring.
///
/// E.g. 1....... 0....... 0....... 1....... 0....... 1....... 1....... 1....... => 10010111
fn swarMovMask(v: anytype) @TypeOf(v) {
comptime assert(@divExact(@bitSizeOf(@TypeOf(v)), 8) <= 8);
const ones: @TypeOf(v) = @bitCast(@as(@Vector(@sizeOf(@TypeOf(v)), u8), @splat(1)));
const msb_mask = 0x80 * ones;
// We are exploiting a multiplication as a shifter and adder, and the derivation of this number is
// shown here as a comptime loop.
// This trick is often generalizable to other problems too: https://stackoverflow.com/a/14547307
comptime var mult: @TypeOf(v) = 0;
comptime for (0..@sizeOf(@TypeOf(v))) |i| {
mult |= @as(@TypeOf(v), 1) << (7 * i);
};
// Example with 32 bit integers:
// We want to concentrate the upper bits of each byte into a single nibble.
// Doing the gradeschool multiplication algorithm, we can see that each 1 bit
// in the bottom multiplicand shifts the upper multiplicand, and then we add all these
// shifted bitstrings together. (Note `.` represents a 0)
// a.......b.......c.......d.......
// * ..........1......1......1......1
// -------------------------------------------------------------------------
// a.......b.......c.......d.......
// .b.......c.......d..............
// ..c.......d.....................
// + ...d............................
// -------------------------------------------------------------------------
// abcd....bcd.....cd......d.......
// Then we simply shift to the right by `32 - 4` (bitstring size minus the number of relevant bits) to isolate the desired `abcd` bits in the least significant byte!
// Inspired by: http://0x80.pl/articles/scalar-sse-movmask.html
return (mult *% (v & msb_mask)) >> (@bitSizeOf(@TypeOf(v)) - @sizeOf(@TypeOf(v)));
}
```
This implementation has exactly the same semantics, but [on riscv64 sifive_u74, it reduces the instruction count from 1143 to 204](https://zig.godbolt.org/z/64drbKqfv) (change which function has `export` at the beginning).
</pre>
<img width="1px" height="1px" alt="" src="http://email.email.llvm.org/o/eJzEWV9v47gR_zTMy8ACTcmy85CH_D0c2usBvest0JeAkkYSG4p0ScqJ83CfvRhK_m9v0r3bVhusLImcGf7mx5khKb1XjUG8YbM7Nnu4kn1orbv5TWpVSfdyVdhqffOzAWPNZIVlsA6kK1sVsAy9Q8_EPbwilNIA1rUqFZqg14Bdr2VAGLvYJToZlDUeJGgbwNYQVIc-YfyB8dvh_yflfAB8k91SIygPS4chrMGrTmnpIFhoQ1h6lt4y8cTEU6NC2xdJaTsmnrRebW6TpbP_wjIw8aS878nKpzyfzq4TYLO7H2xVWB1AK_PCZg9MLA6lvqsmaYY2iXUNvWHiafHlhx-y9r3-wgSJEQuW89palnPwre11BQVCaBG87BCkB5bzQjqWcyau90dJb-Lfu2qGN6U1PoAPFbD0AVjGVbe0LpAOIXyomBAkI73bF4Nv1AZqA2SFWFQySJbe0gCZuGWzB6lVY5hY5BkT1zDo6Bf0u7BWA5uP4gBg_LrCMkrI-G_RbWPn-7EbGRe1zO54kuQZmz0kpCrdE-Qw9M6QCOmZWPRDf5bxQoV76WlMKyyBpY_jUGOzCxrHrn6pJXXkb4uI5eaPDCIpfIfM_OE8ROSIPwjRSjoC6FmWEaQ-z-BQOTVSRiuDUFtHDOFJMgA3v1dsfr8n7wDvjagDlEagFRO3iyQhmkbUFxvQ9_gA4zVaR-pIXJS-12QLzr6fojuGXiIHQvjw3wcwb6i8j_pPdtVJ_-JBOtzM4N5j3WsI1iZfnwp_gNURxu9C6lGLqdD50lLQu-S1gdsP_z23mZg_MzHf0ftUv9TLVj5rGxUM377LNGNiLg9MIZRE_mmV99-k8v1I5VG0O8CgVXsYjNf3guL2MhSf1vxtiPzzA0S2xKhUo8L_DhH-f0Pk-hIiexFtb5ZSINybNdsHos_8foDtkzHtRwPWVRhLkMru0nxolWkOCh9rQF4qlsZaqVNNG6CyVHB8NRh-S-I6iYIf563PJq3xGsPqh5kLmLiFBXwyeQ1SrRmCK8v4r-sl_lwTYzZh-kDLOfKwjFdq9fgmy7FFocIv6n0QcyxyZNiGaecoNz3PtJ25lOeiaaXtllTPxgHEgfO3-dOFXtq-Ps-fCxV87LuKqZdEnZnhcTrFqhR-NH6pHFZQrAkiqlm3JSsl7mSpmXgyNqBHw8ST4CKd8HTC84l_lW5SK1NNpFknbej0ObuMNc-76TNALhb71s4ez4x0l7qAiTsmZmfHMg4izkAoWyxftvQ_Guc_lkt0kEJU2fU-UGGdgXWQJ_BX-4oOZocfCwyviAamIE0Fx9Fo46pGlc9bh8WAshigPx3S79sINidK8DfBN7HuLAT8LeOXWUK4Uotn6575W86jAfv2nEHtVMKA3Pmu55wSy2WYwBAyv8qsIYF8wilGFYXGLe7pxh9H72m2E27XNPPPDKfWarnE6pm_pQMYH3KMWn4Fm20GPBL9MTr87SvRqGx7M9Dl94EpTCxOnTl73LwfzRD5vsPGp93EuhBRjqp3mrI_2dVPRAqxiJYMcfCepfekcIyvx6KOlkB7mXGbBM4vlgZHk6_vHcqAtFInjwRHKa52totpr7O0VFWNUbUqpQnUhlbzKMsWinVAUJQBG7VCs-ufHOk4VvmYNAlMk-ECfvLj9NP08o9YgD_ClPMpn06n21x6CGlMYNKsw3qJEdmTlHO4lhg5JL1HF_5AstlUPyee-5MyoP_Ahs8mujFs-mIXNOOcIdYNRp6ElNGZXzAu_fBtqa0KxB4JXa-DWmpVxl0gkMQu36o6oItBW1YVOrKKHohnFTq1GhrbmuosD6bvCnSg_IlG39pXAy06HCRvvaWtXSYnzX8lacGp8gWUB1sHNNCgQSe1epcUyYIFG1p0sHS20Nh5Wrpusu5up8gHWb7YFbpa29dxG0oy8TTNZtk85fMtlKM5VIsREhcdzI8YMfbblWYfOvgrhRtp3oSXPfIcybiH6WGgmUeXq3OB5iIFHsc9vFcVWkhFDBLKBGzQ-YMUs6PMKwWTYIl3JZrgZBi20_qYeWJuOAozwRKJlGk0jhno1NUPlvhHchonK_Rla60-YaNurFOh7fY2Mz2SehkGjVMy4ES4MlFyYUOw3Z5QUw3c9nsD2P-6x3ND-mRVgdSanj2eITdNk2oXTImMDRI94z7k32xAYDlPWM7B4dJR_RdoGux2HvfEAcgxUBbjvRzv1Xg_6UL-T7bX9OztpNPkz7oOC8hvMP9iy4sdLrW8DNAdXGx9qdP3A6gooyXjLRlvRwCdiYoDG73qlno90I4mJHHYxSVrsSaepQImkBHZmFjsKgQKS9Ap0w-sH4O1rcGhxtVYKXgKLcGC8lZvJniFPi5rWM7JchIcZ_s4uzTK44pjHZCJ6bnlxefWSNIFVep4KuBLqaWbeI-Tbti3PFoe7TZKh_BJq9cZbNcOm_y43ZF-pNLjUwUBTODjaP7J_YmY0chv2KEJQ1RrpQek8kSvdxsWHjtpgirjuU3RB6rVrQGnfLnKM_CqVit87udxG0YFcFj1JQ4uVcYH15dReGl7E4aqcDrNUvKp4NnnT1PyrHLFX_5dD5mPClxpGoTXVpUt1L0pt0NgOR82hYkYMgwRFxtljDJNPIu5qm7S6jq9lld4M53zTCzyuZhftTdzXmTzfLEQsk5nRVbJMq3SukrT6zSt5tPFlboRXGR8ytOpmC2yeYLlPOWYC0yvsyydZSzj2EmlE61XHZl_Fc-Tbub5YiqutCxQ-3hyJoTBV4gfmRBs9nDlbuJJVNE3nmVcKx_8TkpQQePNnfSqHI_I1DtWMeFLh35zolRLrQtZvoA18MuX27-Px2qxKLp8JHfVO33z7QdlcWz_CQAA__9-qL9G">