[llvm-branch-commits] [libc] [libc][wctype] Upstream immintrin storage from PtrHash-cc prototype to LLVM libc (PR #175038)
Muhammad Bassiouni via llvm-branch-commits
llvm-branch-commits at lists.llvm.org
Fri Jan 9 14:00:46 PST 2026
https://github.com/bassiounix updated https://github.com/llvm/llvm-project/pull/175038
>From 0400f21353ef23e82e6515ead54f7e8ff2088a7c Mon Sep 17 00:00:00 2001
From: bassiounix <muhammad.m.bassiouni at gmail.com>
Date: Thu, 8 Jan 2026 19:44:47 +0200
Subject: [PATCH] [libc][wctype] Upstream immintrin storage from PtrHash-cc
prototype to LLVM libc
---
.../wctype/conversion/random/CMakeLists.txt | 22 ++
.../__support/wctype/conversion/random/imm.h | 268 ++++++++++++++
.../wctype/conversion/random/vec128_storage.h | 76 ++++
.../conversion/random/vec256_storage.cpp | 60 +++
.../wctype/conversion/random/vec256_storage.h | 63 ++++
.../conversion/random/vec512_storage.cpp | 342 ++++++++++++++++++
.../wctype/conversion/random/vec512_storage.h | 82 +++++
7 files changed, 913 insertions(+)
create mode 100644 libc/src/__support/wctype/conversion/random/imm.h
create mode 100644 libc/src/__support/wctype/conversion/random/vec128_storage.h
create mode 100644 libc/src/__support/wctype/conversion/random/vec256_storage.cpp
create mode 100644 libc/src/__support/wctype/conversion/random/vec256_storage.h
create mode 100644 libc/src/__support/wctype/conversion/random/vec512_storage.cpp
create mode 100644 libc/src/__support/wctype/conversion/random/vec512_storage.h
diff --git a/libc/src/__support/wctype/conversion/random/CMakeLists.txt b/libc/src/__support/wctype/conversion/random/CMakeLists.txt
index dd9d577e4cd8b..a7cab77016033 100644
--- a/libc/src/__support/wctype/conversion/random/CMakeLists.txt
+++ b/libc/src/__support/wctype/conversion/random/CMakeLists.txt
@@ -5,3 +5,25 @@ add_header_library(
DEPENDS
libc.src.__support.wctype.conversion.utils.utils
)
+
+add_header_library(
+ vec128_storage
+ HDRS
+ vec128_storage.h
+ DEPENDS
+ libc.src.__support.CPP.array
+ libc.src.__support.wctype.conversion.utils.slice
+)
+
+add_object_library(
+ vector_storage
+ HDRS
+ imm.h
+ vec256_storage.h
+ vec512_storage.h
+ SRCS
+ vec512_storage.cpp
+ vec256_storage.cpp
+ DEPENDS
+ .vec128_storage
+)
diff --git a/libc/src/__support/wctype/conversion/random/imm.h b/libc/src/__support/wctype/conversion/random/imm.h
new file mode 100644
index 0000000000000..e67cb1b5774b9
--- /dev/null
+++ b/libc/src/__support/wctype/conversion/random/imm.h
@@ -0,0 +1,268 @@
+//===-- Portable subset of <immintrin.h> ------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+// Only little-endian is supported (runtime code is not affected by this).
+
+#ifndef LLVM_LIBC_SRC___SUPPORT_WCTYPE_CONVERSION_RANDOM_IMM_H
+#define LLVM_LIBC_SRC___SUPPORT_WCTYPE_CONVERSION_RANDOM_IMM_H
+
+#include "vec512_storage.h"
+
+namespace LIBC_NAMESPACE_DECL {
+
+namespace wctype_internal {
+
+namespace random {
+
+namespace immintrin {
+
+using random::vector_storage::vec128_storage;
+using random::vector_storage::vec256_storage;
+using random::vector_storage::vec512_storage;
+
+LIBC_INLINE static constexpr vec256_storage
+mm256_add_epi32(const vec256_storage &a, const vec256_storage &b) {
+ vec256_storage r{{}};
+ for (int i = 0; i < 8; ++i) {
+ r.u32x8[i] = a.u32x8[i] + b.u32x8[i]; // modulo 2^32
+ }
+ return r;
+}
+
+LIBC_INLINE static constexpr vec512_storage
+mm256_add_epi32(const vec512_storage &a, const vec512_storage &b) {
+ vec512_storage r{{}};
+ for (int i = 0; i < 16; ++i) {
+ r.u32x16[i] = a.u32x16[i] + b.u32x16[i]; // modulo 2^32
+ }
+ return r;
+}
+
+LIBC_INLINE static constexpr vec256_storage
+mm256_xor_si256(const cpp::array<uint32_t, 8> &a,
+ const cpp::array<uint32_t, 8> &b) {
+ cpp::array<uint32_t, 8> r{};
+ for (int i = 0; i < 8; ++i) {
+ r[i] = a[i] ^ b[i];
+ }
+ return r;
+}
+
+LIBC_INLINE static constexpr vec512_storage
+mm256_xor_si256(const cpp::array<uint32_t, 16> &a,
+ const cpp::array<uint32_t, 16> &b) {
+ vec512_storage r{.u32x16 = {}};
+ for (int i = 0; i < 16; ++i) {
+ r.u32x16[i] = a[i] ^ b[i];
+ }
+ return r;
+}
+
+LIBC_INLINE static constexpr vec256_storage
+mm256_shuffle_epi8(const vec256_storage &a, const vec256_storage &b) {
+ vec256_storage r{{}};
+ for (size_t k = 0; k < 8; k++) {
+ r.u32x8[k] = 0;
+ }
+
+ // Helper for 128-bit lane (16 bytes)
+ auto shuffle_128 = [](const uint32_t *src, const uint32_t *ctrl,
+ uint32_t *dst) {
+ // dst must be zero-initialized by caller
+ for (int i = 0; i < 16; ++i) {
+ uint8_t c = (ctrl[i / 4] >> ((i % 4) * 8)) & 0xFF;
+
+ if (c & 0x80) {
+ // zero byte → already zero
+ continue;
+ }
+
+ int k = c & 0x0F;
+ uint8_t byte = (src[k / 4] >> ((k % 4) * 8)) & 0xFF;
+
+ dst[i / 4] |= static_cast<uint32_t>(byte) << ((i % 4) * 8);
+ }
+ };
+
+ // Shuffle lower 128-bit lane
+ shuffle_128(&a.u32x8[0], &b.u32x8[0], &r.u32x8[0]);
+ // Shuffle upper 128-bit lane
+ shuffle_128(&a.u32x8[4], &b.u32x8[4], &r.u32x8[4]);
+
+ return r;
+}
+
+LIBC_INLINE static constexpr vec256_storage
+mm256_set_epi64x(long long a, long long b, long long c, long long d) {
+ vec256_storage v{{}};
+
+ // Lower 128-bit lane (d, c)
+ v.u32x8[0] = static_cast<uint32_t>(d); // d[31:0]
+ v.u32x8[1] = static_cast<uint32_t>(d >> 32); // d[63:32]
+ v.u32x8[2] = static_cast<uint32_t>(c); // c[31:0]
+ v.u32x8[3] = static_cast<uint32_t>(c >> 32); // c[63:32]
+
+ // Upper 128-bit lane (b, a)
+ v.u32x8[4] = static_cast<uint32_t>(b); // b[31:0]
+ v.u32x8[5] = static_cast<uint32_t>(b >> 32); // b[63:32]
+ v.u32x8[6] = static_cast<uint32_t>(a); // a[31:0]
+ v.u32x8[7] = static_cast<uint32_t>(a >> 32); // a[63:32]
+
+ return v;
+}
+
+LIBC_INLINE static constexpr vec256_storage
+mm256_or_si256(const vec256_storage &a, const vec256_storage &b) {
+ vec256_storage r{{}};
+ for (int i = 0; i < 8; ++i) {
+ r.u32x8[i] = a.u32x8[i] | b.u32x8[i];
+ }
+ return r;
+}
+
+LIBC_INLINE static constexpr vec256_storage
+mm256_srli_epi32(const vec256_storage &a, int count) {
+ vec256_storage r{{}};
+
+ // Cap the shift count at 31, as larger shifts produce zero
+ const int c = count & 0x1F;
+
+ for (int i = 0; i < 8; ++i) {
+ r.u32x8[i] = a.u32x8[i] >> c;
+ }
+
+ return r;
+}
+
+LIBC_INLINE static constexpr vec256_storage
+mm256_slli_epi32(const vec256_storage &a, int count) {
+ vec256_storage r{{}};
+
+ // Cap the shift count at 31, as larger shifts produce zero
+ const int c = count & 0x1F;
+
+ for (int i = 0; i < 8; ++i) {
+ r.u32x8[i] = a.u32x8[i] << c;
+ }
+
+ return r;
+}
+
+LIBC_INLINE static constexpr vec256_storage
+mm256_permute2x128_si256(const vec256_storage &V1, const vec256_storage &V2,
+ int M) {
+ vec256_storage r{{}};
+
+ // For each 128-bit destination half
+ for (int half = 0; half < 2; ++half) {
+ int control = (M >> (half * 4)) & 0xF;
+ int dst_base = half * 4;
+
+ if (control & 0x8) {
+ // bit 3 set → zero this 128-bit half
+ for (int i = 0; i < 4; ++i) {
+ r.u32x8[dst_base + i] = 0;
+ }
+ } else {
+ // bits [1:0] select source half
+ const vec256_storage *src{};
+ int src_base{};
+
+ switch (control & 0x3) {
+ case 0: // V1 lower
+ src = &V1;
+ src_base = 0;
+ break;
+ case 1: // V1 upper
+ src = &V1;
+ src_base = 4;
+ break;
+ case 2: // V2 lower
+ src = &V2;
+ src_base = 0;
+ break;
+ case 3: // V2 upper
+ src = &V2;
+ src_base = 4;
+ break;
+ }
+
+ for (int i = 0; i < 4; ++i) {
+ r.u32x8[dst_base + i] = src->u32x8[src_base + i];
+ }
+ }
+ }
+
+ return r;
+}
+
+// a_lo and a_hi are each 128-bit vectors represented as 4 x 32-bit integers
+LIBC_INLINE static constexpr vec256_storage
+mm256_setr_m128i(const vec128_storage &lo, const vec128_storage &hi) {
+ return vec256_storage{{
+ lo.u32x4[0],
+ lo.u32x4[1],
+ lo.u32x4[2],
+ lo.u32x4[3],
+ hi.u32x4[0],
+ hi.u32x4[1],
+ hi.u32x4[2],
+ hi.u32x4[3],
+ }};
+}
+
+LIBC_INLINE static constexpr vec256_storage
+mm256_shuffle_epi32(vec256_storage a, int imm) {
+ vec256_storage r{{}};
+
+ // lower half (elements 0..3)
+ for (int i = 0; i < 4; ++i) {
+ int src = (imm >> (2 * i)) & 0x3;
+ r.u32x8[i] = a.u32x8[src];
+ }
+
+ // upper half (elements 4..7)
+ for (int i = 0; i < 4; ++i) {
+ int src = (imm >> (2 * i)) & 0x3;
+ r.u32x8[4 + i] = a.u32x8[4 + src];
+ }
+
+ return r;
+}
+
+LIBC_INLINE static constexpr vec128_storage
+mm256_extracti128_si256(const vec256_storage &V, int M) {
+ const int base = (M & 1) * 4;
+ return {{V.u32x8[base + 0], V.u32x8[base + 1], V.u32x8[base + 2],
+ V.u32x8[base + 3]}};
+}
+
+LIBC_INLINE static constexpr vec128_storage
+mm_add_epi64(const cpp::array<uint32_t, 4> &a,
+ const cpp::array<uint32_t, 4> &b) {
+ return {cpp::array<uint32_t, 4>{
+ a[0] + b[0],
+ a[1] + b[1],
+ a[2] + b[2],
+ a[3] + b[3],
+ }};
+}
+
+LIBC_INLINE static constexpr cpp::array<uint32_t, 4>
+add_epi64(const cpp::array<uint32_t, 4> &a, const cpp::array<uint32_t, 4> &b) {
+ return {a[0] + b[0], a[1] + b[1], a[2] + b[2], a[3] + b[3]};
+}
+
+} // namespace immintrin
+
+} // namespace random
+
+} // namespace wctype_internal
+
+} // namespace LIBC_NAMESPACE_DECL
+
+#endif // LLVM_LIBC_SRC___SUPPORT_WCTYPE_CONVERSION_RANDOM_IMM_H
diff --git a/libc/src/__support/wctype/conversion/random/vec128_storage.h b/libc/src/__support/wctype/conversion/random/vec128_storage.h
new file mode 100644
index 0000000000000..1416fe5b9a60a
--- /dev/null
+++ b/libc/src/__support/wctype/conversion/random/vec128_storage.h
@@ -0,0 +1,76 @@
+//===-- 128-bit storage for StdRng - wctype conversion ----------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+// This is a portable implementation of a 128-bit vector storage implemented
+// with static arrays, parallel of <immintrin.h>'s `__m128i` which works with
+// `constexpr` code.
+// Only little-endian is supported (runtime code is not affected by this).
+
+#ifndef LLVM_LIBC_SRC___SUPPORT_WCTYPE_CONVERSION_RANDOM_VEC128_STORAGE_H
+#define LLVM_LIBC_SRC___SUPPORT_WCTYPE_CONVERSION_RANDOM_VEC128_STORAGE_H
+
+#include "src/__support/CPP/array.h"
+#include "src/__support/wctype/conversion/utils/slice.h"
+
+namespace LIBC_NAMESPACE_DECL {
+
+namespace wctype_internal {
+
+namespace random {
+
+namespace vector_storage {
+
+union vec128_storage {
+ mutable cpp::array<uint32_t, 4> u32x4;
+
+ LIBC_INLINE constexpr vec128_storage(cpp::array<uint32_t, 4> &&x)
+ : u32x4(x) {}
+ LIBC_INLINE constexpr vec128_storage(cpp::array<uint32_t, 4> &x) : u32x4(x) {}
+ LIBC_INLINE constexpr vec128_storage() : u32x4() {}
+
+ LIBC_INLINE constexpr operator cpp::array<uint32_t, 4>() const {
+ return this->u32x4;
+ }
+
+ LIBC_INLINE constexpr cpp::array<uint32_t, 4> to_lanes() const {
+ return this->u32x4;
+ }
+
+ LIBC_INLINE static constexpr vec128_storage
+ from_lanes(cpp::array<uint32_t, 4> &&xs) {
+ return vec128_storage(xs);
+ }
+
+ LIBC_INLINE static constexpr auto from_lanes(cpp::array<uint64_t, 2> &&xs) {
+ cpp::array<uint32_t, 4> x = {
+ static_cast<uint32_t>(xs[0]), static_cast<uint32_t>(xs[0] >> 32),
+ static_cast<uint32_t>(xs[1]), static_cast<uint32_t>(xs[1] >> 32)};
+ return vec128_storage(x);
+ }
+
+ LIBC_INLINE static constexpr auto
+ read_le(conversion_utils::Slice<uint8_t> x) {
+ LIBC_ASSERT(x.size() == 16);
+ vec128_storage v = cpp::array<uint32_t, 4>{0};
+ uint32_t *dst = v.u32x4.data();
+ uint8_t *src = x.data();
+ for (uint8_t i = 0; i < 4; ++i)
+ dst[i] = src[i * 4] | (src[i * 4 + 1] << 8) | (src[i * 4 + 2] << 16) |
+ (src[i * 4 + 3] << 24);
+ return v;
+ }
+};
+
+} // namespace vector_storage
+
+} // namespace random
+
+} // namespace wctype_internal
+
+} // namespace LIBC_NAMESPACE_DECL
+
+#endif // LLVM_LIBC_SRC___SUPPORT_WCTYPE_CONVERSION_RANDOM_VEC128_STORAGE_H
diff --git a/libc/src/__support/wctype/conversion/random/vec256_storage.cpp b/libc/src/__support/wctype/conversion/random/vec256_storage.cpp
new file mode 100644
index 0000000000000..806a452b99db7
--- /dev/null
+++ b/libc/src/__support/wctype/conversion/random/vec256_storage.cpp
@@ -0,0 +1,60 @@
+//===-- 256-bit storage implementation --------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+// This is a portable implementation of a 256-bit vector storage implemented
+// with static arrays, parallel of <immintrin.h>'s AVX256 which works with
+// `constexpr` code.
+// Only little-endian is supported (runtime code is not affected by this).
+
+#include "vec256_storage.h"
+#include "imm.h"
+
+namespace LIBC_NAMESPACE_DECL {
+
+namespace wctype_internal {
+
+namespace random {
+
+namespace vector_storage {
+
+LIBC_INLINE constexpr vec256_storage
+vec256_storage::shuffle_lane_words3012() const {
+ return immintrin::mm256_shuffle_epi32(*this, 0b0011'1001);
+}
+
+LIBC_INLINE constexpr vec256_storage
+vec256_storage::shuffle_lane_words2301() const {
+ return immintrin::mm256_shuffle_epi32(*this, 0b0100'1110);
+}
+
+LIBC_INLINE constexpr vec256_storage
+vec256_storage::shuffle_lane_words1230() const {
+ return immintrin::mm256_shuffle_epi32(*this, 0b1001'0011);
+}
+
+LIBC_INLINE constexpr vec256_storage vec256_storage::to_lanes() const {
+ auto lo = immintrin::mm256_extracti128_si256(*this, 0);
+ auto hi = immintrin::mm256_extracti128_si256(*this, 1);
+ return vec256_storage{{
+ lo.u32x4[0],
+ lo.u32x4[1],
+ lo.u32x4[2],
+ lo.u32x4[3],
+ hi.u32x4[0],
+ hi.u32x4[1],
+ hi.u32x4[2],
+ hi.u32x4[3],
+ }};
+}
+
+} // namespace vector_storage
+
+} // namespace random
+
+} // namespace wctype_internal
+
+} // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/__support/wctype/conversion/random/vec256_storage.h b/libc/src/__support/wctype/conversion/random/vec256_storage.h
new file mode 100644
index 0000000000000..f3bd16b8731c5
--- /dev/null
+++ b/libc/src/__support/wctype/conversion/random/vec256_storage.h
@@ -0,0 +1,63 @@
+//===-- 256-bit storage for StdRng - wctype conversion ----------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+// This is a portable implementation of a 256-bit vector storage implemented
+// with static arrays, parallel of <immintrin.h>'s AVX256 which works with
+// `constexpr` code.
+// Only little-endian is supported (runtime code is not affected by this).
+
+#ifndef LLVM_LIBC_SRC___SUPPORT_WCTYPE_CONVERSION_RANDOM_VEC256_STORAGE_H
+#define LLVM_LIBC_SRC___SUPPORT_WCTYPE_CONVERSION_RANDOM_VEC256_STORAGE_H
+
+#include "vec128_storage.h"
+
+namespace LIBC_NAMESPACE_DECL {
+
+namespace wctype_internal {
+
+namespace random {
+
+namespace vector_storage {
+
+union vec256_storage {
+ mutable cpp::array<uint32_t, 8> u32x8;
+
+ LIBC_INLINE constexpr operator cpp::array<uint32_t, 8>() const {
+ return this->u32x8;
+ }
+
+ LIBC_INLINE constexpr vec256_storage() : u32x8() {}
+ LIBC_INLINE static constexpr vec256_storage
+ construct_from_vec128(vec128_storage &&lo, vec128_storage &&hi) {
+ vec256_storage r{{}};
+ for (size_t i = 0; i < 4; i++) {
+ r.u32x8[i] = lo.u32x4[i];
+ }
+ for (size_t i = 0; i < 4; i++) {
+ r.u32x8[i + 4] = hi.u32x4[i];
+ }
+ return r;
+ }
+
+ LIBC_INLINE constexpr vec256_storage(cpp::array<uint32_t, 8> &&x)
+ : u32x8(x) {}
+
+ LIBC_INLINE constexpr vec256_storage shuffle_lane_words3012() const;
+ LIBC_INLINE constexpr vec256_storage shuffle_lane_words2301() const;
+ LIBC_INLINE constexpr vec256_storage shuffle_lane_words1230() const;
+ LIBC_INLINE constexpr vec256_storage to_lanes() const;
+};
+
+} // namespace vector_storage
+
+} // namespace random
+
+} // namespace wctype_internal
+
+} // namespace LIBC_NAMESPACE_DECL
+
+#endif // LLVM_LIBC_SRC___SUPPORT_WCTYPE_CONVERSION_RANDOM_VEC256_STORAGE_H
diff --git a/libc/src/__support/wctype/conversion/random/vec512_storage.cpp b/libc/src/__support/wctype/conversion/random/vec512_storage.cpp
new file mode 100644
index 0000000000000..e239703d08443
--- /dev/null
+++ b/libc/src/__support/wctype/conversion/random/vec512_storage.cpp
@@ -0,0 +1,342 @@
+//===-- 512-bit storage implementation --------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "vec512_storage.h"
+#include "imm.h"
+
+namespace LIBC_NAMESPACE_DECL {
+
+namespace wctype_internal {
+
+namespace random {
+
+namespace vector_storage {
+
+LIBC_INLINE constexpr vec512_storage
+vec512_storage::construct_from_vec256(const vec256_storage &lo,
+ const vec256_storage &hi) {
+ vec512_storage r{{}};
+ for (size_t i = 0; i < 8; i++) {
+ r.u32x16[i] = lo.u32x8[i];
+ }
+ for (size_t i = 0; i < 8; i++) {
+ r.u32x16[8 + i] = hi.u32x8[i];
+ }
+ return r;
+}
+
+LIBC_INLINE constexpr vec512_storage vec512_storage::new128(vec128_storage i,
+ vec128_storage j,
+ vec128_storage k,
+ vec128_storage l) {
+ vec512_storage r{{}};
+ for (size_t a = 0; a < 4; a++) {
+ r.u32x16[a] = i.u32x4[a];
+ r.u32x16[a + 4] = j.u32x4[a];
+ r.u32x16[a + 8] = k.u32x4[a];
+ r.u32x16[a + 12] = l.u32x4[a];
+ }
+ return r;
+}
+
+LIBC_INLINE constexpr const vec512_storage &
+vec512_storage::operator+=(vec512_storage &rhs) const {
+ this->u32x16 = immintrin::mm256_add_epi32(*this, rhs).u32x16;
+ return *this;
+}
+
+LIBC_INLINE constexpr vec512_storage
+vec512_storage::operator+(const vec512_storage &rhs) const {
+ return immintrin::mm256_add_epi32(*this, rhs);
+}
+
+LIBC_INLINE constexpr vec512_storage
+vec512_storage::operator^(vec512_storage &rhs) const {
+ return immintrin::mm256_xor_si256(*this, rhs);
+}
+
+LIBC_INLINE constexpr vec512_storage
+vec512_storage::rotate_each_word_right16() const {
+ auto constexpr K0 = 0x0d0c'0f0e'0908'0b0a;
+ auto constexpr K1 = 0x0504'0706'0100'0302;
+
+ vec256_storage lo{{}};
+ vec256_storage hi{{}};
+
+ for (size_t i = 0; i < 8; i++) {
+ lo.u32x8[i] = this->u32x16[i];
+ }
+ for (size_t i = 0; i < 8; i++) {
+ hi.u32x8[i] = this->u32x16[8 + i];
+ }
+ lo = immintrin::mm256_shuffle_epi8(
+ lo, immintrin::mm256_set_epi64x(K0, K1, K0, K1));
+ hi = immintrin::mm256_shuffle_epi8(
+ hi, immintrin::mm256_set_epi64x(K0, K1, K0, K1));
+
+ vec512_storage ret{{}};
+ for (size_t i = 0; i < 8; i++) {
+ ret.u32x16[i] = lo.u32x8[i];
+ }
+
+ for (size_t i = 0; i < 8; i++) {
+ ret.u32x16[8 + i] = hi.u32x8[i];
+ }
+
+ return ret;
+}
+
+LIBC_INLINE constexpr vec512_storage
+vec512_storage::rotate_each_word_right20() const {
+ constexpr int32_t I = 20;
+
+ vec256_storage lo{{}};
+ vec256_storage hi{{}};
+
+ for (size_t i = 0; i < 8; i++) {
+ lo.u32x8[i] = this->u32x16[i];
+ }
+ for (size_t i = 0; i < 8; i++) {
+ hi.u32x8[i] = this->u32x16[8 + i];
+ }
+
+ lo = immintrin::mm256_or_si256(immintrin::mm256_srli_epi32(lo, I),
+ immintrin::mm256_slli_epi32(lo, 32 - I));
+ hi = immintrin::mm256_or_si256(immintrin::mm256_srli_epi32(hi, I),
+ immintrin::mm256_slli_epi32(hi, 32 - I));
+
+ vec512_storage r{{}};
+
+ for (size_t i = 0; i < 8; i++) {
+ r.u32x16[i] = lo.u32x8[i];
+ }
+ for (size_t i = 0; i < 8; i++) {
+ r.u32x16[8 + i] = hi.u32x8[i];
+ }
+
+ return r;
+}
+
+LIBC_INLINE constexpr vec512_storage
+vec512_storage::rotate_each_word_right24() const {
+ auto constexpr K0 = 0x0e0d'0c0f'0a09'080b;
+ auto constexpr K1 = 0x0605'0407'0201'0003;
+
+ vec256_storage lo{{}};
+ vec256_storage hi{{}};
+
+ for (size_t i = 0; i < 8; i++) {
+ lo.u32x8[i] = this->u32x16[i];
+ }
+ for (size_t i = 0; i < 8; i++) {
+ hi.u32x8[i] = this->u32x16[8 + i];
+ }
+
+ lo = immintrin::mm256_shuffle_epi8(
+ lo, immintrin::mm256_set_epi64x(K0, K1, K0, K1));
+ hi = immintrin::mm256_shuffle_epi8(
+ hi, immintrin::mm256_set_epi64x(K0, K1, K0, K1));
+
+ vec512_storage r{{}};
+
+ for (size_t i = 0; i < 8; i++) {
+ r.u32x16[i] = lo.u32x8[i];
+ }
+ for (size_t i = 0; i < 8; i++) {
+ r.u32x16[8 + i] = hi.u32x8[i];
+ }
+
+ return r;
+}
+
+LIBC_INLINE constexpr vec512_storage
+vec512_storage::rotate_each_word_right25() const {
+ constexpr int32_t I = 25;
+ vec256_storage lo{{}};
+ vec256_storage hi{{}};
+
+ for (size_t i = 0; i < 8; i++) {
+ lo.u32x8[i] = this->u32x16[i];
+ }
+ for (size_t i = 0; i < 8; i++) {
+ hi.u32x8[i] = this->u32x16[8 + i];
+ }
+
+ lo = immintrin::mm256_or_si256(immintrin::mm256_srli_epi32(lo, I),
+ immintrin::mm256_slli_epi32(lo, 32 - I));
+ hi = immintrin::mm256_or_si256(immintrin::mm256_srli_epi32(hi, I),
+ immintrin::mm256_slli_epi32(hi, 32 - I));
+
+ vec512_storage r{{}};
+
+ for (size_t i = 0; i < 8; i++) {
+ r.u32x16[i] = lo.u32x8[i];
+ }
+ for (size_t i = 0; i < 8; i++) {
+ r.u32x16[8 + i] = hi.u32x8[i];
+ }
+
+ return r;
+}
+
+LIBC_INLINE constexpr vec512_storage
+vec512_storage::shuffle_lane_words3012() const {
+ vec256_storage lo{{}};
+ vec256_storage hi{{}};
+
+ for (size_t i = 0; i < 8; i++) {
+ lo.u32x8[i] = this->u32x16[i];
+ }
+ for (size_t i = 0; i < 8; i++) {
+ hi.u32x8[i] = this->u32x16[8 + i];
+ }
+ lo = lo.shuffle_lane_words3012();
+ hi = hi.shuffle_lane_words3012();
+
+ vec512_storage r{{}};
+
+ for (size_t i = 0; i < 8; i++) {
+ r.u32x16[i] = lo.u32x8[i];
+ }
+ for (size_t i = 0; i < 8; i++) {
+ r.u32x16[8 + i] = hi.u32x8[i];
+ }
+
+ return r;
+}
+
+LIBC_INLINE constexpr vec512_storage
+vec512_storage::shuffle_lane_words2301() const {
+ vec256_storage lo{{}};
+ vec256_storage hi{{}};
+
+ for (size_t i = 0; i < 8; i++) {
+ lo.u32x8[i] = this->u32x16[i];
+ }
+ for (size_t i = 0; i < 8; i++) {
+ hi.u32x8[i] = this->u32x16[8 + i];
+ }
+ lo = lo.shuffle_lane_words2301();
+ hi = hi.shuffle_lane_words2301();
+
+ vec512_storage r{{}};
+
+ for (size_t i = 0; i < 8; i++) {
+ r.u32x16[i] = lo.u32x8[i];
+ }
+ for (size_t i = 0; i < 8; i++) {
+ r.u32x16[8 + i] = hi.u32x8[i];
+ }
+
+ return r;
+}
+
+LIBC_INLINE constexpr vec512_storage
+vec512_storage::shuffle_lane_words1230() const {
+ vec256_storage lo{{}};
+ vec256_storage hi{{}};
+
+ for (size_t i = 0; i < 8; i++) {
+ lo.u32x8[i] = this->u32x16[i];
+ }
+ for (size_t i = 0; i < 8; i++) {
+ hi.u32x8[i] = this->u32x16[8 + i];
+ }
+
+ lo = lo.shuffle_lane_words1230();
+ hi = hi.shuffle_lane_words1230();
+
+ vec512_storage r{{}};
+
+ for (size_t i = 0; i < 8; i++) {
+ r.u32x16[i] = lo.u32x8[i];
+ }
+ for (size_t i = 0; i < 8; i++) {
+ r.u32x16[8 + i] = hi.u32x8[i];
+ }
+
+ return r;
+}
+
+LIBC_INLINE constexpr cpp::array<vec512_storage, 4>
+vec512_storage::transpose4(const vec512_storage &a, const vec512_storage &b,
+ const vec512_storage &c, const vec512_storage &d) {
+ /*
+ * a00:a01 a10:a11
+ * b00:b01 b10:b11
+ * c00:c01 c10:c11
+ * d00:d01 d10:d11
+ * =>
+ * a00:b00 c00:d00
+ * a01:b01 c01:d01
+ * a10:b10 c10:d10
+ * a11:b11 c11:d11
+ */
+ vec256_storage a_lo{{}};
+ vec256_storage b_lo{{}};
+
+ for (size_t i = 0; i < 8; i++) {
+ a_lo.u32x8[i] = a.u32x16[i];
+ }
+ for (size_t i = 0; i < 8; i++) {
+ b_lo.u32x8[i] = b.u32x16[i];
+ }
+ auto const ab00 = immintrin::mm256_permute2x128_si256(a_lo, b_lo, 0x20);
+ auto const ab01 = immintrin::mm256_permute2x128_si256(a_lo, b_lo, 0x31);
+
+ vec256_storage a_hi{{}};
+ vec256_storage b_hi{{}};
+
+ for (size_t i = 0; i < 8; i++) {
+ a_hi.u32x8[i] = a.u32x16[8 + i];
+ }
+ for (size_t i = 0; i < 8; i++) {
+ b_hi.u32x8[i] = b.u32x16[8 + i];
+ }
+ auto const ab10 = immintrin::mm256_permute2x128_si256(a_hi, b_hi, 0x20);
+ auto const ab11 = immintrin::mm256_permute2x128_si256(a_hi, b_hi, 0x31);
+
+ vec256_storage c_lo{{}};
+ vec256_storage d_lo{{}};
+
+ for (size_t i = 0; i < 8; i++) {
+ c_lo.u32x8[i] = c.u32x16[i];
+ }
+ for (size_t i = 0; i < 8; i++) {
+ d_lo.u32x8[i] = d.u32x16[i];
+ }
+ auto const cd00 = immintrin::mm256_permute2x128_si256(c_lo, d_lo, 0x20);
+ auto const cd01 = immintrin::mm256_permute2x128_si256(c_lo, d_lo, 0x31);
+
+ vec256_storage c_hi{{}};
+ vec256_storage d_hi{{}};
+
+ for (size_t i = 0; i < 8; i++) {
+ c_hi.u32x8[i] = c.u32x16[8 + i];
+ }
+ for (size_t i = 0; i < 8; i++) {
+ d_hi.u32x8[i] = d.u32x16[8 + i];
+ }
+ auto const cd10 = immintrin::mm256_permute2x128_si256(c_hi, d_hi, 0x20);
+ auto const cd11 = immintrin::mm256_permute2x128_si256(c_hi, d_hi, 0x31);
+
+ auto r1 = vec512_storage::construct_from_vec256(ab00, cd00);
+ auto r2 = vec512_storage::construct_from_vec256(ab01, cd01);
+ auto r3 = vec512_storage::construct_from_vec256(ab10, cd10);
+ auto r4 = vec512_storage::construct_from_vec256(ab11, cd11);
+
+ return cpp::array<vec512_storage, 4>{r1, r2, r3, r4};
+}
+
+} // namespace vector_storage
+
+} // namespace random
+
+} // namespace wctype_internal
+
+} // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/__support/wctype/conversion/random/vec512_storage.h b/libc/src/__support/wctype/conversion/random/vec512_storage.h
new file mode 100644
index 0000000000000..de80d4d0442d2
--- /dev/null
+++ b/libc/src/__support/wctype/conversion/random/vec512_storage.h
@@ -0,0 +1,82 @@
+//===-- 512-bit storage for StdRng - wctype conversion ----------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+// This is a portable implementation of a 512-bit vector storage implemented
+// with static arrays, works with `constexpr` code.
+// Only little-endian is supported (runtime code is not affected by this).
+
+#ifndef LLVM_LIBC_SRC___SUPPORT_WCTYPE_CONVERSION_RANDOM_VEC512_STORAGE_H
+#define LLVM_LIBC_SRC___SUPPORT_WCTYPE_CONVERSION_RANDOM_VEC512_STORAGE_H
+
+#include "vec256_storage.h"
+
+namespace LIBC_NAMESPACE_DECL {
+
+namespace wctype_internal {
+
+namespace random {
+
+namespace vector_storage {
+
+union vec512_storage {
+ mutable cpp::array<uint32_t, 16> u32x16;
+
+ LIBC_INLINE static constexpr vec512_storage
+ construct_from_vec256(const vec256_storage &lo, const vec256_storage &hi);
+
+ LIBC_INLINE constexpr operator cpp::array<uint32_t, 16>() const {
+ return this->u32x16;
+ }
+
+ LIBC_INLINE static constexpr vec512_storage
+ new128(cpp::array<uint32_t, 16> &&xs) {
+ return vec512_storage{xs};
+ }
+
+ LIBC_INLINE static constexpr vec512_storage new128(vec128_storage i,
+ vec128_storage j,
+ vec128_storage k,
+ vec128_storage l);
+
+ LIBC_INLINE constexpr vec512_storage unpack() const { return *this; }
+
+ LIBC_INLINE constexpr const vec512_storage &
+ operator+=(vec512_storage &rhs) const;
+
+ LIBC_INLINE constexpr vec512_storage operator+(const vec512_storage &) const;
+ LIBC_INLINE constexpr vec512_storage operator^(vec512_storage &rhs) const;
+
+ LIBC_INLINE constexpr vec512_storage rotate_each_word_right16() const;
+ LIBC_INLINE constexpr vec512_storage rotate_each_word_right20() const;
+ LIBC_INLINE constexpr vec512_storage rotate_each_word_right24() const;
+ LIBC_INLINE constexpr vec512_storage rotate_each_word_right25() const;
+
+ LIBC_INLINE constexpr vec512_storage shuffle_lane_words3012() const;
+ LIBC_INLINE constexpr vec512_storage shuffle_lane_words2301() const;
+ LIBC_INLINE constexpr vec512_storage shuffle_lane_words1230() const;
+
+ LIBC_INLINE static constexpr cpp::array<vec512_storage, 4>
+ transpose4(const vec512_storage &a, const vec512_storage &b,
+ const vec512_storage &c, const vec512_storage &d);
+
+ LIBC_INLINE constexpr conversion_utils::Slice<uint32_t> to_scalars() const {
+ return conversion_utils::Slice<uint32_t>(this->u32x16.data(),
+ this->u32x16.size());
+ }
+
+ LIBC_INLINE constexpr vec512_storage to_lanes() const { return *this; }
+};
+
+} // namespace vector_storage
+
+} // namespace random
+
+} // namespace wctype_internal
+
+} // namespace LIBC_NAMESPACE_DECL
+
+#endif // LLVM_LIBC_SRC___SUPPORT_WCTYPE_CONVERSION_RANDOM_VEC512_STORAGE_H
More information about the llvm-branch-commits
mailing list