[llvm-branch-commits] [libc] [libc][wctype] Upstream immintrin storage from PtrHash-cc prototype to LLVM libc (PR #175038)

Muhammad Bassiouni via llvm-branch-commits llvm-branch-commits at lists.llvm.org
Fri Jan 9 14:00:46 PST 2026


https://github.com/bassiounix updated https://github.com/llvm/llvm-project/pull/175038

>From 0400f21353ef23e82e6515ead54f7e8ff2088a7c Mon Sep 17 00:00:00 2001
From: bassiounix <muhammad.m.bassiouni at gmail.com>
Date: Thu, 8 Jan 2026 19:44:47 +0200
Subject: [PATCH] [libc][wctype] Upstream immintrin storage from PtrHash-cc
 prototype to LLVM libc

---
 .../wctype/conversion/random/CMakeLists.txt   |  22 ++
 .../__support/wctype/conversion/random/imm.h  | 268 ++++++++++++++
 .../wctype/conversion/random/vec128_storage.h |  76 ++++
 .../conversion/random/vec256_storage.cpp      |  60 +++
 .../wctype/conversion/random/vec256_storage.h |  63 ++++
 .../conversion/random/vec512_storage.cpp      | 342 ++++++++++++++++++
 .../wctype/conversion/random/vec512_storage.h |  82 +++++
 7 files changed, 913 insertions(+)
 create mode 100644 libc/src/__support/wctype/conversion/random/imm.h
 create mode 100644 libc/src/__support/wctype/conversion/random/vec128_storage.h
 create mode 100644 libc/src/__support/wctype/conversion/random/vec256_storage.cpp
 create mode 100644 libc/src/__support/wctype/conversion/random/vec256_storage.h
 create mode 100644 libc/src/__support/wctype/conversion/random/vec512_storage.cpp
 create mode 100644 libc/src/__support/wctype/conversion/random/vec512_storage.h

diff --git a/libc/src/__support/wctype/conversion/random/CMakeLists.txt b/libc/src/__support/wctype/conversion/random/CMakeLists.txt
index dd9d577e4cd8b..a7cab77016033 100644
--- a/libc/src/__support/wctype/conversion/random/CMakeLists.txt
+++ b/libc/src/__support/wctype/conversion/random/CMakeLists.txt
@@ -5,3 +5,25 @@ add_header_library(
   DEPENDS
     libc.src.__support.wctype.conversion.utils.utils
 )
+
+add_header_library(
+  vec128_storage
+  HDRS
+    vec128_storage.h
+  DEPENDS
+    libc.src.__support.CPP.array
+    libc.src.__support.wctype.conversion.utils.slice
+)
+
+add_object_library(
+  vector_storage
+  HDRS
+    imm.h
+    vec256_storage.h
+    vec512_storage.h
+  SRCS
+    vec512_storage.cpp
+    vec256_storage.cpp
+  DEPENDS
+    .vec128_storage
+)
diff --git a/libc/src/__support/wctype/conversion/random/imm.h b/libc/src/__support/wctype/conversion/random/imm.h
new file mode 100644
index 0000000000000..e67cb1b5774b9
--- /dev/null
+++ b/libc/src/__support/wctype/conversion/random/imm.h
@@ -0,0 +1,268 @@
+//===-- Portable subset of <immintrin.h> ------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+// Only little-endian is supported (runtime code is not affected by this).
+
+#ifndef LLVM_LIBC_SRC___SUPPORT_WCTYPE_CONVERSION_RANDOM_IMM_H
+#define LLVM_LIBC_SRC___SUPPORT_WCTYPE_CONVERSION_RANDOM_IMM_H
+
+#include "vec512_storage.h"
+
+namespace LIBC_NAMESPACE_DECL {
+
+namespace wctype_internal {
+
+namespace random {
+
+namespace immintrin {
+
+using random::vector_storage::vec128_storage;
+using random::vector_storage::vec256_storage;
+using random::vector_storage::vec512_storage;
+
+LIBC_INLINE static constexpr vec256_storage
+mm256_add_epi32(const vec256_storage &a, const vec256_storage &b) {
+  vec256_storage r{{}};
+  for (int i = 0; i < 8; ++i) {
+    r.u32x8[i] = a.u32x8[i] + b.u32x8[i]; // modulo 2^32
+  }
+  return r;
+}
+
+LIBC_INLINE static constexpr vec512_storage
+mm256_add_epi32(const vec512_storage &a, const vec512_storage &b) {
+  vec512_storage r{{}};
+  for (int i = 0; i < 16; ++i) {
+    r.u32x16[i] = a.u32x16[i] + b.u32x16[i]; // modulo 2^32
+  }
+  return r;
+}
+
+LIBC_INLINE static constexpr vec256_storage
+mm256_xor_si256(const cpp::array<uint32_t, 8> &a,
+                const cpp::array<uint32_t, 8> &b) {
+  cpp::array<uint32_t, 8> r{};
+  for (int i = 0; i < 8; ++i) {
+    r[i] = a[i] ^ b[i];
+  }
+  return r;
+}
+
+LIBC_INLINE static constexpr vec512_storage
+mm256_xor_si256(const cpp::array<uint32_t, 16> &a,
+                const cpp::array<uint32_t, 16> &b) {
+  vec512_storage r{.u32x16 = {}};
+  for (int i = 0; i < 16; ++i) {
+    r.u32x16[i] = a[i] ^ b[i];
+  }
+  return r;
+}
+
+LIBC_INLINE static constexpr vec256_storage
+mm256_shuffle_epi8(const vec256_storage &a, const vec256_storage &b) {
+  vec256_storage r{{}};
+  for (size_t k = 0; k < 8; k++) {
+    r.u32x8[k] = 0;
+  }
+
+  // Helper for 128-bit lane (16 bytes)
+  auto shuffle_128 = [](const uint32_t *src, const uint32_t *ctrl,
+                        uint32_t *dst) {
+    // dst must be zero-initialized by caller
+    for (int i = 0; i < 16; ++i) {
+      uint8_t c = (ctrl[i / 4] >> ((i % 4) * 8)) & 0xFF;
+
+      if (c & 0x80) {
+        // zero byte → already zero
+        continue;
+      }
+
+      int k = c & 0x0F;
+      uint8_t byte = (src[k / 4] >> ((k % 4) * 8)) & 0xFF;
+
+      dst[i / 4] |= static_cast<uint32_t>(byte) << ((i % 4) * 8);
+    }
+  };
+
+  // Shuffle lower 128-bit lane
+  shuffle_128(&a.u32x8[0], &b.u32x8[0], &r.u32x8[0]);
+  // Shuffle upper 128-bit lane
+  shuffle_128(&a.u32x8[4], &b.u32x8[4], &r.u32x8[4]);
+
+  return r;
+}
+
+LIBC_INLINE static constexpr vec256_storage
+mm256_set_epi64x(long long a, long long b, long long c, long long d) {
+  vec256_storage v{{}};
+
+  // Lower 128-bit lane (d, c)
+  v.u32x8[0] = static_cast<uint32_t>(d);       // d[31:0]
+  v.u32x8[1] = static_cast<uint32_t>(d >> 32); // d[63:32]
+  v.u32x8[2] = static_cast<uint32_t>(c);       // c[31:0]
+  v.u32x8[3] = static_cast<uint32_t>(c >> 32); // c[63:32]
+
+  // Upper 128-bit lane (b, a)
+  v.u32x8[4] = static_cast<uint32_t>(b);       // b[31:0]
+  v.u32x8[5] = static_cast<uint32_t>(b >> 32); // b[63:32]
+  v.u32x8[6] = static_cast<uint32_t>(a);       // a[31:0]
+  v.u32x8[7] = static_cast<uint32_t>(a >> 32); // a[63:32]
+
+  return v;
+}
+
+LIBC_INLINE static constexpr vec256_storage
+mm256_or_si256(const vec256_storage &a, const vec256_storage &b) {
+  vec256_storage r{{}};
+  for (int i = 0; i < 8; ++i) {
+    r.u32x8[i] = a.u32x8[i] | b.u32x8[i];
+  }
+  return r;
+}
+
+LIBC_INLINE static constexpr vec256_storage
+mm256_srli_epi32(const vec256_storage &a, int count) {
+  vec256_storage r{{}};
+
+  // Cap the shift count at 31, as larger shifts produce zero
+  const int c = count & 0x1F;
+
+  for (int i = 0; i < 8; ++i) {
+    r.u32x8[i] = a.u32x8[i] >> c;
+  }
+
+  return r;
+}
+
+LIBC_INLINE static constexpr vec256_storage
+mm256_slli_epi32(const vec256_storage &a, int count) {
+  vec256_storage r{{}};
+
+  // Cap the shift count at 31, as larger shifts produce zero
+  const int c = count & 0x1F;
+
+  for (int i = 0; i < 8; ++i) {
+    r.u32x8[i] = a.u32x8[i] << c;
+  }
+
+  return r;
+}
+
+LIBC_INLINE static constexpr vec256_storage
+mm256_permute2x128_si256(const vec256_storage &V1, const vec256_storage &V2,
+                         int M) {
+  vec256_storage r{{}};
+
+  // For each 128-bit destination half
+  for (int half = 0; half < 2; ++half) {
+    int control = (M >> (half * 4)) & 0xF;
+    int dst_base = half * 4;
+
+    if (control & 0x8) {
+      // bit 3 set → zero this 128-bit half
+      for (int i = 0; i < 4; ++i) {
+        r.u32x8[dst_base + i] = 0;
+      }
+    } else {
+      // bits [1:0] select source half
+      const vec256_storage *src{};
+      int src_base{};
+
+      switch (control & 0x3) {
+      case 0: // V1 lower
+        src = &V1;
+        src_base = 0;
+        break;
+      case 1: // V1 upper
+        src = &V1;
+        src_base = 4;
+        break;
+      case 2: // V2 lower
+        src = &V2;
+        src_base = 0;
+        break;
+      case 3: // V2 upper
+        src = &V2;
+        src_base = 4;
+        break;
+      }
+
+      for (int i = 0; i < 4; ++i) {
+        r.u32x8[dst_base + i] = src->u32x8[src_base + i];
+      }
+    }
+  }
+
+  return r;
+}
+
+// a_lo and a_hi are each 128-bit vectors represented as 4 x 32-bit integers
+LIBC_INLINE static constexpr vec256_storage
+mm256_setr_m128i(const vec128_storage &lo, const vec128_storage &hi) {
+  return vec256_storage{{
+      lo.u32x4[0],
+      lo.u32x4[1],
+      lo.u32x4[2],
+      lo.u32x4[3],
+      hi.u32x4[0],
+      hi.u32x4[1],
+      hi.u32x4[2],
+      hi.u32x4[3],
+  }};
+}
+
+LIBC_INLINE static constexpr vec256_storage
+mm256_shuffle_epi32(vec256_storage a, int imm) {
+  vec256_storage r{{}};
+
+  // lower half (elements 0..3)
+  for (int i = 0; i < 4; ++i) {
+    int src = (imm >> (2 * i)) & 0x3;
+    r.u32x8[i] = a.u32x8[src];
+  }
+
+  // upper half (elements 4..7)
+  for (int i = 0; i < 4; ++i) {
+    int src = (imm >> (2 * i)) & 0x3;
+    r.u32x8[4 + i] = a.u32x8[4 + src];
+  }
+
+  return r;
+}
+
+LIBC_INLINE static constexpr vec128_storage
+mm256_extracti128_si256(const vec256_storage &V, int M) {
+  const int base = (M & 1) * 4;
+  return {{V.u32x8[base + 0], V.u32x8[base + 1], V.u32x8[base + 2],
+           V.u32x8[base + 3]}};
+}
+
+LIBC_INLINE static constexpr vec128_storage
+mm_add_epi64(const cpp::array<uint32_t, 4> &a,
+             const cpp::array<uint32_t, 4> &b) {
+  return {cpp::array<uint32_t, 4>{
+      a[0] + b[0],
+      a[1] + b[1],
+      a[2] + b[2],
+      a[3] + b[3],
+  }};
+}
+
+LIBC_INLINE static constexpr cpp::array<uint32_t, 4>
+add_epi64(const cpp::array<uint32_t, 4> &a, const cpp::array<uint32_t, 4> &b) {
+  return {a[0] + b[0], a[1] + b[1], a[2] + b[2], a[3] + b[3]};
+}
+
+} // namespace immintrin
+
+} // namespace random
+
+} // namespace wctype_internal
+
+} // namespace LIBC_NAMESPACE_DECL
+
+#endif // LLVM_LIBC_SRC___SUPPORT_WCTYPE_CONVERSION_RANDOM_IMM_H
diff --git a/libc/src/__support/wctype/conversion/random/vec128_storage.h b/libc/src/__support/wctype/conversion/random/vec128_storage.h
new file mode 100644
index 0000000000000..1416fe5b9a60a
--- /dev/null
+++ b/libc/src/__support/wctype/conversion/random/vec128_storage.h
@@ -0,0 +1,76 @@
+//===-- 128-bit storage for StdRng - wctype conversion ----------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+// This is a portable implementation of a 128-bit vector storage implemented
+// with static arrays, parallel of <immintrin.h>'s `__m128i` which works with
+// `constexpr` code.
+// Only little-endian is supported (runtime code is not affected by this).
+
+#ifndef LLVM_LIBC_SRC___SUPPORT_WCTYPE_CONVERSION_RANDOM_VEC128_STORAGE_H
+#define LLVM_LIBC_SRC___SUPPORT_WCTYPE_CONVERSION_RANDOM_VEC128_STORAGE_H
+
+#include "src/__support/CPP/array.h"
+#include "src/__support/wctype/conversion/utils/slice.h"
+
+namespace LIBC_NAMESPACE_DECL {
+
+namespace wctype_internal {
+
+namespace random {
+
+namespace vector_storage {
+
+union vec128_storage {
+  mutable cpp::array<uint32_t, 4> u32x4;
+
+  LIBC_INLINE constexpr vec128_storage(cpp::array<uint32_t, 4> &&x)
+      : u32x4(x) {}
+  LIBC_INLINE constexpr vec128_storage(cpp::array<uint32_t, 4> &x) : u32x4(x) {}
+  LIBC_INLINE constexpr vec128_storage() : u32x4() {}
+
+  LIBC_INLINE constexpr operator cpp::array<uint32_t, 4>() const {
+    return this->u32x4;
+  }
+
+  LIBC_INLINE constexpr cpp::array<uint32_t, 4> to_lanes() const {
+    return this->u32x4;
+  }
+
+  LIBC_INLINE static constexpr vec128_storage
+  from_lanes(cpp::array<uint32_t, 4> &&xs) {
+    return vec128_storage(xs);
+  }
+
+  LIBC_INLINE static constexpr auto from_lanes(cpp::array<uint64_t, 2> &&xs) {
+    cpp::array<uint32_t, 4> x = {
+        static_cast<uint32_t>(xs[0]), static_cast<uint32_t>(xs[0] >> 32),
+        static_cast<uint32_t>(xs[1]), static_cast<uint32_t>(xs[1] >> 32)};
+    return vec128_storage(x);
+  }
+
+  LIBC_INLINE static constexpr auto
+  read_le(conversion_utils::Slice<uint8_t> x) {
+    LIBC_ASSERT(x.size() == 16);
+    vec128_storage v = cpp::array<uint32_t, 4>{0};
+    uint32_t *dst = v.u32x4.data();
+    uint8_t *src = x.data();
+    for (uint8_t i = 0; i < 4; ++i)
+      dst[i] = src[i * 4] | (src[i * 4 + 1] << 8) | (src[i * 4 + 2] << 16) |
+               (src[i * 4 + 3] << 24);
+    return v;
+  }
+};
+
+} // namespace vector_storage
+
+} // namespace random
+
+} // namespace wctype_internal
+
+} // namespace LIBC_NAMESPACE_DECL
+
+#endif // LLVM_LIBC_SRC___SUPPORT_WCTYPE_CONVERSION_RANDOM_VEC128_STORAGE_H
diff --git a/libc/src/__support/wctype/conversion/random/vec256_storage.cpp b/libc/src/__support/wctype/conversion/random/vec256_storage.cpp
new file mode 100644
index 0000000000000..806a452b99db7
--- /dev/null
+++ b/libc/src/__support/wctype/conversion/random/vec256_storage.cpp
@@ -0,0 +1,60 @@
+//===-- 256-bit storage implementation --------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+// This is a portable implementation of a 256-bit vector storage implemented
+// with static arrays, parallel of <immintrin.h>'s AVX256 which works with
+// `constexpr` code.
+// Only little-endian is supported (runtime code is not affected by this).
+
+#include "vec256_storage.h"
+#include "imm.h"
+
+namespace LIBC_NAMESPACE_DECL {
+
+namespace wctype_internal {
+
+namespace random {
+
+namespace vector_storage {
+
+LIBC_INLINE constexpr vec256_storage
+vec256_storage::shuffle_lane_words3012() const {
+  return immintrin::mm256_shuffle_epi32(*this, 0b0011'1001);
+}
+
+LIBC_INLINE constexpr vec256_storage
+vec256_storage::shuffle_lane_words2301() const {
+  return immintrin::mm256_shuffle_epi32(*this, 0b0100'1110);
+}
+
+LIBC_INLINE constexpr vec256_storage
+vec256_storage::shuffle_lane_words1230() const {
+  return immintrin::mm256_shuffle_epi32(*this, 0b1001'0011);
+}
+
+LIBC_INLINE constexpr vec256_storage vec256_storage::to_lanes() const {
+  auto lo = immintrin::mm256_extracti128_si256(*this, 0);
+  auto hi = immintrin::mm256_extracti128_si256(*this, 1);
+  return vec256_storage{{
+      lo.u32x4[0],
+      lo.u32x4[1],
+      lo.u32x4[2],
+      lo.u32x4[3],
+      hi.u32x4[0],
+      hi.u32x4[1],
+      hi.u32x4[2],
+      hi.u32x4[3],
+  }};
+}
+
+} // namespace vector_storage
+
+} // namespace random
+
+} // namespace wctype_internal
+
+} // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/__support/wctype/conversion/random/vec256_storage.h b/libc/src/__support/wctype/conversion/random/vec256_storage.h
new file mode 100644
index 0000000000000..f3bd16b8731c5
--- /dev/null
+++ b/libc/src/__support/wctype/conversion/random/vec256_storage.h
@@ -0,0 +1,63 @@
+//===-- 256-bit storage for StdRng - wctype conversion ----------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+// This is a portable implementation of a 256-bit vector storage implemented
+// with static arrays, parallel of <immintrin.h>'s AVX256 which works with
+// `constexpr` code.
+// Only little-endian is supported (runtime code is not affected by this).
+
+#ifndef LLVM_LIBC_SRC___SUPPORT_WCTYPE_CONVERSION_RANDOM_VEC256_STORAGE_H
+#define LLVM_LIBC_SRC___SUPPORT_WCTYPE_CONVERSION_RANDOM_VEC256_STORAGE_H
+
+#include "vec128_storage.h"
+
+namespace LIBC_NAMESPACE_DECL {
+
+namespace wctype_internal {
+
+namespace random {
+
+namespace vector_storage {
+
+union vec256_storage {
+  mutable cpp::array<uint32_t, 8> u32x8;
+
+  LIBC_INLINE constexpr operator cpp::array<uint32_t, 8>() const {
+    return this->u32x8;
+  }
+
+  LIBC_INLINE constexpr vec256_storage() : u32x8() {}
+  LIBC_INLINE static constexpr vec256_storage
+  construct_from_vec128(vec128_storage &&lo, vec128_storage &&hi) {
+    vec256_storage r{{}};
+    for (size_t i = 0; i < 4; i++) {
+      r.u32x8[i] = lo.u32x4[i];
+    }
+    for (size_t i = 0; i < 4; i++) {
+      r.u32x8[i + 4] = hi.u32x4[i];
+    }
+    return r;
+  }
+
+  LIBC_INLINE constexpr vec256_storage(cpp::array<uint32_t, 8> &&x)
+      : u32x8(x) {}
+
+  LIBC_INLINE constexpr vec256_storage shuffle_lane_words3012() const;
+  LIBC_INLINE constexpr vec256_storage shuffle_lane_words2301() const;
+  LIBC_INLINE constexpr vec256_storage shuffle_lane_words1230() const;
+  LIBC_INLINE constexpr vec256_storage to_lanes() const;
+};
+
+} // namespace vector_storage
+
+} // namespace random
+
+} // namespace wctype_internal
+
+} // namespace LIBC_NAMESPACE_DECL
+
+#endif // LLVM_LIBC_SRC___SUPPORT_WCTYPE_CONVERSION_RANDOM_VEC256_STORAGE_H
diff --git a/libc/src/__support/wctype/conversion/random/vec512_storage.cpp b/libc/src/__support/wctype/conversion/random/vec512_storage.cpp
new file mode 100644
index 0000000000000..e239703d08443
--- /dev/null
+++ b/libc/src/__support/wctype/conversion/random/vec512_storage.cpp
@@ -0,0 +1,342 @@
+//===-- 512-bit storage implementation --------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "vec512_storage.h"
+#include "imm.h"
+
+namespace LIBC_NAMESPACE_DECL {
+
+namespace wctype_internal {
+
+namespace random {
+
+namespace vector_storage {
+
+LIBC_INLINE constexpr vec512_storage
+vec512_storage::construct_from_vec256(const vec256_storage &lo,
+                                      const vec256_storage &hi) {
+  vec512_storage r{{}};
+  for (size_t i = 0; i < 8; i++) {
+    r.u32x16[i] = lo.u32x8[i];
+  }
+  for (size_t i = 0; i < 8; i++) {
+    r.u32x16[8 + i] = hi.u32x8[i];
+  }
+  return r;
+}
+
+LIBC_INLINE constexpr vec512_storage vec512_storage::new128(vec128_storage i,
+                                                            vec128_storage j,
+                                                            vec128_storage k,
+                                                            vec128_storage l) {
+  vec512_storage r{{}};
+  for (size_t a = 0; a < 4; a++) {
+    r.u32x16[a] = i.u32x4[a];
+    r.u32x16[a + 4] = j.u32x4[a];
+    r.u32x16[a + 8] = k.u32x4[a];
+    r.u32x16[a + 12] = l.u32x4[a];
+  }
+  return r;
+}
+
+LIBC_INLINE constexpr const vec512_storage &
+vec512_storage::operator+=(vec512_storage &rhs) const {
+  this->u32x16 = immintrin::mm256_add_epi32(*this, rhs).u32x16;
+  return *this;
+}
+
+LIBC_INLINE constexpr vec512_storage
+vec512_storage::operator+(const vec512_storage &rhs) const {
+  return immintrin::mm256_add_epi32(*this, rhs);
+}
+
+LIBC_INLINE constexpr vec512_storage
+vec512_storage::operator^(vec512_storage &rhs) const {
+  return immintrin::mm256_xor_si256(*this, rhs);
+}
+
+LIBC_INLINE constexpr vec512_storage
+vec512_storage::rotate_each_word_right16() const {
+  auto constexpr K0 = 0x0d0c'0f0e'0908'0b0a;
+  auto constexpr K1 = 0x0504'0706'0100'0302;
+
+  vec256_storage lo{{}};
+  vec256_storage hi{{}};
+
+  for (size_t i = 0; i < 8; i++) {
+    lo.u32x8[i] = this->u32x16[i];
+  }
+  for (size_t i = 0; i < 8; i++) {
+    hi.u32x8[i] = this->u32x16[8 + i];
+  }
+  lo = immintrin::mm256_shuffle_epi8(
+      lo, immintrin::mm256_set_epi64x(K0, K1, K0, K1));
+  hi = immintrin::mm256_shuffle_epi8(
+      hi, immintrin::mm256_set_epi64x(K0, K1, K0, K1));
+
+  vec512_storage ret{{}};
+  for (size_t i = 0; i < 8; i++) {
+    ret.u32x16[i] = lo.u32x8[i];
+  }
+
+  for (size_t i = 0; i < 8; i++) {
+    ret.u32x16[8 + i] = hi.u32x8[i];
+  }
+
+  return ret;
+}
+
+LIBC_INLINE constexpr vec512_storage
+vec512_storage::rotate_each_word_right20() const {
+  constexpr int32_t I = 20;
+
+  vec256_storage lo{{}};
+  vec256_storage hi{{}};
+
+  for (size_t i = 0; i < 8; i++) {
+    lo.u32x8[i] = this->u32x16[i];
+  }
+  for (size_t i = 0; i < 8; i++) {
+    hi.u32x8[i] = this->u32x16[8 + i];
+  }
+
+  lo = immintrin::mm256_or_si256(immintrin::mm256_srli_epi32(lo, I),
+                                 immintrin::mm256_slli_epi32(lo, 32 - I));
+  hi = immintrin::mm256_or_si256(immintrin::mm256_srli_epi32(hi, I),
+                                 immintrin::mm256_slli_epi32(hi, 32 - I));
+
+  vec512_storage r{{}};
+
+  for (size_t i = 0; i < 8; i++) {
+    r.u32x16[i] = lo.u32x8[i];
+  }
+  for (size_t i = 0; i < 8; i++) {
+    r.u32x16[8 + i] = hi.u32x8[i];
+  }
+
+  return r;
+}
+
+LIBC_INLINE constexpr vec512_storage
+vec512_storage::rotate_each_word_right24() const {
+  auto constexpr K0 = 0x0e0d'0c0f'0a09'080b;
+  auto constexpr K1 = 0x0605'0407'0201'0003;
+
+  vec256_storage lo{{}};
+  vec256_storage hi{{}};
+
+  for (size_t i = 0; i < 8; i++) {
+    lo.u32x8[i] = this->u32x16[i];
+  }
+  for (size_t i = 0; i < 8; i++) {
+    hi.u32x8[i] = this->u32x16[8 + i];
+  }
+
+  lo = immintrin::mm256_shuffle_epi8(
+      lo, immintrin::mm256_set_epi64x(K0, K1, K0, K1));
+  hi = immintrin::mm256_shuffle_epi8(
+      hi, immintrin::mm256_set_epi64x(K0, K1, K0, K1));
+
+  vec512_storage r{{}};
+
+  for (size_t i = 0; i < 8; i++) {
+    r.u32x16[i] = lo.u32x8[i];
+  }
+  for (size_t i = 0; i < 8; i++) {
+    r.u32x16[8 + i] = hi.u32x8[i];
+  }
+
+  return r;
+}
+
+LIBC_INLINE constexpr vec512_storage
+vec512_storage::rotate_each_word_right25() const {
+  constexpr int32_t I = 25;
+  vec256_storage lo{{}};
+  vec256_storage hi{{}};
+
+  for (size_t i = 0; i < 8; i++) {
+    lo.u32x8[i] = this->u32x16[i];
+  }
+  for (size_t i = 0; i < 8; i++) {
+    hi.u32x8[i] = this->u32x16[8 + i];
+  }
+
+  lo = immintrin::mm256_or_si256(immintrin::mm256_srli_epi32(lo, I),
+                                 immintrin::mm256_slli_epi32(lo, 32 - I));
+  hi = immintrin::mm256_or_si256(immintrin::mm256_srli_epi32(hi, I),
+                                 immintrin::mm256_slli_epi32(hi, 32 - I));
+
+  vec512_storage r{{}};
+
+  for (size_t i = 0; i < 8; i++) {
+    r.u32x16[i] = lo.u32x8[i];
+  }
+  for (size_t i = 0; i < 8; i++) {
+    r.u32x16[8 + i] = hi.u32x8[i];
+  }
+
+  return r;
+}
+
+LIBC_INLINE constexpr vec512_storage
+vec512_storage::shuffle_lane_words3012() const {
+  vec256_storage lo{{}};
+  vec256_storage hi{{}};
+
+  for (size_t i = 0; i < 8; i++) {
+    lo.u32x8[i] = this->u32x16[i];
+  }
+  for (size_t i = 0; i < 8; i++) {
+    hi.u32x8[i] = this->u32x16[8 + i];
+  }
+  lo = lo.shuffle_lane_words3012();
+  hi = hi.shuffle_lane_words3012();
+
+  vec512_storage r{{}};
+
+  for (size_t i = 0; i < 8; i++) {
+    r.u32x16[i] = lo.u32x8[i];
+  }
+  for (size_t i = 0; i < 8; i++) {
+    r.u32x16[8 + i] = hi.u32x8[i];
+  }
+
+  return r;
+}
+
+LIBC_INLINE constexpr vec512_storage
+vec512_storage::shuffle_lane_words2301() const {
+  vec256_storage lo{{}};
+  vec256_storage hi{{}};
+
+  for (size_t i = 0; i < 8; i++) {
+    lo.u32x8[i] = this->u32x16[i];
+  }
+  for (size_t i = 0; i < 8; i++) {
+    hi.u32x8[i] = this->u32x16[8 + i];
+  }
+  lo = lo.shuffle_lane_words2301();
+  hi = hi.shuffle_lane_words2301();
+
+  vec512_storage r{{}};
+
+  for (size_t i = 0; i < 8; i++) {
+    r.u32x16[i] = lo.u32x8[i];
+  }
+  for (size_t i = 0; i < 8; i++) {
+    r.u32x16[8 + i] = hi.u32x8[i];
+  }
+
+  return r;
+}
+
+LIBC_INLINE constexpr vec512_storage
+vec512_storage::shuffle_lane_words1230() const {
+  vec256_storage lo{{}};
+  vec256_storage hi{{}};
+
+  for (size_t i = 0; i < 8; i++) {
+    lo.u32x8[i] = this->u32x16[i];
+  }
+  for (size_t i = 0; i < 8; i++) {
+    hi.u32x8[i] = this->u32x16[8 + i];
+  }
+
+  lo = lo.shuffle_lane_words1230();
+  hi = hi.shuffle_lane_words1230();
+
+  vec512_storage r{{}};
+
+  for (size_t i = 0; i < 8; i++) {
+    r.u32x16[i] = lo.u32x8[i];
+  }
+  for (size_t i = 0; i < 8; i++) {
+    r.u32x16[8 + i] = hi.u32x8[i];
+  }
+
+  return r;
+}
+
+LIBC_INLINE constexpr cpp::array<vec512_storage, 4>
+vec512_storage::transpose4(const vec512_storage &a, const vec512_storage &b,
+                           const vec512_storage &c, const vec512_storage &d) {
+  /*
+   * a00:a01 a10:a11
+   * b00:b01 b10:b11
+   * c00:c01 c10:c11
+   * d00:d01 d10:d11
+   *       =>
+   * a00:b00 c00:d00
+   * a01:b01 c01:d01
+   * a10:b10 c10:d10
+   * a11:b11 c11:d11
+   */
+  vec256_storage a_lo{{}};
+  vec256_storage b_lo{{}};
+
+  for (size_t i = 0; i < 8; i++) {
+    a_lo.u32x8[i] = a.u32x16[i];
+  }
+  for (size_t i = 0; i < 8; i++) {
+    b_lo.u32x8[i] = b.u32x16[i];
+  }
+  auto const ab00 = immintrin::mm256_permute2x128_si256(a_lo, b_lo, 0x20);
+  auto const ab01 = immintrin::mm256_permute2x128_si256(a_lo, b_lo, 0x31);
+
+  vec256_storage a_hi{{}};
+  vec256_storage b_hi{{}};
+
+  for (size_t i = 0; i < 8; i++) {
+    a_hi.u32x8[i] = a.u32x16[8 + i];
+  }
+  for (size_t i = 0; i < 8; i++) {
+    b_hi.u32x8[i] = b.u32x16[8 + i];
+  }
+  auto const ab10 = immintrin::mm256_permute2x128_si256(a_hi, b_hi, 0x20);
+  auto const ab11 = immintrin::mm256_permute2x128_si256(a_hi, b_hi, 0x31);
+
+  vec256_storage c_lo{{}};
+  vec256_storage d_lo{{}};
+
+  for (size_t i = 0; i < 8; i++) {
+    c_lo.u32x8[i] = c.u32x16[i];
+  }
+  for (size_t i = 0; i < 8; i++) {
+    d_lo.u32x8[i] = d.u32x16[i];
+  }
+  auto const cd00 = immintrin::mm256_permute2x128_si256(c_lo, d_lo, 0x20);
+  auto const cd01 = immintrin::mm256_permute2x128_si256(c_lo, d_lo, 0x31);
+
+  vec256_storage c_hi{{}};
+  vec256_storage d_hi{{}};
+
+  for (size_t i = 0; i < 8; i++) {
+    c_hi.u32x8[i] = c.u32x16[8 + i];
+  }
+  for (size_t i = 0; i < 8; i++) {
+    d_hi.u32x8[i] = d.u32x16[8 + i];
+  }
+  auto const cd10 = immintrin::mm256_permute2x128_si256(c_hi, d_hi, 0x20);
+  auto const cd11 = immintrin::mm256_permute2x128_si256(c_hi, d_hi, 0x31);
+
+  auto r1 = vec512_storage::construct_from_vec256(ab00, cd00);
+  auto r2 = vec512_storage::construct_from_vec256(ab01, cd01);
+  auto r3 = vec512_storage::construct_from_vec256(ab10, cd10);
+  auto r4 = vec512_storage::construct_from_vec256(ab11, cd11);
+
+  return cpp::array<vec512_storage, 4>{r1, r2, r3, r4};
+}
+
+} // namespace vector_storage
+
+} // namespace random
+
+} // namespace wctype_internal
+
+} // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/__support/wctype/conversion/random/vec512_storage.h b/libc/src/__support/wctype/conversion/random/vec512_storage.h
new file mode 100644
index 0000000000000..de80d4d0442d2
--- /dev/null
+++ b/libc/src/__support/wctype/conversion/random/vec512_storage.h
@@ -0,0 +1,82 @@
+//===-- 512-bit storage for StdRng - wctype conversion ----------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+// This is a portable implementation of a 512-bit vector storage implemented
+// with static arrays, works with `constexpr` code.
+// Only little-endian is supported (runtime code is not affected by this).
+
+#ifndef LLVM_LIBC_SRC___SUPPORT_WCTYPE_CONVERSION_RANDOM_VEC512_STORAGE_H
+#define LLVM_LIBC_SRC___SUPPORT_WCTYPE_CONVERSION_RANDOM_VEC512_STORAGE_H
+
+#include "vec256_storage.h"
+
+namespace LIBC_NAMESPACE_DECL {
+
+namespace wctype_internal {
+
+namespace random {
+
+namespace vector_storage {
+
+union vec512_storage {
+  mutable cpp::array<uint32_t, 16> u32x16;
+
+  LIBC_INLINE static constexpr vec512_storage
+  construct_from_vec256(const vec256_storage &lo, const vec256_storage &hi);
+
+  LIBC_INLINE constexpr operator cpp::array<uint32_t, 16>() const {
+    return this->u32x16;
+  }
+
+  LIBC_INLINE static constexpr vec512_storage
+  new128(cpp::array<uint32_t, 16> &&xs) {
+    return vec512_storage{xs};
+  }
+
+  LIBC_INLINE static constexpr vec512_storage new128(vec128_storage i,
+                                                     vec128_storage j,
+                                                     vec128_storage k,
+                                                     vec128_storage l);
+
+  LIBC_INLINE constexpr vec512_storage unpack() const { return *this; }
+
+  LIBC_INLINE constexpr const vec512_storage &
+  operator+=(vec512_storage &rhs) const;
+
+  LIBC_INLINE constexpr vec512_storage operator+(const vec512_storage &) const;
+  LIBC_INLINE constexpr vec512_storage operator^(vec512_storage &rhs) const;
+
+  LIBC_INLINE constexpr vec512_storage rotate_each_word_right16() const;
+  LIBC_INLINE constexpr vec512_storage rotate_each_word_right20() const;
+  LIBC_INLINE constexpr vec512_storage rotate_each_word_right24() const;
+  LIBC_INLINE constexpr vec512_storage rotate_each_word_right25() const;
+
+  LIBC_INLINE constexpr vec512_storage shuffle_lane_words3012() const;
+  LIBC_INLINE constexpr vec512_storage shuffle_lane_words2301() const;
+  LIBC_INLINE constexpr vec512_storage shuffle_lane_words1230() const;
+
+  LIBC_INLINE static constexpr cpp::array<vec512_storage, 4>
+  transpose4(const vec512_storage &a, const vec512_storage &b,
+             const vec512_storage &c, const vec512_storage &d);
+
+  LIBC_INLINE constexpr conversion_utils::Slice<uint32_t> to_scalars() const {
+    return conversion_utils::Slice<uint32_t>(this->u32x16.data(),
+                                             this->u32x16.size());
+  }
+
+  LIBC_INLINE constexpr vec512_storage to_lanes() const { return *this; }
+};
+
+} // namespace vector_storage
+
+} // namespace random
+
+} // namespace wctype_internal
+
+} // namespace LIBC_NAMESPACE_DECL
+
+#endif // LLVM_LIBC_SRC___SUPPORT_WCTYPE_CONVERSION_RANDOM_VEC512_STORAGE_H



More information about the llvm-branch-commits mailing list