[clang] [llvm] [Support] Add SipHash-based 16/64-bit ptrauth stable hash. (PR #93902)

Mon Jun 3 13:27:00 PDT 2024

================
@@ -0,0 +1,174 @@
+//===--- StableHash.cpp - An ABI-stable string hash -----------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+//  This file implements an ABI-stable string hash based on SipHash, used to
+//  compute ptrauth discriminators.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Support/SipHash.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Support/Debug.h"
+#include <cstdint>
+#include <cstring>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "llvm-siphash"
+
+//  Lightly adapted from the SipHash reference C implementation by
+//  Jean-Philippe Aumasson and Daniel J. Bernstein.
+
+#define SIPHASH_ROTL(x, b) (uint64_t)(((x) << (b)) | ((x) >> (64 - (b))))
+
+#define SIPHASH_U8TO64_LE(p)                                                   \
+  (((uint64_t)((p)[0])) | ((uint64_t)((p)[1]) << 8) |                          \
+   ((uint64_t)((p)[2]) << 16) | ((uint64_t)((p)[3]) << 24) |                   \
+   ((uint64_t)((p)[4]) << 32) | ((uint64_t)((p)[5]) << 40) |                   \
+   ((uint64_t)((p)[6]) << 48) | ((uint64_t)((p)[7]) << 56))
+
+#define SIPHASH_SIPROUND                                                       \
+  do {                                                                         \
+    v0 += v1;                                                                  \
+    v1 = SIPHASH_ROTL(v1, 13);                                                 \
+    v1 ^= v0;                                                                  \
+    v0 = SIPHASH_ROTL(v0, 32);                                                 \
+    v2 += v3;                                                                  \
+    v3 = SIPHASH_ROTL(v3, 16);                                                 \
+    v3 ^= v2;                                                                  \
+    v0 += v3;                                                                  \
+    v3 = SIPHASH_ROTL(v3, 21);                                                 \
+    v3 ^= v0;                                                                  \
+    v2 += v1;                                                                  \
+    v1 = SIPHASH_ROTL(v1, 17);                                                 \
+    v1 ^= v2;                                                                  \
+    v2 = SIPHASH_ROTL(v2, 32);                                                 \
+  } while (0)
+
+template <int cROUNDS, int dROUNDS, class ResultTy>
+static inline ResultTy siphash(const uint8_t *in, uint64_t inlen,
+                               const uint8_t (&k)[16]) {
+  static_assert(sizeof(ResultTy) == 8 || sizeof(ResultTy) == 16,
+                "result type should be uint64_t or uint128_t");
+  uint64_t v0 = 0x736f6d6570736575ULL;
+  uint64_t v1 = 0x646f72616e646f6dULL;
+  uint64_t v2 = 0x6c7967656e657261ULL;
+  uint64_t v3 = 0x7465646279746573ULL;
+  uint64_t b;
+  uint64_t k0 = SIPHASH_U8TO64_LE(k);
+  uint64_t k1 = SIPHASH_U8TO64_LE(k + 8);
+  uint64_t m;
+  int i;
+  const uint8_t *end = in + inlen - (inlen % sizeof(uint64_t));
+  const int left = inlen & 7;
+  b = ((uint64_t)inlen) << 56;
+  v3 ^= k1;
+  v2 ^= k0;
+  v1 ^= k1;
+  v0 ^= k0;
+
+  if (sizeof(ResultTy) == 16) {
+    v1 ^= 0xee;
+  }
+
+  for (; in != end; in += 8) {
+    m = SIPHASH_U8TO64_LE(in);
+    v3 ^= m;
+
+    for (i = 0; i < cROUNDS; ++i)
+      SIPHASH_SIPROUND;
+
+    v0 ^= m;
+  }
+
+  switch (left) {
+  case 7:
+    b |= ((uint64_t)in[6]) << 48;
+    LLVM_FALLTHROUGH;
+  case 6:
+    b |= ((uint64_t)in[5]) << 40;
+    LLVM_FALLTHROUGH;
+  case 5:
+    b |= ((uint64_t)in[4]) << 32;
+    LLVM_FALLTHROUGH;
+  case 4:
+    b |= ((uint64_t)in[3]) << 24;
+    LLVM_FALLTHROUGH;
+  case 3:
+    b |= ((uint64_t)in[2]) << 16;
+    LLVM_FALLTHROUGH;
+  case 2:
+    b |= ((uint64_t)in[1]) << 8;
+    LLVM_FALLTHROUGH;
+  case 1:
+    b |= ((uint64_t)in[0]);
+    break;
+  case 0:
+    break;
+  }
+
+  v3 ^= b;
+
+  for (i = 0; i < cROUNDS; ++i)
+    SIPHASH_SIPROUND;
+
+  v0 ^= b;
+
+  if (sizeof(ResultTy) == 8) {
+    v2 ^= 0xff;
+  } else {
+    v2 ^= 0xee;
+  }
+
+  for (i = 0; i < dROUNDS; ++i)
+    SIPHASH_SIPROUND;
+
+  b = v0 ^ v1 ^ v2 ^ v3;
+
+  // This mess with the result type would be easier with 'if constexpr'.
+
+  uint64_t firstHalf = b;
+  if (sizeof(ResultTy) == 8)
+    return firstHalf;
+
+  v1 ^= 0xdd;
+
+  for (i = 0; i < dROUNDS; ++i)
+    SIPHASH_SIPROUND;
+
+  b = v0 ^ v1 ^ v2 ^ v3;
+  uint64_t secondHalf = b;
+
+  return firstHalf | (ResultTy(secondHalf) << (sizeof(ResultTy) == 8 ? 0 : 64));
+}
+
+//===--- LLVM-specific wrappers around siphash.
+
+/// Compute an ABI-stable 64-bit hash of the given string.
+uint64_t llvm::getPointerAuthStableSipHash64(StringRef Str) {
+  static const uint8_t K[16] = {0xb5, 0xd4, 0xc9, 0xeb, 0x79, 0x10, 0x4a, 0x79,
+                                0x6f, 0xec, 0x8b, 0x1b, 0x42, 0x87, 0x81, 0xd4};
+
+  // The aliasing is fine here because of omnipotent char.
+  auto *Data = reinterpret_cast<const uint8_t *>(Str.data());
+  return siphash<2, 4, uint64_t>(Data, Str.size(), K);
+}
+
+/// Compute an ABI-stable 16-bit hash of the given string.
+uint64_t llvm::getPointerAuthStableSipHash16(StringRef Str) {
+  uint64_t RawHash = getPointerAuthStableSipHash64(Str);
+
+  // Produce a non-zero 16-bit discriminator.
+  uint64_t Discriminator = (RawHash % 0xFFFF) + 1;
----------------
kovdan01 wrote:

I'm sure that such scheme is already used in downstream for a long time and there is a strong point in having non-zero discriminator always when we compute that from a string, but let me mention a potential downside of such approach instead of just doing `uint64_t Discriminator = RawHash % 0x10000;`.

If we assume that 64-bit hash values are distributed uniformly when applying the hash function to an infinite set of all possible strings (this should probably be true for a cryptographically secure hash), non-zero 16-bit values computed as here become non-uniformly distributed:
- 16-bit value 0: 0 64-bit values corresponding
- 16-bit value 1: 281479271743490 64-bit values corresponding
- 16-bit values 2..65535: 281479271743489 64-bit values corresponding

I suppose that it might be OK, it's just not very consistent with 64-bit hash computation since we do not try to avoid zero value there. I get the point that the chance of having zero 64-bit hash value is very low compared to 16-bit though.

The final point: if that was discussed with security researchers, I have no issues with such an implementation ignoring 16-bit zeros. If not - IMHO it's better to talk to security specialists and ask them for a piece of advice.


https://github.com/llvm/llvm-project/pull/93902