[libc-commits] [libc] [libc] add checksum for jmpbuf (PR #101110)

Tue Jul 30 14:30:40 PDT 2024

SchrodingerZhu wrote:

```
~ via 🐍 v3.10.12 
❯ clang++ -O3 test5.cc -DNO_CHECK

~ via 🐍 v3.10.12 
❯ ./a.out 
2845100
5104636

~ via 🐍 v3.10.12 
❯ ./a.out
2723879
5370382

~ via 🐍 v3.10.12 
❯ clang++ -O3 test5.cc

~ via 🐍 v3.10.12 
❯ ./a.out
7721258
5307271

~ via 🐍 v3.10.12 
❯ ./a.out
7872398
5451505
```

program
```c++
namespace internal {

// Folded multiplication.
// This function multiplies two 64-bit integers and xor the high and
// low 64-bit parts of the result.
inline __UINT64_TYPE__ folded_multiply(__UINT64_TYPE__ x, __UINT64_TYPE__ y) {
  __uint128_t p = static_cast<__uint128_t>(x) * static_cast<__uint128_t>(y);
  __UINT64_TYPE__ low = static_cast<__UINT64_TYPE__>(p);
  __UINT64_TYPE__ high = static_cast<__UINT64_TYPE__>(p >> 64);
  return low ^ high;
}

// Read as little endian.
// Shift-and-or implementation does not give a satisfactory code on aarch64.
// Therefore, we use a union to read the value.
template <typename T> inline T read_little_endian(const void *ptr) {
  const __UINT8_TYPE__ *bytes = static_cast<const __UINT8_TYPE__ *>(ptr);
  union {
    T value;
    __UINT8_TYPE__ buffer[sizeof(T)];
  } data;
#if __BYTE_ORDER__ != __ORDER_LITTLE_ENDIAN__
  // Compiler should able to optimize this as a load followed by a byte swap.
  // On aarch64 (-mbig-endian), this compiles to the following for int:
  //      ldr     w0, [x0]
  //      rev     w0, w0
  //      ret
  for (__SIZE_TYPE__ i = 0; i < sizeof(T); ++i) {
    data.buffer[i] = bytes[sizeof(T) - i - 1];
  }
#else
  for (__SIZE_TYPE__ i = 0; i < sizeof(T); ++i) {
    data.buffer[i] = bytes[i];
  }
#endif
  return data.value;
}

// Specialized read functions for small values. size must be <= 8.
inline void read_small_values(const void *ptr, __SIZE_TYPE__ size, __UINT64_TYPE__ &low,
                                   __UINT64_TYPE__ &high) {
  const __UINT8_TYPE__ *bytes = static_cast<const __UINT8_TYPE__ *>(ptr);
  if (size >= 2) {
    if (size >= 4) {
      low = static_cast<__UINT64_TYPE__>(read_little_endian<__UINT32_TYPE__>(&bytes[0]));
      high =
          static_cast<__UINT64_TYPE__>(read_little_endian<__UINT32_TYPE__>(&bytes[size - 4]));
    } else {
      low = static_cast<__UINT64_TYPE__>(read_little_endian<__UINT16_TYPE__>(&bytes[0]));
      high = static_cast<__UINT64_TYPE__>(bytes[size - 1]);
    }
  } else {
    if (size > 0) {
      low = static_cast<__UINT64_TYPE__>(bytes[0]);
      high = static_cast<__UINT64_TYPE__>(bytes[0]);
    } else {
      low = 0;
      high = 0;
    }
  }
}

// This constant comes from Kunth's prng (it empirically works well).
inline constexpr __UINT64_TYPE__ MULTIPLE = 6364136223846793005;
// Rotation amount for mixing.
inline constexpr __UINT64_TYPE__ ROTATE = 23;

// Randomly generated values. For now, we use the same values as in aHash as
// they are widely tested.
// https://github.com/tkaitchuck/aHash/blob/9f6a2ad8b721fd28da8dc1d0b7996677b374357c/src/random_state.rs#L38
inline constexpr __UINT64_TYPE__ RANDOMNESS[2][4] = {
    {0x243f6a8885a308d3, 0x13198a2e03707344, 0xa4093822299f31d0,
     0x082efa98ec4e6c89},
    {0x452821e638d01377, 0xbe5466cf34e90c6c, 0xc0ac29b7c97c50dd,
     0x3f84d5b5b5470917},
};

// This is a portable string hasher. It is not cryptographically secure.
// The quality of the hash is good enough to pass all tests in SMHasher.
// The implementation is derived from the generic routine of aHash.
class HashState {
  __UINT64_TYPE__ buffer;
  __UINT64_TYPE__ pad;
  __UINT64_TYPE__ extra_keys[2];
  inline void update(__UINT64_TYPE__ low, __UINT64_TYPE__ high) {
    __UINT64_TYPE__ combined =
        folded_multiply(low ^ extra_keys[0], high ^ extra_keys[1]);
    buffer = (buffer + pad) ^ combined;
    buffer = __builtin_rotateleft64(buffer, ROTATE);
  }
  inline static __UINT64_TYPE__ mix(__UINT64_TYPE__ seed) {
    HashState mixer{RANDOMNESS[0][0], RANDOMNESS[0][1], RANDOMNESS[0][2],
                    RANDOMNESS[0][3]};
    mixer.update(seed, 0);
    return mixer.finish();
  }

public:
  inline constexpr HashState(__UINT64_TYPE__ a, __UINT64_TYPE__ b, __UINT64_TYPE__ c,
                                  __UINT64_TYPE__ d)
      : buffer(a), pad(b), extra_keys{c, d} {}
  inline HashState(__UINT64_TYPE__ seed) {
    // Mix one more round of the seed to make it stronger.
    __UINT64_TYPE__ mixed = mix(seed);
    buffer = RANDOMNESS[1][0] ^ mixed;
    pad = RANDOMNESS[1][1] ^ mixed;
    extra_keys[0] = RANDOMNESS[1][2] ^ mixed;
    extra_keys[1] = RANDOMNESS[1][3] ^ mixed;
  }
  inline void update(const void *ptr, __SIZE_TYPE__ size) {
    __UINT8_TYPE__ const *bytes = static_cast<const __UINT8_TYPE__ *>(ptr);
    buffer = (buffer + size) * MULTIPLE;
    __UINT64_TYPE__ low, high;
    if (size > 8) {
      if (size > 16) {
        // update tail
        low = read_little_endian<__UINT64_TYPE__>(&bytes[size - 16]);
        high = read_little_endian<__UINT64_TYPE__>(&bytes[size - 8]);
        update(low, high);
        while (size > 16) {
          low = read_little_endian<__UINT64_TYPE__>(&bytes[0]);
          high = read_little_endian<__UINT64_TYPE__>(&bytes[8]);
          update(low, high);
          bytes += 16;
          size -= 16;
        }
      } else {
        low = read_little_endian<__UINT64_TYPE__>(&bytes[0]);
        high = read_little_endian<__UINT64_TYPE__>(&bytes[size - 8]);
        update(low, high);
      }
    } else {
      read_small_values(ptr, size, low, high);
      update(low, high);
    }
  }
  inline __UINT64_TYPE__ finish() {
    int rot = buffer & 63;
    __UINT64_TYPE__ folded = folded_multiply(buffer, pad);
    return __builtin_rotateleft64(folded, rot);
  }
};

} // namespace internal

#define offsetof(A, B) __builtin_offsetof(A, B)

typedef  struct {
  __UINT64_TYPE__ rbx;
  __UINT64_TYPE__ rbp;
  __UINT64_TYPE__ r12;
  __UINT64_TYPE__ r13;
  __UINT64_TYPE__ r14;
  __UINT64_TYPE__ r15;
  __UINTPTR_TYPE__ rsp;
  __UINTPTR_TYPE__ rip;
  __UINT64_TYPE__ __sigmask;
  __UINT64_TYPE__ __has_sigmask : 1;
  __UINT64_TYPE__ __unused : 63;
  __UINT64_TYPE__ __chksum;
} my_jmp_buf;

namespace jmpbuf {
using HashState = internal::HashState;
// Initial values generated by
// https://www.random.org/cgi-bin/randbyte?nbytes=48&format=h
// These values are only used for overlay targets.
inline __UINT64_TYPE__ register_mangle_cookie = 0xdf8a883867040cbc;
inline __UINT64_TYPE__ checksum_mangle_cookie = 0x9ed4fe406ebe9cf9;
inline __UINT64_TYPE__ randomness[4] = {
    0x83b9df7dddf5ab3d,
    0x06c931cca75e15c6,
    0x08280ec9e9a778bf,
    0x111f67f4aafc9276,
};

inline int update_checksum(my_jmp_buf *buf) {
  HashState state{
      randomness[0],
      randomness[1],
      randomness[2],
      randomness[3],
  };
  state.update(buf, offsetof(my_jmp_buf, __chksum));
  buf->__chksum = state.finish() ^ checksum_mangle_cookie;
  return 0;
}

inline void verify(const my_jmp_buf *buf) {
  HashState state{
      randomness[0],
      randomness[1],
      randomness[2],
      randomness[3],
  };
  state.update(buf, offsetof(my_jmp_buf, __chksum));
  auto chksum = state.finish() ^ checksum_mangle_cookie;
  if (chksum != buf->__chksum) {
    __builtin_trap();
  }
}

} // namespace jmpbuf

namespace test {

[[gnu::naked]]
void longjmp (my_jmp_buf * buf, int val) {
#ifndef NO_CHECK
  asm(R"(
   pushq %%rbp
   pushq %%rbx
   mov  %%rdi, %%rbp
   mov  %%esi, %%ebx
   subq $8, %%rsp
   call %P0
   addq $8, %%rsp
   mov  %%ebx, %%esi
   mov  %%rbp, %%rdi
   popq %%rbx
   popq %%rbp
 )" ::"i"(jmpbuf::verify)
      : "rax", "rcx", "rdx", "r8", "r9", "r10", "r11");
#endif

  register __UINT64_TYPE__ rcx __asm__("rcx");
  // Load cookie
  asm("mov %1, %0\n\t" : "=r"(rcx) : "m"(jmpbuf::register_mangle_cookie));

  // load registers from buffer
  // do not pass any invalid values into registers
#define RECOVER(REG)                                                           \
  register __UINT64_TYPE__ REG __asm__(#REG);                                  \
  asm volatile("mov %c[" #REG "](%%rdi), %%rdx\n\t"                            \
               "xor %%rdx, %1\n\t"                                             \
               "mov %%rdx, %0\n\t"                                             \
               : "=r"(REG)                                                     \
               : "r"(rcx), [REG] "i"(offsetof(my_jmp_buf, REG))                 \
               : "rdx");

  RECOVER(rbx);
  RECOVER(rbp);
  RECOVER(r12);
  RECOVER(r13);
  RECOVER(r14);
  RECOVER(r15);
  RECOVER(rsp);

  register int eax __asm__("eax");
  asm volatile(R"(
   xor %0,%0
   cmp $1,%%esi       
   adc %%esi,%0
   mov %c[rip](%%rdi),%%rdx
   xor %%rdx, %%rcx
   jmp *%%rdx
 )"
               : "=r"(eax)
               : [rip] "i"(offsetof(my_jmp_buf, rip))
               : "rdx");
}

[[gnu::naked]]
int setjmp (my_jmp_buf * buf) {
  register __UINT64_TYPE__ rcx __asm__("rcx");
  // Load cookie
  asm("mov %1, %0\n\t" : "=r"(rcx) : "m"(jmpbuf::register_mangle_cookie));
  // store registers to buffer
  // do not pass any invalid values into registers
#define STORE(REG)                                                             \
  asm("mov %%" #REG ", %%rdx\n\t"                                              \
      "xor %%rdx, %%rcx\n\t"                                                   \
      "mov %%rdx, %c[" #REG                                                    \
      "](%%rdi)\n\t" ::[REG] "i"(offsetof(my_jmp_buf, REG))                     \
      : "rdx");

  STORE(rbx);
  STORE(rbp);
  STORE(r12);
  STORE(r13);
  STORE(r14);
  STORE(r15);
  asm(R"(
   lea 8(%%rsp),%%rdx
   xor %%rdx, %%rcx
   mov %%rdx,%c[rsp](%%rdi)
   mov (%%rsp),%%rdx
   xor %%rdx, %%rcx     
   mov %%rdx,%c[rip](%%rdi)
 )" ::[rsp] "i"(offsetof(my_jmp_buf, rsp)),
      [rip] "i"(offsetof(my_jmp_buf, rip))
      : "rdx");
#ifndef NO_CHECK
  // tail call to update checksum
  asm("jmp %P0" : : "i"(jmpbuf::update_checksum));
#else
    asm("xor %eax, %eax\n\tret\n\t");
#endif
}

}

#include <setjmp.h>
#include <chrono>
#include <iostream>
int main() {
    using namespace std::chrono;
    {
        auto x = high_resolution_clock::now();
#pragma push_macro("setjmp")
#undef setjmp
        for (int i = 0; i < 1000000; ++i) {
            my_jmp_buf buf;
            if (test::setjmp(&buf))
                continue;
            test::longjmp(&buf, 0);
        }
        auto y = high_resolution_clock::now();
        std::cout << duration_cast<nanoseconds>(y - x).count() << std::endl;
    }
#pragma pop_macro("setjmp")
    {
        auto x = high_resolution_clock::now();
        for (int i = 0; i < 1000000; ++i) {
            jmp_buf buf;
            if (::setjmp(buf))
                continue;
            ::longjmp(buf, 0);
        }
        auto y = high_resolution_clock::now();
        std::cout << duration_cast<nanoseconds>(y - x).count() << std::endl;
    }
}

```

https://github.com/llvm/llvm-project/pull/101110