[llvm] Support: Add Unsigned Counted LEB128 (PR #167257)

via llvm-commits llvm-commits at lists.llvm.org
Sun Nov 9 15:34:10 PST 2025


llvmbot wrote:


<!--LLVM PR SUMMARY COMMENT-->

@llvm/pr-subscribers-llvm-support

Author: Fangrui Song (MaskRay)

<details>
<summary>Changes</summary>

https://mlir.llvm.org/docs/BytecodeFormat/#signed-variable-width-integers
(mlir/lib/Bytecode/Reader/BytecodeReader.cpp)
describes a variant of LEB128 where the length information is determined
by counting trailing zero bits in the first byte. Specifically, if the
first byte has n-1 trailing zeros, then the encoded integer occupies n
bytes total. The special case of a zero first byte signals a 9-byte
encoding.

The remaining bits in the first byte, plus all subsequent bytes, contain the
actual value in little-endian order.

Implements Counted LEB128 encoding/decoding to llvm/lib/Support.
This will be used by object file format features like
[ELF Compact Section Header
Table](https://discourse.llvm.org/t/compact-section-header-table-for-elf/88821)

The name CLEB128 is suggested by
https://groups.google.com/g/generic-abi/c/9DPPniRXFa8/m/MJ3jetzZAAAJ


---
Full diff: https://github.com/llvm/llvm-project/pull/167257.diff


3 Files Affected:

- (modified) llvm/include/llvm/Support/LEB128.h (+28) 
- (modified) llvm/lib/Support/LEB128.cpp (+92-1) 
- (modified) llvm/unittests/Support/LEB128Test.cpp (+86) 


``````````diff
diff --git a/llvm/include/llvm/Support/LEB128.h b/llvm/include/llvm/Support/LEB128.h
index 4e2262fb15c56..6cea80eb74bcf 100644
--- a/llvm/include/llvm/Support/LEB128.h
+++ b/llvm/include/llvm/Support/LEB128.h
@@ -252,6 +252,34 @@ LLVM_ABI extern unsigned getULEB128Size(uint64_t Value);
 /// Utility function to get the size of the SLEB128-encoded value.
 LLVM_ABI extern unsigned getSLEB128Size(int64_t Value);
 
+// Unsigned Counted LEB128: A variant of LEB128 where the length information is
+// determined by counting trailing zero bits in the first byte. Specifically, if
+// the first byte has n-1 trailing zeros, then the encoded integer occupies n
+// bytes total. The special case of a zero first byte signals a 9-byte encoding.
+//
+// The remaining bits in the first byte, plus all subsequent bytes, contain the
+// actual value in little-endian order.
+
+// clang-format off
+// xxxxxxx1: 7 value bits, 1 byte
+// xxxxxx10 xxxxxxxx: 14 value bits, 2 bytes
+// xxxxx100 xxxxxxxx xxxxxxxx: 21 value bits, 3 bytes
+// xxxx1000 xxxxxxxx xxxxxxxx xxxxxxxx: 28 value bits, 4 bytes
+// xxx10000 xxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx: 35 value bits, 5 bytes
+// xx100000 xxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx: 42 value bits, 6 bytes
+// x1000000 xxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx: 49 value bits, 7 bytes
+// 10000000 xxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx: 56 value bits, 8 bytes
+//
+// 00000000 xxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx: 64 value bits, 9 bytes
+// The last byte should not be 0.
+// clang-format on
+LLVM_ABI void encodeUCLeb128(uint64_t x, raw_ostream &os);
+LLVM_ABI uint64_t getUCLeb128(const uint8_t *&p, const uint8_t *end);
+LLVM_ABI uint64_t getUCLeb128Unsafe(const uint8_t *&p);
+
+// Note: If we introduce signed version of CLEB128, we should use sign extension
+// instead of zig-zag encoding. Sign extension actually generates faster code.
+
 } // namespace llvm
 
 #endif // LLVM_SUPPORT_LEB128_H
diff --git a/llvm/lib/Support/LEB128.cpp b/llvm/lib/Support/LEB128.cpp
index d41b673e9c8a5..703d13c46605c 100644
--- a/llvm/lib/Support/LEB128.cpp
+++ b/llvm/lib/Support/LEB128.cpp
@@ -12,6 +12,13 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Support/LEB128.h"
+#include "llvm/ADT/bit.h"
+#include "llvm/Support/Endian.h"
+#include "llvm/Support/EndianStream.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+using namespace llvm::support;
 
 namespace llvm {
 
@@ -39,5 +46,89 @@ unsigned getSLEB128Size(int64_t Value) {
   } while (IsMore);
   return Size;
 }
-
 }  // namespace llvm
+
+void llvm::encodeUCLeb128(uint64_t x, raw_ostream &os) {
+  // Fast path for n == 1
+  if (x < 128) {
+    os.write((x << 1) | 1);
+    return;
+  }
+
+  unsigned significantBits = 64 - countl_zero(x);
+  unsigned n = (significantBits + 6) / 7;
+  if (n > 8) {
+    // 9 bytes: 00000000 xxxxxxxx ...
+    os.write(0);
+    endian::write(os, x, endianness::little);
+    return;
+  }
+
+  uint64_t tagged = endian::byte_swap((x << n) | ((uint64_t)1 << (n - 1)),
+                                      endianness::little);
+  os.write((const char *)&tagged, n);
+}
+
+template <int n>
+static inline uint64_t getUCLeb128Case(const uint8_t *&p, uint8_t byte) {
+  uint64_t val = byte >> n;
+  int shift = 8 - n;
+  for (int i = 1; i < n; ++i) {
+    val |= (uint64_t)p[i] << shift;
+    shift += 8;
+  }
+  p += n;
+  return val;
+}
+
+template <bool CheckBounds>
+static uint64_t getUCLeb128Impl(const uint8_t *&p, const uint8_t *end) {
+  if constexpr (CheckBounds) {
+    if (p >= end)
+      return 0;
+  }
+  // Fast path for n == 1
+  uint8_t b0 = p[0];
+  if (b0 & 1) {
+    ++p;
+    return b0 >> 1;
+  }
+
+  unsigned n = llvm::countr_zero(b0) + 1;
+  if constexpr (CheckBounds) {
+    if (end - p < n)
+      return 0;
+  }
+  // Note: If n < 9 and we allow out-of-bounds read, we can use read64le(p) <<
+  // (64-8*n) >> (64-7*n) instead of the following switch statement.
+  switch (n) {
+  case 1:
+    return getUCLeb128Case<1>(p, b0);
+  case 2:
+    return getUCLeb128Case<2>(p, b0);
+  case 3:
+    return getUCLeb128Case<3>(p, b0);
+  case 4:
+    return getUCLeb128Case<4>(p, b0);
+  case 5:
+    return getUCLeb128Case<5>(p, b0);
+  case 6:
+    return getUCLeb128Case<6>(p, b0);
+  case 7:
+    return getUCLeb128Case<7>(p, b0);
+  case 8:
+    return getUCLeb128Case<8>(p, b0);
+  default:
+    // 9 bytes: 00000000 xxxxxxxx ...
+    p += 9;
+    return endian::read64le(p - 8);
+  }
+}
+
+uint64_t llvm::getUCLeb128(const uint8_t *&p, const uint8_t *end) {
+  return getUCLeb128Impl<true>(p, end);
+}
+
+uint64_t llvm::getUCLeb128Unsafe(const uint8_t *&p) {
+  return getUCLeb128Impl<false>(p, nullptr);
+}
diff --git a/llvm/unittests/Support/LEB128Test.cpp b/llvm/unittests/Support/LEB128Test.cpp
index 0c54a2846903b..668fb48278998 100644
--- a/llvm/unittests/Support/LEB128Test.cpp
+++ b/llvm/unittests/Support/LEB128Test.cpp
@@ -474,4 +474,90 @@ TEST(LEB128Test, ULEB128Size) {
   EXPECT_EQ(10u, getULEB128Size(UINT64_MAX));
 }
 
+TEST(CLeb128Test, get) {
+#define EXPECT_CLEB128(VALUE, EXPECTED, SIZE)                                  \
+  do {                                                                         \
+    const uint8_t *V = reinterpret_cast<const uint8_t *>(VALUE);               \
+    const uint8_t *P = V;                                                      \
+    const uint8_t *End = V + sizeof(VALUE) - 1;                                \
+    uint64_t Result = getUCLeb128(P, End);                                     \
+    EXPECT_EQ(Result, EXPECTED);                                               \
+    EXPECT_EQ(P - V, SIZE);                                                    \
+    P = V;                                                                     \
+    Result = getUCLeb128Unsafe(P);                                             \
+    EXPECT_EQ(Result, EXPECTED);                                               \
+    EXPECT_EQ(P - V, SIZE);                                                    \
+  } while (0)
+
+  // Fast path: single byte with LSB = 1 (value = byte >> 1)
+  EXPECT_CLEB128("\x01", 0u, 1);
+  EXPECT_CLEB128("\x7f", 63u, 1);
+  EXPECT_CLEB128("\xff", 127u, 1);
+  EXPECT_CLEB128("\x02\x02", 128u, 2);
+  EXPECT_CLEB128("\x00\x00\x01\x00\x00\x00\x00\x00\x00", 256u, 9);
+
+  // Test (1<<56)-2
+  EXPECT_CLEB128("\x80\xfe\xff\xff\xff\xff\xff\xff", 0xfffffffffffffeu, 8);
+  EXPECT_CLEB128("\x00\xfe\xff\xff\xff\xff\xff\xff\x00", 0xfffffffffffffeu, 9);
+
+#undef EXPECT_CLEB128
+
+  // Test bounds checking in safe version
+  {
+    const uint8_t data[] = {0x02, 0x02}; // 2-byte encoding for 128
+    const uint8_t *p = data;
+
+    // Insufficient buffer (should return 0)
+    p = data;
+    EXPECT_EQ(getUCLeb128(p, data + 1), 0u);
+    EXPECT_EQ(p, data);
+
+    // Empty buffer
+    p = data;
+    EXPECT_EQ(getUCLeb128(p, data), 0u);
+    EXPECT_EQ(p, data);
+  }
+
+  // Test 9-byte format bounds checking
+  {
+    const uint8_t data[] = {0x00, 0x01, 0x02, 0x03, 0x04,
+                            0x05, 0x06, 0x07, 0x08, 0x09};
+    const uint8_t *p = data;
+
+    // Sufficient buffer for 9-byte format
+    EXPECT_EQ(getUCLeb128(p, data + 10), 0x0807060504030201ULL);
+
+    // Insufficient buffer for 9-byte format
+    p = data;
+    EXPECT_EQ(getUCLeb128(p, data + 8), 0u);
+  }
+}
+
+TEST(CLeb128Test, encode) {
+  // Test round-trip consistency for all encoding lengths.
+  const uint64_t vals[] = {
+      0,                // 1 byte
+      128,              // 2 bytes
+      (1ULL << 14) + 2, // 3 bytes
+      (1ULL << 21) + 3, // 4 bytes
+      (1ULL << 28) + 4, // 5 bytes
+      (1ULL << 35) + 5, // 6 bytes
+      (1ULL << 42) + 6, // 7 bytes
+      (1ULL << 49) + 7, // 8 bytes
+      UINT64_MAX / 2,   // 9 bytes
+      UINT64_MAX - 1,   // 9 bytes
+  };
+  for (uint64_t val : vals) {
+    std::string encoded;
+    raw_string_ostream os(encoded);
+    encodeUCLeb128(val, os);
+
+    const uint8_t *p0 = reinterpret_cast<const uint8_t *>(encoded.data());
+    const uint8_t *p = p0;
+    uint64_t decoded = getUCLeb128Unsafe(p);
+    EXPECT_EQ(val, decoded) << "Round-trip failed for value " << val;
+    EXPECT_EQ(p - p0, encoded.size());
+  }
+}
+
 }  // anonymous namespace

``````````

</details>


https://github.com/llvm/llvm-project/pull/167257


More information about the llvm-commits mailing list