[llvm] Support: Add Unsigned Counted LEB128 (PR #167257)
via llvm-commits
llvm-commits at lists.llvm.org
Sun Nov 9 15:34:10 PST 2025
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-llvm-support
Author: Fangrui Song (MaskRay)
<details>
<summary>Changes</summary>
https://mlir.llvm.org/docs/BytecodeFormat/#signed-variable-width-integers
(mlir/lib/Bytecode/Reader/BytecodeReader.cpp)
describes a variant of LEB128 where the length information is determined
by counting trailing zero bits in the first byte. Specifically, if the
first byte has n-1 trailing zeros, then the encoded integer occupies n
bytes total. The special case of a zero first byte signals a 9-byte
encoding.
The remaining bits in the first byte, plus all subsequent bytes, contain the
actual value in little-endian order.
Implements Counted LEB128 encoding/decoding to llvm/lib/Support.
This will be used by object file format features like
[ELF Compact Section Header
Table](https://discourse.llvm.org/t/compact-section-header-table-for-elf/88821)
The name CLEB128 is suggested by
https://groups.google.com/g/generic-abi/c/9DPPniRXFa8/m/MJ3jetzZAAAJ
---
Full diff: https://github.com/llvm/llvm-project/pull/167257.diff
3 Files Affected:
- (modified) llvm/include/llvm/Support/LEB128.h (+28)
- (modified) llvm/lib/Support/LEB128.cpp (+92-1)
- (modified) llvm/unittests/Support/LEB128Test.cpp (+86)
``````````diff
diff --git a/llvm/include/llvm/Support/LEB128.h b/llvm/include/llvm/Support/LEB128.h
index 4e2262fb15c56..6cea80eb74bcf 100644
--- a/llvm/include/llvm/Support/LEB128.h
+++ b/llvm/include/llvm/Support/LEB128.h
@@ -252,6 +252,34 @@ LLVM_ABI extern unsigned getULEB128Size(uint64_t Value);
/// Utility function to get the size of the SLEB128-encoded value.
LLVM_ABI extern unsigned getSLEB128Size(int64_t Value);
+// Unsigned Counted LEB128: A variant of LEB128 where the length information is
+// determined by counting trailing zero bits in the first byte. Specifically, if
+// the first byte has n-1 trailing zeros, then the encoded integer occupies n
+// bytes total. The special case of a zero first byte signals a 9-byte encoding.
+//
+// The remaining bits in the first byte, plus all subsequent bytes, contain the
+// actual value in little-endian order.
+
+// clang-format off
+// xxxxxxx1: 7 value bits, 1 byte
+// xxxxxx10 xxxxxxxx: 14 value bits, 2 bytes
+// xxxxx100 xxxxxxxx xxxxxxxx: 21 value bits, 3 bytes
+// xxxx1000 xxxxxxxx xxxxxxxx xxxxxxxx: 28 value bits, 4 bytes
+// xxx10000 xxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx: 35 value bits, 5 bytes
+// xx100000 xxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx: 42 value bits, 6 bytes
+// x1000000 xxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx: 49 value bits, 7 bytes
+// 10000000 xxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx: 56 value bits, 8 bytes
+//
+// 00000000 xxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx: 64 value bits, 9 bytes
+// The last byte should not be 0.
+// clang-format on
+LLVM_ABI void encodeUCLeb128(uint64_t x, raw_ostream &os);
+LLVM_ABI uint64_t getUCLeb128(const uint8_t *&p, const uint8_t *end);
+LLVM_ABI uint64_t getUCLeb128Unsafe(const uint8_t *&p);
+
+// Note: If we introduce signed version of CLEB128, we should use sign extension
+// instead of zig-zag encoding. Sign extension actually generates faster code.
+
} // namespace llvm
#endif // LLVM_SUPPORT_LEB128_H
diff --git a/llvm/lib/Support/LEB128.cpp b/llvm/lib/Support/LEB128.cpp
index d41b673e9c8a5..703d13c46605c 100644
--- a/llvm/lib/Support/LEB128.cpp
+++ b/llvm/lib/Support/LEB128.cpp
@@ -12,6 +12,13 @@
//===----------------------------------------------------------------------===//
#include "llvm/Support/LEB128.h"
+#include "llvm/ADT/bit.h"
+#include "llvm/Support/Endian.h"
+#include "llvm/Support/EndianStream.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+using namespace llvm::support;
namespace llvm {
@@ -39,5 +46,89 @@ unsigned getSLEB128Size(int64_t Value) {
} while (IsMore);
return Size;
}
-
} // namespace llvm
+
+void llvm::encodeUCLeb128(uint64_t x, raw_ostream &os) {
+ // Fast path for n == 1
+ if (x < 128) {
+ os.write((x << 1) | 1);
+ return;
+ }
+
+ unsigned significantBits = 64 - countl_zero(x);
+ unsigned n = (significantBits + 6) / 7;
+ if (n > 8) {
+ // 9 bytes: 00000000 xxxxxxxx ...
+ os.write(0);
+ endian::write(os, x, endianness::little);
+ return;
+ }
+
+ uint64_t tagged = endian::byte_swap((x << n) | ((uint64_t)1 << (n - 1)),
+ endianness::little);
+ os.write((const char *)&tagged, n);
+}
+
+template <int n>
+static inline uint64_t getUCLeb128Case(const uint8_t *&p, uint8_t byte) {
+ uint64_t val = byte >> n;
+ int shift = 8 - n;
+ for (int i = 1; i < n; ++i) {
+ val |= (uint64_t)p[i] << shift;
+ shift += 8;
+ }
+ p += n;
+ return val;
+}
+
+template <bool CheckBounds>
+static uint64_t getUCLeb128Impl(const uint8_t *&p, const uint8_t *end) {
+ if constexpr (CheckBounds) {
+ if (p >= end)
+ return 0;
+ }
+ // Fast path for n == 1
+ uint8_t b0 = p[0];
+ if (b0 & 1) {
+ ++p;
+ return b0 >> 1;
+ }
+
+ unsigned n = llvm::countr_zero(b0) + 1;
+ if constexpr (CheckBounds) {
+ if (end - p < n)
+ return 0;
+ }
+ // Note: If n < 9 and we allow out-of-bounds read, we can use read64le(p) <<
+ // (64-8*n) >> (64-7*n) instead of the following switch statement.
+ switch (n) {
+ case 1:
+ return getUCLeb128Case<1>(p, b0);
+ case 2:
+ return getUCLeb128Case<2>(p, b0);
+ case 3:
+ return getUCLeb128Case<3>(p, b0);
+ case 4:
+ return getUCLeb128Case<4>(p, b0);
+ case 5:
+ return getUCLeb128Case<5>(p, b0);
+ case 6:
+ return getUCLeb128Case<6>(p, b0);
+ case 7:
+ return getUCLeb128Case<7>(p, b0);
+ case 8:
+ return getUCLeb128Case<8>(p, b0);
+ default:
+ // 9 bytes: 00000000 xxxxxxxx ...
+ p += 9;
+ return endian::read64le(p - 8);
+ }
+}
+
+uint64_t llvm::getUCLeb128(const uint8_t *&p, const uint8_t *end) {
+ return getUCLeb128Impl<true>(p, end);
+}
+
+uint64_t llvm::getUCLeb128Unsafe(const uint8_t *&p) {
+ return getUCLeb128Impl<false>(p, nullptr);
+}
diff --git a/llvm/unittests/Support/LEB128Test.cpp b/llvm/unittests/Support/LEB128Test.cpp
index 0c54a2846903b..668fb48278998 100644
--- a/llvm/unittests/Support/LEB128Test.cpp
+++ b/llvm/unittests/Support/LEB128Test.cpp
@@ -474,4 +474,90 @@ TEST(LEB128Test, ULEB128Size) {
EXPECT_EQ(10u, getULEB128Size(UINT64_MAX));
}
+TEST(CLeb128Test, get) {
+#define EXPECT_CLEB128(VALUE, EXPECTED, SIZE) \
+ do { \
+ const uint8_t *V = reinterpret_cast<const uint8_t *>(VALUE); \
+ const uint8_t *P = V; \
+ const uint8_t *End = V + sizeof(VALUE) - 1; \
+ uint64_t Result = getUCLeb128(P, End); \
+ EXPECT_EQ(Result, EXPECTED); \
+ EXPECT_EQ(P - V, SIZE); \
+ P = V; \
+ Result = getUCLeb128Unsafe(P); \
+ EXPECT_EQ(Result, EXPECTED); \
+ EXPECT_EQ(P - V, SIZE); \
+ } while (0)
+
+ // Fast path: single byte with LSB = 1 (value = byte >> 1)
+ EXPECT_CLEB128("\x01", 0u, 1);
+ EXPECT_CLEB128("\x7f", 63u, 1);
+ EXPECT_CLEB128("\xff", 127u, 1);
+ EXPECT_CLEB128("\x02\x02", 128u, 2);
+ EXPECT_CLEB128("\x00\x00\x01\x00\x00\x00\x00\x00\x00", 256u, 9);
+
+ // Test (1<<56)-2
+ EXPECT_CLEB128("\x80\xfe\xff\xff\xff\xff\xff\xff", 0xfffffffffffffeu, 8);
+ EXPECT_CLEB128("\x00\xfe\xff\xff\xff\xff\xff\xff\x00", 0xfffffffffffffeu, 9);
+
+#undef EXPECT_CLEB128
+
+ // Test bounds checking in safe version
+ {
+ const uint8_t data[] = {0x02, 0x02}; // 2-byte encoding for 128
+ const uint8_t *p = data;
+
+ // Insufficient buffer (should return 0)
+ p = data;
+ EXPECT_EQ(getUCLeb128(p, data + 1), 0u);
+ EXPECT_EQ(p, data);
+
+ // Empty buffer
+ p = data;
+ EXPECT_EQ(getUCLeb128(p, data), 0u);
+ EXPECT_EQ(p, data);
+ }
+
+ // Test 9-byte format bounds checking
+ {
+ const uint8_t data[] = {0x00, 0x01, 0x02, 0x03, 0x04,
+ 0x05, 0x06, 0x07, 0x08, 0x09};
+ const uint8_t *p = data;
+
+ // Sufficient buffer for 9-byte format
+ EXPECT_EQ(getUCLeb128(p, data + 10), 0x0807060504030201ULL);
+
+ // Insufficient buffer for 9-byte format
+ p = data;
+ EXPECT_EQ(getUCLeb128(p, data + 8), 0u);
+ }
+}
+
+TEST(CLeb128Test, encode) {
+ // Test round-trip consistency for all encoding lengths.
+ const uint64_t vals[] = {
+ 0, // 1 byte
+ 128, // 2 bytes
+ (1ULL << 14) + 2, // 3 bytes
+ (1ULL << 21) + 3, // 4 bytes
+ (1ULL << 28) + 4, // 5 bytes
+ (1ULL << 35) + 5, // 6 bytes
+ (1ULL << 42) + 6, // 7 bytes
+ (1ULL << 49) + 7, // 8 bytes
+ UINT64_MAX / 2, // 9 bytes
+ UINT64_MAX - 1, // 9 bytes
+ };
+ for (uint64_t val : vals) {
+ std::string encoded;
+ raw_string_ostream os(encoded);
+ encodeUCLeb128(val, os);
+
+ const uint8_t *p0 = reinterpret_cast<const uint8_t *>(encoded.data());
+ const uint8_t *p = p0;
+ uint64_t decoded = getUCLeb128Unsafe(p);
+ EXPECT_EQ(val, decoded) << "Round-trip failed for value " << val;
+ EXPECT_EQ(p - p0, encoded.size());
+ }
+}
+
} // anonymous namespace
``````````
</details>
https://github.com/llvm/llvm-project/pull/167257
More information about the llvm-commits
mailing list