[clang-tools-extra] [clang-tidy] Compact the confusable identifier table (PR #202626)
David Zbarsky via cfe-commits
cfe-commits at lists.llvm.org
Tue Jun 9 07:00:00 PDT 2026
https://github.com/dzbarsky created https://github.com/llvm/llvm-project/pull/202626
Encode confusable replacement strings once in a shared UTF-8 byte array. Each sorted mapping becomes an eight-byte code point, 16-bit offset, and 16-bit length entry instead of reserving nineteen UTF-32 code points for every replacement.
ConfusableIdentifierCheck appends the UTF-8 slice directly after the binary search, removing the per-identifier UTF-32 to UTF-8 conversion. Add compile-time assertions for the eight-byte entry layout and the 64 KiB replacement-data limit, plus generator-side overflow checks.
The logical table shrinks from 508,400 to 63,327 bytes, saving 445,073 bytes. A stripped standalone clang-tidy shrinks from 91,116,608 to 90,654,176 bytes, saving 462,432 bytes (0.508%).
A lookup microbenchmark improves from 79.9 to 40.4 ns median. A 100,000-identifier clang-tidy benchmark improves from 0.343 to 0.314 seconds mean.
Validate all 6,355 replacements byte-for-byte against confusables.txt, deterministic regeneration, byte-identical baseline and candidate diagnostics, and the focused misc-confusable-identifiers test with multi-code-point and non-ASCII replacements. Compile and link the generator, consumer object, and standalone clang-tidy.
Work towards #202616
>From f16a5f2ee0935f6800a577a56517e42fea2bee97 Mon Sep 17 00:00:00 2001
From: David Zbarsky <dzbarsky at gmail.com>
Date: Mon, 8 Jun 2026 23:45:57 -0400
Subject: [PATCH] [clang-tidy] Compact the confusable identifier table
Encode confusable replacement strings once in a shared UTF-8 byte array. Each sorted mapping becomes an eight-byte code point, 16-bit offset, and 16-bit length entry instead of reserving nineteen UTF-32 code points for every replacement.
ConfusableIdentifierCheck appends the UTF-8 slice directly after the binary search, removing the per-identifier UTF-32 to UTF-8 conversion. Add compile-time assertions for the eight-byte entry layout and the 64 KiB replacement-data limit, plus generator-side overflow checks.
The logical table shrinks from 508,400 to 63,327 bytes, saving 445,073 bytes. A stripped standalone clang-tidy shrinks from 91,116,608 to 90,654,176 bytes, saving 462,432 bytes (0.508%).
A lookup microbenchmark improves from 79.9 to 40.4 ns median. A 100,000-identifier clang-tidy benchmark improves from 0.343 to 0.314 seconds mean.
Validate all 6,355 replacements byte-for-byte against confusables.txt, deterministic regeneration, byte-identical baseline and candidate diagnostics, and the focused misc-confusable-identifiers test with multi-code-point and non-ASCII replacements. Compile and link the generator, consumer object, and standalone clang-tidy.
---
.../misc/ConfusableIdentifierCheck.cpp | 37 +++++----
.../ConfusableTable/BuildConfusableTable.cpp | 76 +++++++++++++------
.../checkers/misc/confusable-identifiers.cpp | 10 +++
3 files changed, 81 insertions(+), 42 deletions(-)
diff --git a/clang-tools-extra/clang-tidy/misc/ConfusableIdentifierCheck.cpp b/clang-tools-extra/clang-tidy/misc/ConfusableIdentifierCheck.cpp
index f386bd613dfc6..7647c509ea202 100644
--- a/clang-tools-extra/clang-tidy/misc/ConfusableIdentifierCheck.cpp
+++ b/clang-tools-extra/clang-tidy/misc/ConfusableIdentifierCheck.cpp
@@ -13,12 +13,23 @@
#include "llvm/ADT/SmallString.h"
#include "llvm/Support/ConvertUTF.h"
+#include <cstdint>
+
namespace {
// Preprocessed version of
// https://www.unicode.org/Public/security/latest/confusables.txt
//
-// This contains a sorted array of { UTF32 codepoint; UTF32 values[N];}
+// This contains a sorted array of code points and slices into a shared UTF-8
+// replacement array.
+struct ConfusableEntry {
+ llvm::UTF32 CodePoint;
+ uint16_t Offset;
+ uint16_t Length;
+};
+static_assert(sizeof(ConfusableEntry) == 8);
+
#include "Confusables.inc"
+static_assert(sizeof(ConfusableReplacementData) <= 1ULL << 16);
} // namespace
namespace clang::tidy::misc {
@@ -63,26 +74,14 @@ static SmallString<64U> skeleton(StringRef Name) {
break;
}
- const StringRef Key(Prev, Curr - Prev);
- auto *Where = llvm::lower_bound(ConfusableEntries, CodePoint,
- [](decltype(ConfusableEntries[0]) X,
- UTF32 Y) { return X.codepoint < Y; });
- if (Where == std::end(ConfusableEntries) || CodePoint != Where->codepoint) {
+ auto *Where = llvm::lower_bound(
+ ConfusableEntries, CodePoint,
+ [](const ConfusableEntry &X, UTF32 Y) { return X.CodePoint < Y; });
+ if (Where == std::end(ConfusableEntries) || CodePoint != Where->CodePoint) {
Skeleton.append(Prev, Curr);
} else {
- UTF8 Buffer[32];
- UTF8 *BufferStart = std::begin(Buffer);
- UTF8 *IBuffer = BufferStart;
- const UTF32 *ValuesStart = std::begin(Where->values);
- const UTF32 *ValuesEnd = llvm::find(Where->values, '\0');
- if (ConvertUTF32toUTF8(&ValuesStart, ValuesEnd, &IBuffer,
- std::end(Buffer),
- strictConversion) != conversionOK) {
- errs() << "Unicode conversion issue\n";
- break;
- }
- Skeleton.append(reinterpret_cast<char *>(BufferStart),
- reinterpret_cast<char *>(IBuffer));
+ Skeleton.append(
+ StringRef(ConfusableReplacementData + Where->Offset, Where->Length));
}
}
return Skeleton;
diff --git a/clang-tools-extra/clang-tidy/misc/ConfusableTable/BuildConfusableTable.cpp b/clang-tools-extra/clang-tidy/misc/ConfusableTable/BuildConfusableTable.cpp
index 0c2f289d16922..af5791c21923e 100644
--- a/clang-tools-extra/clang-tidy/misc/ConfusableTable/BuildConfusableTable.cpp
+++ b/clang-tools-extra/clang-tidy/misc/ConfusableTable/BuildConfusableTable.cpp
@@ -8,12 +8,23 @@
#include "llvm/ADT/STLExtras.h"
#include "llvm/ADT/StringExtras.h"
+#include "llvm/ADT/StringMap.h"
#include "llvm/Support/ConvertUTF.h"
#include "llvm/Support/MemoryBuffer.h"
#include "llvm/Support/raw_ostream.h"
+#include <cstdint>
+#include <limits>
+
using namespace llvm;
+namespace {
+struct ConfusableEntry {
+ UTF32 CodePoint;
+ std::string Replacement;
+};
+} // namespace
+
int main(int argc, char *argv[]) {
auto ErrorOrBuffer = MemoryBuffer::getFile(argv[1], true);
if (!ErrorOrBuffer)
@@ -24,7 +35,7 @@ int main(int argc, char *argv[]) {
SmallVector<StringRef> Lines;
SplitString(Content, Lines, "\r\n");
- std::vector<std::pair<llvm::UTF32, SmallVector<llvm::UTF32>>> Entries;
+ std::vector<ConfusableEntry> Entries;
SmallVector<StringRef> Values;
for (const StringRef Line : Lines) {
if (Line.starts_with('#'))
@@ -41,42 +52,61 @@ int main(int argc, char *argv[]) {
llvm::UTF32 CodePoint = 0;
From.getAsInteger(16, CodePoint);
- SmallVector<llvm::UTF32> To;
+ std::string Replacement;
SmallVector<StringRef> ToN;
Values[1].split(ToN, ' ', -1, false);
for (const StringRef ToI : ToN) {
llvm::UTF32 ToCodePoint = 0;
ToI.trim().getAsInteger(16, ToCodePoint);
- To.push_back(ToCodePoint);
+ char Encoded[UNI_MAX_UTF8_BYTES_PER_CODE_POINT];
+ char *EncodedEnd = Encoded;
+ if (!ConvertCodePointToUTF8(ToCodePoint, EncodedEnd)) {
+ errs() << "Failed to encode code point: " << ToI << "\n";
+ return 3;
+ }
+ Replacement.append(Encoded, EncodedEnd);
}
- // Sentinel
- To.push_back(0);
- Entries.emplace_back(CodePoint, To);
+ Entries.push_back({CodePoint, std::move(Replacement)});
}
- llvm::sort(Entries);
+ llvm::sort(Entries,
+ [](const ConfusableEntry &LHS, const ConfusableEntry &RHS) {
+ return LHS.CodePoint < RHS.CodePoint;
+ });
- const unsigned LargestValue =
- llvm::max_element(Entries, [](const auto &Entry0, const auto &Entry1) {
- return Entry0.second.size() < Entry1.second.size();
- })->second.size();
+ StringMap<uint16_t> ReplacementOffsets;
+ std::string ReplacementData;
+ for (const ConfusableEntry &Entry : Entries) {
+ if (ReplacementOffsets.contains(Entry.Replacement))
+ continue;
+ if (ReplacementData.size() + Entry.Replacement.size() >
+ std::numeric_limits<uint16_t>::max()) {
+ errs() << "Confusable replacement data exceeds 16-bit offsets\n";
+ return 4;
+ }
+ ReplacementOffsets.try_emplace(
+ Entry.Replacement, static_cast<uint16_t>(ReplacementData.size()));
+ ReplacementData.append(Entry.Replacement);
+ }
std::error_code Ec;
llvm::raw_fd_ostream Os(argv[2], Ec);
- // FIXME: If memory consumption and/or lookup time becomes a constraint, it
- // maybe worth using a more elaborate data structure.
- Os << "struct {llvm::UTF32 codepoint; llvm::UTF32 values[" << LargestValue
- << "];} "
- "ConfusableEntries[] = {\n";
- for (const auto &Values : Entries) {
- Os << " { ";
- Os << Values.first;
- Os << ", {";
- for (auto CP : Values.second)
- Os << CP << ", ";
+ static constexpr char HexDigits[] = "0123456789ABCDEF";
+ Os << "constexpr char ConfusableReplacementData[] =\n";
+ for (size_t I = 0; I < ReplacementData.size(); I += 32) {
+ Os << " \"";
+ for (unsigned char C : StringRef(ReplacementData).substr(I, 32))
+ Os << "\\x" << HexDigits[C >> 4] << HexDigits[C & 0x0F];
+ Os << "\"\n";
+ }
+ Os << " ;\n\n";
- Os << "}},\n";
+ Os << "constexpr ConfusableEntry ConfusableEntries[] = {\n";
+ for (const ConfusableEntry &Entry : Entries) {
+ const auto It = ReplacementOffsets.find(Entry.Replacement);
+ Os << " {" << Entry.CodePoint << ", " << It->second << ", "
+ << Entry.Replacement.size() << "},\n";
}
Os << "};\n";
return 0;
diff --git a/clang-tools-extra/test/clang-tidy/checkers/misc/confusable-identifiers.cpp b/clang-tools-extra/test/clang-tidy/checkers/misc/confusable-identifiers.cpp
index fb7eeed7e715f..82520eac4523b 100644
--- a/clang-tools-extra/test/clang-tidy/checkers/misc/confusable-identifiers.cpp
+++ b/clang-tools-extra/test/clang-tidy/checkers/misc/confusable-identifiers.cpp
@@ -17,6 +17,16 @@ int ll;
// CHECK-MESSAGES: :[[#@LINE-1]]:5: warning: 'll' is confusable with 'l1' [misc-confusable-identifiers]
// CHECK-MESSAGES: :[[#@LINE-3]]:5: note: other declaration found here
+int rn;
+int m;
+// CHECK-MESSAGES: :[[#@LINE-1]]:5: warning: 'm' is confusable with 'rn' [misc-confusable-identifiers]
+// CHECK-MESSAGES: :[[#@LINE-3]]:5: note: other declaration found here
+
+int μ;
+int µ;
+// CHECK-MESSAGES: :[[#@LINE-1]]:5: warning: 'µ' is confusable with 'μ' [misc-confusable-identifiers]
+// CHECK-MESSAGES: :[[#@LINE-3]]:5: note: other declaration found here
+
bool f0(const char *q1, const char *ql) {
// CHECK-MESSAGES: :[[#@LINE-1]]:37: warning: 'ql' is confusable with 'q1' [misc-confusable-identifiers]
// CHECK-MESSAGES: :[[#@LINE-2]]:21: note: other declaration found here
More information about the cfe-commits
mailing list