[llvm] r325107 - Implement a case-folding version of DJB hash
Pavel Labath via llvm-commits
llvm-commits at lists.llvm.org
Wed Feb 14 02:05:09 PST 2018
Author: labath
Date: Wed Feb 14 02:05:09 2018
New Revision: 325107
URL: http://llvm.org/viewvc/llvm-project?rev=325107&view=rev
Log:
Implement a case-folding version of DJB hash
Summary:
This patch implements a variant of the DJB hash function which folds the
input according to the algorithm in the Dwarf 5 specification (Section
6.1.1.4.5), which in turn references the Unicode Standard (Section 5.18,
"Case Mappings").
To achieve this, I have added a llvm::sys::unicode::foldCharSimple
function, which performs this mapping. The implementation of this
function was generated from the CaseMatching.txt file from the Unicode
spec using a python script (which is also included in this patch). The
script tries to optimize the function by coalescing adjecant mappings
with the same shift and stride (terms I made up). Theoretically, it
could be made a bit smarter and merge adjecant blocks that were
interrupted by only one or two characters with exceptional mapping, but
this would save only a couple of branches, while it would greatly
complicate the implementation, so I deemed it was not worth it.
Since we assume that the vast majority of the input characters will be
US-ASCII, the folding hash function has a fast-path for handling these,
and only whips out the full decode+fold+encode logic if we encounter a
character outside of this range. It might be possible to implement the
folding directly on utf8 sequences, but this would also bring a lot of
complexity for the few cases where we will actually need to process
non-ascii characters.
Reviewers: JDevlieghere, aprantl, probinson, dblaikie
Subscribers: mgorny, hintonda, echristo, clayborg, vleschuk, llvm-commits
Differential Revision: https://reviews.llvm.org/D42740
Added:
llvm/trunk/lib/Support/UnicodeCaseFold.cpp
llvm/trunk/unittests/Support/DJBTest.cpp
llvm/trunk/utils/unicode-case-fold.py (with props)
Modified:
llvm/trunk/include/llvm/Support/DJB.h
llvm/trunk/include/llvm/Support/Unicode.h
llvm/trunk/lib/Support/CMakeLists.txt
llvm/trunk/lib/Support/DJB.cpp
llvm/trunk/unittests/Support/CMakeLists.txt
Modified: llvm/trunk/include/llvm/Support/DJB.h
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/include/llvm/Support/DJB.h?rev=325107&r1=325106&r2=325107&view=diff
==============================================================================
--- llvm/trunk/include/llvm/Support/DJB.h (original)
+++ llvm/trunk/include/llvm/Support/DJB.h Wed Feb 14 02:05:09 2018
@@ -20,6 +20,10 @@ namespace llvm {
/// The Bernstein hash function used by the DWARF accelerator tables.
uint32_t djbHash(StringRef Buffer, uint32_t H = 5381);
+
+/// Computes the Bernstein hash after folding the input according to the Dwarf 5
+/// standard case folding rules.
+uint32_t caseFoldingDjbHash(StringRef Buffer, uint32_t H = 5381);
} // namespace llvm
#endif // LLVM_SUPPORT_DJB_H
Modified: llvm/trunk/include/llvm/Support/Unicode.h
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/include/llvm/Support/Unicode.h?rev=325107&r1=325106&r2=325107&view=diff
==============================================================================
--- llvm/trunk/include/llvm/Support/Unicode.h (original)
+++ llvm/trunk/include/llvm/Support/Unicode.h Wed Feb 14 02:05:09 2018
@@ -60,6 +60,10 @@ bool isPrintable(int UCS);
/// * 1 for each of the remaining characters.
int columnWidthUTF8(StringRef Text);
+/// Fold input unicode character according the the Simple unicode case folding
+/// rules.
+int foldCharSimple(int C);
+
} // namespace unicode
} // namespace sys
} // namespace llvm
Modified: llvm/trunk/lib/Support/CMakeLists.txt
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Support/CMakeLists.txt?rev=325107&r1=325106&r2=325107&view=diff
==============================================================================
--- llvm/trunk/lib/Support/CMakeLists.txt (original)
+++ llvm/trunk/lib/Support/CMakeLists.txt Wed Feb 14 02:05:09 2018
@@ -113,6 +113,7 @@ add_llvm_library(LLVMSupport
Triple.cpp
Twine.cpp
Unicode.cpp
+ UnicodeCaseFold.cpp
YAMLParser.cpp
YAMLTraits.cpp
raw_os_ostream.cpp
Modified: llvm/trunk/lib/Support/DJB.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Support/DJB.cpp?rev=325107&r1=325106&r2=325107&view=diff
==============================================================================
--- llvm/trunk/lib/Support/DJB.cpp (original)
+++ llvm/trunk/lib/Support/DJB.cpp Wed Feb 14 02:05:09 2018
@@ -12,9 +12,85 @@
//===----------------------------------------------------------------------===//
#include "llvm/Support/DJB.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/ConvertUTF.h"
+#include "llvm/Support/Unicode.h"
+
+using namespace llvm;
+
+static inline uint32_t djbHashChar(char C, uint32_t H) {
+ return (H << 5) + H + C;
+}
uint32_t llvm::djbHash(StringRef Buffer, uint32_t H) {
for (char C : Buffer.bytes())
- H = ((H << 5) + H) + C;
+ H = djbHashChar(C, H);
+ return H;
+}
+
+static UTF32 chopOneUTF32(StringRef &Buffer) {
+ UTF32 C;
+ const UTF8 *const Begin8Const =
+ reinterpret_cast<const UTF8 *>(Buffer.begin());
+ const UTF8 *Begin8 = Begin8Const;
+ UTF32 *Begin32 = &C;
+
+ // In lenient mode we will always end up with a "reasonable" value in C for
+ // non-empty input.
+ assert(!Buffer.empty());
+ ConvertUTF8toUTF32(&Begin8, reinterpret_cast<const UTF8 *>(Buffer.end()),
+ &Begin32, &C + 1, lenientConversion);
+ Buffer = Buffer.drop_front(Begin8 - Begin8Const);
+ return C;
+}
+
+static StringRef toUTF8(UTF32 C, MutableArrayRef<UTF8> Storage) {
+ const UTF32 *Begin32 = &C;
+ UTF8 *Begin8 = Storage.begin();
+
+ // The case-folded output should always be a valid unicode character, so use
+ // strict mode here.
+ ConversionResult CR = ConvertUTF32toUTF8(&Begin32, &C + 1, &Begin8,
+ Storage.end(), strictConversion);
+ assert(CR == conversionOK && "Case folding produced invalid char?");
+ (void)CR;
+ return StringRef(reinterpret_cast<char *>(Storage.begin()),
+ Begin8 - Storage.begin());
+}
+
+static UTF32 foldCharDwarf(UTF32 C) {
+ // DWARF v5 addition to the unicode folding rules.
+ // Fold "Latin Small Letter Dotless I" and "Latin Capital Letter I With Dot
+ // Above" into "i".
+ if (C == 0x130 || C == 0x131)
+ return 'i';
+ return sys::unicode::foldCharSimple(C);
+}
+
+static uint32_t caseFoldingDjbHashCharSlow(StringRef &Buffer, uint32_t H) {
+ UTF32 C = chopOneUTF32(Buffer);
+
+ C = foldCharDwarf(C);
+
+ std::array<UTF8, UNI_MAX_UTF8_BYTES_PER_CODE_POINT> Storage;
+ StringRef Folded = toUTF8(C, Storage);
+ return djbHash(Folded, H);
+}
+
+uint32_t llvm::caseFoldingDjbHash(StringRef Buffer, uint32_t H) {
+ while (!Buffer.empty()) {
+ unsigned char C = Buffer.front();
+ if (LLVM_LIKELY(C <= 0x7f)) {
+ // US-ASCII, encoded as one character in utf-8.
+ // This is by far the most common case, so handle this specially.
+ if (C >= 'A' && C <= 'Z')
+ C = 'a' + (C - 'A'); // fold uppercase into lowercase
+ H = djbHashChar(C, H);
+ Buffer = Buffer.drop_front();
+ continue;
+ }
+ H = caseFoldingDjbHashCharSlow(Buffer, H);
+ }
return H;
}
Added: llvm/trunk/lib/Support/UnicodeCaseFold.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Support/UnicodeCaseFold.cpp?rev=325107&view=auto
==============================================================================
--- llvm/trunk/lib/Support/UnicodeCaseFold.cpp (added)
+++ llvm/trunk/lib/Support/UnicodeCaseFold.cpp Wed Feb 14 02:05:09 2018
@@ -0,0 +1,742 @@
+//===---------- Support/UnicodeCaseFold.cpp -------------------------------===//
+//
+// This file was generated by utils/unicode-case-fold.py from the Unicode
+// case folding database at
+// http://www.unicode.org/Public/9.0.0/ucd/CaseFolding.txt
+//
+// To regenerate this file, run:
+// utils/unicode-case-fold.py \
+// "http://www.unicode.org/Public/9.0.0/ucd/CaseFolding.txt" \
+// > lib/Support/UnicodeCaseFold.cpp
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Support/Unicode.h"
+
+int llvm::sys::unicode::foldCharSimple(int C) {
+ if (C < 0x0041)
+ return C;
+ // 26 characters
+ if (C <= 0x005a)
+ return C + 32;
+ // MICRO SIGN
+ if (C == 0x00b5)
+ return 0x03bc;
+ if (C < 0x00c0)
+ return C;
+ // 23 characters
+ if (C <= 0x00d6)
+ return C + 32;
+ if (C < 0x00d8)
+ return C;
+ // 7 characters
+ if (C <= 0x00de)
+ return C + 32;
+ if (C < 0x0100)
+ return C;
+ // 24 characters
+ if (C <= 0x012e)
+ return C | 1;
+ if (C < 0x0132)
+ return C;
+ // 3 characters
+ if (C <= 0x0136)
+ return C | 1;
+ if (C < 0x0139)
+ return C;
+ // 8 characters
+ if (C <= 0x0147 && C % 2 == 1)
+ return C + 1;
+ if (C < 0x014a)
+ return C;
+ // 23 characters
+ if (C <= 0x0176)
+ return C | 1;
+ // LATIN CAPITAL LETTER Y WITH DIAERESIS
+ if (C == 0x0178)
+ return 0x00ff;
+ if (C < 0x0179)
+ return C;
+ // 3 characters
+ if (C <= 0x017d && C % 2 == 1)
+ return C + 1;
+ // LATIN SMALL LETTER LONG S
+ if (C == 0x017f)
+ return 0x0073;
+ // LATIN CAPITAL LETTER B WITH HOOK
+ if (C == 0x0181)
+ return 0x0253;
+ if (C < 0x0182)
+ return C;
+ // 2 characters
+ if (C <= 0x0184)
+ return C | 1;
+ // LATIN CAPITAL LETTER OPEN O
+ if (C == 0x0186)
+ return 0x0254;
+ // LATIN CAPITAL LETTER C WITH HOOK
+ if (C == 0x0187)
+ return 0x0188;
+ if (C < 0x0189)
+ return C;
+ // 2 characters
+ if (C <= 0x018a)
+ return C + 205;
+ // LATIN CAPITAL LETTER D WITH TOPBAR
+ if (C == 0x018b)
+ return 0x018c;
+ // LATIN CAPITAL LETTER REVERSED E
+ if (C == 0x018e)
+ return 0x01dd;
+ // LATIN CAPITAL LETTER SCHWA
+ if (C == 0x018f)
+ return 0x0259;
+ // LATIN CAPITAL LETTER OPEN E
+ if (C == 0x0190)
+ return 0x025b;
+ // LATIN CAPITAL LETTER F WITH HOOK
+ if (C == 0x0191)
+ return 0x0192;
+ // LATIN CAPITAL LETTER G WITH HOOK
+ if (C == 0x0193)
+ return 0x0260;
+ // LATIN CAPITAL LETTER GAMMA
+ if (C == 0x0194)
+ return 0x0263;
+ // LATIN CAPITAL LETTER IOTA
+ if (C == 0x0196)
+ return 0x0269;
+ // LATIN CAPITAL LETTER I WITH STROKE
+ if (C == 0x0197)
+ return 0x0268;
+ // LATIN CAPITAL LETTER K WITH HOOK
+ if (C == 0x0198)
+ return 0x0199;
+ // LATIN CAPITAL LETTER TURNED M
+ if (C == 0x019c)
+ return 0x026f;
+ // LATIN CAPITAL LETTER N WITH LEFT HOOK
+ if (C == 0x019d)
+ return 0x0272;
+ // LATIN CAPITAL LETTER O WITH MIDDLE TILDE
+ if (C == 0x019f)
+ return 0x0275;
+ if (C < 0x01a0)
+ return C;
+ // 3 characters
+ if (C <= 0x01a4)
+ return C | 1;
+ // LATIN LETTER YR
+ if (C == 0x01a6)
+ return 0x0280;
+ // LATIN CAPITAL LETTER TONE TWO
+ if (C == 0x01a7)
+ return 0x01a8;
+ // LATIN CAPITAL LETTER ESH
+ if (C == 0x01a9)
+ return 0x0283;
+ // LATIN CAPITAL LETTER T WITH HOOK
+ if (C == 0x01ac)
+ return 0x01ad;
+ // LATIN CAPITAL LETTER T WITH RETROFLEX HOOK
+ if (C == 0x01ae)
+ return 0x0288;
+ // LATIN CAPITAL LETTER U WITH HORN
+ if (C == 0x01af)
+ return 0x01b0;
+ if (C < 0x01b1)
+ return C;
+ // 2 characters
+ if (C <= 0x01b2)
+ return C + 217;
+ if (C < 0x01b3)
+ return C;
+ // 2 characters
+ if (C <= 0x01b5 && C % 2 == 1)
+ return C + 1;
+ // LATIN CAPITAL LETTER EZH
+ if (C == 0x01b7)
+ return 0x0292;
+ if (C < 0x01b8)
+ return C;
+ // 2 characters
+ if (C <= 0x01bc && C % 4 == 0)
+ return C + 1;
+ // LATIN CAPITAL LETTER DZ WITH CARON
+ if (C == 0x01c4)
+ return 0x01c6;
+ // LATIN CAPITAL LETTER D WITH SMALL LETTER Z WITH CARON
+ if (C == 0x01c5)
+ return 0x01c6;
+ // LATIN CAPITAL LETTER LJ
+ if (C == 0x01c7)
+ return 0x01c9;
+ // LATIN CAPITAL LETTER L WITH SMALL LETTER J
+ if (C == 0x01c8)
+ return 0x01c9;
+ // LATIN CAPITAL LETTER NJ
+ if (C == 0x01ca)
+ return 0x01cc;
+ if (C < 0x01cb)
+ return C;
+ // 9 characters
+ if (C <= 0x01db && C % 2 == 1)
+ return C + 1;
+ if (C < 0x01de)
+ return C;
+ // 9 characters
+ if (C <= 0x01ee)
+ return C | 1;
+ // LATIN CAPITAL LETTER DZ
+ if (C == 0x01f1)
+ return 0x01f3;
+ if (C < 0x01f2)
+ return C;
+ // 2 characters
+ if (C <= 0x01f4)
+ return C | 1;
+ // LATIN CAPITAL LETTER HWAIR
+ if (C == 0x01f6)
+ return 0x0195;
+ // LATIN CAPITAL LETTER WYNN
+ if (C == 0x01f7)
+ return 0x01bf;
+ if (C < 0x01f8)
+ return C;
+ // 20 characters
+ if (C <= 0x021e)
+ return C | 1;
+ // LATIN CAPITAL LETTER N WITH LONG RIGHT LEG
+ if (C == 0x0220)
+ return 0x019e;
+ if (C < 0x0222)
+ return C;
+ // 9 characters
+ if (C <= 0x0232)
+ return C | 1;
+ // LATIN CAPITAL LETTER A WITH STROKE
+ if (C == 0x023a)
+ return 0x2c65;
+ // LATIN CAPITAL LETTER C WITH STROKE
+ if (C == 0x023b)
+ return 0x023c;
+ // LATIN CAPITAL LETTER L WITH BAR
+ if (C == 0x023d)
+ return 0x019a;
+ // LATIN CAPITAL LETTER T WITH DIAGONAL STROKE
+ if (C == 0x023e)
+ return 0x2c66;
+ // LATIN CAPITAL LETTER GLOTTAL STOP
+ if (C == 0x0241)
+ return 0x0242;
+ // LATIN CAPITAL LETTER B WITH STROKE
+ if (C == 0x0243)
+ return 0x0180;
+ // LATIN CAPITAL LETTER U BAR
+ if (C == 0x0244)
+ return 0x0289;
+ // LATIN CAPITAL LETTER TURNED V
+ if (C == 0x0245)
+ return 0x028c;
+ if (C < 0x0246)
+ return C;
+ // 5 characters
+ if (C <= 0x024e)
+ return C | 1;
+ // COMBINING GREEK YPOGEGRAMMENI
+ if (C == 0x0345)
+ return 0x03b9;
+ if (C < 0x0370)
+ return C;
+ // 2 characters
+ if (C <= 0x0372)
+ return C | 1;
+ // GREEK CAPITAL LETTER PAMPHYLIAN DIGAMMA
+ if (C == 0x0376)
+ return 0x0377;
+ // GREEK CAPITAL LETTER YOT
+ if (C == 0x037f)
+ return 0x03f3;
+ // GREEK CAPITAL LETTER ALPHA WITH TONOS
+ if (C == 0x0386)
+ return 0x03ac;
+ if (C < 0x0388)
+ return C;
+ // 3 characters
+ if (C <= 0x038a)
+ return C + 37;
+ // GREEK CAPITAL LETTER OMICRON WITH TONOS
+ if (C == 0x038c)
+ return 0x03cc;
+ if (C < 0x038e)
+ return C;
+ // 2 characters
+ if (C <= 0x038f)
+ return C + 63;
+ if (C < 0x0391)
+ return C;
+ // 17 characters
+ if (C <= 0x03a1)
+ return C + 32;
+ if (C < 0x03a3)
+ return C;
+ // 9 characters
+ if (C <= 0x03ab)
+ return C + 32;
+ // GREEK SMALL LETTER FINAL SIGMA
+ if (C == 0x03c2)
+ return 0x03c3;
+ // GREEK CAPITAL KAI SYMBOL
+ if (C == 0x03cf)
+ return 0x03d7;
+ // GREEK BETA SYMBOL
+ if (C == 0x03d0)
+ return 0x03b2;
+ // GREEK THETA SYMBOL
+ if (C == 0x03d1)
+ return 0x03b8;
+ // GREEK PHI SYMBOL
+ if (C == 0x03d5)
+ return 0x03c6;
+ // GREEK PI SYMBOL
+ if (C == 0x03d6)
+ return 0x03c0;
+ if (C < 0x03d8)
+ return C;
+ // 12 characters
+ if (C <= 0x03ee)
+ return C | 1;
+ // GREEK KAPPA SYMBOL
+ if (C == 0x03f0)
+ return 0x03ba;
+ // GREEK RHO SYMBOL
+ if (C == 0x03f1)
+ return 0x03c1;
+ // GREEK CAPITAL THETA SYMBOL
+ if (C == 0x03f4)
+ return 0x03b8;
+ // GREEK LUNATE EPSILON SYMBOL
+ if (C == 0x03f5)
+ return 0x03b5;
+ // GREEK CAPITAL LETTER SHO
+ if (C == 0x03f7)
+ return 0x03f8;
+ // GREEK CAPITAL LUNATE SIGMA SYMBOL
+ if (C == 0x03f9)
+ return 0x03f2;
+ // GREEK CAPITAL LETTER SAN
+ if (C == 0x03fa)
+ return 0x03fb;
+ if (C < 0x03fd)
+ return C;
+ // 3 characters
+ if (C <= 0x03ff)
+ return C + -130;
+ if (C < 0x0400)
+ return C;
+ // 16 characters
+ if (C <= 0x040f)
+ return C + 80;
+ if (C < 0x0410)
+ return C;
+ // 32 characters
+ if (C <= 0x042f)
+ return C + 32;
+ if (C < 0x0460)
+ return C;
+ // 17 characters
+ if (C <= 0x0480)
+ return C | 1;
+ if (C < 0x048a)
+ return C;
+ // 27 characters
+ if (C <= 0x04be)
+ return C | 1;
+ // CYRILLIC LETTER PALOCHKA
+ if (C == 0x04c0)
+ return 0x04cf;
+ if (C < 0x04c1)
+ return C;
+ // 7 characters
+ if (C <= 0x04cd && C % 2 == 1)
+ return C + 1;
+ if (C < 0x04d0)
+ return C;
+ // 48 characters
+ if (C <= 0x052e)
+ return C | 1;
+ if (C < 0x0531)
+ return C;
+ // 38 characters
+ if (C <= 0x0556)
+ return C + 48;
+ if (C < 0x10a0)
+ return C;
+ // 38 characters
+ if (C <= 0x10c5)
+ return C + 7264;
+ if (C < 0x10c7)
+ return C;
+ // 2 characters
+ if (C <= 0x10cd && C % 6 == 5)
+ return C + 7264;
+ if (C < 0x13f8)
+ return C;
+ // 6 characters
+ if (C <= 0x13fd)
+ return C + -8;
+ // CYRILLIC SMALL LETTER ROUNDED VE
+ if (C == 0x1c80)
+ return 0x0432;
+ // CYRILLIC SMALL LETTER LONG-LEGGED DE
+ if (C == 0x1c81)
+ return 0x0434;
+ // CYRILLIC SMALL LETTER NARROW O
+ if (C == 0x1c82)
+ return 0x043e;
+ if (C < 0x1c83)
+ return C;
+ // 2 characters
+ if (C <= 0x1c84)
+ return C + -6210;
+ // CYRILLIC SMALL LETTER THREE-LEGGED TE
+ if (C == 0x1c85)
+ return 0x0442;
+ // CYRILLIC SMALL LETTER TALL HARD SIGN
+ if (C == 0x1c86)
+ return 0x044a;
+ // CYRILLIC SMALL LETTER TALL YAT
+ if (C == 0x1c87)
+ return 0x0463;
+ // CYRILLIC SMALL LETTER UNBLENDED UK
+ if (C == 0x1c88)
+ return 0xa64b;
+ if (C < 0x1e00)
+ return C;
+ // 75 characters
+ if (C <= 0x1e94)
+ return C | 1;
+ // LATIN SMALL LETTER LONG S WITH DOT ABOVE
+ if (C == 0x1e9b)
+ return 0x1e61;
+ // LATIN CAPITAL LETTER SHARP S
+ if (C == 0x1e9e)
+ return 0x00df;
+ if (C < 0x1ea0)
+ return C;
+ // 48 characters
+ if (C <= 0x1efe)
+ return C | 1;
+ if (C < 0x1f08)
+ return C;
+ // 8 characters
+ if (C <= 0x1f0f)
+ return C + -8;
+ if (C < 0x1f18)
+ return C;
+ // 6 characters
+ if (C <= 0x1f1d)
+ return C + -8;
+ if (C < 0x1f28)
+ return C;
+ // 8 characters
+ if (C <= 0x1f2f)
+ return C + -8;
+ if (C < 0x1f38)
+ return C;
+ // 8 characters
+ if (C <= 0x1f3f)
+ return C + -8;
+ if (C < 0x1f48)
+ return C;
+ // 6 characters
+ if (C <= 0x1f4d)
+ return C + -8;
+ if (C < 0x1f59)
+ return C;
+ // 4 characters
+ if (C <= 0x1f5f && C % 2 == 1)
+ return C + -8;
+ if (C < 0x1f68)
+ return C;
+ // 8 characters
+ if (C <= 0x1f6f)
+ return C + -8;
+ if (C < 0x1f88)
+ return C;
+ // 8 characters
+ if (C <= 0x1f8f)
+ return C + -8;
+ if (C < 0x1f98)
+ return C;
+ // 8 characters
+ if (C <= 0x1f9f)
+ return C + -8;
+ if (C < 0x1fa8)
+ return C;
+ // 8 characters
+ if (C <= 0x1faf)
+ return C + -8;
+ if (C < 0x1fb8)
+ return C;
+ // 2 characters
+ if (C <= 0x1fb9)
+ return C + -8;
+ if (C < 0x1fba)
+ return C;
+ // 2 characters
+ if (C <= 0x1fbb)
+ return C + -74;
+ // GREEK CAPITAL LETTER ALPHA WITH PROSGEGRAMMENI
+ if (C == 0x1fbc)
+ return 0x1fb3;
+ // GREEK PROSGEGRAMMENI
+ if (C == 0x1fbe)
+ return 0x03b9;
+ if (C < 0x1fc8)
+ return C;
+ // 4 characters
+ if (C <= 0x1fcb)
+ return C + -86;
+ // GREEK CAPITAL LETTER ETA WITH PROSGEGRAMMENI
+ if (C == 0x1fcc)
+ return 0x1fc3;
+ if (C < 0x1fd8)
+ return C;
+ // 2 characters
+ if (C <= 0x1fd9)
+ return C + -8;
+ if (C < 0x1fda)
+ return C;
+ // 2 characters
+ if (C <= 0x1fdb)
+ return C + -100;
+ if (C < 0x1fe8)
+ return C;
+ // 2 characters
+ if (C <= 0x1fe9)
+ return C + -8;
+ if (C < 0x1fea)
+ return C;
+ // 2 characters
+ if (C <= 0x1feb)
+ return C + -112;
+ // GREEK CAPITAL LETTER RHO WITH DASIA
+ if (C == 0x1fec)
+ return 0x1fe5;
+ if (C < 0x1ff8)
+ return C;
+ // 2 characters
+ if (C <= 0x1ff9)
+ return C + -128;
+ if (C < 0x1ffa)
+ return C;
+ // 2 characters
+ if (C <= 0x1ffb)
+ return C + -126;
+ // GREEK CAPITAL LETTER OMEGA WITH PROSGEGRAMMENI
+ if (C == 0x1ffc)
+ return 0x1ff3;
+ // OHM SIGN
+ if (C == 0x2126)
+ return 0x03c9;
+ // KELVIN SIGN
+ if (C == 0x212a)
+ return 0x006b;
+ // ANGSTROM SIGN
+ if (C == 0x212b)
+ return 0x00e5;
+ // TURNED CAPITAL F
+ if (C == 0x2132)
+ return 0x214e;
+ if (C < 0x2160)
+ return C;
+ // 16 characters
+ if (C <= 0x216f)
+ return C + 16;
+ // ROMAN NUMERAL REVERSED ONE HUNDRED
+ if (C == 0x2183)
+ return 0x2184;
+ if (C < 0x24b6)
+ return C;
+ // 26 characters
+ if (C <= 0x24cf)
+ return C + 26;
+ if (C < 0x2c00)
+ return C;
+ // 47 characters
+ if (C <= 0x2c2e)
+ return C + 48;
+ // LATIN CAPITAL LETTER L WITH DOUBLE BAR
+ if (C == 0x2c60)
+ return 0x2c61;
+ // LATIN CAPITAL LETTER L WITH MIDDLE TILDE
+ if (C == 0x2c62)
+ return 0x026b;
+ // LATIN CAPITAL LETTER P WITH STROKE
+ if (C == 0x2c63)
+ return 0x1d7d;
+ // LATIN CAPITAL LETTER R WITH TAIL
+ if (C == 0x2c64)
+ return 0x027d;
+ if (C < 0x2c67)
+ return C;
+ // 3 characters
+ if (C <= 0x2c6b && C % 2 == 1)
+ return C + 1;
+ // LATIN CAPITAL LETTER ALPHA
+ if (C == 0x2c6d)
+ return 0x0251;
+ // LATIN CAPITAL LETTER M WITH HOOK
+ if (C == 0x2c6e)
+ return 0x0271;
+ // LATIN CAPITAL LETTER TURNED A
+ if (C == 0x2c6f)
+ return 0x0250;
+ // LATIN CAPITAL LETTER TURNED ALPHA
+ if (C == 0x2c70)
+ return 0x0252;
+ if (C < 0x2c72)
+ return C;
+ // 2 characters
+ if (C <= 0x2c75 && C % 3 == 2)
+ return C + 1;
+ if (C < 0x2c7e)
+ return C;
+ // 2 characters
+ if (C <= 0x2c7f)
+ return C + -10815;
+ if (C < 0x2c80)
+ return C;
+ // 50 characters
+ if (C <= 0x2ce2)
+ return C | 1;
+ if (C < 0x2ceb)
+ return C;
+ // 2 characters
+ if (C <= 0x2ced && C % 2 == 1)
+ return C + 1;
+ if (C < 0x2cf2)
+ return C;
+ // 2 characters
+ if (C <= 0xa640 && C % 31054 == 11506)
+ return C + 1;
+ if (C < 0xa642)
+ return C;
+ // 22 characters
+ if (C <= 0xa66c)
+ return C | 1;
+ if (C < 0xa680)
+ return C;
+ // 14 characters
+ if (C <= 0xa69a)
+ return C | 1;
+ if (C < 0xa722)
+ return C;
+ // 7 characters
+ if (C <= 0xa72e)
+ return C | 1;
+ if (C < 0xa732)
+ return C;
+ // 31 characters
+ if (C <= 0xa76e)
+ return C | 1;
+ if (C < 0xa779)
+ return C;
+ // 2 characters
+ if (C <= 0xa77b && C % 2 == 1)
+ return C + 1;
+ // LATIN CAPITAL LETTER INSULAR G
+ if (C == 0xa77d)
+ return 0x1d79;
+ if (C < 0xa77e)
+ return C;
+ // 5 characters
+ if (C <= 0xa786)
+ return C | 1;
+ // LATIN CAPITAL LETTER SALTILLO
+ if (C == 0xa78b)
+ return 0xa78c;
+ // LATIN CAPITAL LETTER TURNED H
+ if (C == 0xa78d)
+ return 0x0265;
+ if (C < 0xa790)
+ return C;
+ // 2 characters
+ if (C <= 0xa792)
+ return C | 1;
+ if (C < 0xa796)
+ return C;
+ // 10 characters
+ if (C <= 0xa7a8)
+ return C | 1;
+ // LATIN CAPITAL LETTER H WITH HOOK
+ if (C == 0xa7aa)
+ return 0x0266;
+ // LATIN CAPITAL LETTER REVERSED OPEN E
+ if (C == 0xa7ab)
+ return 0x025c;
+ // LATIN CAPITAL LETTER SCRIPT G
+ if (C == 0xa7ac)
+ return 0x0261;
+ // LATIN CAPITAL LETTER L WITH BELT
+ if (C == 0xa7ad)
+ return 0x026c;
+ // LATIN CAPITAL LETTER SMALL CAPITAL I
+ if (C == 0xa7ae)
+ return 0x026a;
+ // LATIN CAPITAL LETTER TURNED K
+ if (C == 0xa7b0)
+ return 0x029e;
+ // LATIN CAPITAL LETTER TURNED T
+ if (C == 0xa7b1)
+ return 0x0287;
+ // LATIN CAPITAL LETTER J WITH CROSSED-TAIL
+ if (C == 0xa7b2)
+ return 0x029d;
+ // LATIN CAPITAL LETTER CHI
+ if (C == 0xa7b3)
+ return 0xab53;
+ if (C < 0xa7b4)
+ return C;
+ // 2 characters
+ if (C <= 0xa7b6)
+ return C | 1;
+ if (C < 0xab70)
+ return C;
+ // 80 characters
+ if (C <= 0xabbf)
+ return C + -38864;
+ if (C < 0xff21)
+ return C;
+ // 26 characters
+ if (C <= 0xff3a)
+ return C + 32;
+ if (C < 0x10400)
+ return C;
+ // 40 characters
+ if (C <= 0x10427)
+ return C + 40;
+ if (C < 0x104b0)
+ return C;
+ // 36 characters
+ if (C <= 0x104d3)
+ return C + 40;
+ if (C < 0x10c80)
+ return C;
+ // 51 characters
+ if (C <= 0x10cb2)
+ return C + 64;
+ if (C < 0x118a0)
+ return C;
+ // 32 characters
+ if (C <= 0x118bf)
+ return C + 32;
+ if (C < 0x1e900)
+ return C;
+ // 34 characters
+ if (C <= 0x1e921)
+ return C + 34;
+
+ return C;
+}
Modified: llvm/trunk/unittests/Support/CMakeLists.txt
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/unittests/Support/CMakeLists.txt?rev=325107&r1=325106&r2=325107&view=diff
==============================================================================
--- llvm/trunk/unittests/Support/CMakeLists.txt (original)
+++ llvm/trunk/unittests/Support/CMakeLists.txt Wed Feb 14 02:05:09 2018
@@ -19,6 +19,7 @@ add_llvm_unittest(SupportTests
ConvertUTFTest.cpp
DataExtractorTest.cpp
DebugTest.cpp
+ DJBTest.cpp
EndianStreamTest.cpp
EndianTest.cpp
ErrnoTest.cpp
Added: llvm/trunk/unittests/Support/DJBTest.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/unittests/Support/DJBTest.cpp?rev=325107&view=auto
==============================================================================
--- llvm/trunk/unittests/Support/DJBTest.cpp (added)
+++ llvm/trunk/unittests/Support/DJBTest.cpp Wed Feb 14 02:05:09 2018
@@ -0,0 +1,89 @@
+//===---------- llvm/unittest/Support/DJBTest.cpp -------------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Support/DJB.h"
+#include "llvm/ADT/Twine.h"
+#include "gtest/gtest.h"
+
+using namespace llvm;
+
+TEST(DJBTest, caseFolding) {
+ struct TestCase {
+ StringLiteral One;
+ StringLiteral Two;
+ };
+
+ static constexpr TestCase Tests[] = {
+ {"ASDF", "asdf"},
+ {"qWeR", "QwEr"},
+ {"qqqqqqqqqqqqqqqqqqqq", "QQQQQQQQQQQQQQQQQQQQ"},
+
+ {"I", "i"},
+ // Latin Small Letter Dotless I
+ {u8"\u0130", "i"},
+ // Latin Capital Letter I With Dot Above
+ {u8"\u0131", "i"},
+
+ // Latin Capital Letter A With Grave
+ {u8"\u00c0", u8"\u00e0"},
+ // Latin Capital Letter A With Macron
+ {u8"\u0100", u8"\u0101"},
+ // Latin Capital Letter L With Acute
+ {u8"\u0139", u8"\u013a"},
+ // Cyrillic Capital Letter Ie
+ {u8"\u0415", u8"\u0435"},
+ // Latin Capital Letter A With Circumflex And Grave
+ {u8"\u1ea6", u8"\u1ea7"},
+ // Kelvin Sign
+ {u8"\u212a", u8"\u006b"},
+ // Glagolitic Capital Letter Chrivi
+ {u8"\u2c1d", u8"\u2c4d"},
+ // Fullwidth Latin Capital Letter M
+ {u8"\uff2d", u8"\uff4d"},
+ // Old Hungarian Capital Letter Ej
+ {u8"\U00010c92", u8"\U00010cd2"},
+ };
+
+ for (const TestCase &T : Tests) {
+ SCOPED_TRACE("Comparing '" + T.One + "' and '" + T.Two + "'");
+ EXPECT_EQ(caseFoldingDjbHash(T.One), caseFoldingDjbHash(T.Two));
+ }
+}
+
+TEST(DJBTest, knownValuesLowerCase) {
+ struct TestCase {
+ StringLiteral Text;
+ uint32_t Hash;
+ };
+ static constexpr TestCase Tests[] = {
+ {"", 5381u},
+ {"f", 177675u},
+ {"fo", 5863386u},
+ {"foo", 193491849u},
+ {"foob", 2090263819u},
+ {"fooba", 259229388u},
+ {"foobar", 4259602622u},
+ {"pneumonoultramicroscopicsilicovolcanoconiosis", 3999417781u},
+ };
+
+ for (const TestCase &T : Tests) {
+ SCOPED_TRACE("Text: '" + T.Text + "'");
+ EXPECT_EQ(T.Hash, djbHash(T.Text));
+ EXPECT_EQ(T.Hash, caseFoldingDjbHash(T.Text));
+ EXPECT_EQ(T.Hash, caseFoldingDjbHash(T.Text.upper()));
+ }
+}
+
+TEST(DJBTest, knownValuesUnicode) {
+ EXPECT_EQ(
+ 2326183139u,
+ caseFoldingDjbHash(
+ u8"\u0130\u0131\u00c0\u00e0\u0100\u0101\u0139\u013a\u0415\u0435\u1ea6"
+ u8"\u1ea7\u212a\u006b\u2c1d\u2c4d\uff2d\uff4d\U00010c92\U00010cd2"));
+}
Added: llvm/trunk/utils/unicode-case-fold.py
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/utils/unicode-case-fold.py?rev=325107&view=auto
==============================================================================
--- llvm/trunk/utils/unicode-case-fold.py (added)
+++ llvm/trunk/utils/unicode-case-fold.py Wed Feb 14 02:05:09 2018
@@ -0,0 +1,137 @@
+#!/usr/bin/env python
+"""
+Unicode case folding database conversion utility
+
+Parses the database and generates a C++ function which implements the case
+folding algorithm. The database entries are of the form:
+
+ <code>; <status>; <mapping>; # <name>
+
+<status> can be one of four characters:
+ C - Common mappings
+ S - mappings for Simple case folding
+ F - mappings for Full case folding
+ T - special case for Turkish I characters
+
+Right now this generates a function which implements simple case folding (C+S
+entries).
+"""
+
+import sys
+import re
+import urllib2
+
+# This variable will body of the mappings function
+body = ""
+
+# Reads file line-by-line, extracts Common and Simple case fold mappings and
+# returns a (from_char, to_char, from_name) tuple.
+def mappings(f):
+ previous_from = -1
+ expr = re.compile(r'^(.*); [CS]; (.*); # (.*)')
+ for line in f:
+ m = expr.match(line)
+ if not m: continue
+ from_char = int(m.group(1), 16)
+ to_char = int(m.group(2), 16)
+ from_name = m.group(3)
+
+ if from_char <= previous_from:
+ raise Exception("Duplicate or unsorted characters in input")
+ yield from_char, to_char, from_name
+ previous_from = from_char
+
+# Computes the shift (to_char - from_char) in a mapping.
+def shift(mapping):
+ return mapping[1] - mapping[0]
+
+# Computes the stride (from_char2 - from_char1) of two mappings.
+def stride2(mapping1, mapping2):
+ return mapping2[0] - mapping1[0]
+
+# Computes the stride of a list of mappings. The list should have at least two
+# mappings. All mappings in the list are assumed to have the same stride.
+def stride(block):
+ return stride2(block[0], block[1])
+
+
+# b is a list of mappings. All the mappings are assumed to have the same
+# shift and the stride between adjecant mappings (if any) is constant.
+def dump_block(b):
+ global body
+
+ if len(b) == 1:
+ # Special case for handling blocks of length 1. We don't even need to
+ # emit the "if (C < X) return C" check below as all characters in this
+ # range will be caught by the "C < X" check emitted by the first
+ # non-trivial block.
+ body += " // {2}\n if (C == {0:#06x})\n return {1:#06x};\n".format(*b[0])
+ return
+
+ first = b[0][0]
+ last = first + stride(b) * (len(b)-1)
+ modulo = first % stride(b)
+
+ # All characters before this block map to themselves.
+ body += " if (C < {0:#06x})\n return C;\n".format(first)
+ body += " // {0} characters\n".format(len(b))
+
+ # Generic pattern: check upper bound (lower bound is checked by the "if"
+ # above) and modulo of C, return C+shift.
+ pattern = " if (C <= {0:#06x} && C % {1} == {2})\n return C + {3};\n"
+
+ if stride(b) == 2 and shift(b[0]) == 1 and modulo == 0:
+ # Special case:
+ # We can elide the modulo-check because the expression "C|1" will map
+ # the intervening characters to themselves.
+ pattern = " if (C <= {0:#06x})\n return C | 1;\n"
+ elif stride(b) == 1:
+ # Another special case: X % 1 is always zero, so don't emit the
+ # modulo-check.
+ pattern = " if (C <= {0:#06x})\n return C + {3};\n"
+
+ body += pattern.format(last, stride(b), modulo, shift(b[0]))
+
+current_block = []
+f = urllib2.urlopen(sys.argv[1])
+for m in mappings(f):
+ if len(current_block) == 0:
+ current_block.append(m)
+ continue
+
+ if shift(current_block[0]) != shift(m):
+ # Incompatible shift, start a new block.
+ dump_block(current_block)
+ current_block = [m]
+ continue
+
+ if len(current_block) == 1 or stride(current_block) == stride2(current_block[-1], m):
+ current_block.append(m)
+ continue
+
+ # Incompatible stride, start a new block.
+ dump_block(current_block)
+ current_block = [m]
+f.close()
+
+dump_block(current_block)
+
+print '//===---------- Support/UnicodeCaseFold.cpp -------------------------------===//'
+print '//'
+print '// This file was generated by utils/unicode-case-fold.py from the Unicode'
+print '// case folding database at'
+print '// ', sys.argv[1]
+print '//'
+print '// To regenerate this file, run:'
+print '// utils/unicode-case-fold.py \\'
+print '// "{}" \\'.format(sys.argv[1])
+print '// > lib/Support/UnicodeCaseFold.cpp'
+print '//'
+print '//===----------------------------------------------------------------------===//'
+print ''
+print '#include "llvm/Support/Unicode.h"'
+print ''
+print "int llvm::sys::unicode::foldCharSimple(int C) {"
+print body
+print " return C;"
+print "}"
Propchange: llvm/trunk/utils/unicode-case-fold.py
------------------------------------------------------------------------------
svn:executable = *
More information about the llvm-commits
mailing list