[libcxx-commits] [libcxxabi] c8c2b46 - [Demangle][Rust] Parse non-ASCII identifiers

Tomasz Miąsko via libcxx-commits libcxx-commits at lists.llvm.org
Fri Oct 1 13:14:03 PDT 2021


Author: Tomasz Miąsko
Date: 2021-10-01T22:08:32+02:00
New Revision: c8c2b4629f7597ac16102dab6150da14d68167de

URL: https://github.com/llvm/llvm-project/commit/c8c2b4629f7597ac16102dab6150da14d68167de
DIFF: https://github.com/llvm/llvm-project/commit/c8c2b4629f7597ac16102dab6150da14d68167de.diff

LOG: [Demangle][Rust] Parse non-ASCII identifiers

Rust allows use of non-ASCII identifiers, which in Rust mangling scheme
are encoded using Punycode.

The encoding deviates from the standard by using an underscore as the
separator between ASCII part and a base-36 encoding of non-ASCII
characters (avoiding hypen-minus in the symbol name). Other than that,
the encoding follows the standard, and the decoder implemented here in
turn follows the one given in RFC 3492.

To avoid an extra intermediate memory allocation while decoding
Punycode, the interface of OutputStream is extended with an insert
method.

Reviewed By: dblaikie

Differential Revision: https://reviews.llvm.org/D104366

Added: 
    llvm/unittests/Demangle/OutputStreamTest.cpp

Modified: 
    libcxxabi/src/demangle/Utility.h
    llvm/include/llvm/Demangle/Utility.h
    llvm/lib/Demangle/RustDemangle.cpp
    llvm/test/Demangle/rust.test
    llvm/unittests/Demangle/CMakeLists.txt

Removed: 
    


################################################################################
diff  --git a/libcxxabi/src/demangle/Utility.h b/libcxxabi/src/demangle/Utility.h
index 846a5f0818e7e..bb073ae073a04 100644
--- a/libcxxabi/src/demangle/Utility.h
+++ b/libcxxabi/src/demangle/Utility.h
@@ -126,6 +126,16 @@ class OutputStream {
     return this->operator<<(static_cast<unsigned long long>(N));
   }
 
+  void insert(size_t Pos, const char *S, size_t N) {
+    assert(Pos <= CurrentPosition);
+    if (N == 0)
+      return;
+    grow(N);
+    std::memmove(Buffer + Pos + N, Buffer + Pos, CurrentPosition - Pos);
+    std::memcpy(Buffer + Pos, S, N);
+    CurrentPosition += N;
+  }
+
   size_t getCurrentPosition() const { return CurrentPosition; }
   void setCurrentPosition(size_t NewPos) { CurrentPosition = NewPos; }
 

diff  --git a/llvm/include/llvm/Demangle/Utility.h b/llvm/include/llvm/Demangle/Utility.h
index 04ff65a35aed5..153033ccc5e9a 100644
--- a/llvm/include/llvm/Demangle/Utility.h
+++ b/llvm/include/llvm/Demangle/Utility.h
@@ -126,6 +126,16 @@ class OutputStream {
     return this->operator<<(static_cast<unsigned long long>(N));
   }
 
+  void insert(size_t Pos, const char *S, size_t N) {
+    assert(Pos <= CurrentPosition);
+    if (N == 0)
+      return;
+    grow(N);
+    std::memmove(Buffer + Pos + N, Buffer + Pos, CurrentPosition - Pos);
+    std::memcpy(Buffer + Pos, S, N);
+    CurrentPosition += N;
+  }
+
   size_t getCurrentPosition() const { return CurrentPosition; }
   void setCurrentPosition(size_t NewPos) { CurrentPosition = NewPos; }
 

diff  --git a/llvm/lib/Demangle/RustDemangle.cpp b/llvm/lib/Demangle/RustDemangle.cpp
index f916300835ce5..2f5a1b49e34d3 100644
--- a/llvm/lib/Demangle/RustDemangle.cpp
+++ b/llvm/lib/Demangle/RustDemangle.cpp
@@ -135,6 +135,7 @@ class Demangler {
   void printDecimalNumber(uint64_t N);
   void printBasicType(BasicType);
   void printLifetime(uint64_t Index);
+  void printIdentifier(Identifier Ident);
 
   char look() const;
   char consume();
@@ -283,8 +284,7 @@ bool Demangler::demanglePath(IsInType InType, LeaveGenericsOpen LeaveOpen) {
   switch (consume()) {
   case 'C': {
     parseOptionalBase62Number('s');
-    Identifier Ident = parseIdentifier();
-    print(Ident.Name);
+    printIdentifier(parseIdentifier());
     break;
   }
   case 'M': {
@@ -333,7 +333,7 @@ bool Demangler::demanglePath(IsInType InType, LeaveGenericsOpen LeaveOpen) {
         print(NS);
       if (!Ident.empty()) {
         print(":");
-        print(Ident.Name);
+        printIdentifier(Ident);
       }
       print('#');
       printDecimalNumber(Disambiguator);
@@ -342,7 +342,7 @@ bool Demangler::demanglePath(IsInType InType, LeaveGenericsOpen LeaveOpen) {
       // Implementation internal namespaces.
       if (!Ident.empty()) {
         print("::");
-        print(Ident.Name);
+        printIdentifier(Ident);
       }
     }
     break;
@@ -669,6 +669,8 @@ void Demangler::demangleFnSig() {
       print("C");
     } else {
       Identifier Ident = parseIdentifier();
+      if (Ident.Punycode)
+        Error = true;
       for (char C : Ident.Name) {
         // When mangling ABI string, the "-" is replaced with "_".
         if (C == '_')
@@ -1078,6 +1080,172 @@ void Demangler::printLifetime(uint64_t Index) {
   }
 }
 
+static inline bool decodePunycodeDigit(char C, size_t &Value) {
+  if (isLower(C)) {
+    Value = C - 'a';
+    return true;
+  }
+
+  if (isDigit(C)) {
+    Value = 26 + (C - '0');
+    return true;
+  }
+
+  return false;
+}
+
+static void removeNullBytes(OutputStream &Output, size_t StartIdx) {
+  char *Buffer = Output.getBuffer();
+  char *Start = Buffer + StartIdx;
+  char *End = Buffer + Output.getCurrentPosition();
+  Output.setCurrentPosition(std::remove(Start, End, '\0') - Buffer);
+}
+
+// Encodes code point as UTF-8 and stores results in Output. Returns false if
+// CodePoint is not a valid unicode scalar value.
+static inline bool encodeUTF8(size_t CodePoint, char *Output) {
+  if (0xD800 <= CodePoint && CodePoint <= 0xDFFF)
+    return false;
+
+  if (CodePoint <= 0x7F) {
+    Output[0] = CodePoint;
+    return true;
+  }
+
+  if (CodePoint <= 0x7FF) {
+    Output[0] = 0xC0 | ((CodePoint >> 6) & 0x3F);
+    Output[1] = 0x80 | (CodePoint & 0x3F);
+    return true;
+  }
+
+  if (CodePoint <= 0xFFFF) {
+    Output[0] = 0xE0 | (CodePoint >> 12);
+    Output[1] = 0x80 | ((CodePoint >> 6) & 0x3F);
+    Output[2] = 0x80 | (CodePoint & 0x3F);
+    return true;
+  }
+
+  if (CodePoint <= 0x10FFFF) {
+    Output[0] = 0xF0 | (CodePoint >> 18);
+    Output[1] = 0x80 | ((CodePoint >> 12) & 0x3F);
+    Output[2] = 0x80 | ((CodePoint >> 6) & 0x3F);
+    Output[3] = 0x80 | (CodePoint & 0x3F);
+    return true;
+  }
+
+  return false;
+}
+
+// Decodes string encoded using punycode and appends results to Output.
+// Returns true if decoding was successful.
+static bool decodePunycode(StringView Input, OutputStream &Output) {
+  size_t OutputSize = Output.getCurrentPosition();
+  size_t InputIdx = 0;
+
+  // Rust uses an underscore as a delimiter.
+  size_t DelimiterPos = StringView::npos;
+  for (size_t I = 0; I != Input.size(); ++I)
+    if (Input[I] == '_')
+      DelimiterPos = I;
+
+  if (DelimiterPos != StringView::npos) {
+    // Copy basic code points before the last delimiter to the output.
+    for (; InputIdx != DelimiterPos; ++InputIdx) {
+      char C = Input[InputIdx];
+      if (!isValid(C))
+        return false;
+      // Code points are padded with zeros while decoding is in progress.
+      char UTF8[4] = {C};
+      Output += StringView(UTF8, UTF8 + 4);
+    }
+    // Skip over the delimiter.
+    ++InputIdx;
+  }
+
+  size_t Base = 36;
+  size_t Skew = 38;
+  size_t Bias = 72;
+  size_t N = 0x80;
+  size_t TMin = 1;
+  size_t TMax = 26;
+  size_t Damp = 700;
+
+  auto Adapt = [&](size_t Delta, size_t NumPoints) {
+    Delta /= Damp;
+    Delta += Delta / NumPoints;
+    Damp = 2;
+
+    size_t K = 0;
+    while (Delta > (Base - TMin) * TMax / 2) {
+      Delta /= Base - TMin;
+      K += Base;
+    }
+    return K + (((Base - TMin + 1) * Delta) / (Delta + Skew));
+  };
+
+  // Main decoding loop.
+  for (size_t I = 0; InputIdx != Input.size(); I += 1) {
+    size_t OldI = I;
+    size_t W = 1;
+    size_t Max = std::numeric_limits<size_t>::max();
+    for (size_t K = Base; true; K += Base) {
+      if (InputIdx == Input.size())
+        return false;
+      char C = Input[InputIdx++];
+      size_t Digit = 0;
+      if (!decodePunycodeDigit(C, Digit))
+        return false;
+
+      if (Digit > (Max - I) / W)
+        return false;
+      I += Digit * W;
+
+      size_t T;
+      if (K <= Bias)
+        T = TMin;
+      else if (K >= Bias + TMax)
+        T = TMax;
+      else
+        T = K - Bias;
+
+      if (Digit < T)
+        break;
+
+      if (W > Max / (Base - T))
+        return false;
+      W *= (Base - T);
+    }
+    size_t NumPoints = (Output.getCurrentPosition() - OutputSize) / 4 + 1;
+    Bias = Adapt(I - OldI, NumPoints);
+
+    if (I / NumPoints > Max - N)
+      return false;
+    N += I / NumPoints;
+    I = I % NumPoints;
+
+    // Insert N at position I in the output.
+    char UTF8[4] = {};
+    if (!encodeUTF8(N, UTF8))
+      return false;
+    Output.insert(OutputSize + I * 4, UTF8, 4);
+  }
+
+  removeNullBytes(Output, OutputSize);
+  return true;
+}
+
+void Demangler::printIdentifier(Identifier Ident) {
+  if (Error || !Print)
+    return;
+
+  if (Ident.Punycode) {
+    if (!decodePunycode(Ident.Name, Output))
+      Error = true;
+  } else {
+    print(Ident.Name);
+  }
+}
+
 char Demangler::look() const {
   if (Error || Position >= Input.size())
     return 0;

diff  --git a/llvm/test/Demangle/rust.test b/llvm/test/Demangle/rust.test
index b5d3a160ff632..d926b68610126 100644
--- a/llvm/test/Demangle/rust.test
+++ b/llvm/test/Demangle/rust.test
@@ -237,6 +237,11 @@ CHECK: function::<extern "cdecl" fn()>
 CHECK: function::<unsafe extern "C-cmse-nonsecure-call" fn()>
        _RIC8functionFUK21C_cmse_nonsecure_callEuE
 
+; Invalid ABI with punycode.
+
+CHECK: _RIC8functionFKu3n3hEuE
+       _RIC8functionFKu3n3hEuE
+
 ; Trait objects
 
 CHECK: trait::<dyn >
@@ -456,6 +461,44 @@ CHECK: dot (.llvm.1234)
 CHECK: dot (.llvm.6789)
        _RC3dotC5crate.llvm.6789
 
+; Punycode
+
+CHECK: punycode::東京
+       _RNvC8punycodeu7_1lqs71d
+
+CHECK: punycode::zażółć_gęślą_jaźń
+       _RNvC8punycodeu29za_gl_ja_w3a7psa2tqtgb10airva
+
+CHECK: punycode::საჭმელად_გემრიელი_სადილი
+       _RNvC8punycodeu30____7hkackfecea1cbdathfdh9hlq6y
+
+CHECK: Gödel::Escher::Bach
+       _RNtNvCu8Gdel_5qa6Escher4Bach
+
+CHECK: punycode::🦁🐅
+       _RNvC8punycodeu7wn8hx1g
+
+; Punycode - invalid code point
+
+CHECK: _RCu5r731r
+       _RCu5r731r
+
+CHECK: _RCu8b44444yy
+       _RCu8b44444yy
+
+CHECK: _RNvC1au25zzzzzzzzzzzzzzzzzzzzzzzzz
+       _RNvC1au25zzzzzzzzzzzzzzzzzzzzzzzzz
+
+; Punycode - early EOF
+
+CHECK: _RCu8_CCCAR_u4
+       _RCu8_CCCAR_u4
+
+; Punycode - overflow
+
+CHECK: _RNvC1au21p18888888888888888888
+       _RNvC1au21p18888888888888888888
+
 ; Invalid mangled characters
 
 CHECK: _RNvC2a.1c

diff  --git a/llvm/unittests/Demangle/CMakeLists.txt b/llvm/unittests/Demangle/CMakeLists.txt
index 4bcc9bb322ea9..a4baf91421e07 100644
--- a/llvm/unittests/Demangle/CMakeLists.txt
+++ b/llvm/unittests/Demangle/CMakeLists.txt
@@ -6,6 +6,7 @@ set(LLVM_LINK_COMPONENTS
 add_llvm_unittest(DemangleTests
   DemangleTest.cpp
   ItaniumDemangleTest.cpp
+  OutputStreamTest.cpp
   PartialDemangleTest.cpp
   RustDemangleTest.cpp
   StringViewTest.cpp

diff  --git a/llvm/unittests/Demangle/OutputStreamTest.cpp b/llvm/unittests/Demangle/OutputStreamTest.cpp
new file mode 100644
index 0000000000000..8c867969512e7
--- /dev/null
+++ b/llvm/unittests/Demangle/OutputStreamTest.cpp
@@ -0,0 +1,61 @@
+//===- llvm/unittest/OutputStreamTest.cpp - OutputStream unit tests -------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Demangle/Utility.h"
+#include "gtest/gtest.h"
+#include <string>
+
+using namespace llvm;
+using llvm::itanium_demangle::OutputStream;
+
+static std::string toString(OutputStream &OS) {
+  return {OS.getBuffer(), OS.getCurrentPosition()};
+}
+
+template <typename T> static std::string printToString(const T &Value) {
+  OutputStream OS;
+  OS << Value;
+  std::string s = toString(OS);
+  std::free(OS.getBuffer());
+  return s;
+}
+
+TEST(OutputStreamTest, Format) {
+  EXPECT_EQ("0", printToString(0));
+  EXPECT_EQ("1", printToString(1));
+  EXPECT_EQ("-1", printToString(-1));
+  EXPECT_EQ("-90", printToString(-90));
+  EXPECT_EQ("109", printToString(109));
+  EXPECT_EQ("400", printToString(400));
+
+  EXPECT_EQ("a", printToString('a'));
+  EXPECT_EQ("?", printToString('?'));
+
+  EXPECT_EQ("abc", printToString("abc"));
+}
+
+TEST(OutputStreamTest, Insert) {
+  OutputStream OS;
+
+  OS.insert(0, "", 0);
+  EXPECT_EQ("", toString(OS));
+
+  OS.insert(0, "abcd", 4);
+  EXPECT_EQ("abcd", toString(OS));
+
+  OS.insert(0, "x", 1);
+  EXPECT_EQ("xabcd", toString(OS));
+
+  OS.insert(5, "y", 1);
+  EXPECT_EQ("xabcdy", toString(OS));
+
+  OS.insert(3, "defghi", 6);
+  EXPECT_EQ("xabdefghicdy", toString(OS));
+
+  std::free(OS.getBuffer());
+}


        


More information about the libcxx-commits mailing list