r204562 - MS ABI: Eliminate Duplicate Strings

David Majnemer david.majnemer at gmail.com
Sun Mar 23 10:47:16 PDT 2014


Author: majnemer
Date: Sun Mar 23 12:47:16 2014
New Revision: 204562

URL: http://llvm.org/viewvc/llvm-project?rev=204562&view=rev
Log:
MS ABI: Eliminate Duplicate Strings

COFF doesn't have mergeable sections so LLVM/clang's normal tactics for
string deduplication will not have any effect.

To remedy this we place each string inside it's own section and mark
the section as IMAGE_COMDAT_SELECT_ANY.  However, we can only do this if the
string has an external name that we can generate from it's contents.

To be compatible with MSVC, we must use their scheme.  Otherwise identical
strings in translation units from clang may not be deduplicated with
translation units in MSVC.

This fixes PR18248.

N.B. We will not attempt to do anything with a string literal which is not of
type 'char' or 'wchar_t' because their compiler does not support unicode
string literals as of this date.

Modified:
    cfe/trunk/include/clang/AST/Mangle.h
    cfe/trunk/lib/AST/ItaniumMangle.cpp
    cfe/trunk/lib/AST/MicrosoftMangle.cpp
    cfe/trunk/lib/CodeGen/CodeGenModule.cpp
    cfe/trunk/lib/CodeGen/CodeGenModule.h
    cfe/trunk/test/CodeGen/wchar-const.c

Modified: cfe/trunk/include/clang/AST/Mangle.h
URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/include/clang/AST/Mangle.h?rev=204562&r1=204561&r2=204562&view=diff
==============================================================================
--- cfe/trunk/include/clang/AST/Mangle.h (original)
+++ cfe/trunk/include/clang/AST/Mangle.h Sun Mar 23 12:47:16 2014
@@ -31,9 +31,10 @@ namespace clang {
   class FunctionDecl;
   class NamedDecl;
   class ObjCMethodDecl;
-  class VarDecl;
+  class StringLiteral;
   struct ThisAdjustment;
   struct ThunkInfo;
+  class VarDecl;
 
 /// MangleBuffer - a convenient class for storing a name which is
 /// either the result of a mangling or is a constant string with
@@ -117,6 +118,7 @@ public:
 
   bool shouldMangleDeclName(const NamedDecl *D);
   virtual bool shouldMangleCXXName(const NamedDecl *D) = 0;
+  virtual bool shouldMangleStringLiteral(const StringLiteral *SL) = 0;
 
   // FIXME: consider replacing raw_ostream & with something like SmallString &.
   void mangleName(const NamedDecl *D, raw_ostream &);
@@ -135,6 +137,7 @@ public:
                              raw_ostream &) = 0;
   virtual void mangleCXXDtor(const CXXDestructorDecl *D, CXXDtorType Type,
                              raw_ostream &) = 0;
+  virtual void mangleStringLiteral(const StringLiteral *SL, raw_ostream &) = 0;
 
   void mangleGlobalBlock(const BlockDecl *BD,
                          const NamedDecl *ID,

Modified: cfe/trunk/lib/AST/ItaniumMangle.cpp
URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/lib/AST/ItaniumMangle.cpp?rev=204562&r1=204561&r2=204562&view=diff
==============================================================================
--- cfe/trunk/lib/AST/ItaniumMangle.cpp (original)
+++ cfe/trunk/lib/AST/ItaniumMangle.cpp Sun Mar 23 12:47:16 2014
@@ -21,6 +21,7 @@
 #include "clang/AST/DeclCXX.h"
 #include "clang/AST/DeclObjC.h"
 #include "clang/AST/DeclTemplate.h"
+#include "clang/AST/Expr.h"
 #include "clang/AST/ExprCXX.h"
 #include "clang/AST/ExprObjC.h"
 #include "clang/AST/TypeLoc.h"
@@ -126,6 +127,9 @@ public:
   /// @{
 
   bool shouldMangleCXXName(const NamedDecl *D) override;
+  bool shouldMangleStringLiteral(const StringLiteral *) override {
+    return false;
+  }
   void mangleCXXName(const NamedDecl *D, raw_ostream &) override;
   void mangleThunk(const CXXMethodDecl *MD, const ThunkInfo &Thunk,
                    raw_ostream &) override;
@@ -153,6 +157,8 @@ public:
   void mangleItaniumThreadLocalWrapper(const VarDecl *D,
                                        raw_ostream &) override;
 
+  void mangleStringLiteral(const StringLiteral *, raw_ostream &) override;
+
   bool getNextDiscriminator(const NamedDecl *ND, unsigned &disc) {
     // Lambda closure types are already numbered.
     if (isLambda(ND))
@@ -3774,6 +3780,10 @@ void ItaniumMangleContextImpl::mangleTyp
   mangleCXXRTTIName(Ty, Out);
 }
 
+void ItaniumMangleContextImpl::mangleStringLiteral(const StringLiteral *, raw_ostream &) {
+  llvm_unreachable("Can't mangle string literals");
+}
+
 ItaniumMangleContext *
 ItaniumMangleContext::create(ASTContext &Context, DiagnosticsEngine &Diags) {
   return new ItaniumMangleContextImpl(Context, Diags);

Modified: cfe/trunk/lib/AST/MicrosoftMangle.cpp
URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/lib/AST/MicrosoftMangle.cpp?rev=204562&r1=204561&r2=204562&view=diff
==============================================================================
--- cfe/trunk/lib/AST/MicrosoftMangle.cpp (original)
+++ cfe/trunk/lib/AST/MicrosoftMangle.cpp Sun Mar 23 12:47:16 2014
@@ -20,6 +20,7 @@
 #include "clang/AST/DeclCXX.h"
 #include "clang/AST/DeclObjC.h"
 #include "clang/AST/DeclTemplate.h"
+#include "clang/AST/Expr.h"
 #include "clang/AST/ExprCXX.h"
 #include "clang/AST/VTableBuilder.h"
 #include "clang/Basic/ABI.h"
@@ -27,6 +28,9 @@
 #include "clang/Basic/TargetInfo.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/ADT/StringMap.h"
+#include "llvm/Support/MathExtras.h"
+
+#include <algorithm>
 
 using namespace clang;
 
@@ -93,6 +97,7 @@ public:
   MicrosoftMangleContextImpl(ASTContext &Context, DiagnosticsEngine &Diags)
       : MicrosoftMangleContext(Context, Diags) {}
   bool shouldMangleCXXName(const NamedDecl *D) override;
+  bool shouldMangleStringLiteral(const StringLiteral *SL) override;
   void mangleCXXName(const NamedDecl *D, raw_ostream &Out) override;
   void mangleVirtualMemPtrThunk(const CXXMethodDecl *MD, raw_ostream &) override;
   void mangleThunk(const CXXMethodDecl *MD, const ThunkInfo &Thunk,
@@ -118,6 +123,7 @@ public:
   void mangleDynamicInitializer(const VarDecl *D, raw_ostream &Out) override;
   void mangleDynamicAtExitDestructor(const VarDecl *D,
                                      raw_ostream &Out) override;
+  void mangleStringLiteral(const StringLiteral *SL, raw_ostream &Out) override;
   bool getNextDiscriminator(const NamedDecl *ND, unsigned &disc) {
     // Lambda closure types are already numbered.
     if (isLambda(ND))
@@ -321,6 +327,13 @@ bool MicrosoftMangleContextImpl::shouldM
   return true;
 }
 
+bool
+MicrosoftMangleContextImpl::shouldMangleStringLiteral(const StringLiteral *SL) {
+  return SL->isAscii() || SL->isWide();
+  // TODO: This needs to be updated when MSVC gains support for Unicode
+  // literals.
+}
+
 void MicrosoftCXXNameMangler::mangle(const NamedDecl *D,
                                      StringRef Prefix) {
   // MSVC doesn't mangle C++ names the same way it mangles extern "C" names.
@@ -2315,6 +2328,162 @@ MicrosoftMangleContextImpl::mangleDynami
   mangleInitFiniStub(D, Out, 'F');
 }
 
+void MicrosoftMangleContextImpl::mangleStringLiteral(const StringLiteral *SL,
+                                                     raw_ostream &Out) {
+  // <char-type> ::= 0   # char
+  //             ::= 1   # wchar_t
+  //             ::= ??? # char16_t/char32_t will need a mangling too...
+  //
+  // <literal-length> ::= <non-negative integer>  # the length of the literal
+  //
+  // <encoded-crc>    ::= <hex digit>+ @          # crc of the literal including
+  //                                              # null-terminator
+  //
+  // <encoded-string> ::= <simple character>           # uninteresting character
+  //                  ::= '?$' <hex digit> <hex digit> # these two nibbles
+  //                                                   # encode the byte for the
+  //                                                   # character
+  //                  ::= '?' [a-z]                    # \xe1 - \xfa
+  //                  ::= '?' [A-Z]                    # \xc1 - \xda
+  //                  ::= '?' [0-9]                    # [,/\:. \n\t'-]
+  //
+  // <literal> ::= '??_C at _' <char-type> <literal-length> <encoded-crc>
+  //               <encoded-string> '@'
+  MicrosoftCXXNameMangler Mangler(*this, Out);
+  Mangler.getStream() << "\01??_C at _";
+
+  // <char-type>: The "kind" of string literal is encoded into the mangled name.
+  // TODO: This needs to be updated when MSVC gains support for unicode
+  // literals.
+  if (SL->isAscii())
+    Mangler.getStream() << '0';
+  else if (SL->isWide())
+    Mangler.getStream() << '1';
+  else
+    llvm_unreachable("unexpected string literal kind!");
+
+  // <literal-length>: The next part of the mangled name consists of the length
+  // of the string.
+  // The StringLiteral does not consider the NUL terminator byte(s) but the
+  // mangling does.
+  // N.B. The length is in terms of bytes, not characters.
+  Mangler.mangleNumber(SL->getByteLength() + SL->getCharByteWidth());
+
+  // We will use the "Rocksoft^tm Model CRC Algorithm" to describe the
+  // properties of our CRC:
+  //   Width  : 32
+  //   Poly   : 04C11DB7
+  //   Init   : FFFFFFFF
+  //   RefIn  : True
+  //   RefOut : True
+  //   XorOut : 00000000
+  //   Check  : 340BC6D9
+  uint32_t CRC = 0xFFFFFFFFU;
+
+  auto UpdateCRC = [&CRC](char Byte) {
+    for (unsigned i = 0; i < 8; ++i) {
+      bool Bit = CRC & 0x80000000U;
+      if (Byte & (1U << i))
+        Bit = !Bit;
+      CRC <<= 1;
+      if (Bit)
+        CRC ^= 0x04C11DB7U;
+    }
+  };
+
+  // CRC all the bytes of the StringLiteral.
+  for (char Byte : SL->getBytes())
+    UpdateCRC(Byte);
+
+  // The NUL terminator byte(s) were not present earlier,
+  // we need to manually process those bytes into the CRC.
+  for (unsigned NullTerminator = 0; NullTerminator < SL->getCharByteWidth();
+       ++NullTerminator)
+    UpdateCRC('\x00');
+
+  // The literature refers to the process of reversing the bits in the final CRC
+  // output as "reflection".
+  CRC = llvm::reverseBits(CRC);
+
+  // <encoded-crc>: The CRC is encoded utilizing the standard number mangling
+  // scheme.
+  Mangler.mangleNumber(CRC);
+
+  // <encoded-crc>: The mangled name also contains the first 32 _characters_
+  // (including null-terminator bytes) of the StringLiteral.
+  // Each character is encoded by splitting them into bytes and then encoding
+  // the constituent bytes.
+  auto MangleCharacter = [&Mangler](char Byte) {
+    // There are five different manglings for characters:
+    // - ?[0-9]: The set of [,/\:. \n\t'-].
+    // - [a-zA-Z0-9_$]: A one-to-one mapping.
+    // - ?[a-z]: The range from \xe1 to \xfa.
+    // - ?[A-Z]: The range from \xc1 to \xda.
+    // - ?$XX: A fallback which maps nibbles.
+    static const char SpecialMap[] = {',', '/',  '\\', ':',  '.',
+                                      ' ', '\n', '\t', '\'', '-'};
+    const char *SpecialMapI =
+        std::find(std::begin(SpecialMap), std::end(SpecialMap), Byte);
+    if (SpecialMapI != std::end(SpecialMap)) {
+      Mangler.getStream() << '?' << SpecialMapI - SpecialMap;
+    } else if ((Byte >= 'a' && Byte <= 'z') || (Byte >= 'A' && Byte <= 'Z') ||
+               (Byte >= '0' && Byte <= '9') || Byte == '_' || Byte == '$') {
+      Mangler.getStream() << Byte;
+    } else if (Byte >= '\xe1' && Byte <= '\xfa') {
+      Mangler.getStream() << '?' << (char)('a' + Byte - '\xe1');
+    } else if (Byte >= '\xc1' && Byte <= '\xda') {
+      Mangler.getStream() << '?' << (char)('A' + Byte - '\xc1');
+    } else {
+      Mangler.getStream() << "?$";
+      Mangler.getStream() << (char)('A' + ((Byte >> 4) & 0xf));
+      Mangler.getStream() << (char)('A' + (Byte & 0xf));
+    }
+  };
+
+  // Enforce our 32 character max.
+  unsigned MaxBytes = 32 * SL->getCharByteWidth();
+  StringRef Bytes = SL->getBytes().substr(0, MaxBytes);
+  size_t BytesLength = Bytes.size();
+
+  if (SL->isAscii()) {
+    // A character maps directly to a byte for ASCII StringLiterals.
+    for (char Byte : Bytes)
+      MangleCharacter(Byte);
+  } else if (SL->isWide()) {
+    // The ordering of bytes in a wide StringLiteral is like so:
+    //   A B C D ...
+    // However, they are mangled in the following order:
+    //   B A D C ...
+    for (size_t i = 0; i != BytesLength;) {
+      char FirstByte = Bytes[i];
+      ++i;
+      if (i != BytesLength) {
+        char SecondByte = Bytes[i];
+        ++i;
+        MangleCharacter(SecondByte);
+      }
+      MangleCharacter(FirstByte);
+    }
+  } else {
+    llvm_unreachable("unexpected string literal kind!");
+    // TODO: This needs to be updated when MSVC gains support for Unicode
+    // literals.
+  }
+
+  // We should also encode the NUL terminator(s) if we encoded less than 32
+  // characters.
+  if (BytesLength < MaxBytes) {
+    size_t PaddingBytes = SL->getCharByteWidth();
+    size_t BytesLeft = MaxBytes - BytesLength;
+    if (BytesLeft < PaddingBytes)
+      PaddingBytes = BytesLeft;
+    for (unsigned i = 0; i < PaddingBytes; ++i)
+      MangleCharacter('\x00');
+  }
+
+  Mangler.getStream() << '@';
+}
+
 MicrosoftMangleContext *
 MicrosoftMangleContext::create(ASTContext &Context, DiagnosticsEngine &Diags) {
   return new MicrosoftMangleContextImpl(Context, Diags);

Modified: cfe/trunk/lib/CodeGen/CodeGenModule.cpp
URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/lib/CodeGen/CodeGenModule.cpp?rev=204562&r1=204561&r2=204562&view=diff
==============================================================================
--- cfe/trunk/lib/CodeGen/CodeGenModule.cpp (original)
+++ cfe/trunk/lib/CodeGen/CodeGenModule.cpp Sun Mar 23 12:47:16 2014
@@ -2572,25 +2572,67 @@ CodeGenModule::GetConstantArrayFromStrin
 llvm::Constant *
 CodeGenModule::GetAddrOfConstantStringFromLiteral(const StringLiteral *S) {
   CharUnits Align = getContext().getAlignOfGlobalVarInChars(S->getType());
-  if (S->isAscii() || S->isUTF8()) {
-    SmallString<64> Str(S->getString());
-    
-    // Resize the string to the right size, which is indicated by its type.
-    const ConstantArrayType *CAT = Context.getAsConstantArrayType(S->getType());
-    Str.resize(CAT->getSize().getZExtValue());
-    return GetAddrOfConstantString(Str, /*GlobalName*/ 0, Align.getQuantity());
+
+  llvm::StringMapEntry<llvm::GlobalVariable *> *Entry = nullptr;
+  llvm::GlobalVariable *GV = nullptr;
+  if (!LangOpts.WritableStrings) {
+    llvm::StringMap<llvm::GlobalVariable *> *ConstantStringMap = nullptr;
+    switch (S->getCharByteWidth()) {
+    case 1:
+      ConstantStringMap = &Constant1ByteStringMap;
+      break;
+    case 2:
+      ConstantStringMap = &Constant2ByteStringMap;
+      break;
+    case 4:
+      ConstantStringMap = &Constant4ByteStringMap;
+      break;
+    default:
+      llvm_unreachable("unhandled byte width!");
+    }
+    Entry = &ConstantStringMap->GetOrCreateValue(S->getBytes());
+    GV = Entry->getValue();
+  }
+
+  if (!GV) {
+    StringRef GlobalVariableName;
+    llvm::GlobalValue::LinkageTypes LT;
+    if (!LangOpts.WritableStrings &&
+        getCXXABI().getMangleContext().shouldMangleStringLiteral(S)) {
+      LT = llvm::GlobalValue::LinkOnceODRLinkage;
+
+      SmallString<256> Buffer;
+      llvm::raw_svector_ostream Out(Buffer);
+      getCXXABI().getMangleContext().mangleStringLiteral(S, Out);
+      Out.flush();
+
+      size_t Length = Buffer.size();
+      char *Name = MangledNamesAllocator.Allocate<char>(Length);
+      std::copy(Buffer.begin(), Buffer.end(), Name);
+      GlobalVariableName = StringRef(Name, Length);
+    } else {
+      LT = llvm::GlobalValue::PrivateLinkage;;
+      GlobalVariableName = ".str";
+    }
+
+    // OpenCL v1.2 s6.5.3: a string literal is in the constant address space.
+    unsigned AddrSpace = 0;
+    if (getLangOpts().OpenCL)
+      AddrSpace = getContext().getTargetAddressSpace(LangAS::opencl_constant);
+
+    llvm::Constant *C = GetConstantArrayFromStringLiteral(S);
+    GV = new llvm::GlobalVariable(
+        getModule(), C->getType(), !LangOpts.WritableStrings, LT, C,
+        GlobalVariableName, /*InsertBefore=*/nullptr,
+        llvm::GlobalVariable::NotThreadLocal, AddrSpace);
+    GV->setUnnamedAddr(true);
+    if (Entry)
+      Entry->setValue(GV);
   }
 
-  // FIXME: the following does not memoize wide strings.
-  llvm::Constant *C = GetConstantArrayFromStringLiteral(S);
-  llvm::GlobalVariable *GV =
-    new llvm::GlobalVariable(getModule(),C->getType(),
-                             !LangOpts.WritableStrings,
-                             llvm::GlobalValue::PrivateLinkage,
-                             C,".str");
+  if (Align.getQuantity() > GV->getAlignment())
+    GV->setAlignment(Align.getQuantity());
 
-  GV->setAlignment(Align.getQuantity());
-  GV->setUnnamedAddr(true);
   return GV;
 }
 
@@ -2615,7 +2657,7 @@ static llvm::GlobalVariable *GenerateStr
   llvm::Constant *C =
       llvm::ConstantDataArray::getString(CGM.getLLVMContext(), str, false);
 
-  // OpenCL v1.1 s6.5.3: a string literal is in the constant address space.
+  // OpenCL v1.2 s6.5.3: a string literal is in the constant address space.
   unsigned AddrSpace = 0;
   if (CGM.getLangOpts().OpenCL)
     AddrSpace = CGM.getContext().getTargetAddressSpace(LangAS::opencl_constant);
@@ -2654,7 +2696,7 @@ llvm::Constant *CodeGenModule::GetAddrOf
     return GenerateStringLiteral(Str, false, *this, GlobalName, Alignment);
 
   llvm::StringMapEntry<llvm::GlobalVariable *> &Entry =
-    ConstantStringMap.GetOrCreateValue(Str);
+    Constant1ByteStringMap.GetOrCreateValue(Str);
 
   if (llvm::GlobalVariable *GV = Entry.getValue()) {
     if (Alignment > GV->getAlignment()) {

Modified: cfe/trunk/lib/CodeGen/CodeGenModule.h
URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/lib/CodeGen/CodeGenModule.h?rev=204562&r1=204561&r2=204562&view=diff
==============================================================================
--- cfe/trunk/lib/CodeGen/CodeGenModule.h (original)
+++ cfe/trunk/lib/CodeGen/CodeGenModule.h Sun Mar 23 12:47:16 2014
@@ -319,7 +319,10 @@ class CodeGenModule : public CodeGenType
   llvm::StringMap<llvm::Constant*> AnnotationStrings;
 
   llvm::StringMap<llvm::Constant*> CFConstantStringMap;
-  llvm::StringMap<llvm::GlobalVariable*> ConstantStringMap;
+
+  llvm::StringMap<llvm::GlobalVariable *> Constant1ByteStringMap;
+  llvm::StringMap<llvm::GlobalVariable *> Constant2ByteStringMap;
+  llvm::StringMap<llvm::GlobalVariable *> Constant4ByteStringMap;
   llvm::DenseMap<const Decl*, llvm::Constant *> StaticLocalDeclMap;
   llvm::DenseMap<const Decl*, llvm::GlobalVariable*> StaticLocalDeclGuardMap;
   llvm::DenseMap<const Expr*, llvm::Constant *> MaterializedGlobalTemporaryMap;

Modified: cfe/trunk/test/CodeGen/wchar-const.c
URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/test/CodeGen/wchar-const.c?rev=204562&r1=204561&r2=204562&view=diff
==============================================================================
--- cfe/trunk/test/CodeGen/wchar-const.c (original)
+++ cfe/trunk/test/CodeGen/wchar-const.c Sun Mar 23 12:47:16 2014
@@ -15,7 +15,7 @@ typedef __WCHAR_TYPE__ wchar_t;
 
 
 // CHECK-DAR: private unnamed_addr constant [18 x i32] [i32 84,
-// CHECK-WIN: private unnamed_addr constant [18 x i16] [i16 84,
+// CHECK-WIN: linkonce_odr unnamed_addr constant [18 x i16] [i16 84,
 extern void foo(const wchar_t* p);
 int main (int argc, const char * argv[])
 {





More information about the cfe-commits mailing list