r204562 - MS ABI: Eliminate Duplicate Strings
Richard Smith
richard at metafoo.co.uk
Sun Mar 23 11:34:25 PDT 2014
On Sun, Mar 23, 2014 at 10:47 AM, David Majnemer
<david.majnemer at gmail.com>wrote:
> Author: majnemer
> Date: Sun Mar 23 12:47:16 2014
> New Revision: 204562
>
> URL: http://llvm.org/viewvc/llvm-project?rev=204562&view=rev
> Log:
> MS ABI: Eliminate Duplicate Strings
>
> COFF doesn't have mergeable sections so LLVM/clang's normal tactics for
> string deduplication will not have any effect.
>
> To remedy this we place each string inside it's own section and mark
> the section as IMAGE_COMDAT_SELECT_ANY. However, we can only do this if
> the
> string has an external name that we can generate from it's contents.
>
> To be compatible with MSVC, we must use their scheme. Otherwise identical
> strings in translation units from clang may not be deduplicated with
> translation units in MSVC.
>
> This fixes PR18248.
>
> N.B. We will not attempt to do anything with a string literal which is not
> of
> type 'char' or 'wchar_t' because their compiler does not support unicode
> string literals as of this date.
>
> Modified:
> cfe/trunk/include/clang/AST/Mangle.h
> cfe/trunk/lib/AST/ItaniumMangle.cpp
> cfe/trunk/lib/AST/MicrosoftMangle.cpp
> cfe/trunk/lib/CodeGen/CodeGenModule.cpp
> cfe/trunk/lib/CodeGen/CodeGenModule.h
> cfe/trunk/test/CodeGen/wchar-const.c
>
> Modified: cfe/trunk/include/clang/AST/Mangle.h
> URL:
> http://llvm.org/viewvc/llvm-project/cfe/trunk/include/clang/AST/Mangle.h?rev=204562&r1=204561&r2=204562&view=diff
>
> ==============================================================================
> --- cfe/trunk/include/clang/AST/Mangle.h (original)
> +++ cfe/trunk/include/clang/AST/Mangle.h Sun Mar 23 12:47:16 2014
> @@ -31,9 +31,10 @@ namespace clang {
> class FunctionDecl;
> class NamedDecl;
> class ObjCMethodDecl;
> - class VarDecl;
> + class StringLiteral;
> struct ThisAdjustment;
> struct ThunkInfo;
> + class VarDecl;
>
> /// MangleBuffer - a convenient class for storing a name which is
> /// either the result of a mangling or is a constant string with
> @@ -117,6 +118,7 @@ public:
>
> bool shouldMangleDeclName(const NamedDecl *D);
> virtual bool shouldMangleCXXName(const NamedDecl *D) = 0;
> + virtual bool shouldMangleStringLiteral(const StringLiteral *SL) = 0;
>
> // FIXME: consider replacing raw_ostream & with something like
> SmallString &.
> void mangleName(const NamedDecl *D, raw_ostream &);
> @@ -135,6 +137,7 @@ public:
> raw_ostream &) = 0;
> virtual void mangleCXXDtor(const CXXDestructorDecl *D, CXXDtorType Type,
> raw_ostream &) = 0;
> + virtual void mangleStringLiteral(const StringLiteral *SL, raw_ostream
> &) = 0;
>
> void mangleGlobalBlock(const BlockDecl *BD,
> const NamedDecl *ID,
>
> Modified: cfe/trunk/lib/AST/ItaniumMangle.cpp
> URL:
> http://llvm.org/viewvc/llvm-project/cfe/trunk/lib/AST/ItaniumMangle.cpp?rev=204562&r1=204561&r2=204562&view=diff
>
> ==============================================================================
> --- cfe/trunk/lib/AST/ItaniumMangle.cpp (original)
> +++ cfe/trunk/lib/AST/ItaniumMangle.cpp Sun Mar 23 12:47:16 2014
> @@ -21,6 +21,7 @@
> #include "clang/AST/DeclCXX.h"
> #include "clang/AST/DeclObjC.h"
> #include "clang/AST/DeclTemplate.h"
> +#include "clang/AST/Expr.h"
> #include "clang/AST/ExprCXX.h"
> #include "clang/AST/ExprObjC.h"
> #include "clang/AST/TypeLoc.h"
> @@ -126,6 +127,9 @@ public:
> /// @{
>
> bool shouldMangleCXXName(const NamedDecl *D) override;
> + bool shouldMangleStringLiteral(const StringLiteral *) override {
> + return false;
> + }
> void mangleCXXName(const NamedDecl *D, raw_ostream &) override;
> void mangleThunk(const CXXMethodDecl *MD, const ThunkInfo &Thunk,
> raw_ostream &) override;
> @@ -153,6 +157,8 @@ public:
> void mangleItaniumThreadLocalWrapper(const VarDecl *D,
> raw_ostream &) override;
>
> + void mangleStringLiteral(const StringLiteral *, raw_ostream &) override;
> +
> bool getNextDiscriminator(const NamedDecl *ND, unsigned &disc) {
> // Lambda closure types are already numbered.
> if (isLambda(ND))
> @@ -3774,6 +3780,10 @@ void ItaniumMangleContextImpl::mangleTyp
> mangleCXXRTTIName(Ty, Out);
> }
>
> +void ItaniumMangleContextImpl::mangleStringLiteral(const StringLiteral *,
> raw_ostream &) {
> + llvm_unreachable("Can't mangle string literals");
> +}
> +
> ItaniumMangleContext *
> ItaniumMangleContext::create(ASTContext &Context, DiagnosticsEngine
> &Diags) {
> return new ItaniumMangleContextImpl(Context, Diags);
>
> Modified: cfe/trunk/lib/AST/MicrosoftMangle.cpp
> URL:
> http://llvm.org/viewvc/llvm-project/cfe/trunk/lib/AST/MicrosoftMangle.cpp?rev=204562&r1=204561&r2=204562&view=diff
>
> ==============================================================================
> --- cfe/trunk/lib/AST/MicrosoftMangle.cpp (original)
> +++ cfe/trunk/lib/AST/MicrosoftMangle.cpp Sun Mar 23 12:47:16 2014
> @@ -20,6 +20,7 @@
> #include "clang/AST/DeclCXX.h"
> #include "clang/AST/DeclObjC.h"
> #include "clang/AST/DeclTemplate.h"
> +#include "clang/AST/Expr.h"
> #include "clang/AST/ExprCXX.h"
> #include "clang/AST/VTableBuilder.h"
> #include "clang/Basic/ABI.h"
> @@ -27,6 +28,9 @@
> #include "clang/Basic/TargetInfo.h"
> #include "llvm/ADT/StringExtras.h"
> #include "llvm/ADT/StringMap.h"
> +#include "llvm/Support/MathExtras.h"
> +
> +#include <algorithm>
>
> using namespace clang;
>
> @@ -93,6 +97,7 @@ public:
> MicrosoftMangleContextImpl(ASTContext &Context, DiagnosticsEngine
> &Diags)
> : MicrosoftMangleContext(Context, Diags) {}
> bool shouldMangleCXXName(const NamedDecl *D) override;
> + bool shouldMangleStringLiteral(const StringLiteral *SL) override;
> void mangleCXXName(const NamedDecl *D, raw_ostream &Out) override;
> void mangleVirtualMemPtrThunk(const CXXMethodDecl *MD, raw_ostream &)
> override;
> void mangleThunk(const CXXMethodDecl *MD, const ThunkInfo &Thunk,
> @@ -118,6 +123,7 @@ public:
> void mangleDynamicInitializer(const VarDecl *D, raw_ostream &Out)
> override;
> void mangleDynamicAtExitDestructor(const VarDecl *D,
> raw_ostream &Out) override;
> + void mangleStringLiteral(const StringLiteral *SL, raw_ostream &Out)
> override;
> bool getNextDiscriminator(const NamedDecl *ND, unsigned &disc) {
> // Lambda closure types are already numbered.
> if (isLambda(ND))
> @@ -321,6 +327,13 @@ bool MicrosoftMangleContextImpl::shouldM
> return true;
> }
>
> +bool
> +MicrosoftMangleContextImpl::shouldMangleStringLiteral(const StringLiteral
> *SL) {
> + return SL->isAscii() || SL->isWide();
> + // TODO: This needs to be updated when MSVC gains support for Unicode
> + // literals.
> +}
> +
> void MicrosoftCXXNameMangler::mangle(const NamedDecl *D,
> StringRef Prefix) {
> // MSVC doesn't mangle C++ names the same way it mangles extern "C"
> names.
> @@ -2315,6 +2328,162 @@ MicrosoftMangleContextImpl::mangleDynami
> mangleInitFiniStub(D, Out, 'F');
> }
>
> +void MicrosoftMangleContextImpl::mangleStringLiteral(const StringLiteral
> *SL,
> + raw_ostream &Out) {
> + // <char-type> ::= 0 # char
> + // ::= 1 # wchar_t
> + // ::= ??? # char16_t/char32_t will need a mangling too...
> + //
> + // <literal-length> ::= <non-negative integer> # the length of the
> literal
> + //
> + // <encoded-crc> ::= <hex digit>+ @ # crc of the literal
> including
> + // # null-terminator
> + //
> + // <encoded-string> ::= <simple character> # uninteresting
> character
> + // ::= '?$' <hex digit> <hex digit> # these two nibbles
> + // # encode the byte
> for the
> + // # character
> + // ::= '?' [a-z] # \xe1 - \xfa
> + // ::= '?' [A-Z] # \xc1 - \xda
> + // ::= '?' [0-9] # [,/\:. \n\t'-]
> + //
> + // <literal> ::= '??_C at _' <char-type> <literal-length> <encoded-crc>
> + // <encoded-string> '@'
> + MicrosoftCXXNameMangler Mangler(*this, Out);
> + Mangler.getStream() << "\01??_C at _";
> +
> + // <char-type>: The "kind" of string literal is encoded into the
> mangled name.
> + // TODO: This needs to be updated when MSVC gains support for unicode
> + // literals.
> + if (SL->isAscii())
> + Mangler.getStream() << '0';
> + else if (SL->isWide())
> + Mangler.getStream() << '1';
> + else
> + llvm_unreachable("unexpected string literal kind!");
> +
> + // <literal-length>: The next part of the mangled name consists of the
> length
> + // of the string.
> + // The StringLiteral does not consider the NUL terminator byte(s) but
> the
> + // mangling does.
> + // N.B. The length is in terms of bytes, not characters.
> + Mangler.mangleNumber(SL->getByteLength() + SL->getCharByteWidth());
> +
> + // We will use the "Rocksoft^tm Model CRC Algorithm" to describe the
> + // properties of our CRC:
> + // Width : 32
> + // Poly : 04C11DB7
> + // Init : FFFFFFFF
> + // RefIn : True
> + // RefOut : True
> + // XorOut : 00000000
> + // Check : 340BC6D9
> + uint32_t CRC = 0xFFFFFFFFU;
> +
> + auto UpdateCRC = [&CRC](char Byte) {
> + for (unsigned i = 0; i < 8; ++i) {
> + bool Bit = CRC & 0x80000000U;
> + if (Byte & (1U << i))
> + Bit = !Bit;
> + CRC <<= 1;
> + if (Bit)
> + CRC ^= 0x04C11DB7U;
> + }
> + };
> +
> + // CRC all the bytes of the StringLiteral.
> + for (char Byte : SL->getBytes())
> + UpdateCRC(Byte);
> +
> + // The NUL terminator byte(s) were not present earlier,
> + // we need to manually process those bytes into the CRC.
> + for (unsigned NullTerminator = 0; NullTerminator <
> SL->getCharByteWidth();
> + ++NullTerminator)
> + UpdateCRC('\x00');
> +
> + // The literature refers to the process of reversing the bits in the
> final CRC
> + // output as "reflection".
> + CRC = llvm::reverseBits(CRC);
> +
> + // <encoded-crc>: The CRC is encoded utilizing the standard number
> mangling
> + // scheme.
> + Mangler.mangleNumber(CRC);
> +
> + // <encoded-crc>: The mangled name also contains the first 32
> _characters_
> + // (including null-terminator bytes) of the StringLiteral.
> + // Each character is encoded by splitting them into bytes and then
> encoding
> + // the constituent bytes.
> + auto MangleCharacter = [&Mangler](char Byte) {
> + // There are five different manglings for characters:
> + // - ?[0-9]: The set of [,/\:. \n\t'-].
> + // - [a-zA-Z0-9_$]: A one-to-one mapping.
> + // - ?[a-z]: The range from \xe1 to \xfa.
> + // - ?[A-Z]: The range from \xc1 to \xda.
> + // - ?$XX: A fallback which maps nibbles.
> + static const char SpecialMap[] = {',', '/', '\\', ':', '.',
> + ' ', '\n', '\t', '\'', '-'};
> + const char *SpecialMapI =
> + std::find(std::begin(SpecialMap), std::end(SpecialMap), Byte);
>
Do we generate good code for this? Maybe move it down into the 'else' block
to avoid the linear scan in the common case?
> + if (SpecialMapI != std::end(SpecialMap)) {
> + Mangler.getStream() << '?' << SpecialMapI - SpecialMap;
> + } else if ((Byte >= 'a' && Byte <= 'z') || (Byte >= 'A' && Byte <=
> 'Z') ||
> + (Byte >= '0' && Byte <= '9') || Byte == '_' || Byte ==
> '$') {
> + Mangler.getStream() << Byte;
> + } else if (Byte >= '\xe1' && Byte <= '\xfa') {
> + Mangler.getStream() << '?' << (char)('a' + Byte - '\xe1');
> + } else if (Byte >= '\xc1' && Byte <= '\xda') {
> + Mangler.getStream() << '?' << (char)('A' + Byte - '\xc1');
> + } else {
> + Mangler.getStream() << "?$";
> + Mangler.getStream() << (char)('A' + ((Byte >> 4) & 0xf));
> + Mangler.getStream() << (char)('A' + (Byte & 0xf));
> + }
> + };
> +
> + // Enforce our 32 character max.
> + unsigned MaxBytes = 32 * SL->getCharByteWidth();
> + StringRef Bytes = SL->getBytes().substr(0, MaxBytes);
> + size_t BytesLength = Bytes.size();
> +
> + if (SL->isAscii()) {
> + // A character maps directly to a byte for ASCII StringLiterals.
> + for (char Byte : Bytes)
> + MangleCharacter(Byte);
> + } else if (SL->isWide()) {
> + // The ordering of bytes in a wide StringLiteral is like so:
> + // A B C D ...
> + // However, they are mangled in the following order:
> + // B A D C ...
>
I don't believe you =)
The ordering of bytes in a wide StringLiteral depends on the endianness of
the host machine. If the strings are mangled in big-endian order, we should
do that properly rather than assuming the host is little-endian.
> + for (size_t i = 0; i != BytesLength;) {
> + char FirstByte = Bytes[i];
> + ++i;
> + if (i != BytesLength) {
> + char SecondByte = Bytes[i];
> + ++i;
> + MangleCharacter(SecondByte);
> + }
> + MangleCharacter(FirstByte);
> + }
> + } else {
> + llvm_unreachable("unexpected string literal kind!");
> + // TODO: This needs to be updated when MSVC gains support for Unicode
> + // literals.
> + }
> +
> + // We should also encode the NUL terminator(s) if we encoded less than
> 32
> + // characters.
>
"less" should be "fewer" =)
> + if (BytesLength < MaxBytes) {
> + size_t PaddingBytes = SL->getCharByteWidth();
> + size_t BytesLeft = MaxBytes - BytesLength;
> + if (BytesLeft < PaddingBytes)
> + PaddingBytes = BytesLeft;
> + for (unsigned i = 0; i < PaddingBytes; ++i)
> + MangleCharacter('\x00');
> + }
> +
> + Mangler.getStream() << '@';
> +}
> +
> MicrosoftMangleContext *
> MicrosoftMangleContext::create(ASTContext &Context, DiagnosticsEngine
> &Diags) {
> return new MicrosoftMangleContextImpl(Context, Diags);
>
> Modified: cfe/trunk/lib/CodeGen/CodeGenModule.cpp
> URL:
> http://llvm.org/viewvc/llvm-project/cfe/trunk/lib/CodeGen/CodeGenModule.cpp?rev=204562&r1=204561&r2=204562&view=diff
>
> ==============================================================================
> --- cfe/trunk/lib/CodeGen/CodeGenModule.cpp (original)
> +++ cfe/trunk/lib/CodeGen/CodeGenModule.cpp Sun Mar 23 12:47:16 2014
> @@ -2572,25 +2572,67 @@ CodeGenModule::GetConstantArrayFromStrin
> llvm::Constant *
> CodeGenModule::GetAddrOfConstantStringFromLiteral(const StringLiteral *S)
> {
> CharUnits Align = getContext().getAlignOfGlobalVarInChars(S->getType());
> - if (S->isAscii() || S->isUTF8()) {
> - SmallString<64> Str(S->getString());
> -
> - // Resize the string to the right size, which is indicated by its
> type.
> - const ConstantArrayType *CAT =
> Context.getAsConstantArrayType(S->getType());
> - Str.resize(CAT->getSize().getZExtValue());
> - return GetAddrOfConstantString(Str, /*GlobalName*/ 0,
> Align.getQuantity());
> +
> + llvm::StringMapEntry<llvm::GlobalVariable *> *Entry = nullptr;
> + llvm::GlobalVariable *GV = nullptr;
> + if (!LangOpts.WritableStrings) {
> + llvm::StringMap<llvm::GlobalVariable *> *ConstantStringMap = nullptr;
> + switch (S->getCharByteWidth()) {
> + case 1:
> + ConstantStringMap = &Constant1ByteStringMap;
> + break;
> + case 2:
> + ConstantStringMap = &Constant2ByteStringMap;
> + break;
> + case 4:
> + ConstantStringMap = &Constant4ByteStringMap;
> + break;
> + default:
> + llvm_unreachable("unhandled byte width!");
> + }
> + Entry = &ConstantStringMap->GetOrCreateValue(S->getBytes());
> + GV = Entry->getValue();
> + }
> +
> + if (!GV) {
> + StringRef GlobalVariableName;
> + llvm::GlobalValue::LinkageTypes LT;
> + if (!LangOpts.WritableStrings &&
> + getCXXABI().getMangleContext().shouldMangleStringLiteral(S)) {
> + LT = llvm::GlobalValue::LinkOnceODRLinkage;
> +
> + SmallString<256> Buffer;
> + llvm::raw_svector_ostream Out(Buffer);
> + getCXXABI().getMangleContext().mangleStringLiteral(S, Out);
> + Out.flush();
> +
> + size_t Length = Buffer.size();
> + char *Name = MangledNamesAllocator.Allocate<char>(Length);
>
Do you really need an allocation and a copy here? Moving 'Buffer' outside
the 'if' would seem sufficient -- creating the GlobalVariable will make its
own copy.
+ std::copy(Buffer.begin(), Buffer.end(), Name);
> + GlobalVariableName = StringRef(Name, Length);
> + } else {
> + LT = llvm::GlobalValue::PrivateLinkage;;
> + GlobalVariableName = ".str";
> + }
> +
> + // OpenCL v1.2 s6.5.3: a string literal is in the constant address
> space.
> + unsigned AddrSpace = 0;
> + if (getLangOpts().OpenCL)
> + AddrSpace =
> getContext().getTargetAddressSpace(LangAS::opencl_constant);
> +
> + llvm::Constant *C = GetConstantArrayFromStringLiteral(S);
> + GV = new llvm::GlobalVariable(
> + getModule(), C->getType(), !LangOpts.WritableStrings, LT, C,
> + GlobalVariableName, /*InsertBefore=*/nullptr,
> + llvm::GlobalVariable::NotThreadLocal, AddrSpace);
> + GV->setUnnamedAddr(true);
> + if (Entry)
> + Entry->setValue(GV);
> }
>
> - // FIXME: the following does not memoize wide strings.
> - llvm::Constant *C = GetConstantArrayFromStringLiteral(S);
> - llvm::GlobalVariable *GV =
> - new llvm::GlobalVariable(getModule(),C->getType(),
> - !LangOpts.WritableStrings,
> - llvm::GlobalValue::PrivateLinkage,
> - C,".str");
> + if (Align.getQuantity() > GV->getAlignment())
> + GV->setAlignment(Align.getQuantity());
>
> - GV->setAlignment(Align.getQuantity());
> - GV->setUnnamedAddr(true);
> return GV;
> }
>
> @@ -2615,7 +2657,7 @@ static llvm::GlobalVariable *GenerateStr
> llvm::Constant *C =
> llvm::ConstantDataArray::getString(CGM.getLLVMContext(), str,
> false);
>
> - // OpenCL v1.1 s6.5.3: a string literal is in the constant address
> space.
> + // OpenCL v1.2 s6.5.3: a string literal is in the constant address
> space.
> unsigned AddrSpace = 0;
> if (CGM.getLangOpts().OpenCL)
> AddrSpace =
> CGM.getContext().getTargetAddressSpace(LangAS::opencl_constant);
> @@ -2654,7 +2696,7 @@ llvm::Constant *CodeGenModule::GetAddrOf
> return GenerateStringLiteral(Str, false, *this, GlobalName,
> Alignment);
>
> llvm::StringMapEntry<llvm::GlobalVariable *> &Entry =
> - ConstantStringMap.GetOrCreateValue(Str);
> + Constant1ByteStringMap.GetOrCreateValue(Str);
>
> if (llvm::GlobalVariable *GV = Entry.getValue()) {
> if (Alignment > GV->getAlignment()) {
>
> Modified: cfe/trunk/lib/CodeGen/CodeGenModule.h
> URL:
> http://llvm.org/viewvc/llvm-project/cfe/trunk/lib/CodeGen/CodeGenModule.h?rev=204562&r1=204561&r2=204562&view=diff
>
> ==============================================================================
> --- cfe/trunk/lib/CodeGen/CodeGenModule.h (original)
> +++ cfe/trunk/lib/CodeGen/CodeGenModule.h Sun Mar 23 12:47:16 2014
> @@ -319,7 +319,10 @@ class CodeGenModule : public CodeGenType
> llvm::StringMap<llvm::Constant*> AnnotationStrings;
>
> llvm::StringMap<llvm::Constant*> CFConstantStringMap;
> - llvm::StringMap<llvm::GlobalVariable*> ConstantStringMap;
> +
> + llvm::StringMap<llvm::GlobalVariable *> Constant1ByteStringMap;
> + llvm::StringMap<llvm::GlobalVariable *> Constant2ByteStringMap;
> + llvm::StringMap<llvm::GlobalVariable *> Constant4ByteStringMap;
> llvm::DenseMap<const Decl*, llvm::Constant *> StaticLocalDeclMap;
> llvm::DenseMap<const Decl*, llvm::GlobalVariable*>
> StaticLocalDeclGuardMap;
> llvm::DenseMap<const Expr*, llvm::Constant *>
> MaterializedGlobalTemporaryMap;
>
> Modified: cfe/trunk/test/CodeGen/wchar-const.c
> URL:
> http://llvm.org/viewvc/llvm-project/cfe/trunk/test/CodeGen/wchar-const.c?rev=204562&r1=204561&r2=204562&view=diff
>
> ==============================================================================
> --- cfe/trunk/test/CodeGen/wchar-const.c (original)
> +++ cfe/trunk/test/CodeGen/wchar-const.c Sun Mar 23 12:47:16 2014
> @@ -15,7 +15,7 @@ typedef __WCHAR_TYPE__ wchar_t;
>
>
> // CHECK-DAR: private unnamed_addr constant [18 x i32] [i32 84,
> -// CHECK-WIN: private unnamed_addr constant [18 x i16] [i16 84,
> +// CHECK-WIN: linkonce_odr unnamed_addr constant [18 x i16] [i16 84,
> extern void foo(const wchar_t* p);
> int main (int argc, const char * argv[])
> {
You seem to be very short on tests; did you forget to svn add a file?
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://lists.llvm.org/pipermail/cfe-commits/attachments/20140323/c04c969c/attachment.html>
More information about the cfe-commits
mailing list