[llvm] r264551 - Reapply ~"Bitcode: Collect all MDString records into a single blob"

Duncan P. N. Exon Smith via llvm-commits llvm-commits at lists.llvm.org
Sun Mar 27 16:17:55 PDT 2016


Author: dexonsmith
Date: Sun Mar 27 18:17:54 2016
New Revision: 264551

URL: http://llvm.org/viewvc/llvm-project?rev=264551&view=rev
Log:
Reapply ~"Bitcode: Collect all MDString records into a single blob"

Spiritually reapply commit r264409 (reverted in r264410), albeit with a
bit of a redesign.

Firstly, avoid splitting the big blob into multiple chunks of strings.

r264409 imposed an arbitrary limit to avoid a massive allocation on the
shared 'Record' SmallVector.  The bug with that commit only reproduced
when there were more than "chunk-size" strings.  A test for this would
have been useless long-term, since we're liable to adjust the chunk-size
in the future.

Thus, eliminate the motivation for chunk-ing by storing the string sizes
in the blob.  Here's the layout:

    vbr6: # of strings
    vbr6: offset-to-blob
    blob:
       [vbr6]: string lengths
       [char]: concatenated strings

Secondly, make the output of llvm-bcanalyzer readable.

I noticed when debugging r264409 that llvm-bcanalyzer was outputting a
massive blob all in one line.  Past a small number, the strings were
impossible to split in my head, and the lines were way too long.  This
version adds support in llvm-bcanalyzer for pretty-printing.

    <STRINGS abbrevid=4 op0=3 op1=9/> num-strings = 3 {
      'abc'
      'def'
      'ghi'
    }

>From the original commit:

Inspired by Mehdi's similar patch, http://reviews.llvm.org/D18342, this
should (a) slightly reduce bitcode size, since there is less record
overhead, and (b) greatly improve reading speed, since blobs are super
cheap to deserialize.

Added:
    llvm/trunk/test/Bitcode/metadata-strings.ll
Modified:
    llvm/trunk/include/llvm/Bitcode/LLVMBitCodes.h
    llvm/trunk/lib/Bitcode/Reader/BitcodeReader.cpp
    llvm/trunk/lib/Bitcode/Writer/BitcodeWriter.cpp
    llvm/trunk/lib/Bitcode/Writer/ValueEnumerator.cpp
    llvm/trunk/lib/Bitcode/Writer/ValueEnumerator.h
    llvm/trunk/tools/llvm-bcanalyzer/llvm-bcanalyzer.cpp

Modified: llvm/trunk/include/llvm/Bitcode/LLVMBitCodes.h
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/include/llvm/Bitcode/LLVMBitCodes.h?rev=264551&r1=264550&r2=264551&view=diff
==============================================================================
--- llvm/trunk/include/llvm/Bitcode/LLVMBitCodes.h (original)
+++ llvm/trunk/include/llvm/Bitcode/LLVMBitCodes.h Sun Mar 27 18:17:54 2016
@@ -209,7 +209,7 @@ enum { BITCODE_CURRENT_EPOCH = 0 };
   };
 
   enum MetadataCodes {
-    METADATA_STRING        = 1,   // MDSTRING:      [values]
+    METADATA_STRING_OLD    = 1,   // MDSTRING:      [values]
     METADATA_VALUE         = 2,   // VALUE:         [type num, value num]
     METADATA_NODE          = 3,   // NODE:          [n x md num]
     METADATA_NAME          = 4,   // STRING:        [values]
@@ -243,6 +243,7 @@ enum { BITCODE_CURRENT_EPOCH = 0 };
     METADATA_MODULE        = 32,  // [distinct, scope, name, ...]
     METADATA_MACRO         = 33,  // [distinct, macinfo, line, name, value]
     METADATA_MACRO_FILE    = 34,  // [distinct, macinfo, line, file, ...]
+    METADATA_STRINGS       = 35,  // [count, offset] blob([lengths][chars])
   };
 
   // The constants block (CONSTANTS_BLOCK_ID) describes emission for each

Modified: llvm/trunk/lib/Bitcode/Reader/BitcodeReader.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Bitcode/Reader/BitcodeReader.cpp?rev=264551&r1=264550&r2=264551&view=diff
==============================================================================
--- llvm/trunk/lib/Bitcode/Reader/BitcodeReader.cpp (original)
+++ llvm/trunk/lib/Bitcode/Reader/BitcodeReader.cpp Sun Mar 27 18:17:54 2016
@@ -396,6 +396,9 @@ private:
   std::error_code globalCleanup();
   std::error_code resolveGlobalAndAliasInits();
   std::error_code parseMetadata(bool ModuleLevel = false);
+  std::error_code parseMetadataStrings(ArrayRef<uint64_t> Record,
+                                       StringRef Blob,
+                                       unsigned &NextMetadataNo);
   std::error_code parseMetadataKinds();
   std::error_code parseMetadataKindRecord(SmallVectorImpl<uint64_t> &Record);
   std::error_code parseMetadataAttachment(Function &F);
@@ -1883,6 +1886,47 @@ BitcodeReader::parseMetadataKindRecord(S
 
 static int64_t unrotateSign(uint64_t U) { return U & 1 ? ~(U >> 1) : U >> 1; }
 
+std::error_code BitcodeReader::parseMetadataStrings(ArrayRef<uint64_t> Record,
+                                                    StringRef Blob,
+                                                    unsigned &NextMetadataNo) {
+  // All the MDStrings in the block are emitted together in a single
+  // record.  The strings are concatenated and stored in a blob along with
+  // their sizes.
+  if (Record.size() != 2)
+    return error("Invalid record: metadata strings layout");
+
+  unsigned NumStrings = Record[0];
+  unsigned StringsOffset = Record[1];
+  if (!NumStrings)
+    return error("Invalid record: metadata strings with no strings");
+  if (StringsOffset >= Blob.size())
+    return error("Invalid record: metadata strings corrupt offset");
+
+  StringRef Lengths = Blob.slice(0, StringsOffset);
+  SimpleBitstreamCursor R(*StreamFile);
+  R.jumpToPointer(Lengths.begin());
+
+  // Ensure that Blob doesn't get invalidated, even if this is reading from
+  // a StreamingMemoryObject with corrupt data.
+  R.setArtificialByteLimit(R.getCurrentByteNo() + StringsOffset);
+
+  StringRef Strings = Blob.drop_front(StringsOffset);
+  do {
+    if (R.AtEndOfStream())
+      return error("Invalid record: metadata strings bad length");
+
+    unsigned Size = R.ReadVBR(6);
+    if (Strings.size() < Size)
+      return error("Invalid record: metadata strings truncated chars");
+
+    MetadataList.assignValue(MDString::get(Context, Strings.slice(0, Size)),
+                             NextMetadataNo++);
+    Strings = Strings.drop_front(Size);
+  } while (--NumStrings);
+
+  return std::error_code();
+}
+
 /// Parse a METADATA_BLOCK. If ModuleLevel is true then we are parsing
 /// module level metadata.
 std::error_code BitcodeReader::parseMetadata(bool ModuleLevel) {
@@ -1929,7 +1973,8 @@ std::error_code BitcodeReader::parseMeta
 
     // Read a record.
     Record.clear();
-    unsigned Code = Stream.readRecord(Entry.ID, Record);
+    StringRef Blob;
+    unsigned Code = Stream.readRecord(Entry.ID, Record, &Blob);
     bool IsDistinct = false;
     switch (Code) {
     default:  // Default behavior: ignore.
@@ -2363,7 +2408,7 @@ std::error_code BitcodeReader::parseMeta
           NextMetadataNo++);
       break;
     }
-    case bitc::METADATA_STRING: {
+    case bitc::METADATA_STRING_OLD: {
       std::string String(Record.begin(), Record.end());
 
       // Test for upgrading !llvm.loop.
@@ -2373,6 +2418,11 @@ std::error_code BitcodeReader::parseMeta
       MetadataList.assignValue(MD, NextMetadataNo++);
       break;
     }
+    case bitc::METADATA_STRINGS:
+      if (std::error_code EC =
+              parseMetadataStrings(Record, Blob, NextMetadataNo))
+        return EC;
+      break;
     case bitc::METADATA_KIND: {
       // Support older bitcode files that had METADATA_KIND records in a
       // block with METADATA_BLOCK_ID.

Modified: llvm/trunk/lib/Bitcode/Writer/BitcodeWriter.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Bitcode/Writer/BitcodeWriter.cpp?rev=264551&r1=264550&r2=264551&view=diff
==============================================================================
--- llvm/trunk/lib/Bitcode/Writer/BitcodeWriter.cpp (original)
+++ llvm/trunk/lib/Bitcode/Writer/BitcodeWriter.cpp Sun Mar 27 18:17:54 2016
@@ -1347,31 +1347,65 @@ static void writeNamedMetadata(const Mod
   }
 }
 
+static unsigned createMetadataStringsAbbrev(BitstreamWriter &Stream) {
+  BitCodeAbbrev *Abbv = new BitCodeAbbrev();
+  Abbv->Add(BitCodeAbbrevOp(bitc::METADATA_STRINGS));
+  Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6)); // # of strings
+  Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6)); // offset to chars
+  Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Blob));
+  return Stream.EmitAbbrev(Abbv);
+}
+
+/// Write out a record for MDString.
+///
+/// All the metadata strings in a metadata block are emitted in a single
+/// record.  The sizes and strings themselves are shoved into a blob.
+static void writeMetadataStrings(ArrayRef<const Metadata *> Strings,
+                                 BitstreamWriter &Stream,
+                                 SmallVectorImpl<uint64_t> &Record) {
+  if (Strings.empty())
+    return;
+
+  // Start the record with the number of strings.
+  Record.push_back(bitc::METADATA_STRINGS);
+  Record.push_back(Strings.size());
+
+  // Emit the sizes of the strings in the blob.
+  SmallString<256> Blob;
+  {
+    BitstreamWriter W(Blob);
+    for (const Metadata *MD : Strings)
+      W.EmitVBR(cast<MDString>(MD)->getLength(), 6);
+    W.FlushToWord();
+  }
+
+  // Add the offset to the strings to the record.
+  Record.push_back(Blob.size());
+
+  // Add the strings to the blob.
+  for (const Metadata *MD : Strings)
+    Blob.append(cast<MDString>(MD)->getString());
+
+  // Emit the final record.
+  Stream.EmitRecordWithBlob(createMetadataStringsAbbrev(Stream), Record, Blob);
+  Record.clear();
+}
+
 static void WriteModuleMetadata(const Module &M,
                                 const ValueEnumerator &VE,
                                 BitstreamWriter &Stream) {
-  const auto &MDs = VE.getMDs();
-  if (MDs.empty() && M.named_metadata_empty())
+  if (VE.getMDs().empty() && M.named_metadata_empty())
     return;
 
   Stream.EnterSubblock(bitc::METADATA_BLOCK_ID, 3);
 
-  unsigned MDSAbbrev = 0;
-  if (VE.hasMDString()) {
-    // Abbrev for METADATA_STRING.
-    BitCodeAbbrev *Abbv = new BitCodeAbbrev();
-    Abbv->Add(BitCodeAbbrevOp(bitc::METADATA_STRING));
-    Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Array));
-    Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 8));
-    MDSAbbrev = Stream.EmitAbbrev(Abbv);
-  }
-
   // Initialize MDNode abbreviations.
 #define HANDLE_MDNODE_LEAF(CLASS) unsigned CLASS##Abbrev = 0;
 #include "llvm/IR/Metadata.def"
 
   SmallVector<uint64_t, 64> Record;
-  for (const Metadata *MD : MDs) {
+  writeMetadataStrings(VE.getMDStrings(), Stream, Record);
+  for (const Metadata *MD : VE.getNonMDStrings()) {
     if (const MDNode *N = dyn_cast<MDNode>(MD)) {
       assert(N->isResolved() && "Expected forward references to be resolved");
 
@@ -1385,17 +1419,7 @@ static void WriteModuleMetadata(const Mo
 #include "llvm/IR/Metadata.def"
       }
     }
-    if (const auto *MDC = dyn_cast<ConstantAsMetadata>(MD)) {
-      WriteValueAsMetadata(MDC, VE, Stream, Record);
-      continue;
-    }
-    const MDString *MDS = cast<MDString>(MD);
-    // Code: [strchar x N]
-    Record.append(MDS->bytes_begin(), MDS->bytes_end());
-
-    // Emit the finished record.
-    Stream.EmitRecord(bitc::METADATA_STRING, Record, MDSAbbrev);
-    Record.clear();
+    WriteValueAsMetadata(cast<ConstantAsMetadata>(MD), VE, Stream, Record);
   }
 
   writeNamedMetadata(M, VE, Stream, Record);

Modified: llvm/trunk/lib/Bitcode/Writer/ValueEnumerator.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Bitcode/Writer/ValueEnumerator.cpp?rev=264551&r1=264550&r2=264551&view=diff
==============================================================================
--- llvm/trunk/lib/Bitcode/Writer/ValueEnumerator.cpp (original)
+++ llvm/trunk/lib/Bitcode/Writer/ValueEnumerator.cpp Sun Mar 27 18:17:54 2016
@@ -280,8 +280,7 @@ static bool isIntOrIntVectorValue(const
 
 ValueEnumerator::ValueEnumerator(const Module &M,
                                  bool ShouldPreserveUseListOrder)
-    : HasMDString(false),
-      ShouldPreserveUseListOrder(ShouldPreserveUseListOrder) {
+    : ShouldPreserveUseListOrder(ShouldPreserveUseListOrder) {
   if (ShouldPreserveUseListOrder)
     UseListOrders = predictUseListOrder(M);
 
@@ -375,6 +374,9 @@ ValueEnumerator::ValueEnumerator(const M
 
   // Optimize constant ordering.
   OptimizeConstants(FirstConstant, Values.size());
+
+  // Organize metadata ordering.
+  organizeMetadata();
 }
 
 unsigned ValueEnumerator::getInstructionID(const Instruction *Inst) const {
@@ -530,8 +532,8 @@ void ValueEnumerator::EnumerateMetadata(
     EnumerateMDNodeOperands(N);
   else if (auto *C = dyn_cast<ConstantAsMetadata>(MD))
     EnumerateValue(C->getValue());
-
-  HasMDString |= isa<MDString>(MD);
+  else
+    ++NumMDStrings;
 
   // Replace the dummy ID inserted above with the correct one.  MetadataMap may
   // have changed by inserting operands, so we need a fresh lookup here.
@@ -557,6 +559,19 @@ void ValueEnumerator::EnumerateFunctionL
   FunctionLocalMDs.push_back(Local);
 }
 
+void ValueEnumerator::organizeMetadata() {
+  if (!NumMDStrings)
+    return;
+
+  // Put the strings first.
+  std::stable_partition(MDs.begin(), MDs.end(),
+                        [](const Metadata *MD) { return isa<MDString>(MD); });
+
+  // Renumber.
+  for (unsigned I = 0, E = MDs.size(); I != E; ++I)
+    MetadataMap[MDs[I]] = I + 1;
+}
+
 void ValueEnumerator::EnumerateValue(const Value *V) {
   assert(!V->getType()->isVoidTy() && "Can't insert void values!");
   assert(!isa<MetadataAsValue>(V) && "EnumerateValue doesn't handle Metadata!");

Modified: llvm/trunk/lib/Bitcode/Writer/ValueEnumerator.h
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Bitcode/Writer/ValueEnumerator.h?rev=264551&r1=264550&r2=264551&view=diff
==============================================================================
--- llvm/trunk/lib/Bitcode/Writer/ValueEnumerator.h (original)
+++ llvm/trunk/lib/Bitcode/Writer/ValueEnumerator.h Sun Mar 27 18:17:54 2016
@@ -66,7 +66,7 @@ private:
   SmallVector<const LocalAsMetadata *, 8> FunctionLocalMDs;
   typedef DenseMap<const Metadata *, unsigned> MetadataMapType;
   MetadataMapType MetadataMap;
-  bool HasMDString;
+  unsigned NumMDStrings = 0;
   bool ShouldPreserveUseListOrder;
 
   typedef DenseMap<AttributeSet, unsigned> AttributeGroupMapType;
@@ -121,8 +121,6 @@ public:
   }
   unsigned numMDs() const { return MDs.size(); }
 
-  bool hasMDString() const { return HasMDString; }
-
   bool shouldPreserveUseListOrder() const { return ShouldPreserveUseListOrder; }
 
   unsigned getTypeID(Type *T) const {
@@ -157,9 +155,16 @@ public:
 
   const ValueList &getValues() const { return Values; }
   const std::vector<const Metadata *> &getMDs() const { return MDs; }
+  ArrayRef<const Metadata *> getMDStrings() const {
+    return makeArrayRef(MDs).slice(0, NumMDStrings);
+  }
+  ArrayRef<const Metadata *> getNonMDStrings() const {
+    return makeArrayRef(MDs).slice(NumMDStrings);
+  }
   const SmallVectorImpl<const LocalAsMetadata *> &getFunctionLocalMDs() const {
     return FunctionLocalMDs;
   }
+
   const TypeList &getTypes() const { return Types; }
   const std::vector<const BasicBlock*> &getBasicBlocks() const {
     return BasicBlocks;
@@ -189,6 +194,10 @@ public:
 private:
   void OptimizeConstants(unsigned CstStart, unsigned CstEnd);
 
+  // Reorder the reachable metadata.  This is not just an optimization, but is
+  // mandatory for emitting MDString correctly.
+  void organizeMetadata();
+
   void EnumerateMDNodeOperands(const MDNode *N);
   void EnumerateMetadata(const Metadata *MD);
   void EnumerateFunctionLocalMetadata(const LocalAsMetadata *Local);

Added: llvm/trunk/test/Bitcode/metadata-strings.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Bitcode/metadata-strings.ll?rev=264551&view=auto
==============================================================================
--- llvm/trunk/test/Bitcode/metadata-strings.ll (added)
+++ llvm/trunk/test/Bitcode/metadata-strings.ll Sun Mar 27 18:17:54 2016
@@ -0,0 +1,12 @@
+; RUN: llvm-as < %s | llvm-bcanalyzer -dump | FileCheck %s
+
+!named = !{!0}
+
+; CHECK:      <METADATA_BLOCK
+; CHECK-NEXT: <STRINGS
+; CHECK-SAME: /> num-strings = 3 {
+; CHECK-NEXT:   'a'
+; CHECK-NEXT:   'b'
+; CHECK-NEXT:   'c'
+; CHECK-NEXT: }
+!0 = !{!"a", !"b", !"c"}

Modified: llvm/trunk/tools/llvm-bcanalyzer/llvm-bcanalyzer.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/tools/llvm-bcanalyzer/llvm-bcanalyzer.cpp?rev=264551&r1=264550&r2=264551&view=diff
==============================================================================
--- llvm/trunk/tools/llvm-bcanalyzer/llvm-bcanalyzer.cpp (original)
+++ llvm/trunk/tools/llvm-bcanalyzer/llvm-bcanalyzer.cpp Sun Mar 27 18:17:54 2016
@@ -312,7 +312,8 @@ static const char *GetCodeName(unsigned
   case bitc::METADATA_BLOCK_ID:
     switch(CodeID) {
     default:return nullptr;
-      STRINGIFY_CODE(METADATA, STRING)
+      STRINGIFY_CODE(METADATA, STRING_OLD)
+      STRINGIFY_CODE(METADATA, STRINGS)
       STRINGIFY_CODE(METADATA, NAME)
       STRINGIFY_CODE(METADATA, KIND) // Older bitcode has it in a MODULE_BLOCK
       STRINGIFY_CODE(METADATA, NODE)
@@ -404,6 +405,57 @@ static bool Error(const Twine &Err) {
   return true;
 }
 
+static bool decodeMetadataStringsBlob(BitstreamReader &Reader, StringRef Indent,
+                                      ArrayRef<uint64_t> Record,
+                                      StringRef Blob) {
+  if (Blob.empty())
+    return true;
+
+  if (Record.size() != 2)
+    return true;
+
+  unsigned NumStrings = Record[0];
+  unsigned StringsOffset = Record[1];
+  outs() << " num-strings = " << NumStrings << " {\n";
+
+  StringRef Lengths = Blob.slice(0, StringsOffset);
+  SimpleBitstreamCursor R(Reader);
+  R.jumpToPointer(Lengths.begin());
+
+  // Ensure that Blob doesn't get invalidated, even if this is reading from a
+  // StreamingMemoryObject with corrupt data.
+  R.setArtificialByteLimit(R.getCurrentByteNo() + StringsOffset);
+
+  StringRef Strings = Blob.drop_front(StringsOffset);
+  do {
+    if (R.AtEndOfStream())
+      return Error("bad length");
+
+    unsigned Size = R.ReadVBR(6);
+    if (Strings.size() < Size)
+      return Error("truncated chars");
+
+    outs() << Indent << "    '";
+    outs().write_escaped(Strings.slice(0, Size), /*hex=*/true);
+    outs() << "'\n";
+    Strings = Strings.drop_front(Size);
+  } while (--NumStrings);
+
+  outs() << Indent << "  }";
+  return false;
+}
+
+static bool decodeBlob(unsigned Code, unsigned BlockID, BitstreamReader &Reader,
+                       StringRef Indent, ArrayRef<uint64_t> Record,
+                       StringRef Blob) {
+  if (BlockID != bitc::METADATA_BLOCK_ID)
+    return true;
+  if (Code != bitc::METADATA_STRINGS)
+    return true;
+
+  return decodeMetadataStringsBlob(Reader, Indent, Record, Blob);
+}
+
 /// ParseBlock - Read a block, updating statistics, etc.
 static bool ParseBlock(BitstreamCursor &Stream, unsigned BlockID,
                        unsigned IndentLevel, CurStreamTypeType CurStreamType) {
@@ -557,7 +609,8 @@ static bool ParseBlock(BitstreamCursor &
         }
       }
 
-      if (Blob.data()) {
+      if (Blob.data() && decodeBlob(Code, BlockID, *Stream.getBitStreamReader(),
+                                    Indent, Record, Blob)) {
         outs() << " blob data = ";
         if (ShowBinaryBlobs) {
           outs() << "'";




More information about the llvm-commits mailing list