[Mlir-commits] [mlir] Add logging for emit functions in BytecodeWriter.cpp (PR #99558)

Thu Jul 18 13:22:03 PDT 2024

llvmbot wrote:




@llvm/pr-subscribers-mlir

Author: Kevin Gleason (GleasonK)

<details>
<summary>Changes</summary>

Recently there was a change to materializing unrealized conversion casts, which inserted conversion that previously did not exist during legalization (https://github.com/llvm/llvm-project/pull/97903), after these cases are inserted and then washed away after transformation completes, it caused the use-list ordering of an op to change in some cases: `my.add %arg0(use1), %arg0(use2) --> my.add %arg0(use2), %arg0(use1)`, which subtly changes the bytecode emitted since this is considered a custom use-list.

When investigating why the bytecode had changed I added the following logging which helped track down the difference, in my case it showed extra bytes with "use-list section". With `-debug-only=mlir-bytecode-writer` emits logs like the following, detailing the source of written bytes:

```
emitBytes(4b)	bytecode header
emitVarInt(6)	bytecode version
emitByte(13)	bytecode version
emitBytes(17b)	bytecode producer
emitByte(0)	null terminator
emitVarInt(2)	dialects count
...
emitByte(5)	dialect version
emitVarInt(4)	op names count
emitByte(9)	op names count
emitVarInt(0)	dialect number
...
emitVarInt(2)	dialect writer
emitByte(5)	dialect writer
emitVarInt(9259963783827161088)	dialect APInt
...
emitVarInt(3)	attr/type offset
emitByte(7)	attr/type offset
emitByte(3)	section code
emitVarInt(18)	section size
...
```

Note: this uses string constants and `StringLiteral`, I'm not sure if these are washed away during compilation / OK to have these around for debuggin, or if there's a better way to do this? Alternative was adding many braces and `LLVM_DEBUG` calls at each callsite, but this felt more error prone / likely to miss some callsites.

---

Patch is 31.85 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/99558.diff


1 Files Affected:

- (modified) mlir/lib/Bytecode/Writer/BytecodeWriter.cpp (+153-113) 


``````````diff

diff --git a/mlir/lib/Bytecode/Writer/BytecodeWriter.cpp b/mlir/lib/Bytecode/Writer/BytecodeWriter.cpp
index 449d7549eb724..0e96aa97abeba 100644
--- a/mlir/lib/Bytecode/Writer/BytecodeWriter.cpp
+++ b/mlir/lib/Bytecode/Writer/BytecodeWriter.cpp
@@ -18,6 +18,7 @@
 #include "llvm/ADT/CachedHashString.h"
 #include "llvm/ADT/MapVector.h"
 #include "llvm/ADT/SmallVector.h"
+#include "llvm/Support/Debug.h"
 #include "llvm/Support/Endian.h"
 #include "llvm/Support/raw_ostream.h"
 #include <optional>
@@ -145,7 +146,9 @@ class EncodingEmitter {
   //===--------------------------------------------------------------------===//
 
   /// Backpatch a byte in the result buffer at the given offset.
-  void patchByte(uint64_t offset, uint8_t value) {
+  void patchByte(uint64_t offset, uint8_t value, StringLiteral desc) {
+    LLVM_DEBUG(llvm::dbgs() << "patchByte(" << offset << ',' << uint64_t(value)
+                            << ")\t" << desc << '\n');
     assert(offset < size() && offset >= prevResultSize &&
            "cannot patch previously emitted data");
     currentResult[offset - prevResultSize] = value;
@@ -153,7 +156,9 @@ class EncodingEmitter {
 
   /// Emit the provided blob of data, which is owned by the caller and is
   /// guaranteed to not die before the end of the bytecode process.
-  void emitOwnedBlob(ArrayRef<uint8_t> data) {
+  void emitOwnedBlob(ArrayRef<uint8_t> data, StringLiteral desc) {
+    LLVM_DEBUG(llvm::dbgs()
+               << "emitOwnedBlob(" << data.size() << "b)\t" << desc << '\n');
     // Push the current buffer before adding the provided data.
     appendResult(std::move(currentResult));
     appendOwnedResult(data);
@@ -163,17 +168,19 @@ class EncodingEmitter {
   /// owned by the caller and is guaranteed to not die before the end of the
   /// bytecode process. The alignment value is also encoded, making it available
   /// on load.
-  void emitOwnedBlobAndAlignment(ArrayRef<uint8_t> data, uint32_t alignment) {
-    emitVarInt(alignment);
-    emitVarInt(data.size());
+  void emitOwnedBlobAndAlignment(ArrayRef<uint8_t> data, uint32_t alignment,
+                                 StringLiteral desc) {
+    emitVarInt(alignment, desc);
+    emitVarInt(data.size(), desc);
 
     alignTo(alignment);
-    emitOwnedBlob(data);
+    emitOwnedBlob(data, desc);
   }
-  void emitOwnedBlobAndAlignment(ArrayRef<char> data, uint32_t alignment) {
+  void emitOwnedBlobAndAlignment(ArrayRef<char> data, uint32_t alignment,
+                                 StringLiteral desc) {
     ArrayRef<uint8_t> castedData(reinterpret_cast<const uint8_t *>(data.data()),
                                  data.size());
-    emitOwnedBlobAndAlignment(castedData, alignment);
+    emitOwnedBlobAndAlignment(castedData, alignment, desc);
   }
 
   /// Align the emitter to the given alignment.
@@ -187,7 +194,7 @@ class EncodingEmitter {
     size_t curOffset = size();
     size_t paddingSize = llvm::alignTo(curOffset, alignment) - curOffset;
     while (paddingSize--)
-      emitByte(bytecode::kAlignmentByte);
+      emitByte(bytecode::kAlignmentByte, "alignment byte");
 
     // Keep track of the maximum required alignment.
     requiredAlignment = std::max(requiredAlignment, alignment);
@@ -198,12 +205,16 @@ class EncodingEmitter {
 
   /// Emit a single byte.
   template <typename T>
-  void emitByte(T byte) {
+  void emitByte(T byte, StringLiteral desc) {
+    LLVM_DEBUG(llvm::dbgs()
+               << "emitByte(" << uint64_t(byte) << ")\t" << desc << '\n');
     currentResult.push_back(static_cast<uint8_t>(byte));
   }
 
   /// Emit a range of bytes.
-  void emitBytes(ArrayRef<uint8_t> bytes) {
+  void emitBytes(ArrayRef<uint8_t> bytes, StringLiteral desc) {
+    LLVM_DEBUG(llvm::dbgs()
+               << "emitBytes(" << bytes.size() << "b)\t" << desc << '\n');
     llvm::append_range(currentResult, bytes);
   }
 
@@ -214,40 +225,43 @@ class EncodingEmitter {
   /// All remaining bits in the first byte, along with all of the bits in
   /// additional bytes, provide the value of the integer encoded in
   /// little-endian order.
-  void emitVarInt(uint64_t value) {
+  void emitVarInt(uint64_t value, StringLiteral desc) {
+    LLVM_DEBUG(llvm::dbgs() << "emitVarInt(" << value << ")\t" << desc << '\n');
+
     // In the most common case, the value can be represented in a single byte.
     // Given how hot this case is, explicitly handle that here.
     if ((value >> 7) == 0)
-      return emitByte((value << 1) | 0x1);
-    emitMultiByteVarInt(value);
+      return emitByte((value << 1) | 0x1, desc);
+    emitMultiByteVarInt(value, desc);
   }
 
   /// Emit a signed variable length integer. Signed varints are encoded using
   /// a varint with zigzag encoding, meaning that we use the low bit of the
   /// value to indicate the sign of the value. This allows for more efficient
   /// encoding of negative values by limiting the number of active bits
-  void emitSignedVarInt(uint64_t value) {
-    emitVarInt((value << 1) ^ (uint64_t)((int64_t)value >> 63));
+  void emitSignedVarInt(uint64_t value, StringLiteral desc) {
+    emitVarInt((value << 1) ^ (uint64_t)((int64_t)value >> 63), desc);
   }
 
   /// Emit a variable length integer whose low bit is used to encode the
   /// provided flag, i.e. encoded as: (value << 1) | (flag ? 1 : 0).
-  void emitVarIntWithFlag(uint64_t value, bool flag) {
-    emitVarInt((value << 1) | (flag ? 1 : 0));
+  void emitVarIntWithFlag(uint64_t value, bool flag, StringLiteral desc) {
+    emitVarInt((value << 1) | (flag ? 1 : 0), desc);
   }
 
   //===--------------------------------------------------------------------===//
   // String Emission
 
   /// Emit the given string as a nul terminated string.
-  void emitNulTerminatedString(StringRef str) {
-    emitString(str);
-    emitByte(0);
+  void emitNulTerminatedString(StringRef str, StringLiteral desc) {
+    emitString(str, desc);
+    emitByte(0, "null terminator");
   }
 
   /// Emit the given string without a nul terminator.
-  void emitString(StringRef str) {
-    emitBytes({reinterpret_cast<const uint8_t *>(str.data()), str.size()});
+  void emitString(StringRef str, StringLiteral desc) {
+    emitBytes({reinterpret_cast<const uint8_t *>(str.data()), str.size()},
+              desc);
   }
 
   //===--------------------------------------------------------------------===//
@@ -260,14 +274,14 @@ class EncodingEmitter {
     // indicate whether the section alignment is present, so save an offset to
     // it.
     uint64_t codeOffset = currentResult.size();
-    emitByte(code);
-    emitVarInt(emitter.size());
+    emitByte(code, "section code");
+    emitVarInt(emitter.size(), "section size");
 
     // Integrate the alignment of the section into this emitter if necessary.
     unsigned emitterAlign = emitter.requiredAlignment;
     if (emitterAlign > 1) {
       if (size() & (emitterAlign - 1)) {
-        emitVarInt(emitterAlign);
+        emitVarInt(emitterAlign, "section alignment");
         alignTo(emitterAlign);
 
         // Indicate that we needed to align the section, the high bit of the
@@ -295,7 +309,8 @@ class EncodingEmitter {
   /// fallback when the number of bytes needed to encode the value is greater
   /// than 1. We mark it noinline here so that the single byte hot path isn't
   /// pessimized.
-  LLVM_ATTRIBUTE_NOINLINE void emitMultiByteVarInt(uint64_t value);
+  LLVM_ATTRIBUTE_NOINLINE void emitMultiByteVarInt(uint64_t value,
+                                                   StringLiteral desc);
 
   /// Append a new result buffer to the current contents.
   void appendResult(std::vector<uint8_t> &&result) {
@@ -345,15 +360,15 @@ class StringSectionBuilder {
 
   /// Write the current set of strings to the given emitter.
   void write(EncodingEmitter &emitter) {
-    emitter.emitVarInt(strings.size());
+    emitter.emitVarInt(strings.size(), "string section size");
 
     // Emit the sizes in reverse order, so that we don't need to backpatch an
     // offset to the string data or have a separate section.
     for (const auto &it : llvm::reverse(strings))
-      emitter.emitVarInt(it.first.size() + 1);
+      emitter.emitVarInt(it.first.size() + 1, "string size");
     // Emit the string data itself.
     for (const auto &it : strings)
-      emitter.emitNulTerminatedString(it.first.val());
+      emitter.emitNulTerminatedString(it.first.val(), "string");
   }
 
 private:
@@ -380,32 +395,35 @@ class DialectWriter : public DialectBytecodeWriter {
   //===--------------------------------------------------------------------===//
 
   void writeAttribute(Attribute attr) override {
-    emitter.emitVarInt(numberingState.getNumber(attr));
+    emitter.emitVarInt(numberingState.getNumber(attr), "dialect attr");
   }
   void writeOptionalAttribute(Attribute attr) override {
     if (!attr) {
-      emitter.emitVarInt(0);
+      emitter.emitVarInt(0, "dialect optional attr none");
       return;
     }
-    emitter.emitVarIntWithFlag(numberingState.getNumber(attr), true);
+    emitter.emitVarIntWithFlag(numberingState.getNumber(attr), true,
+                               "dialect optional attr");
   }
 
   void writeType(Type type) override {
-    emitter.emitVarInt(numberingState.getNumber(type));
+    emitter.emitVarInt(numberingState.getNumber(type), "dialect type");
   }
 
   void writeResourceHandle(const AsmDialectResourceHandle &resource) override {
-    emitter.emitVarInt(numberingState.getNumber(resource));
+    emitter.emitVarInt(numberingState.getNumber(resource), "dialect resource");
   }
 
   //===--------------------------------------------------------------------===//
   // Primitives
   //===--------------------------------------------------------------------===//
 
-  void writeVarInt(uint64_t value) override { emitter.emitVarInt(value); }
+  void writeVarInt(uint64_t value) override {
+    emitter.emitVarInt(value, "dialect writer");
+  }
 
   void writeSignedVarInt(int64_t value) override {
-    emitter.emitSignedVarInt(value);
+    emitter.emitSignedVarInt(value, "dialect writer");
   }
 
   void writeAPIntWithKnownWidth(const APInt &value) override {
@@ -414,21 +432,21 @@ class DialectWriter : public DialectBytecodeWriter {
     // If the value is a single byte, just emit it directly without going
     // through a varint.
     if (bitWidth <= 8)
-      return emitter.emitByte(value.getLimitedValue());
+      return emitter.emitByte(value.getLimitedValue(), "dialect APInt");
 
     // If the value fits within a single varint, emit it directly.
     if (bitWidth <= 64)
-      return emitter.emitSignedVarInt(value.getLimitedValue());
+      return emitter.emitSignedVarInt(value.getLimitedValue(), "dialect APInt");
 
     // Otherwise, we need to encode a variable number of active words. We use
     // active words instead of the number of total words under the observation
     // that smaller values will be more common.
     unsigned numActiveWords = value.getActiveWords();
-    emitter.emitVarInt(numActiveWords);
+    emitter.emitVarInt(numActiveWords, "dialect APInt word count");
 
     const uint64_t *rawValueData = value.getRawData();
     for (unsigned i = 0; i < numActiveWords; ++i)
-      emitter.emitSignedVarInt(rawValueData[i]);
+      emitter.emitSignedVarInt(rawValueData[i], "dialect APInt word");
   }
 
   void writeAPFloatWithKnownSemantics(const APFloat &value) override {
@@ -436,16 +454,20 @@ class DialectWriter : public DialectBytecodeWriter {
   }
 
   void writeOwnedString(StringRef str) override {
-    emitter.emitVarInt(stringSection.insert(str));
+    emitter.emitVarInt(stringSection.insert(str), "dialect string");
   }
 
   void writeOwnedBlob(ArrayRef<char> blob) override {
-    emitter.emitVarInt(blob.size());
-    emitter.emitOwnedBlob(ArrayRef<uint8_t>(
-        reinterpret_cast<const uint8_t *>(blob.data()), blob.size()));
+    emitter.emitVarInt(blob.size(), "dialect blob");
+    emitter.emitOwnedBlob(
+        ArrayRef<uint8_t>(reinterpret_cast<const uint8_t *>(blob.data()),
+                          blob.size()),
+        "dialect blob");
   }
 
-  void writeOwnedBool(bool value) override { emitter.emitByte(value); }
+  void writeOwnedBool(bool value) override {
+    emitter.emitByte(value, "dialect bool");
+  }
 
   int64_t getBytecodeVersion() const override { return bytecodeVersion; }
 
@@ -486,7 +508,7 @@ class PropertiesSectionBuilder {
       if (!prop)
         return std::nullopt;
       EncodingEmitter sizeEmitter;
-      sizeEmitter.emitVarInt(numberingState.getNumber(prop));
+      sizeEmitter.emitVarInt(numberingState.getNumber(prop), "properties size");
       scratch.clear();
       llvm::raw_svector_ostream os(scratch);
       sizeEmitter.writeTo(os);
@@ -507,16 +529,17 @@ class PropertiesSectionBuilder {
 
   /// Write the current set of properties to the given emitter.
   void write(EncodingEmitter &emitter) {
-    emitter.emitVarInt(propertiesStorage.size());
+    emitter.emitVarInt(propertiesStorage.size(), "properties size");
     if (propertiesStorage.empty())
       return;
     for (const auto &storage : propertiesStorage) {
       if (storage.empty()) {
-        emitter.emitBytes(ArrayRef<uint8_t>());
+        emitter.emitBytes(ArrayRef<uint8_t>(), "empty properties");
         continue;
       }
       emitter.emitBytes(ArrayRef(reinterpret_cast<const uint8_t *>(&storage[0]),
-                                 storage.size()));
+                                 storage.size()),
+                        "property");
     }
   }
 
@@ -532,7 +555,7 @@ class PropertiesSectionBuilder {
     SmallVector<char> sizeScratch;
     {
       EncodingEmitter sizeEmitter;
-      sizeEmitter.emitVarInt(rawProperties.size());
+      sizeEmitter.emitVarInt(rawProperties.size(), "properties");
       llvm::raw_svector_ostream os(sizeScratch);
       sizeEmitter.writeTo(os);
     }
@@ -576,7 +599,8 @@ class RawEmitterOstream : public raw_ostream {
 
 private:
   void write_impl(const char *ptr, size_t size) override {
-    emitter.emitBytes({reinterpret_cast<const uint8_t *>(ptr), size});
+    emitter.emitBytes({reinterpret_cast<const uint8_t *>(ptr), size},
+                      "raw emitter");
   }
   uint64_t current_pos() const override { return emitter.size(); }
 
@@ -591,7 +615,7 @@ void EncodingEmitter::writeTo(raw_ostream &os) const {
   os.write((const char *)currentResult.data(), currentResult.size());
 }
 
-void EncodingEmitter::emitMultiByteVarInt(uint64_t value) {
+void EncodingEmitter::emitMultiByteVarInt(uint64_t value, StringLiteral desc) {
   // Compute the number of bytes needed to encode the value. Each byte can hold
   // up to 7-bits of data. We only check up to the number of bits we can encode
   // in the first byte (8).
@@ -601,16 +625,16 @@ void EncodingEmitter::emitMultiByteVarInt(uint64_t value) {
       uint64_t encodedValue = (value << 1) | 0x1;
       encodedValue <<= (numBytes - 1);
       llvm::support::ulittle64_t encodedValueLE(encodedValue);
-      emitBytes({reinterpret_cast<uint8_t *>(&encodedValueLE), numBytes});
+      emitBytes({reinterpret_cast<uint8_t *>(&encodedValueLE), numBytes}, desc);
       return;
     }
   }
 
   // If the value is too large to encode in a single byte, emit a special all
   // zero marker byte and splat the value directly.
-  emitByte(0);
+  emitByte(0, desc);
   llvm::support::ulittle64_t valueLE(value);
-  emitBytes({reinterpret_cast<uint8_t *>(&valueLE), sizeof(valueLE)});
+  emitBytes({reinterpret_cast<uint8_t *>(&valueLE), sizeof(valueLE)}, desc);
 }
 
 //===----------------------------------------------------------------------===//
@@ -696,7 +720,7 @@ LogicalResult BytecodeWriter::write(Operation *rootOp, raw_ostream &os) {
 
   // Emit the bytecode file header. This is how we identify the output as a
   // bytecode file.
-  emitter.emitString("ML\xefR");
+  emitter.emitString("ML\xefR", "bytecode header");
 
   // Emit the bytecode version.
   if (config.bytecodeVersion < bytecode::kMinSupportedVersion ||
@@ -706,10 +730,10 @@ LogicalResult BytecodeWriter::write(Operation *rootOp, raw_ostream &os) {
            << ", must be in range ["
            << static_cast<int64_t>(bytecode::kMinSupportedVersion) << ", "
            << static_cast<int64_t>(bytecode::kVersion) << ']';
-  emitter.emitVarInt(config.bytecodeVersion);
+  emitter.emitVarInt(config.bytecodeVersion, "bytecode version");
 
   // Emit the producer.
-  emitter.emitNulTerminatedString(config.producer);
+  emitter.emitNulTerminatedString(config.producer, "bytecode producer");
 
   // Emit the dialect section.
   writeDialectSection(emitter);
@@ -760,8 +784,8 @@ static void writeDialectGrouping(EncodingEmitter &emitter, EntriesT &&entries,
     });
 
     // Emit the dialect and number of elements.
-    emitter.emitVarInt(currentDialect->number);
-    emitter.emitVarInt(std::distance(groupStart, it));
+    emitter.emitVarInt(currentDialect->number, "dialect number");
+    emitter.emitVarInt(std::distance(groupStart, it), "dialect offset");
 
     // Emit the entries within the group.
     for (auto &entry : llvm::make_range(groupStart, it))
@@ -774,13 +798,13 @@ void BytecodeWriter::writeDialectSection(EncodingEmitter &emitter) {
 
   // Emit the referenced dialects.
   auto dialects = numberingState.getDialects();
-  dialectEmitter.emitVarInt(llvm::size(dialects));
+  dialectEmitter.emitVarInt(llvm::size(dialects), "dialects count");
   for (DialectNumbering &dialect : dialects) {
     // Write the string section and get the ID.
     size_t nameID = stringSection.insert(dialect.name);
 
     if (config.bytecodeVersion < bytecode::kDialectVersioning) {
-      dialectEmitter.emitVarInt(nameID);
+      dialectEmitter.emitVarInt(nameID, "dialect name ID");
       continue;
     }
 
@@ -798,22 +822,25 @@ void BytecodeWriter::writeDialectSection(EncodingEmitter &emitter) {
     // this in the dialect ID, so if there is no version, we don't write the
     // section.
     size_t versionAvailable = versionEmitter.size() > 0;
-    dialectEmitter.emitVarIntWithFlag(nameID, versionAvailable);
+    dialectEmitter.emitVarIntWithFlag(nameID, versionAvailable,
+                                      "dialect version");
     if (versionAvailable)
       dialectEmitter.emitSection(bytecode::Section::kDialectVersions,
                                  std::move(versionEmitter));
   }
 
   if (config.bytecodeVersion >= bytecode::kElideUnknownBlockArgLocation)
-    dialectEmitter.emitVarInt(size(numberingState.getOpNames()));
+    dialectEmitter.emitVarInt(size(numberingState.getOpNames()),
+                              "op names count");
 
   // Emit the referenced operation names grouped by dialect.
   auto emitOpName = [&](OpNameNumbering &name) {
     size_t stringId = stringSection.insert(name.name.stripDialect());
     if (config.bytecodeVersion < bytecode::kNativePropertiesEncoding)
-      dialectEmitter.emitVarInt(stringId);
+      dialectEmitter.emitVarInt(stringId, "dialect op name");
     else
-      dialectEmitter.emitVarIntWithFlag(stringId, name.name.isRegistered());
+      dialectEmitter.emitVarIntWithFlag(stringId, name.name.isRegistered(),
+                                        "dialect op name");
   };
   writeDialectGrouping(dialectEmitter, numberingState.getOpNames(), emitOpName);
 
@@ -826,8 +853,10 @@ void BytecodeWriter::writeDialectSection(EncodingEmitter &emitter) {
 void BytecodeWriter::writeAttrTypeSection(EncodingEmitter &emitter) {
   EncodingEmitter attrTypeEmitter;
   EncodingEmitter offsetEmitter;
-  offsetEmitter.emitVarInt(llvm::size(numberingState.getAttributes()));
-  offsetEmitter.emitVarInt(llvm::size(numberingState.getTypes()));
+  offsetEmitter.emitVarInt(llvm::size(numberingState.getAttributes()),
+                           "attributes count");
+  offsetEmitter.emitVarInt(llvm::size(numberingState.getTypes()),
+                           "types count");
 
   // A functor used to emit an attribute or type entry.
   uint64_t prevOffset = 0;
@@ -836,7 +865,7 @@ void BytecodeWriter::writeAttrTypeSection(EncodingEmitter &emitter) {
 
     auto emitAttrOrTypeRawImpl = [&]() -> void {
       RawEmitterOstream(attrTypeEmitter) << entryValue;
-      attrTypeEmitter.emitByte(0);
+      attrTypeEmitter.emitByte(0, "attr/type separator");
     };
     auto emitAttrOrTypeImpl = [&]() -> bool {
       // TODO: We don't currently support custom encoded mutable types and
@@ -882,7 +911,8 @@ vo...
[truncated]

``````````

</details>


https://github.com/llvm/llvm-project/pull/99558