[lld] [lld-macho] Implement ObjC category merging (-objc_category_merging) (PR #82928)

Kyungwoo Lee via llvm-commits llvm-commits at lists.llvm.org
Fri Mar 1 13:31:50 PST 2024


================
@@ -320,3 +332,915 @@ void objc::checkCategories() {
       }
   }
 }
+
+namespace {
+
+class ObjcCategoryMerger {
+  // Information about an input category
+  struct InfoInputCategory {
+    ConcatInputSection *catBodyIsec;
+    ConcatInputSection *catListIsec;
+    uint32_t offCatListIsec = 0;
+
+    bool wasMerged = false;
+  };
+
+  // To write new (merged) categories or classes, we will try make limited
+  // assumptions about the alignment and the sections the various class/category
+  // info are stored in and . So we'll just reuse the same sections and
+  // alignment as already used in existing (input) categories. To do this we
+  // have InfoCategoryWriter which contains the various sections that the
+  // generated categories will be written to.
+  template <typename T> struct InfroWriteSection {
+    bool valid = false; // Data has been successfully collected from input
+    uint32_t align = 0;
+    const Section *inputSection;
+    Reloc relocTemplate;
+    T *outputSection;
+  };
+
+  struct InfoCategoryWriter {
+    InfroWriteSection<ConcatOutputSection> catListInfo;
+    InfroWriteSection<ConcatOutputSection> catBodyInfo;
+    InfroWriteSection<CStringSection> catNameInfo;
+    InfroWriteSection<ConcatOutputSection> catPtrListInfo;
+  };
+
+  // Information about a pointer list in the original categories (method lists,
+  // protocol lists, etc)
+  struct PointerListInfo {
+    PointerListInfo(const char *pszSymNamePrefix)
+        : namePrefix(pszSymNamePrefix) {}
+    const char *namePrefix;
+
+    uint32_t structSize = 0;
+    uint32_t structCount = 0;
+
+    std::vector<Symbol *> allPtrs;
+  };
+
+  // Full information about all the categories that are extending a class. This
+  // will have all the additional methods, protocols, proprieties that are
+  // contained in all the categories that extend a particular class.
+  struct ClassExtensionInfo {
+    // Merged names of containers. Ex: base|firstCategory|secondCategory|...
+    std::string mergedContainerName;
+    std::string baseClassName;
+    Symbol *baseClass = nullptr;
+    // In case we generate new data, mark the new data as belonging to this file
+    ObjFile *objFileForMergeData = nullptr;
+
+    PointerListInfo instanceMethods =
+        objc::symbol_names::categoryInstanceMethods;
+    PointerListInfo classMethods = objc::symbol_names::categoryClassMethods;
+    PointerListInfo protocols = objc::symbol_names::categoryProtocols;
+    PointerListInfo instanceProps = objc::symbol_names::listProprieties;
+    PointerListInfo classProps = objc::symbol_names::klassPropList;
+  };
+
+public:
+  ObjcCategoryMerger(std::vector<ConcatInputSection *> &_allInputSections);
+  void doMerge();
+  static void doCleanup();
+
+private:
+  void collectAndValidateCategoriesData();
+  void
+  mergeCategoriesIntoSingleCategory(std::vector<InfoInputCategory> &categories);
+
+  void eraseISec(ConcatInputSection *isec);
+  void eraseMergedCategories();
+
+  void generateCatListForNonErasedCategories(
+      std::map<ConcatInputSection *, std::set<uint64_t>>
+          catListToErasedOffsets);
+  template <typename T>
+  void collectSectionWriteInfoFromIsec(InputSection *isec,
+                                       InfroWriteSection<T> &catWriteInfo);
+  void collectCategoryWriterInfoFromCategory(InfoInputCategory &catInfo);
+  void parseCatInfoToExtInfo(InfoInputCategory &catInfo,
+                             ClassExtensionInfo &extInfo);
+
+  void parseProtocolListInfo(ConcatInputSection *isec,
+                             uint32_t symbolsPerStruct,
+                             PointerListInfo &ptrList);
+
+  void parsePointerListInfo(ConcatInputSection *isec, uint32_t secOffset,
+                            uint32_t symbolsPerStruct,
+                            PointerListInfo &ptrList);
+
+  void emitAndLinkPointerList(Defined *parentSym, uint32_t linkAtOffset,
+                              ClassExtensionInfo &extInfo,
+                              PointerListInfo &ptrList);
+
+  void emitAndLinkProtocolList(Defined *parentSym, uint32_t linkAtOffset,
+                               ClassExtensionInfo &extInfo,
+                               PointerListInfo &ptrList);
+
+  void emitCategory(ClassExtensionInfo &extInfo, Defined *&catBodySym);
+  void emitCatListEntrySec(std::string &forCateogryName,
+                           std::string &forBaseClassName, ObjFile *objFile,
+                           Defined *&catListSym);
+  void emitCategoryBody(std::string &name, Defined *nameSym,
+                        Symbol *baseClassSym, std::string &baseClassName,
+                        ObjFile *objFile, Defined *&catBodySym);
+  void emitCategoryName(std::string &name, ObjFile *objFile,
+                        Defined *&catNameSym);
+  void createSymbolReference(Defined *refFrom, Symbol *refTo, uint32_t offset,
+                             Reloc &relocTemplate);
+  bool tryGetSymbolAtIsecOffset(ConcatInputSection *isec, uint32_t offset,
+                                Symbol *&sym);
+  bool tryGetDefinedAtIsecOffset(ConcatInputSection *isec, uint32_t offset,
+                                 Defined *&defined);
+  void tryEraseDefinedAtIsecOffset(ConcatInputSection *isec, uint32_t offset,
+                                   bool stringOnly = false);
+
+  CategoryLayout catLayout;
+  ClassLayout classLayout;
+  ROClassLayout roClassLayout;
+  ListHeaderLayout listHeaderLayout;
+  MethodLayout methodLayout;
+  ProtocolListHeaderLayout protocolListHeaderLayout;
+
+  InfoCategoryWriter infoCategoryWriter;
+  std::vector<ConcatInputSection *> &allInputSections;
+  // Map of base class Symbol to list of InfoInputCategory's for it
+  std::map<const Symbol *, std::vector<InfoInputCategory>> categoryMap;
+
+  // Normally, the binary data comes from the input files, but since we're
+  // generating binary data ourselves, we use the below arrays to store it in.
+  // Need this to be 'static' so the data survives past the ObjcCategoryMerger
+  // object, as the data will be read by the Writer when the final binary is
+  // generated.
+  static SmallVector<SmallString<0>> generatedNames;
+  static SmallVector<SmallVector<uint8_t>> generatedSectionData;
+};
+
+SmallVector<SmallString<0>> ObjcCategoryMerger::generatedNames;
+SmallVector<SmallVector<uint8_t>> ObjcCategoryMerger::generatedSectionData;
+
+ObjcCategoryMerger::ObjcCategoryMerger(
+    std::vector<ConcatInputSection *> &_allInputSections)
+    : catLayout(target->wordSize), classLayout(target->wordSize),
+      roClassLayout(target->wordSize), listHeaderLayout(target->wordSize),
+      methodLayout(target->wordSize),
+      protocolListHeaderLayout(target->wordSize),
+      allInputSections(_allInputSections) {}
+
+// This is a template so that it can be used both for CStringSection and
+// ConcatOutputSection
+template <typename T>
+void ObjcCategoryMerger::collectSectionWriteInfoFromIsec(
+    InputSection *isec, InfroWriteSection<T> &catWriteInfo) {
+
+  catWriteInfo.inputSection = &isec->section;
+  catWriteInfo.align = isec->align;
+  catWriteInfo.outputSection = dyn_cast_or_null<T>(isec->parent);
+
+  assert(catWriteInfo.outputSection &&
+         "outputSection may not be null in collectSectionWriteInfoFromIsec.");
+
+  if (isec->relocs.size())
+    catWriteInfo.relocTemplate = isec->relocs[0];
+
+  catWriteInfo.valid = true;
+}
+
+bool ObjcCategoryMerger::tryGetSymbolAtIsecOffset(ConcatInputSection *isec,
+                                                  uint32_t offset,
+                                                  Symbol *&sym) {
+  const Reloc *reloc = isec->getRelocAt(offset);
+
+  if (!reloc)
+    return false;
+
+  sym = reloc->referent.get<Symbol *>();
+  return sym != nullptr;
+}
+
+bool ObjcCategoryMerger::tryGetDefinedAtIsecOffset(ConcatInputSection *isec,
+                                                   uint32_t offset,
+                                                   Defined *&defined) {
+  Symbol *sym;
+  if (!tryGetSymbolAtIsecOffset(isec, offset, sym))
+    return false;
+
+  defined = dyn_cast_or_null<Defined>(sym);
+  return defined != nullptr;
+}
+
+// Given an ConcatInputSection and an offset, if there is a symbol(Defined) at
+// that offset, then erase the symbol (mark it not live) from the final output.
+// Used for easely erasing already merged strings, method lists, etc ...
+void ObjcCategoryMerger::tryEraseDefinedAtIsecOffset(ConcatInputSection *isec,
+                                                     uint32_t offset,
+                                                     bool stringOnly) {
+  const Reloc *reloc = isec->getRelocAt(offset);
+
+  if (!reloc)
+    return;
+
+  Defined *sym = dyn_cast_or_null<Defined>(reloc->referent.get<Symbol *>());
+  if (!sym)
+    return;
+
+  auto *cisec = dyn_cast_or_null<ConcatInputSection>(sym->isec);
+  if (!stringOnly && cisec) {
+    eraseISec(cisec);
+    return;
+  }
+
+  if (auto *cisec = dyn_cast_or_null<CStringInputSection>(sym->isec)) {
+    uint32_t totalOffset = sym->value + reloc->addend;
+    StringPiece &piece = cisec->getStringPiece(totalOffset);
+    piece.live = false;
+    return;
+  }
+}
+
+void ObjcCategoryMerger::collectCategoryWriterInfoFromCategory(
+    InfoInputCategory &catInfo) {
+
+  collectSectionWriteInfoFromIsec<ConcatOutputSection>(
+      catInfo.catListIsec, infoCategoryWriter.catListInfo);
+  collectSectionWriteInfoFromIsec<ConcatOutputSection>(
+      catInfo.catBodyIsec, infoCategoryWriter.catBodyInfo);
+
+  if (!infoCategoryWriter.catNameInfo.valid) {
+    const Reloc *catNameReloc =
+        catInfo.catBodyIsec->getRelocAt(catLayout.nameOffset);
+
+    assert(catNameReloc && "Category does not have a reloc at nameOffset");
+
+    lld::macho::Defined *catDefSym =
+        dyn_cast_or_null<Defined>(catNameReloc->referent.dyn_cast<Symbol *>());
+    assert(catDefSym && "Reloc of category name is not a valid Defined symbol");
+
+    collectSectionWriteInfoFromIsec<CStringSection>(
+        catDefSym->isec, infoCategoryWriter.catNameInfo);
+  }
+
+  // Collect writer info from all the category lists (we're assuming they all
+  // would provide the same info)
+  if (!infoCategoryWriter.catPtrListInfo.valid) {
+    for (uint32_t off = catLayout.instanceMethodsOffset;
+         off <= catLayout.classPropsOffset; off += target->wordSize) {
+      Defined *ptrList;
+      if (tryGetDefinedAtIsecOffset(catInfo.catBodyIsec, off, ptrList)) {
+        collectSectionWriteInfoFromIsec<ConcatOutputSection>(
+            ptrList->isec, infoCategoryWriter.catPtrListInfo);
+        // we've successfully collected data, so we can break
+        break;
+      }
+    }
+  }
+}
+
+// Parse a protocol list that might be linked to at a ConcatInputSection given
+// offset. The format of the protocol list is different than other lists (prop
+// lists, method lists) so we need to parse it differently
+void ObjcCategoryMerger::parseProtocolListInfo(ConcatInputSection *isec,
+                                               uint32_t secOffset,
+                                               PointerListInfo &ptrList) {
+  if (!isec || (secOffset + target->wordSize > isec->data.size()))
+    assert("Tried to read pointer list beyond protocol section end");
+
+  const Reloc *reloc = isec->getRelocAt(secOffset);
+  if (!reloc)
+    return; // List is null, nothing to do
+
+  auto *ptrListSym = dyn_cast_or_null<Defined>(reloc->referent.get<Symbol *>());
+  assert(ptrListSym && "Protocol list reloc does not have a valid Defined");
+
+  // Theoretically protocol count can be either 32b or 64b, but reading the
+  // first 32b is good enough
+  uint32_t protocolCount = *reinterpret_cast<const uint32_t *>(
+      ptrListSym->isec->data.data() + listHeaderLayout.structSizeOffset);
+
+  ptrList.structCount += protocolCount;
+  ptrList.structSize = target->wordSize;
+
+  uint32_t expectedListSize =
+      (protocolCount * target->wordSize) +
+      /*header(count)*/ protocolListHeaderLayout.totalSize +
+      /*extra null value*/ target->wordSize;
+  assert(expectedListSize == ptrListSym->isec->data.size() &&
+         "Protocol list does not match expected size");
+
+  uint32_t off = protocolListHeaderLayout.totalSize;
+  for (uint32_t inx = 0; inx < protocolCount; inx++) {
+    const Reloc *reloc = ptrListSym->isec->getRelocAt(off);
+    assert(reloc && "No reloc found at protocol list offset");
+
+    auto *listSym = dyn_cast_or_null<Defined>(reloc->referent.get<Symbol *>());
+    assert(listSym && "Protocol list reloc does not have a valid Defined");
+
+    ptrList.allPtrs.push_back(listSym);
+    off += target->wordSize;
+  }
+}
+
+// Parse a pointer list that might be linked to at a ConcatInputSection given
+// offset. This can be used for instance methods, class methods, instance props
+// and class props since they have the same format.
+void ObjcCategoryMerger::parsePointerListInfo(ConcatInputSection *isec,
+                                              uint32_t secOffset,
+                                              uint32_t symbolsPerStruct,
+                                              PointerListInfo &ptrList) {
+  assert(symbolsPerStruct == 2 || symbolsPerStruct == 3);
+  assert(isec && "Trying to parse pointer list from null isec");
+  assert(secOffset + target->wordSize <= isec->data.size() &&
+         "Trying to read pointer list beyond section end");
+
+  const Reloc *reloc = isec->getRelocAt(secOffset);
+  if (!reloc)
+    return; // No reloc found, nothing to parse
+
+  auto *ptrListSym = dyn_cast_or_null<Defined>(reloc->referent.get<Symbol *>());
+  assert(ptrListSym && "Reloc does not have a valid Defined");
+
+  uint32_t thisStructSize = *reinterpret_cast<const uint32_t *>(
+      ptrListSym->isec->data.data() + listHeaderLayout.structSizeOffset);
+  uint32_t thisStructCount = *reinterpret_cast<const uint32_t *>(
+      ptrListSym->isec->data.data() + listHeaderLayout.structCountOffset);
+
+  assert(!ptrList.structSize || (thisStructSize == ptrList.structSize));
+
+  ptrList.structCount += thisStructCount;
+  ptrList.structSize = thisStructSize;
+
+  uint32_t expectedListSize =
+      listHeaderLayout.totalSize + (thisStructSize * thisStructCount);
+
+  assert(expectedListSize == ptrListSym->isec->data.size() &&
+         "Pointer list does not match expected size");
+
+  for (uint32_t off = listHeaderLayout.totalSize; off < expectedListSize;
+       off += target->wordSize) {
+    const Reloc *reloc = ptrListSym->isec->getRelocAt(off);
+    assert(reloc && "No reloc found at pointer list offset");
+
+    auto *listSym = dyn_cast_or_null<Defined>(reloc->referent.get<Symbol *>());
+    assert(listSym && "Reloc does not have a valid Defined");
+
+    ptrList.allPtrs.push_back(listSym);
+  }
+}
+
+// Here we parse all the information of an input category (catInfo) and
+// append-store the parsed info into the strucutre which will contain all the
+// information about how a class is extended (extInfo)
+void ObjcCategoryMerger::parseCatInfoToExtInfo(InfoInputCategory &catInfo,
+                                               ClassExtensionInfo &extInfo) {
+  const Reloc *catNameReloc =
+      catInfo.catBodyIsec->getRelocAt(catLayout.nameOffset);
+
+  // Parse name
+  assert(catNameReloc && "Category does not have a reloc at 'nameOffset'");
+
+  if (!extInfo.mergedContainerName.empty())
+    extInfo.mergedContainerName += "|";
+
+  if (!extInfo.objFileForMergeData)
+    extInfo.objFileForMergeData =
+        dyn_cast_or_null<ObjFile>(catInfo.catBodyIsec->getFile());
+
+  StringRef catName = getReferentString(*catNameReloc);
+  extInfo.mergedContainerName += catName.str();
+
+  // Parse base class
+  const Reloc *klassReloc =
+      catInfo.catBodyIsec->getRelocAt(catLayout.klassOffset);
+
+  assert(klassReloc && "Category does not have a reloc at 'klassOffset'");
+
+  Symbol *classSym = klassReloc->referent.get<Symbol *>();
+
+  assert(
+      (!extInfo.baseClass || (extInfo.baseClass == classSym)) &&
+      "Trying to parse category info into container with different base class");
+
+  extInfo.baseClass = classSym;
+
+  if (extInfo.baseClassName.empty()) {
----------------
kyulee-com wrote:

In fact, you don't need to save baseClassName in separate (just waste of memory) as you can always derive it from `baseClass`.

https://github.com/llvm/llvm-project/pull/82928


More information about the llvm-commits mailing list