[lld] [llvm] [lld-macho] Implement ObjC category merging (-objc_category_merging) (PR #82928)

Kyungwoo Lee via llvm-commits llvm-commits at lists.llvm.org
Fri Mar 8 00:52:01 PST 2024


================
@@ -320,3 +332,929 @@ void objc::checkCategories() {
       }
   }
 }
+
+namespace {
+
+class ObjcCategoryMerger {
+  // Information about an input category
+  struct InfoInputCategory {
+    ConcatInputSection *catListIsec;
+    ConcatInputSection *catBodyIsec;
+    uint32_t offCatListIsec = 0;
+
+    bool wasMerged = false;
+  };
+
+  // To write new (merged) categories or classes, we will try make limited
+  // assumptions about the alignment and the sections the various class/category
+  // info are stored in and . So we'll just reuse the same sections and
+  // alignment as already used in existing (input) categories. To do this we
+  // have InfoCategoryWriter which contains the various sections that the
+  // generated categories will be written to.
+  template <typename T> struct InfroWriteSection {
+    bool valid = false; // Data has been successfully collected from input
+    uint32_t align = 0;
+    Section *inputSection;
+    Reloc relocTemplate;
+    T *outputSection;
+  };
+
+  struct InfoCategoryWriter {
+    InfroWriteSection<ConcatOutputSection> catListInfo;
+    InfroWriteSection<ConcatOutputSection> catBodyInfo;
+    InfroWriteSection<CStringSection> catNameInfo;
+    InfroWriteSection<ConcatOutputSection> catPtrListInfo;
+  };
+
+  // Information about a pointer list in the original categories (method lists,
+  // protocol lists, etc)
+  struct PointerListInfo {
+    PointerListInfo(const char *_categoryPrefix, uint32_t _categoryOffset,
+                    uint32_t _pointersPerStruct)
+        : categoryPrefix(_categoryPrefix), categoryOffset(_categoryOffset),
+          pointersPerStruct(_pointersPerStruct) {}
+    const char *categoryPrefix;
+    uint32_t categoryOffset = 0;
+
+    uint32_t pointersPerStruct = 0;
+
+    uint32_t structSize = 0;
+    uint32_t structCount = 0;
+
+    std::vector<Symbol *> allPtrs;
+  };
+
+  // Full information about all the categories that are extending a class. This
+  // will have all the additional methods, protocols, proprieties that are
+  // contained in all the categories that extend a particular class.
+  struct ClassExtensionInfo {
+    ClassExtensionInfo(CategoryLayout &_catLayout) : catLayout(_catLayout){};
+
+    // Merged names of containers. Ex: base|firstCategory|secondCategory|...
+    std::string mergedContainerName;
+    std::string baseClassName;
+    Symbol *baseClass = nullptr;
+    CategoryLayout &catLayout;
+
+    // In case we generate new data, mark the new data as belonging to this file
+    ObjFile *objFileForMergeData = nullptr;
+
+    PointerListInfo instanceMethods = {
+        objc::symbol_names::categoryInstanceMethods,
+        /*_categoryOffset=*/catLayout.instanceMethodsOffset,
+        /*pointersPerStruct=*/3};
+    PointerListInfo classMethods = {
+        objc::symbol_names::categoryClassMethods,
+        /*_categoryOffset=*/catLayout.classMethodsOffset,
+        /*pointersPerStruct=*/3};
+    PointerListInfo protocols = {objc::symbol_names::categoryProtocols,
+                                 /*_categoryOffset=*/catLayout.protocolsOffset,
+                                 /*pointersPerStruct=*/0};
+    PointerListInfo instanceProps = {
+        objc::symbol_names::listProprieties,
+        /*_categoryOffset=*/catLayout.instancePropsOffset,
+        /*pointersPerStruct=*/2};
+    PointerListInfo classProps = {
+        objc::symbol_names::klassPropList,
+        /*_categoryOffset=*/catLayout.classPropsOffset,
+        /*pointersPerStruct=*/2};
+  };
+
+public:
+  ObjcCategoryMerger(std::vector<ConcatInputSection *> &_allInputSections);
+  void doMerge();
+  static void doCleanup();
+
+private:
+  void collectAndValidateCategoriesData();
+  void
+  mergeCategoriesIntoSingleCategory(std::vector<InfoInputCategory> &categories);
+
+  void eraseISec(ConcatInputSection *isec);
+  void eraseMergedCategories();
+
+  void generateCatListForNonErasedCategories(
+      std::map<ConcatInputSection *, std::set<uint64_t>>
+          catListToErasedOffsets);
+  template <typename T>
+  void collectSectionWriteInfoFromIsec(const InputSection *isec,
+                                       InfroWriteSection<T> &catWriteInfo);
+  void collectCategoryWriterInfoFromCategory(const InfoInputCategory &catInfo);
+  void parseCatInfoToExtInfo(const InfoInputCategory &catInfo,
+                             ClassExtensionInfo &extInfo);
+
+  void parseProtocolListInfo(const ConcatInputSection *isec, uint32_t secOffset,
+                             PointerListInfo &ptrList);
+
+  void parsePointerListInfo(const ConcatInputSection *isec, uint32_t secOffset,
+                            PointerListInfo &ptrList);
+
+  void emitAndLinkPointerList(Defined *parentSym, uint32_t linkAtOffset,
+                              const ClassExtensionInfo &extInfo,
+                              const PointerListInfo &ptrList);
+
+  void emitAndLinkProtocolList(Defined *parentSym, uint32_t linkAtOffset,
+                               const ClassExtensionInfo &extInfo,
+                               const PointerListInfo &ptrList);
+
+  Defined *emitCategory(const ClassExtensionInfo &extInfo);
+  Defined *emitCatListEntrySec(const std::string &forCateogryName,
+                               const std::string &forBaseClassName,
+                               ObjFile *objFile);
+  Defined *emitCategoryBody(const std::string &name, const Defined *nameSym,
+                            const Symbol *baseClassSym,
+                            const std::string &baseClassName, ObjFile *objFile);
+  Defined *emitCategoryName(const std::string &name, ObjFile *objFile);
+  void createSymbolReference(Defined *refFrom, const Symbol *refTo,
+                             uint32_t offset, const Reloc &relocTemplate);
+  Symbol *tryGetSymbolAtIsecOffset(const ConcatInputSection *isec,
+                                   uint32_t offset);
+  Defined *tryGetDefinedAtIsecOffset(const ConcatInputSection *isec,
+                                     uint32_t offset);
+  void tryEraseDefinedAtIsecOffset(const ConcatInputSection *isec,
+                                   uint32_t offset, bool stringOnly = false);
+
+  CategoryLayout catLayout;
+  ClassLayout classLayout;
+  ROClassLayout roClassLayout;
+  ListHeaderLayout listHeaderLayout;
+  MethodLayout methodLayout;
+  ProtocolListHeaderLayout protocolListHeaderLayout;
+
+  InfoCategoryWriter infoCategoryWriter;
+  std::vector<ConcatInputSection *> &allInputSections;
+  // Map of base class Symbol to list of InfoInputCategory's for it
+  std::map<const Symbol *, std::vector<InfoInputCategory>> categoryMap;
+
+  // Normally, the binary data comes from the input files, but since we're
+  // generating binary data ourselves, we use the below arrays to store it in.
+  // Need this to be 'static' so the data survives past the ObjcCategoryMerger
+  // object, as the data will be read by the Writer when the final binary is
+  // generated.
+  static SmallVector<SmallString<0>> generatedNames;
+  static SmallVector<SmallVector<uint8_t>> generatedSectionData;
+};
+
+SmallVector<SmallString<0>> ObjcCategoryMerger::generatedNames;
+SmallVector<SmallVector<uint8_t>> ObjcCategoryMerger::generatedSectionData;
+
+ObjcCategoryMerger::ObjcCategoryMerger(
+    std::vector<ConcatInputSection *> &_allInputSections)
+    : catLayout(target->wordSize), classLayout(target->wordSize),
+      roClassLayout(target->wordSize), listHeaderLayout(target->wordSize),
+      methodLayout(target->wordSize),
+      protocolListHeaderLayout(target->wordSize),
+      allInputSections(_allInputSections) {}
+
+// This is a template so that it can be used both for CStringSection and
+// ConcatOutputSection
+template <typename T>
+void ObjcCategoryMerger::collectSectionWriteInfoFromIsec(
+    const InputSection *isec, InfroWriteSection<T> &catWriteInfo) {
+
+  catWriteInfo.inputSection = const_cast<Section *>(&isec->section);
+  catWriteInfo.align = isec->align;
+  catWriteInfo.outputSection = dyn_cast_or_null<T>(isec->parent);
+
+  assert(catWriteInfo.outputSection &&
+         "outputSection may not be null in collectSectionWriteInfoFromIsec.");
+
+  if (isec->relocs.size())
+    catWriteInfo.relocTemplate = isec->relocs[0];
+
+  catWriteInfo.valid = true;
+}
+
+Symbol *
+ObjcCategoryMerger::tryGetSymbolAtIsecOffset(const ConcatInputSection *isec,
+                                             uint32_t offset) {
+  const Reloc *reloc = isec->getRelocAt(offset);
+
+  if (!reloc)
+    return nullptr;
+
+  return reloc->referent.get<Symbol *>();
+}
+
+Defined *
+ObjcCategoryMerger::tryGetDefinedAtIsecOffset(const ConcatInputSection *isec,
+                                              uint32_t offset) {
+  Symbol *sym = tryGetSymbolAtIsecOffset(isec, offset);
+  return dyn_cast_or_null<Defined>(sym);
+}
+
+// Given an ConcatInputSection and an offset, if there is a symbol(Defined) at
+// that offset, then erase the symbol (mark it not live) from the final output.
+// Used for easely erasing already merged strings, method lists, etc ...
+void ObjcCategoryMerger::tryEraseDefinedAtIsecOffset(
+    const ConcatInputSection *isec, uint32_t offset, bool stringOnly) {
+  const Reloc *reloc = isec->getRelocAt(offset);
+
+  if (!reloc)
+    return;
+
+  Defined *sym = dyn_cast_or_null<Defined>(reloc->referent.get<Symbol *>());
+  if (!sym)
+    return;
+
+  auto *cisec = dyn_cast_or_null<ConcatInputSection>(sym->isec);
+  if (!stringOnly && cisec) {
+    eraseISec(cisec);
+    return;
+  }
+
+  if (auto *cisec = dyn_cast_or_null<CStringInputSection>(sym->isec)) {
+    uint32_t totalOffset = sym->value + reloc->addend;
+    StringPiece &piece = cisec->getStringPiece(totalOffset);
+    piece.live = false;
+    return;
+  }
+}
+
+void ObjcCategoryMerger::collectCategoryWriterInfoFromCategory(
+    const InfoInputCategory &catInfo) {
+
+  if (!infoCategoryWriter.catListInfo.valid)
+    collectSectionWriteInfoFromIsec<ConcatOutputSection>(
+        catInfo.catListIsec, infoCategoryWriter.catListInfo);
+  if (!infoCategoryWriter.catBodyInfo.valid)
+    collectSectionWriteInfoFromIsec<ConcatOutputSection>(
+        catInfo.catBodyIsec, infoCategoryWriter.catBodyInfo);
+
+  if (!infoCategoryWriter.catNameInfo.valid) {
+    lld::macho::Defined *catNameSym =
+        tryGetDefinedAtIsecOffset(catInfo.catBodyIsec, catLayout.nameOffset);
+    assert(catNameSym && "Category does not have a valid name Symbol");
+
+    collectSectionWriteInfoFromIsec<CStringSection>(
+        catNameSym->isec, infoCategoryWriter.catNameInfo);
+  }
+
+  // Collect writer info from all the category lists (we're assuming they all
+  // would provide the same info)
+  if (!infoCategoryWriter.catPtrListInfo.valid) {
+    for (uint32_t off = catLayout.instanceMethodsOffset;
+         off <= catLayout.classPropsOffset; off += target->wordSize) {
+      if (Defined *ptrList =
+              tryGetDefinedAtIsecOffset(catInfo.catBodyIsec, off)) {
+        collectSectionWriteInfoFromIsec<ConcatOutputSection>(
+            ptrList->isec, infoCategoryWriter.catPtrListInfo);
+        // we've successfully collected data, so we can break
+        break;
+      }
+    }
+  }
+}
+
+// Parse a protocol list that might be linked to at a ConcatInputSection given
+// offset. The format of the protocol list is different than other lists (prop
+// lists, method lists) so we need to parse it differently
+void ObjcCategoryMerger::parseProtocolListInfo(const ConcatInputSection *isec,
+                                               uint32_t secOffset,
+                                               PointerListInfo &ptrList) {
+  if (!isec || (secOffset + target->wordSize > isec->data.size()))
+    assert("Tried to read pointer list beyond protocol section end");
+
+  const Reloc *reloc = isec->getRelocAt(secOffset);
+  if (!reloc)
+    return; // List is null, nothing to do
+
+  auto *ptrListSym = dyn_cast_or_null<Defined>(reloc->referent.get<Symbol *>());
+  assert(ptrListSym && "Protocol list reloc does not have a valid Defined");
+
+  // Theoretically protocol count can be either 32b or 64b, but reading the
+  // first 32b is good enough
----------------
kyulee-com wrote:

I don't get it, but I saw all the places uses 32 bit operations. Is the content 64bit but we read only 32bit, or it's 32bit always? If you think it's always 32 bit, you could just delete this comment.

https://github.com/llvm/llvm-project/pull/82928


More information about the llvm-commits mailing list