[lld] [lld-macho] Implement ObjC category merging (-objc_category_merging) (PR #82928)

via llvm-commits llvm-commits at lists.llvm.org
Thu Feb 29 07:43:55 PST 2024


================
@@ -320,3 +342,1007 @@ void objc::checkCategories() {
       }
   }
 }
+
+namespace {
+
+class ObjcCategoryMerger {
+  // Information about an input category
+  struct InfoInputCategory {
+    ConcatInputSection *catBodyIsec;
+    ConcatInputSection *catListIsec;
+    uint32_t offCatListIsec = 0;
+
+    bool wasMerged = false;
+  };
+
+  // To write new (merged) categories or classes, we will try make limited
+  // assumptions about the alignment and the sections the various class/category
+  // info are stored in and . So we'll just reuse the same sections and
+  // alignment as already used in existing (input) categories. To do this we
+  // have InfoCategoryWriter which contains the various sections that the
+  // generated categories will be written to.
+  template <typename T> struct InfroWriteSection {
+    bool valid = false; // Data has been successfully collected from input
+    uint32_t align = 0;
+    const Section *inputSection;
+    Reloc relocTemplate;
+    T *outputSection;
+  };
+
+  struct InfoCategoryWriter {
+    InfroWriteSection<ConcatOutputSection> catListInfo;
+    InfroWriteSection<CStringSection> catNameInfo;
+    InfroWriteSection<ConcatOutputSection> catBodyInfo;
+    InfroWriteSection<ConcatOutputSection> catPtrListInfo;
+  };
+
+  // Information about a pointer list in the original categories (method lists,
+  // protocol lists, etc)
+  struct PointerListInfo {
+    PointerListInfo(const char *pszSymNamePrefix)
+        : namePrefix(pszSymNamePrefix) {}
+    const char *namePrefix;
+
+    uint32_t structSize = 0;
+    uint32_t structCount = 0;
+
+    std::vector<Symbol *> allPtrs;
+  };
+
+  // Full information about all the categories that are extending a class. This
+  // will have all the additional methods, protocols, proprieties that are
+  // contained in all the categories that extend a particular class.
+  struct ClassExtensionInfo {
+    // Merged names of containers. Ex: base|firstCategory|secondCategory|...
+    std::string mergedContainerName;
+    std::string baseClassName;
+    Symbol *baseClass = nullptr;
+    // In case we generate new data, mark the new data as belonging to this file
+    ObjFile *objFileForMergeData = nullptr;
+
+    PointerListInfo instanceMethods = "__OBJC_$_CATEGORY_INSTANCE_METHODS_";
+    PointerListInfo classMethods = "__OBJC_$_CATEGORY_CLASS_METHODS_";
+    PointerListInfo protocols = "__OBJC_CATEGORY_PROTOCOLS_$_";
+    PointerListInfo instanceProps = "__OBJC_$_PROP_LIST_";
+    PointerListInfo classProps = "__OBJC_$_CLASS_PROP_LIST_";
+  };
+
+public:
+  ObjcCategoryMerger(std::vector<ConcatInputSection *> &_allInputSections);
+  bool doMerge();
+
+private:
+  // This returns bool and always false for easy 'return false;' statements
+  bool registerError(const char *msg);
+
+  bool collectAndValidateCategoriesData();
+  bool
+  mergeCategoriesIntoSingleCategory(std::vector<InfoInputCategory> &categories);
+  bool eraseMergedCategories();
+
+  bool generateCatListForNonErasedCategories(
+      std::map<ConcatInputSection *, std::set<uint64_t>>
+          catListToErasedOffsets);
+  template <typename T>
+  bool collectSectionWriteInfoFromIsec(InputSection *isec,
+                                       InfroWriteSection<T> &catWriteInfo);
+  bool collectCategoryWriterInfoFromCategory(InfoInputCategory &catInfo);
+  bool parseCatInfoToExtInfo(InfoInputCategory &catInfo,
+                             ClassExtensionInfo &extInfo);
+
+  bool tryParseProtocolListInfo(ConcatInputSection *isec,
+                                uint32_t symbolsPerStruct,
+                                PointerListInfo &ptrList);
+
+  bool parsePointerListInfo(ConcatInputSection *isec, uint32_t secOffset,
+                            uint32_t symbolsPerStruct,
+                            PointerListInfo &ptrList);
+
+  bool emitAndLinkPointerList(Defined *parentSym, uint32_t linkAtOffset,
+                              ClassExtensionInfo &extInfo,
+                              PointerListInfo &ptrList);
+
+  bool emitAndLinkProtocolList(Defined *parentSym, uint32_t linkAtOffset,
+                               ClassExtensionInfo &extInfo,
+                               PointerListInfo &ptrList);
+
+  bool emitCategory(ClassExtensionInfo &extInfo, Defined *&catBodySym);
+  bool emitCatListEntrySec(std::string &forCateogryName,
+                           std::string &forBaseClassName, ObjFile *objFile,
+                           Defined *&catListSym);
+  bool emitCategoryBody(std::string &name, Defined *nameSym,
+                        Symbol *baseClassSym, std::string &baseClassName,
+                        ObjFile *objFile, Defined *&catBodySym);
+  bool emitCategoryName(std::string &name, ObjFile *objFile,
+                        Defined *&catNameSym);
+  bool createSymbolReference(Defined *refFrom, Symbol *refTo, uint32_t offset,
+                             Reloc &relocTemplate);
+  bool tryGetSymbolAtIsecOffset(ConcatInputSection *isec, uint32_t offset,
+                                Symbol *&sym);
+  bool tryGetDefinedAtIsecOffset(ConcatInputSection *isec, uint32_t offset,
+                                 Defined *&defined);
+  bool tryEraseDefinedAtIsecOffset(ConcatInputSection *isec, uint32_t offset,
+                                   bool stringOnly = false);
+
+  CategoryLayout catLayout;
+  ClassLayout classLayout;
+  ROClassLayout roClassLayout;
+  ListHeaderLayout listHeaderLayout;
+  MethodLayout methodLayout;
+  ProtocolListHeaderLayout protocolListHeaderLayout;
+
+  InfoCategoryWriter infoCategoryWriter;
+  std::vector<ConcatInputSection *> &allInputSections;
+  // Map of base class Symbol to list of InfoInputCategory's for it
+  std::map<const Symbol *, std::vector<InfoInputCategory>> categoryMap;
+
+  // Normally, the binary data comes from the input files, but since we're
+  // generating binary data ourselves, we use the below arrays to store it in.
+  // Need this to be 'static' so the data survives past the ObjcCategoryMerger
+  // object, as the data will be read by the Writer when the final binary is
+  // generated.
+  static SmallVector<SmallString<0>> generatedNames;
+  static SmallVector<SmallVector<uint8_t>> generatedSectionData;
+};
+
+SmallVector<SmallString<0>> ObjcCategoryMerger::generatedNames;
+SmallVector<SmallVector<uint8_t>> ObjcCategoryMerger::generatedSectionData;
+
+ObjcCategoryMerger::ObjcCategoryMerger(
+    std::vector<ConcatInputSection *> &_allInputSections)
+    : catLayout(target->wordSize), classLayout(target->wordSize),
+      roClassLayout(target->wordSize), listHeaderLayout(target->wordSize),
+      methodLayout(target->wordSize),
+      protocolListHeaderLayout(target->wordSize),
+      allInputSections(_allInputSections) {}
+
+bool ObjcCategoryMerger::registerError(const char *msg) {
+  std::string err = "ObjC category merging error[-merge-objc-categories]: ";
+  err += msg;
+  error(err);
+  return false; // Always return false for easy 'return registerError()' syntax.
+}
+
+// This is a template so that it can be used both for CStringSection and
+// ConcatOutputSection
+template <typename T>
+bool ObjcCategoryMerger::collectSectionWriteInfoFromIsec(
+    InputSection *isec, InfroWriteSection<T> &catWriteInfo) {
+  if (catWriteInfo.valid)
+    return true;
+
+  catWriteInfo.inputSection = &isec->section;
+  catWriteInfo.align = isec->align;
+  catWriteInfo.outputSection = dyn_cast_or_null<T>(isec->parent);
+
+  if (isec->relocs.size())
+    catWriteInfo.relocTemplate = isec->relocs[0];
+
+  if (!catWriteInfo.outputSection) {
+    std::string message =
+        "Unexpected output section type for" + isec->getName().str();
+    return registerError(message.c_str());
+  }
+
+  catWriteInfo.valid = true;
+
+  return true;
+}
+
+bool ObjcCategoryMerger::tryGetSymbolAtIsecOffset(ConcatInputSection *isec,
+                                                  uint32_t offset,
+                                                  Symbol *&sym) {
+  const Reloc *reloc = isec->getRelocAt(offset);
+
+  if (!reloc)
+    return false;
+
+  sym = reloc->referent.get<Symbol *>();
+  return sym != nullptr;
+}
+
+bool ObjcCategoryMerger::tryGetDefinedAtIsecOffset(ConcatInputSection *isec,
+                                                   uint32_t offset,
+                                                   Defined *&defined) {
+  Symbol *sym;
+  if (!tryGetSymbolAtIsecOffset(isec, offset, sym))
+    return false;
+
+  defined = dyn_cast_or_null<Defined>(sym);
+  return defined != nullptr;
+}
+
+// Given an ConcatInputSection and an offset, if there is a symbol(Defined) at
+// that offset, then erase the symbol (mark it not live) from the final output.
+// Used for easely erasing already merged strings, method lists, etc ...
+bool ObjcCategoryMerger::tryEraseDefinedAtIsecOffset(ConcatInputSection *isec,
+                                                     uint32_t offset,
+                                                     bool stringOnly) {
+  const Reloc *reloc = isec->getRelocAt(offset);
+
+  if (!reloc)
+    return false;
+
+  Defined *sym = dyn_cast_or_null<Defined>(reloc->referent.get<Symbol *>());
+
+  if (!sym)
+    return false;
+
+  auto *cisec = dyn_cast_or_null<ConcatInputSection>(sym->isec);
+  if (!stringOnly && cisec) {
+    cisec->linkerOptimizeReason = LinkerOptReason::CategoryMerging;
+    return true;
+  }
+
+  if (auto *cisec = dyn_cast_or_null<CStringInputSection>(sym->isec)) {
+    uint32_t totalOffset = sym->value + reloc->addend;
+    StringPiece &piece = cisec->getStringPiece(totalOffset);
+    piece.linkerOptimizeReason = LinkerOptReason::CategoryMerging;
+    return true;
+  }
+
+  return false;
+}
+
+bool ObjcCategoryMerger::collectCategoryWriterInfoFromCategory(
+    InfoInputCategory &catInfo) {
+
+  if (!collectSectionWriteInfoFromIsec<ConcatOutputSection>(
----------------
alx32 wrote:

`error` is not fatal - it just outputs a message and then lld at the end returns an error code. 
For example you can have:
```
  error("error1");
  error("error2");
  ```
  and get:
```
ld64.lld: error: error1
ld64.lld: error: error2
```
The code is structured so that we only print out one error and after that return all the way out of category checker. This is because it's likely that the next error messages would be invalid / confusing - only the first error matters after which the whole category checker is put in a bad state. 

so we do have to have something like 
```
if ( false )
  return error("")
```
or
```
if ( false )  {
  error("")
  return false
}
```

I think options are:
1. don't output just one error - output and ignore errors (will probably output a few errors and crash later on)
2. hard system.exit() after first error - I don't think this is recommended in lld
3. leave as-is with first format above
4. refactor to 2nd format above

Or ... any other suggestions ? 

https://github.com/llvm/llvm-project/pull/82928


More information about the llvm-commits mailing list