[lld] [llvm] [lld-macho] Implement ObjC category merging (-objc_category_merging) (PR #82928)

Kyungwoo Lee via llvm-commits llvm-commits at lists.llvm.org
Tue Mar 5 10:10:21 PST 2024


================
@@ -320,3 +342,1007 @@ void objc::checkCategories() {
       }
   }
 }
+
+namespace {
+
+class ObjcCategoryMerger {
+  // Information about an input category
+  struct InfoInputCategory {
+    ConcatInputSection *catBodyIsec;
+    ConcatInputSection *catListIsec;
+    uint32_t offCatListIsec = 0;
+
+    bool wasMerged = false;
+  };
+
+  // To write new (merged) categories or classes, we will try make limited
+  // assumptions about the alignment and the sections the various class/category
+  // info are stored in and . So we'll just reuse the same sections and
+  // alignment as already used in existing (input) categories. To do this we
+  // have InfoCategoryWriter which contains the various sections that the
+  // generated categories will be written to.
+  template <typename T> struct InfroWriteSection {
+    bool valid = false; // Data has been successfully collected from input
+    uint32_t align = 0;
+    const Section *inputSection;
+    Reloc relocTemplate;
+    T *outputSection;
+  };
+
+  struct InfoCategoryWriter {
+    InfroWriteSection<ConcatOutputSection> catListInfo;
+    InfroWriteSection<CStringSection> catNameInfo;
+    InfroWriteSection<ConcatOutputSection> catBodyInfo;
+    InfroWriteSection<ConcatOutputSection> catPtrListInfo;
+  };
+
+  // Information about a pointer list in the original categories (method lists,
+  // protocol lists, etc)
+  struct PointerListInfo {
+    PointerListInfo(const char *pszSymNamePrefix)
+        : namePrefix(pszSymNamePrefix) {}
+    const char *namePrefix;
+
+    uint32_t structSize = 0;
+    uint32_t structCount = 0;
+
+    std::vector<Symbol *> allPtrs;
+  };
+
+  // Full information about all the categories that are extending a class. This
+  // will have all the additional methods, protocols, proprieties that are
+  // contained in all the categories that extend a particular class.
+  struct ClassExtensionInfo {
+    // Merged names of containers. Ex: base|firstCategory|secondCategory|...
+    std::string mergedContainerName;
+    std::string baseClassName;
+    Symbol *baseClass = nullptr;
+    // In case we generate new data, mark the new data as belonging to this file
+    ObjFile *objFileForMergeData = nullptr;
+
+    PointerListInfo instanceMethods = "__OBJC_$_CATEGORY_INSTANCE_METHODS_";
+    PointerListInfo classMethods = "__OBJC_$_CATEGORY_CLASS_METHODS_";
+    PointerListInfo protocols = "__OBJC_CATEGORY_PROTOCOLS_$_";
+    PointerListInfo instanceProps = "__OBJC_$_PROP_LIST_";
+    PointerListInfo classProps = "__OBJC_$_CLASS_PROP_LIST_";
+  };
+
+public:
+  ObjcCategoryMerger(std::vector<ConcatInputSection *> &_allInputSections);
+  bool doMerge();
+
+private:
+  // This returns bool and always false for easy 'return false;' statements
+  bool registerError(const char *msg);
+
+  bool collectAndValidateCategoriesData();
+  bool
+  mergeCategoriesIntoSingleCategory(std::vector<InfoInputCategory> &categories);
+  bool eraseMergedCategories();
+
+  bool generateCatListForNonErasedCategories(
+      std::map<ConcatInputSection *, std::set<uint64_t>>
+          catListToErasedOffsets);
+  template <typename T>
+  bool collectSectionWriteInfoFromIsec(InputSection *isec,
+                                       InfroWriteSection<T> &catWriteInfo);
+  bool collectCategoryWriterInfoFromCategory(InfoInputCategory &catInfo);
+  bool parseCatInfoToExtInfo(InfoInputCategory &catInfo,
+                             ClassExtensionInfo &extInfo);
+
+  bool tryParseProtocolListInfo(ConcatInputSection *isec,
+                                uint32_t symbolsPerStruct,
+                                PointerListInfo &ptrList);
+
+  bool parsePointerListInfo(ConcatInputSection *isec, uint32_t secOffset,
+                            uint32_t symbolsPerStruct,
+                            PointerListInfo &ptrList);
+
+  bool emitAndLinkPointerList(Defined *parentSym, uint32_t linkAtOffset,
+                              ClassExtensionInfo &extInfo,
+                              PointerListInfo &ptrList);
+
+  bool emitAndLinkProtocolList(Defined *parentSym, uint32_t linkAtOffset,
+                               ClassExtensionInfo &extInfo,
+                               PointerListInfo &ptrList);
+
+  bool emitCategory(ClassExtensionInfo &extInfo, Defined *&catBodySym);
+  bool emitCatListEntrySec(std::string &forCateogryName,
+                           std::string &forBaseClassName, ObjFile *objFile,
+                           Defined *&catListSym);
+  bool emitCategoryBody(std::string &name, Defined *nameSym,
+                        Symbol *baseClassSym, std::string &baseClassName,
+                        ObjFile *objFile, Defined *&catBodySym);
+  bool emitCategoryName(std::string &name, ObjFile *objFile,
+                        Defined *&catNameSym);
+  bool createSymbolReference(Defined *refFrom, Symbol *refTo, uint32_t offset,
+                             Reloc &relocTemplate);
+  bool tryGetSymbolAtIsecOffset(ConcatInputSection *isec, uint32_t offset,
+                                Symbol *&sym);
+  bool tryGetDefinedAtIsecOffset(ConcatInputSection *isec, uint32_t offset,
+                                 Defined *&defined);
+  bool tryEraseDefinedAtIsecOffset(ConcatInputSection *isec, uint32_t offset,
+                                   bool stringOnly = false);
+
+  CategoryLayout catLayout;
+  ClassLayout classLayout;
+  ROClassLayout roClassLayout;
+  ListHeaderLayout listHeaderLayout;
+  MethodLayout methodLayout;
+  ProtocolListHeaderLayout protocolListHeaderLayout;
+
+  InfoCategoryWriter infoCategoryWriter;
+  std::vector<ConcatInputSection *> &allInputSections;
+  // Map of base class Symbol to list of InfoInputCategory's for it
+  std::map<const Symbol *, std::vector<InfoInputCategory>> categoryMap;
+
+  // Normally, the binary data comes from the input files, but since we're
+  // generating binary data ourselves, we use the below arrays to store it in.
+  // Need this to be 'static' so the data survives past the ObjcCategoryMerger
+  // object, as the data will be read by the Writer when the final binary is
+  // generated.
+  static SmallVector<SmallString<0>> generatedNames;
+  static SmallVector<SmallVector<uint8_t>> generatedSectionData;
+};
+
+SmallVector<SmallString<0>> ObjcCategoryMerger::generatedNames;
+SmallVector<SmallVector<uint8_t>> ObjcCategoryMerger::generatedSectionData;
+
+ObjcCategoryMerger::ObjcCategoryMerger(
+    std::vector<ConcatInputSection *> &_allInputSections)
+    : catLayout(target->wordSize), classLayout(target->wordSize),
+      roClassLayout(target->wordSize), listHeaderLayout(target->wordSize),
+      methodLayout(target->wordSize),
+      protocolListHeaderLayout(target->wordSize),
+      allInputSections(_allInputSections) {}
+
+bool ObjcCategoryMerger::registerError(const char *msg) {
+  std::string err = "ObjC category merging error[-merge-objc-categories]: ";
+  err += msg;
+  error(err);
+  return false; // Always return false for easy 'return registerError()' syntax.
+}
+
+// This is a template so that it can be used both for CStringSection and
+// ConcatOutputSection
+template <typename T>
+bool ObjcCategoryMerger::collectSectionWriteInfoFromIsec(
+    InputSection *isec, InfroWriteSection<T> &catWriteInfo) {
+  if (catWriteInfo.valid)
+    return true;
+
+  catWriteInfo.inputSection = &isec->section;
+  catWriteInfo.align = isec->align;
+  catWriteInfo.outputSection = dyn_cast_or_null<T>(isec->parent);
+
+  if (isec->relocs.size())
+    catWriteInfo.relocTemplate = isec->relocs[0];
+
+  if (!catWriteInfo.outputSection) {
+    std::string message =
+        "Unexpected output section type for" + isec->getName().str();
+    return registerError(message.c_str());
+  }
+
+  catWriteInfo.valid = true;
+
+  return true;
+}
+
+bool ObjcCategoryMerger::tryGetSymbolAtIsecOffset(ConcatInputSection *isec,
+                                                  uint32_t offset,
+                                                  Symbol *&sym) {
+  const Reloc *reloc = isec->getRelocAt(offset);
+
+  if (!reloc)
+    return false;
+
+  sym = reloc->referent.get<Symbol *>();
+  return sym != nullptr;
+}
+
+bool ObjcCategoryMerger::tryGetDefinedAtIsecOffset(ConcatInputSection *isec,
+                                                   uint32_t offset,
+                                                   Defined *&defined) {
+  Symbol *sym;
+  if (!tryGetSymbolAtIsecOffset(isec, offset, sym))
+    return false;
+
+  defined = dyn_cast_or_null<Defined>(sym);
+  return defined != nullptr;
+}
+
+// Given an ConcatInputSection and an offset, if there is a symbol(Defined) at
+// that offset, then erase the symbol (mark it not live) from the final output.
+// Used for easely erasing already merged strings, method lists, etc ...
+bool ObjcCategoryMerger::tryEraseDefinedAtIsecOffset(ConcatInputSection *isec,
+                                                     uint32_t offset,
+                                                     bool stringOnly) {
+  const Reloc *reloc = isec->getRelocAt(offset);
+
+  if (!reloc)
+    return false;
+
+  Defined *sym = dyn_cast_or_null<Defined>(reloc->referent.get<Symbol *>());
+
+  if (!sym)
+    return false;
+
+  auto *cisec = dyn_cast_or_null<ConcatInputSection>(sym->isec);
+  if (!stringOnly && cisec) {
+    cisec->linkerOptimizeReason = LinkerOptReason::CategoryMerging;
+    return true;
+  }
+
+  if (auto *cisec = dyn_cast_or_null<CStringInputSection>(sym->isec)) {
+    uint32_t totalOffset = sym->value + reloc->addend;
+    StringPiece &piece = cisec->getStringPiece(totalOffset);
+    piece.linkerOptimizeReason = LinkerOptReason::CategoryMerging;
+    return true;
+  }
+
+  return false;
+}
+
+bool ObjcCategoryMerger::collectCategoryWriterInfoFromCategory(
+    InfoInputCategory &catInfo) {
+
+  if (!collectSectionWriteInfoFromIsec<ConcatOutputSection>(
+          catInfo.catListIsec, infoCategoryWriter.catListInfo))
+    return false;
+  if (!collectSectionWriteInfoFromIsec<ConcatOutputSection>(
+          catInfo.catBodyIsec, infoCategoryWriter.catBodyInfo))
+    return false;
+
+  if (!infoCategoryWriter.catNameInfo.valid) {
+    const Reloc *catNameReloc =
+        catInfo.catBodyIsec->getRelocAt(catLayout.nameOffset);
+
+    if (!catNameReloc)
+      return registerError("Category does not have a reloc at nameOffset");
+
+    lld::macho::Defined *catDefSym =
+        dyn_cast_or_null<Defined>(catNameReloc->referent.dyn_cast<Symbol *>());
+    if (!catDefSym)
+      return registerError(
+          "Reloc of category name is not a valid Defined symbol");
+
+    if (!collectSectionWriteInfoFromIsec<CStringSection>(
+            catDefSym->isec, infoCategoryWriter.catNameInfo))
+      return false;
+  }
+
+  // Collect writer info from all the category lists (we're assuming they all
+  // would provide the same info)
+  if (!infoCategoryWriter.catPtrListInfo.valid) {
+    for (uint32_t off = catLayout.instanceMethodsOffset;
+         off <= catLayout.classPropsOffset; off += target->wordSize) {
+      Defined *ptrList;
+      if (tryGetDefinedAtIsecOffset(catInfo.catBodyIsec, off, ptrList)) {
+        if (!collectSectionWriteInfoFromIsec<ConcatOutputSection>(
+                ptrList->isec, infoCategoryWriter.catPtrListInfo))
+          return false;
+        break;
+      }
+    }
+  }
+
+  return true;
+}
+
+// Parse a protocol list that might be linked to at a ConcatInputSection given
+// offset. The format of the protocol list is different than other lists (prop
+// lists, method lists) so we need to parse it differently
+bool ObjcCategoryMerger::tryParseProtocolListInfo(ConcatInputSection *isec,
+                                                  uint32_t secOffset,
+                                                  PointerListInfo &ptrList) {
+  if (!isec || (secOffset + target->wordSize > isec->data.size()))
+    return registerError(
+        "Tried to read pointer list beyond protocol section end");
+
+  const Reloc *reloc = isec->getRelocAt(secOffset);
+  if (!reloc)
+    return true; // List is null, return true because no m_error
+
+  auto *ptrListSym = dyn_cast_or_null<Defined>(reloc->referent.get<Symbol *>());
+  if (!ptrListSym)
+    return registerError("Protocol list reloc does not have a valid Defined");
+
+  // Theoretically protocol count can be either 32b or 64b, but reading the
+  // first 32b is good enough
+  uint32_t protocolCount = *reinterpret_cast<const uint32_t *>(
+      ptrListSym->isec->data.data() + listHeaderLayout.structSizeOffset);
+
+  ptrList.structCount += protocolCount;
+  ptrList.structSize = target->wordSize;
+
+  uint32_t expectedListSize =
+      (protocolCount * target->wordSize) +
+      /*header(count)*/ protocolListHeaderLayout.totalSize +
+      /*extra null value*/ target->wordSize;
+  if (expectedListSize != ptrListSym->isec->data.size())
+    return registerError("Protocol list does not match expected size");
+
+  uint32_t off = protocolListHeaderLayout.totalSize;
+  for (uint32_t inx = 0; inx < protocolCount; inx++) {
+    const Reloc *reloc = ptrListSym->isec->getRelocAt(off);
+    if (!reloc)
+      return registerError("No reloc found at protocol list offset");
+
+    auto *listSym = dyn_cast_or_null<Defined>(reloc->referent.get<Symbol *>());
+    if (!listSym)
+      return registerError("Protocol list reloc does not have a valid Defined");
+
+    ptrList.allPtrs.push_back(listSym);
+    off += target->wordSize;
+  }
+
+  return true;
+}
+
+// Parse a pointer list that might be linked to at a ConcatInputSection given
+// offset. This can be used for instance methods, class methods, instance props
+// and class props since they have the same format.
+bool ObjcCategoryMerger::parsePointerListInfo(ConcatInputSection *isec,
+                                              uint32_t secOffset,
+                                              uint32_t symbolsPerStruct,
+                                              PointerListInfo &ptrList) {
+  assert(symbolsPerStruct == 2 || symbolsPerStruct == 3);
+  if (!isec || (secOffset + target->wordSize > isec->data.size()))
+    return registerError("Tried to read pointer list beyond section end");
+
+  const Reloc *reloc = isec->getRelocAt(secOffset);
+  if (!reloc)
+    return true; // No reloc found, nothing to parse, so return success
+
+  auto *ptrListSym = dyn_cast_or_null<Defined>(reloc->referent.get<Symbol *>());
+  if (!ptrListSym)
+    return registerError("Reloc does not have a valid Defined");
+
+  uint32_t thisStructSize = *reinterpret_cast<const uint32_t *>(
+      ptrListSym->isec->data.data() + listHeaderLayout.structSizeOffset);
+  uint32_t thisStructCount = *reinterpret_cast<const uint32_t *>(
+      ptrListSym->isec->data.data() + listHeaderLayout.structCountOffset);
+
+  assert(!ptrList.structSize || (thisStructSize == ptrList.structSize));
+
+  ptrList.structCount += thisStructCount;
+  ptrList.structSize = thisStructSize;
+
+  uint32_t expectedListSize =
+      listHeaderLayout.totalSize + (thisStructSize * thisStructCount);
+
+  if (expectedListSize != ptrListSym->isec->data.size())
+    return registerError("Pointer list does not match expected size");
+
+  for (uint32_t off = listHeaderLayout.totalSize; off < expectedListSize;
+       off += target->wordSize) {
+    const Reloc *reloc = ptrListSym->isec->getRelocAt(off);
+    if (!reloc)
+      return registerError("No reloc found at pointer list offset");
+
+    auto *listSym = dyn_cast_or_null<Defined>(reloc->referent.get<Symbol *>());
+    if (!listSym)
+      return registerError("Reloc does not have a valid Defined");
+
+    ptrList.allPtrs.push_back(listSym);
+  }
+
+  return true;
+}
+
+// Here we parse all the information of an input category (catInfo) and
+// append-store the parsed info into the strucutre which will contain all the
+// information about how a class is extended (extInfo)
+bool ObjcCategoryMerger::parseCatInfoToExtInfo(InfoInputCategory &catInfo,
+                                               ClassExtensionInfo &extInfo) {
+  const Reloc *catNameReloc =
+      catInfo.catBodyIsec->getRelocAt(catLayout.nameOffset);
+
+  //// Parse name ///////////////////////////////////////////////////////////
+  if (!catNameReloc)
+    return registerError("Category does not have a reloc at 'nameOffset'");
+
+  if (!extInfo.mergedContainerName.empty())
+    extInfo.mergedContainerName += "|";
+
+  if (!extInfo.objFileForMergeData)
+    extInfo.objFileForMergeData =
+        dyn_cast_or_null<ObjFile>(catInfo.catBodyIsec->getFile());
+
+  StringRef catName = getReferentString(*catNameReloc);
+  extInfo.mergedContainerName += catName.str();
+
+  //// Parse base class /////////////////////////////////////////////////////
+  const Reloc *klassReloc =
+      catInfo.catBodyIsec->getRelocAt(catLayout.klassOffset);
+
+  if (!klassReloc)
+    return registerError("Category does not have a reloc at 'klassOffset'");
+
+  Symbol *classSym = klassReloc->referent.get<Symbol *>();
+
+  if (extInfo.baseClass && extInfo.baseClass != classSym)
+    return registerError("Trying to parse category info into container with "
+                         "different base class");
+
+  extInfo.baseClass = classSym;
+
+  if (extInfo.baseClassName.empty()) {
+    llvm::StringRef classPrefix("_OBJC_CLASS_$_");
+    if (!classSym->getName().starts_with(classPrefix))
+      return registerError(
+          "Base class symbol does not start with '_OBJC_CLASS_$_'");
+
+    extInfo.baseClassName = classSym->getName().substr(classPrefix.size());
+  }
+
+  if (!parsePointerListInfo(catInfo.catBodyIsec,
+                            catLayout.instanceMethodsOffset,
+                            /*symbolsPerStruct=*/3, extInfo.instanceMethods))
+    return false;
+
+  if (!parsePointerListInfo(catInfo.catBodyIsec, catLayout.classMethodsOffset,
+                            /*symbolsPerStruct=*/3, extInfo.classMethods))
+    return false;
+
+  if (!tryParseProtocolListInfo(catInfo.catBodyIsec, catLayout.protocolsOffset,
+                                extInfo.protocols))
+    return false;
+
+  if (!parsePointerListInfo(catInfo.catBodyIsec, catLayout.instancePropsOffset,
+                            /*symbolsPerStruct=*/2, extInfo.instanceProps))
+    return false;
+
+  if (!parsePointerListInfo(catInfo.catBodyIsec, catLayout.classPropsOffset,
+                            /*symbolsPerStruct=*/2, extInfo.classProps))
+    return false;
+
+  return true;
+}
+
+// Generate a protocol list (including header) and link it into the parent at
+// the specified offset.
+bool ObjcCategoryMerger::emitAndLinkProtocolList(Defined *parentSym,
+                                                 uint32_t linkAtOffset,
+                                                 ClassExtensionInfo &extInfo,
+                                                 PointerListInfo &ptrList) {
+  if (ptrList.allPtrs.empty())
+    return true;
+
+  assert(ptrList.allPtrs.size() == ptrList.structCount);
+
+  uint32_t bodySize = (ptrList.structCount * target->wordSize) +
+                      /*header(count)*/ protocolListHeaderLayout.totalSize +
+                      /*extra null value*/ target->wordSize;
+  generatedSectionData.push_back(SmallVector<uint8_t>(bodySize, 0));
+  llvm::ArrayRef<uint8_t> bodyData = generatedSectionData.back();
+
+  // This theoretically can be either 32b or 64b, but writing just the first 32b
+  // is good enough
+  const uint32_t *ptrProtoCount = reinterpret_cast<const uint32_t *>(
+      bodyData.data() + protocolListHeaderLayout.protocolCountOffset);
+
+  *const_cast<uint32_t *>(ptrProtoCount) = ptrList.allPtrs.size();
+
+  ConcatInputSection *listSec = make<ConcatInputSection>(
+      *infoCategoryWriter.catPtrListInfo.inputSection, bodyData,
+      infoCategoryWriter.catPtrListInfo.align);
+  listSec->parent = infoCategoryWriter.catPtrListInfo.outputSection;
+  listSec->live = true;
+  allInputSections.push_back(listSec);
+
+  listSec->parent = infoCategoryWriter.catPtrListInfo.outputSection;
+
+  generatedNames.push_back(StringRef(ptrList.namePrefix));
+  auto &symName = generatedNames.back();
+  symName += extInfo.baseClassName + "_$_(" + extInfo.mergedContainerName + ")";
+
+  Defined *ptrListSym = make<Defined>(
+      symName.c_str(), /*file=*/parentSym->getObjectFile(), listSec,
+      /*value=*/0, bodyData.size(),
+      /*isWeakDef=*/false, /*isExternal=*/false, /*isPrivateExtern=*/false,
+      /*includeInSymtab=*/true, /*isReferencedDynamically=*/false,
+      /*noDeadStrip=*/false, /*isWeakDefCanBeHidden=*/false);
+
+  ptrListSym->used = true;
+  parentSym->getObjectFile()->symbols.push_back(ptrListSym);
+
+  if (!createSymbolReference(parentSym, ptrListSym, linkAtOffset,
+                             infoCategoryWriter.catBodyInfo.relocTemplate))
+    return false;
+
+  uint32_t offset = protocolListHeaderLayout.totalSize;
+  for (Symbol *symbol : ptrList.allPtrs) {
+    if (!createSymbolReference(ptrListSym, symbol, offset,
+                               infoCategoryWriter.catPtrListInfo.relocTemplate))
+      return false;
+
+    offset += target->wordSize;
+  }
+
+  return true;
+}
+
+// Generate a pointer list (including header) and link it into the parent at the
+// specified offset. This is used for instance and class methods and
+// proprieties.
+bool ObjcCategoryMerger::emitAndLinkPointerList(Defined *parentSym,
+                                                uint32_t linkAtOffset,
+                                                ClassExtensionInfo &extInfo,
+                                                PointerListInfo &ptrList) {
+  if (ptrList.allPtrs.empty())
+    return true;
+
+  assert(ptrList.allPtrs.size() * target->wordSize ==
+         ptrList.structCount * ptrList.structSize);
+
+  // Generate body
+  uint32_t bodySize =
+      listHeaderLayout.totalSize + (ptrList.structSize * ptrList.structCount);
+  generatedSectionData.push_back(SmallVector<uint8_t>(bodySize, 0));
+  llvm::ArrayRef<uint8_t> bodyData = generatedSectionData.back();
+
+  const uint32_t *ptrStructSize = reinterpret_cast<const uint32_t *>(
+      bodyData.data() + listHeaderLayout.structSizeOffset);
+  const uint32_t *ptrStructCount = reinterpret_cast<const uint32_t *>(
+      bodyData.data() + listHeaderLayout.structCountOffset);
+
+  *const_cast<uint32_t *>(ptrStructSize) = ptrList.structSize;
+  *const_cast<uint32_t *>(ptrStructCount) = ptrList.structCount;
+
+  ConcatInputSection *listSec = make<ConcatInputSection>(
+      *infoCategoryWriter.catPtrListInfo.inputSection, bodyData,
+      infoCategoryWriter.catPtrListInfo.align);
+  listSec->parent = infoCategoryWriter.catPtrListInfo.outputSection;
+  listSec->live = true;
+  allInputSections.push_back(listSec);
+
+  listSec->parent = infoCategoryWriter.catPtrListInfo.outputSection;
+
+  generatedNames.push_back(StringRef(ptrList.namePrefix));
+  auto &symName = generatedNames.back();
+  symName += extInfo.baseClassName + "_$_" + extInfo.mergedContainerName;
+
+  Defined *ptrListSym = make<Defined>(
+      symName.c_str(), /*file=*/parentSym->getObjectFile(), listSec,
+      /*value=*/0, bodyData.size(),
+      /*isWeakDef=*/false, /*isExternal=*/false, /*isPrivateExtern=*/false,
+      /*includeInSymtab=*/true, /*isReferencedDynamically=*/false,
+      /*noDeadStrip=*/false, /*isWeakDefCanBeHidden=*/false);
+
+  ptrListSym->used = true;
+  parentSym->getObjectFile()->symbols.push_back(ptrListSym);
+
+  if (!createSymbolReference(parentSym, ptrListSym, linkAtOffset,
+                             infoCategoryWriter.catBodyInfo.relocTemplate))
+    return false;
+
+  uint32_t offset = listHeaderLayout.totalSize;
+  for (Symbol *symbol : ptrList.allPtrs) {
+    if (!createSymbolReference(ptrListSym, symbol, offset,
+                               infoCategoryWriter.catPtrListInfo.relocTemplate))
+      return false;
+
+    offset += target->wordSize;
+  }
+
+  return true;
+}
+
+// This method creates an __objc_catlist ConcatInputSection with a single slot
+bool ObjcCategoryMerger::emitCatListEntrySec(std::string &forCateogryName,
+                                             std::string &forBaseClassName,
+                                             ObjFile *objFile,
+                                             Defined *&catListSym) {
+  uint32_t sectionSize = target->wordSize;
+  generatedSectionData.push_back(SmallVector<uint8_t>(sectionSize, 0));
+  llvm::ArrayRef<uint8_t> bodyData = generatedSectionData.back();
+
+  ConcatInputSection *newCatList =
+      make<ConcatInputSection>(*infoCategoryWriter.catListInfo.inputSection,
+                               bodyData, infoCategoryWriter.catListInfo.align);
+  newCatList->parent = infoCategoryWriter.catListInfo.outputSection;
+  newCatList->live = true;
+  allInputSections.push_back(newCatList);
+
+  newCatList->parent = infoCategoryWriter.catListInfo.outputSection;
+
+  SmallString<0> catSymName;
+  catSymName += "<__objc_catlist slot for merged category ";
+  catSymName += forBaseClassName + "(" + forCateogryName + ")>";
+  generatedNames.push_back(StringRef(catSymName));
+
+  catListSym = make<Defined>(
+      StringRef(generatedNames.back()), /*file=*/objFile, newCatList,
+      /*value=*/0, bodyData.size(), /*isWeakDef=*/false, /*isExternal=*/false,
+      /*isPrivateExtern=*/false, /*includeInSymtab=*/false,
+      /*isReferencedDynamically=*/false, /*noDeadStrip=*/false,
+      /*isWeakDefCanBeHidden=*/false);
+
+  catListSym->used = true;
+  objFile->symbols.push_back(catListSym);
+  return true;
+}
+
+// Here we generate the main category body and just the body and link the name
+// and base class into it. We don't link any other info like the protocol and
+// class/instance methods/props.
+bool ObjcCategoryMerger::emitCategoryBody(std::string &name, Defined *nameSym,
+                                          Symbol *baseClassSym,
+                                          std::string &baseClassName,
+                                          ObjFile *objFile,
+                                          Defined *&catBodySym) {
+  generatedSectionData.push_back(SmallVector<uint8_t>(catLayout.totalSize, 0));
+  llvm::ArrayRef<uint8_t> bodyData = generatedSectionData.back();
+
+  uint32_t *ptrSize = (uint32_t *)(const_cast<uint8_t *>(bodyData.data()) +
+                                   catLayout.sizeOffset);
+  *ptrSize = catLayout.totalSize;
+
+  ConcatInputSection *newBodySec =
+      make<ConcatInputSection>(*infoCategoryWriter.catBodyInfo.inputSection,
+                               bodyData, infoCategoryWriter.catBodyInfo.align);
+  newBodySec->parent = infoCategoryWriter.catBodyInfo.outputSection;
+  newBodySec->live = true;
+  allInputSections.push_back(newBodySec);
+
+  newBodySec->parent = infoCategoryWriter.catBodyInfo.outputSection;
+
+  std::string symName =
+      "__OBJC_$_CATEGORY_" + baseClassName + "_$_(" + name + ")";
+  generatedNames.push_back(StringRef(symName));
+  catBodySym = make<Defined>(
+      StringRef(generatedNames.back()), /*file=*/objFile, newBodySec,
+      /*value=*/0, bodyData.size(), /*isWeakDef=*/false, /*isExternal=*/false,
+      /*isPrivateExtern=*/false, /*includeInSymtab=*/true,
+      /*isReferencedDynamically=*/false, /*noDeadStrip=*/false,
+      /*isWeakDefCanBeHidden=*/false);
+
+  catBodySym->used = true;
+  objFile->symbols.push_back(catBodySym);
+
+  if (!createSymbolReference(catBodySym, nameSym, catLayout.nameOffset,
+                             infoCategoryWriter.catBodyInfo.relocTemplate))
+    return false;
+
+  // Create a reloc to the base class (either external or internal)
+  if (!createSymbolReference(catBodySym, baseClassSym, catLayout.klassOffset,
+                             infoCategoryWriter.catBodyInfo.relocTemplate))
+    return false;
+
+  return true;
+}
+
+// This writes the new category name (for the merged category) into the binary
+// and returns the sybmol for it.
+bool ObjcCategoryMerger::emitCategoryName(std::string &name, ObjFile *objFile,
+                                          Defined *&catNamdeSym) {
+  llvm::ArrayRef<uint8_t> inputNameArrData(
+      reinterpret_cast<const uint8_t *>(name.c_str()), name.size() + 1);
+  generatedSectionData.push_back(SmallVector<uint8_t>(inputNameArrData));
+
+  llvm::ArrayRef<uint8_t> nameData = generatedSectionData.back();
+
+  CStringInputSection *newStringSec = make<CStringInputSection>(
+      *infoCategoryWriter.catNameInfo.inputSection, nameData,
+      infoCategoryWriter.catNameInfo.align, true);
+
+  newStringSec->splitIntoPieces();
+  newStringSec->pieces[0].live = true;
+  newStringSec->parent = infoCategoryWriter.catNameInfo.outputSection;
+
+  catNamdeSym = make<Defined>(
+      "<merged category name>", /*file=*/objFile, newStringSec,
+      /*value=*/0, nameData.size(),
+      /*isWeakDef=*/false, /*isExternal=*/false, /*isPrivateExtern=*/false,
+      /*includeInSymtab=*/false, /*isReferencedDynamically=*/false,
+      /*noDeadStrip=*/false, /*isWeakDefCanBeHidden=*/false);
+
+  catNamdeSym->used = true;
+  objFile->symbols.push_back(catNamdeSym);
+  return true;
+}
+
+// This method fully creates a new category from the given ClassExtensionInfo.
+// It creates the category body, name and protocol/method/prop lists an links
+// everything together. Then it creates a new __objc_catlist entry and links the
+// category into it. Calling this method will fully generate a category which
+// will be available in the final binary.
+bool ObjcCategoryMerger::emitCategory(ClassExtensionInfo &extInfo,
+                                      Defined *&catBodySym) {
+  Defined *catNameSym = nullptr;
+  if (!emitCategoryName(extInfo.mergedContainerName,
+                        extInfo.objFileForMergeData, catNameSym))
+    return false;
+
+  if (!emitCategoryBody(extInfo.mergedContainerName, catNameSym,
+                        extInfo.baseClass, extInfo.baseClassName,
+                        extInfo.objFileForMergeData, catBodySym))
+    return false;
+
+  Defined *catListSym = nullptr;
+  if (!emitCatListEntrySec(extInfo.mergedContainerName, extInfo.baseClassName,
+                           extInfo.objFileForMergeData, catListSym))
+    return false;
+
+  const uint32_t offsetFirstCat = 0;
+  if (!createSymbolReference(catListSym, catBodySym, offsetFirstCat,
+                             infoCategoryWriter.catListInfo.relocTemplate))
+    return false;
+
+  if (!emitAndLinkPointerList(catBodySym, catLayout.instanceMethodsOffset,
+                              extInfo, extInfo.instanceMethods))
+    return false;
+
+  if (!emitAndLinkPointerList(catBodySym, catLayout.classMethodsOffset, extInfo,
+                              extInfo.classMethods))
+    return false;
+
+  if (!emitAndLinkProtocolList(catBodySym, catLayout.protocolsOffset, extInfo,
+                               extInfo.protocols))
+    return false;
+
+  if (!emitAndLinkPointerList(catBodySym, catLayout.instancePropsOffset,
+                              extInfo, extInfo.instanceProps))
+    return false;
+
+  if (!emitAndLinkPointerList(catBodySym, catLayout.classPropsOffset, extInfo,
+                              extInfo.classProps))
+    return false;
+
+  return true;
+}
+
+// This method merges all the categories (sharing a base class) into a single
+// category.
+bool ObjcCategoryMerger::mergeCategoriesIntoSingleCategory(
+    std::vector<InfoInputCategory> &categories) {
+  assert(categories.size() > 1 && "Expected at least 2 categories");
+
+  ClassExtensionInfo extInfo;
+
+  for (auto &catInfo : categories)
+    if (!parseCatInfoToExtInfo(catInfo, extInfo))
+      return false;
+
+  Defined *newCatDef = nullptr;
+  if (!emitCategory(extInfo, newCatDef))
+    return false;
+
+  return true;
+}
+
+bool ObjcCategoryMerger::createSymbolReference(Defined *refFrom, Symbol *refTo,
+                                               uint32_t offset,
+                                               Reloc &relocTemplate) {
+  Reloc r = relocTemplate;
+  r.offset = offset;
+  r.addend = 0;
+  r.referent = refTo;
+  refFrom->isec->relocs.push_back(r);
+
+  return true;
+}
+
+bool ObjcCategoryMerger::collectAndValidateCategoriesData() {
+  for (InputSection *sec : allInputSections) {
+    if (sec->getName() != section_names::objcCatList)
+      continue;
+    ConcatInputSection *catListCisec = dyn_cast<ConcatInputSection>(sec);
+    if (!catListCisec)
+      return registerError(
+          "__objc_catList InputSection is not a ConcatInputSection");
+
+    for (const Reloc &r : catListCisec->relocs) {
+      auto *sym = cast<Defined>(r.referent.get<Symbol *>());
+      if (!sym || !sym->getName().starts_with("__OBJC_$_CATEGORY_"))
+        continue; // Only support ObjC categories (no swift + @objc)
+
+      auto *catBodyIsec =
+          dyn_cast<ConcatInputSection>(r.getReferentInputSection());
+      if (!catBodyIsec)
+        return registerError(
+            "Category data section is not an ConcatInputSection");
+
+      if (catBodyIsec->getSize() != catLayout.totalSize) {
+        std::string err;
+        llvm::raw_string_ostream OS(err);
+        OS << "Invalid input category size encountered, category merging only "
+              "supports "
+           << catLayout.totalSize << " bytes";
+        OS.flush();
+        return registerError(err.c_str());
+      }
+
+      // Check that the category has a reloc at 'klassOffset' (which is
+      // a pointer to the class symbol)
+
+      auto *classReloc = catBodyIsec->getRelocAt(catLayout.klassOffset);
+      if (!classReloc)
+        return registerError("Category does not have a reloc at klassOffset");
+
+      auto *classSym = classReloc->referent.get<Symbol *>();
+      InfoInputCategory catInputInfo{catBodyIsec, catListCisec, r.offset};
+      categoryMap[classSym].push_back(catInputInfo);
+
+      if (!collectCategoryWriterInfoFromCategory(catInputInfo))
+        return false;
+    }
+  }
+
+  for (auto &entry : categoryMap) {
+    if (entry.second.size() > 1) {
+      // Sort categories by offset to make sure we process categories in
+      // the same order as they appear in the input
+      auto cmpFn = [](const InfoInputCategory &a, const InfoInputCategory &b) {
+        return (a.catListIsec == b.catListIsec) &&
----------------
kyulee-com wrote:

The code still seems using `llvm:sort` below. As the inputs are already sorted by the first value (catListIsec) while you want to preserve it, should it be using a`stable_sort` like?
```
llvm::stable_sort(entry.second, [](const InfoInputCategory &a, const InfoInputCategory &b) {
        return a.offCatListIsec < b. offCatListIsec;
});
```

https://github.com/llvm/llvm-project/pull/82928


More information about the llvm-commits mailing list