[lld] 9431787 - LLD Support for Basic Block Sections

Tue Apr 7 06:56:12 PDT 2020

Author: Sriraman Tallam
Date: 2020-04-07T06:55:57-07:00
New Revision: 94317878d8267de23a1a23dfa3d92f9143c433ca

URL: https://github.com/llvm/llvm-project/commit/94317878d8267de23a1a23dfa3d92f9143c433ca
DIFF: https://github.com/llvm/llvm-project/commit/94317878d8267de23a1a23dfa3d92f9143c433ca.diff

LOG: LLD Support for Basic Block Sections

This is part of the Propeller framework to do post link code layout
optimizations. Please see the RFC here:
https://groups.google.com/forum/#!msg/llvm-dev/ef3mKzAdJ7U/1shV64BYBAAJ and the
detailed RFC doc here:
https://github.com/google/llvm-propeller/blob/plo-dev/Propeller_RFC.pdf

This patch adds lld support for basic block sections and performs relaxations
after the basic blocks have been reordered.

After the linker has reordered the basic block sections according to the
desired sequence, it runs a relaxation pass to optimize jump instructions.
Currently, the compiler emits the long form of all jump instructions. AMD64 ISA
supports variants of jump instructions with one byte offset or a four byte
offset. The compiler generates jump instructions with R_X86_64 32-bit PC
relative relocations. We would like to use a new relocation type for these jump
instructions as it makes it easy and accurate while relaxing these instructions.

The relaxation pass does two things:

First, it deletes all explicit fall-through direct jump instructions between
adjacent basic blocks. This is done by discarding the tail of the basic block
section.

Second, If there are consecutive jump instructions, it checks if the first
conditional jump can be inverted to convert the second into a fall through and
delete the second.

The jump instructions are relaxed by using jump instruction mods, something
like relocations. These are used to modify the opcode of the jump instruction.
Jump instruction mods contain three values, instruction offset, jump type and
size. While writing this jump instruction out to the final binary, the linker
uses the jump instruction mod to determine the opcode and the size of the
modified jump instruction. These mods are required because the input object
files are memory-mapped without write permissions and directly modifying the
object files requires copying these sections. Copying a large number of basic
block sections significantly bloats memory.

Differential Revision: https://reviews.llvm.org/D68065

Added: 
    lld/test/ELF/bb-sections-and-icf.s
    lld/test/ELF/bb-sections-delete-fallthru.s
    lld/test/ELF/bb-sections-pc32reloc.s

Modified: 
    lld/ELF/Arch/X86_64.cpp
    lld/ELF/Config.h
    lld/ELF/Driver.cpp
    lld/ELF/InputSection.cpp
    lld/ELF/InputSection.h
    lld/ELF/LTO.cpp
    lld/ELF/Options.td
    lld/ELF/OutputSections.cpp
    lld/ELF/Relocations.h
    lld/ELF/Target.h
    lld/ELF/Writer.cpp

Removed: 
    


################################################################################
diff  --git a/lld/ELF/Arch/X86_64.cpp b/lld/ELF/Arch/X86_64.cpp
index 7cb6f320fdef..5d9ce6f6a3dc 100644

--- a/lld/ELF/Arch/X86_64.cpp
+++ b/lld/ELF/Arch/X86_64.cpp
@@ -7,6 +7,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "InputFiles.h"
+#include "OutputSections.h"
 #include "Symbols.h"
 #include "SyntheticSections.h"
 #include "Target.h"
@@ -37,6 +38,8 @@ class X86_64 : public TargetInfo {
                 uint64_t pltEntryAddr) const override;
   void relocate(uint8_t *loc, const Relocation &rel,
                 uint64_t val) const override;
+  void applyJumpInstrMod(uint8_t *loc, JumpModType type,
+                         unsigned size) const override;
 
   RelExpr adjustRelaxExpr(RelType type, const uint8_t *data,
                           RelExpr expr) const override;
@@ -52,9 +55,25 @@ class X86_64 : public TargetInfo {
                       uint64_t val) const override;
   bool adjustPrologueForCrossSplitStack(uint8_t *loc, uint8_t *end,
                                         uint8_t stOther) const override;
+  bool deleteFallThruJmpInsn(InputSection &is, InputFile *file,
+                             InputSection *nextIS) const override;
 };
 } // namespace
 
+// This is vector of NOP instructions of sizes from 1 to 8 bytes.  The
+// appropriately sized instructions are used to fill the gaps between sections
+// which are executed during fall through.
+static const std::vector<std::vector<uint8_t>> nopInstructions = {
+    {0x90},
+    {0x66, 0x90},
+    {0x0f, 0x1f, 0x00},
+    {0x0f, 0x1f, 0x40, 0x00},
+    {0x0f, 0x1f, 0x44, 0x00, 0x00},
+    {0x66, 0x0f, 0x1f, 0x44, 0x00, 0x00},
+    {0x0F, 0x1F, 0x80, 0x00, 0x00, 0x00, 0x00},
+    {0x0F, 0x1F, 0x84, 0x00, 0x00, 0x00, 0x00, 0x00},
+    {0x66, 0x0F, 0x1F, 0x84, 0x00, 0x00, 0x00, 0x00, 0x00}};
+
 X86_64::X86_64() {
   copyRel = R_X86_64_COPY;
   gotRel = R_X86_64_GLOB_DAT;
@@ -71,6 +90,7 @@ X86_64::X86_64() {
   pltEntrySize = 16;
   ipltEntrySize = 16;
   trapInstr = {0xcc, 0xcc, 0xcc, 0xcc}; // 0xcc = INT3
+  nopInstrs = nopInstructions;
 
   // Align to the large page size (known as a superpage or huge page).
   // FreeBSD automatically promotes large, superpage-aligned allocations.
@@ -79,6 +99,216 @@ X86_64::X86_64() {
 
 int X86_64::getTlsGdRelaxSkip(RelType type) const { return 2; }
 
+// Opcodes for the 
diff erent X86_64 jmp instructions.
+enum JmpInsnOpcode : uint32_t {
+  J_JMP_32,
+  J_JNE_32,
+  J_JE_32,
+  J_JG_32,
+  J_JGE_32,
+  J_JB_32,
+  J_JBE_32,
+  J_JL_32,
+  J_JLE_32,
+  J_JA_32,
+  J_JAE_32,
+  J_UNKNOWN,
+};
+
+// Given the first (optional) and second byte of the insn's opcode, this
+// returns the corresponding enum value.
+static JmpInsnOpcode getJmpInsnType(const uint8_t *first,
+                                    const uint8_t *second) {
+  if (*second == 0xe9)
+    return J_JMP_32;
+
+  if (first == nullptr)
+    return J_UNKNOWN;
+
+  if (*first == 0x0f) {
+    switch (*second) {
+    case 0x84:
+      return J_JE_32;
+    case 0x85:
+      return J_JNE_32;
+    case 0x8f:
+      return J_JG_32;
+    case 0x8d:
+      return J_JGE_32;
+    case 0x82:
+      return J_JB_32;
+    case 0x86:
+      return J_JBE_32;
+    case 0x8c:
+      return J_JL_32;
+    case 0x8e:
+      return J_JLE_32;
+    case 0x87:
+      return J_JA_32;
+    case 0x83:
+      return J_JAE_32;
+    }
+  }
+  return J_UNKNOWN;
+}
+
+// Return the relocation index for input section IS with a specific Offset.
+// Returns the maximum size of the vector if no such relocation is found.
+static unsigned getRelocationWithOffset(const InputSection &is,
+                                        uint64_t offset) {
+  unsigned size = is.relocations.size();
+  for (unsigned i = size - 1; i + 1 > 0; --i) {
+    if (is.relocations[i].offset == offset && is.relocations[i].expr != R_NONE)
+      return i;
+  }
+  return size;
+}
+
+// Returns true if R corresponds to a relocation used for a jump instruction.
+// TODO: Once special relocations for relaxable jump instructions are available,
+// this should be modified to use those relocations.
+static bool isRelocationForJmpInsn(Relocation &R) {
+  return R.type == R_X86_64_PLT32 || R.type == R_X86_64_PC32 ||
+         R.type == R_X86_64_PC8;
+}
+
+// Return true if Relocation R points to the first instruction in the
+// next section.
+// TODO: Delete this once psABI reserves a new relocation type for fall thru
+// jumps.
+static bool isFallThruRelocation(InputSection &is, InputFile *file,
+                                 InputSection *nextIS, Relocation &r) {
+  if (!isRelocationForJmpInsn(r))
+    return false;
+
+  uint64_t addrLoc = is.getOutputSection()->addr + is.outSecOff + r.offset;
+  uint64_t targetOffset = InputSectionBase::getRelocTargetVA(
+      file, r.type, r.addend, addrLoc, *r.sym, r.expr);
+
+  // If this jmp is a fall thru, the target offset is the beginning of the
+  // next section.
+  uint64_t nextSectionOffset =
+      nextIS->getOutputSection()->addr + nextIS->outSecOff;
+  return (addrLoc + 4 + targetOffset) == nextSectionOffset;
+}
+
+// Return the jmp instruction opcode that is the inverse of the given
+// opcode.  For example, JE inverted is JNE.
+static JmpInsnOpcode invertJmpOpcode(const JmpInsnOpcode opcode) {
+  switch (opcode) {
+  case J_JE_32:
+    return J_JNE_32;
+  case J_JNE_32:
+    return J_JE_32;
+  case J_JG_32:
+    return J_JLE_32;
+  case J_JGE_32:
+    return J_JL_32;
+  case J_JB_32:
+    return J_JAE_32;
+  case J_JBE_32:
+    return J_JA_32;
+  case J_JL_32:
+    return J_JGE_32;
+  case J_JLE_32:
+    return J_JG_32;
+  case J_JA_32:
+    return J_JBE_32;
+  case J_JAE_32:
+    return J_JB_32;
+  default:
+    return J_UNKNOWN;
+  }
+}
+
+// Deletes direct jump instruction in input sections that jumps to the
+// following section as it is not required.  If there are two consecutive jump
+// instructions, it checks if they can be flipped and one can be deleted.
+// For example:
+// .section .text
+// a.BB.foo:
+//    ...
+//    10: jne aa.BB.foo
+//    16: jmp bar
+// aa.BB.foo:
+//    ...
+//
+// can be converted to:
+// a.BB.foo:
+//   ...
+//   10: je bar  #jne flipped to je and the jmp is deleted.
+// aa.BB.foo:
+//   ...
+bool X86_64::deleteFallThruJmpInsn(InputSection &is, InputFile *file,
+                                   InputSection *nextIS) const {
+  const unsigned sizeOfDirectJmpInsn = 5;
+
+  if (nextIS == nullptr)
+    return false;
+
+  if (is.getSize() < sizeOfDirectJmpInsn)
+    return false;
+
+  // If this jmp insn can be removed, it is the last insn and the
+  // relocation is 4 bytes before the end.
+  unsigned rIndex = getRelocationWithOffset(is, is.getSize() - 4);
+  if (rIndex == is.relocations.size())
+    return false;
+
+  Relocation &r = is.relocations[rIndex];
+
+  // Check if the relocation corresponds to a direct jmp.
+  const uint8_t *secContents = is.data().data();
+  // If it is not a direct jmp instruction, there is nothing to do here.
+  if (*(secContents + r.offset - 1) != 0xe9)
+    return false;
+
+  if (isFallThruRelocation(is, file, nextIS, r)) {
+    // This is a fall thru and can be deleted.
+    r.expr = R_NONE;
+    r.offset = 0;
+    is.drop_back(sizeOfDirectJmpInsn);
+    is.nopFiller = true;
+    return true;
+  }
+
+  // Now, check if flip and delete is possible.
+  const unsigned sizeOfJmpCCInsn = 6;
+  // To flip, there must be atleast one JmpCC and one direct jmp.
+  if (is.getSize() < sizeOfDirectJmpInsn + sizeOfJmpCCInsn)
+    return 0;
+
+  unsigned rbIndex =
+      getRelocationWithOffset(is, (is.getSize() - sizeOfDirectJmpInsn - 4));
+  if (rbIndex == is.relocations.size())
+    return 0;
+
+  Relocation &rB = is.relocations[rbIndex];
+
+  const uint8_t *jmpInsnB = secContents + rB.offset - 1;
+  JmpInsnOpcode jmpOpcodeB = getJmpInsnType(jmpInsnB - 1, jmpInsnB);
+  if (jmpOpcodeB == J_UNKNOWN)
+    return false;
+
+  if (!isFallThruRelocation(is, file, nextIS, rB))
+    return false;
+
+  // jmpCC jumps to the fall thru block, the branch can be flipped and the
+  // jmp can be deleted.
+  JmpInsnOpcode jInvert = invertJmpOpcode(jmpOpcodeB);
+  if (jInvert == J_UNKNOWN)
+    return false;
+  is.jumpInstrMods.push_back({jInvert, (rB.offset - 1), 4});
+  // Move R's values to rB except the offset.
+  rB = {r.expr, r.type, rB.offset, r.addend, r.sym};
+  // Cancel R
+  r.expr = R_NONE;
+  r.offset = 0;
+  is.drop_back(sizeOfDirectJmpInsn);
+  is.nopFiller = true;
+  return true;
+}
+
 RelExpr X86_64::getRelExpr(RelType type, const Symbol &s,
                            const uint8_t *loc) const {
   if (type == R_X86_64_GOTTPOFF)
@@ -357,6 +587,94 @@ void X86_64::relaxTlsLdToLe(uint8_t *loc, const Relocation &rel,
         "expected R_X86_64_PLT32 or R_X86_64_GOTPCRELX after R_X86_64_TLSLD");
 }
 
+// A JumpInstrMod at a specific offset indicates that the jump instruction
+// opcode at that offset must be modified.  This is specifically used to relax
+// jump instructions with basic block sections.  This function looks at the
+// JumpMod and effects the change.
+void X86_64::applyJumpInstrMod(uint8_t *loc, JumpModType type,
+                               unsigned size) const {
+  switch (type) {
+  case J_JMP_32:
+    if (size == 4)
+      *loc = 0xe9;
+    else
+      *loc = 0xeb;
+    break;
+  case J_JE_32:
+    if (size == 4) {
+      loc[-1] = 0x0f;
+      *loc = 0x84;
+    } else
+      *loc = 0x74;
+    break;
+  case J_JNE_32:
+    if (size == 4) {
+      loc[-1] = 0x0f;
+      *loc = 0x85;
+    } else
+      *loc = 0x75;
+    break;
+  case J_JG_32:
+    if (size == 4) {
+      loc[-1] = 0x0f;
+      *loc = 0x8f;
+    } else
+      *loc = 0x7f;
+    break;
+  case J_JGE_32:
+    if (size == 4) {
+      loc[-1] = 0x0f;
+      *loc = 0x8d;
+    } else
+      *loc = 0x7d;
+    break;
+  case J_JB_32:
+    if (size == 4) {
+      loc[-1] = 0x0f;
+      *loc = 0x82;
+    } else
+      *loc = 0x72;
+    break;
+  case J_JBE_32:
+    if (size == 4) {
+      loc[-1] = 0x0f;
+      *loc = 0x86;
+    } else
+      *loc = 0x76;
+    break;
+  case J_JL_32:
+    if (size == 4) {
+      loc[-1] = 0x0f;
+      *loc = 0x8c;
+    } else
+      *loc = 0x7c;
+    break;
+  case J_JLE_32:
+    if (size == 4) {
+      loc[-1] = 0x0f;
+      *loc = 0x8e;
+    } else
+      *loc = 0x7e;
+    break;
+  case J_JA_32:
+    if (size == 4) {
+      loc[-1] = 0x0f;
+      *loc = 0x87;
+    } else
+      *loc = 0x77;
+    break;
+  case J_JAE_32:
+    if (size == 4) {
+      loc[-1] = 0x0f;
+      *loc = 0x83;
+    } else
+      *loc = 0x73;
+    break;
+  case J_UNKNOWN:
+    llvm_unreachable("Unknown Jump Relocation");
+  }
+}
+
 void X86_64::relocate(uint8_t *loc, const Relocation &rel, uint64_t val) const {
   switch (rel.type) {
   case R_X86_64_8:

diff  --git a/lld/ELF/Config.h b/lld/ELF/Config.h
index 0263708634cb..70ef86239aa2 100644
--- a/lld/ELF/Config.h
+++ b/lld/ELF/Config.h
@@ -114,6 +114,7 @@ struct Configuration {
   llvm::StringRef sysroot;
   llvm::StringRef thinLTOCacheDir;
   llvm::StringRef thinLTOIndexOnlyArg;
+  llvm::StringRef ltoBasicBlockSections;
   std::pair<llvm::StringRef, llvm::StringRef> thinLTOObjectSuffixReplace;
   std::pair<llvm::StringRef, llvm::StringRef> thinLTOPrefixReplace;
   std::string rpath;
@@ -165,6 +166,7 @@ struct Configuration {
   bool ltoCSProfileGenerate;
   bool ltoDebugPassManager;
   bool ltoNewPassManager;
+  bool ltoUniqueBBSectionNames;
   bool ltoWholeProgramVisibility;
   bool mergeArmExidx;
   bool mipsN32Abi = false;
@@ -175,6 +177,7 @@ struct Configuration {
   bool nostdlib;
   bool oFormatBinary;
   bool omagic;
+  bool optimizeBBJumps;
   bool optRemarksWithHotness;
   bool picThunk;
   bool pie;

diff  --git a/lld/ELF/Driver.cpp b/lld/ELF/Driver.cpp
index c31cbcc8a54d..914cfefbd329 100644
--- a/lld/ELF/Driver.cpp
+++ b/lld/ELF/Driver.cpp
@@ -878,6 +878,8 @@ static void readConfigs(opt::InputArgList &args) {
   config->cref = args.hasFlag(OPT_cref, OPT_no_cref, false);
   config->defineCommon = args.hasFlag(OPT_define_common, OPT_no_define_common,
                                       !args.hasArg(OPT_relocatable));
+  config->optimizeBBJumps =
+      args.hasFlag(OPT_optimize_bb_jumps, OPT_no_optimize_bb_jumps, false);
   config->demangle = args.hasFlag(OPT_demangle, OPT_no_demangle, true);
   config->dependentLibraries = args.hasFlag(OPT_dependent_libraries, OPT_no_dependent_libraries, true);
   config->disableVerify = args.hasArg(OPT_disable_verify);
@@ -924,6 +926,11 @@ static void readConfigs(opt::InputArgList &args) {
   config->ltoObjPath = args.getLastArgValue(OPT_lto_obj_path_eq);
   config->ltoPartitions = args::getInteger(args, OPT_lto_partitions, 1);
   config->ltoSampleProfile = args.getLastArgValue(OPT_lto_sample_profile);
+  config->ltoBasicBlockSections =
+      args.getLastArgValue(OPT_lto_basicblock_sections);
+  config->ltoUniqueBBSectionNames =
+      args.hasFlag(OPT_lto_unique_bb_section_names,
+                   OPT_no_lto_unique_bb_section_names, false);
   config->mapFile = args.getLastArgValue(OPT_Map);
   config->mipsGotSize = args::getInteger(args, OPT_mips_got_size, 0xfff0);
   config->mergeArmExidx =

diff  --git a/lld/ELF/InputSection.cpp b/lld/ELF/InputSection.cpp
index e93dec947d90..13c3dd486c33 100644
--- a/lld/ELF/InputSection.cpp
+++ b/lld/ELF/InputSection.cpp
@@ -138,7 +138,7 @@ size_t InputSectionBase::getSize() const {
     return s->getSize();
   if (uncompressedSize >= 0)
     return uncompressedSize;
-  return rawData.size();
+  return rawData.size() - bytesDropped;
 }
 
 void InputSectionBase::uncompress() const {
@@ -659,8 +659,9 @@ static int64_t getTlsTpOffset(const Symbol &s) {
   }
 }
 
-static uint64_t getRelocTargetVA(const InputFile *file, RelType type, int64_t a,
-                                 uint64_t p, const Symbol &sym, RelExpr expr) {
+uint64_t InputSectionBase::getRelocTargetVA(const InputFile *file, RelType type,
+                                            int64_t a, uint64_t p,
+                                            const Symbol &sym, RelExpr expr) {
   switch (expr) {
   case R_ABS:
   case R_DTPREL:
@@ -871,6 +872,12 @@ void InputSection::relocateNonAlloc(uint8_t *buf, ArrayRef<RelTy> rels) {
     if (expr == R_NONE)
       continue;
 
+    if (expr == R_SIZE) {
+      target->relocateNoSym(bufLoc, type,
+                            SignExtend64<bits>(sym.getSize() + addend));
+      continue;
+    }
+
     if (expr != R_ABS && expr != R_DTPREL && expr != R_RISCV_ADD) {
       std::string msg = getLocation<ELFT>(offset) +
                         ": has non-ABS relocation " + toString(type) +
@@ -942,6 +949,8 @@ void InputSectionBase::relocateAlloc(uint8_t *buf, uint8_t *bufEnd) {
   const unsigned bits = config->wordsize * 8;
 
   for (const Relocation &rel : relocations) {
+    if (rel.expr == R_NONE)
+      continue;
     uint64_t offset = rel.offset;
     if (auto *sec = dyn_cast<InputSection>(this))
       offset += sec->outSecOff;
@@ -1011,6 +1020,18 @@ void InputSectionBase::relocateAlloc(uint8_t *buf, uint8_t *bufEnd) {
       break;
     }
   }
+
+  // Apply jumpInstrMods.  jumpInstrMods are created when the opcode of
+  // a jmp insn must be modified to shrink the jmp insn or to flip the jmp
+  // insn.  This is primarily used to relax and optimize jumps created with
+  // basic block sections.
+  if (auto *sec = dyn_cast<InputSection>(this)) {
+    for (const JumpInstrMod &jumpMod : jumpInstrMods) {
+      uint64_t offset = jumpMod.offset + sec->outSecOff;
+      uint8_t *bufLoc = buf + offset;
+      target->applyJumpInstrMod(bufLoc, jumpMod.original, jumpMod.size);
+    }
+  }
 }
 
 // For each function-defining prologue, find any calls to __morestack,

diff  --git a/lld/ELF/InputSection.h b/lld/ELF/InputSection.h
index fe2c3c516a96..719971b9c72a 100644
--- a/lld/ELF/InputSection.h
+++ b/lld/ELF/InputSection.h
@@ -128,6 +128,26 @@ class InputSectionBase : public SectionBase {
     return cast_or_null<ObjFile<ELFT>>(file);
   }
 
+  // If basic block sections are enabled, many code sections could end up with
+  // one or two jump instructions at the end that could be relaxed to a smaller
+  // instruction. The members below help trimming the trailing jump instruction
+  // and shrinking a section.
+  unsigned bytesDropped = 0;
+
+  void drop_back(uint64_t num) { bytesDropped += num; }
+
+  void push_back(uint64_t num) {
+    assert(bytesDropped >= num);
+    bytesDropped -= num;
+  }
+
+  void trim() {
+    if (bytesDropped) {
+      rawData = rawData.drop_back(bytesDropped);
+      bytesDropped = 0;
+    }
+  }
+
   ArrayRef<uint8_t> data() const {
     if (uncompressedSize >= 0)
       uncompress();
@@ -183,12 +203,25 @@ class InputSectionBase : public SectionBase {
   // the mmap'ed output buffer.
   template <class ELFT> void relocate(uint8_t *buf, uint8_t *bufEnd);
   void relocateAlloc(uint8_t *buf, uint8_t *bufEnd);
+  static uint64_t getRelocTargetVA(const InputFile *File, RelType Type,
+                                   int64_t A, uint64_t P, const Symbol &Sym,
+                                   RelExpr Expr);
 
   // The native ELF reloc data type is not very convenient to handle.
   // So we convert ELF reloc records to our own records in Relocations.cpp.
   // This vector contains such "cooked" relocations.
   std::vector<Relocation> relocations;
 
+  // Indicates that this section needs to be padded with a NOP filler if set to
+  // true.
+  bool nopFiller = false;
+
+  // These are modifiers to jump instructions that are necessary when basic
+  // block sections are enabled.  Basic block sections creates opportunities to
+  // relax jump instructions at basic block boundaries after reordering the
+  // basic blocks.
+  std::vector<JumpInstrMod> jumpInstrMods;
+
   // A function compiled with -fsplit-stack calling a function
   // compiled without -fsplit-stack needs its prologue adjusted. Find
   // such functions and adjust their prologues.  This is very similar

diff  --git a/lld/ELF/LTO.cpp b/lld/ELF/LTO.cpp
index f4e04a82f0c2..d09f2a74e48c 100644
--- a/lld/ELF/LTO.cpp
+++ b/lld/ELF/LTO.cpp
@@ -76,6 +76,32 @@ static lto::Config createConfig() {
   c.Options.FunctionSections = true;
   c.Options.DataSections = true;
 
+  // Check if basic block sections must be used.
+  // Allowed values for --lto-basicblock-sections are "all", "labels",
+  // "<file name specifying basic block ids>", or none.  This is the equivalent
+  // of -fbasicblock-sections= flag in clang.
+  if (!config->ltoBasicBlockSections.empty()) {
+    if (config->ltoBasicBlockSections == "all") {
+      c.Options.BBSections = BasicBlockSection::All;
+    } else if (config->ltoBasicBlockSections == "labels") {
+      c.Options.BBSections = BasicBlockSection::Labels;
+    } else if (config->ltoBasicBlockSections == "none") {
+      c.Options.BBSections = BasicBlockSection::None;
+    } else {
+      ErrorOr<std::unique_ptr<MemoryBuffer>> MBOrErr =
+          MemoryBuffer::getFile(config->ltoBasicBlockSections.str());
+      if (!MBOrErr) {
+        error("cannot open " + config->ltoBasicBlockSections + ":" +
+              MBOrErr.getError().message());
+      } else {
+        c.Options.BBSectionsFuncListBuf = std::move(*MBOrErr);
+      }
+      c.Options.BBSections = BasicBlockSection::List;
+    }
+  }
+
+  c.Options.UniqueBBSectionNames = config->ltoUniqueBBSectionNames;
+
   if (auto relocModel = getRelocModelFromCMModel())
     c.RelocModel = *relocModel;
   else if (config->relocatable)

diff  --git a/lld/ELF/Options.td b/lld/ELF/Options.td
index f3afb03cba47..83b9fa467033 100644
--- a/lld/ELF/Options.td
+++ b/lld/ELF/Options.td
@@ -42,6 +42,10 @@ defm compress_debug_sections:
 
 defm defsym: Eq<"defsym", "Define a symbol alias">, MetaVarName<"<symbol>=<value>">;
 
+defm optimize_bb_jumps: B<"optimize-bb-jumps",
+    "Remove direct jumps at the end to the next basic block",
+    "Do not remove any direct jumps at the end to the next basic block (default)">;
+
 defm split_stack_adjust_size
     : Eq<"split-stack-adjust-size",
          "Specify adjustment to stack size when a split-stack function calls a "
@@ -502,6 +506,11 @@ def opt_remarks_format: Separate<["--"], "opt-remarks-format">,
   HelpText<"The format used for serializing remarks (default: YAML)">;
 defm plugin_opt: Eq<"plugin-opt", "specifies LTO options for compatibility with GNU linkers">;
 def save_temps: F<"save-temps">;
+def lto_basicblock_sections: J<"lto-basicblock-sections=">,
+  HelpText<"Enable basic block sections for LTO">;
+defm lto_unique_bb_section_names: B<"lto-unique-bb-section-names",
+    "Give unique names to every basic block section for LTO",
+    "Do not give unique names to every basic block section for LTO (default)">;
 def shuffle_sections: J<"shuffle-sections=">, MetaVarName<"<seed>">,
   HelpText<"Shuffle input sections using the given seed. If 0, use a random seed">;
 def thinlto_cache_dir: J<"thinlto-cache-dir=">,

diff  --git a/lld/ELF/OutputSections.cpp b/lld/ELF/OutputSections.cpp
index c38d4f0b6750..59868e38d72f 100644
--- a/lld/ELF/OutputSections.cpp
+++ b/lld/ELF/OutputSections.cpp
@@ -242,6 +242,25 @@ void OutputSection::sort(llvm::function_ref<int(InputSectionBase *s)> order) {
       sortByOrder(isd->sections, order);
 }
 
+static void nopInstrFill(uint8_t *buf, size_t size) {
+  if (size == 0)
+    return;
+  unsigned i = 0;
+  if (size == 0)
+    return;
+  std::vector<std::vector<uint8_t>> nopFiller = *target->nopInstrs;
+  unsigned num = size / nopFiller.back().size();
+  for (unsigned c = 0; c < num; ++c) {
+    memcpy(buf + i, nopFiller.back().data(), nopFiller.back().size());
+    i += nopFiller.back().size();
+  }
+  unsigned remaining = size - i;
+  if (!remaining)
+    return;
+  assert(nopFiller[remaining - 1].size() == remaining);
+  memcpy(buf + i, nopFiller[remaining - 1].data(), remaining);
+}
+
 // Fill [Buf, Buf + Size) with Filler.
 // This is used for linker script "=fillexp" command.
 static void fill(uint8_t *buf, size_t size,
@@ -330,7 +349,11 @@ template <class ELFT> void OutputSection::writeTo(uint8_t *buf) {
         end = buf + size;
       else
         end = buf + sections[i + 1]->outSecOff;
-      fill(start, end - start, filler);
+      if (isec->nopFiller) {
+        assert(target->nopInstrs);
+        nopInstrFill(start, end - start);
+      } else
+        fill(start, end - start, filler);
     }
   });
 

diff  --git a/lld/ELF/Relocations.h b/lld/ELF/Relocations.h
index b3181b67ad0e..ec59c63410d0 100644
--- a/lld/ELF/Relocations.h
+++ b/lld/ELF/Relocations.h
@@ -24,6 +24,7 @@ class SectionBase;
 
 // Represents a relocation type, such as R_X86_64_PC32 or R_ARM_THM_CALL.
 using RelType = uint32_t;
+using JumpModType = uint32_t;
 
 // List of target-independent relocation types. Relocations read
 // from files are converted to these types so that the main code
@@ -108,6 +109,15 @@ struct Relocation {
   Symbol *sym;
 };
 
+// Manipulate jump instructions with these modifiers.  These are used to relax
+// jump instruction opcodes at basic block boundaries and are particularly
+// useful when basic block sections are enabled.
+struct JumpInstrMod {
+  JumpModType original;
+  uint64_t offset;
+  unsigned size;
+};
+
 // This function writes undefined symbol diagnostics to an internal buffer.
 // Call reportUndefinedSymbols() after calling scanRelocations() to emit
 // the diagnostics.

diff  --git a/lld/ELF/Target.h b/lld/ELF/Target.h
index f58f216332a5..a308a41ff4b9 100644
--- a/lld/ELF/Target.h
+++ b/lld/ELF/Target.h
@@ -88,8 +88,21 @@ class TargetInfo {
     relocate(loc, Relocation{R_NONE, type, 0, 0, nullptr}, val);
   }
 
+  virtual void applyJumpInstrMod(uint8_t *loc, JumpModType type,
+                                 JumpModType val) const {}
+
   virtual ~TargetInfo();
 
+  // This deletes a jump insn at the end of the section if it is a fall thru to
+  // the next section.  Further, if there is a conditional jump and a direct
+  // jump consecutively, it tries to flip the conditional jump to convert the
+  // direct jump into a fall thru and delete it.  Returns true if a jump
+  // instruction can be deleted.
+  virtual bool deleteFallThruJmpInsn(InputSection &is, InputFile *file,
+                                     InputSection *nextIS) const {
+    return false;
+  }
+
   unsigned defaultCommonPageSize = 4096;
   unsigned defaultMaxPageSize = 4096;
 
@@ -126,6 +139,10 @@ class TargetInfo {
   // executable OutputSections.
   std::array<uint8_t, 4> trapInstr;
 
+  // Stores the NOP instructions of 
diff erent sizes for the target and is used
+  // to pad sections that are relaxed.
+  llvm::Optional<std::vector<std::vector<uint8_t>>> nopInstrs;
+
   // If a target needs to rewrite calls to __morestack to instead call
   // __morestack_non_split when a split-stack enabled caller calls a
   // non-split-stack callee this will return true. Otherwise returns false.

diff  --git a/lld/ELF/Writer.cpp b/lld/ELF/Writer.cpp
index 801ca7a0dc5e..55df6b7c5e4c 100644
--- a/lld/ELF/Writer.cpp
+++ b/lld/ELF/Writer.cpp
@@ -31,6 +31,8 @@
 #include "llvm/Support/xxhash.h"
 #include <climits>
 
+#define DEBUG_TYPE "lld"
+
 using namespace llvm;
 using namespace llvm::ELF;
 using namespace llvm::object;
@@ -57,6 +59,7 @@ template <class ELFT> class Writer {
   void sortSections();
   void resolveShfLinkOrder();
   void finalizeAddressDependentContent();
+  void optimizeBasicBlockJumps();
   void sortInputSections();
   void finalizeSections();
   void checkExecuteOnly();
@@ -1670,6 +1673,94 @@ template <class ELFT> void Writer<ELFT>::finalizeAddressDependentContent() {
              Twine(os->alignment) + ")");
 }
 
+// If Input Sections have been shrinked (basic block sections) then
+// update symbol values and sizes associated with these sections.  With basic
+// block sections, input sections can shrink when the jump instructions at
+// the end of the section are relaxed.
+static void fixSymbolsAfterShrinking() {
+  for (InputFile *File : objectFiles) {
+    parallelForEach(File->getSymbols(), [&](Symbol *Sym) {
+      auto *def = dyn_cast<Defined>(Sym);
+      if (!def)
+        return;
+
+      const SectionBase *sec = def->section;
+      if (!sec)
+        return;
+
+      const InputSectionBase *inputSec = dyn_cast<InputSectionBase>(sec->repl);
+      if (!inputSec || !inputSec->bytesDropped)
+        return;
+
+      const size_t OldSize = inputSec->data().size();
+      const size_t NewSize = OldSize - inputSec->bytesDropped;
+
+      if (def->value > NewSize && def->value <= OldSize) {
+        LLVM_DEBUG(llvm::dbgs()
+                   << "Moving symbol " << Sym->getName() << " from "
+                   << def->value << " to "
+                   << def->value - inputSec->bytesDropped << " bytes\n");
+        def->value -= inputSec->bytesDropped;
+        return;
+      }
+
+      if (def->value + def->size > NewSize && def->value <= OldSize &&
+          def->value + def->size <= OldSize) {
+        LLVM_DEBUG(llvm::dbgs()
+                   << "Shrinking symbol " << Sym->getName() << " from "
+                   << def->size << " to " << def->size - inputSec->bytesDropped
+                   << " bytes\n");
+        def->size -= inputSec->bytesDropped;
+      }
+    });
+  }
+}
+
+// If basic block sections exist, there are opportunities to delete fall thru
+// jumps and shrink jump instructions after basic block reordering.  This
+// relaxation pass does that.  It is only enabled when --optimize-bb-jumps
+// option is used.
+template <class ELFT> void Writer<ELFT>::optimizeBasicBlockJumps() {
+  assert(config->optimizeBBJumps);
+
+  script->assignAddresses();
+  // For every output section that has executable input sections, this
+  // does the following:
+  //   1. Deletes all direct jump instructions in input sections that
+  //      jump to the following section as it is not required.
+  //   2. If there are two consecutive jump instructions, it checks
+  //      if they can be flipped and one can be deleted.
+  for (OutputSection *os : outputSections) {
+    if (!(os->flags & SHF_EXECINSTR))
+      continue;
+    std::vector<InputSection *> sections = getInputSections(os);
+    std::vector<unsigned> result(sections.size());
+    // Delete all fall through jump instructions.  Also, check if two
+    // consecutive jump instructions can be flipped so that a fall
+    // through jmp instruction can be deleted.
+    parallelForEachN(0, sections.size(), [&](size_t i) {
+      InputSection *next = i + 1 < sections.size() ? sections[i + 1] : nullptr;
+      InputSection &is = *sections[i];
+      result[i] =
+          target->deleteFallThruJmpInsn(is, is.getFile<ELFT>(), next) ? 1 : 0;
+    });
+    size_t numDeleted = std::count(result.begin(), result.end(), 1);
+    if (numDeleted > 0) {
+      script->assignAddresses();
+      LLVM_DEBUG(llvm::dbgs()
+                 << "Removing " << numDeleted << " fall through jumps\n");
+    }
+  }
+
+  fixSymbolsAfterShrinking();
+
+  for (OutputSection *os : outputSections) {
+    std::vector<InputSection *> sections = getInputSections(os);
+    for (InputSection *is : sections)
+      is->trim();
+  }
+}
+
 static void finalizeSynthetic(SyntheticSection *sec) {
   if (sec && sec->isNeeded() && sec->getParent())
     sec->finalizeContents();
@@ -1992,6 +2083,12 @@ template <class ELFT> void Writer<ELFT>::finalizeSections() {
   finalizeSynthetic(in.symTab);
   finalizeSynthetic(in.ppc64LongBranchTarget);
 
+  // Relaxation to delete inter-basic block jumps created by basic block
+  // sections. Run after in.symTab is finalized as optimizeBasicBlockJumps
+  // can relax jump instructions based on symbol offset.
+  if (config->optimizeBBJumps)
+    optimizeBasicBlockJumps();
+
   // Fill other section headers. The dynamic table is finalized
   // at the end because some tags like RELSZ depend on result
   // of finalizing other sections.

diff  --git a/lld/test/ELF/bb-sections-and-icf.s b/lld/test/ELF/bb-sections-and-icf.s
new file mode 100644
index 000000000000..bcc9193c6ed8
--- /dev/null
+++ b/lld/test/ELF/bb-sections-and-icf.s
@@ -0,0 +1,47 @@
+# REQUIRES: x86
+## basicblock-sections tests.
+## This simple test checks foo is folded into bar with bb sections
+## and the jumps are deleted.
+
+# RUN: llvm-mc -filetype=obj -triple=x86_64 %s -o %t.o
+# RUN: ld.lld --optimize-bb-jumps --icf=all %t.o -o %t.out
+# RUN: llvm-objdump -d %t.out| FileCheck %s
+
+# CHECK:      <foo>:
+# CHECK-NEXT:  nopl (%rax)
+# CHECK-NEXT:  je 0x{{[[:xdigit:]]+}} <aa.BB.foo>
+# CHECK-NOT:   jmp
+
+# CHECK:     <a.BB.foo>:
+## Explicity check that bar is folded and not emitted.
+# CHECK-NOT: <bar>:
+# CHECK-NOT: <a.BB.bar>:
+# CHECK-NOT: <aa.BB.bar>:
+
+.section	.text.bar,"ax", at progbits
+.type	bar, at function
+bar:
+ nopl (%rax)
+ jne	a.BB.bar
+ jmp	aa.BB.bar
+
+.section	.text.a.BB.bar,"ax", at progbits,unique,3
+a.BB.bar:
+ nopl (%rax)
+
+aa.BB.bar:
+ ret
+
+.section	.text.foo,"ax", at progbits
+.type	foo, at function
+foo:
+ nopl (%rax)
+ jne	a.BB.foo
+ jmp	aa.BB.foo
+
+.section	.text.a.BB.foo,"ax", at progbits,unique,2
+a.BB.foo:
+ nopl (%rax)
+
+aa.BB.foo:
+ ret

diff  --git a/lld/test/ELF/bb-sections-delete-fallthru.s b/lld/test/ELF/bb-sections-delete-fallthru.s
new file mode 100644
index 000000000000..c8a0e9353424
--- /dev/null
+++ b/lld/test/ELF/bb-sections-delete-fallthru.s
@@ -0,0 +1,128 @@
+# REQUIRES: x86
+## basicblock-sections tests.
+## This simple test checks if redundant direct jumps are converted to
+## implicit fallthrus.  The jcc's must be converted to their inverted
+## opcode, for instance jne to je and jmp must be deleted.
+
+# RUN: llvm-mc -filetype=obj -triple=x86_64 %s -o %t.o
+# RUN: ld.lld  --optimize-bb-jumps %t.o -o %t.out
+# RUN: llvm-objdump -d %t.out| FileCheck %s
+
+# CHECK:      <foo>:
+# CHECK-NEXT:  nopl    (%rax)
+# CHECK-NEXT:  jne      0x{{[[:xdigit:]]+}} <r.BB.foo>
+# CHECK-NOT:   jmp
+
+
+.section	.text,"ax", at progbits
+.type	foo, at function
+foo:
+ nopl (%rax)
+ je	a.BB.foo
+ jmp	r.BB.foo
+
+# CHECK:      <a.BB.foo>:
+# CHECK-NEXT:  nopl (%rax)
+# CHECK-NEXT:  je 0x{{[[:xdigit:]]+}} <r.BB.foo>
+# CHECK-NOT:   jmp
+
+.section	.text,"ax", at progbits,unique,3
+a.BB.foo:
+ nopl (%rax)
+ jne	aa.BB.foo
+ jmp	r.BB.foo
+
+# CHECK:      <aa.BB.foo>:
+# CHECK-NEXT:  nopl (%rax)
+# CHECK-NEXT:  jle 0x{{[[:xdigit:]]+}} <r.BB.foo>
+# CHECK-NOT:   jmp
+#
+.section	.text,"ax", at progbits,unique,4
+aa.BB.foo:
+ nopl (%rax)
+ jg	aaa.BB.foo
+ jmp	r.BB.foo
+
+# CHECK:      <aaa.BB.foo>:
+# CHECK-NEXT:  nopl (%rax)
+# CHECK-NEXT:  jl 0x{{[[:xdigit:]]+}} <r.BB.foo>
+# CHECK-NOT:   jmp
+#
+.section	.text,"ax", at progbits,unique,5
+aaa.BB.foo:
+ nopl (%rax)
+ jge	aaaa.BB.foo
+ jmp	r.BB.foo
+
+# CHECK:      <aaaa.BB.foo>:
+# CHECK-NEXT:  nopl (%rax)
+# CHECK-NEXT:  jae 0x{{[[:xdigit:]]+}} <r.BB.foo>
+# CHECK-NOT:   jmp
+#
+.section	.text,"ax", at progbits,unique,6
+aaaa.BB.foo:
+ nopl (%rax)
+ jb	aaaaa.BB.foo
+ jmp	r.BB.foo
+
+# CHECK:      <aaaaa.BB.foo>:
+# CHECK-NEXT:  nopl (%rax)
+# CHECK-NEXT:  ja 0x{{[[:xdigit:]]+}} <r.BB.foo>
+# CHECK-NOT:   jmp
+#
+.section	.text,"ax", at progbits,unique,7
+aaaaa.BB.foo:
+ nopl (%rax)
+ jbe	aaaaaa.BB.foo
+ jmp	r.BB.foo
+
+# CHECK:      <aaaaaa.BB.foo>:
+# CHECK-NEXT:  nopl (%rax)
+# CHECK-NEXT:  jge 0x{{[[:xdigit:]]+}} <r.BB.foo>
+# CHECK-NOT:   jmp
+#
+.section	.text,"ax", at progbits,unique,8
+aaaaaa.BB.foo:
+ nopl (%rax)
+ jl	aaaaaaa.BB.foo
+ jmp	r.BB.foo
+
+# CHECK:      <aaaaaaa.BB.foo>:
+# CHECK-NEXT:  nopl (%rax)
+# CHECK-NEXT:  jg 0x{{[[:xdigit:]]+}} <r.BB.foo>
+# CHECK-NOT:   jmp
+#
+.section	.text,"ax", at progbits,unique,9
+aaaaaaa.BB.foo:
+ nopl (%rax)
+ jle	aaaaaaaa.BB.foo
+ jmp	r.BB.foo
+
+# CHECK:      <aaaaaaaa.BB.foo>:
+# CHECK-NEXT:  nopl (%rax)
+# CHECK-NEXT:  jbe 0x{{[[:xdigit:]]+}} <r.BB.foo>
+# CHECK-NOT:   jmp
+#
+.section	.text,"ax", at progbits,unique,10
+aaaaaaaa.BB.foo:
+ nopl (%rax)
+ ja	aaaaaaaaa.BB.foo
+ jmp	r.BB.foo
+
+# CHECK:      <aaaaaaaaa.BB.foo>:
+# CHECK-NEXT:  nopl (%rax)
+# CHECK-NEXT:  jb 0x{{[[:xdigit:]]+}} <r.BB.foo>
+# CHECK-NOT:   jmp
+#
+.section	.text,"ax", at progbits,unique,11
+aaaaaaaaa.BB.foo:
+ nopl (%rax)
+ jae	aaaaaaaaaa.BB.foo
+ jmp	r.BB.foo
+
+.section	.text,"ax", at progbits,unique,20
+aaaaaaaaaa.BB.foo:
+ nopl (%rax)
+
+r.BB.foo:
+ ret

diff  --git a/lld/test/ELF/bb-sections-pc32reloc.s b/lld/test/ELF/bb-sections-pc32reloc.s
new file mode 100644
index 000000000000..9631a3cfe6c3
--- /dev/null
+++ b/lld/test/ELF/bb-sections-pc32reloc.s
@@ -0,0 +1,37 @@
+# REQUIRES: x86
+## basicblock-sections tests.
+## This simple test checks if redundant direct jumps are converted to
+## implicit fallthrus when PC32 reloc is present.  The jcc's must be converted
+## to their inverted opcode, for instance jne to je and jmp must be deleted.
+
+# RUN: llvm-mc -filetype=obj -triple=x86_64 %s -o %t.o
+# RUN: llvm-objdump -dr %t.o| FileCheck %s --check-prefix=RELOC
+# RUN: ld.lld  --optimize-bb-jumps %t.o -o %t.out
+# RUN: llvm-objdump -d %t.out| FileCheck %s
+
+# RELOC:      jmp
+# RELOC-NEXT: R_X86_64_PC32
+
+# CHECK:      <foo>:
+# CHECK-NEXT:  nopl (%rax)
+# CHECK-NEXT:  jne 0x{{[[:xdigit:]]+}} <r.BB.foo>
+# CHECK-NOT:   jmp
+
+
+.section	.text,"ax", at progbits
+.type	foo, at function
+foo:
+ nopl (%rax)
+ je	a.BB.foo
+# Encode a jmp r.BB.foo insn using a PC32 reloc
+ .byte  0xe9
+ .long  r.BB.foo - . - 4
+
+# CHECK:      <a.BB.foo>:
+# CHECK-NEXT:  nopl (%rax)
+
+.section	.text,"ax", at progbits,unique,3
+a.BB.foo:
+ nopl (%rax)
+r.BB.foo:
+ ret