<div dir="ltr">What is your machine spec by the way?</div><div class="gmail_extra"><br><div class="gmail_quote">On Mon, Dec 5, 2016 at 9:22 AM, Rafael Avila de Espindola <span dir="ltr"><<a href="mailto:rafael.espindola@gmail.com" target="_blank">rafael.espindola@gmail.com</a>></span> wrote:<br><blockquote class="gmail_quote" style="margin:0 0 0 .8ex;border-left:1px #ccc solid;padding-left:1ex"><br>

Thanks!<br>

<br>

I didn't have access to my workstation last week. Now that I do, I<br>

measure 1.15226905502x faster for firefox and 1.27814295845x faster for<br>

scylla, the two programs with debug info in the tests I normally run.<br>

<br>

Cheers,<br>

Rafael<br>

<div class="HOEnZb"><div class="h5"><br>

Rui Ueyama via llvm-commits <<a href="mailto:llvm-commits@lists.llvm.org">llvm-commits@lists.llvm.org</a>> writes:<br>

<br>

> Author: ruiu<br>

> Date: Fri Nov 25 14:05:08 2016<br>

> New Revision: 287946<br>

><br>

> URL: <a href="http://llvm.org/viewvc/llvm-project?rev=287946&view=rev" rel="noreferrer" target="_blank">http://llvm.org/viewvc/llvm-<wbr>project?rev=287946&view=rev</a><br>

> Log:<br>

> Parallelize uncompress() and splitIntoPieces().<br>

><br>

> Uncompressing section contents and spliting mergeable section contents<br>

> into smaller chunks are heavy tasks. They scan entire section contents<br>

> and do CPU-intensive tasks such as uncompressing zlib-compressed data<br>

> or computing a hash value for each section piece.<br>

><br>

> Luckily, these tasks are independent to each other, so we can do that<br>

> in parallel_for_each. The number of input sections is large (as opposed<br>

> to the number of output sections), so there's a large parallelism here.<br>

><br>

> Actually the current design to call uncompress() and splitIntoPieces()<br>

> in batch was chosen with doing this in mind. Basically what we need to<br>

> do here is to replace `for` with `parallel_for_each`.<br>

><br>

> It seems this patch improves latency significantly if linked programs<br>

> contain debug info (which in turn contain lots of mergeable strings.)<br>

> For example, the latency to link Clang (debug build) improved by 20% on<br>

> my machine as shown below. Note that ld.gold took 19.2 seconds to do<br>

> the same thing.<br>

><br>

> Before:<br>

>     30801.782712 task-clock (msec)         #    3.652 CPUs utilized            ( +-  2.59% )<br>

>          104,084 context-switches          #    0.003 M/sec                    ( +-  1.02% )<br>

>            5,063 cpu-migrations            #    0.164 K/sec                    ( +- 13.66% )<br>

>        2,528,130 page-faults               #    0.082 M/sec                    ( +-  0.47% )<br>

>   85,317,809,130 cycles                    #    2.770 GHz                      ( +-  2.62% )<br>

>   67,352,463,373 stalled-cycles-frontend   #   78.94% frontend cycles idle     ( +-  3.06% )<br>

>  <not supported> stalled-cycles-backend<br>

>   44,295,945,493 instructions              #    0.52  insns per cycle<br>

>                                            #    1.52  stalled cycles per insn  ( +-  0.44% )<br>

>    8,572,384,877 branches                  #  278.308 M/sec                    ( +-  0.66% )<br>

>      141,806,726 branch-misses             #    1.65% of all branches          ( +-  0.13% )<br>

><br>

>      8.433424003 seconds time elapsed                                          ( +-  1.20% )<br>

><br>

> After:<br>

>     35523.764575 task-clock (msec)         #    5.265 CPUs utilized            ( +-  2.67% )<br>

>          159,107 context-switches          #    0.004 M/sec                    ( +-  0.48% )<br>

>            8,123 cpu-migrations            #    0.229 K/sec                    ( +- 23.34% )<br>

>        2,372,483 page-faults               #    0.067 M/sec                    ( +-  0.36% )<br>

>   98,395,342,152 cycles                    #    2.770 GHz                      ( +-  2.62% )<br>

>   79,294,670,125 stalled-cycles-frontend   #   80.59% frontend cycles idle     ( +-  3.03% )<br>

>  <not supported> stalled-cycles-backend<br>

>   46,274,151,813 instructions              #    0.47  insns per cycle<br>

>                                            #    1.71  stalled cycles per insn  ( +-  0.47% )<br>

>    8,987,621,670 branches                  #  253.003 M/sec                    ( +-  0.60% )<br>

>      148,900,624 branch-misses             #    1.66% of all branches          ( +-  0.27% )<br>

><br>

>      6.747548004 seconds time elapsed                                          ( +-  0.40% )<br>

><br>

> Modified:<br>

>     lld/trunk/ELF/Driver.cpp<br>

>     lld/trunk/ELF/InputSection.cpp<br>

><br>

> Modified: lld/trunk/ELF/Driver.cpp<br>

> URL: <a href="http://llvm.org/viewvc/llvm-project/lld/trunk/ELF/Driver.cpp?rev=287946&r1=287945&r2=287946&view=diff" rel="noreferrer" target="_blank">http://llvm.org/viewvc/llvm-<wbr>project/lld/trunk/ELF/Driver.<wbr>cpp?rev=287946&r1=287945&r2=<wbr>287946&view=diff</a><br>

> ==============================<wbr>==============================<wbr>==================<br>

> --- lld/trunk/ELF/Driver.cpp (original)<br>

> +++ lld/trunk/ELF/Driver.cpp Fri Nov 25 14:05:08 2016<br>

> @@ -20,6 +20,7 @@<br>

>  #include "Target.h"<br>

>  #include "Writer.h"<br>

>  #include "lld/Config/Version.h"<br>

> +#include "lld/Core/Parallel.h"<br>

>  #include "lld/Driver/Driver.h"<br>

>  #include "llvm/ADT/StringExtras.h"<br>

>  #include "llvm/ADT/StringSwitch.h"<br>

> @@ -800,14 +801,15 @@ template <class ELFT> void LinkerDriver:<br>

><br>

>    // MergeInputSection::<wbr>splitIntoPieces needs to be called before<br>

>    // any call of MergeInputSection::getOffset. Do that.<br>

> -  for (InputSectionBase<ELFT> *S : Symtab.Sections) {<br>

> -    if (!S->Live)<br>

> -      continue;<br>

> -    if (S->Compressed)<br>

> -      S->uncompress();<br>

> -    if (auto *MS = dyn_cast<MergeInputSection<<wbr>ELFT>>(S))<br>

> -      MS->splitIntoPieces();<br>

> -  }<br>

> +  parallel_for_each(Symtab.<wbr>Sections.begin(), Symtab.Sections.end(),<br>

> +                    [](InputSectionBase<ELFT> *S) {<br>

> +                      if (!S->Live)<br>

> +                        return;<br>

> +                      if (S->Compressed)<br>

> +                        S->uncompress();<br>

> +                      if (auto *MS = dyn_cast<MergeInputSection<<wbr>ELFT>>(S))<br>

> +                        MS->splitIntoPieces();<br>

> +                    });<br>

><br>

>    // Write the result to the file.<br>

>    writeResult<ELFT>();<br>

><br>

> Modified: lld/trunk/ELF/InputSection.cpp<br>

> URL: <a href="http://llvm.org/viewvc/llvm-project/lld/trunk/ELF/InputSection.cpp?rev=287946&r1=287945&r2=287946&view=diff" rel="noreferrer" target="_blank">http://llvm.org/viewvc/llvm-<wbr>project/lld/trunk/ELF/<wbr>InputSection.cpp?rev=287946&<wbr>r1=287945&r2=287946&view=diff</a><br>

> ==============================<wbr>==============================<wbr>==================<br>

> --- lld/trunk/ELF/InputSection.cpp (original)<br>

> +++ lld/trunk/ELF/InputSection.cpp Fri Nov 25 14:05:08 2016<br>

> @@ -22,6 +22,7 @@<br>

><br>

>  #include "llvm/Support/Compression.h"<br>

>  #include "llvm/Support/Endian.h"<br>

> +#include <mutex><br>

><br>

>  using namespace llvm;<br>

>  using namespace llvm::ELF;<br>

> @@ -160,6 +161,8 @@ InputSectionBase<ELFT>::<wbr>getRawCompressed<br>

>    return {Data.slice(sizeof(*Hdr)), read64be(Hdr->Size)};<br>

>  }<br>

><br>

> +// Uncompress section contents. Note that this function is called<br>

> +// from parallel_for_each, so it must be thread-safe.<br>

>  template <class ELFT> void InputSectionBase<ELFT>::<wbr>uncompress() {<br>

>    if (!zlib::isAvailable())<br>

>      fatal(toString(this) +<br>

> @@ -179,7 +182,12 @@ template <class ELFT> void InputSectionB<br>

>      std::tie(Buf, Size) = getRawCompressedData(Data);<br>

><br>

>    // Uncompress Buf.<br>

> -  char *OutputBuf = BAlloc.Allocate<char>(Size);<br>

> +  char *OutputBuf;<br>

> +  {<br>

> +    static std::mutex Mu;<br>

> +    std::lock_guard<std::mutex> Lock(Mu);<br>

> +    OutputBuf = BAlloc.Allocate<char>(Size);<br>

> +  }<br>

>    if (zlib::uncompress(toStringRef(<wbr>Buf), OutputBuf, Size) != zlib::StatusOK)<br>

>      fatal(toString(this) + ": error while uncompressing section");<br>

>    Data = ArrayRef<uint8_t>((uint8_t *)OutputBuf, Size);<br>

> @@ -746,6 +754,12 @@ MergeInputSection<ELFT>::<wbr>MergeInputSecti<br>

>                                             StringRef Name)<br>

>      : InputSectionBase<ELFT>(F, Header, Name, InputSectionBase<ELFT>::Merge) {}<br>

><br>

> +// This function is called after we obtain a complete list of input sections<br>

> +// that need to be linked. This is responsible to split section contents<br>

> +// into small chunks for further processing.<br>

> +//<br>

> +// Note that this function is called from parallel_for_each. This must be<br>

> +// thread-safe (i.e. no memory allocation from the pools).<br>

>  template <class ELFT> void MergeInputSection<ELFT>::<wbr>splitIntoPieces() {<br>

>    ArrayRef<uint8_t> Data = this->Data;<br>

>    uintX_t EntSize = this->Entsize;<br>

><br>

><br>

> ______________________________<wbr>_________________<br>

> llvm-commits mailing list<br>

> <a href="mailto:llvm-commits@lists.llvm.org">llvm-commits@lists.llvm.org</a><br>

> <a href="http://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-commits" rel="noreferrer" target="_blank">http://lists.llvm.org/cgi-bin/<wbr>mailman/listinfo/llvm-commits</a><br>

</div></div></blockquote></div><br></div>