<div dir="ltr">What is your machine spec by the way?</div><div class="gmail_extra"><br><div class="gmail_quote">On Mon, Dec 5, 2016 at 9:22 AM, Rafael Avila de Espindola <span dir="ltr"><<a href="mailto:rafael.espindola@gmail.com" target="_blank">rafael.espindola@gmail.com</a>></span> wrote:<br><blockquote class="gmail_quote" style="margin:0 0 0 .8ex;border-left:1px #ccc solid;padding-left:1ex"><br>
Thanks!<br>
<br>
I didn't have access to my workstation last week. Now that I do, I<br>
measure 1.15226905502x faster for firefox and 1.27814295845x faster for<br>
scylla, the two programs with debug info in the tests I normally run.<br>
<br>
Cheers,<br>
Rafael<br>
<div class="HOEnZb"><div class="h5"><br>
Rui Ueyama via llvm-commits <<a href="mailto:llvm-commits@lists.llvm.org">llvm-commits@lists.llvm.org</a>> writes:<br>
<br>
> Author: ruiu<br>
> Date: Fri Nov 25 14:05:08 2016<br>
> New Revision: 287946<br>
><br>
> URL: <a href="http://llvm.org/viewvc/llvm-project?rev=287946&view=rev" rel="noreferrer" target="_blank">http://llvm.org/viewvc/llvm-<wbr>project?rev=287946&view=rev</a><br>
> Log:<br>
> Parallelize uncompress() and splitIntoPieces().<br>
><br>
> Uncompressing section contents and spliting mergeable section contents<br>
> into smaller chunks are heavy tasks. They scan entire section contents<br>
> and do CPU-intensive tasks such as uncompressing zlib-compressed data<br>
> or computing a hash value for each section piece.<br>
><br>
> Luckily, these tasks are independent to each other, so we can do that<br>
> in parallel_for_each. The number of input sections is large (as opposed<br>
> to the number of output sections), so there's a large parallelism here.<br>
><br>
> Actually the current design to call uncompress() and splitIntoPieces()<br>
> in batch was chosen with doing this in mind. Basically what we need to<br>
> do here is to replace `for` with `parallel_for_each`.<br>
><br>
> It seems this patch improves latency significantly if linked programs<br>
> contain debug info (which in turn contain lots of mergeable strings.)<br>
> For example, the latency to link Clang (debug build) improved by 20% on<br>
> my machine as shown below. Note that ld.gold took 19.2 seconds to do<br>
> the same thing.<br>
><br>
> Before:<br>
>     30801.782712 task-clock (msec)         #    3.652 CPUs utilized            ( +-  2.59% )<br>
>          104,084 context-switches          #    0.003 M/sec                    ( +-  1.02% )<br>
>            5,063 cpu-migrations            #    0.164 K/sec                    ( +- 13.66% )<br>
>        2,528,130 page-faults               #    0.082 M/sec                    ( +-  0.47% )<br>
>   85,317,809,130 cycles                    #    2.770 GHz                      ( +-  2.62% )<br>
>   67,352,463,373 stalled-cycles-frontend   #   78.94% frontend cycles idle     ( +-  3.06% )<br>
>  <not supported> stalled-cycles-backend<br>
>   44,295,945,493 instructions              #    0.52  insns per cycle<br>
>                                            #    1.52  stalled cycles per insn  ( +-  0.44% )<br>
>    8,572,384,877 branches                  #  278.308 M/sec                    ( +-  0.66% )<br>
>      141,806,726 branch-misses             #    1.65% of all branches          ( +-  0.13% )<br>
><br>
>      8.433424003 seconds time elapsed                                          ( +-  1.20% )<br>
><br>
> After:<br>
>     35523.764575 task-clock (msec)         #    5.265 CPUs utilized            ( +-  2.67% )<br>
>          159,107 context-switches          #    0.004 M/sec                    ( +-  0.48% )<br>
>            8,123 cpu-migrations            #    0.229 K/sec                    ( +- 23.34% )<br>
>        2,372,483 page-faults               #    0.067 M/sec                    ( +-  0.36% )<br>
>   98,395,342,152 cycles                    #    2.770 GHz                      ( +-  2.62% )<br>
>   79,294,670,125 stalled-cycles-frontend   #   80.59% frontend cycles idle     ( +-  3.03% )<br>
>  <not supported> stalled-cycles-backend<br>
>   46,274,151,813 instructions              #    0.47  insns per cycle<br>
>                                            #    1.71  stalled cycles per insn  ( +-  0.47% )<br>
>    8,987,621,670 branches                  #  253.003 M/sec                    ( +-  0.60% )<br>
>      148,900,624 branch-misses             #    1.66% of all branches          ( +-  0.27% )<br>
><br>
>      6.747548004 seconds time elapsed                                          ( +-  0.40% )<br>
><br>
> Modified:<br>
>     lld/trunk/ELF/Driver.cpp<br>
>     lld/trunk/ELF/InputSection.cpp<br>
><br>
> Modified: lld/trunk/ELF/Driver.cpp<br>
> URL: <a href="http://llvm.org/viewvc/llvm-project/lld/trunk/ELF/Driver.cpp?rev=287946&r1=287945&r2=287946&view=diff" rel="noreferrer" target="_blank">http://llvm.org/viewvc/llvm-<wbr>project/lld/trunk/ELF/Driver.<wbr>cpp?rev=287946&r1=287945&r2=<wbr>287946&view=diff</a><br>
> ==============================<wbr>==============================<wbr>==================<br>
> --- lld/trunk/ELF/Driver.cpp (original)<br>
> +++ lld/trunk/ELF/Driver.cpp Fri Nov 25 14:05:08 2016<br>
> @@ -20,6 +20,7 @@<br>
>  #include "Target.h"<br>
>  #include "Writer.h"<br>
>  #include "lld/Config/Version.h"<br>
> +#include "lld/Core/Parallel.h"<br>
>  #include "lld/Driver/Driver.h"<br>
>  #include "llvm/ADT/StringExtras.h"<br>
>  #include "llvm/ADT/StringSwitch.h"<br>
> @@ -800,14 +801,15 @@ template <class ELFT> void LinkerDriver:<br>
><br>
>    // MergeInputSection::<wbr>splitIntoPieces needs to be called before<br>
>    // any call of MergeInputSection::getOffset. Do that.<br>
> -  for (InputSectionBase<ELFT> *S : Symtab.Sections) {<br>
> -    if (!S->Live)<br>
> -      continue;<br>
> -    if (S->Compressed)<br>
> -      S->uncompress();<br>
> -    if (auto *MS = dyn_cast<MergeInputSection<<wbr>ELFT>>(S))<br>
> -      MS->splitIntoPieces();<br>
> -  }<br>
> +  parallel_for_each(Symtab.<wbr>Sections.begin(), Symtab.Sections.end(),<br>
> +                    [](InputSectionBase<ELFT> *S) {<br>
> +                      if (!S->Live)<br>
> +                        return;<br>
> +                      if (S->Compressed)<br>
> +                        S->uncompress();<br>
> +                      if (auto *MS = dyn_cast<MergeInputSection<<wbr>ELFT>>(S))<br>
> +                        MS->splitIntoPieces();<br>
> +                    });<br>
><br>
>    // Write the result to the file.<br>
>    writeResult<ELFT>();<br>
><br>
> Modified: lld/trunk/ELF/InputSection.cpp<br>
> URL: <a href="http://llvm.org/viewvc/llvm-project/lld/trunk/ELF/InputSection.cpp?rev=287946&r1=287945&r2=287946&view=diff" rel="noreferrer" target="_blank">http://llvm.org/viewvc/llvm-<wbr>project/lld/trunk/ELF/<wbr>InputSection.cpp?rev=287946&<wbr>r1=287945&r2=287946&view=diff</a><br>
> ==============================<wbr>==============================<wbr>==================<br>
> --- lld/trunk/ELF/InputSection.cpp (original)<br>
> +++ lld/trunk/ELF/InputSection.cpp Fri Nov 25 14:05:08 2016<br>
> @@ -22,6 +22,7 @@<br>
><br>
>  #include "llvm/Support/Compression.h"<br>
>  #include "llvm/Support/Endian.h"<br>
> +#include <mutex><br>
><br>
>  using namespace llvm;<br>
>  using namespace llvm::ELF;<br>
> @@ -160,6 +161,8 @@ InputSectionBase<ELFT>::<wbr>getRawCompressed<br>
>    return {Data.slice(sizeof(*Hdr)), read64be(Hdr->Size)};<br>
>  }<br>
><br>
> +// Uncompress section contents. Note that this function is called<br>
> +// from parallel_for_each, so it must be thread-safe.<br>
>  template <class ELFT> void InputSectionBase<ELFT>::<wbr>uncompress() {<br>
>    if (!zlib::isAvailable())<br>
>      fatal(toString(this) +<br>
> @@ -179,7 +182,12 @@ template <class ELFT> void InputSectionB<br>
>      std::tie(Buf, Size) = getRawCompressedData(Data);<br>
><br>
>    // Uncompress Buf.<br>
> -  char *OutputBuf = BAlloc.Allocate<char>(Size);<br>
> +  char *OutputBuf;<br>
> +  {<br>
> +    static std::mutex Mu;<br>
> +    std::lock_guard<std::mutex> Lock(Mu);<br>
> +    OutputBuf = BAlloc.Allocate<char>(Size);<br>
> +  }<br>
>    if (zlib::uncompress(toStringRef(<wbr>Buf), OutputBuf, Size) != zlib::StatusOK)<br>
>      fatal(toString(this) + ": error while uncompressing section");<br>
>    Data = ArrayRef<uint8_t>((uint8_t *)OutputBuf, Size);<br>
> @@ -746,6 +754,12 @@ MergeInputSection<ELFT>::<wbr>MergeInputSecti<br>
>                                             StringRef Name)<br>
>      : InputSectionBase<ELFT>(F, Header, Name, InputSectionBase<ELFT>::Merge) {}<br>
><br>
> +// This function is called after we obtain a complete list of input sections<br>
> +// that need to be linked. This is responsible to split section contents<br>
> +// into small chunks for further processing.<br>
> +//<br>
> +// Note that this function is called from parallel_for_each. This must be<br>
> +// thread-safe (i.e. no memory allocation from the pools).<br>
>  template <class ELFT> void MergeInputSection<ELFT>::<wbr>splitIntoPieces() {<br>
>    ArrayRef<uint8_t> Data = this->Data;<br>
>    uintX_t EntSize = this->Entsize;<br>
><br>
><br>
> ______________________________<wbr>_________________<br>
> llvm-commits mailing list<br>
> <a href="mailto:llvm-commits@lists.llvm.org">llvm-commits@lists.llvm.org</a><br>
> <a href="http://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-commits" rel="noreferrer" target="_blank">http://lists.llvm.org/cgi-bin/<wbr>mailman/listinfo/llvm-commits</a><br>
</div></div></blockquote></div><br></div>