[lld] r287140 - Reduce number of tasks in parallel_for_each.

Wed Nov 16 11:27:34 PST 2016

Author: ruiu
Date: Wed Nov 16 13:27:33 2016
New Revision: 287140

URL: http://llvm.org/viewvc/llvm-project?rev=287140&view=rev
Log:
Reduce number of tasks in parallel_for_each.

TaskGroup has a fairly high overhead, so we don't want to partition
tasks into too small tasks. This patch partition tasks into up to
1024 tasks.

I compared this patch with the original LLD's parallel_for_each.
I reverted r287042 locally for comparison.

With this patch, time to self-link lld with debug info changed from
6.23 seconds to 4.62 seconds (-25.8%), with -threads and without -build-id.
With both -threads and -build-id, it improved from 11.71 seconds
to 4.94 seconds (-57.8%). Full results are below.

BTW, GNU gold takes 11.65 seconds to link the same binary.

NOW

--no-threads --build-id=none
       6789.847776 task-clock (msec)         #    1.000 CPUs utilized            ( +-  1.86% )
               685 context-switches          #    0.101 K/sec                    ( +-  2.82% )
                 4 cpu-migrations            #    0.001 K/sec                    ( +- 31.18% )
         1,424,690 page-faults               #    0.210 M/sec                    ( +-  1.07% )
    21,339,542,522 cycles                    #    3.143 GHz                      ( +-  1.49% )
    13,092,260,230 stalled-cycles-frontend   #   61.35% frontend cycles idle     ( +-  2.23% )
   <not supported> stalled-cycles-backend
    21,462,051,828 instructions              #    1.01  insns per cycle
                                             #    0.61  stalled cycles per insn  ( +-  0.41% )
     3,955,296,378 branches                  #  582.531 M/sec                    ( +-  0.39% )
        75,699,909 branch-misses             #    1.91% of all branches          ( +-  0.08% )

       6.787630744 seconds time elapsed                                          ( +-  1.86% )

--threads --build-id=none
      14767.148697 task-clock (msec)         #    3.196 CPUs utilized            ( +-  2.56% )
            28,891 context-switches          #    0.002 M/sec                    ( +-  1.99% )
               905 cpu-migrations            #    0.061 K/sec                    ( +-  5.49% )
         1,262,122 page-faults               #    0.085 M/sec                    ( +-  1.68% )
    43,116,163,217 cycles                    #    2.920 GHz                      ( +-  3.07% )
    33,690,171,242 stalled-cycles-frontend   #   78.14% frontend cycles idle     ( +-  3.67% )
   <not supported> stalled-cycles-backend
    22,836,731,536 instructions              #    0.53  insns per cycle
                                             #    1.48  stalled cycles per insn  ( +-  1.13% )
     4,382,712,998 branches                  #  296.788 M/sec                    ( +-  1.33% )
        78,622,295 branch-misses             #    1.79% of all branches          ( +-  0.54% )

       4.621228056 seconds time elapsed                                          ( +-  1.90% )

--threads --build-id=sha1
      24594.457135 task-clock (msec)         #    4.974 CPUs utilized            ( +-  1.78% )
            29,902 context-switches          #    0.001 M/sec                    ( +-  2.62% )
             1,097 cpu-migrations            #    0.045 K/sec                    ( +-  6.29% )
         1,313,947 page-faults               #    0.053 M/sec                    ( +-  2.36% )
    70,516,415,741 cycles                    #    2.867 GHz                      ( +-  0.78% )
    47,570,262,296 stalled-cycles-frontend   #   67.46% frontend cycles idle     ( +-  0.86% )
   <not supported> stalled-cycles-backend
    73,124,599,029 instructions              #    1.04  insns per cycle
                                             #    0.65  stalled cycles per insn  ( +-  0.33% )
    10,495,266,104 branches                  #  426.733 M/sec                    ( +-  0.41% )
        91,444,149 branch-misses             #    0.87% of all branches          ( +-  0.83% )

       4.944291711 seconds time elapsed                                          ( +-  1.72% )

PREVIOUS

--threads --build-id=none
       7307.437544 task-clock (msec)         #    1.160 CPUs utilized            ( +-  2.34% )
             3,128 context-switches          #    0.428 K/sec                    ( +-  4.37% )
               352 cpu-migrations            #    0.048 K/sec                    ( +-  5.98% )
         1,354,450 page-faults               #    0.185 M/sec                    ( +-  2.20% )
    22,081,733,098 cycles                    #    3.022 GHz                      ( +-  1.46% )
    13,709,991,267 stalled-cycles-frontend   #   62.09% frontend cycles idle     ( +-  1.77% )
   <not supported> stalled-cycles-backend
    21,634,468,895 instructions              #    0.98  insns per cycle
                                             #    0.63  stalled cycles per insn  ( +-  0.86% )
     3,993,062,361 branches                  #  546.438 M/sec                    ( +-  0.83% )
        76,188,819 branch-misses             #    1.91% of all branches          ( +-  0.19% )

       6.298101157 seconds time elapsed                                          ( +-  2.03% )

--threads --build-id=sha1
      12845.420265 task-clock (msec)         #    1.097 CPUs utilized            ( +-  1.95% )
             4,020 context-switches          #    0.313 K/sec                    ( +-  2.89% )
               369 cpu-migrations            #    0.029 K/sec                    ( +-  6.26% )
         1,464,822 page-faults               #    0.114 M/sec                    ( +-  1.37% )
    40,668,449,813 cycles                    #    3.166 GHz                      ( +-  0.96% )
    18,863,982,388 stalled-cycles-frontend   #   46.38% frontend cycles idle     ( +-  1.82% )
   <not supported> stalled-cycles-backend
    71,560,499,058 instructions              #    1.76  insns per cycle
                                             #    0.26  stalled cycles per insn  ( +-  0.14% )
    10,044,152,441 branches                  #  781.925 M/sec                    ( +-  0.19% )
        87,835,773 branch-misses             #    0.87% of all branches          ( +-  0.09% )

      11.711773314 seconds time elapsed                                          ( +-  1.51% )

Modified:
    lld/trunk/include/lld/Core/Parallel.h

Modified: lld/trunk/include/lld/Core/Parallel.h
URL: http://llvm.org/viewvc/llvm-project/lld/trunk/include/lld/Core/Parallel.h?rev=287140&r1=287139&r2=287140&view=diff
==============================================================================

--- lld/trunk/include/lld/Core/Parallel.h (original)
+++ lld/trunk/include/lld/Core/Parallel.h Wed Nov 16 13:27:33 2016
@@ -283,9 +283,20 @@ void parallel_for_each(Iterator begin, I
 #else
 template <class Iterator, class Func>
 void parallel_for_each(Iterator begin, Iterator end, Func func) {
+  // TaskGroup has a relatively high overhead, so we want to reduce
+  // the number of spawn() calls. We'll create up to 1024 tasks here.
+  // (Note that 1024 is an arbitrary number. This code probably needs
+  // improving to take the number of available cores into account.)
+  ptrdiff_t taskSize = std::distance(begin, end) / 1024;
+  if (taskSize == 0)
+    taskSize = 1;
+
   TaskGroup tg;
-  for (; begin != end; ++begin)
-    tg.spawn([=, &func] { func(*begin); });
+  while (taskSize <= std::distance(begin, end)) {
+    tg.spawn([=, &func] { std::for_each(begin, begin + taskSize, func); });
+    begin += taskSize;
+  }
+  std::for_each(begin, end, func);
 }
 #endif
 } // end namespace lld