[clang] 0915825 - [ThinLTO] Allow usage of all hardware threads in the system

Fri Mar 27 07:21:11 PDT 2020

Author: Alexandre Ganea
Date: 2020-03-27T10:20:58-04:00
New Revision: 09158252f777c2e2f06a86b154c44abcbcf9bb74

URL: https://github.com/llvm/llvm-project/commit/09158252f777c2e2f06a86b154c44abcbcf9bb74
DIFF: https://github.com/llvm/llvm-project/commit/09158252f777c2e2f06a86b154c44abcbcf9bb74.diff

LOG: [ThinLTO] Allow usage of all hardware threads in the system

Before this patch, it wasn't possible to extend the ThinLTO threads to all SMT/CMT threads in the system. Only one thread per core was allowed, instructed by usage of llvm::heavyweight_hardware_concurrency() in the ThinLTO code. Any number passed to the LLD flag /opt:lldltojobs=..., or any other ThinLTO-specific flag, was previously interpreted in the context of llvm::heavyweight_hardware_concurrency(), which means SMT disabled.

One can now say in LLD:
/opt:lldltojobs=0 -- Use one std::thread / hardware core in the system (no SMT). Default value if flag not specified.
/opt:lldltojobs=N -- Limit usage to N threads, regardless of usage of heavyweight_hardware_concurrency().
/opt:lldltojobs=all -- Use all hardware threads in the system. Equivalent to /opt:lldltojobs=$(nproc) on Linux and /opt:lldltojobs=%NUMBER_OF_PROCESSORS% on Windows. When an affinity mask is set for the process, threads will be created only for the cores selected by the mask.

When N > number-of-hardware-threads-in-the-system, the threads in the thread pool will be dispatched equally on all CPU sockets (tested only on Windows).
When N <= number-of-hardware-threads-on-a-CPU-socket, the threads will remain on the CPU socket where the process started (only on Windows).

Differential Revision: https://reviews.llvm.org/D75153

Added: 
    

Modified: 
    clang/lib/Driver/ToolChains/CommonArgs.cpp
    clang/lib/Driver/ToolChains/CommonArgs.h
    clang/lib/Driver/ToolChains/Darwin.cpp
    lld/COFF/Config.h
    lld/COFF/Driver.cpp
    lld/COFF/LTO.cpp
    lld/ELF/Config.h
    lld/ELF/Driver.cpp
    lld/ELF/LTO.cpp
    lld/test/COFF/thinlto.ll
    lld/test/ELF/basic.s
    lld/test/ELF/lto/thinlto.ll
    lld/test/wasm/lto/thinlto.ll
    lld/wasm/Config.h
    lld/wasm/Driver.cpp
    lld/wasm/LTO.cpp
    llvm/include/llvm/LTO/LTO.h
    llvm/include/llvm/Support/Threading.h
    llvm/lib/LTO/LTO.cpp
    llvm/lib/Support/Threading.cpp
    llvm/lib/Support/Unix/Threading.inc
    llvm/lib/Support/Windows/Threading.inc
    llvm/tools/gold/gold-plugin.cpp
    llvm/tools/llvm-lto2/llvm-lto2.cpp

Removed: 
    


################################################################################
diff  --git a/clang/lib/Driver/ToolChains/CommonArgs.cpp b/clang/lib/Driver/ToolChains/CommonArgs.cpp
index 51b2e5f3c34d..303ced52454c 100644

--- a/clang/lib/Driver/ToolChains/CommonArgs.cpp
+++ b/clang/lib/Driver/ToolChains/CommonArgs.cpp
@@ -50,6 +50,7 @@
 #include "llvm/Support/Program.h"
 #include "llvm/Support/ScopedPrinter.h"
 #include "llvm/Support/TargetParser.h"
+#include "llvm/Support/Threading.h"
 #include "llvm/Support/VirtualFileSystem.h"
 #include "llvm/Support/YAMLParser.h"
 
@@ -338,14 +339,14 @@ std::string tools::getCPUName(const ArgList &Args, const llvm::Triple &T,
   }
 }
 
-unsigned tools::getLTOParallelism(const ArgList &Args, const Driver &D) {
-  unsigned Parallelism = 0;
+llvm::StringRef tools::getLTOParallelism(const ArgList &Args, const Driver &D) {
   Arg *LtoJobsArg = Args.getLastArg(options::OPT_flto_jobs_EQ);
-  if (LtoJobsArg &&
-      StringRef(LtoJobsArg->getValue()).getAsInteger(10, Parallelism))
-    D.Diag(diag::err_drv_invalid_int_value) << LtoJobsArg->getAsString(Args)
-                                            << LtoJobsArg->getValue();
-  return Parallelism;
+  if (!LtoJobsArg)
+    return {};
+  if (!llvm::get_threadpool_strategy(LtoJobsArg->getValue()))
+    D.Diag(diag::err_drv_invalid_int_value)
+        << LtoJobsArg->getAsString(Args) << LtoJobsArg->getValue();
+  return LtoJobsArg->getValue();
 }
 
 // CloudABI uses -ffunction-sections and -fdata-sections by default.
@@ -410,7 +411,8 @@ void tools::addLTOOptions(const ToolChain &ToolChain, const ArgList &Args,
   if (IsThinLTO)
     CmdArgs.push_back("-plugin-opt=thinlto");
 
-  if (unsigned Parallelism = getLTOParallelism(Args, ToolChain.getDriver()))
+  StringRef Parallelism = getLTOParallelism(Args, ToolChain.getDriver());
+  if (!Parallelism.empty())
     CmdArgs.push_back(
         Args.MakeArgString("-plugin-opt=jobs=" + Twine(Parallelism)));
 

diff  --git a/clang/lib/Driver/ToolChains/CommonArgs.h b/clang/lib/Driver/ToolChains/CommonArgs.h
index 984f3ee98af1..c94b2b828c9b 100644
--- a/clang/lib/Driver/ToolChains/CommonArgs.h
+++ b/clang/lib/Driver/ToolChains/CommonArgs.h
@@ -88,7 +88,8 @@ llvm::opt::Arg *getLastProfileSampleUseArg(const llvm::opt::ArgList &Args);
 
 bool isObjCAutoRefCount(const llvm::opt::ArgList &Args);
 
-unsigned getLTOParallelism(const llvm::opt::ArgList &Args, const Driver &D);
+llvm::StringRef getLTOParallelism(const llvm::opt::ArgList &Args,
+                                  const Driver &D);
 
 bool areOptimizationsEnabled(const llvm::opt::ArgList &Args);
 

diff  --git a/clang/lib/Driver/ToolChains/Darwin.cpp b/clang/lib/Driver/ToolChains/Darwin.cpp
index d1aec649cdc7..451d0d206d07 100644
--- a/clang/lib/Driver/ToolChains/Darwin.cpp
+++ b/clang/lib/Driver/ToolChains/Darwin.cpp
@@ -23,6 +23,7 @@
 #include "llvm/Support/Path.h"
 #include "llvm/Support/ScopedPrinter.h"
 #include "llvm/Support/TargetParser.h"
+#include "llvm/Support/Threading.h"
 #include "llvm/Support/VirtualFileSystem.h"
 #include <cstdlib> // ::getenv
 
@@ -605,10 +606,12 @@ void darwin::Linker::ConstructJob(Compilation &C, const JobAction &JA,
 
   getMachOToolChain().addProfileRTLibs(Args, CmdArgs);
 
-  if (unsigned Parallelism =
-          getLTOParallelism(Args, getToolChain().getDriver())) {
+  StringRef Parallelism = getLTOParallelism(Args, getToolChain().getDriver());
+  if (!Parallelism.empty()) {
     CmdArgs.push_back("-mllvm");
-    CmdArgs.push_back(Args.MakeArgString("-threads=" + Twine(Parallelism)));
+    unsigned NumThreads =
+        llvm::get_threadpool_strategy(Parallelism)->compute_thread_count();
+    CmdArgs.push_back(Args.MakeArgString("-threads=" + Twine(NumThreads)));
   }
 
   if (getToolChain().ShouldLinkCXXStdlib(Args))

diff  --git a/lld/COFF/Config.h b/lld/COFF/Config.h
index e376ea51f133..93b732ea6821 100644
--- a/lld/COFF/Config.h
+++ b/lld/COFF/Config.h
@@ -144,7 +144,7 @@ struct Configuration {
   unsigned ltoo = 2;
 
   // Used for /opt:lldltojobs=N
-  unsigned thinLTOJobs = 0;
+  std::string thinLTOJobs;
   // Used for /opt:lldltopartitions=N
   unsigned ltoPartitions = 1;
 

diff  --git a/lld/COFF/Driver.cpp b/lld/COFF/Driver.cpp
index 8b3ba1cdf24b..7d99f798a322 100644
--- a/lld/COFF/Driver.cpp
+++ b/lld/COFF/Driver.cpp
@@ -1417,9 +1417,9 @@ void LinkerDriver::link(ArrayRef<const char *> argsArr) {
           error("/opt:lldlto: invalid optimization level: " + optLevel);
       } else if (s.startswith("lldltojobs=")) {
         StringRef jobs = s.substr(11);
-        if (jobs.getAsInteger(10, config->thinLTOJobs) ||
-            config->thinLTOJobs == 0)
+        if (!get_threadpool_strategy(jobs))
           error("/opt:lldltojobs: invalid job count: " + jobs);
+        config->thinLTOJobs = jobs.str();
       } else if (s.startswith("lldltopartitions=")) {
         StringRef n = s.substr(17);
         if (n.getAsInteger(10, config->ltoPartitions) ||

diff  --git a/lld/COFF/LTO.cpp b/lld/COFF/LTO.cpp
index 247331b16928..94906e500593 100644
--- a/lld/COFF/LTO.cpp
+++ b/lld/COFF/LTO.cpp
@@ -101,8 +101,9 @@ BitcodeCompiler::BitcodeCompiler() {
         std::string(config->thinLTOPrefixReplace.first),
         std::string(config->thinLTOPrefixReplace.second),
         config->thinLTOEmitImportsFiles, indexFile.get(), OnIndexWrite);
-  } else if (config->thinLTOJobs != 0) {
-    backend = lto::createInProcessThinBackend(config->thinLTOJobs);
+  } else {
+    backend = lto::createInProcessThinBackend(
+        llvm::heavyweight_hardware_concurrency(config->thinLTOJobs));
   }
 
   ltoObj = std::make_unique<lto::LTO>(createConfig(), backend,

diff  --git a/lld/ELF/Config.h b/lld/ELF/Config.h
index 9f9f0d696f63..0263708634cb 100644
--- a/lld/ELF/Config.h
+++ b/lld/ELF/Config.h
@@ -246,7 +246,7 @@ struct Configuration {
   unsigned ltoPartitions;
   unsigned ltoo;
   unsigned optimize;
-  unsigned thinLTOJobs;
+  StringRef thinLTOJobs;
   unsigned timeTraceGranularity;
   int32_t splitStackAdjustSize;
 

diff  --git a/lld/ELF/Driver.cpp b/lld/ELF/Driver.cpp
index f9a1340e25d3..95276cd33420 100644
--- a/lld/ELF/Driver.cpp
+++ b/lld/ELF/Driver.cpp
@@ -974,7 +974,7 @@ static void readConfigs(opt::InputArgList &args) {
   config->thinLTOIndexOnly = args.hasArg(OPT_thinlto_index_only) ||
                              args.hasArg(OPT_thinlto_index_only_eq);
   config->thinLTOIndexOnlyArg = args.getLastArgValue(OPT_thinlto_index_only_eq);
-  config->thinLTOJobs = args::getInteger(args, OPT_thinlto_jobs, -1u);
+  config->thinLTOJobs = args.getLastArgValue(OPT_thinlto_jobs);
   config->thinLTOObjectSuffixReplace =
       getOldNewOptions(args, OPT_thinlto_object_suffix_replace_eq);
   config->thinLTOPrefixReplace =
@@ -1040,8 +1040,8 @@ static void readConfigs(opt::InputArgList &args) {
     error("invalid optimization level for LTO: " + Twine(config->ltoo));
   if (config->ltoPartitions == 0)
     error("--lto-partitions: number of threads must be > 0");
-  if (config->thinLTOJobs == 0)
-    error("--thinlto-jobs: number of threads must be > 0");
+  if (!get_threadpool_strategy(config->thinLTOJobs))
+    error("--thinlto-jobs: invalid job count: " + config->thinLTOJobs);
 
   if (config->splitStackAdjustSize < 0)
     error("--split-stack-adjust-size: size must be >= 0");

diff  --git a/lld/ELF/LTO.cpp b/lld/ELF/LTO.cpp
index ba943283706a..f4e04a82f0c2 100644
--- a/lld/ELF/LTO.cpp
+++ b/lld/ELF/LTO.cpp
@@ -146,8 +146,9 @@ BitcodeCompiler::BitcodeCompiler() {
         std::string(config->thinLTOPrefixReplace.first),
         std::string(config->thinLTOPrefixReplace.second),
         config->thinLTOEmitImportsFiles, indexFile.get(), onIndexWrite);
-  } else if (config->thinLTOJobs != -1U) {
-    backend = lto::createInProcessThinBackend(config->thinLTOJobs);
+  } else {
+    backend = lto::createInProcessThinBackend(
+        llvm::heavyweight_hardware_concurrency(config->thinLTOJobs));
   }
 
   ltoObj = std::make_unique<lto::LTO>(createConfig(), backend,

diff  --git a/lld/test/COFF/thinlto.ll b/lld/test/COFF/thinlto.ll
index 517be330c710..a8390721cc42 100644
--- a/lld/test/COFF/thinlto.ll
+++ b/lld/test/COFF/thinlto.ll
@@ -6,6 +6,16 @@
 ; RUN: lld-link /lldsavetemps /out:%T/thinlto/main.exe /entry:main /subsystem:console %T/thinlto/main.obj %T/thinlto/foo.obj
 ; RUN: llvm-nm %T/thinlto/main.exe1.lto.obj | FileCheck %s
 
+; Test various possible options for /opt:lldltojobs
+; RUN: lld-link /lldsavetemps /out:%T/thinlto/main.exe /entry:main /subsystem:console %T/thinlto/main.obj %T/thinlto/foo.obj /opt:lldltojobs=1
+; RUN: llvm-nm %T/thinlto/main.exe1.lto.obj | FileCheck %s
+; RUN: lld-link /lldsavetemps /out:%T/thinlto/main.exe /entry:main /subsystem:console %T/thinlto/main.obj %T/thinlto/foo.obj /opt:lldltojobs=all
+; RUN: llvm-nm %T/thinlto/main.exe1.lto.obj | FileCheck %s
+; RUN: lld-link /lldsavetemps /out:%T/thinlto/main.exe /entry:main /subsystem:console %T/thinlto/main.obj %T/thinlto/foo.obj /opt:lldltojobs=1000
+; RUN: llvm-nm %T/thinlto/main.exe1.lto.obj | FileCheck %s
+; RUN: not lld-link /lldsavetemps /out:%T/thinlto/main.exe /entry:main /subsystem:console %T/thinlto/main.obj %T/thinlto/foo.obj /opt:lldltojobs=foo 2>&1 | FileCheck %s --check-prefix=BAD-JOBS
+; BAD-JOBS: error: /opt:lldltojobs: invalid job count: foo
+
 ; This command will store full path to foo.obj in the archive %t.lib
 ; Check that /lldsavetemps is still usable in such case.
 ; RUN: lld-link /lib %T/thinlto/foo.obj /out:%t.lib

diff  --git a/lld/test/ELF/basic.s b/lld/test/ELF/basic.s
index b0509a2dc123..5300c6efa2ef 100644
--- a/lld/test/ELF/basic.s
+++ b/lld/test/ELF/basic.s
@@ -249,9 +249,21 @@ _start:
 # RUN: not ld.lld %t --plugin-opt=lto-partitions=0 2>&1 | FileCheck --check-prefix=NOTHREADS %s
 # NOTHREADS: --lto-partitions: number of threads must be > 0
 
-# RUN: not ld.lld %t --thinlto-jobs=0 2>&1 | FileCheck --check-prefix=NOTHREADSTHIN %s
-# RUN: not ld.lld %t --plugin-opt=jobs=0 2>&1 | FileCheck --check-prefix=NOTHREADSTHIN %s
-# NOTHREADSTHIN: --thinlto-jobs: number of threads must be > 0
+# RUN: ld.lld %t --thinlto-jobs=0 -verbose 2>&1 | FileCheck --check-prefix=THREADSTHIN %s
+# RUN: ld.lld %t --thinlto-jobs=1 -verbose 2>&1 | FileCheck --check-prefix=THREADSTHIN %s
+# RUN: ld.lld %t --thinlto-jobs=2 -verbose 2>&1 | FileCheck --check-prefix=THREADSTHIN %s
+# RUN: ld.lld %t --thinlto-jobs=all -verbose 2>&1 | FileCheck --check-prefix=THREADSTHIN %s
+# RUN: ld.lld %t --thinlto-jobs=1000 -verbose 2>&1 | FileCheck --check-prefix=THREADSTHIN %s
+# THREADSTHIN: basic.s.tmp
+# RUN: not ld.lld %t --thinlto-jobs=foo -verbose 2>&1 | FileCheck --check-prefix=BADTHREADSTHIN %s
+# BADTHREADSTHIN: error: --thinlto-jobs: invalid job count: foo
+
+# RUN: ld.lld %t --plugin-opt=jobs=0 -verbose 2>&1 | FileCheck --check-prefix=THREADSTHIN %s
+# RUN: ld.lld %t --plugin-opt=jobs=1 -verbose 2>&1 | FileCheck --check-prefix=THREADSTHIN %s
+# RUN: ld.lld %t --plugin-opt=jobs=2 -verbose 2>&1 | FileCheck --check-prefix=THREADSTHIN %s
+# RUN: ld.lld %t --plugin-opt=jobs=all -verbose 2>&1 | FileCheck --check-prefix=THREADSTHIN %s
+# RUN: ld.lld %t --plugin-opt=jobs=1000 -verbose 2>&1 | FileCheck --check-prefix=THREADSTHIN %s
+# RUN: not ld.lld %t --plugin-opt=jobs=foo -verbose 2>&1 | FileCheck --check-prefix=BADTHREADSTHIN %s
 
 # RUN: not ld.lld %t -z ifunc-noplt -z text 2>&1 | FileCheck --check-prefix=NOIFUNCPLTNOTEXTREL %s
 # NOIFUNCPLTNOTEXTREL: -z text and -z ifunc-noplt may not be used together

diff  --git a/lld/test/ELF/lto/thinlto.ll b/lld/test/ELF/lto/thinlto.ll
index 2a7ffb90f6ff..ce37433f0bf4 100644
--- a/lld/test/ELF/lto/thinlto.ll
+++ b/lld/test/ELF/lto/thinlto.ll
@@ -16,8 +16,25 @@
 ; RUN: llvm-nm %t31.lto.o | FileCheck %s --check-prefix=NM1
 ; RUN: llvm-nm %t32.lto.o | FileCheck %s --check-prefix=NM2
 
-; Then check without --thinlto-jobs (which currently default to hardware_concurrency)
-; RUN: ld.lld -shared %t1.o %t2.o -o %t3
+; Test with all threads, on all cores, on all CPU sockets
+; RUN: rm -f %t31.lto.o %t32.lto.o
+; RUN: ld.lld -save-temps --thinlto-jobs=all -shared %t1.o %t2.o -o %t3
+; RUN: llvm-nm %t31.lto.o | FileCheck %s --check-prefix=NM1
+; RUN: llvm-nm %t32.lto.o | FileCheck %s --check-prefix=NM2
+
+; Test with many more threads than the system has
+; RUN: rm -f %t31.lto.o %t32.lto.o
+; RUN: ld.lld -save-temps --thinlto-jobs=1000 -shared %t1.o %t2.o -o %t3
+; RUN: llvm-nm %t31.lto.o | FileCheck %s --check-prefix=NM1
+; RUN: llvm-nm %t32.lto.o | FileCheck %s --check-prefix=NM2
+
+; Test with a bad value
+; RUN: rm -f %t31.lto.o %t32.lto.o
+; RUN: not ld.lld -save-temps --thinlto-jobs=foo -shared %t1.o %t2.o -o %t3 2>&1 | FileCheck %s --check-prefix=BAD-JOBS
+; BAD-JOBS: error: --thinlto-jobs: invalid job count: foo
+
+; Then check without --thinlto-jobs (which currently defaults to heavyweight_hardware_concurrency, meanning one thread per hardware core -- not SMT)
+; RUN: ld.lld -shared -save-temps %t1.o %t2.o -o %t3
 ; RUN: llvm-nm %t31.lto.o | FileCheck %s --check-prefix=NM1
 ; RUN: llvm-nm %t32.lto.o | FileCheck %s --check-prefix=NM2
 

diff  --git a/lld/test/wasm/lto/thinlto.ll b/lld/test/wasm/lto/thinlto.ll
index 062da1a33b89..460452905597 100644
--- a/lld/test/wasm/lto/thinlto.ll
+++ b/lld/test/wasm/lto/thinlto.ll
@@ -14,8 +14,26 @@
 ; RUN: llvm-nm %t31.lto.o | FileCheck %s --check-prefix=NM1
 ; RUN: llvm-nm %t32.lto.o | FileCheck %s --check-prefix=NM2
 
-; Check without --thinlto-jobs (which currently default to hardware_concurrency)
-; RUN: wasm-ld -r %t1.o %t2.o -o %t3
+; Test with all threads, on all cores, on all CPU sockets
+; RUN: rm -f %t31.lto.o %t32.lto.o
+; RUN: wasm-ld -r -save-temps --thinlto-jobs=all %t1.o %t2.o -o %t3
+; RUN: llvm-nm %t31.lto.o | FileCheck %s --check-prefix=NM1
+; RUN: llvm-nm %t32.lto.o | FileCheck %s --check-prefix=NM2
+
+; Test with many more threads than the system has
+; RUN: rm -f %t31.lto.o %t32.lto.o
+; RUN: wasm-ld -r -save-temps --thinlto-jobs=1000 %t1.o %t2.o -o %t3
+; RUN: llvm-nm %t31.lto.o | FileCheck %s --check-prefix=NM1
+; RUN: llvm-nm %t32.lto.o | FileCheck %s --check-prefix=NM2
+
+; Test with a bad value
+; RUN: rm -f %t31.lto.o %t32.lto.o
+; RUN: not wasm-ld -r -save-temps --thinlto-jobs=foo %t1.o %t2.o -o %t3 2>&1 | FileCheck %s --check-prefix=BAD-JOBS
+; BAD-JOBS: error: --thinlto-jobs: invalid job count: foo
+
+; Check without --thinlto-jobs (which currently defaults to heavyweight_hardware_concurrency, meanning one thread per hardware core -- not SMT)
+; RUN: rm -f %t31.lto.o %t32.lto.o
+; RUN: wasm-ld -r -save-temps %t1.o %t2.o -o %t3
 ; RUN: llvm-nm %t31.lto.o | FileCheck %s --check-prefix=NM1
 ; RUN: llvm-nm %t32.lto.o | FileCheck %s --check-prefix=NM2
 

diff  --git a/lld/wasm/Config.h b/lld/wasm/Config.h
index 3b3f2daf576c..b396bda93d58 100644
--- a/lld/wasm/Config.h
+++ b/lld/wasm/Config.h
@@ -53,7 +53,7 @@ struct Configuration {
   unsigned ltoPartitions;
   unsigned ltoo;
   unsigned optimize;
-  unsigned thinLTOJobs;
+  llvm::StringRef thinLTOJobs;
 
   llvm::StringRef entry;
   llvm::StringRef outputFile;

diff  --git a/lld/wasm/Driver.cpp b/lld/wasm/Driver.cpp
index 73b330ae1470..b38e980004ad 100644
--- a/lld/wasm/Driver.cpp
+++ b/lld/wasm/Driver.cpp
@@ -362,7 +362,7 @@ static void readConfigs(opt::InputArgList &args) {
   config->thinLTOCachePolicy = CHECK(
       parseCachePruningPolicy(args.getLastArgValue(OPT_thinlto_cache_policy)),
       "--thinlto-cache-policy: invalid cache policy");
-  config->thinLTOJobs = args::getInteger(args, OPT_thinlto_jobs, -1u);
+  config->thinLTOJobs = args.getLastArgValue(OPT_thinlto_jobs);
   errorHandler().verbose = args.hasArg(OPT_verbose);
   LLVM_DEBUG(errorHandler().verbose = true);
   threadsEnabled = args.hasFlag(OPT_threads, OPT_no_threads, true);
@@ -415,8 +415,8 @@ static void checkOptions(opt::InputArgList &args) {
     error("invalid optimization level for LTO: " + Twine(config->ltoo));
   if (config->ltoPartitions == 0)
     error("--lto-partitions: number of threads must be > 0");
-  if (config->thinLTOJobs == 0)
-    error("--thinlto-jobs: number of threads must be > 0");
+  if (!get_threadpool_strategy(config->thinLTOJobs))
+    error("--thinlto-jobs: invalid job count: " + config->thinLTOJobs);
 
   if (config->pie && config->shared)
     error("-shared and -pie may not be used together");

diff  --git a/lld/wasm/LTO.cpp b/lld/wasm/LTO.cpp
index 0b7d8cb12d0e..28b62022f29f 100644
--- a/lld/wasm/LTO.cpp
+++ b/lld/wasm/LTO.cpp
@@ -63,10 +63,8 @@ static std::unique_ptr<lto::LTO> createLTO() {
   if (config->saveTemps)
     checkError(c.addSaveTemps(config->outputFile.str() + ".",
                               /*UseInputModulePath*/ true));
-
-  lto::ThinBackend backend;
-  if (config->thinLTOJobs != -1U)
-    backend = lto::createInProcessThinBackend(config->thinLTOJobs);
+  lto::ThinBackend backend = lto::createInProcessThinBackend(
+      llvm::heavyweight_hardware_concurrency(config->thinLTOJobs));
   return std::make_unique<lto::LTO>(std::move(c), backend,
                                      config->ltoPartitions);
 }

diff  --git a/llvm/include/llvm/LTO/LTO.h b/llvm/include/llvm/LTO/LTO.h
index df2853345682..e93439819415 100644
--- a/llvm/include/llvm/LTO/LTO.h
+++ b/llvm/include/llvm/LTO/LTO.h
@@ -228,7 +228,7 @@ using ThinBackend = std::function<std::unique_ptr<ThinBackendProc>(
 
 /// This ThinBackend runs the individual backend jobs in-process.
 /// The default value means to use one job per hardware core (not hyper-thread).
-ThinBackend createInProcessThinBackend(unsigned ParallelismLevel = 0);
+ThinBackend createInProcessThinBackend(ThreadPoolStrategy Parallelism);
 
 /// This ThinBackend writes individual module indexes to files, instead of
 /// running the individual backend jobs. This backend is for distributed builds

diff  --git a/llvm/include/llvm/Support/Threading.h b/llvm/include/llvm/Support/Threading.h
index 24a0789e0964..f133c790ad96 100644
--- a/llvm/include/llvm/Support/Threading.h
+++ b/llvm/include/llvm/Support/Threading.h
@@ -166,8 +166,20 @@ void llvm_execute_on_thread_async(
     /// sockets. \p ThreadPoolNum represents a number bounded by [0,
     /// compute_thread_count()).
     void apply_thread_strategy(unsigned ThreadPoolNum) const;
+
+    /// Finds the CPU socket where a thread should go. Returns 'None' if the
+    /// thread shall remain on the actual CPU socket.
+    Optional<unsigned> compute_cpu_socket(unsigned ThreadPoolNum) const;
   };
 
+  /// Build a strategy from a number of threads as a string provided in \p Num.
+  /// When Num is above the max number of threads specified by the \p Default
+  /// strategy, we attempt to equally allocate the threads on all CPU sockets.
+  /// "0" or an empty string will return the \p Default strategy.
+  /// "all" for using all hardware threads.
+  Optional<ThreadPoolStrategy>
+  get_threadpool_strategy(StringRef Num, ThreadPoolStrategy Default = {});
+
   /// Returns a thread strategy for tasks requiring significant memory or other
   /// resources. To be used for workloads where hardware_concurrency() proves to
   /// be less efficient. Avoid this strategy if doing lots of I/O. Currently
@@ -182,6 +194,18 @@ void llvm_execute_on_thread_async(
     return S;
   }
 
+  /// Like heavyweight_hardware_concurrency() above, but builds a strategy
+  /// based on the rules described for get_threadpool_strategy().
+  /// If \p Num is invalid, returns a default strategy where one thread per
+  /// hardware core is used.
+  inline ThreadPoolStrategy heavyweight_hardware_concurrency(StringRef Num) {
+    Optional<ThreadPoolStrategy> S =
+        get_threadpool_strategy(Num, heavyweight_hardware_concurrency());
+    if (S)
+      return *S;
+    return heavyweight_hardware_concurrency();
+  }
+
   /// Returns a default thread strategy where all available hardware ressources
   /// are to be used, except for those initially excluded by an affinity mask.
   /// This function takes affinity into consideration. Returns 1 when LLVM is

diff  --git a/llvm/lib/LTO/LTO.cpp b/llvm/lib/LTO/LTO.cpp
index f8affcb20cef..f652bd628770 100644
--- a/llvm/lib/LTO/LTO.cpp
+++ b/llvm/lib/LTO/LTO.cpp
@@ -477,7 +477,8 @@ LTO::RegularLTOState::RegularLTOState(unsigned ParallelCodeGenParallelismLevel,
 LTO::ThinLTOState::ThinLTOState(ThinBackend Backend)
     : Backend(Backend), CombinedIndex(/*HaveGVs*/ false) {
   if (!Backend)
-    this->Backend = createInProcessThinBackend();
+    this->Backend =
+        createInProcessThinBackend(llvm::heavyweight_hardware_concurrency());
 }
 
 LTO::LTO(Config Conf, ThinBackend Backend,
@@ -1090,13 +1091,12 @@ class InProcessThinBackend : public ThinBackendProc {
 public:
   InProcessThinBackend(
       const Config &Conf, ModuleSummaryIndex &CombinedIndex,
-      unsigned ThinLTOParallelismLevel,
+      ThreadPoolStrategy ThinLTOParallelism,
       const StringMap<GVSummaryMapTy> &ModuleToDefinedGVSummaries,
       AddStreamFn AddStream, NativeObjectCache Cache)
       : ThinBackendProc(Conf, CombinedIndex, ModuleToDefinedGVSummaries),
-        BackendThreadPool(
-            heavyweight_hardware_concurrency(ThinLTOParallelismLevel)),
-        AddStream(std::move(AddStream)), Cache(std::move(Cache)) {
+        BackendThreadPool(ThinLTOParallelism), AddStream(std::move(AddStream)),
+        Cache(std::move(Cache)) {
     for (auto &Name : CombinedIndex.cfiFunctionDefs())
       CfiFunctionDefs.insert(
           GlobalValue::getGUID(GlobalValue::dropLLVMManglingEscape(Name)));
@@ -1192,13 +1192,13 @@ class InProcessThinBackend : public ThinBackendProc {
 };
 } // end anonymous namespace
 
-ThinBackend lto::createInProcessThinBackend(unsigned ParallelismLevel) {
+ThinBackend lto::createInProcessThinBackend(ThreadPoolStrategy Parallelism) {
   return [=](const Config &Conf, ModuleSummaryIndex &CombinedIndex,
              const StringMap<GVSummaryMapTy> &ModuleToDefinedGVSummaries,
              AddStreamFn AddStream, NativeObjectCache Cache) {
     return std::make_unique<InProcessThinBackend>(
-        Conf, CombinedIndex, ParallelismLevel, ModuleToDefinedGVSummaries,
-        AddStream, Cache);
+        Conf, CombinedIndex, Parallelism, ModuleToDefinedGVSummaries, AddStream,
+        Cache);
   };
 }
 

diff  --git a/llvm/lib/Support/Threading.cpp b/llvm/lib/Support/Threading.cpp
index de5adaddd9d3..39de05892981 100644
--- a/llvm/lib/Support/Threading.cpp
+++ b/llvm/lib/Support/Threading.cpp
@@ -84,16 +84,34 @@ void llvm::llvm_execute_on_thread_async(
 int computeHostNumHardwareThreads();
 
 unsigned llvm::ThreadPoolStrategy::compute_thread_count() const {
+  if (ThreadsRequested > 0)
+    return ThreadsRequested;
+
   int MaxThreadCount = UseHyperThreads ? computeHostNumHardwareThreads()
                                        : sys::getHostNumPhysicalCores();
   if (MaxThreadCount <= 0)
     MaxThreadCount = 1;
+  return MaxThreadCount;
+}
 
-  // No need to create more threads than there are hardware threads, it would
-  // uselessly induce more context-switching and cache eviction.
-  if (!ThreadsRequested || ThreadsRequested > (unsigned)MaxThreadCount)
-    return MaxThreadCount;
-  return ThreadsRequested;
+Optional<ThreadPoolStrategy>
+llvm::get_threadpool_strategy(StringRef Num, ThreadPoolStrategy Default) {
+  if (Num == "all")
+    return llvm::hardware_concurrency();
+  if (Num.empty())
+    return Default;
+  unsigned V;
+  if (Num.getAsInteger(10, V))
+    return None; // malformed 'Num' value
+  if (V == 0)
+    return Default;
+
+  // Do not take the Default into account. This effectively disables
+  // heavyweight_hardware_concurrency() if the user asks for any number of
+  // threads on the cmd-line.
+  ThreadPoolStrategy S = llvm::hardware_concurrency();
+  S.ThreadsRequested = V;
+  return S;
 }
 
 namespace {

diff  --git a/llvm/lib/Support/Unix/Threading.inc b/llvm/lib/Support/Unix/Threading.inc
index 8cacaa83e961..307a24469144 100644
--- a/llvm/lib/Support/Unix/Threading.inc
+++ b/llvm/lib/Support/Unix/Threading.inc
@@ -273,7 +273,7 @@ SetThreadPriorityResult llvm::set_thread_priority(ThreadPriority Priority) {
 int computeHostNumHardwareThreads() {
 #if defined(HAVE_SCHED_GETAFFINITY) && defined(HAVE_CPU_COUNT)
   cpu_set_t Set;
-  if (sched_getaffinity(0, sizeof(Set), &Set))
+  if (sched_getaffinity(0, sizeof(Set), &Set) == 0)
     return CPU_COUNT(&Set);
 #endif
   // Guard against std::thread::hardware_concurrency() returning 0.

diff  --git a/llvm/lib/Support/Windows/Threading.inc b/llvm/lib/Support/Windows/Threading.inc
index 50c94068b395..296e87b77695 100644
--- a/llvm/lib/Support/Windows/Threading.inc
+++ b/llvm/lib/Support/Windows/Threading.inc
@@ -131,6 +131,10 @@ struct ProcessorGroup {
   unsigned UsableThreads;
   unsigned ThreadsPerCore;
   uint64_t Affinity;
+
+  unsigned useableCores() const {
+    return std::max(1U, UsableThreads / ThreadsPerCore);
+  }
 };
 
 template <typename F>
@@ -232,33 +236,41 @@ int computeHostNumHardwareThreads() {
   return Threads;
 }
 
-// Assign the current thread to a more appropriate CPU socket or CPU group
-void llvm::ThreadPoolStrategy::apply_thread_strategy(
-    unsigned ThreadPoolNum) const {
+// Finds the proper CPU socket where a thread number should go. Returns 'None'
+// if the thread shall remain on the actual CPU socket.
+Optional<unsigned>
+llvm::ThreadPoolStrategy::compute_cpu_socket(unsigned ThreadPoolNum) const {
   ArrayRef<ProcessorGroup> Groups = getProcessorGroups();
+  // Only one CPU socket in the system or process affinity was set, no need to
+  // move the thread(s) to another CPU socket.
+  if (Groups.size() <= 1)
+    return None;
+
+  // We ask for less threads than there are hardware threads per CPU socket, no
+  // need to dispatch threads to other CPU sockets.
+  unsigned MaxThreadsPerSocket =
+      UseHyperThreads ? Groups[0].UsableThreads : Groups[0].useableCores();
+  if (compute_thread_count() <= MaxThreadsPerSocket)
+    return None;
 
   assert(ThreadPoolNum < compute_thread_count() &&
          "The thread index is not within thread strategy's range!");
 
-  // In this mode, the ThreadNumber represents the core number, not the
-  // hyper-thread number. Assumes all NUMA groups have the same amount of
-  // hyper-threads.
-  if (!UseHyperThreads)
-    ThreadPoolNum *= Groups[0].ThreadsPerCore;
-
-  unsigned ThreadRangeStart = 0;
-  for (unsigned I = 0; I < Groups.size(); ++I) {
-    const ProcessorGroup &G = Groups[I];
-    if (ThreadPoolNum >= ThreadRangeStart &&
-        ThreadPoolNum < ThreadRangeStart + G.UsableThreads) {
-
-      GROUP_AFFINITY Affinity{};
-      Affinity.Group = G.ID;
-      Affinity.Mask = G.Affinity;
-      SetThreadGroupAffinity(GetCurrentThread(), &Affinity, nullptr);
-    }
-    ThreadRangeStart += G.UsableThreads;
-  }
+  // Assumes the same number of hardware threads per CPU socket.
+  return (ThreadPoolNum * Groups.size()) / compute_thread_count();
+}
+
+// Assign the current thread to a more appropriate CPU socket or CPU group
+void llvm::ThreadPoolStrategy::apply_thread_strategy(
+    unsigned ThreadPoolNum) const {
+  Optional<unsigned> Socket = compute_cpu_socket(ThreadPoolNum);
+  if (!Socket)
+    return;
+  ArrayRef<ProcessorGroup> Groups = getProcessorGroups();
+  GROUP_AFFINITY Affinity{};
+  Affinity.Group = Groups[*Socket].ID;
+  Affinity.Mask = Groups[*Socket].Affinity;
+  SetThreadGroupAffinity(GetCurrentThread(), &Affinity, nullptr);
 }
 
 llvm::BitVector llvm::get_thread_affinity_mask() {

diff  --git a/llvm/tools/gold/gold-plugin.cpp b/llvm/tools/gold/gold-plugin.cpp
index 2db571617181..383f972b8636 100644
--- a/llvm/tools/gold/gold-plugin.cpp
+++ b/llvm/tools/gold/gold-plugin.cpp
@@ -28,6 +28,7 @@
 #include "llvm/Support/MemoryBuffer.h"
 #include "llvm/Support/Path.h"
 #include "llvm/Support/TargetSelect.h"
+#include "llvm/Support/Threading.h"
 #include "llvm/Support/raw_ostream.h"
 #include <list>
 #include <map>
@@ -134,11 +135,9 @@ namespace options {
   };
   static OutputType TheOutputType = OT_NORMAL;
   static unsigned OptLevel = 2;
-  // Default parallelism of 0 used to indicate that user did not specify.
-  // Actual parallelism default value depends on implementation.
   // Currently only affects ThinLTO, where the default is the max cores in the
-  // system.
-  static unsigned Parallelism = 0;
+  // system. See llvm::get_threadpool_strategy() for acceptable values.
+  static std::string Parallelism;
   // Default regular LTO codegen parallelism (number of partitions).
   static unsigned ParallelCodeGenParallelismLevel = 1;
 #ifdef NDEBUG
@@ -272,8 +271,10 @@ namespace options {
         message(LDPL_FATAL, "Optimization level must be between 0 and 3");
       OptLevel = opt[1] - '0';
     } else if (opt.startswith("jobs=")) {
-      if (StringRef(opt_ + 5).getAsInteger(10, Parallelism))
-        message(LDPL_FATAL, "Invalid parallelism level: %s", opt_ + 5);
+      StringRef Num(opt_ + 5);
+      if (!get_threadpool_strategy(Num))
+        message(LDPL_FATAL, "Invalid parallelism level: %s", Num.data());
+      Parallelism = Num;
     } else if (opt.startswith("lto-partitions=")) {
       if (opt.substr(strlen("lto-partitions="))
               .getAsInteger(10, ParallelCodeGenParallelismLevel))
@@ -877,14 +878,15 @@ static std::unique_ptr<LTO> createLTO(IndexWriteCallback OnIndexWrite,
   Conf.PTO.LoopVectorization = options::OptLevel > 1;
   Conf.PTO.SLPVectorization = options::OptLevel > 1;
 
-  if (options::Parallelism)
-    Backend = createInProcessThinBackend(options::Parallelism);
   if (options::thinlto_index_only) {
     std::string OldPrefix, NewPrefix;
     getThinLTOOldAndNewPrefix(OldPrefix, NewPrefix);
     Backend = createWriteIndexesThinBackend(OldPrefix, NewPrefix,
                                             options::thinlto_emit_imports_files,
                                             LinkedObjectsFile, OnIndexWrite);
+  } else {
+    Backend = createInProcessThinBackend(
+        llvm::heavyweight_hardware_concurrency(options::Parallelism));
   }
 
   Conf.OverrideTriple = options::triple;

diff  --git a/llvm/tools/llvm-lto2/llvm-lto2.cpp b/llvm/tools/llvm-lto2/llvm-lto2.cpp
index dda596be0ce6..6d78082f980e 100644
--- a/llvm/tools/llvm-lto2/llvm-lto2.cpp
+++ b/llvm/tools/llvm-lto2/llvm-lto2.cpp
@@ -68,9 +68,10 @@ static cl::opt<bool>
                                        "distributed backend case"));
 
 // Default to using all available threads in the system, but using only one
-// thread per core, as indicated by the usage of
-// heavyweight_hardware_concurrency() in the InProcessThinBackend constructor.
-static cl::opt<int> Threads("thinlto-threads", cl::init(0));
+// thread per core (no SMT).
+// Use -thinlto-threads=all to use hardware_concurrency() instead, which means
+// to use all hardware threads or cores in the system.
+static cl::opt<std::string> Threads("thinlto-threads");
 
 static cl::list<std::string> SymbolResolutions(
     "r",
@@ -286,7 +287,8 @@ static int run(int argc, char **argv) {
                                             /* LinkedObjectsFile */ nullptr,
                                             /* OnWrite */ {});
   else
-    Backend = createInProcessThinBackend(Threads);
+    Backend = createInProcessThinBackend(
+        llvm::heavyweight_hardware_concurrency(Threads));
   LTO Lto(std::move(Conf), std::move(Backend));
 
   bool HasErrors = false;