[llvm] e66500c - [Support] On Windows 11 and Windows Server 2022, fix an affinity mask issue on large core count machines

Fri Jan 6 14:04:12 PST 2023

Author: Alexandre Ganea
Date: 2023-01-06T17:03:43-05:00
New Revision: e66500c7749578cf5e1372d0b52781259cb1fa28

URL: https://github.com/llvm/llvm-project/commit/e66500c7749578cf5e1372d0b52781259cb1fa28
DIFF: https://github.com/llvm/llvm-project/commit/e66500c7749578cf5e1372d0b52781259cb1fa28.diff

LOG: [Support] On Windows 11 and Windows Server 2022, fix an affinity mask issue on large core count machines

Before Windows 11 and Windows Server 2022, only one 'processor group' is assigned by default to a starting process, then the program is responsible for dispatching its own threads on more 'processor groups'. That is what 8404aeb56a73ab24f9b295111de3b37a37f0b841 was doing, allowing LLVM tools to automatically use all hardware threads in the machine.

After Windows 11 and Windows Server 2022, the OS takes care of that. This has an adverse effect reported in #56618 which is that using `GetProcessAffinityMask()` API in some edge cases seems buggy now. That API is used to detect if an affinity mask was set, and adjust accordingly the available threads for a ThreadPool.

With this patch, on one hand, we let the OS dispatch threads on all 'processor groups', but only for Windows 11 & Windows Server 2022 and after. We retain the old behavior for older OS versions. On the other hand, a workaround was added to mitigate the `GetProcessAffinityMask()` issue described above (see Threading.inc, L226).

Differential Revision: https://reviews.llvm.org/D138747

Added: 
    

Modified: 
    llvm/docs/ReleaseNotes.rst
    llvm/include/llvm/Support/Windows/WindowsSupport.h
    llvm/lib/Support/Windows/Process.inc
    llvm/lib/Support/Windows/Threading.inc
    llvm/unittests/Support/ThreadPool.cpp

Removed: 
    


################################################################################
diff  --git a/llvm/docs/ReleaseNotes.rst b/llvm/docs/ReleaseNotes.rst
index 30d3d7e5be6af..53ea9cd3f0bef 100644

--- a/llvm/docs/ReleaseNotes.rst
+++ b/llvm/docs/ReleaseNotes.rst
@@ -176,6 +176,14 @@ Changes to the Windows Target
   This roughly makes hidden visibility work like it does for other object
   file formats.
 
+* When using multi-threaded LLVM tools (such as LLD) on a Windows host with a
+  large number of processors or CPU sockets, previously the LLVM ThreadPool
+  would span out threads to use all processors.
+  Starting with Windows Server 2022 and Windows 11, the behavior has changed,
+  the OS now spans out threads automatically to all processors. This also fixes
+  an affinity mask issue.
+  (`D138747 <https://reviews.llvm.org/D138747>`_)
+
 Changes to the X86 Backend
 --------------------------
 

diff  --git a/llvm/include/llvm/Support/Windows/WindowsSupport.h b/llvm/include/llvm/Support/Windows/WindowsSupport.h
index 917822678e979..d3aacd14b2097 100644
--- a/llvm/include/llvm/Support/Windows/WindowsSupport.h
+++ b/llvm/include/llvm/Support/Windows/WindowsSupport.h
@@ -59,6 +59,9 @@ namespace llvm {
 /// yet have VersionHelpers.h, so we have our own helper.
 bool RunningWindows8OrGreater();
 
+/// Determines if the program is running on Windows 11 or Windows Server 2022.
+bool RunningWindows11OrGreater();
+
 /// Returns the Windows version as Major.Minor.0.BuildNumber. Uses
 /// RtlGetVersion or GetVersionEx under the hood depending on what is available.
 /// GetVersionEx is deprecated, but this API exposes the build number which can

diff  --git a/llvm/lib/Support/Windows/Process.inc b/llvm/lib/Support/Windows/Process.inc
index 4786199b4d9e3..493209052a1c5 100644
--- a/llvm/lib/Support/Windows/Process.inc
+++ b/llvm/lib/Support/Windows/Process.inc
@@ -477,20 +477,30 @@ unsigned Process::GetRandomNumber() {
 typedef NTSTATUS(WINAPI *RtlGetVersionPtr)(PRTL_OSVERSIONINFOW);
 #define STATUS_SUCCESS ((NTSTATUS)0x00000000L)
 
-llvm::VersionTuple llvm::GetWindowsOSVersion() {
-  HMODULE hMod = ::GetModuleHandleW(L"ntdll.dll");
-  if (hMod) {
+static RTL_OSVERSIONINFOEXW GetWindowsVer() {
+  auto getVer = []() -> RTL_OSVERSIONINFOEXW {
+    HMODULE hMod = ::GetModuleHandleW(L"ntdll.dll");
+    assert(hMod);
+
     auto getVer = (RtlGetVersionPtr)::GetProcAddress(hMod, "RtlGetVersion");
-    if (getVer) {
-      RTL_OSVERSIONINFOEXW info{};
-      info.dwOSVersionInfoSize = sizeof(info);
-      if (getVer((PRTL_OSVERSIONINFOW)&info) == STATUS_SUCCESS) {
-        return llvm::VersionTuple(info.dwMajorVersion, info.dwMinorVersion, 0,
-                                  info.dwBuildNumber);
-      }
-    }
-  }
-  return llvm::VersionTuple(0, 0, 0, 0);
+    assert(getVer);
+
+    RTL_OSVERSIONINFOEXW info{};
+    info.dwOSVersionInfoSize = sizeof(info);
+    NTSTATUS r = getVer((PRTL_OSVERSIONINFOW)&info);
+    (void)r;
+    assert(r == STATUS_SUCCESS);
+
+    return info;
+  };
+  static RTL_OSVERSIONINFOEXW info = getVer();
+  return info;
+}
+
+llvm::VersionTuple llvm::GetWindowsOSVersion() {
+  RTL_OSVERSIONINFOEXW info = GetWindowsVer();
+  return llvm::VersionTuple(info.dwMajorVersion, info.dwMinorVersion, 0,
+                            info.dwBuildNumber);
 }
 
 bool llvm::RunningWindows8OrGreater() {
@@ -498,6 +508,19 @@ bool llvm::RunningWindows8OrGreater() {
   return GetWindowsOSVersion() >= llvm::VersionTuple(6, 2, 0, 0);
 }
 
+bool llvm::RunningWindows11OrGreater() {
+  RTL_OSVERSIONINFOEXW info = GetWindowsVer();
+  auto ver = llvm::VersionTuple(info.dwMajorVersion, info.dwMinorVersion, 0,
+                                info.dwBuildNumber);
+
+  // Windows Server 2022
+  if (info.wProductType == VER_NT_SERVER)
+    return ver >= llvm::VersionTuple(10, 0, 0, 20348);
+
+  // Windows 11
+  return ver >= llvm::VersionTuple(10, 0, 0, 22000);
+}
+
 [[noreturn]] void Process::ExitNoCleanup(int RetCode) {
   TerminateProcess(GetCurrentProcess(), RetCode);
   llvm_unreachable("TerminateProcess doesn't return");

diff  --git a/llvm/lib/Support/Windows/Threading.inc b/llvm/lib/Support/Windows/Threading.inc
index 2c16fe442b703..aa47484cb5cec 100644
--- a/llvm/lib/Support/Windows/Threading.inc
+++ b/llvm/lib/Support/Windows/Threading.inc
@@ -159,6 +159,22 @@ static bool IterateProcInfo(LOGICAL_PROCESSOR_RELATIONSHIP Relationship, F Fn) {
   return true;
 }
 
+static std::optional<std::vector<USHORT>> getActiveGroups() {
+  USHORT Count = 0;
+  if (::GetProcessGroupAffinity(GetCurrentProcess(), &Count, nullptr))
+    return std::nullopt;
+
+  if (GetLastError() != ERROR_INSUFFICIENT_BUFFER)
+    return std::nullopt;
+
+  std::vector<USHORT> Groups;
+  Groups.resize(Count);
+  if (!::GetProcessGroupAffinity(GetCurrentProcess(), &Count, Groups.data()))
+    return std::nullopt;
+
+  return Groups;
+}
+
 static ArrayRef<ProcessorGroup> getProcessorGroups() {
   auto computeGroups = []() {
     SmallVector<ProcessorGroup, 4> Groups;
@@ -193,22 +209,28 @@ static ArrayRef<ProcessorGroup> getProcessorGroups() {
     if (!IterateProcInfo(RelationProcessorCore, HandleProc))
       return std::vector<ProcessorGroup>();
 
+    auto ActiveGroups = getActiveGroups();
+    if (!ActiveGroups)
+      return std::vector<ProcessorGroup>();
+
     // If there's an affinity mask set, assume the user wants to constrain the
     // current process to only a single CPU group. On Windows, it is not
     // possible for affinity masks to cross CPU group boundaries.
     DWORD_PTR ProcessAffinityMask = 0, SystemAffinityMask = 0;
     if (::GetProcessAffinityMask(GetCurrentProcess(), &ProcessAffinityMask,
-                                 &SystemAffinityMask) &&
-        ProcessAffinityMask != SystemAffinityMask) {
-      // We don't expect more that 4 CPU groups on Windows (256 processors).
-      USHORT GroupCount = 4;
-      USHORT GroupArray[4]{};
-      if (::GetProcessGroupAffinity(GetCurrentProcess(), &GroupCount,
-                                    GroupArray)) {
-        assert(GroupCount == 1 &&
-               "On startup, a program is expected to be assigned only to "
-               "one processor group!");
-        unsigned CurrentGroupID = GroupArray[0];
+                                 &SystemAffinityMask)) {
+
+      if (ProcessAffinityMask != SystemAffinityMask) {
+        if (llvm::RunningWindows11OrGreater() && ActiveGroups->size() > 1) {
+          // The process affinity mask is spurious, due to an OS bug, ignore it.
+          return std::vector<ProcessorGroup>(Groups.begin(), Groups.end());
+        }
+
+        assert(ActiveGroups->size() == 1 &&
+               "When an affinity mask is set, the process is expected to be "
+               "assigned to a single processor group!");
+
+        unsigned CurrentGroupID = (*ActiveGroups)[0];
         ProcessorGroup NewG{Groups[CurrentGroupID]};
         NewG.Affinity = ProcessAffinityMask;
         NewG.UsableThreads = countPopulation(ProcessAffinityMask);
@@ -216,7 +238,6 @@ static ArrayRef<ProcessorGroup> getProcessorGroups() {
         Groups.push_back(NewG);
       }
     }
-
     return std::vector<ProcessorGroup>(Groups.begin(), Groups.end());
   };
   static auto Groups = computeGroups();
@@ -273,6 +294,12 @@ llvm::ThreadPoolStrategy::compute_cpu_socket(unsigned ThreadPoolNum) const {
 // Assign the current thread to a more appropriate CPU socket or CPU group
 void llvm::ThreadPoolStrategy::apply_thread_strategy(
     unsigned ThreadPoolNum) const {
+
+  // After Windows 11 and Windows Server 2022, let the OS do the scheduling,
+  // since a process automatically gains access to all processor groups.
+  if (llvm::RunningWindows11OrGreater())
+    return;
+
   std::optional<unsigned> Socket = compute_cpu_socket(ThreadPoolNum);
   if (!Socket)
     return;

diff  --git a/llvm/unittests/Support/ThreadPool.cpp b/llvm/unittests/Support/ThreadPool.cpp
index fd9d7272e7e0b..faaeea6d80fd9 100644
--- a/llvm/unittests/Support/ThreadPool.cpp
+++ b/llvm/unittests/Support/ThreadPool.cpp
@@ -18,6 +18,10 @@
 #include "llvm/Support/TargetSelect.h"
 #include "llvm/Support/Threading.h"
 
+#ifdef _WIN32
+#include "llvm/Support/Windows/WindowsSupport.h"
+#endif
+
 #include <chrono>
 #include <thread>
 
@@ -378,12 +382,22 @@ ThreadPoolTest::RunOnAllSockets(ThreadPoolStrategy S) {
 
 TEST_F(ThreadPoolTest, AllThreads_UseAllRessources) {
   CHECK_UNSUPPORTED();
+  // After Windows 11, the OS is free to deploy the threads on any CPU socket.
+  // We cannot relibly ensure that all thread affinity mask are covered,
+  // therefore this test should not run.
+  if (llvm::RunningWindows11OrGreater())
+    return;
   std::vector<llvm::BitVector> ThreadsUsed = RunOnAllSockets({});
   ASSERT_EQ(llvm::get_cpus(), ThreadsUsed.size());
 }
 
 TEST_F(ThreadPoolTest, AllThreads_OneThreadPerCore) {
   CHECK_UNSUPPORTED();
+  // After Windows 11, the OS is free to deploy the threads on any CPU socket.
+  // We cannot relibly ensure that all thread affinity mask are covered,
+  // therefore this test should not run.
+  if (llvm::RunningWindows11OrGreater())
+    return;
   std::vector<llvm::BitVector> ThreadsUsed =
       RunOnAllSockets(llvm::heavyweight_hardware_concurrency());
   ASSERT_EQ(llvm::get_cpus(), ThreadsUsed.size());