[llvm] [MCA][ResourceManager] Fix a bug in the instruction issue logic. (PR #108386)

Andrea Di Biagio via llvm-commits llvm-commits at lists.llvm.org
Fri Sep 13 05:05:37 PDT 2024


https://github.com/adibiagio updated https://github.com/llvm/llvm-project/pull/108386

>From c3f75328e28f60a72d4ab1d727011d120e91902a Mon Sep 17 00:00:00 2001
From: Andrea Di Biagio <andrea.dibiagio at gmail.com>
Date: Thu, 12 Sep 2024 14:00:33 +0100
Subject: [PATCH 1/2] [MCA][ResourceManager] Fix a bug in the instruction issue
 logic.

This is a fix for issue #108157.

Before this patch, the pipeline selection logic in ResourceManager::issueInstruction() didn't know how to correctly handle
instructions which consume multiple partially overlapping resource groups.
In some cases (like the test case from #108157), the inability to correctly allocate resources on instruction issue was leading to crashes.

The presence of multiple partially overlapping groups complicates the selection process by introducing extra constraints.
For those cases, the issue logic now prioritizes groups which are more constrained than others.
---
 .../llvm/MCA/HardwareUnits/ResourceManager.h  | 20 +++-
 .../lib/MCA/HardwareUnits/ResourceManager.cpp | 87 +++++++++++++++++-
 .../partially-overlapping-group-resources.s   | 91 +++++++++++++++++++
 3 files changed, 192 insertions(+), 6 deletions(-)
 create mode 100644 llvm/test/tools/llvm-mca/X86/Znver4/partially-overlapping-group-resources.s

diff --git a/llvm/include/llvm/MCA/HardwareUnits/ResourceManager.h b/llvm/include/llvm/MCA/HardwareUnits/ResourceManager.h
index 0e3f16d2a490b5..1742039fc3752a 100644
--- a/llvm/include/llvm/MCA/HardwareUnits/ResourceManager.h
+++ b/llvm/include/llvm/MCA/HardwareUnits/ResourceManager.h
@@ -232,6 +232,8 @@ class ResourceState {
   /// `NumUnits` available units.
   bool isReady(unsigned NumUnits = 1) const;
 
+  uint64_t getNumReadyUnits() const { return llvm::popcount(ReadyMask); }
+
   bool isAResourceGroup() const { return IsAGroup; }
 
   bool containsResource(uint64_t ID) const { return ResourceMask & ID; }
@@ -428,9 +430,21 @@ class ResourceManager {
   uint64_t getProcResUnitMask() const { return ProcResUnitMask; }
   uint64_t getAvailableProcResUnits() const { return AvailableProcResUnits; }
 
-  void issueInstruction(
-      const InstrDesc &Desc,
-      SmallVectorImpl<std::pair<ResourceRef, ReleaseAtCycles>> &Pipes);
+  using ResourceWithCycles = std::pair<ResourceRef, ReleaseAtCycles>;
+
+  void issueInstruction(const InstrDesc &Desc,
+                        SmallVectorImpl<ResourceWithCycles> &Pipes) {
+    if (Desc.HasPartiallyOverlappingGroups)
+      return issueInstructionImpl(Desc, Pipes);
+
+    return fastIssueInstruction(Desc, Pipes);
+  }
+
+  void fastIssueInstruction(const InstrDesc &Desc,
+                            SmallVectorImpl<ResourceWithCycles> &Pipes);
+
+  void issueInstructionImpl(const InstrDesc &Desc,
+                            SmallVectorImpl<ResourceWithCycles> &Pipes);
 
   void cycleEvent(SmallVectorImpl<ResourceRef> &ResourcesFreed);
 
diff --git a/llvm/lib/MCA/HardwareUnits/ResourceManager.cpp b/llvm/lib/MCA/HardwareUnits/ResourceManager.cpp
index 8d99695f4c29e5..b5ecfdb2a3441d 100644
--- a/llvm/lib/MCA/HardwareUnits/ResourceManager.cpp
+++ b/llvm/lib/MCA/HardwareUnits/ResourceManager.cpp
@@ -344,9 +344,90 @@ uint64_t ResourceManager::checkAvailability(const InstrDesc &Desc) const {
   return BusyResourceMask;
 }
 
-void ResourceManager::issueInstruction(
-    const InstrDesc &Desc,
-    SmallVectorImpl<std::pair<ResourceRef, ReleaseAtCycles>> &Pipes) {
+void ResourceManager::issueInstructionImpl(
+    const InstrDesc &Desc, SmallVectorImpl<ResourceWithCycles> &Pipes) {
+
+  // Step 1.
+  // - Issue writes to non-group resources.
+  // - Issue writes to groups with only a single resource unit available.
+  // - Update reserved groups (if any)
+  // - Add any remaining resource usage requests to a Worklist.
+  SmallVector<std::pair<uint64_t, ResourceUsage>, 4> Worklist;
+
+  using ResourceWithUsage = std::pair<uint64_t, ResourceUsage>;
+
+  for (const ResourceWithUsage &R : Desc.Resources) {
+    const CycleSegment &CS = R.second.CS;
+    if (!CS.size()) {
+      releaseResource(R.first);
+      continue;
+    }
+
+    assert(CS.begin() == 0 && "Invalid {Start, End} cycles!");
+    if (R.second.isReserved()) {
+      assert((llvm::popcount(R.first) > 1) && "Expected a group!");
+      // Mark this group as reserved.
+      assert(R.second.isReserved());
+      reserveResource(R.first);
+      BusyResources[ResourceRef(R.first, R.first)] += CS.size();
+      continue;
+    }
+
+    const ResourceState &RS = *Resources[getResourceStateIndex(R.first)];
+    if (RS.isAResourceGroup() && RS.getNumReadyUnits() > 1) {
+      Worklist.push_back(R);
+      continue;
+    }
+
+    ResourceRef Pipe = selectPipe(R.first);
+    use(Pipe);
+    BusyResources[Pipe] += CS.size();
+    Pipes.emplace_back(std::make_pair(Pipe, ReleaseAtCycles(CS.size())));
+  }
+
+  // Step 2.
+  // Prioritize writes to groups with less available resources.
+  // NOTE: this algorithm has quadratic complexity in the worst case scenario.
+  // On average, this algorithm is expected to perform quite well and always
+  // converge in very few iterations. That is mainly because instructions rarely
+  // consume more than two or three resource groups.
+
+  while (!Worklist.empty()) {
+    sort(Worklist, [&](const ResourceWithUsage &lhs,
+                       const ResourceWithUsage &rhs) {
+      const ResourceState &lhsRS = *Resources[getResourceStateIndex(lhs.first)];
+      const ResourceState &rhsRS = *Resources[getResourceStateIndex(rhs.first)];
+      uint64_t lhsReadyUnits = lhsRS.getNumReadyUnits();
+      uint64_t rhsReadyUnits = rhsRS.getNumReadyUnits();
+      if (lhsReadyUnits == rhsReadyUnits)
+        return lhs.first < rhs.first;
+      return lhsReadyUnits < rhsReadyUnits;
+    });
+
+    SmallVector<ResourceWithUsage, 4> NewWorklist;
+
+    for (unsigned I = 0, E = Worklist.size(); I < E; ++I) {
+      const auto &Elt = Worklist[I];
+      const ResourceState &RS = *Resources[getResourceStateIndex(Elt.first)];
+
+      if (I == 0 || RS.getNumReadyUnits() == 1) {
+        ResourceRef Pipe = selectPipe(Elt.first);
+        use(Pipe);
+        const CycleSegment &CS = Elt.second.CS;
+        BusyResources[Pipe] += CS.size();
+        Pipes.emplace_back(std::make_pair(Pipe, ReleaseAtCycles(CS.size())));
+        continue;
+      }
+
+      NewWorklist.push_back(Elt);
+    }
+
+    swap(NewWorklist, Worklist);
+  };
+}
+
+void ResourceManager::fastIssueInstruction(
+    const InstrDesc &Desc, SmallVectorImpl<ResourceWithCycles> &Pipes) {
   for (const std::pair<uint64_t, ResourceUsage> &R : Desc.Resources) {
     const CycleSegment &CS = R.second.CS;
     if (!CS.size()) {
diff --git a/llvm/test/tools/llvm-mca/X86/Znver4/partially-overlapping-group-resources.s b/llvm/test/tools/llvm-mca/X86/Znver4/partially-overlapping-group-resources.s
new file mode 100644
index 00000000000000..1110a21fe891d4
--- /dev/null
+++ b/llvm/test/tools/llvm-mca/X86/Znver4/partially-overlapping-group-resources.s
@@ -0,0 +1,91 @@
+# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
+# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=znver4 -timeline -timeline-max-iterations=1 < %s | FileCheck %s
+
+# This is a test for issue #108157.
+#
+# llvm-mca was crashing when analyzing the following code snippet. This was due to a bug in the
+# instruction issue logic (from class ResourceManager) which was affecting instructions declaring
+# consumption of partially overlapping resources.
+# This test makes sure that analysis is successful, and that no crash occurs.
+
+.intel_syntax
+vpconflictd zmm0, zmm3
+kxnorw k1, k1, k1
+vpxord zmm1, zmm1, zmm1
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      300
+# CHECK-NEXT: Total Cycles:      278
+# CHECK-NEXT: Total uOps:        600
+
+# CHECK:      Dispatch Width:    6
+# CHECK-NEXT: uOps Per Cycle:    2.16
+# CHECK-NEXT: IPC:               1.08
+# CHECK-NEXT: Block RThroughput: 1.5
+
+# CHECK:      Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
+# CHECK-NEXT:  4      6     1.50                        vpconflictd	zmm0, zmm3
+# CHECK-NEXT:  1      1     0.50                        kxnorw	k1, k1, k1
+# CHECK-NEXT:  1      1     0.50                        vpxord	zmm1, zmm1, zmm1
+
+# CHECK:      Resources:
+# CHECK-NEXT: [0]   - Zn4AGU0
+# CHECK-NEXT: [1]   - Zn4AGU1
+# CHECK-NEXT: [2]   - Zn4AGU2
+# CHECK-NEXT: [3]   - Zn4ALU0
+# CHECK-NEXT: [4]   - Zn4ALU1
+# CHECK-NEXT: [5]   - Zn4ALU2
+# CHECK-NEXT: [6]   - Zn4ALU3
+# CHECK-NEXT: [7]   - Zn4BRU1
+# CHECK-NEXT: [8]   - Zn4FP0
+# CHECK-NEXT: [9]   - Zn4FP1
+# CHECK-NEXT: [10]  - Zn4FP2
+# CHECK-NEXT: [11]  - Zn4FP3
+# CHECK-NEXT: [12.0] - Zn4FP45
+# CHECK-NEXT: [12.1] - Zn4FP45
+# CHECK-NEXT: [13]  - Zn4FPSt
+# CHECK-NEXT: [14.0] - Zn4LSU
+# CHECK-NEXT: [14.1] - Zn4LSU
+# CHECK-NEXT: [14.2] - Zn4LSU
+# CHECK-NEXT: [15.0] - Zn4Load
+# CHECK-NEXT: [15.1] - Zn4Load
+# CHECK-NEXT: [15.2] - Zn4Load
+# CHECK-NEXT: [16.0] - Zn4Store
+# CHECK-NEXT: [16.1] - Zn4Store
+
+# CHECK:      Resource pressure per iteration:
+# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]    [10]   [11]   [12.0] [12.1] [13]   [14.0] [14.1] [14.2] [15.0] [15.1] [15.2] [16.0] [16.1]
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     2.22   2.24   2.26   2.28    -      -      -      -      -      -      -      -      -      -      -
+
+# CHECK:      Resource pressure by instruction:
+# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]    [10]   [11]   [12.0] [12.1] [13]   [14.0] [14.1] [14.2] [15.0] [15.1] [15.2] [16.0] [16.1] Instructions:
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.84   1.64   1.60   1.92    -      -      -      -      -      -      -      -      -      -      -     vpconflictd	zmm0, zmm3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.64   0.36    -      -      -      -      -      -      -      -      -      -      -     kxnorw	k1, k1, k1
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     1.38   0.60   0.02    -      -      -      -      -      -      -      -      -      -      -      -     vpxord	zmm1, zmm1, zmm1
+
+# CHECK:      Timeline view:
+# CHECK-NEXT: Index     012345678
+
+# CHECK:      [0,0]     DeeeeeeER   vpconflictd	zmm0, zmm3
+# CHECK-NEXT: [0,1]     D==eE---R   kxnorw	k1, k1, k1
+# CHECK-NEXT: [0,2]     DeE-----R   vpxord	zmm1, zmm1, zmm1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       vpconflictd	zmm0, zmm3
+# CHECK-NEXT: 1.     1     3.0    3.0    3.0       kxnorw	k1, k1, k1
+# CHECK-NEXT: 2.     1     1.0    1.0    5.0       vpxord	zmm1, zmm1, zmm1
+# CHECK-NEXT:        1     1.7    1.7    2.7       <total>

>From de3280fd3fd2581a9023e4918b57f83bd15b503e Mon Sep 17 00:00:00 2001
From: Andrea Di Biagio <andrea.dibiagio at gmail.com>
Date: Fri, 13 Sep 2024 12:49:41 +0100
Subject: [PATCH 2/2] Address review comments.

---
 .../llvm/MCA/HardwareUnits/ResourceManager.h  | 11 ++++++++++
 .../lib/MCA/HardwareUnits/ResourceManager.cpp | 18 +++++++--------
 .../partially-overlapping-group-resources.s   | 22 +++++++++----------
 3 files changed, 31 insertions(+), 20 deletions(-)

diff --git a/llvm/include/llvm/MCA/HardwareUnits/ResourceManager.h b/llvm/include/llvm/MCA/HardwareUnits/ResourceManager.h
index 1742039fc3752a..f71928f835c0e9 100644
--- a/llvm/include/llvm/MCA/HardwareUnits/ResourceManager.h
+++ b/llvm/include/llvm/MCA/HardwareUnits/ResourceManager.h
@@ -440,9 +440,20 @@ class ResourceManager {
     return fastIssueInstruction(Desc, Pipes);
   }
 
+  // Selects pipeline resources consumed by an instruction.
+  // This method works under the assumption that used group resources don't
+  // partially overlap. The logic is guaranteed to find a valid resource unit
+  // schedule, no matter in which order individual uses are processed. For that
+  // reason, the vector of resource uses is simply (and quickly) processed in
+  // sequence. The resulting schedule is eventually stored into vector `Pipes`.
   void fastIssueInstruction(const InstrDesc &Desc,
                             SmallVectorImpl<ResourceWithCycles> &Pipes);
 
+  // Selects pipeline resources consumed by an instruction.
+  // This method works under the assumption that used resource groups may
+  // partially overlap. This complicates the selection process, because the
+  // order in which uses are processed matters. The logic internally prioritizes
+  // groups which are more constrained than others.
   void issueInstructionImpl(const InstrDesc &Desc,
                             SmallVectorImpl<ResourceWithCycles> &Pipes);
 
diff --git a/llvm/lib/MCA/HardwareUnits/ResourceManager.cpp b/llvm/lib/MCA/HardwareUnits/ResourceManager.cpp
index b5ecfdb2a3441d..82030207eee616 100644
--- a/llvm/lib/MCA/HardwareUnits/ResourceManager.cpp
+++ b/llvm/lib/MCA/HardwareUnits/ResourceManager.cpp
@@ -393,15 +393,15 @@ void ResourceManager::issueInstructionImpl(
   // consume more than two or three resource groups.
 
   while (!Worklist.empty()) {
-    sort(Worklist, [&](const ResourceWithUsage &lhs,
-                       const ResourceWithUsage &rhs) {
-      const ResourceState &lhsRS = *Resources[getResourceStateIndex(lhs.first)];
-      const ResourceState &rhsRS = *Resources[getResourceStateIndex(rhs.first)];
-      uint64_t lhsReadyUnits = lhsRS.getNumReadyUnits();
-      uint64_t rhsReadyUnits = rhsRS.getNumReadyUnits();
-      if (lhsReadyUnits == rhsReadyUnits)
-        return lhs.first < rhs.first;
-      return lhsReadyUnits < rhsReadyUnits;
+    sort(Worklist, [&](const ResourceWithUsage &Lhs,
+                       const ResourceWithUsage &Rhs) {
+      const ResourceState &LhsRS = *Resources[getResourceStateIndex(Lhs.first)];
+      const ResourceState &RhsRS = *Resources[getResourceStateIndex(Rhs.first)];
+      uint64_t LhsReadyUnits = LhsRS.getNumReadyUnits();
+      uint64_t RhsReadyUnits = RhsRS.getNumReadyUnits();
+      if (LhsReadyUnits == RhsReadyUnits)
+        return Lhs.first < Rhs.first;
+      return LhsReadyUnits < RhsReadyUnits;
     });
 
     SmallVector<ResourceWithUsage, 4> NewWorklist;
diff --git a/llvm/test/tools/llvm-mca/X86/Znver4/partially-overlapping-group-resources.s b/llvm/test/tools/llvm-mca/X86/Znver4/partially-overlapping-group-resources.s
index 1110a21fe891d4..7705c900ddff12 100644
--- a/llvm/test/tools/llvm-mca/X86/Znver4/partially-overlapping-group-resources.s
+++ b/llvm/test/tools/llvm-mca/X86/Znver4/partially-overlapping-group-resources.s
@@ -15,12 +15,12 @@ vpxord zmm1, zmm1, zmm1
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      300
-# CHECK-NEXT: Total Cycles:      278
+# CHECK-NEXT: Total Cycles:      209
 # CHECK-NEXT: Total uOps:        600
 
 # CHECK:      Dispatch Width:    6
-# CHECK-NEXT: uOps Per Cycle:    2.16
-# CHECK-NEXT: IPC:               1.08
+# CHECK-NEXT: uOps Per Cycle:    2.87
+# CHECK-NEXT: IPC:               1.44
 # CHECK-NEXT: Block RThroughput: 1.5
 
 # CHECK:      Instruction Info:
@@ -34,7 +34,7 @@ vpxord zmm1, zmm1, zmm1
 # CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
 # CHECK-NEXT:  4      6     1.50                        vpconflictd	zmm0, zmm3
 # CHECK-NEXT:  1      1     0.50                        kxnorw	k1, k1, k1
-# CHECK-NEXT:  1      1     0.50                        vpxord	zmm1, zmm1, zmm1
+# CHECK-NEXT:  1      0     0.17                        vpxord	zmm1, zmm1, zmm1
 
 # CHECK:      Resources:
 # CHECK-NEXT: [0]   - Zn4AGU0
@@ -63,20 +63,20 @@ vpxord zmm1, zmm1, zmm1
 
 # CHECK:      Resource pressure per iteration:
 # CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]    [10]   [11]   [12.0] [12.1] [13]   [14.0] [14.1] [14.2] [15.0] [15.1] [15.2] [16.0] [16.1]
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     2.22   2.24   2.26   2.28    -      -      -      -      -      -      -      -      -      -      -
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     1.76   1.76   1.73   1.75    -      -      -      -      -      -      -      -      -      -      -
 
 # CHECK:      Resource pressure by instruction:
 # CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]    [10]   [11]   [12.0] [12.1] [13]   [14.0] [14.1] [14.2] [15.0] [15.1] [15.2] [16.0] [16.1] Instructions:
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.84   1.64   1.60   1.92    -      -      -      -      -      -      -      -      -      -      -     vpconflictd	zmm0, zmm3
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.64   0.36    -      -      -      -      -      -      -      -      -      -      -     kxnorw	k1, k1, k1
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     1.38   0.60   0.02    -      -      -      -      -      -      -      -      -      -      -      -     vpxord	zmm1, zmm1, zmm1
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     1.76   1.76   0.74   1.74    -      -      -      -      -      -      -      -      -      -      -     vpconflictd	zmm0, zmm3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.99   0.01    -      -      -      -      -      -      -      -      -      -      -     kxnorw	k1, k1, k1
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     vpxord	zmm1, zmm1, zmm1
 
 # CHECK:      Timeline view:
 # CHECK-NEXT: Index     012345678
 
 # CHECK:      [0,0]     DeeeeeeER   vpconflictd	zmm0, zmm3
 # CHECK-NEXT: [0,1]     D==eE---R   kxnorw	k1, k1, k1
-# CHECK-NEXT: [0,2]     DeE-----R   vpxord	zmm1, zmm1, zmm1
+# CHECK-NEXT: [0,2]     D-------R   vpxord	zmm1, zmm1, zmm1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -87,5 +87,5 @@ vpxord zmm1, zmm1, zmm1
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       vpconflictd	zmm0, zmm3
 # CHECK-NEXT: 1.     1     3.0    3.0    3.0       kxnorw	k1, k1, k1
-# CHECK-NEXT: 2.     1     1.0    1.0    5.0       vpxord	zmm1, zmm1, zmm1
-# CHECK-NEXT:        1     1.7    1.7    2.7       <total>
+# CHECK-NEXT: 2.     1     0.0    0.0    7.0       vpxord	zmm1, zmm1, zmm1
+# CHECK-NEXT:        1     1.3    1.3    3.3       <total>



More information about the llvm-commits mailing list