[llvm] [MCA][ResourceManager] Fix a bug in the instruction issue logic. (PR #108386)

Andrea Di Biagio via llvm-commits llvm-commits at lists.llvm.org
Thu Sep 12 06:38:54 PDT 2024


https://github.com/adibiagio updated https://github.com/llvm/llvm-project/pull/108386

>From c3f75328e28f60a72d4ab1d727011d120e91902a Mon Sep 17 00:00:00 2001
From: Andrea Di Biagio <andrea.dibiagio at gmail.com>
Date: Thu, 12 Sep 2024 14:00:33 +0100
Subject: [PATCH] [MCA][ResourceManager] Fix a bug in the instruction issue
 logic.

This is a fix for issue #108157.

Before this patch, the pipeline selection logic in ResourceManager::issueInstruction() didn't know how to correctly handle
instructions which consume multiple partially overlapping resource groups.
In some cases (like the test case from #108157), the inability to correctly allocate resources on instruction issue was leading to crashes.

The presence of multiple partially overlapping groups complicates the selection process by introducing extra constraints.
For those cases, the issue logic now prioritizes groups which are more constrained than others.
---
 .../llvm/MCA/HardwareUnits/ResourceManager.h  | 20 +++-
 .../lib/MCA/HardwareUnits/ResourceManager.cpp | 87 +++++++++++++++++-
 .../partially-overlapping-group-resources.s   | 91 +++++++++++++++++++
 3 files changed, 192 insertions(+), 6 deletions(-)
 create mode 100644 llvm/test/tools/llvm-mca/X86/Znver4/partially-overlapping-group-resources.s

diff --git a/llvm/include/llvm/MCA/HardwareUnits/ResourceManager.h b/llvm/include/llvm/MCA/HardwareUnits/ResourceManager.h
index 0e3f16d2a490b5..1742039fc3752a 100644
--- a/llvm/include/llvm/MCA/HardwareUnits/ResourceManager.h
+++ b/llvm/include/llvm/MCA/HardwareUnits/ResourceManager.h
@@ -232,6 +232,8 @@ class ResourceState {
   /// `NumUnits` available units.
   bool isReady(unsigned NumUnits = 1) const;
 
+  uint64_t getNumReadyUnits() const { return llvm::popcount(ReadyMask); }
+
   bool isAResourceGroup() const { return IsAGroup; }
 
   bool containsResource(uint64_t ID) const { return ResourceMask & ID; }
@@ -428,9 +430,21 @@ class ResourceManager {
   uint64_t getProcResUnitMask() const { return ProcResUnitMask; }
   uint64_t getAvailableProcResUnits() const { return AvailableProcResUnits; }
 
-  void issueInstruction(
-      const InstrDesc &Desc,
-      SmallVectorImpl<std::pair<ResourceRef, ReleaseAtCycles>> &Pipes);
+  using ResourceWithCycles = std::pair<ResourceRef, ReleaseAtCycles>;
+
+  void issueInstruction(const InstrDesc &Desc,
+                        SmallVectorImpl<ResourceWithCycles> &Pipes) {
+    if (Desc.HasPartiallyOverlappingGroups)
+      return issueInstructionImpl(Desc, Pipes);
+
+    return fastIssueInstruction(Desc, Pipes);
+  }
+
+  void fastIssueInstruction(const InstrDesc &Desc,
+                            SmallVectorImpl<ResourceWithCycles> &Pipes);
+
+  void issueInstructionImpl(const InstrDesc &Desc,
+                            SmallVectorImpl<ResourceWithCycles> &Pipes);
 
   void cycleEvent(SmallVectorImpl<ResourceRef> &ResourcesFreed);
 
diff --git a/llvm/lib/MCA/HardwareUnits/ResourceManager.cpp b/llvm/lib/MCA/HardwareUnits/ResourceManager.cpp
index 8d99695f4c29e5..b5ecfdb2a3441d 100644
--- a/llvm/lib/MCA/HardwareUnits/ResourceManager.cpp
+++ b/llvm/lib/MCA/HardwareUnits/ResourceManager.cpp
@@ -344,9 +344,90 @@ uint64_t ResourceManager::checkAvailability(const InstrDesc &Desc) const {
   return BusyResourceMask;
 }
 
-void ResourceManager::issueInstruction(
-    const InstrDesc &Desc,
-    SmallVectorImpl<std::pair<ResourceRef, ReleaseAtCycles>> &Pipes) {
+void ResourceManager::issueInstructionImpl(
+    const InstrDesc &Desc, SmallVectorImpl<ResourceWithCycles> &Pipes) {
+
+  // Step 1.
+  // - Issue writes to non-group resources.
+  // - Issue writes to groups with only a single resource unit available.
+  // - Update reserved groups (if any)
+  // - Add any remaining resource usage requests to a Worklist.
+  SmallVector<std::pair<uint64_t, ResourceUsage>, 4> Worklist;
+
+  using ResourceWithUsage = std::pair<uint64_t, ResourceUsage>;
+
+  for (const ResourceWithUsage &R : Desc.Resources) {
+    const CycleSegment &CS = R.second.CS;
+    if (!CS.size()) {
+      releaseResource(R.first);
+      continue;
+    }
+
+    assert(CS.begin() == 0 && "Invalid {Start, End} cycles!");
+    if (R.second.isReserved()) {
+      assert((llvm::popcount(R.first) > 1) && "Expected a group!");
+      // Mark this group as reserved.
+      assert(R.second.isReserved());
+      reserveResource(R.first);
+      BusyResources[ResourceRef(R.first, R.first)] += CS.size();
+      continue;
+    }
+
+    const ResourceState &RS = *Resources[getResourceStateIndex(R.first)];
+    if (RS.isAResourceGroup() && RS.getNumReadyUnits() > 1) {
+      Worklist.push_back(R);
+      continue;
+    }
+
+    ResourceRef Pipe = selectPipe(R.first);
+    use(Pipe);
+    BusyResources[Pipe] += CS.size();
+    Pipes.emplace_back(std::make_pair(Pipe, ReleaseAtCycles(CS.size())));
+  }
+
+  // Step 2.
+  // Prioritize writes to groups with less available resources.
+  // NOTE: this algorithm has quadratic complexity in the worst case scenario.
+  // On average, this algorithm is expected to perform quite well and always
+  // converge in very few iterations. That is mainly because instructions rarely
+  // consume more than two or three resource groups.
+
+  while (!Worklist.empty()) {
+    sort(Worklist, [&](const ResourceWithUsage &lhs,
+                       const ResourceWithUsage &rhs) {
+      const ResourceState &lhsRS = *Resources[getResourceStateIndex(lhs.first)];
+      const ResourceState &rhsRS = *Resources[getResourceStateIndex(rhs.first)];
+      uint64_t lhsReadyUnits = lhsRS.getNumReadyUnits();
+      uint64_t rhsReadyUnits = rhsRS.getNumReadyUnits();
+      if (lhsReadyUnits == rhsReadyUnits)
+        return lhs.first < rhs.first;
+      return lhsReadyUnits < rhsReadyUnits;
+    });
+
+    SmallVector<ResourceWithUsage, 4> NewWorklist;
+
+    for (unsigned I = 0, E = Worklist.size(); I < E; ++I) {
+      const auto &Elt = Worklist[I];
+      const ResourceState &RS = *Resources[getResourceStateIndex(Elt.first)];
+
+      if (I == 0 || RS.getNumReadyUnits() == 1) {
+        ResourceRef Pipe = selectPipe(Elt.first);
+        use(Pipe);
+        const CycleSegment &CS = Elt.second.CS;
+        BusyResources[Pipe] += CS.size();
+        Pipes.emplace_back(std::make_pair(Pipe, ReleaseAtCycles(CS.size())));
+        continue;
+      }
+
+      NewWorklist.push_back(Elt);
+    }
+
+    swap(NewWorklist, Worklist);
+  };
+}
+
+void ResourceManager::fastIssueInstruction(
+    const InstrDesc &Desc, SmallVectorImpl<ResourceWithCycles> &Pipes) {
   for (const std::pair<uint64_t, ResourceUsage> &R : Desc.Resources) {
     const CycleSegment &CS = R.second.CS;
     if (!CS.size()) {
diff --git a/llvm/test/tools/llvm-mca/X86/Znver4/partially-overlapping-group-resources.s b/llvm/test/tools/llvm-mca/X86/Znver4/partially-overlapping-group-resources.s
new file mode 100644
index 00000000000000..1110a21fe891d4
--- /dev/null
+++ b/llvm/test/tools/llvm-mca/X86/Znver4/partially-overlapping-group-resources.s
@@ -0,0 +1,91 @@
+# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
+# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=znver4 -timeline -timeline-max-iterations=1 < %s | FileCheck %s
+
+# This is a test for issue #108157.
+#
+# llvm-mca was crashing when analyzing the following code snippet. This was due to a bug in the
+# instruction issue logic (from class ResourceManager) which was affecting instructions declaring
+# consumption of partially overlapping resources.
+# This test makes sure that analysis is successful, and that no crash occurs.
+
+.intel_syntax
+vpconflictd zmm0, zmm3
+kxnorw k1, k1, k1
+vpxord zmm1, zmm1, zmm1
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      300
+# CHECK-NEXT: Total Cycles:      278
+# CHECK-NEXT: Total uOps:        600
+
+# CHECK:      Dispatch Width:    6
+# CHECK-NEXT: uOps Per Cycle:    2.16
+# CHECK-NEXT: IPC:               1.08
+# CHECK-NEXT: Block RThroughput: 1.5
+
+# CHECK:      Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
+# CHECK-NEXT:  4      6     1.50                        vpconflictd	zmm0, zmm3
+# CHECK-NEXT:  1      1     0.50                        kxnorw	k1, k1, k1
+# CHECK-NEXT:  1      1     0.50                        vpxord	zmm1, zmm1, zmm1
+
+# CHECK:      Resources:
+# CHECK-NEXT: [0]   - Zn4AGU0
+# CHECK-NEXT: [1]   - Zn4AGU1
+# CHECK-NEXT: [2]   - Zn4AGU2
+# CHECK-NEXT: [3]   - Zn4ALU0
+# CHECK-NEXT: [4]   - Zn4ALU1
+# CHECK-NEXT: [5]   - Zn4ALU2
+# CHECK-NEXT: [6]   - Zn4ALU3
+# CHECK-NEXT: [7]   - Zn4BRU1
+# CHECK-NEXT: [8]   - Zn4FP0
+# CHECK-NEXT: [9]   - Zn4FP1
+# CHECK-NEXT: [10]  - Zn4FP2
+# CHECK-NEXT: [11]  - Zn4FP3
+# CHECK-NEXT: [12.0] - Zn4FP45
+# CHECK-NEXT: [12.1] - Zn4FP45
+# CHECK-NEXT: [13]  - Zn4FPSt
+# CHECK-NEXT: [14.0] - Zn4LSU
+# CHECK-NEXT: [14.1] - Zn4LSU
+# CHECK-NEXT: [14.2] - Zn4LSU
+# CHECK-NEXT: [15.0] - Zn4Load
+# CHECK-NEXT: [15.1] - Zn4Load
+# CHECK-NEXT: [15.2] - Zn4Load
+# CHECK-NEXT: [16.0] - Zn4Store
+# CHECK-NEXT: [16.1] - Zn4Store
+
+# CHECK:      Resource pressure per iteration:
+# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]    [10]   [11]   [12.0] [12.1] [13]   [14.0] [14.1] [14.2] [15.0] [15.1] [15.2] [16.0] [16.1]
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     2.22   2.24   2.26   2.28    -      -      -      -      -      -      -      -      -      -      -
+
+# CHECK:      Resource pressure by instruction:
+# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]    [10]   [11]   [12.0] [12.1] [13]   [14.0] [14.1] [14.2] [15.0] [15.1] [15.2] [16.0] [16.1] Instructions:
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.84   1.64   1.60   1.92    -      -      -      -      -      -      -      -      -      -      -     vpconflictd	zmm0, zmm3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.64   0.36    -      -      -      -      -      -      -      -      -      -      -     kxnorw	k1, k1, k1
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     1.38   0.60   0.02    -      -      -      -      -      -      -      -      -      -      -      -     vpxord	zmm1, zmm1, zmm1
+
+# CHECK:      Timeline view:
+# CHECK-NEXT: Index     012345678
+
+# CHECK:      [0,0]     DeeeeeeER   vpconflictd	zmm0, zmm3
+# CHECK-NEXT: [0,1]     D==eE---R   kxnorw	k1, k1, k1
+# CHECK-NEXT: [0,2]     DeE-----R   vpxord	zmm1, zmm1, zmm1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       vpconflictd	zmm0, zmm3
+# CHECK-NEXT: 1.     1     3.0    3.0    3.0       kxnorw	k1, k1, k1
+# CHECK-NEXT: 2.     1     1.0    1.0    5.0       vpxord	zmm1, zmm1, zmm1
+# CHECK-NEXT:        1     1.7    1.7    2.7       <total>



More information about the llvm-commits mailing list