[llvm] bba25a9 - [MCA] Support carry-over instructions for in-order processors
Andrew Savonichev via llvm-commits
llvm-commits at lists.llvm.org
Thu Mar 25 14:17:45 PDT 2021
Author: Andrew Savonichev
Date: 2021-03-26T00:06:19+03:00
New Revision: bba25a9cd827f9ee822616cc194206ffb7c0a49b
URL: https://github.com/llvm/llvm-project/commit/bba25a9cd827f9ee822616cc194206ffb7c0a49b
DIFF: https://github.com/llvm/llvm-project/commit/bba25a9cd827f9ee822616cc194206ffb7c0a49b.diff
LOG: [MCA] Support carry-over instructions for in-order processors
Instructions that have more uops than the processor's IssueWidth are
issued in multiple cycles.
The patch fixes PR49712.
Differential Revision: https://reviews.llvm.org/D99339
Added:
llvm/test/tools/llvm-mca/AArch64/Cortex/A53-carry-over.s
Modified:
llvm/include/llvm/MCA/Stages/InOrderIssueStage.h
llvm/lib/MCA/Stages/InOrderIssueStage.cpp
llvm/test/tools/llvm-mca/AMDGPU/gfx10-double.s
Removed:
################################################################################
diff --git a/llvm/include/llvm/MCA/Stages/InOrderIssueStage.h b/llvm/include/llvm/MCA/Stages/InOrderIssueStage.h
index e3aec7fb78ca..1a944243db60 100644
--- a/llvm/include/llvm/MCA/Stages/InOrderIssueStage.h
+++ b/llvm/include/llvm/MCA/Stages/InOrderIssueStage.h
@@ -45,6 +45,11 @@ class InOrderIssueStage final : public Stage {
InstRef StalledInst;
unsigned StallCyclesLeft;
+ /// Instruction that is issued in more than 1 cycle.
+ InstRef CarriedOver;
+ /// Number of CarriedOver uops left to issue.
+ unsigned CarryOver;
+
/// Number of instructions that can be issued in the current cycle.
unsigned Bandwidth;
@@ -67,6 +72,9 @@ class InOrderIssueStage final : public Stage {
/// Update status of instructions from IssuedInst.
void updateIssuedInst();
+ /// Continue to issue the CarriedOver instruction.
+ void updateCarriedOver();
+
/// Retire instruction once it is executed.
void retireInstruction(InstRef &IR);
@@ -74,7 +82,8 @@ class InOrderIssueStage final : public Stage {
InOrderIssueStage(RegisterFile &PRF, const MCSchedModel &SM,
const MCSubtargetInfo &STI)
: SM(SM), STI(STI), PRF(PRF), RM(std::make_unique<ResourceManager>(SM)),
- NumIssued(0), StallCyclesLeft(0), Bandwidth(0), LastWriteBackCycle(0) {}
+ NumIssued(0), StallCyclesLeft(0), CarryOver(0), Bandwidth(0),
+ LastWriteBackCycle(0) {}
bool isAvailable(const InstRef &) const override;
bool hasWorkToComplete() const override;
diff --git a/llvm/lib/MCA/Stages/InOrderIssueStage.cpp b/llvm/lib/MCA/Stages/InOrderIssueStage.cpp
index 2d2a75cc99a7..a32319b4b390 100644
--- a/llvm/lib/MCA/Stages/InOrderIssueStage.cpp
+++ b/llvm/lib/MCA/Stages/InOrderIssueStage.cpp
@@ -29,15 +29,19 @@ namespace llvm {
namespace mca {
bool InOrderIssueStage::hasWorkToComplete() const {
- return !IssuedInst.empty() || StalledInst;
+ return !IssuedInst.empty() || StalledInst || CarriedOver;
}
bool InOrderIssueStage::isAvailable(const InstRef &IR) const {
+ if (StalledInst || CarriedOver)
+ return false;
+
const Instruction &Inst = *IR.getInstruction();
unsigned NumMicroOps = Inst.getNumMicroOps();
const InstrDesc &Desc = Inst.getDesc();
- if (Bandwidth < NumMicroOps)
+ bool ShouldCarryOver = NumMicroOps > SM.IssueWidth;
+ if (Bandwidth < NumMicroOps && !ShouldCarryOver)
return false;
// Instruction with BeginGroup must be the first instruction to be issued in a
@@ -247,15 +251,19 @@ llvm::Error InOrderIssueStage::tryIssue(InstRef &IR, unsigned *StallCycles) {
}
notifyInstructionIssue(IR, UsedResources, *this);
- if (Desc.EndGroup) {
+ bool ShouldCarryOver = NumMicroOps > Bandwidth;
+ if (ShouldCarryOver) {
+ CarryOver = NumMicroOps - Bandwidth;
+ CarriedOver = IR;
Bandwidth = 0;
+ NumIssued += Bandwidth;
+ LLVM_DEBUG(dbgs() << "[N] Carry over #" << IR << " \n");
} else {
- assert(Bandwidth >= NumMicroOps);
- Bandwidth -= NumMicroOps;
+ NumIssued += NumMicroOps;
+ Bandwidth = Desc.EndGroup ? 0 : Bandwidth - NumMicroOps;
}
IssuedInst.push_back(IR);
- NumIssued += NumMicroOps;
if (!IR.getInstruction()->getDesc().RetireOOO)
LastWriteBackCycle = findLastWriteBackCycle(IR);
@@ -295,6 +303,32 @@ void InOrderIssueStage::updateIssuedInst() {
IssuedInst.resize(IssuedInst.size() - NumExecuted);
}
+void InOrderIssueStage::updateCarriedOver() {
+ if (!CarriedOver)
+ return;
+
+ assert(!StalledInst && "A stalled instruction cannot be carried over.");
+
+ if (CarryOver > Bandwidth) {
+ CarryOver -= Bandwidth;
+ Bandwidth = 0;
+ LLVM_DEBUG(dbgs() << "[N] Carry over (" << CarryOver << "uops left) #"
+ << CarriedOver << " \n");
+ return;
+ }
+
+ LLVM_DEBUG(dbgs() << "[N] Carry over (complete) #" << CarriedOver
+ << " \n");
+
+ if (CarriedOver.getInstruction()->getDesc().EndGroup)
+ Bandwidth = 0;
+ else
+ Bandwidth -= CarryOver;
+
+ CarriedOver = InstRef();
+ CarryOver = 0;
+}
+
void InOrderIssueStage::retireInstruction(InstRef &IR) {
Instruction &IS = *IR.getInstruction();
IS.retire();
@@ -319,6 +353,9 @@ llvm::Error InOrderIssueStage::cycleStart() {
updateIssuedInst();
+ // Continue to issue the instruction carried over from the previous cycle
+ updateCarriedOver();
+
// Issue instructions scheduled for this cycle
if (!StallCyclesLeft && StalledInst) {
if (llvm::Error E = tryIssue(StalledInst, &StallCyclesLeft))
diff --git a/llvm/test/tools/llvm-mca/AArch64/Cortex/A53-carry-over.s b/llvm/test/tools/llvm-mca/AArch64/Cortex/A53-carry-over.s
new file mode 100644
index 000000000000..a5715b965210
--- /dev/null
+++ b/llvm/test/tools/llvm-mca/AArch64/Cortex/A53-carry-over.s
@@ -0,0 +1,83 @@
+# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
+# RUN: llvm-mca -mtriple=aarch64 -mcpu=cortex-a53 --timeline --iterations=1 < %s | FileCheck %s
+
+ldp w3, w5, [x10], #4 // 2uop + 1uop carry over
+add w10, w11, w12
+add w13, w14, w15
+ldp w7, w8, [x11] // 2uop, no carry over
+add w16, w17, w18
+add w19, w20, w21
+
+# CHECK: Iterations: 1
+# CHECK-NEXT: Instructions: 6
+# CHECK-NEXT: Total Cycles: 8
+# CHECK-NEXT: Total uOps: 9
+
+# CHECK: Dispatch Width: 2
+# CHECK-NEXT: uOps Per Cycle: 1.13
+# CHECK-NEXT: IPC: 0.75
+# CHECK-NEXT: Block RThroughput: 4.5
+
+# CHECK: Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK: [1] [2] [3] [4] [5] [6] Instructions:
+# CHECK-NEXT: 3 4 2.00 * ldp w3, w5, [x10], #4
+# CHECK-NEXT: 1 3 0.50 add w10, w11, w12
+# CHECK-NEXT: 1 3 0.50 add w13, w14, w15
+# CHECK-NEXT: 2 4 2.00 * ldp w7, w8, [x11]
+# CHECK-NEXT: 1 3 0.50 add w16, w17, w18
+# CHECK-NEXT: 1 3 0.50 add w19, w20, w21
+
+# CHECK: Resources:
+# CHECK-NEXT: [0.0] - A53UnitALU
+# CHECK-NEXT: [0.1] - A53UnitALU
+# CHECK-NEXT: [1] - A53UnitB
+# CHECK-NEXT: [2] - A53UnitDiv
+# CHECK-NEXT: [3] - A53UnitFPALU
+# CHECK-NEXT: [4] - A53UnitFPMDS
+# CHECK-NEXT: [5] - A53UnitLdSt
+# CHECK-NEXT: [6] - A53UnitMAC
+
+# CHECK: Resource pressure per iteration:
+# CHECK-NEXT: [0.0] [0.1] [1] [2] [3] [4] [5] [6]
+# CHECK-NEXT: 2.00 2.00 - - - - 4.00 -
+
+# CHECK: Resource pressure by instruction:
+# CHECK-NEXT: [0.0] [0.1] [1] [2] [3] [4] [5] [6] Instructions:
+# CHECK-NEXT: - - - - - - 2.00 - ldp w3, w5, [x10], #4
+# CHECK-NEXT: - 1.00 - - - - - - add w10, w11, w12
+# CHECK-NEXT: 1.00 - - - - - - - add w13, w14, w15
+# CHECK-NEXT: - - - - - - 2.00 - ldp w7, w8, [x11]
+# CHECK-NEXT: - 1.00 - - - - - - add w16, w17, w18
+# CHECK-NEXT: 1.00 - - - - - - - add w19, w20, w21
+
+# CHECK: Timeline view:
+# CHECK-NEXT: Index 01234567
+
+# CHECK: [0,0] DeeeE. . ldp w3, w5, [x10], #4
+# CHECK-NEXT: [0,1] .DeeE. . add w10, w11, w12
+# CHECK-NEXT: [0,2] . DeeE . add w13, w14, w15
+# CHECK-NEXT: [0,3] . DeeeE ldp w7, w8, [x11]
+# CHECK-NEXT: [0,4] . DeeE add w16, w17, w18
+# CHECK-NEXT: [0,5] . DeeE add w19, w20, w21
+
+# CHECK: Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK: [0] [1] [2] [3]
+# CHECK-NEXT: 0. 1 0.0 0.0 0.0 ldp w3, w5, [x10], #4
+# CHECK-NEXT: 1. 1 0.0 0.0 0.0 add w10, w11, w12
+# CHECK-NEXT: 2. 1 0.0 0.0 0.0 add w13, w14, w15
+# CHECK-NEXT: 3. 1 0.0 0.0 0.0 ldp w7, w8, [x11]
+# CHECK-NEXT: 4. 1 0.0 0.0 0.0 add w16, w17, w18
+# CHECK-NEXT: 5. 1 0.0 0.0 0.0 add w19, w20, w21
+# CHECK-NEXT: 1 0.0 0.0 0.0 <total>
diff --git a/llvm/test/tools/llvm-mca/AMDGPU/gfx10-double.s b/llvm/test/tools/llvm-mca/AMDGPU/gfx10-double.s
index 28d811f01806..3a0991d875f6 100644
--- a/llvm/test/tools/llvm-mca/AMDGPU/gfx10-double.s
+++ b/llvm/test/tools/llvm-mca/AMDGPU/gfx10-double.s
@@ -28,8 +28,7 @@ v_div_fmas_f64 v[0:1], v[0:1], v[0:1], v[0:1]
v_div_fixup_f64 v[0:1], v[0:1], v[0:1], v[0:1]
v_ldexp_f64 v[2:3], v[2:3], v0
-; FIXME: This instructions sends llvm-mca into an infinite loop
-;v_div_scale_f64 v[0:1], vcc_lo, v[0:1], v[0:1], v[0:1]
+v_div_scale_f64 v[0:1], vcc_lo, v[0:1], v[0:1], v[0:1]
v_trig_preop_f64 v[2:3], v[2:3], v0
@@ -41,14 +40,14 @@ v_rsq_f64 v[2:3], v[2:3]
v_sqrt_f64 v[4:5], v[4:5]
# CHECK: Iterations: 1
-# CHECK-NEXT: Instructions: 27
-# CHECK-NEXT: Total Cycles: 204
-# CHECK-NEXT: Total uOps: 27
+# CHECK-NEXT: Instructions: 28
+# CHECK-NEXT: Total Cycles: 224
+# CHECK-NEXT: Total uOps: 29
# CHECK: Dispatch Width: 1
# CHECK-NEXT: uOps Per Cycle: 0.13
# CHECK-NEXT: IPC: 0.13
-# CHECK-NEXT: Block RThroughput: 27.0
+# CHECK-NEXT: Block RThroughput: 29.0
# CHECK: Instruction Info:
# CHECK-NEXT: [1]: #uOps
@@ -80,6 +79,7 @@ v_sqrt_f64 v[4:5], v[4:5]
# CHECK-NEXT: 1 22 1.00 U v_div_fmas_f64 v[0:1], v[0:1], v[0:1], v[0:1]
# CHECK-NEXT: 1 22 1.00 U v_div_fixup_f64 v[0:1], v[0:1], v[0:1], v[0:1]
# CHECK-NEXT: 1 22 1.00 U v_ldexp_f64 v[2:3], v[2:3], v0
+# CHECK-NEXT: 2 22 2.00 U v_div_scale_f64 v[0:1], vcc_lo, v[0:1], v[0:1], v[0:1]
# CHECK-NEXT: 1 22 1.00 U v_trig_preop_f64 v[2:3], v[2:3], v0
# CHECK-NEXT: 1 22 1.00 U v_cmp_eq_f64_e32 vcc_lo, v[0:1], v[0:1]
# CHECK-NEXT: 1 22 1.00 U v_cmp_class_f64_e64 vcc_lo, v[2:3], s0
@@ -98,7 +98,7 @@ v_sqrt_f64 v[4:5], v[4:5]
# CHECK: Resource pressure per iteration:
# CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6]
-# CHECK-NEXT: - - - 27.00 - 27.00 -
+# CHECK-NEXT: - - - 29.00 1.00 28.00 -
# CHECK: Resource pressure by instruction:
# CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6] Instructions:
@@ -123,6 +123,7 @@ v_sqrt_f64 v[4:5], v[4:5]
# CHECK-NEXT: - - - 1.00 - 1.00 - v_div_fmas_f64 v[0:1], v[0:1], v[0:1], v[0:1]
# CHECK-NEXT: - - - 1.00 - 1.00 - v_div_fixup_f64 v[0:1], v[0:1], v[0:1], v[0:1]
# CHECK-NEXT: - - - 1.00 - 1.00 - v_ldexp_f64 v[2:3], v[2:3], v0
+# CHECK-NEXT: - - - 2.00 1.00 1.00 - v_div_scale_f64 v[0:1], vcc_lo, v[0:1], v[0:1], v[0:1]
# CHECK-NEXT: - - - 1.00 - 1.00 - v_trig_preop_f64 v[2:3], v[2:3], v0
# CHECK-NEXT: - - - 1.00 - 1.00 - v_cmp_eq_f64_e32 vcc_lo, v[0:1], v[0:1]
# CHECK-NEXT: - - - 1.00 - 1.00 - v_cmp_class_f64_e64 vcc_lo, v[2:3], s0
@@ -176,10 +177,11 @@ v_sqrt_f64 v[4:5], v[4:5]
# CHECK-NEXT: 18. 1 0.0 0.0 0.0 v_div_fmas_f64 v[0:1], v[0:1], v[0:1], v[0:1]
# CHECK-NEXT: 19. 1 0.0 0.0 0.0 v_div_fixup_f64 v[0:1], v[0:1], v[0:1], v[0:1]
# CHECK-NEXT: 20. 1 0.0 0.0 0.0 v_ldexp_f64 v[2:3], v[2:3], v0
-# CHECK-NEXT: 21. 1 0.0 0.0 0.0 v_trig_preop_f64 v[2:3], v[2:3], v0
-# CHECK-NEXT: 22. 1 0.0 0.0 0.0 v_cmp_eq_f64_e32 vcc_lo, v[0:1], v[0:1]
-# CHECK-NEXT: 23. 1 0.0 0.0 0.0 v_cmp_class_f64_e64 vcc_lo, v[2:3], s0
-# CHECK-NEXT: 24. 1 0.0 0.0 0.0 v_rcp_f64_e32 v[0:1], v[0:1]
-# CHECK-NEXT: 25. 1 0.0 0.0 0.0 v_rsq_f64_e32 v[2:3], v[2:3]
-# CHECK-NEXT: 26. 1 0.0 0.0 0.0 v_sqrt_f64_e32 v[4:5], v[4:5]
+# CHECK-NEXT: 21. 1 0.0 0.0 0.0 v_div_scale_f64 v[0:1], vcc_lo, v[0:1], v[0:1], v[0:1]
+# CHECK-NEXT: 22. 1 0.0 0.0 0.0 v_trig_preop_f64 v[2:3], v[2:3], v0
+# CHECK-NEXT: 23. 1 0.0 0.0 0.0 v_cmp_eq_f64_e32 vcc_lo, v[0:1], v[0:1]
+# CHECK-NEXT: 24. 1 0.0 0.0 0.0 v_cmp_class_f64_e64 vcc_lo, v[2:3], s0
+# CHECK-NEXT: 25. 1 0.0 0.0 0.0 v_rcp_f64_e32 v[0:1], v[0:1]
+# CHECK-NEXT: 26. 1 0.0 0.0 0.0 v_rsq_f64_e32 v[2:3], v[2:3]
+# CHECK-NEXT: 27. 1 0.0 0.0 0.0 v_sqrt_f64_e32 v[4:5], v[4:5]
# CHECK-NEXT: 1 0.0 0.0 0.0 <total>
More information about the llvm-commits
mailing list