[llvm] [llvm-mca][AArch64] Add AArch64 version of clearsSuperRegisters. (PR #92548)
via llvm-commits
llvm-commits at lists.llvm.org
Fri May 17 06:58:20 PDT 2024
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-backend-aarch64
Author: Rin Dobrescu (Rin18)
<details>
<summary>Changes</summary>
This patch overrides the clearsSuperRegisters method defined in MCInstrAnalysis to identify register writes that clear the upper portion of all super-registers on AArch64 architecture.
On AArch64, a write to a general-purpose register of 32-bit data size is defined to use the lower 32-bits of the register and zero extend the upper 32-bits.
Similarly, SIMD and FP instructions operating on scalar data only access the lower bits of the SIMD&FP register. The unused upper bits are cleared to zero on a write.
This also applies to SIMD vector registers when the element size in bits multiplied by the number of lanes is lower than 128. The upper 64 bits of the vector register are cleared to zero on a write.
---
Patch is 62.92 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/92548.diff
3 Files Affected:
- (modified) llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp (+46)
- (added) llvm/test/tools/llvm-mca/AArch64/Neoverse/V1-clear-upper-regs.s (+791)
- (added) llvm/test/tools/llvm-mca/AArch64/Neoverse/V2-clear-upper-regs.s (+650)
``````````diff
diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp
index 043f0a03b7975..d545cfefc319d 100644
--- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp
+++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp
@@ -432,6 +432,52 @@ class AArch64MCInstrAnalysis : public MCInstrAnalysis {
return false;
}
+ bool clearsSuperRegisters(const MCRegisterInfo &MRI, const MCInst &Inst,
+ APInt &Mask) const override {
+ const MCInstrDesc &Desc = Info->get(Inst.getOpcode());
+ unsigned NumDefs = Desc.getNumDefs();
+ unsigned NumImplicitDefs = Desc.implicit_defs().size();
+ assert(Mask.getBitWidth() == NumDefs + NumImplicitDefs &&
+ "Unexpected number of bits in the mask!");
+ // 32-bit General Purpose Register class.
+ const MCRegisterClass &GPR32RC = MRI.getRegClass(AArch64::GPR32RegClassID);
+ // Floating Point Register classes lower than 128-bit.
+ const MCRegisterClass &FPR8RC = MRI.getRegClass(AArch64::FPR8RegClassID);
+ const MCRegisterClass &FPR16RC = MRI.getRegClass(AArch64::FPR16RegClassID);
+ const MCRegisterClass &FPR32RC = MRI.getRegClass(AArch64::FPR32RegClassID);
+ const MCRegisterClass &FPR64RC = MRI.getRegClass(AArch64::FPR64RegClassID);
+
+ auto ClearsSuperReg = [=](unsigned RegID) {
+ // An update to the lower 32 bits of a 64 bit integer register is
+ // architecturally defined to zero extend the upper 32 bits on a write.
+ if (GPR32RC.contains(RegID))
+ return true;
+ // SIMD&FP instructions operating on scalar data only acccess the lower
+ // bits of a register, the upper bits are zero extended on a write. For
+ // SIMD vector registers smaller than 128-bits, the upper 64-bits of the
+ // register are zero extended on a write.
+ if (FPR8RC.contains(RegID) || FPR16RC.contains(RegID) ||
+ FPR32RC.contains(RegID) || FPR64RC.contains(RegID))
+ return true;
+ return false;
+ };
+
+ Mask.clearAllBits();
+ for (unsigned I = 0, E = NumDefs; I < E; ++I) {
+ const MCOperand &Op = Inst.getOperand(I);
+ if (ClearsSuperReg(Op.getReg()))
+ Mask.setBit(I);
+ }
+
+ for (unsigned I = 0, E = NumImplicitDefs; I < E; ++I) {
+ const MCPhysReg Reg = Desc.implicit_defs()[I];
+ if (ClearsSuperReg(Reg))
+ Mask.setBit(NumDefs + I);
+ }
+
+ return Mask.getBoolValue();
+ }
+
std::vector<std::pair<uint64_t, uint64_t>>
findPltEntries(uint64_t PltSectionVA, ArrayRef<uint8_t> PltContents,
const Triple &TargetTriple) const override {
diff --git a/llvm/test/tools/llvm-mca/AArch64/Neoverse/V1-clear-upper-regs.s b/llvm/test/tools/llvm-mca/AArch64/Neoverse/V1-clear-upper-regs.s
new file mode 100644
index 0000000000000..ab81f9fb04af3
--- /dev/null
+++ b/llvm/test/tools/llvm-mca/AArch64/Neoverse/V1-clear-upper-regs.s
@@ -0,0 +1,791 @@
+# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
+# RUN: llvm-mca -mtriple=aarch64 -mcpu=neoverse-v1 --timeline --timeline-max-iterations=4 < %s | FileCheck %s
+
+# LLVM-MCA-BEGIN GPR32-bit
+ldr w0, [sp]
+add x0, x0, x0
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN FPR8-bit
+ldr b0, [sp]
+fadd d0, d0, d0
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN FPR16-bit
+ldr h0, [sp]
+fadd d0, d0, d0
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN FPR32-bit
+ldr s0, [sp]
+fadd d0, d0, d0
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN SIMD64-bit-b
+ld1 {v0.8b}, [sp]
+add v0.16b, v0.16b, v0.16b
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN SIMD64-bit-h
+ld1 {v0.4h}, [sp]
+add v0.8h, v0.8h, v0.8h
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN SIMD64-bit-s
+ld1 {v0.2s}, [sp]
+add v0.4s, v0.4s, v0.4s
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN SIMD64-bit-d
+ld1 {v0.1d}, [sp]
+add v0.2d, v0.2d, v0.2d
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN ins
+ins v0.b[0], v1.b[1]
+add v0.16b, v0.16b, v0.16b
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN lanewise-load
+ld1 {v0.b}[0], [sp]
+add v0.16b, v0.16b, v0.16b
+# LLVM-MCA-END
+
+# CHECK: [0] Code Region - GPR32-bit
+
+# CHECK: Iterations: 100
+# CHECK-NEXT: Instructions: 200
+# CHECK-NEXT: Total Cycles: 41
+# CHECK-NEXT: Total uOps: 200
+
+# CHECK: Dispatch Width: 15
+# CHECK-NEXT: uOps Per Cycle: 4.88
+# CHECK-NEXT: IPC: 4.88
+# CHECK-NEXT: Block RThroughput: 0.3
+
+# CHECK: Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK: [1] [2] [3] [4] [5] [6] Instructions:
+# CHECK-NEXT: 1 4 0.33 * ldr w0, [sp]
+# CHECK-NEXT: 1 1 0.25 add x0, x0, x0
+
+# CHECK: Resources:
+# CHECK-NEXT: [0.0] - V1UnitB
+# CHECK-NEXT: [0.1] - V1UnitB
+# CHECK-NEXT: [1.0] - V1UnitD
+# CHECK-NEXT: [1.1] - V1UnitD
+# CHECK-NEXT: [2] - V1UnitL2
+# CHECK-NEXT: [3.0] - V1UnitL01
+# CHECK-NEXT: [3.1] - V1UnitL01
+# CHECK-NEXT: [4] - V1UnitM0
+# CHECK-NEXT: [5] - V1UnitM1
+# CHECK-NEXT: [6.0] - V1UnitS
+# CHECK-NEXT: [6.1] - V1UnitS
+# CHECK-NEXT: [7] - V1UnitV0
+# CHECK-NEXT: [8] - V1UnitV1
+# CHECK-NEXT: [9] - V1UnitV2
+# CHECK-NEXT: [10] - V1UnitV3
+
+# CHECK: Resource pressure per iteration:
+# CHECK-NEXT: [0.0] [0.1] [1.0] [1.1] [2] [3.0] [3.1] [4] [5] [6.0] [6.1] [7] [8] [9] [10]
+# CHECK-NEXT: - - - - 0.33 0.33 0.34 0.22 0.22 0.28 0.28 - - - -
+
+# CHECK: Resource pressure by instruction:
+# CHECK-NEXT: [0.0] [0.1] [1.0] [1.1] [2] [3.0] [3.1] [4] [5] [6.0] [6.1] [7] [8] [9] [10] Instructions:
+# CHECK-NEXT: - - - - 0.33 0.33 0.34 - - - - - - - - ldr w0, [sp]
+# CHECK-NEXT: - - - - - - - 0.22 0.22 0.28 0.28 - - - - add x0, x0, x0
+
+# CHECK: Timeline view:
+# CHECK-NEXT: Index 012345678
+
+# CHECK: [0,0] DeeeeER . ldr w0, [sp]
+# CHECK-NEXT: [0,1] D====eER. add x0, x0, x0
+# CHECK-NEXT: [1,0] DeeeeE-R. ldr w0, [sp]
+# CHECK-NEXT: [1,1] D====eER. add x0, x0, x0
+# CHECK-NEXT: [2,0] DeeeeE-R. ldr w0, [sp]
+# CHECK-NEXT: [2,1] D====eER. add x0, x0, x0
+# CHECK-NEXT: [3,0] D=eeeeER. ldr w0, [sp]
+# CHECK-NEXT: [3,1] D=====eER add x0, x0, x0
+
+# CHECK: Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK: [0] [1] [2] [3]
+# CHECK-NEXT: 0. 4 1.3 1.3 0.5 ldr w0, [sp]
+# CHECK-NEXT: 1. 4 5.3 0.0 0.0 add x0, x0, x0
+# CHECK-NEXT: 4 3.3 0.6 0.3 <total>
+
+# CHECK: [1] Code Region - FPR8-bit
+
+# CHECK: Iterations: 100
+# CHECK-NEXT: Instructions: 200
+# CHECK-NEXT: Total Cycles: 44
+# CHECK-NEXT: Total uOps: 200
+
+# CHECK: Dispatch Width: 15
+# CHECK-NEXT: uOps Per Cycle: 4.55
+# CHECK-NEXT: IPC: 4.55
+# CHECK-NEXT: Block RThroughput: 0.3
+
+# CHECK: Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK: [1] [2] [3] [4] [5] [6] Instructions:
+# CHECK-NEXT: 1 6 0.33 * ldr b0, [sp]
+# CHECK-NEXT: 1 2 0.25 fadd d0, d0, d0
+
+# CHECK: Resources:
+# CHECK-NEXT: [0.0] - V1UnitB
+# CHECK-NEXT: [0.1] - V1UnitB
+# CHECK-NEXT: [1.0] - V1UnitD
+# CHECK-NEXT: [1.1] - V1UnitD
+# CHECK-NEXT: [2] - V1UnitL2
+# CHECK-NEXT: [3.0] - V1UnitL01
+# CHECK-NEXT: [3.1] - V1UnitL01
+# CHECK-NEXT: [4] - V1UnitM0
+# CHECK-NEXT: [5] - V1UnitM1
+# CHECK-NEXT: [6.0] - V1UnitS
+# CHECK-NEXT: [6.1] - V1UnitS
+# CHECK-NEXT: [7] - V1UnitV0
+# CHECK-NEXT: [8] - V1UnitV1
+# CHECK-NEXT: [9] - V1UnitV2
+# CHECK-NEXT: [10] - V1UnitV3
+
+# CHECK: Resource pressure per iteration:
+# CHECK-NEXT: [0.0] [0.1] [1.0] [1.1] [2] [3.0] [3.1] [4] [5] [6.0] [6.1] [7] [8] [9] [10]
+# CHECK-NEXT: - - - - 0.33 0.33 0.34 - - - - 0.25 0.25 0.25 0.25
+
+# CHECK: Resource pressure by instruction:
+# CHECK-NEXT: [0.0] [0.1] [1.0] [1.1] [2] [3.0] [3.1] [4] [5] [6.0] [6.1] [7] [8] [9] [10] Instructions:
+# CHECK-NEXT: - - - - 0.33 0.33 0.34 - - - - - - - - ldr b0, [sp]
+# CHECK-NEXT: - - - - - - - - - - - 0.25 0.25 0.25 0.25 fadd d0, d0, d0
+
+# CHECK: Timeline view:
+# CHECK-NEXT: 01
+# CHECK-NEXT: Index 0123456789
+
+# CHECK: [0,0] DeeeeeeER .. ldr b0, [sp]
+# CHECK-NEXT: [0,1] D======eeER. fadd d0, d0, d0
+# CHECK-NEXT: [1,0] DeeeeeeE--R. ldr b0, [sp]
+# CHECK-NEXT: [1,1] D======eeER. fadd d0, d0, d0
+# CHECK-NEXT: [2,0] DeeeeeeE--R. ldr b0, [sp]
+# CHECK-NEXT: [2,1] D======eeER. fadd d0, d0, d0
+# CHECK-NEXT: [3,0] D=eeeeeeE-R. ldr b0, [sp]
+# CHECK-NEXT: [3,1] D=======eeER fadd d0, d0, d0
+
+# CHECK: Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK: [0] [1] [2] [3]
+# CHECK-NEXT: 0. 4 1.3 1.3 1.3 ldr b0, [sp]
+# CHECK-NEXT: 1. 4 7.3 0.0 0.0 fadd d0, d0, d0
+# CHECK-NEXT: 4 4.3 0.6 0.6 <total>
+
+# CHECK: [2] Code Region - FPR16-bit
+
+# CHECK: Iterations: 100
+# CHECK-NEXT: Instructions: 200
+# CHECK-NEXT: Total Cycles: 44
+# CHECK-NEXT: Total uOps: 200
+
+# CHECK: Dispatch Width: 15
+# CHECK-NEXT: uOps Per Cycle: 4.55
+# CHECK-NEXT: IPC: 4.55
+# CHECK-NEXT: Block RThroughput: 0.3
+
+# CHECK: Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK: [1] [2] [3] [4] [5] [6] Instructions:
+# CHECK-NEXT: 1 6 0.33 * ldr h0, [sp]
+# CHECK-NEXT: 1 2 0.25 fadd d0, d0, d0
+
+# CHECK: Resources:
+# CHECK-NEXT: [0.0] - V1UnitB
+# CHECK-NEXT: [0.1] - V1UnitB
+# CHECK-NEXT: [1.0] - V1UnitD
+# CHECK-NEXT: [1.1] - V1UnitD
+# CHECK-NEXT: [2] - V1UnitL2
+# CHECK-NEXT: [3.0] - V1UnitL01
+# CHECK-NEXT: [3.1] - V1UnitL01
+# CHECK-NEXT: [4] - V1UnitM0
+# CHECK-NEXT: [5] - V1UnitM1
+# CHECK-NEXT: [6.0] - V1UnitS
+# CHECK-NEXT: [6.1] - V1UnitS
+# CHECK-NEXT: [7] - V1UnitV0
+# CHECK-NEXT: [8] - V1UnitV1
+# CHECK-NEXT: [9] - V1UnitV2
+# CHECK-NEXT: [10] - V1UnitV3
+
+# CHECK: Resource pressure per iteration:
+# CHECK-NEXT: [0.0] [0.1] [1.0] [1.1] [2] [3.0] [3.1] [4] [5] [6.0] [6.1] [7] [8] [9] [10]
+# CHECK-NEXT: - - - - 0.33 0.33 0.34 - - - - 0.25 0.25 0.25 0.25
+
+# CHECK: Resource pressure by instruction:
+# CHECK-NEXT: [0.0] [0.1] [1.0] [1.1] [2] [3.0] [3.1] [4] [5] [6.0] [6.1] [7] [8] [9] [10] Instructions:
+# CHECK-NEXT: - - - - 0.33 0.33 0.34 - - - - - - - - ldr h0, [sp]
+# CHECK-NEXT: - - - - - - - - - - - 0.25 0.25 0.25 0.25 fadd d0, d0, d0
+
+# CHECK: Timeline view:
+# CHECK-NEXT: 01
+# CHECK-NEXT: Index 0123456789
+
+# CHECK: [0,0] DeeeeeeER .. ldr h0, [sp]
+# CHECK-NEXT: [0,1] D======eeER. fadd d0, d0, d0
+# CHECK-NEXT: [1,0] DeeeeeeE--R. ldr h0, [sp]
+# CHECK-NEXT: [1,1] D======eeER. fadd d0, d0, d0
+# CHECK-NEXT: [2,0] DeeeeeeE--R. ldr h0, [sp]
+# CHECK-NEXT: [2,1] D======eeER. fadd d0, d0, d0
+# CHECK-NEXT: [3,0] D=eeeeeeE-R. ldr h0, [sp]
+# CHECK-NEXT: [3,1] D=======eeER fadd d0, d0, d0
+
+# CHECK: Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK: [0] [1] [2] [3]
+# CHECK-NEXT: 0. 4 1.3 1.3 1.3 ldr h0, [sp]
+# CHECK-NEXT: 1. 4 7.3 0.0 0.0 fadd d0, d0, d0
+# CHECK-NEXT: 4 4.3 0.6 0.6 <total>
+
+# CHECK: [3] Code Region - FPR32-bit
+
+# CHECK: Iterations: 100
+# CHECK-NEXT: Instructions: 200
+# CHECK-NEXT: Total Cycles: 44
+# CHECK-NEXT: Total uOps: 200
+
+# CHECK: Dispatch Width: 15
+# CHECK-NEXT: uOps Per Cycle: 4.55
+# CHECK-NEXT: IPC: 4.55
+# CHECK-NEXT: Block RThroughput: 0.3
+
+# CHECK: Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK: [1] [2] [3] [4] [5] [6] Instructions:
+# CHECK-NEXT: 1 6 0.33 * ldr s0, [sp]
+# CHECK-NEXT: 1 2 0.25 fadd d0, d0, d0
+
+# CHECK: Resources:
+# CHECK-NEXT: [0.0] - V1UnitB
+# CHECK-NEXT: [0.1] - V1UnitB
+# CHECK-NEXT: [1.0] - V1UnitD
+# CHECK-NEXT: [1.1] - V1UnitD
+# CHECK-NEXT: [2] - V1UnitL2
+# CHECK-NEXT: [3.0] - V1UnitL01
+# CHECK-NEXT: [3.1] - V1UnitL01
+# CHECK-NEXT: [4] - V1UnitM0
+# CHECK-NEXT: [5] - V1UnitM1
+# CHECK-NEXT: [6.0] - V1UnitS
+# CHECK-NEXT: [6.1] - V1UnitS
+# CHECK-NEXT: [7] - V1UnitV0
+# CHECK-NEXT: [8] - V1UnitV1
+# CHECK-NEXT: [9] - V1UnitV2
+# CHECK-NEXT: [10] - V1UnitV3
+
+# CHECK: Resource pressure per iteration:
+# CHECK-NEXT: [0.0] [0.1] [1.0] [1.1] [2] [3.0] [3.1] [4] [5] [6.0] [6.1] [7] [8] [9] [10]
+# CHECK-NEXT: - - - - 0.33 0.33 0.34 - - - - 0.25 0.25 0.25 0.25
+
+# CHECK: Resource pressure by instruction:
+# CHECK-NEXT: [0.0] [0.1] [1.0] [1.1] [2] [3.0] [3.1] [4] [5] [6.0] [6.1] [7] [8] [9] [10] Instructions:
+# CHECK-NEXT: - - - - 0.33 0.33 0.34 - - - - - - - - ldr s0, [sp]
+# CHECK-NEXT: - - - - - - - - - - - 0.25 0.25 0.25 0.25 fadd d0, d0, d0
+
+# CHECK: Timeline view:
+# CHECK-NEXT: 01
+# CHECK-NEXT: Index 0123456789
+
+# CHECK: [0,0] DeeeeeeER .. ldr s0, [sp]
+# CHECK-NEXT: [0,1] D======eeER. fadd d0, d0, d0
+# CHECK-NEXT: [1,0] DeeeeeeE--R. ldr s0, [sp]
+# CHECK-NEXT: [1,1] D======eeER. fadd d0, d0, d0
+# CHECK-NEXT: [2,0] DeeeeeeE--R. ldr s0, [sp]
+# CHECK-NEXT: [2,1] D======eeER. fadd d0, d0, d0
+# CHECK-NEXT: [3,0] D=eeeeeeE-R. ldr s0, [sp]
+# CHECK-NEXT: [3,1] D=======eeER fadd d0, d0, d0
+
+# CHECK: Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK: [0] [1] [2] [3]
+# CHECK-NEXT: 0. 4 1.3 1.3 1.3 ldr s0, [sp]
+# CHECK-NEXT: 1. 4 7.3 0.0 0.0 fadd d0, d0, d0
+# CHECK-NEXT: 4 4.3 0.6 0.6 <total>
+
+# CHECK: [4] Code Region - SIMD64-bit-b
+
+# CHECK: Iterations: 100
+# CHECK-NEXT: Instructions: 200
+# CHECK-NEXT: Total Cycles: 44
+# CHECK-NEXT: Total uOps: 200
+
+# CHECK: Dispatch Width: 15
+# CHECK-NEXT: uOps Per Cycle: 4.55
+# CHECK-NEXT: IPC: 4.55
+# CHECK-NEXT: Block RThroughput: 0.3
+
+# CHECK: Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK: [1] [2] [3] [4] [5] [6] Instructions:
+# CHECK-NEXT: 1 6 0.33 * ld1 { v0.8b }, [sp]
+# CHECK-NEXT: 1 2 0.25 add v0.16b, v0.16b, v0.16b
+
+# CHECK: Resources:
+# CHECK-NEXT: [0.0] - V1UnitB
+# CHECK-NEXT: [0.1] - V1UnitB
+# CHECK-NEXT: [1.0] - V1UnitD
+# CHECK-NEXT: [1.1] - V1UnitD
+# CHECK-NEXT: [2] - V1UnitL2
+# CHECK-NEXT: [3.0] - V1UnitL01
+# CHECK-NEXT: [3.1] - V1UnitL01
+# CHECK-NEXT: [4] - V1UnitM0
+# CHECK-NEXT: [5] - V1UnitM1
+# CHECK-NEXT: [6.0] - V1UnitS
+# CHECK-NEXT: [6.1] - V1UnitS
+# CHECK-NEXT: [7] - V1UnitV0
+# CHECK-NEXT: [8] - V1UnitV1
+# CHECK-NEXT: [9] - V1UnitV2
+# CHECK-NEXT: [10] - V1UnitV3
+
+# CHECK: Resource pressure per iteration:
+# CHECK-NEXT: [0.0] [0.1] [1.0] [1.1] [2] [3.0] [3.1] [4] [5] [6.0] [6.1] [7] [8] [9] [10]
+# CHECK-NEXT: - - - - 0.33 0.33 0.34 - - - - 0.25 0.25 0.25 0.25
+
+# CHECK: Resource pressure by instruction:
+# CHECK-NEXT: [0.0] [0.1] [1.0] [1.1] [2] [3.0] [3.1] [4] [5] [6.0] [6.1] [7] [8] [9] [10] Instructions:
+# CHECK-NEXT: - - - - 0.33 0.33 0.34 - - - - - - - - ld1 { v0.8b }, [sp]
+# CHECK-NEXT: - - - - - - - - - - - 0.25 0.25 0.25 0.25 add v0.16b, v0.16b, v0.16b
+
+# CHECK: Timeline view:
+# CHECK-NEXT: 01
+# CHECK-NEXT: Index 0123456789
+
+# CHECK: [0,0] DeeeeeeER .. ld1 { v0.8b }, [sp]
+# CHECK-NEXT: [0,1] D======eeER. add v0.16b, v0.16b, v0.16b
+# CHECK-NEXT: [1,0] DeeeeeeE--R. ld1 { v0.8b }, [sp]
+# CHECK-NEXT: [1,1] D======eeER. add v0.16b, v0.16b, v0.16b
+# CHECK-NEXT: [2,0] DeeeeeeE--R. ld1 { v0.8b }, [sp]
+# CHECK-NEXT: [2,1] D======eeER. add v0.16b, v0.16b, v0.16b
+# CHECK-NEXT: [3,0] D=eeeeeeE-R. ld1 { v0.8b }, [sp]
+# CHECK-NEXT: [3,1] D=======eeER add v0.16b, v0.16b, v0.16b
+
+# CHECK: Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK: [0] [1] [2] [3]
+# CHECK-NEXT: 0. 4 1.3 1.3 1.3 ld1 { v0.8b }, [sp]
+# CHECK-NEXT: 1. 4 7.3 0.0 0.0 add v0.16b, v0.16b, v0.16b
+# CHECK-NEXT: 4 4.3 0.6 0.6 <total>
+
+# CHECK: [5] Code Region - SIMD64-bit-h
+
+# CHECK: Iterations: 100
+# CHECK-NEXT: Instructions: 200
+# CHECK-NEXT: Total Cycles: 44
+# CHECK-NEXT: Total uOps: 200
+
+# CHECK: Dispatch Width: 15
+# CHECK-NEXT: uOps Per Cycle: 4.55
+# CHECK-NEXT: IPC: 4.55
+# CHECK-NEXT: Block RThroughput: 0.3
+
+# CHECK: Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK: [1] [2] [3] [4] [5] [6] Instructions:
+# CHECK-NEXT: 1 6 0.33 * ld1 { v0.4h }, ...
[truncated]
``````````
</details>
https://github.com/llvm/llvm-project/pull/92548
More information about the llvm-commits
mailing list