[llvm] r338372 - [llvm-mca][BtVer2] Teach how to identify dependency-breaking idioms.

Andrea Di Biagio via llvm-commits llvm-commits at lists.llvm.org
Tue Jul 31 06:21:43 PDT 2018


Author: adibiagio
Date: Tue Jul 31 06:21:43 2018
New Revision: 338372

URL: http://llvm.org/viewvc/llvm-project?rev=338372&view=rev
Log:
[llvm-mca][BtVer2] Teach how to identify dependency-breaking idioms.

This patch teaches llvm-mca how to identify dependency breaking instructions on
btver2.

An example of dependency breaking instructions is the zero-idiom XOR (example:
`XOR %eax, %eax`), which always generates zero regardless of the actual value of
the input register operands.
Dependency breaking instructions don't have to wait on their input register
operands before executing. This is because the computation is not dependent on
the inputs.

Not all dependency breaking idioms are also zero-latency instructions. For
example, `CMPEQ %xmm1, %xmm1` is independent on
the value of XMM1, and it generates a vector of all-ones.
That instruction is not eliminated at register renaming stage, and its opcode is
issued to a pipeline for execution. So, the latency is not zero. 

This patch adds a new method named isDependencyBreaking() to the MCInstrAnalysis
interface. That method takes as input an instruction (i.e. MCInst) and a
MCSubtargetInfo.
The default implementation of isDependencyBreaking() conservatively returns
false for all instructions. Targets may override the default behavior for
specific CPUs, and return a value which better matches the subtarget behavior.

In future, we should teach to Tablegen how to automatically generate the body of
isDependencyBreaking from scheduling predicate definitions. This would allow us
to expose the knowledge about dependency breaking instructions to the machine
schedulers (and, potentially, other codegen passes).

Differential Revision: https://reviews.llvm.org/D49310

Modified:
    llvm/trunk/include/llvm/MC/MCInstrAnalysis.h
    llvm/trunk/lib/MC/MCInstrAnalysis.cpp
    llvm/trunk/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp
    llvm/trunk/test/tools/llvm-mca/X86/BtVer2/dependency-breaking-cmp.s
    llvm/trunk/test/tools/llvm-mca/X86/BtVer2/dependency-breaking-pcmpeq.s
    llvm/trunk/test/tools/llvm-mca/X86/BtVer2/dependency-breaking-sbb-2.s
    llvm/trunk/test/tools/llvm-mca/X86/BtVer2/one-idioms.s
    llvm/trunk/tools/llvm-mca/DispatchStage.cpp
    llvm/trunk/tools/llvm-mca/InstrBuilder.cpp
    llvm/trunk/tools/llvm-mca/Instruction.h
    llvm/trunk/tools/llvm-mca/RetireStage.cpp

Modified: llvm/trunk/include/llvm/MC/MCInstrAnalysis.h
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/include/llvm/MC/MCInstrAnalysis.h?rev=338372&r1=338371&r2=338372&view=diff
==============================================================================
--- llvm/trunk/include/llvm/MC/MCInstrAnalysis.h (original)
+++ llvm/trunk/include/llvm/MC/MCInstrAnalysis.h Tue Jul 31 06:21:43 2018
@@ -87,6 +87,19 @@ public:
                                     const MCInst &Inst,
                                     APInt &Writes) const;
 
+  /// Returns true if \param Inst is a dependency breaking instruction for the
+  /// given subtarget.
+  ///
+  /// The value computed by a dependency breaking instruction is not dependent
+  /// on the inputs. An example of dependency breaking instruction on X86 is
+  /// `XOR %eax, %eax`.
+  /// TODO: In future, we could implement an alternative approach where this
+  /// method returns `true` if the input instruction is not dependent on
+  /// some/all of its input operands. An APInt mask could then be used to
+  /// identify independent operands.
+  virtual bool isDependencyBreaking(const MCSubtargetInfo &STI,
+                                    const MCInst &Inst) const;
+
   /// Given a branch instruction try to get the address the branch
   /// targets. Return true on success, and the address in Target.
   virtual bool

Modified: llvm/trunk/lib/MC/MCInstrAnalysis.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/MC/MCInstrAnalysis.cpp?rev=338372&r1=338371&r2=338372&view=diff
==============================================================================
--- llvm/trunk/lib/MC/MCInstrAnalysis.cpp (original)
+++ llvm/trunk/lib/MC/MCInstrAnalysis.cpp Tue Jul 31 06:21:43 2018
@@ -24,6 +24,11 @@ bool MCInstrAnalysis::clearsSuperRegiste
   return false;
 }
 
+bool MCInstrAnalysis::isDependencyBreaking(const MCSubtargetInfo &STI,
+                                           const MCInst &Inst) const {
+  return false;
+}
+
 bool MCInstrAnalysis::evaluateBranch(const MCInst &Inst, uint64_t Addr,
                                      uint64_t Size, uint64_t &Target) const {
   if (Inst.getNumOperands() == 0 ||

Modified: llvm/trunk/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp?rev=338372&r1=338371&r2=338372&view=diff
==============================================================================
--- llvm/trunk/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp (original)
+++ llvm/trunk/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp Tue Jul 31 06:21:43 2018
@@ -307,10 +307,84 @@ class X86MCInstrAnalysis : public MCInst
 public:
   X86MCInstrAnalysis(const MCInstrInfo *MCII) : MCInstrAnalysis(MCII) {}
 
+  bool isDependencyBreaking(const MCSubtargetInfo &STI,
+                            const MCInst &Inst) const override;
   bool clearsSuperRegisters(const MCRegisterInfo &MRI, const MCInst &Inst,
                             APInt &Mask) const override;
 };
 
+bool X86MCInstrAnalysis::isDependencyBreaking(const MCSubtargetInfo &STI,
+                                              const MCInst &Inst) const {
+  if (STI.getCPU() == "btver2") {
+    // Reference: Agner Fog's microarchitecture.pdf - Section 20 "AMD Bobcat and
+    // Jaguar pipeline", subsection 8 "Dependency-breaking instructions".
+    switch (Inst.getOpcode()) {
+    default:
+      return false;
+    case X86::SUB32rr:
+    case X86::SUB64rr:
+    case X86::SBB32rr:
+    case X86::SBB64rr:
+    case X86::XOR32rr:
+    case X86::XOR64rr:
+    case X86::XORPSrr:
+    case X86::XORPDrr:
+    case X86::VXORPSrr:
+    case X86::VXORPDrr:
+    case X86::ANDNPSrr:
+    case X86::VANDNPSrr:
+    case X86::ANDNPDrr:
+    case X86::VANDNPDrr:
+    case X86::PXORrr:
+    case X86::VPXORrr:
+    case X86::PANDNrr:
+    case X86::VPANDNrr:
+    case X86::PSUBBrr:
+    case X86::PSUBWrr:
+    case X86::PSUBDrr:
+    case X86::PSUBQrr:
+    case X86::VPSUBBrr:
+    case X86::VPSUBWrr:
+    case X86::VPSUBDrr:
+    case X86::VPSUBQrr:
+    case X86::PCMPEQBrr:
+    case X86::PCMPEQWrr:
+    case X86::PCMPEQDrr:
+    case X86::PCMPEQQrr:
+    case X86::VPCMPEQBrr:
+    case X86::VPCMPEQWrr:
+    case X86::VPCMPEQDrr:
+    case X86::VPCMPEQQrr:
+    case X86::PCMPGTBrr:
+    case X86::PCMPGTWrr:
+    case X86::PCMPGTDrr:
+    case X86::PCMPGTQrr:
+    case X86::VPCMPGTBrr:
+    case X86::VPCMPGTWrr:
+    case X86::VPCMPGTDrr:
+    case X86::VPCMPGTQrr:
+    case X86::MMX_PXORirr:
+    case X86::MMX_PANDNirr:
+    case X86::MMX_PSUBBirr:
+    case X86::MMX_PSUBDirr:
+    case X86::MMX_PSUBQirr:
+    case X86::MMX_PSUBWirr:
+    case X86::MMX_PCMPGTBirr:
+    case X86::MMX_PCMPGTDirr:
+    case X86::MMX_PCMPGTWirr:
+    case X86::MMX_PCMPEQBirr:
+    case X86::MMX_PCMPEQDirr:
+    case X86::MMX_PCMPEQWirr:
+      return Inst.getOperand(1).getReg() == Inst.getOperand(2).getReg();
+    case X86::CMP32rr:
+    case X86::CMP64rr:
+      return Inst.getOperand(0).getReg() == Inst.getOperand(1).getReg();
+    }
+  }
+
+  return false;
+}
+
 bool X86MCInstrAnalysis::clearsSuperRegisters(const MCRegisterInfo &MRI,
                                               const MCInst &Inst,
                                               APInt &Mask) const {

Modified: llvm/trunk/test/tools/llvm-mca/X86/BtVer2/dependency-breaking-cmp.s
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/tools/llvm-mca/X86/BtVer2/dependency-breaking-cmp.s?rev=338372&r1=338371&r2=338372&view=diff
==============================================================================
--- llvm/trunk/test/tools/llvm-mca/X86/BtVer2/dependency-breaking-cmp.s (original)
+++ llvm/trunk/test/tools/llvm-mca/X86/BtVer2/dependency-breaking-cmp.s Tue Jul 31 06:21:43 2018
@@ -11,9 +11,9 @@ cmovae %ebx, %eax
 
 # CHECK:      Iterations:        1500
 # CHECK-NEXT: Instructions:      3000
-# CHECK-NEXT: Total Cycles:      3003
+# CHECK-NEXT: Total Cycles:      1504
 # CHECK-NEXT: Dispatch Width:    2
-# CHECK-NEXT: IPC:               1.00
+# CHECK-NEXT: IPC:               1.99
 # CHECK-NEXT: Block RThroughput: 1.0
 
 # CHECK:      Instruction Info:
@@ -54,14 +54,14 @@ cmovae %ebx, %eax
 # CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     cmovael	%ebx, %eax
 
 # CHECK:      Timeline view:
-# CHECK-NEXT: Index     012345678
+# CHECK-NEXT: Index     0123456
 
-# CHECK:      [0,0]     DeER .  .   cmpl	%eax, %eax
-# CHECK-NEXT: [0,1]     D=eER.  .   cmovael	%ebx, %eax
-# CHECK-NEXT: [1,0]     .D=eER  .   cmpl	%eax, %eax
-# CHECK-NEXT: [1,1]     .D==eER .   cmovael	%ebx, %eax
-# CHECK-NEXT: [2,0]     . D==eER.   cmpl	%eax, %eax
-# CHECK-NEXT: [2,1]     . D===eER   cmovael	%ebx, %eax
+# CHECK:      [0,0]     DeER ..   cmpl	%eax, %eax
+# CHECK-NEXT: [0,1]     D=eER..   cmovael	%ebx, %eax
+# CHECK-NEXT: [1,0]     .DeER..   cmpl	%eax, %eax
+# CHECK-NEXT: [1,1]     .D=eER.   cmovael	%ebx, %eax
+# CHECK-NEXT: [2,0]     . DeER.   cmpl	%eax, %eax
+# CHECK-NEXT: [2,1]     . D=eER   cmovael	%ebx, %eax
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -70,5 +70,5 @@ cmovae %ebx, %eax
 # CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
 
 # CHECK:            [0]    [1]    [2]    [3]
-# CHECK-NEXT: 0.     3     2.0    0.3    0.0       cmpl	%eax, %eax
-# CHECK-NEXT: 1.     3     3.0    0.0    0.0       cmovael	%ebx, %eax
+# CHECK-NEXT: 0.     3     1.0    1.0    0.0       cmpl	%eax, %eax
+# CHECK-NEXT: 1.     3     2.0    0.0    0.0       cmovael	%ebx, %eax

Modified: llvm/trunk/test/tools/llvm-mca/X86/BtVer2/dependency-breaking-pcmpeq.s
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/tools/llvm-mca/X86/BtVer2/dependency-breaking-pcmpeq.s?rev=338372&r1=338371&r2=338372&view=diff
==============================================================================
--- llvm/trunk/test/tools/llvm-mca/X86/BtVer2/dependency-breaking-pcmpeq.s (original)
+++ llvm/trunk/test/tools/llvm-mca/X86/BtVer2/dependency-breaking-pcmpeq.s Tue Jul 31 06:21:43 2018
@@ -14,9 +14,9 @@ vpcmpeqq %xmm3, %xmm3, %xmm0
 
 # CHECK:      Iterations:        1500
 # CHECK-NEXT: Instructions:      6000
-# CHECK-NEXT: Total Cycles:      6003
+# CHECK-NEXT: Total Cycles:      3003
 # CHECK-NEXT: Dispatch Width:    2
-# CHECK-NEXT: IPC:               1.00
+# CHECK-NEXT: IPC:               2.00
 # CHECK-NEXT: Block RThroughput: 2.0
 
 # CHECK:      Instruction Info:
@@ -61,21 +61,20 @@ vpcmpeqq %xmm3, %xmm3, %xmm0
 # CHECK-NEXT:  -      -      -      -      -     1.00    -      -      -      -      -     1.00    -      -     vpcmpeqq	%xmm3, %xmm3, %xmm0
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     01234
-# CHECK-NEXT: Index     0123456789
+# CHECK-NEXT: Index     012345678
 
-# CHECK:      [0,0]     DeER .    .   .   vpcmpeqb	%xmm0, %xmm0, %xmm1
-# CHECK-NEXT: [0,1]     D=eER.    .   .   vpcmpeqw	%xmm1, %xmm1, %xmm2
-# CHECK-NEXT: [0,2]     .D=eER    .   .   vpcmpeqd	%xmm2, %xmm2, %xmm3
-# CHECK-NEXT: [0,3]     .D==eER   .   .   vpcmpeqq	%xmm3, %xmm3, %xmm0
-# CHECK-NEXT: [1,0]     . D==eER  .   .   vpcmpeqb	%xmm0, %xmm0, %xmm1
-# CHECK-NEXT: [1,1]     . D===eER .   .   vpcmpeqw	%xmm1, %xmm1, %xmm2
-# CHECK-NEXT: [1,2]     .  D===eER.   .   vpcmpeqd	%xmm2, %xmm2, %xmm3
-# CHECK-NEXT: [1,3]     .  D====eER   .   vpcmpeqq	%xmm3, %xmm3, %xmm0
-# CHECK-NEXT: [2,0]     .   D====eER  .   vpcmpeqb	%xmm0, %xmm0, %xmm1
-# CHECK-NEXT: [2,1]     .   D=====eER .   vpcmpeqw	%xmm1, %xmm1, %xmm2
-# CHECK-NEXT: [2,2]     .    D=====eER.   vpcmpeqd	%xmm2, %xmm2, %xmm3
-# CHECK-NEXT: [2,3]     .    D======eER   vpcmpeqq	%xmm3, %xmm3, %xmm0
+# CHECK:      [0,0]     DeER .  .   vpcmpeqb	%xmm0, %xmm0, %xmm1
+# CHECK-NEXT: [0,1]     DeER .  .   vpcmpeqw	%xmm1, %xmm1, %xmm2
+# CHECK-NEXT: [0,2]     .DeER.  .   vpcmpeqd	%xmm2, %xmm2, %xmm3
+# CHECK-NEXT: [0,3]     .DeER.  .   vpcmpeqq	%xmm3, %xmm3, %xmm0
+# CHECK-NEXT: [1,0]     . DeER  .   vpcmpeqb	%xmm0, %xmm0, %xmm1
+# CHECK-NEXT: [1,1]     . DeER  .   vpcmpeqw	%xmm1, %xmm1, %xmm2
+# CHECK-NEXT: [1,2]     .  DeER .   vpcmpeqd	%xmm2, %xmm2, %xmm3
+# CHECK-NEXT: [1,3]     .  DeER .   vpcmpeqq	%xmm3, %xmm3, %xmm0
+# CHECK-NEXT: [2,0]     .   DeER.   vpcmpeqb	%xmm0, %xmm0, %xmm1
+# CHECK-NEXT: [2,1]     .   DeER.   vpcmpeqw	%xmm1, %xmm1, %xmm2
+# CHECK-NEXT: [2,2]     .    DeER   vpcmpeqd	%xmm2, %xmm2, %xmm3
+# CHECK-NEXT: [2,3]     .    DeER   vpcmpeqq	%xmm3, %xmm3, %xmm0
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -84,7 +83,7 @@ vpcmpeqq %xmm3, %xmm3, %xmm0
 # CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
 
 # CHECK:            [0]    [1]    [2]    [3]
-# CHECK-NEXT: 0.     3     3.0    0.3    0.0       vpcmpeqb	%xmm0, %xmm0, %xmm1
-# CHECK-NEXT: 1.     3     4.0    0.0    0.0       vpcmpeqw	%xmm1, %xmm1, %xmm2
-# CHECK-NEXT: 2.     3     4.0    0.0    0.0       vpcmpeqd	%xmm2, %xmm2, %xmm3
-# CHECK-NEXT: 3.     3     5.0    0.0    0.0       vpcmpeqq	%xmm3, %xmm3, %xmm0
+# CHECK-NEXT: 0.     3     1.0    1.0    0.0       vpcmpeqb	%xmm0, %xmm0, %xmm1
+# CHECK-NEXT: 1.     3     1.0    1.0    0.0       vpcmpeqw	%xmm1, %xmm1, %xmm2
+# CHECK-NEXT: 2.     3     1.0    1.0    0.0       vpcmpeqd	%xmm2, %xmm2, %xmm3
+# CHECK-NEXT: 3.     3     1.0    1.0    0.0       vpcmpeqq	%xmm3, %xmm3, %xmm0

Modified: llvm/trunk/test/tools/llvm-mca/X86/BtVer2/dependency-breaking-sbb-2.s
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/tools/llvm-mca/X86/BtVer2/dependency-breaking-sbb-2.s?rev=338372&r1=338371&r2=338372&view=diff
==============================================================================
--- llvm/trunk/test/tools/llvm-mca/X86/BtVer2/dependency-breaking-sbb-2.s (original)
+++ llvm/trunk/test/tools/llvm-mca/X86/BtVer2/dependency-breaking-sbb-2.s Tue Jul 31 06:21:43 2018
@@ -13,9 +13,9 @@ sbb %eax, %eax
 
 # CHECK:      Iterations:        1500
 # CHECK-NEXT: Instructions:      4500
-# CHECK-NEXT: Total Cycles:      6745
+# CHECK-NEXT: Total Cycles:      3007
 # CHECK-NEXT: Dispatch Width:    2
-# CHECK-NEXT: IPC:               0.67
+# CHECK-NEXT: IPC:               1.50
 # CHECK-NEXT: Block RThroughput: 2.0
 
 # CHECK:      Instruction Info:
@@ -49,27 +49,27 @@ sbb %eax, %eax
 
 # CHECK:      Resource pressure per iteration:
 # CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]    [10]   [11]   [12]   [13]
-# CHECK-NEXT: 2.01   1.99    -      -      -      -      -      -     1.00    -      -      -      -      -
+# CHECK-NEXT: 2.00   2.00    -      -      -      -      -      -     1.00    -      -      -      -      -
 
 # CHECK:      Resource pressure by instruction:
 # CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]    [10]   [11]   [12]   [13]   Instructions:
 # CHECK-NEXT:  -     1.00    -      -      -      -      -      -     1.00    -      -      -      -      -     imull	%edx, %eax
-# CHECK-NEXT: 0.99   0.01    -      -      -      -      -      -      -      -      -      -      -      -     addl	%edx, %edx
-# CHECK-NEXT: 1.01   0.99    -      -      -      -      -      -      -      -      -      -      -      -     sbbl	%eax, %eax
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -      -      -      -      -      -      -     addl	%edx, %edx
+# CHECK-NEXT: 2.00    -      -      -      -      -      -      -      -      -      -      -      -      -     sbbl	%eax, %eax
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     012345
+# CHECK-NEXT:                     01
 # CHECK-NEXT: Index     0123456789
 
-# CHECK:      [0,0]     DeeeER    .    .   imull	%edx, %eax
-# CHECK-NEXT: [0,1]     .DeE-R    .    .   addl	%edx, %edx
-# CHECK-NEXT: [0,2]     .D==eER   .    .   sbbl	%eax, %eax
-# CHECK-NEXT: [1,0]     . D===eeeER    .   imull	%edx, %eax
-# CHECK-NEXT: [1,1]     .  DeE----R    .   addl	%edx, %edx
-# CHECK-NEXT: [1,2]     .  D=====eER   .   sbbl	%eax, %eax
-# CHECK-NEXT: [2,0]     .   D=====eeeER.   imull	%edx, %eax
-# CHECK-NEXT: [2,1]     .    DeE------R.   addl	%edx, %edx
-# CHECK-NEXT: [2,2]     .    D=======eER   sbbl	%eax, %eax
+# CHECK:      [0,0]     DeeeER    ..   imull	%edx, %eax
+# CHECK-NEXT: [0,1]     .DeE-R    ..   addl	%edx, %edx
+# CHECK-NEXT: [0,2]     .D=eE-R   ..   sbbl	%eax, %eax
+# CHECK-NEXT: [1,0]     . D==eeeER..   imull	%edx, %eax
+# CHECK-NEXT: [1,1]     .  DeE---R..   addl	%edx, %edx
+# CHECK-NEXT: [1,2]     .  D=eE---R.   sbbl	%eax, %eax
+# CHECK-NEXT: [2,0]     .   D=eeeER.   imull	%edx, %eax
+# CHECK-NEXT: [2,1]     .    D=eE--R   addl	%edx, %edx
+# CHECK-NEXT: [2,2]     .    D==eE-R   sbbl	%eax, %eax
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -78,6 +78,6 @@ sbb %eax, %eax
 # CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
 
 # CHECK:            [0]    [1]    [2]    [3]
-# CHECK-NEXT: 0.     3     3.7    0.7    0.0       imull	%edx, %eax
-# CHECK-NEXT: 1.     3     1.0    1.0    3.7       addl	%edx, %edx
-# CHECK-NEXT: 2.     3     5.7    0.0    0.0       sbbl	%eax, %eax
+# CHECK-NEXT: 0.     3     2.0    0.7    0.0       imull	%edx, %eax
+# CHECK-NEXT: 1.     3     1.3    1.3    2.0       addl	%edx, %edx
+# CHECK-NEXT: 2.     3     2.3    0.0    1.7       sbbl	%eax, %eax

Modified: llvm/trunk/test/tools/llvm-mca/X86/BtVer2/one-idioms.s
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/tools/llvm-mca/X86/BtVer2/one-idioms.s?rev=338372&r1=338371&r2=338372&view=diff
==============================================================================
--- llvm/trunk/test/tools/llvm-mca/X86/BtVer2/one-idioms.s (original)
+++ llvm/trunk/test/tools/llvm-mca/X86/BtVer2/one-idioms.s Tue Jul 31 06:21:43 2018
@@ -1,9 +1,11 @@
 # NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
-# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=btver2 -timeline -register-file-stats -iterations=1 < %s | FileCheck %s
+# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=btver2 -timeline -timeline-max-iterations=1 -register-file-stats < %s | FileCheck %s
 
 # These are dependency-breaking one-idioms.
 # Much like zero-idioms, but they produce ones, and do consume resources.
 
+# perf stats reports a throughput of 2.00 IPC.
+
 pcmpeqb   %mm2, %mm2
 pcmpeqd   %mm2, %mm2
 pcmpeqw   %mm2, %mm2
@@ -25,11 +27,11 @@ vpcmpeqw  %xmm3, %xmm3, %xmm5
 
 # FIXME: their handling is broken in llvm-mca.
 
-# CHECK:      Iterations:        1
-# CHECK-NEXT: Instructions:      15
-# CHECK-NEXT: Total Cycles:      12
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1500
+# CHECK-NEXT: Total Cycles:      753
 # CHECK-NEXT: Dispatch Width:    2
-# CHECK-NEXT: IPC:               1.25
+# CHECK-NEXT: IPC:               1.99
 # CHECK-NEXT: Block RThroughput: 7.5
 
 # CHECK:      Instruction Info:
@@ -58,13 +60,13 @@ vpcmpeqw  %xmm3, %xmm3, %xmm5
 # CHECK-NEXT:  1      1     0.50                        vpcmpeqw	%xmm3, %xmm3, %xmm5
 
 # CHECK:      Register File statistics:
-# CHECK-NEXT: Total number of mappings created:    15
-# CHECK-NEXT: Max number of mappings used:         8
+# CHECK-NEXT: Total number of mappings created:    1500
+# CHECK-NEXT: Max number of mappings used:         6
 
 # CHECK:      *  Register File #1 -- JFpuPRF:
 # CHECK-NEXT:    Number of physical registers:     72
-# CHECK-NEXT:    Total number of mappings created: 15
-# CHECK-NEXT:    Max number of mappings used:      8
+# CHECK-NEXT:    Total number of mappings created: 1500
+# CHECK-NEXT:    Max number of mappings used:      6
 
 # CHECK:      *  Register File #2 -- JIntegerPRF:
 # CHECK-NEXT:    Number of physical registers:     64
@@ -89,45 +91,45 @@ vpcmpeqw  %xmm3, %xmm3, %xmm5
 
 # CHECK:      Resource pressure per iteration:
 # CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]    [10]   [11]   [12]   [13]
-# CHECK-NEXT:  -      -      -      -      -     7.00   8.00    -      -      -      -     7.00   8.00    -
+# CHECK-NEXT:  -      -      -      -      -     7.50   7.50    -      -      -      -     7.50   7.50    -
 
 # CHECK:      Resource pressure by instruction:
 # CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]    [10]   [11]   [12]   [13]   Instructions:
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -      -      -      -     1.00    -     pcmpeqb	%mm2, %mm2
-# CHECK-NEXT:  -      -      -      -      -     1.00    -      -      -      -      -     1.00    -      -     pcmpeqd	%mm2, %mm2
-# CHECK-NEXT:  -      -      -      -      -     1.00    -      -      -      -      -     1.00    -      -     pcmpeqw	%mm2, %mm2
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -      -      -      -     1.00    -     pcmpeqb	%xmm2, %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -      -      -      -     1.00    -     pcmpeqd	%xmm2, %xmm2
-# CHECK-NEXT:  -      -      -      -      -     1.00    -      -      -      -      -     1.00    -      -     pcmpeqq	%xmm2, %xmm2
-# CHECK-NEXT:  -      -      -      -      -     1.00    -      -      -      -      -     1.00    -      -     pcmpeqw	%xmm2, %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -      -      -      -     1.00    -     vpcmpeqb	%xmm3, %xmm3, %xmm3
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -      -      -      -     1.00    -     vpcmpeqd	%xmm3, %xmm3, %xmm3
-# CHECK-NEXT:  -      -      -      -      -     1.00    -      -      -      -      -     1.00    -      -     vpcmpeqq	%xmm3, %xmm3, %xmm3
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -      -      -      -     1.00    -     vpcmpeqw	%xmm3, %xmm3, %xmm3
-# CHECK-NEXT:  -      -      -      -      -     1.00    -      -      -      -      -     1.00    -      -     vpcmpeqb	%xmm3, %xmm3, %xmm5
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -      -      -      -     1.00    -     vpcmpeqd	%xmm3, %xmm3, %xmm5
-# CHECK-NEXT:  -      -      -      -      -     1.00    -      -      -      -      -     1.00    -      -     vpcmpeqq	%xmm3, %xmm3, %xmm5
-# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -      -      -      -     1.00    -     vpcmpeqw	%xmm3, %xmm3, %xmm5
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -     pcmpeqb	%mm2, %mm2
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -     pcmpeqd	%mm2, %mm2
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -     pcmpeqw	%mm2, %mm2
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -     pcmpeqb	%xmm2, %xmm2
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -     pcmpeqd	%xmm2, %xmm2
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -     pcmpeqq	%xmm2, %xmm2
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -     pcmpeqw	%xmm2, %xmm2
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -     vpcmpeqb	%xmm3, %xmm3, %xmm3
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -     vpcmpeqd	%xmm3, %xmm3, %xmm3
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -     vpcmpeqq	%xmm3, %xmm3, %xmm3
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -     vpcmpeqw	%xmm3, %xmm3, %xmm3
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -     vpcmpeqb	%xmm3, %xmm3, %xmm5
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -     vpcmpeqd	%xmm3, %xmm3, %xmm5
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -     vpcmpeqq	%xmm3, %xmm3, %xmm5
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -     vpcmpeqw	%xmm3, %xmm3, %xmm5
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     01
+# CHECK-NEXT:                     0
 # CHECK-NEXT: Index     0123456789
 
-# CHECK:      [0,0]     DeER .    ..   pcmpeqb	%mm2, %mm2
-# CHECK-NEXT: [0,1]     D=eER.    ..   pcmpeqd	%mm2, %mm2
-# CHECK-NEXT: [0,2]     .D=eER    ..   pcmpeqw	%mm2, %mm2
-# CHECK-NEXT: [0,3]     .DeE-R    ..   pcmpeqb	%xmm2, %xmm2
-# CHECK-NEXT: [0,4]     . DeE-R   ..   pcmpeqd	%xmm2, %xmm2
-# CHECK-NEXT: [0,5]     . D=eER   ..   pcmpeqq	%xmm2, %xmm2
-# CHECK-NEXT: [0,6]     .  D=eER  ..   pcmpeqw	%xmm2, %xmm2
-# CHECK-NEXT: [0,7]     .  DeE-R  ..   vpcmpeqb	%xmm3, %xmm3, %xmm3
-# CHECK-NEXT: [0,8]     .   DeE-R ..   vpcmpeqd	%xmm3, %xmm3, %xmm3
-# CHECK-NEXT: [0,9]     .   D=eER ..   vpcmpeqq	%xmm3, %xmm3, %xmm3
-# CHECK-NEXT: [0,10]    .    D=eER..   vpcmpeqw	%xmm3, %xmm3, %xmm3
-# CHECK-NEXT: [0,11]    .    D==eER.   vpcmpeqb	%xmm3, %xmm3, %xmm5
-# CHECK-NEXT: [0,12]    .    .D=eER.   vpcmpeqd	%xmm3, %xmm3, %xmm5
-# CHECK-NEXT: [0,13]    .    .D==eER   vpcmpeqq	%xmm3, %xmm3, %xmm5
-# CHECK-NEXT: [0,14]    .    . D=eER   vpcmpeqw	%xmm3, %xmm3, %xmm5
+# CHECK:      [0,0]     DeER .    .   pcmpeqb	%mm2, %mm2
+# CHECK-NEXT: [0,1]     DeER .    .   pcmpeqd	%mm2, %mm2
+# CHECK-NEXT: [0,2]     .DeER.    .   pcmpeqw	%mm2, %mm2
+# CHECK-NEXT: [0,3]     .DeER.    .   pcmpeqb	%xmm2, %xmm2
+# CHECK-NEXT: [0,4]     . DeER    .   pcmpeqd	%xmm2, %xmm2
+# CHECK-NEXT: [0,5]     . DeER    .   pcmpeqq	%xmm2, %xmm2
+# CHECK-NEXT: [0,6]     .  DeER   .   pcmpeqw	%xmm2, %xmm2
+# CHECK-NEXT: [0,7]     .  DeER   .   vpcmpeqb	%xmm3, %xmm3, %xmm3
+# CHECK-NEXT: [0,8]     .   DeER  .   vpcmpeqd	%xmm3, %xmm3, %xmm3
+# CHECK-NEXT: [0,9]     .   DeER  .   vpcmpeqq	%xmm3, %xmm3, %xmm3
+# CHECK-NEXT: [0,10]    .    DeER .   vpcmpeqw	%xmm3, %xmm3, %xmm3
+# CHECK-NEXT: [0,11]    .    DeER .   vpcmpeqb	%xmm3, %xmm3, %xmm5
+# CHECK-NEXT: [0,12]    .    .DeER.   vpcmpeqd	%xmm3, %xmm3, %xmm5
+# CHECK-NEXT: [0,13]    .    .DeER.   vpcmpeqq	%xmm3, %xmm3, %xmm5
+# CHECK-NEXT: [0,14]    .    . DeER   vpcmpeqw	%xmm3, %xmm3, %xmm5
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -137,17 +139,17 @@ vpcmpeqw  %xmm3, %xmm3, %xmm5
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       pcmpeqb	%mm2, %mm2
-# CHECK-NEXT: 1.     1     2.0    0.0    0.0       pcmpeqd	%mm2, %mm2
-# CHECK-NEXT: 2.     1     2.0    0.0    0.0       pcmpeqw	%mm2, %mm2
-# CHECK-NEXT: 3.     1     1.0    1.0    1.0       pcmpeqb	%xmm2, %xmm2
-# CHECK-NEXT: 4.     1     1.0    0.0    1.0       pcmpeqd	%xmm2, %xmm2
-# CHECK-NEXT: 5.     1     2.0    0.0    0.0       pcmpeqq	%xmm2, %xmm2
-# CHECK-NEXT: 6.     1     2.0    0.0    0.0       pcmpeqw	%xmm2, %xmm2
-# CHECK-NEXT: 7.     1     1.0    1.0    1.0       vpcmpeqb	%xmm3, %xmm3, %xmm3
-# CHECK-NEXT: 8.     1     1.0    0.0    1.0       vpcmpeqd	%xmm3, %xmm3, %xmm3
-# CHECK-NEXT: 9.     1     2.0    0.0    0.0       vpcmpeqq	%xmm3, %xmm3, %xmm3
-# CHECK-NEXT: 10.    1     2.0    0.0    0.0       vpcmpeqw	%xmm3, %xmm3, %xmm3
-# CHECK-NEXT: 11.    1     3.0    0.0    0.0       vpcmpeqb	%xmm3, %xmm3, %xmm5
-# CHECK-NEXT: 12.    1     2.0    0.0    0.0       vpcmpeqd	%xmm3, %xmm3, %xmm5
-# CHECK-NEXT: 13.    1     3.0    1.0    0.0       vpcmpeqq	%xmm3, %xmm3, %xmm5
-# CHECK-NEXT: 14.    1     2.0    1.0    0.0       vpcmpeqw	%xmm3, %xmm3, %xmm5
+# CHECK-NEXT: 1.     1     1.0    1.0    0.0       pcmpeqd	%mm2, %mm2
+# CHECK-NEXT: 2.     1     1.0    1.0    0.0       pcmpeqw	%mm2, %mm2
+# CHECK-NEXT: 3.     1     1.0    1.0    0.0       pcmpeqb	%xmm2, %xmm2
+# CHECK-NEXT: 4.     1     1.0    1.0    0.0       pcmpeqd	%xmm2, %xmm2
+# CHECK-NEXT: 5.     1     1.0    1.0    0.0       pcmpeqq	%xmm2, %xmm2
+# CHECK-NEXT: 6.     1     1.0    1.0    0.0       pcmpeqw	%xmm2, %xmm2
+# CHECK-NEXT: 7.     1     1.0    1.0    0.0       vpcmpeqb	%xmm3, %xmm3, %xmm3
+# CHECK-NEXT: 8.     1     1.0    1.0    0.0       vpcmpeqd	%xmm3, %xmm3, %xmm3
+# CHECK-NEXT: 9.     1     1.0    1.0    0.0       vpcmpeqq	%xmm3, %xmm3, %xmm3
+# CHECK-NEXT: 10.    1     1.0    1.0    0.0       vpcmpeqw	%xmm3, %xmm3, %xmm3
+# CHECK-NEXT: 11.    1     1.0    1.0    0.0       vpcmpeqb	%xmm3, %xmm3, %xmm5
+# CHECK-NEXT: 12.    1     1.0    1.0    0.0       vpcmpeqd	%xmm3, %xmm3, %xmm5
+# CHECK-NEXT: 13.    1     1.0    1.0    0.0       vpcmpeqq	%xmm3, %xmm3, %xmm5
+# CHECK-NEXT: 14.    1     1.0    1.0    0.0       vpcmpeqw	%xmm3, %xmm3, %xmm5

Modified: llvm/trunk/tools/llvm-mca/DispatchStage.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/tools/llvm-mca/DispatchStage.cpp?rev=338372&r1=338371&r2=338372&view=diff
==============================================================================
--- llvm/trunk/tools/llvm-mca/DispatchStage.cpp (original)
+++ llvm/trunk/tools/llvm-mca/DispatchStage.cpp Tue Jul 31 06:21:43 2018
@@ -107,17 +107,21 @@ void DispatchStage::dispatch(InstRef IR)
   // instruction. A dependency-breaking instruction is a zero-latency
   // instruction that doesn't consume hardware resources.
   // An example of dependency-breaking instruction on X86 is a zero-idiom XOR.
-  if (!Desc.isZeroLatency())
-    for (std::unique_ptr<ReadState> &RS : IS.getUses())
+  bool IsDependencyBreaking = IS.isDependencyBreaking();
+  for (std::unique_ptr<ReadState> &RS : IS.getUses())
+    if (RS->isImplicitRead() || !IsDependencyBreaking)
       updateRAWDependencies(*RS, STI);
 
   // By default, a dependency-breaking zero-latency instruction is expected to
   // be optimized at register renaming stage. That means, no physical register
   // is allocated to the instruction.
+  bool ShouldAllocateRegisters =
+      !(Desc.isZeroLatency() && IsDependencyBreaking);
   SmallVector<unsigned, 4> RegisterFiles(PRF.getNumRegisterFiles());
-  for (std::unique_ptr<WriteState> &WS : IS.getDefs())
+  for (std::unique_ptr<WriteState> &WS : IS.getDefs()) {
     PRF.addRegisterWrite(WriteRef(IR.first, WS.get()), RegisterFiles,
-                         !Desc.isZeroLatency());
+                         ShouldAllocateRegisters);
+  }
 
   // Reserve slots in the RCU, and notify the instruction that it has been
   // dispatched to the schedulers for execution.

Modified: llvm/trunk/tools/llvm-mca/InstrBuilder.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/tools/llvm-mca/InstrBuilder.cpp?rev=338372&r1=338371&r2=338372&view=diff
==============================================================================
--- llvm/trunk/tools/llvm-mca/InstrBuilder.cpp (original)
+++ llvm/trunk/tools/llvm-mca/InstrBuilder.cpp Tue Jul 31 06:21:43 2018
@@ -443,6 +443,10 @@ InstrBuilder::createInstruction(const MC
   // register writes implicitly clear the upper portion of a super-register.
   MCIA.clearsSuperRegisters(MRI, MCI, WriteMask);
 
+  // Check if this is a dependency breaking instruction.
+  if (MCIA.isDependencyBreaking(STI, MCI))
+    NewIS->setDependencyBreaking();
+
   // Initialize writes.
   unsigned WriteIndex = 0;
   for (const WriteDescriptor &WD : D.Writes) {

Modified: llvm/trunk/tools/llvm-mca/Instruction.h
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/tools/llvm-mca/Instruction.h?rev=338372&r1=338371&r2=338372&view=diff
==============================================================================
--- llvm/trunk/tools/llvm-mca/Instruction.h (original)
+++ llvm/trunk/tools/llvm-mca/Instruction.h Tue Jul 31 06:21:43 2018
@@ -170,8 +170,6 @@ class ReadState {
   bool IsReady;
 
 public:
-  bool isReady() const { return IsReady; }
-
   ReadState(const ReadDescriptor &Desc, unsigned RegID)
       : RD(Desc), RegisterID(RegID), DependentWrites(0),
         CyclesLeft(UNKNOWN_CYCLES), TotalCycles(0), IsReady(true) {}
@@ -182,6 +180,9 @@ public:
   unsigned getSchedClass() const { return RD.SchedClassID; }
   unsigned getRegisterID() const { return RegisterID; }
 
+  bool isReady() const { return IsReady; }
+  bool isImplicitRead() const { return RD.isImplicitRead(); }
+
   void cycleEvent();
   void writeStartEvent(unsigned Cycles);
   void setDependentWrites(unsigned Writes) {
@@ -299,6 +300,8 @@ class Instruction {
   // Retire Unit token ID for this instruction.
   unsigned RCUTokenID;
 
+  bool IsDepBreaking;
+
   using UniqueDef = std::unique_ptr<WriteState>;
   using UniqueUse = std::unique_ptr<ReadState>;
   using VecDefs = std::vector<UniqueDef>;
@@ -314,7 +317,8 @@ class Instruction {
 
 public:
   Instruction(const InstrDesc &D)
-      : Desc(D), Stage(IS_INVALID), CyclesLeft(UNKNOWN_CYCLES) {}
+      : Desc(D), Stage(IS_INVALID), CyclesLeft(UNKNOWN_CYCLES), RCUTokenID(0),
+        IsDepBreaking(false) {}
   Instruction(const Instruction &Other) = delete;
   Instruction &operator=(const Instruction &Other) = delete;
 
@@ -326,6 +330,9 @@ public:
   unsigned getRCUTokenID() const { return RCUTokenID; }
   int getCyclesLeft() const { return CyclesLeft; }
 
+  bool isDependencyBreaking() const { return IsDepBreaking; }
+  void setDependencyBreaking() { IsDepBreaking = true; }
+
   unsigned getNumUsers() const {
     unsigned NumUsers = 0;
     for (const UniqueDef &Def : Defs)

Modified: llvm/trunk/tools/llvm-mca/RetireStage.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/tools/llvm-mca/RetireStage.cpp?rev=338372&r1=338371&r2=338372&view=diff
==============================================================================
--- llvm/trunk/tools/llvm-mca/RetireStage.cpp (original)
+++ llvm/trunk/tools/llvm-mca/RetireStage.cpp Tue Jul 31 06:21:43 2018
@@ -45,10 +45,12 @@ void RetireStage::cycleStart() {
 void RetireStage::notifyInstructionRetired(const InstRef &IR) {
   LLVM_DEBUG(dbgs() << "[E] Instruction Retired: #" << IR << '\n');
   SmallVector<unsigned, 4> FreedRegs(PRF.getNumRegisterFiles());
-  const InstrDesc &Desc = IR.getInstruction()->getDesc();
+  const Instruction &Inst = *IR.getInstruction();
+  const InstrDesc &Desc = Inst.getDesc();
 
-  for (const std::unique_ptr<WriteState> &WS : IR.getInstruction()->getDefs())
-    PRF.removeRegisterWrite(*WS.get(), FreedRegs, !Desc.isZeroLatency());
+  bool ShouldFreeRegs = !(Desc.isZeroLatency() && Inst.isDependencyBreaking());
+  for (const std::unique_ptr<WriteState> &WS : Inst.getDefs())
+    PRF.removeRegisterWrite(*WS.get(), FreedRegs, ShouldFreeRegs);
   notifyEvent<HWInstructionEvent>(HWInstructionRetiredEvent(IR, FreedRegs));
 }
 




More information about the llvm-commits mailing list