[llvm] r239486 - [x86] Add a reassociation optimization to increase ILP via the MachineCombiner pass

Wed Jun 10 16:32:08 PDT 2015

Thanks, Richard! Checked in at r239497.

On Wed, Jun 10, 2015 at 5:10 PM, Richard Trieu <rtrieu at google.com> wrote:

>
>
> On Wed, Jun 10, 2015 at 1:32 PM, Sanjay Patel <spatel at rotateright.com>
> wrote:
>
>> Author: spatel
>> Date: Wed Jun 10 15:32:21 2015
>> New Revision: 239486
>>
>> URL: http://llvm.org/viewvc/llvm-project?rev=239486&view=rev
>> Log:
>> [x86] Add a reassociation optimization to increase ILP via the
>> MachineCombiner pass
>>
>> This is a reimplementation of D9780 at the machine instruction level
>> rather than the DAG.
>>
>> Use the MachineCombiner pass to reassociate scalar single-precision AVX
>> additions (just a
>> starting point; see the TODO comments) to increase ILP when it's safe to
>> do so.
>>
>> The code is closely based on the existing MachineCombiner optimization
>> that is implemented
>> for AArch64.
>>
>> This patch should not cause the kind of spilling tragedy that led to the
>> reversion of r236031.
>>
>> Differential Revision: http://reviews.llvm.org/D10321
>>
>>
>> Modified:
>>     llvm/trunk/lib/Target/X86/X86InstrInfo.cpp
>>     llvm/trunk/lib/Target/X86/X86InstrInfo.h
>>     llvm/trunk/lib/Target/X86/X86TargetMachine.cpp
>>     llvm/trunk/test/CodeGen/X86/fp-fast.ll
>>
>> Modified: llvm/trunk/lib/Target/X86/X86InstrInfo.cpp
>> URL:
>> http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86InstrInfo.cpp?rev=239486&r1=239485&r2=239486&view=diff
>>
>> ==============================================================================
>> --- llvm/trunk/lib/Target/X86/X86InstrInfo.cpp (original)
>> +++ llvm/trunk/lib/Target/X86/X86InstrInfo.cpp Wed Jun 10 15:32:21 2015
>> @@ -6226,6 +6226,210 @@ hasHighOperandLatency(const InstrItinera
>>    return isHighLatencyDef(DefMI->getOpcode());
>>  }
>>
>> +/// If the input instruction is part of a chain of dependent ops that are
>> +/// suitable for reassociation, return the earlier instruction in the
>> sequence
>> +/// that defines its first operand, otherwise return a nullptr.
>> +/// If the instruction's operands must be commuted to be considered a
>> +/// reassociation candidate, Commuted will be set to true.
>> +static MachineInstr *isReassocCandidate(const MachineInstr &Inst,
>> +                                        unsigned AssocOpcode,
>> +                                        bool checkPrevOneUse,
>> +                                        bool &Commuted) {
>> +  if (Inst.getOpcode() != AssocOpcode)
>> +    return nullptr;
>> +
>> +  MachineOperand Op1 = Inst.getOperand(1);
>> +  MachineOperand Op2 = Inst.getOperand(2);
>> +
>> +  const MachineBasicBlock *MBB = Inst.getParent();
>> +  const MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
>> +
>> +  // We need virtual register definitions.
>> +  MachineInstr *MI1 = nullptr;
>> +  MachineInstr *MI2 = nullptr;
>> +  if (Op1.isReg() && TargetRegisterInfo::isVirtualRegister(Op1.getReg()))
>> +    MI1 = MRI.getUniqueVRegDef(Op1.getReg());
>> +  if (Op2.isReg() && TargetRegisterInfo::isVirtualRegister(Op2.getReg()))
>> +    MI2 = MRI.getUniqueVRegDef(Op2.getReg());
>> +
>> +  // And they need to be in the trace (otherwise, they won't have a
>> depth).
>> +  if (!MI1 || !MI2 || MI1->getParent() != MBB || MI2->getParent() != MBB)
>> +    return nullptr;
>> +
>> +  Commuted = false;
>> +  if (MI1->getOpcode() != AssocOpcode && MI2->getOpcode() ==
>> AssocOpcode) {
>> +    std::swap(MI1, MI2);
>> +    Commuted = true;
>> +  }
>> +
>> +  // Avoid reassociating operands when it won't provide any benefit. If
>> both
>> +  // operands are produced by instructions of this type, we may already
>> +  // have the optimal sequence.
>> +  if (MI2->getOpcode() == AssocOpcode)
>> +    return nullptr;
>> +
>> +  // The instruction must only be used by the other instruction that we
>> +  // reassociate with.
>> +  if (checkPrevOneUse &&
>> !MRI.hasOneNonDBGUse(MI1->getOperand(0).getReg()))
>> +    return nullptr;
>> +
>> +  // We must match a simple chain of dependent ops.
>> +  // TODO: This check is not necessary for the earliest instruction in
>> the
>> +  // sequence. Instead of a sequence of 3 dependent instructions with
>> the same
>> +  // opcode, we only need to find a sequence of 2 dependent instructions
>> with
>> +  // the same opcode plus 1 other instruction that adds to the height of
>> the
>> +  // trace.
>> +  if (MI1->getOpcode() != AssocOpcode)
>> +    return nullptr;
>> +
>> +  return MI1;
>> +}
>> +
>> +/// Select a pattern based on how the operands of each associative
>> operation
>> +/// need to be commuted.
>> +static MachineCombinerPattern::MC_PATTERN getPattern(bool CommutePrev,
>> +                                                     bool CommuteRoot) {
>> +  if (CommutePrev) {
>> +    if (CommuteRoot)
>> +      return MachineCombinerPattern::MC_REASSOC_XA_YB;
>> +    return MachineCombinerPattern::MC_REASSOC_XA_BY;
>> +  } else {
>> +    if (CommuteRoot)
>> +      return MachineCombinerPattern::MC_REASSOC_AX_YB;
>> +    return MachineCombinerPattern::MC_REASSOC_AX_BY;
>> +  }
>> +}
>> +
>> +bool X86InstrInfo::hasPattern(MachineInstr &Root,
>> +        SmallVectorImpl<MachineCombinerPattern::MC_PATTERN> &Pattern)
>> const {
>> +  if (!Root.getParent()->getParent()->getTarget().Options.UnsafeFPMath)
>> +    return false;
>> +
>> +  // TODO: There are many more associative instruction types to match:
>> +  //       1. Other forms of scalar FP add (non-AVX)
>> +  //       2. Other data types (double, integer, vectors)
>> +  //       3. Other math / logic operations (mul, and, or)
>> +  unsigned AssocOpcode = X86::VADDSSrr;
>> +
>> +  // TODO: There is nothing x86-specific here except the instruction
>> type.
>> +  // This logic could be hoisted into the machine combiner pass itself.
>> +  bool CommuteRoot;
>> +  if (MachineInstr *Prev = isReassocCandidate(Root, AssocOpcode, true,
>> +                                              CommuteRoot)) {
>> +    bool CommutePrev;
>> +    if (isReassocCandidate(*Prev, AssocOpcode, false, CommutePrev)) {
>> +      // We found a sequence of instructions that may be suitable for a
>> +      // reassociation of operands to increase ILP.
>> +      Pattern.push_back(getPattern(CommutePrev, CommuteRoot));
>> +      return true;
>> +    }
>> +  }
>> +
>> +  return false;
>> +}
>> +
>> +/// Attempt the following reassociation to reduce critical path length:
>> +///   B = A op X (Prev)
>> +///   C = B op Y (Root)
>> +///   ===>
>> +///   B = X op Y
>> +///   C = A op B
>> +static void reassociateOps(MachineInstr &Root, MachineInstr &Prev,
>> +                           MachineCombinerPattern::MC_PATTERN Pattern,
>> +                           SmallVectorImpl<MachineInstr *> &InsInstrs,
>> +                           SmallVectorImpl<MachineInstr *> &DelInstrs,
>> +                           DenseMap<unsigned, unsigned>
>> &InstrIdxForVirtReg) {
>> +  MachineFunction *MF = Root.getParent()->getParent();
>> +  MachineRegisterInfo &MRI = MF->getRegInfo();
>> +  const TargetInstrInfo *TII = MF->getSubtarget().getInstrInfo();
>> +  const TargetRegisterInfo *TRI = MF->getSubtarget().getRegisterInfo();
>> +  const TargetRegisterClass *RC = Root.getRegClassConstraint(0, TII,
>> TRI);
>> +
>> +  // This array encodes the operand index for each parameter because the
>> +  // operands may be commuted. Each row corresponds to a pattern value,
>> +  // and each column specifies the index of A, B, X, Y.
>> +  unsigned OpIdx[4][4] = {
>> +    { 1, 1, 2, 2 },
>> +    { 1, 2, 2, 1 },
>> +    { 2, 1, 1, 2 },
>> +    { 2, 2, 1, 1 }
>> +  };
>> +
>> +  MachineOperand &OpA = Prev.getOperand(OpIdx[Pattern][0]);
>> +  MachineOperand &OpB = Root.getOperand(OpIdx[Pattern][1]);
>> +  MachineOperand &OpX = Prev.getOperand(OpIdx[Pattern][2]);
>> +  MachineOperand &OpY = Root.getOperand(OpIdx[Pattern][3]);
>> +  MachineOperand &OpC = Root.getOperand(0);
>> +
>> +  unsigned RegA = OpA.getReg();
>> +  unsigned RegB = OpB.getReg();
>> +  unsigned RegX = OpX.getReg();
>> +  unsigned RegY = OpY.getReg();
>> +  unsigned RegC = OpC.getReg();
>> +
>> +  if (TargetRegisterInfo::isVirtualRegister(RegA))
>> +    MRI.constrainRegClass(RegA, RC);
>> +  if (TargetRegisterInfo::isVirtualRegister(RegB))
>> +    MRI.constrainRegClass(RegB, RC);
>> +  if (TargetRegisterInfo::isVirtualRegister(RegX))
>> +    MRI.constrainRegClass(RegX, RC);
>> +  if (TargetRegisterInfo::isVirtualRegister(RegY))
>> +    MRI.constrainRegClass(RegY, RC);
>> +  if (TargetRegisterInfo::isVirtualRegister(RegC))
>> +    MRI.constrainRegClass(RegC, RC);
>> +
>> +  // Create a new virtual register for the result of (X op Y) instead of
>> +  // recycling RegB because the MachineCombiner's computation of the
>> critical
>> +  // path requires a new register definition rather than an existing one.
>> +  unsigned NewVR = MRI.createVirtualRegister(RC);
>> +  InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0));
>> +
>> +  unsigned Opcode = Root.getOpcode();
>> +  bool KillA = OpA.isKill();
>> +  bool KillX = OpX.isKill();
>> +  bool KillY = OpY.isKill();
>> +
>> +  // Create new instructions for insertion.
>> +  MachineInstrBuilder MIB1 =
>> +    BuildMI(*MF, Prev.getDebugLoc(), TII->get(Opcode), NewVR)
>> +      .addReg(RegX, getKillRegState(KillX))
>> +      .addReg(RegY, getKillRegState(KillY));
>> +  InsInstrs.push_back(MIB1);
>> +
>> +  MachineInstrBuilder MIB2 =
>> +    BuildMI(*MF, Root.getDebugLoc(), TII->get(Opcode), RegC)
>> +      .addReg(RegA, getKillRegState(KillA))
>> +      .addReg(NewVR, getKillRegState(true));
>> +  InsInstrs.push_back(MIB2);
>> +
>> +  // Record old instructions for deletion.
>> +  DelInstrs.push_back(&Prev);
>> +  DelInstrs.push_back(&Root);
>> +}
>> +
>> +void X86InstrInfo::genAlternativeCodeSequence(
>> +    MachineInstr &Root,
>> +    MachineCombinerPattern::MC_PATTERN Pattern,
>> +    SmallVectorImpl<MachineInstr *> &InsInstrs,
>> +    SmallVectorImpl<MachineInstr *> &DelInstrs,
>> +    DenseMap<unsigned, unsigned> &InstIdxForVirtReg) const {
>> +  MachineRegisterInfo &MRI = Root.getParent()->getParent()->getRegInfo();
>> +
>> +  // Select the previous instruction in the sequence based on the input
>> pattern.
>> +  MachineInstr *Prev = nullptr;
>> +  if (Pattern == MachineCombinerPattern::MC_REASSOC_AX_BY ||
>> +      Pattern == MachineCombinerPattern::MC_REASSOC_XA_BY)
>> +    Prev = MRI.getUniqueVRegDef(Root.getOperand(1).getReg());
>> +  else if (Pattern == MachineCombinerPattern::MC_REASSOC_AX_YB ||
>> +           Pattern == MachineCombinerPattern::MC_REASSOC_XA_YB)
>> +    Prev = MRI.getUniqueVRegDef(Root.getOperand(2).getReg());
>> +  else
>> +    assert("Unknown pattern for machine combiner");
>
> Use llvm_unreachable("msg") instead of assert.  Since string literals are
> converted to a true bool value, this assert will never be triggered.
>
>>
>
> +
>> +  reassociateOps(Root, *Prev, Pattern, InsInstrs, DelInstrs,
>> InstIdxForVirtReg);
>> +  return;
>> +}
>> +
>>  namespace {
>>    /// Create Global Base Reg pass. This initializes the PIC
>>    /// global base register for x86-32.
>>
>> Modified: llvm/trunk/lib/Target/X86/X86InstrInfo.h
>> URL:
>> http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86InstrInfo.h?rev=239486&r1=239485&r2=239486&view=diff
>>
>> ==============================================================================
>> --- llvm/trunk/lib/Target/X86/X86InstrInfo.h (original)
>> +++ llvm/trunk/lib/Target/X86/X86InstrInfo.h Wed Jun 10 15:32:21 2015
>> @@ -26,6 +26,19 @@ namespace llvm {
>>    class X86RegisterInfo;
>>    class X86Subtarget;
>>
>> +  namespace MachineCombinerPattern {
>> +    enum MC_PATTERN : int {
>> +      // These are commutative variants for reassociating a computation
>> chain
>> +      // of the form:
>> +      //   B = A op X (Prev)
>> +      //   C = B op Y (Root)
>> +      MC_REASSOC_AX_BY = 0,
>> +      MC_REASSOC_AX_YB = 1,
>> +      MC_REASSOC_XA_BY = 2,
>> +      MC_REASSOC_XA_YB = 3,
>> +    };
>> +  } // end namespace MachineCombinerPattern
>> +
>>  namespace X86 {
>>    // X86 specific condition code. These correspond to X86_*_COND in
>>    // X86InstrInfo.td. They must be kept in synch.
>> @@ -429,6 +442,26 @@ public:
>>                               const MachineInstr *UseMI,
>>                               unsigned UseIdx) const override;
>>
>> +
>> +  bool useMachineCombiner() const override {
>> +    return true;
>> +  }
>> +
>> +  /// Return true when there is potentially a faster code sequence
>> +  /// for an instruction chain ending in <Root>. All potential patterns
>> are
>> +  /// output in the <Pattern> array.
>> +  bool hasPattern(
>> +      MachineInstr &Root,
>> +      SmallVectorImpl<MachineCombinerPattern::MC_PATTERN> &P) const
>> override;
>> +
>> +  /// When hasPattern() finds a pattern, this function generates the
>> +  /// instructions that could replace the original code sequence.
>> +  void genAlternativeCodeSequence(
>> +          MachineInstr &Root, MachineCombinerPattern::MC_PATTERN P,
>> +          SmallVectorImpl<MachineInstr *> &InsInstrs,
>> +          SmallVectorImpl<MachineInstr *> &DelInstrs,
>> +          DenseMap<unsigned, unsigned> &InstrIdxForVirtReg) const
>> override;
>> +
>>    /// analyzeCompare - For a comparison instruction, return the source
>> registers
>>    /// in SrcReg and SrcReg2 if having two register operands, and the
>> value it
>>    /// compares against in CmpValue. Return true if the comparison
>> instruction
>>
>> Modified: llvm/trunk/lib/Target/X86/X86TargetMachine.cpp
>> URL:
>> http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86TargetMachine.cpp?rev=239486&r1=239485&r2=239486&view=diff
>>
>> ==============================================================================
>> --- llvm/trunk/lib/Target/X86/X86TargetMachine.cpp (original)
>> +++ llvm/trunk/lib/Target/X86/X86TargetMachine.cpp Wed Jun 10 15:32:21
>> 2015
>> @@ -24,6 +24,10 @@
>>  #include "llvm/Target/TargetOptions.h"
>>  using namespace llvm;
>>
>> +static cl::opt<bool> EnableMachineCombinerPass("x86-machine-combiner",
>> +                               cl::desc("Enable the machine combiner
>> pass"),
>> +                               cl::init(true), cl::Hidden);
>> +
>>  extern "C" void LLVMInitializeX86Target() {
>>    // Register the target.
>>    RegisterTargetMachine<X86TargetMachine> X(TheX86_32Target);
>> @@ -224,6 +228,8 @@ bool X86PassConfig::addInstSelector() {
>>
>>  bool X86PassConfig::addILPOpts() {
>>    addPass(&EarlyIfConverterID);
>> +  if (EnableMachineCombinerPass)
>> +    addPass(&MachineCombinerID);
>>    return true;
>>  }
>>
>>
>> Modified: llvm/trunk/test/CodeGen/X86/fp-fast.ll
>> URL:
>> http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/fp-fast.ll?rev=239486&r1=239485&r2=239486&view=diff
>>
>> ==============================================================================
>> --- llvm/trunk/test/CodeGen/X86/fp-fast.ll (original)
>> +++ llvm/trunk/test/CodeGen/X86/fp-fast.ll Wed Jun 10 15:32:21 2015
>> @@ -114,3 +114,81 @@ define float @test11(float %a) {
>>    ret float %t2
>>  }
>>
>> +; Verify that the first two adds are independent regardless of how the
>> inputs are
>> +; commuted. The destination registers are used as source registers for
>> the third add.
>> +
>> +define float @reassociate_adds1(float %x0, float %x1, float %x2, float
>> %x3) {
>> +; CHECK-LABEL: reassociate_adds1:
>> +; CHECK:       # BB#0:
>> +; CHECK-NEXT:    vaddss %xmm1, %xmm0, %xmm0
>> +; CHECK-NEXT:    vaddss %xmm3, %xmm2, %xmm1
>> +; CHECK-NEXT:    vaddss %xmm1, %xmm0, %xmm0
>> +; CHECK-NEXT:    retq
>> +  %t0 = fadd float %x0, %x1
>> +  %t1 = fadd float %t0, %x2
>> +  %t2 = fadd float %t1, %x3
>> +  ret float %t2
>> +}
>> +
>> +define float @reassociate_adds2(float %x0, float %x1, float %x2, float
>> %x3) {
>> +; CHECK-LABEL: reassociate_adds2:
>> +; CHECK:       # BB#0:
>> +; CHECK-NEXT:    vaddss %xmm1, %xmm0, %xmm0
>> +; CHECK-NEXT:    vaddss %xmm3, %xmm2, %xmm1
>> +; CHECK-NEXT:    vaddss %xmm1, %xmm0, %xmm0
>> +; CHECK-NEXT:    retq
>> +  %t0 = fadd float %x0, %x1
>> +  %t1 = fadd float %x2, %t0
>> +  %t2 = fadd float %t1, %x3
>> +  ret float %t2
>> +}
>> +
>> +define float @reassociate_adds3(float %x0, float %x1, float %x2, float
>> %x3) {
>> +; CHECK-LABEL: reassociate_adds3:
>> +; CHECK:       # BB#0:
>> +; CHECK-NEXT:    vaddss %xmm1, %xmm0, %xmm0
>> +; CHECK-NEXT:    vaddss %xmm3, %xmm2, %xmm1
>> +; CHECK-NEXT:    vaddss %xmm1, %xmm0, %xmm0
>> +; CHECK-NEXT:    retq
>> +  %t0 = fadd float %x0, %x1
>> +  %t1 = fadd float %t0, %x2
>> +  %t2 = fadd float %x3, %t1
>> +  ret float %t2
>> +}
>> +
>> +define float @reassociate_adds4(float %x0, float %x1, float %x2, float
>> %x3) {
>> +; CHECK-LABEL: reassociate_adds4:
>> +; CHECK:       # BB#0:
>> +; CHECK-NEXT:    vaddss %xmm1, %xmm0, %xmm0
>> +; CHECK-NEXT:    vaddss %xmm3, %xmm2, %xmm1
>> +; CHECK-NEXT:    vaddss %xmm1, %xmm0, %xmm0
>> +; CHECK-NEXT:    retq
>> +  %t0 = fadd float %x0, %x1
>> +  %t1 = fadd float %x2, %t0
>> +  %t2 = fadd float %x3, %t1
>> +  ret float %t2
>> +}
>> +
>> +; Verify that we reassociate some of these ops. The optimal balanced
>> tree of adds is not
>> +; produced because that would cost more compile time.
>> +
>> +define float @reassociate_adds5(float %x0, float %x1, float %x2, float
>> %x3, float %x4, float %x5, float %x6, float %x7) {
>> +; CHECK-LABEL: reassociate_adds5:
>> +; CHECK:       # BB#0:
>> +; CHECK-NEXT:    vaddss %xmm1, %xmm0, %xmm0
>> +; CHECK-NEXT:    vaddss %xmm3, %xmm2, %xmm1
>> +; CHECK-NEXT:    vaddss %xmm1, %xmm0, %xmm0
>> +; CHECK-NEXT:    vaddss %xmm5, %xmm4, %xmm1
>> +; CHECK-NEXT:    vaddss %xmm1, %xmm0, %xmm0
>> +; CHECK-NEXT:    vaddss %xmm7, %xmm6, %xmm1
>> +; CHECK-NEXT:    vaddss %xmm1, %xmm0, %xmm0
>> +; CHECK-NEXT:    retq
>> +  %t0 = fadd float %x0, %x1
>> +  %t1 = fadd float %t0, %x2
>> +  %t2 = fadd float %t1, %x3
>> +  %t3 = fadd float %t2, %x4
>> +  %t4 = fadd float %t3, %x5
>> +  %t5 = fadd float %t4, %x6
>> +  %t6 = fadd float %t5, %x7
>> +  ret float %t6
>> +}
>>
>>
>> _______________________________________________
>> llvm-commits mailing list
>> llvm-commits at cs.uiuc.edu
>> http://lists.cs.uiuc.edu/mailman/listinfo/llvm-commits
>>
>
>
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://lists.llvm.org/pipermail/llvm-commits/attachments/20150610/d5db3392/attachment.html>