[llvm] r239486 - [x86] Add a reassociation optimization to increase ILP via the MachineCombiner pass
Richard Trieu
rtrieu at google.com
Wed Jun 10 16:10:56 PDT 2015
On Wed, Jun 10, 2015 at 1:32 PM, Sanjay Patel <spatel at rotateright.com>
wrote:
> Author: spatel
> Date: Wed Jun 10 15:32:21 2015
> New Revision: 239486
>
> URL: http://llvm.org/viewvc/llvm-project?rev=239486&view=rev
> Log:
> [x86] Add a reassociation optimization to increase ILP via the
> MachineCombiner pass
>
> This is a reimplementation of D9780 at the machine instruction level
> rather than the DAG.
>
> Use the MachineCombiner pass to reassociate scalar single-precision AVX
> additions (just a
> starting point; see the TODO comments) to increase ILP when it's safe to
> do so.
>
> The code is closely based on the existing MachineCombiner optimization
> that is implemented
> for AArch64.
>
> This patch should not cause the kind of spilling tragedy that led to the
> reversion of r236031.
>
> Differential Revision: http://reviews.llvm.org/D10321
>
>
> Modified:
> llvm/trunk/lib/Target/X86/X86InstrInfo.cpp
> llvm/trunk/lib/Target/X86/X86InstrInfo.h
> llvm/trunk/lib/Target/X86/X86TargetMachine.cpp
> llvm/trunk/test/CodeGen/X86/fp-fast.ll
>
> Modified: llvm/trunk/lib/Target/X86/X86InstrInfo.cpp
> URL:
> http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86InstrInfo.cpp?rev=239486&r1=239485&r2=239486&view=diff
>
> ==============================================================================
> --- llvm/trunk/lib/Target/X86/X86InstrInfo.cpp (original)
> +++ llvm/trunk/lib/Target/X86/X86InstrInfo.cpp Wed Jun 10 15:32:21 2015
> @@ -6226,6 +6226,210 @@ hasHighOperandLatency(const InstrItinera
> return isHighLatencyDef(DefMI->getOpcode());
> }
>
> +/// If the input instruction is part of a chain of dependent ops that are
> +/// suitable for reassociation, return the earlier instruction in the
> sequence
> +/// that defines its first operand, otherwise return a nullptr.
> +/// If the instruction's operands must be commuted to be considered a
> +/// reassociation candidate, Commuted will be set to true.
> +static MachineInstr *isReassocCandidate(const MachineInstr &Inst,
> + unsigned AssocOpcode,
> + bool checkPrevOneUse,
> + bool &Commuted) {
> + if (Inst.getOpcode() != AssocOpcode)
> + return nullptr;
> +
> + MachineOperand Op1 = Inst.getOperand(1);
> + MachineOperand Op2 = Inst.getOperand(2);
> +
> + const MachineBasicBlock *MBB = Inst.getParent();
> + const MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
> +
> + // We need virtual register definitions.
> + MachineInstr *MI1 = nullptr;
> + MachineInstr *MI2 = nullptr;
> + if (Op1.isReg() && TargetRegisterInfo::isVirtualRegister(Op1.getReg()))
> + MI1 = MRI.getUniqueVRegDef(Op1.getReg());
> + if (Op2.isReg() && TargetRegisterInfo::isVirtualRegister(Op2.getReg()))
> + MI2 = MRI.getUniqueVRegDef(Op2.getReg());
> +
> + // And they need to be in the trace (otherwise, they won't have a
> depth).
> + if (!MI1 || !MI2 || MI1->getParent() != MBB || MI2->getParent() != MBB)
> + return nullptr;
> +
> + Commuted = false;
> + if (MI1->getOpcode() != AssocOpcode && MI2->getOpcode() == AssocOpcode)
> {
> + std::swap(MI1, MI2);
> + Commuted = true;
> + }
> +
> + // Avoid reassociating operands when it won't provide any benefit. If
> both
> + // operands are produced by instructions of this type, we may already
> + // have the optimal sequence.
> + if (MI2->getOpcode() == AssocOpcode)
> + return nullptr;
> +
> + // The instruction must only be used by the other instruction that we
> + // reassociate with.
> + if (checkPrevOneUse &&
> !MRI.hasOneNonDBGUse(MI1->getOperand(0).getReg()))
> + return nullptr;
> +
> + // We must match a simple chain of dependent ops.
> + // TODO: This check is not necessary for the earliest instruction in the
> + // sequence. Instead of a sequence of 3 dependent instructions with the
> same
> + // opcode, we only need to find a sequence of 2 dependent instructions
> with
> + // the same opcode plus 1 other instruction that adds to the height of
> the
> + // trace.
> + if (MI1->getOpcode() != AssocOpcode)
> + return nullptr;
> +
> + return MI1;
> +}
> +
> +/// Select a pattern based on how the operands of each associative
> operation
> +/// need to be commuted.
> +static MachineCombinerPattern::MC_PATTERN getPattern(bool CommutePrev,
> + bool CommuteRoot) {
> + if (CommutePrev) {
> + if (CommuteRoot)
> + return MachineCombinerPattern::MC_REASSOC_XA_YB;
> + return MachineCombinerPattern::MC_REASSOC_XA_BY;
> + } else {
> + if (CommuteRoot)
> + return MachineCombinerPattern::MC_REASSOC_AX_YB;
> + return MachineCombinerPattern::MC_REASSOC_AX_BY;
> + }
> +}
> +
> +bool X86InstrInfo::hasPattern(MachineInstr &Root,
> + SmallVectorImpl<MachineCombinerPattern::MC_PATTERN> &Pattern)
> const {
> + if (!Root.getParent()->getParent()->getTarget().Options.UnsafeFPMath)
> + return false;
> +
> + // TODO: There are many more associative instruction types to match:
> + // 1. Other forms of scalar FP add (non-AVX)
> + // 2. Other data types (double, integer, vectors)
> + // 3. Other math / logic operations (mul, and, or)
> + unsigned AssocOpcode = X86::VADDSSrr;
> +
> + // TODO: There is nothing x86-specific here except the instruction type.
> + // This logic could be hoisted into the machine combiner pass itself.
> + bool CommuteRoot;
> + if (MachineInstr *Prev = isReassocCandidate(Root, AssocOpcode, true,
> + CommuteRoot)) {
> + bool CommutePrev;
> + if (isReassocCandidate(*Prev, AssocOpcode, false, CommutePrev)) {
> + // We found a sequence of instructions that may be suitable for a
> + // reassociation of operands to increase ILP.
> + Pattern.push_back(getPattern(CommutePrev, CommuteRoot));
> + return true;
> + }
> + }
> +
> + return false;
> +}
> +
> +/// Attempt the following reassociation to reduce critical path length:
> +/// B = A op X (Prev)
> +/// C = B op Y (Root)
> +/// ===>
> +/// B = X op Y
> +/// C = A op B
> +static void reassociateOps(MachineInstr &Root, MachineInstr &Prev,
> + MachineCombinerPattern::MC_PATTERN Pattern,
> + SmallVectorImpl<MachineInstr *> &InsInstrs,
> + SmallVectorImpl<MachineInstr *> &DelInstrs,
> + DenseMap<unsigned, unsigned>
> &InstrIdxForVirtReg) {
> + MachineFunction *MF = Root.getParent()->getParent();
> + MachineRegisterInfo &MRI = MF->getRegInfo();
> + const TargetInstrInfo *TII = MF->getSubtarget().getInstrInfo();
> + const TargetRegisterInfo *TRI = MF->getSubtarget().getRegisterInfo();
> + const TargetRegisterClass *RC = Root.getRegClassConstraint(0, TII, TRI);
> +
> + // This array encodes the operand index for each parameter because the
> + // operands may be commuted. Each row corresponds to a pattern value,
> + // and each column specifies the index of A, B, X, Y.
> + unsigned OpIdx[4][4] = {
> + { 1, 1, 2, 2 },
> + { 1, 2, 2, 1 },
> + { 2, 1, 1, 2 },
> + { 2, 2, 1, 1 }
> + };
> +
> + MachineOperand &OpA = Prev.getOperand(OpIdx[Pattern][0]);
> + MachineOperand &OpB = Root.getOperand(OpIdx[Pattern][1]);
> + MachineOperand &OpX = Prev.getOperand(OpIdx[Pattern][2]);
> + MachineOperand &OpY = Root.getOperand(OpIdx[Pattern][3]);
> + MachineOperand &OpC = Root.getOperand(0);
> +
> + unsigned RegA = OpA.getReg();
> + unsigned RegB = OpB.getReg();
> + unsigned RegX = OpX.getReg();
> + unsigned RegY = OpY.getReg();
> + unsigned RegC = OpC.getReg();
> +
> + if (TargetRegisterInfo::isVirtualRegister(RegA))
> + MRI.constrainRegClass(RegA, RC);
> + if (TargetRegisterInfo::isVirtualRegister(RegB))
> + MRI.constrainRegClass(RegB, RC);
> + if (TargetRegisterInfo::isVirtualRegister(RegX))
> + MRI.constrainRegClass(RegX, RC);
> + if (TargetRegisterInfo::isVirtualRegister(RegY))
> + MRI.constrainRegClass(RegY, RC);
> + if (TargetRegisterInfo::isVirtualRegister(RegC))
> + MRI.constrainRegClass(RegC, RC);
> +
> + // Create a new virtual register for the result of (X op Y) instead of
> + // recycling RegB because the MachineCombiner's computation of the
> critical
> + // path requires a new register definition rather than an existing one.
> + unsigned NewVR = MRI.createVirtualRegister(RC);
> + InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0));
> +
> + unsigned Opcode = Root.getOpcode();
> + bool KillA = OpA.isKill();
> + bool KillX = OpX.isKill();
> + bool KillY = OpY.isKill();
> +
> + // Create new instructions for insertion.
> + MachineInstrBuilder MIB1 =
> + BuildMI(*MF, Prev.getDebugLoc(), TII->get(Opcode), NewVR)
> + .addReg(RegX, getKillRegState(KillX))
> + .addReg(RegY, getKillRegState(KillY));
> + InsInstrs.push_back(MIB1);
> +
> + MachineInstrBuilder MIB2 =
> + BuildMI(*MF, Root.getDebugLoc(), TII->get(Opcode), RegC)
> + .addReg(RegA, getKillRegState(KillA))
> + .addReg(NewVR, getKillRegState(true));
> + InsInstrs.push_back(MIB2);
> +
> + // Record old instructions for deletion.
> + DelInstrs.push_back(&Prev);
> + DelInstrs.push_back(&Root);
> +}
> +
> +void X86InstrInfo::genAlternativeCodeSequence(
> + MachineInstr &Root,
> + MachineCombinerPattern::MC_PATTERN Pattern,
> + SmallVectorImpl<MachineInstr *> &InsInstrs,
> + SmallVectorImpl<MachineInstr *> &DelInstrs,
> + DenseMap<unsigned, unsigned> &InstIdxForVirtReg) const {
> + MachineRegisterInfo &MRI = Root.getParent()->getParent()->getRegInfo();
> +
> + // Select the previous instruction in the sequence based on the input
> pattern.
> + MachineInstr *Prev = nullptr;
> + if (Pattern == MachineCombinerPattern::MC_REASSOC_AX_BY ||
> + Pattern == MachineCombinerPattern::MC_REASSOC_XA_BY)
> + Prev = MRI.getUniqueVRegDef(Root.getOperand(1).getReg());
> + else if (Pattern == MachineCombinerPattern::MC_REASSOC_AX_YB ||
> + Pattern == MachineCombinerPattern::MC_REASSOC_XA_YB)
> + Prev = MRI.getUniqueVRegDef(Root.getOperand(2).getReg());
> + else
> + assert("Unknown pattern for machine combiner");
Use llvm_unreachable("msg") instead of assert. Since string literals are
converted to a true bool value, this assert will never be triggered.
>
+
> + reassociateOps(Root, *Prev, Pattern, InsInstrs, DelInstrs,
> InstIdxForVirtReg);
> + return;
> +}
> +
> namespace {
> /// Create Global Base Reg pass. This initializes the PIC
> /// global base register for x86-32.
>
> Modified: llvm/trunk/lib/Target/X86/X86InstrInfo.h
> URL:
> http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86InstrInfo.h?rev=239486&r1=239485&r2=239486&view=diff
>
> ==============================================================================
> --- llvm/trunk/lib/Target/X86/X86InstrInfo.h (original)
> +++ llvm/trunk/lib/Target/X86/X86InstrInfo.h Wed Jun 10 15:32:21 2015
> @@ -26,6 +26,19 @@ namespace llvm {
> class X86RegisterInfo;
> class X86Subtarget;
>
> + namespace MachineCombinerPattern {
> + enum MC_PATTERN : int {
> + // These are commutative variants for reassociating a computation
> chain
> + // of the form:
> + // B = A op X (Prev)
> + // C = B op Y (Root)
> + MC_REASSOC_AX_BY = 0,
> + MC_REASSOC_AX_YB = 1,
> + MC_REASSOC_XA_BY = 2,
> + MC_REASSOC_XA_YB = 3,
> + };
> + } // end namespace MachineCombinerPattern
> +
> namespace X86 {
> // X86 specific condition code. These correspond to X86_*_COND in
> // X86InstrInfo.td. They must be kept in synch.
> @@ -429,6 +442,26 @@ public:
> const MachineInstr *UseMI,
> unsigned UseIdx) const override;
>
> +
> + bool useMachineCombiner() const override {
> + return true;
> + }
> +
> + /// Return true when there is potentially a faster code sequence
> + /// for an instruction chain ending in <Root>. All potential patterns
> are
> + /// output in the <Pattern> array.
> + bool hasPattern(
> + MachineInstr &Root,
> + SmallVectorImpl<MachineCombinerPattern::MC_PATTERN> &P) const
> override;
> +
> + /// When hasPattern() finds a pattern, this function generates the
> + /// instructions that could replace the original code sequence.
> + void genAlternativeCodeSequence(
> + MachineInstr &Root, MachineCombinerPattern::MC_PATTERN P,
> + SmallVectorImpl<MachineInstr *> &InsInstrs,
> + SmallVectorImpl<MachineInstr *> &DelInstrs,
> + DenseMap<unsigned, unsigned> &InstrIdxForVirtReg) const
> override;
> +
> /// analyzeCompare - For a comparison instruction, return the source
> registers
> /// in SrcReg and SrcReg2 if having two register operands, and the
> value it
> /// compares against in CmpValue. Return true if the comparison
> instruction
>
> Modified: llvm/trunk/lib/Target/X86/X86TargetMachine.cpp
> URL:
> http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86TargetMachine.cpp?rev=239486&r1=239485&r2=239486&view=diff
>
> ==============================================================================
> --- llvm/trunk/lib/Target/X86/X86TargetMachine.cpp (original)
> +++ llvm/trunk/lib/Target/X86/X86TargetMachine.cpp Wed Jun 10 15:32:21 2015
> @@ -24,6 +24,10 @@
> #include "llvm/Target/TargetOptions.h"
> using namespace llvm;
>
> +static cl::opt<bool> EnableMachineCombinerPass("x86-machine-combiner",
> + cl::desc("Enable the machine combiner
> pass"),
> + cl::init(true), cl::Hidden);
> +
> extern "C" void LLVMInitializeX86Target() {
> // Register the target.
> RegisterTargetMachine<X86TargetMachine> X(TheX86_32Target);
> @@ -224,6 +228,8 @@ bool X86PassConfig::addInstSelector() {
>
> bool X86PassConfig::addILPOpts() {
> addPass(&EarlyIfConverterID);
> + if (EnableMachineCombinerPass)
> + addPass(&MachineCombinerID);
> return true;
> }
>
>
> Modified: llvm/trunk/test/CodeGen/X86/fp-fast.ll
> URL:
> http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/fp-fast.ll?rev=239486&r1=239485&r2=239486&view=diff
>
> ==============================================================================
> --- llvm/trunk/test/CodeGen/X86/fp-fast.ll (original)
> +++ llvm/trunk/test/CodeGen/X86/fp-fast.ll Wed Jun 10 15:32:21 2015
> @@ -114,3 +114,81 @@ define float @test11(float %a) {
> ret float %t2
> }
>
> +; Verify that the first two adds are independent regardless of how the
> inputs are
> +; commuted. The destination registers are used as source registers for
> the third add.
> +
> +define float @reassociate_adds1(float %x0, float %x1, float %x2, float
> %x3) {
> +; CHECK-LABEL: reassociate_adds1:
> +; CHECK: # BB#0:
> +; CHECK-NEXT: vaddss %xmm1, %xmm0, %xmm0
> +; CHECK-NEXT: vaddss %xmm3, %xmm2, %xmm1
> +; CHECK-NEXT: vaddss %xmm1, %xmm0, %xmm0
> +; CHECK-NEXT: retq
> + %t0 = fadd float %x0, %x1
> + %t1 = fadd float %t0, %x2
> + %t2 = fadd float %t1, %x3
> + ret float %t2
> +}
> +
> +define float @reassociate_adds2(float %x0, float %x1, float %x2, float
> %x3) {
> +; CHECK-LABEL: reassociate_adds2:
> +; CHECK: # BB#0:
> +; CHECK-NEXT: vaddss %xmm1, %xmm0, %xmm0
> +; CHECK-NEXT: vaddss %xmm3, %xmm2, %xmm1
> +; CHECK-NEXT: vaddss %xmm1, %xmm0, %xmm0
> +; CHECK-NEXT: retq
> + %t0 = fadd float %x0, %x1
> + %t1 = fadd float %x2, %t0
> + %t2 = fadd float %t1, %x3
> + ret float %t2
> +}
> +
> +define float @reassociate_adds3(float %x0, float %x1, float %x2, float
> %x3) {
> +; CHECK-LABEL: reassociate_adds3:
> +; CHECK: # BB#0:
> +; CHECK-NEXT: vaddss %xmm1, %xmm0, %xmm0
> +; CHECK-NEXT: vaddss %xmm3, %xmm2, %xmm1
> +; CHECK-NEXT: vaddss %xmm1, %xmm0, %xmm0
> +; CHECK-NEXT: retq
> + %t0 = fadd float %x0, %x1
> + %t1 = fadd float %t0, %x2
> + %t2 = fadd float %x3, %t1
> + ret float %t2
> +}
> +
> +define float @reassociate_adds4(float %x0, float %x1, float %x2, float
> %x3) {
> +; CHECK-LABEL: reassociate_adds4:
> +; CHECK: # BB#0:
> +; CHECK-NEXT: vaddss %xmm1, %xmm0, %xmm0
> +; CHECK-NEXT: vaddss %xmm3, %xmm2, %xmm1
> +; CHECK-NEXT: vaddss %xmm1, %xmm0, %xmm0
> +; CHECK-NEXT: retq
> + %t0 = fadd float %x0, %x1
> + %t1 = fadd float %x2, %t0
> + %t2 = fadd float %x3, %t1
> + ret float %t2
> +}
> +
> +; Verify that we reassociate some of these ops. The optimal balanced tree
> of adds is not
> +; produced because that would cost more compile time.
> +
> +define float @reassociate_adds5(float %x0, float %x1, float %x2, float
> %x3, float %x4, float %x5, float %x6, float %x7) {
> +; CHECK-LABEL: reassociate_adds5:
> +; CHECK: # BB#0:
> +; CHECK-NEXT: vaddss %xmm1, %xmm0, %xmm0
> +; CHECK-NEXT: vaddss %xmm3, %xmm2, %xmm1
> +; CHECK-NEXT: vaddss %xmm1, %xmm0, %xmm0
> +; CHECK-NEXT: vaddss %xmm5, %xmm4, %xmm1
> +; CHECK-NEXT: vaddss %xmm1, %xmm0, %xmm0
> +; CHECK-NEXT: vaddss %xmm7, %xmm6, %xmm1
> +; CHECK-NEXT: vaddss %xmm1, %xmm0, %xmm0
> +; CHECK-NEXT: retq
> + %t0 = fadd float %x0, %x1
> + %t1 = fadd float %t0, %x2
> + %t2 = fadd float %t1, %x3
> + %t3 = fadd float %t2, %x4
> + %t4 = fadd float %t3, %x5
> + %t5 = fadd float %t4, %x6
> + %t6 = fadd float %t5, %x7
> + ret float %t6
> +}
>
>
> _______________________________________________
> llvm-commits mailing list
> llvm-commits at cs.uiuc.edu
> http://lists.cs.uiuc.edu/mailman/listinfo/llvm-commits
>
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://lists.llvm.org/pipermail/llvm-commits/attachments/20150610/c455ab40/attachment.html>
More information about the llvm-commits
mailing list