[llvm] r231148 - Fix a problem where the TwoAddressInstructionPass which generate redundant register moves in a loop.

Tue Mar 3 14:03:03 PST 2015

Author: echristo
Date: Tue Mar  3 16:03:03 2015
New Revision: 231148

URL: http://llvm.org/viewvc/llvm-project?rev=231148&view=rev
Log:
Fix a problem where the TwoAddressInstructionPass which generate redundant register moves in a loop.

From:
int M, total;
void foo() {
int i;
for (i = 0; i < M; i++) {
  total = total + i / 2;
}
}

This is the kernel loop:

.LBB0_2: # %for.body

=>This Inner Loop Header: Depth=1
movl %edx, %esi
movl %ecx, %edx
shrl $31, %edx
addl %ecx, %edx
sarl %edx
addl %esi, %edx
incl %ecx
cmpl %eax, %ecx
jl .LBB0_2
--------------------------
The first mov insn "movl %edx, %esi" could be removed if we change "addl %esi, %edx" to "addl %edx, %esi".

The IR before TwoAddressInstructionPass is:
BB#2: derived from LLVM BB %for.body

Predecessors according to CFG: BB#1 BB#2
    %vreg3<def> = COPY %vreg12<kill>; GR32:%vreg3,%vreg12
    %vreg2<def> = COPY %vreg11<kill>; GR32:%vreg2,%vreg11
    %vreg7<def,tied1> = SHR32ri %vreg3<tied0>, 31, %EFLAGS<imp-def,dead>; GR32:%vreg7,%vreg3
    %vreg8<def,tied1> = ADD32rr %vreg3<tied0>, %vreg7<kill>, %EFLAGS<imp-def,dead>; GR32:%vreg8,%vreg3,%vreg7
    %vreg9<def,tied1> = SAR32r1 %vreg8<kill,tied0>, %EFLAGS<imp-def,dead>; GR32:%vreg9,%vreg8
    %vreg4<def,tied1> = ADD32rr %vreg9<kill,tied0>, %vreg2<kill>, %EFLAGS<imp-def,dead>; GR32:%vreg4,%vreg9,%vreg2
    %vreg5<def,tied1> = INC64_32r %vreg3<kill,tied0>, %EFLAGS<imp-def,dead>; GR32:%vreg5,%vreg3
    CMP32rr %vreg5, %vreg0, %EFLAGS<imp-def>; GR32:%vreg5,%vreg0
    %vreg11<def> = COPY %vreg4; GR32:%vreg11,%vreg4
    %vreg12<def> = COPY %vreg5<kill>; GR32:%vreg12,%vreg5
    JL_4 <BB#2>, %EFLAGS<imp-use,kill>
Now TwoAddressInstructionPass will choose vreg9 to be tied with vreg4. However, it doesn't see that there is copy from vreg4 to vreg11 and another copy from vreg11 to vreg2 inside the loop body. To remove those copies, it is necessary to choose vreg2 to be tied with vreg4 instead of vreg9. This code pattern commonly appears when there is reduction operation in a loop.

So check for a reversed copy chain and if we encounter one then we can commute the add instruction so we can avoid a copy.

Patch by Wei Mi.
http://reviews.llvm.org/D7806

Added:
    llvm/trunk/test/CodeGen/X86/twoaddr-coalesce-3.ll
Modified:
    llvm/trunk/lib/CodeGen/TwoAddressInstructionPass.cpp

Modified: llvm/trunk/lib/CodeGen/TwoAddressInstructionPass.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/CodeGen/TwoAddressInstructionPass.cpp?rev=231148&r1=231147&r2=231148&view=diff
==============================================================================

--- llvm/trunk/lib/CodeGen/TwoAddressInstructionPass.cpp (original)
+++ llvm/trunk/lib/CodeGen/TwoAddressInstructionPass.cpp Tue Mar  3 16:03:03 2015
@@ -102,6 +102,8 @@ class TwoAddressInstructionPass : public
   bool sink3AddrInstruction(MachineInstr *MI, unsigned Reg,
                             MachineBasicBlock::iterator OldPos);
 
+  bool isRevCopyChain(unsigned FromReg, unsigned ToReg, int Maxlen);
+
   bool noUseAfterLastDef(unsigned Reg, unsigned Dist, unsigned &LastDef);
 
   bool isProfitableToCommute(unsigned regA, unsigned regB, unsigned regC,
@@ -309,6 +311,45 @@ sink3AddrInstruction(MachineInstr *MI, u
   return true;
 }
 
+/// getSingleDef -- return the MachineInstr* if it is the single def of the Reg
+/// in current BB.
+static MachineInstr *getSingleDef(unsigned Reg, MachineBasicBlock *BB,
+                                  const MachineRegisterInfo *MRI) {
+  MachineInstr *Ret = nullptr;
+  for (MachineInstr &DefMI : MRI->def_instructions(Reg)) {
+    if (DefMI.getParent() != BB || DefMI.isDebugValue())
+      continue;
+    if (!Ret)
+      Ret = &DefMI;
+    else if (Ret != &DefMI)
+      return nullptr;
+  }
+  return Ret;
+}
+
+/// Check if there is a reversed copy chain from FromReg to ToReg:
+/// %Tmp1 = copy %Tmp2;
+/// %FromReg = copy %Tmp1;
+/// %ToReg = add %FromReg ...
+/// %Tmp2 = copy %ToReg;
+/// MaxLen specifies the maximum length of the copy chain the func
+/// can walk through.
+bool TwoAddressInstructionPass::isRevCopyChain(unsigned FromReg, unsigned ToReg,
+                                               int Maxlen) {
+  unsigned TmpReg = FromReg;
+  for (int i = 0; i < Maxlen; i++) {
+    MachineInstr *Def = getSingleDef(TmpReg, MBB, MRI);
+    if (!Def || !Def->isCopy())
+      return false;
+
+    TmpReg = Def->getOperand(1).getReg();
+
+    if (TmpReg == ToReg)
+      return true;
+  }
+  return false;
+}
+
 /// noUseAfterLastDef - Return true if there are no intervening uses between the
 /// last instruction in the MBB that defines the specified register and the
 /// two-address instruction which is being processed. It also returns the last
@@ -574,6 +615,27 @@ isProfitableToCommute(unsigned regA, uns
   if (!noUseAfterLastDef(regB, Dist, LastDefB))
     return true;
 
+  // Look for situation like this:
+  // %reg101 = MOV %reg100
+  // %reg102 = ...
+  // %reg103 = ADD %reg102, %reg101
+  // ... = %reg103 ...
+  // %reg100 = MOV %reg103
+  // If there is a reversed copy chain from reg101 to reg103, commute the ADD
+  // to eliminate an otherwise unavoidable copy.
+  // FIXME:
+  // We can extend the logic further: If an pair of operands in an insn has
+  // been merged, the insn could be regarded as a virtual copy, and the virtual
+  // copy could also be used to construct a copy chain.
+  // To more generally minimize register copies, ideally the logic of two addr
+  // instruction pass should be integrated with register allocation pass where
+  // interference graph is available.
+  if (isRevCopyChain(regC, regA, 3))
+    return true;
+
+  if (isRevCopyChain(regB, regA, 3))
+    return false;
+
   // Since there are no intervening uses for both registers, then commute
   // if the def of regC is closer. Its live interval is shorter.
   return LastDefB && LastDefC && LastDefC > LastDefB;

Added: llvm/trunk/test/CodeGen/X86/twoaddr-coalesce-3.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/twoaddr-coalesce-3.ll?rev=231148&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/X86/twoaddr-coalesce-3.ll (added)
+++ llvm/trunk/test/CodeGen/X86/twoaddr-coalesce-3.ll Tue Mar  3 16:03:03 2015
@@ -0,0 +1,84 @@
+; RUN: llc < %s -march=x86-64 | FileCheck %s
+; This test is to ensure the TwoAddrInstruction pass chooses the proper operands to
+; merge and generates fewer mov insns.
+
+ at M = common global i32 0, align 4
+ at total = common global i32 0, align 4
+ at g = common global i32 0, align 4
+
+; Function Attrs: nounwind uwtable
+define void @foo() {
+entry:
+  %0 = load i32, i32* @M, align 4
+  %cmp3 = icmp sgt i32 %0, 0
+  br i1 %cmp3, label %for.body.lr.ph, label %for.end
+
+for.body.lr.ph:                                   ; preds = %entry
+  %total.promoted = load i32, i32* @total, align 4
+  br label %for.body
+
+; Check that only one mov will be generated in the kernel loop.
+; CHECK-LABEL: foo:
+; CHECK: [[LOOP1:^[a-zA-Z0-9_.]+]]: # %for.body
+; CHECK-NOT: mov
+; CHECK: movl {{.*}}, [[REG1:%[a-z0-9]+]]
+; CHECK-NOT: mov
+; CHECK: shrl $31, [[REG1]]
+; CHECK-NOT: mov
+; CHECK: jl [[LOOP1]]
+for.body:                                         ; preds = %for.body.lr.ph, %for.body
+  %add5 = phi i32 [ %total.promoted, %for.body.lr.ph ], [ %add, %for.body ]
+  %i.04 = phi i32 [ 0, %for.body.lr.ph ], [ %inc, %for.body ]
+  %div = sdiv i32 %i.04, 2
+  %add = add nsw i32 %div, %add5
+  %inc = add nuw nsw i32 %i.04, 1
+  %cmp = icmp slt i32 %inc, %0
+  br i1 %cmp, label %for.body, label %for.cond.for.end_crit_edge
+
+for.cond.for.end_crit_edge:                       ; preds = %for.body
+  store i32 %add, i32* @total, align 4
+  br label %for.end
+
+for.end:                                          ; preds = %for.cond.for.end_crit_edge, %entry
+  ret void
+}
+
+; Function Attrs: nounwind uwtable
+define void @goo() {
+entry:
+  %0 = load i32, i32* @M, align 4
+  %cmp3 = icmp sgt i32 %0, 0
+  br i1 %cmp3, label %for.body.lr.ph, label %for.end
+
+for.body.lr.ph:                                   ; preds = %entry
+  %total.promoted = load i32, i32* @total, align 4
+  br label %for.body
+
+; Check that only two mov will be generated in the kernel loop.
+; CHECK-LABEL: goo:
+; CHECK: [[LOOP2:^[a-zA-Z0-9_.]+]]: # %for.body
+; CHECK-NOT: mov
+; CHECK: movl {{.*}}, [[REG2:%[a-z0-9]+]]
+; CHECK-NOT: mov
+; CHECK: shrl $31, [[REG2]]
+; CHECK-NOT: mov
+; CHECK: movl {{.*}}, g(%rip)
+; CHECK: jl [[LOOP2]]
+for.body:                                         ; preds = %for.body.lr.ph, %for.body
+  %add5 = phi i32 [ %total.promoted, %for.body.lr.ph ], [ %add, %for.body ]
+  %i.04 = phi i32 [ 0, %for.body.lr.ph ], [ %inc, %for.body ]
+  %div = sdiv i32 %i.04, 2
+  %add = add nsw i32 %div, %add5
+  store volatile i32 %add, i32* @g, align 4
+  %inc = add nuw nsw i32 %i.04, 1
+  %cmp = icmp slt i32 %inc, %0
+  br i1 %cmp, label %for.body, label %for.cond.for.end_crit_edge
+
+for.cond.for.end_crit_edge:                       ; preds = %for.body
+  store i32 %add, i32* @total, align 4
+  br label %for.end
+
+for.end:                                          ; preds = %for.cond.for.end_crit_edge, %entry
+  ret void
+}
+