[llvm] r248611 - PeepholeOptimizer: Remove redundant copies

Matt Arsenault via llvm-commits llvm-commits at lists.llvm.org
Fri Sep 25 13:22:12 PDT 2015


Author: arsenm
Date: Fri Sep 25 15:22:12 2015
New Revision: 248611

URL: http://llvm.org/viewvc/llvm-project?rev=248611&view=rev
Log:
PeepholeOptimizer: Remove redundant copies

If a virtual register is copied and another copy was already
seen, replace with the previous copy. This only handles the
simplest cases for now.

This pattern shows up from various operand restrictions
AMDGPU has which require inserting copies depending
on the register class of the operands.

Modified:
    llvm/trunk/lib/CodeGen/PeepholeOptimizer.cpp
    llvm/trunk/test/CodeGen/AMDGPU/address-space.ll
    llvm/trunk/test/CodeGen/AMDGPU/use-sgpr-multiple-times.ll

Modified: llvm/trunk/lib/CodeGen/PeepholeOptimizer.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/CodeGen/PeepholeOptimizer.cpp?rev=248611&r1=248610&r2=248611&view=diff
==============================================================================
--- llvm/trunk/lib/CodeGen/PeepholeOptimizer.cpp (original)
+++ llvm/trunk/lib/CodeGen/PeepholeOptimizer.cpp Fri Sep 25 15:22:12 2015
@@ -160,6 +160,15 @@ namespace {
     bool foldImmediate(MachineInstr *MI, MachineBasicBlock *MBB,
                        SmallSet<unsigned, 4> &ImmDefRegs,
                        DenseMap<unsigned, MachineInstr*> &ImmDefMIs);
+
+    /// \brief If copy instruction \p MI is a virtual register copy, track it in
+    /// the set \p CopiedFromRegs and \p CopyMIs. If this virtual register was
+    /// previously seen as a copy, replace the uses of this copy with the
+    /// previously seen copy's destination register.
+    bool foldRedundantCopy(MachineInstr *MI,
+                           SmallSet<unsigned, 4> &CopiedFromRegs,
+                           DenseMap<unsigned, MachineInstr*> &CopyMIs);
+
     bool isLoadFoldable(MachineInstr *MI,
                         SmallSet<unsigned, 16> &FoldAsLoadDefCandidates);
 
@@ -1335,6 +1344,65 @@ bool PeepholeOptimizer::foldImmediate(Ma
   return false;
 }
 
+// FIXME: This is very simple and misses some cases which should be handled when
+// motivating examples are found.
+//
+// The copy rewriting logic should look at uses as well as defs and be able to
+// eliminate copies across blocks.
+//
+// Later copies that are subregister extracts will also not be eliminated since
+// only the first copy is considered.
+//
+// e.g.
+// %vreg1 = COPY %vreg0
+// %vreg2 = COPY %vreg0:sub1
+//
+// Should replace %vreg2 uses with %vreg1:sub1
+bool PeepholeOptimizer::foldRedundantCopy(
+  MachineInstr *MI,
+  SmallSet<unsigned, 4> &CopySrcRegs,
+  DenseMap<unsigned, MachineInstr *> &CopyMIs) {
+  assert(MI->isCopy());
+
+  unsigned SrcReg = MI->getOperand(1).getReg();
+  if (!TargetRegisterInfo::isVirtualRegister(SrcReg))
+    return false;
+
+  unsigned DstReg = MI->getOperand(0).getReg();
+  if (!TargetRegisterInfo::isVirtualRegister(DstReg))
+    return false;
+
+  if (CopySrcRegs.insert(SrcReg).second) {
+    // First copy of this reg seen.
+    CopyMIs.insert(std::make_pair(SrcReg, MI));
+    return false;
+  }
+
+  MachineInstr *PrevCopy = CopyMIs.find(SrcReg)->second;
+
+  unsigned SrcSubReg = MI->getOperand(1).getSubReg();
+  unsigned PrevSrcSubReg = PrevCopy->getOperand(1).getSubReg();
+
+  // Can't replace different subregister extracts.
+  if (SrcSubReg != PrevSrcSubReg)
+    return false;
+
+  unsigned PrevDstReg = PrevCopy->getOperand(0).getReg();
+
+  // Only replace if the copy register class is the same.
+  //
+  // TODO: If we have multiple copies to different register classes, we may want
+  // to track multiple copies of the same source register.
+  if (MRI->getRegClass(DstReg) != MRI->getRegClass(PrevDstReg))
+    return false;
+
+  MRI->replaceRegWith(DstReg, PrevDstReg);
+
+  // Lifetime of the previous copy has been extended.
+  MRI->clearKillFlags(PrevDstReg);
+  return true;
+}
+
 bool PeepholeOptimizer::runOnMachineFunction(MachineFunction &MF) {
   if (skipOptnoneFunction(*MF.getFunction()))
     return false;
@@ -1368,6 +1436,10 @@ bool PeepholeOptimizer::runOnMachineFunc
     DenseMap<unsigned, MachineInstr*> ImmDefMIs;
     SmallSet<unsigned, 16> FoldAsLoadDefCandidates;
 
+    // Set of virtual registers that are copied from.
+    SmallSet<unsigned, 4> CopySrcRegs;
+    DenseMap<unsigned, MachineInstr *> CopySrcMIs;
+
     for (MachineBasicBlock::iterator
            MII = I->begin(), MIE = I->end(); MII != MIE; ) {
       MachineInstr *MI = &*MII;
@@ -1409,6 +1481,13 @@ bool PeepholeOptimizer::runOnMachineFunc
         Changed = true;
         continue;
       }
+
+      if (MI->isCopy() && foldRedundantCopy(MI, CopySrcRegs, CopySrcMIs)) {
+        LocalMIs.erase(MI);
+        MI->eraseFromParent();
+        Changed = true;
+        continue;
+      }
 
       if (isMoveImmediate(MI, ImmDefRegs, ImmDefMIs)) {
         SeenMoveImm = true;

Modified: llvm/trunk/test/CodeGen/AMDGPU/address-space.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/address-space.ll?rev=248611&r1=248610&r2=248611&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/address-space.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/address-space.ll Fri Sep 25 15:22:12 2015
@@ -5,15 +5,11 @@
 
 %struct.foo = type { [3 x float], [3 x float] }
 
-; FIXME: Extra V_MOV from SGPR to VGPR for second read. The address is
-; already in a VGPR after the first read.
-
 ; CHECK-LABEL: {{^}}do_as_ptr_calcs:
 ; CHECK: s_load_dword [[SREG1:s[0-9]+]],
-; CHECK: v_mov_b32_e32 [[VREG2:v[0-9]+]], [[SREG1]]
 ; CHECK: v_mov_b32_e32 [[VREG1:v[0-9]+]], [[SREG1]]
 ; CHECK-DAG: ds_read_b32 v{{[0-9]+}}, [[VREG1]] offset:12
-; CHECK-DAG: ds_read_b32 v{{[0-9]+}}, [[VREG2]] offset:20
+; CHECK-DAG: ds_read_b32 v{{[0-9]+}}, [[VREG1]] offset:20
 define void @do_as_ptr_calcs(%struct.foo addrspace(3)* nocapture %ptr) nounwind {
 entry:
   %x = getelementptr inbounds %struct.foo, %struct.foo addrspace(3)* %ptr, i32 0, i32 1, i32 0

Modified: llvm/trunk/test/CodeGen/AMDGPU/use-sgpr-multiple-times.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/use-sgpr-multiple-times.ll?rev=248611&r1=248610&r2=248611&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/use-sgpr-multiple-times.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/use-sgpr-multiple-times.ll Fri Sep 25 15:22:12 2015
@@ -41,6 +41,32 @@ define void @test_sgpr_use_twice_ternary
   ret void
 }
 
+; GCN-LABEL: {{^}}test_use_s_v_s:
+; GCN-DAG: s_load_dword [[SA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, {{0xb|0x2c}}
+; GCN-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, {{0xc|0x30}}
+
+; GCN: buffer_load_dword [[VA0:v[0-9]+]]
+; GCN-NOT: v_mov_b32
+; GCN: buffer_load_dword [[VA1:v[0-9]+]]
+
+; GCN-NOT: v_mov_b32
+; GCN: v_mov_b32_e32 [[VB:v[0-9]+]], [[SB]]
+; GCN-NOT: v_mov_b32
+
+; GCN-DAG: v_fma_f32 [[RESULT0:v[0-9]+]], [[VA0]], [[SA]], [[VB]]
+; GCN-DAG: v_fma_f32 [[RESULT1:v[0-9]+]], [[VA1]], [[SA]], [[VB]]
+; GCN: buffer_store_dword [[RESULT0]]
+; GCN: buffer_store_dword [[RESULT1]]
+define void @test_use_s_v_s(float addrspace(1)* %out, float %a, float %b, float addrspace(1)* %in) #0 {
+  %va0 = load volatile float, float addrspace(1)* %in
+  %va1 = load volatile float, float addrspace(1)* %in
+  %fma0 = call float @llvm.fma.f32(float %a, float %va0, float %b) #1
+  %fma1 = call float @llvm.fma.f32(float %a, float %va1, float %b) #1
+  store volatile float %fma0, float addrspace(1)* %out
+  store volatile float %fma1, float addrspace(1)* %out
+  ret void
+}
+
 ; GCN-LABEL: {{^}}test_sgpr_use_twice_ternary_op_a_b_a:
 ; SI: s_load_dword [[SGPR0:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb
 ; SI: s_load_dword [[SGPR1:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xc
@@ -199,14 +225,11 @@ define void @test_literal_use_twice_tern
 ; GCN-DAG: s_load_dword [[SGPR0:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, {{0xb|0x2c}}
 ; GCN-DAG: s_load_dword [[SGPR1:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, {{0xc|0x30}}
 ; GCN-DAG: v_mov_b32_e32 [[VK0:v[0-9]+]], 0x44800000
+; GCN-DAG: v_mov_b32_e32 [[VS1:v[0-9]+]], [[SGPR1]]
 
-; FIXME: Why do we end up with 2 copies of the same SGPR? These should be CSE'd
-; GCN: v_mov_b32_e32 [[VS1_1:v[0-9]+]], [[SGPR1]]
-; GCN: v_mov_b32_e32 [[VS1_0:v[0-9]+]], [[SGPR1]]
-
-; GCN-DAG: v_fma_f32 [[RESULT0:v[0-9]+]], [[SGPR0]], [[VS1_0]], [[VK0]]
+; GCN-DAG: v_fma_f32 [[RESULT0:v[0-9]+]], [[SGPR0]], [[VS1]], [[VK0]]
 ; GCN-DAG: v_mov_b32_e32 [[VK1:v[0-9]+]], 0x45800000
-; GCN-DAG: v_fma_f32 [[RESULT1:v[0-9]+]], [[SGPR0]], [[VS1_1]], [[VK1]]
+; GCN-DAG: v_fma_f32 [[RESULT1:v[0-9]+]], [[SGPR0]], [[VS1]], [[VK1]]
 
 ; GCN: buffer_store_dword [[RESULT0]]
 ; GCN: buffer_store_dword [[RESULT1]]
@@ -224,24 +247,16 @@ define void @test_s0_s1_k_f32(float addr
 ; GCN-DAG: s_load_dwordx2 s{{\[}}[[SGPR1_SUB0:[0-9]+]]:[[SGPR1_SUB1:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, {{0xd|0x34}}
 ; GCN-DAG: s_mov_b32 s[[SK0_SUB1:[0-9]+]], 0x40900000
 ; GCN-DAG: s_mov_b32 s[[SZERO:[0-9]+]], 0{{$}}
-; GCN-DAG: v_mov_b32_e32 v[[VK0_SUB0:[0-9]+]], s[[SZERO]]
-; GCN-DAG: v_mov_b32_e32 v[[VK0_SUB1:[0-9]+]], s[[SK0_SUB1]]
-
-; GCN-DAG: s_mov_b32 s[[SK1_SUB0:[0-9]+]], 0x40b00000{{$}}
-
-; FIXME: Redundant copies
-; GCN: v_mov_b32_e32 v[[VS1_1_SUB0:[0-9]+]], s[[SGPR1_SUB0]]
-; GCN: v_mov_b32_e32 v[[VS1_1_SUB1:[0-9]+]], s[[SGPR1_SUB1]]
-; GCN: v_mov_b32_e32 v[[VS1_0_SUB0:[0-9]+]], s[[SGPR1_SUB0]]
-; GCN: v_mov_b32_e32 v[[VS1_0_SUB1:[0-9]+]], s[[SGPR1_SUB1]]
-
-
-; GCN-DAG: v_fma_f64 [[RESULT0:v\[[0-9]+:[0-9]+\]]], [[SGPR0]], v{{\[}}[[VS1_0_SUB0]]:[[VS1_0_SUB1]]{{\]}}, v{{\[}}[[VK0_SUB0]]:[[VK0_SUB1]]{{\]}}
+; GCN: v_mov_b32_e32 v[[VK0_SUB0:[0-9]+]], s[[SZERO]]
+; GCN: v_mov_b32_e32 v[[VK0_SUB1:[0-9]+]], s[[SK0_SUB1]]
 
+; GCN-DAG: s_mov_b32 s[[SK1_SUB1:[0-9]+]], 0x40b00000{{$}}
+; GCN-DAG: v_mov_b32_e32 v[[VS1_SUB0:[0-9]+]], s[[SGPR1_SUB0]]
+; GCN-DAG: v_mov_b32_e32 v[[VS1_SUB1:[0-9]+]], s[[SGPR1_SUB1]]
 ; GCN-DAG: v_mov_b32_e32 v[[VK1_SUB0:[0-9]+]], s[[SZERO]]
-; GCN-DAG: v_mov_b32_e32 v[[VK1_SUB1:[0-9]+]], s[[SK1_SUB0]]
-
-; GCN-DAG: v_fma_f64 [[RESULT1:v\[[0-9]+:[0-9]+\]]], [[SGPR0]], v{{\[}}[[VS1_1_SUB0]]:[[VS1_1_SUB1]]{{\]}}, v{{\[}}[[VK1_SUB0]]:[[VK1_SUB1]]{{\]}}
+; GCN-DAG: v_mov_b32_e32 v[[VK1_SUB1:[0-9]+]], s[[SK1_SUB1]]
+; GCN-DAG: v_fma_f64 [[RESULT0:v\[[0-9]+:[0-9]+\]]], [[SGPR0]], v{{\[}}[[VS1_SUB0]]:[[VS1_SUB1]]{{\]}}, v{{\[}}[[VK0_SUB0]]:[[VK0_SUB1]]{{\]}}
+; GCN-DAG: v_fma_f64 [[RESULT1:v\[[0-9]+:[0-9]+\]]], [[SGPR0]], v{{\[}}[[VS1_SUB0]]:[[VS1_SUB1]]{{\]}}, v{{\[}}[[VK1_SUB0]]:[[VK1_SUB1]]{{\]}}
 
 ; GCN: buffer_store_dwordx2 [[RESULT0]]
 ; GCN: buffer_store_dwordx2 [[RESULT1]]




More information about the llvm-commits mailing list