[llvm] 738c73a - RegAllocFast: Make self loop live-out heuristic more aggressive

Matt Arsenault via llvm-commits llvm-commits at lists.llvm.org
Wed Sep 16 10:12:49 PDT 2020


Author: Matt Arsenault
Date: 2020-09-16T13:12:38-04:00
New Revision: 738c73a454881ca78214816754c1b82941d0cd26

URL: https://github.com/llvm/llvm-project/commit/738c73a454881ca78214816754c1b82941d0cd26
DIFF: https://github.com/llvm/llvm-project/commit/738c73a454881ca78214816754c1b82941d0cd26.diff

LOG: RegAllocFast: Make self loop live-out heuristic more aggressive

This currently has no impact on code, but prevents sizeable code size
regressions after D52010. This prevents spilling and reloading all
values inside blocks that loop back. Add a baseline test which would
regress without this patch.

Added: 
    llvm/test/CodeGen/AMDGPU/fastregalloc-self-loop-heuristic.mir

Modified: 
    llvm/lib/CodeGen/RegAllocFast.cpp

Removed: 
    


################################################################################
diff  --git a/llvm/lib/CodeGen/RegAllocFast.cpp b/llvm/lib/CodeGen/RegAllocFast.cpp
index d93fd8f601c6..db1b904fb2e6 100644
--- a/llvm/lib/CodeGen/RegAllocFast.cpp
+++ b/llvm/lib/CodeGen/RegAllocFast.cpp
@@ -263,6 +263,20 @@ int RegAllocFast::getStackSpaceFor(Register VirtReg) {
   return FrameIdx;
 }
 
+static bool dominates(MachineBasicBlock &MBB,
+                      MachineBasicBlock::const_iterator A,
+                      MachineBasicBlock::const_iterator B) {
+  auto MBBEnd = MBB.end();
+  if (B == MBBEnd)
+    return true;
+
+  MachineBasicBlock::const_iterator I = MBB.begin();
+  for (; &*I != A && &*I != B; ++I)
+    ;
+
+  return &*I == A;
+}
+
 /// Returns false if \p VirtReg is known to not live out of the current block.
 bool RegAllocFast::mayLiveOut(Register VirtReg) {
   if (MayLiveAcrossBlocks.test(Register::virtReg2Index(VirtReg))) {
@@ -270,11 +284,16 @@ bool RegAllocFast::mayLiveOut(Register VirtReg) {
     return !MBB->succ_empty();
   }
 
-  // If this block loops back to itself, it would be necessary to check whether
-  // the use comes after the def.
+  const MachineInstr *SelfLoopDef = nullptr;
+
+  // If this block loops back to itself, it is necessary to check whether the
+  // use comes after the def.
   if (MBB->isSuccessor(MBB)) {
-    MayLiveAcrossBlocks.set(Register::virtReg2Index(VirtReg));
-    return true;
+    SelfLoopDef = MRI->getUniqueVRegDef(VirtReg);
+    if (!SelfLoopDef) {
+      MayLiveAcrossBlocks.set(Register::virtReg2Index(VirtReg));
+      return true;
+    }
   }
 
   // See if the first \p Limit uses of the register are all in the current
@@ -287,6 +306,16 @@ bool RegAllocFast::mayLiveOut(Register VirtReg) {
       // Cannot be live-out if there are no successors.
       return !MBB->succ_empty();
     }
+
+    if (SelfLoopDef) {
+      // Try to handle some simple cases to avoid spilling and reloading every
+      // value inside a self looping block.
+      if (SelfLoopDef == &UseInst ||
+          !dominates(*MBB, SelfLoopDef->getIterator(), UseInst.getIterator())) {
+        MayLiveAcrossBlocks.set(Register::virtReg2Index(VirtReg));
+        return true;
+      }
+    }
   }
 
   return false;

diff  --git a/llvm/test/CodeGen/AMDGPU/fastregalloc-self-loop-heuristic.mir b/llvm/test/CodeGen/AMDGPU/fastregalloc-self-loop-heuristic.mir
new file mode 100644
index 000000000000..32de26283781
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/fastregalloc-self-loop-heuristic.mir
@@ -0,0 +1,185 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -verify-machineinstrs -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -run-pass=regallocfast -o - %s | FileCheck -check-prefix=GCN %s
+
+---
+name: self_loop_single_def_use
+tracksRegLiveness: true
+machineFunctionInfo:
+  isEntryFunction: true
+  scratchRSrcReg:  '$sgpr0_sgpr1_sgpr2_sgpr3'
+  stackPtrOffsetReg: '$sgpr32'
+body:             |
+  ; GCN-LABEL: name: self_loop_single_def_use
+  ; GCN: bb.0:
+  ; GCN:   successors: %bb.1(0x80000000)
+  ; GCN:   liveins: $vgpr0_vgpr1
+  ; GCN:   SI_SPILL_V64_SAVE killed $vgpr0_vgpr1, %stack.0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, implicit $exec :: (store 8 into %stack.0, align 4, addrspace 5)
+  ; GCN: bb.1:
+  ; GCN:   successors: %bb.1(0x40000000), %bb.2(0x40000000)
+  ; GCN:   $vgpr0_vgpr1 = SI_SPILL_V64_RESTORE %stack.0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, implicit $exec :: (load 8 from %stack.0, align 4, addrspace 5)
+  ; GCN:   renamable $vgpr2 = GLOBAL_LOAD_DWORD renamable $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec
+  ; GCN:   GLOBAL_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, 0, implicit $exec
+  ; GCN:   S_CBRANCH_EXECZ %bb.1, implicit $exec
+  ; GCN: bb.2:
+  ; GCN:   S_ENDPGM 0
+  bb.0:
+    liveins: $vgpr0_vgpr1
+    %0:vreg_64 = COPY $vgpr0_vgpr1
+
+  bb.1:
+    %1:vgpr_32 = GLOBAL_LOAD_DWORD %0, 0, 0, 0, 0, implicit $exec
+    GLOBAL_STORE_DWORD %0, %1, 0, 0, 0, 0, implicit $exec
+    S_CBRANCH_EXECZ %bb.1, implicit $exec
+
+  bb.2:
+    S_ENDPGM 0
+
+...
+
+---
+name: self_loop_multi_def
+tracksRegLiveness: true
+machineFunctionInfo:
+  isEntryFunction: true
+  scratchRSrcReg:  '$sgpr0_sgpr1_sgpr2_sgpr3'
+  stackPtrOffsetReg: '$sgpr32'
+body:             |
+  ; GCN-LABEL: name: self_loop_multi_def
+  ; GCN: bb.0:
+  ; GCN:   successors: %bb.1(0x80000000)
+  ; GCN:   liveins: $vgpr0_vgpr1
+  ; GCN:   SI_SPILL_V64_SAVE killed $vgpr0_vgpr1, %stack.0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, implicit $exec :: (store 8 into %stack.0, align 4, addrspace 5)
+  ; GCN: bb.1:
+  ; GCN:   successors: %bb.1(0x40000000), %bb.2(0x40000000)
+  ; GCN:   $vgpr0_vgpr1 = SI_SPILL_V64_RESTORE %stack.0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, implicit $exec :: (load 8 from %stack.0, align 4, addrspace 5)
+  ; GCN:   renamable $vgpr2 = GLOBAL_LOAD_DWORD renamable $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec
+  ; GCN:   GLOBAL_STORE_DWORD renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, 0, implicit $exec
+  ; GCN:   renamable $vgpr2 = GLOBAL_LOAD_DWORD renamable $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec
+  ; GCN:   GLOBAL_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, 0, implicit $exec
+  ; GCN:   S_CBRANCH_EXECZ %bb.1, implicit $exec
+  ; GCN: bb.2:
+  ; GCN:   S_ENDPGM 0
+  bb.0:
+    liveins: $vgpr0_vgpr1
+    %0:vreg_64 = COPY $vgpr0_vgpr1
+
+  bb.1:
+    %1:vgpr_32 = GLOBAL_LOAD_DWORD %0, 0, 0, 0, 0, implicit $exec
+    GLOBAL_STORE_DWORD %0, %1, 0, 0, 0, 0, implicit $exec
+    %1:vgpr_32 = GLOBAL_LOAD_DWORD %0, 0, 0, 0, 0, implicit $exec
+    GLOBAL_STORE_DWORD %0, %1, 0, 0, 0, 0, implicit $exec
+    S_CBRANCH_EXECZ %bb.1, implicit $exec
+
+  bb.2:
+    S_ENDPGM 0
+
+...
+
+# There's a single def inside the self loop, but it's also a use.
+
+---
+name: self_loop_def_use_same_inst
+tracksRegLiveness: true
+machineFunctionInfo:
+  isEntryFunction: true
+  scratchRSrcReg:  '$sgpr0_sgpr1_sgpr2_sgpr3'
+  stackPtrOffsetReg: '$sgpr32'
+body:             |
+  ; GCN-LABEL: name: self_loop_def_use_same_inst
+  ; GCN: bb.0:
+  ; GCN:   successors: %bb.1(0x80000000)
+  ; GCN:   liveins: $vgpr0_vgpr1
+  ; GCN:   SI_SPILL_V64_SAVE killed $vgpr0_vgpr1, %stack.0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, implicit $exec :: (store 8 into %stack.0, align 4, addrspace 5)
+  ; GCN: bb.1:
+  ; GCN:   successors: %bb.1(0x40000000), %bb.2(0x40000000)
+  ; GCN:   renamable $vgpr0 = V_ADD_U32_e32 1, undef $vgpr0, implicit $exec
+  ; GCN:   $vgpr1_vgpr2 = SI_SPILL_V64_RESTORE %stack.0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, implicit $exec :: (load 8 from %stack.0, align 4, addrspace 5)
+  ; GCN:   GLOBAL_STORE_DWORD killed renamable $vgpr1_vgpr2, killed renamable $vgpr0, 0, 0, 0, 0, implicit $exec
+  ; GCN:   S_CBRANCH_EXECZ %bb.1, implicit $exec
+  ; GCN: bb.2:
+  ; GCN:   S_ENDPGM 0
+  bb.0:
+    liveins: $vgpr0_vgpr1
+    %0:vreg_64 = COPY $vgpr0_vgpr1
+
+  bb.1:
+    %1:vgpr_32 = V_ADD_U32_e32 1, undef %1, implicit $exec
+    GLOBAL_STORE_DWORD %0, %1, 0, 0, 0, 0, implicit $exec
+    S_CBRANCH_EXECZ %bb.1, implicit $exec
+
+  bb.2:
+    S_ENDPGM 0
+
+...
+
+---
+name: self_loop_def_after_use
+tracksRegLiveness: true
+machineFunctionInfo:
+  isEntryFunction: true
+  scratchRSrcReg:  '$sgpr0_sgpr1_sgpr2_sgpr3'
+  stackPtrOffsetReg: '$sgpr32'
+body:             |
+  ; GCN-LABEL: name: self_loop_def_after_use
+  ; GCN: bb.0:
+  ; GCN:   successors: %bb.1(0x80000000)
+  ; GCN:   liveins: $vgpr0_vgpr1
+  ; GCN:   SI_SPILL_V64_SAVE killed $vgpr0_vgpr1, %stack.0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, implicit $exec :: (store 8 into %stack.0, align 4, addrspace 5)
+  ; GCN: bb.1:
+  ; GCN:   successors: %bb.1(0x40000000), %bb.2(0x40000000)
+  ; GCN:   $vgpr0_vgpr1 = SI_SPILL_V64_RESTORE %stack.0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, implicit $exec :: (load 8 from %stack.0, align 4, addrspace 5)
+  ; GCN:   GLOBAL_STORE_DWORD killed renamable $vgpr0_vgpr1, undef renamable $vgpr0, 0, 0, 0, 0, implicit $exec
+  ; GCN:   renamable $vgpr2 = V_ADD_U32_e64 1, 1, 0, implicit $exec
+  ; GCN:   SI_SPILL_V32_SAVE killed $vgpr2, %stack.1, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, implicit $exec :: (store 4 into %stack.1, addrspace 5)
+  ; GCN:   S_CBRANCH_EXECZ %bb.1, implicit $exec
+  ; GCN: bb.2:
+  ; GCN:   S_ENDPGM 0
+  bb.0:
+    liveins: $vgpr0_vgpr1
+    %0:vreg_64 = COPY $vgpr0_vgpr1
+
+  bb.1:
+    GLOBAL_STORE_DWORD %0, undef %1, 0, 0, 0, 0, implicit $exec
+    %1:vgpr_32 = V_ADD_U32_e64 1, 1, 0, implicit $exec
+    S_CBRANCH_EXECZ %bb.1, implicit $exec
+
+  bb.2:
+    S_ENDPGM 0
+
+...
+
+---
+name: self_loop_single_subreg_def_use
+tracksRegLiveness: true
+machineFunctionInfo:
+  isEntryFunction: true
+  scratchRSrcReg:  '$sgpr0_sgpr1_sgpr2_sgpr3'
+  stackPtrOffsetReg: '$sgpr32'
+body:             |
+  ; GCN-LABEL: name: self_loop_single_subreg_def_use
+  ; GCN: bb.0:
+  ; GCN:   successors: %bb.1(0x80000000)
+  ; GCN:   liveins: $vgpr0_vgpr1
+  ; GCN:   SI_SPILL_V64_SAVE killed $vgpr0_vgpr1, %stack.0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, implicit $exec :: (store 8 into %stack.0, align 4, addrspace 5)
+  ; GCN: bb.1:
+  ; GCN:   successors: %bb.1(0x40000000), %bb.2(0x40000000)
+  ; GCN:   $vgpr0_vgpr1 = SI_SPILL_V64_RESTORE %stack.0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, implicit $exec :: (load 8 from %stack.0, align 4, addrspace 5)
+  ; GCN:   undef renamable $vgpr3 = GLOBAL_LOAD_DWORD renamable $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr2_vgpr3
+  ; GCN:   GLOBAL_STORE_DWORD killed renamable $vgpr0_vgpr1, undef renamable $vgpr3, 0, 0, 0, 0, implicit $exec
+  ; GCN:   SI_SPILL_V64_SAVE killed $vgpr2_vgpr3, %stack.1, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, implicit $exec :: (store 8 into %stack.1, align 4, addrspace 5)
+  ; GCN:   S_CBRANCH_EXECZ %bb.1, implicit $exec
+  ; GCN: bb.2:
+  ; GCN:   S_ENDPGM 0
+  bb.0:
+    liveins: $vgpr0_vgpr1
+    %0:vreg_64 = COPY $vgpr0_vgpr1
+
+  bb.1:
+    undef %1.sub1:vreg_64 = GLOBAL_LOAD_DWORD %0, 0, 0, 0, 0, implicit $exec
+    GLOBAL_STORE_DWORD %0, undef %1.sub1, 0, 0, 0, 0, implicit $exec
+    S_CBRANCH_EXECZ %bb.1, implicit $exec
+
+  bb.2:
+    S_ENDPGM 0
+
+...


        


More information about the llvm-commits mailing list