[llvm] e1beebb - SplitKit: Don't further split subrange mask in buildCopy

Thu Aug 12 16:37:17 PDT 2021

Author: Ruiling Song
Date: 2021-08-13T07:36:38+08:00
New Revision: e1beebbac5da99b2451f25c2028531c12a70860a

URL: https://github.com/llvm/llvm-project/commit/e1beebbac5da99b2451f25c2028531c12a70860a
DIFF: https://github.com/llvm/llvm-project/commit/e1beebbac5da99b2451f25c2028531c12a70860a.diff

LOG: SplitKit: Don't further split subrange mask in buildCopy

We may use several COPY instructions to copy the needed sub-registers
during split. But the way we split the lanes during the COPYs may be
different from the subranges of the old register. This would fail when we
extend the subranges of the new register because the LaneMasks do not
match exactly between subranges of new register and old register.
Since we are bundling the COPYs, I think there is no need to further refine the
subranges of the new register based on the set of LaneMasks of the inserted COPYs.

I am not sure if there will be further breaking cases. But as the subranges of
new register are created based on the LaneMasks of the subranges of old register,
it will be highly possible we will always find an exact LaneMask match.
We can think about how to make the extendPHIKillRanges() work for
subrange mask mismatch case if we meet more such cases in the future.

The test case was from D105065 by @arsenm.

Differential Revision: https://reviews.llvm.org/D107829

Added: 
    llvm/test/CodeGen/AMDGPU/extend-phi-subrange-not-in-parent.mir

Modified: 
    llvm/lib/CodeGen/SplitKit.cpp

Removed: 
    


################################################################################
diff  --git a/llvm/lib/CodeGen/SplitKit.cpp b/llvm/lib/CodeGen/SplitKit.cpp
index c70620fd75328..38493edf0af13 100644

--- a/llvm/lib/CodeGen/SplitKit.cpp
+++ b/llvm/lib/CodeGen/SplitKit.cpp
@@ -529,19 +529,12 @@ SlotIndex SplitEditor::buildSingleSubRegCopy(Register FromReg, Register ToReg,
               | getInternalReadRegState(!FirstCopy), SubIdx)
       .addReg(FromReg, 0, SubIdx);
 
-  BumpPtrAllocator &Allocator = LIS.getVNInfoAllocator();
   SlotIndexes &Indexes = *LIS.getSlotIndexes();
   if (FirstCopy) {
     Def = Indexes.insertMachineInstrInMaps(*CopyMI, Late).getRegSlot();
   } else {
     CopyMI->bundleWithPred();
   }
-  LaneBitmask LaneMask = TRI.getSubRegIndexLaneMask(SubIdx);
-  DestLI.refineSubRanges(Allocator, LaneMask,
-                         [Def, &Allocator](LiveInterval::SubRange &SR) {
-                           SR.createDeadDef(Def, Allocator);
-                         },
-                         Indexes, TRI);
   return Def;
 }
 
@@ -549,11 +542,11 @@ SlotIndex SplitEditor::buildCopy(Register FromReg, Register ToReg,
     LaneBitmask LaneMask, MachineBasicBlock &MBB,
     MachineBasicBlock::iterator InsertBefore, bool Late, unsigned RegIdx) {
   const MCInstrDesc &Desc = TII.get(TargetOpcode::COPY);
+  SlotIndexes &Indexes = *LIS.getSlotIndexes();
   if (LaneMask.all() || LaneMask == MRI.getMaxLaneMaskForVReg(FromReg)) {
     // The full vreg is copied.
     MachineInstr *CopyMI =
         BuildMI(MBB, InsertBefore, DebugLoc(), Desc, ToReg).addReg(FromReg);
-    SlotIndexes &Indexes = *LIS.getSlotIndexes();
     return Indexes.insertMachineInstrInMaps(*CopyMI, Late).getRegSlot();
   }
 
@@ -567,18 +560,26 @@ SlotIndex SplitEditor::buildCopy(Register FromReg, Register ToReg,
   const TargetRegisterClass *RC = MRI.getRegClass(FromReg);
   assert(RC == MRI.getRegClass(ToReg) && "Should have same reg class");
 
-  SmallVector<unsigned, 8> Indexes;
+  SmallVector<unsigned, 8> SubIndexes;
 
   // Abort if we cannot possibly implement the COPY with the given indexes.
-  if (!TRI.getCoveringSubRegIndexes(MRI, RC, LaneMask, Indexes))
+  if (!TRI.getCoveringSubRegIndexes(MRI, RC, LaneMask, SubIndexes))
     report_fatal_error("Impossible to implement partial COPY");
 
   SlotIndex Def;
-  for (unsigned BestIdx : Indexes) {
+  for (unsigned BestIdx : SubIndexes) {
     Def = buildSingleSubRegCopy(FromReg, ToReg, MBB, InsertBefore, BestIdx,
                                 DestLI, Late, Def);
   }
 
+  BumpPtrAllocator &Allocator = LIS.getVNInfoAllocator();
+  DestLI.refineSubRanges(
+      Allocator, LaneMask,
+      [Def, &Allocator](LiveInterval::SubRange &SR) {
+        SR.createDeadDef(Def, Allocator);
+      },
+      Indexes, TRI);
+
   return Def;
 }
 

diff  --git a/llvm/test/CodeGen/AMDGPU/extend-phi-subrange-not-in-parent.mir b/llvm/test/CodeGen/AMDGPU/extend-phi-subrange-not-in-parent.mir
new file mode 100644
index 0000000000000..6a855c29b7b44
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/extend-phi-subrange-not-in-parent.mir
@@ -0,0 +1,73 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -verify-regalloc -run-pass=greedy -o - %s | FileCheck %s
+
+# Initially %2 starts out with 2 subranges (one for sub0, and one for
+# the rest of the lanes). After %2 is split, after refineSubRanges the
+# newly created register has a 
diff erent set of lane masks since the
+# copy bundle uses 2 
diff erent defs to cover the register. This was
+# fixed by doing refineSubRanges after all the COPYs being inserted.
+
+---
+name: subrange_for_this_mask_not_found
+tracksRegLiveness: true
+machineFunctionInfo:
+  scratchRSrcReg:  '$sgpr0_sgpr1_sgpr2_sgpr3'
+  stackPtrOffsetReg: '$sgpr32'
+  occupancy:       7
+body:             |
+  ; CHECK-LABEL: name: subrange_for_this_mask_not_found
+  ; CHECK: bb.0:
+  ; CHECK:   successors: %bb.1(0x80000000)
+  ; CHECK:   [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; CHECK:   [[DEF1:%[0-9]+]]:vreg_1024_align2 = IMPLICIT_DEF
+  ; CHECK:   SI_SPILL_V1024_SAVE [[DEF1]], %stack.0, $sgpr32, 0, implicit $exec :: (store (s1024) into %stack.0, align 4, addrspace 5)
+  ; CHECK: bb.1:
+  ; CHECK:   successors: %bb.1(0x40000000), %bb.2(0x40000000)
+  ; CHECK:   S_NOP 0, implicit [[DEF1]]
+  ; CHECK:   S_NOP 0, implicit [[DEF1]]
+  ; CHECK:   [[DEF2:%[0-9]+]]:vreg_1024_align2 = IMPLICIT_DEF
+  ; CHECK:   S_CBRANCH_VCCNZ %bb.1, implicit undef $vcc
+  ; CHECK: bb.2:
+  ; CHECK:   successors: %bb.3(0x80000000)
+  ; CHECK:   [[SI_SPILL_V1024_RESTORE:%[0-9]+]]:vreg_1024_align2 = SI_SPILL_V1024_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s1024) from %stack.0, align 4, addrspace 5)
+  ; CHECK:   undef %5.sub1_sub2_sub3_sub4_sub5_sub6_sub7_sub8_sub9_sub10_sub11_sub12_sub13_sub14_sub15_sub16:vreg_1024_align2 = COPY [[SI_SPILL_V1024_RESTORE]].sub1_sub2_sub3_sub4_sub5_sub6_sub7_sub8_sub9_sub10_sub11_sub12_sub13_sub14_sub15_sub16 {
+  ; CHECK:     internal %5.sub16_sub17_sub18_sub19_sub20_sub21_sub22_sub23_sub24_sub25_sub26_sub27_sub28_sub29_sub30_sub31:vreg_1024_align2 = COPY [[SI_SPILL_V1024_RESTORE]].sub16_sub17_sub18_sub19_sub20_sub21_sub22_sub23_sub24_sub25_sub26_sub27_sub28_sub29_sub30_sub31
+  ; CHECK:   }
+  ; CHECK:   %5.sub0:vreg_1024_align2 = IMPLICIT_DEF
+  ; CHECK:   S_NOP 0, implicit %5.sub0
+  ; CHECK: bb.3:
+  ; CHECK:   successors: %bb.4(0x80000000)
+  ; CHECK:   S_NOP 0, implicit %5
+  ; CHECK: bb.4:
+  ; CHECK:   successors: %bb.3(0x40000000), %bb.5(0x40000000)
+  ; CHECK:   [[DEF2:%[0-9]+]]:vreg_1024_align2 = IMPLICIT_DEF
+  ; CHECK:   S_CBRANCH_VCCNZ %bb.3, implicit undef $vcc
+  ; CHECK: bb.5:
+  ; CHECK:   undef %3.sub0:vreg_1024_align2 = COPY [[DEF]]
+  ; CHECK:   S_NOP 0, implicit %3
+  bb.0:
+    %0:vgpr_32 = IMPLICIT_DEF
+    %1:vreg_1024_align2 = IMPLICIT_DEF
+    %2:vreg_1024_align2 = COPY %1
+
+  bb.1:
+    S_NOP 0, implicit %1
+    S_NOP 0, implicit %1
+    %1:vreg_1024_align2 = IMPLICIT_DEF
+    S_CBRANCH_VCCNZ %bb.1, implicit undef $vcc
+
+  bb.3:
+    %2.sub0:vreg_1024_align2 = IMPLICIT_DEF
+    S_NOP 0, implicit %2.sub0
+
+  bb.4:
+    S_NOP 0, implicit %2
+
+  bb.5:
+    %2:vreg_1024_align2 = IMPLICIT_DEF
+    S_CBRANCH_VCCNZ %bb.4, implicit undef $vcc
+
+  bb.6:
+    undef %4.sub0:vreg_1024_align2 = COPY %0
+    S_NOP 0, implicit %4
+...