[llvm] 08140d1 - [AMDGPU] Add optional tied-op for wwm-register's epilog spill restore

Mon May 15 22:37:27 PDT 2023

Author: Christudasan Devadasan
Date: 2023-05-16T11:06:06+05:30
New Revision: 08140d165e3d5b2257b26099469f7ae10c284dc7

URL: https://github.com/llvm/llvm-project/commit/08140d165e3d5b2257b26099469f7ae10c284dc7
DIFF: https://github.com/llvm/llvm-project/commit/08140d165e3d5b2257b26099469f7ae10c284dc7.diff

LOG: [AMDGPU] Add optional tied-op for wwm-register's epilog spill restore

The COPY inserted in the epilog block before return instruction as part
of ABI lowering, can get optimized during machine copy propagation if
the same register is used earlier in a wwm operation that demands the
prolog/epilog wwm-spill store/restore to preserve its inactive lanes.
With the spill restore in the epilog, the preceding COPY appears to be
dead during machine-cp. To avoid it, mark the same register as a tied-op
in the spill restore instruction to ensure a usage for the COPY.

Reviewed By: arsenm

Differential Revision: https://reviews.llvm.org/D150381

Added: 
    llvm/test/CodeGen/AMDGPU/tied-op-for-wwm-scratch-reg-spill-restore.mir

Modified: 
    llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
    llvm/test/CodeGen/AMDGPU/preserve-only-inactive-lane.mir

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
index 5d7a99a1b7c56..a3f89c03056d4 100644

--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
@@ -1644,6 +1644,33 @@ void SIRegisterInfo::buildSpillLoadStore(
 
     if (NeedSuperRegImpOperand && (IsFirstSubReg || IsLastSubReg))
       MIB.addReg(ValueReg, RegState::Implicit | SrcDstRegState);
+
+    // The epilog restore of a wwm-scratch register can cause undesired
+    // optimization during machine-cp post PrologEpilogInserter if the same
+    // register was assigned for return value ABI lowering with a COPY
+    // instruction. As given below, with the epilog reload, the earlier COPY
+    // appeared to be dead during machine-cp.
+    // ...
+    // v0 in WWM operation, needs the WWM spill at prolog/epilog.
+    // $vgpr0 = V_WRITELANE_B32 $sgpr20, 0, $vgpr0
+    // ...
+    // Epilog block:
+    // $vgpr0 = COPY $vgpr1 // outgoing value moved to v0
+    // ...
+    // WWM spill restore to preserve the inactive lanes of v0.
+    // $sgpr4_sgpr5 = S_XOR_SAVEEXEC_B64 -1
+    // $vgpr0 = BUFFER_LOAD $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0
+    // $exec = S_MOV_B64 killed $sgpr4_sgpr5
+    // ...
+    // SI_RETURN implicit $vgpr0
+    // ...
+    // To fix it, mark the same reg as a tied op for such restore instructions
+    // so that it marks a usage for the preceding COPY.
+    if (!IsStore && MI != MBB.end() && MI->isReturn() &&
+        MI->readsRegister(SubReg, this)) {
+      MIB.addReg(SubReg, RegState::Implicit);
+      MIB->tieOperands(0, MIB->getNumOperands() - 1);
+    }
   }
 
   if (ScratchOffsetRegDelta != 0) {

diff  --git a/llvm/test/CodeGen/AMDGPU/preserve-only-inactive-lane.mir b/llvm/test/CodeGen/AMDGPU/preserve-only-inactive-lane.mir
index f06133c8248a8..2f36bba5b75b9 100644
--- a/llvm/test/CodeGen/AMDGPU/preserve-only-inactive-lane.mir
+++ b/llvm/test/CodeGen/AMDGPU/preserve-only-inactive-lane.mir
@@ -28,7 +28,7 @@ body:             |
     ; GCN-NEXT: $sgpr35 = V_READLANE_B32 $vgpr0, 0
     ; GCN-NEXT: renamable $vgpr0 = V_MOV_B32_e32 10, implicit $exec
     ; GCN-NEXT: $sgpr4_sgpr5 = S_XOR_SAVEEXEC_B64 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec
-    ; GCN-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec :: (load (s32) from %stack.0, addrspace 5)
+    ; GCN-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec, implicit $vgpr0(tied-def 0) :: (load (s32) from %stack.0, addrspace 5)
     ; GCN-NEXT: $exec = S_MOV_B64 killed $sgpr4_sgpr5
     ; GCN-NEXT: S_SETPC_B64_return killed renamable $sgpr30_sgpr31, implicit $vgpr0
     renamable $vgpr0 = V_WRITELANE_B32 $sgpr35, 0, killed $vgpr0

diff  --git a/llvm/test/CodeGen/AMDGPU/tied-op-for-wwm-scratch-reg-spill-restore.mir b/llvm/test/CodeGen/AMDGPU/tied-op-for-wwm-scratch-reg-spill-restore.mir
new file mode 100644
index 0000000000000..c4fde5dd2da45
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/tied-op-for-wwm-scratch-reg-spill-restore.mir
@@ -0,0 +1,142 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx906 -run-pass=prologepilog,machine-cp -verify-machineinstrs %s -o - | FileCheck -check-prefix=GCN %s
+
+# The COPY that moves the return value to VGPR0 should not be removed during machine-cp. The spill restore of the same register that follows,
+# meant to only reload its inactive lanes. By marking the reg itself as the tied-op in the spill reload prevents the undesired optimization.
+
+---
+name:            wwm_scratch_reg_spill_reload_of_outgoing_reg
+tracksRegLiveness: true
+machineFunctionInfo:
+  wwmReservedRegs: ['$vgpr0']
+  isEntryFunction: false
+  scratchRSrcReg:  '$sgpr0_sgpr1_sgpr2_sgpr3'
+  stackPtrOffsetReg: '$sgpr32'
+  frameOffsetReg: '$sgpr33'
+body:             |
+  bb.0:
+    liveins: $sgpr20, $vgpr1
+    ; GCN-LABEL: name: wwm_scratch_reg_spill_reload_of_outgoing_reg
+    ; GCN: liveins: $sgpr20, $vgpr0, $vgpr1
+    ; GCN-NEXT: {{  $}}
+    ; GCN-NEXT: $sgpr4_sgpr5 = S_XOR_SAVEEXEC_B64 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec
+    ; GCN-NEXT: BUFFER_STORE_DWORD_OFFSET $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec :: (store (s32) into %stack.0, addrspace 5)
+    ; GCN-NEXT: $exec = S_MOV_B64 killed $sgpr4_sgpr5
+    ; GCN-NEXT: $vgpr0 = IMPLICIT_DEF
+    ; GCN-NEXT: $vgpr0 = V_WRITELANE_B32 killed $sgpr20, 0, $vgpr0
+    ; GCN-NEXT: $vgpr0 = COPY killed renamable $vgpr1, implicit $exec
+    ; GCN-NEXT: $sgpr4_sgpr5 = S_XOR_SAVEEXEC_B64 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec
+    ; GCN-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec, implicit $vgpr0(tied-def 0) :: (load (s32) from %stack.0, addrspace 5)
+    ; GCN-NEXT: $exec = S_MOV_B64 killed $sgpr4_sgpr5
+    ; GCN-NEXT: SI_RETURN implicit $vgpr0
+    $vgpr0 = IMPLICIT_DEF
+    $vgpr0 = V_WRITELANE_B32 killed $sgpr20, 0, $vgpr0
+    $vgpr0 = COPY killed renamable $vgpr1, implicit $exec
+    SI_RETURN implicit $vgpr0
+...
+
+# The reload of vgpr0 require the tied-op as it is a subreg in the outgoing tuple register vgpr0_vgpr1.
+# The vgpr2 doesn't need the tied-op in the reload as it isn't holding any return value.
+---
+name:            wwm_scratch_reg_spill_reload_of_outgoing_tuple_subreg
+tracksRegLiveness: true
+machineFunctionInfo:
+  wwmReservedRegs: ['$vgpr0', '$vgpr2']
+  isEntryFunction: false
+  scratchRSrcReg:  '$sgpr0_sgpr1_sgpr2_sgpr3'
+  stackPtrOffsetReg: '$sgpr32'
+  frameOffsetReg: '$sgpr33'
+body:             |
+  bb.0:
+    liveins: $sgpr20, $sgpr21, $vgpr1
+    ; GCN-LABEL: name: wwm_scratch_reg_spill_reload_of_outgoing_tuple_subreg
+    ; GCN: liveins: $sgpr20, $sgpr21, $vgpr0, $vgpr1, $vgpr2
+    ; GCN-NEXT: {{  $}}
+    ; GCN-NEXT: $sgpr4_sgpr5 = S_XOR_SAVEEXEC_B64 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec
+    ; GCN-NEXT: BUFFER_STORE_DWORD_OFFSET $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec :: (store (s32) into %stack.0, addrspace 5)
+    ; GCN-NEXT: BUFFER_STORE_DWORD_OFFSET $vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4, 0, 0, implicit $exec :: (store (s32) into %stack.1, addrspace 5)
+    ; GCN-NEXT: $exec = S_MOV_B64 killed $sgpr4_sgpr5
+    ; GCN-NEXT: $vgpr0 = IMPLICIT_DEF
+    ; GCN-NEXT: $vgpr2 = IMPLICIT_DEF
+    ; GCN-NEXT: $vgpr0 = V_WRITELANE_B32 killed $sgpr20, 0, $vgpr0
+    ; GCN-NEXT: $vgpr2 = V_WRITELANE_B32 killed $sgpr21, 0, $vgpr2
+    ; GCN-NEXT: $vgpr0 = COPY $vgpr1, implicit $exec
+    ; GCN-NEXT: $sgpr4_sgpr5 = S_XOR_SAVEEXEC_B64 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec
+    ; GCN-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec, implicit $vgpr0(tied-def 0) :: (load (s32) from %stack.0, addrspace 5)
+    ; GCN-NEXT: $vgpr2 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4, 0, 0, implicit $exec :: (load (s32) from %stack.1, addrspace 5)
+    ; GCN-NEXT: $exec = S_MOV_B64 killed $sgpr4_sgpr5
+    ; GCN-NEXT: SI_RETURN implicit $vgpr0_vgpr1
+    $vgpr0 = IMPLICIT_DEF
+    $vgpr2 = IMPLICIT_DEF
+    $vgpr0 = V_WRITELANE_B32 killed $sgpr20, 0, $vgpr0
+    $vgpr2 = V_WRITELANE_B32 killed $sgpr21, 0, $vgpr2
+    $vgpr0 = COPY $vgpr1, implicit $exec
+    SI_RETURN implicit $vgpr0_vgpr1
+...
+
+# Tied op not required in the spill reload of vgpr2.
+
+---
+name:            wwm_scratch_reg_spill_reload_
diff erent_outgoing_reg
+tracksRegLiveness: true
+machineFunctionInfo:
+  wwmReservedRegs: ['$vgpr2']
+  isEntryFunction: false
+  scratchRSrcReg:  '$sgpr0_sgpr1_sgpr2_sgpr3'
+  stackPtrOffsetReg: '$sgpr32'
+  frameOffsetReg: '$sgpr33'
+body:             |
+  bb.0:
+    liveins: $sgpr20, $vgpr1
+    ; GCN-LABEL: name: wwm_scratch_reg_spill_reload_
diff erent_outgoing_reg
+    ; GCN: liveins: $sgpr20, $vgpr1, $vgpr2
+    ; GCN-NEXT: {{  $}}
+    ; GCN-NEXT: $sgpr4_sgpr5 = S_XOR_SAVEEXEC_B64 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec
+    ; GCN-NEXT: BUFFER_STORE_DWORD_OFFSET $vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec :: (store (s32) into %stack.0, addrspace 5)
+    ; GCN-NEXT: $exec = S_MOV_B64 killed $sgpr4_sgpr5
+    ; GCN-NEXT: $vgpr2 = IMPLICIT_DEF
+    ; GCN-NEXT: $vgpr2 = V_WRITELANE_B32 killed $sgpr20, 0, $vgpr2
+    ; GCN-NEXT: $vgpr0 = COPY $vgpr1, implicit $exec
+    ; GCN-NEXT: $sgpr4_sgpr5 = S_XOR_SAVEEXEC_B64 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec
+    ; GCN-NEXT: $vgpr2 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec :: (load (s32) from %stack.0, addrspace 5)
+    ; GCN-NEXT: $exec = S_MOV_B64 killed $sgpr4_sgpr5
+    ; GCN-NEXT: SI_RETURN implicit $vgpr0_vgpr1
+    $vgpr2 = IMPLICIT_DEF
+    $vgpr2 = V_WRITELANE_B32 killed $sgpr20, 0, $vgpr2
+    $vgpr0 = COPY $vgpr1, implicit $exec
+    SI_RETURN implicit $vgpr0_vgpr1
+...
+
+# Tied op not required in the spill reload of vgpr40 which is in the CSR range.
+---
+name:            wwm_csr_spill_reload
+tracksRegLiveness: true
+machineFunctionInfo:
+  wwmReservedRegs: ['$vgpr40']
+  isEntryFunction: false
+  scratchRSrcReg:  '$sgpr0_sgpr1_sgpr2_sgpr3'
+  stackPtrOffsetReg: '$sgpr32'
+  frameOffsetReg: '$sgpr33'
+body:             |
+  bb.0:
+    liveins: $sgpr20, $vgpr1
+    ; GCN-LABEL: name: wwm_csr_spill_reload
+    ; GCN: liveins: $sgpr20, $vgpr1, $vgpr40
+    ; GCN-NEXT: {{  $}}
+    ; GCN-NEXT: $sgpr4_sgpr5 = S_OR_SAVEEXEC_B64 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec
+    ; GCN-NEXT: BUFFER_STORE_DWORD_OFFSET $vgpr40, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec :: (store (s32) into %stack.0, addrspace 5)
+    ; GCN-NEXT: $exec = S_MOV_B64 killed $sgpr4_sgpr5
+    ; GCN-NEXT: $vgpr40 = IMPLICIT_DEF
+    ; GCN-NEXT: $vgpr40 = V_WRITELANE_B32 killed $sgpr20, 0, $vgpr40
+    ; GCN-NEXT: $sgpr20 = V_READLANE_B32 $vgpr40, 0, implicit $exec
+    ; GCN-NEXT: $vgpr0 = COPY killed $vgpr1, implicit $exec
+    ; GCN-NEXT: $sgpr4_sgpr5 = S_OR_SAVEEXEC_B64 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec
+    ; GCN-NEXT: $vgpr40 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec :: (load (s32) from %stack.0, addrspace 5)
+    ; GCN-NEXT: $exec = S_MOV_B64 killed $sgpr4_sgpr5
+    ; GCN-NEXT: SI_RETURN implicit $vgpr0
+    $vgpr40 = IMPLICIT_DEF
+    $vgpr40 = V_WRITELANE_B32 killed $sgpr20, 0, $vgpr40
+    $sgpr20 = V_READLANE_B32 $vgpr40, 0, implicit $exec
+    $vgpr0 = COPY killed $vgpr1, implicit $exec
+    SI_RETURN implicit $vgpr0
+...