[llvm] r370929 - AMDGPU: Handle frame index expansion with no free SGPRs pre gfx9

Wed Sep 4 10:12:57 PDT 2019

Author: arsenm
Date: Wed Sep  4 10:12:57 2019
New Revision: 370929

URL: http://llvm.org/viewvc/llvm-project?rev=370929&view=rev
Log:
AMDGPU: Handle frame index expansion with no free SGPRs pre gfx9

Since an add instruction must produce an unused carry out, this
requires additional SGPRs. This can be avoided by keeping the entire
offset computation in SGPRs. If one SGPR is still available, this only
costs one extra mov. If none are available, the entire computation can
be done in place and reversed.

This does assume the use is a VGPR operand. This was already assumed,
and we currently only select frame indexes to VALU instructions. This
should probably be fixed at some point to handle more possible MIR.

Added:
    llvm/trunk/test/CodeGen/AMDGPU/pei-scavenge-sgpr-carry-out.mir
Modified:
    llvm/trunk/lib/Target/AMDGPU/SIInstrInfo.cpp
    llvm/trunk/lib/Target/AMDGPU/SIRegisterInfo.cpp

Modified: llvm/trunk/lib/Target/AMDGPU/SIInstrInfo.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/SIInstrInfo.cpp?rev=370929&r1=370928&r2=370929&view=diff
==============================================================================

--- llvm/trunk/lib/Target/AMDGPU/SIInstrInfo.cpp (original)
+++ llvm/trunk/lib/Target/AMDGPU/SIInstrInfo.cpp Wed Sep  4 10:12:57 2019
@@ -6098,7 +6098,7 @@ MachineInstrBuilder SIInstrInfo::getAddN
   Register UnusedCarry = RS.scavengeRegister(RI.getBoolRC(), I, 0, false);
   // TODO: Users need to deal with this.
   if (!UnusedCarry.isValid())
-    report_fatal_error("failed to scavenge unused carry-out SGPR");
+    return MachineInstrBuilder();
 
   return BuildMI(MBB, I, DL, get(AMDGPU::V_ADD_I32_e64), DestReg)
            .addReg(UnusedCarry, RegState::Define | RegState::Dead);

Modified: llvm/trunk/lib/Target/AMDGPU/SIRegisterInfo.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/SIRegisterInfo.cpp?rev=370929&r1=370928&r2=370929&view=diff
==============================================================================
--- llvm/trunk/lib/Target/AMDGPU/SIRegisterInfo.cpp (original)
+++ llvm/trunk/lib/Target/AMDGPU/SIRegisterInfo.cpp Wed Sep  4 10:12:57 2019
@@ -1273,35 +1273,67 @@ void SIRegisterInfo::eliminateFrameIndex
         if (Offset == 0) {
           // XXX - This never happens because of emergency scavenging slot at 0?
           BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_LSHRREV_B32_e64), ResultReg)
-            .addImm(Log2_32(ST.getWavefrontSize()))
+            .addImm(ST.getWavefrontSizeLog2())
             .addReg(DiffReg);
         } else {
-          Register ScaledReg =
-            RS->scavengeRegister(&AMDGPU::VGPR_32RegClass, MI, 0);
+          if (auto MIB = TII->getAddNoCarry(*MBB, MI, DL, ResultReg, *RS)) {
+            Register ScaledReg =
+              RS->scavengeRegister(&AMDGPU::VGPR_32RegClass, MIB, 0);
 
-          // FIXME: Assusmed VGPR use.
-          BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_LSHRREV_B32_e64), ScaledReg)
-            .addImm(Log2_32(ST.getWavefrontSize()))
-            .addReg(DiffReg, RegState::Kill);
-
-          // TODO: Fold if use instruction is another add of a constant.
-          if (AMDGPU::isInlinableLiteral32(Offset, ST.hasInv2PiInlineImm())) {
-
-            // FIXME: This can fail
-            TII->getAddNoCarry(*MBB, MI, DL, ResultReg, *RS)
-              .addImm(Offset)
-              .addReg(ScaledReg, RegState::Kill)
-              .addImm(0); // clamp bit
+            BuildMI(*MBB, *MIB, DL, TII->get(AMDGPU::V_LSHRREV_B32_e64),
+                    ScaledReg)
+              .addImm(ST.getWavefrontSizeLog2())
+              .addReg(DiffReg, RegState::Kill);
+
+            // TODO: Fold if use instruction is another add of a constant.
+            if (AMDGPU::isInlinableLiteral32(Offset, ST.hasInv2PiInlineImm())) {
+              // FIXME: This can fail
+              MIB.addImm(Offset);
+              MIB.addReg(ScaledReg, RegState::Kill);
+              MIB.addImm(0); // clamp bit
+            } else {
+              Register ConstOffsetReg =
+                RS->scavengeRegister(&AMDGPU::SReg_32_XM0RegClass, MIB, 0, false);
+
+              // This should always be able to use the unused carry out.
+              assert(ConstOffsetReg && "this scavenge should not be able to fail");
+
+              BuildMI(*MBB, *MIB, DL, TII->get(AMDGPU::S_MOV_B32), ConstOffsetReg)
+                .addImm(Offset);
+              MIB.addReg(ConstOffsetReg, RegState::Kill);
+              MIB.addReg(ScaledReg, RegState::Kill);
+              MIB.addImm(0); // clamp bit
+            }
           } else {
-            Register ConstOffsetReg =
-              RS->scavengeRegister(&AMDGPU::SReg_32_XM0RegClass, MI, 0, false);
+            // We have to produce a carry out, and we there isn't a free SGPR
+            // pair for it. We can keep the whole computation on the SALU to
+            // avoid clobbering an additional register at the cost of an extra
+            // mov.
 
-            BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_MOV_B32), ConstOffsetReg)
-              .addImm(Offset);
-            TII->getAddNoCarry(*MBB, MI, DL, ResultReg, *RS)
-              .addReg(ConstOffsetReg, RegState::Kill)
+            // We may have 1 free scratch SGPR even though a carry out is
+            // unavailable. Only one additional mov is needed.
+            Register TmpScaledReg =
+                RS->scavengeRegister(&AMDGPU::SReg_32_XM0RegClass, MI, 0, false);
+            Register ScaledReg = TmpScaledReg.isValid() ? TmpScaledReg : DiffReg;
+
+            BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_LSHR_B32), ScaledReg)
+              .addReg(DiffReg, RegState::Kill)
+              .addImm(ST.getWavefrontSizeLog2());
+            BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_ADD_U32), ScaledReg)
               .addReg(ScaledReg, RegState::Kill)
-              .addImm(0); // clamp bit
+              .addImm(Offset);
+            BuildMI(*MBB, MI, DL, TII->get(AMDGPU::COPY), ResultReg)
+              .addReg(ScaledReg, RegState::Kill);
+
+            // If there were truly no free SGPRs, we need to undo everything.
+            if (!TmpScaledReg.isValid()) {
+              BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_SUB_U32), ScaledReg)
+                .addReg(ScaledReg, RegState::Kill)
+                .addImm(Offset);
+            BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_LSHL_B32), ScaledReg)
+              .addReg(DiffReg, RegState::Kill)
+              .addImm(ST.getWavefrontSizeLog2());
+            }
           }
         }
 

Added: llvm/trunk/test/CodeGen/AMDGPU/pei-scavenge-sgpr-carry-out.mir
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/pei-scavenge-sgpr-carry-out.mir?rev=370929&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/pei-scavenge-sgpr-carry-out.mir (added)
+++ llvm/trunk/test/CodeGen/AMDGPU/pei-scavenge-sgpr-carry-out.mir Wed Sep  4 10:12:57 2019
@@ -0,0 +1,93 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -verify-machineinstrs -run-pass=prologepilog %s -o - | FileCheck %s
+
+# Test what happens when an SGPR is unavailable for the unused add
+# carry out when materializing the frame index.
+
+
+# There are truly no free SGPRs, so the entire frame index expansion
+# needs to be inverted to restore the original frame register.
+
+---
+name: scavenge_sgpr_pei_no_sgprs
+tracksRegLiveness: true
+
+stack:
+  - { id: 0, type: default, offset: 0, size: 4, alignment: 8192 }
+  - { id: 1, type: default, offset: 0, size: 4, alignment: 8192 }
+
+machineFunctionInfo:
+  isEntryFunction: false
+  scratchRSrcReg:  $sgpr0_sgpr1_sgpr2_sgpr3
+  scratchWaveOffsetReg: $sgpr34
+  frameOffsetReg:  $sgpr33
+  stackPtrOffsetReg:  $sgpr32
+
+body:             |
+  bb.0:
+    liveins: $vgpr1
+
+    ; CHECK-LABEL: name: scavenge_sgpr_pei_no_sgprs
+    ; CHECK: liveins: $vgpr1
+    ; CHECK: $sgpr27 = frame-setup COPY $sgpr33
+    ; CHECK: $sgpr4 = frame-setup S_ADD_U32 $sgpr32, 524224, implicit-def $scc
+    ; CHECK: $sgpr33 = frame-setup S_AND_B32 killed $sgpr4, 4294443008, implicit-def $scc
+    ; CHECK: $sgpr32 = frame-setup S_ADD_U32 $sgpr32, 1572864, implicit-def $scc
+    ; CHECK: S_NOP 0, implicit-def $sgpr4, implicit-def $sgpr5, implicit-def $sgpr6, implicit-def $sgpr7, implicit-def $sgpr8, implicit-def $sgpr9, implicit-def $sgpr10, implicit-def $sgpr11, implicit-def $sgpr12, implicit-def $sgpr13, implicit-def $sgpr14, implicit-def $sgpr15, implicit-def $sgpr16, implicit-def $sgpr17, implicit-def $sgpr18, implicit-def $sgpr19, implicit-def $sgpr20, implicit-def $sgpr21, implicit-def $sgpr22, implicit-def $sgpr23, implicit-def $sgpr24, implicit-def $sgpr25, implicit-def $sgpr26, implicit-def $sgpr17, implicit-def $sgpr28, implicit-def $sgpr29, implicit-def $sgpr30, implicit-def $sgpr31, implicit-def $vcc
+    ; CHECK: $sgpr33 = S_SUB_U32 $sgpr33, $sgpr34, implicit-def $scc
+    ; CHECK: $sgpr33 = S_LSHR_B32 killed $sgpr33, 6, implicit-def $scc
+    ; CHECK: $sgpr33 = S_ADD_U32 killed $sgpr33, 8192, implicit-def $scc
+    ; CHECK: $vgpr2 = COPY killed $sgpr33
+    ; CHECK: $sgpr33 = S_SUB_U32 killed $sgpr33, 8192, implicit-def $scc
+    ; CHECK: $sgpr33 = S_LSHL_B32 killed $sgpr33, 6, implicit-def $scc
+    ; CHECK: $sgpr33 = S_ADD_U32 $sgpr33, $sgpr34, implicit-def $scc
+    ; CHECK: $vgpr0 = V_OR_B32_e32 killed $vgpr2, $vgpr1, implicit $exec, implicit $sgpr4, implicit $sgpr5, implicit $sgpr6, implicit $sgpr7, implicit $sgpr8, implicit $sgpr9, implicit $sgpr10, implicit $sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $sgpr16, implicit $sgpr17, implicit $sgpr18, implicit $sgpr19, implicit $sgpr20, implicit $sgpr21, implicit $sgpr22, implicit $sgpr23, implicit $sgpr24, implicit $sgpr25, implicit $sgpr26, implicit $sgpr17, implicit $sgpr28, implicit $sgpr29, implicit $sgpr30, implicit $sgpr31
+    ; CHECK: $sgpr32 = frame-destroy S_SUB_U32 $sgpr32, 1572864, implicit-def $scc
+    ; CHECK: $sgpr33 = frame-setup COPY $sgpr27
+    ; CHECK: S_ENDPGM 0, implicit $vcc
+    S_NOP 0, implicit-def $sgpr4, implicit-def $sgpr5, implicit-def $sgpr6, implicit-def $sgpr7, implicit-def $sgpr8, implicit-def $sgpr9, implicit-def $sgpr10, implicit-def $sgpr11, implicit-def $sgpr12, implicit-def $sgpr13, implicit-def $sgpr14, implicit-def $sgpr15, implicit-def $sgpr16, implicit-def $sgpr17, implicit-def $sgpr18, implicit-def $sgpr19, implicit-def $sgpr20, implicit-def $sgpr21, implicit-def $sgpr22, implicit-def $sgpr23, implicit-def $sgpr24, implicit-def $sgpr25, implicit-def $sgpr26, implicit-def $sgpr17, implicit-def $sgpr28, implicit-def $sgpr29, implicit-def $sgpr30, implicit-def $sgpr31, implicit-def $vcc
+    $vgpr0 = V_OR_B32_e32 %stack.1, $vgpr1, implicit $exec, implicit $sgpr4, implicit $sgpr5, implicit $sgpr6, implicit $sgpr7, implicit $sgpr8, implicit $sgpr9, implicit $sgpr10, implicit $sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $sgpr16, implicit $sgpr17, implicit $sgpr18, implicit $sgpr19, implicit $sgpr20, implicit $sgpr21, implicit $sgpr22, implicit $sgpr23, implicit $sgpr24, implicit $sgpr25, implicit $sgpr26, implicit $sgpr17, implicit $sgpr28, implicit $sgpr29, implicit $sgpr30, implicit $sgpr31
+    S_ENDPGM 0, implicit $vcc
+...
+
+# One 32-bit SGPR is available for the intermediate scale computation,
+# so only an extra copy to VALU is necessary.
+
+---
+name: scavenge_sgpr_pei_one_sgpr
+tracksRegLiveness: true
+
+stack:
+  - { id: 0, type: default, offset: 0, size: 4, alignment: 8192 }
+  - { id: 1, type: default, offset: 0, size: 4, alignment: 8192 }
+
+machineFunctionInfo:
+  isEntryFunction: false
+  scratchRSrcReg:  $sgpr0_sgpr1_sgpr2_sgpr3
+  scratchWaveOffsetReg: $sgpr34
+  frameOffsetReg:  $sgpr33
+  stackPtrOffsetReg:  $sgpr32
+
+body:             |
+  bb.0:
+    liveins: $vgpr1
+
+    ; CHECK-LABEL: name: scavenge_sgpr_pei_one_sgpr
+    ; CHECK: liveins: $vgpr1
+    ; CHECK: $sgpr27 = frame-setup COPY $sgpr33
+    ; CHECK: $sgpr4 = frame-setup S_ADD_U32 $sgpr32, 524224, implicit-def $scc
+    ; CHECK: $sgpr33 = frame-setup S_AND_B32 killed $sgpr4, 4294443008, implicit-def $scc
+    ; CHECK: $sgpr32 = frame-setup S_ADD_U32 $sgpr32, 1572864, implicit-def $scc
+    ; CHECK: S_NOP 0, implicit-def $sgpr4, implicit-def $sgpr5, implicit-def $sgpr6, implicit-def $sgpr7, implicit-def $sgpr8, implicit-def $sgpr9, implicit-def $sgpr10, implicit-def $sgpr11, implicit-def $sgpr12, implicit-def $sgpr13, implicit-def $sgpr14, implicit-def $sgpr15, implicit-def $sgpr16, implicit-def $sgpr17, implicit-def $sgpr18, implicit-def $sgpr19, implicit-def $sgpr20, implicit-def $sgpr21, implicit-def $sgpr22, implicit-def $sgpr23, implicit-def $sgpr24, implicit-def $sgpr25, implicit-def $sgpr26, implicit-def $sgpr17, implicit-def $sgpr28, implicit-def $sgpr30, implicit-def $sgpr31, implicit-def $vcc
+    ; CHECK: $sgpr29 = S_SUB_U32 $sgpr33, $sgpr34, implicit-def $scc
+    ; CHECK: $sgpr29 = S_LSHR_B32 killed $sgpr29, 6, implicit-def $scc
+    ; CHECK: $sgpr29 = S_ADD_U32 killed $sgpr29, 8192, implicit-def $scc
+    ; CHECK: $vgpr2 = COPY killed $sgpr29
+    ; CHECK: $vgpr0 = V_OR_B32_e32 killed $vgpr2, $vgpr1, implicit $exec, implicit $sgpr4, implicit $sgpr5, implicit $sgpr6, implicit $sgpr7, implicit $sgpr8, implicit $sgpr9, implicit $sgpr10, implicit $sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $sgpr16, implicit $sgpr17, implicit $sgpr18, implicit $sgpr19, implicit $sgpr20, implicit $sgpr21, implicit $sgpr22, implicit $sgpr23, implicit $sgpr24, implicit $sgpr25, implicit $sgpr26, implicit $sgpr17, implicit $sgpr28, implicit $sgpr31
+    ; CHECK: $sgpr32 = frame-destroy S_SUB_U32 $sgpr32, 1572864, implicit-def $scc
+    ; CHECK: $sgpr33 = frame-setup COPY $sgpr27
+    ; CHECK: S_ENDPGM 0, implicit $vcc
+    S_NOP 0, implicit-def $sgpr4, implicit-def $sgpr5, implicit-def $sgpr6, implicit-def $sgpr7, implicit-def $sgpr8, implicit-def $sgpr9, implicit-def $sgpr10, implicit-def $sgpr11, implicit-def $sgpr12, implicit-def $sgpr13, implicit-def $sgpr14, implicit-def $sgpr15, implicit-def $sgpr16, implicit-def $sgpr17, implicit-def $sgpr18, implicit-def $sgpr19, implicit-def $sgpr20, implicit-def $sgpr21, implicit-def $sgpr22, implicit-def $sgpr23, implicit-def $sgpr24, implicit-def $sgpr25, implicit-def $sgpr26, implicit-def $sgpr17, implicit-def $sgpr28, implicit-def $sgpr30, implicit-def $sgpr31, implicit-def $vcc
+    $vgpr0 = V_OR_B32_e32 %stack.1, $vgpr1, implicit $exec, implicit $sgpr4, implicit $sgpr5, implicit $sgpr6, implicit $sgpr7, implicit $sgpr8, implicit $sgpr9, implicit $sgpr10, implicit $sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $sgpr16, implicit $sgpr17, implicit $sgpr18, implicit $sgpr19, implicit $sgpr20, implicit $sgpr21, implicit $sgpr22, implicit $sgpr23, implicit $sgpr24, implicit $sgpr25, implicit $sgpr26, implicit $sgpr17, implicit $sgpr28, implicit $sgpr31
+    S_ENDPGM 0, implicit $vcc
+...