[llvm] 0a488cb - [AMDGPU] use scalar shift for SALU users in frame index elimination

via llvm-commits llvm-commits at lists.llvm.org
Tue Mar 22 03:43:30 PDT 2022


Author: alex-t
Date: 2022-03-22T11:43:23+01:00
New Revision: 0a488cba2c1b207cec49c8873c7d6b2af75f2d19

URL: https://github.com/llvm/llvm-project/commit/0a488cba2c1b207cec49c8873c7d6b2af75f2d19
DIFF: https://github.com/llvm/llvm-project/commit/0a488cba2c1b207cec49c8873c7d6b2af75f2d19.diff

LOG: [AMDGPU] use scalar shift for SALU users in frame index elimination

In the frame index lowering we have to insert shift and add
instructions to adjust stack object access.  We need to take care of the stack
object user kind and use scalar shift/add for scalar users.

Reviewed By: rampitec

Differential Revision: https://reviews.llvm.org/D121524

Added: 
    llvm/test/CodeGen/AMDGPU/frame-index.mir

Modified: 
    llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
index b8d0e0370f0fb..d6b9c9f787774 100644
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
@@ -2239,57 +2239,77 @@ void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
 
       if (!IsMUBUF && !MFI->isEntryFunction()) {
         // Convert to a swizzled stack address by scaling by the wave size.
-        //
         // In an entry function/kernel the offset is already swizzled.
-
-        bool IsCopy = MI->getOpcode() == AMDGPU::V_MOV_B32_e32;
-        Register ResultReg =
-            IsCopy ? MI->getOperand(0).getReg()
-                   : RS->scavengeRegister(&AMDGPU::VGPR_32RegClass, MI, 0);
+        bool IsSALU = isSGPRClass(TII->getOpRegClass(*MI, FIOperandNum));
+        bool LiveSCC = RS->isRegUsed(AMDGPU::SCC);
+        const TargetRegisterClass *RC = IsSALU && !LiveSCC
+                                            ? &AMDGPU::SReg_32RegClass
+                                            : &AMDGPU::VGPR_32RegClass;
+        bool IsCopy = MI->getOpcode() == AMDGPU::V_MOV_B32_e32 ||
+                      MI->getOpcode() == AMDGPU::V_MOV_B32_e64;
+        Register ResultReg = IsCopy ? MI->getOperand(0).getReg()
+                                    : RS->scavengeRegister(RC, MI, 0);
 
         int64_t Offset = FrameInfo.getObjectOffset(Index);
         if (Offset == 0) {
+          unsigned OpCode = IsSALU && !LiveSCC ? AMDGPU::S_LSHR_B32
+                                               : AMDGPU::V_LSHRREV_B32_e64;
           // XXX - This never happens because of emergency scavenging slot at 0?
-          BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_LSHRREV_B32_e64), ResultReg)
-            .addImm(ST.getWavefrontSizeLog2())
-            .addReg(FrameReg);
+          auto Shift = BuildMI(*MBB, MI, DL, TII->get(OpCode), ResultReg)
+                           .addImm(ST.getWavefrontSizeLog2())
+                           .addReg(FrameReg);
+          if (IsSALU && !LiveSCC)
+            Shift.getInstr()->getOperand(3).setIsDead(
+                true); // Mark SCC as dead.
+          if (IsSALU && LiveSCC) {
+            Register NewDest =
+                RS->scavengeRegister(&AMDGPU::SReg_32RegClass, Shift, 0);
+            BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32),
+                    NewDest)
+                .addReg(ResultReg);
+            ResultReg = NewDest;
+          }
         } else {
-          if (auto MIB = TII->getAddNoCarry(*MBB, MI, DL, ResultReg, *RS)) {
-            // Reuse ResultReg in intermediate step.
-            Register ScaledReg = ResultReg;
-
-            BuildMI(*MBB, *MIB, DL, TII->get(AMDGPU::V_LSHRREV_B32_e64),
-                    ScaledReg)
-              .addImm(ST.getWavefrontSizeLog2())
-              .addReg(FrameReg);
-
-            const bool IsVOP2 = MIB->getOpcode() == AMDGPU::V_ADD_U32_e32;
-
-            // TODO: Fold if use instruction is another add of a constant.
-            if (IsVOP2 || AMDGPU::isInlinableLiteral32(Offset, ST.hasInv2PiInlineImm())) {
-              // FIXME: This can fail
-              MIB.addImm(Offset);
-              MIB.addReg(ScaledReg, RegState::Kill);
-              if (!IsVOP2)
+          MachineInstrBuilder MIB;
+          if (!IsSALU) {
+            if (MIB = TII->getAddNoCarry(*MBB, MI, DL, ResultReg, *RS)) {
+              // Reuse ResultReg in intermediate step.
+              Register ScaledReg = ResultReg;
+
+              BuildMI(*MBB, *MIB, DL, TII->get(AMDGPU::V_LSHRREV_B32_e64),
+                      ScaledReg)
+                .addImm(ST.getWavefrontSizeLog2())
+                .addReg(FrameReg);
+
+              const bool IsVOP2 = MIB->getOpcode() == AMDGPU::V_ADD_U32_e32;
+
+              // TODO: Fold if use instruction is another add of a constant.
+              if (IsVOP2 || AMDGPU::isInlinableLiteral32(Offset, ST.hasInv2PiInlineImm())) {
+                // FIXME: This can fail
+                MIB.addImm(Offset);
+                MIB.addReg(ScaledReg, RegState::Kill);
+                if (!IsVOP2)
+                  MIB.addImm(0); // clamp bit
+              } else {
+                assert(MIB->getOpcode() == AMDGPU::V_ADD_CO_U32_e64 &&
+                       "Need to reuse carry out register");
+
+                // Use scavenged unused carry out as offset register.
+                Register ConstOffsetReg;
+                if (!isWave32)
+                  ConstOffsetReg = getSubReg(MIB.getReg(1), AMDGPU::sub0);
+                else
+                  ConstOffsetReg = MIB.getReg(1);
+
+                BuildMI(*MBB, *MIB, DL, TII->get(AMDGPU::S_MOV_B32), ConstOffsetReg)
+                    .addImm(Offset);
+                MIB.addReg(ConstOffsetReg, RegState::Kill);
+                MIB.addReg(ScaledReg, RegState::Kill);
                 MIB.addImm(0); // clamp bit
-            } else {
-              assert(MIB->getOpcode() == AMDGPU::V_ADD_CO_U32_e64 &&
-                     "Need to reuse carry out register");
-
-              // Use scavenged unused carry out as offset register.
-              Register ConstOffsetReg;
-              if (!isWave32)
-                ConstOffsetReg = getSubReg(MIB.getReg(1), AMDGPU::sub0);
-              else
-                ConstOffsetReg = MIB.getReg(1);
-
-              BuildMI(*MBB, *MIB, DL, TII->get(AMDGPU::S_MOV_B32), ConstOffsetReg)
-                .addImm(Offset);
-              MIB.addReg(ConstOffsetReg, RegState::Kill);
-              MIB.addReg(ScaledReg, RegState::Kill);
-              MIB.addImm(0); // clamp bit
+              }
             }
-          } else {
+          }
+          if (!MIB || IsSALU) {
             // We have to produce a carry out, and there isn't a free SGPR pair
             // for it. We can keep the whole computation on the SALU to avoid
             // clobbering an additional register at the cost of an extra mov.
@@ -2297,7 +2317,7 @@ void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
             // We may have 1 free scratch SGPR even though a carry out is
             // unavailable. Only one additional mov is needed.
             Register TmpScaledReg =
-                RS->scavengeRegister(&AMDGPU::SReg_32_XM0RegClass, MI, 0, false);
+              RS->scavengeRegister(&AMDGPU::SReg_32_XM0RegClass, MI, 0, false);
             Register ScaledReg = TmpScaledReg.isValid() ? TmpScaledReg : FrameReg;
 
             BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_LSHR_B32), ScaledReg)
@@ -2306,14 +2326,17 @@ void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
             BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_ADD_I32), ScaledReg)
                 .addReg(ScaledReg, RegState::Kill)
                 .addImm(Offset);
-            BuildMI(*MBB, MI, DL, TII->get(AMDGPU::COPY), ResultReg)
-              .addReg(ScaledReg, RegState::Kill);
+            if (!IsSALU)
+              BuildMI(*MBB, MI, DL, TII->get(AMDGPU::COPY), ResultReg)
+                  .addReg(ScaledReg, RegState::Kill);
+            else
+              ResultReg = ScaledReg;
 
             // If there were truly no free SGPRs, we need to undo everything.
             if (!TmpScaledReg.isValid()) {
               BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_ADD_I32), ScaledReg)
-                  .addReg(ScaledReg, RegState::Kill)
-                  .addImm(-Offset);
+                .addReg(ScaledReg, RegState::Kill)
+                .addImm(-Offset);
               BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_LSHL_B32), ScaledReg)
                 .addReg(FrameReg)
                 .addImm(ST.getWavefrontSizeLog2());

diff  --git a/llvm/test/CodeGen/AMDGPU/frame-index.mir b/llvm/test/CodeGen/AMDGPU/frame-index.mir
new file mode 100644
index 0000000000000..b56bbaa1504ae
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/frame-index.mir
@@ -0,0 +1,186 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=kaveri -verify-machineinstrs -run-pass=prologepilog -o - %s | FileCheck -check-prefix=GCN %s
+
+---
+name: func_add_constant_to_fi_divergent_i32
+tracksRegLiveness: true
+stack:
+  - { id: 0, type: default, offset: 0, size: 8, alignment: 4,
+      stack-id: default, callee-saved-register: '', callee-saved-restored: true,
+      local-offset: 0, debug-info-variable: '', debug-info-expression: '',
+      debug-info-location: '' }
+machineFunctionInfo:
+  stackPtrOffsetReg: '$sgpr32'
+
+body:             |
+  bb.0:
+    liveins: $vgpr31, $sgpr30_sgpr31
+
+    ; GCN-LABEL: name: func_add_constant_to_fi_divergent_i32
+    ; GCN: liveins: $vgpr31, $sgpr30_sgpr31
+    ; GCN-NEXT: {{  $}}
+    ; GCN-NEXT: renamable $vgpr0 = V_AND_B32_e32 1023, killed $vgpr31, implicit $exec
+    ; GCN-NEXT: renamable $vgpr0 = V_LSHLREV_B32_e32 2, killed $vgpr0, implicit $exec
+    ; GCN-NEXT: $vgpr1 = V_LSHRREV_B32_e64 6, $sgpr32, implicit $exec
+    ; GCN-NEXT: renamable $vgpr0 = V_ADD_CO_U32_e32 killed $vgpr1, killed $vgpr0, implicit-def dead $vcc, implicit $exec
+    ; GCN-NEXT: renamable $vgpr0, dead renamable $vcc = nuw V_ADD_CO_U32_e64 4, killed $vgpr0, 0, implicit $exec
+    ; GCN-NEXT: $m0 = S_MOV_B32 -1
+    ; GCN-NEXT: DS_WRITE_B32 undef renamable $vgpr0, killed renamable $vgpr0, 0, 0, implicit $m0, implicit $exec
+    ; GCN-NEXT: S_SETPC_B64_return killed renamable $sgpr30_sgpr31
+    renamable $vgpr0 = V_AND_B32_e32 1023, killed $vgpr31, implicit $exec
+    renamable $vgpr0 = V_LSHLREV_B32_e32 2, killed $vgpr0, implicit $exec
+    renamable $vgpr0 = V_ADD_CO_U32_e32 %stack.0, killed $vgpr0, implicit-def dead $vcc, implicit $exec
+    renamable $vgpr0, dead renamable $vcc = nuw V_ADD_CO_U32_e64 4, killed $vgpr0, 0, implicit $exec
+    $m0 = S_MOV_B32 -1
+    DS_WRITE_B32 undef renamable $vgpr0, killed renamable $vgpr0, 0, 0, implicit $m0, implicit $exec
+    S_SETPC_B64_return killed renamable $sgpr30_sgpr31
+
+...
+---
+name: func_add_constant_to_fi_uniform_i32
+tracksRegLiveness: true
+stack:
+  - { id: 0, type: default, offset: 0, size: 8, alignment: 4,
+      stack-id: default, callee-saved-register: '', callee-saved-restored: true,
+      local-offset: 0, debug-info-variable: '', debug-info-expression: '',
+      debug-info-location: '' }
+machineFunctionInfo:
+  stackPtrOffsetReg: '$sgpr32'
+body:             |
+  bb.0:
+    liveins: $sgpr30_sgpr31
+
+    ; GCN-LABEL: name: func_add_constant_to_fi_uniform_i32
+    ; GCN: liveins: $sgpr30_sgpr31
+    ; GCN-NEXT: {{  $}}
+    ; GCN-NEXT: $vcc_hi = S_LSHR_B32 6, $sgpr32, implicit-def dead $scc
+    ; GCN-NEXT: renamable $sgpr4 = nuw S_ADD_I32 killed $vcc_hi, 4, implicit-def dead $scc
+    ; GCN-NEXT: renamable $vgpr0 = COPY killed renamable $sgpr4, implicit $exec
+    ; GCN-NEXT: $m0 = S_MOV_B32 -1
+    ; GCN-NEXT: DS_WRITE_B32 undef renamable $vgpr0, killed renamable $vgpr0, 0, 0, implicit $m0, implicit $exec
+    ; GCN-NEXT: S_SETPC_B64_return killed renamable $sgpr30_sgpr31
+    renamable $sgpr4 = nuw S_ADD_I32 %stack.0, 4, implicit-def dead $scc
+    renamable $vgpr0 = COPY killed renamable $sgpr4, implicit $exec
+    $m0 = S_MOV_B32 -1
+    DS_WRITE_B32 undef renamable $vgpr0, killed renamable $vgpr0, 0, 0, implicit $m0, implicit $exec
+    S_SETPC_B64_return killed renamable $sgpr30_sgpr31
+
+...
+---
+name: func_add_constant_to_fi_uniform_SCC_clobber_i32
+tracksRegLiveness: true
+stack:
+  - { id: 0, type: default, offset: 0, size: 8, alignment: 4,
+      stack-id: default, callee-saved-register: '', callee-saved-restored: true,
+      local-offset: 0, debug-info-variable: '', debug-info-expression: '',
+      debug-info-location: '' }
+  - { id: 1, type: default, offset: 0, size: 32, alignment: 4,
+      stack-id: default, callee-saved-register: '', callee-saved-restored: true,
+      local-offset: 512, debug-info-variable: '', debug-info-expression: '',
+      debug-info-location: '' }
+
+machineFunctionInfo:
+
+  stackPtrOffsetReg: '$sgpr32'
+body:             |
+  bb.0:
+    liveins: $sgpr30_sgpr31
+
+    ; GCN-LABEL: name: func_add_constant_to_fi_uniform_SCC_clobber_i32
+    ; GCN: liveins: $sgpr30_sgpr31
+    ; GCN-NEXT: {{  $}}
+    ; GCN-NEXT: $vcc_hi = S_LSHR_B32 6, $sgpr32, implicit-def dead $scc
+    ; GCN-NEXT: renamable $sgpr4 = nuw S_ADD_U32 killed $vcc_hi, 4, implicit-def $scc
+    ; GCN-NEXT: renamable $sgpr5 = S_ADDC_U32 $sgpr4, 1234567, implicit-def $scc, implicit $scc
+    ; GCN-NEXT: $vcc_hi = S_LSHR_B32 $sgpr32, 6, implicit-def $scc
+    ; GCN-NEXT: $vcc_hi = S_ADD_I32 killed $vcc_hi, 8, implicit-def $scc
+    ; GCN-NEXT: renamable $sgpr6 = S_MUL_I32 killed $vcc_hi, $sgpr5
+    ; GCN-NEXT: renamable $vgpr0 = COPY killed renamable $sgpr6, implicit $exec
+    ; GCN-NEXT: $m0 = S_MOV_B32 -1
+    ; GCN-NEXT: DS_WRITE_B32 undef renamable $vgpr0, killed renamable $vgpr0, 0, 0, implicit $m0, implicit $exec
+    ; GCN-NEXT: S_SETPC_B64_return killed renamable $sgpr30_sgpr31
+    renamable $sgpr4 = nuw S_ADD_U32 %stack.0, 4, implicit-def $scc
+    renamable $sgpr5 = S_ADDC_U32 $sgpr4, 1234567, implicit-def $scc, implicit $scc
+    renamable $sgpr6 = S_MUL_I32 %stack.1, $sgpr5
+    renamable $vgpr0 = COPY killed renamable $sgpr6, implicit $exec
+    $m0 = S_MOV_B32 -1
+    DS_WRITE_B32 undef renamable $vgpr0, killed renamable $vgpr0, 0, 0, implicit $m0, implicit $exec
+    S_SETPC_B64_return killed renamable $sgpr30_sgpr31
+
+...
+---
+name: func_other_fi_user_non_inline_imm_offset_i32
+tracksRegLiveness: true
+stack:
+  - { id: 0, type: default, offset: 0, size: 512, alignment: 4,
+      stack-id: default, callee-saved-register: '', callee-saved-restored: true,
+      local-offset: 0, debug-info-variable: '', debug-info-expression: '',
+      debug-info-location: '' }
+  - { id: 1, type: default, offset: 0, size: 32, alignment: 4,
+      stack-id: default, callee-saved-register: '', callee-saved-restored: true,
+      local-offset: 512, debug-info-variable: '', debug-info-expression: '',
+      debug-info-location: '' }
+machineFunctionInfo:
+  scratchRSrcReg:  '$sgpr0_sgpr1_sgpr2_sgpr3'
+  stackPtrOffsetReg: '$sgpr32'
+body:             |
+  bb.0:
+    liveins: $sgpr30_sgpr31
+
+    ; GCN-LABEL: name: func_other_fi_user_non_inline_imm_offset_i32
+    ; GCN: liveins: $sgpr30_sgpr31
+    ; GCN-NEXT: {{  $}}
+    ; GCN-NEXT: renamable $vgpr0 = V_MOV_B32_e32 7, implicit $exec
+    ; GCN-NEXT: BUFFER_STORE_DWORD_OFFSET killed renamable $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 260, 0, 0, 0, implicit $exec
+    ; GCN-NEXT: $vcc_hi = S_LSHR_B32 $sgpr32, 6, implicit-def $scc
+    ; GCN-NEXT: $vcc_hi = S_ADD_I32 killed $vcc_hi, 512, implicit-def $scc
+    ; GCN-NEXT: renamable $sgpr4 = S_MUL_I32 killed $vcc_hi, 9
+    ; GCN-NEXT: renamable $vgpr0 = COPY killed renamable $sgpr4, implicit $exec
+    ; GCN-NEXT: $m0 = S_MOV_B32 -1
+    ; GCN-NEXT: DS_WRITE_B32 undef renamable $vgpr0, killed renamable $vgpr0, 0, 0, implicit $m0, implicit $exec
+    ; GCN-NEXT: S_SETPC_B64_return killed renamable $sgpr30_sgpr31
+    renamable $vgpr0 = V_MOV_B32_e32 7, implicit $exec
+    BUFFER_STORE_DWORD_OFFEN killed renamable $vgpr0, %stack.0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 260, 0, 0, 0, implicit $exec
+    renamable $sgpr4 = S_MUL_I32 %stack.1, 9
+    renamable $vgpr0 = COPY killed renamable $sgpr4, implicit $exec
+    $m0 = S_MOV_B32 -1
+    DS_WRITE_B32 undef renamable $vgpr0, killed renamable $vgpr0, 0, 0, implicit $m0, implicit $exec
+    S_SETPC_B64_return killed renamable $sgpr30_sgpr31
+
+...
+---
+name: func_add_constant_to_fi_uniform_live_SCC_i32
+tracksRegLiveness: true
+stack:
+  - { id: 0, type: default, offset: 0, size: 8, alignment: 4,
+      stack-id: default, callee-saved-register: '', callee-saved-restored: true,
+      local-offset: 0, debug-info-variable: '', debug-info-expression: '',
+      debug-info-location: '' }
+
+machineFunctionInfo:
+
+  stackPtrOffsetReg: '$sgpr32'
+body:             |
+  bb.0:
+    liveins: $sgpr30_sgpr31, $sgpr10
+    ; GCN-LABEL: name: func_add_constant_to_fi_uniform_live_SCC_i32
+    ; GCN: liveins: $sgpr30_sgpr31, $sgpr10
+    ; GCN-NEXT: {{  $}}
+    ; GCN-NEXT: renamable $sgpr4 = nuw S_ADD_U32 $sgpr10, 4, implicit-def $scc
+    ; GCN-NEXT: $vgpr1 = V_LSHRREV_B32_e64 6, $sgpr32, implicit $exec
+    ; GCN-NEXT: $vcc_hi = V_READFIRSTLANE_B32 $vgpr1, implicit $exec
+    ; GCN-NEXT: renamable $sgpr5 = S_MUL_I32 killed $vcc_hi, $sgpr4
+    ; GCN-NEXT: renamable $sgpr6 = S_ADDC_U32 $sgpr5, 1234567, implicit-def dead $scc, implicit $scc
+    ; GCN-NEXT: renamable $vgpr0 = COPY killed renamable $sgpr6, implicit $exec
+    ; GCN-NEXT: $m0 = S_MOV_B32 -1
+    ; GCN-NEXT: DS_WRITE_B32 undef renamable $vgpr0, killed renamable $vgpr0, 0, 0, implicit $m0, implicit $exec
+    ; GCN-NEXT: S_SETPC_B64_return killed renamable $sgpr30_sgpr31
+    renamable $sgpr4 = nuw S_ADD_U32 $sgpr10, 4, implicit-def $scc
+    renamable $sgpr5 = S_MUL_I32 %stack.0, $sgpr4
+    renamable $sgpr6 = S_ADDC_U32 $sgpr5, 1234567, implicit-def dead $scc, implicit $scc
+    renamable $vgpr0 = COPY killed renamable $sgpr6, implicit $exec
+    $m0 = S_MOV_B32 -1
+    DS_WRITE_B32 undef renamable $vgpr0, killed renamable $vgpr0, 0, 0, implicit $m0, implicit $exec
+    S_SETPC_B64_return killed renamable $sgpr30_sgpr31
+
+...


        


More information about the llvm-commits mailing list