[llvm] 045be6f - AMDGPU/GlobalISel: Fold wave address into mubuf addressing modes

Matt Arsenault via llvm-commits llvm-commits at lists.llvm.org
Wed Jan 26 12:41:13 PST 2022


Author: Matt Arsenault
Date: 2022-01-26T15:25:26-05:00
New Revision: 045be6ff36dfcbf4a1146ea06ad90909f0508f9b

URL: https://github.com/llvm/llvm-project/commit/045be6ff36dfcbf4a1146ea06ad90909f0508f9b
DIFF: https://github.com/llvm/llvm-project/commit/045be6ff36dfcbf4a1146ea06ad90909f0508f9b.diff

LOG: AMDGPU/GlobalISel: Fold wave address into mubuf addressing modes

Added: 
    

Modified: 
    llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
    llvm/test/CodeGen/AMDGPU/GlobalISel/call-outgoing-stack-args.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-store-private.mir
    llvm/test/CodeGen/AMDGPU/amdpal-callable.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
index e48dca3cc957..4883b6a86ef8 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
@@ -3905,20 +3905,59 @@ bool AMDGPUInstructionSelector::isUnneededShiftMask(const MachineInstr &MI,
   return (LHSKnownZeros | *RHS).countTrailingOnes() >= ShAmtBits;
 }
 
+// Return the wave level SGPR base address if this is a wave address.
+static Register getWaveAddress(const MachineInstr *Def) {
+  return Def->getOpcode() == AMDGPU::G_AMDGPU_WAVE_ADDRESS
+             ? Def->getOperand(1).getReg()
+             : Register();
+}
+
 InstructionSelector::ComplexRendererFns
 AMDGPUInstructionSelector::selectMUBUFScratchOffset(
     MachineOperand &Root) const {
-  MachineInstr *MI = Root.getParent();
-  MachineBasicBlock *MBB = MI->getParent();
+  Register Reg = Root.getReg();
+  const SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>();
+
+  const MachineInstr *Def = MRI->getVRegDef(Reg);
+  if (Register WaveBase = getWaveAddress(Def)) {
+    return {{
+        [=](MachineInstrBuilder &MIB) { // rsrc
+          MIB.addReg(Info->getScratchRSrcReg());
+        },
+        [=](MachineInstrBuilder &MIB) { // soffset
+          MIB.addReg(WaveBase);
+        },
+        [=](MachineInstrBuilder &MIB) { MIB.addImm(0); } // offset
+    }};
+  }
 
   int64_t Offset = 0;
+
+  // FIXME: Copy check is a hack
+  Register BasePtr;
+  if (mi_match(Reg, *MRI, m_GPtrAdd(m_Reg(BasePtr), m_Copy(m_ICst(Offset))))) {
+    if (!SIInstrInfo::isLegalMUBUFImmOffset(Offset))
+      return {};
+    const MachineInstr *BasePtrDef = MRI->getVRegDef(BasePtr);
+    Register WaveBase = getWaveAddress(BasePtrDef);
+    if (!WaveBase)
+      return {};
+
+    return {{
+        [=](MachineInstrBuilder &MIB) { // rsrc
+          MIB.addReg(Info->getScratchRSrcReg());
+        },
+        [=](MachineInstrBuilder &MIB) { // soffset
+          MIB.addReg(WaveBase);
+        },
+        [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); } // offset
+    }};
+  }
+
   if (!mi_match(Root.getReg(), *MRI, m_ICst(Offset)) ||
       !SIInstrInfo::isLegalMUBUFImmOffset(Offset))
     return {};
 
-  const MachineFunction *MF = MBB->getParent();
-  const SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>();
-
   return {{
       [=](MachineInstrBuilder &MIB) { // rsrc
         MIB.addReg(Info->getScratchRSrcReg());

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/call-outgoing-stack-args.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/call-outgoing-stack-args.ll
index 0519a9c7db32..8e0f3fc47989 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/call-outgoing-stack-args.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/call-outgoing-stack-args.ll
@@ -14,21 +14,20 @@ define amdgpu_kernel void @kernel_caller_stack() {
 ; MUBUF:       ; %bb.0:
 ; MUBUF-NEXT:    s_add_u32 flat_scratch_lo, s4, s7
 ; MUBUF-NEXT:    s_addc_u32 flat_scratch_hi, s5, 0
-; MUBUF-NEXT:    s_mov_b32 s32, 0
 ; MUBUF-NEXT:    s_add_u32 s0, s0, s7
+; MUBUF-NEXT:    s_mov_b32 s32, 0
 ; MUBUF-NEXT:    s_addc_u32 s1, s1, 0
-; MUBUF-NEXT:    v_lshrrev_b32_e64 v0, 6, s32
-; MUBUF-NEXT:    v_mov_b32_e32 v1, 9
-; MUBUF-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen offset:4
-; MUBUF-NEXT:    v_mov_b32_e32 v1, 10
-; MUBUF-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen offset:8
-; MUBUF-NEXT:    v_mov_b32_e32 v1, 11
-; MUBUF-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen offset:12
-; MUBUF-NEXT:    v_mov_b32_e32 v1, 12
+; MUBUF-NEXT:    v_mov_b32_e32 v0, 9
+; MUBUF-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:4
+; MUBUF-NEXT:    v_mov_b32_e32 v0, 10
+; MUBUF-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:8
+; MUBUF-NEXT:    v_mov_b32_e32 v0, 11
+; MUBUF-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:12
+; MUBUF-NEXT:    v_mov_b32_e32 v0, 12
 ; MUBUF-NEXT:    s_getpc_b64 s[4:5]
 ; MUBUF-NEXT:    s_add_u32 s4, s4, external_void_func_v16i32_v16i32_v4i32 at rel32@lo+4
 ; MUBUF-NEXT:    s_addc_u32 s5, s5, external_void_func_v16i32_v16i32_v4i32 at rel32@hi+12
-; MUBUF-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen offset:16
+; MUBUF-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:16
 ; MUBUF-NEXT:    s_swappc_b64 s[30:31], s[4:5]
 ; MUBUF-NEXT:    s_endpgm
 ;
@@ -112,42 +111,41 @@ define amdgpu_kernel void @kernel_caller_byval() {
 ; MUBUF-NEXT:    buffer_load_dword v14, off, s[0:3], 0 offset:64
 ; MUBUF-NEXT:    buffer_load_dword v15, off, s[0:3], 0 offset:68
 ; MUBUF-NEXT:    s_movk_i32 s32, 0x1400
-; MUBUF-NEXT:    v_lshrrev_b32_e64 v16, 6, s32
 ; MUBUF-NEXT:    s_getpc_b64 s[4:5]
 ; MUBUF-NEXT:    s_add_u32 s4, s4, external_void_func_byval at rel32@lo+4
 ; MUBUF-NEXT:    s_addc_u32 s5, s5, external_void_func_byval at rel32@hi+12
 ; MUBUF-NEXT:    s_waitcnt vmcnt(15)
-; MUBUF-NEXT:    buffer_store_dword v0, v16, s[0:3], 0 offen
+; MUBUF-NEXT:    buffer_store_dword v0, off, s[0:3], s32
 ; MUBUF-NEXT:    s_waitcnt vmcnt(15)
-; MUBUF-NEXT:    buffer_store_dword v1, v16, s[0:3], 0 offen offset:4
+; MUBUF-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:4
 ; MUBUF-NEXT:    s_waitcnt vmcnt(15)
-; MUBUF-NEXT:    buffer_store_dword v2, v16, s[0:3], 0 offen offset:8
+; MUBUF-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:8
 ; MUBUF-NEXT:    s_waitcnt vmcnt(15)
-; MUBUF-NEXT:    buffer_store_dword v3, v16, s[0:3], 0 offen offset:12
+; MUBUF-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:12
 ; MUBUF-NEXT:    s_waitcnt vmcnt(15)
-; MUBUF-NEXT:    buffer_store_dword v4, v16, s[0:3], 0 offen offset:16
+; MUBUF-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:16
 ; MUBUF-NEXT:    s_waitcnt vmcnt(15)
-; MUBUF-NEXT:    buffer_store_dword v5, v16, s[0:3], 0 offen offset:20
+; MUBUF-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:20
 ; MUBUF-NEXT:    s_waitcnt vmcnt(15)
-; MUBUF-NEXT:    buffer_store_dword v6, v16, s[0:3], 0 offen offset:24
+; MUBUF-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:24
 ; MUBUF-NEXT:    s_waitcnt vmcnt(15)
-; MUBUF-NEXT:    buffer_store_dword v7, v16, s[0:3], 0 offen offset:28
+; MUBUF-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:28
 ; MUBUF-NEXT:    s_waitcnt vmcnt(15)
-; MUBUF-NEXT:    buffer_store_dword v8, v16, s[0:3], 0 offen offset:32
+; MUBUF-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:32
 ; MUBUF-NEXT:    s_waitcnt vmcnt(15)
-; MUBUF-NEXT:    buffer_store_dword v9, v16, s[0:3], 0 offen offset:36
+; MUBUF-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:36
 ; MUBUF-NEXT:    s_waitcnt vmcnt(15)
-; MUBUF-NEXT:    buffer_store_dword v10, v16, s[0:3], 0 offen offset:40
+; MUBUF-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:40
 ; MUBUF-NEXT:    s_waitcnt vmcnt(15)
-; MUBUF-NEXT:    buffer_store_dword v11, v16, s[0:3], 0 offen offset:44
+; MUBUF-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:44
 ; MUBUF-NEXT:    s_waitcnt vmcnt(15)
-; MUBUF-NEXT:    buffer_store_dword v12, v16, s[0:3], 0 offen offset:48
+; MUBUF-NEXT:    buffer_store_dword v12, off, s[0:3], s32 offset:48
 ; MUBUF-NEXT:    s_waitcnt vmcnt(15)
-; MUBUF-NEXT:    buffer_store_dword v13, v16, s[0:3], 0 offen offset:52
+; MUBUF-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:52
 ; MUBUF-NEXT:    s_waitcnt vmcnt(15)
-; MUBUF-NEXT:    buffer_store_dword v14, v16, s[0:3], 0 offen offset:56
+; MUBUF-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:56
 ; MUBUF-NEXT:    s_waitcnt vmcnt(15)
-; MUBUF-NEXT:    buffer_store_dword v15, v16, s[0:3], 0 offen offset:60
+; MUBUF-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:60
 ; MUBUF-NEXT:    s_swappc_b64 s[30:31], s[4:5]
 ; MUBUF-NEXT:    s_endpgm
 ;
@@ -244,20 +242,19 @@ define void @func_caller_stack() {
 ; MUBUF-NEXT:    v_writelane_b32 v40, s33, 2
 ; MUBUF-NEXT:    s_mov_b32 s33, s32
 ; MUBUF-NEXT:    s_addk_i32 s32, 0x400
-; MUBUF-NEXT:    v_lshrrev_b32_e64 v0, 6, s32
-; MUBUF-NEXT:    v_mov_b32_e32 v1, 9
-; MUBUF-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen offset:4
-; MUBUF-NEXT:    v_mov_b32_e32 v1, 10
-; MUBUF-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen offset:8
-; MUBUF-NEXT:    v_mov_b32_e32 v1, 11
+; MUBUF-NEXT:    v_mov_b32_e32 v0, 9
+; MUBUF-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:4
+; MUBUF-NEXT:    v_mov_b32_e32 v0, 10
+; MUBUF-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:8
+; MUBUF-NEXT:    v_mov_b32_e32 v0, 11
 ; MUBUF-NEXT:    v_writelane_b32 v40, s30, 0
-; MUBUF-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen offset:12
-; MUBUF-NEXT:    v_mov_b32_e32 v1, 12
+; MUBUF-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:12
+; MUBUF-NEXT:    v_mov_b32_e32 v0, 12
 ; MUBUF-NEXT:    v_writelane_b32 v40, s31, 1
 ; MUBUF-NEXT:    s_getpc_b64 s[4:5]
 ; MUBUF-NEXT:    s_add_u32 s4, s4, external_void_func_v16i32_v16i32_v4i32 at rel32@lo+4
 ; MUBUF-NEXT:    s_addc_u32 s5, s5, external_void_func_v16i32_v16i32_v4i32 at rel32@hi+12
-; MUBUF-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen offset:16
+; MUBUF-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:16
 ; MUBUF-NEXT:    s_swappc_b64 s[30:31], s[4:5]
 ; MUBUF-NEXT:    v_readlane_b32 s4, v40, 0
 ; MUBUF-NEXT:    v_readlane_b32 s5, v40, 1
@@ -317,65 +314,64 @@ define void @func_caller_byval([16 x i32] addrspace(5)* %argptr) {
 ; MUBUF-NEXT:    v_writelane_b32 v40, s33, 2
 ; MUBUF-NEXT:    s_mov_b32 s33, s32
 ; MUBUF-NEXT:    s_addk_i32 s32, 0x400
-; MUBUF-NEXT:    v_lshrrev_b32_e64 v3, 6, s32
 ; MUBUF-NEXT:    v_writelane_b32 v40, s30, 0
 ; MUBUF-NEXT:    v_writelane_b32 v40, s31, 1
 ; MUBUF-NEXT:    s_getpc_b64 s[4:5]
 ; MUBUF-NEXT:    s_add_u32 s4, s4, external_void_func_byval at rel32@lo+4
 ; MUBUF-NEXT:    s_addc_u32 s5, s5, external_void_func_byval at rel32@hi+12
 ; MUBUF-NEXT:    s_waitcnt vmcnt(1)
-; MUBUF-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
+; MUBUF-NEXT:    buffer_store_dword v1, off, s[0:3], s32
 ; MUBUF-NEXT:    s_waitcnt vmcnt(1)
-; MUBUF-NEXT:    buffer_store_dword v2, v3, s[0:3], 0 offen offset:4
+; MUBUF-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:4
 ; MUBUF-NEXT:    buffer_load_dword v1, v0, s[0:3], 0 offen offset:8
 ; MUBUF-NEXT:    s_nop 0
 ; MUBUF-NEXT:    buffer_load_dword v2, v0, s[0:3], 0 offen offset:12
 ; MUBUF-NEXT:    s_waitcnt vmcnt(1)
-; MUBUF-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen offset:8
+; MUBUF-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:8
 ; MUBUF-NEXT:    s_waitcnt vmcnt(1)
-; MUBUF-NEXT:    buffer_store_dword v2, v3, s[0:3], 0 offen offset:12
+; MUBUF-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:12
 ; MUBUF-NEXT:    buffer_load_dword v1, v0, s[0:3], 0 offen offset:16
 ; MUBUF-NEXT:    s_nop 0
 ; MUBUF-NEXT:    buffer_load_dword v2, v0, s[0:3], 0 offen offset:20
 ; MUBUF-NEXT:    s_waitcnt vmcnt(1)
-; MUBUF-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen offset:16
+; MUBUF-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:16
 ; MUBUF-NEXT:    s_waitcnt vmcnt(1)
-; MUBUF-NEXT:    buffer_store_dword v2, v3, s[0:3], 0 offen offset:20
+; MUBUF-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:20
 ; MUBUF-NEXT:    buffer_load_dword v1, v0, s[0:3], 0 offen offset:24
 ; MUBUF-NEXT:    s_nop 0
 ; MUBUF-NEXT:    buffer_load_dword v2, v0, s[0:3], 0 offen offset:28
 ; MUBUF-NEXT:    s_waitcnt vmcnt(1)
-; MUBUF-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen offset:24
+; MUBUF-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:24
 ; MUBUF-NEXT:    s_waitcnt vmcnt(1)
-; MUBUF-NEXT:    buffer_store_dword v2, v3, s[0:3], 0 offen offset:28
+; MUBUF-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:28
 ; MUBUF-NEXT:    buffer_load_dword v1, v0, s[0:3], 0 offen offset:32
 ; MUBUF-NEXT:    s_nop 0
 ; MUBUF-NEXT:    buffer_load_dword v2, v0, s[0:3], 0 offen offset:36
 ; MUBUF-NEXT:    s_waitcnt vmcnt(1)
-; MUBUF-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen offset:32
+; MUBUF-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:32
 ; MUBUF-NEXT:    s_waitcnt vmcnt(1)
-; MUBUF-NEXT:    buffer_store_dword v2, v3, s[0:3], 0 offen offset:36
+; MUBUF-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:36
 ; MUBUF-NEXT:    buffer_load_dword v1, v0, s[0:3], 0 offen offset:40
 ; MUBUF-NEXT:    s_nop 0
 ; MUBUF-NEXT:    buffer_load_dword v2, v0, s[0:3], 0 offen offset:44
 ; MUBUF-NEXT:    s_waitcnt vmcnt(1)
-; MUBUF-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen offset:40
+; MUBUF-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:40
 ; MUBUF-NEXT:    s_waitcnt vmcnt(1)
-; MUBUF-NEXT:    buffer_store_dword v2, v3, s[0:3], 0 offen offset:44
+; MUBUF-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:44
 ; MUBUF-NEXT:    buffer_load_dword v1, v0, s[0:3], 0 offen offset:48
 ; MUBUF-NEXT:    s_nop 0
 ; MUBUF-NEXT:    buffer_load_dword v2, v0, s[0:3], 0 offen offset:52
 ; MUBUF-NEXT:    s_waitcnt vmcnt(1)
-; MUBUF-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen offset:48
+; MUBUF-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:48
 ; MUBUF-NEXT:    s_waitcnt vmcnt(1)
-; MUBUF-NEXT:    buffer_store_dword v2, v3, s[0:3], 0 offen offset:52
+; MUBUF-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:52
 ; MUBUF-NEXT:    buffer_load_dword v1, v0, s[0:3], 0 offen offset:56
 ; MUBUF-NEXT:    s_nop 0
 ; MUBUF-NEXT:    buffer_load_dword v2, v0, s[0:3], 0 offen offset:60
 ; MUBUF-NEXT:    s_waitcnt vmcnt(1)
-; MUBUF-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen offset:56
+; MUBUF-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:56
 ; MUBUF-NEXT:    s_waitcnt vmcnt(1)
-; MUBUF-NEXT:    buffer_store_dword v2, v3, s[0:3], 0 offen offset:60
+; MUBUF-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:60
 ; MUBUF-NEXT:    s_swappc_b64 s[30:31], s[4:5]
 ; MUBUF-NEXT:    v_readlane_b32 s4, v40, 0
 ; MUBUF-NEXT:    v_readlane_b32 s5, v40, 1

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-store-private.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-store-private.mir
index 11b7c28860ae..246c7686d646 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-store-private.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-store-private.mir
@@ -581,3 +581,144 @@ body: |
     G_STORE %1, %0 :: (store (s8), align 1, addrspace 5)
 
 ...
+
+---
+name: function_store_private_s32_to_4_wave_address
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+machineFunctionInfo:
+  isEntryFunction: false
+  scratchRSrcReg: $sgpr0_sgpr1_sgpr2_sgpr3
+  stackPtrOffsetReg: $sgpr32
+
+body: |
+  bb.0:
+    liveins: $vgpr0, $vgpr1
+
+    ; GFX6-LABEL: name: function_store_private_s32_to_4_wave_address
+    ; GFX6: liveins: $vgpr0, $vgpr1
+    ; GFX6-NEXT: {{  $}}
+    ; GFX6-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GFX6-NEXT: BUFFER_STORE_DWORD_OFFSET [[COPY]], $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, implicit $exec :: (store (s32), addrspace 5)
+    ; GFX9-LABEL: name: function_store_private_s32_to_4_wave_address
+    ; GFX9: liveins: $vgpr0, $vgpr1
+    ; GFX9-NEXT: {{  $}}
+    ; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GFX9-NEXT: BUFFER_STORE_DWORD_OFFSET [[COPY]], $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, implicit $exec :: (store (s32), addrspace 5)
+    %0:vgpr(s32) = COPY $vgpr0
+    %1:vgpr(p5) = G_AMDGPU_WAVE_ADDRESS $sgpr32
+    G_STORE %0, %1 :: (store (s32), align 4, addrspace 5)
+
+...
+
+# Has regbank copy of constant
+---
+name: function_store_private_s32_to_4_wave_address_offset_copy_constant_4095
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+machineFunctionInfo:
+  isEntryFunction: false
+  scratchRSrcReg: $sgpr0_sgpr1_sgpr2_sgpr3
+  stackPtrOffsetReg: $sgpr32
+
+body: |
+  bb.0:
+    liveins: $vgpr0, $vgpr1
+
+    ; GFX6-LABEL: name: function_store_private_s32_to_4_wave_address_offset_copy_constant_4095
+    ; GFX6: liveins: $vgpr0, $vgpr1
+    ; GFX6-NEXT: {{  $}}
+    ; GFX6-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GFX6-NEXT: BUFFER_STORE_DWORD_OFFSET [[COPY]], $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4095, 0, 0, 0, implicit $exec :: (store (s32), addrspace 5)
+    ; GFX9-LABEL: name: function_store_private_s32_to_4_wave_address_offset_copy_constant_4095
+    ; GFX9: liveins: $vgpr0, $vgpr1
+    ; GFX9-NEXT: {{  $}}
+    ; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GFX9-NEXT: BUFFER_STORE_DWORD_OFFSET [[COPY]], $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4095, 0, 0, 0, implicit $exec :: (store (s32), addrspace 5)
+    %0:vgpr(s32) = COPY $vgpr0
+    %1:vgpr(p5) = G_AMDGPU_WAVE_ADDRESS $sgpr32
+    %2:sgpr(s32) = G_CONSTANT i32 4095
+    %3:vgpr(s32) = COPY %2
+    %4:vgpr(p5) = G_PTR_ADD %1, %3
+    G_STORE %0, %4 :: (store (s32), align 4, addrspace 5)
+
+...
+
+---
+name: function_store_private_s32_to_4_wave_address_offset_4095
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+machineFunctionInfo:
+  isEntryFunction: false
+  scratchRSrcReg: $sgpr0_sgpr1_sgpr2_sgpr3
+  stackPtrOffsetReg: $sgpr32
+
+body: |
+  bb.0:
+    liveins: $vgpr0, $vgpr1
+
+    ; GFX6-LABEL: name: function_store_private_s32_to_4_wave_address_offset_4095
+    ; GFX6: liveins: $vgpr0, $vgpr1
+    ; GFX6-NEXT: {{  $}}
+    ; GFX6-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GFX6-NEXT: [[V_LSHRREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 6, $sgpr32, implicit $exec
+    ; GFX6-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4095, implicit $exec
+    ; GFX6-NEXT: %3:vgpr_32, dead %4:sreg_64_xexec = V_ADD_CO_U32_e64 [[V_LSHRREV_B32_e64_]], [[V_MOV_B32_e32_]], 0, implicit $exec
+    ; GFX6-NEXT: BUFFER_STORE_DWORD_OFFEN [[COPY]], %3, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, implicit $exec :: (store (s32), addrspace 5)
+    ; GFX9-LABEL: name: function_store_private_s32_to_4_wave_address_offset_4095
+    ; GFX9: liveins: $vgpr0, $vgpr1
+    ; GFX9-NEXT: {{  $}}
+    ; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GFX9-NEXT: [[V_LSHRREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 6, $sgpr32, implicit $exec
+    ; GFX9-NEXT: BUFFER_STORE_DWORD_OFFEN [[COPY]], [[V_LSHRREV_B32_e64_]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4095, 0, 0, 0, implicit $exec :: (store (s32), addrspace 5)
+    %0:vgpr(s32) = COPY $vgpr0
+    %1:vgpr(p5) = G_AMDGPU_WAVE_ADDRESS $sgpr32
+    %2:vgpr(s32) = G_CONSTANT i32 4095
+    %3:vgpr(p5) = G_PTR_ADD %1, %2
+    G_STORE %0, %3 :: (store (s32), align 4, addrspace 5)
+
+...
+
+---
+name: function_store_private_s32_to_4_wave_address_offset_copy_constant_4096
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+machineFunctionInfo:
+  isEntryFunction: false
+  scratchRSrcReg: $sgpr0_sgpr1_sgpr2_sgpr3
+  stackPtrOffsetReg: $sgpr32
+
+body: |
+  bb.0:
+    liveins: $vgpr0, $vgpr1
+
+    ; GFX6-LABEL: name: function_store_private_s32_to_4_wave_address_offset_copy_constant_4096
+    ; GFX6: liveins: $vgpr0, $vgpr1
+    ; GFX6-NEXT: {{  $}}
+    ; GFX6-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GFX6-NEXT: [[V_LSHRREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 6, $sgpr32, implicit $exec
+    ; GFX6-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4096
+    ; GFX6-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
+    ; GFX6-NEXT: %4:vgpr_32, dead %5:sreg_64_xexec = V_ADD_CO_U32_e64 [[V_LSHRREV_B32_e64_]], [[COPY1]], 0, implicit $exec
+    ; GFX6-NEXT: BUFFER_STORE_DWORD_OFFEN [[COPY]], %4, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, implicit $exec :: (store (s32), addrspace 5)
+    ; GFX9-LABEL: name: function_store_private_s32_to_4_wave_address_offset_copy_constant_4096
+    ; GFX9: liveins: $vgpr0, $vgpr1
+    ; GFX9-NEXT: {{  $}}
+    ; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GFX9-NEXT: [[V_LSHRREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 6, $sgpr32, implicit $exec
+    ; GFX9-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4096
+    ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
+    ; GFX9-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[V_LSHRREV_B32_e64_]], [[COPY1]], 0, implicit $exec
+    ; GFX9-NEXT: BUFFER_STORE_DWORD_OFFEN [[COPY]], [[V_ADD_U32_e64_]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, implicit $exec :: (store (s32), addrspace 5)
+    %0:vgpr(s32) = COPY $vgpr0
+    %1:vgpr(p5) = G_AMDGPU_WAVE_ADDRESS $sgpr32
+    %2:sgpr(s32) = G_CONSTANT i32 4096
+    %3:vgpr(s32) = COPY %2
+    %4:vgpr(p5) = G_PTR_ADD %1, %3
+    G_STORE %0, %4 :: (store (s32), align 4, addrspace 5)
+
+...

diff  --git a/llvm/test/CodeGen/AMDGPU/amdpal-callable.ll b/llvm/test/CodeGen/AMDGPU/amdpal-callable.ll
index 6950088bd71b..3c32a431bc01 100644
--- a/llvm/test/CodeGen/AMDGPU/amdpal-callable.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdpal-callable.ll
@@ -144,8 +144,7 @@ attributes #0 = { nounwind }
 
 ; GCN: amdpal.pipelines:
 ; GCN-NEXT:  - .registers:
-; SDAG-NEXT:      0x2e12 (COMPUTE_PGM_RSRC1): 0xaf01ca{{$}}
-; GISEL-NEXT:      0x2e12 (COMPUTE_PGM_RSRC1): 0xaf01ce{{$}}
+; GCN-NEXT:      0x2e12 (COMPUTE_PGM_RSRC1): 0xaf01ca{{$}}
 ; GCN-NEXT:      0x2e13 (COMPUTE_PGM_RSRC2): 0x8001{{$}}
 ; GCN-NEXT:    .shader_functions:
 ; GCN-NEXT:      dynamic_stack:
@@ -187,15 +186,13 @@ attributes #0 = { nounwind }
 ; GFX8-NEXT:        .sgpr_count:     0x28{{$}}
 ; GFX9-NEXT:        .sgpr_count:     0x2c{{$}}
 ; GCN-NEXT:        .stack_frame_size_in_bytes: 0x90{{$}}
-; SDAG-NEXT:        .vgpr_count:     0x2a{{$}}
-; GISEL-NEXT:        .vgpr_count:     0x34{{$}}
+; GCN-NEXT:        .vgpr_count:     0x2a{{$}}
 ; GCN-NEXT:      no_stack_indirect_call:
 ; GCN-NEXT:        .lds_size:       0{{$}}
 ; GFX8-NEXT:        .sgpr_count:     0x28{{$}}
 ; GFX9-NEXT:        .sgpr_count:     0x2c{{$}}
 ; GCN-NEXT:        .stack_frame_size_in_bytes: 0x10{{$}}
-; SDAG-NEXT:        .vgpr_count:     0x2a{{$}}
-; GISEL-NEXT:        .vgpr_count:     0x34{{$}}
+; GCN-NEXT:        .vgpr_count:     0x2a{{$}}
 ; GCN-NEXT:      simple_lds:
 ; GCN-NEXT:        .lds_size:       0x100{{$}}
 ; GCN-NEXT:        .sgpr_count:     0x20{{$}}
@@ -227,8 +224,7 @@ attributes #0 = { nounwind }
 ; GFX8-NEXT:        .sgpr_count:     0x28{{$}}
 ; GFX9-NEXT:        .sgpr_count:     0x2c{{$}}
 ; GCN-NEXT:        .stack_frame_size_in_bytes: 0x20{{$}}
-; SDAG-NEXT:        .vgpr_count:     0x2b{{$}}
-; GISEL-NEXT:        .vgpr_count:     0x34{{$}}
+; GCN-NEXT:        .vgpr_count:     0x2b{{$}}
 ; GCN-NEXT:      simple_stack_recurse:
 ; GCN-NEXT:        .lds_size:       0{{$}}
 ; GCN-NEXT:        .sgpr_count:     0x26{{$}}


        


More information about the llvm-commits mailing list