[llvm] 045be6f - AMDGPU/GlobalISel: Fold wave address into mubuf addressing modes
Matt Arsenault via llvm-commits
llvm-commits at lists.llvm.org
Wed Jan 26 12:41:13 PST 2022
Author: Matt Arsenault
Date: 2022-01-26T15:25:26-05:00
New Revision: 045be6ff36dfcbf4a1146ea06ad90909f0508f9b
URL: https://github.com/llvm/llvm-project/commit/045be6ff36dfcbf4a1146ea06ad90909f0508f9b
DIFF: https://github.com/llvm/llvm-project/commit/045be6ff36dfcbf4a1146ea06ad90909f0508f9b.diff
LOG: AMDGPU/GlobalISel: Fold wave address into mubuf addressing modes
Added:
Modified:
llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
llvm/test/CodeGen/AMDGPU/GlobalISel/call-outgoing-stack-args.ll
llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-store-private.mir
llvm/test/CodeGen/AMDGPU/amdpal-callable.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
index e48dca3cc957..4883b6a86ef8 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
@@ -3905,20 +3905,59 @@ bool AMDGPUInstructionSelector::isUnneededShiftMask(const MachineInstr &MI,
return (LHSKnownZeros | *RHS).countTrailingOnes() >= ShAmtBits;
}
+// Return the wave level SGPR base address if this is a wave address.
+static Register getWaveAddress(const MachineInstr *Def) {
+ return Def->getOpcode() == AMDGPU::G_AMDGPU_WAVE_ADDRESS
+ ? Def->getOperand(1).getReg()
+ : Register();
+}
+
InstructionSelector::ComplexRendererFns
AMDGPUInstructionSelector::selectMUBUFScratchOffset(
MachineOperand &Root) const {
- MachineInstr *MI = Root.getParent();
- MachineBasicBlock *MBB = MI->getParent();
+ Register Reg = Root.getReg();
+ const SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>();
+
+ const MachineInstr *Def = MRI->getVRegDef(Reg);
+ if (Register WaveBase = getWaveAddress(Def)) {
+ return {{
+ [=](MachineInstrBuilder &MIB) { // rsrc
+ MIB.addReg(Info->getScratchRSrcReg());
+ },
+ [=](MachineInstrBuilder &MIB) { // soffset
+ MIB.addReg(WaveBase);
+ },
+ [=](MachineInstrBuilder &MIB) { MIB.addImm(0); } // offset
+ }};
+ }
int64_t Offset = 0;
+
+ // FIXME: Copy check is a hack
+ Register BasePtr;
+ if (mi_match(Reg, *MRI, m_GPtrAdd(m_Reg(BasePtr), m_Copy(m_ICst(Offset))))) {
+ if (!SIInstrInfo::isLegalMUBUFImmOffset(Offset))
+ return {};
+ const MachineInstr *BasePtrDef = MRI->getVRegDef(BasePtr);
+ Register WaveBase = getWaveAddress(BasePtrDef);
+ if (!WaveBase)
+ return {};
+
+ return {{
+ [=](MachineInstrBuilder &MIB) { // rsrc
+ MIB.addReg(Info->getScratchRSrcReg());
+ },
+ [=](MachineInstrBuilder &MIB) { // soffset
+ MIB.addReg(WaveBase);
+ },
+ [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); } // offset
+ }};
+ }
+
if (!mi_match(Root.getReg(), *MRI, m_ICst(Offset)) ||
!SIInstrInfo::isLegalMUBUFImmOffset(Offset))
return {};
- const MachineFunction *MF = MBB->getParent();
- const SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>();
-
return {{
[=](MachineInstrBuilder &MIB) { // rsrc
MIB.addReg(Info->getScratchRSrcReg());
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/call-outgoing-stack-args.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/call-outgoing-stack-args.ll
index 0519a9c7db32..8e0f3fc47989 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/call-outgoing-stack-args.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/call-outgoing-stack-args.ll
@@ -14,21 +14,20 @@ define amdgpu_kernel void @kernel_caller_stack() {
; MUBUF: ; %bb.0:
; MUBUF-NEXT: s_add_u32 flat_scratch_lo, s4, s7
; MUBUF-NEXT: s_addc_u32 flat_scratch_hi, s5, 0
-; MUBUF-NEXT: s_mov_b32 s32, 0
; MUBUF-NEXT: s_add_u32 s0, s0, s7
+; MUBUF-NEXT: s_mov_b32 s32, 0
; MUBUF-NEXT: s_addc_u32 s1, s1, 0
-; MUBUF-NEXT: v_lshrrev_b32_e64 v0, 6, s32
-; MUBUF-NEXT: v_mov_b32_e32 v1, 9
-; MUBUF-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:4
-; MUBUF-NEXT: v_mov_b32_e32 v1, 10
-; MUBUF-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:8
-; MUBUF-NEXT: v_mov_b32_e32 v1, 11
-; MUBUF-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:12
-; MUBUF-NEXT: v_mov_b32_e32 v1, 12
+; MUBUF-NEXT: v_mov_b32_e32 v0, 9
+; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:4
+; MUBUF-NEXT: v_mov_b32_e32 v0, 10
+; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:8
+; MUBUF-NEXT: v_mov_b32_e32 v0, 11
+; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:12
+; MUBUF-NEXT: v_mov_b32_e32 v0, 12
; MUBUF-NEXT: s_getpc_b64 s[4:5]
; MUBUF-NEXT: s_add_u32 s4, s4, external_void_func_v16i32_v16i32_v4i32 at rel32@lo+4
; MUBUF-NEXT: s_addc_u32 s5, s5, external_void_func_v16i32_v16i32_v4i32 at rel32@hi+12
-; MUBUF-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:16
+; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:16
; MUBUF-NEXT: s_swappc_b64 s[30:31], s[4:5]
; MUBUF-NEXT: s_endpgm
;
@@ -112,42 +111,41 @@ define amdgpu_kernel void @kernel_caller_byval() {
; MUBUF-NEXT: buffer_load_dword v14, off, s[0:3], 0 offset:64
; MUBUF-NEXT: buffer_load_dword v15, off, s[0:3], 0 offset:68
; MUBUF-NEXT: s_movk_i32 s32, 0x1400
-; MUBUF-NEXT: v_lshrrev_b32_e64 v16, 6, s32
; MUBUF-NEXT: s_getpc_b64 s[4:5]
; MUBUF-NEXT: s_add_u32 s4, s4, external_void_func_byval at rel32@lo+4
; MUBUF-NEXT: s_addc_u32 s5, s5, external_void_func_byval at rel32@hi+12
; MUBUF-NEXT: s_waitcnt vmcnt(15)
-; MUBUF-NEXT: buffer_store_dword v0, v16, s[0:3], 0 offen
+; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], s32
; MUBUF-NEXT: s_waitcnt vmcnt(15)
-; MUBUF-NEXT: buffer_store_dword v1, v16, s[0:3], 0 offen offset:4
+; MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:4
; MUBUF-NEXT: s_waitcnt vmcnt(15)
-; MUBUF-NEXT: buffer_store_dword v2, v16, s[0:3], 0 offen offset:8
+; MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:8
; MUBUF-NEXT: s_waitcnt vmcnt(15)
-; MUBUF-NEXT: buffer_store_dword v3, v16, s[0:3], 0 offen offset:12
+; MUBUF-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:12
; MUBUF-NEXT: s_waitcnt vmcnt(15)
-; MUBUF-NEXT: buffer_store_dword v4, v16, s[0:3], 0 offen offset:16
+; MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:16
; MUBUF-NEXT: s_waitcnt vmcnt(15)
-; MUBUF-NEXT: buffer_store_dword v5, v16, s[0:3], 0 offen offset:20
+; MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:20
; MUBUF-NEXT: s_waitcnt vmcnt(15)
-; MUBUF-NEXT: buffer_store_dword v6, v16, s[0:3], 0 offen offset:24
+; MUBUF-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:24
; MUBUF-NEXT: s_waitcnt vmcnt(15)
-; MUBUF-NEXT: buffer_store_dword v7, v16, s[0:3], 0 offen offset:28
+; MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:28
; MUBUF-NEXT: s_waitcnt vmcnt(15)
-; MUBUF-NEXT: buffer_store_dword v8, v16, s[0:3], 0 offen offset:32
+; MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:32
; MUBUF-NEXT: s_waitcnt vmcnt(15)
-; MUBUF-NEXT: buffer_store_dword v9, v16, s[0:3], 0 offen offset:36
+; MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:36
; MUBUF-NEXT: s_waitcnt vmcnt(15)
-; MUBUF-NEXT: buffer_store_dword v10, v16, s[0:3], 0 offen offset:40
+; MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:40
; MUBUF-NEXT: s_waitcnt vmcnt(15)
-; MUBUF-NEXT: buffer_store_dword v11, v16, s[0:3], 0 offen offset:44
+; MUBUF-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:44
; MUBUF-NEXT: s_waitcnt vmcnt(15)
-; MUBUF-NEXT: buffer_store_dword v12, v16, s[0:3], 0 offen offset:48
+; MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:48
; MUBUF-NEXT: s_waitcnt vmcnt(15)
-; MUBUF-NEXT: buffer_store_dword v13, v16, s[0:3], 0 offen offset:52
+; MUBUF-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:52
; MUBUF-NEXT: s_waitcnt vmcnt(15)
-; MUBUF-NEXT: buffer_store_dword v14, v16, s[0:3], 0 offen offset:56
+; MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:56
; MUBUF-NEXT: s_waitcnt vmcnt(15)
-; MUBUF-NEXT: buffer_store_dword v15, v16, s[0:3], 0 offen offset:60
+; MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:60
; MUBUF-NEXT: s_swappc_b64 s[30:31], s[4:5]
; MUBUF-NEXT: s_endpgm
;
@@ -244,20 +242,19 @@ define void @func_caller_stack() {
; MUBUF-NEXT: v_writelane_b32 v40, s33, 2
; MUBUF-NEXT: s_mov_b32 s33, s32
; MUBUF-NEXT: s_addk_i32 s32, 0x400
-; MUBUF-NEXT: v_lshrrev_b32_e64 v0, 6, s32
-; MUBUF-NEXT: v_mov_b32_e32 v1, 9
-; MUBUF-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:4
-; MUBUF-NEXT: v_mov_b32_e32 v1, 10
-; MUBUF-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:8
-; MUBUF-NEXT: v_mov_b32_e32 v1, 11
+; MUBUF-NEXT: v_mov_b32_e32 v0, 9
+; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:4
+; MUBUF-NEXT: v_mov_b32_e32 v0, 10
+; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:8
+; MUBUF-NEXT: v_mov_b32_e32 v0, 11
; MUBUF-NEXT: v_writelane_b32 v40, s30, 0
-; MUBUF-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:12
-; MUBUF-NEXT: v_mov_b32_e32 v1, 12
+; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:12
+; MUBUF-NEXT: v_mov_b32_e32 v0, 12
; MUBUF-NEXT: v_writelane_b32 v40, s31, 1
; MUBUF-NEXT: s_getpc_b64 s[4:5]
; MUBUF-NEXT: s_add_u32 s4, s4, external_void_func_v16i32_v16i32_v4i32 at rel32@lo+4
; MUBUF-NEXT: s_addc_u32 s5, s5, external_void_func_v16i32_v16i32_v4i32 at rel32@hi+12
-; MUBUF-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:16
+; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:16
; MUBUF-NEXT: s_swappc_b64 s[30:31], s[4:5]
; MUBUF-NEXT: v_readlane_b32 s4, v40, 0
; MUBUF-NEXT: v_readlane_b32 s5, v40, 1
@@ -317,65 +314,64 @@ define void @func_caller_byval([16 x i32] addrspace(5)* %argptr) {
; MUBUF-NEXT: v_writelane_b32 v40, s33, 2
; MUBUF-NEXT: s_mov_b32 s33, s32
; MUBUF-NEXT: s_addk_i32 s32, 0x400
-; MUBUF-NEXT: v_lshrrev_b32_e64 v3, 6, s32
; MUBUF-NEXT: v_writelane_b32 v40, s30, 0
; MUBUF-NEXT: v_writelane_b32 v40, s31, 1
; MUBUF-NEXT: s_getpc_b64 s[4:5]
; MUBUF-NEXT: s_add_u32 s4, s4, external_void_func_byval at rel32@lo+4
; MUBUF-NEXT: s_addc_u32 s5, s5, external_void_func_byval at rel32@hi+12
; MUBUF-NEXT: s_waitcnt vmcnt(1)
-; MUBUF-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen
+; MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], s32
; MUBUF-NEXT: s_waitcnt vmcnt(1)
-; MUBUF-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen offset:4
+; MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:4
; MUBUF-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen offset:8
; MUBUF-NEXT: s_nop 0
; MUBUF-NEXT: buffer_load_dword v2, v0, s[0:3], 0 offen offset:12
; MUBUF-NEXT: s_waitcnt vmcnt(1)
-; MUBUF-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen offset:8
+; MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:8
; MUBUF-NEXT: s_waitcnt vmcnt(1)
-; MUBUF-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen offset:12
+; MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:12
; MUBUF-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen offset:16
; MUBUF-NEXT: s_nop 0
; MUBUF-NEXT: buffer_load_dword v2, v0, s[0:3], 0 offen offset:20
; MUBUF-NEXT: s_waitcnt vmcnt(1)
-; MUBUF-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen offset:16
+; MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:16
; MUBUF-NEXT: s_waitcnt vmcnt(1)
-; MUBUF-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen offset:20
+; MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:20
; MUBUF-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen offset:24
; MUBUF-NEXT: s_nop 0
; MUBUF-NEXT: buffer_load_dword v2, v0, s[0:3], 0 offen offset:28
; MUBUF-NEXT: s_waitcnt vmcnt(1)
-; MUBUF-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen offset:24
+; MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:24
; MUBUF-NEXT: s_waitcnt vmcnt(1)
-; MUBUF-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen offset:28
+; MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:28
; MUBUF-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen offset:32
; MUBUF-NEXT: s_nop 0
; MUBUF-NEXT: buffer_load_dword v2, v0, s[0:3], 0 offen offset:36
; MUBUF-NEXT: s_waitcnt vmcnt(1)
-; MUBUF-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen offset:32
+; MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:32
; MUBUF-NEXT: s_waitcnt vmcnt(1)
-; MUBUF-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen offset:36
+; MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:36
; MUBUF-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen offset:40
; MUBUF-NEXT: s_nop 0
; MUBUF-NEXT: buffer_load_dword v2, v0, s[0:3], 0 offen offset:44
; MUBUF-NEXT: s_waitcnt vmcnt(1)
-; MUBUF-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen offset:40
+; MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:40
; MUBUF-NEXT: s_waitcnt vmcnt(1)
-; MUBUF-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen offset:44
+; MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:44
; MUBUF-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen offset:48
; MUBUF-NEXT: s_nop 0
; MUBUF-NEXT: buffer_load_dword v2, v0, s[0:3], 0 offen offset:52
; MUBUF-NEXT: s_waitcnt vmcnt(1)
-; MUBUF-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen offset:48
+; MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:48
; MUBUF-NEXT: s_waitcnt vmcnt(1)
-; MUBUF-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen offset:52
+; MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:52
; MUBUF-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen offset:56
; MUBUF-NEXT: s_nop 0
; MUBUF-NEXT: buffer_load_dword v2, v0, s[0:3], 0 offen offset:60
; MUBUF-NEXT: s_waitcnt vmcnt(1)
-; MUBUF-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen offset:56
+; MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:56
; MUBUF-NEXT: s_waitcnt vmcnt(1)
-; MUBUF-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen offset:60
+; MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:60
; MUBUF-NEXT: s_swappc_b64 s[30:31], s[4:5]
; MUBUF-NEXT: v_readlane_b32 s4, v40, 0
; MUBUF-NEXT: v_readlane_b32 s5, v40, 1
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-store-private.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-store-private.mir
index 11b7c28860ae..246c7686d646 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-store-private.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-store-private.mir
@@ -581,3 +581,144 @@ body: |
G_STORE %1, %0 :: (store (s8), align 1, addrspace 5)
...
+
+---
+name: function_store_private_s32_to_4_wave_address
+legalized: true
+regBankSelected: true
+tracksRegLiveness: true
+machineFunctionInfo:
+ isEntryFunction: false
+ scratchRSrcReg: $sgpr0_sgpr1_sgpr2_sgpr3
+ stackPtrOffsetReg: $sgpr32
+
+body: |
+ bb.0:
+ liveins: $vgpr0, $vgpr1
+
+ ; GFX6-LABEL: name: function_store_private_s32_to_4_wave_address
+ ; GFX6: liveins: $vgpr0, $vgpr1
+ ; GFX6-NEXT: {{ $}}
+ ; GFX6-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX6-NEXT: BUFFER_STORE_DWORD_OFFSET [[COPY]], $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, implicit $exec :: (store (s32), addrspace 5)
+ ; GFX9-LABEL: name: function_store_private_s32_to_4_wave_address
+ ; GFX9: liveins: $vgpr0, $vgpr1
+ ; GFX9-NEXT: {{ $}}
+ ; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX9-NEXT: BUFFER_STORE_DWORD_OFFSET [[COPY]], $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, implicit $exec :: (store (s32), addrspace 5)
+ %0:vgpr(s32) = COPY $vgpr0
+ %1:vgpr(p5) = G_AMDGPU_WAVE_ADDRESS $sgpr32
+ G_STORE %0, %1 :: (store (s32), align 4, addrspace 5)
+
+...
+
+# Has regbank copy of constant
+---
+name: function_store_private_s32_to_4_wave_address_offset_copy_constant_4095
+legalized: true
+regBankSelected: true
+tracksRegLiveness: true
+machineFunctionInfo:
+ isEntryFunction: false
+ scratchRSrcReg: $sgpr0_sgpr1_sgpr2_sgpr3
+ stackPtrOffsetReg: $sgpr32
+
+body: |
+ bb.0:
+ liveins: $vgpr0, $vgpr1
+
+ ; GFX6-LABEL: name: function_store_private_s32_to_4_wave_address_offset_copy_constant_4095
+ ; GFX6: liveins: $vgpr0, $vgpr1
+ ; GFX6-NEXT: {{ $}}
+ ; GFX6-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX6-NEXT: BUFFER_STORE_DWORD_OFFSET [[COPY]], $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4095, 0, 0, 0, implicit $exec :: (store (s32), addrspace 5)
+ ; GFX9-LABEL: name: function_store_private_s32_to_4_wave_address_offset_copy_constant_4095
+ ; GFX9: liveins: $vgpr0, $vgpr1
+ ; GFX9-NEXT: {{ $}}
+ ; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX9-NEXT: BUFFER_STORE_DWORD_OFFSET [[COPY]], $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4095, 0, 0, 0, implicit $exec :: (store (s32), addrspace 5)
+ %0:vgpr(s32) = COPY $vgpr0
+ %1:vgpr(p5) = G_AMDGPU_WAVE_ADDRESS $sgpr32
+ %2:sgpr(s32) = G_CONSTANT i32 4095
+ %3:vgpr(s32) = COPY %2
+ %4:vgpr(p5) = G_PTR_ADD %1, %3
+ G_STORE %0, %4 :: (store (s32), align 4, addrspace 5)
+
+...
+
+---
+name: function_store_private_s32_to_4_wave_address_offset_4095
+legalized: true
+regBankSelected: true
+tracksRegLiveness: true
+machineFunctionInfo:
+ isEntryFunction: false
+ scratchRSrcReg: $sgpr0_sgpr1_sgpr2_sgpr3
+ stackPtrOffsetReg: $sgpr32
+
+body: |
+ bb.0:
+ liveins: $vgpr0, $vgpr1
+
+ ; GFX6-LABEL: name: function_store_private_s32_to_4_wave_address_offset_4095
+ ; GFX6: liveins: $vgpr0, $vgpr1
+ ; GFX6-NEXT: {{ $}}
+ ; GFX6-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX6-NEXT: [[V_LSHRREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 6, $sgpr32, implicit $exec
+ ; GFX6-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4095, implicit $exec
+ ; GFX6-NEXT: %3:vgpr_32, dead %4:sreg_64_xexec = V_ADD_CO_U32_e64 [[V_LSHRREV_B32_e64_]], [[V_MOV_B32_e32_]], 0, implicit $exec
+ ; GFX6-NEXT: BUFFER_STORE_DWORD_OFFEN [[COPY]], %3, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, implicit $exec :: (store (s32), addrspace 5)
+ ; GFX9-LABEL: name: function_store_private_s32_to_4_wave_address_offset_4095
+ ; GFX9: liveins: $vgpr0, $vgpr1
+ ; GFX9-NEXT: {{ $}}
+ ; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX9-NEXT: [[V_LSHRREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 6, $sgpr32, implicit $exec
+ ; GFX9-NEXT: BUFFER_STORE_DWORD_OFFEN [[COPY]], [[V_LSHRREV_B32_e64_]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4095, 0, 0, 0, implicit $exec :: (store (s32), addrspace 5)
+ %0:vgpr(s32) = COPY $vgpr0
+ %1:vgpr(p5) = G_AMDGPU_WAVE_ADDRESS $sgpr32
+ %2:vgpr(s32) = G_CONSTANT i32 4095
+ %3:vgpr(p5) = G_PTR_ADD %1, %2
+ G_STORE %0, %3 :: (store (s32), align 4, addrspace 5)
+
+...
+
+---
+name: function_store_private_s32_to_4_wave_address_offset_copy_constant_4096
+legalized: true
+regBankSelected: true
+tracksRegLiveness: true
+machineFunctionInfo:
+ isEntryFunction: false
+ scratchRSrcReg: $sgpr0_sgpr1_sgpr2_sgpr3
+ stackPtrOffsetReg: $sgpr32
+
+body: |
+ bb.0:
+ liveins: $vgpr0, $vgpr1
+
+ ; GFX6-LABEL: name: function_store_private_s32_to_4_wave_address_offset_copy_constant_4096
+ ; GFX6: liveins: $vgpr0, $vgpr1
+ ; GFX6-NEXT: {{ $}}
+ ; GFX6-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX6-NEXT: [[V_LSHRREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 6, $sgpr32, implicit $exec
+ ; GFX6-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4096
+ ; GFX6-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
+ ; GFX6-NEXT: %4:vgpr_32, dead %5:sreg_64_xexec = V_ADD_CO_U32_e64 [[V_LSHRREV_B32_e64_]], [[COPY1]], 0, implicit $exec
+ ; GFX6-NEXT: BUFFER_STORE_DWORD_OFFEN [[COPY]], %4, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, implicit $exec :: (store (s32), addrspace 5)
+ ; GFX9-LABEL: name: function_store_private_s32_to_4_wave_address_offset_copy_constant_4096
+ ; GFX9: liveins: $vgpr0, $vgpr1
+ ; GFX9-NEXT: {{ $}}
+ ; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX9-NEXT: [[V_LSHRREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 6, $sgpr32, implicit $exec
+ ; GFX9-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4096
+ ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
+ ; GFX9-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[V_LSHRREV_B32_e64_]], [[COPY1]], 0, implicit $exec
+ ; GFX9-NEXT: BUFFER_STORE_DWORD_OFFEN [[COPY]], [[V_ADD_U32_e64_]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, implicit $exec :: (store (s32), addrspace 5)
+ %0:vgpr(s32) = COPY $vgpr0
+ %1:vgpr(p5) = G_AMDGPU_WAVE_ADDRESS $sgpr32
+ %2:sgpr(s32) = G_CONSTANT i32 4096
+ %3:vgpr(s32) = COPY %2
+ %4:vgpr(p5) = G_PTR_ADD %1, %3
+ G_STORE %0, %4 :: (store (s32), align 4, addrspace 5)
+
+...
diff --git a/llvm/test/CodeGen/AMDGPU/amdpal-callable.ll b/llvm/test/CodeGen/AMDGPU/amdpal-callable.ll
index 6950088bd71b..3c32a431bc01 100644
--- a/llvm/test/CodeGen/AMDGPU/amdpal-callable.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdpal-callable.ll
@@ -144,8 +144,7 @@ attributes #0 = { nounwind }
; GCN: amdpal.pipelines:
; GCN-NEXT: - .registers:
-; SDAG-NEXT: 0x2e12 (COMPUTE_PGM_RSRC1): 0xaf01ca{{$}}
-; GISEL-NEXT: 0x2e12 (COMPUTE_PGM_RSRC1): 0xaf01ce{{$}}
+; GCN-NEXT: 0x2e12 (COMPUTE_PGM_RSRC1): 0xaf01ca{{$}}
; GCN-NEXT: 0x2e13 (COMPUTE_PGM_RSRC2): 0x8001{{$}}
; GCN-NEXT: .shader_functions:
; GCN-NEXT: dynamic_stack:
@@ -187,15 +186,13 @@ attributes #0 = { nounwind }
; GFX8-NEXT: .sgpr_count: 0x28{{$}}
; GFX9-NEXT: .sgpr_count: 0x2c{{$}}
; GCN-NEXT: .stack_frame_size_in_bytes: 0x90{{$}}
-; SDAG-NEXT: .vgpr_count: 0x2a{{$}}
-; GISEL-NEXT: .vgpr_count: 0x34{{$}}
+; GCN-NEXT: .vgpr_count: 0x2a{{$}}
; GCN-NEXT: no_stack_indirect_call:
; GCN-NEXT: .lds_size: 0{{$}}
; GFX8-NEXT: .sgpr_count: 0x28{{$}}
; GFX9-NEXT: .sgpr_count: 0x2c{{$}}
; GCN-NEXT: .stack_frame_size_in_bytes: 0x10{{$}}
-; SDAG-NEXT: .vgpr_count: 0x2a{{$}}
-; GISEL-NEXT: .vgpr_count: 0x34{{$}}
+; GCN-NEXT: .vgpr_count: 0x2a{{$}}
; GCN-NEXT: simple_lds:
; GCN-NEXT: .lds_size: 0x100{{$}}
; GCN-NEXT: .sgpr_count: 0x20{{$}}
@@ -227,8 +224,7 @@ attributes #0 = { nounwind }
; GFX8-NEXT: .sgpr_count: 0x28{{$}}
; GFX9-NEXT: .sgpr_count: 0x2c{{$}}
; GCN-NEXT: .stack_frame_size_in_bytes: 0x20{{$}}
-; SDAG-NEXT: .vgpr_count: 0x2b{{$}}
-; GISEL-NEXT: .vgpr_count: 0x34{{$}}
+; GCN-NEXT: .vgpr_count: 0x2b{{$}}
; GCN-NEXT: simple_stack_recurse:
; GCN-NEXT: .lds_size: 0{{$}}
; GCN-NEXT: .sgpr_count: 0x26{{$}}
More information about the llvm-commits
mailing list