[llvm] [AMDGPU] Support alloca in AS0 (PR #136584)
Shilei Tian via llvm-commits
llvm-commits at lists.llvm.org
Tue Apr 22 18:49:18 PDT 2025
https://github.com/shiltian updated https://github.com/llvm/llvm-project/pull/136584
>From ae093995b232a38ce4487e5009fd76a3d1dd9fec Mon Sep 17 00:00:00 2001
From: Shilei Tian <i at tianshilei.me>
Date: Tue, 22 Apr 2025 20:36:20 -0400
Subject: [PATCH] [AMDGPU] Support alloca in AS0
This PR lowers an alloca in AS0 to an alloca in AS5 followed by an addrspacecast
back to AS0.
---
llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp | 3 +
.../lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp | 33 +-
llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h | 5 +
llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 29 +-
llvm/lib/Target/AMDGPU/SIISelLowering.h | 1 +
llvm/test/CodeGen/AMDGPU/alloca-as0.ll | 417 ++++++++++++++++++
.../AMDGPU/assert-wrong-alloca-addrspace.ll | 16 -
7 files changed, 485 insertions(+), 19 deletions(-)
create mode 100644 llvm/test/CodeGen/AMDGPU/alloca-as0.ll
delete mode 100644 llvm/test/CodeGen/AMDGPU/assert-wrong-alloca-addrspace.ll
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index 2846405a2538c..ddc61a219eb83 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -385,9 +385,12 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
setOperationAction({ISD::BR_JT, ISD::BRIND}, MVT::Other, Expand);
+ setOperationAction(ISD::FrameIndex, MVT::i64, Custom);
+
// For R600, this is totally unsupported, just custom lower to produce an
// error.
setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Custom);
+ setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i64, Custom);
// Library functions. These default to Expand, but we have instructions
// for them.
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
index beb6432170970..4d7002db2cca7 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
@@ -912,12 +912,15 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
.widenScalarToNextPow2(0, 32)
.clampMaxNumElements(0, S32, 16);
- getActionDefinitionsBuilder(G_FRAME_INDEX).legalFor({PrivatePtr});
+ getActionDefinitionsBuilder(G_FRAME_INDEX)
+ .legalFor({PrivatePtr})
+ .customFor({FlatPtr});
// If the amount is divergent, we have to do a wave reduction to get the
// maximum value, so this is expanded during RegBankSelect.
getActionDefinitionsBuilder(G_DYN_STACKALLOC)
- .legalFor({{PrivatePtr, S32}});
+ .legalFor({{PrivatePtr, S32}})
+ .customFor({FlatPtr, S32});
getActionDefinitionsBuilder(G_STACKSAVE)
.customFor({PrivatePtr});
@@ -2221,6 +2224,10 @@ bool AMDGPULegalizerInfo::legalizeCustom(
return legalizeTrap(MI, MRI, B);
case TargetOpcode::G_DEBUGTRAP:
return legalizeDebugTrap(MI, MRI, B);
+ case TargetOpcode::G_FRAME_INDEX:
+ return legalizeFrameIndex(MI, MRI, B);
+ case TargetOpcode::G_DYN_STACKALLOC:
+ return legalizeDynStackAlloc(MI, MRI, B);
default:
return false;
}
@@ -7668,3 +7675,25 @@ bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
return true;
}
+
+bool AMDGPULegalizerInfo::legalizeFrameIndex(MachineInstr &MI,
+ MachineRegisterInfo &MRI,
+ MachineIRBuilder &B) const {
+ MachineInstrBuilder FI = B.buildFrameIndex(
+ LLT::pointer(AMDGPUAS::PRIVATE_ADDRESS, 32), MI.getOperand(1).getIndex());
+ B.buildAddrSpaceCast(MI.getOperand(0).getReg(), FI);
+ MI.eraseFromParent();
+ return true;
+}
+
+bool AMDGPULegalizerInfo::legalizeDynStackAlloc(MachineInstr &MI,
+ MachineRegisterInfo &MRI,
+ MachineIRBuilder &B) const {
+ MachineInstrBuilder Size = B.buildTrunc(S32, MI.getOperand(1));
+ Align Alignment(MI.getOperand(2).getImm());
+ MachineInstrBuilder DynStackAlloc = B.buildDynStackAlloc(
+ LLT::pointer(AMDGPUAS::PRIVATE_ADDRESS, 32), Size, Alignment);
+ B.buildAddrSpaceCast(MI.getOperand(0).getReg(), DynStackAlloc);
+ MI.eraseFromParent();
+ return true;
+}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h
index 1f4e02b0d600a..55250530689cb 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h
@@ -246,6 +246,11 @@ class AMDGPULegalizerInfo final : public LegalizerInfo {
bool legalizeIntrinsic(LegalizerHelper &Helper,
MachineInstr &MI) const override;
+
+ bool legalizeFrameIndex(MachineInstr &MI, MachineRegisterInfo &MRI,
+ MachineIRBuilder &B) const;
+ bool legalizeDynStackAlloc(MachineInstr &MI, MachineRegisterInfo &MRI,
+ MachineIRBuilder &B) const;
};
} // End llvm namespace.
#endif
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 2e3cd5ca6692d..3f2e5fbce03a1 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -4117,6 +4117,17 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI,
InVals, /*IsThisReturn=*/false, SDValue());
}
+SDValue SITargetLowering::lowerFrameIndex(SDValue Op, SelectionDAG &DAG) const {
+ // Since address space information is lost here, we assume that an i64 frame
+ // index comes from an alloca in AS0.
+ SDLoc DL(Op);
+ auto *FI = cast<FrameIndexSDNode>(Op);
+ SDValue TFI = DAG.getFrameIndex(FI->getIndex(), MVT::i32);
+ return DAG.getAddrSpaceCast(DL, Op.getValueType(), TFI,
+ AMDGPUAS::PRIVATE_ADDRESS,
+ AMDGPUAS::FLAT_ADDRESS);
+}
+
// This is similar to the default implementation in ExpandDYNAMIC_STACKALLOC,
// except for:
// 1. Stack growth direction(default: downwards, AMDGPU: upwards), and
@@ -4129,13 +4140,27 @@ SDValue SITargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
SDLoc dl(Op);
EVT VT = Op.getValueType();
SDValue Chain = Op.getOperand(0);
+ SDValue Size = Op.getOperand(1);
+
+ // Since address space information is lost here, we assume that an i64 dynamic
+ // alloca comes from an alloca in AS0.
+ if (VT == MVT::i64) {
+ SDValue Align = Op.getOperand(2);
+ Size = DAG.getZExtOrTrunc(Size, dl, MVT::i32);
+ SDValue Ops[] = {Chain, Size, Align};
+ SDValue DynAlloc =
+ DAG.getNode(ISD::DYNAMIC_STACKALLOC, dl, {MVT::i32, MVT::Other}, Ops);
+ SDValue Cast = DAG.getAddrSpaceCast(
+ dl, VT, DynAlloc, AMDGPUAS::PRIVATE_ADDRESS, AMDGPUAS::FLAT_ADDRESS);
+ return DAG.getMergeValues({Cast, DynAlloc.getValue(1)}, dl);
+ }
+
Register SPReg = Info->getStackPtrOffsetReg();
// Chain the dynamic stack allocation so that it doesn't modify the stack
// pointer when other instructions are using the stack.
Chain = DAG.getCALLSEQ_START(Chain, 0, 0, dl);
- SDValue Size = Op.getOperand(1);
SDValue BaseAddr = DAG.getCopyFromReg(Chain, dl, SPReg, VT);
Align Alignment = cast<ConstantSDNode>(Op.getOperand(2))->getAlignValue();
@@ -6087,6 +6112,8 @@ SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
case ISD::SMUL_LOHI:
case ISD::UMUL_LOHI:
return lowerXMUL_LOHI(Op, DAG);
+ case ISD::FrameIndex:
+ return lowerFrameIndex(Op, DAG);
case ISD::DYNAMIC_STACKALLOC:
return LowerDYNAMIC_STACKALLOC(Op, DAG);
case ISD::STACKSAVE:
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.h b/llvm/lib/Target/AMDGPU/SIISelLowering.h
index c42366a1c04c8..f08cd15282c94 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.h
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.h
@@ -428,6 +428,7 @@ class SITargetLowering final : public AMDGPUTargetLowering {
SDValue LowerCall(CallLoweringInfo &CLI,
SmallVectorImpl<SDValue> &InVals) const override;
+ SDValue lowerFrameIndex(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerSTACKSAVE(SDValue Op, SelectionDAG &DAG) const;
SDValue lowerGET_ROUNDING(SDValue Op, SelectionDAG &DAG) const;
diff --git a/llvm/test/CodeGen/AMDGPU/alloca-as0.ll b/llvm/test/CodeGen/AMDGPU/alloca-as0.ll
new file mode 100644
index 0000000000000..5f20f7ce1e638
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/alloca-as0.ll
@@ -0,0 +1,417 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -O0 %s -o - | FileCheck %s --check-prefix=ISEL
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -global-isel -O0 %s -o - | FileCheck %s --check-prefix=GI
+
+declare void @bar(ptr)
+
+define i32 @static_alloca() {
+; ISEL-LABEL: static_alloca:
+; ISEL: ; %bb.0:
+; ISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; ISEL-NEXT: s_mov_b32 s16, s33
+; ISEL-NEXT: s_mov_b32 s33, s32
+; ISEL-NEXT: s_or_saveexec_b64 s[18:19], -1
+; ISEL-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
+; ISEL-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
+; ISEL-NEXT: s_mov_b64 exec, s[18:19]
+; ISEL-NEXT: v_writelane_b32 v40, s16, 2
+; ISEL-NEXT: s_add_i32 s32, s32, 0x400
+; ISEL-NEXT: v_writelane_b32 v40, s30, 0
+; ISEL-NEXT: v_writelane_b32 v40, s31, 1
+; ISEL-NEXT: s_mov_b32 s18, 32
+; ISEL-NEXT: s_mov_b64 s[16:17], src_private_base
+; ISEL-NEXT: s_lshr_b64 s[16:17], s[16:17], s18
+; ISEL-NEXT: s_mov_b32 s17, s16
+; ISEL-NEXT: s_mov_b64 s[22:23], 0
+; ISEL-NEXT: s_mov_b32 s19, s23
+; ISEL-NEXT: s_mov_b32 s20, -1
+; ISEL-NEXT: s_lshr_b32 s16, s33, 6
+; ISEL-NEXT: s_cmp_lg_u32 s16, s20
+; ISEL-NEXT: s_cselect_b32 s20, s17, s19
+; ISEL-NEXT: s_mov_b32 s17, s22
+; ISEL-NEXT: s_cselect_b32 s19, s16, s17
+; ISEL-NEXT: s_mov_b32 s16, s19
+; ISEL-NEXT: s_mov_b32 s17, s20
+; ISEL-NEXT: ; implicit-def: $vgpr41 : SGPR spill to VGPR lane
+; ISEL-NEXT: v_writelane_b32 v41, s16, 0
+; ISEL-NEXT: v_writelane_b32 v41, s17, 1
+; ISEL-NEXT: s_lshr_b64 s[16:17], s[16:17], s18
+; ISEL-NEXT: s_mov_b32 s18, s16
+; ISEL-NEXT: s_getpc_b64 s[16:17]
+; ISEL-NEXT: s_add_u32 s16, s16, bar at gotpcrel32@lo+4
+; ISEL-NEXT: s_addc_u32 s17, s17, bar at gotpcrel32@hi+12
+; ISEL-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0
+; ISEL-NEXT: s_mov_b64 s[22:23], s[2:3]
+; ISEL-NEXT: s_mov_b64 s[20:21], s[0:1]
+; ISEL-NEXT: s_mov_b64 s[0:1], s[20:21]
+; ISEL-NEXT: s_mov_b64 s[2:3], s[22:23]
+; ISEL-NEXT: v_mov_b32_e32 v0, s19
+; ISEL-NEXT: v_mov_b32_e32 v1, s18
+; ISEL-NEXT: s_waitcnt lgkmcnt(0)
+; ISEL-NEXT: s_swappc_b64 s[30:31], s[16:17]
+; ISEL-NEXT: v_readlane_b32 s4, v41, 0
+; ISEL-NEXT: v_readlane_b32 s5, v41, 1
+; ISEL-NEXT: v_mov_b32_e32 v0, s4
+; ISEL-NEXT: v_mov_b32_e32 v1, s5
+; ISEL-NEXT: flat_load_dword v0, v[0:1]
+; ISEL-NEXT: v_readlane_b32 s31, v40, 1
+; ISEL-NEXT: v_readlane_b32 s30, v40, 0
+; ISEL-NEXT: s_mov_b32 s32, s33
+; ISEL-NEXT: v_readlane_b32 s4, v40, 2
+; ISEL-NEXT: s_or_saveexec_b64 s[6:7], -1
+; ISEL-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
+; ISEL-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
+; ISEL-NEXT: s_mov_b64 exec, s[6:7]
+; ISEL-NEXT: s_mov_b32 s33, s4
+; ISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; ISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GI-LABEL: static_alloca:
+; GI: ; %bb.0:
+; GI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GI-NEXT: s_mov_b32 s16, s33
+; GI-NEXT: s_mov_b32 s33, s32
+; GI-NEXT: s_or_saveexec_b64 s[18:19], -1
+; GI-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
+; GI-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
+; GI-NEXT: s_mov_b64 exec, s[18:19]
+; GI-NEXT: v_writelane_b32 v40, s16, 2
+; GI-NEXT: s_add_i32 s32, s32, 0x400
+; GI-NEXT: v_writelane_b32 v40, s30, 0
+; GI-NEXT: v_writelane_b32 v40, s31, 1
+; GI-NEXT: s_lshr_b32 s17, s33, 6
+; GI-NEXT: s_mov_b64 s[18:19], src_private_base
+; GI-NEXT: ; kill: def $sgpr16 killed $sgpr18
+; GI-NEXT: s_mov_b32 s16, s19
+; GI-NEXT: s_mov_b32 s18, s17
+; GI-NEXT: s_mov_b32 s19, s16
+; GI-NEXT: ; implicit-def: $vgpr41 : SGPR spill to VGPR lane
+; GI-NEXT: v_writelane_b32 v41, s18, 0
+; GI-NEXT: v_writelane_b32 v41, s19, 1
+; GI-NEXT: v_mov_b32_e32 v0, s17
+; GI-NEXT: v_mov_b32_e32 v1, s16
+; GI-NEXT: s_mov_b64 s[18:19], s[2:3]
+; GI-NEXT: s_mov_b64 s[16:17], s[0:1]
+; GI-NEXT: s_mov_b64 s[0:1], s[16:17]
+; GI-NEXT: s_mov_b64 s[2:3], s[18:19]
+; GI-NEXT: s_getpc_b64 s[16:17]
+; GI-NEXT: s_add_u32 s16, s16, bar at gotpcrel32@lo+4
+; GI-NEXT: s_addc_u32 s17, s17, bar at gotpcrel32@hi+12
+; GI-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0
+; GI-NEXT: s_waitcnt lgkmcnt(0)
+; GI-NEXT: s_swappc_b64 s[30:31], s[16:17]
+; GI-NEXT: v_readlane_b32 s4, v41, 0
+; GI-NEXT: v_readlane_b32 s5, v41, 1
+; GI-NEXT: v_mov_b32_e32 v0, s4
+; GI-NEXT: v_mov_b32_e32 v1, s5
+; GI-NEXT: flat_load_dword v0, v[0:1]
+; GI-NEXT: v_readlane_b32 s31, v40, 1
+; GI-NEXT: v_readlane_b32 s30, v40, 0
+; GI-NEXT: s_mov_b32 s32, s33
+; GI-NEXT: v_readlane_b32 s4, v40, 2
+; GI-NEXT: s_or_saveexec_b64 s[6:7], -1
+; GI-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
+; GI-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
+; GI-NEXT: s_mov_b64 exec, s[6:7]
+; GI-NEXT: s_mov_b32 s33, s4
+; GI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GI-NEXT: s_setpc_b64 s[30:31]
+ %alloca = alloca i32, align 4
+ call void @bar(ptr %alloca)
+ %load = load i32, ptr %alloca
+ ret i32 %load
+}
+
+define i32 @dynamic_alloca(i32 %n) {
+; ISEL-LABEL: dynamic_alloca:
+; ISEL: ; %bb.0:
+; ISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; ISEL-NEXT: s_mov_b32 s16, s33
+; ISEL-NEXT: s_mov_b32 s33, s32
+; ISEL-NEXT: s_or_saveexec_b64 s[18:19], -1
+; ISEL-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill
+; ISEL-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:20 ; 4-byte Folded Spill
+; ISEL-NEXT: s_mov_b64 exec, s[18:19]
+; ISEL-NEXT: v_writelane_b32 v40, s16, 4
+; ISEL-NEXT: v_writelane_b32 v40, s34, 2
+; ISEL-NEXT: v_writelane_b32 v40, s35, 3
+; ISEL-NEXT: s_add_i32 s32, s32, 0x800
+; ISEL-NEXT: v_writelane_b32 v40, s30, 0
+; ISEL-NEXT: v_writelane_b32 v40, s31, 1
+; ISEL-NEXT: buffer_store_dword v31, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill
+; ISEL-NEXT: ; implicit-def: $vgpr41 : SGPR spill to VGPR lane
+; ISEL-NEXT: v_writelane_b32 v41, s15, 0
+; ISEL-NEXT: v_writelane_b32 v41, s14, 1
+; ISEL-NEXT: v_writelane_b32 v41, s13, 2
+; ISEL-NEXT: v_writelane_b32 v41, s12, 3
+; ISEL-NEXT: v_writelane_b32 v41, s10, 4
+; ISEL-NEXT: v_writelane_b32 v41, s11, 5
+; ISEL-NEXT: v_writelane_b32 v41, s8, 6
+; ISEL-NEXT: v_writelane_b32 v41, s9, 7
+; ISEL-NEXT: v_writelane_b32 v41, s6, 8
+; ISEL-NEXT: v_writelane_b32 v41, s7, 9
+; ISEL-NEXT: v_writelane_b32 v41, s4, 10
+; ISEL-NEXT: v_writelane_b32 v41, s5, 11
+; ISEL-NEXT: s_mov_b32 s5, 15
+; ISEL-NEXT: s_mov_b32 s4, 2
+; ISEL-NEXT: v_mov_b32_e32 v1, s5
+; ISEL-NEXT: v_lshl_add_u32 v0, v0, s4, v1
+; ISEL-NEXT: s_mov_b32 s4, -16
+; ISEL-NEXT: v_and_b32_e64 v0, v0, s4
+; ISEL-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
+; ISEL-NEXT: v_mov_b32_e32 v0, 0
+; ISEL-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
+; ISEL-NEXT: s_mov_b64 s[4:5], exec
+; ISEL-NEXT: s_mov_b32 s6, 0
+; ISEL-NEXT: v_writelane_b32 v41, s6, 12
+; ISEL-NEXT: v_writelane_b32 v41, s4, 13
+; ISEL-NEXT: v_writelane_b32 v41, s5, 14
+; ISEL-NEXT: s_or_saveexec_b64 s[34:35], -1
+; ISEL-NEXT: buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill
+; ISEL-NEXT: s_mov_b64 exec, s[34:35]
+; ISEL-NEXT: .LBB1_1: ; =>This Inner Loop Header: Depth=1
+; ISEL-NEXT: s_or_saveexec_b64 s[34:35], -1
+; ISEL-NEXT: buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload
+; ISEL-NEXT: s_mov_b64 exec, s[34:35]
+; ISEL-NEXT: s_waitcnt vmcnt(0)
+; ISEL-NEXT: v_readlane_b32 s4, v41, 13
+; ISEL-NEXT: v_readlane_b32 s5, v41, 14
+; ISEL-NEXT: v_readlane_b32 s6, v41, 12
+; ISEL-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
+; ISEL-NEXT: s_ff1_i32_b64 s7, s[4:5]
+; ISEL-NEXT: s_waitcnt vmcnt(0)
+; ISEL-NEXT: v_readlane_b32 s8, v0, s7
+; ISEL-NEXT: s_max_u32 s6, s6, s8
+; ISEL-NEXT: v_writelane_b32 v41, s6, 15
+; ISEL-NEXT: s_bitset0_b64 s[4:5], s7
+; ISEL-NEXT: s_cmp_lg_u64 s[4:5], 0
+; ISEL-NEXT: v_writelane_b32 v41, s6, 12
+; ISEL-NEXT: v_writelane_b32 v41, s4, 13
+; ISEL-NEXT: v_writelane_b32 v41, s5, 14
+; ISEL-NEXT: s_mov_b64 s[34:35], exec
+; ISEL-NEXT: s_mov_b64 exec, -1
+; ISEL-NEXT: buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill
+; ISEL-NEXT: s_mov_b64 exec, s[34:35]
+; ISEL-NEXT: s_cbranch_scc1 .LBB1_1
+; ISEL-NEXT: ; %bb.2:
+; ISEL-NEXT: s_or_saveexec_b64 s[34:35], -1
+; ISEL-NEXT: buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload
+; ISEL-NEXT: s_mov_b64 exec, s[34:35]
+; ISEL-NEXT: s_waitcnt vmcnt(0)
+; ISEL-NEXT: v_readlane_b32 s15, v41, 0
+; ISEL-NEXT: v_readlane_b32 s14, v41, 1
+; ISEL-NEXT: v_readlane_b32 s13, v41, 2
+; ISEL-NEXT: v_readlane_b32 s12, v41, 3
+; ISEL-NEXT: v_readlane_b32 s10, v41, 4
+; ISEL-NEXT: v_readlane_b32 s11, v41, 5
+; ISEL-NEXT: v_readlane_b32 s8, v41, 6
+; ISEL-NEXT: v_readlane_b32 s9, v41, 7
+; ISEL-NEXT: v_readlane_b32 s6, v41, 8
+; ISEL-NEXT: v_readlane_b32 s7, v41, 9
+; ISEL-NEXT: v_readlane_b32 s4, v41, 10
+; ISEL-NEXT: v_readlane_b32 s5, v41, 11
+; ISEL-NEXT: v_readlane_b32 s16, v41, 15
+; ISEL-NEXT: buffer_load_dword v31, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload
+; ISEL-NEXT: s_mov_b32 s19, s32
+; ISEL-NEXT: s_mov_b32 s17, 6
+; ISEL-NEXT: v_mov_b32_e32 v0, s17
+; ISEL-NEXT: v_mov_b32_e32 v1, s19
+; ISEL-NEXT: v_lshl_add_u32 v0, s16, v0, v1
+; ISEL-NEXT: v_readfirstlane_b32 s20, v0
+; ISEL-NEXT: s_mov_b64 s[16:17], src_private_base
+; ISEL-NEXT: s_mov_b32 s18, 32
+; ISEL-NEXT: s_lshr_b64 s[16:17], s[16:17], s18
+; ISEL-NEXT: ; kill: def $sgpr16 killed $sgpr16 killed $sgpr16_sgpr17
+; ISEL-NEXT: s_mov_b64 s[22:23], 0
+; ISEL-NEXT: s_mov_b32 s17, s23
+; ISEL-NEXT: s_mov_b32 s21, -1
+; ISEL-NEXT: s_cmp_lg_u32 s19, s21
+; ISEL-NEXT: s_cselect_b32 s21, s16, s17
+; ISEL-NEXT: ; implicit-def: $sgpr16
+; ISEL-NEXT: ; implicit-def: $sgpr17
+; ISEL-NEXT: ; kill: def $sgpr16 killed $sgpr16 def $sgpr16_sgpr17
+; ISEL-NEXT: s_mov_b32 s17, s21
+; ISEL-NEXT: s_mov_b32 s21, s22
+; ISEL-NEXT: s_cselect_b32 s19, s19, s21
+; ISEL-NEXT: s_mov_b32 s32, s20
+; ISEL-NEXT: s_lshr_b64 s[16:17], s[16:17], s18
+; ISEL-NEXT: s_mov_b32 s18, s16
+; ISEL-NEXT: s_getpc_b64 s[16:17]
+; ISEL-NEXT: s_add_u32 s16, s16, bar at gotpcrel32@lo+4
+; ISEL-NEXT: s_addc_u32 s17, s17, bar at gotpcrel32@hi+12
+; ISEL-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0
+; ISEL-NEXT: s_mov_b64 s[22:23], s[2:3]
+; ISEL-NEXT: s_mov_b64 s[20:21], s[0:1]
+; ISEL-NEXT: s_mov_b64 s[0:1], s[20:21]
+; ISEL-NEXT: s_mov_b64 s[2:3], s[22:23]
+; ISEL-NEXT: v_mov_b32_e32 v0, s19
+; ISEL-NEXT: v_mov_b32_e32 v1, s18
+; ISEL-NEXT: s_waitcnt lgkmcnt(0)
+; ISEL-NEXT: s_swappc_b64 s[30:31], s[16:17]
+; ISEL-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
+; ISEL-NEXT: v_readlane_b32 s31, v40, 1
+; ISEL-NEXT: v_readlane_b32 s30, v40, 0
+; ISEL-NEXT: s_mov_b32 s32, s33
+; ISEL-NEXT: v_readlane_b32 s4, v40, 4
+; ISEL-NEXT: v_readlane_b32 s34, v40, 2
+; ISEL-NEXT: v_readlane_b32 s35, v40, 3
+; ISEL-NEXT: s_or_saveexec_b64 s[6:7], -1
+; ISEL-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:16 ; 4-byte Folded Reload
+; ISEL-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:20 ; 4-byte Folded Reload
+; ISEL-NEXT: s_mov_b64 exec, s[6:7]
+; ISEL-NEXT: s_mov_b32 s33, s4
+; ISEL-NEXT: s_waitcnt vmcnt(0)
+; ISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GI-LABEL: dynamic_alloca:
+; GI: ; %bb.0:
+; GI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GI-NEXT: s_mov_b32 s16, s33
+; GI-NEXT: s_mov_b32 s33, s32
+; GI-NEXT: s_or_saveexec_b64 s[18:19], -1
+; GI-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill
+; GI-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill
+; GI-NEXT: s_mov_b64 exec, s[18:19]
+; GI-NEXT: v_writelane_b32 v40, s16, 4
+; GI-NEXT: v_writelane_b32 v40, s34, 2
+; GI-NEXT: v_writelane_b32 v40, s35, 3
+; GI-NEXT: s_add_i32 s32, s32, 0x800
+; GI-NEXT: v_writelane_b32 v40, s30, 0
+; GI-NEXT: v_writelane_b32 v40, s31, 1
+; GI-NEXT: buffer_store_dword v31, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
+; GI-NEXT: ; implicit-def: $vgpr41 : SGPR spill to VGPR lane
+; GI-NEXT: v_writelane_b32 v41, s15, 0
+; GI-NEXT: v_writelane_b32 v41, s14, 1
+; GI-NEXT: v_writelane_b32 v41, s13, 2
+; GI-NEXT: v_writelane_b32 v41, s12, 3
+; GI-NEXT: v_writelane_b32 v41, s10, 4
+; GI-NEXT: v_writelane_b32 v41, s11, 5
+; GI-NEXT: v_writelane_b32 v41, s8, 6
+; GI-NEXT: v_writelane_b32 v41, s9, 7
+; GI-NEXT: v_writelane_b32 v41, s6, 8
+; GI-NEXT: v_writelane_b32 v41, s7, 9
+; GI-NEXT: v_writelane_b32 v41, s4, 10
+; GI-NEXT: v_writelane_b32 v41, s5, 11
+; GI-NEXT: v_mov_b32_e32 v1, v0
+; GI-NEXT: v_mov_b32_e32 v0, 0
+; GI-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GI-NEXT: v_mov_b32_e32 v2, v0
+; GI-NEXT: s_mov_b32 s4, 2
+; GI-NEXT: v_mov_b32_e32 v0, s4
+; GI-NEXT: v_lshlrev_b64 v[1:2], v0, v[1:2]
+; GI-NEXT: v_mov_b32_e32 v0, v1
+; GI-NEXT: v_mov_b32_e32 v1, v2
+; GI-NEXT: s_mov_b32 s4, 15
+; GI-NEXT: s_mov_b32 s6, 0
+; GI-NEXT: v_mov_b32_e32 v2, s4
+; GI-NEXT: v_add_co_u32_e64 v0, s[4:5], v0, v2
+; GI-NEXT: v_mov_b32_e32 v2, s6
+; GI-NEXT: v_addc_co_u32_e64 v2, s[4:5], v1, v2, s[4:5]
+; GI-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GI-NEXT: v_mov_b32_e32 v1, v2
+; GI-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GI-NEXT: s_mov_b32 s4, -16
+; GI-NEXT: v_mov_b32_e32 v1, s4
+; GI-NEXT: v_and_b32_e64 v0, v0, v1
+; GI-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
+; GI-NEXT: s_mov_b64 s[4:5], exec
+; GI-NEXT: s_mov_b32 s6, 0
+; GI-NEXT: v_writelane_b32 v41, s6, 12
+; GI-NEXT: v_writelane_b32 v41, s4, 13
+; GI-NEXT: v_writelane_b32 v41, s5, 14
+; GI-NEXT: s_or_saveexec_b64 s[34:35], -1
+; GI-NEXT: buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill
+; GI-NEXT: s_mov_b64 exec, s[34:35]
+; GI-NEXT: .LBB1_1: ; =>This Inner Loop Header: Depth=1
+; GI-NEXT: s_or_saveexec_b64 s[34:35], -1
+; GI-NEXT: buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload
+; GI-NEXT: s_mov_b64 exec, s[34:35]
+; GI-NEXT: s_waitcnt vmcnt(0)
+; GI-NEXT: v_readlane_b32 s4, v41, 13
+; GI-NEXT: v_readlane_b32 s5, v41, 14
+; GI-NEXT: v_readlane_b32 s6, v41, 12
+; GI-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
+; GI-NEXT: s_ff1_i32_b64 s7, s[4:5]
+; GI-NEXT: s_waitcnt vmcnt(0)
+; GI-NEXT: v_readlane_b32 s8, v0, s7
+; GI-NEXT: s_max_u32 s6, s6, s8
+; GI-NEXT: v_writelane_b32 v41, s6, 15
+; GI-NEXT: s_bitset0_b64 s[4:5], s7
+; GI-NEXT: s_cmp_lg_u64 s[4:5], 0
+; GI-NEXT: v_writelane_b32 v41, s6, 12
+; GI-NEXT: v_writelane_b32 v41, s4, 13
+; GI-NEXT: v_writelane_b32 v41, s5, 14
+; GI-NEXT: s_mov_b64 s[34:35], exec
+; GI-NEXT: s_mov_b64 exec, -1
+; GI-NEXT: buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill
+; GI-NEXT: s_mov_b64 exec, s[34:35]
+; GI-NEXT: s_cbranch_scc1 .LBB1_1
+; GI-NEXT: ; %bb.2:
+; GI-NEXT: s_or_saveexec_b64 s[34:35], -1
+; GI-NEXT: buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload
+; GI-NEXT: s_mov_b64 exec, s[34:35]
+; GI-NEXT: s_waitcnt vmcnt(0)
+; GI-NEXT: v_readlane_b32 s15, v41, 0
+; GI-NEXT: v_readlane_b32 s14, v41, 1
+; GI-NEXT: v_readlane_b32 s13, v41, 2
+; GI-NEXT: v_readlane_b32 s12, v41, 3
+; GI-NEXT: v_readlane_b32 s10, v41, 4
+; GI-NEXT: v_readlane_b32 s11, v41, 5
+; GI-NEXT: v_readlane_b32 s8, v41, 6
+; GI-NEXT: v_readlane_b32 s9, v41, 7
+; GI-NEXT: v_readlane_b32 s6, v41, 8
+; GI-NEXT: v_readlane_b32 s7, v41, 9
+; GI-NEXT: v_readlane_b32 s4, v41, 10
+; GI-NEXT: v_readlane_b32 s5, v41, 11
+; GI-NEXT: v_readlane_b32 s16, v41, 15
+; GI-NEXT: buffer_load_dword v31, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
+; GI-NEXT: s_mov_b32 s17, 6
+; GI-NEXT: s_lshl_b32 s16, s16, s17
+; GI-NEXT: s_mov_b32 s20, s32
+; GI-NEXT: s_add_u32 s16, s20, s16
+; GI-NEXT: s_mov_b32 s32, s16
+; GI-NEXT: s_mov_b64 s[16:17], src_private_base
+; GI-NEXT: ; kill: def $sgpr18 killed $sgpr16
+; GI-NEXT: s_mov_b32 s18, s17
+; GI-NEXT: s_mov_b32 s16, s20
+; GI-NEXT: s_mov_b32 s17, s18
+; GI-NEXT: s_mov_b32 s21, -1
+; GI-NEXT: s_mov_b64 s[18:19], 0
+; GI-NEXT: s_cmp_lg_u32 s20, s21
+; GI-NEXT: s_cselect_b32 s20, 1, 0
+; GI-NEXT: s_cmp_lg_u32 s20, 0
+; GI-NEXT: s_cselect_b64 s[18:19], s[16:17], s[18:19]
+; GI-NEXT: s_mov_b32 s17, s18
+; GI-NEXT: s_mov_b32 s16, s19
+; GI-NEXT: v_mov_b32_e32 v0, s17
+; GI-NEXT: v_mov_b32_e32 v1, s16
+; GI-NEXT: s_mov_b64 s[18:19], s[2:3]
+; GI-NEXT: s_mov_b64 s[16:17], s[0:1]
+; GI-NEXT: s_mov_b64 s[0:1], s[16:17]
+; GI-NEXT: s_mov_b64 s[2:3], s[18:19]
+; GI-NEXT: s_getpc_b64 s[16:17]
+; GI-NEXT: s_add_u32 s16, s16, bar at gotpcrel32@lo+4
+; GI-NEXT: s_addc_u32 s17, s17, bar at gotpcrel32@hi+12
+; GI-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0
+; GI-NEXT: s_waitcnt lgkmcnt(0)
+; GI-NEXT: s_swappc_b64 s[30:31], s[16:17]
+; GI-NEXT: s_mov_b32 s4, 0
+; GI-NEXT: v_mov_b32_e32 v0, s4
+; GI-NEXT: v_readlane_b32 s31, v40, 1
+; GI-NEXT: v_readlane_b32 s30, v40, 0
+; GI-NEXT: s_mov_b32 s32, s33
+; GI-NEXT: v_readlane_b32 s4, v40, 4
+; GI-NEXT: v_readlane_b32 s34, v40, 2
+; GI-NEXT: v_readlane_b32 s35, v40, 3
+; GI-NEXT: s_or_saveexec_b64 s[6:7], -1
+; GI-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload
+; GI-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:16 ; 4-byte Folded Reload
+; GI-NEXT: s_mov_b64 exec, s[6:7]
+; GI-NEXT: s_mov_b32 s33, s4
+; GI-NEXT: s_waitcnt vmcnt(0)
+; GI-NEXT: s_setpc_b64 s[30:31]
+ %alloca = alloca i32, i32 %n, align 4
+ call void @bar(ptr %alloca)
+ %load = load i32, ptr %alloca
+ ret i32 0
+}
diff --git a/llvm/test/CodeGen/AMDGPU/assert-wrong-alloca-addrspace.ll b/llvm/test/CodeGen/AMDGPU/assert-wrong-alloca-addrspace.ll
deleted file mode 100644
index 1e72e679e83c0..0000000000000
--- a/llvm/test/CodeGen/AMDGPU/assert-wrong-alloca-addrspace.ll
+++ /dev/null
@@ -1,16 +0,0 @@
-; RUN: not --crash llc -mtriple=amdgcn -mcpu=gfx900 -filetype=null %s 2>&1 | FileCheck %s
-
-; The alloca has the wrong address space and is passed to a call. The
-; FrameIndex was created with the natural 32-bit pointer type instead
-; of the declared 64-bit. Make sure we don't assert.
-
-; CHECK: LLVM ERROR: Cannot select: {{.*}}: i64 = FrameIndex<0>
-
-declare void @func(ptr)
-
-define void @main() {
-bb:
- %alloca = alloca i32, align 4
- call void @func(ptr %alloca)
- ret void
-}
More information about the llvm-commits
mailing list