[llvm] [AMDGPU] Support alloca in AS0 (PR #136584)

Tue Apr 22 18:49:18 PDT 2025

https://github.com/shiltian updated https://github.com/llvm/llvm-project/pull/136584

>From ae093995b232a38ce4487e5009fd76a3d1dd9fec Mon Sep 17 00:00:00 2001
From: Shilei Tian <i at tianshilei.me>
Date: Tue, 22 Apr 2025 20:36:20 -0400
Subject: [PATCH] [AMDGPU] Support alloca in AS0

This PR lowers an alloca in AS0 to an alloca in AS5 followed by an addrspacecast
back to AS0.
---
 llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp |   3 +
 .../lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp |  33 +-
 llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h  |   5 +
 llvm/lib/Target/AMDGPU/SIISelLowering.cpp     |  29 +-
 llvm/lib/Target/AMDGPU/SIISelLowering.h       |   1 +
 llvm/test/CodeGen/AMDGPU/alloca-as0.ll        | 417 ++++++++++++++++++
 .../AMDGPU/assert-wrong-alloca-addrspace.ll   |  16 -
 7 files changed, 485 insertions(+), 19 deletions(-)
 create mode 100644 llvm/test/CodeGen/AMDGPU/alloca-as0.ll
 delete mode 100644 llvm/test/CodeGen/AMDGPU/assert-wrong-alloca-addrspace.ll

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index 2846405a2538c..ddc61a219eb83 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -385,9 +385,12 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
 
   setOperationAction({ISD::BR_JT, ISD::BRIND}, MVT::Other, Expand);
 
+  setOperationAction(ISD::FrameIndex, MVT::i64, Custom);
+
   // For R600, this is totally unsupported, just custom lower to produce an
   // error.
   setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Custom);
+  setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i64, Custom);
 
   // Library functions.  These default to Expand, but we have instructions
   // for them.
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
index beb6432170970..4d7002db2cca7 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
@@ -912,12 +912,15 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
       .widenScalarToNextPow2(0, 32)
       .clampMaxNumElements(0, S32, 16);
 
-  getActionDefinitionsBuilder(G_FRAME_INDEX).legalFor({PrivatePtr});
+  getActionDefinitionsBuilder(G_FRAME_INDEX)
+      .legalFor({PrivatePtr})
+      .customFor({FlatPtr});
 
   // If the amount is divergent, we have to do a wave reduction to get the
   // maximum value, so this is expanded during RegBankSelect.
   getActionDefinitionsBuilder(G_DYN_STACKALLOC)
-    .legalFor({{PrivatePtr, S32}});
+      .legalFor({{PrivatePtr, S32}})
+      .customFor({FlatPtr, S32});
 
   getActionDefinitionsBuilder(G_STACKSAVE)
     .customFor({PrivatePtr});
@@ -2221,6 +2224,10 @@ bool AMDGPULegalizerInfo::legalizeCustom(
     return legalizeTrap(MI, MRI, B);
   case TargetOpcode::G_DEBUGTRAP:
     return legalizeDebugTrap(MI, MRI, B);
+  case TargetOpcode::G_FRAME_INDEX:
+    return legalizeFrameIndex(MI, MRI, B);
+  case TargetOpcode::G_DYN_STACKALLOC:
+    return legalizeDynStackAlloc(MI, MRI, B);
   default:
     return false;
   }
@@ -7668,3 +7675,25 @@ bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
 
   return true;
 }
+
+bool AMDGPULegalizerInfo::legalizeFrameIndex(MachineInstr &MI,
+                                             MachineRegisterInfo &MRI,
+                                             MachineIRBuilder &B) const {
+  MachineInstrBuilder FI = B.buildFrameIndex(
+      LLT::pointer(AMDGPUAS::PRIVATE_ADDRESS, 32), MI.getOperand(1).getIndex());
+  B.buildAddrSpaceCast(MI.getOperand(0).getReg(), FI);
+  MI.eraseFromParent();
+  return true;
+}
+
+bool AMDGPULegalizerInfo::legalizeDynStackAlloc(MachineInstr &MI,
+                                                MachineRegisterInfo &MRI,
+                                                MachineIRBuilder &B) const {
+  MachineInstrBuilder Size = B.buildTrunc(S32, MI.getOperand(1));
+  Align Alignment(MI.getOperand(2).getImm());
+  MachineInstrBuilder DynStackAlloc = B.buildDynStackAlloc(
+      LLT::pointer(AMDGPUAS::PRIVATE_ADDRESS, 32), Size, Alignment);
+  B.buildAddrSpaceCast(MI.getOperand(0).getReg(), DynStackAlloc);
+  MI.eraseFromParent();
+  return true;
+}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h
index 1f4e02b0d600a..55250530689cb 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h
@@ -246,6 +246,11 @@ class AMDGPULegalizerInfo final : public LegalizerInfo {
 
   bool legalizeIntrinsic(LegalizerHelper &Helper,
                          MachineInstr &MI) const override;
+
+  bool legalizeFrameIndex(MachineInstr &MI, MachineRegisterInfo &MRI,
+                          MachineIRBuilder &B) const;
+  bool legalizeDynStackAlloc(MachineInstr &MI, MachineRegisterInfo &MRI,
+                             MachineIRBuilder &B) const;
 };
 } // End llvm namespace.
 #endif
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 2e3cd5ca6692d..3f2e5fbce03a1 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -4117,6 +4117,17 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI,
                          InVals, /*IsThisReturn=*/false, SDValue());
 }
 
+SDValue SITargetLowering::lowerFrameIndex(SDValue Op, SelectionDAG &DAG) const {
+  // Since address space information is lost here, we assume that an i64 frame
+  // index comes from an alloca in AS0.
+  SDLoc DL(Op);
+  auto *FI = cast<FrameIndexSDNode>(Op);
+  SDValue TFI = DAG.getFrameIndex(FI->getIndex(), MVT::i32);
+  return DAG.getAddrSpaceCast(DL, Op.getValueType(), TFI,
+                              AMDGPUAS::PRIVATE_ADDRESS,
+                              AMDGPUAS::FLAT_ADDRESS);
+}
+
 // This is similar to the default implementation in ExpandDYNAMIC_STACKALLOC,
 // except for:
 // 1. Stack growth direction(default: downwards, AMDGPU: upwards), and
@@ -4129,13 +4140,27 @@ SDValue SITargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
   SDLoc dl(Op);
   EVT VT = Op.getValueType();
   SDValue Chain = Op.getOperand(0);
+  SDValue Size = Op.getOperand(1);
+
+  // Since address space information is lost here, we assume that an i64 dynamic
+  // alloca comes from an alloca in AS0.
+  if (VT == MVT::i64) {
+    SDValue Align = Op.getOperand(2);
+    Size = DAG.getZExtOrTrunc(Size, dl, MVT::i32);
+    SDValue Ops[] = {Chain, Size, Align};
+    SDValue DynAlloc =
+        DAG.getNode(ISD::DYNAMIC_STACKALLOC, dl, {MVT::i32, MVT::Other}, Ops);
+    SDValue Cast = DAG.getAddrSpaceCast(
+        dl, VT, DynAlloc, AMDGPUAS::PRIVATE_ADDRESS, AMDGPUAS::FLAT_ADDRESS);
+    return DAG.getMergeValues({Cast, DynAlloc.getValue(1)}, dl);
+  }
+
   Register SPReg = Info->getStackPtrOffsetReg();
 
   // Chain the dynamic stack allocation so that it doesn't modify the stack
   // pointer when other instructions are using the stack.
   Chain = DAG.getCALLSEQ_START(Chain, 0, 0, dl);
 
-  SDValue Size = Op.getOperand(1);
   SDValue BaseAddr = DAG.getCopyFromReg(Chain, dl, SPReg, VT);
   Align Alignment = cast<ConstantSDNode>(Op.getOperand(2))->getAlignValue();
 
@@ -6087,6 +6112,8 @@ SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
   case ISD::SMUL_LOHI:
   case ISD::UMUL_LOHI:
     return lowerXMUL_LOHI(Op, DAG);
+  case ISD::FrameIndex:
+    return lowerFrameIndex(Op, DAG);
   case ISD::DYNAMIC_STACKALLOC:
     return LowerDYNAMIC_STACKALLOC(Op, DAG);
   case ISD::STACKSAVE:
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.h b/llvm/lib/Target/AMDGPU/SIISelLowering.h
index c42366a1c04c8..f08cd15282c94 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.h
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.h
@@ -428,6 +428,7 @@ class SITargetLowering final : public AMDGPUTargetLowering {
   SDValue LowerCall(CallLoweringInfo &CLI,
                     SmallVectorImpl<SDValue> &InVals) const override;
 
+  SDValue lowerFrameIndex(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerSTACKSAVE(SDValue Op, SelectionDAG &DAG) const;
   SDValue lowerGET_ROUNDING(SDValue Op, SelectionDAG &DAG) const;
diff --git a/llvm/test/CodeGen/AMDGPU/alloca-as0.ll b/llvm/test/CodeGen/AMDGPU/alloca-as0.ll
new file mode 100644
index 0000000000000..5f20f7ce1e638
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/alloca-as0.ll
@@ -0,0 +1,417 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -O0 %s -o - | FileCheck %s --check-prefix=ISEL
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -global-isel -O0 %s -o - | FileCheck %s --check-prefix=GI
+
+declare void @bar(ptr)
+
+define i32 @static_alloca() {
+; ISEL-LABEL: static_alloca:
+; ISEL:       ; %bb.0:
+; ISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; ISEL-NEXT:    s_mov_b32 s16, s33
+; ISEL-NEXT:    s_mov_b32 s33, s32
+; ISEL-NEXT:    s_or_saveexec_b64 s[18:19], -1
+; ISEL-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
+; ISEL-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
+; ISEL-NEXT:    s_mov_b64 exec, s[18:19]
+; ISEL-NEXT:    v_writelane_b32 v40, s16, 2
+; ISEL-NEXT:    s_add_i32 s32, s32, 0x400
+; ISEL-NEXT:    v_writelane_b32 v40, s30, 0
+; ISEL-NEXT:    v_writelane_b32 v40, s31, 1
+; ISEL-NEXT:    s_mov_b32 s18, 32
+; ISEL-NEXT:    s_mov_b64 s[16:17], src_private_base
+; ISEL-NEXT:    s_lshr_b64 s[16:17], s[16:17], s18
+; ISEL-NEXT:    s_mov_b32 s17, s16
+; ISEL-NEXT:    s_mov_b64 s[22:23], 0
+; ISEL-NEXT:    s_mov_b32 s19, s23
+; ISEL-NEXT:    s_mov_b32 s20, -1
+; ISEL-NEXT:    s_lshr_b32 s16, s33, 6
+; ISEL-NEXT:    s_cmp_lg_u32 s16, s20
+; ISEL-NEXT:    s_cselect_b32 s20, s17, s19
+; ISEL-NEXT:    s_mov_b32 s17, s22
+; ISEL-NEXT:    s_cselect_b32 s19, s16, s17
+; ISEL-NEXT:    s_mov_b32 s16, s19
+; ISEL-NEXT:    s_mov_b32 s17, s20
+; ISEL-NEXT:    ; implicit-def: $vgpr41 : SGPR spill to VGPR lane
+; ISEL-NEXT:    v_writelane_b32 v41, s16, 0
+; ISEL-NEXT:    v_writelane_b32 v41, s17, 1
+; ISEL-NEXT:    s_lshr_b64 s[16:17], s[16:17], s18
+; ISEL-NEXT:    s_mov_b32 s18, s16
+; ISEL-NEXT:    s_getpc_b64 s[16:17]
+; ISEL-NEXT:    s_add_u32 s16, s16, bar at gotpcrel32@lo+4
+; ISEL-NEXT:    s_addc_u32 s17, s17, bar at gotpcrel32@hi+12
+; ISEL-NEXT:    s_load_dwordx2 s[16:17], s[16:17], 0x0
+; ISEL-NEXT:    s_mov_b64 s[22:23], s[2:3]
+; ISEL-NEXT:    s_mov_b64 s[20:21], s[0:1]
+; ISEL-NEXT:    s_mov_b64 s[0:1], s[20:21]
+; ISEL-NEXT:    s_mov_b64 s[2:3], s[22:23]
+; ISEL-NEXT:    v_mov_b32_e32 v0, s19
+; ISEL-NEXT:    v_mov_b32_e32 v1, s18
+; ISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; ISEL-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; ISEL-NEXT:    v_readlane_b32 s4, v41, 0
+; ISEL-NEXT:    v_readlane_b32 s5, v41, 1
+; ISEL-NEXT:    v_mov_b32_e32 v0, s4
+; ISEL-NEXT:    v_mov_b32_e32 v1, s5
+; ISEL-NEXT:    flat_load_dword v0, v[0:1]
+; ISEL-NEXT:    v_readlane_b32 s31, v40, 1
+; ISEL-NEXT:    v_readlane_b32 s30, v40, 0
+; ISEL-NEXT:    s_mov_b32 s32, s33
+; ISEL-NEXT:    v_readlane_b32 s4, v40, 2
+; ISEL-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; ISEL-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
+; ISEL-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
+; ISEL-NEXT:    s_mov_b64 exec, s[6:7]
+; ISEL-NEXT:    s_mov_b32 s33, s4
+; ISEL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; ISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GI-LABEL: static_alloca:
+; GI:       ; %bb.0:
+; GI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GI-NEXT:    s_mov_b32 s16, s33
+; GI-NEXT:    s_mov_b32 s33, s32
+; GI-NEXT:    s_or_saveexec_b64 s[18:19], -1
+; GI-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
+; GI-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
+; GI-NEXT:    s_mov_b64 exec, s[18:19]
+; GI-NEXT:    v_writelane_b32 v40, s16, 2
+; GI-NEXT:    s_add_i32 s32, s32, 0x400
+; GI-NEXT:    v_writelane_b32 v40, s30, 0
+; GI-NEXT:    v_writelane_b32 v40, s31, 1
+; GI-NEXT:    s_lshr_b32 s17, s33, 6
+; GI-NEXT:    s_mov_b64 s[18:19], src_private_base
+; GI-NEXT:    ; kill: def $sgpr16 killed $sgpr18
+; GI-NEXT:    s_mov_b32 s16, s19
+; GI-NEXT:    s_mov_b32 s18, s17
+; GI-NEXT:    s_mov_b32 s19, s16
+; GI-NEXT:    ; implicit-def: $vgpr41 : SGPR spill to VGPR lane
+; GI-NEXT:    v_writelane_b32 v41, s18, 0
+; GI-NEXT:    v_writelane_b32 v41, s19, 1
+; GI-NEXT:    v_mov_b32_e32 v0, s17
+; GI-NEXT:    v_mov_b32_e32 v1, s16
+; GI-NEXT:    s_mov_b64 s[18:19], s[2:3]
+; GI-NEXT:    s_mov_b64 s[16:17], s[0:1]
+; GI-NEXT:    s_mov_b64 s[0:1], s[16:17]
+; GI-NEXT:    s_mov_b64 s[2:3], s[18:19]
+; GI-NEXT:    s_getpc_b64 s[16:17]
+; GI-NEXT:    s_add_u32 s16, s16, bar at gotpcrel32@lo+4
+; GI-NEXT:    s_addc_u32 s17, s17, bar at gotpcrel32@hi+12
+; GI-NEXT:    s_load_dwordx2 s[16:17], s[16:17], 0x0
+; GI-NEXT:    s_waitcnt lgkmcnt(0)
+; GI-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GI-NEXT:    v_readlane_b32 s4, v41, 0
+; GI-NEXT:    v_readlane_b32 s5, v41, 1
+; GI-NEXT:    v_mov_b32_e32 v0, s4
+; GI-NEXT:    v_mov_b32_e32 v1, s5
+; GI-NEXT:    flat_load_dword v0, v[0:1]
+; GI-NEXT:    v_readlane_b32 s31, v40, 1
+; GI-NEXT:    v_readlane_b32 s30, v40, 0
+; GI-NEXT:    s_mov_b32 s32, s33
+; GI-NEXT:    v_readlane_b32 s4, v40, 2
+; GI-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; GI-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
+; GI-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
+; GI-NEXT:    s_mov_b64 exec, s[6:7]
+; GI-NEXT:    s_mov_b32 s33, s4
+; GI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GI-NEXT:    s_setpc_b64 s[30:31]
+  %alloca = alloca i32, align 4
+  call void @bar(ptr %alloca)
+  %load = load i32, ptr %alloca
+  ret i32 %load
+}
+
+define i32 @dynamic_alloca(i32 %n) {
+; ISEL-LABEL: dynamic_alloca:
+; ISEL:       ; %bb.0:
+; ISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; ISEL-NEXT:    s_mov_b32 s16, s33
+; ISEL-NEXT:    s_mov_b32 s33, s32
+; ISEL-NEXT:    s_or_saveexec_b64 s[18:19], -1
+; ISEL-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill
+; ISEL-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:20 ; 4-byte Folded Spill
+; ISEL-NEXT:    s_mov_b64 exec, s[18:19]
+; ISEL-NEXT:    v_writelane_b32 v40, s16, 4
+; ISEL-NEXT:    v_writelane_b32 v40, s34, 2
+; ISEL-NEXT:    v_writelane_b32 v40, s35, 3
+; ISEL-NEXT:    s_add_i32 s32, s32, 0x800
+; ISEL-NEXT:    v_writelane_b32 v40, s30, 0
+; ISEL-NEXT:    v_writelane_b32 v40, s31, 1
+; ISEL-NEXT:    buffer_store_dword v31, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill
+; ISEL-NEXT:    ; implicit-def: $vgpr41 : SGPR spill to VGPR lane
+; ISEL-NEXT:    v_writelane_b32 v41, s15, 0
+; ISEL-NEXT:    v_writelane_b32 v41, s14, 1
+; ISEL-NEXT:    v_writelane_b32 v41, s13, 2
+; ISEL-NEXT:    v_writelane_b32 v41, s12, 3
+; ISEL-NEXT:    v_writelane_b32 v41, s10, 4
+; ISEL-NEXT:    v_writelane_b32 v41, s11, 5
+; ISEL-NEXT:    v_writelane_b32 v41, s8, 6
+; ISEL-NEXT:    v_writelane_b32 v41, s9, 7
+; ISEL-NEXT:    v_writelane_b32 v41, s6, 8
+; ISEL-NEXT:    v_writelane_b32 v41, s7, 9
+; ISEL-NEXT:    v_writelane_b32 v41, s4, 10
+; ISEL-NEXT:    v_writelane_b32 v41, s5, 11
+; ISEL-NEXT:    s_mov_b32 s5, 15
+; ISEL-NEXT:    s_mov_b32 s4, 2
+; ISEL-NEXT:    v_mov_b32_e32 v1, s5
+; ISEL-NEXT:    v_lshl_add_u32 v0, v0, s4, v1
+; ISEL-NEXT:    s_mov_b32 s4, -16
+; ISEL-NEXT:    v_and_b32_e64 v0, v0, s4
+; ISEL-NEXT:    buffer_store_dword v0, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
+; ISEL-NEXT:    v_mov_b32_e32 v0, 0
+; ISEL-NEXT:    buffer_store_dword v0, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
+; ISEL-NEXT:    s_mov_b64 s[4:5], exec
+; ISEL-NEXT:    s_mov_b32 s6, 0
+; ISEL-NEXT:    v_writelane_b32 v41, s6, 12
+; ISEL-NEXT:    v_writelane_b32 v41, s4, 13
+; ISEL-NEXT:    v_writelane_b32 v41, s5, 14
+; ISEL-NEXT:    s_or_saveexec_b64 s[34:35], -1
+; ISEL-NEXT:    buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill
+; ISEL-NEXT:    s_mov_b64 exec, s[34:35]
+; ISEL-NEXT:  .LBB1_1: ; =>This Inner Loop Header: Depth=1
+; ISEL-NEXT:    s_or_saveexec_b64 s[34:35], -1
+; ISEL-NEXT:    buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload
+; ISEL-NEXT:    s_mov_b64 exec, s[34:35]
+; ISEL-NEXT:    s_waitcnt vmcnt(0)
+; ISEL-NEXT:    v_readlane_b32 s4, v41, 13
+; ISEL-NEXT:    v_readlane_b32 s5, v41, 14
+; ISEL-NEXT:    v_readlane_b32 s6, v41, 12
+; ISEL-NEXT:    buffer_load_dword v0, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
+; ISEL-NEXT:    s_ff1_i32_b64 s7, s[4:5]
+; ISEL-NEXT:    s_waitcnt vmcnt(0)
+; ISEL-NEXT:    v_readlane_b32 s8, v0, s7
+; ISEL-NEXT:    s_max_u32 s6, s6, s8
+; ISEL-NEXT:    v_writelane_b32 v41, s6, 15
+; ISEL-NEXT:    s_bitset0_b64 s[4:5], s7
+; ISEL-NEXT:    s_cmp_lg_u64 s[4:5], 0
+; ISEL-NEXT:    v_writelane_b32 v41, s6, 12
+; ISEL-NEXT:    v_writelane_b32 v41, s4, 13
+; ISEL-NEXT:    v_writelane_b32 v41, s5, 14
+; ISEL-NEXT:    s_mov_b64 s[34:35], exec
+; ISEL-NEXT:    s_mov_b64 exec, -1
+; ISEL-NEXT:    buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill
+; ISEL-NEXT:    s_mov_b64 exec, s[34:35]
+; ISEL-NEXT:    s_cbranch_scc1 .LBB1_1
+; ISEL-NEXT:  ; %bb.2:
+; ISEL-NEXT:    s_or_saveexec_b64 s[34:35], -1
+; ISEL-NEXT:    buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload
+; ISEL-NEXT:    s_mov_b64 exec, s[34:35]
+; ISEL-NEXT:    s_waitcnt vmcnt(0)
+; ISEL-NEXT:    v_readlane_b32 s15, v41, 0
+; ISEL-NEXT:    v_readlane_b32 s14, v41, 1
+; ISEL-NEXT:    v_readlane_b32 s13, v41, 2
+; ISEL-NEXT:    v_readlane_b32 s12, v41, 3
+; ISEL-NEXT:    v_readlane_b32 s10, v41, 4
+; ISEL-NEXT:    v_readlane_b32 s11, v41, 5
+; ISEL-NEXT:    v_readlane_b32 s8, v41, 6
+; ISEL-NEXT:    v_readlane_b32 s9, v41, 7
+; ISEL-NEXT:    v_readlane_b32 s6, v41, 8
+; ISEL-NEXT:    v_readlane_b32 s7, v41, 9
+; ISEL-NEXT:    v_readlane_b32 s4, v41, 10
+; ISEL-NEXT:    v_readlane_b32 s5, v41, 11
+; ISEL-NEXT:    v_readlane_b32 s16, v41, 15
+; ISEL-NEXT:    buffer_load_dword v31, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload
+; ISEL-NEXT:    s_mov_b32 s19, s32
+; ISEL-NEXT:    s_mov_b32 s17, 6
+; ISEL-NEXT:    v_mov_b32_e32 v0, s17
+; ISEL-NEXT:    v_mov_b32_e32 v1, s19
+; ISEL-NEXT:    v_lshl_add_u32 v0, s16, v0, v1
+; ISEL-NEXT:    v_readfirstlane_b32 s20, v0
+; ISEL-NEXT:    s_mov_b64 s[16:17], src_private_base
+; ISEL-NEXT:    s_mov_b32 s18, 32
+; ISEL-NEXT:    s_lshr_b64 s[16:17], s[16:17], s18
+; ISEL-NEXT:    ; kill: def $sgpr16 killed $sgpr16 killed $sgpr16_sgpr17
+; ISEL-NEXT:    s_mov_b64 s[22:23], 0
+; ISEL-NEXT:    s_mov_b32 s17, s23
+; ISEL-NEXT:    s_mov_b32 s21, -1
+; ISEL-NEXT:    s_cmp_lg_u32 s19, s21
+; ISEL-NEXT:    s_cselect_b32 s21, s16, s17
+; ISEL-NEXT:    ; implicit-def: $sgpr16
+; ISEL-NEXT:    ; implicit-def: $sgpr17
+; ISEL-NEXT:    ; kill: def $sgpr16 killed $sgpr16 def $sgpr16_sgpr17
+; ISEL-NEXT:    s_mov_b32 s17, s21
+; ISEL-NEXT:    s_mov_b32 s21, s22
+; ISEL-NEXT:    s_cselect_b32 s19, s19, s21
+; ISEL-NEXT:    s_mov_b32 s32, s20
+; ISEL-NEXT:    s_lshr_b64 s[16:17], s[16:17], s18
+; ISEL-NEXT:    s_mov_b32 s18, s16
+; ISEL-NEXT:    s_getpc_b64 s[16:17]
+; ISEL-NEXT:    s_add_u32 s16, s16, bar at gotpcrel32@lo+4
+; ISEL-NEXT:    s_addc_u32 s17, s17, bar at gotpcrel32@hi+12
+; ISEL-NEXT:    s_load_dwordx2 s[16:17], s[16:17], 0x0
+; ISEL-NEXT:    s_mov_b64 s[22:23], s[2:3]
+; ISEL-NEXT:    s_mov_b64 s[20:21], s[0:1]
+; ISEL-NEXT:    s_mov_b64 s[0:1], s[20:21]
+; ISEL-NEXT:    s_mov_b64 s[2:3], s[22:23]
+; ISEL-NEXT:    v_mov_b32_e32 v0, s19
+; ISEL-NEXT:    v_mov_b32_e32 v1, s18
+; ISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; ISEL-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; ISEL-NEXT:    buffer_load_dword v0, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
+; ISEL-NEXT:    v_readlane_b32 s31, v40, 1
+; ISEL-NEXT:    v_readlane_b32 s30, v40, 0
+; ISEL-NEXT:    s_mov_b32 s32, s33
+; ISEL-NEXT:    v_readlane_b32 s4, v40, 4
+; ISEL-NEXT:    v_readlane_b32 s34, v40, 2
+; ISEL-NEXT:    v_readlane_b32 s35, v40, 3
+; ISEL-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; ISEL-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:16 ; 4-byte Folded Reload
+; ISEL-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:20 ; 4-byte Folded Reload
+; ISEL-NEXT:    s_mov_b64 exec, s[6:7]
+; ISEL-NEXT:    s_mov_b32 s33, s4
+; ISEL-NEXT:    s_waitcnt vmcnt(0)
+; ISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GI-LABEL: dynamic_alloca:
+; GI:       ; %bb.0:
+; GI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GI-NEXT:    s_mov_b32 s16, s33
+; GI-NEXT:    s_mov_b32 s33, s32
+; GI-NEXT:    s_or_saveexec_b64 s[18:19], -1
+; GI-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill
+; GI-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill
+; GI-NEXT:    s_mov_b64 exec, s[18:19]
+; GI-NEXT:    v_writelane_b32 v40, s16, 4
+; GI-NEXT:    v_writelane_b32 v40, s34, 2
+; GI-NEXT:    v_writelane_b32 v40, s35, 3
+; GI-NEXT:    s_add_i32 s32, s32, 0x800
+; GI-NEXT:    v_writelane_b32 v40, s30, 0
+; GI-NEXT:    v_writelane_b32 v40, s31, 1
+; GI-NEXT:    buffer_store_dword v31, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
+; GI-NEXT:    ; implicit-def: $vgpr41 : SGPR spill to VGPR lane
+; GI-NEXT:    v_writelane_b32 v41, s15, 0
+; GI-NEXT:    v_writelane_b32 v41, s14, 1
+; GI-NEXT:    v_writelane_b32 v41, s13, 2
+; GI-NEXT:    v_writelane_b32 v41, s12, 3
+; GI-NEXT:    v_writelane_b32 v41, s10, 4
+; GI-NEXT:    v_writelane_b32 v41, s11, 5
+; GI-NEXT:    v_writelane_b32 v41, s8, 6
+; GI-NEXT:    v_writelane_b32 v41, s9, 7
+; GI-NEXT:    v_writelane_b32 v41, s6, 8
+; GI-NEXT:    v_writelane_b32 v41, s7, 9
+; GI-NEXT:    v_writelane_b32 v41, s4, 10
+; GI-NEXT:    v_writelane_b32 v41, s5, 11
+; GI-NEXT:    v_mov_b32_e32 v1, v0
+; GI-NEXT:    v_mov_b32_e32 v0, 0
+; GI-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GI-NEXT:    v_mov_b32_e32 v2, v0
+; GI-NEXT:    s_mov_b32 s4, 2
+; GI-NEXT:    v_mov_b32_e32 v0, s4
+; GI-NEXT:    v_lshlrev_b64 v[1:2], v0, v[1:2]
+; GI-NEXT:    v_mov_b32_e32 v0, v1
+; GI-NEXT:    v_mov_b32_e32 v1, v2
+; GI-NEXT:    s_mov_b32 s4, 15
+; GI-NEXT:    s_mov_b32 s6, 0
+; GI-NEXT:    v_mov_b32_e32 v2, s4
+; GI-NEXT:    v_add_co_u32_e64 v0, s[4:5], v0, v2
+; GI-NEXT:    v_mov_b32_e32 v2, s6
+; GI-NEXT:    v_addc_co_u32_e64 v2, s[4:5], v1, v2, s[4:5]
+; GI-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GI-NEXT:    v_mov_b32_e32 v1, v2
+; GI-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GI-NEXT:    s_mov_b32 s4, -16
+; GI-NEXT:    v_mov_b32_e32 v1, s4
+; GI-NEXT:    v_and_b32_e64 v0, v0, v1
+; GI-NEXT:    buffer_store_dword v0, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
+; GI-NEXT:    s_mov_b64 s[4:5], exec
+; GI-NEXT:    s_mov_b32 s6, 0
+; GI-NEXT:    v_writelane_b32 v41, s6, 12
+; GI-NEXT:    v_writelane_b32 v41, s4, 13
+; GI-NEXT:    v_writelane_b32 v41, s5, 14
+; GI-NEXT:    s_or_saveexec_b64 s[34:35], -1
+; GI-NEXT:    buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill
+; GI-NEXT:    s_mov_b64 exec, s[34:35]
+; GI-NEXT:  .LBB1_1: ; =>This Inner Loop Header: Depth=1
+; GI-NEXT:    s_or_saveexec_b64 s[34:35], -1
+; GI-NEXT:    buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload
+; GI-NEXT:    s_mov_b64 exec, s[34:35]
+; GI-NEXT:    s_waitcnt vmcnt(0)
+; GI-NEXT:    v_readlane_b32 s4, v41, 13
+; GI-NEXT:    v_readlane_b32 s5, v41, 14
+; GI-NEXT:    v_readlane_b32 s6, v41, 12
+; GI-NEXT:    buffer_load_dword v0, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
+; GI-NEXT:    s_ff1_i32_b64 s7, s[4:5]
+; GI-NEXT:    s_waitcnt vmcnt(0)
+; GI-NEXT:    v_readlane_b32 s8, v0, s7
+; GI-NEXT:    s_max_u32 s6, s6, s8
+; GI-NEXT:    v_writelane_b32 v41, s6, 15
+; GI-NEXT:    s_bitset0_b64 s[4:5], s7
+; GI-NEXT:    s_cmp_lg_u64 s[4:5], 0
+; GI-NEXT:    v_writelane_b32 v41, s6, 12
+; GI-NEXT:    v_writelane_b32 v41, s4, 13
+; GI-NEXT:    v_writelane_b32 v41, s5, 14
+; GI-NEXT:    s_mov_b64 s[34:35], exec
+; GI-NEXT:    s_mov_b64 exec, -1
+; GI-NEXT:    buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill
+; GI-NEXT:    s_mov_b64 exec, s[34:35]
+; GI-NEXT:    s_cbranch_scc1 .LBB1_1
+; GI-NEXT:  ; %bb.2:
+; GI-NEXT:    s_or_saveexec_b64 s[34:35], -1
+; GI-NEXT:    buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload
+; GI-NEXT:    s_mov_b64 exec, s[34:35]
+; GI-NEXT:    s_waitcnt vmcnt(0)
+; GI-NEXT:    v_readlane_b32 s15, v41, 0
+; GI-NEXT:    v_readlane_b32 s14, v41, 1
+; GI-NEXT:    v_readlane_b32 s13, v41, 2
+; GI-NEXT:    v_readlane_b32 s12, v41, 3
+; GI-NEXT:    v_readlane_b32 s10, v41, 4
+; GI-NEXT:    v_readlane_b32 s11, v41, 5
+; GI-NEXT:    v_readlane_b32 s8, v41, 6
+; GI-NEXT:    v_readlane_b32 s9, v41, 7
+; GI-NEXT:    v_readlane_b32 s6, v41, 8
+; GI-NEXT:    v_readlane_b32 s7, v41, 9
+; GI-NEXT:    v_readlane_b32 s4, v41, 10
+; GI-NEXT:    v_readlane_b32 s5, v41, 11
+; GI-NEXT:    v_readlane_b32 s16, v41, 15
+; GI-NEXT:    buffer_load_dword v31, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
+; GI-NEXT:    s_mov_b32 s17, 6
+; GI-NEXT:    s_lshl_b32 s16, s16, s17
+; GI-NEXT:    s_mov_b32 s20, s32
+; GI-NEXT:    s_add_u32 s16, s20, s16
+; GI-NEXT:    s_mov_b32 s32, s16
+; GI-NEXT:    s_mov_b64 s[16:17], src_private_base
+; GI-NEXT:    ; kill: def $sgpr18 killed $sgpr16
+; GI-NEXT:    s_mov_b32 s18, s17
+; GI-NEXT:    s_mov_b32 s16, s20
+; GI-NEXT:    s_mov_b32 s17, s18
+; GI-NEXT:    s_mov_b32 s21, -1
+; GI-NEXT:    s_mov_b64 s[18:19], 0
+; GI-NEXT:    s_cmp_lg_u32 s20, s21
+; GI-NEXT:    s_cselect_b32 s20, 1, 0
+; GI-NEXT:    s_cmp_lg_u32 s20, 0
+; GI-NEXT:    s_cselect_b64 s[18:19], s[16:17], s[18:19]
+; GI-NEXT:    s_mov_b32 s17, s18
+; GI-NEXT:    s_mov_b32 s16, s19
+; GI-NEXT:    v_mov_b32_e32 v0, s17
+; GI-NEXT:    v_mov_b32_e32 v1, s16
+; GI-NEXT:    s_mov_b64 s[18:19], s[2:3]
+; GI-NEXT:    s_mov_b64 s[16:17], s[0:1]
+; GI-NEXT:    s_mov_b64 s[0:1], s[16:17]
+; GI-NEXT:    s_mov_b64 s[2:3], s[18:19]
+; GI-NEXT:    s_getpc_b64 s[16:17]
+; GI-NEXT:    s_add_u32 s16, s16, bar at gotpcrel32@lo+4
+; GI-NEXT:    s_addc_u32 s17, s17, bar at gotpcrel32@hi+12
+; GI-NEXT:    s_load_dwordx2 s[16:17], s[16:17], 0x0
+; GI-NEXT:    s_waitcnt lgkmcnt(0)
+; GI-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GI-NEXT:    s_mov_b32 s4, 0
+; GI-NEXT:    v_mov_b32_e32 v0, s4
+; GI-NEXT:    v_readlane_b32 s31, v40, 1
+; GI-NEXT:    v_readlane_b32 s30, v40, 0
+; GI-NEXT:    s_mov_b32 s32, s33
+; GI-NEXT:    v_readlane_b32 s4, v40, 4
+; GI-NEXT:    v_readlane_b32 s34, v40, 2
+; GI-NEXT:    v_readlane_b32 s35, v40, 3
+; GI-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; GI-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload
+; GI-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:16 ; 4-byte Folded Reload
+; GI-NEXT:    s_mov_b64 exec, s[6:7]
+; GI-NEXT:    s_mov_b32 s33, s4
+; GI-NEXT:    s_waitcnt vmcnt(0)
+; GI-NEXT:    s_setpc_b64 s[30:31]
+  %alloca = alloca i32, i32 %n, align 4
+  call void @bar(ptr %alloca)
+  %load = load i32, ptr %alloca
+  ret i32 0
+}
diff --git a/llvm/test/CodeGen/AMDGPU/assert-wrong-alloca-addrspace.ll b/llvm/test/CodeGen/AMDGPU/assert-wrong-alloca-addrspace.ll
deleted file mode 100644
index 1e72e679e83c0..0000000000000
--- a/llvm/test/CodeGen/AMDGPU/assert-wrong-alloca-addrspace.ll
+++ /dev/null
@@ -1,16 +0,0 @@
-; RUN: not --crash llc -mtriple=amdgcn -mcpu=gfx900 -filetype=null %s 2>&1 | FileCheck %s
-
-; The alloca has the wrong address space and is passed to a call. The
-; FrameIndex was created with the natural 32-bit pointer type instead
-; of the declared 64-bit. Make sure we don't assert.
-
-; CHECK: LLVM ERROR: Cannot select: {{.*}}: i64 = FrameIndex<0>
-
-declare void @func(ptr)
-
-define void @main() {
-bb:
-  %alloca = alloca i32, align 4
-  call void @func(ptr %alloca)
-  ret void
-}