[llvm] [NFC][AMDGPU] Autogenerating test cases (PR #124507)

via llvm-commits llvm-commits at lists.llvm.org
Sun Jan 26 22:12:13 PST 2025


https://github.com/easyonaadit updated https://github.com/llvm/llvm-project/pull/124507

>From 8590a0072af2d419e5d9c5ef13cc96cf50daca16 Mon Sep 17 00:00:00 2001
From: easyonaadit <aaditya.alokdeshpande at amd.com>
Date: Mon, 27 Jan 2025 11:07:51 +0530
Subject: [PATCH 1/2] [NFC][AMDGPU] Autogenerating test cases

---
 .../test/CodeGen/AMDGPU/callee-frame-setup.ll | 2800 ++++++++++++++---
 llvm/test/CodeGen/AMDGPU/nested-calls.ll      |   96 +-
 llvm/test/CodeGen/AMDGPU/sibling-call.ll      |  800 +++--
 3 files changed, 3038 insertions(+), 658 deletions(-)

diff --git a/llvm/test/CodeGen/AMDGPU/callee-frame-setup.ll b/llvm/test/CodeGen/AMDGPU/callee-frame-setup.ll
index 6fb071dd42d2ff..3241a76d46a1e0 100644
--- a/llvm/test/CodeGen/AMDGPU/callee-frame-setup.ll
+++ b/llvm/test/CodeGen/AMDGPU/callee-frame-setup.ll
@@ -1,123 +1,177 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
 ; RUN: llc -mtriple=amdgcn -mcpu=hawaii -verify-machineinstrs < %s | FileCheck  -enable-var-scope -check-prefixes=GCN,MUBUF %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck  -enable-var-scope -check-prefixes=GCN,MUBUF %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs -mattr=+enable-flat-scratch < %s | FileCheck  -enable-var-scope -check-prefixes=GCN,FLATSCR %s
 
-; GCN-LABEL: {{^}}callee_no_stack:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_waitcnt
-; GCN-NEXT: s_setpc_b64
 define void @callee_no_stack() #0 {
+; GCN-LABEL: callee_no_stack:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    s_setpc_b64 s[30:31]
   ret void
 }
 
-; GCN-LABEL: {{^}}callee_no_stack_no_fp_elim_all:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_waitcnt
-; MUBUF-NEXT:   s_mov_b32 [[FP_COPY:s4]], s33
-; FLATSCR-NEXT: s_mov_b32 [[FP_COPY:s0]], s33
-; GCN-NEXT: s_mov_b32 s33, s32
-; GCN-NEXT: s_mov_b32 s33, [[FP_COPY]]
-; GCN-NEXT: s_setpc_b64
 define void @callee_no_stack_no_fp_elim_all() #1 {
+; MUBUF-LABEL: callee_no_stack_no_fp_elim_all:
+; MUBUF:       ; %bb.0:
+; MUBUF-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; MUBUF-NEXT:    s_mov_b32 s4, s33
+; MUBUF-NEXT:    s_mov_b32 s33, s32
+; MUBUF-NEXT:    s_mov_b32 s33, s4
+; MUBUF-NEXT:    s_setpc_b64 s[30:31]
+;
+; FLATSCR-LABEL: callee_no_stack_no_fp_elim_all:
+; FLATSCR:       ; %bb.0:
+; FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; FLATSCR-NEXT:    s_mov_b32 s0, s33
+; FLATSCR-NEXT:    s_mov_b32 s33, s32
+; FLATSCR-NEXT:    s_mov_b32 s33, s0
+; FLATSCR-NEXT:    s_setpc_b64 s[30:31]
   ret void
 }
 
-; GCN-LABEL: {{^}}callee_no_stack_no_fp_elim_nonleaf:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_waitcnt
-; GCN-NEXT: s_setpc_b64
 define void @callee_no_stack_no_fp_elim_nonleaf() #2 {
+; GCN-LABEL: callee_no_stack_no_fp_elim_nonleaf:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    s_setpc_b64 s[30:31]
   ret void
 }
 
-; GCN-LABEL: {{^}}callee_with_stack:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_waitcnt
-; GCN-NEXT: v_mov_b32_e32 v0, 0{{$}}
-; MUBUF-NEXT:   buffer_store_dword v0, off, s[0:3], s32{{$}}
-; FLATSCR-NEXT: scratch_store_dword off, v0, s32
-; GCN-NEXT: s_waitcnt
-; GCN-NEXT: s_setpc_b64
 define void @callee_with_stack() #0 {
+; MUBUF-LABEL: callee_with_stack:
+; MUBUF:       ; %bb.0:
+; MUBUF-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; MUBUF-NEXT:    v_mov_b32_e32 v0, 0
+; MUBUF-NEXT:    buffer_store_dword v0, off, s[0:3], s32
+; MUBUF-NEXT:    s_waitcnt vmcnt(0)
+; MUBUF-NEXT:    s_setpc_b64 s[30:31]
+;
+; FLATSCR-LABEL: callee_with_stack:
+; FLATSCR:       ; %bb.0:
+; FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; FLATSCR-NEXT:    v_mov_b32_e32 v0, 0
+; FLATSCR-NEXT:    scratch_store_dword off, v0, s32
+; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
+; FLATSCR-NEXT:    s_setpc_b64 s[30:31]
   %alloca = alloca i32, addrspace(5)
   store volatile i32 0, ptr addrspace(5) %alloca
   ret void
 }
 
 ; Can use free call clobbered register to preserve original FP value.
-
-; GCN-LABEL: {{^}}callee_with_stack_no_fp_elim_all:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_waitcnt
-; MUBUF-NEXT:   s_mov_b32 [[FP_COPY:s4]], s33
-; FLATSCR-NEXT: s_mov_b32 [[FP_COPY:s0]], s33
-; GCN-NEXT: s_mov_b32 s33, s32
-; MUBUF-NEXT:   s_addk_i32 s32, 0x200
-; FLATSCR-NEXT: s_add_i32 s32, s32, 8
-; GCN-NEXT: v_mov_b32_e32 v0, 0{{$}}
-; MUBUF-NEXT:   buffer_store_dword v0, off, s[0:3], s33{{$}}
-; FLATSCR-NEXT: scratch_store_dword off, v0, s33{{$}}
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; MUBUF-NEXT:   s_mov_b32 s32, s33
-; FLATSCR-NEXT: s_mov_b32 s32, s33
-; GCN-NEXT: s_mov_b32 s33, [[FP_COPY]]
-; GCN-NEXT: s_setpc_b64
 define void @callee_with_stack_no_fp_elim_all() #1 {
+; MUBUF-LABEL: callee_with_stack_no_fp_elim_all:
+; MUBUF:       ; %bb.0:
+; MUBUF-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; MUBUF-NEXT:    s_mov_b32 s4, s33
+; MUBUF-NEXT:    s_mov_b32 s33, s32
+; MUBUF-NEXT:    s_addk_i32 s32, 0x200
+; MUBUF-NEXT:    v_mov_b32_e32 v0, 0
+; MUBUF-NEXT:    buffer_store_dword v0, off, s[0:3], s33
+; MUBUF-NEXT:    s_waitcnt vmcnt(0)
+; MUBUF-NEXT:    s_mov_b32 s32, s33
+; MUBUF-NEXT:    s_mov_b32 s33, s4
+; MUBUF-NEXT:    s_setpc_b64 s[30:31]
+;
+; FLATSCR-LABEL: callee_with_stack_no_fp_elim_all:
+; FLATSCR:       ; %bb.0:
+; FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; FLATSCR-NEXT:    s_mov_b32 s0, s33
+; FLATSCR-NEXT:    s_mov_b32 s33, s32
+; FLATSCR-NEXT:    s_add_i32 s32, s32, 8
+; FLATSCR-NEXT:    v_mov_b32_e32 v0, 0
+; FLATSCR-NEXT:    scratch_store_dword off, v0, s33
+; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
+; FLATSCR-NEXT:    s_mov_b32 s32, s33
+; FLATSCR-NEXT:    s_mov_b32 s33, s0
+; FLATSCR-NEXT:    s_setpc_b64 s[30:31]
   %alloca = alloca i32, addrspace(5)
   store volatile i32 0, ptr addrspace(5) %alloca
   ret void
 }
 
-; GCN-LABEL: {{^}}callee_with_stack_no_fp_elim_non_leaf:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_waitcnt
-; GCN-NEXT: v_mov_b32_e32 v0, 0{{$}}
-; MUBUF-NEXT:   buffer_store_dword v0, off, s[0:3], s32{{$}}
-; FLATSCR-NEXT: scratch_store_dword off, v0, s32{{$}}
-; GCN-NEXT: s_waitcnt
-; GCN-NEXT: s_setpc_b64
 define void @callee_with_stack_no_fp_elim_non_leaf() #2 {
+; MUBUF-LABEL: callee_with_stack_no_fp_elim_non_leaf:
+; MUBUF:       ; %bb.0:
+; MUBUF-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; MUBUF-NEXT:    v_mov_b32_e32 v0, 0
+; MUBUF-NEXT:    buffer_store_dword v0, off, s[0:3], s32
+; MUBUF-NEXT:    s_waitcnt vmcnt(0)
+; MUBUF-NEXT:    s_setpc_b64 s[30:31]
+;
+; FLATSCR-LABEL: callee_with_stack_no_fp_elim_non_leaf:
+; FLATSCR:       ; %bb.0:
+; FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; FLATSCR-NEXT:    v_mov_b32_e32 v0, 0
+; FLATSCR-NEXT:    scratch_store_dword off, v0, s32
+; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
+; FLATSCR-NEXT:    s_setpc_b64 s[30:31]
   %alloca = alloca i32, addrspace(5)
   store volatile i32 0, ptr addrspace(5) %alloca
   ret void
 }
 
-; GCN-LABEL: {{^}}callee_with_stack_and_call:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_waitcnt
-; GCN-NEXT: s_mov_b32 [[FP_SCRATCH_COPY:s[0-9]+]], s33
-; GCN-NEXT: s_mov_b32 s33, s32
-; GCN-NEXT: s_or_saveexec_b64 [[COPY_EXEC0:s\[[0-9]+:[0-9]+\]]], -1{{$}}
-; MUBUF-NEXT:   buffer_store_dword [[CSR_VGPR:v[0-9]+]], off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
-; FLATSCR-NEXT: scratch_store_dword off, [[CSR_VGPR:v[0-9]+]], s33 offset:4 ; 4-byte Folded Spill
-; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC0]]
-; GCN: v_writelane_b32 [[CSR_VGPR]], [[FP_SCRATCH_COPY]], 2
-; MUBUF-DAG:   s_addk_i32 s32, 0x400{{$}}
-; FLATSCR-DAG: s_add_i32 s32, s32, 16{{$}}
-; GCN-DAG: v_writelane_b32 [[CSR_VGPR]], s30,
-; GCN-DAG: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}}
-; GCN-DAG: v_writelane_b32 [[CSR_VGPR]], s31,
-
-; MUBUF-DAG:   buffer_store_dword [[ZERO]], off, s[0:3], s33{{$}}
-; FLATSCR-DAG: scratch_store_dword off, [[ZERO]], s33{{$}}
-
-; GCN: s_swappc_b64
-
-; GCN-DAG: v_readlane_b32 s30, [[CSR_VGPR]]
-; GCN-DAG: v_readlane_b32 s31, [[CSR_VGPR]]
-
-; MUBUF:    s_mov_b32 s32, s33{{$}}
-; FLATSCR:  s_mov_b32 s32, s33{{$}}
-; GCN-NEXT: v_readlane_b32 [[FP_SCRATCH_COPY:s[0-9]+]], [[CSR_VGPR]], 2
-; GCN-NEXT: s_or_saveexec_b64 [[COPY_EXEC1:s\[[0-9]+:[0-9]+\]]], -1{{$}}
-; MUBUF-NEXT:   buffer_load_dword [[CSR_VGPR]], off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
-; FLATSCR-NEXT: scratch_load_dword [[CSR_VGPR]], off, s33 offset:4 ; 4-byte Folded Reload
-; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC1]]
-; GCN-NEXT: s_mov_b32 s33, [[FP_SCRATCH_COPY]]
-; GCN-NEXT: s_waitcnt vmcnt(0)
-
-; GCN-NEXT: s_setpc_b64 s[30:31]
 define void @callee_with_stack_and_call() #0 {
+; MUBUF-LABEL: callee_with_stack_and_call:
+; MUBUF:       ; %bb.0:
+; MUBUF-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; MUBUF-NEXT:    s_mov_b32 s16, s33
+; MUBUF-NEXT:    s_mov_b32 s33, s32
+; MUBUF-NEXT:    s_or_saveexec_b64 s[18:19], -1
+; MUBUF-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
+; MUBUF-NEXT:    s_mov_b64 exec, s[18:19]
+; MUBUF-NEXT:    v_writelane_b32 v40, s16, 2
+; MUBUF-NEXT:    s_addk_i32 s32, 0x400
+; MUBUF-NEXT:    v_writelane_b32 v40, s30, 0
+; MUBUF-NEXT:    v_mov_b32_e32 v0, 0
+; MUBUF-NEXT:    v_writelane_b32 v40, s31, 1
+; MUBUF-NEXT:    buffer_store_dword v0, off, s[0:3], s33
+; MUBUF-NEXT:    s_waitcnt vmcnt(0)
+; MUBUF-NEXT:    s_getpc_b64 s[16:17]
+; MUBUF-NEXT:    s_add_u32 s16, s16, external_void_func_void at rel32@lo+4
+; MUBUF-NEXT:    s_addc_u32 s17, s17, external_void_func_void at rel32@hi+12
+; MUBUF-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; MUBUF-NEXT:    v_readlane_b32 s31, v40, 1
+; MUBUF-NEXT:    v_readlane_b32 s30, v40, 0
+; MUBUF-NEXT:    s_mov_b32 s32, s33
+; MUBUF-NEXT:    v_readlane_b32 s4, v40, 2
+; MUBUF-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; MUBUF-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
+; MUBUF-NEXT:    s_mov_b64 exec, s[6:7]
+; MUBUF-NEXT:    s_mov_b32 s33, s4
+; MUBUF-NEXT:    s_waitcnt vmcnt(0)
+; MUBUF-NEXT:    s_setpc_b64 s[30:31]
+;
+; FLATSCR-LABEL: callee_with_stack_and_call:
+; FLATSCR:       ; %bb.0:
+; FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; FLATSCR-NEXT:    s_mov_b32 s0, s33
+; FLATSCR-NEXT:    s_mov_b32 s33, s32
+; FLATSCR-NEXT:    s_or_saveexec_b64 s[2:3], -1
+; FLATSCR-NEXT:    scratch_store_dword off, v40, s33 offset:4 ; 4-byte Folded Spill
+; FLATSCR-NEXT:    s_mov_b64 exec, s[2:3]
+; FLATSCR-NEXT:    v_writelane_b32 v40, s0, 2
+; FLATSCR-NEXT:    s_add_i32 s32, s32, 16
+; FLATSCR-NEXT:    v_writelane_b32 v40, s30, 0
+; FLATSCR-NEXT:    v_mov_b32_e32 v0, 0
+; FLATSCR-NEXT:    v_writelane_b32 v40, s31, 1
+; FLATSCR-NEXT:    scratch_store_dword off, v0, s33
+; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
+; FLATSCR-NEXT:    s_getpc_b64 s[0:1]
+; FLATSCR-NEXT:    s_add_u32 s0, s0, external_void_func_void at rel32@lo+4
+; FLATSCR-NEXT:    s_addc_u32 s1, s1, external_void_func_void at rel32@hi+12
+; FLATSCR-NEXT:    s_swappc_b64 s[30:31], s[0:1]
+; FLATSCR-NEXT:    v_readlane_b32 s31, v40, 1
+; FLATSCR-NEXT:    v_readlane_b32 s30, v40, 0
+; FLATSCR-NEXT:    s_mov_b32 s32, s33
+; FLATSCR-NEXT:    v_readlane_b32 s0, v40, 2
+; FLATSCR-NEXT:    s_or_saveexec_b64 s[2:3], -1
+; FLATSCR-NEXT:    scratch_load_dword v40, off, s33 offset:4 ; 4-byte Folded Reload
+; FLATSCR-NEXT:    s_mov_b64 exec, s[2:3]
+; FLATSCR-NEXT:    s_mov_b32 s33, s0
+; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
+; FLATSCR-NEXT:    s_setpc_b64 s[30:31]
   %alloca = alloca i32, addrspace(5)
   store volatile i32 0, ptr addrspace(5) %alloca
   call void @external_void_func_void()
@@ -130,36 +184,60 @@ define void @callee_with_stack_and_call() #0 {
 ; There is stack usage only because of the need to evict a VGPR for
 ; spilling CSR SGPRs.
 
-; GCN-LABEL: {{^}}callee_no_stack_with_call:
-; GCN: s_waitcnt
-; GCN: s_mov_b32 [[FP_SCRATCH_COPY:s[0-9]+]], s33
-; GCN-NEXT: s_mov_b32 s33, s32
-; GCN-NEXT: s_or_saveexec_b64 [[COPY_EXEC0:s\[[0-9]+:[0-9]+\]]], -1{{$}}
-; MUBUF-NEXT:   buffer_store_dword [[CSR_VGPR:v[0-9]+]], off, s[0:3], s33 ; 4-byte Folded Spill
-; FLATSCR-NEXT: scratch_store_dword off, [[CSR_VGPR:v[0-9]+]], s33 ; 4-byte Folded Spill
-; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC0]]
-; MUBUF-DAG:   s_addk_i32 s32, 0x400
-; FLATSCR-DAG: s_add_i32 s32, s32, 16
-; GCN-DAG: v_writelane_b32 [[CSR_VGPR]], [[FP_SCRATCH_COPY]], [[FP_SPILL_LANE:[0-9]+]]
-
-; GCN-DAG: v_writelane_b32 [[CSR_VGPR]], s30, 0
-; GCN-DAG: v_writelane_b32 [[CSR_VGPR]], s31, 1
-; GCN: s_swappc_b64
-
-; GCN-DAG: v_readlane_b32 s30, [[CSR_VGPR]], 0
-; GCN-DAG: v_readlane_b32 s31, [[CSR_VGPR]], 1
-
-; MUBUF:   s_mov_b32 s32, s33
-; FLATSCR: s_mov_b32 s32, s33
-; GCN-NEXT: v_readlane_b32 [[FP_SCRATCH_COPY:s[0-9]+]], [[CSR_VGPR]], [[FP_SPILL_LANE]]
-; GCN-NEXT: s_or_saveexec_b64 [[COPY_EXEC1:s\[[0-9]+:[0-9]+\]]], -1{{$}}
-; MUBUF-NEXT:   buffer_load_dword [[CSR_VGPR]], off, s[0:3], s33 ; 4-byte Folded Reload
-; FLATSCR-NEXT: scratch_load_dword [[CSR_VGPR]], off, s33 ; 4-byte Folded Reload
-; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC1]]
-; GCN-NEXT: s_mov_b32 s33, [[FP_SCRATCH_COPY]]
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: s_setpc_b64 s[30:31]
 define void @callee_no_stack_with_call() #0 {
+; MUBUF-LABEL: callee_no_stack_with_call:
+; MUBUF:       ; %bb.0:
+; MUBUF-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; MUBUF-NEXT:    s_mov_b32 s16, s33
+; MUBUF-NEXT:    s_mov_b32 s33, s32
+; MUBUF-NEXT:    s_or_saveexec_b64 s[18:19], -1
+; MUBUF-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
+; MUBUF-NEXT:    s_mov_b64 exec, s[18:19]
+; MUBUF-NEXT:    v_writelane_b32 v40, s16, 2
+; MUBUF-NEXT:    s_addk_i32 s32, 0x400
+; MUBUF-NEXT:    v_writelane_b32 v40, s30, 0
+; MUBUF-NEXT:    v_writelane_b32 v40, s31, 1
+; MUBUF-NEXT:    s_getpc_b64 s[16:17]
+; MUBUF-NEXT:    s_add_u32 s16, s16, external_void_func_void at rel32@lo+4
+; MUBUF-NEXT:    s_addc_u32 s17, s17, external_void_func_void at rel32@hi+12
+; MUBUF-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; MUBUF-NEXT:    v_readlane_b32 s31, v40, 1
+; MUBUF-NEXT:    v_readlane_b32 s30, v40, 0
+; MUBUF-NEXT:    s_mov_b32 s32, s33
+; MUBUF-NEXT:    v_readlane_b32 s4, v40, 2
+; MUBUF-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; MUBUF-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
+; MUBUF-NEXT:    s_mov_b64 exec, s[6:7]
+; MUBUF-NEXT:    s_mov_b32 s33, s4
+; MUBUF-NEXT:    s_waitcnt vmcnt(0)
+; MUBUF-NEXT:    s_setpc_b64 s[30:31]
+;
+; FLATSCR-LABEL: callee_no_stack_with_call:
+; FLATSCR:       ; %bb.0:
+; FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; FLATSCR-NEXT:    s_mov_b32 s0, s33
+; FLATSCR-NEXT:    s_mov_b32 s33, s32
+; FLATSCR-NEXT:    s_or_saveexec_b64 s[2:3], -1
+; FLATSCR-NEXT:    scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
+; FLATSCR-NEXT:    s_mov_b64 exec, s[2:3]
+; FLATSCR-NEXT:    v_writelane_b32 v40, s0, 2
+; FLATSCR-NEXT:    s_add_i32 s32, s32, 16
+; FLATSCR-NEXT:    v_writelane_b32 v40, s30, 0
+; FLATSCR-NEXT:    v_writelane_b32 v40, s31, 1
+; FLATSCR-NEXT:    s_getpc_b64 s[0:1]
+; FLATSCR-NEXT:    s_add_u32 s0, s0, external_void_func_void at rel32@lo+4
+; FLATSCR-NEXT:    s_addc_u32 s1, s1, external_void_func_void at rel32@hi+12
+; FLATSCR-NEXT:    s_swappc_b64 s[30:31], s[0:1]
+; FLATSCR-NEXT:    v_readlane_b32 s31, v40, 1
+; FLATSCR-NEXT:    v_readlane_b32 s30, v40, 0
+; FLATSCR-NEXT:    s_mov_b32 s32, s33
+; FLATSCR-NEXT:    v_readlane_b32 s0, v40, 2
+; FLATSCR-NEXT:    s_or_saveexec_b64 s[2:3], -1
+; FLATSCR-NEXT:    scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
+; FLATSCR-NEXT:    s_mov_b64 exec, s[2:3]
+; FLATSCR-NEXT:    s_mov_b32 s33, s0
+; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
+; FLATSCR-NEXT:    s_setpc_b64 s[30:31]
   call void @external_void_func_void()
   ret void
 }
@@ -168,26 +246,306 @@ declare hidden void @external_void_func_void() #0
 
 ; Make sure if a CSR vgpr is used for SGPR spilling, it is saved and
 ; restored. No FP is required.
-;
-; GCN-LABEL: {{^}}callee_func_sgpr_spill_no_calls:
-; GCN: s_or_saveexec_b64 [[COPY_EXEC0:s\[[0-9]+:[0-9]+\]]], -1{{$}}
-; MUBUF-NEXT:   buffer_store_dword [[CSR_VGPR:v[0-9]+]], off, s[0:3], s32 ; 4-byte Folded Spill
-; FLATSCR-NEXT: scratch_store_dword off, [[CSR_VGPR:v[0-9]+]], s32 ; 4-byte Folded Spill
-; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC0]]
-; GCN: v_writelane_b32 [[CSR_VGPR]], s
-; GCN: v_writelane_b32 [[CSR_VGPR]], s
-
-; GCN: ;;#ASMSTART
-; GCN: v_readlane_b32 s{{[0-9]+}}, [[CSR_VGPR]]
-; GCN: v_readlane_b32 s{{[0-9]+}}, [[CSR_VGPR]]
-
-; GCN: s_or_saveexec_b64 [[COPY_EXEC1:s\[[0-9]+:[0-9]+\]]], -1{{$}}
-; MUBUF-NEXT:   buffer_load_dword [[CSR_VGPR]], off, s[0:3], s32 ; 4-byte Folded Reload
-; FLATSCR-NEXT: scratch_load_dword [[CSR_VGPR]], off, s32 ; 4-byte Folded Reload
-; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC1]]
-; GCN-NEXT: s_waitcnt
-; GCN-NEXT: s_setpc_b64
 define void @callee_func_sgpr_spill_no_calls(i32 %in) #0 {
+; MUBUF-LABEL: callee_func_sgpr_spill_no_calls:
+; MUBUF:       ; %bb.0:
+; MUBUF-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; MUBUF-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; MUBUF-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; MUBUF-NEXT:    s_mov_b64 exec, s[4:5]
+; MUBUF-NEXT:    v_writelane_b32 v40, s36, 0
+; MUBUF-NEXT:    v_writelane_b32 v40, s37, 1
+; MUBUF-NEXT:    v_writelane_b32 v40, s38, 2
+; MUBUF-NEXT:    v_writelane_b32 v40, s39, 3
+; MUBUF-NEXT:    v_writelane_b32 v40, s40, 4
+; MUBUF-NEXT:    v_writelane_b32 v40, s41, 5
+; MUBUF-NEXT:    v_writelane_b32 v40, s42, 6
+; MUBUF-NEXT:    v_writelane_b32 v40, s43, 7
+; MUBUF-NEXT:    v_writelane_b32 v40, s44, 8
+; MUBUF-NEXT:    v_writelane_b32 v40, s45, 9
+; MUBUF-NEXT:    v_writelane_b32 v40, s46, 10
+; MUBUF-NEXT:    v_writelane_b32 v40, s47, 11
+; MUBUF-NEXT:    v_writelane_b32 v40, s48, 12
+; MUBUF-NEXT:    v_writelane_b32 v40, s49, 13
+; MUBUF-NEXT:    v_writelane_b32 v40, s50, 14
+; MUBUF-NEXT:    v_writelane_b32 v40, s51, 15
+; MUBUF-NEXT:    v_writelane_b32 v40, s52, 16
+; MUBUF-NEXT:    v_writelane_b32 v40, s53, 17
+; MUBUF-NEXT:    v_writelane_b32 v40, s54, 18
+; MUBUF-NEXT:    v_writelane_b32 v40, s55, 19
+; MUBUF-NEXT:    v_writelane_b32 v40, s56, 20
+; MUBUF-NEXT:    v_writelane_b32 v40, s57, 21
+; MUBUF-NEXT:    v_writelane_b32 v40, s58, 22
+; MUBUF-NEXT:    v_writelane_b32 v40, s59, 23
+; MUBUF-NEXT:    v_writelane_b32 v40, s60, 24
+; MUBUF-NEXT:    v_writelane_b32 v40, s61, 25
+; MUBUF-NEXT:    v_writelane_b32 v40, s62, 26
+; MUBUF-NEXT:    v_writelane_b32 v40, s63, 27
+; MUBUF-NEXT:    v_writelane_b32 v40, s64, 28
+; MUBUF-NEXT:    v_writelane_b32 v40, s65, 29
+; MUBUF-NEXT:    v_writelane_b32 v40, s66, 30
+; MUBUF-NEXT:    v_writelane_b32 v40, s67, 31
+; MUBUF-NEXT:    v_writelane_b32 v40, s68, 32
+; MUBUF-NEXT:    v_writelane_b32 v40, s69, 33
+; MUBUF-NEXT:    v_writelane_b32 v40, s70, 34
+; MUBUF-NEXT:    v_writelane_b32 v40, s71, 35
+; MUBUF-NEXT:    v_writelane_b32 v40, s72, 36
+; MUBUF-NEXT:    v_writelane_b32 v40, s73, 37
+; MUBUF-NEXT:    v_writelane_b32 v40, s74, 38
+; MUBUF-NEXT:    v_writelane_b32 v40, s75, 39
+; MUBUF-NEXT:    v_writelane_b32 v40, s76, 40
+; MUBUF-NEXT:    v_writelane_b32 v40, s77, 41
+; MUBUF-NEXT:    v_writelane_b32 v40, s78, 42
+; MUBUF-NEXT:    v_writelane_b32 v40, s79, 43
+; MUBUF-NEXT:    v_writelane_b32 v40, s80, 44
+; MUBUF-NEXT:    v_writelane_b32 v40, s81, 45
+; MUBUF-NEXT:    v_writelane_b32 v40, s82, 46
+; MUBUF-NEXT:    v_writelane_b32 v40, s83, 47
+; MUBUF-NEXT:    ;;#ASMSTART
+; MUBUF-NEXT:    ;;#ASMEND
+; MUBUF-NEXT:    ;;#ASMSTART
+; MUBUF-NEXT:    ;;#ASMEND
+; MUBUF-NEXT:    ;;#ASMSTART
+; MUBUF-NEXT:    ;;#ASMEND
+; MUBUF-NEXT:    ;;#ASMSTART
+; MUBUF-NEXT:    ;;#ASMEND
+; MUBUF-NEXT:    ;;#ASMSTART
+; MUBUF-NEXT:    ;;#ASMEND
+; MUBUF-NEXT:    ;;#ASMSTART
+; MUBUF-NEXT:    ; def s[68:83]
+; MUBUF-NEXT:    ;;#ASMEND
+; MUBUF-NEXT:    ;;#ASMSTART
+; MUBUF-NEXT:    ; def s[52:67]
+; MUBUF-NEXT:    ;;#ASMEND
+; MUBUF-NEXT:    ;;#ASMSTART
+; MUBUF-NEXT:    ; def s[36:51]
+; MUBUF-NEXT:    ;;#ASMEND
+; MUBUF-NEXT:    ;;#ASMSTART
+; MUBUF-NEXT:    ; def s[4:19]
+; MUBUF-NEXT:    ;;#ASMEND
+; MUBUF-NEXT:    ;;#ASMSTART
+; MUBUF-NEXT:    ; def s[20:27]
+; MUBUF-NEXT:    ;;#ASMEND
+; MUBUF-NEXT:    ;;#ASMSTART
+; MUBUF-NEXT:    ; def s[28:29]
+; MUBUF-NEXT:    ;;#ASMEND
+; MUBUF-NEXT:    ;;#ASMSTART
+; MUBUF-NEXT:    ; use s[68:83]
+; MUBUF-NEXT:    ;;#ASMEND
+; MUBUF-NEXT:    ;;#ASMSTART
+; MUBUF-NEXT:    ; use s[52:67]
+; MUBUF-NEXT:    ;;#ASMEND
+; MUBUF-NEXT:    ;;#ASMSTART
+; MUBUF-NEXT:    ; use s[36:51]
+; MUBUF-NEXT:    ;;#ASMEND
+; MUBUF-NEXT:    ;;#ASMSTART
+; MUBUF-NEXT:    ; use s[20:27]
+; MUBUF-NEXT:    ;;#ASMEND
+; MUBUF-NEXT:    ;;#ASMSTART
+; MUBUF-NEXT:    ; use s[28:29]
+; MUBUF-NEXT:    ;;#ASMEND
+; MUBUF-NEXT:    ;;#ASMSTART
+; MUBUF-NEXT:    ; use s[4:19]
+; MUBUF-NEXT:    ;;#ASMEND
+; MUBUF-NEXT:    v_readlane_b32 s83, v40, 47
+; MUBUF-NEXT:    v_readlane_b32 s82, v40, 46
+; MUBUF-NEXT:    v_readlane_b32 s81, v40, 45
+; MUBUF-NEXT:    v_readlane_b32 s80, v40, 44
+; MUBUF-NEXT:    v_readlane_b32 s79, v40, 43
+; MUBUF-NEXT:    v_readlane_b32 s78, v40, 42
+; MUBUF-NEXT:    v_readlane_b32 s77, v40, 41
+; MUBUF-NEXT:    v_readlane_b32 s76, v40, 40
+; MUBUF-NEXT:    v_readlane_b32 s75, v40, 39
+; MUBUF-NEXT:    v_readlane_b32 s74, v40, 38
+; MUBUF-NEXT:    v_readlane_b32 s73, v40, 37
+; MUBUF-NEXT:    v_readlane_b32 s72, v40, 36
+; MUBUF-NEXT:    v_readlane_b32 s71, v40, 35
+; MUBUF-NEXT:    v_readlane_b32 s70, v40, 34
+; MUBUF-NEXT:    v_readlane_b32 s69, v40, 33
+; MUBUF-NEXT:    v_readlane_b32 s68, v40, 32
+; MUBUF-NEXT:    v_readlane_b32 s67, v40, 31
+; MUBUF-NEXT:    v_readlane_b32 s66, v40, 30
+; MUBUF-NEXT:    v_readlane_b32 s65, v40, 29
+; MUBUF-NEXT:    v_readlane_b32 s64, v40, 28
+; MUBUF-NEXT:    v_readlane_b32 s63, v40, 27
+; MUBUF-NEXT:    v_readlane_b32 s62, v40, 26
+; MUBUF-NEXT:    v_readlane_b32 s61, v40, 25
+; MUBUF-NEXT:    v_readlane_b32 s60, v40, 24
+; MUBUF-NEXT:    v_readlane_b32 s59, v40, 23
+; MUBUF-NEXT:    v_readlane_b32 s58, v40, 22
+; MUBUF-NEXT:    v_readlane_b32 s57, v40, 21
+; MUBUF-NEXT:    v_readlane_b32 s56, v40, 20
+; MUBUF-NEXT:    v_readlane_b32 s55, v40, 19
+; MUBUF-NEXT:    v_readlane_b32 s54, v40, 18
+; MUBUF-NEXT:    v_readlane_b32 s53, v40, 17
+; MUBUF-NEXT:    v_readlane_b32 s52, v40, 16
+; MUBUF-NEXT:    v_readlane_b32 s51, v40, 15
+; MUBUF-NEXT:    v_readlane_b32 s50, v40, 14
+; MUBUF-NEXT:    v_readlane_b32 s49, v40, 13
+; MUBUF-NEXT:    v_readlane_b32 s48, v40, 12
+; MUBUF-NEXT:    v_readlane_b32 s47, v40, 11
+; MUBUF-NEXT:    v_readlane_b32 s46, v40, 10
+; MUBUF-NEXT:    v_readlane_b32 s45, v40, 9
+; MUBUF-NEXT:    v_readlane_b32 s44, v40, 8
+; MUBUF-NEXT:    v_readlane_b32 s43, v40, 7
+; MUBUF-NEXT:    v_readlane_b32 s42, v40, 6
+; MUBUF-NEXT:    v_readlane_b32 s41, v40, 5
+; MUBUF-NEXT:    v_readlane_b32 s40, v40, 4
+; MUBUF-NEXT:    v_readlane_b32 s39, v40, 3
+; MUBUF-NEXT:    v_readlane_b32 s38, v40, 2
+; MUBUF-NEXT:    v_readlane_b32 s37, v40, 1
+; MUBUF-NEXT:    v_readlane_b32 s36, v40, 0
+; MUBUF-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; MUBUF-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; MUBUF-NEXT:    s_mov_b64 exec, s[4:5]
+; MUBUF-NEXT:    s_waitcnt vmcnt(0)
+; MUBUF-NEXT:    s_setpc_b64 s[30:31]
+;
+; FLATSCR-LABEL: callee_func_sgpr_spill_no_calls:
+; FLATSCR:       ; %bb.0:
+; FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; FLATSCR-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; FLATSCR-NEXT:    scratch_store_dword off, v40, s32 ; 4-byte Folded Spill
+; FLATSCR-NEXT:    s_mov_b64 exec, s[0:1]
+; FLATSCR-NEXT:    v_writelane_b32 v40, s30, 0
+; FLATSCR-NEXT:    v_writelane_b32 v40, s31, 1
+; FLATSCR-NEXT:    v_writelane_b32 v40, s34, 2
+; FLATSCR-NEXT:    v_writelane_b32 v40, s35, 3
+; FLATSCR-NEXT:    v_writelane_b32 v40, s36, 4
+; FLATSCR-NEXT:    v_writelane_b32 v40, s37, 5
+; FLATSCR-NEXT:    v_writelane_b32 v40, s38, 6
+; FLATSCR-NEXT:    v_writelane_b32 v40, s39, 7
+; FLATSCR-NEXT:    v_writelane_b32 v40, s40, 8
+; FLATSCR-NEXT:    v_writelane_b32 v40, s41, 9
+; FLATSCR-NEXT:    v_writelane_b32 v40, s42, 10
+; FLATSCR-NEXT:    v_writelane_b32 v40, s43, 11
+; FLATSCR-NEXT:    v_writelane_b32 v40, s44, 12
+; FLATSCR-NEXT:    v_writelane_b32 v40, s45, 13
+; FLATSCR-NEXT:    v_writelane_b32 v40, s46, 14
+; FLATSCR-NEXT:    v_writelane_b32 v40, s47, 15
+; FLATSCR-NEXT:    v_writelane_b32 v40, s48, 16
+; FLATSCR-NEXT:    v_writelane_b32 v40, s49, 17
+; FLATSCR-NEXT:    v_writelane_b32 v40, s50, 18
+; FLATSCR-NEXT:    v_writelane_b32 v40, s51, 19
+; FLATSCR-NEXT:    v_writelane_b32 v40, s52, 20
+; FLATSCR-NEXT:    v_writelane_b32 v40, s53, 21
+; FLATSCR-NEXT:    v_writelane_b32 v40, s54, 22
+; FLATSCR-NEXT:    v_writelane_b32 v40, s55, 23
+; FLATSCR-NEXT:    v_writelane_b32 v40, s56, 24
+; FLATSCR-NEXT:    v_writelane_b32 v40, s57, 25
+; FLATSCR-NEXT:    v_writelane_b32 v40, s58, 26
+; FLATSCR-NEXT:    v_writelane_b32 v40, s59, 27
+; FLATSCR-NEXT:    v_writelane_b32 v40, s60, 28
+; FLATSCR-NEXT:    v_writelane_b32 v40, s61, 29
+; FLATSCR-NEXT:    v_writelane_b32 v40, s62, 30
+; FLATSCR-NEXT:    v_writelane_b32 v40, s63, 31
+; FLATSCR-NEXT:    v_writelane_b32 v40, s64, 32
+; FLATSCR-NEXT:    v_writelane_b32 v40, s65, 33
+; FLATSCR-NEXT:    v_writelane_b32 v40, s66, 34
+; FLATSCR-NEXT:    v_writelane_b32 v40, s67, 35
+; FLATSCR-NEXT:    v_writelane_b32 v40, s68, 36
+; FLATSCR-NEXT:    v_writelane_b32 v40, s69, 37
+; FLATSCR-NEXT:    v_writelane_b32 v40, s70, 38
+; FLATSCR-NEXT:    v_writelane_b32 v40, s71, 39
+; FLATSCR-NEXT:    v_writelane_b32 v40, s72, 40
+; FLATSCR-NEXT:    v_writelane_b32 v40, s73, 41
+; FLATSCR-NEXT:    v_writelane_b32 v40, s74, 42
+; FLATSCR-NEXT:    v_writelane_b32 v40, s75, 43
+; FLATSCR-NEXT:    ;;#ASMSTART
+; FLATSCR-NEXT:    ;;#ASMEND
+; FLATSCR-NEXT:    ;;#ASMSTART
+; FLATSCR-NEXT:    ;;#ASMEND
+; FLATSCR-NEXT:    ;;#ASMSTART
+; FLATSCR-NEXT:    ;;#ASMEND
+; FLATSCR-NEXT:    ;;#ASMSTART
+; FLATSCR-NEXT:    ;;#ASMEND
+; FLATSCR-NEXT:    ;;#ASMSTART
+; FLATSCR-NEXT:    ;;#ASMEND
+; FLATSCR-NEXT:    ;;#ASMSTART
+; FLATSCR-NEXT:    ; def s[52:67]
+; FLATSCR-NEXT:    ;;#ASMEND
+; FLATSCR-NEXT:    ;;#ASMSTART
+; FLATSCR-NEXT:    ; def s[36:51]
+; FLATSCR-NEXT:    ;;#ASMEND
+; FLATSCR-NEXT:    ;;#ASMSTART
+; FLATSCR-NEXT:    ; def s[16:31]
+; FLATSCR-NEXT:    ;;#ASMEND
+; FLATSCR-NEXT:    ;;#ASMSTART
+; FLATSCR-NEXT:    ; def s[0:15]
+; FLATSCR-NEXT:    ;;#ASMEND
+; FLATSCR-NEXT:    ;;#ASMSTART
+; FLATSCR-NEXT:    ; def s[68:75]
+; FLATSCR-NEXT:    ;;#ASMEND
+; FLATSCR-NEXT:    ;;#ASMSTART
+; FLATSCR-NEXT:    ; def s[34:35]
+; FLATSCR-NEXT:    ;;#ASMEND
+; FLATSCR-NEXT:    ;;#ASMSTART
+; FLATSCR-NEXT:    ; use s[52:67]
+; FLATSCR-NEXT:    ;;#ASMEND
+; FLATSCR-NEXT:    ;;#ASMSTART
+; FLATSCR-NEXT:    ; use s[36:51]
+; FLATSCR-NEXT:    ;;#ASMEND
+; FLATSCR-NEXT:    ;;#ASMSTART
+; FLATSCR-NEXT:    ; use s[16:31]
+; FLATSCR-NEXT:    ;;#ASMEND
+; FLATSCR-NEXT:    ;;#ASMSTART
+; FLATSCR-NEXT:    ; use s[68:75]
+; FLATSCR-NEXT:    ;;#ASMEND
+; FLATSCR-NEXT:    ;;#ASMSTART
+; FLATSCR-NEXT:    ; use s[34:35]
+; FLATSCR-NEXT:    ;;#ASMEND
+; FLATSCR-NEXT:    ;;#ASMSTART
+; FLATSCR-NEXT:    ; use s[0:15]
+; FLATSCR-NEXT:    ;;#ASMEND
+; FLATSCR-NEXT:    v_readlane_b32 s75, v40, 43
+; FLATSCR-NEXT:    v_readlane_b32 s74, v40, 42
+; FLATSCR-NEXT:    v_readlane_b32 s73, v40, 41
+; FLATSCR-NEXT:    v_readlane_b32 s72, v40, 40
+; FLATSCR-NEXT:    v_readlane_b32 s71, v40, 39
+; FLATSCR-NEXT:    v_readlane_b32 s70, v40, 38
+; FLATSCR-NEXT:    v_readlane_b32 s69, v40, 37
+; FLATSCR-NEXT:    v_readlane_b32 s68, v40, 36
+; FLATSCR-NEXT:    v_readlane_b32 s67, v40, 35
+; FLATSCR-NEXT:    v_readlane_b32 s66, v40, 34
+; FLATSCR-NEXT:    v_readlane_b32 s65, v40, 33
+; FLATSCR-NEXT:    v_readlane_b32 s64, v40, 32
+; FLATSCR-NEXT:    v_readlane_b32 s63, v40, 31
+; FLATSCR-NEXT:    v_readlane_b32 s62, v40, 30
+; FLATSCR-NEXT:    v_readlane_b32 s61, v40, 29
+; FLATSCR-NEXT:    v_readlane_b32 s60, v40, 28
+; FLATSCR-NEXT:    v_readlane_b32 s59, v40, 27
+; FLATSCR-NEXT:    v_readlane_b32 s58, v40, 26
+; FLATSCR-NEXT:    v_readlane_b32 s57, v40, 25
+; FLATSCR-NEXT:    v_readlane_b32 s56, v40, 24
+; FLATSCR-NEXT:    v_readlane_b32 s55, v40, 23
+; FLATSCR-NEXT:    v_readlane_b32 s54, v40, 22
+; FLATSCR-NEXT:    v_readlane_b32 s53, v40, 21
+; FLATSCR-NEXT:    v_readlane_b32 s52, v40, 20
+; FLATSCR-NEXT:    v_readlane_b32 s51, v40, 19
+; FLATSCR-NEXT:    v_readlane_b32 s50, v40, 18
+; FLATSCR-NEXT:    v_readlane_b32 s49, v40, 17
+; FLATSCR-NEXT:    v_readlane_b32 s48, v40, 16
+; FLATSCR-NEXT:    v_readlane_b32 s47, v40, 15
+; FLATSCR-NEXT:    v_readlane_b32 s46, v40, 14
+; FLATSCR-NEXT:    v_readlane_b32 s45, v40, 13
+; FLATSCR-NEXT:    v_readlane_b32 s44, v40, 12
+; FLATSCR-NEXT:    v_readlane_b32 s43, v40, 11
+; FLATSCR-NEXT:    v_readlane_b32 s42, v40, 10
+; FLATSCR-NEXT:    v_readlane_b32 s41, v40, 9
+; FLATSCR-NEXT:    v_readlane_b32 s40, v40, 8
+; FLATSCR-NEXT:    v_readlane_b32 s39, v40, 7
+; FLATSCR-NEXT:    v_readlane_b32 s38, v40, 6
+; FLATSCR-NEXT:    v_readlane_b32 s37, v40, 5
+; FLATSCR-NEXT:    v_readlane_b32 s36, v40, 4
+; FLATSCR-NEXT:    v_readlane_b32 s35, v40, 3
+; FLATSCR-NEXT:    v_readlane_b32 s34, v40, 2
+; FLATSCR-NEXT:    v_readlane_b32 s31, v40, 1
+; FLATSCR-NEXT:    v_readlane_b32 s30, v40, 0
+; FLATSCR-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; FLATSCR-NEXT:    scratch_load_dword v40, off, s32 ; 4-byte Folded Reload
+; FLATSCR-NEXT:    s_mov_b64 exec, s[0:1]
+; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
+; FLATSCR-NEXT:    s_setpc_b64 s[30:31]
   call void asm sideeffect "", "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7}"() #0
   call void asm sideeffect "", "~{v8},~{v9},~{v10},~{v11},~{v12},~{v13},~{v14},~{v15}"() #0
   call void asm sideeffect "", "~{v16},~{v17},~{v18},~{v19},~{v20},~{v21},~{v22},~{v23}"() #0
@@ -212,55 +570,83 @@ define void @callee_func_sgpr_spill_no_calls(i32 %in) #0 {
 
 ; Has no spilled CSR VGPRs used for SGPR spilling, so no need to
 ; enable all lanes and restore.
-
-; GCN-LABEL: {{^}}spill_only_csr_sgpr:
-; GCN: s_waitcnt
-; GCN-NEXT: s_xor_saveexec_b64
-; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
-; FLATSCR-NEXT: scratch_store_dword off, v0, s32 ; 4-byte Folded Spill
-; GCN-NEXT: s_mov_b64 exec,
-; GCN-NEXT: v_writelane_b32 v0, s42, 0
-; GCN-NEXT: ;;#ASMSTART
-; GCN-NEXT: ; clobber s42
-; GCN-NEXT: ;;#ASMEND
-; GCN-NEXT: v_readlane_b32 s42, v0, 0
-; GCN-NEXT: s_xor_saveexec_b64
-; MUBUF-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
-; FLATSCR-NEXT: scratch_load_dword v0, off, s32 ; 4-byte Folded Reload
-; GCN-NEXT: s_mov_b64 exec,
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: s_setpc_b64
 define void @spill_only_csr_sgpr() {
+; MUBUF-LABEL: spill_only_csr_sgpr:
+; MUBUF:       ; %bb.0:
+; MUBUF-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; MUBUF-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; MUBUF-NEXT:    buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
+; MUBUF-NEXT:    s_mov_b64 exec, s[4:5]
+; MUBUF-NEXT:    v_writelane_b32 v0, s42, 0
+; MUBUF-NEXT:    ;;#ASMSTART
+; MUBUF-NEXT:    ; clobber s42
+; MUBUF-NEXT:    ;;#ASMEND
+; MUBUF-NEXT:    v_readlane_b32 s42, v0, 0
+; MUBUF-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; MUBUF-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
+; MUBUF-NEXT:    s_mov_b64 exec, s[4:5]
+; MUBUF-NEXT:    s_waitcnt vmcnt(0)
+; MUBUF-NEXT:    s_setpc_b64 s[30:31]
+;
+; FLATSCR-LABEL: spill_only_csr_sgpr:
+; FLATSCR:       ; %bb.0:
+; FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; FLATSCR-NEXT:    s_xor_saveexec_b64 s[0:1], -1
+; FLATSCR-NEXT:    scratch_store_dword off, v0, s32 ; 4-byte Folded Spill
+; FLATSCR-NEXT:    s_mov_b64 exec, s[0:1]
+; FLATSCR-NEXT:    v_writelane_b32 v0, s42, 0
+; FLATSCR-NEXT:    ;;#ASMSTART
+; FLATSCR-NEXT:    ; clobber s42
+; FLATSCR-NEXT:    ;;#ASMEND
+; FLATSCR-NEXT:    v_readlane_b32 s42, v0, 0
+; FLATSCR-NEXT:    s_xor_saveexec_b64 s[0:1], -1
+; FLATSCR-NEXT:    scratch_load_dword v0, off, s32 ; 4-byte Folded Reload
+; FLATSCR-NEXT:    s_mov_b64 exec, s[0:1]
+; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
+; FLATSCR-NEXT:    s_setpc_b64 s[30:31]
   call void asm sideeffect "; clobber s42", "~{s42}"()
   ret void
 }
 
 ; TODO: Can the SP inc/deec be remvoed?
-; GCN-LABEL: {{^}}callee_with_stack_no_fp_elim_csr_vgpr:
-; GCN: s_waitcnt
-; GCN-NEXT:s_mov_b32 [[FP_COPY:s[0-9]+]], s33
-; GCN-NEXT: s_mov_b32 s33, s32
-; GCN: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0
-; MUBUF-DAG:   buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill
-; FLATSCR-DAG: scratch_store_dword off, v41, s33 ; 4-byte Folded Spill
-; MUBUF-DAG:   buffer_store_dword [[ZERO]], off, s[0:3], s33 offset:4
-; FLATSCR-DAG: scratch_store_dword off, [[ZERO]], s33 offset:4
-
-; GCN:	;;#ASMSTART
-; GCN-NEXT: ; clobber v41
-; GCN-NEXT: ;;#ASMEND
-
-; MUBUF:   buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload
-; FLATSCR: scratch_load_dword v41, off, s33 ; 4-byte Folded Reload
-; MUBUF:        s_addk_i32 s32, 0x300
-; MUBUF-NEXT:   s_mov_b32 s32, s33
-; MUBUF-NEXT:   s_mov_b32 s33, s4
-; FLATSCR:      s_add_i32 s32, s32, 12
-; FLATSCR-NEXT:   s_mov_b32 s32, s33
-; FLATSCR-NEXT: s_mov_b32 s33, s0
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: s_setpc_b64
 define void @callee_with_stack_no_fp_elim_csr_vgpr() #1 {
+; MUBUF-LABEL: callee_with_stack_no_fp_elim_csr_vgpr:
+; MUBUF:       ; %bb.0:
+; MUBUF-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; MUBUF-NEXT:    s_mov_b32 s4, s33
+; MUBUF-NEXT:    s_mov_b32 s33, s32
+; MUBUF-NEXT:    v_mov_b32_e32 v0, 0
+; MUBUF-NEXT:    buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill
+; MUBUF-NEXT:    buffer_store_dword v0, off, s[0:3], s33 offset:4
+; MUBUF-NEXT:    s_waitcnt vmcnt(0)
+; MUBUF-NEXT:    ;;#ASMSTART
+; MUBUF-NEXT:    ; clobber v41
+; MUBUF-NEXT:    ;;#ASMEND
+; MUBUF-NEXT:    buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload
+; MUBUF-NEXT:    s_addk_i32 s32, 0x300
+; MUBUF-NEXT:    s_mov_b32 s32, s33
+; MUBUF-NEXT:    s_mov_b32 s33, s4
+; MUBUF-NEXT:    s_waitcnt vmcnt(0)
+; MUBUF-NEXT:    s_setpc_b64 s[30:31]
+;
+; FLATSCR-LABEL: callee_with_stack_no_fp_elim_csr_vgpr:
+; FLATSCR:       ; %bb.0:
+; FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; FLATSCR-NEXT:    s_mov_b32 s0, s33
+; FLATSCR-NEXT:    s_mov_b32 s33, s32
+; FLATSCR-NEXT:    v_mov_b32_e32 v0, 0
+; FLATSCR-NEXT:    scratch_store_dword off, v41, s33 ; 4-byte Folded Spill
+; FLATSCR-NEXT:    scratch_store_dword off, v0, s33 offset:4
+; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
+; FLATSCR-NEXT:    ;;#ASMSTART
+; FLATSCR-NEXT:    ; clobber v41
+; FLATSCR-NEXT:    ;;#ASMEND
+; FLATSCR-NEXT:    scratch_load_dword v41, off, s33 ; 4-byte Folded Reload
+; FLATSCR-NEXT:    s_add_i32 s32, s32, 12
+; FLATSCR-NEXT:    s_mov_b32 s32, s33
+; FLATSCR-NEXT:    s_mov_b32 s33, s0
+; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
+; FLATSCR-NEXT:    s_setpc_b64 s[30:31]
   %alloca = alloca i32, addrspace(5)
   store volatile i32 0, ptr addrspace(5) %alloca
   call void asm sideeffect "; clobber v41", "~{v41}"()
@@ -268,32 +654,312 @@ define void @callee_with_stack_no_fp_elim_csr_vgpr() #1 {
 }
 
 ; Use a copy to a free SGPR instead of introducing a second CSR VGPR.
-; GCN-LABEL: {{^}}last_lane_vgpr_for_fp_csr:
-; GCN: s_waitcnt
-; GCN-NEXT: s_mov_b32 [[TMP_SGPR:s[0-9]+]], s33
-; GCN: s_mov_b32 s33, s32
-; GCN-NEXT: s_xor_saveexec_b64 [[COPY_EXEC0:s\[[0-9]+:[0-9]+\]]], -1{{$}}
-; MUBUF-NEXT:   buffer_store_dword [[CSR_VGPR:v[0-9]+]], off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
-; FLATSCR-NEXT: scratch_store_dword off, [[CSR_VGPR:v[0-9]+]], s33 offset:8 ; 4-byte Folded Spill
-; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC0]]
-; MUBUF:   buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill
-; FLATSCR: scratch_store_dword off, v41, s33 ; 4-byte Folded Spill
-; GCN: v_writelane_b32 v1
-; MUBUF:   buffer_store_dword v{{[0-9]+}}, off, s[0:3], s33 offset:4
-; FLATSCR: scratch_store_dword off, v{{[0-9]+}}, s33 offset:4
-; GCN: ;;#ASMSTART
-
-; MUBUF:        s_mov_b32 s32, s33
-; FLATSCR:      s_mov_b32 s32, s33
-
-; GCN: s_xor_saveexec_b64 [[COPY_EXEC1:s\[[0-9]+:[0-9]+\]]], -1{{$}}
-; MUBUF-NEXT:   buffer_load_dword [[CSR_VGPR]], off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
-; FLATSCR-NEXT: scratch_load_dword [[CSR_VGPR]], off, s33 offset:8 ; 4-byte Folded Reload
-; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC1]]
-; GCN-NEXT: s_mov_b32 s33, [[TMP_SGPR]]
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: s_setpc_b64
 define void @last_lane_vgpr_for_fp_csr() #1 {
+; MUBUF-LABEL: last_lane_vgpr_for_fp_csr:
+; MUBUF:       ; %bb.0:
+; MUBUF-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; MUBUF-NEXT:    s_mov_b32 s4, s33
+; MUBUF-NEXT:    s_mov_b32 s33, s32
+; MUBUF-NEXT:    s_xor_saveexec_b64 s[6:7], -1
+; MUBUF-NEXT:    buffer_store_dword v1, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
+; MUBUF-NEXT:    s_mov_b64 exec, s[6:7]
+; MUBUF-NEXT:    v_writelane_b32 v1, s40, 0
+; MUBUF-NEXT:    v_writelane_b32 v1, s41, 1
+; MUBUF-NEXT:    v_writelane_b32 v1, s42, 2
+; MUBUF-NEXT:    v_writelane_b32 v1, s43, 3
+; MUBUF-NEXT:    v_writelane_b32 v1, s44, 4
+; MUBUF-NEXT:    v_writelane_b32 v1, s45, 5
+; MUBUF-NEXT:    v_writelane_b32 v1, s46, 6
+; MUBUF-NEXT:    v_writelane_b32 v1, s47, 7
+; MUBUF-NEXT:    v_writelane_b32 v1, s48, 8
+; MUBUF-NEXT:    v_writelane_b32 v1, s49, 9
+; MUBUF-NEXT:    v_writelane_b32 v1, s50, 10
+; MUBUF-NEXT:    v_writelane_b32 v1, s51, 11
+; MUBUF-NEXT:    v_writelane_b32 v1, s52, 12
+; MUBUF-NEXT:    v_writelane_b32 v1, s53, 13
+; MUBUF-NEXT:    v_writelane_b32 v1, s54, 14
+; MUBUF-NEXT:    v_writelane_b32 v1, s55, 15
+; MUBUF-NEXT:    v_writelane_b32 v1, s56, 16
+; MUBUF-NEXT:    v_writelane_b32 v1, s57, 17
+; MUBUF-NEXT:    v_writelane_b32 v1, s58, 18
+; MUBUF-NEXT:    v_writelane_b32 v1, s59, 19
+; MUBUF-NEXT:    v_writelane_b32 v1, s60, 20
+; MUBUF-NEXT:    v_writelane_b32 v1, s61, 21
+; MUBUF-NEXT:    v_writelane_b32 v1, s62, 22
+; MUBUF-NEXT:    v_writelane_b32 v1, s63, 23
+; MUBUF-NEXT:    v_writelane_b32 v1, s64, 24
+; MUBUF-NEXT:    v_writelane_b32 v1, s65, 25
+; MUBUF-NEXT:    v_writelane_b32 v1, s66, 26
+; MUBUF-NEXT:    v_writelane_b32 v1, s67, 27
+; MUBUF-NEXT:    v_writelane_b32 v1, s68, 28
+; MUBUF-NEXT:    v_writelane_b32 v1, s69, 29
+; MUBUF-NEXT:    v_writelane_b32 v1, s70, 30
+; MUBUF-NEXT:    v_writelane_b32 v1, s71, 31
+; MUBUF-NEXT:    v_writelane_b32 v1, s72, 32
+; MUBUF-NEXT:    v_writelane_b32 v1, s73, 33
+; MUBUF-NEXT:    v_writelane_b32 v1, s74, 34
+; MUBUF-NEXT:    v_writelane_b32 v1, s75, 35
+; MUBUF-NEXT:    v_writelane_b32 v1, s76, 36
+; MUBUF-NEXT:    v_writelane_b32 v1, s77, 37
+; MUBUF-NEXT:    v_writelane_b32 v1, s78, 38
+; MUBUF-NEXT:    v_writelane_b32 v1, s79, 39
+; MUBUF-NEXT:    v_writelane_b32 v1, s80, 40
+; MUBUF-NEXT:    v_writelane_b32 v1, s81, 41
+; MUBUF-NEXT:    v_writelane_b32 v1, s82, 42
+; MUBUF-NEXT:    v_writelane_b32 v1, s83, 43
+; MUBUF-NEXT:    v_writelane_b32 v1, s84, 44
+; MUBUF-NEXT:    v_writelane_b32 v1, s85, 45
+; MUBUF-NEXT:    v_writelane_b32 v1, s86, 46
+; MUBUF-NEXT:    v_writelane_b32 v1, s87, 47
+; MUBUF-NEXT:    v_writelane_b32 v1, s88, 48
+; MUBUF-NEXT:    v_writelane_b32 v1, s89, 49
+; MUBUF-NEXT:    v_writelane_b32 v1, s90, 50
+; MUBUF-NEXT:    v_writelane_b32 v1, s91, 51
+; MUBUF-NEXT:    v_writelane_b32 v1, s92, 52
+; MUBUF-NEXT:    v_writelane_b32 v1, s93, 53
+; MUBUF-NEXT:    v_writelane_b32 v1, s94, 54
+; MUBUF-NEXT:    v_writelane_b32 v1, s95, 55
+; MUBUF-NEXT:    v_writelane_b32 v1, s96, 56
+; MUBUF-NEXT:    v_writelane_b32 v1, s97, 57
+; MUBUF-NEXT:    v_writelane_b32 v1, s98, 58
+; MUBUF-NEXT:    v_writelane_b32 v1, s99, 59
+; MUBUF-NEXT:    v_writelane_b32 v1, s100, 60
+; MUBUF-NEXT:    v_mov_b32_e32 v0, 0
+; MUBUF-NEXT:    buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill
+; MUBUF-NEXT:    v_writelane_b32 v1, s101, 61
+; MUBUF-NEXT:    buffer_store_dword v0, off, s[0:3], s33 offset:4
+; MUBUF-NEXT:    s_waitcnt vmcnt(0)
+; MUBUF-NEXT:    ;;#ASMSTART
+; MUBUF-NEXT:    ; clobber v41
+; MUBUF-NEXT:    ;;#ASMEND
+; MUBUF-NEXT:    v_writelane_b32 v1, s102, 62
+; MUBUF-NEXT:    ;;#ASMSTART
+; MUBUF-NEXT:    ;;#ASMEND
+; MUBUF-NEXT:    buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload
+; MUBUF-NEXT:    s_addk_i32 s32, 0x400
+; MUBUF-NEXT:    v_readlane_b32 s102, v1, 62
+; MUBUF-NEXT:    v_readlane_b32 s101, v1, 61
+; MUBUF-NEXT:    v_readlane_b32 s100, v1, 60
+; MUBUF-NEXT:    v_readlane_b32 s99, v1, 59
+; MUBUF-NEXT:    v_readlane_b32 s98, v1, 58
+; MUBUF-NEXT:    v_readlane_b32 s97, v1, 57
+; MUBUF-NEXT:    v_readlane_b32 s96, v1, 56
+; MUBUF-NEXT:    v_readlane_b32 s95, v1, 55
+; MUBUF-NEXT:    v_readlane_b32 s94, v1, 54
+; MUBUF-NEXT:    v_readlane_b32 s93, v1, 53
+; MUBUF-NEXT:    v_readlane_b32 s92, v1, 52
+; MUBUF-NEXT:    v_readlane_b32 s91, v1, 51
+; MUBUF-NEXT:    v_readlane_b32 s90, v1, 50
+; MUBUF-NEXT:    v_readlane_b32 s89, v1, 49
+; MUBUF-NEXT:    v_readlane_b32 s88, v1, 48
+; MUBUF-NEXT:    v_readlane_b32 s87, v1, 47
+; MUBUF-NEXT:    v_readlane_b32 s86, v1, 46
+; MUBUF-NEXT:    v_readlane_b32 s85, v1, 45
+; MUBUF-NEXT:    v_readlane_b32 s84, v1, 44
+; MUBUF-NEXT:    v_readlane_b32 s83, v1, 43
+; MUBUF-NEXT:    v_readlane_b32 s82, v1, 42
+; MUBUF-NEXT:    v_readlane_b32 s81, v1, 41
+; MUBUF-NEXT:    v_readlane_b32 s80, v1, 40
+; MUBUF-NEXT:    v_readlane_b32 s79, v1, 39
+; MUBUF-NEXT:    v_readlane_b32 s78, v1, 38
+; MUBUF-NEXT:    v_readlane_b32 s77, v1, 37
+; MUBUF-NEXT:    v_readlane_b32 s76, v1, 36
+; MUBUF-NEXT:    v_readlane_b32 s75, v1, 35
+; MUBUF-NEXT:    v_readlane_b32 s74, v1, 34
+; MUBUF-NEXT:    v_readlane_b32 s73, v1, 33
+; MUBUF-NEXT:    v_readlane_b32 s72, v1, 32
+; MUBUF-NEXT:    v_readlane_b32 s71, v1, 31
+; MUBUF-NEXT:    v_readlane_b32 s70, v1, 30
+; MUBUF-NEXT:    v_readlane_b32 s69, v1, 29
+; MUBUF-NEXT:    v_readlane_b32 s68, v1, 28
+; MUBUF-NEXT:    v_readlane_b32 s67, v1, 27
+; MUBUF-NEXT:    v_readlane_b32 s66, v1, 26
+; MUBUF-NEXT:    v_readlane_b32 s65, v1, 25
+; MUBUF-NEXT:    v_readlane_b32 s64, v1, 24
+; MUBUF-NEXT:    v_readlane_b32 s63, v1, 23
+; MUBUF-NEXT:    v_readlane_b32 s62, v1, 22
+; MUBUF-NEXT:    v_readlane_b32 s61, v1, 21
+; MUBUF-NEXT:    v_readlane_b32 s60, v1, 20
+; MUBUF-NEXT:    v_readlane_b32 s59, v1, 19
+; MUBUF-NEXT:    v_readlane_b32 s58, v1, 18
+; MUBUF-NEXT:    v_readlane_b32 s57, v1, 17
+; MUBUF-NEXT:    v_readlane_b32 s56, v1, 16
+; MUBUF-NEXT:    v_readlane_b32 s55, v1, 15
+; MUBUF-NEXT:    v_readlane_b32 s54, v1, 14
+; MUBUF-NEXT:    v_readlane_b32 s53, v1, 13
+; MUBUF-NEXT:    v_readlane_b32 s52, v1, 12
+; MUBUF-NEXT:    v_readlane_b32 s51, v1, 11
+; MUBUF-NEXT:    v_readlane_b32 s50, v1, 10
+; MUBUF-NEXT:    v_readlane_b32 s49, v1, 9
+; MUBUF-NEXT:    v_readlane_b32 s48, v1, 8
+; MUBUF-NEXT:    v_readlane_b32 s47, v1, 7
+; MUBUF-NEXT:    v_readlane_b32 s46, v1, 6
+; MUBUF-NEXT:    v_readlane_b32 s45, v1, 5
+; MUBUF-NEXT:    v_readlane_b32 s44, v1, 4
+; MUBUF-NEXT:    v_readlane_b32 s43, v1, 3
+; MUBUF-NEXT:    v_readlane_b32 s42, v1, 2
+; MUBUF-NEXT:    v_readlane_b32 s41, v1, 1
+; MUBUF-NEXT:    v_readlane_b32 s40, v1, 0
+; MUBUF-NEXT:    s_mov_b32 s32, s33
+; MUBUF-NEXT:    s_xor_saveexec_b64 s[6:7], -1
+; MUBUF-NEXT:    buffer_load_dword v1, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
+; MUBUF-NEXT:    s_mov_b64 exec, s[6:7]
+; MUBUF-NEXT:    s_mov_b32 s33, s4
+; MUBUF-NEXT:    s_waitcnt vmcnt(0)
+; MUBUF-NEXT:    s_setpc_b64 s[30:31]
+;
+; FLATSCR-LABEL: last_lane_vgpr_for_fp_csr:
+; FLATSCR:       ; %bb.0:
+; FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; FLATSCR-NEXT:    s_mov_b32 s0, s33
+; FLATSCR-NEXT:    s_mov_b32 s33, s32
+; FLATSCR-NEXT:    s_xor_saveexec_b64 s[2:3], -1
+; FLATSCR-NEXT:    scratch_store_dword off, v1, s33 offset:8 ; 4-byte Folded Spill
+; FLATSCR-NEXT:    s_mov_b64 exec, s[2:3]
+; FLATSCR-NEXT:    v_writelane_b32 v1, s40, 0
+; FLATSCR-NEXT:    v_writelane_b32 v1, s41, 1
+; FLATSCR-NEXT:    v_writelane_b32 v1, s42, 2
+; FLATSCR-NEXT:    v_writelane_b32 v1, s43, 3
+; FLATSCR-NEXT:    v_writelane_b32 v1, s44, 4
+; FLATSCR-NEXT:    v_writelane_b32 v1, s45, 5
+; FLATSCR-NEXT:    v_writelane_b32 v1, s46, 6
+; FLATSCR-NEXT:    v_writelane_b32 v1, s47, 7
+; FLATSCR-NEXT:    v_writelane_b32 v1, s48, 8
+; FLATSCR-NEXT:    v_writelane_b32 v1, s49, 9
+; FLATSCR-NEXT:    v_writelane_b32 v1, s50, 10
+; FLATSCR-NEXT:    v_writelane_b32 v1, s51, 11
+; FLATSCR-NEXT:    v_writelane_b32 v1, s52, 12
+; FLATSCR-NEXT:    v_writelane_b32 v1, s53, 13
+; FLATSCR-NEXT:    v_writelane_b32 v1, s54, 14
+; FLATSCR-NEXT:    v_writelane_b32 v1, s55, 15
+; FLATSCR-NEXT:    v_writelane_b32 v1, s56, 16
+; FLATSCR-NEXT:    v_writelane_b32 v1, s57, 17
+; FLATSCR-NEXT:    v_writelane_b32 v1, s58, 18
+; FLATSCR-NEXT:    v_writelane_b32 v1, s59, 19
+; FLATSCR-NEXT:    v_writelane_b32 v1, s60, 20
+; FLATSCR-NEXT:    v_writelane_b32 v1, s61, 21
+; FLATSCR-NEXT:    v_writelane_b32 v1, s62, 22
+; FLATSCR-NEXT:    v_writelane_b32 v1, s63, 23
+; FLATSCR-NEXT:    v_writelane_b32 v1, s64, 24
+; FLATSCR-NEXT:    v_writelane_b32 v1, s65, 25
+; FLATSCR-NEXT:    v_writelane_b32 v1, s66, 26
+; FLATSCR-NEXT:    v_writelane_b32 v1, s67, 27
+; FLATSCR-NEXT:    v_writelane_b32 v1, s68, 28
+; FLATSCR-NEXT:    v_writelane_b32 v1, s69, 29
+; FLATSCR-NEXT:    v_writelane_b32 v1, s70, 30
+; FLATSCR-NEXT:    v_writelane_b32 v1, s71, 31
+; FLATSCR-NEXT:    v_writelane_b32 v1, s72, 32
+; FLATSCR-NEXT:    v_writelane_b32 v1, s73, 33
+; FLATSCR-NEXT:    v_writelane_b32 v1, s74, 34
+; FLATSCR-NEXT:    v_writelane_b32 v1, s75, 35
+; FLATSCR-NEXT:    v_writelane_b32 v1, s76, 36
+; FLATSCR-NEXT:    v_writelane_b32 v1, s77, 37
+; FLATSCR-NEXT:    v_writelane_b32 v1, s78, 38
+; FLATSCR-NEXT:    v_writelane_b32 v1, s79, 39
+; FLATSCR-NEXT:    v_writelane_b32 v1, s80, 40
+; FLATSCR-NEXT:    v_writelane_b32 v1, s81, 41
+; FLATSCR-NEXT:    v_writelane_b32 v1, s82, 42
+; FLATSCR-NEXT:    v_writelane_b32 v1, s83, 43
+; FLATSCR-NEXT:    v_writelane_b32 v1, s84, 44
+; FLATSCR-NEXT:    v_writelane_b32 v1, s85, 45
+; FLATSCR-NEXT:    v_writelane_b32 v1, s86, 46
+; FLATSCR-NEXT:    v_writelane_b32 v1, s87, 47
+; FLATSCR-NEXT:    v_writelane_b32 v1, s88, 48
+; FLATSCR-NEXT:    v_writelane_b32 v1, s89, 49
+; FLATSCR-NEXT:    v_writelane_b32 v1, s90, 50
+; FLATSCR-NEXT:    v_writelane_b32 v1, s91, 51
+; FLATSCR-NEXT:    v_writelane_b32 v1, s92, 52
+; FLATSCR-NEXT:    v_writelane_b32 v1, s93, 53
+; FLATSCR-NEXT:    v_writelane_b32 v1, s94, 54
+; FLATSCR-NEXT:    v_writelane_b32 v1, s95, 55
+; FLATSCR-NEXT:    v_writelane_b32 v1, s96, 56
+; FLATSCR-NEXT:    v_writelane_b32 v1, s97, 57
+; FLATSCR-NEXT:    v_writelane_b32 v1, s98, 58
+; FLATSCR-NEXT:    v_writelane_b32 v1, s99, 59
+; FLATSCR-NEXT:    v_writelane_b32 v1, s100, 60
+; FLATSCR-NEXT:    v_mov_b32_e32 v0, 0
+; FLATSCR-NEXT:    scratch_store_dword off, v41, s33 ; 4-byte Folded Spill
+; FLATSCR-NEXT:    v_writelane_b32 v1, s101, 61
+; FLATSCR-NEXT:    scratch_store_dword off, v0, s33 offset:4
+; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
+; FLATSCR-NEXT:    ;;#ASMSTART
+; FLATSCR-NEXT:    ; clobber v41
+; FLATSCR-NEXT:    ;;#ASMEND
+; FLATSCR-NEXT:    v_writelane_b32 v1, s102, 62
+; FLATSCR-NEXT:    ;;#ASMSTART
+; FLATSCR-NEXT:    ;;#ASMEND
+; FLATSCR-NEXT:    scratch_load_dword v41, off, s33 ; 4-byte Folded Reload
+; FLATSCR-NEXT:    s_add_i32 s32, s32, 16
+; FLATSCR-NEXT:    v_readlane_b32 s102, v1, 62
+; FLATSCR-NEXT:    v_readlane_b32 s101, v1, 61
+; FLATSCR-NEXT:    v_readlane_b32 s100, v1, 60
+; FLATSCR-NEXT:    v_readlane_b32 s99, v1, 59
+; FLATSCR-NEXT:    v_readlane_b32 s98, v1, 58
+; FLATSCR-NEXT:    v_readlane_b32 s97, v1, 57
+; FLATSCR-NEXT:    v_readlane_b32 s96, v1, 56
+; FLATSCR-NEXT:    v_readlane_b32 s95, v1, 55
+; FLATSCR-NEXT:    v_readlane_b32 s94, v1, 54
+; FLATSCR-NEXT:    v_readlane_b32 s93, v1, 53
+; FLATSCR-NEXT:    v_readlane_b32 s92, v1, 52
+; FLATSCR-NEXT:    v_readlane_b32 s91, v1, 51
+; FLATSCR-NEXT:    v_readlane_b32 s90, v1, 50
+; FLATSCR-NEXT:    v_readlane_b32 s89, v1, 49
+; FLATSCR-NEXT:    v_readlane_b32 s88, v1, 48
+; FLATSCR-NEXT:    v_readlane_b32 s87, v1, 47
+; FLATSCR-NEXT:    v_readlane_b32 s86, v1, 46
+; FLATSCR-NEXT:    v_readlane_b32 s85, v1, 45
+; FLATSCR-NEXT:    v_readlane_b32 s84, v1, 44
+; FLATSCR-NEXT:    v_readlane_b32 s83, v1, 43
+; FLATSCR-NEXT:    v_readlane_b32 s82, v1, 42
+; FLATSCR-NEXT:    v_readlane_b32 s81, v1, 41
+; FLATSCR-NEXT:    v_readlane_b32 s80, v1, 40
+; FLATSCR-NEXT:    v_readlane_b32 s79, v1, 39
+; FLATSCR-NEXT:    v_readlane_b32 s78, v1, 38
+; FLATSCR-NEXT:    v_readlane_b32 s77, v1, 37
+; FLATSCR-NEXT:    v_readlane_b32 s76, v1, 36
+; FLATSCR-NEXT:    v_readlane_b32 s75, v1, 35
+; FLATSCR-NEXT:    v_readlane_b32 s74, v1, 34
+; FLATSCR-NEXT:    v_readlane_b32 s73, v1, 33
+; FLATSCR-NEXT:    v_readlane_b32 s72, v1, 32
+; FLATSCR-NEXT:    v_readlane_b32 s71, v1, 31
+; FLATSCR-NEXT:    v_readlane_b32 s70, v1, 30
+; FLATSCR-NEXT:    v_readlane_b32 s69, v1, 29
+; FLATSCR-NEXT:    v_readlane_b32 s68, v1, 28
+; FLATSCR-NEXT:    v_readlane_b32 s67, v1, 27
+; FLATSCR-NEXT:    v_readlane_b32 s66, v1, 26
+; FLATSCR-NEXT:    v_readlane_b32 s65, v1, 25
+; FLATSCR-NEXT:    v_readlane_b32 s64, v1, 24
+; FLATSCR-NEXT:    v_readlane_b32 s63, v1, 23
+; FLATSCR-NEXT:    v_readlane_b32 s62, v1, 22
+; FLATSCR-NEXT:    v_readlane_b32 s61, v1, 21
+; FLATSCR-NEXT:    v_readlane_b32 s60, v1, 20
+; FLATSCR-NEXT:    v_readlane_b32 s59, v1, 19
+; FLATSCR-NEXT:    v_readlane_b32 s58, v1, 18
+; FLATSCR-NEXT:    v_readlane_b32 s57, v1, 17
+; FLATSCR-NEXT:    v_readlane_b32 s56, v1, 16
+; FLATSCR-NEXT:    v_readlane_b32 s55, v1, 15
+; FLATSCR-NEXT:    v_readlane_b32 s54, v1, 14
+; FLATSCR-NEXT:    v_readlane_b32 s53, v1, 13
+; FLATSCR-NEXT:    v_readlane_b32 s52, v1, 12
+; FLATSCR-NEXT:    v_readlane_b32 s51, v1, 11
+; FLATSCR-NEXT:    v_readlane_b32 s50, v1, 10
+; FLATSCR-NEXT:    v_readlane_b32 s49, v1, 9
+; FLATSCR-NEXT:    v_readlane_b32 s48, v1, 8
+; FLATSCR-NEXT:    v_readlane_b32 s47, v1, 7
+; FLATSCR-NEXT:    v_readlane_b32 s46, v1, 6
+; FLATSCR-NEXT:    v_readlane_b32 s45, v1, 5
+; FLATSCR-NEXT:    v_readlane_b32 s44, v1, 4
+; FLATSCR-NEXT:    v_readlane_b32 s43, v1, 3
+; FLATSCR-NEXT:    v_readlane_b32 s42, v1, 2
+; FLATSCR-NEXT:    v_readlane_b32 s41, v1, 1
+; FLATSCR-NEXT:    v_readlane_b32 s40, v1, 0
+; FLATSCR-NEXT:    s_mov_b32 s32, s33
+; FLATSCR-NEXT:    s_xor_saveexec_b64 s[2:3], -1
+; FLATSCR-NEXT:    scratch_load_dword v1, off, s33 offset:8 ; 4-byte Folded Reload
+; FLATSCR-NEXT:    s_mov_b64 exec, s[2:3]
+; FLATSCR-NEXT:    s_mov_b32 s33, s0
+; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
+; FLATSCR-NEXT:    s_setpc_b64 s[30:31]
   %alloca = alloca i32, addrspace(5)
   store volatile i32 0, ptr addrspace(5) %alloca
   call void asm sideeffect "; clobber v41", "~{v41}"()
@@ -310,37 +976,316 @@ define void @last_lane_vgpr_for_fp_csr() #1 {
 }
 
 ; Use a copy to a free SGPR instead of introducing a second CSR VGPR.
-; GCN-LABEL: {{^}}no_new_vgpr_for_fp_csr:
-; GCN: s_waitcnt
-; GCN-NEXT: s_mov_b32 [[FP_COPY:s[0-9]+]], s33
-; GCN-NEXT: s_mov_b32 s33, s32
-; GCN-NEXT: s_xor_saveexec_b64 [[COPY_EXEC0:s\[[0-9]+:[0-9]+\]]], -1{{$}}
-; MUBUF-NEXT:   buffer_store_dword [[CSR_VGPR:v[0-9]+]], off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
-; FLATSCR-NEXT: scratch_store_dword off, [[CSR_VGPR:v[0-9]+]], s33 offset:8 ; 4-byte Folded Spill
-; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC0]]
-; GCN-COUNT-61: v_writelane_b32 v1,
-; MUBUF:   buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill
-; FLATSCR: scratch_store_dword off, v41, s33 ; 4-byte Folded Spill
-; GCN: v_writelane_b32 v1,
-; MUBUF:   buffer_store_dword
-; FLATSCR: scratch_store_dword
-; GCN: ;;#ASMSTART
-; GCN: v_writelane_b32 v1,
-; MUBUF:   buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload
-; FLATSCR: scratch_load_dword v41, off, s33 ; 4-byte Folded Reload
-; MUBUF:        s_addk_i32 s32, 0x400
-; FLATSCR:      s_add_i32 s32, s32, 16
-; GCN-COUNT-64: v_readlane_b32 s{{[0-9]+}}, v1
-; MUBUF-NEXT:   s_mov_b32 s32, s33
-; FLATSCR-NEXT: s_mov_b32 s32, s33
-; GCN-NEXT: s_xor_saveexec_b64 [[COPY_EXEC1:s\[[0-9]+:[0-9]+\]]], -1{{$}}
-; MUBUF-NEXT:   buffer_load_dword [[CSR_VGPR]], off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
-; FLATSCR-NEXT: scratch_load_dword [[CSR_VGPR]], off, s33 offset:8 ; 4-byte Folded Reload
-; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC1]]
-; GCN-NEXT: s_mov_b32 s33, [[FP_COPY]]
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: s_setpc_b64
 define void @no_new_vgpr_for_fp_csr() #1 {
+; MUBUF-LABEL: no_new_vgpr_for_fp_csr:
+; MUBUF:       ; %bb.0:
+; MUBUF-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; MUBUF-NEXT:    s_mov_b32 s4, s33
+; MUBUF-NEXT:    s_mov_b32 s33, s32
+; MUBUF-NEXT:    s_xor_saveexec_b64 s[6:7], -1
+; MUBUF-NEXT:    buffer_store_dword v1, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
+; MUBUF-NEXT:    s_mov_b64 exec, s[6:7]
+; MUBUF-NEXT:    v_writelane_b32 v1, s39, 0
+; MUBUF-NEXT:    v_writelane_b32 v1, s40, 1
+; MUBUF-NEXT:    v_writelane_b32 v1, s41, 2
+; MUBUF-NEXT:    v_writelane_b32 v1, s42, 3
+; MUBUF-NEXT:    v_writelane_b32 v1, s43, 4
+; MUBUF-NEXT:    v_writelane_b32 v1, s44, 5
+; MUBUF-NEXT:    v_writelane_b32 v1, s45, 6
+; MUBUF-NEXT:    v_writelane_b32 v1, s46, 7
+; MUBUF-NEXT:    v_writelane_b32 v1, s47, 8
+; MUBUF-NEXT:    v_writelane_b32 v1, s48, 9
+; MUBUF-NEXT:    v_writelane_b32 v1, s49, 10
+; MUBUF-NEXT:    v_writelane_b32 v1, s50, 11
+; MUBUF-NEXT:    v_writelane_b32 v1, s51, 12
+; MUBUF-NEXT:    v_writelane_b32 v1, s52, 13
+; MUBUF-NEXT:    v_writelane_b32 v1, s53, 14
+; MUBUF-NEXT:    v_writelane_b32 v1, s54, 15
+; MUBUF-NEXT:    v_writelane_b32 v1, s55, 16
+; MUBUF-NEXT:    v_writelane_b32 v1, s56, 17
+; MUBUF-NEXT:    v_writelane_b32 v1, s57, 18
+; MUBUF-NEXT:    v_writelane_b32 v1, s58, 19
+; MUBUF-NEXT:    v_writelane_b32 v1, s59, 20
+; MUBUF-NEXT:    v_writelane_b32 v1, s60, 21
+; MUBUF-NEXT:    v_writelane_b32 v1, s61, 22
+; MUBUF-NEXT:    v_writelane_b32 v1, s62, 23
+; MUBUF-NEXT:    v_writelane_b32 v1, s63, 24
+; MUBUF-NEXT:    v_writelane_b32 v1, s64, 25
+; MUBUF-NEXT:    v_writelane_b32 v1, s65, 26
+; MUBUF-NEXT:    v_writelane_b32 v1, s66, 27
+; MUBUF-NEXT:    v_writelane_b32 v1, s67, 28
+; MUBUF-NEXT:    v_writelane_b32 v1, s68, 29
+; MUBUF-NEXT:    v_writelane_b32 v1, s69, 30
+; MUBUF-NEXT:    v_writelane_b32 v1, s70, 31
+; MUBUF-NEXT:    v_writelane_b32 v1, s71, 32
+; MUBUF-NEXT:    v_writelane_b32 v1, s72, 33
+; MUBUF-NEXT:    v_writelane_b32 v1, s73, 34
+; MUBUF-NEXT:    v_writelane_b32 v1, s74, 35
+; MUBUF-NEXT:    v_writelane_b32 v1, s75, 36
+; MUBUF-NEXT:    v_writelane_b32 v1, s76, 37
+; MUBUF-NEXT:    v_writelane_b32 v1, s77, 38
+; MUBUF-NEXT:    v_writelane_b32 v1, s78, 39
+; MUBUF-NEXT:    v_writelane_b32 v1, s79, 40
+; MUBUF-NEXT:    v_writelane_b32 v1, s80, 41
+; MUBUF-NEXT:    v_writelane_b32 v1, s81, 42
+; MUBUF-NEXT:    v_writelane_b32 v1, s82, 43
+; MUBUF-NEXT:    v_writelane_b32 v1, s83, 44
+; MUBUF-NEXT:    v_writelane_b32 v1, s84, 45
+; MUBUF-NEXT:    v_writelane_b32 v1, s85, 46
+; MUBUF-NEXT:    v_writelane_b32 v1, s86, 47
+; MUBUF-NEXT:    v_writelane_b32 v1, s87, 48
+; MUBUF-NEXT:    v_writelane_b32 v1, s88, 49
+; MUBUF-NEXT:    v_writelane_b32 v1, s89, 50
+; MUBUF-NEXT:    v_writelane_b32 v1, s90, 51
+; MUBUF-NEXT:    v_writelane_b32 v1, s91, 52
+; MUBUF-NEXT:    v_writelane_b32 v1, s92, 53
+; MUBUF-NEXT:    v_writelane_b32 v1, s93, 54
+; MUBUF-NEXT:    v_writelane_b32 v1, s94, 55
+; MUBUF-NEXT:    v_writelane_b32 v1, s95, 56
+; MUBUF-NEXT:    v_writelane_b32 v1, s96, 57
+; MUBUF-NEXT:    v_writelane_b32 v1, s97, 58
+; MUBUF-NEXT:    v_writelane_b32 v1, s98, 59
+; MUBUF-NEXT:    v_writelane_b32 v1, s99, 60
+; MUBUF-NEXT:    v_writelane_b32 v1, s100, 61
+; MUBUF-NEXT:    v_mov_b32_e32 v0, 0
+; MUBUF-NEXT:    buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill
+; MUBUF-NEXT:    v_writelane_b32 v1, s101, 62
+; MUBUF-NEXT:    buffer_store_dword v0, off, s[0:3], s33 offset:4
+; MUBUF-NEXT:    s_waitcnt vmcnt(0)
+; MUBUF-NEXT:    ;;#ASMSTART
+; MUBUF-NEXT:    ; clobber v41
+; MUBUF-NEXT:    ;;#ASMEND
+; MUBUF-NEXT:    v_writelane_b32 v1, s102, 63
+; MUBUF-NEXT:    ;;#ASMSTART
+; MUBUF-NEXT:    ;;#ASMEND
+; MUBUF-NEXT:    buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload
+; MUBUF-NEXT:    s_addk_i32 s32, 0x400
+; MUBUF-NEXT:    v_readlane_b32 s102, v1, 63
+; MUBUF-NEXT:    v_readlane_b32 s101, v1, 62
+; MUBUF-NEXT:    v_readlane_b32 s100, v1, 61
+; MUBUF-NEXT:    v_readlane_b32 s99, v1, 60
+; MUBUF-NEXT:    v_readlane_b32 s98, v1, 59
+; MUBUF-NEXT:    v_readlane_b32 s97, v1, 58
+; MUBUF-NEXT:    v_readlane_b32 s96, v1, 57
+; MUBUF-NEXT:    v_readlane_b32 s95, v1, 56
+; MUBUF-NEXT:    v_readlane_b32 s94, v1, 55
+; MUBUF-NEXT:    v_readlane_b32 s93, v1, 54
+; MUBUF-NEXT:    v_readlane_b32 s92, v1, 53
+; MUBUF-NEXT:    v_readlane_b32 s91, v1, 52
+; MUBUF-NEXT:    v_readlane_b32 s90, v1, 51
+; MUBUF-NEXT:    v_readlane_b32 s89, v1, 50
+; MUBUF-NEXT:    v_readlane_b32 s88, v1, 49
+; MUBUF-NEXT:    v_readlane_b32 s87, v1, 48
+; MUBUF-NEXT:    v_readlane_b32 s86, v1, 47
+; MUBUF-NEXT:    v_readlane_b32 s85, v1, 46
+; MUBUF-NEXT:    v_readlane_b32 s84, v1, 45
+; MUBUF-NEXT:    v_readlane_b32 s83, v1, 44
+; MUBUF-NEXT:    v_readlane_b32 s82, v1, 43
+; MUBUF-NEXT:    v_readlane_b32 s81, v1, 42
+; MUBUF-NEXT:    v_readlane_b32 s80, v1, 41
+; MUBUF-NEXT:    v_readlane_b32 s79, v1, 40
+; MUBUF-NEXT:    v_readlane_b32 s78, v1, 39
+; MUBUF-NEXT:    v_readlane_b32 s77, v1, 38
+; MUBUF-NEXT:    v_readlane_b32 s76, v1, 37
+; MUBUF-NEXT:    v_readlane_b32 s75, v1, 36
+; MUBUF-NEXT:    v_readlane_b32 s74, v1, 35
+; MUBUF-NEXT:    v_readlane_b32 s73, v1, 34
+; MUBUF-NEXT:    v_readlane_b32 s72, v1, 33
+; MUBUF-NEXT:    v_readlane_b32 s71, v1, 32
+; MUBUF-NEXT:    v_readlane_b32 s70, v1, 31
+; MUBUF-NEXT:    v_readlane_b32 s69, v1, 30
+; MUBUF-NEXT:    v_readlane_b32 s68, v1, 29
+; MUBUF-NEXT:    v_readlane_b32 s67, v1, 28
+; MUBUF-NEXT:    v_readlane_b32 s66, v1, 27
+; MUBUF-NEXT:    v_readlane_b32 s65, v1, 26
+; MUBUF-NEXT:    v_readlane_b32 s64, v1, 25
+; MUBUF-NEXT:    v_readlane_b32 s63, v1, 24
+; MUBUF-NEXT:    v_readlane_b32 s62, v1, 23
+; MUBUF-NEXT:    v_readlane_b32 s61, v1, 22
+; MUBUF-NEXT:    v_readlane_b32 s60, v1, 21
+; MUBUF-NEXT:    v_readlane_b32 s59, v1, 20
+; MUBUF-NEXT:    v_readlane_b32 s58, v1, 19
+; MUBUF-NEXT:    v_readlane_b32 s57, v1, 18
+; MUBUF-NEXT:    v_readlane_b32 s56, v1, 17
+; MUBUF-NEXT:    v_readlane_b32 s55, v1, 16
+; MUBUF-NEXT:    v_readlane_b32 s54, v1, 15
+; MUBUF-NEXT:    v_readlane_b32 s53, v1, 14
+; MUBUF-NEXT:    v_readlane_b32 s52, v1, 13
+; MUBUF-NEXT:    v_readlane_b32 s51, v1, 12
+; MUBUF-NEXT:    v_readlane_b32 s50, v1, 11
+; MUBUF-NEXT:    v_readlane_b32 s49, v1, 10
+; MUBUF-NEXT:    v_readlane_b32 s48, v1, 9
+; MUBUF-NEXT:    v_readlane_b32 s47, v1, 8
+; MUBUF-NEXT:    v_readlane_b32 s46, v1, 7
+; MUBUF-NEXT:    v_readlane_b32 s45, v1, 6
+; MUBUF-NEXT:    v_readlane_b32 s44, v1, 5
+; MUBUF-NEXT:    v_readlane_b32 s43, v1, 4
+; MUBUF-NEXT:    v_readlane_b32 s42, v1, 3
+; MUBUF-NEXT:    v_readlane_b32 s41, v1, 2
+; MUBUF-NEXT:    v_readlane_b32 s40, v1, 1
+; MUBUF-NEXT:    v_readlane_b32 s39, v1, 0
+; MUBUF-NEXT:    s_mov_b32 s32, s33
+; MUBUF-NEXT:    s_xor_saveexec_b64 s[6:7], -1
+; MUBUF-NEXT:    buffer_load_dword v1, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
+; MUBUF-NEXT:    s_mov_b64 exec, s[6:7]
+; MUBUF-NEXT:    s_mov_b32 s33, s4
+; MUBUF-NEXT:    s_waitcnt vmcnt(0)
+; MUBUF-NEXT:    s_setpc_b64 s[30:31]
+;
+; FLATSCR-LABEL: no_new_vgpr_for_fp_csr:
+; FLATSCR:       ; %bb.0:
+; FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; FLATSCR-NEXT:    s_mov_b32 s0, s33
+; FLATSCR-NEXT:    s_mov_b32 s33, s32
+; FLATSCR-NEXT:    s_xor_saveexec_b64 s[2:3], -1
+; FLATSCR-NEXT:    scratch_store_dword off, v1, s33 offset:8 ; 4-byte Folded Spill
+; FLATSCR-NEXT:    s_mov_b64 exec, s[2:3]
+; FLATSCR-NEXT:    v_writelane_b32 v1, s39, 0
+; FLATSCR-NEXT:    v_writelane_b32 v1, s40, 1
+; FLATSCR-NEXT:    v_writelane_b32 v1, s41, 2
+; FLATSCR-NEXT:    v_writelane_b32 v1, s42, 3
+; FLATSCR-NEXT:    v_writelane_b32 v1, s43, 4
+; FLATSCR-NEXT:    v_writelane_b32 v1, s44, 5
+; FLATSCR-NEXT:    v_writelane_b32 v1, s45, 6
+; FLATSCR-NEXT:    v_writelane_b32 v1, s46, 7
+; FLATSCR-NEXT:    v_writelane_b32 v1, s47, 8
+; FLATSCR-NEXT:    v_writelane_b32 v1, s48, 9
+; FLATSCR-NEXT:    v_writelane_b32 v1, s49, 10
+; FLATSCR-NEXT:    v_writelane_b32 v1, s50, 11
+; FLATSCR-NEXT:    v_writelane_b32 v1, s51, 12
+; FLATSCR-NEXT:    v_writelane_b32 v1, s52, 13
+; FLATSCR-NEXT:    v_writelane_b32 v1, s53, 14
+; FLATSCR-NEXT:    v_writelane_b32 v1, s54, 15
+; FLATSCR-NEXT:    v_writelane_b32 v1, s55, 16
+; FLATSCR-NEXT:    v_writelane_b32 v1, s56, 17
+; FLATSCR-NEXT:    v_writelane_b32 v1, s57, 18
+; FLATSCR-NEXT:    v_writelane_b32 v1, s58, 19
+; FLATSCR-NEXT:    v_writelane_b32 v1, s59, 20
+; FLATSCR-NEXT:    v_writelane_b32 v1, s60, 21
+; FLATSCR-NEXT:    v_writelane_b32 v1, s61, 22
+; FLATSCR-NEXT:    v_writelane_b32 v1, s62, 23
+; FLATSCR-NEXT:    v_writelane_b32 v1, s63, 24
+; FLATSCR-NEXT:    v_writelane_b32 v1, s64, 25
+; FLATSCR-NEXT:    v_writelane_b32 v1, s65, 26
+; FLATSCR-NEXT:    v_writelane_b32 v1, s66, 27
+; FLATSCR-NEXT:    v_writelane_b32 v1, s67, 28
+; FLATSCR-NEXT:    v_writelane_b32 v1, s68, 29
+; FLATSCR-NEXT:    v_writelane_b32 v1, s69, 30
+; FLATSCR-NEXT:    v_writelane_b32 v1, s70, 31
+; FLATSCR-NEXT:    v_writelane_b32 v1, s71, 32
+; FLATSCR-NEXT:    v_writelane_b32 v1, s72, 33
+; FLATSCR-NEXT:    v_writelane_b32 v1, s73, 34
+; FLATSCR-NEXT:    v_writelane_b32 v1, s74, 35
+; FLATSCR-NEXT:    v_writelane_b32 v1, s75, 36
+; FLATSCR-NEXT:    v_writelane_b32 v1, s76, 37
+; FLATSCR-NEXT:    v_writelane_b32 v1, s77, 38
+; FLATSCR-NEXT:    v_writelane_b32 v1, s78, 39
+; FLATSCR-NEXT:    v_writelane_b32 v1, s79, 40
+; FLATSCR-NEXT:    v_writelane_b32 v1, s80, 41
+; FLATSCR-NEXT:    v_writelane_b32 v1, s81, 42
+; FLATSCR-NEXT:    v_writelane_b32 v1, s82, 43
+; FLATSCR-NEXT:    v_writelane_b32 v1, s83, 44
+; FLATSCR-NEXT:    v_writelane_b32 v1, s84, 45
+; FLATSCR-NEXT:    v_writelane_b32 v1, s85, 46
+; FLATSCR-NEXT:    v_writelane_b32 v1, s86, 47
+; FLATSCR-NEXT:    v_writelane_b32 v1, s87, 48
+; FLATSCR-NEXT:    v_writelane_b32 v1, s88, 49
+; FLATSCR-NEXT:    v_writelane_b32 v1, s89, 50
+; FLATSCR-NEXT:    v_writelane_b32 v1, s90, 51
+; FLATSCR-NEXT:    v_writelane_b32 v1, s91, 52
+; FLATSCR-NEXT:    v_writelane_b32 v1, s92, 53
+; FLATSCR-NEXT:    v_writelane_b32 v1, s93, 54
+; FLATSCR-NEXT:    v_writelane_b32 v1, s94, 55
+; FLATSCR-NEXT:    v_writelane_b32 v1, s95, 56
+; FLATSCR-NEXT:    v_writelane_b32 v1, s96, 57
+; FLATSCR-NEXT:    v_writelane_b32 v1, s97, 58
+; FLATSCR-NEXT:    v_writelane_b32 v1, s98, 59
+; FLATSCR-NEXT:    v_writelane_b32 v1, s99, 60
+; FLATSCR-NEXT:    v_writelane_b32 v1, s100, 61
+; FLATSCR-NEXT:    v_mov_b32_e32 v0, 0
+; FLATSCR-NEXT:    scratch_store_dword off, v41, s33 ; 4-byte Folded Spill
+; FLATSCR-NEXT:    v_writelane_b32 v1, s101, 62
+; FLATSCR-NEXT:    scratch_store_dword off, v0, s33 offset:4
+; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
+; FLATSCR-NEXT:    ;;#ASMSTART
+; FLATSCR-NEXT:    ; clobber v41
+; FLATSCR-NEXT:    ;;#ASMEND
+; FLATSCR-NEXT:    v_writelane_b32 v1, s102, 63
+; FLATSCR-NEXT:    ;;#ASMSTART
+; FLATSCR-NEXT:    ;;#ASMEND
+; FLATSCR-NEXT:    scratch_load_dword v41, off, s33 ; 4-byte Folded Reload
+; FLATSCR-NEXT:    s_add_i32 s32, s32, 16
+; FLATSCR-NEXT:    v_readlane_b32 s102, v1, 63
+; FLATSCR-NEXT:    v_readlane_b32 s101, v1, 62
+; FLATSCR-NEXT:    v_readlane_b32 s100, v1, 61
+; FLATSCR-NEXT:    v_readlane_b32 s99, v1, 60
+; FLATSCR-NEXT:    v_readlane_b32 s98, v1, 59
+; FLATSCR-NEXT:    v_readlane_b32 s97, v1, 58
+; FLATSCR-NEXT:    v_readlane_b32 s96, v1, 57
+; FLATSCR-NEXT:    v_readlane_b32 s95, v1, 56
+; FLATSCR-NEXT:    v_readlane_b32 s94, v1, 55
+; FLATSCR-NEXT:    v_readlane_b32 s93, v1, 54
+; FLATSCR-NEXT:    v_readlane_b32 s92, v1, 53
+; FLATSCR-NEXT:    v_readlane_b32 s91, v1, 52
+; FLATSCR-NEXT:    v_readlane_b32 s90, v1, 51
+; FLATSCR-NEXT:    v_readlane_b32 s89, v1, 50
+; FLATSCR-NEXT:    v_readlane_b32 s88, v1, 49
+; FLATSCR-NEXT:    v_readlane_b32 s87, v1, 48
+; FLATSCR-NEXT:    v_readlane_b32 s86, v1, 47
+; FLATSCR-NEXT:    v_readlane_b32 s85, v1, 46
+; FLATSCR-NEXT:    v_readlane_b32 s84, v1, 45
+; FLATSCR-NEXT:    v_readlane_b32 s83, v1, 44
+; FLATSCR-NEXT:    v_readlane_b32 s82, v1, 43
+; FLATSCR-NEXT:    v_readlane_b32 s81, v1, 42
+; FLATSCR-NEXT:    v_readlane_b32 s80, v1, 41
+; FLATSCR-NEXT:    v_readlane_b32 s79, v1, 40
+; FLATSCR-NEXT:    v_readlane_b32 s78, v1, 39
+; FLATSCR-NEXT:    v_readlane_b32 s77, v1, 38
+; FLATSCR-NEXT:    v_readlane_b32 s76, v1, 37
+; FLATSCR-NEXT:    v_readlane_b32 s75, v1, 36
+; FLATSCR-NEXT:    v_readlane_b32 s74, v1, 35
+; FLATSCR-NEXT:    v_readlane_b32 s73, v1, 34
+; FLATSCR-NEXT:    v_readlane_b32 s72, v1, 33
+; FLATSCR-NEXT:    v_readlane_b32 s71, v1, 32
+; FLATSCR-NEXT:    v_readlane_b32 s70, v1, 31
+; FLATSCR-NEXT:    v_readlane_b32 s69, v1, 30
+; FLATSCR-NEXT:    v_readlane_b32 s68, v1, 29
+; FLATSCR-NEXT:    v_readlane_b32 s67, v1, 28
+; FLATSCR-NEXT:    v_readlane_b32 s66, v1, 27
+; FLATSCR-NEXT:    v_readlane_b32 s65, v1, 26
+; FLATSCR-NEXT:    v_readlane_b32 s64, v1, 25
+; FLATSCR-NEXT:    v_readlane_b32 s63, v1, 24
+; FLATSCR-NEXT:    v_readlane_b32 s62, v1, 23
+; FLATSCR-NEXT:    v_readlane_b32 s61, v1, 22
+; FLATSCR-NEXT:    v_readlane_b32 s60, v1, 21
+; FLATSCR-NEXT:    v_readlane_b32 s59, v1, 20
+; FLATSCR-NEXT:    v_readlane_b32 s58, v1, 19
+; FLATSCR-NEXT:    v_readlane_b32 s57, v1, 18
+; FLATSCR-NEXT:    v_readlane_b32 s56, v1, 17
+; FLATSCR-NEXT:    v_readlane_b32 s55, v1, 16
+; FLATSCR-NEXT:    v_readlane_b32 s54, v1, 15
+; FLATSCR-NEXT:    v_readlane_b32 s53, v1, 14
+; FLATSCR-NEXT:    v_readlane_b32 s52, v1, 13
+; FLATSCR-NEXT:    v_readlane_b32 s51, v1, 12
+; FLATSCR-NEXT:    v_readlane_b32 s50, v1, 11
+; FLATSCR-NEXT:    v_readlane_b32 s49, v1, 10
+; FLATSCR-NEXT:    v_readlane_b32 s48, v1, 9
+; FLATSCR-NEXT:    v_readlane_b32 s47, v1, 8
+; FLATSCR-NEXT:    v_readlane_b32 s46, v1, 7
+; FLATSCR-NEXT:    v_readlane_b32 s45, v1, 6
+; FLATSCR-NEXT:    v_readlane_b32 s44, v1, 5
+; FLATSCR-NEXT:    v_readlane_b32 s43, v1, 4
+; FLATSCR-NEXT:    v_readlane_b32 s42, v1, 3
+; FLATSCR-NEXT:    v_readlane_b32 s41, v1, 2
+; FLATSCR-NEXT:    v_readlane_b32 s40, v1, 1
+; FLATSCR-NEXT:    v_readlane_b32 s39, v1, 0
+; FLATSCR-NEXT:    s_mov_b32 s32, s33
+; FLATSCR-NEXT:    s_xor_saveexec_b64 s[2:3], -1
+; FLATSCR-NEXT:    scratch_load_dword v1, off, s33 offset:8 ; 4-byte Folded Reload
+; FLATSCR-NEXT:    s_mov_b64 exec, s[2:3]
+; FLATSCR-NEXT:    s_mov_b32 s33, s0
+; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
+; FLATSCR-NEXT:    s_setpc_b64 s[30:31]
   %alloca = alloca i32, addrspace(5)
   store volatile i32 0, ptr addrspace(5) %alloca
   call void asm sideeffect "; clobber v41", "~{v41}"()
@@ -356,64 +1301,99 @@ define void @no_new_vgpr_for_fp_csr() #1 {
   ret void
 }
 
-; GCN-LABEL: {{^}}realign_stack_no_fp_elim:
-; GCN: s_waitcnt
-; MUBUF-NEXT:   s_mov_b32 [[FP_COPY:s4]], s33
-; FLATSCR-NEXT: s_mov_b32 [[FP_COPY:s0]], s33
-; MUBUF-NEXT:   s_add_i32 s33, s32, 0x7ffc0
-; FLATSCR-NEXT: s_add_i32 s33, s32, 0x1fff
-; MUBUF-NEXT:   s_and_b32 s33, s33, 0xfff80000
-; FLATSCR-NEXT: s_and_b32 s33, s33, 0xffffe000
-; MUBUF-NEXT:   s_mov_b32 s5, s34
-; FLATSCR-NEXT: s_mov_b32 s1, s34
-; MUBUF-NEXT:   s_mov_b32 s34, s32 
-; FLATSCR-NEXT: s_mov_b32 s34, s32
-; MUBUF-NEXT:   s_add_i32 s32, s32, 0x180000
-; FLATSCR-NEXT: s_addk_i32 s32, 0x6000
-; GCN-NEXT:     v_mov_b32_e32 [[ZERO:v[0-9]+]], 0
-; MUBUF-NEXT:   v_mov_b32_e32 [[OFFSET:v[0-9]+]], 0x2000{{$}}
-; MUBUF-NEXT:   buffer_store_dword [[ZERO]], [[OFFSET]], s[0:3], s33 offen{{$}}
-; FLATSCR-NEXT: s_add_i32 s2, s33, 0x2000
-; FLATSCR-NEXT: scratch_store_dword off, [[ZERO]], s2
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; MUBUF-NEXT:   s_mov_b32 s32, s34 
-; MUBUF-NEXT:   s_mov_b32 s34, s5 
-; FLATSCR-NEXT:   s_mov_b32 s32, s34 
-; FLATSCR-NEXT:   s_mov_b32 s34, s1 
-; GCN-NEXT: s_mov_b32 s33, [[FP_COPY]]
-; GCN-NEXT: s_setpc_b64
 define void @realign_stack_no_fp_elim() #1 {
+; MUBUF-LABEL: realign_stack_no_fp_elim:
+; MUBUF:       ; %bb.0:
+; MUBUF-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; MUBUF-NEXT:    s_mov_b32 s4, s33
+; MUBUF-NEXT:    s_add_i32 s33, s32, 0x7ffc0
+; MUBUF-NEXT:    s_and_b32 s33, s33, 0xfff80000
+; MUBUF-NEXT:    s_mov_b32 s5, s34
+; MUBUF-NEXT:    s_mov_b32 s34, s32
+; MUBUF-NEXT:    s_add_i32 s32, s32, 0x180000
+; MUBUF-NEXT:    v_mov_b32_e32 v0, 0
+; MUBUF-NEXT:    v_mov_b32_e32 v1, 0x2000
+; MUBUF-NEXT:    buffer_store_dword v0, v1, s[0:3], s33 offen
+; MUBUF-NEXT:    s_waitcnt vmcnt(0)
+; MUBUF-NEXT:    s_mov_b32 s32, s34
+; MUBUF-NEXT:    s_mov_b32 s34, s5
+; MUBUF-NEXT:    s_mov_b32 s33, s4
+; MUBUF-NEXT:    s_setpc_b64 s[30:31]
+;
+; FLATSCR-LABEL: realign_stack_no_fp_elim:
+; FLATSCR:       ; %bb.0:
+; FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; FLATSCR-NEXT:    s_mov_b32 s0, s33
+; FLATSCR-NEXT:    s_add_i32 s33, s32, 0x1fff
+; FLATSCR-NEXT:    s_and_b32 s33, s33, 0xffffe000
+; FLATSCR-NEXT:    s_mov_b32 s1, s34
+; FLATSCR-NEXT:    s_mov_b32 s34, s32
+; FLATSCR-NEXT:    s_addk_i32 s32, 0x6000
+; FLATSCR-NEXT:    v_mov_b32_e32 v0, 0
+; FLATSCR-NEXT:    s_add_i32 s2, s33, 0x2000
+; FLATSCR-NEXT:    scratch_store_dword off, v0, s2
+; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
+; FLATSCR-NEXT:    s_mov_b32 s32, s34
+; FLATSCR-NEXT:    s_mov_b32 s34, s1
+; FLATSCR-NEXT:    s_mov_b32 s33, s0
+; FLATSCR-NEXT:    s_setpc_b64 s[30:31]
   %alloca = alloca i32, align 8192, addrspace(5)
   store volatile i32 0, ptr addrspace(5) %alloca
   ret void
 }
 
-; GCN-LABEL: {{^}}no_unused_non_csr_sgpr_for_fp:
-; GCN: s_waitcnt
-; GCN-NEXT: s_mov_b32 vcc_lo, s33
-; GCN-NEXT: s_mov_b32 s33, s32
-; GCN-NEXT: s_xor_saveexec_b64 [[COPY_EXEC0:s\[[0-9]+:[0-9]+\]]], -1{{$}}
-; MUBUF-NEXT:   buffer_store_dword [[CSR_VGPR:v[0-9]+]], off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
-; FLATSCR-NEXT: scratch_store_dword off, [[CSR_VGPR:v[0-9]+]], s33 offset:4 ; 4-byte Folded Spill
-; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC0]]
-; GCN: v_writelane_b32 [[CSR_VGPR]], s30, 0
-; GCN: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0
-; GCN: v_writelane_b32 [[CSR_VGPR]], s31, 1
-; MUBUF:   buffer_store_dword [[ZERO]], off, s[0:3], s33{{$}}
-; FLATSCR: scratch_store_dword off, [[ZERO]], s33{{$}}
-; GCN-NEXT:     s_waitcnt vmcnt(0)
-; GCN: ;;#ASMSTART
-; GCN:     v_readlane_b32 s31, [[CSR_VGPR]], 1
-; GCN:     v_readlane_b32 s30, [[CSR_VGPR]], 0
-;GCN-NEXT: s_mov_b32 s32, s33
-; GCN-NEXT: s_xor_saveexec_b64 [[COPY_EXEC1:s\[[0-9]+:[0-9]+\]]], -1{{$}}
-; MUBUF-NEXT:   buffer_load_dword [[CSR_VGPR]], off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
-; FLATSCR-NEXT: scratch_load_dword [[CSR_VGPR]], off, s33 offset:4 ; 4-byte Folded Reload
-; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC1]]
-; GCN-NEXT: s_mov_b32 s33, vcc_lo
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: s_setpc_b64 s[30:31]
 define void @no_unused_non_csr_sgpr_for_fp() #1 {
+; MUBUF-LABEL: no_unused_non_csr_sgpr_for_fp:
+; MUBUF:       ; %bb.0:
+; MUBUF-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; MUBUF-NEXT:    s_mov_b32 vcc_lo, s33
+; MUBUF-NEXT:    s_mov_b32 s33, s32
+; MUBUF-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; MUBUF-NEXT:    buffer_store_dword v1, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
+; MUBUF-NEXT:    s_mov_b64 exec, s[4:5]
+; MUBUF-NEXT:    v_writelane_b32 v1, s30, 0
+; MUBUF-NEXT:    v_mov_b32_e32 v0, 0
+; MUBUF-NEXT:    s_addk_i32 s32, 0x300
+; MUBUF-NEXT:    v_writelane_b32 v1, s31, 1
+; MUBUF-NEXT:    buffer_store_dword v0, off, s[0:3], s33
+; MUBUF-NEXT:    s_waitcnt vmcnt(0)
+; MUBUF-NEXT:    ;;#ASMSTART
+; MUBUF-NEXT:    ;;#ASMEND
+; MUBUF-NEXT:    v_readlane_b32 s31, v1, 1
+; MUBUF-NEXT:    v_readlane_b32 s30, v1, 0
+; MUBUF-NEXT:    s_mov_b32 s32, s33
+; MUBUF-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; MUBUF-NEXT:    buffer_load_dword v1, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
+; MUBUF-NEXT:    s_mov_b64 exec, s[4:5]
+; MUBUF-NEXT:    s_mov_b32 s33, vcc_lo
+; MUBUF-NEXT:    s_waitcnt vmcnt(0)
+; MUBUF-NEXT:    s_setpc_b64 s[30:31]
+;
+; FLATSCR-LABEL: no_unused_non_csr_sgpr_for_fp:
+; FLATSCR:       ; %bb.0:
+; FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; FLATSCR-NEXT:    s_mov_b32 vcc_lo, s33
+; FLATSCR-NEXT:    s_mov_b32 s33, s32
+; FLATSCR-NEXT:    s_xor_saveexec_b64 s[0:1], -1
+; FLATSCR-NEXT:    scratch_store_dword off, v1, s33 offset:4 ; 4-byte Folded Spill
+; FLATSCR-NEXT:    s_mov_b64 exec, s[0:1]
+; FLATSCR-NEXT:    v_writelane_b32 v1, s30, 0
+; FLATSCR-NEXT:    v_mov_b32_e32 v0, 0
+; FLATSCR-NEXT:    s_add_i32 s32, s32, 12
+; FLATSCR-NEXT:    v_writelane_b32 v1, s31, 1
+; FLATSCR-NEXT:    scratch_store_dword off, v0, s33
+; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
+; FLATSCR-NEXT:    ;;#ASMSTART
+; FLATSCR-NEXT:    ;;#ASMEND
+; FLATSCR-NEXT:    v_readlane_b32 s31, v1, 1
+; FLATSCR-NEXT:    v_readlane_b32 s30, v1, 0
+; FLATSCR-NEXT:    s_mov_b32 s32, s33
+; FLATSCR-NEXT:    s_xor_saveexec_b64 s[0:1], -1
+; FLATSCR-NEXT:    scratch_load_dword v1, off, s33 offset:4 ; 4-byte Folded Reload
+; FLATSCR-NEXT:    s_mov_b64 exec, s[0:1]
+; FLATSCR-NEXT:    s_mov_b32 s33, vcc_lo
+; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
+; FLATSCR-NEXT:    s_setpc_b64 s[30:31]
   %alloca = alloca i32, addrspace(5)
   store volatile i32 0, ptr addrspace(5) %alloca
 
@@ -428,31 +1408,64 @@ define void @no_unused_non_csr_sgpr_for_fp() #1 {
 }
 
 ; Need a new CSR VGPR to satisfy the FP spill.
-; GCN-LABEL: {{^}}no_unused_non_csr_sgpr_for_fp_no_scratch_vgpr:
-; GCN: s_waitcnt
-; GCN-NEXT: s_mov_b32 vcc_lo, s33
-; GCN-NEXT: s_mov_b32 s33, s32
-; GCN-NEXT: s_or_saveexec_b64 [[COPY_EXEC0:s\[[0-9]+:[0-9]+\]]], -1{{$}}
-; MUBUF-NEXT:   buffer_store_dword [[CSR_VGPR:v[0-9]+]], off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
-; FLATSCR-NEXT: scratch_store_dword off, [[CSR_VGPR:v[0-9]+]], s33 offset:4 ; 4-byte Folded Spill
-; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC0]]
-; GCN: v_mov_b32_e32
-
-; MUBUF:       s_addk_i32 s32, 0x300{{$}}
-; FLATSCR:     s_add_i32 s32, s32, 12{{$}}
-; MUBUF-DAG:   buffer_store_dword
-; FLATSCR-DAG: scratch_store_dword
-
-; GCN: ;;#ASMSTART
-; GCN: s_mov_b32 s32, s33
-; GCN: s_or_saveexec_b64 [[COPY_EXEC1:s\[[0-9]+:[0-9]+\]]], -1{{$}}
-; MUBUF-NEXT:   buffer_load_dword [[CSR_VGPR]], off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
-; FLATSCR-NEXT: scratch_load_dword [[CSR_VGPR]], off, s33 offset:4 ; 4-byte Folded Reload
-; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC1]]
-; GCN-NEXT: s_mov_b32 s33, vcc_lo
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: s_setpc_b64 s[30:31]
 define void @no_unused_non_csr_sgpr_for_fp_no_scratch_vgpr() #1 {
+; MUBUF-LABEL: no_unused_non_csr_sgpr_for_fp_no_scratch_vgpr:
+; MUBUF:       ; %bb.0:
+; MUBUF-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; MUBUF-NEXT:    s_mov_b32 vcc_lo, s33
+; MUBUF-NEXT:    s_mov_b32 s33, s32
+; MUBUF-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; MUBUF-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
+; MUBUF-NEXT:    s_mov_b64 exec, s[4:5]
+; MUBUF-NEXT:    v_writelane_b32 v40, s30, 0
+; MUBUF-NEXT:    v_mov_b32_e32 v0, 0
+; MUBUF-NEXT:    s_addk_i32 s32, 0x300
+; MUBUF-NEXT:    v_writelane_b32 v40, s31, 1
+; MUBUF-NEXT:    buffer_store_dword v0, off, s[0:3], s33
+; MUBUF-NEXT:    s_waitcnt vmcnt(0)
+; MUBUF-NEXT:    ;;#ASMSTART
+; MUBUF-NEXT:    ;;#ASMEND
+; MUBUF-NEXT:    ;;#ASMSTART
+; MUBUF-NEXT:    ; clobber nonpreserved initial VGPRs
+; MUBUF-NEXT:    ;;#ASMEND
+; MUBUF-NEXT:    v_readlane_b32 s31, v40, 1
+; MUBUF-NEXT:    v_readlane_b32 s30, v40, 0
+; MUBUF-NEXT:    s_mov_b32 s32, s33
+; MUBUF-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; MUBUF-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
+; MUBUF-NEXT:    s_mov_b64 exec, s[4:5]
+; MUBUF-NEXT:    s_mov_b32 s33, vcc_lo
+; MUBUF-NEXT:    s_waitcnt vmcnt(0)
+; MUBUF-NEXT:    s_setpc_b64 s[30:31]
+;
+; FLATSCR-LABEL: no_unused_non_csr_sgpr_for_fp_no_scratch_vgpr:
+; FLATSCR:       ; %bb.0:
+; FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; FLATSCR-NEXT:    s_mov_b32 vcc_lo, s33
+; FLATSCR-NEXT:    s_mov_b32 s33, s32
+; FLATSCR-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; FLATSCR-NEXT:    scratch_store_dword off, v40, s33 offset:4 ; 4-byte Folded Spill
+; FLATSCR-NEXT:    s_mov_b64 exec, s[0:1]
+; FLATSCR-NEXT:    v_writelane_b32 v40, s30, 0
+; FLATSCR-NEXT:    v_mov_b32_e32 v0, 0
+; FLATSCR-NEXT:    s_add_i32 s32, s32, 12
+; FLATSCR-NEXT:    v_writelane_b32 v40, s31, 1
+; FLATSCR-NEXT:    scratch_store_dword off, v0, s33
+; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
+; FLATSCR-NEXT:    ;;#ASMSTART
+; FLATSCR-NEXT:    ;;#ASMEND
+; FLATSCR-NEXT:    ;;#ASMSTART
+; FLATSCR-NEXT:    ; clobber nonpreserved initial VGPRs
+; FLATSCR-NEXT:    ;;#ASMEND
+; FLATSCR-NEXT:    v_readlane_b32 s31, v40, 1
+; FLATSCR-NEXT:    v_readlane_b32 s30, v40, 0
+; FLATSCR-NEXT:    s_mov_b32 s32, s33
+; FLATSCR-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; FLATSCR-NEXT:    scratch_load_dword v40, off, s33 offset:4 ; 4-byte Folded Reload
+; FLATSCR-NEXT:    s_mov_b64 exec, s[0:1]
+; FLATSCR-NEXT:    s_mov_b32 s33, vcc_lo
+; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
+; FLATSCR-NEXT:    s_setpc_b64 s[30:31]
   %alloca = alloca i32, addrspace(5)
   store volatile i32 0, ptr addrspace(5) %alloca
 
@@ -474,32 +1487,72 @@ define void @no_unused_non_csr_sgpr_for_fp_no_scratch_vgpr() #1 {
 
 ; The byval argument exceeds the MUBUF constant offset, so a scratch
 ; register is needed to access the CSR VGPR slot.
-; GCN-LABEL: {{^}}scratch_reg_needed_mubuf_offset:
-; GCN: s_waitcnt
-; GCN-NEXT: s_mov_b32 vcc_lo, s33
-; GCN-DAG:  s_mov_b32 s33, s32
-; GCN-NEXT: s_or_saveexec_b64 [[COPY_EXEC0:s\[[0-9]+:[0-9]+\]]], -1{{$}}
-; MUBUF-NEXT: s_add_i32 [[SCRATCH_SGPR:s[0-9]+]], s33, 0x40100
-; FLATSCR-NEXT: s_add_i32 [[SCRATCH_SGPR:s[0-9]+]], s33, 0x1004
-; MUBUF-NEXT: buffer_store_dword [[CSR_VGPR:v[0-9]+]], off, s[0:3], [[SCRATCH_SGPR]] ; 4-byte Folded Spill
-; FLATSCR-NEXT: scratch_store_dword off, [[CSR_VGPR:v[0-9]+]], [[SCRATCH_SGPR]] ; 4-byte Folded Spill
-; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC0]]
-; MUBUF-DAG:   s_add_i32 s32, s32, 0x40300{{$}}
-; FLATSCR-DAG: s_addk_i32 s32, 0x100c{{$}}
-; MUBUF-DAG:   buffer_store_dword
-; FLATSCR-DAG: scratch_store_dword
-
-; GCN: ;;#ASMSTART
-; GCN: s_or_saveexec_b64 [[COPY_EXEC1:s\[[0-9]+:[0-9]+\]]], -1{{$}}
-; MUBUF-NEXT: s_add_i32 [[SCRATCH_SGPR:s[0-9]+]], s33, 0x40100
-; MUBUF-NEXT: buffer_load_dword [[CSR_VGPR]], off, s[0:3], [[SCRATCH_SGPR]] ; 4-byte Folded Reload
-; FLATSCR-NEXT: s_add_i32 [[SCRATCH_SGPR:s[0-9]+]], s33, 0x1004
-; FLATSCR-NEXT: scratch_load_dword [[CSR_VGPR]], off, [[SCRATCH_SGPR]] ; 4-byte Folded Reload
-; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC1]]
-; GCN-NEXT: s_mov_b32 s33, vcc_lo
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: s_setpc_b64 s[30:31]
 define void @scratch_reg_needed_mubuf_offset(ptr addrspace(5) byval([4096 x i8]) align 4 %arg) #1 {
+; MUBUF-LABEL: scratch_reg_needed_mubuf_offset:
+; MUBUF:       ; %bb.0:
+; MUBUF-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; MUBUF-NEXT:    s_mov_b32 vcc_lo, s33
+; MUBUF-NEXT:    s_mov_b32 s33, s32
+; MUBUF-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; MUBUF-NEXT:    s_add_i32 s6, s33, 0x40100
+; MUBUF-NEXT:    buffer_store_dword v40, off, s[0:3], s6 ; 4-byte Folded Spill
+; MUBUF-NEXT:    s_mov_b64 exec, s[4:5]
+; MUBUF-NEXT:    v_writelane_b32 v40, s30, 0
+; MUBUF-NEXT:    v_mov_b32_e32 v0, 0
+; MUBUF-NEXT:    v_mov_b32_e32 v1, 0x1000
+; MUBUF-NEXT:    s_add_i32 s32, s32, 0x40300
+; MUBUF-NEXT:    v_writelane_b32 v40, s31, 1
+; MUBUF-NEXT:    buffer_store_dword v0, v1, s[0:3], s33 offen
+; MUBUF-NEXT:    s_waitcnt vmcnt(0)
+; MUBUF-NEXT:    ;;#ASMSTART
+; MUBUF-NEXT:    ; clobber nonpreserved SGPRs
+; MUBUF-NEXT:    ;;#ASMEND
+; MUBUF-NEXT:    ;;#ASMSTART
+; MUBUF-NEXT:    ; clobber nonpreserved VGPRs
+; MUBUF-NEXT:    ;;#ASMEND
+; MUBUF-NEXT:    v_readlane_b32 s31, v40, 1
+; MUBUF-NEXT:    v_readlane_b32 s30, v40, 0
+; MUBUF-NEXT:    s_mov_b32 s32, s33
+; MUBUF-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; MUBUF-NEXT:    s_add_i32 s6, s33, 0x40100
+; MUBUF-NEXT:    buffer_load_dword v40, off, s[0:3], s6 ; 4-byte Folded Reload
+; MUBUF-NEXT:    s_mov_b64 exec, s[4:5]
+; MUBUF-NEXT:    s_mov_b32 s33, vcc_lo
+; MUBUF-NEXT:    s_waitcnt vmcnt(0)
+; MUBUF-NEXT:    s_setpc_b64 s[30:31]
+;
+; FLATSCR-LABEL: scratch_reg_needed_mubuf_offset:
+; FLATSCR:       ; %bb.0:
+; FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; FLATSCR-NEXT:    s_mov_b32 vcc_lo, s33
+; FLATSCR-NEXT:    s_mov_b32 s33, s32
+; FLATSCR-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; FLATSCR-NEXT:    s_add_i32 s2, s33, 0x1004
+; FLATSCR-NEXT:    scratch_store_dword off, v40, s2 ; 4-byte Folded Spill
+; FLATSCR-NEXT:    s_mov_b64 exec, s[0:1]
+; FLATSCR-NEXT:    s_addk_i32 s32, 0x100c
+; FLATSCR-NEXT:    v_writelane_b32 v40, s30, 0
+; FLATSCR-NEXT:    v_mov_b32_e32 v0, 0
+; FLATSCR-NEXT:    s_add_i32 s0, s33, 0x1000
+; FLATSCR-NEXT:    v_writelane_b32 v40, s31, 1
+; FLATSCR-NEXT:    scratch_store_dword off, v0, s0
+; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
+; FLATSCR-NEXT:    ;;#ASMSTART
+; FLATSCR-NEXT:    ; clobber nonpreserved SGPRs
+; FLATSCR-NEXT:    ;;#ASMEND
+; FLATSCR-NEXT:    ;;#ASMSTART
+; FLATSCR-NEXT:    ; clobber nonpreserved VGPRs
+; FLATSCR-NEXT:    ;;#ASMEND
+; FLATSCR-NEXT:    v_readlane_b32 s31, v40, 1
+; FLATSCR-NEXT:    v_readlane_b32 s30, v40, 0
+; FLATSCR-NEXT:    s_mov_b32 s32, s33
+; FLATSCR-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; FLATSCR-NEXT:    s_add_i32 s2, s33, 0x1004
+; FLATSCR-NEXT:    scratch_load_dword v40, off, s2 ; 4-byte Folded Reload
+; FLATSCR-NEXT:    s_mov_b64 exec, s[0:1]
+; FLATSCR-NEXT:    s_mov_b32 s33, vcc_lo
+; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
+; FLATSCR-NEXT:    s_setpc_b64 s[30:31]
   %alloca = alloca i32, addrspace(5)
   store volatile i32 0, ptr addrspace(5) %alloca
 
@@ -520,25 +1573,72 @@ define void @scratch_reg_needed_mubuf_offset(ptr addrspace(5) byval([4096 x i8])
   ret void
 }
 
-; GCN-LABEL: {{^}}local_empty_func:
-; GCN: s_waitcnt
-; GCN-NEXT: s_setpc_b64
 define internal void @local_empty_func() #0 {
+; GCN-LABEL: local_empty_func:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    s_setpc_b64 s[30:31]
   ret void
 }
 
 ; An FP is needed, despite not needing any spills
 ; TODO: Ccould see callee does not use stack and omit FP.
-; GCN-LABEL: {{^}}ipra_call_with_stack:
-; GCN: s_mov_b32 [[TMP_SGPR:s[0-9]+]], s33
-; GCN: s_mov_b32 s33, s32
-; MUBUF:   s_addk_i32 s32, 0x400
-; FLATSCR: s_add_i32 s32, s32, 16
-; MUBUF:   buffer_store_dword v{{[0-9]+}}, off, s[0:3], s33{{$}}
-; FLATSCR: scratch_store_dword off, v{{[0-9]+}}, s33{{$}}
-; GCN:     s_swappc_b64
-; GCN: s_mov_b32 s33, [[TMP_SGPR]]
 define void @ipra_call_with_stack() #0 {
+; MUBUF-LABEL: ipra_call_with_stack:
+; MUBUF:       ; %bb.0:
+; MUBUF-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; MUBUF-NEXT:    s_mov_b32 s18, s33
+; MUBUF-NEXT:    s_mov_b32 s33, s32
+; MUBUF-NEXT:    s_xor_saveexec_b64 s[16:17], -1
+; MUBUF-NEXT:    buffer_store_dword v1, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
+; MUBUF-NEXT:    s_mov_b64 exec, s[16:17]
+; MUBUF-NEXT:    s_addk_i32 s32, 0x400
+; MUBUF-NEXT:    v_writelane_b32 v1, s30, 0
+; MUBUF-NEXT:    v_mov_b32_e32 v0, 0
+; MUBUF-NEXT:    v_writelane_b32 v1, s31, 1
+; MUBUF-NEXT:    buffer_store_dword v0, off, s[0:3], s33
+; MUBUF-NEXT:    s_waitcnt vmcnt(0)
+; MUBUF-NEXT:    s_getpc_b64 s[16:17]
+; MUBUF-NEXT:    s_add_u32 s16, s16, local_empty_func at rel32@lo+4
+; MUBUF-NEXT:    s_addc_u32 s17, s17, local_empty_func at rel32@hi+12
+; MUBUF-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; MUBUF-NEXT:    v_readlane_b32 s31, v1, 1
+; MUBUF-NEXT:    v_readlane_b32 s30, v1, 0
+; MUBUF-NEXT:    s_mov_b32 s32, s33
+; MUBUF-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; MUBUF-NEXT:    buffer_load_dword v1, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
+; MUBUF-NEXT:    s_mov_b64 exec, s[4:5]
+; MUBUF-NEXT:    s_mov_b32 s33, s18
+; MUBUF-NEXT:    s_waitcnt vmcnt(0)
+; MUBUF-NEXT:    s_setpc_b64 s[30:31]
+;
+; FLATSCR-LABEL: ipra_call_with_stack:
+; FLATSCR:       ; %bb.0:
+; FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; FLATSCR-NEXT:    s_mov_b32 s2, s33
+; FLATSCR-NEXT:    s_mov_b32 s33, s32
+; FLATSCR-NEXT:    s_xor_saveexec_b64 s[0:1], -1
+; FLATSCR-NEXT:    scratch_store_dword off, v1, s33 offset:4 ; 4-byte Folded Spill
+; FLATSCR-NEXT:    s_mov_b64 exec, s[0:1]
+; FLATSCR-NEXT:    s_add_i32 s32, s32, 16
+; FLATSCR-NEXT:    v_writelane_b32 v1, s30, 0
+; FLATSCR-NEXT:    v_mov_b32_e32 v0, 0
+; FLATSCR-NEXT:    v_writelane_b32 v1, s31, 1
+; FLATSCR-NEXT:    scratch_store_dword off, v0, s33
+; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
+; FLATSCR-NEXT:    s_getpc_b64 s[0:1]
+; FLATSCR-NEXT:    s_add_u32 s0, s0, local_empty_func at rel32@lo+4
+; FLATSCR-NEXT:    s_addc_u32 s1, s1, local_empty_func at rel32@hi+12
+; FLATSCR-NEXT:    s_swappc_b64 s[30:31], s[0:1]
+; FLATSCR-NEXT:    v_readlane_b32 s31, v1, 1
+; FLATSCR-NEXT:    v_readlane_b32 s30, v1, 0
+; FLATSCR-NEXT:    s_mov_b32 s32, s33
+; FLATSCR-NEXT:    s_xor_saveexec_b64 s[0:1], -1
+; FLATSCR-NEXT:    scratch_load_dword v1, off, s33 offset:4 ; 4-byte Folded Reload
+; FLATSCR-NEXT:    s_mov_b64 exec, s[0:1]
+; FLATSCR-NEXT:    s_mov_b32 s33, s2
+; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
+; FLATSCR-NEXT:    s_setpc_b64 s[30:31]
   %alloca = alloca i32, addrspace(5)
   store volatile i32 0, ptr addrspace(5) %alloca
   call void @local_empty_func()
@@ -546,21 +1646,41 @@ define void @ipra_call_with_stack() #0 {
 }
 
 ; With no free registers, we must spill the FP to memory.
-; GCN-LABEL: {{^}}callee_need_to_spill_fp_to_memory:
-; MUBUF:   s_mov_b32 [[FP_SCRATCH_COPY:s[0-9]+]], s33
-; FLATSCR: s_mov_b32 s0, s33
-; GCN:     s_mov_b32 s33, s32
-; MUBUF:   v_mov_b32_e32 [[TMP_VGPR1:v[0-9]+]], [[FP_SCRATCH_COPY]]
-; MUBUF:   buffer_store_dword [[TMP_VGPR1]], off, s[0:3], s33 ; 4-byte Folded Spill
-; MUBUF:   buffer_load_dword [[TMP_VGPR2:v[0-9]+]], off, s[0:3], s33 ; 4-byte Folded Reload
-; MUBUF:   s_waitcnt vmcnt(0)
-; MUBUF:   v_readfirstlane_b32 [[FP_SCRATCH_COPY:s[0-9]+]], [[TMP_VGPR2]]
-; MUBUF:   s_mov_b32 s33, [[FP_SCRATCH_COPY]]
-; FLATSCR: s_mov_b32 s33, s0
-; GCN:     s_setpc_b64
-; MUBUF:   ScratchSize: 8
-; FLATSCR: ScratchSize: 0
 define void @callee_need_to_spill_fp_to_memory() #3 {
+; MUBUF-LABEL: callee_need_to_spill_fp_to_memory:
+; MUBUF:       ; %bb.0:
+; MUBUF-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; MUBUF-NEXT:    s_mov_b32 s4, s33
+; MUBUF-NEXT:    s_mov_b32 s33, s32
+; MUBUF-NEXT:    v_mov_b32_e32 v0, s4
+; MUBUF-NEXT:    buffer_store_dword v0, off, s[0:3], s33 ; 4-byte Folded Spill
+; MUBUF-NEXT:    ;;#ASMSTART
+; MUBUF-NEXT:    ; clobber nonpreserved SGPRs
+; MUBUF-NEXT:    ;;#ASMEND
+; MUBUF-NEXT:    ;;#ASMSTART
+; MUBUF-NEXT:    ; clobber all VGPRs
+; MUBUF-NEXT:    ;;#ASMEND
+; MUBUF-NEXT:    buffer_load_dword v0, off, s[0:3], s33 ; 4-byte Folded Reload
+; MUBUF-NEXT:    s_addk_i32 s32, 0x200
+; MUBUF-NEXT:    s_mov_b32 s32, s33
+; MUBUF-NEXT:    s_waitcnt vmcnt(0)
+; MUBUF-NEXT:    v_readfirstlane_b32 s4, v0
+; MUBUF-NEXT:    s_mov_b32 s33, s4
+; MUBUF-NEXT:    s_setpc_b64 s[30:31]
+;
+; FLATSCR-LABEL: callee_need_to_spill_fp_to_memory:
+; FLATSCR:       ; %bb.0:
+; FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; FLATSCR-NEXT:    s_mov_b32 s0, s33
+; FLATSCR-NEXT:    s_mov_b32 s33, s32
+; FLATSCR-NEXT:    ;;#ASMSTART
+; FLATSCR-NEXT:    ; clobber nonpreserved SGPRs
+; FLATSCR-NEXT:    ;;#ASMEND
+; FLATSCR-NEXT:    ;;#ASMSTART
+; FLATSCR-NEXT:    ; clobber all VGPRs
+; FLATSCR-NEXT:    ;;#ASMEND
+; FLATSCR-NEXT:    s_mov_b32 s33, s0
+; FLATSCR-NEXT:    s_setpc_b64 s[30:31]
   call void asm sideeffect "; clobber nonpreserved SGPRs",
     "~{s4},~{s5},~{s6},~{s7},~{s8},~{s9}
     ,~{s10},~{s11},~{s12},~{s13},~{s14},~{s15},~{s16},~{s17},~{s18},~{s19}
@@ -578,23 +1698,313 @@ define void @callee_need_to_spill_fp_to_memory() #3 {
 ; If we have a reserved VGPR that can be used for SGPR spills, we may still
 ; need to spill the FP to memory if there are no free lanes in the reserved
 ; VGPR.
-; GCN-LABEL: {{^}}callee_need_to_spill_fp_to_memory_full_reserved_vgpr:
-; MUBUF:   s_mov_b32 [[FP_SCRATCH_COPY:s[0-9]+]], s33
-; GCN:     s_mov_b32 s33, s32
-; MUBUF:   s_xor_saveexec_b64 [[COPY_EXEC1:s\[[0-9]+:[0-9]+\]]], -1{{$}}
-; MUBUF:   s_mov_b64 exec, [[COPY_EXEC1]]
-; MUBUF:   v_mov_b32_e32 [[TMP_VGPR1:v[0-9]+]], [[FP_SCRATCH_COPY]]
-; MUBUF:   buffer_store_dword [[TMP_VGPR1]], off, s[0:3], s33 offset:[[OFF:[0-9]+]]
-; GCN-NOT: v_writelane_b32 v40, s33
-; GCN-NOT: v_readlane_b32 s33, v40
-; GCN-NOT: v_readlane_b32 s33, v40
-; MUBUF:   buffer_load_dword [[TMP_VGPR2:v[0-9]+]], off, s[0:3], s33 offset:[[OFF]]
-; MUBUF:   v_readfirstlane_b32 [[FP_SCRATCH_COPY:s[0-9]+]], [[TMP_VGPR2]]
-; MUBUF:   s_xor_saveexec_b64 [[COPY_EXEC2:s\[[0-9]+:[0-9]+\]]], -1{{$}}
-; MUBUF:   s_mov_b64 exec, [[COPY_EXEC2]]
-; MUBUF:   s_mov_b32 s33, [[FP_SCRATCH_COPY]]
-; GCN:     s_setpc_b64
 define void @callee_need_to_spill_fp_to_memory_full_reserved_vgpr() #3 {
+; MUBUF-LABEL: callee_need_to_spill_fp_to_memory_full_reserved_vgpr:
+; MUBUF:       ; %bb.0:
+; MUBUF-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; MUBUF-NEXT:    s_mov_b32 s4, s33
+; MUBUF-NEXT:    s_mov_b32 s33, s32
+; MUBUF-NEXT:    s_xor_saveexec_b64 s[6:7], -1
+; MUBUF-NEXT:    buffer_store_dword v39, off, s[0:3], s33 ; 4-byte Folded Spill
+; MUBUF-NEXT:    s_mov_b64 exec, s[6:7]
+; MUBUF-NEXT:    v_writelane_b32 v39, s39, 0
+; MUBUF-NEXT:    v_writelane_b32 v39, s40, 1
+; MUBUF-NEXT:    v_writelane_b32 v39, s41, 2
+; MUBUF-NEXT:    v_writelane_b32 v39, s42, 3
+; MUBUF-NEXT:    v_writelane_b32 v39, s43, 4
+; MUBUF-NEXT:    v_writelane_b32 v39, s44, 5
+; MUBUF-NEXT:    v_writelane_b32 v39, s45, 6
+; MUBUF-NEXT:    v_writelane_b32 v39, s46, 7
+; MUBUF-NEXT:    v_writelane_b32 v39, s47, 8
+; MUBUF-NEXT:    v_writelane_b32 v39, s48, 9
+; MUBUF-NEXT:    v_writelane_b32 v39, s49, 10
+; MUBUF-NEXT:    v_writelane_b32 v39, s50, 11
+; MUBUF-NEXT:    v_writelane_b32 v39, s51, 12
+; MUBUF-NEXT:    v_writelane_b32 v39, s52, 13
+; MUBUF-NEXT:    v_writelane_b32 v39, s53, 14
+; MUBUF-NEXT:    v_writelane_b32 v39, s54, 15
+; MUBUF-NEXT:    v_writelane_b32 v39, s55, 16
+; MUBUF-NEXT:    v_writelane_b32 v39, s56, 17
+; MUBUF-NEXT:    v_writelane_b32 v39, s57, 18
+; MUBUF-NEXT:    v_writelane_b32 v39, s58, 19
+; MUBUF-NEXT:    v_writelane_b32 v39, s59, 20
+; MUBUF-NEXT:    v_writelane_b32 v39, s60, 21
+; MUBUF-NEXT:    v_writelane_b32 v39, s61, 22
+; MUBUF-NEXT:    v_writelane_b32 v39, s62, 23
+; MUBUF-NEXT:    v_writelane_b32 v39, s63, 24
+; MUBUF-NEXT:    v_writelane_b32 v39, s64, 25
+; MUBUF-NEXT:    v_writelane_b32 v39, s65, 26
+; MUBUF-NEXT:    v_writelane_b32 v39, s66, 27
+; MUBUF-NEXT:    v_writelane_b32 v39, s67, 28
+; MUBUF-NEXT:    v_writelane_b32 v39, s68, 29
+; MUBUF-NEXT:    v_writelane_b32 v39, s69, 30
+; MUBUF-NEXT:    v_writelane_b32 v39, s70, 31
+; MUBUF-NEXT:    v_writelane_b32 v39, s71, 32
+; MUBUF-NEXT:    v_writelane_b32 v39, s72, 33
+; MUBUF-NEXT:    v_writelane_b32 v39, s73, 34
+; MUBUF-NEXT:    v_writelane_b32 v39, s74, 35
+; MUBUF-NEXT:    v_writelane_b32 v39, s75, 36
+; MUBUF-NEXT:    v_writelane_b32 v39, s76, 37
+; MUBUF-NEXT:    v_writelane_b32 v39, s77, 38
+; MUBUF-NEXT:    v_writelane_b32 v39, s78, 39
+; MUBUF-NEXT:    v_writelane_b32 v39, s79, 40
+; MUBUF-NEXT:    v_writelane_b32 v39, s80, 41
+; MUBUF-NEXT:    v_writelane_b32 v39, s81, 42
+; MUBUF-NEXT:    v_writelane_b32 v39, s82, 43
+; MUBUF-NEXT:    v_writelane_b32 v39, s83, 44
+; MUBUF-NEXT:    v_writelane_b32 v39, s84, 45
+; MUBUF-NEXT:    v_writelane_b32 v39, s85, 46
+; MUBUF-NEXT:    v_writelane_b32 v39, s86, 47
+; MUBUF-NEXT:    v_writelane_b32 v39, s87, 48
+; MUBUF-NEXT:    v_writelane_b32 v39, s88, 49
+; MUBUF-NEXT:    v_writelane_b32 v39, s89, 50
+; MUBUF-NEXT:    v_writelane_b32 v39, s90, 51
+; MUBUF-NEXT:    v_writelane_b32 v39, s91, 52
+; MUBUF-NEXT:    v_writelane_b32 v39, s92, 53
+; MUBUF-NEXT:    v_writelane_b32 v39, s93, 54
+; MUBUF-NEXT:    v_writelane_b32 v39, s94, 55
+; MUBUF-NEXT:    v_writelane_b32 v39, s95, 56
+; MUBUF-NEXT:    v_writelane_b32 v39, s96, 57
+; MUBUF-NEXT:    v_writelane_b32 v39, s97, 58
+; MUBUF-NEXT:    v_writelane_b32 v39, s98, 59
+; MUBUF-NEXT:    v_writelane_b32 v39, s99, 60
+; MUBUF-NEXT:    v_writelane_b32 v39, s100, 61
+; MUBUF-NEXT:    v_mov_b32_e32 v0, s4
+; MUBUF-NEXT:    v_writelane_b32 v39, s101, 62
+; MUBUF-NEXT:    buffer_store_dword v0, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
+; MUBUF-NEXT:    v_writelane_b32 v39, s102, 63
+; MUBUF-NEXT:    ;;#ASMSTART
+; MUBUF-NEXT:    ; clobber nonpreserved SGPRs and 64 CSRs
+; MUBUF-NEXT:    ;;#ASMEND
+; MUBUF-NEXT:    ;;#ASMSTART
+; MUBUF-NEXT:    ; clobber all VGPRs except CSR v40
+; MUBUF-NEXT:    ;;#ASMEND
+; MUBUF-NEXT:    buffer_load_dword v0, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
+; MUBUF-NEXT:    s_addk_i32 s32, 0x300
+; MUBUF-NEXT:    v_readlane_b32 s102, v39, 63
+; MUBUF-NEXT:    v_readlane_b32 s101, v39, 62
+; MUBUF-NEXT:    v_readlane_b32 s100, v39, 61
+; MUBUF-NEXT:    v_readlane_b32 s99, v39, 60
+; MUBUF-NEXT:    v_readlane_b32 s98, v39, 59
+; MUBUF-NEXT:    v_readlane_b32 s97, v39, 58
+; MUBUF-NEXT:    v_readlane_b32 s96, v39, 57
+; MUBUF-NEXT:    v_readlane_b32 s95, v39, 56
+; MUBUF-NEXT:    v_readlane_b32 s94, v39, 55
+; MUBUF-NEXT:    v_readlane_b32 s93, v39, 54
+; MUBUF-NEXT:    v_readlane_b32 s92, v39, 53
+; MUBUF-NEXT:    v_readlane_b32 s91, v39, 52
+; MUBUF-NEXT:    v_readlane_b32 s90, v39, 51
+; MUBUF-NEXT:    v_readlane_b32 s89, v39, 50
+; MUBUF-NEXT:    v_readlane_b32 s88, v39, 49
+; MUBUF-NEXT:    v_readlane_b32 s87, v39, 48
+; MUBUF-NEXT:    v_readlane_b32 s86, v39, 47
+; MUBUF-NEXT:    v_readlane_b32 s85, v39, 46
+; MUBUF-NEXT:    v_readlane_b32 s84, v39, 45
+; MUBUF-NEXT:    v_readlane_b32 s83, v39, 44
+; MUBUF-NEXT:    v_readlane_b32 s82, v39, 43
+; MUBUF-NEXT:    v_readlane_b32 s81, v39, 42
+; MUBUF-NEXT:    v_readlane_b32 s80, v39, 41
+; MUBUF-NEXT:    v_readlane_b32 s79, v39, 40
+; MUBUF-NEXT:    v_readlane_b32 s78, v39, 39
+; MUBUF-NEXT:    v_readlane_b32 s77, v39, 38
+; MUBUF-NEXT:    v_readlane_b32 s76, v39, 37
+; MUBUF-NEXT:    v_readlane_b32 s75, v39, 36
+; MUBUF-NEXT:    v_readlane_b32 s74, v39, 35
+; MUBUF-NEXT:    v_readlane_b32 s73, v39, 34
+; MUBUF-NEXT:    v_readlane_b32 s72, v39, 33
+; MUBUF-NEXT:    v_readlane_b32 s71, v39, 32
+; MUBUF-NEXT:    v_readlane_b32 s70, v39, 31
+; MUBUF-NEXT:    v_readlane_b32 s69, v39, 30
+; MUBUF-NEXT:    v_readlane_b32 s68, v39, 29
+; MUBUF-NEXT:    v_readlane_b32 s67, v39, 28
+; MUBUF-NEXT:    v_readlane_b32 s66, v39, 27
+; MUBUF-NEXT:    v_readlane_b32 s65, v39, 26
+; MUBUF-NEXT:    v_readlane_b32 s64, v39, 25
+; MUBUF-NEXT:    v_readlane_b32 s63, v39, 24
+; MUBUF-NEXT:    v_readlane_b32 s62, v39, 23
+; MUBUF-NEXT:    v_readlane_b32 s61, v39, 22
+; MUBUF-NEXT:    v_readlane_b32 s60, v39, 21
+; MUBUF-NEXT:    v_readlane_b32 s59, v39, 20
+; MUBUF-NEXT:    v_readlane_b32 s58, v39, 19
+; MUBUF-NEXT:    v_readlane_b32 s57, v39, 18
+; MUBUF-NEXT:    v_readlane_b32 s56, v39, 17
+; MUBUF-NEXT:    v_readlane_b32 s55, v39, 16
+; MUBUF-NEXT:    v_readlane_b32 s54, v39, 15
+; MUBUF-NEXT:    v_readlane_b32 s53, v39, 14
+; MUBUF-NEXT:    v_readlane_b32 s52, v39, 13
+; MUBUF-NEXT:    v_readlane_b32 s51, v39, 12
+; MUBUF-NEXT:    v_readlane_b32 s50, v39, 11
+; MUBUF-NEXT:    v_readlane_b32 s49, v39, 10
+; MUBUF-NEXT:    v_readlane_b32 s48, v39, 9
+; MUBUF-NEXT:    v_readlane_b32 s47, v39, 8
+; MUBUF-NEXT:    v_readlane_b32 s46, v39, 7
+; MUBUF-NEXT:    v_readlane_b32 s45, v39, 6
+; MUBUF-NEXT:    v_readlane_b32 s44, v39, 5
+; MUBUF-NEXT:    v_readlane_b32 s43, v39, 4
+; MUBUF-NEXT:    v_readlane_b32 s42, v39, 3
+; MUBUF-NEXT:    v_readlane_b32 s41, v39, 2
+; MUBUF-NEXT:    v_readlane_b32 s40, v39, 1
+; MUBUF-NEXT:    v_readlane_b32 s39, v39, 0
+; MUBUF-NEXT:    s_mov_b32 s32, s33
+; MUBUF-NEXT:    s_waitcnt vmcnt(0)
+; MUBUF-NEXT:    v_readfirstlane_b32 s4, v0
+; MUBUF-NEXT:    s_xor_saveexec_b64 s[6:7], -1
+; MUBUF-NEXT:    buffer_load_dword v39, off, s[0:3], s33 ; 4-byte Folded Reload
+; MUBUF-NEXT:    s_mov_b64 exec, s[6:7]
+; MUBUF-NEXT:    s_mov_b32 s33, s4
+; MUBUF-NEXT:    s_waitcnt vmcnt(0)
+; MUBUF-NEXT:    s_setpc_b64 s[30:31]
+;
+; FLATSCR-LABEL: callee_need_to_spill_fp_to_memory_full_reserved_vgpr:
+; FLATSCR:       ; %bb.0:
+; FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; FLATSCR-NEXT:    s_mov_b32 s0, s33
+; FLATSCR-NEXT:    s_mov_b32 s33, s32
+; FLATSCR-NEXT:    s_xor_saveexec_b64 s[2:3], -1
+; FLATSCR-NEXT:    scratch_store_dword off, v39, s33 ; 4-byte Folded Spill
+; FLATSCR-NEXT:    s_mov_b64 exec, s[2:3]
+; FLATSCR-NEXT:    v_writelane_b32 v39, s39, 0
+; FLATSCR-NEXT:    v_writelane_b32 v39, s40, 1
+; FLATSCR-NEXT:    v_writelane_b32 v39, s41, 2
+; FLATSCR-NEXT:    v_writelane_b32 v39, s42, 3
+; FLATSCR-NEXT:    v_writelane_b32 v39, s43, 4
+; FLATSCR-NEXT:    v_writelane_b32 v39, s44, 5
+; FLATSCR-NEXT:    v_writelane_b32 v39, s45, 6
+; FLATSCR-NEXT:    v_writelane_b32 v39, s46, 7
+; FLATSCR-NEXT:    v_writelane_b32 v39, s47, 8
+; FLATSCR-NEXT:    v_writelane_b32 v39, s48, 9
+; FLATSCR-NEXT:    v_writelane_b32 v39, s49, 10
+; FLATSCR-NEXT:    v_writelane_b32 v39, s50, 11
+; FLATSCR-NEXT:    v_writelane_b32 v39, s51, 12
+; FLATSCR-NEXT:    v_writelane_b32 v39, s52, 13
+; FLATSCR-NEXT:    v_writelane_b32 v39, s53, 14
+; FLATSCR-NEXT:    v_writelane_b32 v39, s54, 15
+; FLATSCR-NEXT:    v_writelane_b32 v39, s55, 16
+; FLATSCR-NEXT:    v_writelane_b32 v39, s56, 17
+; FLATSCR-NEXT:    v_writelane_b32 v39, s57, 18
+; FLATSCR-NEXT:    v_writelane_b32 v39, s58, 19
+; FLATSCR-NEXT:    v_writelane_b32 v39, s59, 20
+; FLATSCR-NEXT:    v_writelane_b32 v39, s60, 21
+; FLATSCR-NEXT:    v_writelane_b32 v39, s61, 22
+; FLATSCR-NEXT:    v_writelane_b32 v39, s62, 23
+; FLATSCR-NEXT:    v_writelane_b32 v39, s63, 24
+; FLATSCR-NEXT:    v_writelane_b32 v39, s64, 25
+; FLATSCR-NEXT:    v_writelane_b32 v39, s65, 26
+; FLATSCR-NEXT:    v_writelane_b32 v39, s66, 27
+; FLATSCR-NEXT:    v_writelane_b32 v39, s67, 28
+; FLATSCR-NEXT:    v_writelane_b32 v39, s68, 29
+; FLATSCR-NEXT:    v_writelane_b32 v39, s69, 30
+; FLATSCR-NEXT:    v_writelane_b32 v39, s70, 31
+; FLATSCR-NEXT:    v_writelane_b32 v39, s71, 32
+; FLATSCR-NEXT:    v_writelane_b32 v39, s72, 33
+; FLATSCR-NEXT:    v_writelane_b32 v39, s73, 34
+; FLATSCR-NEXT:    v_writelane_b32 v39, s74, 35
+; FLATSCR-NEXT:    v_writelane_b32 v39, s75, 36
+; FLATSCR-NEXT:    v_writelane_b32 v39, s76, 37
+; FLATSCR-NEXT:    v_writelane_b32 v39, s77, 38
+; FLATSCR-NEXT:    v_writelane_b32 v39, s78, 39
+; FLATSCR-NEXT:    v_writelane_b32 v39, s79, 40
+; FLATSCR-NEXT:    v_writelane_b32 v39, s80, 41
+; FLATSCR-NEXT:    v_writelane_b32 v39, s81, 42
+; FLATSCR-NEXT:    v_writelane_b32 v39, s82, 43
+; FLATSCR-NEXT:    v_writelane_b32 v39, s83, 44
+; FLATSCR-NEXT:    v_writelane_b32 v39, s84, 45
+; FLATSCR-NEXT:    v_writelane_b32 v39, s85, 46
+; FLATSCR-NEXT:    v_writelane_b32 v39, s86, 47
+; FLATSCR-NEXT:    v_writelane_b32 v39, s87, 48
+; FLATSCR-NEXT:    v_writelane_b32 v39, s88, 49
+; FLATSCR-NEXT:    v_writelane_b32 v39, s89, 50
+; FLATSCR-NEXT:    v_writelane_b32 v39, s90, 51
+; FLATSCR-NEXT:    v_writelane_b32 v39, s91, 52
+; FLATSCR-NEXT:    v_writelane_b32 v39, s92, 53
+; FLATSCR-NEXT:    v_writelane_b32 v39, s93, 54
+; FLATSCR-NEXT:    v_writelane_b32 v39, s94, 55
+; FLATSCR-NEXT:    v_writelane_b32 v39, s95, 56
+; FLATSCR-NEXT:    v_writelane_b32 v39, s96, 57
+; FLATSCR-NEXT:    v_writelane_b32 v39, s97, 58
+; FLATSCR-NEXT:    v_writelane_b32 v39, s98, 59
+; FLATSCR-NEXT:    v_writelane_b32 v39, s99, 60
+; FLATSCR-NEXT:    v_writelane_b32 v39, s100, 61
+; FLATSCR-NEXT:    v_writelane_b32 v39, s101, 62
+; FLATSCR-NEXT:    s_add_i32 s32, s32, 8
+; FLATSCR-NEXT:    v_writelane_b32 v39, s102, 63
+; FLATSCR-NEXT:    ;;#ASMSTART
+; FLATSCR-NEXT:    ; clobber nonpreserved SGPRs and 64 CSRs
+; FLATSCR-NEXT:    ;;#ASMEND
+; FLATSCR-NEXT:    ;;#ASMSTART
+; FLATSCR-NEXT:    ; clobber all VGPRs except CSR v40
+; FLATSCR-NEXT:    ;;#ASMEND
+; FLATSCR-NEXT:    v_readlane_b32 s102, v39, 63
+; FLATSCR-NEXT:    v_readlane_b32 s101, v39, 62
+; FLATSCR-NEXT:    v_readlane_b32 s100, v39, 61
+; FLATSCR-NEXT:    v_readlane_b32 s99, v39, 60
+; FLATSCR-NEXT:    v_readlane_b32 s98, v39, 59
+; FLATSCR-NEXT:    v_readlane_b32 s97, v39, 58
+; FLATSCR-NEXT:    v_readlane_b32 s96, v39, 57
+; FLATSCR-NEXT:    v_readlane_b32 s95, v39, 56
+; FLATSCR-NEXT:    v_readlane_b32 s94, v39, 55
+; FLATSCR-NEXT:    v_readlane_b32 s93, v39, 54
+; FLATSCR-NEXT:    v_readlane_b32 s92, v39, 53
+; FLATSCR-NEXT:    v_readlane_b32 s91, v39, 52
+; FLATSCR-NEXT:    v_readlane_b32 s90, v39, 51
+; FLATSCR-NEXT:    v_readlane_b32 s89, v39, 50
+; FLATSCR-NEXT:    v_readlane_b32 s88, v39, 49
+; FLATSCR-NEXT:    v_readlane_b32 s87, v39, 48
+; FLATSCR-NEXT:    v_readlane_b32 s86, v39, 47
+; FLATSCR-NEXT:    v_readlane_b32 s85, v39, 46
+; FLATSCR-NEXT:    v_readlane_b32 s84, v39, 45
+; FLATSCR-NEXT:    v_readlane_b32 s83, v39, 44
+; FLATSCR-NEXT:    v_readlane_b32 s82, v39, 43
+; FLATSCR-NEXT:    v_readlane_b32 s81, v39, 42
+; FLATSCR-NEXT:    v_readlane_b32 s80, v39, 41
+; FLATSCR-NEXT:    v_readlane_b32 s79, v39, 40
+; FLATSCR-NEXT:    v_readlane_b32 s78, v39, 39
+; FLATSCR-NEXT:    v_readlane_b32 s77, v39, 38
+; FLATSCR-NEXT:    v_readlane_b32 s76, v39, 37
+; FLATSCR-NEXT:    v_readlane_b32 s75, v39, 36
+; FLATSCR-NEXT:    v_readlane_b32 s74, v39, 35
+; FLATSCR-NEXT:    v_readlane_b32 s73, v39, 34
+; FLATSCR-NEXT:    v_readlane_b32 s72, v39, 33
+; FLATSCR-NEXT:    v_readlane_b32 s71, v39, 32
+; FLATSCR-NEXT:    v_readlane_b32 s70, v39, 31
+; FLATSCR-NEXT:    v_readlane_b32 s69, v39, 30
+; FLATSCR-NEXT:    v_readlane_b32 s68, v39, 29
+; FLATSCR-NEXT:    v_readlane_b32 s67, v39, 28
+; FLATSCR-NEXT:    v_readlane_b32 s66, v39, 27
+; FLATSCR-NEXT:    v_readlane_b32 s65, v39, 26
+; FLATSCR-NEXT:    v_readlane_b32 s64, v39, 25
+; FLATSCR-NEXT:    v_readlane_b32 s63, v39, 24
+; FLATSCR-NEXT:    v_readlane_b32 s62, v39, 23
+; FLATSCR-NEXT:    v_readlane_b32 s61, v39, 22
+; FLATSCR-NEXT:    v_readlane_b32 s60, v39, 21
+; FLATSCR-NEXT:    v_readlane_b32 s59, v39, 20
+; FLATSCR-NEXT:    v_readlane_b32 s58, v39, 19
+; FLATSCR-NEXT:    v_readlane_b32 s57, v39, 18
+; FLATSCR-NEXT:    v_readlane_b32 s56, v39, 17
+; FLATSCR-NEXT:    v_readlane_b32 s55, v39, 16
+; FLATSCR-NEXT:    v_readlane_b32 s54, v39, 15
+; FLATSCR-NEXT:    v_readlane_b32 s53, v39, 14
+; FLATSCR-NEXT:    v_readlane_b32 s52, v39, 13
+; FLATSCR-NEXT:    v_readlane_b32 s51, v39, 12
+; FLATSCR-NEXT:    v_readlane_b32 s50, v39, 11
+; FLATSCR-NEXT:    v_readlane_b32 s49, v39, 10
+; FLATSCR-NEXT:    v_readlane_b32 s48, v39, 9
+; FLATSCR-NEXT:    v_readlane_b32 s47, v39, 8
+; FLATSCR-NEXT:    v_readlane_b32 s46, v39, 7
+; FLATSCR-NEXT:    v_readlane_b32 s45, v39, 6
+; FLATSCR-NEXT:    v_readlane_b32 s44, v39, 5
+; FLATSCR-NEXT:    v_readlane_b32 s43, v39, 4
+; FLATSCR-NEXT:    v_readlane_b32 s42, v39, 3
+; FLATSCR-NEXT:    v_readlane_b32 s41, v39, 2
+; FLATSCR-NEXT:    v_readlane_b32 s40, v39, 1
+; FLATSCR-NEXT:    v_readlane_b32 s39, v39, 0
+; FLATSCR-NEXT:    s_mov_b32 s32, s33
+; FLATSCR-NEXT:    s_xor_saveexec_b64 s[2:3], -1
+; FLATSCR-NEXT:    scratch_load_dword v39, off, s33 ; 4-byte Folded Reload
+; FLATSCR-NEXT:    s_mov_b64 exec, s[2:3]
+; FLATSCR-NEXT:    s_mov_b32 s33, s0
+; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
+; FLATSCR-NEXT:    s_setpc_b64 s[30:31]
   call void asm sideeffect "; clobber nonpreserved SGPRs and 64 CSRs",
     "~{s4},~{s5},~{s6},~{s7},~{s8},~{s9}
     ,~{s10},~{s11},~{s12},~{s13},~{s14},~{s15},~{s16},~{s17},~{s18},~{s19}
@@ -619,17 +2029,312 @@ define void @callee_need_to_spill_fp_to_memory_full_reserved_vgpr() #3 {
 ; the exec register is saved to s0 when saving CSR in the function prolog.
 ; Make sure that the FP save happens after restoring exec from the same
 ; register.
-; GCN-LABEL: {{^}}callee_need_to_spill_fp_to_reg:
-; FLATSCR: s_mov_b32 [[FP_SCRATCH_COPY:s[0-9]+]], s33
-; FLATSCR: s_mov_b32 s33, s32
-; GCN-NOT: v_writelane_b32 v40, s33
-; FLATSCR: s_or_saveexec_b64 [[COPY_EXEC0:s\[[0-9]+:[0-9]+\]]], -1{{$}}
-; FLATSCR: s_mov_b64 exec, [[COPY_EXEC0]]
-; FLATSCR: s_or_saveexec_b64 [[COPY_EXEC1:s\[[0-9]+:[0-9]+\]]], -1{{$}}
-; GCN-NOT: v_readlane_b32 s33, v40
-; FLATSCR: s_mov_b32 s33, [[FP_SCRATCH_COPY]]
-; GCN:     s_setpc_b64
 define void @callee_need_to_spill_fp_to_reg() #1 {
+; MUBUF-LABEL: callee_need_to_spill_fp_to_reg:
+; MUBUF:       ; %bb.0:
+; MUBUF-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; MUBUF-NEXT:    s_mov_b32 s4, s33
+; MUBUF-NEXT:    s_mov_b32 s33, s32
+; MUBUF-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; MUBUF-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
+; MUBUF-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
+; MUBUF-NEXT:    s_mov_b64 exec, s[6:7]
+; MUBUF-NEXT:    v_writelane_b32 v40, s39, 0
+; MUBUF-NEXT:    v_writelane_b32 v40, s40, 1
+; MUBUF-NEXT:    v_writelane_b32 v40, s41, 2
+; MUBUF-NEXT:    v_writelane_b32 v40, s42, 3
+; MUBUF-NEXT:    v_writelane_b32 v40, s43, 4
+; MUBUF-NEXT:    v_writelane_b32 v40, s44, 5
+; MUBUF-NEXT:    v_writelane_b32 v40, s45, 6
+; MUBUF-NEXT:    v_writelane_b32 v40, s46, 7
+; MUBUF-NEXT:    v_writelane_b32 v40, s47, 8
+; MUBUF-NEXT:    v_writelane_b32 v40, s48, 9
+; MUBUF-NEXT:    v_writelane_b32 v40, s49, 10
+; MUBUF-NEXT:    v_writelane_b32 v40, s50, 11
+; MUBUF-NEXT:    v_writelane_b32 v40, s51, 12
+; MUBUF-NEXT:    v_writelane_b32 v40, s52, 13
+; MUBUF-NEXT:    v_writelane_b32 v40, s53, 14
+; MUBUF-NEXT:    v_writelane_b32 v40, s54, 15
+; MUBUF-NEXT:    v_writelane_b32 v40, s55, 16
+; MUBUF-NEXT:    v_writelane_b32 v40, s56, 17
+; MUBUF-NEXT:    v_writelane_b32 v40, s57, 18
+; MUBUF-NEXT:    v_writelane_b32 v40, s58, 19
+; MUBUF-NEXT:    v_writelane_b32 v40, s59, 20
+; MUBUF-NEXT:    v_writelane_b32 v40, s60, 21
+; MUBUF-NEXT:    v_writelane_b32 v40, s61, 22
+; MUBUF-NEXT:    v_writelane_b32 v40, s62, 23
+; MUBUF-NEXT:    v_writelane_b32 v40, s63, 24
+; MUBUF-NEXT:    v_writelane_b32 v40, s64, 25
+; MUBUF-NEXT:    v_writelane_b32 v40, s65, 26
+; MUBUF-NEXT:    v_writelane_b32 v40, s66, 27
+; MUBUF-NEXT:    v_writelane_b32 v40, s67, 28
+; MUBUF-NEXT:    v_writelane_b32 v40, s68, 29
+; MUBUF-NEXT:    v_writelane_b32 v40, s69, 30
+; MUBUF-NEXT:    v_writelane_b32 v40, s70, 31
+; MUBUF-NEXT:    v_writelane_b32 v40, s71, 32
+; MUBUF-NEXT:    v_writelane_b32 v40, s72, 33
+; MUBUF-NEXT:    v_writelane_b32 v40, s73, 34
+; MUBUF-NEXT:    v_writelane_b32 v40, s74, 35
+; MUBUF-NEXT:    v_writelane_b32 v40, s75, 36
+; MUBUF-NEXT:    v_writelane_b32 v40, s76, 37
+; MUBUF-NEXT:    v_writelane_b32 v40, s77, 38
+; MUBUF-NEXT:    v_writelane_b32 v40, s78, 39
+; MUBUF-NEXT:    v_writelane_b32 v40, s79, 40
+; MUBUF-NEXT:    v_writelane_b32 v40, s80, 41
+; MUBUF-NEXT:    v_writelane_b32 v40, s81, 42
+; MUBUF-NEXT:    v_writelane_b32 v40, s82, 43
+; MUBUF-NEXT:    v_writelane_b32 v40, s83, 44
+; MUBUF-NEXT:    v_writelane_b32 v40, s84, 45
+; MUBUF-NEXT:    v_writelane_b32 v40, s85, 46
+; MUBUF-NEXT:    v_writelane_b32 v40, s86, 47
+; MUBUF-NEXT:    v_writelane_b32 v40, s87, 48
+; MUBUF-NEXT:    v_writelane_b32 v40, s88, 49
+; MUBUF-NEXT:    v_writelane_b32 v40, s89, 50
+; MUBUF-NEXT:    v_writelane_b32 v40, s90, 51
+; MUBUF-NEXT:    v_writelane_b32 v40, s91, 52
+; MUBUF-NEXT:    v_writelane_b32 v40, s92, 53
+; MUBUF-NEXT:    v_writelane_b32 v40, s93, 54
+; MUBUF-NEXT:    v_writelane_b32 v40, s94, 55
+; MUBUF-NEXT:    v_writelane_b32 v40, s95, 56
+; MUBUF-NEXT:    v_writelane_b32 v40, s96, 57
+; MUBUF-NEXT:    v_writelane_b32 v40, s97, 58
+; MUBUF-NEXT:    v_writelane_b32 v40, s98, 59
+; MUBUF-NEXT:    v_writelane_b32 v40, s99, 60
+; MUBUF-NEXT:    v_writelane_b32 v40, s100, 61
+; MUBUF-NEXT:    v_writelane_b32 v40, s101, 62
+; MUBUF-NEXT:    v_writelane_b32 v41, s4, 0
+; MUBUF-NEXT:    s_addk_i32 s32, 0x300
+; MUBUF-NEXT:    v_writelane_b32 v40, s102, 63
+; MUBUF-NEXT:    ;;#ASMSTART
+; MUBUF-NEXT:    ; clobber nonpreserved SGPRs and 64 CSRs
+; MUBUF-NEXT:    ;;#ASMEND
+; MUBUF-NEXT:    ;;#ASMSTART
+; MUBUF-NEXT:    ; clobber all VGPRs except CSR v40
+; MUBUF-NEXT:    ;;#ASMEND
+; MUBUF-NEXT:    v_readlane_b32 s102, v40, 63
+; MUBUF-NEXT:    v_readlane_b32 s101, v40, 62
+; MUBUF-NEXT:    v_readlane_b32 s100, v40, 61
+; MUBUF-NEXT:    v_readlane_b32 s99, v40, 60
+; MUBUF-NEXT:    v_readlane_b32 s98, v40, 59
+; MUBUF-NEXT:    v_readlane_b32 s97, v40, 58
+; MUBUF-NEXT:    v_readlane_b32 s96, v40, 57
+; MUBUF-NEXT:    v_readlane_b32 s95, v40, 56
+; MUBUF-NEXT:    v_readlane_b32 s94, v40, 55
+; MUBUF-NEXT:    v_readlane_b32 s93, v40, 54
+; MUBUF-NEXT:    v_readlane_b32 s92, v40, 53
+; MUBUF-NEXT:    v_readlane_b32 s91, v40, 52
+; MUBUF-NEXT:    v_readlane_b32 s90, v40, 51
+; MUBUF-NEXT:    v_readlane_b32 s89, v40, 50
+; MUBUF-NEXT:    v_readlane_b32 s88, v40, 49
+; MUBUF-NEXT:    v_readlane_b32 s87, v40, 48
+; MUBUF-NEXT:    v_readlane_b32 s86, v40, 47
+; MUBUF-NEXT:    v_readlane_b32 s85, v40, 46
+; MUBUF-NEXT:    v_readlane_b32 s84, v40, 45
+; MUBUF-NEXT:    v_readlane_b32 s83, v40, 44
+; MUBUF-NEXT:    v_readlane_b32 s82, v40, 43
+; MUBUF-NEXT:    v_readlane_b32 s81, v40, 42
+; MUBUF-NEXT:    v_readlane_b32 s80, v40, 41
+; MUBUF-NEXT:    v_readlane_b32 s79, v40, 40
+; MUBUF-NEXT:    v_readlane_b32 s78, v40, 39
+; MUBUF-NEXT:    v_readlane_b32 s77, v40, 38
+; MUBUF-NEXT:    v_readlane_b32 s76, v40, 37
+; MUBUF-NEXT:    v_readlane_b32 s75, v40, 36
+; MUBUF-NEXT:    v_readlane_b32 s74, v40, 35
+; MUBUF-NEXT:    v_readlane_b32 s73, v40, 34
+; MUBUF-NEXT:    v_readlane_b32 s72, v40, 33
+; MUBUF-NEXT:    v_readlane_b32 s71, v40, 32
+; MUBUF-NEXT:    v_readlane_b32 s70, v40, 31
+; MUBUF-NEXT:    v_readlane_b32 s69, v40, 30
+; MUBUF-NEXT:    v_readlane_b32 s68, v40, 29
+; MUBUF-NEXT:    v_readlane_b32 s67, v40, 28
+; MUBUF-NEXT:    v_readlane_b32 s66, v40, 27
+; MUBUF-NEXT:    v_readlane_b32 s65, v40, 26
+; MUBUF-NEXT:    v_readlane_b32 s64, v40, 25
+; MUBUF-NEXT:    v_readlane_b32 s63, v40, 24
+; MUBUF-NEXT:    v_readlane_b32 s62, v40, 23
+; MUBUF-NEXT:    v_readlane_b32 s61, v40, 22
+; MUBUF-NEXT:    v_readlane_b32 s60, v40, 21
+; MUBUF-NEXT:    v_readlane_b32 s59, v40, 20
+; MUBUF-NEXT:    v_readlane_b32 s58, v40, 19
+; MUBUF-NEXT:    v_readlane_b32 s57, v40, 18
+; MUBUF-NEXT:    v_readlane_b32 s56, v40, 17
+; MUBUF-NEXT:    v_readlane_b32 s55, v40, 16
+; MUBUF-NEXT:    v_readlane_b32 s54, v40, 15
+; MUBUF-NEXT:    v_readlane_b32 s53, v40, 14
+; MUBUF-NEXT:    v_readlane_b32 s52, v40, 13
+; MUBUF-NEXT:    v_readlane_b32 s51, v40, 12
+; MUBUF-NEXT:    v_readlane_b32 s50, v40, 11
+; MUBUF-NEXT:    v_readlane_b32 s49, v40, 10
+; MUBUF-NEXT:    v_readlane_b32 s48, v40, 9
+; MUBUF-NEXT:    v_readlane_b32 s47, v40, 8
+; MUBUF-NEXT:    v_readlane_b32 s46, v40, 7
+; MUBUF-NEXT:    v_readlane_b32 s45, v40, 6
+; MUBUF-NEXT:    v_readlane_b32 s44, v40, 5
+; MUBUF-NEXT:    v_readlane_b32 s43, v40, 4
+; MUBUF-NEXT:    v_readlane_b32 s42, v40, 3
+; MUBUF-NEXT:    v_readlane_b32 s41, v40, 2
+; MUBUF-NEXT:    v_readlane_b32 s40, v40, 1
+; MUBUF-NEXT:    v_readlane_b32 s39, v40, 0
+; MUBUF-NEXT:    s_mov_b32 s32, s33
+; MUBUF-NEXT:    v_readlane_b32 s4, v41, 0
+; MUBUF-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; MUBUF-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
+; MUBUF-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
+; MUBUF-NEXT:    s_mov_b64 exec, s[6:7]
+; MUBUF-NEXT:    s_mov_b32 s33, s4
+; MUBUF-NEXT:    s_waitcnt vmcnt(0)
+; MUBUF-NEXT:    s_setpc_b64 s[30:31]
+;
+; FLATSCR-LABEL: callee_need_to_spill_fp_to_reg:
+; FLATSCR:       ; %bb.0:
+; FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; FLATSCR-NEXT:    s_mov_b32 s0, s33
+; FLATSCR-NEXT:    s_mov_b32 s33, s32
+; FLATSCR-NEXT:    s_or_saveexec_b64 s[2:3], -1
+; FLATSCR-NEXT:    scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
+; FLATSCR-NEXT:    s_mov_b64 exec, s[2:3]
+; FLATSCR-NEXT:    v_writelane_b32 v40, s39, 0
+; FLATSCR-NEXT:    v_writelane_b32 v40, s40, 1
+; FLATSCR-NEXT:    v_writelane_b32 v40, s41, 2
+; FLATSCR-NEXT:    v_writelane_b32 v40, s42, 3
+; FLATSCR-NEXT:    v_writelane_b32 v40, s43, 4
+; FLATSCR-NEXT:    v_writelane_b32 v40, s44, 5
+; FLATSCR-NEXT:    v_writelane_b32 v40, s45, 6
+; FLATSCR-NEXT:    v_writelane_b32 v40, s46, 7
+; FLATSCR-NEXT:    v_writelane_b32 v40, s47, 8
+; FLATSCR-NEXT:    v_writelane_b32 v40, s48, 9
+; FLATSCR-NEXT:    v_writelane_b32 v40, s49, 10
+; FLATSCR-NEXT:    v_writelane_b32 v40, s50, 11
+; FLATSCR-NEXT:    v_writelane_b32 v40, s51, 12
+; FLATSCR-NEXT:    v_writelane_b32 v40, s52, 13
+; FLATSCR-NEXT:    v_writelane_b32 v40, s53, 14
+; FLATSCR-NEXT:    v_writelane_b32 v40, s54, 15
+; FLATSCR-NEXT:    v_writelane_b32 v40, s55, 16
+; FLATSCR-NEXT:    v_writelane_b32 v40, s56, 17
+; FLATSCR-NEXT:    v_writelane_b32 v40, s57, 18
+; FLATSCR-NEXT:    v_writelane_b32 v40, s58, 19
+; FLATSCR-NEXT:    v_writelane_b32 v40, s59, 20
+; FLATSCR-NEXT:    v_writelane_b32 v40, s60, 21
+; FLATSCR-NEXT:    v_writelane_b32 v40, s61, 22
+; FLATSCR-NEXT:    v_writelane_b32 v40, s62, 23
+; FLATSCR-NEXT:    v_writelane_b32 v40, s63, 24
+; FLATSCR-NEXT:    v_writelane_b32 v40, s64, 25
+; FLATSCR-NEXT:    v_writelane_b32 v40, s65, 26
+; FLATSCR-NEXT:    v_writelane_b32 v40, s66, 27
+; FLATSCR-NEXT:    v_writelane_b32 v40, s67, 28
+; FLATSCR-NEXT:    v_writelane_b32 v40, s68, 29
+; FLATSCR-NEXT:    v_writelane_b32 v40, s69, 30
+; FLATSCR-NEXT:    v_writelane_b32 v40, s70, 31
+; FLATSCR-NEXT:    v_writelane_b32 v40, s71, 32
+; FLATSCR-NEXT:    v_writelane_b32 v40, s72, 33
+; FLATSCR-NEXT:    v_writelane_b32 v40, s73, 34
+; FLATSCR-NEXT:    v_writelane_b32 v40, s74, 35
+; FLATSCR-NEXT:    v_writelane_b32 v40, s75, 36
+; FLATSCR-NEXT:    v_writelane_b32 v40, s76, 37
+; FLATSCR-NEXT:    v_writelane_b32 v40, s77, 38
+; FLATSCR-NEXT:    v_writelane_b32 v40, s78, 39
+; FLATSCR-NEXT:    v_writelane_b32 v40, s79, 40
+; FLATSCR-NEXT:    v_writelane_b32 v40, s80, 41
+; FLATSCR-NEXT:    v_writelane_b32 v40, s81, 42
+; FLATSCR-NEXT:    v_writelane_b32 v40, s82, 43
+; FLATSCR-NEXT:    v_writelane_b32 v40, s83, 44
+; FLATSCR-NEXT:    v_writelane_b32 v40, s84, 45
+; FLATSCR-NEXT:    v_writelane_b32 v40, s85, 46
+; FLATSCR-NEXT:    v_writelane_b32 v40, s86, 47
+; FLATSCR-NEXT:    v_writelane_b32 v40, s87, 48
+; FLATSCR-NEXT:    v_writelane_b32 v40, s88, 49
+; FLATSCR-NEXT:    v_writelane_b32 v40, s89, 50
+; FLATSCR-NEXT:    v_writelane_b32 v40, s90, 51
+; FLATSCR-NEXT:    v_writelane_b32 v40, s91, 52
+; FLATSCR-NEXT:    v_writelane_b32 v40, s92, 53
+; FLATSCR-NEXT:    v_writelane_b32 v40, s93, 54
+; FLATSCR-NEXT:    v_writelane_b32 v40, s94, 55
+; FLATSCR-NEXT:    v_writelane_b32 v40, s95, 56
+; FLATSCR-NEXT:    v_writelane_b32 v40, s96, 57
+; FLATSCR-NEXT:    v_writelane_b32 v40, s97, 58
+; FLATSCR-NEXT:    v_writelane_b32 v40, s98, 59
+; FLATSCR-NEXT:    v_writelane_b32 v40, s99, 60
+; FLATSCR-NEXT:    v_writelane_b32 v40, s100, 61
+; FLATSCR-NEXT:    v_writelane_b32 v40, s101, 62
+; FLATSCR-NEXT:    s_add_i32 s32, s32, 8
+; FLATSCR-NEXT:    v_writelane_b32 v40, s102, 63
+; FLATSCR-NEXT:    ;;#ASMSTART
+; FLATSCR-NEXT:    ; clobber nonpreserved SGPRs and 64 CSRs
+; FLATSCR-NEXT:    ;;#ASMEND
+; FLATSCR-NEXT:    ;;#ASMSTART
+; FLATSCR-NEXT:    ; clobber all VGPRs except CSR v40
+; FLATSCR-NEXT:    ;;#ASMEND
+; FLATSCR-NEXT:    v_readlane_b32 s102, v40, 63
+; FLATSCR-NEXT:    v_readlane_b32 s101, v40, 62
+; FLATSCR-NEXT:    v_readlane_b32 s100, v40, 61
+; FLATSCR-NEXT:    v_readlane_b32 s99, v40, 60
+; FLATSCR-NEXT:    v_readlane_b32 s98, v40, 59
+; FLATSCR-NEXT:    v_readlane_b32 s97, v40, 58
+; FLATSCR-NEXT:    v_readlane_b32 s96, v40, 57
+; FLATSCR-NEXT:    v_readlane_b32 s95, v40, 56
+; FLATSCR-NEXT:    v_readlane_b32 s94, v40, 55
+; FLATSCR-NEXT:    v_readlane_b32 s93, v40, 54
+; FLATSCR-NEXT:    v_readlane_b32 s92, v40, 53
+; FLATSCR-NEXT:    v_readlane_b32 s91, v40, 52
+; FLATSCR-NEXT:    v_readlane_b32 s90, v40, 51
+; FLATSCR-NEXT:    v_readlane_b32 s89, v40, 50
+; FLATSCR-NEXT:    v_readlane_b32 s88, v40, 49
+; FLATSCR-NEXT:    v_readlane_b32 s87, v40, 48
+; FLATSCR-NEXT:    v_readlane_b32 s86, v40, 47
+; FLATSCR-NEXT:    v_readlane_b32 s85, v40, 46
+; FLATSCR-NEXT:    v_readlane_b32 s84, v40, 45
+; FLATSCR-NEXT:    v_readlane_b32 s83, v40, 44
+; FLATSCR-NEXT:    v_readlane_b32 s82, v40, 43
+; FLATSCR-NEXT:    v_readlane_b32 s81, v40, 42
+; FLATSCR-NEXT:    v_readlane_b32 s80, v40, 41
+; FLATSCR-NEXT:    v_readlane_b32 s79, v40, 40
+; FLATSCR-NEXT:    v_readlane_b32 s78, v40, 39
+; FLATSCR-NEXT:    v_readlane_b32 s77, v40, 38
+; FLATSCR-NEXT:    v_readlane_b32 s76, v40, 37
+; FLATSCR-NEXT:    v_readlane_b32 s75, v40, 36
+; FLATSCR-NEXT:    v_readlane_b32 s74, v40, 35
+; FLATSCR-NEXT:    v_readlane_b32 s73, v40, 34
+; FLATSCR-NEXT:    v_readlane_b32 s72, v40, 33
+; FLATSCR-NEXT:    v_readlane_b32 s71, v40, 32
+; FLATSCR-NEXT:    v_readlane_b32 s70, v40, 31
+; FLATSCR-NEXT:    v_readlane_b32 s69, v40, 30
+; FLATSCR-NEXT:    v_readlane_b32 s68, v40, 29
+; FLATSCR-NEXT:    v_readlane_b32 s67, v40, 28
+; FLATSCR-NEXT:    v_readlane_b32 s66, v40, 27
+; FLATSCR-NEXT:    v_readlane_b32 s65, v40, 26
+; FLATSCR-NEXT:    v_readlane_b32 s64, v40, 25
+; FLATSCR-NEXT:    v_readlane_b32 s63, v40, 24
+; FLATSCR-NEXT:    v_readlane_b32 s62, v40, 23
+; FLATSCR-NEXT:    v_readlane_b32 s61, v40, 22
+; FLATSCR-NEXT:    v_readlane_b32 s60, v40, 21
+; FLATSCR-NEXT:    v_readlane_b32 s59, v40, 20
+; FLATSCR-NEXT:    v_readlane_b32 s58, v40, 19
+; FLATSCR-NEXT:    v_readlane_b32 s57, v40, 18
+; FLATSCR-NEXT:    v_readlane_b32 s56, v40, 17
+; FLATSCR-NEXT:    v_readlane_b32 s55, v40, 16
+; FLATSCR-NEXT:    v_readlane_b32 s54, v40, 15
+; FLATSCR-NEXT:    v_readlane_b32 s53, v40, 14
+; FLATSCR-NEXT:    v_readlane_b32 s52, v40, 13
+; FLATSCR-NEXT:    v_readlane_b32 s51, v40, 12
+; FLATSCR-NEXT:    v_readlane_b32 s50, v40, 11
+; FLATSCR-NEXT:    v_readlane_b32 s49, v40, 10
+; FLATSCR-NEXT:    v_readlane_b32 s48, v40, 9
+; FLATSCR-NEXT:    v_readlane_b32 s47, v40, 8
+; FLATSCR-NEXT:    v_readlane_b32 s46, v40, 7
+; FLATSCR-NEXT:    v_readlane_b32 s45, v40, 6
+; FLATSCR-NEXT:    v_readlane_b32 s44, v40, 5
+; FLATSCR-NEXT:    v_readlane_b32 s43, v40, 4
+; FLATSCR-NEXT:    v_readlane_b32 s42, v40, 3
+; FLATSCR-NEXT:    v_readlane_b32 s41, v40, 2
+; FLATSCR-NEXT:    v_readlane_b32 s40, v40, 1
+; FLATSCR-NEXT:    v_readlane_b32 s39, v40, 0
+; FLATSCR-NEXT:    s_mov_b32 s32, s33
+; FLATSCR-NEXT:    s_or_saveexec_b64 s[2:3], -1
+; FLATSCR-NEXT:    scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
+; FLATSCR-NEXT:    s_mov_b64 exec, s[2:3]
+; FLATSCR-NEXT:    s_mov_b32 s33, s0
+; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
+; FLATSCR-NEXT:    s_setpc_b64 s[30:31]
   call void asm sideeffect "; clobber nonpreserved SGPRs and 64 CSRs",
     "~{s4},~{s5},~{s6},~{s7},~{s8},~{s9}
     ,~{s10},~{s11},~{s12},~{s13},~{s14},~{s15},~{s16},~{s17},~{s18},~{s19}
@@ -652,20 +2357,327 @@ define void @callee_need_to_spill_fp_to_reg() #1 {
 
 ; If the size of the offset exceeds the MUBUF offset field we need another
 ; scratch VGPR to hold the offset.
-; GCN-LABEL: {{^}}spill_fp_to_memory_scratch_reg_needed_mubuf_offset
-; MUBUF: s_mov_b32 [[FP_SCRATCH_COPY:s[0-9]+]], s33
-; MUBUF-NEXT: s_mov_b32 s33, s32
-; MUBUF-NEXT: s_xor_saveexec_b64 s[6:7], -1
-; MUBUF-NEXT: s_add_i32 [[SCRATCH_SGPR:s[0-9]+]], s33, 0x40100
-; MUBUF-NEXT: buffer_store_dword v39, off, s[0:3], [[SCRATCH_SGPR]] ; 4-byte Folded Spill
-; MUBUF: v_mov_b32_e32 v0, [[FP_SCRATCH_COPY]]
-; GCN-NOT: v_mov_b32_e32 v0, 0x100c
-; MUBUF-NEXT: s_add_i32 [[SCRATCH_SGPR:s[0-9]+]], s33, 0x40200
-; MUBUF: buffer_store_dword v0, off, s[0:3], [[SCRATCH_SGPR]] ; 4-byte Folded Spill
-; FLATSCR: v_mov_b32_e32 v0, 0
-; FLATSCR: s_add_i32 [[SOFF:s[0-9]+]], s33, 0x1000
-; FLATSCR: scratch_store_dword off, v0, [[SOFF]]
 define void @spill_fp_to_memory_scratch_reg_needed_mubuf_offset(ptr addrspace(5) byval([4096 x i8]) align 4 %arg) #3 {
+; MUBUF-LABEL: spill_fp_to_memory_scratch_reg_needed_mubuf_offset:
+; MUBUF:       ; %bb.0:
+; MUBUF-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; MUBUF-NEXT:    s_mov_b32 s4, s33
+; MUBUF-NEXT:    s_mov_b32 s33, s32
+; MUBUF-NEXT:    s_xor_saveexec_b64 s[6:7], -1
+; MUBUF-NEXT:    s_add_i32 s5, s33, 0x40100
+; MUBUF-NEXT:    buffer_store_dword v39, off, s[0:3], s5 ; 4-byte Folded Spill
+; MUBUF-NEXT:    s_mov_b64 exec, s[6:7]
+; MUBUF-NEXT:    v_writelane_b32 v39, s39, 0
+; MUBUF-NEXT:    v_writelane_b32 v39, s40, 1
+; MUBUF-NEXT:    v_writelane_b32 v39, s41, 2
+; MUBUF-NEXT:    v_writelane_b32 v39, s42, 3
+; MUBUF-NEXT:    v_writelane_b32 v39, s43, 4
+; MUBUF-NEXT:    v_writelane_b32 v39, s44, 5
+; MUBUF-NEXT:    v_writelane_b32 v39, s45, 6
+; MUBUF-NEXT:    v_writelane_b32 v39, s46, 7
+; MUBUF-NEXT:    v_writelane_b32 v39, s47, 8
+; MUBUF-NEXT:    v_writelane_b32 v39, s48, 9
+; MUBUF-NEXT:    v_writelane_b32 v39, s49, 10
+; MUBUF-NEXT:    v_writelane_b32 v39, s50, 11
+; MUBUF-NEXT:    v_writelane_b32 v39, s51, 12
+; MUBUF-NEXT:    v_writelane_b32 v39, s52, 13
+; MUBUF-NEXT:    v_writelane_b32 v39, s53, 14
+; MUBUF-NEXT:    v_writelane_b32 v39, s54, 15
+; MUBUF-NEXT:    v_writelane_b32 v39, s55, 16
+; MUBUF-NEXT:    v_writelane_b32 v39, s56, 17
+; MUBUF-NEXT:    v_writelane_b32 v39, s57, 18
+; MUBUF-NEXT:    v_writelane_b32 v39, s58, 19
+; MUBUF-NEXT:    v_writelane_b32 v39, s59, 20
+; MUBUF-NEXT:    v_writelane_b32 v39, s60, 21
+; MUBUF-NEXT:    v_writelane_b32 v39, s61, 22
+; MUBUF-NEXT:    v_writelane_b32 v39, s62, 23
+; MUBUF-NEXT:    v_writelane_b32 v39, s63, 24
+; MUBUF-NEXT:    v_writelane_b32 v39, s64, 25
+; MUBUF-NEXT:    v_writelane_b32 v39, s65, 26
+; MUBUF-NEXT:    v_writelane_b32 v39, s66, 27
+; MUBUF-NEXT:    v_writelane_b32 v39, s67, 28
+; MUBUF-NEXT:    v_writelane_b32 v39, s68, 29
+; MUBUF-NEXT:    v_writelane_b32 v39, s69, 30
+; MUBUF-NEXT:    v_writelane_b32 v39, s70, 31
+; MUBUF-NEXT:    v_writelane_b32 v39, s71, 32
+; MUBUF-NEXT:    v_writelane_b32 v39, s72, 33
+; MUBUF-NEXT:    v_writelane_b32 v39, s73, 34
+; MUBUF-NEXT:    v_writelane_b32 v39, s74, 35
+; MUBUF-NEXT:    v_writelane_b32 v39, s75, 36
+; MUBUF-NEXT:    v_writelane_b32 v39, s76, 37
+; MUBUF-NEXT:    v_writelane_b32 v39, s77, 38
+; MUBUF-NEXT:    v_writelane_b32 v39, s78, 39
+; MUBUF-NEXT:    v_writelane_b32 v39, s79, 40
+; MUBUF-NEXT:    v_writelane_b32 v39, s80, 41
+; MUBUF-NEXT:    v_writelane_b32 v39, s81, 42
+; MUBUF-NEXT:    v_writelane_b32 v39, s82, 43
+; MUBUF-NEXT:    v_writelane_b32 v39, s83, 44
+; MUBUF-NEXT:    v_writelane_b32 v39, s84, 45
+; MUBUF-NEXT:    v_writelane_b32 v39, s85, 46
+; MUBUF-NEXT:    v_writelane_b32 v39, s86, 47
+; MUBUF-NEXT:    v_writelane_b32 v39, s87, 48
+; MUBUF-NEXT:    v_writelane_b32 v39, s88, 49
+; MUBUF-NEXT:    v_writelane_b32 v39, s89, 50
+; MUBUF-NEXT:    v_writelane_b32 v39, s90, 51
+; MUBUF-NEXT:    v_writelane_b32 v39, s91, 52
+; MUBUF-NEXT:    v_writelane_b32 v39, s92, 53
+; MUBUF-NEXT:    v_writelane_b32 v39, s93, 54
+; MUBUF-NEXT:    v_writelane_b32 v39, s94, 55
+; MUBUF-NEXT:    v_writelane_b32 v39, s95, 56
+; MUBUF-NEXT:    v_writelane_b32 v39, s96, 57
+; MUBUF-NEXT:    v_writelane_b32 v39, s97, 58
+; MUBUF-NEXT:    v_writelane_b32 v39, s98, 59
+; MUBUF-NEXT:    v_writelane_b32 v39, s99, 60
+; MUBUF-NEXT:    v_mov_b32_e32 v0, s4
+; MUBUF-NEXT:    s_add_i32 s5, s33, 0x40200
+; MUBUF-NEXT:    v_writelane_b32 v39, s100, 61
+; MUBUF-NEXT:    buffer_store_dword v0, off, s[0:3], s5 ; 4-byte Folded Spill
+; MUBUF-NEXT:    v_writelane_b32 v39, s101, 62
+; MUBUF-NEXT:    v_mov_b32_e32 v0, 0
+; MUBUF-NEXT:    v_mov_b32_e32 v1, 0x1000
+; MUBUF-NEXT:    v_writelane_b32 v39, s102, 63
+; MUBUF-NEXT:    buffer_store_dword v0, v1, s[0:3], s33 offen
+; MUBUF-NEXT:    s_waitcnt vmcnt(0)
+; MUBUF-NEXT:    ;;#ASMSTART
+; MUBUF-NEXT:    ; clobber nonpreserved SGPRs and 64 CSRs
+; MUBUF-NEXT:    ;;#ASMEND
+; MUBUF-NEXT:    ;;#ASMSTART
+; MUBUF-NEXT:    ; clobber all VGPRs except CSR v40
+; MUBUF-NEXT:    ;;#ASMEND
+; MUBUF-NEXT:    s_add_i32 s5, s33, 0x40200
+; MUBUF-NEXT:    buffer_load_dword v0, off, s[0:3], s5 ; 4-byte Folded Reload
+; MUBUF-NEXT:    s_add_i32 s32, s32, 0x40400
+; MUBUF-NEXT:    v_readlane_b32 s102, v39, 63
+; MUBUF-NEXT:    v_readlane_b32 s101, v39, 62
+; MUBUF-NEXT:    v_readlane_b32 s100, v39, 61
+; MUBUF-NEXT:    v_readlane_b32 s99, v39, 60
+; MUBUF-NEXT:    v_readlane_b32 s98, v39, 59
+; MUBUF-NEXT:    v_readlane_b32 s97, v39, 58
+; MUBUF-NEXT:    v_readlane_b32 s96, v39, 57
+; MUBUF-NEXT:    v_readlane_b32 s95, v39, 56
+; MUBUF-NEXT:    v_readlane_b32 s94, v39, 55
+; MUBUF-NEXT:    v_readlane_b32 s93, v39, 54
+; MUBUF-NEXT:    v_readlane_b32 s92, v39, 53
+; MUBUF-NEXT:    v_readlane_b32 s91, v39, 52
+; MUBUF-NEXT:    v_readlane_b32 s90, v39, 51
+; MUBUF-NEXT:    v_readlane_b32 s89, v39, 50
+; MUBUF-NEXT:    v_readlane_b32 s88, v39, 49
+; MUBUF-NEXT:    v_readlane_b32 s87, v39, 48
+; MUBUF-NEXT:    v_readlane_b32 s86, v39, 47
+; MUBUF-NEXT:    v_readlane_b32 s85, v39, 46
+; MUBUF-NEXT:    v_readlane_b32 s84, v39, 45
+; MUBUF-NEXT:    v_readlane_b32 s83, v39, 44
+; MUBUF-NEXT:    v_readlane_b32 s82, v39, 43
+; MUBUF-NEXT:    v_readlane_b32 s81, v39, 42
+; MUBUF-NEXT:    v_readlane_b32 s80, v39, 41
+; MUBUF-NEXT:    v_readlane_b32 s79, v39, 40
+; MUBUF-NEXT:    v_readlane_b32 s78, v39, 39
+; MUBUF-NEXT:    v_readlane_b32 s77, v39, 38
+; MUBUF-NEXT:    v_readlane_b32 s76, v39, 37
+; MUBUF-NEXT:    v_readlane_b32 s75, v39, 36
+; MUBUF-NEXT:    v_readlane_b32 s74, v39, 35
+; MUBUF-NEXT:    v_readlane_b32 s73, v39, 34
+; MUBUF-NEXT:    v_readlane_b32 s72, v39, 33
+; MUBUF-NEXT:    v_readlane_b32 s71, v39, 32
+; MUBUF-NEXT:    v_readlane_b32 s70, v39, 31
+; MUBUF-NEXT:    v_readlane_b32 s69, v39, 30
+; MUBUF-NEXT:    v_readlane_b32 s68, v39, 29
+; MUBUF-NEXT:    v_readlane_b32 s67, v39, 28
+; MUBUF-NEXT:    v_readlane_b32 s66, v39, 27
+; MUBUF-NEXT:    v_readlane_b32 s65, v39, 26
+; MUBUF-NEXT:    v_readlane_b32 s64, v39, 25
+; MUBUF-NEXT:    v_readlane_b32 s63, v39, 24
+; MUBUF-NEXT:    v_readlane_b32 s62, v39, 23
+; MUBUF-NEXT:    v_readlane_b32 s61, v39, 22
+; MUBUF-NEXT:    v_readlane_b32 s60, v39, 21
+; MUBUF-NEXT:    v_readlane_b32 s59, v39, 20
+; MUBUF-NEXT:    v_readlane_b32 s58, v39, 19
+; MUBUF-NEXT:    v_readlane_b32 s57, v39, 18
+; MUBUF-NEXT:    v_readlane_b32 s56, v39, 17
+; MUBUF-NEXT:    v_readlane_b32 s55, v39, 16
+; MUBUF-NEXT:    v_readlane_b32 s54, v39, 15
+; MUBUF-NEXT:    v_readlane_b32 s53, v39, 14
+; MUBUF-NEXT:    v_readlane_b32 s52, v39, 13
+; MUBUF-NEXT:    v_readlane_b32 s51, v39, 12
+; MUBUF-NEXT:    v_readlane_b32 s50, v39, 11
+; MUBUF-NEXT:    v_readlane_b32 s49, v39, 10
+; MUBUF-NEXT:    v_readlane_b32 s48, v39, 9
+; MUBUF-NEXT:    v_readlane_b32 s47, v39, 8
+; MUBUF-NEXT:    v_readlane_b32 s46, v39, 7
+; MUBUF-NEXT:    v_readlane_b32 s45, v39, 6
+; MUBUF-NEXT:    v_readlane_b32 s44, v39, 5
+; MUBUF-NEXT:    v_readlane_b32 s43, v39, 4
+; MUBUF-NEXT:    v_readlane_b32 s42, v39, 3
+; MUBUF-NEXT:    v_readlane_b32 s41, v39, 2
+; MUBUF-NEXT:    v_readlane_b32 s40, v39, 1
+; MUBUF-NEXT:    v_readlane_b32 s39, v39, 0
+; MUBUF-NEXT:    s_mov_b32 s32, s33
+; MUBUF-NEXT:    s_waitcnt vmcnt(0)
+; MUBUF-NEXT:    v_readfirstlane_b32 s4, v0
+; MUBUF-NEXT:    s_xor_saveexec_b64 s[6:7], -1
+; MUBUF-NEXT:    s_add_i32 s5, s33, 0x40100
+; MUBUF-NEXT:    buffer_load_dword v39, off, s[0:3], s5 ; 4-byte Folded Reload
+; MUBUF-NEXT:    s_mov_b64 exec, s[6:7]
+; MUBUF-NEXT:    s_mov_b32 s33, s4
+; MUBUF-NEXT:    s_waitcnt vmcnt(0)
+; MUBUF-NEXT:    s_setpc_b64 s[30:31]
+;
+; FLATSCR-LABEL: spill_fp_to_memory_scratch_reg_needed_mubuf_offset:
+; FLATSCR:       ; %bb.0:
+; FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; FLATSCR-NEXT:    s_mov_b32 s0, s33
+; FLATSCR-NEXT:    s_mov_b32 s33, s32
+; FLATSCR-NEXT:    s_xor_saveexec_b64 s[2:3], -1
+; FLATSCR-NEXT:    s_add_i32 s1, s33, 0x1004
+; FLATSCR-NEXT:    scratch_store_dword off, v39, s1 ; 4-byte Folded Spill
+; FLATSCR-NEXT:    s_mov_b64 exec, s[2:3]
+; FLATSCR-NEXT:    v_writelane_b32 v39, s39, 0
+; FLATSCR-NEXT:    v_writelane_b32 v39, s40, 1
+; FLATSCR-NEXT:    v_writelane_b32 v39, s41, 2
+; FLATSCR-NEXT:    v_writelane_b32 v39, s42, 3
+; FLATSCR-NEXT:    v_writelane_b32 v39, s43, 4
+; FLATSCR-NEXT:    v_writelane_b32 v39, s44, 5
+; FLATSCR-NEXT:    v_writelane_b32 v39, s45, 6
+; FLATSCR-NEXT:    v_writelane_b32 v39, s46, 7
+; FLATSCR-NEXT:    v_writelane_b32 v39, s47, 8
+; FLATSCR-NEXT:    v_writelane_b32 v39, s48, 9
+; FLATSCR-NEXT:    v_writelane_b32 v39, s49, 10
+; FLATSCR-NEXT:    v_writelane_b32 v39, s50, 11
+; FLATSCR-NEXT:    v_writelane_b32 v39, s51, 12
+; FLATSCR-NEXT:    v_writelane_b32 v39, s52, 13
+; FLATSCR-NEXT:    v_writelane_b32 v39, s53, 14
+; FLATSCR-NEXT:    v_writelane_b32 v39, s54, 15
+; FLATSCR-NEXT:    v_writelane_b32 v39, s55, 16
+; FLATSCR-NEXT:    v_writelane_b32 v39, s56, 17
+; FLATSCR-NEXT:    v_writelane_b32 v39, s57, 18
+; FLATSCR-NEXT:    v_writelane_b32 v39, s58, 19
+; FLATSCR-NEXT:    v_writelane_b32 v39, s59, 20
+; FLATSCR-NEXT:    v_writelane_b32 v39, s60, 21
+; FLATSCR-NEXT:    v_writelane_b32 v39, s61, 22
+; FLATSCR-NEXT:    v_writelane_b32 v39, s62, 23
+; FLATSCR-NEXT:    v_writelane_b32 v39, s63, 24
+; FLATSCR-NEXT:    v_writelane_b32 v39, s64, 25
+; FLATSCR-NEXT:    v_writelane_b32 v39, s65, 26
+; FLATSCR-NEXT:    v_writelane_b32 v39, s66, 27
+; FLATSCR-NEXT:    v_writelane_b32 v39, s67, 28
+; FLATSCR-NEXT:    v_writelane_b32 v39, s68, 29
+; FLATSCR-NEXT:    v_writelane_b32 v39, s69, 30
+; FLATSCR-NEXT:    v_writelane_b32 v39, s70, 31
+; FLATSCR-NEXT:    v_writelane_b32 v39, s71, 32
+; FLATSCR-NEXT:    v_writelane_b32 v39, s72, 33
+; FLATSCR-NEXT:    v_writelane_b32 v39, s73, 34
+; FLATSCR-NEXT:    v_writelane_b32 v39, s74, 35
+; FLATSCR-NEXT:    v_writelane_b32 v39, s75, 36
+; FLATSCR-NEXT:    v_writelane_b32 v39, s76, 37
+; FLATSCR-NEXT:    v_writelane_b32 v39, s77, 38
+; FLATSCR-NEXT:    v_writelane_b32 v39, s78, 39
+; FLATSCR-NEXT:    v_writelane_b32 v39, s79, 40
+; FLATSCR-NEXT:    v_writelane_b32 v39, s80, 41
+; FLATSCR-NEXT:    v_writelane_b32 v39, s81, 42
+; FLATSCR-NEXT:    v_writelane_b32 v39, s82, 43
+; FLATSCR-NEXT:    v_writelane_b32 v39, s83, 44
+; FLATSCR-NEXT:    v_writelane_b32 v39, s84, 45
+; FLATSCR-NEXT:    v_writelane_b32 v39, s85, 46
+; FLATSCR-NEXT:    v_writelane_b32 v39, s86, 47
+; FLATSCR-NEXT:    v_writelane_b32 v39, s87, 48
+; FLATSCR-NEXT:    v_writelane_b32 v39, s88, 49
+; FLATSCR-NEXT:    v_writelane_b32 v39, s89, 50
+; FLATSCR-NEXT:    v_writelane_b32 v39, s90, 51
+; FLATSCR-NEXT:    v_writelane_b32 v39, s91, 52
+; FLATSCR-NEXT:    v_writelane_b32 v39, s92, 53
+; FLATSCR-NEXT:    v_writelane_b32 v39, s93, 54
+; FLATSCR-NEXT:    v_writelane_b32 v39, s94, 55
+; FLATSCR-NEXT:    v_writelane_b32 v39, s95, 56
+; FLATSCR-NEXT:    v_writelane_b32 v39, s96, 57
+; FLATSCR-NEXT:    v_writelane_b32 v39, s97, 58
+; FLATSCR-NEXT:    v_writelane_b32 v39, s98, 59
+; FLATSCR-NEXT:    v_writelane_b32 v39, s99, 60
+; FLATSCR-NEXT:    s_addk_i32 s32, 0x100c
+; FLATSCR-NEXT:    v_writelane_b32 v39, s100, 61
+; FLATSCR-NEXT:    v_writelane_b32 v39, s101, 62
+; FLATSCR-NEXT:    v_mov_b32_e32 v0, 0
+; FLATSCR-NEXT:    s_add_i32 s1, s33, 0x1000
+; FLATSCR-NEXT:    v_writelane_b32 v39, s102, 63
+; FLATSCR-NEXT:    scratch_store_dword off, v0, s1
+; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
+; FLATSCR-NEXT:    ;;#ASMSTART
+; FLATSCR-NEXT:    ; clobber nonpreserved SGPRs and 64 CSRs
+; FLATSCR-NEXT:    ;;#ASMEND
+; FLATSCR-NEXT:    ;;#ASMSTART
+; FLATSCR-NEXT:    ; clobber all VGPRs except CSR v40
+; FLATSCR-NEXT:    ;;#ASMEND
+; FLATSCR-NEXT:    v_readlane_b32 s102, v39, 63
+; FLATSCR-NEXT:    v_readlane_b32 s101, v39, 62
+; FLATSCR-NEXT:    v_readlane_b32 s100, v39, 61
+; FLATSCR-NEXT:    v_readlane_b32 s99, v39, 60
+; FLATSCR-NEXT:    v_readlane_b32 s98, v39, 59
+; FLATSCR-NEXT:    v_readlane_b32 s97, v39, 58
+; FLATSCR-NEXT:    v_readlane_b32 s96, v39, 57
+; FLATSCR-NEXT:    v_readlane_b32 s95, v39, 56
+; FLATSCR-NEXT:    v_readlane_b32 s94, v39, 55
+; FLATSCR-NEXT:    v_readlane_b32 s93, v39, 54
+; FLATSCR-NEXT:    v_readlane_b32 s92, v39, 53
+; FLATSCR-NEXT:    v_readlane_b32 s91, v39, 52
+; FLATSCR-NEXT:    v_readlane_b32 s90, v39, 51
+; FLATSCR-NEXT:    v_readlane_b32 s89, v39, 50
+; FLATSCR-NEXT:    v_readlane_b32 s88, v39, 49
+; FLATSCR-NEXT:    v_readlane_b32 s87, v39, 48
+; FLATSCR-NEXT:    v_readlane_b32 s86, v39, 47
+; FLATSCR-NEXT:    v_readlane_b32 s85, v39, 46
+; FLATSCR-NEXT:    v_readlane_b32 s84, v39, 45
+; FLATSCR-NEXT:    v_readlane_b32 s83, v39, 44
+; FLATSCR-NEXT:    v_readlane_b32 s82, v39, 43
+; FLATSCR-NEXT:    v_readlane_b32 s81, v39, 42
+; FLATSCR-NEXT:    v_readlane_b32 s80, v39, 41
+; FLATSCR-NEXT:    v_readlane_b32 s79, v39, 40
+; FLATSCR-NEXT:    v_readlane_b32 s78, v39, 39
+; FLATSCR-NEXT:    v_readlane_b32 s77, v39, 38
+; FLATSCR-NEXT:    v_readlane_b32 s76, v39, 37
+; FLATSCR-NEXT:    v_readlane_b32 s75, v39, 36
+; FLATSCR-NEXT:    v_readlane_b32 s74, v39, 35
+; FLATSCR-NEXT:    v_readlane_b32 s73, v39, 34
+; FLATSCR-NEXT:    v_readlane_b32 s72, v39, 33
+; FLATSCR-NEXT:    v_readlane_b32 s71, v39, 32
+; FLATSCR-NEXT:    v_readlane_b32 s70, v39, 31
+; FLATSCR-NEXT:    v_readlane_b32 s69, v39, 30
+; FLATSCR-NEXT:    v_readlane_b32 s68, v39, 29
+; FLATSCR-NEXT:    v_readlane_b32 s67, v39, 28
+; FLATSCR-NEXT:    v_readlane_b32 s66, v39, 27
+; FLATSCR-NEXT:    v_readlane_b32 s65, v39, 26
+; FLATSCR-NEXT:    v_readlane_b32 s64, v39, 25
+; FLATSCR-NEXT:    v_readlane_b32 s63, v39, 24
+; FLATSCR-NEXT:    v_readlane_b32 s62, v39, 23
+; FLATSCR-NEXT:    v_readlane_b32 s61, v39, 22
+; FLATSCR-NEXT:    v_readlane_b32 s60, v39, 21
+; FLATSCR-NEXT:    v_readlane_b32 s59, v39, 20
+; FLATSCR-NEXT:    v_readlane_b32 s58, v39, 19
+; FLATSCR-NEXT:    v_readlane_b32 s57, v39, 18
+; FLATSCR-NEXT:    v_readlane_b32 s56, v39, 17
+; FLATSCR-NEXT:    v_readlane_b32 s55, v39, 16
+; FLATSCR-NEXT:    v_readlane_b32 s54, v39, 15
+; FLATSCR-NEXT:    v_readlane_b32 s53, v39, 14
+; FLATSCR-NEXT:    v_readlane_b32 s52, v39, 13
+; FLATSCR-NEXT:    v_readlane_b32 s51, v39, 12
+; FLATSCR-NEXT:    v_readlane_b32 s50, v39, 11
+; FLATSCR-NEXT:    v_readlane_b32 s49, v39, 10
+; FLATSCR-NEXT:    v_readlane_b32 s48, v39, 9
+; FLATSCR-NEXT:    v_readlane_b32 s47, v39, 8
+; FLATSCR-NEXT:    v_readlane_b32 s46, v39, 7
+; FLATSCR-NEXT:    v_readlane_b32 s45, v39, 6
+; FLATSCR-NEXT:    v_readlane_b32 s44, v39, 5
+; FLATSCR-NEXT:    v_readlane_b32 s43, v39, 4
+; FLATSCR-NEXT:    v_readlane_b32 s42, v39, 3
+; FLATSCR-NEXT:    v_readlane_b32 s41, v39, 2
+; FLATSCR-NEXT:    v_readlane_b32 s40, v39, 1
+; FLATSCR-NEXT:    v_readlane_b32 s39, v39, 0
+; FLATSCR-NEXT:    s_mov_b32 s32, s33
+; FLATSCR-NEXT:    s_xor_saveexec_b64 s[2:3], -1
+; FLATSCR-NEXT:    s_add_i32 s1, s33, 0x1004
+; FLATSCR-NEXT:    scratch_load_dword v39, off, s1 ; 4-byte Folded Reload
+; FLATSCR-NEXT:    s_mov_b64 exec, s[2:3]
+; FLATSCR-NEXT:    s_mov_b32 s33, s0
+; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
+; FLATSCR-NEXT:    s_setpc_b64 s[30:31]
   %alloca = alloca i32, addrspace(5)
   store volatile i32 0, ptr addrspace(5) %alloca
 
diff --git a/llvm/test/CodeGen/AMDGPU/nested-calls.ll b/llvm/test/CodeGen/AMDGPU/nested-calls.ll
index 593f40fd1b25e1..31e520ce74d98b 100644
--- a/llvm/test/CodeGen/AMDGPU/nested-calls.ll
+++ b/llvm/test/CodeGen/AMDGPU/nested-calls.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
 ; RUN: llc -mtriple=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN %s
 ; RUN: llc -mtriple=amdgcn -mcpu=hawaii -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN %s
@@ -7,45 +8,76 @@
 
 declare void @external_void_func_i32(i32) #0
 
-; GCN-LABEL: {{^}}test_func_call_external_void_func_i32_imm:
-; GCN: s_waitcnt
-
 ; Spill CSR VGPR used for SGPR spilling
-; GCN: s_mov_b32 [[FP_SCRATCH_COPY:s[0-9]+]], s33
-; GCN-NEXT: s_mov_b32 s33, s32
-; GCN-NEXT: s_or_saveexec_b64 [[COPY_EXEC0:s\[[0-9]+:[0-9]+\]]], -1{{$}}
-; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
-; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC0]]
-; GCN-DAG: v_writelane_b32 v40, [[FP_SCRATCH_COPY]], 2
-; GCN-DAG: v_writelane_b32 v40, s30, 0
-; GCN-DAG: v_writelane_b32 v40, s31, 1
-
-; GCN: s_swappc_b64
-
-; GCN: v_readlane_b32 s31, v40, 1
-; GCN: v_readlane_b32 s30, v40, 0
-; GCN: s_mov_b32 s32, s33
-
-; GCN-NEXT: v_readlane_b32 [[FP_SCRATCH_COPY:s[0-9]+]], v40, 2
-; GCN: s_or_saveexec_b64 [[COPY_EXEC1:s\[[0-9]+:[0-9]+\]]], -1{{$}}
-; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
-; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC1]]
-; GCN-NEXT: s_mov_b32 s33, [[FP_SCRATCH_COPY]]
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: s_setpc_b64 s[30:31]
 define void @test_func_call_external_void_func_i32_imm() #0 {
+; GCN-LABEL: test_func_call_external_void_func_i32_imm:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    s_mov_b32 s16, s33
+; GCN-NEXT:    s_mov_b32 s33, s32
+; GCN-NEXT:    s_or_saveexec_b64 s[18:19], -1
+; GCN-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
+; GCN-NEXT:    s_mov_b64 exec, s[18:19]
+; GCN-NEXT:    s_addk_i32 s32, 0x400
+; GCN-NEXT:    v_writelane_b32 v40, s16, 2
+; GCN-NEXT:    s_getpc_b64 s[16:17]
+; GCN-NEXT:    s_add_u32 s16, s16, external_void_func_i32 at gotpcrel32@lo+4
+; GCN-NEXT:    s_addc_u32 s17, s17, external_void_func_i32 at gotpcrel32@hi+12
+; GCN-NEXT:    s_load_dwordx2 s[16:17], s[16:17], 0x0
+; GCN-NEXT:    v_writelane_b32 v40, s30, 0
+; GCN-NEXT:    v_mov_b32_e32 v0, 42
+; GCN-NEXT:    v_writelane_b32 v40, s31, 1
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GCN-NEXT:    v_readlane_b32 s31, v40, 1
+; GCN-NEXT:    v_readlane_b32 s30, v40, 0
+; GCN-NEXT:    s_mov_b32 s32, s33
+; GCN-NEXT:    v_readlane_b32 s4, v40, 2
+; GCN-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; GCN-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
+; GCN-NEXT:    s_mov_b64 exec, s[6:7]
+; GCN-NEXT:    s_mov_b32 s33, s4
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    s_setpc_b64 s[30:31]
   call void @external_void_func_i32(i32 42)
   ret void
 }
 
-; GCN-LABEL: {{^}}test_func_call_external_void_func_i32_imm_stack_use:
-; GCN: s_waitcnt
-; GCN: s_mov_b32 s33, s32
-; GCN-DAG: s_addk_i32 s32, 0x1400{{$}}
-; GCN-DAG: buffer_store_dword v{{[0-9]+}}, off, s[0:3], s33 offset:
-; GCN: s_swappc_b64
-; GCN: s_setpc_b64
 define void @test_func_call_external_void_func_i32_imm_stack_use() #0 {
+; GCN-LABEL: test_func_call_external_void_func_i32_imm_stack_use:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    s_mov_b32 s16, s33
+; GCN-NEXT:    s_mov_b32 s33, s32
+; GCN-NEXT:    s_or_saveexec_b64 s[18:19], -1
+; GCN-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:64 ; 4-byte Folded Spill
+; GCN-NEXT:    s_mov_b64 exec, s[18:19]
+; GCN-NEXT:    s_addk_i32 s32, 0x1400
+; GCN-NEXT:    v_writelane_b32 v40, s16, 2
+; GCN-NEXT:    s_getpc_b64 s[16:17]
+; GCN-NEXT:    s_add_u32 s16, s16, external_void_func_i32 at gotpcrel32@lo+4
+; GCN-NEXT:    s_addc_u32 s17, s17, external_void_func_i32 at gotpcrel32@hi+12
+; GCN-NEXT:    s_load_dwordx2 s[16:17], s[16:17], 0x0
+; GCN-NEXT:    v_mov_b32_e32 v0, 0
+; GCN-NEXT:    v_writelane_b32 v40, s30, 0
+; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], s33
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], s33 offset:64
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_mov_b32_e32 v0, 42
+; GCN-NEXT:    v_writelane_b32 v40, s31, 1
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GCN-NEXT:    v_readlane_b32 s31, v40, 1
+; GCN-NEXT:    v_readlane_b32 s30, v40, 0
+; GCN-NEXT:    s_mov_b32 s32, s33
+; GCN-NEXT:    v_readlane_b32 s4, v40, 2
+; GCN-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; GCN-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:64 ; 4-byte Folded Reload
+; GCN-NEXT:    s_mov_b64 exec, s[6:7]
+; GCN-NEXT:    s_mov_b32 s33, s4
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    s_setpc_b64 s[30:31]
   %alloca = alloca [16 x i32], align 4, addrspace(5)
   %gep15 = getelementptr inbounds [16 x i32], ptr addrspace(5) %alloca, i32 0, i32 16
   store volatile i32 0, ptr addrspace(5) %alloca
diff --git a/llvm/test/CodeGen/AMDGPU/sibling-call.ll b/llvm/test/CodeGen/AMDGPU/sibling-call.ll
index 0676bc79a46f52..d09fc947bac187 100644
--- a/llvm/test/CodeGen/AMDGPU/sibling-call.ll
+++ b/llvm/test/CodeGen/AMDGPU/sibling-call.ll
@@ -1,29 +1,30 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -mattr=-flat-for-global -enable-ipra=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,CIVI %s
 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii -enable-ipra=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,CIVI %s
 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -mattr=-flat-for-global -enable-ipra=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9 %s
 target datalayout = "A5"
 
 ; FIXME: Why is this commuted only sometimes?
-; GCN-LABEL: {{^}}i32_fastcc_i32_i32:
-; GCN: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CIVI-NEXT: v_add_{{i|u}}32_e32 v0, vcc, v0, v1
-; GFX9-NEXT: v_add_u32_e32 v0, v0, v1
-; GCN-NEXT: s_setpc_b64
 define fastcc i32 @i32_fastcc_i32_i32(i32 %arg0, i32 %arg1) #1 {
+; GFX9-LABEL: i32_fastcc_i32_i32:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_add_u32_e32 v0, v0, v1
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %add0 = add i32 %arg0, %arg1
   ret i32 %add0
 }
 
-; GCN-LABEL: {{^}}i32_fastcc_i32_i32_stack_object:
-; GCN: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_mov_b32_e32 [[K:v[0-9]+]], 9
-; CIVI-NEXT: v_add_{{i|u}}32_e32 v0, vcc, v0, v1
-; GFX9-NEXT: v_add_u32_e32 v0, v0, v1
-; GCN: buffer_store_dword [[K]], off, s[0:3], s32 offset:20
-; GCN: s_waitcnt vmcnt(0)
-; GCN: s_setpc_b64
-; GCN: ; ScratchSize: 68
 define fastcc i32 @i32_fastcc_i32_i32_stack_object(i32 %arg0, i32 %arg1) #1 {
+; GFX9-LABEL: i32_fastcc_i32_i32_stack_object:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v2, 9
+; GFX9-NEXT:    v_add_u32_e32 v0, v0, v1
+; GFX9-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:20
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GCN: ; ScratchSize: 68
   %alloca = alloca [16 x i32], align 4, addrspace(5)
   %gep = getelementptr inbounds [16 x i32], ptr addrspace(5) %alloca, i32 0, i32 5
   store volatile i32 9, ptr addrspace(5) %gep
@@ -31,19 +32,34 @@ define fastcc i32 @i32_fastcc_i32_i32_stack_object(i32 %arg0, i32 %arg1) #1 {
   ret i32 %add0
 }
 
-; GCN-LABEL: {{^}}sibling_call_i32_fastcc_i32_i32:
 define hidden fastcc i32 @sibling_call_i32_fastcc_i32_i32(i32 %a, i32 %b, i32 %c) #1 {
+; GCN-LABEL: sibling_call_i32_fastcc_i32_i32:
+; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    s_getpc_b64 s[4:5]
+; GCN-NEXT:    s_add_u32 s4, s4, i32_fastcc_i32_i32 at gotpcrel32@lo+4
+; GCN-NEXT:    s_addc_u32 s5, s5, i32_fastcc_i32_i32 at gotpcrel32@hi+12
+; GCN-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_setpc_b64 s[4:5]
 entry:
   %ret = tail call fastcc i32 @i32_fastcc_i32_i32(i32 %a, i32 %b)
   ret i32 %ret
 }
 
-; GCN-LABEL: {{^}}sibling_call_i32_fastcc_i32_i32_stack_object:
-; GCN: v_mov_b32_e32 [[NINE:v[0-9]+]], 9
-; GCN: buffer_store_dword [[NINE]], off, s[0:3], s32 offset:20
-; GCN: s_setpc_b64
-; GCN: ; ScratchSize: 68
 define fastcc i32 @sibling_call_i32_fastcc_i32_i32_stack_object(i32 %a, i32 %b, i32 %c) #1 {
+; GCN-LABEL: sibling_call_i32_fastcc_i32_i32_stack_object:
+; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    s_getpc_b64 s[4:5]
+; GCN-NEXT:    s_add_u32 s4, s4, i32_fastcc_i32_i32 at gotpcrel32@lo+4
+; GCN-NEXT:    s_addc_u32 s5, s5, i32_fastcc_i32_i32 at gotpcrel32@hi+12
+; GCN-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
+; GCN-NEXT:    v_mov_b32_e32 v2, 9
+; GCN-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:20
+; GCN-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GCN-NEXT:    s_setpc_b64 s[4:5]
+; GCN: ; ScratchSize: 68
 entry:
   %alloca = alloca [16 x i32], align 4, addrspace(5)
   %gep = getelementptr inbounds [16 x i32], ptr addrspace(5) %alloca, i32 0, i32 5
@@ -52,12 +68,19 @@ entry:
   ret i32 %ret
 }
 
-; GCN-LABEL: {{^}}sibling_call_i32_fastcc_i32_i32_callee_stack_object:
-; GCN: v_mov_b32_e32 [[NINE:v[0-9]+]], 9
-; GCN: buffer_store_dword [[NINE]], off, s[0:3], s32 offset:20
-; GCN: s_setpc_b64
-; GCN: ; ScratchSize: 136
 define fastcc i32 @sibling_call_i32_fastcc_i32_i32_callee_stack_object(i32 %a, i32 %b, i32 %c) #1 {
+; GCN-LABEL: sibling_call_i32_fastcc_i32_i32_callee_stack_object:
+; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    s_getpc_b64 s[4:5]
+; GCN-NEXT:    s_add_u32 s4, s4, i32_fastcc_i32_i32_stack_object at gotpcrel32@lo+4
+; GCN-NEXT:    s_addc_u32 s5, s5, i32_fastcc_i32_i32_stack_object at gotpcrel32@hi+12
+; GCN-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
+; GCN-NEXT:    v_mov_b32_e32 v2, 9
+; GCN-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:20
+; GCN-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GCN-NEXT:    s_setpc_b64 s[4:5]
+; GCN: ; ScratchSize: 136
 entry:
   %alloca = alloca [16 x i32], align 4, addrspace(5)
   %gep = getelementptr inbounds [16 x i32], ptr addrspace(5) %alloca, i32 0, i32 5
@@ -66,45 +89,108 @@ entry:
   ret i32 %ret
 }
 
-; GCN-LABEL: {{^}}sibling_call_i32_fastcc_i32_i32_unused_result:
 define fastcc void @sibling_call_i32_fastcc_i32_i32_unused_result(i32 %a, i32 %b, i32 %c) #1 {
+; GCN-LABEL: sibling_call_i32_fastcc_i32_i32_unused_result:
+; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    s_getpc_b64 s[4:5]
+; GCN-NEXT:    s_add_u32 s4, s4, i32_fastcc_i32_i32 at gotpcrel32@lo+4
+; GCN-NEXT:    s_addc_u32 s5, s5, i32_fastcc_i32_i32 at gotpcrel32@hi+12
+; GCN-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_setpc_b64 s[4:5]
 entry:
   %ret = tail call fastcc i32 @i32_fastcc_i32_i32(i32 %a, i32 %b)
   ret void
 }
 
 ; It doesn't make sense to do a tail from a kernel
-; GCN-LABEL: {{^}}kernel_call_i32_fastcc_i32_i32_unused_result:
-;define amdgpu_kernel void @kernel_call_i32_fastcc_i32_i32_unused_result(i32 %a, i32 %b, i32 %c) #1 {
 define amdgpu_kernel void @kernel_call_i32_fastcc_i32_i32_unused_result(i32 %a, i32 %b, i32 %c) #1 {
+; CIVI-LABEL: kernel_call_i32_fastcc_i32_i32_unused_result:
+; CIVI:       ; %bb.0: ; %entry
+; CIVI-NEXT:    s_add_i32 s6, s6, s9
+; CIVI-NEXT:    s_lshr_b32 flat_scratch_hi, s6, 8
+; CIVI-NEXT:    s_add_u32 s0, s0, s9
+; CIVI-NEXT:    s_addc_u32 s1, s1, 0
+; CIVI-NEXT:    s_mov_b32 flat_scratch_lo, s7
+; CIVI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
+; CIVI-NEXT:    s_getpc_b64 s[6:7]
+; CIVI-NEXT:    s_add_u32 s6, s6, i32_fastcc_i32_i32 at gotpcrel32@lo+4
+; CIVI-NEXT:    s_addc_u32 s7, s7, i32_fastcc_i32_i32 at gotpcrel32@hi+12
+; CIVI-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x0
+; CIVI-NEXT:    s_mov_b32 s32, 0
+; CIVI-NEXT:    s_waitcnt lgkmcnt(0)
+; CIVI-NEXT:    v_mov_b32_e32 v0, s4
+; CIVI-NEXT:    v_mov_b32_e32 v1, s5
+; CIVI-NEXT:    s_swappc_b64 s[30:31], s[6:7]
+; CIVI-NEXT:    s_endpgm
+;
+; GFX9-LABEL: kernel_call_i32_fastcc_i32_i32_unused_result:
+; GFX9:       ; %bb.0: ; %entry
+; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s6, s9
+; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s7, 0
+; GFX9-NEXT:    s_add_u32 s0, s0, s9
+; GFX9-NEXT:    s_addc_u32 s1, s1, 0
+; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
+; GFX9-NEXT:    s_getpc_b64 s[6:7]
+; GFX9-NEXT:    s_add_u32 s6, s6, i32_fastcc_i32_i32 at gotpcrel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s7, s7, i32_fastcc_i32_i32 at gotpcrel32@hi+12
+; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x0
+; GFX9-NEXT:    s_mov_b32 s32, 0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9-NEXT:    v_mov_b32_e32 v1, s5
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[6:7]
+; GFX9-NEXT:    s_endpgm
 entry:
   %ret = tail call fastcc i32 @i32_fastcc_i32_i32(i32 %a, i32 %b)
   ret void
 }
 
-; GCN-LABEL: {{^}}i32_fastcc_i32_byval_i32:
-; GCN: s_waitcnt
-; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32{{$}}
-; GCN-NEXT: s_waitcnt vmcnt(0)
-
-; CIVI-NEXT: v_add_{{i|u}}32_e32 v0, vcc, v0, v1
-; GFX9-NEXT: v_add_u32_e32 v0, v0, v1
-
-; GCN-NEXT: s_setpc_b64 s[30:31]
 define hidden fastcc i32 @i32_fastcc_i32_byval_i32(i32 %arg0, ptr addrspace(5) byval(i32) align 4 %arg1) #1 {
+; GFX9-LABEL: i32_fastcc_i32_byval_i32:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    buffer_load_dword v1, off, s[0:3], s32
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_add_u32_e32 v0, v0, v1
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %arg1.load = load i32, ptr addrspace(5) %arg1, align 4
   %add0 = add i32 %arg0, %arg1.load
   ret i32 %add0
 }
 
 ; Tail call disallowed with byval in parent.
-; GCN-LABEL: {{^}}sibling_call_i32_fastcc_i32_byval_i32_byval_parent:
-; GCN-NOT: v_writelane_b32 v{{[0-9]+}}, s32
-; GCN: buffer_store_dword v{{[0-9]+}}, off, s[0:3], s32{{$}}
-; GCN: s_swappc_b64
-; GCN-NOT: v_readlane_b32 s32
-; GCN: s_setpc_b64
 define fastcc i32 @sibling_call_i32_fastcc_i32_byval_i32_byval_parent(i32 %a, ptr addrspace(5) byval(i32) %b.byval, i32 %c) #1 {
+; GCN-LABEL: sibling_call_i32_fastcc_i32_byval_i32_byval_parent:
+; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    s_mov_b32 s4, s33
+; GCN-NEXT:    s_mov_b32 s33, s32
+; GCN-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; GCN-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
+; GCN-NEXT:    s_mov_b64 exec, s[6:7]
+; GCN-NEXT:    buffer_load_dword v1, off, s[0:3], s33
+; GCN-NEXT:    v_writelane_b32 v40, s4, 2
+; GCN-NEXT:    s_addk_i32 s32, 0x400
+; GCN-NEXT:    v_writelane_b32 v40, s30, 0
+; GCN-NEXT:    v_writelane_b32 v40, s31, 1
+; GCN-NEXT:    s_getpc_b64 s[4:5]
+; GCN-NEXT:    s_add_u32 s4, s4, i32_fastcc_i32_byval_i32 at rel32@lo+4
+; GCN-NEXT:    s_addc_u32 s5, s5, i32_fastcc_i32_byval_i32 at rel32@hi+12
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    buffer_store_dword v1, off, s[0:3], s32
+; GCN-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GCN-NEXT:    v_readlane_b32 s31, v40, 1
+; GCN-NEXT:    v_readlane_b32 s30, v40, 0
+; GCN-NEXT:    s_mov_b32 s32, s33
+; GCN-NEXT:    v_readlane_b32 s4, v40, 2
+; GCN-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; GCN-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
+; GCN-NEXT:    s_mov_b64 exec, s[6:7]
+; GCN-NEXT:    s_mov_b32 s33, s4
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %ret = tail call fastcc i32 @i32_fastcc_i32_byval_i32(i32 %a, ptr addrspace(5) byval(i32) %b.byval)
   ret i32 %ret
@@ -113,34 +199,32 @@ entry:
 ; Tail call disallowed with byval in parent, not callee. The stack
 ; usage of incoming arguments must be <= the outgoing stack
 ; arguments.
-
-; GCN-LABEL: {{^}}sibling_call_i32_fastcc_i32_byval_i32:
-; GCN-NOT: v0
-; GCN-NOT: s32
-; GCN: buffer_load_dword v1, off, s[0:3], 0 offset:16
-; GCN: buffer_store_dword v1, off, s[0:3], s32{{$}}
-; GCN-NEXT: s_setpc_b64
 define fastcc i32 @sibling_call_i32_fastcc_i32_byval_i32(i32 %a, [32 x i32] %large) #1 {
+; GCN-LABEL: sibling_call_i32_fastcc_i32_byval_i32:
+; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    buffer_load_dword v1, off, s[0:3], 0 offset:16
+; GCN-NEXT:    s_getpc_b64 s[4:5]
+; GCN-NEXT:    s_add_u32 s4, s4, i32_fastcc_i32_byval_i32 at rel32@lo+4
+; GCN-NEXT:    s_addc_u32 s5, s5, i32_fastcc_i32_byval_i32 at rel32@hi+12
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    buffer_store_dword v1, off, s[0:3], s32
+; GCN-NEXT:    s_setpc_b64 s[4:5]
 entry:
   %ret = tail call fastcc i32 @i32_fastcc_i32_byval_i32(i32 %a, ptr addrspace(5) byval(i32) inttoptr (i32 16 to ptr addrspace(5)))
   ret i32 %ret
 }
 
-; GCN-LABEL: {{^}}i32_fastcc_i32_i32_a32i32:
-; GCN: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-DAG: buffer_load_dword [[LOAD_0:v[0-9]+]], off, s[0:3], s32 offset:4{{$}}
-; GCN-DAG: buffer_load_dword [[LOAD_1:v[0-9]+]], off, s[0:3], s32 offset:8{{$}}
-
-; CIVI-NEXT: v_add_{{i|u}}32_e32 v0, vcc, v0, v1
-; CIVI: v_add_{{i|u}}32_e32 v0, vcc, v0, [[LOAD_0]]
-; CIVI: v_add_{{i|u}}32_e32 v0, vcc, v0, [[LOAD_1]]
-
-
-; GFX9-NEXT: v_add_u32_e32 v0, v0, v1
-; GFX9: v_add3_u32 v0, v0, v3, v2
-
-; GCN-NEXT: s_setpc_b64
 define fastcc i32 @i32_fastcc_i32_i32_a32i32(i32 %arg0, i32 %arg1, [32 x i32] %large) #1 {
+; GFX9-LABEL: i32_fastcc_i32_i32_a32i32:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:8
+; GFX9-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:4
+; GFX9-NEXT:    v_add_u32_e32 v0, v0, v1
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_add3_u32 v0, v0, v3, v2
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %val_firststack = extractvalue [32 x i32] %large, 30
   %val_laststack = extractvalue [32 x i32] %large, 31
   %add0 = add i32 %arg0, %arg1
@@ -150,31 +234,49 @@ define fastcc i32 @i32_fastcc_i32_i32_a32i32(i32 %arg0, i32 %arg1, [32 x i32] %l
 }
 
 ; FIXME: Why load and store same location for stack args?
-; GCN-LABEL: {{^}}sibling_call_i32_fastcc_i32_i32_a32i32:
-
-; GCN-DAG: buffer_load_dword [[LOAD_0:v[0-9]+]], off, s[0:3], s32{{$}}
-; GCN-DAG: buffer_load_dword [[LOAD_1:v[0-9]+]], off, s[0:3], s32 offset:4
-; GCN-DAG: buffer_load_dword [[LOAD_2:v[0-9]+]], off, s[0:3], s32 offset:8
-
-; GCN-NOT: s32
-
-; GCN-DAG: buffer_store_dword [[LOAD_0]], off, s[0:3], s32{{$}}
-; GCN-DAG: buffer_store_dword [[LOAD_1]], off, s[0:3], s32 offset:4
-; GCN-DAG: buffer_store_dword [[LOAD_2]], off, s[0:3], s32 offset:8
-
-; GCN-NOT: s32
-; GCN: s_setpc_b64
 define fastcc i32 @sibling_call_i32_fastcc_i32_i32_a32i32(i32 %a, i32 %b, [32 x i32] %c) #1 {
+; GCN-LABEL: sibling_call_i32_fastcc_i32_i32_a32i32:
+; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    buffer_load_dword v31, off, s[0:3], s32
+; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:4
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:8
+; GCN-NEXT:    s_getpc_b64 s[4:5]
+; GCN-NEXT:    s_add_u32 s4, s4, i32_fastcc_i32_i32_a32i32 at gotpcrel32@lo+4
+; GCN-NEXT:    s_addc_u32 s5, s5, i32_fastcc_i32_i32_a32i32 at gotpcrel32@hi+12
+; GCN-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
+; GCN-NEXT:    s_waitcnt vmcnt(2)
+; GCN-NEXT:    buffer_store_dword v31, off, s[0:3], s32
+; GCN-NEXT:    s_waitcnt vmcnt(2)
+; GCN-NEXT:    buffer_store_dword v32, off, s[0:3], s32 offset:4
+; GCN-NEXT:    s_waitcnt vmcnt(2)
+; GCN-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:8
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_setpc_b64 s[4:5]
 entry:
   %ret = tail call fastcc i32 @i32_fastcc_i32_i32_a32i32(i32 %a, i32 %b, [32 x i32] %c)
   ret i32 %ret
 }
 
-; GCN-LABEL: {{^}}sibling_call_i32_fastcc_i32_i32_a32i32_stack_object:
-; GCN-DAG: v_mov_b32_e32 [[NINE:v[0-9]+]], 9
-; GCN: buffer_store_dword [[NINE]], off, s[0:3], s32 offset:32
-; GCN: s_setpc_b64
 define fastcc i32 @sibling_call_i32_fastcc_i32_i32_a32i32_stack_object(i32 %a, i32 %b, [32 x i32] %c) #1 {
+; GCN-LABEL: sibling_call_i32_fastcc_i32_i32_a32i32_stack_object:
+; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    buffer_load_dword v31, off, s[0:3], s32
+; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:4
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:8
+; GCN-NEXT:    s_getpc_b64 s[4:5]
+; GCN-NEXT:    s_add_u32 s4, s4, i32_fastcc_i32_i32_a32i32 at gotpcrel32@lo+4
+; GCN-NEXT:    s_addc_u32 s5, s5, i32_fastcc_i32_i32_a32i32 at gotpcrel32@hi+12
+; GCN-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
+; GCN-NEXT:    v_mov_b32_e32 v34, 9
+; GCN-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:32
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    buffer_store_dword v31, off, s[0:3], s32
+; GCN-NEXT:    buffer_store_dword v32, off, s[0:3], s32 offset:4
+; GCN-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:8
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_setpc_b64 s[4:5]
 entry:
   %alloca = alloca [16 x i32], align 4, addrspace(5)
   %gep = getelementptr inbounds [16 x i32], ptr addrspace(5) %alloca, i32 0, i32 5
@@ -186,54 +288,114 @@ entry:
 ; If the callee requires more stack argument space than the caller,
 ; don't do a tail call.
 ; TODO: Do we really need this restriction?
-
-; GCN-LABEL: {{^}}no_sibling_call_callee_more_stack_space:
-; GCN: s_swappc_b64
-; GCN: s_setpc_b64
 define fastcc i32 @no_sibling_call_callee_more_stack_space(i32 %a, i32 %b) #1 {
+; GCN-LABEL: no_sibling_call_callee_more_stack_space:
+; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    s_mov_b32 s4, s33
+; GCN-NEXT:    s_mov_b32 s33, s32
+; GCN-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; GCN-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
+; GCN-NEXT:    s_mov_b64 exec, s[6:7]
+; GCN-NEXT:    s_addk_i32 s32, 0x400
+; GCN-NEXT:    v_writelane_b32 v40, s4, 2
+; GCN-NEXT:    s_getpc_b64 s[4:5]
+; GCN-NEXT:    s_add_u32 s4, s4, i32_fastcc_i32_i32_a32i32 at gotpcrel32@lo+4
+; GCN-NEXT:    s_addc_u32 s5, s5, i32_fastcc_i32_i32_a32i32 at gotpcrel32@hi+12
+; GCN-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
+; GCN-NEXT:    v_mov_b32_e32 v2, 0
+; GCN-NEXT:    v_writelane_b32 v40, s30, 0
+; GCN-NEXT:    buffer_store_dword v2, off, s[0:3], s32
+; GCN-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:4
+; GCN-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:8
+; GCN-NEXT:    v_mov_b32_e32 v2, 0
+; GCN-NEXT:    v_mov_b32_e32 v3, 0
+; GCN-NEXT:    v_mov_b32_e32 v4, 0
+; GCN-NEXT:    v_mov_b32_e32 v5, 0
+; GCN-NEXT:    v_mov_b32_e32 v6, 0
+; GCN-NEXT:    v_mov_b32_e32 v7, 0
+; GCN-NEXT:    v_mov_b32_e32 v8, 0
+; GCN-NEXT:    v_mov_b32_e32 v9, 0
+; GCN-NEXT:    v_mov_b32_e32 v10, 0
+; GCN-NEXT:    v_mov_b32_e32 v11, 0
+; GCN-NEXT:    v_mov_b32_e32 v12, 0
+; GCN-NEXT:    v_mov_b32_e32 v13, 0
+; GCN-NEXT:    v_mov_b32_e32 v14, 0
+; GCN-NEXT:    v_mov_b32_e32 v15, 0
+; GCN-NEXT:    v_mov_b32_e32 v16, 0
+; GCN-NEXT:    v_mov_b32_e32 v17, 0
+; GCN-NEXT:    v_mov_b32_e32 v18, 0
+; GCN-NEXT:    v_mov_b32_e32 v19, 0
+; GCN-NEXT:    v_mov_b32_e32 v20, 0
+; GCN-NEXT:    v_mov_b32_e32 v21, 0
+; GCN-NEXT:    v_mov_b32_e32 v22, 0
+; GCN-NEXT:    v_mov_b32_e32 v23, 0
+; GCN-NEXT:    v_mov_b32_e32 v24, 0
+; GCN-NEXT:    v_mov_b32_e32 v25, 0
+; GCN-NEXT:    v_mov_b32_e32 v26, 0
+; GCN-NEXT:    v_mov_b32_e32 v27, 0
+; GCN-NEXT:    v_mov_b32_e32 v28, 0
+; GCN-NEXT:    v_mov_b32_e32 v29, 0
+; GCN-NEXT:    v_mov_b32_e32 v30, 0
+; GCN-NEXT:    v_writelane_b32 v40, s31, 1
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GCN-NEXT:    v_readlane_b32 s31, v40, 1
+; GCN-NEXT:    v_readlane_b32 s30, v40, 0
+; GCN-NEXT:    s_mov_b32 s32, s33
+; GCN-NEXT:    v_readlane_b32 s4, v40, 2
+; GCN-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; GCN-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
+; GCN-NEXT:    s_mov_b64 exec, s[6:7]
+; GCN-NEXT:    s_mov_b32 s33, s4
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %ret = tail call fastcc i32 @i32_fastcc_i32_i32_a32i32(i32 %a, i32 %b, [32 x i32] zeroinitializer)
   ret i32 %ret
 }
 
 ; Have another non-tail in the function
-; GCN-LABEL: {{^}}sibling_call_i32_fastcc_i32_i32_other_call:
-; GCN: s_mov_b32 [[FP_SCRATCH_COPY:s[0-9]+]], s33
-; GCN-NEXT: s_mov_b32 s33, s32
-; GCN-NEXT: s_or_saveexec_b64 s{{\[[0-9]+:[0-9]+\]}}, -1
-; GCN-NEXT: buffer_store_dword [[CSRV:v[0-9]+]], off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
-; GCN-NEXT: s_mov_b64 exec
-; GCN-DAG: s_addk_i32 s32, 0x400
-; GCN: v_writelane_b32 [[CSRV]], [[FP_SCRATCH_COPY]], 2
-
-; GCN-DAG: s_getpc_b64 s[4:5]
-; GCN-DAG: s_add_u32 s4, s4, i32_fastcc_i32_i32 at gotpcrel32@lo+4
-; GCN-DAG: s_addc_u32 s5, s5, i32_fastcc_i32_i32 at gotpcrel32@hi+12
-
-; GCN-DAG: v_writelane_b32 [[CSRV]], s30, 0
-; GCN-DAG: buffer_store_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
-; GCN-DAG: buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill
-; GCN-DAG: v_writelane_b32 [[CSRV]], s31, 1
-
-
-; GCN: s_swappc_b64
-
-; GCN-DAG: buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload
-; GCN-DAG: buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
-
-; GCN: s_getpc_b64 s[4:5]
-; GCN-NEXT: s_add_u32 s4, s4, sibling_call_i32_fastcc_i32_i32 at rel32@lo+4
-; GCN-NEXT: s_addc_u32 s5, s5, sibling_call_i32_fastcc_i32_i32 at rel32@hi+12
-; GCN-NEXT: v_readlane_b32 s31, [[CSRV]], 1
-; GCN-NEXT: v_readlane_b32 s30, [[CSRV]], 0
-; GCN-NEXT: s_mov_b32 s32, s33
-; GCN-NEXT: v_readlane_b32 [[FP_SCRATCH_COPY:s[0-9]+]], [[CSRV]], 2
-; GCN-NEXT: s_or_saveexec_b64 s[8:9], -1
-; GCN-NEXT: buffer_load_dword [[CSRV]], off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
-; GCN-NEXT: s_mov_b64 exec, s[8:9]
-; GCN-NEXT: s_mov_b32 s33, [[FP_SCRATCH_COPY]]
-; GCN-NEXT: s_setpc_b64 s[4:5]
 define fastcc i32 @sibling_call_i32_fastcc_i32_i32_other_call(i32 %a, i32 %b, i32 %c) #1 {
+; GCN-LABEL: sibling_call_i32_fastcc_i32_i32_other_call:
+; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    s_mov_b32 s4, s33
+; GCN-NEXT:    s_mov_b32 s33, s32
+; GCN-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; GCN-NEXT:    buffer_store_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
+; GCN-NEXT:    s_mov_b64 exec, s[6:7]
+; GCN-NEXT:    s_addk_i32 s32, 0x400
+; GCN-NEXT:    v_writelane_b32 v42, s4, 2
+; GCN-NEXT:    s_getpc_b64 s[4:5]
+; GCN-NEXT:    s_add_u32 s4, s4, i32_fastcc_i32_i32 at gotpcrel32@lo+4
+; GCN-NEXT:    s_addc_u32 s5, s5, i32_fastcc_i32_i32 at gotpcrel32@hi+12
+; GCN-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
+; GCN-NEXT:    v_writelane_b32 v42, s30, 0
+; GCN-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill
+; GCN-NEXT:    v_writelane_b32 v42, s31, 1
+; GCN-NEXT:    v_mov_b32_e32 v40, v1
+; GCN-NEXT:    v_mov_b32_e32 v41, v0
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GCN-NEXT:    v_mov_b32_e32 v2, v0
+; GCN-NEXT:    v_mov_b32_e32 v0, v41
+; GCN-NEXT:    v_mov_b32_e32 v1, v40
+; GCN-NEXT:    buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
+; GCN-NEXT:    s_getpc_b64 s[4:5]
+; GCN-NEXT:    s_add_u32 s4, s4, sibling_call_i32_fastcc_i32_i32 at rel32@lo+4
+; GCN-NEXT:    s_addc_u32 s5, s5, sibling_call_i32_fastcc_i32_i32 at rel32@hi+12
+; GCN-NEXT:    v_readlane_b32 s31, v42, 1
+; GCN-NEXT:    v_readlane_b32 s30, v42, 0
+; GCN-NEXT:    s_mov_b32 s32, s33
+; GCN-NEXT:    v_readlane_b32 s6, v42, 2
+; GCN-NEXT:    s_or_saveexec_b64 s[8:9], -1
+; GCN-NEXT:    buffer_load_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
+; GCN-NEXT:    s_mov_b64 exec, s[8:9]
+; GCN-NEXT:    s_mov_b32 s33, s6
+; GCN-NEXT:    s_setpc_b64 s[4:5]
 entry:
   %other.call = tail call fastcc i32 @i32_fastcc_i32_i32(i32 %a, i32 %b)
   %ret = tail call fastcc i32 @sibling_call_i32_fastcc_i32_i32(i32 %a, i32 %b, i32 %other.call)
@@ -242,16 +404,25 @@ entry:
 
 ; Have stack object in caller and stack passed arguments. SP should be
 ; in same place at function exit.
-
-; GCN-LABEL: {{^}}sibling_call_stack_objecti32_fastcc_i32_i32_a32i32:
-; GCN-NOT: s33
-; GCN: buffer_load_dword v{{[0-9]+}}, off, s[0:3], s32 offset:
-
-; GCN-NOT: s33
-
-; GCN: buffer_store_dword v{{[0-9]+}}, off, s[0:3], s32 offset:
-; GCN: s_setpc_b64 s[4:5]
 define fastcc i32 @sibling_call_stack_objecti32_fastcc_i32_i32_a32i32(i32 %a, i32 %b, [32 x i32] %c) #1 {
+; GCN-LABEL: sibling_call_stack_objecti32_fastcc_i32_i32_a32i32:
+; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    buffer_load_dword v31, off, s[0:3], s32
+; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:4
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:8
+; GCN-NEXT:    s_getpc_b64 s[4:5]
+; GCN-NEXT:    s_add_u32 s4, s4, i32_fastcc_i32_i32_a32i32 at gotpcrel32@lo+4
+; GCN-NEXT:    s_addc_u32 s5, s5, i32_fastcc_i32_i32_a32i32 at gotpcrel32@hi+12
+; GCN-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
+; GCN-NEXT:    v_mov_b32_e32 v34, 9
+; GCN-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:32
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    buffer_store_dword v31, off, s[0:3], s32
+; GCN-NEXT:    buffer_store_dword v32, off, s[0:3], s32 offset:4
+; GCN-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:8
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_setpc_b64 s[4:5]
 entry:
   %alloca = alloca [16 x i32], align 4, addrspace(5)
   %gep = getelementptr inbounds [16 x i32], ptr addrspace(5) %alloca, i32 0, i32 5
@@ -260,13 +431,52 @@ entry:
   ret i32 %ret
 }
 
-; GCN-LABEL: {{^}}sibling_call_stack_objecti32_fastcc_i32_i32_a32i32_larger_arg_area:
-; GCN-NOT: s33
-; GCN: buffer_store_dword v{{[0-9]+}}, off, s[0:3], s32 offset:48
-
-; GCN-NOT: s33
-; GCN: s_setpc_b64 s[4:5]
 define fastcc i32 @sibling_call_stack_objecti32_fastcc_i32_i32_a32i32_larger_arg_area(i32 %a, i32 %b, [36 x i32] %c) #1 {
+; GCN-LABEL: sibling_call_stack_objecti32_fastcc_i32_i32_a32i32_larger_arg_area:
+; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    s_getpc_b64 s[4:5]
+; GCN-NEXT:    s_add_u32 s4, s4, i32_fastcc_i32_i32_a32i32 at gotpcrel32@lo+4
+; GCN-NEXT:    s_addc_u32 s5, s5, i32_fastcc_i32_i32_a32i32 at gotpcrel32@hi+12
+; GCN-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
+; GCN-NEXT:    v_mov_b32_e32 v2, 9
+; GCN-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:48
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_mov_b32_e32 v2, 0
+; GCN-NEXT:    buffer_store_dword v2, off, s[0:3], s32
+; GCN-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:4
+; GCN-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:8
+; GCN-NEXT:    v_mov_b32_e32 v2, 0
+; GCN-NEXT:    v_mov_b32_e32 v3, 0
+; GCN-NEXT:    v_mov_b32_e32 v4, 0
+; GCN-NEXT:    v_mov_b32_e32 v5, 0
+; GCN-NEXT:    v_mov_b32_e32 v6, 0
+; GCN-NEXT:    v_mov_b32_e32 v7, 0
+; GCN-NEXT:    v_mov_b32_e32 v8, 0
+; GCN-NEXT:    v_mov_b32_e32 v9, 0
+; GCN-NEXT:    v_mov_b32_e32 v10, 0
+; GCN-NEXT:    v_mov_b32_e32 v11, 0
+; GCN-NEXT:    v_mov_b32_e32 v12, 0
+; GCN-NEXT:    v_mov_b32_e32 v13, 0
+; GCN-NEXT:    v_mov_b32_e32 v14, 0
+; GCN-NEXT:    v_mov_b32_e32 v15, 0
+; GCN-NEXT:    v_mov_b32_e32 v16, 0
+; GCN-NEXT:    v_mov_b32_e32 v17, 0
+; GCN-NEXT:    v_mov_b32_e32 v18, 0
+; GCN-NEXT:    v_mov_b32_e32 v19, 0
+; GCN-NEXT:    v_mov_b32_e32 v20, 0
+; GCN-NEXT:    v_mov_b32_e32 v21, 0
+; GCN-NEXT:    v_mov_b32_e32 v22, 0
+; GCN-NEXT:    v_mov_b32_e32 v23, 0
+; GCN-NEXT:    v_mov_b32_e32 v24, 0
+; GCN-NEXT:    v_mov_b32_e32 v25, 0
+; GCN-NEXT:    v_mov_b32_e32 v26, 0
+; GCN-NEXT:    v_mov_b32_e32 v27, 0
+; GCN-NEXT:    v_mov_b32_e32 v28, 0
+; GCN-NEXT:    v_mov_b32_e32 v29, 0
+; GCN-NEXT:    v_mov_b32_e32 v30, 0
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_setpc_b64 s[4:5]
 entry:
   %alloca = alloca [16 x i32], align 4, addrspace(5)
   %gep = getelementptr inbounds [16 x i32], ptr addrspace(5) %alloca, i32 0, i32 5
@@ -278,11 +488,18 @@ entry:
 @func_ptr_gv = external unnamed_addr addrspace(4) constant ptr, align 4
 
 ; Do support tail calls with a uniform, but unknown, callee.
-; GCN-LABEL: {{^}}indirect_uniform_sibling_call_i32_fastcc_i32_i32:
-; GCN: s_load_dwordx2 [[GV_ADDR:s\[[0-9]+:[0-9]+\]]]
-; GCN: s_load_dwordx2 [[FUNC_PTR:s\[[0-9]+:[0-9]+\]]], [[GV_ADDR]]
-; GCN: s_setpc_b64 [[FUNC_PTR]]
 define hidden fastcc i32 @indirect_uniform_sibling_call_i32_fastcc_i32_i32(i32 %a, i32 %b, i32 %c) #1 {
+; GCN-LABEL: indirect_uniform_sibling_call_i32_fastcc_i32_i32:
+; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    s_getpc_b64 s[16:17]
+; GCN-NEXT:    s_add_u32 s16, s16, func_ptr_gv at gotpcrel32@lo+4
+; GCN-NEXT:    s_addc_u32 s17, s17, func_ptr_gv at gotpcrel32@hi+12
+; GCN-NEXT:    s_load_dwordx2 s[16:17], s[16:17], 0x0
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_load_dwordx2 s[16:17], s[16:17], 0x0
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_setpc_b64 s[16:17]
 entry:
   %func.ptr.load = load ptr, ptr addrspace(4) @func_ptr_gv
   %ret = tail call fastcc i32 %func.ptr.load(i32 %a, i32 %b)
@@ -291,14 +508,97 @@ entry:
 
 ; We can't support a tail call to a divergent target. Use a waterfall
 ; loop around a regular call
-; GCN-LABEL: {{^}}indirect_divergent_sibling_call_i32_fastcc_i32_i32:
-; GCN: v_readfirstlane_b32
-; GCN: v_readfirstlane_b32
-; GCN: s_and_saveexec_b64
-; GCN: s_swappc_b64
-; GCN: s_cbranch_execnz
-; GCN: s_setpc_b64
 define hidden fastcc i32 @indirect_divergent_sibling_call_i32_fastcc_i32_i32(ptr %func.ptr, i32 %a, i32 %b, i32 %c) #1 {
+; GFX9-LABEL: indirect_divergent_sibling_call_i32_fastcc_i32_i32:
+; GFX9:       ; %bb.0: ; %entry
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_mov_b32 s16, s33
+; GFX9-NEXT:    s_mov_b32 s33, s32
+; GFX9-NEXT:    s_or_saveexec_b64 s[18:19], -1
+; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_mov_b64 exec, s[18:19]
+; GFX9-NEXT:    v_writelane_b32 v40, s16, 18
+; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX9-NEXT:    v_writelane_b32 v40, s34, 2
+; GFX9-NEXT:    v_writelane_b32 v40, s35, 3
+; GFX9-NEXT:    v_writelane_b32 v40, s36, 4
+; GFX9-NEXT:    v_writelane_b32 v40, s37, 5
+; GFX9-NEXT:    v_writelane_b32 v40, s38, 6
+; GFX9-NEXT:    v_writelane_b32 v40, s39, 7
+; GFX9-NEXT:    v_writelane_b32 v40, s40, 8
+; GFX9-NEXT:    v_writelane_b32 v40, s41, 9
+; GFX9-NEXT:    v_writelane_b32 v40, s42, 10
+; GFX9-NEXT:    v_writelane_b32 v40, s43, 11
+; GFX9-NEXT:    v_writelane_b32 v40, s44, 12
+; GFX9-NEXT:    v_writelane_b32 v40, s45, 13
+; GFX9-NEXT:    v_writelane_b32 v40, s46, 14
+; GFX9-NEXT:    v_writelane_b32 v40, s47, 15
+; GFX9-NEXT:    v_writelane_b32 v40, s48, 16
+; GFX9-NEXT:    s_mov_b32 s42, s15
+; GFX9-NEXT:    s_mov_b32 s43, s14
+; GFX9-NEXT:    s_mov_b32 s44, s13
+; GFX9-NEXT:    s_mov_b32 s45, s12
+; GFX9-NEXT:    s_mov_b64 s[34:35], s[10:11]
+; GFX9-NEXT:    s_mov_b64 s[36:37], s[8:9]
+; GFX9-NEXT:    s_mov_b64 s[38:39], s[6:7]
+; GFX9-NEXT:    s_mov_b64 s[40:41], s[4:5]
+; GFX9-NEXT:    v_add_u32_e32 v3, v3, v4
+; GFX9-NEXT:    s_mov_b64 s[46:47], exec
+; GFX9-NEXT:    s_addk_i32 s32, 0x400
+; GFX9-NEXT:    v_writelane_b32 v40, s49, 17
+; GFX9-NEXT:  .LBB18_1: ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT:    v_readfirstlane_b32 s16, v0
+; GFX9-NEXT:    v_readfirstlane_b32 s17, v1
+; GFX9-NEXT:    v_cmp_eq_u64_e32 vcc, s[16:17], v[0:1]
+; GFX9-NEXT:    s_and_saveexec_b64 s[48:49], vcc
+; GFX9-NEXT:    s_mov_b64 s[4:5], s[40:41]
+; GFX9-NEXT:    s_mov_b64 s[6:7], s[38:39]
+; GFX9-NEXT:    s_mov_b64 s[8:9], s[36:37]
+; GFX9-NEXT:    s_mov_b64 s[10:11], s[34:35]
+; GFX9-NEXT:    s_mov_b32 s12, s45
+; GFX9-NEXT:    s_mov_b32 s13, s44
+; GFX9-NEXT:    s_mov_b32 s14, s43
+; GFX9-NEXT:    s_mov_b32 s15, s42
+; GFX9-NEXT:    v_mov_b32_e32 v0, v2
+; GFX9-NEXT:    v_mov_b32_e32 v1, v3
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX9-NEXT:    v_mov_b32_e32 v4, v0
+; GFX9-NEXT:    ; implicit-def: $vgpr0_vgpr1
+; GFX9-NEXT:    ; implicit-def: $vgpr31
+; GFX9-NEXT:    ; implicit-def: $vgpr2
+; GFX9-NEXT:    ; implicit-def: $vgpr3
+; GFX9-NEXT:    s_xor_b64 exec, exec, s[48:49]
+; GFX9-NEXT:    s_cbranch_execnz .LBB18_1
+; GFX9-NEXT:  ; %bb.2:
+; GFX9-NEXT:    s_mov_b64 exec, s[46:47]
+; GFX9-NEXT:    v_mov_b32_e32 v0, v4
+; GFX9-NEXT:    v_readlane_b32 s49, v40, 17
+; GFX9-NEXT:    v_readlane_b32 s48, v40, 16
+; GFX9-NEXT:    v_readlane_b32 s47, v40, 15
+; GFX9-NEXT:    v_readlane_b32 s46, v40, 14
+; GFX9-NEXT:    v_readlane_b32 s45, v40, 13
+; GFX9-NEXT:    v_readlane_b32 s44, v40, 12
+; GFX9-NEXT:    v_readlane_b32 s43, v40, 11
+; GFX9-NEXT:    v_readlane_b32 s42, v40, 10
+; GFX9-NEXT:    v_readlane_b32 s41, v40, 9
+; GFX9-NEXT:    v_readlane_b32 s40, v40, 8
+; GFX9-NEXT:    v_readlane_b32 s39, v40, 7
+; GFX9-NEXT:    v_readlane_b32 s38, v40, 6
+; GFX9-NEXT:    v_readlane_b32 s37, v40, 5
+; GFX9-NEXT:    v_readlane_b32 s36, v40, 4
+; GFX9-NEXT:    v_readlane_b32 s35, v40, 3
+; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    s_mov_b32 s32, s33
+; GFX9-NEXT:    v_readlane_b32 s4, v40, 18
+; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX9-NEXT:    s_mov_b32 s33, s4
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %add = add i32 %b, %c
   %ret = tail call fastcc i32 %func.ptr(i32 %a, i32 %add)
@@ -307,30 +607,30 @@ entry:
 
 declare hidden void @void_fastcc_multi_byval(i32 %a, ptr addrspace(5) byval([3 x i32]) align 16, ptr addrspace(5) byval([2 x i64]))
 
-; GCN-LABEL: {{^}}sibling_call_fastcc_multi_byval:
-; GCN-DAG: s_getpc_b64 [[TARGET_ADDR:s\[[0-9]+:[0-9]+\]]]
-; GCN-DAG: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0
-; GCN-DAG: v_mov_b32_e32 [[NINE:v[0-9]+]], 9
-
-; GCN-DAG: buffer_store_dword [[NINE]], off, s[0:3], s32 offset:144
-; GCN-DAG: buffer_store_dword [[NINE]], off, s[0:3], s32 offset:148
-; GCN-DAG: buffer_store_dword [[NINE]], off, s[0:3], s32 offset:152
-
-; GCN-DAG: buffer_store_dword [[NINE]], off, s[0:3], s32{{$}}
-; GCN-DAG: buffer_store_dword [[NINE]], off, s[0:3], s32 offset:4{{$}}
-; GCN-DAG: buffer_store_dword [[NINE]], off, s[0:3], s32 offset:8{{$}}
-
-; GCN-DAG: buffer_store_dword [[ZERO]], off, s[0:3], s32 offset:160
-; GCN-DAG: buffer_store_dword [[ZERO]], off, s[0:3], s32 offset:164
-; GCN-DAG: buffer_store_dword [[ZERO]], off, s[0:3], s32 offset:168
-; GCN-DAG: buffer_store_dword [[ZERO]], off, s[0:3], s32 offset:172
-; GCN-DAG: buffer_store_dword [[ZERO]], off, s[0:3], s32 offset:16{{$}}
-; GCN-DAG: buffer_store_dword [[ZERO]], off, s[0:3], s32 offset:20{{$}}
-; GCN-DAG: buffer_store_dword [[ZERO]], off, s[0:3], s32 offset:24{{$}}
-; GCN-DAG: buffer_store_dword [[ZERO]], off, s[0:3], s32 offset:28{{$}}
-
-; GCN: s_setpc_b64 [[TARGET_ADDR]]
 define fastcc void @sibling_call_fastcc_multi_byval(i32 %a, [64 x i32]) #1 {
+; GCN-LABEL: sibling_call_fastcc_multi_byval:
+; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_mov_b32_e32 v1, 9
+; GCN-NEXT:    v_mov_b32_e32 v2, 0
+; GCN-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:148
+; GCN-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:144
+; GCN-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:152
+; GCN-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:172
+; GCN-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:168
+; GCN-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:164
+; GCN-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:160
+; GCN-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:4
+; GCN-NEXT:    buffer_store_dword v1, off, s[0:3], s32
+; GCN-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:28
+; GCN-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:24
+; GCN-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:20
+; GCN-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:16
+; GCN-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:8
+; GCN-NEXT:    s_getpc_b64 s[16:17]
+; GCN-NEXT:    s_add_u32 s16, s16, void_fastcc_multi_byval at rel32@lo+4
+; GCN-NEXT:    s_addc_u32 s17, s17, void_fastcc_multi_byval at rel32@hi+12
+; GCN-NEXT:    s_setpc_b64 s[16:17]
 entry:
   %alloca0 = alloca [3 x i32], align 16, addrspace(5)
   %alloca1 = alloca [2 x i64], align 8, addrspace(5)
@@ -343,26 +643,55 @@ entry:
 declare hidden void @void_fastcc_byval_and_stack_passed(ptr addrspace(5) byval([3 x i32]) align 16, [32 x i32], i32)
 
 ; Callee has a byval and non-byval stack passed argument
-; GCN-LABEL: {{^}}sibling_call_byval_and_stack_passed:
-; GCN: v_mov_b32_e32 [[NINE:v[0-9]+]], 9
-
-; GCN-DAG: buffer_store_dword [[NINE]], off, s[0:3], s32 offset:144
-; GCN-DAG: buffer_store_dword [[NINE]], off, s[0:3], s32 offset:148
-; GCN-DAG: buffer_store_dword [[NINE]], off, s[0:3], s32 offset:152
-; GCN-DAG: buffer_store_dword [[NINE]], off, s[0:3], s32{{$}}
-; GCN-DAG: buffer_store_dword [[NINE]], off, s[0:3], s32 offset:4{{$}}
-; GCN-DAG: buffer_store_dword [[NINE]], off, s[0:3], s32 offset:8{{$}}
-; GCN-DAG: buffer_store_dword [[NINE]], off, s[0:3], s32 offset:12{{$}}
-; GCN-DAG: buffer_store_dword v0, off, s[0:3], s32 offset:16
-
-; GCN: v_mov_b32_e32 v0, 0
-; GCN: v_mov_b32_e32 v30, 0
-
-; GCN: s_getpc_b64 [[TARGET_ADDR:s\[[0-9]+:[0-9]+\]]]
-; GCN-NEXT: s_add_u32
-; GCN-NEXT: s_addc_u32
-; GCN-NEXT: s_setpc_b64 [[TARGET_ADDR]]
 define fastcc void @sibling_call_byval_and_stack_passed(i32 %stack.out.arg, [64 x i32]) #1 {
+; GCN-LABEL: sibling_call_byval_and_stack_passed:
+; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_mov_b32_e32 v1, 9
+; GCN-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:148
+; GCN-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:144
+; GCN-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:152
+; GCN-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:4
+; GCN-NEXT:    buffer_store_dword v1, off, s[0:3], s32
+; GCN-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:8
+; GCN-NEXT:    v_mov_b32_e32 v1, 0
+; GCN-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:12
+; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:16
+; GCN-NEXT:    v_mov_b32_e32 v0, 0
+; GCN-NEXT:    v_mov_b32_e32 v1, 0
+; GCN-NEXT:    v_mov_b32_e32 v2, 0
+; GCN-NEXT:    v_mov_b32_e32 v3, 0
+; GCN-NEXT:    v_mov_b32_e32 v4, 0
+; GCN-NEXT:    v_mov_b32_e32 v5, 0
+; GCN-NEXT:    v_mov_b32_e32 v6, 0
+; GCN-NEXT:    v_mov_b32_e32 v7, 0
+; GCN-NEXT:    v_mov_b32_e32 v8, 0
+; GCN-NEXT:    v_mov_b32_e32 v9, 0
+; GCN-NEXT:    v_mov_b32_e32 v10, 0
+; GCN-NEXT:    v_mov_b32_e32 v11, 0
+; GCN-NEXT:    v_mov_b32_e32 v12, 0
+; GCN-NEXT:    v_mov_b32_e32 v13, 0
+; GCN-NEXT:    v_mov_b32_e32 v14, 0
+; GCN-NEXT:    v_mov_b32_e32 v15, 0
+; GCN-NEXT:    v_mov_b32_e32 v16, 0
+; GCN-NEXT:    v_mov_b32_e32 v17, 0
+; GCN-NEXT:    v_mov_b32_e32 v18, 0
+; GCN-NEXT:    v_mov_b32_e32 v19, 0
+; GCN-NEXT:    v_mov_b32_e32 v20, 0
+; GCN-NEXT:    v_mov_b32_e32 v21, 0
+; GCN-NEXT:    v_mov_b32_e32 v22, 0
+; GCN-NEXT:    v_mov_b32_e32 v23, 0
+; GCN-NEXT:    v_mov_b32_e32 v24, 0
+; GCN-NEXT:    v_mov_b32_e32 v25, 0
+; GCN-NEXT:    v_mov_b32_e32 v26, 0
+; GCN-NEXT:    v_mov_b32_e32 v27, 0
+; GCN-NEXT:    v_mov_b32_e32 v28, 0
+; GCN-NEXT:    v_mov_b32_e32 v29, 0
+; GCN-NEXT:    v_mov_b32_e32 v30, 0
+; GCN-NEXT:    s_getpc_b64 s[16:17]
+; GCN-NEXT:    s_add_u32 s16, s16, void_fastcc_byval_and_stack_passed at rel32@lo+4
+; GCN-NEXT:    s_addc_u32 s17, s17, void_fastcc_byval_and_stack_passed at rel32@hi+12
+; GCN-NEXT:    s_setpc_b64 s[16:17]
 entry:
   %alloca = alloca [3 x i32], align 16, addrspace(5)
   store [3 x i32] [i32 9, i32 9, i32 9], ptr addrspace(5) %alloca
@@ -372,13 +701,14 @@ entry:
 
 declare hidden fastcc i64 @i64_fastcc_i64(i64 %arg0)
 
-; GCN-LABEL: {{^}}sibling_call_i64_fastcc_i64:
-; GCN: s_waitcnt
-; GCN-NEXT: s_getpc_b64
-; GCN-NEXT: s_add_u32
-; GCN-NEXT: s_addc_u32
-; GCN-NEXT: s_setpc_b64
 define hidden fastcc i64 @sibling_call_i64_fastcc_i64(i64 %a) #1 {
+; GCN-LABEL: sibling_call_i64_fastcc_i64:
+; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    s_getpc_b64 s[16:17]
+; GCN-NEXT:    s_add_u32 s16, s16, i64_fastcc_i64 at rel32@lo+4
+; GCN-NEXT:    s_addc_u32 s17, s17, i64_fastcc_i64 at rel32@hi+12
+; GCN-NEXT:    s_setpc_b64 s[16:17]
 entry:
   %ret = tail call fastcc i64 @i64_fastcc_i64(i64 %a)
   ret i64 %ret
@@ -386,13 +716,14 @@ entry:
 
 declare hidden fastcc ptr addrspace(1) @p1i8_fastcc_p1i8(ptr addrspace(1) %arg0)
 
-; GCN-LABEL: {{^}}sibling_call_p1i8_fastcc_p1i8:
-; GCN: s_waitcnt
-; GCN-NEXT: s_getpc_b64
-; GCN-NEXT: s_add_u32
-; GCN-NEXT: s_addc_u32
-; GCN-NEXT: s_setpc_b64
 define hidden fastcc ptr addrspace(1) @sibling_call_p1i8_fastcc_p1i8(ptr addrspace(1) %a) #1 {
+; GCN-LABEL: sibling_call_p1i8_fastcc_p1i8:
+; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    s_getpc_b64 s[16:17]
+; GCN-NEXT:    s_add_u32 s16, s16, p1i8_fastcc_p1i8 at rel32@lo+4
+; GCN-NEXT:    s_addc_u32 s17, s17, p1i8_fastcc_p1i8 at rel32@hi+12
+; GCN-NEXT:    s_setpc_b64 s[16:17]
 entry:
   %ret = tail call fastcc ptr addrspace(1) @p1i8_fastcc_p1i8(ptr addrspace(1) %a)
   ret ptr addrspace(1) %ret
@@ -400,13 +731,14 @@ entry:
 
 declare hidden fastcc i16 @i16_fastcc_i16(i16 %arg0)
 
-; GCN-LABEL: {{^}}sibling_call_i16_fastcc_i16:
-; GCN: s_waitcnt
-; GCN-NEXT: s_getpc_b64
-; GCN-NEXT: s_add_u32
-; GCN-NEXT: s_addc_u32
-; GCN-NEXT: s_setpc_b64
 define hidden fastcc i16 @sibling_call_i16_fastcc_i16(i16 %a) #1 {
+; GCN-LABEL: sibling_call_i16_fastcc_i16:
+; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    s_getpc_b64 s[16:17]
+; GCN-NEXT:    s_add_u32 s16, s16, i16_fastcc_i16 at rel32@lo+4
+; GCN-NEXT:    s_addc_u32 s17, s17, i16_fastcc_i16 at rel32@hi+12
+; GCN-NEXT:    s_setpc_b64 s[16:17]
 entry:
   %ret = tail call fastcc i16 @i16_fastcc_i16(i16 %a)
   ret i16 %ret
@@ -414,13 +746,14 @@ entry:
 
 declare hidden fastcc half @f16_fastcc_f16(half %arg0)
 
-; GCN-LABEL: {{^}}sibling_call_f16_fastcc_f16:
-; GCN: s_waitcnt
-; GCN-NEXT: s_getpc_b64
-; GCN-NEXT: s_add_u32
-; GCN-NEXT: s_addc_u32
-; GCN-NEXT: s_setpc_b64
 define hidden fastcc half @sibling_call_f16_fastcc_f16(half %a) #1 {
+; GCN-LABEL: sibling_call_f16_fastcc_f16:
+; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    s_getpc_b64 s[16:17]
+; GCN-NEXT:    s_add_u32 s16, s16, f16_fastcc_f16 at rel32@lo+4
+; GCN-NEXT:    s_addc_u32 s17, s17, f16_fastcc_f16 at rel32@hi+12
+; GCN-NEXT:    s_setpc_b64 s[16:17]
 entry:
   %ret = tail call fastcc half @f16_fastcc_f16(half %a)
   ret half %ret
@@ -428,13 +761,14 @@ entry:
 
 declare hidden fastcc <3 x i16> @v3i16_fastcc_v3i16(<3 x i16> %arg0)
 
-; GCN-LABEL: {{^}}sibling_call_v3i16_fastcc_v3i16:
-; GCN: s_waitcnt
-; GCN-NEXT: s_getpc_b64
-; GCN-NEXT: s_add_u32
-; GCN-NEXT: s_addc_u32
-; GCN-NEXT: s_setpc_b64
 define hidden fastcc <3 x i16> @sibling_call_v3i16_fastcc_v3i16(<3 x i16> %a) #1 {
+; GCN-LABEL: sibling_call_v3i16_fastcc_v3i16:
+; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    s_getpc_b64 s[16:17]
+; GCN-NEXT:    s_add_u32 s16, s16, v3i16_fastcc_v3i16 at rel32@lo+4
+; GCN-NEXT:    s_addc_u32 s17, s17, v3i16_fastcc_v3i16 at rel32@hi+12
+; GCN-NEXT:    s_setpc_b64 s[16:17]
 entry:
   %ret = tail call fastcc <3 x i16> @v3i16_fastcc_v3i16(<3 x i16> %a)
   ret <3 x i16> %ret
@@ -442,13 +776,14 @@ entry:
 
 declare hidden fastcc <4 x i16> @v4i16_fastcc_v4i16(<4 x i16> %arg0)
 
-; GCN-LABEL: {{^}}sibling_call_v4i16_fastcc_v4i16:
-; GCN: s_waitcnt
-; GCN-NEXT: s_getpc_b64
-; GCN-NEXT: s_add_u32
-; GCN-NEXT: s_addc_u32
-; GCN-NEXT: s_setpc_b64
 define hidden fastcc <4 x i16> @sibling_call_v4i16_fastcc_v4i16(<4 x i16> %a) #1 {
+; GCN-LABEL: sibling_call_v4i16_fastcc_v4i16:
+; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    s_getpc_b64 s[16:17]
+; GCN-NEXT:    s_add_u32 s16, s16, v4i16_fastcc_v4i16 at rel32@lo+4
+; GCN-NEXT:    s_addc_u32 s17, s17, v4i16_fastcc_v4i16 at rel32@hi+12
+; GCN-NEXT:    s_setpc_b64 s[16:17]
 entry:
   %ret = tail call fastcc <4 x i16> @v4i16_fastcc_v4i16(<4 x i16> %a)
   ret <4 x i16> %ret
@@ -456,13 +791,14 @@ entry:
 
 declare hidden fastcc <2 x i64> @v2i64_fastcc_v2i64(<2 x i64> %arg0)
 
-; GCN-LABEL: {{^}}sibling_call_v2i64_fastcc_v2i64:
-; GCN: s_waitcnt
-; GCN-NEXT: s_getpc_b64
-; GCN-NEXT: s_add_u32
-; GCN-NEXT: s_addc_u32
-; GCN-NEXT: s_setpc_b64
 define hidden fastcc <2 x i64> @sibling_call_v2i64_fastcc_v2i64(<2 x i64> %a) #1 {
+; GCN-LABEL: sibling_call_v2i64_fastcc_v2i64:
+; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    s_getpc_b64 s[16:17]
+; GCN-NEXT:    s_add_u32 s16, s16, v2i64_fastcc_v2i64 at rel32@lo+4
+; GCN-NEXT:    s_addc_u32 s17, s17, v2i64_fastcc_v2i64 at rel32@hi+12
+; GCN-NEXT:    s_setpc_b64 s[16:17]
 entry:
   %ret = tail call fastcc <2 x i64> @v2i64_fastcc_v2i64(<2 x i64> %a)
   ret <2 x i64> %ret

>From 4b16622a2c124927b8c454ddb43f163a937159b8 Mon Sep 17 00:00:00 2001
From: easyonaadit <aaditya.alokdeshpande at amd.com>
Date: Mon, 27 Jan 2025 11:40:29 +0530
Subject: [PATCH 2/2] Split run-lines

---
 llvm/test/CodeGen/AMDGPU/sibling-call.ll | 1142 +++++++++++++++++++++-
 1 file changed, 1136 insertions(+), 6 deletions(-)

diff --git a/llvm/test/CodeGen/AMDGPU/sibling-call.ll b/llvm/test/CodeGen/AMDGPU/sibling-call.ll
index d09fc947bac187..e20248426324fd 100644
--- a/llvm/test/CodeGen/AMDGPU/sibling-call.ll
+++ b/llvm/test/CodeGen/AMDGPU/sibling-call.ll
@@ -1,11 +1,23 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -mattr=-flat-for-global -enable-ipra=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,CIVI %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii -enable-ipra=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,CIVI %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -mattr=-flat-for-global -enable-ipra=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -mattr=-flat-for-global -enable-ipra=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=CIVI %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii -enable-ipra=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -mattr=-flat-for-global -enable-ipra=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s
 target datalayout = "A5"
 
 ; FIXME: Why is this commuted only sometimes?
 define fastcc i32 @i32_fastcc_i32_i32(i32 %arg0, i32 %arg1) #1 {
+; CIVI-LABEL: i32_fastcc_i32_i32:
+; CIVI:       ; %bb.0:
+; CIVI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CIVI-NEXT:    v_add_u32_e32 v0, vcc, v0, v1
+; CIVI-NEXT:    s_setpc_b64 s[30:31]
+;
+; GCN-LABEL: i32_fastcc_i32_i32:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX9-LABEL: i32_fastcc_i32_i32:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -16,6 +28,24 @@ define fastcc i32 @i32_fastcc_i32_i32(i32 %arg0, i32 %arg1) #1 {
 }
 
 define fastcc i32 @i32_fastcc_i32_i32_stack_object(i32 %arg0, i32 %arg1) #1 {
+; CIVI-LABEL: i32_fastcc_i32_i32_stack_object:
+; CIVI:       ; %bb.0:
+; CIVI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CIVI-NEXT:    v_mov_b32_e32 v2, 9
+; CIVI-NEXT:    v_add_u32_e32 v0, vcc, v0, v1
+; CIVI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:20
+; CIVI-NEXT:    s_waitcnt vmcnt(0)
+; CIVI-NEXT:    s_setpc_b64 s[30:31]
+;
+; GCN-LABEL: i32_fastcc_i32_i32_stack_object:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_mov_b32_e32 v2, 9
+; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
+; GCN-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:20
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX9-LABEL: i32_fastcc_i32_i32_stack_object:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -24,7 +54,6 @@ define fastcc i32 @i32_fastcc_i32_i32_stack_object(i32 %arg0, i32 %arg1) #1 {
 ; GFX9-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:20
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
-; GCN: ; ScratchSize: 68
   %alloca = alloca [16 x i32], align 4, addrspace(5)
   %gep = getelementptr inbounds [16 x i32], ptr addrspace(5) %alloca, i32 0, i32 5
   store volatile i32 9, ptr addrspace(5) %gep
@@ -33,6 +62,16 @@ define fastcc i32 @i32_fastcc_i32_i32_stack_object(i32 %arg0, i32 %arg1) #1 {
 }
 
 define hidden fastcc i32 @sibling_call_i32_fastcc_i32_i32(i32 %a, i32 %b, i32 %c) #1 {
+; CIVI-LABEL: sibling_call_i32_fastcc_i32_i32:
+; CIVI:       ; %bb.0: ; %entry
+; CIVI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CIVI-NEXT:    s_getpc_b64 s[4:5]
+; CIVI-NEXT:    s_add_u32 s4, s4, i32_fastcc_i32_i32 at gotpcrel32@lo+4
+; CIVI-NEXT:    s_addc_u32 s5, s5, i32_fastcc_i32_i32 at gotpcrel32@hi+12
+; CIVI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
+; CIVI-NEXT:    s_waitcnt lgkmcnt(0)
+; CIVI-NEXT:    s_setpc_b64 s[4:5]
+;
 ; GCN-LABEL: sibling_call_i32_fastcc_i32_i32:
 ; GCN:       ; %bb.0: ; %entry
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -42,12 +81,34 @@ define hidden fastcc i32 @sibling_call_i32_fastcc_i32_i32(i32 %a, i32 %b, i32 %c
 ; GCN-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[4:5]
+;
+; GFX9-LABEL: sibling_call_i32_fastcc_i32_i32:
+; GFX9:       ; %bb.0: ; %entry
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_getpc_b64 s[4:5]
+; GFX9-NEXT:    s_add_u32 s4, s4, i32_fastcc_i32_i32 at gotpcrel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s5, s5, i32_fastcc_i32_i32 at gotpcrel32@hi+12
+; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[4:5]
 entry:
   %ret = tail call fastcc i32 @i32_fastcc_i32_i32(i32 %a, i32 %b)
   ret i32 %ret
 }
 
 define fastcc i32 @sibling_call_i32_fastcc_i32_i32_stack_object(i32 %a, i32 %b, i32 %c) #1 {
+; CIVI-LABEL: sibling_call_i32_fastcc_i32_i32_stack_object:
+; CIVI:       ; %bb.0: ; %entry
+; CIVI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CIVI-NEXT:    s_getpc_b64 s[4:5]
+; CIVI-NEXT:    s_add_u32 s4, s4, i32_fastcc_i32_i32 at gotpcrel32@lo+4
+; CIVI-NEXT:    s_addc_u32 s5, s5, i32_fastcc_i32_i32 at gotpcrel32@hi+12
+; CIVI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
+; CIVI-NEXT:    v_mov_b32_e32 v2, 9
+; CIVI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:20
+; CIVI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CIVI-NEXT:    s_setpc_b64 s[4:5]
+;
 ; GCN-LABEL: sibling_call_i32_fastcc_i32_i32_stack_object:
 ; GCN:       ; %bb.0: ; %entry
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -59,7 +120,18 @@ define fastcc i32 @sibling_call_i32_fastcc_i32_i32_stack_object(i32 %a, i32 %b,
 ; GCN-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:20
 ; GCN-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[4:5]
-; GCN: ; ScratchSize: 68
+;
+; GFX9-LABEL: sibling_call_i32_fastcc_i32_i32_stack_object:
+; GFX9:       ; %bb.0: ; %entry
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_getpc_b64 s[4:5]
+; GFX9-NEXT:    s_add_u32 s4, s4, i32_fastcc_i32_i32 at gotpcrel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s5, s5, i32_fastcc_i32_i32 at gotpcrel32@hi+12
+; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
+; GFX9-NEXT:    v_mov_b32_e32 v2, 9
+; GFX9-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:20
+; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[4:5]
 entry:
   %alloca = alloca [16 x i32], align 4, addrspace(5)
   %gep = getelementptr inbounds [16 x i32], ptr addrspace(5) %alloca, i32 0, i32 5
@@ -69,6 +141,18 @@ entry:
 }
 
 define fastcc i32 @sibling_call_i32_fastcc_i32_i32_callee_stack_object(i32 %a, i32 %b, i32 %c) #1 {
+; CIVI-LABEL: sibling_call_i32_fastcc_i32_i32_callee_stack_object:
+; CIVI:       ; %bb.0: ; %entry
+; CIVI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CIVI-NEXT:    s_getpc_b64 s[4:5]
+; CIVI-NEXT:    s_add_u32 s4, s4, i32_fastcc_i32_i32_stack_object at gotpcrel32@lo+4
+; CIVI-NEXT:    s_addc_u32 s5, s5, i32_fastcc_i32_i32_stack_object at gotpcrel32@hi+12
+; CIVI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
+; CIVI-NEXT:    v_mov_b32_e32 v2, 9
+; CIVI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:20
+; CIVI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CIVI-NEXT:    s_setpc_b64 s[4:5]
+;
 ; GCN-LABEL: sibling_call_i32_fastcc_i32_i32_callee_stack_object:
 ; GCN:       ; %bb.0: ; %entry
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -80,7 +164,18 @@ define fastcc i32 @sibling_call_i32_fastcc_i32_i32_callee_stack_object(i32 %a, i
 ; GCN-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:20
 ; GCN-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[4:5]
-; GCN: ; ScratchSize: 136
+;
+; GFX9-LABEL: sibling_call_i32_fastcc_i32_i32_callee_stack_object:
+; GFX9:       ; %bb.0: ; %entry
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_getpc_b64 s[4:5]
+; GFX9-NEXT:    s_add_u32 s4, s4, i32_fastcc_i32_i32_stack_object at gotpcrel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s5, s5, i32_fastcc_i32_i32_stack_object at gotpcrel32@hi+12
+; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
+; GFX9-NEXT:    v_mov_b32_e32 v2, 9
+; GFX9-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:20
+; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[4:5]
 entry:
   %alloca = alloca [16 x i32], align 4, addrspace(5)
   %gep = getelementptr inbounds [16 x i32], ptr addrspace(5) %alloca, i32 0, i32 5
@@ -90,6 +185,16 @@ entry:
 }
 
 define fastcc void @sibling_call_i32_fastcc_i32_i32_unused_result(i32 %a, i32 %b, i32 %c) #1 {
+; CIVI-LABEL: sibling_call_i32_fastcc_i32_i32_unused_result:
+; CIVI:       ; %bb.0: ; %entry
+; CIVI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CIVI-NEXT:    s_getpc_b64 s[4:5]
+; CIVI-NEXT:    s_add_u32 s4, s4, i32_fastcc_i32_i32 at gotpcrel32@lo+4
+; CIVI-NEXT:    s_addc_u32 s5, s5, i32_fastcc_i32_i32 at gotpcrel32@hi+12
+; CIVI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
+; CIVI-NEXT:    s_waitcnt lgkmcnt(0)
+; CIVI-NEXT:    s_setpc_b64 s[4:5]
+;
 ; GCN-LABEL: sibling_call_i32_fastcc_i32_i32_unused_result:
 ; GCN:       ; %bb.0: ; %entry
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -99,6 +204,16 @@ define fastcc void @sibling_call_i32_fastcc_i32_i32_unused_result(i32 %a, i32 %b
 ; GCN-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[4:5]
+;
+; GFX9-LABEL: sibling_call_i32_fastcc_i32_i32_unused_result:
+; GFX9:       ; %bb.0: ; %entry
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_getpc_b64 s[4:5]
+; GFX9-NEXT:    s_add_u32 s4, s4, i32_fastcc_i32_i32 at gotpcrel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s5, s5, i32_fastcc_i32_i32 at gotpcrel32@hi+12
+; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[4:5]
 entry:
   %ret = tail call fastcc i32 @i32_fastcc_i32_i32(i32 %a, i32 %b)
   ret void
@@ -125,6 +240,25 @@ define amdgpu_kernel void @kernel_call_i32_fastcc_i32_i32_unused_result(i32 %a,
 ; CIVI-NEXT:    s_swappc_b64 s[30:31], s[6:7]
 ; CIVI-NEXT:    s_endpgm
 ;
+; GCN-LABEL: kernel_call_i32_fastcc_i32_i32_unused_result:
+; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    s_add_i32 s6, s6, s9
+; GCN-NEXT:    s_lshr_b32 flat_scratch_hi, s6, 8
+; GCN-NEXT:    s_add_u32 s0, s0, s9
+; GCN-NEXT:    s_addc_u32 s1, s1, 0
+; GCN-NEXT:    s_mov_b32 flat_scratch_lo, s7
+; GCN-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
+; GCN-NEXT:    s_getpc_b64 s[6:7]
+; GCN-NEXT:    s_add_u32 s6, s6, i32_fastcc_i32_i32 at gotpcrel32@lo+4
+; GCN-NEXT:    s_addc_u32 s7, s7, i32_fastcc_i32_i32 at gotpcrel32@hi+12
+; GCN-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x0
+; GCN-NEXT:    s_mov_b32 s32, 0
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    v_mov_b32_e32 v0, s4
+; GCN-NEXT:    v_mov_b32_e32 v1, s5
+; GCN-NEXT:    s_swappc_b64 s[30:31], s[6:7]
+; GCN-NEXT:    s_endpgm
+;
 ; GFX9-LABEL: kernel_call_i32_fastcc_i32_i32_unused_result:
 ; GFX9:       ; %bb.0: ; %entry
 ; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s6, s9
@@ -148,6 +282,22 @@ entry:
 }
 
 define hidden fastcc i32 @i32_fastcc_i32_byval_i32(i32 %arg0, ptr addrspace(5) byval(i32) align 4 %arg1) #1 {
+; CIVI-LABEL: i32_fastcc_i32_byval_i32:
+; CIVI:       ; %bb.0:
+; CIVI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CIVI-NEXT:    buffer_load_dword v1, off, s[0:3], s32
+; CIVI-NEXT:    s_waitcnt vmcnt(0)
+; CIVI-NEXT:    v_add_u32_e32 v0, vcc, v0, v1
+; CIVI-NEXT:    s_setpc_b64 s[30:31]
+;
+; GCN-LABEL: i32_fastcc_i32_byval_i32:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    buffer_load_dword v1, off, s[0:3], s32
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX9-LABEL: i32_fastcc_i32_byval_i32:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -162,6 +312,36 @@ define hidden fastcc i32 @i32_fastcc_i32_byval_i32(i32 %arg0, ptr addrspace(5) b
 
 ; Tail call disallowed with byval in parent.
 define fastcc i32 @sibling_call_i32_fastcc_i32_byval_i32_byval_parent(i32 %a, ptr addrspace(5) byval(i32) %b.byval, i32 %c) #1 {
+; CIVI-LABEL: sibling_call_i32_fastcc_i32_byval_i32_byval_parent:
+; CIVI:       ; %bb.0: ; %entry
+; CIVI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CIVI-NEXT:    s_mov_b32 s4, s33
+; CIVI-NEXT:    s_mov_b32 s33, s32
+; CIVI-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; CIVI-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
+; CIVI-NEXT:    s_mov_b64 exec, s[6:7]
+; CIVI-NEXT:    buffer_load_dword v1, off, s[0:3], s33
+; CIVI-NEXT:    v_writelane_b32 v40, s4, 2
+; CIVI-NEXT:    s_addk_i32 s32, 0x400
+; CIVI-NEXT:    v_writelane_b32 v40, s30, 0
+; CIVI-NEXT:    v_writelane_b32 v40, s31, 1
+; CIVI-NEXT:    s_getpc_b64 s[4:5]
+; CIVI-NEXT:    s_add_u32 s4, s4, i32_fastcc_i32_byval_i32 at rel32@lo+4
+; CIVI-NEXT:    s_addc_u32 s5, s5, i32_fastcc_i32_byval_i32 at rel32@hi+12
+; CIVI-NEXT:    s_waitcnt vmcnt(0)
+; CIVI-NEXT:    buffer_store_dword v1, off, s[0:3], s32
+; CIVI-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; CIVI-NEXT:    v_readlane_b32 s31, v40, 1
+; CIVI-NEXT:    v_readlane_b32 s30, v40, 0
+; CIVI-NEXT:    s_mov_b32 s32, s33
+; CIVI-NEXT:    v_readlane_b32 s4, v40, 2
+; CIVI-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; CIVI-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
+; CIVI-NEXT:    s_mov_b64 exec, s[6:7]
+; CIVI-NEXT:    s_mov_b32 s33, s4
+; CIVI-NEXT:    s_waitcnt vmcnt(0)
+; CIVI-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GCN-LABEL: sibling_call_i32_fastcc_i32_byval_i32_byval_parent:
 ; GCN:       ; %bb.0: ; %entry
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -191,6 +371,36 @@ define fastcc i32 @sibling_call_i32_fastcc_i32_byval_i32_byval_parent(i32 %a, pt
 ; GCN-NEXT:    s_mov_b32 s33, s4
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: sibling_call_i32_fastcc_i32_byval_i32_byval_parent:
+; GFX9:       ; %bb.0: ; %entry
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_mov_b32 s4, s33
+; GFX9-NEXT:    s_mov_b32 s33, s32
+; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX9-NEXT:    buffer_load_dword v1, off, s[0:3], s33
+; GFX9-NEXT:    v_writelane_b32 v40, s4, 2
+; GFX9-NEXT:    s_addk_i32 s32, 0x400
+; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX9-NEXT:    s_getpc_b64 s[4:5]
+; GFX9-NEXT:    s_add_u32 s4, s4, i32_fastcc_i32_byval_i32 at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s5, s5, i32_fastcc_i32_byval_i32 at rel32@hi+12
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    buffer_store_dword v1, off, s[0:3], s32
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    s_mov_b32 s32, s33
+; GFX9-NEXT:    v_readlane_b32 s4, v40, 2
+; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX9-NEXT:    s_mov_b32 s33, s4
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %ret = tail call fastcc i32 @i32_fastcc_i32_byval_i32(i32 %a, ptr addrspace(5) byval(i32) %b.byval)
   ret i32 %ret
@@ -200,6 +410,17 @@ entry:
 ; usage of incoming arguments must be <= the outgoing stack
 ; arguments.
 define fastcc i32 @sibling_call_i32_fastcc_i32_byval_i32(i32 %a, [32 x i32] %large) #1 {
+; CIVI-LABEL: sibling_call_i32_fastcc_i32_byval_i32:
+; CIVI:       ; %bb.0: ; %entry
+; CIVI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CIVI-NEXT:    buffer_load_dword v1, off, s[0:3], 0 offset:16
+; CIVI-NEXT:    s_getpc_b64 s[4:5]
+; CIVI-NEXT:    s_add_u32 s4, s4, i32_fastcc_i32_byval_i32 at rel32@lo+4
+; CIVI-NEXT:    s_addc_u32 s5, s5, i32_fastcc_i32_byval_i32 at rel32@hi+12
+; CIVI-NEXT:    s_waitcnt vmcnt(0)
+; CIVI-NEXT:    buffer_store_dword v1, off, s[0:3], s32
+; CIVI-NEXT:    s_setpc_b64 s[4:5]
+;
 ; GCN-LABEL: sibling_call_i32_fastcc_i32_byval_i32:
 ; GCN:       ; %bb.0: ; %entry
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -210,12 +431,47 @@ define fastcc i32 @sibling_call_i32_fastcc_i32_byval_i32(i32 %a, [32 x i32] %lar
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    buffer_store_dword v1, off, s[0:3], s32
 ; GCN-NEXT:    s_setpc_b64 s[4:5]
+;
+; GFX9-LABEL: sibling_call_i32_fastcc_i32_byval_i32:
+; GFX9:       ; %bb.0: ; %entry
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    buffer_load_dword v1, off, s[0:3], 0 offset:16
+; GFX9-NEXT:    s_getpc_b64 s[4:5]
+; GFX9-NEXT:    s_add_u32 s4, s4, i32_fastcc_i32_byval_i32 at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s5, s5, i32_fastcc_i32_byval_i32 at rel32@hi+12
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    buffer_store_dword v1, off, s[0:3], s32
+; GFX9-NEXT:    s_setpc_b64 s[4:5]
 entry:
   %ret = tail call fastcc i32 @i32_fastcc_i32_byval_i32(i32 %a, ptr addrspace(5) byval(i32) inttoptr (i32 16 to ptr addrspace(5)))
   ret i32 %ret
 }
 
 define fastcc i32 @i32_fastcc_i32_i32_a32i32(i32 %arg0, i32 %arg1, [32 x i32] %large) #1 {
+; CIVI-LABEL: i32_fastcc_i32_i32_a32i32:
+; CIVI:       ; %bb.0:
+; CIVI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CIVI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:4
+; CIVI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:8
+; CIVI-NEXT:    v_add_u32_e32 v0, vcc, v0, v1
+; CIVI-NEXT:    s_waitcnt vmcnt(1)
+; CIVI-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
+; CIVI-NEXT:    s_waitcnt vmcnt(0)
+; CIVI-NEXT:    v_add_u32_e32 v0, vcc, v0, v3
+; CIVI-NEXT:    s_setpc_b64 s[30:31]
+;
+; GCN-LABEL: i32_fastcc_i32_i32_a32i32:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:4
+; GCN-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:8
+; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
+; GCN-NEXT:    s_waitcnt vmcnt(1)
+; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v3
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX9-LABEL: i32_fastcc_i32_i32_a32i32:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -235,6 +491,25 @@ define fastcc i32 @i32_fastcc_i32_i32_a32i32(i32 %arg0, i32 %arg1, [32 x i32] %l
 
 ; FIXME: Why load and store same location for stack args?
 define fastcc i32 @sibling_call_i32_fastcc_i32_i32_a32i32(i32 %a, i32 %b, [32 x i32] %c) #1 {
+; CIVI-LABEL: sibling_call_i32_fastcc_i32_i32_a32i32:
+; CIVI:       ; %bb.0: ; %entry
+; CIVI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CIVI-NEXT:    buffer_load_dword v31, off, s[0:3], s32
+; CIVI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:4
+; CIVI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:8
+; CIVI-NEXT:    s_getpc_b64 s[4:5]
+; CIVI-NEXT:    s_add_u32 s4, s4, i32_fastcc_i32_i32_a32i32 at gotpcrel32@lo+4
+; CIVI-NEXT:    s_addc_u32 s5, s5, i32_fastcc_i32_i32_a32i32 at gotpcrel32@hi+12
+; CIVI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
+; CIVI-NEXT:    s_waitcnt vmcnt(2)
+; CIVI-NEXT:    buffer_store_dword v31, off, s[0:3], s32
+; CIVI-NEXT:    s_waitcnt vmcnt(2)
+; CIVI-NEXT:    buffer_store_dword v32, off, s[0:3], s32 offset:4
+; CIVI-NEXT:    s_waitcnt vmcnt(2)
+; CIVI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:8
+; CIVI-NEXT:    s_waitcnt lgkmcnt(0)
+; CIVI-NEXT:    s_setpc_b64 s[4:5]
+;
 ; GCN-LABEL: sibling_call_i32_fastcc_i32_i32_a32i32:
 ; GCN:       ; %bb.0: ; %entry
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -253,12 +528,50 @@ define fastcc i32 @sibling_call_i32_fastcc_i32_i32_a32i32(i32 %a, i32 %b, [32 x
 ; GCN-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:8
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[4:5]
+;
+; GFX9-LABEL: sibling_call_i32_fastcc_i32_i32_a32i32:
+; GFX9:       ; %bb.0: ; %entry
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    buffer_load_dword v31, off, s[0:3], s32
+; GFX9-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:4
+; GFX9-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:8
+; GFX9-NEXT:    s_getpc_b64 s[4:5]
+; GFX9-NEXT:    s_add_u32 s4, s4, i32_fastcc_i32_i32_a32i32 at gotpcrel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s5, s5, i32_fastcc_i32_i32_a32i32 at gotpcrel32@hi+12
+; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
+; GFX9-NEXT:    s_waitcnt vmcnt(2)
+; GFX9-NEXT:    buffer_store_dword v31, off, s[0:3], s32
+; GFX9-NEXT:    s_waitcnt vmcnt(2)
+; GFX9-NEXT:    buffer_store_dword v32, off, s[0:3], s32 offset:4
+; GFX9-NEXT:    s_waitcnt vmcnt(2)
+; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:8
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[4:5]
 entry:
   %ret = tail call fastcc i32 @i32_fastcc_i32_i32_a32i32(i32 %a, i32 %b, [32 x i32] %c)
   ret i32 %ret
 }
 
 define fastcc i32 @sibling_call_i32_fastcc_i32_i32_a32i32_stack_object(i32 %a, i32 %b, [32 x i32] %c) #1 {
+; CIVI-LABEL: sibling_call_i32_fastcc_i32_i32_a32i32_stack_object:
+; CIVI:       ; %bb.0: ; %entry
+; CIVI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CIVI-NEXT:    buffer_load_dword v31, off, s[0:3], s32
+; CIVI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:4
+; CIVI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:8
+; CIVI-NEXT:    s_getpc_b64 s[4:5]
+; CIVI-NEXT:    s_add_u32 s4, s4, i32_fastcc_i32_i32_a32i32 at gotpcrel32@lo+4
+; CIVI-NEXT:    s_addc_u32 s5, s5, i32_fastcc_i32_i32_a32i32 at gotpcrel32@hi+12
+; CIVI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
+; CIVI-NEXT:    v_mov_b32_e32 v34, 9
+; CIVI-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:32
+; CIVI-NEXT:    s_waitcnt vmcnt(0)
+; CIVI-NEXT:    buffer_store_dword v31, off, s[0:3], s32
+; CIVI-NEXT:    buffer_store_dword v32, off, s[0:3], s32 offset:4
+; CIVI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:8
+; CIVI-NEXT:    s_waitcnt lgkmcnt(0)
+; CIVI-NEXT:    s_setpc_b64 s[4:5]
+;
 ; GCN-LABEL: sibling_call_i32_fastcc_i32_i32_a32i32_stack_object:
 ; GCN:       ; %bb.0: ; %entry
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -277,6 +590,25 @@ define fastcc i32 @sibling_call_i32_fastcc_i32_i32_a32i32_stack_object(i32 %a, i
 ; GCN-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:8
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[4:5]
+;
+; GFX9-LABEL: sibling_call_i32_fastcc_i32_i32_a32i32_stack_object:
+; GFX9:       ; %bb.0: ; %entry
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    buffer_load_dword v31, off, s[0:3], s32
+; GFX9-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:4
+; GFX9-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:8
+; GFX9-NEXT:    s_getpc_b64 s[4:5]
+; GFX9-NEXT:    s_add_u32 s4, s4, i32_fastcc_i32_i32_a32i32 at gotpcrel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s5, s5, i32_fastcc_i32_i32_a32i32 at gotpcrel32@hi+12
+; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
+; GFX9-NEXT:    v_mov_b32_e32 v34, 9
+; GFX9-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:32
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    buffer_store_dword v31, off, s[0:3], s32
+; GFX9-NEXT:    buffer_store_dword v32, off, s[0:3], s32 offset:4
+; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:8
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[4:5]
 entry:
   %alloca = alloca [16 x i32], align 4, addrspace(5)
   %gep = getelementptr inbounds [16 x i32], ptr addrspace(5) %alloca, i32 0, i32 5
@@ -289,6 +621,68 @@ entry:
 ; don't do a tail call.
 ; TODO: Do we really need this restriction?
 define fastcc i32 @no_sibling_call_callee_more_stack_space(i32 %a, i32 %b) #1 {
+; CIVI-LABEL: no_sibling_call_callee_more_stack_space:
+; CIVI:       ; %bb.0: ; %entry
+; CIVI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CIVI-NEXT:    s_mov_b32 s4, s33
+; CIVI-NEXT:    s_mov_b32 s33, s32
+; CIVI-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; CIVI-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
+; CIVI-NEXT:    s_mov_b64 exec, s[6:7]
+; CIVI-NEXT:    s_addk_i32 s32, 0x400
+; CIVI-NEXT:    v_writelane_b32 v40, s4, 2
+; CIVI-NEXT:    s_getpc_b64 s[4:5]
+; CIVI-NEXT:    s_add_u32 s4, s4, i32_fastcc_i32_i32_a32i32 at gotpcrel32@lo+4
+; CIVI-NEXT:    s_addc_u32 s5, s5, i32_fastcc_i32_i32_a32i32 at gotpcrel32@hi+12
+; CIVI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
+; CIVI-NEXT:    v_mov_b32_e32 v2, 0
+; CIVI-NEXT:    v_writelane_b32 v40, s30, 0
+; CIVI-NEXT:    buffer_store_dword v2, off, s[0:3], s32
+; CIVI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:4
+; CIVI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:8
+; CIVI-NEXT:    v_mov_b32_e32 v2, 0
+; CIVI-NEXT:    v_mov_b32_e32 v3, 0
+; CIVI-NEXT:    v_mov_b32_e32 v4, 0
+; CIVI-NEXT:    v_mov_b32_e32 v5, 0
+; CIVI-NEXT:    v_mov_b32_e32 v6, 0
+; CIVI-NEXT:    v_mov_b32_e32 v7, 0
+; CIVI-NEXT:    v_mov_b32_e32 v8, 0
+; CIVI-NEXT:    v_mov_b32_e32 v9, 0
+; CIVI-NEXT:    v_mov_b32_e32 v10, 0
+; CIVI-NEXT:    v_mov_b32_e32 v11, 0
+; CIVI-NEXT:    v_mov_b32_e32 v12, 0
+; CIVI-NEXT:    v_mov_b32_e32 v13, 0
+; CIVI-NEXT:    v_mov_b32_e32 v14, 0
+; CIVI-NEXT:    v_mov_b32_e32 v15, 0
+; CIVI-NEXT:    v_mov_b32_e32 v16, 0
+; CIVI-NEXT:    v_mov_b32_e32 v17, 0
+; CIVI-NEXT:    v_mov_b32_e32 v18, 0
+; CIVI-NEXT:    v_mov_b32_e32 v19, 0
+; CIVI-NEXT:    v_mov_b32_e32 v20, 0
+; CIVI-NEXT:    v_mov_b32_e32 v21, 0
+; CIVI-NEXT:    v_mov_b32_e32 v22, 0
+; CIVI-NEXT:    v_mov_b32_e32 v23, 0
+; CIVI-NEXT:    v_mov_b32_e32 v24, 0
+; CIVI-NEXT:    v_mov_b32_e32 v25, 0
+; CIVI-NEXT:    v_mov_b32_e32 v26, 0
+; CIVI-NEXT:    v_mov_b32_e32 v27, 0
+; CIVI-NEXT:    v_mov_b32_e32 v28, 0
+; CIVI-NEXT:    v_mov_b32_e32 v29, 0
+; CIVI-NEXT:    v_mov_b32_e32 v30, 0
+; CIVI-NEXT:    v_writelane_b32 v40, s31, 1
+; CIVI-NEXT:    s_waitcnt lgkmcnt(0)
+; CIVI-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; CIVI-NEXT:    v_readlane_b32 s31, v40, 1
+; CIVI-NEXT:    v_readlane_b32 s30, v40, 0
+; CIVI-NEXT:    s_mov_b32 s32, s33
+; CIVI-NEXT:    v_readlane_b32 s4, v40, 2
+; CIVI-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; CIVI-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
+; CIVI-NEXT:    s_mov_b64 exec, s[6:7]
+; CIVI-NEXT:    s_mov_b32 s33, s4
+; CIVI-NEXT:    s_waitcnt vmcnt(0)
+; CIVI-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GCN-LABEL: no_sibling_call_callee_more_stack_space:
 ; GCN:       ; %bb.0: ; %entry
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -350,6 +744,68 @@ define fastcc i32 @no_sibling_call_callee_more_stack_space(i32 %a, i32 %b) #1 {
 ; GCN-NEXT:    s_mov_b32 s33, s4
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: no_sibling_call_callee_more_stack_space:
+; GFX9:       ; %bb.0: ; %entry
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_mov_b32 s4, s33
+; GFX9-NEXT:    s_mov_b32 s33, s32
+; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX9-NEXT:    s_addk_i32 s32, 0x400
+; GFX9-NEXT:    v_writelane_b32 v40, s4, 2
+; GFX9-NEXT:    s_getpc_b64 s[4:5]
+; GFX9-NEXT:    s_add_u32 s4, s4, i32_fastcc_i32_i32_a32i32 at gotpcrel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s5, s5, i32_fastcc_i32_i32_a32i32 at gotpcrel32@hi+12
+; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX9-NEXT:    buffer_store_dword v2, off, s[0:3], s32
+; GFX9-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:4
+; GFX9-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:8
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9-NEXT:    v_mov_b32_e32 v3, 0
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0
+; GFX9-NEXT:    v_mov_b32_e32 v5, 0
+; GFX9-NEXT:    v_mov_b32_e32 v6, 0
+; GFX9-NEXT:    v_mov_b32_e32 v7, 0
+; GFX9-NEXT:    v_mov_b32_e32 v8, 0
+; GFX9-NEXT:    v_mov_b32_e32 v9, 0
+; GFX9-NEXT:    v_mov_b32_e32 v10, 0
+; GFX9-NEXT:    v_mov_b32_e32 v11, 0
+; GFX9-NEXT:    v_mov_b32_e32 v12, 0
+; GFX9-NEXT:    v_mov_b32_e32 v13, 0
+; GFX9-NEXT:    v_mov_b32_e32 v14, 0
+; GFX9-NEXT:    v_mov_b32_e32 v15, 0
+; GFX9-NEXT:    v_mov_b32_e32 v16, 0
+; GFX9-NEXT:    v_mov_b32_e32 v17, 0
+; GFX9-NEXT:    v_mov_b32_e32 v18, 0
+; GFX9-NEXT:    v_mov_b32_e32 v19, 0
+; GFX9-NEXT:    v_mov_b32_e32 v20, 0
+; GFX9-NEXT:    v_mov_b32_e32 v21, 0
+; GFX9-NEXT:    v_mov_b32_e32 v22, 0
+; GFX9-NEXT:    v_mov_b32_e32 v23, 0
+; GFX9-NEXT:    v_mov_b32_e32 v24, 0
+; GFX9-NEXT:    v_mov_b32_e32 v25, 0
+; GFX9-NEXT:    v_mov_b32_e32 v26, 0
+; GFX9-NEXT:    v_mov_b32_e32 v27, 0
+; GFX9-NEXT:    v_mov_b32_e32 v28, 0
+; GFX9-NEXT:    v_mov_b32_e32 v29, 0
+; GFX9-NEXT:    v_mov_b32_e32 v30, 0
+; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    s_mov_b32 s32, s33
+; GFX9-NEXT:    v_readlane_b32 s4, v40, 2
+; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX9-NEXT:    s_mov_b32 s33, s4
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %ret = tail call fastcc i32 @i32_fastcc_i32_i32_a32i32(i32 %a, i32 %b, [32 x i32] zeroinitializer)
   ret i32 %ret
@@ -357,6 +813,46 @@ entry:
 
 ; Have another non-tail in the function
 define fastcc i32 @sibling_call_i32_fastcc_i32_i32_other_call(i32 %a, i32 %b, i32 %c) #1 {
+; CIVI-LABEL: sibling_call_i32_fastcc_i32_i32_other_call:
+; CIVI:       ; %bb.0: ; %entry
+; CIVI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CIVI-NEXT:    s_mov_b32 s4, s33
+; CIVI-NEXT:    s_mov_b32 s33, s32
+; CIVI-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; CIVI-NEXT:    buffer_store_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
+; CIVI-NEXT:    s_mov_b64 exec, s[6:7]
+; CIVI-NEXT:    s_addk_i32 s32, 0x400
+; CIVI-NEXT:    v_writelane_b32 v42, s4, 2
+; CIVI-NEXT:    s_getpc_b64 s[4:5]
+; CIVI-NEXT:    s_add_u32 s4, s4, i32_fastcc_i32_i32 at gotpcrel32@lo+4
+; CIVI-NEXT:    s_addc_u32 s5, s5, i32_fastcc_i32_i32 at gotpcrel32@hi+12
+; CIVI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
+; CIVI-NEXT:    v_writelane_b32 v42, s30, 0
+; CIVI-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
+; CIVI-NEXT:    buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill
+; CIVI-NEXT:    v_writelane_b32 v42, s31, 1
+; CIVI-NEXT:    v_mov_b32_e32 v40, v1
+; CIVI-NEXT:    v_mov_b32_e32 v41, v0
+; CIVI-NEXT:    s_waitcnt lgkmcnt(0)
+; CIVI-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; CIVI-NEXT:    v_mov_b32_e32 v2, v0
+; CIVI-NEXT:    v_mov_b32_e32 v0, v41
+; CIVI-NEXT:    v_mov_b32_e32 v1, v40
+; CIVI-NEXT:    buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload
+; CIVI-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
+; CIVI-NEXT:    s_getpc_b64 s[4:5]
+; CIVI-NEXT:    s_add_u32 s4, s4, sibling_call_i32_fastcc_i32_i32 at rel32@lo+4
+; CIVI-NEXT:    s_addc_u32 s5, s5, sibling_call_i32_fastcc_i32_i32 at rel32@hi+12
+; CIVI-NEXT:    v_readlane_b32 s31, v42, 1
+; CIVI-NEXT:    v_readlane_b32 s30, v42, 0
+; CIVI-NEXT:    s_mov_b32 s32, s33
+; CIVI-NEXT:    v_readlane_b32 s6, v42, 2
+; CIVI-NEXT:    s_or_saveexec_b64 s[8:9], -1
+; CIVI-NEXT:    buffer_load_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
+; CIVI-NEXT:    s_mov_b64 exec, s[8:9]
+; CIVI-NEXT:    s_mov_b32 s33, s6
+; CIVI-NEXT:    s_setpc_b64 s[4:5]
+;
 ; GCN-LABEL: sibling_call_i32_fastcc_i32_i32_other_call:
 ; GCN:       ; %bb.0: ; %entry
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -396,6 +892,46 @@ define fastcc i32 @sibling_call_i32_fastcc_i32_i32_other_call(i32 %a, i32 %b, i3
 ; GCN-NEXT:    s_mov_b64 exec, s[8:9]
 ; GCN-NEXT:    s_mov_b32 s33, s6
 ; GCN-NEXT:    s_setpc_b64 s[4:5]
+;
+; GFX9-LABEL: sibling_call_i32_fastcc_i32_i32_other_call:
+; GFX9:       ; %bb.0: ; %entry
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_mov_b32 s4, s33
+; GFX9-NEXT:    s_mov_b32 s33, s32
+; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; GFX9-NEXT:    buffer_store_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX9-NEXT:    s_addk_i32 s32, 0x400
+; GFX9-NEXT:    v_writelane_b32 v42, s4, 2
+; GFX9-NEXT:    s_getpc_b64 s[4:5]
+; GFX9-NEXT:    s_add_u32 s4, s4, i32_fastcc_i32_i32 at gotpcrel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s5, s5, i32_fastcc_i32_i32 at gotpcrel32@hi+12
+; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
+; GFX9-NEXT:    v_writelane_b32 v42, s30, 0
+; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX9-NEXT:    v_writelane_b32 v42, s31, 1
+; GFX9-NEXT:    v_mov_b32_e32 v40, v1
+; GFX9-NEXT:    v_mov_b32_e32 v41, v0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GFX9-NEXT:    v_mov_b32_e32 v2, v0
+; GFX9-NEXT:    v_mov_b32_e32 v0, v41
+; GFX9-NEXT:    v_mov_b32_e32 v1, v40
+; GFX9-NEXT:    buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_getpc_b64 s[4:5]
+; GFX9-NEXT:    s_add_u32 s4, s4, sibling_call_i32_fastcc_i32_i32 at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s5, s5, sibling_call_i32_fastcc_i32_i32 at rel32@hi+12
+; GFX9-NEXT:    v_readlane_b32 s31, v42, 1
+; GFX9-NEXT:    v_readlane_b32 s30, v42, 0
+; GFX9-NEXT:    s_mov_b32 s32, s33
+; GFX9-NEXT:    v_readlane_b32 s6, v42, 2
+; GFX9-NEXT:    s_or_saveexec_b64 s[8:9], -1
+; GFX9-NEXT:    buffer_load_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_mov_b64 exec, s[8:9]
+; GFX9-NEXT:    s_mov_b32 s33, s6
+; GFX9-NEXT:    s_setpc_b64 s[4:5]
 entry:
   %other.call = tail call fastcc i32 @i32_fastcc_i32_i32(i32 %a, i32 %b)
   %ret = tail call fastcc i32 @sibling_call_i32_fastcc_i32_i32(i32 %a, i32 %b, i32 %other.call)
@@ -405,6 +941,25 @@ entry:
 ; Have stack object in caller and stack passed arguments. SP should be
 ; in same place at function exit.
 define fastcc i32 @sibling_call_stack_objecti32_fastcc_i32_i32_a32i32(i32 %a, i32 %b, [32 x i32] %c) #1 {
+; CIVI-LABEL: sibling_call_stack_objecti32_fastcc_i32_i32_a32i32:
+; CIVI:       ; %bb.0: ; %entry
+; CIVI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CIVI-NEXT:    buffer_load_dword v31, off, s[0:3], s32
+; CIVI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:4
+; CIVI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:8
+; CIVI-NEXT:    s_getpc_b64 s[4:5]
+; CIVI-NEXT:    s_add_u32 s4, s4, i32_fastcc_i32_i32_a32i32 at gotpcrel32@lo+4
+; CIVI-NEXT:    s_addc_u32 s5, s5, i32_fastcc_i32_i32_a32i32 at gotpcrel32@hi+12
+; CIVI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
+; CIVI-NEXT:    v_mov_b32_e32 v34, 9
+; CIVI-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:32
+; CIVI-NEXT:    s_waitcnt vmcnt(0)
+; CIVI-NEXT:    buffer_store_dword v31, off, s[0:3], s32
+; CIVI-NEXT:    buffer_store_dword v32, off, s[0:3], s32 offset:4
+; CIVI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:8
+; CIVI-NEXT:    s_waitcnt lgkmcnt(0)
+; CIVI-NEXT:    s_setpc_b64 s[4:5]
+;
 ; GCN-LABEL: sibling_call_stack_objecti32_fastcc_i32_i32_a32i32:
 ; GCN:       ; %bb.0: ; %entry
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -423,6 +978,25 @@ define fastcc i32 @sibling_call_stack_objecti32_fastcc_i32_i32_a32i32(i32 %a, i3
 ; GCN-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:8
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[4:5]
+;
+; GFX9-LABEL: sibling_call_stack_objecti32_fastcc_i32_i32_a32i32:
+; GFX9:       ; %bb.0: ; %entry
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    buffer_load_dword v31, off, s[0:3], s32
+; GFX9-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:4
+; GFX9-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:8
+; GFX9-NEXT:    s_getpc_b64 s[4:5]
+; GFX9-NEXT:    s_add_u32 s4, s4, i32_fastcc_i32_i32_a32i32 at gotpcrel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s5, s5, i32_fastcc_i32_i32_a32i32 at gotpcrel32@hi+12
+; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
+; GFX9-NEXT:    v_mov_b32_e32 v34, 9
+; GFX9-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:32
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    buffer_store_dword v31, off, s[0:3], s32
+; GFX9-NEXT:    buffer_store_dword v32, off, s[0:3], s32 offset:4
+; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:8
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[4:5]
 entry:
   %alloca = alloca [16 x i32], align 4, addrspace(5)
   %gep = getelementptr inbounds [16 x i32], ptr addrspace(5) %alloca, i32 0, i32 5
@@ -432,6 +1006,52 @@ entry:
 }
 
 define fastcc i32 @sibling_call_stack_objecti32_fastcc_i32_i32_a32i32_larger_arg_area(i32 %a, i32 %b, [36 x i32] %c) #1 {
+; CIVI-LABEL: sibling_call_stack_objecti32_fastcc_i32_i32_a32i32_larger_arg_area:
+; CIVI:       ; %bb.0: ; %entry
+; CIVI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CIVI-NEXT:    s_getpc_b64 s[4:5]
+; CIVI-NEXT:    s_add_u32 s4, s4, i32_fastcc_i32_i32_a32i32 at gotpcrel32@lo+4
+; CIVI-NEXT:    s_addc_u32 s5, s5, i32_fastcc_i32_i32_a32i32 at gotpcrel32@hi+12
+; CIVI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
+; CIVI-NEXT:    v_mov_b32_e32 v2, 9
+; CIVI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:48
+; CIVI-NEXT:    s_waitcnt vmcnt(0)
+; CIVI-NEXT:    v_mov_b32_e32 v2, 0
+; CIVI-NEXT:    buffer_store_dword v2, off, s[0:3], s32
+; CIVI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:4
+; CIVI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:8
+; CIVI-NEXT:    v_mov_b32_e32 v2, 0
+; CIVI-NEXT:    v_mov_b32_e32 v3, 0
+; CIVI-NEXT:    v_mov_b32_e32 v4, 0
+; CIVI-NEXT:    v_mov_b32_e32 v5, 0
+; CIVI-NEXT:    v_mov_b32_e32 v6, 0
+; CIVI-NEXT:    v_mov_b32_e32 v7, 0
+; CIVI-NEXT:    v_mov_b32_e32 v8, 0
+; CIVI-NEXT:    v_mov_b32_e32 v9, 0
+; CIVI-NEXT:    v_mov_b32_e32 v10, 0
+; CIVI-NEXT:    v_mov_b32_e32 v11, 0
+; CIVI-NEXT:    v_mov_b32_e32 v12, 0
+; CIVI-NEXT:    v_mov_b32_e32 v13, 0
+; CIVI-NEXT:    v_mov_b32_e32 v14, 0
+; CIVI-NEXT:    v_mov_b32_e32 v15, 0
+; CIVI-NEXT:    v_mov_b32_e32 v16, 0
+; CIVI-NEXT:    v_mov_b32_e32 v17, 0
+; CIVI-NEXT:    v_mov_b32_e32 v18, 0
+; CIVI-NEXT:    v_mov_b32_e32 v19, 0
+; CIVI-NEXT:    v_mov_b32_e32 v20, 0
+; CIVI-NEXT:    v_mov_b32_e32 v21, 0
+; CIVI-NEXT:    v_mov_b32_e32 v22, 0
+; CIVI-NEXT:    v_mov_b32_e32 v23, 0
+; CIVI-NEXT:    v_mov_b32_e32 v24, 0
+; CIVI-NEXT:    v_mov_b32_e32 v25, 0
+; CIVI-NEXT:    v_mov_b32_e32 v26, 0
+; CIVI-NEXT:    v_mov_b32_e32 v27, 0
+; CIVI-NEXT:    v_mov_b32_e32 v28, 0
+; CIVI-NEXT:    v_mov_b32_e32 v29, 0
+; CIVI-NEXT:    v_mov_b32_e32 v30, 0
+; CIVI-NEXT:    s_waitcnt lgkmcnt(0)
+; CIVI-NEXT:    s_setpc_b64 s[4:5]
+;
 ; GCN-LABEL: sibling_call_stack_objecti32_fastcc_i32_i32_a32i32_larger_arg_area:
 ; GCN:       ; %bb.0: ; %entry
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -477,6 +1097,52 @@ define fastcc i32 @sibling_call_stack_objecti32_fastcc_i32_i32_a32i32_larger_arg
 ; GCN-NEXT:    v_mov_b32_e32 v30, 0
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[4:5]
+;
+; GFX9-LABEL: sibling_call_stack_objecti32_fastcc_i32_i32_a32i32_larger_arg_area:
+; GFX9:       ; %bb.0: ; %entry
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_getpc_b64 s[4:5]
+; GFX9-NEXT:    s_add_u32 s4, s4, i32_fastcc_i32_i32_a32i32 at gotpcrel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s5, s5, i32_fastcc_i32_i32_a32i32 at gotpcrel32@hi+12
+; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
+; GFX9-NEXT:    v_mov_b32_e32 v2, 9
+; GFX9-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:48
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9-NEXT:    buffer_store_dword v2, off, s[0:3], s32
+; GFX9-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:4
+; GFX9-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:8
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9-NEXT:    v_mov_b32_e32 v3, 0
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0
+; GFX9-NEXT:    v_mov_b32_e32 v5, 0
+; GFX9-NEXT:    v_mov_b32_e32 v6, 0
+; GFX9-NEXT:    v_mov_b32_e32 v7, 0
+; GFX9-NEXT:    v_mov_b32_e32 v8, 0
+; GFX9-NEXT:    v_mov_b32_e32 v9, 0
+; GFX9-NEXT:    v_mov_b32_e32 v10, 0
+; GFX9-NEXT:    v_mov_b32_e32 v11, 0
+; GFX9-NEXT:    v_mov_b32_e32 v12, 0
+; GFX9-NEXT:    v_mov_b32_e32 v13, 0
+; GFX9-NEXT:    v_mov_b32_e32 v14, 0
+; GFX9-NEXT:    v_mov_b32_e32 v15, 0
+; GFX9-NEXT:    v_mov_b32_e32 v16, 0
+; GFX9-NEXT:    v_mov_b32_e32 v17, 0
+; GFX9-NEXT:    v_mov_b32_e32 v18, 0
+; GFX9-NEXT:    v_mov_b32_e32 v19, 0
+; GFX9-NEXT:    v_mov_b32_e32 v20, 0
+; GFX9-NEXT:    v_mov_b32_e32 v21, 0
+; GFX9-NEXT:    v_mov_b32_e32 v22, 0
+; GFX9-NEXT:    v_mov_b32_e32 v23, 0
+; GFX9-NEXT:    v_mov_b32_e32 v24, 0
+; GFX9-NEXT:    v_mov_b32_e32 v25, 0
+; GFX9-NEXT:    v_mov_b32_e32 v26, 0
+; GFX9-NEXT:    v_mov_b32_e32 v27, 0
+; GFX9-NEXT:    v_mov_b32_e32 v28, 0
+; GFX9-NEXT:    v_mov_b32_e32 v29, 0
+; GFX9-NEXT:    v_mov_b32_e32 v30, 0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[4:5]
 entry:
   %alloca = alloca [16 x i32], align 4, addrspace(5)
   %gep = getelementptr inbounds [16 x i32], ptr addrspace(5) %alloca, i32 0, i32 5
@@ -489,6 +1155,18 @@ entry:
 
 ; Do support tail calls with a uniform, but unknown, callee.
 define hidden fastcc i32 @indirect_uniform_sibling_call_i32_fastcc_i32_i32(i32 %a, i32 %b, i32 %c) #1 {
+; CIVI-LABEL: indirect_uniform_sibling_call_i32_fastcc_i32_i32:
+; CIVI:       ; %bb.0: ; %entry
+; CIVI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CIVI-NEXT:    s_getpc_b64 s[16:17]
+; CIVI-NEXT:    s_add_u32 s16, s16, func_ptr_gv at gotpcrel32@lo+4
+; CIVI-NEXT:    s_addc_u32 s17, s17, func_ptr_gv at gotpcrel32@hi+12
+; CIVI-NEXT:    s_load_dwordx2 s[16:17], s[16:17], 0x0
+; CIVI-NEXT:    s_waitcnt lgkmcnt(0)
+; CIVI-NEXT:    s_load_dwordx2 s[16:17], s[16:17], 0x0
+; CIVI-NEXT:    s_waitcnt lgkmcnt(0)
+; CIVI-NEXT:    s_setpc_b64 s[16:17]
+;
 ; GCN-LABEL: indirect_uniform_sibling_call_i32_fastcc_i32_i32:
 ; GCN:       ; %bb.0: ; %entry
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -500,6 +1178,18 @@ define hidden fastcc i32 @indirect_uniform_sibling_call_i32_fastcc_i32_i32(i32 %
 ; GCN-NEXT:    s_load_dwordx2 s[16:17], s[16:17], 0x0
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[16:17]
+;
+; GFX9-LABEL: indirect_uniform_sibling_call_i32_fastcc_i32_i32:
+; GFX9:       ; %bb.0: ; %entry
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_getpc_b64 s[16:17]
+; GFX9-NEXT:    s_add_u32 s16, s16, func_ptr_gv at gotpcrel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s17, s17, func_ptr_gv at gotpcrel32@hi+12
+; GFX9-NEXT:    s_load_dwordx2 s[16:17], s[16:17], 0x0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_load_dwordx2 s[16:17], s[16:17], 0x0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[16:17]
 entry:
   %func.ptr.load = load ptr, ptr addrspace(4) @func_ptr_gv
   %ret = tail call fastcc i32 %func.ptr.load(i32 %a, i32 %b)
@@ -509,6 +1199,188 @@ entry:
 ; We can't support a tail call to a divergent target. Use a waterfall
 ; loop around a regular call
 define hidden fastcc i32 @indirect_divergent_sibling_call_i32_fastcc_i32_i32(ptr %func.ptr, i32 %a, i32 %b, i32 %c) #1 {
+; CIVI-LABEL: indirect_divergent_sibling_call_i32_fastcc_i32_i32:
+; CIVI:       ; %bb.0: ; %entry
+; CIVI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CIVI-NEXT:    s_mov_b32 s16, s33
+; CIVI-NEXT:    s_mov_b32 s33, s32
+; CIVI-NEXT:    s_or_saveexec_b64 s[18:19], -1
+; CIVI-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
+; CIVI-NEXT:    s_mov_b64 exec, s[18:19]
+; CIVI-NEXT:    v_writelane_b32 v40, s16, 18
+; CIVI-NEXT:    v_writelane_b32 v40, s30, 0
+; CIVI-NEXT:    v_writelane_b32 v40, s31, 1
+; CIVI-NEXT:    v_writelane_b32 v40, s34, 2
+; CIVI-NEXT:    v_writelane_b32 v40, s35, 3
+; CIVI-NEXT:    v_writelane_b32 v40, s36, 4
+; CIVI-NEXT:    v_writelane_b32 v40, s37, 5
+; CIVI-NEXT:    v_writelane_b32 v40, s38, 6
+; CIVI-NEXT:    v_writelane_b32 v40, s39, 7
+; CIVI-NEXT:    v_writelane_b32 v40, s40, 8
+; CIVI-NEXT:    v_writelane_b32 v40, s41, 9
+; CIVI-NEXT:    v_writelane_b32 v40, s42, 10
+; CIVI-NEXT:    v_writelane_b32 v40, s43, 11
+; CIVI-NEXT:    v_writelane_b32 v40, s44, 12
+; CIVI-NEXT:    v_writelane_b32 v40, s45, 13
+; CIVI-NEXT:    v_writelane_b32 v40, s46, 14
+; CIVI-NEXT:    v_writelane_b32 v40, s47, 15
+; CIVI-NEXT:    v_writelane_b32 v40, s48, 16
+; CIVI-NEXT:    s_mov_b32 s42, s15
+; CIVI-NEXT:    s_mov_b32 s43, s14
+; CIVI-NEXT:    s_mov_b32 s44, s13
+; CIVI-NEXT:    s_mov_b32 s45, s12
+; CIVI-NEXT:    s_mov_b64 s[34:35], s[10:11]
+; CIVI-NEXT:    s_mov_b64 s[36:37], s[8:9]
+; CIVI-NEXT:    s_mov_b64 s[38:39], s[6:7]
+; CIVI-NEXT:    s_mov_b64 s[40:41], s[4:5]
+; CIVI-NEXT:    v_add_u32_e32 v3, vcc, v3, v4
+; CIVI-NEXT:    s_mov_b64 s[46:47], exec
+; CIVI-NEXT:    s_addk_i32 s32, 0x400
+; CIVI-NEXT:    v_writelane_b32 v40, s49, 17
+; CIVI-NEXT:  .LBB18_1: ; =>This Inner Loop Header: Depth=1
+; CIVI-NEXT:    v_readfirstlane_b32 s16, v0
+; CIVI-NEXT:    v_readfirstlane_b32 s17, v1
+; CIVI-NEXT:    v_cmp_eq_u64_e32 vcc, s[16:17], v[0:1]
+; CIVI-NEXT:    s_and_saveexec_b64 s[48:49], vcc
+; CIVI-NEXT:    s_mov_b64 s[4:5], s[40:41]
+; CIVI-NEXT:    s_mov_b64 s[6:7], s[38:39]
+; CIVI-NEXT:    s_mov_b64 s[8:9], s[36:37]
+; CIVI-NEXT:    s_mov_b64 s[10:11], s[34:35]
+; CIVI-NEXT:    s_mov_b32 s12, s45
+; CIVI-NEXT:    s_mov_b32 s13, s44
+; CIVI-NEXT:    s_mov_b32 s14, s43
+; CIVI-NEXT:    s_mov_b32 s15, s42
+; CIVI-NEXT:    v_mov_b32_e32 v0, v2
+; CIVI-NEXT:    v_mov_b32_e32 v1, v3
+; CIVI-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; CIVI-NEXT:    v_mov_b32_e32 v4, v0
+; CIVI-NEXT:    ; implicit-def: $vgpr0_vgpr1
+; CIVI-NEXT:    ; implicit-def: $vgpr31
+; CIVI-NEXT:    ; implicit-def: $vgpr2
+; CIVI-NEXT:    ; implicit-def: $vgpr3
+; CIVI-NEXT:    s_xor_b64 exec, exec, s[48:49]
+; CIVI-NEXT:    s_cbranch_execnz .LBB18_1
+; CIVI-NEXT:  ; %bb.2:
+; CIVI-NEXT:    s_mov_b64 exec, s[46:47]
+; CIVI-NEXT:    v_mov_b32_e32 v0, v4
+; CIVI-NEXT:    v_readlane_b32 s49, v40, 17
+; CIVI-NEXT:    v_readlane_b32 s48, v40, 16
+; CIVI-NEXT:    v_readlane_b32 s47, v40, 15
+; CIVI-NEXT:    v_readlane_b32 s46, v40, 14
+; CIVI-NEXT:    v_readlane_b32 s45, v40, 13
+; CIVI-NEXT:    v_readlane_b32 s44, v40, 12
+; CIVI-NEXT:    v_readlane_b32 s43, v40, 11
+; CIVI-NEXT:    v_readlane_b32 s42, v40, 10
+; CIVI-NEXT:    v_readlane_b32 s41, v40, 9
+; CIVI-NEXT:    v_readlane_b32 s40, v40, 8
+; CIVI-NEXT:    v_readlane_b32 s39, v40, 7
+; CIVI-NEXT:    v_readlane_b32 s38, v40, 6
+; CIVI-NEXT:    v_readlane_b32 s37, v40, 5
+; CIVI-NEXT:    v_readlane_b32 s36, v40, 4
+; CIVI-NEXT:    v_readlane_b32 s35, v40, 3
+; CIVI-NEXT:    v_readlane_b32 s34, v40, 2
+; CIVI-NEXT:    v_readlane_b32 s31, v40, 1
+; CIVI-NEXT:    v_readlane_b32 s30, v40, 0
+; CIVI-NEXT:    s_mov_b32 s32, s33
+; CIVI-NEXT:    v_readlane_b32 s4, v40, 18
+; CIVI-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; CIVI-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
+; CIVI-NEXT:    s_mov_b64 exec, s[6:7]
+; CIVI-NEXT:    s_mov_b32 s33, s4
+; CIVI-NEXT:    s_waitcnt vmcnt(0)
+; CIVI-NEXT:    s_setpc_b64 s[30:31]
+;
+; GCN-LABEL: indirect_divergent_sibling_call_i32_fastcc_i32_i32:
+; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    s_mov_b32 s16, s33
+; GCN-NEXT:    s_mov_b32 s33, s32
+; GCN-NEXT:    s_or_saveexec_b64 s[18:19], -1
+; GCN-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
+; GCN-NEXT:    s_mov_b64 exec, s[18:19]
+; GCN-NEXT:    v_writelane_b32 v40, s16, 18
+; GCN-NEXT:    v_writelane_b32 v40, s30, 0
+; GCN-NEXT:    v_writelane_b32 v40, s31, 1
+; GCN-NEXT:    v_writelane_b32 v40, s34, 2
+; GCN-NEXT:    v_writelane_b32 v40, s35, 3
+; GCN-NEXT:    v_writelane_b32 v40, s36, 4
+; GCN-NEXT:    v_writelane_b32 v40, s37, 5
+; GCN-NEXT:    v_writelane_b32 v40, s38, 6
+; GCN-NEXT:    v_writelane_b32 v40, s39, 7
+; GCN-NEXT:    v_writelane_b32 v40, s40, 8
+; GCN-NEXT:    v_writelane_b32 v40, s41, 9
+; GCN-NEXT:    v_writelane_b32 v40, s42, 10
+; GCN-NEXT:    v_writelane_b32 v40, s43, 11
+; GCN-NEXT:    v_writelane_b32 v40, s44, 12
+; GCN-NEXT:    v_writelane_b32 v40, s45, 13
+; GCN-NEXT:    v_writelane_b32 v40, s46, 14
+; GCN-NEXT:    v_writelane_b32 v40, s47, 15
+; GCN-NEXT:    v_writelane_b32 v40, s48, 16
+; GCN-NEXT:    s_mov_b32 s42, s15
+; GCN-NEXT:    s_mov_b32 s43, s14
+; GCN-NEXT:    s_mov_b32 s44, s13
+; GCN-NEXT:    s_mov_b32 s45, s12
+; GCN-NEXT:    s_mov_b64 s[34:35], s[10:11]
+; GCN-NEXT:    s_mov_b64 s[36:37], s[8:9]
+; GCN-NEXT:    s_mov_b64 s[38:39], s[6:7]
+; GCN-NEXT:    s_mov_b64 s[40:41], s[4:5]
+; GCN-NEXT:    v_add_i32_e32 v3, vcc, v3, v4
+; GCN-NEXT:    s_mov_b64 s[46:47], exec
+; GCN-NEXT:    s_addk_i32 s32, 0x400
+; GCN-NEXT:    v_writelane_b32 v40, s49, 17
+; GCN-NEXT:  .LBB18_1: ; =>This Inner Loop Header: Depth=1
+; GCN-NEXT:    v_readfirstlane_b32 s16, v0
+; GCN-NEXT:    v_readfirstlane_b32 s17, v1
+; GCN-NEXT:    v_cmp_eq_u64_e32 vcc, s[16:17], v[0:1]
+; GCN-NEXT:    s_and_saveexec_b64 s[48:49], vcc
+; GCN-NEXT:    s_mov_b64 s[4:5], s[40:41]
+; GCN-NEXT:    s_mov_b64 s[6:7], s[38:39]
+; GCN-NEXT:    s_mov_b64 s[8:9], s[36:37]
+; GCN-NEXT:    s_mov_b64 s[10:11], s[34:35]
+; GCN-NEXT:    s_mov_b32 s12, s45
+; GCN-NEXT:    s_mov_b32 s13, s44
+; GCN-NEXT:    s_mov_b32 s14, s43
+; GCN-NEXT:    s_mov_b32 s15, s42
+; GCN-NEXT:    v_mov_b32_e32 v0, v2
+; GCN-NEXT:    v_mov_b32_e32 v1, v3
+; GCN-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GCN-NEXT:    v_mov_b32_e32 v4, v0
+; GCN-NEXT:    ; implicit-def: $vgpr0_vgpr1
+; GCN-NEXT:    ; implicit-def: $vgpr31
+; GCN-NEXT:    ; implicit-def: $vgpr2
+; GCN-NEXT:    ; implicit-def: $vgpr3
+; GCN-NEXT:    s_xor_b64 exec, exec, s[48:49]
+; GCN-NEXT:    s_cbranch_execnz .LBB18_1
+; GCN-NEXT:  ; %bb.2:
+; GCN-NEXT:    s_mov_b64 exec, s[46:47]
+; GCN-NEXT:    v_mov_b32_e32 v0, v4
+; GCN-NEXT:    v_readlane_b32 s49, v40, 17
+; GCN-NEXT:    v_readlane_b32 s48, v40, 16
+; GCN-NEXT:    v_readlane_b32 s47, v40, 15
+; GCN-NEXT:    v_readlane_b32 s46, v40, 14
+; GCN-NEXT:    v_readlane_b32 s45, v40, 13
+; GCN-NEXT:    v_readlane_b32 s44, v40, 12
+; GCN-NEXT:    v_readlane_b32 s43, v40, 11
+; GCN-NEXT:    v_readlane_b32 s42, v40, 10
+; GCN-NEXT:    v_readlane_b32 s41, v40, 9
+; GCN-NEXT:    v_readlane_b32 s40, v40, 8
+; GCN-NEXT:    v_readlane_b32 s39, v40, 7
+; GCN-NEXT:    v_readlane_b32 s38, v40, 6
+; GCN-NEXT:    v_readlane_b32 s37, v40, 5
+; GCN-NEXT:    v_readlane_b32 s36, v40, 4
+; GCN-NEXT:    v_readlane_b32 s35, v40, 3
+; GCN-NEXT:    v_readlane_b32 s34, v40, 2
+; GCN-NEXT:    v_readlane_b32 s31, v40, 1
+; GCN-NEXT:    v_readlane_b32 s30, v40, 0
+; GCN-NEXT:    s_mov_b32 s32, s33
+; GCN-NEXT:    v_readlane_b32 s4, v40, 18
+; GCN-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; GCN-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
+; GCN-NEXT:    s_mov_b64 exec, s[6:7]
+; GCN-NEXT:    s_mov_b32 s33, s4
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX9-LABEL: indirect_divergent_sibling_call_i32_fastcc_i32_i32:
 ; GFX9:       ; %bb.0: ; %entry
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -608,6 +1480,30 @@ entry:
 declare hidden void @void_fastcc_multi_byval(i32 %a, ptr addrspace(5) byval([3 x i32]) align 16, ptr addrspace(5) byval([2 x i64]))
 
 define fastcc void @sibling_call_fastcc_multi_byval(i32 %a, [64 x i32]) #1 {
+; CIVI-LABEL: sibling_call_fastcc_multi_byval:
+; CIVI:       ; %bb.0: ; %entry
+; CIVI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CIVI-NEXT:    v_mov_b32_e32 v1, 9
+; CIVI-NEXT:    v_mov_b32_e32 v2, 0
+; CIVI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:148
+; CIVI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:144
+; CIVI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:152
+; CIVI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:172
+; CIVI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:168
+; CIVI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:164
+; CIVI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:160
+; CIVI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:4
+; CIVI-NEXT:    buffer_store_dword v1, off, s[0:3], s32
+; CIVI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:28
+; CIVI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:24
+; CIVI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:20
+; CIVI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:16
+; CIVI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:8
+; CIVI-NEXT:    s_getpc_b64 s[16:17]
+; CIVI-NEXT:    s_add_u32 s16, s16, void_fastcc_multi_byval at rel32@lo+4
+; CIVI-NEXT:    s_addc_u32 s17, s17, void_fastcc_multi_byval at rel32@hi+12
+; CIVI-NEXT:    s_setpc_b64 s[16:17]
+;
 ; GCN-LABEL: sibling_call_fastcc_multi_byval:
 ; GCN:       ; %bb.0: ; %entry
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -631,6 +1527,30 @@ define fastcc void @sibling_call_fastcc_multi_byval(i32 %a, [64 x i32]) #1 {
 ; GCN-NEXT:    s_add_u32 s16, s16, void_fastcc_multi_byval at rel32@lo+4
 ; GCN-NEXT:    s_addc_u32 s17, s17, void_fastcc_multi_byval at rel32@hi+12
 ; GCN-NEXT:    s_setpc_b64 s[16:17]
+;
+; GFX9-LABEL: sibling_call_fastcc_multi_byval:
+; GFX9:       ; %bb.0: ; %entry
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v1, 9
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:148
+; GFX9-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:144
+; GFX9-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:152
+; GFX9-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:172
+; GFX9-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:168
+; GFX9-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:164
+; GFX9-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:160
+; GFX9-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:4
+; GFX9-NEXT:    buffer_store_dword v1, off, s[0:3], s32
+; GFX9-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:28
+; GFX9-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:24
+; GFX9-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:20
+; GFX9-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:16
+; GFX9-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:8
+; GFX9-NEXT:    s_getpc_b64 s[16:17]
+; GFX9-NEXT:    s_add_u32 s16, s16, void_fastcc_multi_byval at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s17, s17, void_fastcc_multi_byval at rel32@hi+12
+; GFX9-NEXT:    s_setpc_b64 s[16:17]
 entry:
   %alloca0 = alloca [3 x i32], align 16, addrspace(5)
   %alloca1 = alloca [2 x i64], align 8, addrspace(5)
@@ -644,6 +1564,55 @@ declare hidden void @void_fastcc_byval_and_stack_passed(ptr addrspace(5) byval([
 
 ; Callee has a byval and non-byval stack passed argument
 define fastcc void @sibling_call_byval_and_stack_passed(i32 %stack.out.arg, [64 x i32]) #1 {
+; CIVI-LABEL: sibling_call_byval_and_stack_passed:
+; CIVI:       ; %bb.0: ; %entry
+; CIVI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CIVI-NEXT:    v_mov_b32_e32 v1, 9
+; CIVI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:148
+; CIVI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:144
+; CIVI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:152
+; CIVI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:4
+; CIVI-NEXT:    buffer_store_dword v1, off, s[0:3], s32
+; CIVI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:8
+; CIVI-NEXT:    v_mov_b32_e32 v1, 0
+; CIVI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:12
+; CIVI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:16
+; CIVI-NEXT:    v_mov_b32_e32 v0, 0
+; CIVI-NEXT:    v_mov_b32_e32 v1, 0
+; CIVI-NEXT:    v_mov_b32_e32 v2, 0
+; CIVI-NEXT:    v_mov_b32_e32 v3, 0
+; CIVI-NEXT:    v_mov_b32_e32 v4, 0
+; CIVI-NEXT:    v_mov_b32_e32 v5, 0
+; CIVI-NEXT:    v_mov_b32_e32 v6, 0
+; CIVI-NEXT:    v_mov_b32_e32 v7, 0
+; CIVI-NEXT:    v_mov_b32_e32 v8, 0
+; CIVI-NEXT:    v_mov_b32_e32 v9, 0
+; CIVI-NEXT:    v_mov_b32_e32 v10, 0
+; CIVI-NEXT:    v_mov_b32_e32 v11, 0
+; CIVI-NEXT:    v_mov_b32_e32 v12, 0
+; CIVI-NEXT:    v_mov_b32_e32 v13, 0
+; CIVI-NEXT:    v_mov_b32_e32 v14, 0
+; CIVI-NEXT:    v_mov_b32_e32 v15, 0
+; CIVI-NEXT:    v_mov_b32_e32 v16, 0
+; CIVI-NEXT:    v_mov_b32_e32 v17, 0
+; CIVI-NEXT:    v_mov_b32_e32 v18, 0
+; CIVI-NEXT:    v_mov_b32_e32 v19, 0
+; CIVI-NEXT:    v_mov_b32_e32 v20, 0
+; CIVI-NEXT:    v_mov_b32_e32 v21, 0
+; CIVI-NEXT:    v_mov_b32_e32 v22, 0
+; CIVI-NEXT:    v_mov_b32_e32 v23, 0
+; CIVI-NEXT:    v_mov_b32_e32 v24, 0
+; CIVI-NEXT:    v_mov_b32_e32 v25, 0
+; CIVI-NEXT:    v_mov_b32_e32 v26, 0
+; CIVI-NEXT:    v_mov_b32_e32 v27, 0
+; CIVI-NEXT:    v_mov_b32_e32 v28, 0
+; CIVI-NEXT:    v_mov_b32_e32 v29, 0
+; CIVI-NEXT:    v_mov_b32_e32 v30, 0
+; CIVI-NEXT:    s_getpc_b64 s[16:17]
+; CIVI-NEXT:    s_add_u32 s16, s16, void_fastcc_byval_and_stack_passed at rel32@lo+4
+; CIVI-NEXT:    s_addc_u32 s17, s17, void_fastcc_byval_and_stack_passed at rel32@hi+12
+; CIVI-NEXT:    s_setpc_b64 s[16:17]
+;
 ; GCN-LABEL: sibling_call_byval_and_stack_passed:
 ; GCN:       ; %bb.0: ; %entry
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -692,6 +1661,55 @@ define fastcc void @sibling_call_byval_and_stack_passed(i32 %stack.out.arg, [64
 ; GCN-NEXT:    s_add_u32 s16, s16, void_fastcc_byval_and_stack_passed at rel32@lo+4
 ; GCN-NEXT:    s_addc_u32 s17, s17, void_fastcc_byval_and_stack_passed at rel32@hi+12
 ; GCN-NEXT:    s_setpc_b64 s[16:17]
+;
+; GFX9-LABEL: sibling_call_byval_and_stack_passed:
+; GFX9:       ; %bb.0: ; %entry
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v1, 9
+; GFX9-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:148
+; GFX9-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:144
+; GFX9-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:152
+; GFX9-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:4
+; GFX9-NEXT:    buffer_store_dword v1, off, s[0:3], s32
+; GFX9-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:8
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0
+; GFX9-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:12
+; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:16
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9-NEXT:    v_mov_b32_e32 v3, 0
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0
+; GFX9-NEXT:    v_mov_b32_e32 v5, 0
+; GFX9-NEXT:    v_mov_b32_e32 v6, 0
+; GFX9-NEXT:    v_mov_b32_e32 v7, 0
+; GFX9-NEXT:    v_mov_b32_e32 v8, 0
+; GFX9-NEXT:    v_mov_b32_e32 v9, 0
+; GFX9-NEXT:    v_mov_b32_e32 v10, 0
+; GFX9-NEXT:    v_mov_b32_e32 v11, 0
+; GFX9-NEXT:    v_mov_b32_e32 v12, 0
+; GFX9-NEXT:    v_mov_b32_e32 v13, 0
+; GFX9-NEXT:    v_mov_b32_e32 v14, 0
+; GFX9-NEXT:    v_mov_b32_e32 v15, 0
+; GFX9-NEXT:    v_mov_b32_e32 v16, 0
+; GFX9-NEXT:    v_mov_b32_e32 v17, 0
+; GFX9-NEXT:    v_mov_b32_e32 v18, 0
+; GFX9-NEXT:    v_mov_b32_e32 v19, 0
+; GFX9-NEXT:    v_mov_b32_e32 v20, 0
+; GFX9-NEXT:    v_mov_b32_e32 v21, 0
+; GFX9-NEXT:    v_mov_b32_e32 v22, 0
+; GFX9-NEXT:    v_mov_b32_e32 v23, 0
+; GFX9-NEXT:    v_mov_b32_e32 v24, 0
+; GFX9-NEXT:    v_mov_b32_e32 v25, 0
+; GFX9-NEXT:    v_mov_b32_e32 v26, 0
+; GFX9-NEXT:    v_mov_b32_e32 v27, 0
+; GFX9-NEXT:    v_mov_b32_e32 v28, 0
+; GFX9-NEXT:    v_mov_b32_e32 v29, 0
+; GFX9-NEXT:    v_mov_b32_e32 v30, 0
+; GFX9-NEXT:    s_getpc_b64 s[16:17]
+; GFX9-NEXT:    s_add_u32 s16, s16, void_fastcc_byval_and_stack_passed at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s17, s17, void_fastcc_byval_and_stack_passed at rel32@hi+12
+; GFX9-NEXT:    s_setpc_b64 s[16:17]
 entry:
   %alloca = alloca [3 x i32], align 16, addrspace(5)
   store [3 x i32] [i32 9, i32 9, i32 9], ptr addrspace(5) %alloca
@@ -702,6 +1720,14 @@ entry:
 declare hidden fastcc i64 @i64_fastcc_i64(i64 %arg0)
 
 define hidden fastcc i64 @sibling_call_i64_fastcc_i64(i64 %a) #1 {
+; CIVI-LABEL: sibling_call_i64_fastcc_i64:
+; CIVI:       ; %bb.0: ; %entry
+; CIVI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CIVI-NEXT:    s_getpc_b64 s[16:17]
+; CIVI-NEXT:    s_add_u32 s16, s16, i64_fastcc_i64 at rel32@lo+4
+; CIVI-NEXT:    s_addc_u32 s17, s17, i64_fastcc_i64 at rel32@hi+12
+; CIVI-NEXT:    s_setpc_b64 s[16:17]
+;
 ; GCN-LABEL: sibling_call_i64_fastcc_i64:
 ; GCN:       ; %bb.0: ; %entry
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -709,6 +1735,14 @@ define hidden fastcc i64 @sibling_call_i64_fastcc_i64(i64 %a) #1 {
 ; GCN-NEXT:    s_add_u32 s16, s16, i64_fastcc_i64 at rel32@lo+4
 ; GCN-NEXT:    s_addc_u32 s17, s17, i64_fastcc_i64 at rel32@hi+12
 ; GCN-NEXT:    s_setpc_b64 s[16:17]
+;
+; GFX9-LABEL: sibling_call_i64_fastcc_i64:
+; GFX9:       ; %bb.0: ; %entry
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_getpc_b64 s[16:17]
+; GFX9-NEXT:    s_add_u32 s16, s16, i64_fastcc_i64 at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s17, s17, i64_fastcc_i64 at rel32@hi+12
+; GFX9-NEXT:    s_setpc_b64 s[16:17]
 entry:
   %ret = tail call fastcc i64 @i64_fastcc_i64(i64 %a)
   ret i64 %ret
@@ -717,6 +1751,14 @@ entry:
 declare hidden fastcc ptr addrspace(1) @p1i8_fastcc_p1i8(ptr addrspace(1) %arg0)
 
 define hidden fastcc ptr addrspace(1) @sibling_call_p1i8_fastcc_p1i8(ptr addrspace(1) %a) #1 {
+; CIVI-LABEL: sibling_call_p1i8_fastcc_p1i8:
+; CIVI:       ; %bb.0: ; %entry
+; CIVI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CIVI-NEXT:    s_getpc_b64 s[16:17]
+; CIVI-NEXT:    s_add_u32 s16, s16, p1i8_fastcc_p1i8 at rel32@lo+4
+; CIVI-NEXT:    s_addc_u32 s17, s17, p1i8_fastcc_p1i8 at rel32@hi+12
+; CIVI-NEXT:    s_setpc_b64 s[16:17]
+;
 ; GCN-LABEL: sibling_call_p1i8_fastcc_p1i8:
 ; GCN:       ; %bb.0: ; %entry
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -724,6 +1766,14 @@ define hidden fastcc ptr addrspace(1) @sibling_call_p1i8_fastcc_p1i8(ptr addrspa
 ; GCN-NEXT:    s_add_u32 s16, s16, p1i8_fastcc_p1i8 at rel32@lo+4
 ; GCN-NEXT:    s_addc_u32 s17, s17, p1i8_fastcc_p1i8 at rel32@hi+12
 ; GCN-NEXT:    s_setpc_b64 s[16:17]
+;
+; GFX9-LABEL: sibling_call_p1i8_fastcc_p1i8:
+; GFX9:       ; %bb.0: ; %entry
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_getpc_b64 s[16:17]
+; GFX9-NEXT:    s_add_u32 s16, s16, p1i8_fastcc_p1i8 at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s17, s17, p1i8_fastcc_p1i8 at rel32@hi+12
+; GFX9-NEXT:    s_setpc_b64 s[16:17]
 entry:
   %ret = tail call fastcc ptr addrspace(1) @p1i8_fastcc_p1i8(ptr addrspace(1) %a)
   ret ptr addrspace(1) %ret
@@ -732,6 +1782,14 @@ entry:
 declare hidden fastcc i16 @i16_fastcc_i16(i16 %arg0)
 
 define hidden fastcc i16 @sibling_call_i16_fastcc_i16(i16 %a) #1 {
+; CIVI-LABEL: sibling_call_i16_fastcc_i16:
+; CIVI:       ; %bb.0: ; %entry
+; CIVI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CIVI-NEXT:    s_getpc_b64 s[16:17]
+; CIVI-NEXT:    s_add_u32 s16, s16, i16_fastcc_i16 at rel32@lo+4
+; CIVI-NEXT:    s_addc_u32 s17, s17, i16_fastcc_i16 at rel32@hi+12
+; CIVI-NEXT:    s_setpc_b64 s[16:17]
+;
 ; GCN-LABEL: sibling_call_i16_fastcc_i16:
 ; GCN:       ; %bb.0: ; %entry
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -739,6 +1797,14 @@ define hidden fastcc i16 @sibling_call_i16_fastcc_i16(i16 %a) #1 {
 ; GCN-NEXT:    s_add_u32 s16, s16, i16_fastcc_i16 at rel32@lo+4
 ; GCN-NEXT:    s_addc_u32 s17, s17, i16_fastcc_i16 at rel32@hi+12
 ; GCN-NEXT:    s_setpc_b64 s[16:17]
+;
+; GFX9-LABEL: sibling_call_i16_fastcc_i16:
+; GFX9:       ; %bb.0: ; %entry
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_getpc_b64 s[16:17]
+; GFX9-NEXT:    s_add_u32 s16, s16, i16_fastcc_i16 at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s17, s17, i16_fastcc_i16 at rel32@hi+12
+; GFX9-NEXT:    s_setpc_b64 s[16:17]
 entry:
   %ret = tail call fastcc i16 @i16_fastcc_i16(i16 %a)
   ret i16 %ret
@@ -747,6 +1813,14 @@ entry:
 declare hidden fastcc half @f16_fastcc_f16(half %arg0)
 
 define hidden fastcc half @sibling_call_f16_fastcc_f16(half %a) #1 {
+; CIVI-LABEL: sibling_call_f16_fastcc_f16:
+; CIVI:       ; %bb.0: ; %entry
+; CIVI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CIVI-NEXT:    s_getpc_b64 s[16:17]
+; CIVI-NEXT:    s_add_u32 s16, s16, f16_fastcc_f16 at rel32@lo+4
+; CIVI-NEXT:    s_addc_u32 s17, s17, f16_fastcc_f16 at rel32@hi+12
+; CIVI-NEXT:    s_setpc_b64 s[16:17]
+;
 ; GCN-LABEL: sibling_call_f16_fastcc_f16:
 ; GCN:       ; %bb.0: ; %entry
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -754,6 +1828,14 @@ define hidden fastcc half @sibling_call_f16_fastcc_f16(half %a) #1 {
 ; GCN-NEXT:    s_add_u32 s16, s16, f16_fastcc_f16 at rel32@lo+4
 ; GCN-NEXT:    s_addc_u32 s17, s17, f16_fastcc_f16 at rel32@hi+12
 ; GCN-NEXT:    s_setpc_b64 s[16:17]
+;
+; GFX9-LABEL: sibling_call_f16_fastcc_f16:
+; GFX9:       ; %bb.0: ; %entry
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_getpc_b64 s[16:17]
+; GFX9-NEXT:    s_add_u32 s16, s16, f16_fastcc_f16 at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s17, s17, f16_fastcc_f16 at rel32@hi+12
+; GFX9-NEXT:    s_setpc_b64 s[16:17]
 entry:
   %ret = tail call fastcc half @f16_fastcc_f16(half %a)
   ret half %ret
@@ -762,6 +1844,14 @@ entry:
 declare hidden fastcc <3 x i16> @v3i16_fastcc_v3i16(<3 x i16> %arg0)
 
 define hidden fastcc <3 x i16> @sibling_call_v3i16_fastcc_v3i16(<3 x i16> %a) #1 {
+; CIVI-LABEL: sibling_call_v3i16_fastcc_v3i16:
+; CIVI:       ; %bb.0: ; %entry
+; CIVI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CIVI-NEXT:    s_getpc_b64 s[16:17]
+; CIVI-NEXT:    s_add_u32 s16, s16, v3i16_fastcc_v3i16 at rel32@lo+4
+; CIVI-NEXT:    s_addc_u32 s17, s17, v3i16_fastcc_v3i16 at rel32@hi+12
+; CIVI-NEXT:    s_setpc_b64 s[16:17]
+;
 ; GCN-LABEL: sibling_call_v3i16_fastcc_v3i16:
 ; GCN:       ; %bb.0: ; %entry
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -769,6 +1859,14 @@ define hidden fastcc <3 x i16> @sibling_call_v3i16_fastcc_v3i16(<3 x i16> %a) #1
 ; GCN-NEXT:    s_add_u32 s16, s16, v3i16_fastcc_v3i16 at rel32@lo+4
 ; GCN-NEXT:    s_addc_u32 s17, s17, v3i16_fastcc_v3i16 at rel32@hi+12
 ; GCN-NEXT:    s_setpc_b64 s[16:17]
+;
+; GFX9-LABEL: sibling_call_v3i16_fastcc_v3i16:
+; GFX9:       ; %bb.0: ; %entry
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_getpc_b64 s[16:17]
+; GFX9-NEXT:    s_add_u32 s16, s16, v3i16_fastcc_v3i16 at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s17, s17, v3i16_fastcc_v3i16 at rel32@hi+12
+; GFX9-NEXT:    s_setpc_b64 s[16:17]
 entry:
   %ret = tail call fastcc <3 x i16> @v3i16_fastcc_v3i16(<3 x i16> %a)
   ret <3 x i16> %ret
@@ -777,6 +1875,14 @@ entry:
 declare hidden fastcc <4 x i16> @v4i16_fastcc_v4i16(<4 x i16> %arg0)
 
 define hidden fastcc <4 x i16> @sibling_call_v4i16_fastcc_v4i16(<4 x i16> %a) #1 {
+; CIVI-LABEL: sibling_call_v4i16_fastcc_v4i16:
+; CIVI:       ; %bb.0: ; %entry
+; CIVI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CIVI-NEXT:    s_getpc_b64 s[16:17]
+; CIVI-NEXT:    s_add_u32 s16, s16, v4i16_fastcc_v4i16 at rel32@lo+4
+; CIVI-NEXT:    s_addc_u32 s17, s17, v4i16_fastcc_v4i16 at rel32@hi+12
+; CIVI-NEXT:    s_setpc_b64 s[16:17]
+;
 ; GCN-LABEL: sibling_call_v4i16_fastcc_v4i16:
 ; GCN:       ; %bb.0: ; %entry
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -784,6 +1890,14 @@ define hidden fastcc <4 x i16> @sibling_call_v4i16_fastcc_v4i16(<4 x i16> %a) #1
 ; GCN-NEXT:    s_add_u32 s16, s16, v4i16_fastcc_v4i16 at rel32@lo+4
 ; GCN-NEXT:    s_addc_u32 s17, s17, v4i16_fastcc_v4i16 at rel32@hi+12
 ; GCN-NEXT:    s_setpc_b64 s[16:17]
+;
+; GFX9-LABEL: sibling_call_v4i16_fastcc_v4i16:
+; GFX9:       ; %bb.0: ; %entry
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_getpc_b64 s[16:17]
+; GFX9-NEXT:    s_add_u32 s16, s16, v4i16_fastcc_v4i16 at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s17, s17, v4i16_fastcc_v4i16 at rel32@hi+12
+; GFX9-NEXT:    s_setpc_b64 s[16:17]
 entry:
   %ret = tail call fastcc <4 x i16> @v4i16_fastcc_v4i16(<4 x i16> %a)
   ret <4 x i16> %ret
@@ -792,6 +1906,14 @@ entry:
 declare hidden fastcc <2 x i64> @v2i64_fastcc_v2i64(<2 x i64> %arg0)
 
 define hidden fastcc <2 x i64> @sibling_call_v2i64_fastcc_v2i64(<2 x i64> %a) #1 {
+; CIVI-LABEL: sibling_call_v2i64_fastcc_v2i64:
+; CIVI:       ; %bb.0: ; %entry
+; CIVI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CIVI-NEXT:    s_getpc_b64 s[16:17]
+; CIVI-NEXT:    s_add_u32 s16, s16, v2i64_fastcc_v2i64 at rel32@lo+4
+; CIVI-NEXT:    s_addc_u32 s17, s17, v2i64_fastcc_v2i64 at rel32@hi+12
+; CIVI-NEXT:    s_setpc_b64 s[16:17]
+;
 ; GCN-LABEL: sibling_call_v2i64_fastcc_v2i64:
 ; GCN:       ; %bb.0: ; %entry
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -799,6 +1921,14 @@ define hidden fastcc <2 x i64> @sibling_call_v2i64_fastcc_v2i64(<2 x i64> %a) #1
 ; GCN-NEXT:    s_add_u32 s16, s16, v2i64_fastcc_v2i64 at rel32@lo+4
 ; GCN-NEXT:    s_addc_u32 s17, s17, v2i64_fastcc_v2i64 at rel32@hi+12
 ; GCN-NEXT:    s_setpc_b64 s[16:17]
+;
+; GFX9-LABEL: sibling_call_v2i64_fastcc_v2i64:
+; GFX9:       ; %bb.0: ; %entry
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_getpc_b64 s[16:17]
+; GFX9-NEXT:    s_add_u32 s16, s16, v2i64_fastcc_v2i64 at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s17, s17, v2i64_fastcc_v2i64 at rel32@hi+12
+; GFX9-NEXT:    s_setpc_b64 s[16:17]
 entry:
   %ret = tail call fastcc <2 x i64> @v2i64_fastcc_v2i64(<2 x i64> %a)
   ret <2 x i64> %ret



More information about the llvm-commits mailing list