[llvm] 33aba5d - [AMDGPU] Switch to autogenerated checks

Jeffrey Byrnes via llvm-commits llvm-commits at lists.llvm.org
Thu Jan 5 16:28:10 PST 2023


Author: Jeffrey Byrnes
Date: 2023-01-05T16:27:18-08:00
New Revision: 33aba5d0d041a555b0f6a839433d561add710150

URL: https://github.com/llvm/llvm-project/commit/33aba5d0d041a555b0f6a839433d561add710150
DIFF: https://github.com/llvm/llvm-project/commit/33aba5d0d041a555b0f6a839433d561add710150.diff

LOG: [AMDGPU] Switch to autogenerated checks

Added: 
    

Modified: 
    llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands.ll

Removed: 
    


################################################################################
diff  --git a/llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands.ll b/llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands.ll
index 36ac9d212b68..ecb949c46b8f 100644
--- a/llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands.ll
+++ b/llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands.ll
@@ -1,128 +1,687 @@
-; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs -verify-machine-dom-info -o - %s | FileCheck %s --check-prefix=W64
-; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -verify-machineinstrs -verify-machine-dom-info -o - %s | FileCheck %s --check-prefix=W32
-; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs -verify-machine-dom-info -o - %s | FileCheck %s --check-prefix=W64
-; RUN: llc -march=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 -verify-machineinstrs -verify-machine-dom-info -o - %s | FileCheck %s --check-prefix=W32
-; RUN: llc -march=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs -verify-machine-dom-info -o - %s | FileCheck %s --check-prefix=W64
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs -verify-machine-dom-info -o - %s | FileCheck %s --check-prefix=GFX9_W64
+; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -verify-machineinstrs -verify-machine-dom-info -o - %s | FileCheck %s --check-prefix=GFX1010_W32
+; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs -verify-machine-dom-info -o - %s | FileCheck %s --check-prefix=GFX1010_W64
+; RUN: llc -march=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 -verify-machineinstrs -verify-machine-dom-info -o - %s | FileCheck %s --check-prefix=GFX1100_W32
+; RUN: llc -march=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs -verify-machine-dom-info -o - %s | FileCheck %s --check-prefix=GFX1100_W64
 ; RUN: llc -O0 -march=amdgcn -mcpu=gfx900 -verify-machineinstrs -verify-machine-dom-info -o - %s | FileCheck %s --check-prefix=W64-O0
 
 ; Test that we correctly legalize VGPR Rsrc operands in MUBUF instructions.
 
-; W64-LABEL: mubuf_vgpr
-; W64: s_mov_b64 [[SAVEEXEC:s\[[0-9]+:[0-9]+\]]], exec
-; W64: [[LOOPBB:.LBB[0-9]+_[0-9]+]]:
-; W64-DAG: v_readfirstlane_b32 s[[SRSRC0:[0-9]+]], v[[VRSRC0:[0-9]+]]
-; W64-DAG: v_readfirstlane_b32 s[[SRSRC1:[0-9]+]], v[[VRSRC1:[0-9]+]]
-; W64-DAG: v_readfirstlane_b32 s[[SRSRC2:[0-9]+]], v[[VRSRC2:[0-9]+]]
-; W64-DAG: v_readfirstlane_b32 s[[SRSRC3:[0-9]+]], v[[VRSRC3:[0-9]+]]
-; W64: v_cmp_eq_u64_e32 vcc, s[[[SRSRC0]]:[[SRSRC1]]], v[[[VRSRC0]]:[[VRSRC1]]]
-; W64: v_cmp_eq_u64_e64 [[CMP0:s\[[0-9]+:[0-9]+\]]], s[[[SRSRC2]]:[[SRSRC3]]], v[[[VRSRC2]]:[[VRSRC3]]]
-; W64: s_and_b64 [[AND:s\[[0-9]+:[0-9]+\]]], vcc, [[CMP0]]
-; W64: s_and_saveexec_b64 [[SAVE:s\[[0-9]+:[0-9]+\]]], [[AND]]
-; W64: buffer_load_format_x [[RES:v[0-9]+]], v{{[0-9]+}}, s[[[SRSRC0]]:[[SRSRC3]]], 0 idxen
-; W64: s_xor_b64 exec, exec, [[AND]]
-; W64: s_cbranch_execnz [[LOOPBB]]
-; W64: s_mov_b64 exec, [[SAVEEXEC]]
-; W64: v_mov_b32_e32 v0, [[RES]]
-
-; W32-LABEL: mubuf_vgpr
-; W32: s_mov_b32 [[SAVEEXEC:s[0-9]+]], exec_lo
-; W32: [[LOOPBB:.LBB[0-9]+_[0-9]+]]:
-; W32-DAG: v_readfirstlane_b32 s[[SRSRC0:[0-9]+]], v[[VRSRC0:[0-9]+]]
-; W32-DAG: v_readfirstlane_b32 s[[SRSRC1:[0-9]+]], v[[VRSRC1:[0-9]+]]
-; W32-DAG: v_readfirstlane_b32 s[[SRSRC2:[0-9]+]], v[[VRSRC2:[0-9]+]]
-; W32-DAG: v_readfirstlane_b32 s[[SRSRC3:[0-9]+]], v[[VRSRC3:[0-9]+]]
-; W32: v_cmp_eq_u64_e32 vcc_lo, s[[[SRSRC0]]:[[SRSRC1]]], v[[[VRSRC0]]:[[VRSRC1]]]
-; W32: v_cmp_eq_u64_e64 [[CMP0:s[0-9]+]], s[[[SRSRC2]]:[[SRSRC3]]], v[[[VRSRC2]]:[[VRSRC3]]]
-; W32: s_and_b32 [[AND:s[0-9]+]], vcc_lo, [[CMP0]]
-; W32: s_and_saveexec_b32 [[SAVE:s[0-9]+]], [[AND]]
-; W32: buffer_load_format_x [[RES:v[0-9]+]], v{{[0-9]+}}, s[[[SRSRC0]]:[[SRSRC3]]], 0 idxen
-; W32: s_xor_b32 exec_lo, exec_lo, [[SAVE]]
-; W32: s_cbranch_execnz [[LOOPBB]]
-; W32: s_mov_b32 exec_lo, [[SAVEEXEC]]
-; W32: v_mov_b32_e32 v0, [[RES]]
-
 define float @mubuf_vgpr(<4 x i32> %i, i32 %c) #0 {
+; GFX9_W64-LABEL: mubuf_vgpr:
+; GFX9_W64:       ; %bb.0:
+; GFX9_W64-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9_W64-NEXT:    s_mov_b64 s[6:7], exec
+; GFX9_W64-NEXT:  .LBB0_1: ; =>This Inner Loop Header: Depth=1
+; GFX9_W64-NEXT:    v_readfirstlane_b32 s8, v0
+; GFX9_W64-NEXT:    v_readfirstlane_b32 s9, v1
+; GFX9_W64-NEXT:    v_readfirstlane_b32 s10, v2
+; GFX9_W64-NEXT:    v_readfirstlane_b32 s11, v3
+; GFX9_W64-NEXT:    v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
+; GFX9_W64-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
+; GFX9_W64-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
+; GFX9_W64-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
+; GFX9_W64-NEXT:    s_nop 0
+; GFX9_W64-NEXT:    buffer_load_format_x v5, v4, s[8:11], 0 idxen
+; GFX9_W64-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
+; GFX9_W64-NEXT:    ; implicit-def: $vgpr4
+; GFX9_W64-NEXT:    s_xor_b64 exec, exec, s[4:5]
+; GFX9_W64-NEXT:    s_cbranch_execnz .LBB0_1
+; GFX9_W64-NEXT:  ; %bb.2:
+; GFX9_W64-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX9_W64-NEXT:    s_waitcnt vmcnt(0)
+; GFX9_W64-NEXT:    v_mov_b32_e32 v0, v5
+; GFX9_W64-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1010_W32-LABEL: mubuf_vgpr:
+; GFX1010_W32:       ; %bb.0:
+; GFX1010_W32-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1010_W32-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX1010_W32-NEXT:    s_mov_b32 s5, exec_lo
+; GFX1010_W32-NEXT:  .LBB0_1: ; =>This Inner Loop Header: Depth=1
+; GFX1010_W32-NEXT:    v_readfirstlane_b32 s8, v0
+; GFX1010_W32-NEXT:    v_readfirstlane_b32 s9, v1
+; GFX1010_W32-NEXT:    v_readfirstlane_b32 s10, v2
+; GFX1010_W32-NEXT:    v_readfirstlane_b32 s11, v3
+; GFX1010_W32-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[8:9], v[0:1]
+; GFX1010_W32-NEXT:    v_cmp_eq_u64_e64 s4, s[10:11], v[2:3]
+; GFX1010_W32-NEXT:    s_and_b32 s4, vcc_lo, s4
+; GFX1010_W32-NEXT:    s_and_saveexec_b32 s4, s4
+; GFX1010_W32-NEXT:    buffer_load_format_x v5, v4, s[8:11], 0 idxen
+; GFX1010_W32-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
+; GFX1010_W32-NEXT:    ; implicit-def: $vgpr4
+; GFX1010_W32-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX1010_W32-NEXT:    s_xor_b32 exec_lo, exec_lo, s4
+; GFX1010_W32-NEXT:    s_cbranch_execnz .LBB0_1
+; GFX1010_W32-NEXT:  ; %bb.2:
+; GFX1010_W32-NEXT:    s_mov_b32 exec_lo, s5
+; GFX1010_W32-NEXT:    s_waitcnt vmcnt(0)
+; GFX1010_W32-NEXT:    v_mov_b32_e32 v0, v5
+; GFX1010_W32-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1010_W64-LABEL: mubuf_vgpr:
+; GFX1010_W64:       ; %bb.0:
+; GFX1010_W64-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1010_W64-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX1010_W64-NEXT:    s_mov_b64 s[6:7], exec
+; GFX1010_W64-NEXT:  .LBB0_1: ; =>This Inner Loop Header: Depth=1
+; GFX1010_W64-NEXT:    v_readfirstlane_b32 s8, v0
+; GFX1010_W64-NEXT:    v_readfirstlane_b32 s9, v1
+; GFX1010_W64-NEXT:    v_readfirstlane_b32 s10, v2
+; GFX1010_W64-NEXT:    v_readfirstlane_b32 s11, v3
+; GFX1010_W64-NEXT:    v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
+; GFX1010_W64-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
+; GFX1010_W64-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
+; GFX1010_W64-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
+; GFX1010_W64-NEXT:    buffer_load_format_x v5, v4, s[8:11], 0 idxen
+; GFX1010_W64-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
+; GFX1010_W64-NEXT:    ; implicit-def: $vgpr4
+; GFX1010_W64-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX1010_W64-NEXT:    s_xor_b64 exec, exec, s[4:5]
+; GFX1010_W64-NEXT:    s_cbranch_execnz .LBB0_1
+; GFX1010_W64-NEXT:  ; %bb.2:
+; GFX1010_W64-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX1010_W64-NEXT:    s_waitcnt vmcnt(0)
+; GFX1010_W64-NEXT:    v_mov_b32_e32 v0, v5
+; GFX1010_W64-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1100_W32-LABEL: mubuf_vgpr:
+; GFX1100_W32:       ; %bb.0:
+; GFX1100_W32-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100_W32-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX1100_W32-NEXT:    s_mov_b32 s1, exec_lo
+; GFX1100_W32-NEXT:  .LBB0_1: ; =>This Inner Loop Header: Depth=1
+; GFX1100_W32-NEXT:    v_readfirstlane_b32 s4, v0
+; GFX1100_W32-NEXT:    v_readfirstlane_b32 s5, v1
+; GFX1100_W32-NEXT:    v_readfirstlane_b32 s6, v2
+; GFX1100_W32-NEXT:    v_readfirstlane_b32 s7, v3
+; GFX1100_W32-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1100_W32-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
+; GFX1100_W32-NEXT:    v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
+; GFX1100_W32-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1100_W32-NEXT:    s_and_b32 s0, vcc_lo, s0
+; GFX1100_W32-NEXT:    s_and_saveexec_b32 s0, s0
+; GFX1100_W32-NEXT:    buffer_load_format_x v5, v4, s[4:7], 0 idxen
+; GFX1100_W32-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
+; GFX1100_W32-NEXT:    ; implicit-def: $vgpr4
+; GFX1100_W32-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
+; GFX1100_W32-NEXT:    s_cbranch_execnz .LBB0_1
+; GFX1100_W32-NEXT:  ; %bb.2:
+; GFX1100_W32-NEXT:    s_mov_b32 exec_lo, s1
+; GFX1100_W32-NEXT:    s_waitcnt vmcnt(0)
+; GFX1100_W32-NEXT:    v_mov_b32_e32 v0, v5
+; GFX1100_W32-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1100_W64-LABEL: mubuf_vgpr:
+; GFX1100_W64:       ; %bb.0:
+; GFX1100_W64-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100_W64-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX1100_W64-NEXT:    s_mov_b64 s[2:3], exec
+; GFX1100_W64-NEXT:  .LBB0_1: ; =>This Inner Loop Header: Depth=1
+; GFX1100_W64-NEXT:    v_readfirstlane_b32 s4, v0
+; GFX1100_W64-NEXT:    v_readfirstlane_b32 s5, v1
+; GFX1100_W64-NEXT:    v_readfirstlane_b32 s6, v2
+; GFX1100_W64-NEXT:    v_readfirstlane_b32 s7, v3
+; GFX1100_W64-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1100_W64-NEXT:    v_cmp_eq_u64_e32 vcc, s[4:5], v[0:1]
+; GFX1100_W64-NEXT:    v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3]
+; GFX1100_W64-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1100_W64-NEXT:    s_and_b64 s[0:1], vcc, s[0:1]
+; GFX1100_W64-NEXT:    s_and_saveexec_b64 s[0:1], s[0:1]
+; GFX1100_W64-NEXT:    buffer_load_format_x v5, v4, s[4:7], 0 idxen
+; GFX1100_W64-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
+; GFX1100_W64-NEXT:    ; implicit-def: $vgpr4
+; GFX1100_W64-NEXT:    s_xor_b64 exec, exec, s[0:1]
+; GFX1100_W64-NEXT:    s_cbranch_execnz .LBB0_1
+; GFX1100_W64-NEXT:  ; %bb.2:
+; GFX1100_W64-NEXT:    s_mov_b64 exec, s[2:3]
+; GFX1100_W64-NEXT:    s_waitcnt vmcnt(0)
+; GFX1100_W64-NEXT:    v_mov_b32_e32 v0, v5
+; GFX1100_W64-NEXT:    s_setpc_b64 s[30:31]
+;
+; W64-O0-LABEL: mubuf_vgpr:
+; W64-O0:       ; %bb.0:
+; W64-O0-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; W64-O0-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; W64-O0-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; W64-O0-NEXT:    s_mov_b64 exec, s[4:5]
+; W64-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; W64-O0-NEXT:    v_mov_b32_e32 v4, v3
+; W64-O0-NEXT:    v_mov_b32_e32 v6, v2
+; W64-O0-NEXT:    v_mov_b32_e32 v7, v1
+; W64-O0-NEXT:    ; implicit-def: $sgpr4
+; W64-O0-NEXT:    ; implicit-def: $sgpr4
+; W64-O0-NEXT:    ; implicit-def: $sgpr4
+; W64-O0-NEXT:    ; implicit-def: $sgpr4
+; W64-O0-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $exec
+; W64-O0-NEXT:    ; kill: def $vgpr7 killed $vgpr7 killed $exec
+; W64-O0-NEXT:    ; kill: def $vgpr4 killed $vgpr4 killed $exec
+; W64-O0-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1_vgpr2_vgpr3 killed $exec
+; W64-O0-NEXT:    v_mov_b32_e32 v1, v7
+; W64-O0-NEXT:    v_mov_b32_e32 v2, v6
+; W64-O0-NEXT:    v_mov_b32_e32 v3, v4
+; W64-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
+; W64-O0-NEXT:    s_waitcnt vmcnt(0)
+; W64-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; W64-O0-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; W64-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; W64-O0-NEXT:    ; implicit-def: $sgpr4_sgpr5_sgpr6_sgpr7
+; W64-O0-NEXT:    s_mov_b32 s4, 0
+; W64-O0-NEXT:    v_writelane_b32 v5, s4, 0
+; W64-O0-NEXT:    s_mov_b64 s[4:5], exec
+; W64-O0-NEXT:    v_writelane_b32 v5, s4, 1
+; W64-O0-NEXT:    v_writelane_b32 v5, s5, 2
+; W64-O0-NEXT:  .LBB0_1: ; =>This Inner Loop Header: Depth=1
+; W64-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
+; W64-O0-NEXT:    s_nop 0
+; W64-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; W64-O0-NEXT:    s_nop 0
+; W64-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; W64-O0-NEXT:    s_nop 0
+; W64-O0-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; W64-O0-NEXT:    s_waitcnt vmcnt(0)
+; W64-O0-NEXT:    v_readfirstlane_b32 s8, v0
+; W64-O0-NEXT:    v_readfirstlane_b32 s12, v1
+; W64-O0-NEXT:    s_mov_b32 s4, s8
+; W64-O0-NEXT:    s_mov_b32 s5, s12
+; W64-O0-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[4:5], v[0:1]
+; W64-O0-NEXT:    v_readfirstlane_b32 s7, v2
+; W64-O0-NEXT:    v_readfirstlane_b32 s6, v3
+; W64-O0-NEXT:    s_mov_b32 s10, s7
+; W64-O0-NEXT:    s_mov_b32 s11, s6
+; W64-O0-NEXT:    v_cmp_eq_u64_e64 s[10:11], s[10:11], v[2:3]
+; W64-O0-NEXT:    s_and_b64 s[4:5], s[4:5], s[10:11]
+; W64-O0-NEXT:    ; kill: def $sgpr8 killed $sgpr8 def $sgpr8_sgpr9_sgpr10_sgpr11
+; W64-O0-NEXT:    s_mov_b32 s9, s12
+; W64-O0-NEXT:    s_mov_b32 s10, s7
+; W64-O0-NEXT:    s_mov_b32 s11, s6
+; W64-O0-NEXT:    v_writelane_b32 v5, s8, 3
+; W64-O0-NEXT:    v_writelane_b32 v5, s9, 4
+; W64-O0-NEXT:    v_writelane_b32 v5, s10, 5
+; W64-O0-NEXT:    v_writelane_b32 v5, s11, 6
+; W64-O0-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
+; W64-O0-NEXT:    v_writelane_b32 v5, s4, 7
+; W64-O0-NEXT:    v_writelane_b32 v5, s5, 8
+; W64-O0-NEXT:  ; %bb.2: ; in Loop: Header=BB0_1 Depth=1
+; W64-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; W64-O0-NEXT:    v_readlane_b32 s4, v5, 7
+; W64-O0-NEXT:    v_readlane_b32 s5, v5, 8
+; W64-O0-NEXT:    v_readlane_b32 s8, v5, 3
+; W64-O0-NEXT:    v_readlane_b32 s9, v5, 4
+; W64-O0-NEXT:    v_readlane_b32 s10, v5, 5
+; W64-O0-NEXT:    v_readlane_b32 s11, v5, 6
+; W64-O0-NEXT:    v_readlane_b32 s6, v5, 0
+; W64-O0-NEXT:    s_waitcnt vmcnt(0)
+; W64-O0-NEXT:    s_nop 3
+; W64-O0-NEXT:    buffer_load_format_x v0, v0, s[8:11], s6 idxen
+; W64-O0-NEXT:    s_waitcnt vmcnt(0)
+; W64-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; W64-O0-NEXT:    s_xor_b64 exec, exec, s[4:5]
+; W64-O0-NEXT:    s_cbranch_execnz .LBB0_1
+; W64-O0-NEXT:  ; %bb.3:
+; W64-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; W64-O0-NEXT:    v_readlane_b32 s4, v5, 1
+; W64-O0-NEXT:    v_readlane_b32 s5, v5, 2
+; W64-O0-NEXT:    s_mov_b64 exec, s[4:5]
+; W64-O0-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; W64-O0-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; W64-O0-NEXT:    s_mov_b64 exec, s[4:5]
+; W64-O0-NEXT:    s_waitcnt vmcnt(0)
+; W64-O0-NEXT:    s_setpc_b64 s[30:31]
   %call = call float @llvm.amdgcn.struct.buffer.load.format.f32(<4 x i32> %i, i32 %c, i32 0, i32 0, i32 0) #1
   ret float %call
 }
 
 
-; W64-LABEL: mubuf_vgpr_adjacent_in_block
-
-; W64: s_mov_b64 [[SAVEEXEC:s\[[0-9]+:[0-9]+\]]], exec
-; W64: [[LOOPBB0:.LBB[0-9]+_[0-9]+]]:
-; W64-DAG: v_readfirstlane_b32 s[[SRSRC0:[0-9]+]], v[[VRSRC0:[0-9]+]]
-; W64-DAG: v_readfirstlane_b32 s[[SRSRC1:[0-9]+]], v[[VRSRC1:[0-9]+]]
-; W64-DAG: v_readfirstlane_b32 s[[SRSRC2:[0-9]+]], v[[VRSRC2:[0-9]+]]
-; W64-DAG: v_readfirstlane_b32 s[[SRSRC3:[0-9]+]], v[[VRSRC3:[0-9]+]]
-; W64: v_cmp_eq_u64_e32 vcc, s[[[SRSRC0]]:[[SRSRC1]]], v[[[VRSRC0]]:[[VRSRC1]]]
-; W64: v_cmp_eq_u64_e64 [[CMP0:s\[[0-9]+:[0-9]+\]]], s[[[SRSRC2]]:[[SRSRC3]]], v[[[VRSRC2]]:[[VRSRC3]]]
-; W64: s_and_b64 [[AND:s\[[0-9]+:[0-9]+\]]], vcc, [[CMP0]]
-; W64: s_and_saveexec_b64 [[SAVE:s\[[0-9]+:[0-9]+\]]], [[AND]]
-; W64: buffer_load_format_x [[RES0:v[0-9]+]], v{{[0-9]+}}, s[[[SRSRC0]]:[[SRSRC3]]], 0 idxen
-; W64: s_xor_b64 exec, exec, [[SAVE]]
-; W64: s_cbranch_execnz [[LOOPBB0]]
-
-; W64: s_mov_b64 exec, [[SAVEEXEC]]
-; FIXME: redundant s_mov
-; W64: s_mov_b64 [[SAVEEXEC:s\[[0-9]+:[0-9]+\]]], exec
-
-; W64: [[LOOPBB1:.LBB[0-9]+_[0-9]+]]:
-; W64-DAG: v_readfirstlane_b32 s[[SRSRC0:[0-9]+]], v[[VRSRC0:[0-9]+]]
-; W64-DAG: v_readfirstlane_b32 s[[SRSRC1:[0-9]+]], v[[VRSRC1:[0-9]+]]
-; W64-DAG: v_readfirstlane_b32 s[[SRSRC2:[0-9]+]], v[[VRSRC2:[0-9]+]]
-; W64-DAG: v_readfirstlane_b32 s[[SRSRC3:[0-9]+]], v[[VRSRC3:[0-9]+]]
-; W64: v_cmp_eq_u64_e32 vcc, s[[[SRSRC0]]:[[SRSRC1]]], v[[[VRSRC0]]:[[VRSRC1]]]
-; W64: v_cmp_eq_u64_e64 [[CMP0:s\[[0-9]+:[0-9]+\]]], s[[[SRSRC2]]:[[SRSRC3]]], v[[[VRSRC2]]:[[VRSRC3]]]
-; W64: s_and_b64 [[AND:s\[[0-9]+:[0-9]+\]]], vcc, [[CMP0]]
-; W64: s_and_saveexec_b64 [[SAVE:s\[[0-9]+:[0-9]+\]]], [[AND]]
-; W64: buffer_load_format_x [[RES1:v[0-9]+]], v{{[0-9]+}}, s[[[SRSRC0]]:[[SRSRC3]]], 0 idxen
-; W64: s_xor_b64 exec, exec, [[SAVE]]
-; W64: s_cbranch_execnz [[LOOPBB1]]
-
-; W64: s_mov_b64 exec, [[SAVEEXEC]]
-; W64-DAG: global_store_{{dword|b32}} v{{\[[0-9]+:[0-9]+\]}}, [[RES0]], off
-; W64-DAG: global_store_{{dword|b32}} v{{\[[0-9]+:[0-9]+\]}}, [[RES1]], off
-
-
-; W32-LABEL: mubuf_vgpr_adjacent_in_block
-
-; W32: s_mov_b32 [[SAVEEXEC:s[0-9]+]], exec_lo
-; W32: [[LOOPBB0:.LBB[0-9]+_[0-9]+]]:
-; W32-DAG: v_readfirstlane_b32 s[[SRSRC0:[0-9]+]], v[[VRSRC0:[0-9]+]]
-; W32-DAG: v_readfirstlane_b32 s[[SRSRC1:[0-9]+]], v[[VRSRC1:[0-9]+]]
-; W32-DAG: v_readfirstlane_b32 s[[SRSRC2:[0-9]+]], v[[VRSRC2:[0-9]+]]
-; W32-DAG: v_readfirstlane_b32 s[[SRSRC3:[0-9]+]], v[[VRSRC3:[0-9]+]]
-; W32: v_cmp_eq_u64_e32 vcc_lo, s[[[SRSRC0]]:[[SRSRC1]]], v[[[VRSRC0]]:[[VRSRC1]]]
-; W32: v_cmp_eq_u64_e64 [[CMP0:s[0-9]+]], s[[[SRSRC2]]:[[SRSRC3]]], v[[[VRSRC2]]:[[VRSRC3]]]
-; W32: s_and_b32 [[AND:s[0-9]+]], vcc_lo, [[CMP0]]
-; W32: s_and_saveexec_b32 [[SAVE:s[0-9]+]], [[AND]]
-; W32: buffer_load_format_x [[RES0:v[0-9]+]], v{{[0-9]+}}, s[[[SRSRC0]]:[[SRSRC3]]], 0 idxen
-; W32: s_xor_b32 exec_lo, exec_lo, [[SAVE]]
-; W32: s_cbranch_execnz [[LOOPBB0]]
 
-; W32: s_mov_b32 exec_lo, [[SAVEEXEC]]
 ; FIXME: redundant s_mov
-; W32: s_mov_b32 [[SAVEEXEC:s[0-9]+]], exec_lo
-
-; W32: [[LOOPBB1:.LBB[0-9]+_[0-9]+]]:
-; W32-DAG: v_readfirstlane_b32 s[[SRSRC0:[0-9]+]], v[[VRSRC0:[0-9]+]]
-; W32-DAG: v_readfirstlane_b32 s[[SRSRC1:[0-9]+]], v[[VRSRC1:[0-9]+]]
-; W32-DAG: v_readfirstlane_b32 s[[SRSRC2:[0-9]+]], v[[VRSRC2:[0-9]+]]
-; W32-DAG: v_readfirstlane_b32 s[[SRSRC3:[0-9]+]], v[[VRSRC3:[0-9]+]]
-; W32: v_cmp_eq_u64_e32 vcc_lo, s[[[SRSRC0]]:[[SRSRC1]]], v[[[VRSRC0]]:[[VRSRC1]]]
-; W32: v_cmp_eq_u64_e64 [[CMP0:s[0-9]+]], s[[[SRSRC2]]:[[SRSRC3]]], v[[[VRSRC2]]:[[VRSRC3]]]
-; W32: s_and_b32 [[AND:s[0-9]+]], vcc_lo, [[CMP0]]
-; W32: s_and_saveexec_b32 [[SAVE:s[0-9]+]], [[AND]]
-; W32: buffer_load_format_x [[RES1:v[0-9]+]], v8, s[[[SRSRC0]]:[[SRSRC3]]], 0 idxen
-; W32: s_xor_b32 exec_lo, exec_lo, [[SAVE]]
-; W32: s_cbranch_execnz [[LOOPBB1]]
-
-; W32: s_mov_b32 exec_lo, [[SAVEEXEC]]
-; W32-DAG: global_store_{{dword|b32}} v{{\[[0-9]+:[0-9]+\]}}, [[RES0]], off
-; W32-DAG: global_store_{{dword|b32}} v{{\[[0-9]+:[0-9]+\]}}, [[RES1]], off
 
 define void @mubuf_vgpr_adjacent_in_block(<4 x i32> %i, <4 x i32> %j, i32 %c, ptr addrspace(1) %out0, ptr addrspace(1) %out1) #0 {
+; GFX9_W64-LABEL: mubuf_vgpr_adjacent_in_block:
+; GFX9_W64:       ; %bb.0: ; %entry
+; GFX9_W64-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9_W64-NEXT:    s_mov_b64 s[6:7], exec
+; GFX9_W64-NEXT:  .LBB1_1: ; =>This Inner Loop Header: Depth=1
+; GFX9_W64-NEXT:    v_readfirstlane_b32 s8, v0
+; GFX9_W64-NEXT:    v_readfirstlane_b32 s9, v1
+; GFX9_W64-NEXT:    v_readfirstlane_b32 s10, v2
+; GFX9_W64-NEXT:    v_readfirstlane_b32 s11, v3
+; GFX9_W64-NEXT:    v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
+; GFX9_W64-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
+; GFX9_W64-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
+; GFX9_W64-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
+; GFX9_W64-NEXT:    s_nop 0
+; GFX9_W64-NEXT:    buffer_load_format_x v13, v8, s[8:11], 0 idxen
+; GFX9_W64-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
+; GFX9_W64-NEXT:    s_xor_b64 exec, exec, s[4:5]
+; GFX9_W64-NEXT:    s_cbranch_execnz .LBB1_1
+; GFX9_W64-NEXT:  ; %bb.2:
+; GFX9_W64-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX9_W64-NEXT:    s_mov_b64 s[6:7], exec
+; GFX9_W64-NEXT:  .LBB1_3: ; =>This Inner Loop Header: Depth=1
+; GFX9_W64-NEXT:    v_readfirstlane_b32 s8, v4
+; GFX9_W64-NEXT:    v_readfirstlane_b32 s9, v5
+; GFX9_W64-NEXT:    v_readfirstlane_b32 s10, v6
+; GFX9_W64-NEXT:    v_readfirstlane_b32 s11, v7
+; GFX9_W64-NEXT:    v_cmp_eq_u64_e32 vcc, s[8:9], v[4:5]
+; GFX9_W64-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[10:11], v[6:7]
+; GFX9_W64-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
+; GFX9_W64-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
+; GFX9_W64-NEXT:    s_nop 0
+; GFX9_W64-NEXT:    buffer_load_format_x v0, v8, s[8:11], 0 idxen
+; GFX9_W64-NEXT:    ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7
+; GFX9_W64-NEXT:    ; implicit-def: $vgpr8
+; GFX9_W64-NEXT:    s_xor_b64 exec, exec, s[4:5]
+; GFX9_W64-NEXT:    s_cbranch_execnz .LBB1_3
+; GFX9_W64-NEXT:  ; %bb.4:
+; GFX9_W64-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX9_W64-NEXT:    s_waitcnt vmcnt(1)
+; GFX9_W64-NEXT:    global_store_dword v[9:10], v13, off
+; GFX9_W64-NEXT:    s_waitcnt vmcnt(0)
+; GFX9_W64-NEXT:    global_store_dword v[11:12], v0, off
+; GFX9_W64-NEXT:    s_waitcnt vmcnt(0)
+; GFX9_W64-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1010_W32-LABEL: mubuf_vgpr_adjacent_in_block:
+; GFX1010_W32:       ; %bb.0: ; %entry
+; GFX1010_W32-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1010_W32-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX1010_W32-NEXT:    s_mov_b32 s5, exec_lo
+; GFX1010_W32-NEXT:  .LBB1_1: ; =>This Inner Loop Header: Depth=1
+; GFX1010_W32-NEXT:    v_readfirstlane_b32 s8, v0
+; GFX1010_W32-NEXT:    v_readfirstlane_b32 s9, v1
+; GFX1010_W32-NEXT:    v_readfirstlane_b32 s10, v2
+; GFX1010_W32-NEXT:    v_readfirstlane_b32 s11, v3
+; GFX1010_W32-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[8:9], v[0:1]
+; GFX1010_W32-NEXT:    v_cmp_eq_u64_e64 s4, s[10:11], v[2:3]
+; GFX1010_W32-NEXT:    s_and_b32 s4, vcc_lo, s4
+; GFX1010_W32-NEXT:    s_and_saveexec_b32 s4, s4
+; GFX1010_W32-NEXT:    buffer_load_format_x v13, v8, s[8:11], 0 idxen
+; GFX1010_W32-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
+; GFX1010_W32-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX1010_W32-NEXT:    s_xor_b32 exec_lo, exec_lo, s4
+; GFX1010_W32-NEXT:    s_cbranch_execnz .LBB1_1
+; GFX1010_W32-NEXT:  ; %bb.2:
+; GFX1010_W32-NEXT:    s_mov_b32 exec_lo, s5
+; GFX1010_W32-NEXT:    s_mov_b32 s5, exec_lo
+; GFX1010_W32-NEXT:  .LBB1_3: ; =>This Inner Loop Header: Depth=1
+; GFX1010_W32-NEXT:    v_readfirstlane_b32 s8, v4
+; GFX1010_W32-NEXT:    v_readfirstlane_b32 s9, v5
+; GFX1010_W32-NEXT:    v_readfirstlane_b32 s10, v6
+; GFX1010_W32-NEXT:    v_readfirstlane_b32 s11, v7
+; GFX1010_W32-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[8:9], v[4:5]
+; GFX1010_W32-NEXT:    v_cmp_eq_u64_e64 s4, s[10:11], v[6:7]
+; GFX1010_W32-NEXT:    s_and_b32 s4, vcc_lo, s4
+; GFX1010_W32-NEXT:    s_and_saveexec_b32 s4, s4
+; GFX1010_W32-NEXT:    buffer_load_format_x v0, v8, s[8:11], 0 idxen
+; GFX1010_W32-NEXT:    ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7
+; GFX1010_W32-NEXT:    ; implicit-def: $vgpr8
+; GFX1010_W32-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX1010_W32-NEXT:    s_xor_b32 exec_lo, exec_lo, s4
+; GFX1010_W32-NEXT:    s_cbranch_execnz .LBB1_3
+; GFX1010_W32-NEXT:  ; %bb.4:
+; GFX1010_W32-NEXT:    s_mov_b32 exec_lo, s5
+; GFX1010_W32-NEXT:    s_waitcnt vmcnt(1)
+; GFX1010_W32-NEXT:    global_store_dword v[9:10], v13, off
+; GFX1010_W32-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX1010_W32-NEXT:    s_waitcnt vmcnt(0)
+; GFX1010_W32-NEXT:    global_store_dword v[11:12], v0, off
+; GFX1010_W32-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX1010_W32-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1010_W64-LABEL: mubuf_vgpr_adjacent_in_block:
+; GFX1010_W64:       ; %bb.0: ; %entry
+; GFX1010_W64-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1010_W64-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX1010_W64-NEXT:    s_mov_b64 s[6:7], exec
+; GFX1010_W64-NEXT:  .LBB1_1: ; =>This Inner Loop Header: Depth=1
+; GFX1010_W64-NEXT:    v_readfirstlane_b32 s8, v0
+; GFX1010_W64-NEXT:    v_readfirstlane_b32 s9, v1
+; GFX1010_W64-NEXT:    v_readfirstlane_b32 s10, v2
+; GFX1010_W64-NEXT:    v_readfirstlane_b32 s11, v3
+; GFX1010_W64-NEXT:    v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
+; GFX1010_W64-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
+; GFX1010_W64-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
+; GFX1010_W64-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
+; GFX1010_W64-NEXT:    buffer_load_format_x v13, v8, s[8:11], 0 idxen
+; GFX1010_W64-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
+; GFX1010_W64-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX1010_W64-NEXT:    s_xor_b64 exec, exec, s[4:5]
+; GFX1010_W64-NEXT:    s_cbranch_execnz .LBB1_1
+; GFX1010_W64-NEXT:  ; %bb.2:
+; GFX1010_W64-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX1010_W64-NEXT:    s_mov_b64 s[6:7], exec
+; GFX1010_W64-NEXT:  .LBB1_3: ; =>This Inner Loop Header: Depth=1
+; GFX1010_W64-NEXT:    v_readfirstlane_b32 s8, v4
+; GFX1010_W64-NEXT:    v_readfirstlane_b32 s9, v5
+; GFX1010_W64-NEXT:    v_readfirstlane_b32 s10, v6
+; GFX1010_W64-NEXT:    v_readfirstlane_b32 s11, v7
+; GFX1010_W64-NEXT:    v_cmp_eq_u64_e32 vcc, s[8:9], v[4:5]
+; GFX1010_W64-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[10:11], v[6:7]
+; GFX1010_W64-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
+; GFX1010_W64-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
+; GFX1010_W64-NEXT:    buffer_load_format_x v0, v8, s[8:11], 0 idxen
+; GFX1010_W64-NEXT:    ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7
+; GFX1010_W64-NEXT:    ; implicit-def: $vgpr8
+; GFX1010_W64-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX1010_W64-NEXT:    s_xor_b64 exec, exec, s[4:5]
+; GFX1010_W64-NEXT:    s_cbranch_execnz .LBB1_3
+; GFX1010_W64-NEXT:  ; %bb.4:
+; GFX1010_W64-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX1010_W64-NEXT:    s_waitcnt vmcnt(1)
+; GFX1010_W64-NEXT:    global_store_dword v[9:10], v13, off
+; GFX1010_W64-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX1010_W64-NEXT:    s_waitcnt vmcnt(0)
+; GFX1010_W64-NEXT:    global_store_dword v[11:12], v0, off
+; GFX1010_W64-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX1010_W64-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1100_W32-LABEL: mubuf_vgpr_adjacent_in_block:
+; GFX1100_W32:       ; %bb.0: ; %entry
+; GFX1100_W32-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100_W32-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX1100_W32-NEXT:    s_mov_b32 s1, exec_lo
+; GFX1100_W32-NEXT:  .LBB1_1: ; =>This Inner Loop Header: Depth=1
+; GFX1100_W32-NEXT:    v_readfirstlane_b32 s4, v0
+; GFX1100_W32-NEXT:    v_readfirstlane_b32 s5, v1
+; GFX1100_W32-NEXT:    v_readfirstlane_b32 s6, v2
+; GFX1100_W32-NEXT:    v_readfirstlane_b32 s7, v3
+; GFX1100_W32-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1100_W32-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
+; GFX1100_W32-NEXT:    v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
+; GFX1100_W32-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1100_W32-NEXT:    s_and_b32 s0, vcc_lo, s0
+; GFX1100_W32-NEXT:    s_and_saveexec_b32 s0, s0
+; GFX1100_W32-NEXT:    buffer_load_format_x v13, v8, s[4:7], 0 idxen
+; GFX1100_W32-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
+; GFX1100_W32-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
+; GFX1100_W32-NEXT:    s_cbranch_execnz .LBB1_1
+; GFX1100_W32-NEXT:  ; %bb.2:
+; GFX1100_W32-NEXT:    s_mov_b32 exec_lo, s1
+; GFX1100_W32-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1100_W32-NEXT:    s_mov_b32 s1, exec_lo
+; GFX1100_W32-NEXT:  .LBB1_3: ; =>This Inner Loop Header: Depth=1
+; GFX1100_W32-NEXT:    v_readfirstlane_b32 s4, v4
+; GFX1100_W32-NEXT:    v_readfirstlane_b32 s5, v5
+; GFX1100_W32-NEXT:    v_readfirstlane_b32 s6, v6
+; GFX1100_W32-NEXT:    v_readfirstlane_b32 s7, v7
+; GFX1100_W32-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1100_W32-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[4:5]
+; GFX1100_W32-NEXT:    v_cmp_eq_u64_e64 s0, s[6:7], v[6:7]
+; GFX1100_W32-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1100_W32-NEXT:    s_and_b32 s0, vcc_lo, s0
+; GFX1100_W32-NEXT:    s_and_saveexec_b32 s0, s0
+; GFX1100_W32-NEXT:    buffer_load_format_x v0, v8, s[4:7], 0 idxen
+; GFX1100_W32-NEXT:    ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7
+; GFX1100_W32-NEXT:    ; implicit-def: $vgpr8
+; GFX1100_W32-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
+; GFX1100_W32-NEXT:    s_cbranch_execnz .LBB1_3
+; GFX1100_W32-NEXT:  ; %bb.4:
+; GFX1100_W32-NEXT:    s_mov_b32 exec_lo, s1
+; GFX1100_W32-NEXT:    s_waitcnt vmcnt(1)
+; GFX1100_W32-NEXT:    global_store_b32 v[9:10], v13, off dlc
+; GFX1100_W32-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX1100_W32-NEXT:    s_waitcnt vmcnt(0)
+; GFX1100_W32-NEXT:    global_store_b32 v[11:12], v0, off dlc
+; GFX1100_W32-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX1100_W32-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1100_W64-LABEL: mubuf_vgpr_adjacent_in_block:
+; GFX1100_W64:       ; %bb.0: ; %entry
+; GFX1100_W64-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100_W64-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX1100_W64-NEXT:    s_mov_b64 s[2:3], exec
+; GFX1100_W64-NEXT:  .LBB1_1: ; =>This Inner Loop Header: Depth=1
+; GFX1100_W64-NEXT:    v_readfirstlane_b32 s4, v0
+; GFX1100_W64-NEXT:    v_readfirstlane_b32 s5, v1
+; GFX1100_W64-NEXT:    v_readfirstlane_b32 s6, v2
+; GFX1100_W64-NEXT:    v_readfirstlane_b32 s7, v3
+; GFX1100_W64-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1100_W64-NEXT:    v_cmp_eq_u64_e32 vcc, s[4:5], v[0:1]
+; GFX1100_W64-NEXT:    v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3]
+; GFX1100_W64-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1100_W64-NEXT:    s_and_b64 s[0:1], vcc, s[0:1]
+; GFX1100_W64-NEXT:    s_and_saveexec_b64 s[0:1], s[0:1]
+; GFX1100_W64-NEXT:    buffer_load_format_x v13, v8, s[4:7], 0 idxen
+; GFX1100_W64-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
+; GFX1100_W64-NEXT:    s_xor_b64 exec, exec, s[0:1]
+; GFX1100_W64-NEXT:    s_cbranch_execnz .LBB1_1
+; GFX1100_W64-NEXT:  ; %bb.2:
+; GFX1100_W64-NEXT:    s_mov_b64 exec, s[2:3]
+; GFX1100_W64-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1100_W64-NEXT:    s_mov_b64 s[2:3], exec
+; GFX1100_W64-NEXT:  .LBB1_3: ; =>This Inner Loop Header: Depth=1
+; GFX1100_W64-NEXT:    v_readfirstlane_b32 s4, v4
+; GFX1100_W64-NEXT:    v_readfirstlane_b32 s5, v5
+; GFX1100_W64-NEXT:    v_readfirstlane_b32 s6, v6
+; GFX1100_W64-NEXT:    v_readfirstlane_b32 s7, v7
+; GFX1100_W64-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1100_W64-NEXT:    v_cmp_eq_u64_e32 vcc, s[4:5], v[4:5]
+; GFX1100_W64-NEXT:    v_cmp_eq_u64_e64 s[0:1], s[6:7], v[6:7]
+; GFX1100_W64-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1100_W64-NEXT:    s_and_b64 s[0:1], vcc, s[0:1]
+; GFX1100_W64-NEXT:    s_and_saveexec_b64 s[0:1], s[0:1]
+; GFX1100_W64-NEXT:    buffer_load_format_x v0, v8, s[4:7], 0 idxen
+; GFX1100_W64-NEXT:    ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7
+; GFX1100_W64-NEXT:    ; implicit-def: $vgpr8
+; GFX1100_W64-NEXT:    s_xor_b64 exec, exec, s[0:1]
+; GFX1100_W64-NEXT:    s_cbranch_execnz .LBB1_3
+; GFX1100_W64-NEXT:  ; %bb.4:
+; GFX1100_W64-NEXT:    s_mov_b64 exec, s[2:3]
+; GFX1100_W64-NEXT:    s_waitcnt vmcnt(1)
+; GFX1100_W64-NEXT:    global_store_b32 v[9:10], v13, off dlc
+; GFX1100_W64-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX1100_W64-NEXT:    s_waitcnt vmcnt(0)
+; GFX1100_W64-NEXT:    global_store_b32 v[11:12], v0, off dlc
+; GFX1100_W64-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX1100_W64-NEXT:    s_setpc_b64 s[30:31]
+;
+; W64-O0-LABEL: mubuf_vgpr_adjacent_in_block:
+; W64-O0:       ; %bb.0: ; %entry
+; W64-O0-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; W64-O0-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; W64-O0-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
+; W64-O0-NEXT:    s_mov_b64 exec, s[4:5]
+; W64-O0-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
+; W64-O0-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
+; W64-O0-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
+; W64-O0-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
+; W64-O0-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
+; W64-O0-NEXT:    v_mov_b32_e32 v14, v4
+; W64-O0-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
+; W64-O0-NEXT:    v_mov_b32_e32 v6, v3
+; W64-O0-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
+; W64-O0-NEXT:    v_mov_b32_e32 v7, v2
+; W64-O0-NEXT:    v_mov_b32_e32 v8, v1
+; W64-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
+; W64-O0-NEXT:    v_mov_b32_e32 v2, v0
+; W64-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; W64-O0-NEXT:    ; implicit-def: $sgpr4
+; W64-O0-NEXT:    ; implicit-def: $sgpr4
+; W64-O0-NEXT:    ; implicit-def: $sgpr4
+; W64-O0-NEXT:    ; implicit-def: $sgpr4
+; W64-O0-NEXT:    ; kill: def $vgpr5 killed $vgpr5 killed $exec
+; W64-O0-NEXT:    ; kill: def $vgpr4 killed $vgpr4 killed $exec
+; W64-O0-NEXT:    ; kill: def $vgpr3 killed $vgpr3 killed $exec
+; W64-O0-NEXT:    ; kill: def $vgpr14 killed $vgpr14 def $vgpr14_vgpr15_vgpr16_vgpr17 killed $exec
+; W64-O0-NEXT:    v_mov_b32_e32 v15, v5
+; W64-O0-NEXT:    s_waitcnt vmcnt(3)
+; W64-O0-NEXT:    v_mov_b32_e32 v16, v4
+; W64-O0-NEXT:    s_waitcnt vmcnt(2)
+; W64-O0-NEXT:    v_mov_b32_e32 v17, v3
+; W64-O0-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
+; W64-O0-NEXT:    s_waitcnt vmcnt(0)
+; W64-O0-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
+; W64-O0-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
+; W64-O0-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
+; W64-O0-NEXT:    ; implicit-def: $sgpr4
+; W64-O0-NEXT:    ; implicit-def: $sgpr4
+; W64-O0-NEXT:    ; implicit-def: $sgpr4
+; W64-O0-NEXT:    ; implicit-def: $sgpr4
+; W64-O0-NEXT:    ; kill: def $vgpr2 killed $vgpr2 killed $exec
+; W64-O0-NEXT:    ; kill: def $vgpr8 killed $vgpr8 killed $exec
+; W64-O0-NEXT:    ; kill: def $vgpr7 killed $vgpr7 killed $exec
+; W64-O0-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3_vgpr4_vgpr5 killed $exec
+; W64-O0-NEXT:    v_mov_b32_e32 v3, v8
+; W64-O0-NEXT:    v_mov_b32_e32 v4, v7
+; W64-O0-NEXT:    v_mov_b32_e32 v5, v6
+; W64-O0-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; W64-O0-NEXT:    s_waitcnt vmcnt(0)
+; W64-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; W64-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; W64-O0-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
+; W64-O0-NEXT:    ; implicit-def: $sgpr4
+; W64-O0-NEXT:    ; implicit-def: $sgpr4
+; W64-O0-NEXT:    ; kill: def $vgpr12 killed $vgpr12 killed $exec
+; W64-O0-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; W64-O0-NEXT:    v_mov_b32_e32 v2, v12
+; W64-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; W64-O0-NEXT:    s_waitcnt vmcnt(0)
+; W64-O0-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; W64-O0-NEXT:    ; implicit-def: $sgpr4
+; W64-O0-NEXT:    ; implicit-def: $sgpr4
+; W64-O0-NEXT:    ; kill: def $vgpr10 killed $vgpr10 killed $exec
+; W64-O0-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; W64-O0-NEXT:    v_mov_b32_e32 v1, v10
+; W64-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
+; W64-O0-NEXT:    s_waitcnt vmcnt(0)
+; W64-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; W64-O0-NEXT:    ; implicit-def: $sgpr4_sgpr5
+; W64-O0-NEXT:    ; implicit-def: $sgpr4_sgpr5
+; W64-O0-NEXT:    ; implicit-def: $sgpr4_sgpr5_sgpr6_sgpr7
+; W64-O0-NEXT:    ; implicit-def: $sgpr4_sgpr5_sgpr6_sgpr7
+; W64-O0-NEXT:    s_mov_b32 s4, 0
+; W64-O0-NEXT:    v_writelane_b32 v13, s4, 0
+; W64-O0-NEXT:    s_mov_b64 s[4:5], exec
+; W64-O0-NEXT:    v_writelane_b32 v13, s4, 1
+; W64-O0-NEXT:    v_writelane_b32 v13, s5, 2
+; W64-O0-NEXT:  .LBB1_1: ; =>This Inner Loop Header: Depth=1
+; W64-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; W64-O0-NEXT:    s_nop 0
+; W64-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; W64-O0-NEXT:    s_nop 0
+; W64-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; W64-O0-NEXT:    s_nop 0
+; W64-O0-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; W64-O0-NEXT:    s_waitcnt vmcnt(0)
+; W64-O0-NEXT:    v_readfirstlane_b32 s8, v0
+; W64-O0-NEXT:    v_readfirstlane_b32 s12, v1
+; W64-O0-NEXT:    s_mov_b32 s4, s8
+; W64-O0-NEXT:    s_mov_b32 s5, s12
+; W64-O0-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[4:5], v[0:1]
+; W64-O0-NEXT:    v_readfirstlane_b32 s7, v2
+; W64-O0-NEXT:    v_readfirstlane_b32 s6, v3
+; W64-O0-NEXT:    s_mov_b32 s10, s7
+; W64-O0-NEXT:    s_mov_b32 s11, s6
+; W64-O0-NEXT:    v_cmp_eq_u64_e64 s[10:11], s[10:11], v[2:3]
+; W64-O0-NEXT:    s_and_b64 s[4:5], s[4:5], s[10:11]
+; W64-O0-NEXT:    ; kill: def $sgpr8 killed $sgpr8 def $sgpr8_sgpr9_sgpr10_sgpr11
+; W64-O0-NEXT:    s_mov_b32 s9, s12
+; W64-O0-NEXT:    s_mov_b32 s10, s7
+; W64-O0-NEXT:    s_mov_b32 s11, s6
+; W64-O0-NEXT:    v_writelane_b32 v13, s8, 3
+; W64-O0-NEXT:    v_writelane_b32 v13, s9, 4
+; W64-O0-NEXT:    v_writelane_b32 v13, s10, 5
+; W64-O0-NEXT:    v_writelane_b32 v13, s11, 6
+; W64-O0-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
+; W64-O0-NEXT:    v_writelane_b32 v13, s4, 7
+; W64-O0-NEXT:    v_writelane_b32 v13, s5, 8
+; W64-O0-NEXT:  ; %bb.2: ; in Loop: Header=BB1_1 Depth=1
+; W64-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
+; W64-O0-NEXT:    v_readlane_b32 s4, v13, 7
+; W64-O0-NEXT:    v_readlane_b32 s5, v13, 8
+; W64-O0-NEXT:    v_readlane_b32 s8, v13, 3
+; W64-O0-NEXT:    v_readlane_b32 s9, v13, 4
+; W64-O0-NEXT:    v_readlane_b32 s10, v13, 5
+; W64-O0-NEXT:    v_readlane_b32 s11, v13, 6
+; W64-O0-NEXT:    v_readlane_b32 s6, v13, 0
+; W64-O0-NEXT:    s_waitcnt vmcnt(0)
+; W64-O0-NEXT:    s_nop 3
+; W64-O0-NEXT:    buffer_load_format_x v0, v0, s[8:11], s6 idxen
+; W64-O0-NEXT:    s_waitcnt vmcnt(0)
+; W64-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
+; W64-O0-NEXT:    s_xor_b64 exec, exec, s[4:5]
+; W64-O0-NEXT:    s_cbranch_execnz .LBB1_1
+; W64-O0-NEXT:  ; %bb.3:
+; W64-O0-NEXT:    v_readlane_b32 s4, v13, 1
+; W64-O0-NEXT:    v_readlane_b32 s5, v13, 2
+; W64-O0-NEXT:    s_mov_b64 exec, s[4:5]
+; W64-O0-NEXT:    s_mov_b64 s[4:5], exec
+; W64-O0-NEXT:    v_writelane_b32 v13, s4, 9
+; W64-O0-NEXT:    v_writelane_b32 v13, s5, 10
+; W64-O0-NEXT:  .LBB1_4: ; =>This Inner Loop Header: Depth=1
+; W64-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; W64-O0-NEXT:    s_nop 0
+; W64-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; W64-O0-NEXT:    s_nop 0
+; W64-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; W64-O0-NEXT:    s_nop 0
+; W64-O0-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; W64-O0-NEXT:    s_waitcnt vmcnt(0)
+; W64-O0-NEXT:    v_readfirstlane_b32 s8, v0
+; W64-O0-NEXT:    v_readfirstlane_b32 s12, v1
+; W64-O0-NEXT:    s_mov_b32 s4, s8
+; W64-O0-NEXT:    s_mov_b32 s5, s12
+; W64-O0-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[4:5], v[0:1]
+; W64-O0-NEXT:    v_readfirstlane_b32 s7, v2
+; W64-O0-NEXT:    v_readfirstlane_b32 s6, v3
+; W64-O0-NEXT:    s_mov_b32 s10, s7
+; W64-O0-NEXT:    s_mov_b32 s11, s6
+; W64-O0-NEXT:    v_cmp_eq_u64_e64 s[10:11], s[10:11], v[2:3]
+; W64-O0-NEXT:    s_and_b64 s[4:5], s[4:5], s[10:11]
+; W64-O0-NEXT:    ; kill: def $sgpr8 killed $sgpr8 def $sgpr8_sgpr9_sgpr10_sgpr11
+; W64-O0-NEXT:    s_mov_b32 s9, s12
+; W64-O0-NEXT:    s_mov_b32 s10, s7
+; W64-O0-NEXT:    s_mov_b32 s11, s6
+; W64-O0-NEXT:    v_writelane_b32 v13, s8, 11
+; W64-O0-NEXT:    v_writelane_b32 v13, s9, 12
+; W64-O0-NEXT:    v_writelane_b32 v13, s10, 13
+; W64-O0-NEXT:    v_writelane_b32 v13, s11, 14
+; W64-O0-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
+; W64-O0-NEXT:    v_writelane_b32 v13, s4, 15
+; W64-O0-NEXT:    v_writelane_b32 v13, s5, 16
+; W64-O0-NEXT:  ; %bb.5: ; in Loop: Header=BB1_4 Depth=1
+; W64-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
+; W64-O0-NEXT:    v_readlane_b32 s4, v13, 15
+; W64-O0-NEXT:    v_readlane_b32 s5, v13, 16
+; W64-O0-NEXT:    v_readlane_b32 s8, v13, 11
+; W64-O0-NEXT:    v_readlane_b32 s9, v13, 12
+; W64-O0-NEXT:    v_readlane_b32 s10, v13, 13
+; W64-O0-NEXT:    v_readlane_b32 s11, v13, 14
+; W64-O0-NEXT:    v_readlane_b32 s6, v13, 0
+; W64-O0-NEXT:    s_waitcnt vmcnt(0)
+; W64-O0-NEXT:    s_nop 3
+; W64-O0-NEXT:    buffer_load_format_x v0, v0, s[8:11], s6 idxen
+; W64-O0-NEXT:    s_waitcnt vmcnt(0)
+; W64-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
+; W64-O0-NEXT:    s_xor_b64 exec, exec, s[4:5]
+; W64-O0-NEXT:    s_cbranch_execnz .LBB1_4
+; W64-O0-NEXT:  ; %bb.6:
+; W64-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
+; W64-O0-NEXT:    s_nop 0
+; W64-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; W64-O0-NEXT:    s_nop 0
+; W64-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; W64-O0-NEXT:    s_nop 0
+; W64-O0-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
+; W64-O0-NEXT:    s_nop 0
+; W64-O0-NEXT:    buffer_load_dword v3, off, s[0:3], s32 ; 4-byte Folded Reload
+; W64-O0-NEXT:    s_nop 0
+; W64-O0-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; W64-O0-NEXT:    v_readlane_b32 s4, v13, 9
+; W64-O0-NEXT:    v_readlane_b32 s5, v13, 10
+; W64-O0-NEXT:    s_mov_b64 exec, s[4:5]
+; W64-O0-NEXT:    s_waitcnt vmcnt(0)
+; W64-O0-NEXT:    global_store_dword v[3:4], v5, off
+; W64-O0-NEXT:    s_waitcnt vmcnt(0)
+; W64-O0-NEXT:    global_store_dword v[0:1], v2, off
+; W64-O0-NEXT:    s_waitcnt vmcnt(0)
+; W64-O0-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; W64-O0-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
+; W64-O0-NEXT:    s_mov_b64 exec, s[4:5]
+; W64-O0-NEXT:    s_waitcnt vmcnt(0)
+; W64-O0-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %val0 = call float @llvm.amdgcn.struct.buffer.load.format.f32(<4 x i32> %i, i32 %c, i32 0, i32 0, i32 0) #1
   %val1 = call float @llvm.amdgcn.struct.buffer.load.format.f32(<4 x i32> %j, i32 %c, i32 0, i32 0, i32 0) #1
@@ -131,183 +690,521 @@ entry:
   ret void
 }
 
-
-; W64-LABEL: mubuf_vgpr_outside_entry
-
-; W64-DAG: v_mov_b32_e32 [[IDX:v[0-9]+]], s{{[0-9]+}}
-; W64-DAG: s_mov_b64 [[SAVEEXEC:s\[[0-9]+:[0-9]+\]]], exec
-
-; W64: [[LOOPBB0:.LBB[0-9]+_[0-9]+]]:
-; W64-DAG: v_readfirstlane_b32 s[[SRSRC0:[0-9]+]], v[[VRSRC0:[0-9]+]]
-; W64-DAG: v_readfirstlane_b32 s[[SRSRC1:[0-9]+]], v[[VRSRC1:[0-9]+]]
-; W64-DAG: v_readfirstlane_b32 s[[SRSRC2:[0-9]+]], v[[VRSRC2:[0-9]+]]
-; W64-DAG: v_readfirstlane_b32 s[[SRSRC3:[0-9]+]], v[[VRSRC3:[0-9]+]]
-; W64: v_cmp_eq_u64_e32 vcc, s[[[SRSRC0]]:[[SRSRC1]]], v[[[VRSRC0]]:[[VRSRC1]]]
-; W64: v_cmp_eq_u64_e64 [[CMP0:s\[[0-9]+:[0-9]+\]]], s[[[SRSRC2]]:[[SRSRC3]]], v[[[VRSRC2]]:[[VRSRC3]]]
-; W64: s_and_b64 [[AND:s\[[0-9]+:[0-9]+\]]], vcc, [[CMP0]]
-; W64: s_and_saveexec_b64 [[SAVE:s\[[0-9]+:[0-9]+\]]], [[AND]]
-; W64: buffer_load_format_x [[RES:v[0-9]+]], [[IDX]], s[[[SRSRC0]]:[[SRSRC3]]], 0 idxen
-; W64: s_xor_b64 exec, exec, [[SAVE]]
-; W64: s_cbranch_execnz [[LOOPBB0]]
-
-; W64: s_mov_b64 exec, [[SAVEEXEC]]
-; W64: s_cbranch_execz [[TERMBB:.LBB[0-9]+_[0-9]+]]
-
-; W64: ; %bb.{{[0-9]+}}:
-; W64-DAG: v_mov_b32_e32 [[IDX:v[0-9]+]], s{{[0-9]+}}
-; W64-DAG: s_mov_b64 [[SAVEEXEC:s\[[0-9]+:[0-9]+\]]], exec
-
-; W64: [[LOOPBB1:.LBB[0-9]+_[0-9]+]]:
-; W64-DAG: v_readfirstlane_b32 s[[SRSRC0:[0-9]+]], v[[VRSRC0:[0-9]+]]
-; W64-DAG: v_readfirstlane_b32 s[[SRSRC1:[0-9]+]], v[[VRSRC1:[0-9]+]]
-; W64-DAG: v_readfirstlane_b32 s[[SRSRC2:[0-9]+]], v[[VRSRC2:[0-9]+]]
-; W64-DAG: v_readfirstlane_b32 s[[SRSRC3:[0-9]+]], v[[VRSRC3:[0-9]+]]
-; W64: v_cmp_eq_u64_e32 vcc, s[[[SRSRC0]]:[[SRSRC1]]], v[[[VRSRC0]]:[[VRSRC1]]]
-; W64: v_cmp_eq_u64_e64 [[CMP0:s\[[0-9]+:[0-9]+\]]], s[[[SRSRC2]]:[[SRSRC3]]], v[[[VRSRC2]]:[[VRSRC3]]]
-; W64: s_and_b64 [[AND:s\[[0-9]+:[0-9]+\]]], vcc, [[CMP0]]
-; W64: s_and_saveexec_b64 [[SAVE:s\[[0-9]+:[0-9]+\]]], [[AND]]
-; W64: buffer_load_format_x [[RES]], [[IDX]], s[[[SRSRC0]]:[[SRSRC3]]], 0 idxen
-; W64: s_xor_b64 exec, exec, [[SAVE]]
-; W64: s_cbranch_execnz [[LOOPBB1]]
-
-; W64: s_mov_b64 exec, [[SAVEEXEC]]
-
-; W64: [[TERMBB]]:
-; W64: global_store_{{dword|b32}} v{{\[[0-9]+:[0-9]+\]}}, [[RES]], off
-
-
-; W32-LABEL: mubuf_vgpr_outside_entry
-
-; W32-DAG: v_mov_b32_e32 [[IDX:v[0-9]+]], s4
-; W32-DAG: s_mov_b32 [[SAVEEXEC:s[0-9]+]], exec_lo
-
-; W32: [[LOOPBB0:.LBB[0-9]+_[0-9]+]]:
-; W32-DAG: v_readfirstlane_b32 s[[SRSRC0:[0-9]+]], v[[VRSRC0:[0-9]+]]
-; W32-DAG: v_readfirstlane_b32 s[[SRSRC1:[0-9]+]], v[[VRSRC1:[0-9]+]]
-; W32-DAG: v_readfirstlane_b32 s[[SRSRC2:[0-9]+]], v[[VRSRC2:[0-9]+]]
-; W32-DAG: v_readfirstlane_b32 s[[SRSRC3:[0-9]+]], v[[VRSRC3:[0-9]+]]
-; W32: v_cmp_eq_u64_e32 vcc_lo, s[[[SRSRC0]]:[[SRSRC1]]], v[[[VRSRC0]]:[[VRSRC1]]]
-; W32: v_cmp_eq_u64_e64 [[CMP0:s[0-9]+]], s[[[SRSRC2]]:[[SRSRC3]]], v[[[VRSRC2]]:[[VRSRC3]]]
-; W32: s_and_b32 [[AND:s[0-9]+]], vcc_lo, [[CMP0]]
-; W32: s_and_saveexec_b32 [[SAVE:s[0-9]+]], [[AND]]
-; W32: buffer_load_format_x [[RES:v[0-9]+]], [[IDX]], s[[[SRSRC0]]:[[SRSRC3]]], 0 idxen
-; W32: s_xor_b32 exec_lo, exec_lo, [[SAVE]]
-; W32: s_cbranch_execnz [[LOOPBB0]]
-
-; W32: s_mov_b32 exec_lo, [[SAVEEXEC]]
-; W32: s_cbranch_execz [[TERMBB:.LBB[0-9]+_[0-9]+]]
-
-; W32: ; %bb.{{[0-9]+}}:
-; W32-DAG: v_mov_b32_e32 [[IDX:v[0-9]+]], s4
-; W32-DAG: s_mov_b32 [[SAVEEXEC:s[0-9]+]], exec_lo
-
-; W32: [[LOOPBB1:.LBB[0-9]+_[0-9]+]]:
-; W32-DAG: v_readfirstlane_b32 s[[SRSRC0:[0-9]+]], v[[VRSRC0:[0-9]+]]
-; W32-DAG: v_readfirstlane_b32 s[[SRSRC1:[0-9]+]], v[[VRSRC1:[0-9]+]]
-; W32-DAG: v_readfirstlane_b32 s[[SRSRC2:[0-9]+]], v[[VRSRC2:[0-9]+]]
-; W32-DAG: v_readfirstlane_b32 s[[SRSRC3:[0-9]+]], v[[VRSRC3:[0-9]+]]
-; W32: v_cmp_eq_u64_e32 vcc_lo, s[[[SRSRC0]]:[[SRSRC1]]], v[[[VRSRC0]]:[[VRSRC1]]]
-; W32: v_cmp_eq_u64_e64 [[CMP0:s[0-9]+]], s[[[SRSRC2]]:[[SRSRC3]]], v[[[VRSRC2]]:[[VRSRC3]]]
-; W32: s_and_b32 [[AND:s[0-9]+]], vcc_lo, [[CMP0]]
-; W32: s_and_saveexec_b32 [[SAVE:s[0-9]+]], [[AND]]
-; W32: buffer_load_format_x [[RES]], [[IDX]], s[[[SRSRC0]]:[[SRSRC3]]], 0 idxen
-; W32: s_xor_b32 exec_lo, exec_lo, [[SAVE]]
-; W32: s_cbranch_execnz [[LOOPBB1]]
-
-; W32: s_mov_b32 exec_lo, [[SAVEEXEC]]
-
-; W32: [[TERMBB]]:
-; W32: global_store_{{dword|b32}} v{{\[[0-9]+:[0-9]+\]}}, [[RES]], off
-
-
 ; Confirm spills do not occur between the XOR and branch that terminate the
 ; waterfall loop BBs.
 
-; W64-O0-LABEL: mubuf_vgpr_outside_entry
-
-; W64-O0-DAG: s_mov_b32 [[IDX_S:s[0-9]+]], s{{[0-9]+}}
-; W64-O0-DAG: v_mov_b32_e32 [[IDX_V:v[0-9]+]], s{{[0-9]+}}
-; W64-O0-DAG: buffer_store_dword [[IDX_V]], off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:[[RES_OFF_TMP:[0-9]+]] ; 4-byte Folded Spill
-; W64-O0-DAG: s_mov_b64 [[SAVEEXEC:s\[[0-9]+:[0-9]+\]]], exec
-
-; W64-O0: [[LOOPBB0:.LBB[0-9]+_[0-9]+]]: ; =>This Inner Loop Header: Depth=1
-; W64-O0: buffer_load_dword v[[VRSRC0:[0-9]+]], off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; W64-O0: buffer_load_dword v[[VRSRC1:[0-9]+]], off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
-; W64-O0: buffer_load_dword v[[VRSRC2:[0-9]+]], off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; W64-O0: buffer_load_dword v[[VRSRC3:[0-9]+]], off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
-; W64-O0: s_waitcnt vmcnt(0)
-; W64-O0-DAG: v_readfirstlane_b32 s[[S0:[0-9]+]], v[[VRSRC0]]
-; W64-O0-DAG: v_readfirstlane_b32 s[[SRSRCTMP1:[0-9]+]], v[[VRSRC1]]
-; W64-O0-DAG: s_mov_b32 s[[SRSRC0:[0-9]+]], s[[S0]]
-; W64-O0-DAG: s_mov_b32 s[[SRSRC1:[0-9]+]], s[[SRSRCTMP1]]
-; W64-O0-DAG: v_cmp_eq_u64_e64 [[CMP0:s\[[0-9]+:[0-9]+\]]], s[[[SRSRC0]]:[[SRSRC1]]], v[[[VRSRC0]]:[[VRSRC1]]]
-; W64-O0-DAG: v_readfirstlane_b32 s[[SRSRCTMP2:[0-9]+]], v[[VRSRC2]]
-; W64-O0-DAG: v_readfirstlane_b32 s[[SRSRCTMP3:[0-9]+]], v[[VRSRC3]]
-; W64-O0-DAG: s_mov_b32 s[[SRSRC2:[0-9]+]], s[[SRSRCTMP2]]
-; W64-O0-DAG: s_mov_b32 s[[SRSRC3:[0-9]+]], s[[SRSRCTMP3]]
-; W64-O0-DAG: v_cmp_eq_u64_e64 [[CMP1:s\[[0-9]+:[0-9]+\]]], s[[[SRSRC2]]:[[SRSRC3]]], v[[[VRSRC2]]:[[VRSRC3]]]
-; W64-O0-DAG: s_and_b64 [[AND:s\[[0-9]+:[0-9]+\]]], [[CMP0]], [[CMP1]]
-; W64-O0-DAG: s_mov_b32 s[[S1:[0-9]+]], s[[SRSRCTMP1]]
-; W64-O0-DAG: s_mov_b32 s[[S2:[0-9]+]], s[[SRSRCTMP2]]
-; W64-O0-DAG: s_mov_b32 s[[S3:[0-9]+]], s[[SRSRCTMP3]]
-; W64-O0: s_and_saveexec_b64 [[SAVE:s\[[0-9]+:[0-9]+\]]], [[AND]]
-; W64-O0: buffer_load_dword [[IDX:v[0-9]+]], off, s{{\[[0-9]+:[0-9]+\]}}, s32 ; 4-byte Folded Reload
-; W64-O0: buffer_load_format_x [[RES:v[0-9]+]], [[IDX]], s[[[S0]]:[[S3]]], {{.*}} idxen
-; W64-O0: s_waitcnt vmcnt(0)
-; W64-O0: buffer_store_dword [[RES]], off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:[[RES_OFF_TMP:[0-9]+]] ; 4-byte Folded Spill
-; W64-O0: s_xor_b64 exec, exec, [[SAVE]]
-; W64-O0-NEXT: s_cbranch_execnz [[LOOPBB0]]
-
-; XXX-W64-O0: s_mov_b64 exec, [[SAVEEXEC]]
-; W64-O0: buffer_load_dword [[RES:v[0-9]+]], off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:[[RES_OFF_TMP]] ; 4-byte Folded Reload
-; W64-O0: buffer_store_dword [[RES]], off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:[[RES_OFF:[0-9]+]] ; 4-byte Folded Spill
-; W64-O0: s_cbranch_execz [[TERMBB:.LBB[0-9]+_[0-9]+]]
-
-; W64-O0: ; %bb.{{[0-9]+}}: ; %bb1
-; W64-O0-DAG: buffer_store_dword {{v[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s32 offset:[[IDX_OFF:[0-9]+]] ; 4-byte Folded Spill
-; W64-O0-DAG: s_mov_b64 s[[[SAVEEXEC0:[0-9]+]]:[[SAVEEXEC1:[0-9]+]]], exec
-; W64-O0: v_writelane_b32 [[VSAVEEXEC:v[0-9]+]], s[[SAVEEXEC0]], [[SAVEEXEC_IDX0:[0-9]+]]
-; W64-O0: v_writelane_b32 [[VSAVEEXEC]], s[[SAVEEXEC1]], [[SAVEEXEC_IDX1:[0-9]+]]
-
-; W64-O0: [[LOOPBB1:.LBB[0-9]+_[0-9]+]]: ; =>This Inner Loop Header: Depth=1
-; W64-O0: buffer_load_dword v[[VRSRC0:[0-9]+]], off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; W64-O0: buffer_load_dword v[[VRSRC1:[0-9]+]], off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; W64-O0: buffer_load_dword v[[VRSRC2:[0-9]+]], off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; W64-O0: buffer_load_dword v[[VRSRC3:[0-9]+]], off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; W64-O0: s_waitcnt vmcnt(0)
-; W64-O0-DAG: v_readfirstlane_b32 s[[S0:[0-9]+]], v[[VRSRC0]]
-; W64-O0-DAG: v_readfirstlane_b32 s[[SRSRCTMP1:[0-9]+]], v[[VRSRC1]]
-; W64-O0-DAG: s_mov_b32 s[[SRSRC0:[0-9]+]], s[[S0]]
-; W64-O0-DAG: s_mov_b32 s[[SRSRC1:[0-9]+]], s[[SRSRCTMP1]]
-; W64-O0-DAG: v_cmp_eq_u64_e64 [[CMP0:s\[[0-9]+:[0-9]+\]]], s[[[SRSRC0]]:[[SRSRC1]]], v[[[VRSRC0]]:[[VRSRC1]]]
-; W64-O0-DAG: v_readfirstlane_b32 s[[SRSRCTMP2:[0-9]+]], v[[VRSRC2]]
-; W64-O0-DAG: v_readfirstlane_b32 s[[SRSRCTMP3:[0-9]+]], v[[VRSRC3]]
-; W64-O0-DAG: s_mov_b32 s[[SRSRC2:[0-9]+]], s[[SRSRCTMP2]]
-; W64-O0-DAG: s_mov_b32 s[[SRSRC3:[0-9]+]], s[[SRSRCTMP3]]
-; W64-O0-DAG: v_cmp_eq_u64_e64 [[CMP1:s\[[0-9]+:[0-9]+\]]], s[[[SRSRC2]]:[[SRSRC3]]], v[[[VRSRC2]]:[[VRSRC3]]]
-; W64-O0-DAG: s_and_b64 [[AND:s\[[0-9]+:[0-9]+\]]], [[CMP0]], [[CMP1]]
-; W64-O0-DAG: s_mov_b32 s[[S1:[0-9]+]], s[[SRSRCTMP1]]
-; W64-O0-DAG: s_mov_b32 s[[S2:[0-9]+]], s[[SRSRCTMP2]]
-; W64-O0-DAG: s_mov_b32 s[[S3:[0-9]+]], s[[SRSRCTMP3]]
-; W64-O0: s_and_saveexec_b64 [[SAVE:s\[[0-9]+:[0-9]+\]]], [[AND]]
-; W64-O0: buffer_load_dword [[IDX:v[0-9]+]], off, s{{\[[0-9]+:[0-9]+\]}}, s32  offset:[[IDX_OFF]] ; 4-byte Folded Reload
-; W64-O0: buffer_load_format_x [[RES:v[0-9]+]], [[IDX]], s[[[S0]]:[[S3]]], {{.*}} idxen
-; W64-O0: s_waitcnt vmcnt(0)
-; W64-O0: buffer_store_dword [[RES]], off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:[[RES_OFF_TMP:[0-9]+]] ; 4-byte Folded Spill
-; W64-O0: s_xor_b64 exec, exec, [[SAVE]]
-; W64-O0-NEXT: s_cbranch_execnz [[LOOPBB1]]
-
-; W64-O0: buffer_load_dword [[RES:v[0-9]+]], off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:[[RES_OFF_TMP]] ; 4-byte Folded Reload
-; W64-O0: v_readlane_b32 s[[SAVEEXEC0:[0-9]+]], [[VSAVEEXEC]], [[SAVEEXEC_IDX0]]
-; W64-O0: v_readlane_b32 s[[SAVEEXEC1:[0-9]+]], [[VSAVEEXEC]], [[SAVEEXEC_IDX1]]
-; W64-O0: s_mov_b64 exec, s[[[SAVEEXEC0]]:[[SAVEEXEC1]]]
-; W64-O0: buffer_store_dword [[RES]], off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:[[RES_OFF]] ; 4-byte Folded Spill
-
-; W64-O0: [[TERMBB]]:
-; W64-O0: buffer_load_dword [[RES:v[0-9]+]], off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:[[RES_OFF]] ; 4-byte Folded Reload
-; W64-O0: global_store_dword v[{{[0-9]+:[0-9]+}}], [[RES]], off
-
 define void @mubuf_vgpr_outside_entry(<4 x i32> %i, <4 x i32> %j, i32 %c, ptr addrspace(1) %in, ptr addrspace(1) %out) #0 {
+; GFX9_W64-LABEL: mubuf_vgpr_outside_entry:
+; GFX9_W64:       ; %bb.0: ; %entry
+; GFX9_W64-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9_W64-NEXT:    ;;#ASMSTART
+; GFX9_W64-NEXT:    s_mov_b32 s4, 17
+; GFX9_W64-NEXT:    ;;#ASMEND
+; GFX9_W64-NEXT:    v_mov_b32_e32 v8, s4
+; GFX9_W64-NEXT:    s_mov_b64 s[12:13], exec
+; GFX9_W64-NEXT:  .LBB2_1: ; =>This Inner Loop Header: Depth=1
+; GFX9_W64-NEXT:    v_readfirstlane_b32 s8, v0
+; GFX9_W64-NEXT:    v_readfirstlane_b32 s9, v1
+; GFX9_W64-NEXT:    v_readfirstlane_b32 s10, v2
+; GFX9_W64-NEXT:    v_readfirstlane_b32 s11, v3
+; GFX9_W64-NEXT:    v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
+; GFX9_W64-NEXT:    v_cmp_eq_u64_e64 s[6:7], s[10:11], v[2:3]
+; GFX9_W64-NEXT:    s_and_b64 s[6:7], vcc, s[6:7]
+; GFX9_W64-NEXT:    s_and_saveexec_b64 s[6:7], s[6:7]
+; GFX9_W64-NEXT:    s_nop 0
+; GFX9_W64-NEXT:    buffer_load_format_x v9, v8, s[8:11], 0 idxen
+; GFX9_W64-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
+; GFX9_W64-NEXT:    ; implicit-def: $vgpr8
+; GFX9_W64-NEXT:    s_xor_b64 exec, exec, s[6:7]
+; GFX9_W64-NEXT:    s_cbranch_execnz .LBB2_1
+; GFX9_W64-NEXT:  ; %bb.2:
+; GFX9_W64-NEXT:    s_mov_b64 exec, s[12:13]
+; GFX9_W64-NEXT:    v_and_b32_e32 v0, 0x3ff, v31
+; GFX9_W64-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9_W64-NEXT:    s_and_saveexec_b64 s[6:7], vcc
+; GFX9_W64-NEXT:    s_cbranch_execz .LBB2_6
+; GFX9_W64-NEXT:  ; %bb.3: ; %bb1
+; GFX9_W64-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9_W64-NEXT:    s_mov_b64 s[12:13], exec
+; GFX9_W64-NEXT:  .LBB2_4: ; =>This Inner Loop Header: Depth=1
+; GFX9_W64-NEXT:    v_readfirstlane_b32 s8, v4
+; GFX9_W64-NEXT:    v_readfirstlane_b32 s9, v5
+; GFX9_W64-NEXT:    v_readfirstlane_b32 s10, v6
+; GFX9_W64-NEXT:    v_readfirstlane_b32 s11, v7
+; GFX9_W64-NEXT:    v_cmp_eq_u64_e32 vcc, s[8:9], v[4:5]
+; GFX9_W64-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[10:11], v[6:7]
+; GFX9_W64-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
+; GFX9_W64-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
+; GFX9_W64-NEXT:    s_nop 0
+; GFX9_W64-NEXT:    buffer_load_format_x v9, v0, s[8:11], 0 idxen
+; GFX9_W64-NEXT:    ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7
+; GFX9_W64-NEXT:    ; implicit-def: $vgpr0
+; GFX9_W64-NEXT:    s_xor_b64 exec, exec, s[4:5]
+; GFX9_W64-NEXT:    s_cbranch_execnz .LBB2_4
+; GFX9_W64-NEXT:  ; %bb.5:
+; GFX9_W64-NEXT:    s_mov_b64 exec, s[12:13]
+; GFX9_W64-NEXT:  .LBB2_6: ; %bb2
+; GFX9_W64-NEXT:    s_or_b64 exec, exec, s[6:7]
+; GFX9_W64-NEXT:    s_waitcnt vmcnt(0)
+; GFX9_W64-NEXT:    global_store_dword v[11:12], v9, off
+; GFX9_W64-NEXT:    s_waitcnt vmcnt(0)
+; GFX9_W64-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1010_W32-LABEL: mubuf_vgpr_outside_entry:
+; GFX1010_W32:       ; %bb.0: ; %entry
+; GFX1010_W32-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1010_W32-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX1010_W32-NEXT:    ;;#ASMSTART
+; GFX1010_W32-NEXT:    s_mov_b32 s4, 17
+; GFX1010_W32-NEXT:    ;;#ASMEND
+; GFX1010_W32-NEXT:    v_mov_b32_e32 v8, s4
+; GFX1010_W32-NEXT:    s_mov_b32 s6, exec_lo
+; GFX1010_W32-NEXT:  .LBB2_1: ; =>This Inner Loop Header: Depth=1
+; GFX1010_W32-NEXT:    v_readfirstlane_b32 s8, v0
+; GFX1010_W32-NEXT:    v_readfirstlane_b32 s9, v1
+; GFX1010_W32-NEXT:    v_readfirstlane_b32 s10, v2
+; GFX1010_W32-NEXT:    v_readfirstlane_b32 s11, v3
+; GFX1010_W32-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[8:9], v[0:1]
+; GFX1010_W32-NEXT:    v_cmp_eq_u64_e64 s5, s[10:11], v[2:3]
+; GFX1010_W32-NEXT:    s_and_b32 s5, vcc_lo, s5
+; GFX1010_W32-NEXT:    s_and_saveexec_b32 s5, s5
+; GFX1010_W32-NEXT:    buffer_load_format_x v9, v8, s[8:11], 0 idxen
+; GFX1010_W32-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
+; GFX1010_W32-NEXT:    ; implicit-def: $vgpr8
+; GFX1010_W32-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX1010_W32-NEXT:    s_xor_b32 exec_lo, exec_lo, s5
+; GFX1010_W32-NEXT:    s_cbranch_execnz .LBB2_1
+; GFX1010_W32-NEXT:  ; %bb.2:
+; GFX1010_W32-NEXT:    s_mov_b32 exec_lo, s6
+; GFX1010_W32-NEXT:    v_and_b32_e32 v0, 0x3ff, v31
+; GFX1010_W32-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX1010_W32-NEXT:    s_and_saveexec_b32 s5, vcc_lo
+; GFX1010_W32-NEXT:    s_cbranch_execz .LBB2_6
+; GFX1010_W32-NEXT:  ; %bb.3: ; %bb1
+; GFX1010_W32-NEXT:    v_mov_b32_e32 v0, s4
+; GFX1010_W32-NEXT:    s_mov_b32 s6, exec_lo
+; GFX1010_W32-NEXT:  .LBB2_4: ; =>This Inner Loop Header: Depth=1
+; GFX1010_W32-NEXT:    v_readfirstlane_b32 s8, v4
+; GFX1010_W32-NEXT:    v_readfirstlane_b32 s9, v5
+; GFX1010_W32-NEXT:    v_readfirstlane_b32 s10, v6
+; GFX1010_W32-NEXT:    v_readfirstlane_b32 s11, v7
+; GFX1010_W32-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[8:9], v[4:5]
+; GFX1010_W32-NEXT:    v_cmp_eq_u64_e64 s4, s[10:11], v[6:7]
+; GFX1010_W32-NEXT:    s_and_b32 s4, vcc_lo, s4
+; GFX1010_W32-NEXT:    s_and_saveexec_b32 s4, s4
+; GFX1010_W32-NEXT:    buffer_load_format_x v9, v0, s[8:11], 0 idxen
+; GFX1010_W32-NEXT:    ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7
+; GFX1010_W32-NEXT:    ; implicit-def: $vgpr0
+; GFX1010_W32-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX1010_W32-NEXT:    s_xor_b32 exec_lo, exec_lo, s4
+; GFX1010_W32-NEXT:    s_cbranch_execnz .LBB2_4
+; GFX1010_W32-NEXT:  ; %bb.5:
+; GFX1010_W32-NEXT:    s_mov_b32 exec_lo, s6
+; GFX1010_W32-NEXT:  .LBB2_6: ; %bb2
+; GFX1010_W32-NEXT:    s_or_b32 exec_lo, exec_lo, s5
+; GFX1010_W32-NEXT:    s_waitcnt vmcnt(0)
+; GFX1010_W32-NEXT:    global_store_dword v[11:12], v9, off
+; GFX1010_W32-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX1010_W32-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1010_W64-LABEL: mubuf_vgpr_outside_entry:
+; GFX1010_W64:       ; %bb.0: ; %entry
+; GFX1010_W64-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1010_W64-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX1010_W64-NEXT:    ;;#ASMSTART
+; GFX1010_W64-NEXT:    s_mov_b32 s4, 17
+; GFX1010_W64-NEXT:    ;;#ASMEND
+; GFX1010_W64-NEXT:    v_mov_b32_e32 v8, s4
+; GFX1010_W64-NEXT:    s_mov_b64 s[12:13], exec
+; GFX1010_W64-NEXT:  .LBB2_1: ; =>This Inner Loop Header: Depth=1
+; GFX1010_W64-NEXT:    v_readfirstlane_b32 s8, v0
+; GFX1010_W64-NEXT:    v_readfirstlane_b32 s9, v1
+; GFX1010_W64-NEXT:    v_readfirstlane_b32 s10, v2
+; GFX1010_W64-NEXT:    v_readfirstlane_b32 s11, v3
+; GFX1010_W64-NEXT:    v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
+; GFX1010_W64-NEXT:    v_cmp_eq_u64_e64 s[6:7], s[10:11], v[2:3]
+; GFX1010_W64-NEXT:    s_and_b64 s[6:7], vcc, s[6:7]
+; GFX1010_W64-NEXT:    s_and_saveexec_b64 s[6:7], s[6:7]
+; GFX1010_W64-NEXT:    buffer_load_format_x v9, v8, s[8:11], 0 idxen
+; GFX1010_W64-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
+; GFX1010_W64-NEXT:    ; implicit-def: $vgpr8
+; GFX1010_W64-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX1010_W64-NEXT:    s_xor_b64 exec, exec, s[6:7]
+; GFX1010_W64-NEXT:    s_cbranch_execnz .LBB2_1
+; GFX1010_W64-NEXT:  ; %bb.2:
+; GFX1010_W64-NEXT:    s_mov_b64 exec, s[12:13]
+; GFX1010_W64-NEXT:    v_and_b32_e32 v0, 0x3ff, v31
+; GFX1010_W64-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX1010_W64-NEXT:    s_and_saveexec_b64 s[6:7], vcc
+; GFX1010_W64-NEXT:    s_cbranch_execz .LBB2_6
+; GFX1010_W64-NEXT:  ; %bb.3: ; %bb1
+; GFX1010_W64-NEXT:    v_mov_b32_e32 v0, s4
+; GFX1010_W64-NEXT:    s_mov_b64 s[12:13], exec
+; GFX1010_W64-NEXT:  .LBB2_4: ; =>This Inner Loop Header: Depth=1
+; GFX1010_W64-NEXT:    v_readfirstlane_b32 s8, v4
+; GFX1010_W64-NEXT:    v_readfirstlane_b32 s9, v5
+; GFX1010_W64-NEXT:    v_readfirstlane_b32 s10, v6
+; GFX1010_W64-NEXT:    v_readfirstlane_b32 s11, v7
+; GFX1010_W64-NEXT:    v_cmp_eq_u64_e32 vcc, s[8:9], v[4:5]
+; GFX1010_W64-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[10:11], v[6:7]
+; GFX1010_W64-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
+; GFX1010_W64-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
+; GFX1010_W64-NEXT:    buffer_load_format_x v9, v0, s[8:11], 0 idxen
+; GFX1010_W64-NEXT:    ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7
+; GFX1010_W64-NEXT:    ; implicit-def: $vgpr0
+; GFX1010_W64-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX1010_W64-NEXT:    s_xor_b64 exec, exec, s[4:5]
+; GFX1010_W64-NEXT:    s_cbranch_execnz .LBB2_4
+; GFX1010_W64-NEXT:  ; %bb.5:
+; GFX1010_W64-NEXT:    s_mov_b64 exec, s[12:13]
+; GFX1010_W64-NEXT:  .LBB2_6: ; %bb2
+; GFX1010_W64-NEXT:    s_or_b64 exec, exec, s[6:7]
+; GFX1010_W64-NEXT:    s_waitcnt vmcnt(0)
+; GFX1010_W64-NEXT:    global_store_dword v[11:12], v9, off
+; GFX1010_W64-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX1010_W64-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1100_W32-LABEL: mubuf_vgpr_outside_entry:
+; GFX1100_W32:       ; %bb.0: ; %entry
+; GFX1100_W32-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100_W32-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX1100_W32-NEXT:    ;;#ASMSTART
+; GFX1100_W32-NEXT:    s_mov_b32 s4, 17
+; GFX1100_W32-NEXT:    ;;#ASMEND
+; GFX1100_W32-NEXT:    v_mov_b32_e32 v8, s4
+; GFX1100_W32-NEXT:    s_mov_b32 s1, exec_lo
+; GFX1100_W32-NEXT:  .LBB2_1: ; =>This Inner Loop Header: Depth=1
+; GFX1100_W32-NEXT:    v_readfirstlane_b32 s8, v0
+; GFX1100_W32-NEXT:    v_readfirstlane_b32 s9, v1
+; GFX1100_W32-NEXT:    v_readfirstlane_b32 s10, v2
+; GFX1100_W32-NEXT:    v_readfirstlane_b32 s11, v3
+; GFX1100_W32-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1100_W32-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[8:9], v[0:1]
+; GFX1100_W32-NEXT:    v_cmp_eq_u64_e64 s0, s[10:11], v[2:3]
+; GFX1100_W32-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1100_W32-NEXT:    s_and_b32 s0, vcc_lo, s0
+; GFX1100_W32-NEXT:    s_and_saveexec_b32 s0, s0
+; GFX1100_W32-NEXT:    buffer_load_format_x v9, v8, s[8:11], 0 idxen
+; GFX1100_W32-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
+; GFX1100_W32-NEXT:    ; implicit-def: $vgpr8
+; GFX1100_W32-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
+; GFX1100_W32-NEXT:    s_cbranch_execnz .LBB2_1
+; GFX1100_W32-NEXT:  ; %bb.2:
+; GFX1100_W32-NEXT:    s_mov_b32 exec_lo, s1
+; GFX1100_W32-NEXT:    v_and_b32_e32 v0, 0x3ff, v31
+; GFX1100_W32-NEXT:    s_mov_b32 s1, exec_lo
+; GFX1100_W32-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1100_W32-NEXT:    v_cmpx_eq_u32_e32 0, v0
+; GFX1100_W32-NEXT:    s_cbranch_execz .LBB2_6
+; GFX1100_W32-NEXT:  ; %bb.3: ; %bb1
+; GFX1100_W32-NEXT:    v_mov_b32_e32 v0, s4
+; GFX1100_W32-NEXT:    s_mov_b32 s2, exec_lo
+; GFX1100_W32-NEXT:  .LBB2_4: ; =>This Inner Loop Header: Depth=1
+; GFX1100_W32-NEXT:    v_readfirstlane_b32 s4, v4
+; GFX1100_W32-NEXT:    v_readfirstlane_b32 s5, v5
+; GFX1100_W32-NEXT:    v_readfirstlane_b32 s6, v6
+; GFX1100_W32-NEXT:    v_readfirstlane_b32 s7, v7
+; GFX1100_W32-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1100_W32-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[4:5]
+; GFX1100_W32-NEXT:    v_cmp_eq_u64_e64 s0, s[6:7], v[6:7]
+; GFX1100_W32-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1100_W32-NEXT:    s_and_b32 s0, vcc_lo, s0
+; GFX1100_W32-NEXT:    s_and_saveexec_b32 s0, s0
+; GFX1100_W32-NEXT:    buffer_load_format_x v9, v0, s[4:7], 0 idxen
+; GFX1100_W32-NEXT:    ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7
+; GFX1100_W32-NEXT:    ; implicit-def: $vgpr0
+; GFX1100_W32-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
+; GFX1100_W32-NEXT:    s_cbranch_execnz .LBB2_4
+; GFX1100_W32-NEXT:  ; %bb.5:
+; GFX1100_W32-NEXT:    s_mov_b32 exec_lo, s2
+; GFX1100_W32-NEXT:  .LBB2_6: ; %bb2
+; GFX1100_W32-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1100_W32-NEXT:    s_or_b32 exec_lo, exec_lo, s1
+; GFX1100_W32-NEXT:    s_waitcnt vmcnt(0)
+; GFX1100_W32-NEXT:    global_store_b32 v[11:12], v9, off dlc
+; GFX1100_W32-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX1100_W32-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1100_W64-LABEL: mubuf_vgpr_outside_entry:
+; GFX1100_W64:       ; %bb.0: ; %entry
+; GFX1100_W64-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100_W64-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX1100_W64-NEXT:    ;;#ASMSTART
+; GFX1100_W64-NEXT:    s_mov_b32 s4, 17
+; GFX1100_W64-NEXT:    ;;#ASMEND
+; GFX1100_W64-NEXT:    v_mov_b32_e32 v8, s4
+; GFX1100_W64-NEXT:    s_mov_b64 s[2:3], exec
+; GFX1100_W64-NEXT:  .LBB2_1: ; =>This Inner Loop Header: Depth=1
+; GFX1100_W64-NEXT:    v_readfirstlane_b32 s8, v0
+; GFX1100_W64-NEXT:    v_readfirstlane_b32 s9, v1
+; GFX1100_W64-NEXT:    v_readfirstlane_b32 s10, v2
+; GFX1100_W64-NEXT:    v_readfirstlane_b32 s11, v3
+; GFX1100_W64-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1100_W64-NEXT:    v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
+; GFX1100_W64-NEXT:    v_cmp_eq_u64_e64 s[0:1], s[10:11], v[2:3]
+; GFX1100_W64-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1100_W64-NEXT:    s_and_b64 s[0:1], vcc, s[0:1]
+; GFX1100_W64-NEXT:    s_and_saveexec_b64 s[0:1], s[0:1]
+; GFX1100_W64-NEXT:    buffer_load_format_x v9, v8, s[8:11], 0 idxen
+; GFX1100_W64-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
+; GFX1100_W64-NEXT:    ; implicit-def: $vgpr8
+; GFX1100_W64-NEXT:    s_xor_b64 exec, exec, s[0:1]
+; GFX1100_W64-NEXT:    s_cbranch_execnz .LBB2_1
+; GFX1100_W64-NEXT:  ; %bb.2:
+; GFX1100_W64-NEXT:    s_mov_b64 exec, s[2:3]
+; GFX1100_W64-NEXT:    v_and_b32_e32 v0, 0x3ff, v31
+; GFX1100_W64-NEXT:    s_mov_b64 s[2:3], exec
+; GFX1100_W64-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1100_W64-NEXT:    v_cmpx_eq_u32_e32 0, v0
+; GFX1100_W64-NEXT:    s_cbranch_execz .LBB2_6
+; GFX1100_W64-NEXT:  ; %bb.3: ; %bb1
+; GFX1100_W64-NEXT:    v_mov_b32_e32 v0, s4
+; GFX1100_W64-NEXT:    s_mov_b64 s[8:9], exec
+; GFX1100_W64-NEXT:  .LBB2_4: ; =>This Inner Loop Header: Depth=1
+; GFX1100_W64-NEXT:    v_readfirstlane_b32 s4, v4
+; GFX1100_W64-NEXT:    v_readfirstlane_b32 s5, v5
+; GFX1100_W64-NEXT:    v_readfirstlane_b32 s6, v6
+; GFX1100_W64-NEXT:    v_readfirstlane_b32 s7, v7
+; GFX1100_W64-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1100_W64-NEXT:    v_cmp_eq_u64_e32 vcc, s[4:5], v[4:5]
+; GFX1100_W64-NEXT:    v_cmp_eq_u64_e64 s[0:1], s[6:7], v[6:7]
+; GFX1100_W64-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1100_W64-NEXT:    s_and_b64 s[0:1], vcc, s[0:1]
+; GFX1100_W64-NEXT:    s_and_saveexec_b64 s[0:1], s[0:1]
+; GFX1100_W64-NEXT:    buffer_load_format_x v9, v0, s[4:7], 0 idxen
+; GFX1100_W64-NEXT:    ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7
+; GFX1100_W64-NEXT:    ; implicit-def: $vgpr0
+; GFX1100_W64-NEXT:    s_xor_b64 exec, exec, s[0:1]
+; GFX1100_W64-NEXT:    s_cbranch_execnz .LBB2_4
+; GFX1100_W64-NEXT:  ; %bb.5:
+; GFX1100_W64-NEXT:    s_mov_b64 exec, s[8:9]
+; GFX1100_W64-NEXT:  .LBB2_6: ; %bb2
+; GFX1100_W64-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1100_W64-NEXT:    s_or_b64 exec, exec, s[2:3]
+; GFX1100_W64-NEXT:    s_waitcnt vmcnt(0)
+; GFX1100_W64-NEXT:    global_store_b32 v[11:12], v9, off dlc
+; GFX1100_W64-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX1100_W64-NEXT:    s_setpc_b64 s[30:31]
+;
+; W64-O0-LABEL: mubuf_vgpr_outside_entry:
+; W64-O0:       ; %bb.0: ; %entry
+; W64-O0-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; W64-O0-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; W64-O0-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
+; W64-O0-NEXT:    s_mov_b64 exec, s[4:5]
+; W64-O0-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
+; W64-O0-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
+; W64-O0-NEXT:    v_mov_b32_e32 v9, v7
+; W64-O0-NEXT:    v_mov_b32_e32 v10, v6
+; W64-O0-NEXT:    v_mov_b32_e32 v11, v5
+; W64-O0-NEXT:    v_mov_b32_e32 v5, v4
+; W64-O0-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; W64-O0-NEXT:    s_nop 0
+; W64-O0-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
+; W64-O0-NEXT:    v_mov_b32_e32 v5, v3
+; W64-O0-NEXT:    v_mov_b32_e32 v6, v2
+; W64-O0-NEXT:    v_mov_b32_e32 v7, v1
+; W64-O0-NEXT:    v_mov_b32_e32 v13, v0
+; W64-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; W64-O0-NEXT:    ; implicit-def: $sgpr4
+; W64-O0-NEXT:    ; implicit-def: $sgpr4
+; W64-O0-NEXT:    ; implicit-def: $sgpr4
+; W64-O0-NEXT:    ; implicit-def: $sgpr4
+; W64-O0-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $exec
+; W64-O0-NEXT:    ; kill: def $vgpr11 killed $vgpr11 killed $exec
+; W64-O0-NEXT:    ; kill: def $vgpr9 killed $vgpr9 killed $exec
+; W64-O0-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1_vgpr2_vgpr3 killed $exec
+; W64-O0-NEXT:    v_mov_b32_e32 v1, v11
+; W64-O0-NEXT:    v_mov_b32_e32 v2, v10
+; W64-O0-NEXT:    v_mov_b32_e32 v3, v9
+; W64-O0-NEXT:    ; implicit-def: $sgpr4
+; W64-O0-NEXT:    ; implicit-def: $sgpr4
+; W64-O0-NEXT:    ; implicit-def: $sgpr4
+; W64-O0-NEXT:    ; implicit-def: $sgpr4
+; W64-O0-NEXT:    ; kill: def $vgpr13 killed $vgpr13 killed $exec
+; W64-O0-NEXT:    ; kill: def $vgpr7 killed $vgpr7 killed $exec
+; W64-O0-NEXT:    ; kill: def $vgpr6 killed $vgpr6 killed $exec
+; W64-O0-NEXT:    ; kill: def $vgpr13 killed $vgpr13 def $vgpr13_vgpr14_vgpr15_vgpr16 killed $exec
+; W64-O0-NEXT:    v_mov_b32_e32 v14, v7
+; W64-O0-NEXT:    v_mov_b32_e32 v15, v6
+; W64-O0-NEXT:    v_mov_b32_e32 v16, v5
+; W64-O0-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
+; W64-O0-NEXT:    s_waitcnt vmcnt(0)
+; W64-O0-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
+; W64-O0-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
+; W64-O0-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
+; W64-O0-NEXT:    ; implicit-def: $sgpr4
+; W64-O0-NEXT:    ; implicit-def: $sgpr4
+; W64-O0-NEXT:    ; kill: def $vgpr12 killed $vgpr12 killed $exec
+; W64-O0-NEXT:    ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec
+; W64-O0-NEXT:    v_mov_b32_e32 v5, v12
+; W64-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; W64-O0-NEXT:    s_waitcnt vmcnt(0)
+; W64-O0-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; W64-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; W64-O0-NEXT:    s_waitcnt vmcnt(0)
+; W64-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; W64-O0-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; W64-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; W64-O0-NEXT:    ; implicit-def: $sgpr4_sgpr5_sgpr6_sgpr7
+; W64-O0-NEXT:    ;;#ASMSTART
+; W64-O0-NEXT:    s_mov_b32 s4, 17
+; W64-O0-NEXT:    ;;#ASMEND
+; W64-O0-NEXT:    s_mov_b32 s5, s4
+; W64-O0-NEXT:    v_writelane_b32 v8, s5, 0
+; W64-O0-NEXT:    s_mov_b32 s5, 0
+; W64-O0-NEXT:    v_writelane_b32 v8, s5, 1
+; W64-O0-NEXT:    v_mov_b32_e32 v0, s4
+; W64-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
+; W64-O0-NEXT:    s_mov_b64 s[4:5], exec
+; W64-O0-NEXT:    v_writelane_b32 v8, s4, 2
+; W64-O0-NEXT:    v_writelane_b32 v8, s5, 3
+; W64-O0-NEXT:  .LBB2_1: ; =>This Inner Loop Header: Depth=1
+; W64-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; W64-O0-NEXT:    s_nop 0
+; W64-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; W64-O0-NEXT:    s_nop 0
+; W64-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; W64-O0-NEXT:    s_nop 0
+; W64-O0-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; W64-O0-NEXT:    s_waitcnt vmcnt(0)
+; W64-O0-NEXT:    v_readfirstlane_b32 s8, v0
+; W64-O0-NEXT:    v_readfirstlane_b32 s12, v1
+; W64-O0-NEXT:    s_mov_b32 s4, s8
+; W64-O0-NEXT:    s_mov_b32 s5, s12
+; W64-O0-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[4:5], v[0:1]
+; W64-O0-NEXT:    v_readfirstlane_b32 s7, v2
+; W64-O0-NEXT:    v_readfirstlane_b32 s6, v3
+; W64-O0-NEXT:    s_mov_b32 s10, s7
+; W64-O0-NEXT:    s_mov_b32 s11, s6
+; W64-O0-NEXT:    v_cmp_eq_u64_e64 s[10:11], s[10:11], v[2:3]
+; W64-O0-NEXT:    s_and_b64 s[4:5], s[4:5], s[10:11]
+; W64-O0-NEXT:    ; kill: def $sgpr8 killed $sgpr8 def $sgpr8_sgpr9_sgpr10_sgpr11
+; W64-O0-NEXT:    s_mov_b32 s9, s12
+; W64-O0-NEXT:    s_mov_b32 s10, s7
+; W64-O0-NEXT:    s_mov_b32 s11, s6
+; W64-O0-NEXT:    v_writelane_b32 v8, s8, 4
+; W64-O0-NEXT:    v_writelane_b32 v8, s9, 5
+; W64-O0-NEXT:    v_writelane_b32 v8, s10, 6
+; W64-O0-NEXT:    v_writelane_b32 v8, s11, 7
+; W64-O0-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
+; W64-O0-NEXT:    v_writelane_b32 v8, s4, 8
+; W64-O0-NEXT:    v_writelane_b32 v8, s5, 9
+; W64-O0-NEXT:  ; %bb.2: ; in Loop: Header=BB2_1 Depth=1
+; W64-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
+; W64-O0-NEXT:    v_readlane_b32 s4, v8, 8
+; W64-O0-NEXT:    v_readlane_b32 s5, v8, 9
+; W64-O0-NEXT:    v_readlane_b32 s8, v8, 4
+; W64-O0-NEXT:    v_readlane_b32 s9, v8, 5
+; W64-O0-NEXT:    v_readlane_b32 s10, v8, 6
+; W64-O0-NEXT:    v_readlane_b32 s11, v8, 7
+; W64-O0-NEXT:    v_readlane_b32 s6, v8, 1
+; W64-O0-NEXT:    s_waitcnt vmcnt(0)
+; W64-O0-NEXT:    s_nop 3
+; W64-O0-NEXT:    buffer_load_format_x v0, v0, s[8:11], s6 idxen
+; W64-O0-NEXT:    s_waitcnt vmcnt(0)
+; W64-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
+; W64-O0-NEXT:    s_xor_b64 exec, exec, s[4:5]
+; W64-O0-NEXT:    s_cbranch_execnz .LBB2_1
+; W64-O0-NEXT:  ; %bb.3:
+; W64-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
+; W64-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
+; W64-O0-NEXT:    v_readlane_b32 s6, v8, 2
+; W64-O0-NEXT:    v_readlane_b32 s7, v8, 3
+; W64-O0-NEXT:    s_mov_b64 exec, s[6:7]
+; W64-O0-NEXT:    v_readlane_b32 s4, v8, 1
+; W64-O0-NEXT:    s_mov_b32 s5, 0x3ff
+; W64-O0-NEXT:    s_waitcnt vmcnt(0)
+; W64-O0-NEXT:    v_and_b32_e64 v1, v1, s5
+; W64-O0-NEXT:    v_cmp_eq_u32_e64 s[6:7], v1, s4
+; W64-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
+; W64-O0-NEXT:    s_mov_b64 s[4:5], exec
+; W64-O0-NEXT:    v_writelane_b32 v8, s4, 10
+; W64-O0-NEXT:    v_writelane_b32 v8, s5, 11
+; W64-O0-NEXT:    s_and_b64 s[4:5], s[4:5], s[6:7]
+; W64-O0-NEXT:    s_mov_b64 exec, s[4:5]
+; W64-O0-NEXT:    s_cbranch_execz .LBB2_8
+; W64-O0-NEXT:  ; %bb.4: ; %bb1
+; W64-O0-NEXT:    v_readlane_b32 s4, v8, 0
+; W64-O0-NEXT:    s_mov_b32 s5, 0
+; W64-O0-NEXT:    v_writelane_b32 v8, s5, 12
+; W64-O0-NEXT:    v_mov_b32_e32 v0, s4
+; W64-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
+; W64-O0-NEXT:    ; implicit-def: $sgpr4_sgpr5_sgpr6_sgpr7
+; W64-O0-NEXT:    s_mov_b64 s[4:5], exec
+; W64-O0-NEXT:    v_writelane_b32 v8, s4, 13
+; W64-O0-NEXT:    v_writelane_b32 v8, s5, 14
+; W64-O0-NEXT:  .LBB2_5: ; =>This Inner Loop Header: Depth=1
+; W64-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; W64-O0-NEXT:    s_nop 0
+; W64-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; W64-O0-NEXT:    s_nop 0
+; W64-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; W64-O0-NEXT:    s_nop 0
+; W64-O0-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; W64-O0-NEXT:    s_waitcnt vmcnt(0)
+; W64-O0-NEXT:    v_readfirstlane_b32 s8, v0
+; W64-O0-NEXT:    v_readfirstlane_b32 s12, v1
+; W64-O0-NEXT:    s_mov_b32 s4, s8
+; W64-O0-NEXT:    s_mov_b32 s5, s12
+; W64-O0-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[4:5], v[0:1]
+; W64-O0-NEXT:    v_readfirstlane_b32 s7, v2
+; W64-O0-NEXT:    v_readfirstlane_b32 s6, v3
+; W64-O0-NEXT:    s_mov_b32 s10, s7
+; W64-O0-NEXT:    s_mov_b32 s11, s6
+; W64-O0-NEXT:    v_cmp_eq_u64_e64 s[10:11], s[10:11], v[2:3]
+; W64-O0-NEXT:    s_and_b64 s[4:5], s[4:5], s[10:11]
+; W64-O0-NEXT:    ; kill: def $sgpr8 killed $sgpr8 def $sgpr8_sgpr9_sgpr10_sgpr11
+; W64-O0-NEXT:    s_mov_b32 s9, s12
+; W64-O0-NEXT:    s_mov_b32 s10, s7
+; W64-O0-NEXT:    s_mov_b32 s11, s6
+; W64-O0-NEXT:    v_writelane_b32 v8, s8, 15
+; W64-O0-NEXT:    v_writelane_b32 v8, s9, 16
+; W64-O0-NEXT:    v_writelane_b32 v8, s10, 17
+; W64-O0-NEXT:    v_writelane_b32 v8, s11, 18
+; W64-O0-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
+; W64-O0-NEXT:    v_writelane_b32 v8, s4, 19
+; W64-O0-NEXT:    v_writelane_b32 v8, s5, 20
+; W64-O0-NEXT:  ; %bb.6: ; in Loop: Header=BB2_5 Depth=1
+; W64-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
+; W64-O0-NEXT:    v_readlane_b32 s4, v8, 19
+; W64-O0-NEXT:    v_readlane_b32 s5, v8, 20
+; W64-O0-NEXT:    v_readlane_b32 s8, v8, 15
+; W64-O0-NEXT:    v_readlane_b32 s9, v8, 16
+; W64-O0-NEXT:    v_readlane_b32 s10, v8, 17
+; W64-O0-NEXT:    v_readlane_b32 s11, v8, 18
+; W64-O0-NEXT:    v_readlane_b32 s6, v8, 12
+; W64-O0-NEXT:    s_waitcnt vmcnt(0)
+; W64-O0-NEXT:    s_nop 3
+; W64-O0-NEXT:    buffer_load_format_x v0, v0, s[8:11], s6 idxen
+; W64-O0-NEXT:    s_waitcnt vmcnt(0)
+; W64-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
+; W64-O0-NEXT:    s_xor_b64 exec, exec, s[4:5]
+; W64-O0-NEXT:    s_cbranch_execnz .LBB2_5
+; W64-O0-NEXT:  ; %bb.7:
+; W64-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
+; W64-O0-NEXT:    v_readlane_b32 s4, v8, 13
+; W64-O0-NEXT:    v_readlane_b32 s5, v8, 14
+; W64-O0-NEXT:    s_mov_b64 exec, s[4:5]
+; W64-O0-NEXT:    s_waitcnt vmcnt(0)
+; W64-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
+; W64-O0-NEXT:  .LBB2_8: ; %bb2
+; W64-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; W64-O0-NEXT:    s_nop 0
+; W64-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; W64-O0-NEXT:    s_nop 0
+; W64-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
+; W64-O0-NEXT:    v_readlane_b32 s4, v8, 10
+; W64-O0-NEXT:    v_readlane_b32 s5, v8, 11
+; W64-O0-NEXT:    s_or_b64 exec, exec, s[4:5]
+; W64-O0-NEXT:    s_waitcnt vmcnt(0)
+; W64-O0-NEXT:    global_store_dword v[0:1], v2, off
+; W64-O0-NEXT:    s_waitcnt vmcnt(0)
+; W64-O0-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; W64-O0-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
+; W64-O0-NEXT:    s_mov_b64 exec, s[4:5]
+; W64-O0-NEXT:    s_waitcnt vmcnt(0)
+; W64-O0-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %live.out.reg = call i32 asm sideeffect "s_mov_b32 $0, 17", "={s4}" ()
   %val0 = call float @llvm.amdgcn.struct.buffer.load.format.f32(<4 x i32> %i, i32 %live.out.reg, i32 0, i32 0, i32 0) #1


        


More information about the llvm-commits mailing list