[llvm] 2ca30eb - AMDGPU/GlobalISel: Handle mubuf load/store for more types (#68268)

via llvm-commits llvm-commits at lists.llvm.org
Thu Oct 5 05:36:21 PDT 2023


Author: Matt Arsenault
Date: 2023-10-05T05:36:16-07:00
New Revision: 2ca30eb8fdb186a775787a3931adb86ff5d86417

URL: https://github.com/llvm/llvm-project/commit/2ca30eb8fdb186a775787a3931adb86ff5d86417
DIFF: https://github.com/llvm/llvm-project/commit/2ca30eb8fdb186a775787a3931adb86ff5d86417.diff

LOG: AMDGPU/GlobalISel: Handle mubuf load/store for more types (#68268)

Fixes MUBUF path for most vectors and pointers, which unblocks fixing
the gfx6/7 run lines in assorted tests. Also fixes inconsistent behavior
for -flat-for-global.

Added: 
    

Modified: 
    llvm/lib/Target/AMDGPU/BUFInstructions.td
    llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i16.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/frem.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i16.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-global.mir
    llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-store-global.mir
    llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.fmas.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.scale.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.set.inactive.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/load-constant.96.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/shl-ext-reduce.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/smrd.ll
    llvm/test/CodeGen/AMDGPU/fmed3.ll
    llvm/test/CodeGen/AMDGPU/fptrunc.f16.ll
    llvm/test/CodeGen/AMDGPU/v_pack.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/AMDGPU/BUFInstructions.td b/llvm/lib/Target/AMDGPU/BUFInstructions.td
index cec35d1147bb0ae..4297111930de254 100644
--- a/llvm/lib/Target/AMDGPU/BUFInstructions.td
+++ b/llvm/lib/Target/AMDGPU/BUFInstructions.td
@@ -574,11 +574,11 @@ class MUBUF_Store_Pseudo <string opName,
 
 multiclass MUBUF_Pseudo_Store_Pats<string BaseInst, ValueType store_vt = i32, SDPatternOperator st = null_frag> {
 
-  def _OFFSET : GCNPat <
+  def : GCNPat <
     (st store_vt:$vdata, (MUBUFOffset v4i32:$srsrc, i32:$soffset, i32:$offset)),
     (!cast<MUBUF_Pseudo>(BaseInst # _OFFSET) store_vt:$vdata, v4i32:$srsrc, i32:$soffset, i32:$offset)>;
 
-  def _ADDR64 : GCNPat <
+  def : GCNPat <
     (st store_vt:$vdata, (MUBUFAddr64 v4i32:$srsrc, i64:$vaddr, i32:$soffset, i32:$offset)),
     (!cast<MUBUF_Pseudo>(BaseInst # _ADDR64) store_vt:$vdata, i64:$vaddr, v4i32:$srsrc, i32:$soffset, i32:$offset)>;
 }
@@ -912,10 +912,22 @@ defm : MUBUF_Pseudo_Load_Pats<"BUFFER_LOAD_SBYTE", i32, sextloadi8_global>;
 defm : MUBUF_Pseudo_Load_Pats<"BUFFER_LOAD_USHORT", i32, extloadi16_global>;
 defm : MUBUF_Pseudo_Load_Pats<"BUFFER_LOAD_USHORT", i32, zextloadi16_global>;
 defm : MUBUF_Pseudo_Load_Pats<"BUFFER_LOAD_SSHORT", i32, sextloadi16_global>;
-defm : MUBUF_Pseudo_Load_Pats<"BUFFER_LOAD_DWORD", i32, load_global>;
-defm : MUBUF_Pseudo_Load_Pats<"BUFFER_LOAD_DWORDX2", v2i32, load_global>;
-defm : MUBUF_Pseudo_Load_Pats<"BUFFER_LOAD_DWORDX3", v3i32, load_global>;
-defm : MUBUF_Pseudo_Load_Pats<"BUFFER_LOAD_DWORDX4", v4i32, load_global>;
+
+foreach vt = Reg32Types.types in {
+defm : MUBUF_Pseudo_Load_Pats<"BUFFER_LOAD_DWORD", vt, load_global>;
+}
+
+foreach vt = VReg_64.RegTypes in {
+defm : MUBUF_Pseudo_Load_Pats<"BUFFER_LOAD_DWORDX2", vt, load_global>;
+}
+
+foreach vt = VReg_96.RegTypes in {
+defm : MUBUF_Pseudo_Load_Pats<"BUFFER_LOAD_DWORDX3", vt, load_global>;
+}
+
+foreach vt = VReg_128.RegTypes in {
+defm : MUBUF_Pseudo_Load_Pats<"BUFFER_LOAD_DWORDX4", vt, load_global>;
+}
 
 defm BUFFER_STORE_BYTE : MUBUF_Pseudo_Stores <
   "buffer_store_byte", i32
@@ -938,10 +950,22 @@ defm BUFFER_STORE_DWORDX4 : MUBUF_Pseudo_Stores <
 
 defm : MUBUF_Pseudo_Store_Pats<"BUFFER_STORE_BYTE", i32, truncstorei8_global>;
 defm : MUBUF_Pseudo_Store_Pats<"BUFFER_STORE_SHORT", i32, truncstorei16_global>;
-defm : MUBUF_Pseudo_Store_Pats<"BUFFER_STORE_DWORD", i32, store_global>;
-defm : MUBUF_Pseudo_Store_Pats<"BUFFER_STORE_DWORDX2", v2i32, store_global>;
-defm : MUBUF_Pseudo_Store_Pats<"BUFFER_STORE_DWORDX3", v3i32, store_global>;
-defm : MUBUF_Pseudo_Store_Pats<"BUFFER_STORE_DWORDX4", v4i32, store_global>;
+
+foreach vt = Reg32Types.types in {
+defm : MUBUF_Pseudo_Store_Pats<"BUFFER_STORE_DWORD", vt, store_global>;
+}
+
+foreach vt = VReg_64.RegTypes in {
+defm : MUBUF_Pseudo_Store_Pats<"BUFFER_STORE_DWORDX2", vt, store_global>;
+}
+
+foreach vt = VReg_96.RegTypes in {
+defm : MUBUF_Pseudo_Store_Pats<"BUFFER_STORE_DWORDX3", vt, store_global>;
+}
+
+foreach vt = VReg_128.RegTypes in {
+defm : MUBUF_Pseudo_Store_Pats<"BUFFER_STORE_DWORDX4", vt, store_global>;
+}
 
 defm BUFFER_ATOMIC_SWAP : MUBUF_Pseudo_Atomics <
   "buffer_atomic_swap", VGPR_32, i32

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i16.ll
index 2263672adb94585..7028d1157787fe9 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i16.ll
@@ -77,7 +77,10 @@ define amdgpu_ps i16 @extractelement_vgpr_v4i16_sgpr_idx(ptr addrspace(1) %ptr,
 ;
 ; GFX7-LABEL: extractelement_vgpr_v4i16_sgpr_idx:
 ; GFX7:       ; %bb.0:
-; GFX7-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
+; GFX7-NEXT:    s_mov_b32 s6, 0
+; GFX7-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-NEXT:    s_mov_b64 s[4:5], 0
+; GFX7-NEXT:    buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64
 ; GFX7-NEXT:    s_lshr_b32 s0, s2, 1
 ; GFX7-NEXT:    s_and_b32 s1, s2, 1
 ; GFX7-NEXT:    v_cmp_eq_u32_e64 vcc, s0, 1
@@ -150,7 +153,10 @@ define i16 @extractelement_vgpr_v4i16_vgpr_idx(ptr addrspace(1) %ptr, i32 %idx)
 ; GFX7-LABEL: extractelement_vgpr_v4i16_vgpr_idx:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
+; GFX7-NEXT:    s_mov_b32 s6, 0
+; GFX7-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-NEXT:    s_mov_b64 s[4:5], 0
+; GFX7-NEXT:    buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 1, v2
 ; GFX7-NEXT:    v_and_b32_e32 v2, 1, v2
 ; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v3
@@ -360,7 +366,10 @@ define i16 @extractelement_vgpr_v4i16_idx0(ptr addrspace(1) %ptr) {
 ; GFX7-LABEL: extractelement_vgpr_v4i16_idx0:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
+; GFX7-NEXT:    s_mov_b32 s6, 0
+; GFX7-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-NEXT:    s_mov_b64 s[4:5], 0
+; GFX7-NEXT:    buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -402,7 +411,10 @@ define i16 @extractelement_vgpr_v4i16_idx1(ptr addrspace(1) %ptr) {
 ; GFX7-LABEL: extractelement_vgpr_v4i16_idx1:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
+; GFX7-NEXT:    s_mov_b32 s6, 0
+; GFX7-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-NEXT:    s_mov_b64 s[4:5], 0
+; GFX7-NEXT:    buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
@@ -447,7 +459,10 @@ define i16 @extractelement_vgpr_v4i16_idx2(ptr addrspace(1) %ptr) {
 ; GFX7-LABEL: extractelement_vgpr_v4i16_idx2:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
+; GFX7-NEXT:    s_mov_b32 s6, 0
+; GFX7-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-NEXT:    s_mov_b64 s[4:5], 0
+; GFX7-NEXT:    buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_mov_b32_e32 v0, v1
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
@@ -492,7 +507,10 @@ define i16 @extractelement_vgpr_v4i16_idx3(ptr addrspace(1) %ptr) {
 ; GFX7-LABEL: extractelement_vgpr_v4i16_idx3:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
+; GFX7-NEXT:    s_mov_b32 s6, 0
+; GFX7-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-NEXT:    s_mov_b64 s[4:5], 0
+; GFX7-NEXT:    buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 16, v1
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/frem.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/frem.ll
index 5b3cba2000de6fc..0417b97a53c0ff3 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/frem.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/frem.ll
@@ -318,6 +318,8 @@ define amdgpu_kernel void @frem_f64(ptr addrspace(1) %out, ptr addrspace(1) %in1
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_load_dwordx2 s[2:3], s[6:7], 0x0
 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; CI-NEXT:    s_mov_b32 s6, -1
+; CI-NEXT:    s_mov_b32 s7, 0xf000
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v0, s0
 ; CI-NEXT:    v_mov_b32_e32 v1, s1
@@ -334,9 +336,7 @@ define amdgpu_kernel void @frem_f64(ptr addrspace(1) %out, ptr addrspace(1) %in1
 ; CI-NEXT:    v_div_fixup_f64 v[2:3], v[2:3], v[0:1], s[2:3]
 ; CI-NEXT:    v_trunc_f64_e32 v[2:3], v[2:3]
 ; CI-NEXT:    v_fma_f64 v[0:1], -v[2:3], v[0:1], s[2:3]
-; CI-NEXT:    v_mov_b32_e32 v2, s4
-; CI-NEXT:    v_mov_b32_e32 v3, s5
-; CI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; CI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; CI-NEXT:    s_endpgm
 ;
 ; VI-LABEL: frem_f64:
@@ -381,6 +381,8 @@ define amdgpu_kernel void @fast_frem_f64(ptr addrspace(1) %out, ptr addrspace(1)
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_load_dwordx2 s[2:3], s[6:7], 0x0
 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; CI-NEXT:    s_mov_b32 s6, -1
+; CI-NEXT:    s_mov_b32 s7, 0xf000
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_rcp_f64_e32 v[0:1], s[0:1]
 ; CI-NEXT:    v_fma_f64 v[2:3], -s[0:1], v[0:1], 1.0
@@ -394,9 +396,7 @@ define amdgpu_kernel void @fast_frem_f64(ptr addrspace(1) %out, ptr addrspace(1)
 ; CI-NEXT:    v_fma_f64 v[0:1], v[6:7], v[0:1], v[4:5]
 ; CI-NEXT:    v_trunc_f64_e32 v[0:1], v[0:1]
 ; CI-NEXT:    v_fma_f64 v[0:1], -v[0:1], s[0:1], v[2:3]
-; CI-NEXT:    v_mov_b32_e32 v2, s4
-; CI-NEXT:    v_mov_b32_e32 v3, s5
-; CI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; CI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; CI-NEXT:    s_endpgm
 ;
 ; VI-LABEL: fast_frem_f64:
@@ -438,6 +438,8 @@ define amdgpu_kernel void @unsafe_frem_f64(ptr addrspace(1) %out, ptr addrspace(
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_load_dwordx2 s[2:3], s[6:7], 0x0
 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; CI-NEXT:    s_mov_b32 s6, -1
+; CI-NEXT:    s_mov_b32 s7, 0xf000
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_rcp_f64_e32 v[0:1], s[0:1]
 ; CI-NEXT:    v_fma_f64 v[2:3], -s[0:1], v[0:1], 1.0
@@ -451,9 +453,7 @@ define amdgpu_kernel void @unsafe_frem_f64(ptr addrspace(1) %out, ptr addrspace(
 ; CI-NEXT:    v_fma_f64 v[0:1], v[6:7], v[0:1], v[4:5]
 ; CI-NEXT:    v_trunc_f64_e32 v[0:1], v[0:1]
 ; CI-NEXT:    v_fma_f64 v[0:1], -v[0:1], s[0:1], v[2:3]
-; CI-NEXT:    v_mov_b32_e32 v2, s4
-; CI-NEXT:    v_mov_b32_e32 v3, s5
-; CI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; CI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; CI-NEXT:    s_endpgm
 ;
 ; VI-LABEL: unsafe_frem_f64:
@@ -532,15 +532,15 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; CI-NEXT:    v_fma_f32 v3, -v3, v6, v4
 ; CI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
 ; CI-NEXT:    v_div_fmas_f32 v3, v3, v5, v6
+; CI-NEXT:    s_mov_b32 s6, -1
+; CI-NEXT:    s_mov_b32 s7, 0xf000
 ; CI-NEXT:    v_div_fixup_f32 v3, v3, v2, v1
 ; CI-NEXT:    v_trunc_f32_e32 v3, v3
 ; CI-NEXT:    v_fma_f32 v1, -v3, v2, v1
 ; CI-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; CI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; CI-NEXT:    v_or_b32_e32 v2, v0, v1
-; CI-NEXT:    v_mov_b32_e32 v0, s4
-; CI-NEXT:    v_mov_b32_e32 v1, s5
-; CI-NEXT:    flat_store_dword v[0:1], v2
+; CI-NEXT:    v_or_b32_e32 v0, v0, v1
+; CI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; CI-NEXT:    s_endpgm
 ;
 ; VI-LABEL: frem_v2f16:
@@ -669,15 +669,15 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; CI-NEXT:    v_div_fmas_f32 v5, v5, v7, v8
 ; CI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; CI-NEXT:    v_or_b32_e32 v0, v0, v1
+; CI-NEXT:    s_mov_b32 s6, -1
+; CI-NEXT:    s_mov_b32 s7, 0xf000
 ; CI-NEXT:    v_div_fixup_f32 v5, v5, v4, v3
 ; CI-NEXT:    v_trunc_f32_e32 v5, v5
 ; CI-NEXT:    v_fma_f32 v3, -v5, v4, v3
 ; CI-NEXT:    v_cvt_f16_f32_e32 v3, v3
 ; CI-NEXT:    v_lshlrev_b32_e32 v1, 16, v3
 ; CI-NEXT:    v_or_b32_e32 v1, v2, v1
-; CI-NEXT:    v_mov_b32_e32 v2, s4
-; CI-NEXT:    v_mov_b32_e32 v3, s5
-; CI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; CI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; CI-NEXT:    s_endpgm
 ;
 ; VI-LABEL: frem_v4f16:
@@ -1017,6 +1017,8 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; CI-NEXT:    v_mov_b32_e32 v1, s9
 ; CI-NEXT:    v_div_scale_f64 v[2:3], s[6:7], v[0:1], v[0:1], s[0:1]
 ; CI-NEXT:    v_div_scale_f64 v[8:9], vcc, s[0:1], v[0:1], s[0:1]
+; CI-NEXT:    s_mov_b32 s6, -1
+; CI-NEXT:    s_mov_b32 s7, 0xf000
 ; CI-NEXT:    v_rcp_f64_e32 v[4:5], v[2:3]
 ; CI-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
 ; CI-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
@@ -1043,9 +1045,7 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; CI-NEXT:    v_div_fixup_f64 v[4:5], v[4:5], v[2:3], s[2:3]
 ; CI-NEXT:    v_trunc_f64_e32 v[4:5], v[4:5]
 ; CI-NEXT:    v_fma_f64 v[2:3], -v[4:5], v[2:3], s[2:3]
-; CI-NEXT:    v_mov_b32_e32 v4, s4
-; CI-NEXT:    v_mov_b32_e32 v5, s5
-; CI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
+; CI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
 ; CI-NEXT:    s_endpgm
 ;
 ; VI-LABEL: frem_v2f64:

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i16.ll
index d2b2aef8077f945..d6957be8ab8ffbb 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i16.ll
@@ -50,11 +50,12 @@ define amdgpu_ps void @insertelement_s_v2i16_s_s(ptr addrspace(4) inreg %ptr, i1
 ; GFX7-NEXT:    s_lshl_b32 s1, 0xffff, s1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    s_andn2_b32 s0, s0, s1
-; GFX7-NEXT:    s_or_b32 s0, s0, s2
-; GFX7-NEXT:    v_mov_b32_e32 v0, 0
-; GFX7-NEXT:    v_mov_b32_e32 v1, 0
-; GFX7-NEXT:    v_mov_b32_e32 v2, s0
-; GFX7-NEXT:    flat_store_dword v[0:1], v2
+; GFX7-NEXT:    s_or_b32 s2, s0, s2
+; GFX7-NEXT:    s_mov_b64 s[0:1], 0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s2
+; GFX7-NEXT:    s_mov_b32 s2, -1
+; GFX7-NEXT:    s_mov_b32 s3, 0xf000
+; GFX7-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: insertelement_s_v2i16_s_s:
@@ -135,19 +136,21 @@ define amdgpu_ps void @insertelement_v_v2i16_s_s(ptr addrspace(1) %ptr, i16 inre
 ;
 ; GFX7-LABEL: insertelement_v_v2i16_s_s:
 ; GFX7:       ; %bb.0:
-; GFX7-NEXT:    flat_load_dword v0, v[0:1]
+; GFX7-NEXT:    s_mov_b32 s6, 0
+; GFX7-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-NEXT:    s_mov_b64 s[4:5], 0
+; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
 ; GFX7-NEXT:    s_and_b32 s0, s3, 1
 ; GFX7-NEXT:    s_and_b32 s1, s2, 0xffff
 ; GFX7-NEXT:    s_lshl_b32 s0, s0, 4
 ; GFX7-NEXT:    s_lshl_b32 s1, s1, s0
 ; GFX7-NEXT:    s_lshl_b32 s0, 0xffff, s0
 ; GFX7-NEXT:    s_not_b32 s0, s0
+; GFX7-NEXT:    s_mov_b32 s6, -1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_and_b32_e32 v2, s0, v0
-; GFX7-NEXT:    v_mov_b32_e32 v0, 0
-; GFX7-NEXT:    v_mov_b32_e32 v1, 0
-; GFX7-NEXT:    v_or_b32_e32 v2, s1, v2
-; GFX7-NEXT:    flat_store_dword v[0:1], v2
+; GFX7-NEXT:    v_and_b32_e32 v0, s0, v0
+; GFX7-NEXT:    v_or_b32_e32 v0, s1, v0
+; GFX7-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: insertelement_v_v2i16_s_s:
@@ -228,14 +231,15 @@ define amdgpu_ps void @insertelement_s_v2i16_v_s(ptr addrspace(4) inreg %ptr, i1
 ; GFX7-NEXT:    s_and_b32 s1, s4, 1
 ; GFX7-NEXT:    s_lshl_b32 s1, s1, 4
 ; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX7-NEXT:    v_lshlrev_b32_e32 v2, s1, v0
+; GFX7-NEXT:    v_lshlrev_b32_e32 v0, s1, v0
 ; GFX7-NEXT:    s_lshl_b32 s1, 0xffff, s1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    s_andn2_b32 s0, s0, s1
-; GFX7-NEXT:    v_mov_b32_e32 v0, 0
-; GFX7-NEXT:    v_mov_b32_e32 v1, 0
-; GFX7-NEXT:    v_or_b32_e32 v2, s0, v2
-; GFX7-NEXT:    flat_store_dword v[0:1], v2
+; GFX7-NEXT:    v_or_b32_e32 v0, s0, v0
+; GFX7-NEXT:    s_mov_b64 s[0:1], 0
+; GFX7-NEXT:    s_mov_b32 s2, -1
+; GFX7-NEXT:    s_mov_b32 s3, 0xf000
+; GFX7-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: insertelement_s_v2i16_v_s:
@@ -318,15 +322,16 @@ define amdgpu_ps void @insertelement_s_v2i16_s_v(ptr addrspace(4) inreg %ptr, i1
 ; GFX7-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
 ; GFX7-NEXT:    s_and_b32 s1, s4, 0xffff
-; GFX7-NEXT:    v_lshl_b32_e32 v2, s1, v0
+; GFX7-NEXT:    v_lshl_b32_e32 v1, s1, v0
 ; GFX7-NEXT:    v_lshl_b32_e32 v0, 0xffff, v0
 ; GFX7-NEXT:    v_not_b32_e32 v0, v0
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_and_b32_e32 v3, s0, v0
-; GFX7-NEXT:    v_mov_b32_e32 v0, 0
-; GFX7-NEXT:    v_mov_b32_e32 v1, 0
-; GFX7-NEXT:    v_or_b32_e32 v2, v3, v2
-; GFX7-NEXT:    flat_store_dword v[0:1], v2
+; GFX7-NEXT:    v_and_b32_e32 v0, s0, v0
+; GFX7-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX7-NEXT:    s_mov_b64 s[0:1], 0
+; GFX7-NEXT:    s_mov_b32 s2, -1
+; GFX7-NEXT:    s_mov_b32 s3, 0xf000
+; GFX7-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: insertelement_s_v2i16_s_v:
@@ -410,15 +415,16 @@ define amdgpu_ps void @insertelement_s_v2i16_v_v(ptr addrspace(4) inreg %ptr, i1
 ; GFX7-NEXT:    v_and_b32_e32 v1, 1, v1
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 4, v1
 ; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX7-NEXT:    v_lshlrev_b32_e32 v2, v1, v0
-; GFX7-NEXT:    v_lshl_b32_e32 v0, 0xffff, v1
-; GFX7-NEXT:    v_not_b32_e32 v0, v0
+; GFX7-NEXT:    v_lshlrev_b32_e32 v0, v1, v0
+; GFX7-NEXT:    v_lshl_b32_e32 v1, 0xffff, v1
+; GFX7-NEXT:    v_not_b32_e32 v1, v1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_and_b32_e32 v3, s0, v0
-; GFX7-NEXT:    v_mov_b32_e32 v0, 0
-; GFX7-NEXT:    v_mov_b32_e32 v1, 0
-; GFX7-NEXT:    v_or_b32_e32 v2, v3, v2
-; GFX7-NEXT:    flat_store_dword v[0:1], v2
+; GFX7-NEXT:    v_and_b32_e32 v1, s0, v1
+; GFX7-NEXT:    v_or_b32_e32 v0, v1, v0
+; GFX7-NEXT:    s_mov_b64 s[0:1], 0
+; GFX7-NEXT:    s_mov_b32 s2, -1
+; GFX7-NEXT:    s_mov_b32 s3, 0xf000
+; GFX7-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: insertelement_s_v2i16_v_v:
@@ -499,19 +505,21 @@ define amdgpu_ps void @insertelement_v_v2i16_s_v(ptr addrspace(1) %ptr, i16 inre
 ;
 ; GFX7-LABEL: insertelement_v_v2i16_s_v:
 ; GFX7:       ; %bb.0:
-; GFX7-NEXT:    flat_load_dword v0, v[0:1]
+; GFX7-NEXT:    s_mov_b32 s6, 0
+; GFX7-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-NEXT:    s_mov_b64 s[4:5], 0
+; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
 ; GFX7-NEXT:    v_and_b32_e32 v1, 1, v2
 ; GFX7-NEXT:    s_and_b32 s0, s2, 0xffff
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 4, v1
 ; GFX7-NEXT:    v_lshl_b32_e32 v2, s0, v1
 ; GFX7-NEXT:    v_lshl_b32_e32 v1, 0xffff, v1
 ; GFX7-NEXT:    v_not_b32_e32 v1, v1
+; GFX7-NEXT:    s_mov_b32 s6, -1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_and_b32_e32 v3, v0, v1
-; GFX7-NEXT:    v_mov_b32_e32 v0, 0
-; GFX7-NEXT:    v_mov_b32_e32 v1, 0
-; GFX7-NEXT:    v_or_b32_e32 v2, v3, v2
-; GFX7-NEXT:    flat_store_dword v[0:1], v2
+; GFX7-NEXT:    v_and_b32_e32 v0, v0, v1
+; GFX7-NEXT:    v_or_b32_e32 v0, v0, v2
+; GFX7-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: insertelement_v_v2i16_s_v:
@@ -590,19 +598,21 @@ define amdgpu_ps void @insertelement_v_v2i16_v_s(ptr addrspace(1) %ptr, i16 %val
 ;
 ; GFX7-LABEL: insertelement_v_v2i16_v_s:
 ; GFX7:       ; %bb.0:
-; GFX7-NEXT:    flat_load_dword v0, v[0:1]
+; GFX7-NEXT:    s_mov_b32 s6, 0
+; GFX7-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-NEXT:    s_mov_b64 s[4:5], 0
+; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
 ; GFX7-NEXT:    s_and_b32 s0, s2, 1
 ; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff, v2
 ; GFX7-NEXT:    s_lshl_b32 s0, s0, 4
-; GFX7-NEXT:    v_lshlrev_b32_e32 v2, s0, v1
+; GFX7-NEXT:    v_lshlrev_b32_e32 v1, s0, v1
 ; GFX7-NEXT:    s_lshl_b32 s0, 0xffff, s0
 ; GFX7-NEXT:    s_not_b32 s0, s0
+; GFX7-NEXT:    s_mov_b32 s6, -1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_and_b32_e32 v3, s0, v0
-; GFX7-NEXT:    v_mov_b32_e32 v0, 0
-; GFX7-NEXT:    v_mov_b32_e32 v1, 0
-; GFX7-NEXT:    v_or_b32_e32 v2, v3, v2
-; GFX7-NEXT:    flat_store_dword v[0:1], v2
+; GFX7-NEXT:    v_and_b32_e32 v0, s0, v0
+; GFX7-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX7-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: insertelement_v_v2i16_v_s:
@@ -681,19 +691,21 @@ define amdgpu_ps void @insertelement_v_v2i16_v_v(ptr addrspace(1) %ptr, i16 %val
 ;
 ; GFX7-LABEL: insertelement_v_v2i16_v_v:
 ; GFX7:       ; %bb.0:
-; GFX7-NEXT:    flat_load_dword v0, v[0:1]
+; GFX7-NEXT:    s_mov_b32 s2, 0
+; GFX7-NEXT:    s_mov_b32 s3, 0xf000
+; GFX7-NEXT:    s_mov_b64 s[0:1], 0
+; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[0:3], 0 addr64
 ; GFX7-NEXT:    v_and_b32_e32 v1, 1, v3
 ; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff, v2
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 4, v1
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v2, v1, v2
 ; GFX7-NEXT:    v_lshl_b32_e32 v1, 0xffff, v1
 ; GFX7-NEXT:    v_not_b32_e32 v1, v1
+; GFX7-NEXT:    s_mov_b32 s2, -1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_and_b32_e32 v3, v0, v1
-; GFX7-NEXT:    v_mov_b32_e32 v0, 0
-; GFX7-NEXT:    v_mov_b32_e32 v1, 0
-; GFX7-NEXT:    v_or_b32_e32 v2, v3, v2
-; GFX7-NEXT:    flat_store_dword v[0:1], v2
+; GFX7-NEXT:    v_and_b32_e32 v0, v0, v1
+; GFX7-NEXT:    v_or_b32_e32 v0, v0, v2
+; GFX7-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: insertelement_v_v2i16_v_v:
@@ -842,7 +854,10 @@ define amdgpu_ps void @insertelement_v_v4i16_s_s(ptr addrspace(1) %ptr, i16 inre
 ;
 ; GFX7-LABEL: insertelement_v_v4i16_s_s:
 ; GFX7:       ; %bb.0:
-; GFX7-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
+; GFX7-NEXT:    s_mov_b32 s6, 0
+; GFX7-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-NEXT:    s_mov_b64 s[4:5], 0
+; GFX7-NEXT:    buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64
 ; GFX7-NEXT:    s_and_b32 s1, s3, 1
 ; GFX7-NEXT:    s_lshr_b32 s0, s3, 1
 ; GFX7-NEXT:    s_and_b32 s2, s2, 0xffff
@@ -851,16 +866,15 @@ define amdgpu_ps void @insertelement_v_v4i16_s_s(ptr addrspace(1) %ptr, i16 inre
 ; GFX7-NEXT:    s_lshl_b32 s1, 0xffff, s1
 ; GFX7-NEXT:    v_cmp_eq_u32_e64 vcc, s0, 1
 ; GFX7-NEXT:    s_not_b32 s1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v2, 0
-; GFX7-NEXT:    v_mov_b32_e32 v3, 0
+; GFX7-NEXT:    s_mov_b32 s6, -1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_cndmask_b32_e32 v4, v0, v1, vcc
-; GFX7-NEXT:    v_and_b32_e32 v4, s1, v4
-; GFX7-NEXT:    v_or_b32_e32 v4, s2, v4
+; GFX7-NEXT:    v_cndmask_b32_e32 v2, v0, v1, vcc
+; GFX7-NEXT:    v_and_b32_e32 v2, s1, v2
+; GFX7-NEXT:    v_or_b32_e32 v2, s2, v2
 ; GFX7-NEXT:    v_cmp_eq_u32_e64 s[0:1], s0, 0
-; GFX7-NEXT:    v_cndmask_b32_e64 v0, v0, v4, s[0:1]
-; GFX7-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
-; GFX7-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; GFX7-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[0:1]
+; GFX7-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
+; GFX7-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: insertelement_v_v4i16_s_s:
@@ -980,15 +994,16 @@ define amdgpu_ps void @insertelement_s_v4i16_v_s(ptr addrspace(4) inreg %ptr, i1
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v0, s4, v0
 ; GFX7-NEXT:    s_lshl_b32 s4, 0xffff, s4
 ; GFX7-NEXT:    s_andn2_b32 s3, s3, s4
-; GFX7-NEXT:    v_or_b32_e32 v4, s3, v0
+; GFX7-NEXT:    v_or_b32_e32 v2, s3, v0
 ; GFX7-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
-; GFX7-NEXT:    v_mov_b32_e32 v2, 0
+; GFX7-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
 ; GFX7-NEXT:    v_cmp_eq_u32_e64 vcc, s2, 1
-; GFX7-NEXT:    v_mov_b32_e32 v3, 0
-; GFX7-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
-; GFX7-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; GFX7-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
+; GFX7-NEXT:    s_mov_b64 s[0:1], 0
+; GFX7-NEXT:    s_mov_b32 s2, -1
+; GFX7-NEXT:    s_mov_b32 s3, 0xf000
+; GFX7-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: insertelement_s_v4i16_v_s:
@@ -1119,15 +1134,16 @@ define amdgpu_ps void @insertelement_s_v4i16_s_v(ptr addrspace(4) inreg %ptr, i1
 ; GFX7-NEXT:    v_lshl_b32_e32 v0, 0xffff, v0
 ; GFX7-NEXT:    v_not_b32_e32 v0, v0
 ; GFX7-NEXT:    v_and_b32_e32 v0, v1, v0
-; GFX7-NEXT:    v_or_b32_e32 v4, v0, v3
+; GFX7-NEXT:    v_or_b32_e32 v3, v0, v3
 ; GFX7-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX7-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX7-NEXT:    v_cmp_eq_u32_e64 s[0:1], 0, v2
-; GFX7-NEXT:    v_mov_b32_e32 v2, 0
-; GFX7-NEXT:    v_cndmask_b32_e64 v0, v0, v4, s[0:1]
-; GFX7-NEXT:    v_mov_b32_e32 v3, 0
-; GFX7-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
-; GFX7-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; GFX7-NEXT:    v_cndmask_b32_e64 v0, v0, v3, s[0:1]
+; GFX7-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
+; GFX7-NEXT:    s_mov_b64 s[0:1], 0
+; GFX7-NEXT:    s_mov_b32 s2, -1
+; GFX7-NEXT:    s_mov_b32 s3, 0xf000
+; GFX7-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: insertelement_s_v4i16_s_v:
@@ -1258,15 +1274,16 @@ define amdgpu_ps void @insertelement_s_v4i16_v_v(ptr addrspace(4) inreg %ptr, i1
 ; GFX7-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
 ; GFX7-NEXT:    v_not_b32_e32 v1, v1
 ; GFX7-NEXT:    v_and_b32_e32 v1, v3, v1
-; GFX7-NEXT:    v_or_b32_e32 v4, v1, v0
+; GFX7-NEXT:    v_or_b32_e32 v3, v1, v0
 ; GFX7-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX7-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX7-NEXT:    v_cmp_eq_u32_e64 s[0:1], 0, v2
-; GFX7-NEXT:    v_mov_b32_e32 v2, 0
-; GFX7-NEXT:    v_cndmask_b32_e64 v0, v0, v4, s[0:1]
-; GFX7-NEXT:    v_mov_b32_e32 v3, 0
-; GFX7-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
-; GFX7-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; GFX7-NEXT:    v_cndmask_b32_e64 v0, v0, v3, s[0:1]
+; GFX7-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
+; GFX7-NEXT:    s_mov_b64 s[0:1], 0
+; GFX7-NEXT:    s_mov_b32 s2, -1
+; GFX7-NEXT:    s_mov_b32 s3, 0xf000
+; GFX7-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: insertelement_s_v4i16_v_v:
@@ -1376,25 +1393,27 @@ define amdgpu_ps void @insertelement_v_v4i16_s_v(ptr addrspace(1) %ptr, i16 inre
 ;
 ; GFX7-LABEL: insertelement_v_v4i16_s_v:
 ; GFX7:       ; %bb.0:
-; GFX7-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
-; GFX7-NEXT:    v_lshrrev_b32_e32 v5, 1, v2
+; GFX7-NEXT:    s_mov_b32 s6, 0
+; GFX7-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-NEXT:    s_mov_b64 s[4:5], 0
+; GFX7-NEXT:    buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 1, v2
 ; GFX7-NEXT:    v_and_b32_e32 v2, 1, v2
 ; GFX7-NEXT:    s_and_b32 s0, s2, 0xffff
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 4, v2
-; GFX7-NEXT:    v_lshl_b32_e32 v6, s0, v2
+; GFX7-NEXT:    v_lshl_b32_e32 v4, s0, v2
 ; GFX7-NEXT:    v_lshl_b32_e32 v2, 0xffff, v2
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v5
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v3
 ; GFX7-NEXT:    v_not_b32_e32 v2, v2
-; GFX7-NEXT:    v_mov_b32_e32 v3, 0
-; GFX7-NEXT:    v_cmp_eq_u32_e64 s[0:1], 0, v5
-; GFX7-NEXT:    v_mov_b32_e32 v4, 0
+; GFX7-NEXT:    v_cmp_eq_u32_e64 s[0:1], 0, v3
+; GFX7-NEXT:    s_mov_b32 s6, -1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_cndmask_b32_e32 v7, v0, v1, vcc
-; GFX7-NEXT:    v_and_b32_e32 v2, v7, v2
-; GFX7-NEXT:    v_or_b32_e32 v2, v2, v6
+; GFX7-NEXT:    v_cndmask_b32_e32 v5, v0, v1, vcc
+; GFX7-NEXT:    v_and_b32_e32 v2, v5, v2
+; GFX7-NEXT:    v_or_b32_e32 v2, v2, v4
 ; GFX7-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[0:1]
 ; GFX7-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
-; GFX7-NEXT:    flat_store_dwordx2 v[3:4], v[0:1]
+; GFX7-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: insertelement_v_v4i16_s_v:
@@ -1498,7 +1517,10 @@ define amdgpu_ps void @insertelement_v_v4i16_v_s(ptr addrspace(1) %ptr, i16 %val
 ;
 ; GFX7-LABEL: insertelement_v_v4i16_v_s:
 ; GFX7:       ; %bb.0:
-; GFX7-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
+; GFX7-NEXT:    s_mov_b32 s6, 0
+; GFX7-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-NEXT:    s_mov_b64 s[4:5], 0
+; GFX7-NEXT:    buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64
 ; GFX7-NEXT:    s_and_b32 s1, s2, 1
 ; GFX7-NEXT:    s_lshr_b32 s0, s2, 1
 ; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff, v2
@@ -1507,16 +1529,15 @@ define amdgpu_ps void @insertelement_v_v4i16_v_s(ptr addrspace(1) %ptr, i16 %val
 ; GFX7-NEXT:    s_lshl_b32 s1, 0xffff, s1
 ; GFX7-NEXT:    v_cmp_eq_u32_e64 vcc, s0, 1
 ; GFX7-NEXT:    s_not_b32 s1, s1
-; GFX7-NEXT:    v_mov_b32_e32 v3, 0
-; GFX7-NEXT:    v_mov_b32_e32 v4, 0
+; GFX7-NEXT:    s_mov_b32 s6, -1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_cndmask_b32_e32 v5, v0, v1, vcc
-; GFX7-NEXT:    v_and_b32_e32 v5, s1, v5
-; GFX7-NEXT:    v_or_b32_e32 v2, v5, v2
+; GFX7-NEXT:    v_cndmask_b32_e32 v3, v0, v1, vcc
+; GFX7-NEXT:    v_and_b32_e32 v3, s1, v3
+; GFX7-NEXT:    v_or_b32_e32 v2, v3, v2
 ; GFX7-NEXT:    v_cmp_eq_u32_e64 s[0:1], s0, 0
 ; GFX7-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[0:1]
 ; GFX7-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
-; GFX7-NEXT:    flat_store_dwordx2 v[3:4], v[0:1]
+; GFX7-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: insertelement_v_v4i16_v_s:
@@ -1619,25 +1640,27 @@ define amdgpu_ps void @insertelement_v_v4i16_v_v(ptr addrspace(1) %ptr, i16 %val
 ;
 ; GFX7-LABEL: insertelement_v_v4i16_v_v:
 ; GFX7:       ; %bb.0:
-; GFX7-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
-; GFX7-NEXT:    v_lshrrev_b32_e32 v6, 1, v3
+; GFX7-NEXT:    s_mov_b32 s6, 0
+; GFX7-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-NEXT:    s_mov_b64 s[4:5], 0
+; GFX7-NEXT:    buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT:    v_lshrrev_b32_e32 v4, 1, v3
 ; GFX7-NEXT:    v_and_b32_e32 v3, 1, v3
 ; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff, v2
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 4, v3
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v2, v3, v2
 ; GFX7-NEXT:    v_lshl_b32_e32 v3, 0xffff, v3
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v6
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v4
 ; GFX7-NEXT:    v_not_b32_e32 v3, v3
-; GFX7-NEXT:    v_mov_b32_e32 v4, 0
-; GFX7-NEXT:    v_cmp_eq_u32_e64 s[0:1], 0, v6
-; GFX7-NEXT:    v_mov_b32_e32 v5, 0
+; GFX7-NEXT:    v_cmp_eq_u32_e64 s[0:1], 0, v4
+; GFX7-NEXT:    s_mov_b32 s6, -1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_cndmask_b32_e32 v7, v0, v1, vcc
-; GFX7-NEXT:    v_and_b32_e32 v3, v7, v3
+; GFX7-NEXT:    v_cndmask_b32_e32 v5, v0, v1, vcc
+; GFX7-NEXT:    v_and_b32_e32 v3, v5, v3
 ; GFX7-NEXT:    v_or_b32_e32 v2, v3, v2
 ; GFX7-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[0:1]
 ; GFX7-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
-; GFX7-NEXT:    flat_store_dwordx2 v[4:5], v[0:1]
+; GFX7-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: insertelement_v_v4i16_v_v:

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-global.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-global.mir
index ea5a0c5aceb5d03..f26e23293dae798 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-global.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-global.mir
@@ -31,6 +31,7 @@ body: |
     ; GFX6-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3
     ; GFX6-NEXT: [[BUFFER_LOAD_DWORD_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_ADDR64 [[COPY]], [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec :: (load (s32), addrspace 1)
     ; GFX6-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_ADDR64_]]
+    ;
     ; GFX7-LABEL: name: load_global_s32_from_4
     ; GFX7: liveins: $vgpr0_vgpr1
     ; GFX7-NEXT: {{  $}}
@@ -42,30 +43,35 @@ body: |
     ; GFX7-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3
     ; GFX7-NEXT: [[BUFFER_LOAD_DWORD_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_ADDR64 [[COPY]], [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec :: (load (s32), addrspace 1)
     ; GFX7-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_ADDR64_]]
+    ;
     ; GFX7-FLAT-LABEL: name: load_global_s32_from_4
     ; GFX7-FLAT: liveins: $vgpr0_vgpr1
     ; GFX7-FLAT-NEXT: {{  $}}
     ; GFX7-FLAT-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX7-FLAT-NEXT: [[FLAT_LOAD_DWORD:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s32), addrspace 1)
     ; GFX7-FLAT-NEXT: $vgpr0 = COPY [[FLAT_LOAD_DWORD]]
+    ;
     ; GFX8-LABEL: name: load_global_s32_from_4
     ; GFX8: liveins: $vgpr0_vgpr1
     ; GFX8-NEXT: {{  $}}
     ; GFX8-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX8-NEXT: [[FLAT_LOAD_DWORD:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s32), addrspace 1)
     ; GFX8-NEXT: $vgpr0 = COPY [[FLAT_LOAD_DWORD]]
+    ;
     ; GFX9-LABEL: name: load_global_s32_from_4
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX9-NEXT: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[COPY]], 0, 0, implicit $exec :: (load (s32), addrspace 1)
     ; GFX9-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_DWORD]]
+    ;
     ; GFX10-LABEL: name: load_global_s32_from_4
     ; GFX10: liveins: $vgpr0_vgpr1
     ; GFX10-NEXT: {{  $}}
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX10-NEXT: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[COPY]], 0, 0, implicit $exec :: (load (s32), addrspace 1)
     ; GFX10-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_DWORD]]
+    ;
     ; GFX11-LABEL: name: load_global_s32_from_4
     ; GFX11: liveins: $vgpr0_vgpr1
     ; GFX11-NEXT: {{  $}}
@@ -100,6 +106,7 @@ body: |
     ; GFX6-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3
     ; GFX6-NEXT: [[BUFFER_LOAD_USHORT_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_USHORT_ADDR64 [[COPY]], [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec :: (load (s16), addrspace 1)
     ; GFX6-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_USHORT_ADDR64_]]
+    ;
     ; GFX7-LABEL: name: load_global_s32_from_2
     ; GFX7: liveins: $vgpr0_vgpr1
     ; GFX7-NEXT: {{  $}}
@@ -111,30 +118,35 @@ body: |
     ; GFX7-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3
     ; GFX7-NEXT: [[BUFFER_LOAD_USHORT_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_USHORT_ADDR64 [[COPY]], [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec :: (load (s16), addrspace 1)
     ; GFX7-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_USHORT_ADDR64_]]
+    ;
     ; GFX7-FLAT-LABEL: name: load_global_s32_from_2
     ; GFX7-FLAT: liveins: $vgpr0_vgpr1
     ; GFX7-FLAT-NEXT: {{  $}}
     ; GFX7-FLAT-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX7-FLAT-NEXT: [[FLAT_LOAD_USHORT:%[0-9]+]]:vgpr_32 = FLAT_LOAD_USHORT [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s16), addrspace 1)
     ; GFX7-FLAT-NEXT: $vgpr0 = COPY [[FLAT_LOAD_USHORT]]
+    ;
     ; GFX8-LABEL: name: load_global_s32_from_2
     ; GFX8: liveins: $vgpr0_vgpr1
     ; GFX8-NEXT: {{  $}}
     ; GFX8-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX8-NEXT: [[FLAT_LOAD_USHORT:%[0-9]+]]:vgpr_32 = FLAT_LOAD_USHORT [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s16), addrspace 1)
     ; GFX8-NEXT: $vgpr0 = COPY [[FLAT_LOAD_USHORT]]
+    ;
     ; GFX9-LABEL: name: load_global_s32_from_2
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX9-NEXT: [[GLOBAL_LOAD_USHORT:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_USHORT [[COPY]], 0, 0, implicit $exec :: (load (s16), addrspace 1)
     ; GFX9-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_USHORT]]
+    ;
     ; GFX10-LABEL: name: load_global_s32_from_2
     ; GFX10: liveins: $vgpr0_vgpr1
     ; GFX10-NEXT: {{  $}}
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX10-NEXT: [[GLOBAL_LOAD_USHORT:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_USHORT [[COPY]], 0, 0, implicit $exec :: (load (s16), addrspace 1)
     ; GFX10-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_USHORT]]
+    ;
     ; GFX11-LABEL: name: load_global_s32_from_2
     ; GFX11: liveins: $vgpr0_vgpr1
     ; GFX11-NEXT: {{  $}}
@@ -169,6 +181,7 @@ body: |
     ; GFX6-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3
     ; GFX6-NEXT: [[BUFFER_LOAD_UBYTE_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_ADDR64 [[COPY]], [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec :: (load (s8), addrspace 1)
     ; GFX6-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_ADDR64_]]
+    ;
     ; GFX7-LABEL: name: load_global_s32_from_1
     ; GFX7: liveins: $vgpr0_vgpr1
     ; GFX7-NEXT: {{  $}}
@@ -180,30 +193,35 @@ body: |
     ; GFX7-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3
     ; GFX7-NEXT: [[BUFFER_LOAD_UBYTE_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_ADDR64 [[COPY]], [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec :: (load (s8), addrspace 1)
     ; GFX7-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_ADDR64_]]
+    ;
     ; GFX7-FLAT-LABEL: name: load_global_s32_from_1
     ; GFX7-FLAT: liveins: $vgpr0_vgpr1
     ; GFX7-FLAT-NEXT: {{  $}}
     ; GFX7-FLAT-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX7-FLAT-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 1)
     ; GFX7-FLAT-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
+    ;
     ; GFX8-LABEL: name: load_global_s32_from_1
     ; GFX8: liveins: $vgpr0_vgpr1
     ; GFX8-NEXT: {{  $}}
     ; GFX8-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX8-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 1)
     ; GFX8-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
+    ;
     ; GFX9-LABEL: name: load_global_s32_from_1
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX9-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[COPY]], 0, 0, implicit $exec :: (load (s8), addrspace 1)
     ; GFX9-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_UBYTE]]
+    ;
     ; GFX10-LABEL: name: load_global_s32_from_1
     ; GFX10: liveins: $vgpr0_vgpr1
     ; GFX10-NEXT: {{  $}}
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX10-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[COPY]], 0, 0, implicit $exec :: (load (s8), addrspace 1)
     ; GFX10-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_UBYTE]]
+    ;
     ; GFX11-LABEL: name: load_global_s32_from_1
     ; GFX11: liveins: $vgpr0_vgpr1
     ; GFX11-NEXT: {{  $}}
@@ -238,6 +256,7 @@ body: |
     ; GFX6-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3
     ; GFX6-NEXT: [[BUFFER_LOAD_DWORDX2_ADDR64_:%[0-9]+]]:vreg_64 = BUFFER_LOAD_DWORDX2_ADDR64 [[COPY]], [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec :: (load (<2 x s32>), addrspace 1)
     ; GFX6-NEXT: $vgpr0_vgpr1 = COPY [[BUFFER_LOAD_DWORDX2_ADDR64_]]
+    ;
     ; GFX7-LABEL: name: load_global_v2s32
     ; GFX7: liveins: $vgpr0_vgpr1
     ; GFX7-NEXT: {{  $}}
@@ -249,30 +268,35 @@ body: |
     ; GFX7-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3
     ; GFX7-NEXT: [[BUFFER_LOAD_DWORDX2_ADDR64_:%[0-9]+]]:vreg_64 = BUFFER_LOAD_DWORDX2_ADDR64 [[COPY]], [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec :: (load (<2 x s32>), addrspace 1)
     ; GFX7-NEXT: $vgpr0_vgpr1 = COPY [[BUFFER_LOAD_DWORDX2_ADDR64_]]
+    ;
     ; GFX7-FLAT-LABEL: name: load_global_v2s32
     ; GFX7-FLAT: liveins: $vgpr0_vgpr1
     ; GFX7-FLAT-NEXT: {{  $}}
     ; GFX7-FLAT-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX7-FLAT-NEXT: [[FLAT_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = FLAT_LOAD_DWORDX2 [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load (<2 x s32>), addrspace 1)
     ; GFX7-FLAT-NEXT: $vgpr0_vgpr1 = COPY [[FLAT_LOAD_DWORDX2_]]
+    ;
     ; GFX8-LABEL: name: load_global_v2s32
     ; GFX8: liveins: $vgpr0_vgpr1
     ; GFX8-NEXT: {{  $}}
     ; GFX8-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX8-NEXT: [[FLAT_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = FLAT_LOAD_DWORDX2 [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load (<2 x s32>), addrspace 1)
     ; GFX8-NEXT: $vgpr0_vgpr1 = COPY [[FLAT_LOAD_DWORDX2_]]
+    ;
     ; GFX9-LABEL: name: load_global_v2s32
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX9-NEXT: [[GLOBAL_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = GLOBAL_LOAD_DWORDX2 [[COPY]], 0, 0, implicit $exec :: (load (<2 x s32>), addrspace 1)
     ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[GLOBAL_LOAD_DWORDX2_]]
+    ;
     ; GFX10-LABEL: name: load_global_v2s32
     ; GFX10: liveins: $vgpr0_vgpr1
     ; GFX10-NEXT: {{  $}}
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX10-NEXT: [[GLOBAL_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = GLOBAL_LOAD_DWORDX2 [[COPY]], 0, 0, implicit $exec :: (load (<2 x s32>), addrspace 1)
     ; GFX10-NEXT: $vgpr0_vgpr1 = COPY [[GLOBAL_LOAD_DWORDX2_]]
+    ;
     ; GFX11-LABEL: name: load_global_v2s32
     ; GFX11: liveins: $vgpr0_vgpr1
     ; GFX11-NEXT: {{  $}}
@@ -307,6 +331,7 @@ body: |
     ; GFX6-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3
     ; GFX6-NEXT: [[BUFFER_LOAD_DWORDX4_ADDR64_:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_ADDR64 [[COPY]], [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec :: (load (<4 x s32>), align 4, addrspace 1)
     ; GFX6-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUFFER_LOAD_DWORDX4_ADDR64_]]
+    ;
     ; GFX7-LABEL: name: load_global_v4s32
     ; GFX7: liveins: $vgpr0_vgpr1
     ; GFX7-NEXT: {{  $}}
@@ -318,30 +343,35 @@ body: |
     ; GFX7-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3
     ; GFX7-NEXT: [[BUFFER_LOAD_DWORDX4_ADDR64_:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_ADDR64 [[COPY]], [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec :: (load (<4 x s32>), align 4, addrspace 1)
     ; GFX7-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUFFER_LOAD_DWORDX4_ADDR64_]]
+    ;
     ; GFX7-FLAT-LABEL: name: load_global_v4s32
     ; GFX7-FLAT: liveins: $vgpr0_vgpr1
     ; GFX7-FLAT-NEXT: {{  $}}
     ; GFX7-FLAT-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX7-FLAT-NEXT: [[FLAT_LOAD_DWORDX4_:%[0-9]+]]:vreg_128 = FLAT_LOAD_DWORDX4 [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load (<4 x s32>), align 4, addrspace 1)
     ; GFX7-FLAT-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[FLAT_LOAD_DWORDX4_]]
+    ;
     ; GFX8-LABEL: name: load_global_v4s32
     ; GFX8: liveins: $vgpr0_vgpr1
     ; GFX8-NEXT: {{  $}}
     ; GFX8-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX8-NEXT: [[FLAT_LOAD_DWORDX4_:%[0-9]+]]:vreg_128 = FLAT_LOAD_DWORDX4 [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load (<4 x s32>), align 4, addrspace 1)
     ; GFX8-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[FLAT_LOAD_DWORDX4_]]
+    ;
     ; GFX9-LABEL: name: load_global_v4s32
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX9-NEXT: [[GLOBAL_LOAD_DWORDX4_:%[0-9]+]]:vreg_128 = GLOBAL_LOAD_DWORDX4 [[COPY]], 0, 0, implicit $exec :: (load (<4 x s32>), align 4, addrspace 1)
     ; GFX9-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[GLOBAL_LOAD_DWORDX4_]]
+    ;
     ; GFX10-LABEL: name: load_global_v4s32
     ; GFX10: liveins: $vgpr0_vgpr1
     ; GFX10-NEXT: {{  $}}
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX10-NEXT: [[GLOBAL_LOAD_DWORDX4_:%[0-9]+]]:vreg_128 = GLOBAL_LOAD_DWORDX4 [[COPY]], 0, 0, implicit $exec :: (load (<4 x s32>), align 4, addrspace 1)
     ; GFX10-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[GLOBAL_LOAD_DWORDX4_]]
+    ;
     ; GFX11-LABEL: name: load_global_v4s32
     ; GFX11: liveins: $vgpr0_vgpr1
     ; GFX11-NEXT: {{  $}}
@@ -368,39 +398,55 @@ body: |
     ; GFX6-LABEL: name: load_global_s64
     ; GFX6: liveins: $vgpr0_vgpr1
     ; GFX6-NEXT: {{  $}}
-    ; GFX6-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
-    ; GFX6-NEXT: [[LOAD:%[0-9]+]]:vreg_64(s64) = G_LOAD [[COPY]](p1) :: (load (s64), addrspace 1)
-    ; GFX6-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](s64)
+    ; GFX6-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
+    ; GFX6-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+    ; GFX6-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 61440
+    ; GFX6-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
+    ; GFX6-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
+    ; GFX6-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3
+    ; GFX6-NEXT: [[BUFFER_LOAD_DWORDX2_ADDR64_:%[0-9]+]]:vreg_64 = BUFFER_LOAD_DWORDX2_ADDR64 [[COPY]], [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec :: (load (s64), addrspace 1)
+    ; GFX6-NEXT: $vgpr0_vgpr1 = COPY [[BUFFER_LOAD_DWORDX2_ADDR64_]]
+    ;
     ; GFX7-LABEL: name: load_global_s64
     ; GFX7: liveins: $vgpr0_vgpr1
     ; GFX7-NEXT: {{  $}}
     ; GFX7-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX7-NEXT: [[FLAT_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = FLAT_LOAD_DWORDX2 [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s64), addrspace 1)
-    ; GFX7-NEXT: $vgpr0_vgpr1 = COPY [[FLAT_LOAD_DWORDX2_]]
+    ; GFX7-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+    ; GFX7-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 61440
+    ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
+    ; GFX7-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
+    ; GFX7-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3
+    ; GFX7-NEXT: [[BUFFER_LOAD_DWORDX2_ADDR64_:%[0-9]+]]:vreg_64 = BUFFER_LOAD_DWORDX2_ADDR64 [[COPY]], [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec :: (load (s64), addrspace 1)
+    ; GFX7-NEXT: $vgpr0_vgpr1 = COPY [[BUFFER_LOAD_DWORDX2_ADDR64_]]
+    ;
     ; GFX7-FLAT-LABEL: name: load_global_s64
     ; GFX7-FLAT: liveins: $vgpr0_vgpr1
     ; GFX7-FLAT-NEXT: {{  $}}
     ; GFX7-FLAT-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX7-FLAT-NEXT: [[FLAT_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = FLAT_LOAD_DWORDX2 [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s64), addrspace 1)
     ; GFX7-FLAT-NEXT: $vgpr0_vgpr1 = COPY [[FLAT_LOAD_DWORDX2_]]
+    ;
     ; GFX8-LABEL: name: load_global_s64
     ; GFX8: liveins: $vgpr0_vgpr1
     ; GFX8-NEXT: {{  $}}
     ; GFX8-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX8-NEXT: [[FLAT_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = FLAT_LOAD_DWORDX2 [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s64), addrspace 1)
     ; GFX8-NEXT: $vgpr0_vgpr1 = COPY [[FLAT_LOAD_DWORDX2_]]
+    ;
     ; GFX9-LABEL: name: load_global_s64
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX9-NEXT: [[GLOBAL_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = GLOBAL_LOAD_DWORDX2 [[COPY]], 0, 0, implicit $exec :: (load (s64), addrspace 1)
     ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[GLOBAL_LOAD_DWORDX2_]]
+    ;
     ; GFX10-LABEL: name: load_global_s64
     ; GFX10: liveins: $vgpr0_vgpr1
     ; GFX10-NEXT: {{  $}}
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX10-NEXT: [[GLOBAL_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = GLOBAL_LOAD_DWORDX2 [[COPY]], 0, 0, implicit $exec :: (load (s64), addrspace 1)
     ; GFX10-NEXT: $vgpr0_vgpr1 = COPY [[GLOBAL_LOAD_DWORDX2_]]
+    ;
     ; GFX11-LABEL: name: load_global_s64
     ; GFX11: liveins: $vgpr0_vgpr1
     ; GFX11-NEXT: {{  $}}
@@ -427,39 +473,55 @@ body: |
     ; GFX6-LABEL: name: load_global_v2s64
     ; GFX6: liveins: $vgpr0_vgpr1
     ; GFX6-NEXT: {{  $}}
-    ; GFX6-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
-    ; GFX6-NEXT: [[LOAD:%[0-9]+]]:vreg_128(<2 x s64>) = G_LOAD [[COPY]](p1) :: (load (<2 x s64>), align 4, addrspace 1)
-    ; GFX6-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<2 x s64>)
+    ; GFX6-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
+    ; GFX6-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+    ; GFX6-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 61440
+    ; GFX6-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
+    ; GFX6-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
+    ; GFX6-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3
+    ; GFX6-NEXT: [[BUFFER_LOAD_DWORDX4_ADDR64_:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_ADDR64 [[COPY]], [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec :: (load (<2 x s64>), align 4, addrspace 1)
+    ; GFX6-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUFFER_LOAD_DWORDX4_ADDR64_]]
+    ;
     ; GFX7-LABEL: name: load_global_v2s64
     ; GFX7: liveins: $vgpr0_vgpr1
     ; GFX7-NEXT: {{  $}}
     ; GFX7-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX7-NEXT: [[FLAT_LOAD_DWORDX4_:%[0-9]+]]:vreg_128 = FLAT_LOAD_DWORDX4 [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load (<2 x s64>), align 4, addrspace 1)
-    ; GFX7-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[FLAT_LOAD_DWORDX4_]]
+    ; GFX7-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+    ; GFX7-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 61440
+    ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
+    ; GFX7-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
+    ; GFX7-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3
+    ; GFX7-NEXT: [[BUFFER_LOAD_DWORDX4_ADDR64_:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_ADDR64 [[COPY]], [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec :: (load (<2 x s64>), align 4, addrspace 1)
+    ; GFX7-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUFFER_LOAD_DWORDX4_ADDR64_]]
+    ;
     ; GFX7-FLAT-LABEL: name: load_global_v2s64
     ; GFX7-FLAT: liveins: $vgpr0_vgpr1
     ; GFX7-FLAT-NEXT: {{  $}}
     ; GFX7-FLAT-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX7-FLAT-NEXT: [[FLAT_LOAD_DWORDX4_:%[0-9]+]]:vreg_128 = FLAT_LOAD_DWORDX4 [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load (<2 x s64>), align 4, addrspace 1)
     ; GFX7-FLAT-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[FLAT_LOAD_DWORDX4_]]
+    ;
     ; GFX8-LABEL: name: load_global_v2s64
     ; GFX8: liveins: $vgpr0_vgpr1
     ; GFX8-NEXT: {{  $}}
     ; GFX8-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX8-NEXT: [[FLAT_LOAD_DWORDX4_:%[0-9]+]]:vreg_128 = FLAT_LOAD_DWORDX4 [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load (<2 x s64>), align 4, addrspace 1)
     ; GFX8-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[FLAT_LOAD_DWORDX4_]]
+    ;
     ; GFX9-LABEL: name: load_global_v2s64
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX9-NEXT: [[GLOBAL_LOAD_DWORDX4_:%[0-9]+]]:vreg_128 = GLOBAL_LOAD_DWORDX4 [[COPY]], 0, 0, implicit $exec :: (load (<2 x s64>), align 4, addrspace 1)
     ; GFX9-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[GLOBAL_LOAD_DWORDX4_]]
+    ;
     ; GFX10-LABEL: name: load_global_v2s64
     ; GFX10: liveins: $vgpr0_vgpr1
     ; GFX10-NEXT: {{  $}}
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX10-NEXT: [[GLOBAL_LOAD_DWORDX4_:%[0-9]+]]:vreg_128 = GLOBAL_LOAD_DWORDX4 [[COPY]], 0, 0, implicit $exec :: (load (<2 x s64>), align 4, addrspace 1)
     ; GFX10-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[GLOBAL_LOAD_DWORDX4_]]
+    ;
     ; GFX11-LABEL: name: load_global_v2s64
     ; GFX11: liveins: $vgpr0_vgpr1
     ; GFX11-NEXT: {{  $}}
@@ -489,36 +551,42 @@ body: |
     ; GFX6-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
     ; GFX6-NEXT: [[LOAD:%[0-9]+]]:vreg_128(<2 x p1>) = G_LOAD [[COPY]](p1) :: (load (<2 x p1>), align 4, addrspace 1)
     ; GFX6-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<2 x p1>)
+    ;
     ; GFX7-LABEL: name: load_global_v2p1
     ; GFX7: liveins: $vgpr0_vgpr1
     ; GFX7-NEXT: {{  $}}
     ; GFX7-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
     ; GFX7-NEXT: [[LOAD:%[0-9]+]]:vreg_128(<2 x p1>) = G_LOAD [[COPY]](p1) :: (load (<2 x p1>), align 4, addrspace 1)
     ; GFX7-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<2 x p1>)
+    ;
     ; GFX7-FLAT-LABEL: name: load_global_v2p1
     ; GFX7-FLAT: liveins: $vgpr0_vgpr1
     ; GFX7-FLAT-NEXT: {{  $}}
     ; GFX7-FLAT-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
     ; GFX7-FLAT-NEXT: [[LOAD:%[0-9]+]]:vreg_128(<2 x p1>) = G_LOAD [[COPY]](p1) :: (load (<2 x p1>), align 4, addrspace 1)
     ; GFX7-FLAT-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<2 x p1>)
+    ;
     ; GFX8-LABEL: name: load_global_v2p1
     ; GFX8: liveins: $vgpr0_vgpr1
     ; GFX8-NEXT: {{  $}}
     ; GFX8-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
     ; GFX8-NEXT: [[LOAD:%[0-9]+]]:vreg_128(<2 x p1>) = G_LOAD [[COPY]](p1) :: (load (<2 x p1>), align 4, addrspace 1)
     ; GFX8-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<2 x p1>)
+    ;
     ; GFX9-LABEL: name: load_global_v2p1
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
     ; GFX9-NEXT: [[LOAD:%[0-9]+]]:vreg_128(<2 x p1>) = G_LOAD [[COPY]](p1) :: (load (<2 x p1>), align 4, addrspace 1)
     ; GFX9-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<2 x p1>)
+    ;
     ; GFX10-LABEL: name: load_global_v2p1
     ; GFX10: liveins: $vgpr0_vgpr1
     ; GFX10-NEXT: {{  $}}
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
     ; GFX10-NEXT: [[LOAD:%[0-9]+]]:vreg_128(<2 x p1>) = G_LOAD [[COPY]](p1) :: (load (<2 x p1>), align 4, addrspace 1)
     ; GFX10-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<2 x p1>)
+    ;
     ; GFX11-LABEL: name: load_global_v2p1
     ; GFX11: liveins: $vgpr0_vgpr1
     ; GFX11-NEXT: {{  $}}
@@ -548,36 +616,42 @@ body: |
     ; GFX6-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
     ; GFX6-NEXT: [[LOAD:%[0-9]+]]:vreg_128(s128) = G_LOAD [[COPY]](p1) :: (load (s128), align 4, addrspace 1)
     ; GFX6-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](s128)
+    ;
     ; GFX7-LABEL: name: load_global_s128
     ; GFX7: liveins: $vgpr0_vgpr1
     ; GFX7-NEXT: {{  $}}
     ; GFX7-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
     ; GFX7-NEXT: [[LOAD:%[0-9]+]]:vreg_128(s128) = G_LOAD [[COPY]](p1) :: (load (s128), align 4, addrspace 1)
     ; GFX7-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](s128)
+    ;
     ; GFX7-FLAT-LABEL: name: load_global_s128
     ; GFX7-FLAT: liveins: $vgpr0_vgpr1
     ; GFX7-FLAT-NEXT: {{  $}}
     ; GFX7-FLAT-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
     ; GFX7-FLAT-NEXT: [[LOAD:%[0-9]+]]:vreg_128(s128) = G_LOAD [[COPY]](p1) :: (load (s128), align 4, addrspace 1)
     ; GFX7-FLAT-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](s128)
+    ;
     ; GFX8-LABEL: name: load_global_s128
     ; GFX8: liveins: $vgpr0_vgpr1
     ; GFX8-NEXT: {{  $}}
     ; GFX8-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
     ; GFX8-NEXT: [[LOAD:%[0-9]+]]:vreg_128(s128) = G_LOAD [[COPY]](p1) :: (load (s128), align 4, addrspace 1)
     ; GFX8-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](s128)
+    ;
     ; GFX9-LABEL: name: load_global_s128
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
     ; GFX9-NEXT: [[LOAD:%[0-9]+]]:vreg_128(s128) = G_LOAD [[COPY]](p1) :: (load (s128), align 4, addrspace 1)
     ; GFX9-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](s128)
+    ;
     ; GFX10-LABEL: name: load_global_s128
     ; GFX10: liveins: $vgpr0_vgpr1
     ; GFX10-NEXT: {{  $}}
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
     ; GFX10-NEXT: [[LOAD:%[0-9]+]]:vreg_128(s128) = G_LOAD [[COPY]](p1) :: (load (s128), align 4, addrspace 1)
     ; GFX10-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](s128)
+    ;
     ; GFX11-LABEL: name: load_global_s128
     ; GFX11: liveins: $vgpr0_vgpr1
     ; GFX11-NEXT: {{  $}}
@@ -604,39 +678,55 @@ body: |
     ; GFX6-LABEL: name: load_global_p3_from_4
     ; GFX6: liveins: $vgpr0_vgpr1
     ; GFX6-NEXT: {{  $}}
-    ; GFX6-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
-    ; GFX6-NEXT: [[LOAD:%[0-9]+]]:vgpr_32(p3) = G_LOAD [[COPY]](p1) :: (load (p3), addrspace 1)
-    ; GFX6-NEXT: $vgpr0 = COPY [[LOAD]](p3)
+    ; GFX6-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
+    ; GFX6-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+    ; GFX6-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 61440
+    ; GFX6-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
+    ; GFX6-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
+    ; GFX6-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3
+    ; GFX6-NEXT: [[BUFFER_LOAD_DWORD_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_ADDR64 [[COPY]], [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec :: (load (p3), addrspace 1)
+    ; GFX6-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_ADDR64_]]
+    ;
     ; GFX7-LABEL: name: load_global_p3_from_4
     ; GFX7: liveins: $vgpr0_vgpr1
     ; GFX7-NEXT: {{  $}}
     ; GFX7-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX7-NEXT: [[FLAT_LOAD_DWORD:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load (p3), addrspace 1)
-    ; GFX7-NEXT: $vgpr0 = COPY [[FLAT_LOAD_DWORD]]
+    ; GFX7-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+    ; GFX7-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 61440
+    ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
+    ; GFX7-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
+    ; GFX7-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3
+    ; GFX7-NEXT: [[BUFFER_LOAD_DWORD_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_ADDR64 [[COPY]], [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec :: (load (p3), addrspace 1)
+    ; GFX7-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_ADDR64_]]
+    ;
     ; GFX7-FLAT-LABEL: name: load_global_p3_from_4
     ; GFX7-FLAT: liveins: $vgpr0_vgpr1
     ; GFX7-FLAT-NEXT: {{  $}}
     ; GFX7-FLAT-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX7-FLAT-NEXT: [[FLAT_LOAD_DWORD:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load (p3), addrspace 1)
     ; GFX7-FLAT-NEXT: $vgpr0 = COPY [[FLAT_LOAD_DWORD]]
+    ;
     ; GFX8-LABEL: name: load_global_p3_from_4
     ; GFX8: liveins: $vgpr0_vgpr1
     ; GFX8-NEXT: {{  $}}
     ; GFX8-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX8-NEXT: [[FLAT_LOAD_DWORD:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load (p3), addrspace 1)
     ; GFX8-NEXT: $vgpr0 = COPY [[FLAT_LOAD_DWORD]]
+    ;
     ; GFX9-LABEL: name: load_global_p3_from_4
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX9-NEXT: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[COPY]], 0, 0, implicit $exec :: (load (p3), addrspace 1)
     ; GFX9-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_DWORD]]
+    ;
     ; GFX10-LABEL: name: load_global_p3_from_4
     ; GFX10: liveins: $vgpr0_vgpr1
     ; GFX10-NEXT: {{  $}}
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX10-NEXT: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[COPY]], 0, 0, implicit $exec :: (load (p3), addrspace 1)
     ; GFX10-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_DWORD]]
+    ;
     ; GFX11-LABEL: name: load_global_p3_from_4
     ; GFX11: liveins: $vgpr0_vgpr1
     ; GFX11-NEXT: {{  $}}
@@ -663,39 +753,55 @@ body: |
     ; GFX6-LABEL: name: load_global_p1_from_8
     ; GFX6: liveins: $vgpr0_vgpr1
     ; GFX6-NEXT: {{  $}}
-    ; GFX6-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
-    ; GFX6-NEXT: [[LOAD:%[0-9]+]]:vreg_64(p1) = G_LOAD [[COPY]](p1) :: (load (p1), addrspace 1)
-    ; GFX6-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](p1)
+    ; GFX6-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
+    ; GFX6-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+    ; GFX6-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 61440
+    ; GFX6-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
+    ; GFX6-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
+    ; GFX6-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3
+    ; GFX6-NEXT: [[BUFFER_LOAD_DWORDX2_ADDR64_:%[0-9]+]]:vreg_64 = BUFFER_LOAD_DWORDX2_ADDR64 [[COPY]], [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec :: (load (p1), addrspace 1)
+    ; GFX6-NEXT: $vgpr0_vgpr1 = COPY [[BUFFER_LOAD_DWORDX2_ADDR64_]]
+    ;
     ; GFX7-LABEL: name: load_global_p1_from_8
     ; GFX7: liveins: $vgpr0_vgpr1
     ; GFX7-NEXT: {{  $}}
     ; GFX7-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX7-NEXT: [[FLAT_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = FLAT_LOAD_DWORDX2 [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load (p1), addrspace 1)
-    ; GFX7-NEXT: $vgpr0_vgpr1 = COPY [[FLAT_LOAD_DWORDX2_]]
+    ; GFX7-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+    ; GFX7-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 61440
+    ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
+    ; GFX7-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
+    ; GFX7-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3
+    ; GFX7-NEXT: [[BUFFER_LOAD_DWORDX2_ADDR64_:%[0-9]+]]:vreg_64 = BUFFER_LOAD_DWORDX2_ADDR64 [[COPY]], [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec :: (load (p1), addrspace 1)
+    ; GFX7-NEXT: $vgpr0_vgpr1 = COPY [[BUFFER_LOAD_DWORDX2_ADDR64_]]
+    ;
     ; GFX7-FLAT-LABEL: name: load_global_p1_from_8
     ; GFX7-FLAT: liveins: $vgpr0_vgpr1
     ; GFX7-FLAT-NEXT: {{  $}}
     ; GFX7-FLAT-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX7-FLAT-NEXT: [[FLAT_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = FLAT_LOAD_DWORDX2 [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load (p1), addrspace 1)
     ; GFX7-FLAT-NEXT: $vgpr0_vgpr1 = COPY [[FLAT_LOAD_DWORDX2_]]
+    ;
     ; GFX8-LABEL: name: load_global_p1_from_8
     ; GFX8: liveins: $vgpr0_vgpr1
     ; GFX8-NEXT: {{  $}}
     ; GFX8-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX8-NEXT: [[FLAT_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = FLAT_LOAD_DWORDX2 [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load (p1), addrspace 1)
     ; GFX8-NEXT: $vgpr0_vgpr1 = COPY [[FLAT_LOAD_DWORDX2_]]
+    ;
     ; GFX9-LABEL: name: load_global_p1_from_8
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX9-NEXT: [[GLOBAL_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = GLOBAL_LOAD_DWORDX2 [[COPY]], 0, 0, implicit $exec :: (load (p1), addrspace 1)
     ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[GLOBAL_LOAD_DWORDX2_]]
+    ;
     ; GFX10-LABEL: name: load_global_p1_from_8
     ; GFX10: liveins: $vgpr0_vgpr1
     ; GFX10-NEXT: {{  $}}
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX10-NEXT: [[GLOBAL_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = GLOBAL_LOAD_DWORDX2 [[COPY]], 0, 0, implicit $exec :: (load (p1), addrspace 1)
     ; GFX10-NEXT: $vgpr0_vgpr1 = COPY [[GLOBAL_LOAD_DWORDX2_]]
+    ;
     ; GFX11-LABEL: name: load_global_p1_from_8
     ; GFX11: liveins: $vgpr0_vgpr1
     ; GFX11-NEXT: {{  $}}
@@ -725,36 +831,42 @@ body: |
     ; GFX6-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
     ; GFX6-NEXT: [[LOAD:%[0-9]+]]:vreg_64(p999) = G_LOAD [[COPY]](p1) :: (load (p999), addrspace 1)
     ; GFX6-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](p999)
+    ;
     ; GFX7-LABEL: name: load_global_p999_from_8
     ; GFX7: liveins: $vgpr0_vgpr1
     ; GFX7-NEXT: {{  $}}
     ; GFX7-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
     ; GFX7-NEXT: [[LOAD:%[0-9]+]]:vreg_64(p999) = G_LOAD [[COPY]](p1) :: (load (p999), addrspace 1)
     ; GFX7-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](p999)
+    ;
     ; GFX7-FLAT-LABEL: name: load_global_p999_from_8
     ; GFX7-FLAT: liveins: $vgpr0_vgpr1
     ; GFX7-FLAT-NEXT: {{  $}}
     ; GFX7-FLAT-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
     ; GFX7-FLAT-NEXT: [[LOAD:%[0-9]+]]:vreg_64(p999) = G_LOAD [[COPY]](p1) :: (load (p999), addrspace 1)
     ; GFX7-FLAT-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](p999)
+    ;
     ; GFX8-LABEL: name: load_global_p999_from_8
     ; GFX8: liveins: $vgpr0_vgpr1
     ; GFX8-NEXT: {{  $}}
     ; GFX8-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
     ; GFX8-NEXT: [[LOAD:%[0-9]+]]:vreg_64(p999) = G_LOAD [[COPY]](p1) :: (load (p999), addrspace 1)
     ; GFX8-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](p999)
+    ;
     ; GFX9-LABEL: name: load_global_p999_from_8
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
     ; GFX9-NEXT: [[LOAD:%[0-9]+]]:vreg_64(p999) = G_LOAD [[COPY]](p1) :: (load (p999), addrspace 1)
     ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](p999)
+    ;
     ; GFX10-LABEL: name: load_global_p999_from_8
     ; GFX10: liveins: $vgpr0_vgpr1
     ; GFX10-NEXT: {{  $}}
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
     ; GFX10-NEXT: [[LOAD:%[0-9]+]]:vreg_64(p999) = G_LOAD [[COPY]](p1) :: (load (p999), addrspace 1)
     ; GFX10-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](p999)
+    ;
     ; GFX11-LABEL: name: load_global_p999_from_8
     ; GFX11: liveins: $vgpr0_vgpr1
     ; GFX11-NEXT: {{  $}}
@@ -784,36 +896,42 @@ body: |
     ; GFX6-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
     ; GFX6-NEXT: [[LOAD:%[0-9]+]]:vreg_64(<2 x p3>) = G_LOAD [[COPY]](p1) :: (load (<2 x p3>), addrspace 1)
     ; GFX6-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x p3>)
+    ;
     ; GFX7-LABEL: name: load_global_v2p3
     ; GFX7: liveins: $vgpr0_vgpr1
     ; GFX7-NEXT: {{  $}}
     ; GFX7-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
     ; GFX7-NEXT: [[LOAD:%[0-9]+]]:vreg_64(<2 x p3>) = G_LOAD [[COPY]](p1) :: (load (<2 x p3>), addrspace 1)
     ; GFX7-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x p3>)
+    ;
     ; GFX7-FLAT-LABEL: name: load_global_v2p3
     ; GFX7-FLAT: liveins: $vgpr0_vgpr1
     ; GFX7-FLAT-NEXT: {{  $}}
     ; GFX7-FLAT-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
     ; GFX7-FLAT-NEXT: [[LOAD:%[0-9]+]]:vreg_64(<2 x p3>) = G_LOAD [[COPY]](p1) :: (load (<2 x p3>), addrspace 1)
     ; GFX7-FLAT-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x p3>)
+    ;
     ; GFX8-LABEL: name: load_global_v2p3
     ; GFX8: liveins: $vgpr0_vgpr1
     ; GFX8-NEXT: {{  $}}
     ; GFX8-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
     ; GFX8-NEXT: [[LOAD:%[0-9]+]]:vreg_64(<2 x p3>) = G_LOAD [[COPY]](p1) :: (load (<2 x p3>), addrspace 1)
     ; GFX8-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x p3>)
+    ;
     ; GFX9-LABEL: name: load_global_v2p3
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
     ; GFX9-NEXT: [[LOAD:%[0-9]+]]:vreg_64(<2 x p3>) = G_LOAD [[COPY]](p1) :: (load (<2 x p3>), addrspace 1)
     ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x p3>)
+    ;
     ; GFX10-LABEL: name: load_global_v2p3
     ; GFX10: liveins: $vgpr0_vgpr1
     ; GFX10-NEXT: {{  $}}
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
     ; GFX10-NEXT: [[LOAD:%[0-9]+]]:vreg_64(<2 x p3>) = G_LOAD [[COPY]](p1) :: (load (<2 x p3>), addrspace 1)
     ; GFX10-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x p3>)
+    ;
     ; GFX11-LABEL: name: load_global_v2p3
     ; GFX11: liveins: $vgpr0_vgpr1
     ; GFX11-NEXT: {{  $}}
@@ -840,39 +958,55 @@ body: |
     ; GFX6-LABEL: name: load_global_v2s16
     ; GFX6: liveins: $vgpr0_vgpr1
     ; GFX6-NEXT: {{  $}}
-    ; GFX6-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
-    ; GFX6-NEXT: [[LOAD:%[0-9]+]]:vgpr_32(<2 x s16>) = G_LOAD [[COPY]](p1) :: (load (<2 x s16>), addrspace 1)
-    ; GFX6-NEXT: $vgpr0 = COPY [[LOAD]](<2 x s16>)
+    ; GFX6-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
+    ; GFX6-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+    ; GFX6-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 61440
+    ; GFX6-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
+    ; GFX6-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
+    ; GFX6-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3
+    ; GFX6-NEXT: [[BUFFER_LOAD_DWORD_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_ADDR64 [[COPY]], [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec :: (load (<2 x s16>), addrspace 1)
+    ; GFX6-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_ADDR64_]]
+    ;
     ; GFX7-LABEL: name: load_global_v2s16
     ; GFX7: liveins: $vgpr0_vgpr1
     ; GFX7-NEXT: {{  $}}
     ; GFX7-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX7-NEXT: [[FLAT_LOAD_DWORD:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load (<2 x s16>), addrspace 1)
-    ; GFX7-NEXT: $vgpr0 = COPY [[FLAT_LOAD_DWORD]]
+    ; GFX7-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+    ; GFX7-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 61440
+    ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
+    ; GFX7-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
+    ; GFX7-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3
+    ; GFX7-NEXT: [[BUFFER_LOAD_DWORD_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_ADDR64 [[COPY]], [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec :: (load (<2 x s16>), addrspace 1)
+    ; GFX7-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_ADDR64_]]
+    ;
     ; GFX7-FLAT-LABEL: name: load_global_v2s16
     ; GFX7-FLAT: liveins: $vgpr0_vgpr1
     ; GFX7-FLAT-NEXT: {{  $}}
     ; GFX7-FLAT-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX7-FLAT-NEXT: [[FLAT_LOAD_DWORD:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load (<2 x s16>), addrspace 1)
     ; GFX7-FLAT-NEXT: $vgpr0 = COPY [[FLAT_LOAD_DWORD]]
+    ;
     ; GFX8-LABEL: name: load_global_v2s16
     ; GFX8: liveins: $vgpr0_vgpr1
     ; GFX8-NEXT: {{  $}}
     ; GFX8-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX8-NEXT: [[FLAT_LOAD_DWORD:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load (<2 x s16>), addrspace 1)
     ; GFX8-NEXT: $vgpr0 = COPY [[FLAT_LOAD_DWORD]]
+    ;
     ; GFX9-LABEL: name: load_global_v2s16
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX9-NEXT: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[COPY]], 0, 0, implicit $exec :: (load (<2 x s16>), addrspace 1)
     ; GFX9-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_DWORD]]
+    ;
     ; GFX10-LABEL: name: load_global_v2s16
     ; GFX10: liveins: $vgpr0_vgpr1
     ; GFX10-NEXT: {{  $}}
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX10-NEXT: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[COPY]], 0, 0, implicit $exec :: (load (<2 x s16>), addrspace 1)
     ; GFX10-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_DWORD]]
+    ;
     ; GFX11-LABEL: name: load_global_v2s16
     ; GFX11: liveins: $vgpr0_vgpr1
     ; GFX11-NEXT: {{  $}}
@@ -899,39 +1033,55 @@ body: |
     ; GFX6-LABEL: name: load_global_v4s16
     ; GFX6: liveins: $vgpr0_vgpr1
     ; GFX6-NEXT: {{  $}}
-    ; GFX6-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
-    ; GFX6-NEXT: [[LOAD:%[0-9]+]]:vreg_64(<4 x s16>) = G_LOAD [[COPY]](p1) :: (load (<4 x s16>), addrspace 1)
-    ; GFX6-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<4 x s16>)
+    ; GFX6-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
+    ; GFX6-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+    ; GFX6-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 61440
+    ; GFX6-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
+    ; GFX6-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
+    ; GFX6-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3
+    ; GFX6-NEXT: [[BUFFER_LOAD_DWORDX2_ADDR64_:%[0-9]+]]:vreg_64 = BUFFER_LOAD_DWORDX2_ADDR64 [[COPY]], [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec :: (load (<4 x s16>), addrspace 1)
+    ; GFX6-NEXT: $vgpr0_vgpr1 = COPY [[BUFFER_LOAD_DWORDX2_ADDR64_]]
+    ;
     ; GFX7-LABEL: name: load_global_v4s16
     ; GFX7: liveins: $vgpr0_vgpr1
     ; GFX7-NEXT: {{  $}}
     ; GFX7-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX7-NEXT: [[FLAT_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = FLAT_LOAD_DWORDX2 [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load (<4 x s16>), addrspace 1)
-    ; GFX7-NEXT: $vgpr0_vgpr1 = COPY [[FLAT_LOAD_DWORDX2_]]
+    ; GFX7-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+    ; GFX7-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 61440
+    ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
+    ; GFX7-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
+    ; GFX7-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3
+    ; GFX7-NEXT: [[BUFFER_LOAD_DWORDX2_ADDR64_:%[0-9]+]]:vreg_64 = BUFFER_LOAD_DWORDX2_ADDR64 [[COPY]], [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec :: (load (<4 x s16>), addrspace 1)
+    ; GFX7-NEXT: $vgpr0_vgpr1 = COPY [[BUFFER_LOAD_DWORDX2_ADDR64_]]
+    ;
     ; GFX7-FLAT-LABEL: name: load_global_v4s16
     ; GFX7-FLAT: liveins: $vgpr0_vgpr1
     ; GFX7-FLAT-NEXT: {{  $}}
     ; GFX7-FLAT-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX7-FLAT-NEXT: [[FLAT_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = FLAT_LOAD_DWORDX2 [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load (<4 x s16>), addrspace 1)
     ; GFX7-FLAT-NEXT: $vgpr0_vgpr1 = COPY [[FLAT_LOAD_DWORDX2_]]
+    ;
     ; GFX8-LABEL: name: load_global_v4s16
     ; GFX8: liveins: $vgpr0_vgpr1
     ; GFX8-NEXT: {{  $}}
     ; GFX8-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX8-NEXT: [[FLAT_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = FLAT_LOAD_DWORDX2 [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load (<4 x s16>), addrspace 1)
     ; GFX8-NEXT: $vgpr0_vgpr1 = COPY [[FLAT_LOAD_DWORDX2_]]
+    ;
     ; GFX9-LABEL: name: load_global_v4s16
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX9-NEXT: [[GLOBAL_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = GLOBAL_LOAD_DWORDX2 [[COPY]], 0, 0, implicit $exec :: (load (<4 x s16>), addrspace 1)
     ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[GLOBAL_LOAD_DWORDX2_]]
+    ;
     ; GFX10-LABEL: name: load_global_v4s16
     ; GFX10: liveins: $vgpr0_vgpr1
     ; GFX10-NEXT: {{  $}}
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX10-NEXT: [[GLOBAL_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = GLOBAL_LOAD_DWORDX2 [[COPY]], 0, 0, implicit $exec :: (load (<4 x s16>), addrspace 1)
     ; GFX10-NEXT: $vgpr0_vgpr1 = COPY [[GLOBAL_LOAD_DWORDX2_]]
+    ;
     ; GFX11-LABEL: name: load_global_v4s16
     ; GFX11: liveins: $vgpr0_vgpr1
     ; GFX11-NEXT: {{  $}}
@@ -958,39 +1108,55 @@ body: |
     ; GFX6-LABEL: name: load_global_v8s16
     ; GFX6: liveins: $vgpr0_vgpr1
     ; GFX6-NEXT: {{  $}}
-    ; GFX6-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
-    ; GFX6-NEXT: [[LOAD:%[0-9]+]]:vreg_128(<8 x s16>) = G_LOAD [[COPY]](p1) :: (load (<8 x s16>), align 4, addrspace 1)
-    ; GFX6-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<8 x s16>)
+    ; GFX6-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
+    ; GFX6-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+    ; GFX6-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 61440
+    ; GFX6-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
+    ; GFX6-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
+    ; GFX6-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3
+    ; GFX6-NEXT: [[BUFFER_LOAD_DWORDX4_ADDR64_:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_ADDR64 [[COPY]], [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec :: (load (<8 x s16>), align 4, addrspace 1)
+    ; GFX6-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUFFER_LOAD_DWORDX4_ADDR64_]]
+    ;
     ; GFX7-LABEL: name: load_global_v8s16
     ; GFX7: liveins: $vgpr0_vgpr1
     ; GFX7-NEXT: {{  $}}
     ; GFX7-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX7-NEXT: [[FLAT_LOAD_DWORDX4_:%[0-9]+]]:vreg_128 = FLAT_LOAD_DWORDX4 [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load (<8 x s16>), align 4, addrspace 1)
-    ; GFX7-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[FLAT_LOAD_DWORDX4_]]
+    ; GFX7-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+    ; GFX7-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 61440
+    ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
+    ; GFX7-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
+    ; GFX7-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3
+    ; GFX7-NEXT: [[BUFFER_LOAD_DWORDX4_ADDR64_:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_ADDR64 [[COPY]], [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec :: (load (<8 x s16>), align 4, addrspace 1)
+    ; GFX7-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUFFER_LOAD_DWORDX4_ADDR64_]]
+    ;
     ; GFX7-FLAT-LABEL: name: load_global_v8s16
     ; GFX7-FLAT: liveins: $vgpr0_vgpr1
     ; GFX7-FLAT-NEXT: {{  $}}
     ; GFX7-FLAT-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX7-FLAT-NEXT: [[FLAT_LOAD_DWORDX4_:%[0-9]+]]:vreg_128 = FLAT_LOAD_DWORDX4 [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load (<8 x s16>), align 4, addrspace 1)
     ; GFX7-FLAT-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[FLAT_LOAD_DWORDX4_]]
+    ;
     ; GFX8-LABEL: name: load_global_v8s16
     ; GFX8: liveins: $vgpr0_vgpr1
     ; GFX8-NEXT: {{  $}}
     ; GFX8-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX8-NEXT: [[FLAT_LOAD_DWORDX4_:%[0-9]+]]:vreg_128 = FLAT_LOAD_DWORDX4 [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load (<8 x s16>), align 4, addrspace 1)
     ; GFX8-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[FLAT_LOAD_DWORDX4_]]
+    ;
     ; GFX9-LABEL: name: load_global_v8s16
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX9-NEXT: [[GLOBAL_LOAD_DWORDX4_:%[0-9]+]]:vreg_128 = GLOBAL_LOAD_DWORDX4 [[COPY]], 0, 0, implicit $exec :: (load (<8 x s16>), align 4, addrspace 1)
     ; GFX9-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[GLOBAL_LOAD_DWORDX4_]]
+    ;
     ; GFX10-LABEL: name: load_global_v8s16
     ; GFX10: liveins: $vgpr0_vgpr1
     ; GFX10-NEXT: {{  $}}
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX10-NEXT: [[GLOBAL_LOAD_DWORDX4_:%[0-9]+]]:vreg_128 = GLOBAL_LOAD_DWORDX4 [[COPY]], 0, 0, implicit $exec :: (load (<8 x s16>), align 4, addrspace 1)
     ; GFX10-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[GLOBAL_LOAD_DWORDX4_]]
+    ;
     ; GFX11-LABEL: name: load_global_v8s16
     ; GFX11: liveins: $vgpr0_vgpr1
     ; GFX11-NEXT: {{  $}}
@@ -1029,6 +1195,7 @@ body: |
     ; GFX6-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3
     ; GFX6-NEXT: [[BUFFER_LOAD_UBYTE_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_ADDR64 [[COPY]], [[REG_SEQUENCE1]], 0, 2047, 0, 0, implicit $exec :: (load (s8), addrspace 1)
     ; GFX6-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_ADDR64_]]
+    ;
     ; GFX7-LABEL: name: load_global_s32_from_1_gep_2047
     ; GFX7: liveins: $vgpr0_vgpr1
     ; GFX7-NEXT: {{  $}}
@@ -1040,6 +1207,7 @@ body: |
     ; GFX7-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3
     ; GFX7-NEXT: [[BUFFER_LOAD_UBYTE_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_ADDR64 [[COPY]], [[REG_SEQUENCE1]], 0, 2047, 0, 0, implicit $exec :: (load (s8), addrspace 1)
     ; GFX7-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_ADDR64_]]
+    ;
     ; GFX7-FLAT-LABEL: name: load_global_s32_from_1_gep_2047
     ; GFX7-FLAT: liveins: $vgpr0_vgpr1
     ; GFX7-FLAT-NEXT: {{  $}}
@@ -1056,6 +1224,7 @@ body: |
     ; GFX7-FLAT-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
     ; GFX7-FLAT-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 1)
     ; GFX7-FLAT-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
+    ;
     ; GFX8-LABEL: name: load_global_s32_from_1_gep_2047
     ; GFX8: liveins: $vgpr0_vgpr1
     ; GFX8-NEXT: {{  $}}
@@ -1072,18 +1241,21 @@ body: |
     ; GFX8-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
     ; GFX8-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 1)
     ; GFX8-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
+    ;
     ; GFX9-LABEL: name: load_global_s32_from_1_gep_2047
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX9-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[COPY]], 2047, 0, implicit $exec :: (load (s8), addrspace 1)
     ; GFX9-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_UBYTE]]
+    ;
     ; GFX10-LABEL: name: load_global_s32_from_1_gep_2047
     ; GFX10: liveins: $vgpr0_vgpr1
     ; GFX10-NEXT: {{  $}}
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX10-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[COPY]], 2047, 0, implicit $exec :: (load (s8), addrspace 1)
     ; GFX10-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_UBYTE]]
+    ;
     ; GFX11-LABEL: name: load_global_s32_from_1_gep_2047
     ; GFX11: liveins: $vgpr0_vgpr1
     ; GFX11-NEXT: {{  $}}
@@ -1120,6 +1292,7 @@ body: |
     ; GFX6-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3
     ; GFX6-NEXT: [[BUFFER_LOAD_UBYTE_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_ADDR64 [[COPY]], [[REG_SEQUENCE1]], 0, 2048, 0, 0, implicit $exec :: (load (s8), addrspace 1)
     ; GFX6-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_ADDR64_]]
+    ;
     ; GFX7-LABEL: name: load_global_s32_from_1_gep_2048
     ; GFX7: liveins: $vgpr0_vgpr1
     ; GFX7-NEXT: {{  $}}
@@ -1131,6 +1304,7 @@ body: |
     ; GFX7-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3
     ; GFX7-NEXT: [[BUFFER_LOAD_UBYTE_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_ADDR64 [[COPY]], [[REG_SEQUENCE1]], 0, 2048, 0, 0, implicit $exec :: (load (s8), addrspace 1)
     ; GFX7-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_ADDR64_]]
+    ;
     ; GFX7-FLAT-LABEL: name: load_global_s32_from_1_gep_2048
     ; GFX7-FLAT: liveins: $vgpr0_vgpr1
     ; GFX7-FLAT-NEXT: {{  $}}
@@ -1147,6 +1321,7 @@ body: |
     ; GFX7-FLAT-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
     ; GFX7-FLAT-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 1)
     ; GFX7-FLAT-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
+    ;
     ; GFX8-LABEL: name: load_global_s32_from_1_gep_2048
     ; GFX8: liveins: $vgpr0_vgpr1
     ; GFX8-NEXT: {{  $}}
@@ -1163,12 +1338,14 @@ body: |
     ; GFX8-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
     ; GFX8-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 1)
     ; GFX8-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
+    ;
     ; GFX9-LABEL: name: load_global_s32_from_1_gep_2048
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX9-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[COPY]], 2048, 0, implicit $exec :: (load (s8), addrspace 1)
     ; GFX9-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_UBYTE]]
+    ;
     ; GFX10-LABEL: name: load_global_s32_from_1_gep_2048
     ; GFX10: liveins: $vgpr0_vgpr1
     ; GFX10-NEXT: {{  $}}
@@ -1185,6 +1362,7 @@ body: |
     ; GFX10-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
     ; GFX10-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, implicit $exec :: (load (s8), addrspace 1)
     ; GFX10-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_UBYTE]]
+    ;
     ; GFX11-LABEL: name: load_global_s32_from_1_gep_2048
     ; GFX11: liveins: $vgpr0_vgpr1
     ; GFX11-NEXT: {{  $}}
@@ -1231,6 +1409,7 @@ body: |
     ; GFX6-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE2]], %subreg.sub2_sub3
     ; GFX6-NEXT: [[BUFFER_LOAD_UBYTE_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_ADDR64 [[REG_SEQUENCE1]], [[REG_SEQUENCE3]], 0, 0, 0, 0, implicit $exec :: (load (s8), addrspace 1)
     ; GFX6-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_ADDR64_]]
+    ;
     ; GFX7-LABEL: name: load_global_s32_from_1_gep_m2047
     ; GFX7: liveins: $vgpr0_vgpr1
     ; GFX7-NEXT: {{  $}}
@@ -1252,6 +1431,7 @@ body: |
     ; GFX7-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE2]], %subreg.sub2_sub3
     ; GFX7-NEXT: [[BUFFER_LOAD_UBYTE_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_ADDR64 [[REG_SEQUENCE1]], [[REG_SEQUENCE3]], 0, 0, 0, 0, implicit $exec :: (load (s8), addrspace 1)
     ; GFX7-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_ADDR64_]]
+    ;
     ; GFX7-FLAT-LABEL: name: load_global_s32_from_1_gep_m2047
     ; GFX7-FLAT: liveins: $vgpr0_vgpr1
     ; GFX7-FLAT-NEXT: {{  $}}
@@ -1268,6 +1448,7 @@ body: |
     ; GFX7-FLAT-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
     ; GFX7-FLAT-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 1)
     ; GFX7-FLAT-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
+    ;
     ; GFX8-LABEL: name: load_global_s32_from_1_gep_m2047
     ; GFX8: liveins: $vgpr0_vgpr1
     ; GFX8-NEXT: {{  $}}
@@ -1284,18 +1465,21 @@ body: |
     ; GFX8-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
     ; GFX8-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 1)
     ; GFX8-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
+    ;
     ; GFX9-LABEL: name: load_global_s32_from_1_gep_m2047
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX9-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[COPY]], -2047, 0, implicit $exec :: (load (s8), addrspace 1)
     ; GFX9-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_UBYTE]]
+    ;
     ; GFX10-LABEL: name: load_global_s32_from_1_gep_m2047
     ; GFX10: liveins: $vgpr0_vgpr1
     ; GFX10-NEXT: {{  $}}
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX10-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[COPY]], -2047, 0, implicit $exec :: (load (s8), addrspace 1)
     ; GFX10-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_UBYTE]]
+    ;
     ; GFX11-LABEL: name: load_global_s32_from_1_gep_m2047
     ; GFX11: liveins: $vgpr0_vgpr1
     ; GFX11-NEXT: {{  $}}
@@ -1342,6 +1526,7 @@ body: |
     ; GFX6-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE2]], %subreg.sub2_sub3
     ; GFX6-NEXT: [[BUFFER_LOAD_UBYTE_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_ADDR64 [[REG_SEQUENCE1]], [[REG_SEQUENCE3]], 0, 0, 0, 0, implicit $exec :: (load (s8), addrspace 1)
     ; GFX6-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_ADDR64_]]
+    ;
     ; GFX7-LABEL: name: load_global_s32_from_1_gep_m2048
     ; GFX7: liveins: $vgpr0_vgpr1
     ; GFX7-NEXT: {{  $}}
@@ -1363,6 +1548,7 @@ body: |
     ; GFX7-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE2]], %subreg.sub2_sub3
     ; GFX7-NEXT: [[BUFFER_LOAD_UBYTE_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_ADDR64 [[REG_SEQUENCE1]], [[REG_SEQUENCE3]], 0, 0, 0, 0, implicit $exec :: (load (s8), addrspace 1)
     ; GFX7-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_ADDR64_]]
+    ;
     ; GFX7-FLAT-LABEL: name: load_global_s32_from_1_gep_m2048
     ; GFX7-FLAT: liveins: $vgpr0_vgpr1
     ; GFX7-FLAT-NEXT: {{  $}}
@@ -1379,6 +1565,7 @@ body: |
     ; GFX7-FLAT-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
     ; GFX7-FLAT-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 1)
     ; GFX7-FLAT-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
+    ;
     ; GFX8-LABEL: name: load_global_s32_from_1_gep_m2048
     ; GFX8: liveins: $vgpr0_vgpr1
     ; GFX8-NEXT: {{  $}}
@@ -1395,18 +1582,21 @@ body: |
     ; GFX8-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
     ; GFX8-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 1)
     ; GFX8-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
+    ;
     ; GFX9-LABEL: name: load_global_s32_from_1_gep_m2048
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX9-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[COPY]], -2048, 0, implicit $exec :: (load (s8), addrspace 1)
     ; GFX9-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_UBYTE]]
+    ;
     ; GFX10-LABEL: name: load_global_s32_from_1_gep_m2048
     ; GFX10: liveins: $vgpr0_vgpr1
     ; GFX10-NEXT: {{  $}}
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX10-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[COPY]], -2048, 0, implicit $exec :: (load (s8), addrspace 1)
     ; GFX10-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_UBYTE]]
+    ;
     ; GFX11-LABEL: name: load_global_s32_from_1_gep_m2048
     ; GFX11: liveins: $vgpr0_vgpr1
     ; GFX11-NEXT: {{  $}}
@@ -1443,6 +1633,7 @@ body: |
     ; GFX6-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3
     ; GFX6-NEXT: [[BUFFER_LOAD_UBYTE_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_ADDR64 [[COPY]], [[REG_SEQUENCE1]], 0, 4095, 0, 0, implicit $exec :: (load (s8), addrspace 1)
     ; GFX6-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_ADDR64_]]
+    ;
     ; GFX7-LABEL: name: load_global_s32_from_1_gep_4095
     ; GFX7: liveins: $vgpr0_vgpr1
     ; GFX7-NEXT: {{  $}}
@@ -1454,6 +1645,7 @@ body: |
     ; GFX7-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3
     ; GFX7-NEXT: [[BUFFER_LOAD_UBYTE_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_ADDR64 [[COPY]], [[REG_SEQUENCE1]], 0, 4095, 0, 0, implicit $exec :: (load (s8), addrspace 1)
     ; GFX7-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_ADDR64_]]
+    ;
     ; GFX7-FLAT-LABEL: name: load_global_s32_from_1_gep_4095
     ; GFX7-FLAT: liveins: $vgpr0_vgpr1
     ; GFX7-FLAT-NEXT: {{  $}}
@@ -1470,6 +1662,7 @@ body: |
     ; GFX7-FLAT-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
     ; GFX7-FLAT-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 1)
     ; GFX7-FLAT-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
+    ;
     ; GFX8-LABEL: name: load_global_s32_from_1_gep_4095
     ; GFX8: liveins: $vgpr0_vgpr1
     ; GFX8-NEXT: {{  $}}
@@ -1486,12 +1679,14 @@ body: |
     ; GFX8-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
     ; GFX8-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 1)
     ; GFX8-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
+    ;
     ; GFX9-LABEL: name: load_global_s32_from_1_gep_4095
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX9-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[COPY]], 4095, 0, implicit $exec :: (load (s8), addrspace 1)
     ; GFX9-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_UBYTE]]
+    ;
     ; GFX10-LABEL: name: load_global_s32_from_1_gep_4095
     ; GFX10: liveins: $vgpr0_vgpr1
     ; GFX10-NEXT: {{  $}}
@@ -1508,6 +1703,7 @@ body: |
     ; GFX10-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
     ; GFX10-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, implicit $exec :: (load (s8), addrspace 1)
     ; GFX10-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_UBYTE]]
+    ;
     ; GFX11-LABEL: name: load_global_s32_from_1_gep_4095
     ; GFX11: liveins: $vgpr0_vgpr1
     ; GFX11-NEXT: {{  $}}
@@ -1545,6 +1741,7 @@ body: |
     ; GFX6-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 4096
     ; GFX6-NEXT: [[BUFFER_LOAD_UBYTE_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_ADDR64 [[COPY]], [[REG_SEQUENCE1]], [[S_MOV_B32_2]], 0, 0, 0, implicit $exec :: (load (s8), addrspace 1)
     ; GFX6-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_ADDR64_]]
+    ;
     ; GFX7-LABEL: name: load_global_s32_from_1_gep_4096
     ; GFX7: liveins: $vgpr0_vgpr1
     ; GFX7-NEXT: {{  $}}
@@ -1557,6 +1754,7 @@ body: |
     ; GFX7-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 4096
     ; GFX7-NEXT: [[BUFFER_LOAD_UBYTE_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_ADDR64 [[COPY]], [[REG_SEQUENCE1]], [[S_MOV_B32_2]], 0, 0, 0, implicit $exec :: (load (s8), addrspace 1)
     ; GFX7-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_ADDR64_]]
+    ;
     ; GFX7-FLAT-LABEL: name: load_global_s32_from_1_gep_4096
     ; GFX7-FLAT: liveins: $vgpr0_vgpr1
     ; GFX7-FLAT-NEXT: {{  $}}
@@ -1573,6 +1771,7 @@ body: |
     ; GFX7-FLAT-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
     ; GFX7-FLAT-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 1)
     ; GFX7-FLAT-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
+    ;
     ; GFX8-LABEL: name: load_global_s32_from_1_gep_4096
     ; GFX8: liveins: $vgpr0_vgpr1
     ; GFX8-NEXT: {{  $}}
@@ -1589,6 +1788,7 @@ body: |
     ; GFX8-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
     ; GFX8-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 1)
     ; GFX8-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
+    ;
     ; GFX9-LABEL: name: load_global_s32_from_1_gep_4096
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -1605,6 +1805,7 @@ body: |
     ; GFX9-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
     ; GFX9-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, implicit $exec :: (load (s8), addrspace 1)
     ; GFX9-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_UBYTE]]
+    ;
     ; GFX10-LABEL: name: load_global_s32_from_1_gep_4096
     ; GFX10: liveins: $vgpr0_vgpr1
     ; GFX10-NEXT: {{  $}}
@@ -1621,6 +1822,7 @@ body: |
     ; GFX10-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
     ; GFX10-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, implicit $exec :: (load (s8), addrspace 1)
     ; GFX10-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_UBYTE]]
+    ;
     ; GFX11-LABEL: name: load_global_s32_from_1_gep_4096
     ; GFX11: liveins: $vgpr0_vgpr1
     ; GFX11-NEXT: {{  $}}
@@ -1677,6 +1879,7 @@ body: |
     ; GFX6-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE2]], %subreg.sub2_sub3
     ; GFX6-NEXT: [[BUFFER_LOAD_UBYTE_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_ADDR64 [[REG_SEQUENCE1]], [[REG_SEQUENCE3]], 0, 0, 0, 0, implicit $exec :: (load (s8), addrspace 1)
     ; GFX6-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_ADDR64_]]
+    ;
     ; GFX7-LABEL: name: load_global_s32_from_1_gep_m4095
     ; GFX7: liveins: $vgpr0_vgpr1
     ; GFX7-NEXT: {{  $}}
@@ -1698,6 +1901,7 @@ body: |
     ; GFX7-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE2]], %subreg.sub2_sub3
     ; GFX7-NEXT: [[BUFFER_LOAD_UBYTE_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_ADDR64 [[REG_SEQUENCE1]], [[REG_SEQUENCE3]], 0, 0, 0, 0, implicit $exec :: (load (s8), addrspace 1)
     ; GFX7-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_ADDR64_]]
+    ;
     ; GFX7-FLAT-LABEL: name: load_global_s32_from_1_gep_m4095
     ; GFX7-FLAT: liveins: $vgpr0_vgpr1
     ; GFX7-FLAT-NEXT: {{  $}}
@@ -1714,6 +1918,7 @@ body: |
     ; GFX7-FLAT-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
     ; GFX7-FLAT-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 1)
     ; GFX7-FLAT-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
+    ;
     ; GFX8-LABEL: name: load_global_s32_from_1_gep_m4095
     ; GFX8: liveins: $vgpr0_vgpr1
     ; GFX8-NEXT: {{  $}}
@@ -1730,12 +1935,14 @@ body: |
     ; GFX8-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
     ; GFX8-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 1)
     ; GFX8-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
+    ;
     ; GFX9-LABEL: name: load_global_s32_from_1_gep_m4095
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX9-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[COPY]], -4095, 0, implicit $exec :: (load (s8), addrspace 1)
     ; GFX9-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_UBYTE]]
+    ;
     ; GFX10-LABEL: name: load_global_s32_from_1_gep_m4095
     ; GFX10: liveins: $vgpr0_vgpr1
     ; GFX10-NEXT: {{  $}}
@@ -1752,6 +1959,7 @@ body: |
     ; GFX10-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
     ; GFX10-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, implicit $exec :: (load (s8), addrspace 1)
     ; GFX10-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_UBYTE]]
+    ;
     ; GFX11-LABEL: name: load_global_s32_from_1_gep_m4095
     ; GFX11: liveins: $vgpr0_vgpr1
     ; GFX11-NEXT: {{  $}}
@@ -1798,6 +2006,7 @@ body: |
     ; GFX6-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE2]], %subreg.sub2_sub3
     ; GFX6-NEXT: [[BUFFER_LOAD_UBYTE_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_ADDR64 [[REG_SEQUENCE1]], [[REG_SEQUENCE3]], 0, 0, 0, 0, implicit $exec :: (load (s8), addrspace 1)
     ; GFX6-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_ADDR64_]]
+    ;
     ; GFX7-LABEL: name: load_global_s32_from_1_gep_m4096
     ; GFX7: liveins: $vgpr0_vgpr1
     ; GFX7-NEXT: {{  $}}
@@ -1819,6 +2028,7 @@ body: |
     ; GFX7-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE2]], %subreg.sub2_sub3
     ; GFX7-NEXT: [[BUFFER_LOAD_UBYTE_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_ADDR64 [[REG_SEQUENCE1]], [[REG_SEQUENCE3]], 0, 0, 0, 0, implicit $exec :: (load (s8), addrspace 1)
     ; GFX7-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_ADDR64_]]
+    ;
     ; GFX7-FLAT-LABEL: name: load_global_s32_from_1_gep_m4096
     ; GFX7-FLAT: liveins: $vgpr0_vgpr1
     ; GFX7-FLAT-NEXT: {{  $}}
@@ -1835,6 +2045,7 @@ body: |
     ; GFX7-FLAT-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
     ; GFX7-FLAT-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 1)
     ; GFX7-FLAT-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
+    ;
     ; GFX8-LABEL: name: load_global_s32_from_1_gep_m4096
     ; GFX8: liveins: $vgpr0_vgpr1
     ; GFX8-NEXT: {{  $}}
@@ -1851,12 +2062,14 @@ body: |
     ; GFX8-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
     ; GFX8-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 1)
     ; GFX8-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
+    ;
     ; GFX9-LABEL: name: load_global_s32_from_1_gep_m4096
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX9-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[COPY]], -4096, 0, implicit $exec :: (load (s8), addrspace 1)
     ; GFX9-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_UBYTE]]
+    ;
     ; GFX10-LABEL: name: load_global_s32_from_1_gep_m4096
     ; GFX10: liveins: $vgpr0_vgpr1
     ; GFX10-NEXT: {{  $}}
@@ -1873,6 +2086,7 @@ body: |
     ; GFX10-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
     ; GFX10-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, implicit $exec :: (load (s8), addrspace 1)
     ; GFX10-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_UBYTE]]
+    ;
     ; GFX11-LABEL: name: load_global_s32_from_1_gep_m4096
     ; GFX11: liveins: $vgpr0_vgpr1
     ; GFX11-NEXT: {{  $}}
@@ -1910,6 +2124,7 @@ body: |
     ; GFX6-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 8191
     ; GFX6-NEXT: [[BUFFER_LOAD_UBYTE_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_ADDR64 [[COPY]], [[REG_SEQUENCE1]], [[S_MOV_B32_2]], 0, 0, 0, implicit $exec :: (load (s8), addrspace 1)
     ; GFX6-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_ADDR64_]]
+    ;
     ; GFX7-LABEL: name: load_global_s32_from_1_gep_8191
     ; GFX7: liveins: $vgpr0_vgpr1
     ; GFX7-NEXT: {{  $}}
@@ -1922,6 +2137,7 @@ body: |
     ; GFX7-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 8191
     ; GFX7-NEXT: [[BUFFER_LOAD_UBYTE_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_ADDR64 [[COPY]], [[REG_SEQUENCE1]], [[S_MOV_B32_2]], 0, 0, 0, implicit $exec :: (load (s8), addrspace 1)
     ; GFX7-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_ADDR64_]]
+    ;
     ; GFX7-FLAT-LABEL: name: load_global_s32_from_1_gep_8191
     ; GFX7-FLAT: liveins: $vgpr0_vgpr1
     ; GFX7-FLAT-NEXT: {{  $}}
@@ -1938,6 +2154,7 @@ body: |
     ; GFX7-FLAT-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
     ; GFX7-FLAT-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 1)
     ; GFX7-FLAT-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
+    ;
     ; GFX8-LABEL: name: load_global_s32_from_1_gep_8191
     ; GFX8: liveins: $vgpr0_vgpr1
     ; GFX8-NEXT: {{  $}}
@@ -1954,6 +2171,7 @@ body: |
     ; GFX8-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
     ; GFX8-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 1)
     ; GFX8-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
+    ;
     ; GFX9-LABEL: name: load_global_s32_from_1_gep_8191
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -1970,6 +2188,7 @@ body: |
     ; GFX9-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
     ; GFX9-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, implicit $exec :: (load (s8), addrspace 1)
     ; GFX9-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_UBYTE]]
+    ;
     ; GFX10-LABEL: name: load_global_s32_from_1_gep_8191
     ; GFX10: liveins: $vgpr0_vgpr1
     ; GFX10-NEXT: {{  $}}
@@ -1986,6 +2205,7 @@ body: |
     ; GFX10-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
     ; GFX10-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, implicit $exec :: (load (s8), addrspace 1)
     ; GFX10-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_UBYTE]]
+    ;
     ; GFX11-LABEL: name: load_global_s32_from_1_gep_8191
     ; GFX11: liveins: $vgpr0_vgpr1
     ; GFX11-NEXT: {{  $}}
@@ -2033,6 +2253,7 @@ body: |
     ; GFX6-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 8192
     ; GFX6-NEXT: [[BUFFER_LOAD_UBYTE_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_ADDR64 [[COPY]], [[REG_SEQUENCE1]], [[S_MOV_B32_2]], 0, 0, 0, implicit $exec :: (load (s8), addrspace 1)
     ; GFX6-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_ADDR64_]]
+    ;
     ; GFX7-LABEL: name: load_global_s32_from_1_gep_8192
     ; GFX7: liveins: $vgpr0_vgpr1
     ; GFX7-NEXT: {{  $}}
@@ -2045,6 +2266,7 @@ body: |
     ; GFX7-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 8192
     ; GFX7-NEXT: [[BUFFER_LOAD_UBYTE_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_ADDR64 [[COPY]], [[REG_SEQUENCE1]], [[S_MOV_B32_2]], 0, 0, 0, implicit $exec :: (load (s8), addrspace 1)
     ; GFX7-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_ADDR64_]]
+    ;
     ; GFX7-FLAT-LABEL: name: load_global_s32_from_1_gep_8192
     ; GFX7-FLAT: liveins: $vgpr0_vgpr1
     ; GFX7-FLAT-NEXT: {{  $}}
@@ -2061,6 +2283,7 @@ body: |
     ; GFX7-FLAT-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
     ; GFX7-FLAT-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 1)
     ; GFX7-FLAT-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
+    ;
     ; GFX8-LABEL: name: load_global_s32_from_1_gep_8192
     ; GFX8: liveins: $vgpr0_vgpr1
     ; GFX8-NEXT: {{  $}}
@@ -2077,6 +2300,7 @@ body: |
     ; GFX8-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
     ; GFX8-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 1)
     ; GFX8-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
+    ;
     ; GFX9-LABEL: name: load_global_s32_from_1_gep_8192
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -2093,6 +2317,7 @@ body: |
     ; GFX9-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
     ; GFX9-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, implicit $exec :: (load (s8), addrspace 1)
     ; GFX9-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_UBYTE]]
+    ;
     ; GFX10-LABEL: name: load_global_s32_from_1_gep_8192
     ; GFX10: liveins: $vgpr0_vgpr1
     ; GFX10-NEXT: {{  $}}
@@ -2109,6 +2334,7 @@ body: |
     ; GFX10-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
     ; GFX10-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, implicit $exec :: (load (s8), addrspace 1)
     ; GFX10-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_UBYTE]]
+    ;
     ; GFX11-LABEL: name: load_global_s32_from_1_gep_8192
     ; GFX11: liveins: $vgpr0_vgpr1
     ; GFX11-NEXT: {{  $}}
@@ -2165,6 +2391,7 @@ body: |
     ; GFX6-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE2]], %subreg.sub2_sub3
     ; GFX6-NEXT: [[BUFFER_LOAD_UBYTE_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_ADDR64 [[REG_SEQUENCE1]], [[REG_SEQUENCE3]], 0, 0, 0, 0, implicit $exec :: (load (s8), addrspace 1)
     ; GFX6-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_ADDR64_]]
+    ;
     ; GFX7-LABEL: name: load_global_s32_from_1_gep_m8191
     ; GFX7: liveins: $vgpr0_vgpr1
     ; GFX7-NEXT: {{  $}}
@@ -2186,6 +2413,7 @@ body: |
     ; GFX7-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE2]], %subreg.sub2_sub3
     ; GFX7-NEXT: [[BUFFER_LOAD_UBYTE_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_ADDR64 [[REG_SEQUENCE1]], [[REG_SEQUENCE3]], 0, 0, 0, 0, implicit $exec :: (load (s8), addrspace 1)
     ; GFX7-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_ADDR64_]]
+    ;
     ; GFX7-FLAT-LABEL: name: load_global_s32_from_1_gep_m8191
     ; GFX7-FLAT: liveins: $vgpr0_vgpr1
     ; GFX7-FLAT-NEXT: {{  $}}
@@ -2202,6 +2430,7 @@ body: |
     ; GFX7-FLAT-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
     ; GFX7-FLAT-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 1)
     ; GFX7-FLAT-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
+    ;
     ; GFX8-LABEL: name: load_global_s32_from_1_gep_m8191
     ; GFX8: liveins: $vgpr0_vgpr1
     ; GFX8-NEXT: {{  $}}
@@ -2218,6 +2447,7 @@ body: |
     ; GFX8-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
     ; GFX8-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 1)
     ; GFX8-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
+    ;
     ; GFX9-LABEL: name: load_global_s32_from_1_gep_m8191
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -2234,6 +2464,7 @@ body: |
     ; GFX9-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
     ; GFX9-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, implicit $exec :: (load (s8), addrspace 1)
     ; GFX9-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_UBYTE]]
+    ;
     ; GFX10-LABEL: name: load_global_s32_from_1_gep_m8191
     ; GFX10: liveins: $vgpr0_vgpr1
     ; GFX10-NEXT: {{  $}}
@@ -2250,6 +2481,7 @@ body: |
     ; GFX10-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
     ; GFX10-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, implicit $exec :: (load (s8), addrspace 1)
     ; GFX10-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_UBYTE]]
+    ;
     ; GFX11-LABEL: name: load_global_s32_from_1_gep_m8191
     ; GFX11: liveins: $vgpr0_vgpr1
     ; GFX11-NEXT: {{  $}}
@@ -2306,6 +2538,7 @@ body: |
     ; GFX6-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE2]], %subreg.sub2_sub3
     ; GFX6-NEXT: [[BUFFER_LOAD_UBYTE_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_ADDR64 [[REG_SEQUENCE1]], [[REG_SEQUENCE3]], 0, 0, 0, 0, implicit $exec :: (load (s8), addrspace 1)
     ; GFX6-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_ADDR64_]]
+    ;
     ; GFX7-LABEL: name: load_global_s32_from_1_gep_m8192
     ; GFX7: liveins: $vgpr0_vgpr1
     ; GFX7-NEXT: {{  $}}
@@ -2327,6 +2560,7 @@ body: |
     ; GFX7-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE2]], %subreg.sub2_sub3
     ; GFX7-NEXT: [[BUFFER_LOAD_UBYTE_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_ADDR64 [[REG_SEQUENCE1]], [[REG_SEQUENCE3]], 0, 0, 0, 0, implicit $exec :: (load (s8), addrspace 1)
     ; GFX7-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_ADDR64_]]
+    ;
     ; GFX7-FLAT-LABEL: name: load_global_s32_from_1_gep_m8192
     ; GFX7-FLAT: liveins: $vgpr0_vgpr1
     ; GFX7-FLAT-NEXT: {{  $}}
@@ -2343,6 +2577,7 @@ body: |
     ; GFX7-FLAT-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
     ; GFX7-FLAT-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 1)
     ; GFX7-FLAT-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
+    ;
     ; GFX8-LABEL: name: load_global_s32_from_1_gep_m8192
     ; GFX8: liveins: $vgpr0_vgpr1
     ; GFX8-NEXT: {{  $}}
@@ -2359,6 +2594,7 @@ body: |
     ; GFX8-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
     ; GFX8-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 1)
     ; GFX8-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
+    ;
     ; GFX9-LABEL: name: load_global_s32_from_1_gep_m8192
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -2375,6 +2611,7 @@ body: |
     ; GFX9-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
     ; GFX9-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, implicit $exec :: (load (s8), addrspace 1)
     ; GFX9-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_UBYTE]]
+    ;
     ; GFX10-LABEL: name: load_global_s32_from_1_gep_m8192
     ; GFX10: liveins: $vgpr0_vgpr1
     ; GFX10-NEXT: {{  $}}
@@ -2391,6 +2628,7 @@ body: |
     ; GFX10-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
     ; GFX10-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, implicit $exec :: (load (s8), addrspace 1)
     ; GFX10-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_UBYTE]]
+    ;
     ; GFX11-LABEL: name: load_global_s32_from_1_gep_m8192
     ; GFX11: liveins: $vgpr0_vgpr1
     ; GFX11-NEXT: {{  $}}

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-store-global.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-store-global.mir
index f36cb1d97821401..c56ba70b667d96d 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-store-global.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-store-global.mir
@@ -29,6 +29,7 @@ body: |
     ; GFX6-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
     ; GFX6-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3
     ; GFX6-NEXT: BUFFER_STORE_DWORD_ADDR64 [[COPY1]], [[COPY]], [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec :: (store (s32), addrspace 1)
+    ;
     ; GFX7-LABEL: name: store_global_s32_to_4
     ; GFX7: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX7-NEXT: {{  $}}
@@ -40,24 +41,28 @@ body: |
     ; GFX7-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
     ; GFX7-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3
     ; GFX7-NEXT: BUFFER_STORE_DWORD_ADDR64 [[COPY1]], [[COPY]], [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec :: (store (s32), addrspace 1)
+    ;
     ; GFX7-FLAT-LABEL: name: store_global_s32_to_4
     ; GFX7-FLAT: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX7-FLAT-NEXT: {{  $}}
     ; GFX7-FLAT-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX7-FLAT-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
     ; GFX7-FLAT-NEXT: FLAT_STORE_DWORD [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32), addrspace 1)
+    ;
     ; GFX8-LABEL: name: store_global_s32_to_4
     ; GFX8: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX8-NEXT: {{  $}}
     ; GFX8-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX8-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
     ; GFX8-NEXT: FLAT_STORE_DWORD [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32), addrspace 1)
+    ;
     ; GFX9-LABEL: name: store_global_s32_to_4
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
     ; GFX9-NEXT: GLOBAL_STORE_DWORD [[COPY]], [[COPY1]], 0, 0, implicit $exec :: (store (s32), addrspace 1)
+    ;
     ; GFX10-LABEL: name: store_global_s32_to_4
     ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX10-NEXT: {{  $}}
@@ -91,6 +96,7 @@ body: |
     ; GFX6-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
     ; GFX6-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3
     ; GFX6-NEXT: BUFFER_STORE_SHORT_ADDR64 [[COPY1]], [[COPY]], [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec :: (store (s16), addrspace 1)
+    ;
     ; GFX7-LABEL: name: store_global_s32_to_2
     ; GFX7: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX7-NEXT: {{  $}}
@@ -102,24 +108,28 @@ body: |
     ; GFX7-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
     ; GFX7-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3
     ; GFX7-NEXT: BUFFER_STORE_SHORT_ADDR64 [[COPY1]], [[COPY]], [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec :: (store (s16), addrspace 1)
+    ;
     ; GFX7-FLAT-LABEL: name: store_global_s32_to_2
     ; GFX7-FLAT: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX7-FLAT-NEXT: {{  $}}
     ; GFX7-FLAT-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX7-FLAT-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
     ; GFX7-FLAT-NEXT: FLAT_STORE_SHORT [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s16), addrspace 1)
+    ;
     ; GFX8-LABEL: name: store_global_s32_to_2
     ; GFX8: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX8-NEXT: {{  $}}
     ; GFX8-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX8-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
     ; GFX8-NEXT: FLAT_STORE_SHORT [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s16), addrspace 1)
+    ;
     ; GFX9-LABEL: name: store_global_s32_to_2
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
     ; GFX9-NEXT: GLOBAL_STORE_SHORT [[COPY]], [[COPY1]], 0, 0, implicit $exec :: (store (s16), addrspace 1)
+    ;
     ; GFX10-LABEL: name: store_global_s32_to_2
     ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX10-NEXT: {{  $}}
@@ -153,6 +163,7 @@ body: |
     ; GFX6-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
     ; GFX6-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3
     ; GFX6-NEXT: BUFFER_STORE_BYTE_ADDR64 [[COPY1]], [[COPY]], [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec :: (store (s8), addrspace 1)
+    ;
     ; GFX7-LABEL: name: store_global_s32_to_1
     ; GFX7: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX7-NEXT: {{  $}}
@@ -164,24 +175,28 @@ body: |
     ; GFX7-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
     ; GFX7-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3
     ; GFX7-NEXT: BUFFER_STORE_BYTE_ADDR64 [[COPY1]], [[COPY]], [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec :: (store (s8), addrspace 1)
+    ;
     ; GFX7-FLAT-LABEL: name: store_global_s32_to_1
     ; GFX7-FLAT: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX7-FLAT-NEXT: {{  $}}
     ; GFX7-FLAT-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX7-FLAT-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
     ; GFX7-FLAT-NEXT: FLAT_STORE_BYTE [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s8), addrspace 1)
+    ;
     ; GFX8-LABEL: name: store_global_s32_to_1
     ; GFX8: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX8-NEXT: {{  $}}
     ; GFX8-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX8-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
     ; GFX8-NEXT: FLAT_STORE_BYTE [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s8), addrspace 1)
+    ;
     ; GFX9-LABEL: name: store_global_s32_to_1
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
     ; GFX9-NEXT: GLOBAL_STORE_BYTE [[COPY]], [[COPY1]], 0, 0, implicit $exec :: (store (s8), addrspace 1)
+    ;
     ; GFX10-LABEL: name: store_global_s32_to_1
     ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX10-NEXT: {{  $}}
@@ -208,33 +223,48 @@ body: |
     ; GFX6-LABEL: name: store_global_s64
     ; GFX6: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX6-NEXT: {{  $}}
-    ; GFX6-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
-    ; GFX6-NEXT: [[COPY1:%[0-9]+]]:vgpr(s64) = COPY $vgpr2_vgpr3
-    ; GFX6-NEXT: G_STORE [[COPY1]](s64), [[COPY]](p1) :: (store (s64), addrspace 1)
+    ; GFX6-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
+    ; GFX6-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3
+    ; GFX6-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+    ; GFX6-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 61440
+    ; GFX6-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
+    ; GFX6-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
+    ; GFX6-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3
+    ; GFX6-NEXT: BUFFER_STORE_DWORDX2_ADDR64 [[COPY1]], [[COPY]], [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec :: (store (s64), addrspace 1)
+    ;
     ; GFX7-LABEL: name: store_global_s64
     ; GFX7: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX7-NEXT: {{  $}}
     ; GFX7-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX7-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3
-    ; GFX7-NEXT: FLAT_STORE_DWORDX2 [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s64), addrspace 1)
+    ; GFX7-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+    ; GFX7-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 61440
+    ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
+    ; GFX7-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
+    ; GFX7-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3
+    ; GFX7-NEXT: BUFFER_STORE_DWORDX2_ADDR64 [[COPY1]], [[COPY]], [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec :: (store (s64), addrspace 1)
+    ;
     ; GFX7-FLAT-LABEL: name: store_global_s64
     ; GFX7-FLAT: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX7-FLAT-NEXT: {{  $}}
     ; GFX7-FLAT-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX7-FLAT-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3
     ; GFX7-FLAT-NEXT: FLAT_STORE_DWORDX2 [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s64), addrspace 1)
+    ;
     ; GFX8-LABEL: name: store_global_s64
     ; GFX8: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX8-NEXT: {{  $}}
     ; GFX8-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX8-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3
     ; GFX8-NEXT: FLAT_STORE_DWORDX2 [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s64), addrspace 1)
+    ;
     ; GFX9-LABEL: name: store_global_s64
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3
     ; GFX9-NEXT: GLOBAL_STORE_DWORDX2 [[COPY]], [[COPY1]], 0, 0, implicit $exec :: (store (s64), addrspace 1)
+    ;
     ; GFX10-LABEL: name: store_global_s64
     ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX10-NEXT: {{  $}}
@@ -263,30 +293,35 @@ body: |
     ; GFX6-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
     ; GFX6-NEXT: [[COPY1:%[0-9]+]]:vgpr(s128) = COPY $vgpr2_vgpr3_vgpr4_vgpr5
     ; GFX6-NEXT: G_STORE [[COPY1]](s128), [[COPY]](p1) :: (store (s128), addrspace 1)
+    ;
     ; GFX7-LABEL: name: store_global_s128
     ; GFX7: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5
     ; GFX7-NEXT: {{  $}}
     ; GFX7-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
     ; GFX7-NEXT: [[COPY1:%[0-9]+]]:vgpr(s128) = COPY $vgpr2_vgpr3_vgpr4_vgpr5
     ; GFX7-NEXT: G_STORE [[COPY1]](s128), [[COPY]](p1) :: (store (s128), addrspace 1)
+    ;
     ; GFX7-FLAT-LABEL: name: store_global_s128
     ; GFX7-FLAT: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5
     ; GFX7-FLAT-NEXT: {{  $}}
     ; GFX7-FLAT-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
     ; GFX7-FLAT-NEXT: [[COPY1:%[0-9]+]]:vgpr(s128) = COPY $vgpr2_vgpr3_vgpr4_vgpr5
     ; GFX7-FLAT-NEXT: G_STORE [[COPY1]](s128), [[COPY]](p1) :: (store (s128), addrspace 1)
+    ;
     ; GFX8-LABEL: name: store_global_s128
     ; GFX8: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5
     ; GFX8-NEXT: {{  $}}
     ; GFX8-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
     ; GFX8-NEXT: [[COPY1:%[0-9]+]]:vgpr(s128) = COPY $vgpr2_vgpr3_vgpr4_vgpr5
     ; GFX8-NEXT: G_STORE [[COPY1]](s128), [[COPY]](p1) :: (store (s128), addrspace 1)
+    ;
     ; GFX9-LABEL: name: store_global_s128
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
     ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr(s128) = COPY $vgpr2_vgpr3_vgpr4_vgpr5
     ; GFX9-NEXT: G_STORE [[COPY1]](s128), [[COPY]](p1) :: (store (s128), addrspace 1)
+    ;
     ; GFX10-LABEL: name: store_global_s128
     ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5
     ; GFX10-NEXT: {{  $}}
@@ -321,6 +356,7 @@ body: |
     ; GFX6-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
     ; GFX6-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3
     ; GFX6-NEXT: BUFFER_STORE_DWORDX2_ADDR64 [[COPY1]], [[COPY]], [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec :: (store (<2 x s32>), addrspace 1)
+    ;
     ; GFX7-LABEL: name: store_global_v2s32
     ; GFX7: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX7-NEXT: {{  $}}
@@ -332,24 +368,28 @@ body: |
     ; GFX7-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
     ; GFX7-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3
     ; GFX7-NEXT: BUFFER_STORE_DWORDX2_ADDR64 [[COPY1]], [[COPY]], [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec :: (store (<2 x s32>), addrspace 1)
+    ;
     ; GFX7-FLAT-LABEL: name: store_global_v2s32
     ; GFX7-FLAT: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX7-FLAT-NEXT: {{  $}}
     ; GFX7-FLAT-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX7-FLAT-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3
     ; GFX7-FLAT-NEXT: FLAT_STORE_DWORDX2 [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (<2 x s32>), addrspace 1)
+    ;
     ; GFX8-LABEL: name: store_global_v2s32
     ; GFX8: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX8-NEXT: {{  $}}
     ; GFX8-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX8-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3
     ; GFX8-NEXT: FLAT_STORE_DWORDX2 [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (<2 x s32>), addrspace 1)
+    ;
     ; GFX9-LABEL: name: store_global_v2s32
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3
     ; GFX9-NEXT: GLOBAL_STORE_DWORDX2 [[COPY]], [[COPY1]], 0, 0, implicit $exec :: (store (<2 x s32>), addrspace 1)
+    ;
     ; GFX10-LABEL: name: store_global_v2s32
     ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX10-NEXT: {{  $}}
@@ -383,6 +423,7 @@ body: |
     ; GFX6-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
     ; GFX6-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3
     ; GFX6-NEXT: BUFFER_STORE_DWORDX4_ADDR64 [[COPY1]], [[COPY]], [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec :: (store (<4 x s32>), addrspace 1)
+    ;
     ; GFX7-LABEL: name: store_global_v4s32
     ; GFX7: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5
     ; GFX7-NEXT: {{  $}}
@@ -394,24 +435,28 @@ body: |
     ; GFX7-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
     ; GFX7-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3
     ; GFX7-NEXT: BUFFER_STORE_DWORDX4_ADDR64 [[COPY1]], [[COPY]], [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec :: (store (<4 x s32>), addrspace 1)
+    ;
     ; GFX7-FLAT-LABEL: name: store_global_v4s32
     ; GFX7-FLAT: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5
     ; GFX7-FLAT-NEXT: {{  $}}
     ; GFX7-FLAT-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX7-FLAT-NEXT: [[COPY1:%[0-9]+]]:vreg_128 = COPY $vgpr2_vgpr3_vgpr4_vgpr5
     ; GFX7-FLAT-NEXT: FLAT_STORE_DWORDX4 [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (<4 x s32>), addrspace 1)
+    ;
     ; GFX8-LABEL: name: store_global_v4s32
     ; GFX8: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5
     ; GFX8-NEXT: {{  $}}
     ; GFX8-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX8-NEXT: [[COPY1:%[0-9]+]]:vreg_128 = COPY $vgpr2_vgpr3_vgpr4_vgpr5
     ; GFX8-NEXT: FLAT_STORE_DWORDX4 [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (<4 x s32>), addrspace 1)
+    ;
     ; GFX9-LABEL: name: store_global_v4s32
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vreg_128 = COPY $vgpr2_vgpr3_vgpr4_vgpr5
     ; GFX9-NEXT: GLOBAL_STORE_DWORDX4 [[COPY]], [[COPY1]], 0, 0, implicit $exec :: (store (<4 x s32>), addrspace 1)
+    ;
     ; GFX10-LABEL: name: store_global_v4s32
     ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5
     ; GFX10-NEXT: {{  $}}
@@ -438,33 +483,48 @@ body: |
     ; GFX6-LABEL: name: store_global_v2s16
     ; GFX6: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX6-NEXT: {{  $}}
-    ; GFX6-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
-    ; GFX6-NEXT: [[COPY1:%[0-9]+]]:vgpr(<2 x s16>) = COPY $vgpr2
-    ; GFX6-NEXT: G_STORE [[COPY1]](<2 x s16>), [[COPY]](p1) :: (store (<2 x s16>), addrspace 1)
+    ; GFX6-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
+    ; GFX6-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+    ; GFX6-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+    ; GFX6-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 61440
+    ; GFX6-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
+    ; GFX6-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
+    ; GFX6-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3
+    ; GFX6-NEXT: BUFFER_STORE_DWORD_ADDR64 [[COPY1]], [[COPY]], [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec :: (store (<2 x s16>), addrspace 1)
+    ;
     ; GFX7-LABEL: name: store_global_v2s16
     ; GFX7: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX7-NEXT: {{  $}}
     ; GFX7-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX7-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-    ; GFX7-NEXT: FLAT_STORE_DWORD [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (<2 x s16>), addrspace 1)
+    ; GFX7-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+    ; GFX7-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 61440
+    ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
+    ; GFX7-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
+    ; GFX7-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3
+    ; GFX7-NEXT: BUFFER_STORE_DWORD_ADDR64 [[COPY1]], [[COPY]], [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec :: (store (<2 x s16>), addrspace 1)
+    ;
     ; GFX7-FLAT-LABEL: name: store_global_v2s16
     ; GFX7-FLAT: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX7-FLAT-NEXT: {{  $}}
     ; GFX7-FLAT-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX7-FLAT-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
     ; GFX7-FLAT-NEXT: FLAT_STORE_DWORD [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (<2 x s16>), addrspace 1)
+    ;
     ; GFX8-LABEL: name: store_global_v2s16
     ; GFX8: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX8-NEXT: {{  $}}
     ; GFX8-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX8-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
     ; GFX8-NEXT: FLAT_STORE_DWORD [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (<2 x s16>), addrspace 1)
+    ;
     ; GFX9-LABEL: name: store_global_v2s16
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
     ; GFX9-NEXT: GLOBAL_STORE_DWORD [[COPY]], [[COPY1]], 0, 0, implicit $exec :: (store (<2 x s16>), addrspace 1)
+    ;
     ; GFX10-LABEL: name: store_global_v2s16
     ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX10-NEXT: {{  $}}
@@ -491,33 +551,48 @@ body: |
     ; GFX6-LABEL: name: store_global_v4s16
     ; GFX6: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX6-NEXT: {{  $}}
-    ; GFX6-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
-    ; GFX6-NEXT: [[COPY1:%[0-9]+]]:vgpr(<4 x s16>) = COPY $vgpr2_vgpr3
-    ; GFX6-NEXT: G_STORE [[COPY1]](<4 x s16>), [[COPY]](p1) :: (store (<4 x s16>), addrspace 1)
+    ; GFX6-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
+    ; GFX6-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3
+    ; GFX6-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+    ; GFX6-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 61440
+    ; GFX6-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
+    ; GFX6-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
+    ; GFX6-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3
+    ; GFX6-NEXT: BUFFER_STORE_DWORDX2_ADDR64 [[COPY1]], [[COPY]], [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec :: (store (<4 x s16>), addrspace 1)
+    ;
     ; GFX7-LABEL: name: store_global_v4s16
     ; GFX7: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX7-NEXT: {{  $}}
     ; GFX7-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX7-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3
-    ; GFX7-NEXT: FLAT_STORE_DWORDX2 [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (<4 x s16>), addrspace 1)
+    ; GFX7-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+    ; GFX7-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 61440
+    ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
+    ; GFX7-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
+    ; GFX7-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3
+    ; GFX7-NEXT: BUFFER_STORE_DWORDX2_ADDR64 [[COPY1]], [[COPY]], [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec :: (store (<4 x s16>), addrspace 1)
+    ;
     ; GFX7-FLAT-LABEL: name: store_global_v4s16
     ; GFX7-FLAT: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX7-FLAT-NEXT: {{  $}}
     ; GFX7-FLAT-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX7-FLAT-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3
     ; GFX7-FLAT-NEXT: FLAT_STORE_DWORDX2 [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (<4 x s16>), addrspace 1)
+    ;
     ; GFX8-LABEL: name: store_global_v4s16
     ; GFX8: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX8-NEXT: {{  $}}
     ; GFX8-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX8-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3
     ; GFX8-NEXT: FLAT_STORE_DWORDX2 [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (<4 x s16>), addrspace 1)
+    ;
     ; GFX9-LABEL: name: store_global_v4s16
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3
     ; GFX9-NEXT: GLOBAL_STORE_DWORDX2 [[COPY]], [[COPY1]], 0, 0, implicit $exec :: (store (<4 x s16>), addrspace 1)
+    ;
     ; GFX10-LABEL: name: store_global_v4s16
     ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX10-NEXT: {{  $}}
@@ -544,33 +619,48 @@ body: |
     ; GFX6-LABEL: name: store_global_v8s16
     ; GFX6: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5
     ; GFX6-NEXT: {{  $}}
-    ; GFX6-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
-    ; GFX6-NEXT: [[COPY1:%[0-9]+]]:vgpr(<8 x s16>) = COPY $vgpr2_vgpr3_vgpr4_vgpr5
-    ; GFX6-NEXT: G_STORE [[COPY1]](<8 x s16>), [[COPY]](p1) :: (store (<8 x s16>), addrspace 1)
+    ; GFX6-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
+    ; GFX6-NEXT: [[COPY1:%[0-9]+]]:vreg_128 = COPY $vgpr2_vgpr3_vgpr4_vgpr5
+    ; GFX6-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+    ; GFX6-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 61440
+    ; GFX6-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
+    ; GFX6-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
+    ; GFX6-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3
+    ; GFX6-NEXT: BUFFER_STORE_DWORDX4_ADDR64 [[COPY1]], [[COPY]], [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec :: (store (<8 x s16>), addrspace 1)
+    ;
     ; GFX7-LABEL: name: store_global_v8s16
     ; GFX7: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5
     ; GFX7-NEXT: {{  $}}
     ; GFX7-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX7-NEXT: [[COPY1:%[0-9]+]]:vreg_128 = COPY $vgpr2_vgpr3_vgpr4_vgpr5
-    ; GFX7-NEXT: FLAT_STORE_DWORDX4 [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (<8 x s16>), addrspace 1)
+    ; GFX7-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+    ; GFX7-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 61440
+    ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
+    ; GFX7-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
+    ; GFX7-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3
+    ; GFX7-NEXT: BUFFER_STORE_DWORDX4_ADDR64 [[COPY1]], [[COPY]], [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec :: (store (<8 x s16>), addrspace 1)
+    ;
     ; GFX7-FLAT-LABEL: name: store_global_v8s16
     ; GFX7-FLAT: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5
     ; GFX7-FLAT-NEXT: {{  $}}
     ; GFX7-FLAT-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX7-FLAT-NEXT: [[COPY1:%[0-9]+]]:vreg_128 = COPY $vgpr2_vgpr3_vgpr4_vgpr5
     ; GFX7-FLAT-NEXT: FLAT_STORE_DWORDX4 [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (<8 x s16>), addrspace 1)
+    ;
     ; GFX8-LABEL: name: store_global_v8s16
     ; GFX8: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5
     ; GFX8-NEXT: {{  $}}
     ; GFX8-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX8-NEXT: [[COPY1:%[0-9]+]]:vreg_128 = COPY $vgpr2_vgpr3_vgpr4_vgpr5
     ; GFX8-NEXT: FLAT_STORE_DWORDX4 [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (<8 x s16>), addrspace 1)
+    ;
     ; GFX9-LABEL: name: store_global_v8s16
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vreg_128 = COPY $vgpr2_vgpr3_vgpr4_vgpr5
     ; GFX9-NEXT: GLOBAL_STORE_DWORDX4 [[COPY]], [[COPY1]], 0, 0, implicit $exec :: (store (<8 x s16>), addrspace 1)
+    ;
     ; GFX10-LABEL: name: store_global_v8s16
     ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5
     ; GFX10-NEXT: {{  $}}
@@ -597,33 +687,48 @@ body: |
     ; GFX6-LABEL: name: store_global_v2s64
     ; GFX6: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5
     ; GFX6-NEXT: {{  $}}
-    ; GFX6-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
-    ; GFX6-NEXT: [[COPY1:%[0-9]+]]:vgpr(<2 x s64>) = COPY $vgpr2_vgpr3_vgpr4_vgpr5
-    ; GFX6-NEXT: G_STORE [[COPY1]](<2 x s64>), [[COPY]](p1) :: (store (<2 x s64>), addrspace 1)
+    ; GFX6-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
+    ; GFX6-NEXT: [[COPY1:%[0-9]+]]:vreg_128 = COPY $vgpr2_vgpr3_vgpr4_vgpr5
+    ; GFX6-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+    ; GFX6-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 61440
+    ; GFX6-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
+    ; GFX6-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
+    ; GFX6-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3
+    ; GFX6-NEXT: BUFFER_STORE_DWORDX4_ADDR64 [[COPY1]], [[COPY]], [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec :: (store (<2 x s64>), addrspace 1)
+    ;
     ; GFX7-LABEL: name: store_global_v2s64
     ; GFX7: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5
     ; GFX7-NEXT: {{  $}}
     ; GFX7-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX7-NEXT: [[COPY1:%[0-9]+]]:vreg_128 = COPY $vgpr2_vgpr3_vgpr4_vgpr5
-    ; GFX7-NEXT: FLAT_STORE_DWORDX4 [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (<2 x s64>), addrspace 1)
+    ; GFX7-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+    ; GFX7-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 61440
+    ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
+    ; GFX7-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
+    ; GFX7-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3
+    ; GFX7-NEXT: BUFFER_STORE_DWORDX4_ADDR64 [[COPY1]], [[COPY]], [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec :: (store (<2 x s64>), addrspace 1)
+    ;
     ; GFX7-FLAT-LABEL: name: store_global_v2s64
     ; GFX7-FLAT: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5
     ; GFX7-FLAT-NEXT: {{  $}}
     ; GFX7-FLAT-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX7-FLAT-NEXT: [[COPY1:%[0-9]+]]:vreg_128 = COPY $vgpr2_vgpr3_vgpr4_vgpr5
     ; GFX7-FLAT-NEXT: FLAT_STORE_DWORDX4 [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (<2 x s64>), addrspace 1)
+    ;
     ; GFX8-LABEL: name: store_global_v2s64
     ; GFX8: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5
     ; GFX8-NEXT: {{  $}}
     ; GFX8-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX8-NEXT: [[COPY1:%[0-9]+]]:vreg_128 = COPY $vgpr2_vgpr3_vgpr4_vgpr5
     ; GFX8-NEXT: FLAT_STORE_DWORDX4 [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (<2 x s64>), addrspace 1)
+    ;
     ; GFX9-LABEL: name: store_global_v2s64
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vreg_128 = COPY $vgpr2_vgpr3_vgpr4_vgpr5
     ; GFX9-NEXT: GLOBAL_STORE_DWORDX4 [[COPY]], [[COPY1]], 0, 0, implicit $exec :: (store (<2 x s64>), addrspace 1)
+    ;
     ; GFX10-LABEL: name: store_global_v2s64
     ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5
     ; GFX10-NEXT: {{  $}}
@@ -650,33 +755,48 @@ body: |
     ; GFX6-LABEL: name: store_global_p1
     ; GFX6: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX6-NEXT: {{  $}}
-    ; GFX6-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
-    ; GFX6-NEXT: [[COPY1:%[0-9]+]]:vgpr(p1) = COPY $vgpr2_vgpr3
-    ; GFX6-NEXT: G_STORE [[COPY1]](p1), [[COPY]](p1) :: (store (p1), addrspace 1)
+    ; GFX6-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
+    ; GFX6-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3
+    ; GFX6-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+    ; GFX6-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 61440
+    ; GFX6-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
+    ; GFX6-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
+    ; GFX6-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3
+    ; GFX6-NEXT: BUFFER_STORE_DWORDX2_ADDR64 [[COPY1]], [[COPY]], [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec :: (store (p1), addrspace 1)
+    ;
     ; GFX7-LABEL: name: store_global_p1
     ; GFX7: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX7-NEXT: {{  $}}
     ; GFX7-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX7-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3
-    ; GFX7-NEXT: FLAT_STORE_DWORDX2 [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (p1), addrspace 1)
+    ; GFX7-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+    ; GFX7-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 61440
+    ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
+    ; GFX7-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
+    ; GFX7-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3
+    ; GFX7-NEXT: BUFFER_STORE_DWORDX2_ADDR64 [[COPY1]], [[COPY]], [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec :: (store (p1), addrspace 1)
+    ;
     ; GFX7-FLAT-LABEL: name: store_global_p1
     ; GFX7-FLAT: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX7-FLAT-NEXT: {{  $}}
     ; GFX7-FLAT-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX7-FLAT-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3
     ; GFX7-FLAT-NEXT: FLAT_STORE_DWORDX2 [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (p1), addrspace 1)
+    ;
     ; GFX8-LABEL: name: store_global_p1
     ; GFX8: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX8-NEXT: {{  $}}
     ; GFX8-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX8-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3
     ; GFX8-NEXT: FLAT_STORE_DWORDX2 [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (p1), addrspace 1)
+    ;
     ; GFX9-LABEL: name: store_global_p1
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3
     ; GFX9-NEXT: GLOBAL_STORE_DWORDX2 [[COPY]], [[COPY1]], 0, 0, implicit $exec :: (store (p1), addrspace 1)
+    ;
     ; GFX10-LABEL: name: store_global_p1
     ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX10-NEXT: {{  $}}
@@ -706,30 +826,35 @@ body: |
     ; GFX6-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
     ; GFX6-NEXT: [[COPY1:%[0-9]+]]:vgpr(<2 x p1>) = COPY $vgpr2_vgpr3_vgpr4_vgpr5
     ; GFX6-NEXT: G_STORE [[COPY1]](<2 x p1>), [[COPY]](p1) :: (store (<2 x p1>), addrspace 1)
+    ;
     ; GFX7-LABEL: name: store_global_v2p1
     ; GFX7: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5
     ; GFX7-NEXT: {{  $}}
     ; GFX7-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
     ; GFX7-NEXT: [[COPY1:%[0-9]+]]:vgpr(<2 x p1>) = COPY $vgpr2_vgpr3_vgpr4_vgpr5
     ; GFX7-NEXT: G_STORE [[COPY1]](<2 x p1>), [[COPY]](p1) :: (store (<2 x p1>), addrspace 1)
+    ;
     ; GFX7-FLAT-LABEL: name: store_global_v2p1
     ; GFX7-FLAT: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5
     ; GFX7-FLAT-NEXT: {{  $}}
     ; GFX7-FLAT-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
     ; GFX7-FLAT-NEXT: [[COPY1:%[0-9]+]]:vgpr(<2 x p1>) = COPY $vgpr2_vgpr3_vgpr4_vgpr5
     ; GFX7-FLAT-NEXT: G_STORE [[COPY1]](<2 x p1>), [[COPY]](p1) :: (store (<2 x p1>), addrspace 1)
+    ;
     ; GFX8-LABEL: name: store_global_v2p1
     ; GFX8: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5
     ; GFX8-NEXT: {{  $}}
     ; GFX8-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
     ; GFX8-NEXT: [[COPY1:%[0-9]+]]:vgpr(<2 x p1>) = COPY $vgpr2_vgpr3_vgpr4_vgpr5
     ; GFX8-NEXT: G_STORE [[COPY1]](<2 x p1>), [[COPY]](p1) :: (store (<2 x p1>), addrspace 1)
+    ;
     ; GFX9-LABEL: name: store_global_v2p1
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
     ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr(<2 x p1>) = COPY $vgpr2_vgpr3_vgpr4_vgpr5
     ; GFX9-NEXT: G_STORE [[COPY1]](<2 x p1>), [[COPY]](p1) :: (store (<2 x p1>), addrspace 1)
+    ;
     ; GFX10-LABEL: name: store_global_v2p1
     ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5
     ; GFX10-NEXT: {{  $}}
@@ -756,33 +881,48 @@ body: |
     ; GFX6-LABEL: name: store_global_p3
     ; GFX6: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX6-NEXT: {{  $}}
-    ; GFX6-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
-    ; GFX6-NEXT: [[COPY1:%[0-9]+]]:vgpr(p3) = COPY $vgpr2
-    ; GFX6-NEXT: G_STORE [[COPY1]](p3), [[COPY]](p1) :: (store (p3), addrspace 1)
+    ; GFX6-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
+    ; GFX6-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+    ; GFX6-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+    ; GFX6-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 61440
+    ; GFX6-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
+    ; GFX6-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
+    ; GFX6-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3
+    ; GFX6-NEXT: BUFFER_STORE_DWORD_ADDR64 [[COPY1]], [[COPY]], [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec :: (store (p3), addrspace 1)
+    ;
     ; GFX7-LABEL: name: store_global_p3
     ; GFX7: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX7-NEXT: {{  $}}
     ; GFX7-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX7-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-    ; GFX7-NEXT: FLAT_STORE_DWORD [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (p3), addrspace 1)
+    ; GFX7-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+    ; GFX7-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 61440
+    ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
+    ; GFX7-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
+    ; GFX7-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3
+    ; GFX7-NEXT: BUFFER_STORE_DWORD_ADDR64 [[COPY1]], [[COPY]], [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec :: (store (p3), addrspace 1)
+    ;
     ; GFX7-FLAT-LABEL: name: store_global_p3
     ; GFX7-FLAT: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX7-FLAT-NEXT: {{  $}}
     ; GFX7-FLAT-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX7-FLAT-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
     ; GFX7-FLAT-NEXT: FLAT_STORE_DWORD [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (p3), addrspace 1)
+    ;
     ; GFX8-LABEL: name: store_global_p3
     ; GFX8: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX8-NEXT: {{  $}}
     ; GFX8-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX8-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
     ; GFX8-NEXT: FLAT_STORE_DWORD [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (p3), addrspace 1)
+    ;
     ; GFX9-LABEL: name: store_global_p3
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
     ; GFX9-NEXT: GLOBAL_STORE_DWORD [[COPY]], [[COPY1]], 0, 0, implicit $exec :: (store (p3), addrspace 1)
+    ;
     ; GFX10-LABEL: name: store_global_p3
     ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX10-NEXT: {{  $}}
@@ -812,30 +952,35 @@ body: |
     ; GFX6-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
     ; GFX6-NEXT: [[COPY1:%[0-9]+]]:vgpr(<2 x p3>) = COPY $vgpr2_vgpr3
     ; GFX6-NEXT: G_STORE [[COPY1]](<2 x p3>), [[COPY]](p1) :: (store (<2 x p3>), addrspace 1)
+    ;
     ; GFX7-LABEL: name: store_global_v2p3
     ; GFX7: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX7-NEXT: {{  $}}
     ; GFX7-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
     ; GFX7-NEXT: [[COPY1:%[0-9]+]]:vgpr(<2 x p3>) = COPY $vgpr2_vgpr3
     ; GFX7-NEXT: G_STORE [[COPY1]](<2 x p3>), [[COPY]](p1) :: (store (<2 x p3>), addrspace 1)
+    ;
     ; GFX7-FLAT-LABEL: name: store_global_v2p3
     ; GFX7-FLAT: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX7-FLAT-NEXT: {{  $}}
     ; GFX7-FLAT-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
     ; GFX7-FLAT-NEXT: [[COPY1:%[0-9]+]]:vgpr(<2 x p3>) = COPY $vgpr2_vgpr3
     ; GFX7-FLAT-NEXT: G_STORE [[COPY1]](<2 x p3>), [[COPY]](p1) :: (store (<2 x p3>), addrspace 1)
+    ;
     ; GFX8-LABEL: name: store_global_v2p3
     ; GFX8: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX8-NEXT: {{  $}}
     ; GFX8-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
     ; GFX8-NEXT: [[COPY1:%[0-9]+]]:vgpr(<2 x p3>) = COPY $vgpr2_vgpr3
     ; GFX8-NEXT: G_STORE [[COPY1]](<2 x p3>), [[COPY]](p1) :: (store (<2 x p3>), addrspace 1)
+    ;
     ; GFX9-LABEL: name: store_global_v2p3
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
     ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr(<2 x p3>) = COPY $vgpr2_vgpr3
     ; GFX9-NEXT: G_STORE [[COPY1]](<2 x p3>), [[COPY]](p1) :: (store (<2 x p3>), addrspace 1)
+    ;
     ; GFX10-LABEL: name: store_global_v2p3
     ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX10-NEXT: {{  $}}
@@ -869,6 +1014,7 @@ body: |
     ; GFX6-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
     ; GFX6-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3
     ; GFX6-NEXT: BUFFER_STORE_DWORD_ADDR64 [[COPY1]], [[COPY]], [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec :: (store monotonic (s32), addrspace 1)
+    ;
     ; GFX7-LABEL: name: store_atomic_global_s32
     ; GFX7: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX7-NEXT: {{  $}}
@@ -880,24 +1026,28 @@ body: |
     ; GFX7-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
     ; GFX7-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3
     ; GFX7-NEXT: BUFFER_STORE_DWORD_ADDR64 [[COPY1]], [[COPY]], [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec :: (store monotonic (s32), addrspace 1)
+    ;
     ; GFX7-FLAT-LABEL: name: store_atomic_global_s32
     ; GFX7-FLAT: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX7-FLAT-NEXT: {{  $}}
     ; GFX7-FLAT-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX7-FLAT-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
     ; GFX7-FLAT-NEXT: FLAT_STORE_DWORD [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store monotonic (s32), addrspace 1)
+    ;
     ; GFX8-LABEL: name: store_atomic_global_s32
     ; GFX8: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX8-NEXT: {{  $}}
     ; GFX8-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX8-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
     ; GFX8-NEXT: FLAT_STORE_DWORD [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store monotonic (s32), addrspace 1)
+    ;
     ; GFX9-LABEL: name: store_atomic_global_s32
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
     ; GFX9-NEXT: GLOBAL_STORE_DWORD [[COPY]], [[COPY1]], 0, 0, implicit $exec :: (store monotonic (s32), addrspace 1)
+    ;
     ; GFX10-LABEL: name: store_atomic_global_s32
     ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX10-NEXT: {{  $}}
@@ -932,6 +1082,7 @@ body: |
     ; GFX6-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
     ; GFX6-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3
     ; GFX6-NEXT: BUFFER_STORE_DWORDX2_ADDR64 [[COPY1]], [[COPY]], [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec :: (store monotonic (s64), addrspace 1)
+    ;
     ; GFX7-LABEL: name: store_atomic_global_s64
     ; GFX7: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX7-NEXT: {{  $}}
@@ -943,24 +1094,28 @@ body: |
     ; GFX7-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
     ; GFX7-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3
     ; GFX7-NEXT: BUFFER_STORE_DWORDX2_ADDR64 [[COPY1]], [[COPY]], [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec :: (store monotonic (s64), addrspace 1)
+    ;
     ; GFX7-FLAT-LABEL: name: store_atomic_global_s64
     ; GFX7-FLAT: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX7-FLAT-NEXT: {{  $}}
     ; GFX7-FLAT-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX7-FLAT-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3
     ; GFX7-FLAT-NEXT: FLAT_STORE_DWORDX2 [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store monotonic (s64), addrspace 1)
+    ;
     ; GFX8-LABEL: name: store_atomic_global_s64
     ; GFX8: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX8-NEXT: {{  $}}
     ; GFX8-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX8-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3
     ; GFX8-NEXT: FLAT_STORE_DWORDX2 [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store monotonic (s64), addrspace 1)
+    ;
     ; GFX9-LABEL: name: store_atomic_global_s64
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3
     ; GFX9-NEXT: GLOBAL_STORE_DWORDX2 [[COPY]], [[COPY1]], 0, 0, implicit $exec :: (store monotonic (s64), addrspace 1)
+    ;
     ; GFX10-LABEL: name: store_atomic_global_s64
     ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX10-NEXT: {{  $}}
@@ -995,6 +1150,7 @@ body: |
     ; GFX6-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
     ; GFX6-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3
     ; GFX6-NEXT: BUFFER_STORE_DWORD_ADDR64 [[COPY1]], [[COPY]], [[REG_SEQUENCE1]], 0, 2047, 0, 0, implicit $exec :: (store (s32), addrspace 1)
+    ;
     ; GFX7-LABEL: name: store_global_s32_gep_2047
     ; GFX7: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX7-NEXT: {{  $}}
@@ -1006,6 +1162,7 @@ body: |
     ; GFX7-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
     ; GFX7-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3
     ; GFX7-NEXT: BUFFER_STORE_DWORD_ADDR64 [[COPY1]], [[COPY]], [[REG_SEQUENCE1]], 0, 2047, 0, 0, implicit $exec :: (store (s32), addrspace 1)
+    ;
     ; GFX7-FLAT-LABEL: name: store_global_s32_gep_2047
     ; GFX7-FLAT: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX7-FLAT-NEXT: {{  $}}
@@ -1022,6 +1179,7 @@ body: |
     ; GFX7-FLAT-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY4]], [[COPY5]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
     ; GFX7-FLAT-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
     ; GFX7-FLAT-NEXT: FLAT_STORE_DWORD [[REG_SEQUENCE1]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32), addrspace 1)
+    ;
     ; GFX8-LABEL: name: store_global_s32_gep_2047
     ; GFX8: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX8-NEXT: {{  $}}
@@ -1038,12 +1196,14 @@ body: |
     ; GFX8-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY4]], [[COPY5]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
     ; GFX8-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
     ; GFX8-NEXT: FLAT_STORE_DWORD [[REG_SEQUENCE1]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32), addrspace 1)
+    ;
     ; GFX9-LABEL: name: store_global_s32_gep_2047
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
     ; GFX9-NEXT: GLOBAL_STORE_DWORD [[COPY]], [[COPY1]], 2047, 0, implicit $exec :: (store (s32), addrspace 1)
+    ;
     ; GFX10-LABEL: name: store_global_s32_gep_2047
     ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX10-NEXT: {{  $}}

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.fmas.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.fmas.ll
index 7160f03d2c3d62c..a5482bd5b79a96f 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.fmas.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.fmas.ll
@@ -748,11 +748,11 @@ define amdgpu_kernel void @test_div_fmas_f64(ptr addrspace(1) %out, double %a, d
 ; GFX7-NEXT:    v_mov_b32_e32 v3, s9
 ; GFX7-NEXT:    v_mov_b32_e32 v5, s11
 ; GFX7-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s0
-; GFX7-NEXT:    s_nop 3
+; GFX7-NEXT:    s_mov_b32 s6, -1
+; GFX7-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-NEXT:    s_nop 1
 ; GFX7-NEXT:    v_div_fmas_f64 v[0:1], v[0:1], v[2:3], v[4:5]
-; GFX7-NEXT:    v_mov_b32_e32 v2, s4
-; GFX7-NEXT:    v_mov_b32_e32 v3, s5
-; GFX7-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; GFX7-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX8-LABEL: test_div_fmas_f64:

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.scale.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.scale.ll
index a325288ca1601fb..13f788520710513 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.scale.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.scale.ll
@@ -174,24 +174,21 @@ define amdgpu_kernel void @test_div_scale_f32_2(ptr addrspace(1) %out, ptr addrs
 define amdgpu_kernel void @test_div_scale_f64_1(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %in) {
 ; GFX7-LABEL: test_div_scale_f64_1:
 ; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0xd
-; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xd
+; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v1, 0
+; GFX7-NEXT:    s_mov_b32 s6, 0
+; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s3
-; GFX7-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
-; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX7-NEXT:    v_add_i32_e32 v2, vcc, 8, v0
-; GFX7-NEXT:    v_addc_u32_e32 v3, vcc, 0, v1, vcc
-; GFX7-NEXT:    flat_load_dwordx2 v[0:1], v[0:1] glc
+; GFX7-NEXT:    buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64 glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    flat_load_dwordx2 v[2:3], v[2:3] glc
+; GFX7-NEXT:    buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64 offset:8 glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_div_scale_f64 v[0:1], s[2:3], v[2:3], v[2:3], v[0:1]
-; GFX7-NEXT:    v_mov_b32_e32 v3, s1
-; GFX7-NEXT:    v_mov_b32_e32 v2, s0
-; GFX7-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
+; GFX7-NEXT:    s_mov_b32 s6, -1
+; GFX7-NEXT:    v_div_scale_f64 v[0:1], s[0:1], v[0:1], v[0:1], v[2:3]
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX8-LABEL: test_div_scale_f64_1:
@@ -263,24 +260,21 @@ define amdgpu_kernel void @test_div_scale_f64_1(ptr addrspace(1) %out, ptr addrs
 define amdgpu_kernel void @test_div_scale_f64_2(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %in) {
 ; GFX7-LABEL: test_div_scale_f64_2:
 ; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0xd
-; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xd
+; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v1, 0
+; GFX7-NEXT:    s_mov_b32 s6, 0
+; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s3
-; GFX7-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
-; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX7-NEXT:    v_add_i32_e32 v2, vcc, 8, v0
-; GFX7-NEXT:    v_addc_u32_e32 v3, vcc, 0, v1, vcc
-; GFX7-NEXT:    flat_load_dwordx2 v[0:1], v[0:1] glc
+; GFX7-NEXT:    buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64 glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    flat_load_dwordx2 v[2:3], v[2:3] glc
+; GFX7-NEXT:    buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64 offset:8 glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_div_scale_f64 v[0:1], s[2:3], v[0:1], v[2:3], v[0:1]
-; GFX7-NEXT:    v_mov_b32_e32 v3, s1
-; GFX7-NEXT:    v_mov_b32_e32 v2, s0
-; GFX7-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
+; GFX7-NEXT:    s_mov_b32 s6, -1
+; GFX7-NEXT:    v_div_scale_f64 v[0:1], s[0:1], v[2:3], v[0:1], v[2:3]
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX8-LABEL: test_div_scale_f64_2:
@@ -649,19 +643,19 @@ define amdgpu_kernel void @test_div_scale_f64_scalar_num_1(ptr addrspace(1) %out
 ; GFX7-LABEL: test_div_scale_f64_scalar_num_1:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x15
-; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
+; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x15
+; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v1, 0
+; GFX7-NEXT:    s_mov_b32 s2, 0
+; GFX7-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s6
-; GFX7-NEXT:    v_mov_b32_e32 v1, s7
-; GFX7-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
-; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX7-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
-; GFX7-NEXT:    v_mov_b32_e32 v2, s4
-; GFX7-NEXT:    v_mov_b32_e32 v3, s5
+; GFX7-NEXT:    s_mov_b64 s[0:1], s[6:7]
+; GFX7-NEXT:    buffer_load_dwordx2 v[0:1], v[0:1], s[0:3], 0 addr64
+; GFX7-NEXT:    s_mov_b32 s2, -1
+; GFX7-NEXT:    s_mov_b64 s[6:7], s[2:3]
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_div_scale_f64 v[0:1], s[0:1], v[0:1], v[0:1], s[0:1]
-; GFX7-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; GFX7-NEXT:    v_div_scale_f64 v[0:1], s[0:1], v[0:1], v[0:1], s[8:9]
+; GFX7-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX8-LABEL: test_div_scale_f64_scalar_num_1:
@@ -724,19 +718,19 @@ define amdgpu_kernel void @test_div_scale_f64_scalar_num_2(ptr addrspace(1) %out
 ; GFX7-LABEL: test_div_scale_f64_scalar_num_2:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x15
-; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
+; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x15
+; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v1, 0
+; GFX7-NEXT:    s_mov_b32 s2, 0
+; GFX7-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s6
-; GFX7-NEXT:    v_mov_b32_e32 v1, s7
-; GFX7-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
-; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX7-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
-; GFX7-NEXT:    v_mov_b32_e32 v2, s4
-; GFX7-NEXT:    v_mov_b32_e32 v3, s5
+; GFX7-NEXT:    s_mov_b64 s[0:1], s[6:7]
+; GFX7-NEXT:    buffer_load_dwordx2 v[0:1], v[0:1], s[0:3], 0 addr64
+; GFX7-NEXT:    s_mov_b32 s2, -1
+; GFX7-NEXT:    s_mov_b64 s[6:7], s[2:3]
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_div_scale_f64 v[0:1], s[0:1], s[0:1], v[0:1], s[0:1]
-; GFX7-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; GFX7-NEXT:    v_div_scale_f64 v[0:1], s[0:1], s[8:9], v[0:1], s[8:9]
+; GFX7-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX8-LABEL: test_div_scale_f64_scalar_num_2:
@@ -799,19 +793,19 @@ define amdgpu_kernel void @test_div_scale_f64_scalar_den_1(ptr addrspace(1) %out
 ; GFX7-LABEL: test_div_scale_f64_scalar_den_1:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x15
-; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
+; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x15
+; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v1, 0
+; GFX7-NEXT:    s_mov_b32 s2, 0
+; GFX7-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s6
-; GFX7-NEXT:    v_mov_b32_e32 v1, s7
-; GFX7-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
-; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX7-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
-; GFX7-NEXT:    v_mov_b32_e32 v2, s4
-; GFX7-NEXT:    v_mov_b32_e32 v3, s5
+; GFX7-NEXT:    s_mov_b64 s[0:1], s[6:7]
+; GFX7-NEXT:    buffer_load_dwordx2 v[0:1], v[0:1], s[0:3], 0 addr64
+; GFX7-NEXT:    s_mov_b32 s2, -1
+; GFX7-NEXT:    s_mov_b64 s[6:7], s[2:3]
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_div_scale_f64 v[0:1], s[0:1], s[0:1], s[0:1], v[0:1]
-; GFX7-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; GFX7-NEXT:    v_div_scale_f64 v[0:1], s[0:1], s[8:9], s[8:9], v[0:1]
+; GFX7-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX8-LABEL: test_div_scale_f64_scalar_den_1:
@@ -874,19 +868,19 @@ define amdgpu_kernel void @test_div_scale_f64_scalar_den_2(ptr addrspace(1) %out
 ; GFX7-LABEL: test_div_scale_f64_scalar_den_2:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x15
-; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
+; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x15
+; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
+; GFX7-NEXT:    v_mov_b32_e32 v1, 0
+; GFX7-NEXT:    s_mov_b32 s2, 0
+; GFX7-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s6
-; GFX7-NEXT:    v_mov_b32_e32 v1, s7
-; GFX7-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
-; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX7-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
-; GFX7-NEXT:    v_mov_b32_e32 v2, s4
-; GFX7-NEXT:    v_mov_b32_e32 v3, s5
+; GFX7-NEXT:    s_mov_b64 s[0:1], s[6:7]
+; GFX7-NEXT:    buffer_load_dwordx2 v[0:1], v[0:1], s[0:3], 0 addr64
+; GFX7-NEXT:    s_mov_b32 s2, -1
+; GFX7-NEXT:    s_mov_b64 s[6:7], s[2:3]
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_div_scale_f64 v[0:1], s[0:1], v[0:1], s[0:1], v[0:1]
-; GFX7-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; GFX7-NEXT:    v_div_scale_f64 v[0:1], s[0:1], v[0:1], s[8:9], v[0:1]
+; GFX7-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX8-LABEL: test_div_scale_f64_scalar_den_2:
@@ -1071,9 +1065,9 @@ define amdgpu_kernel void @test_div_scale_f64_all_scalar_1(ptr addrspace(1) %out
 ; GFX7-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX7-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX7-NEXT:    v_div_scale_f64 v[0:1], s[2:3], v[0:1], v[0:1], s[4:5]
-; GFX7-NEXT:    v_mov_b32_e32 v3, s1
-; GFX7-NEXT:    v_mov_b32_e32 v2, s0
-; GFX7-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; GFX7-NEXT:    s_mov_b32 s2, -1
+; GFX7-NEXT:    s_mov_b32 s3, 0xf000
+; GFX7-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX8-LABEL: test_div_scale_f64_all_scalar_1:
@@ -1131,9 +1125,9 @@ define amdgpu_kernel void @test_div_scale_f64_all_scalar_2(ptr addrspace(1) %out
 ; GFX7-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX7-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX7-NEXT:    v_div_scale_f64 v[0:1], s[2:3], s[4:5], v[0:1], s[4:5]
-; GFX7-NEXT:    v_mov_b32_e32 v3, s1
-; GFX7-NEXT:    v_mov_b32_e32 v2, s0
-; GFX7-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; GFX7-NEXT:    s_mov_b32 s2, -1
+; GFX7-NEXT:    s_mov_b32 s3, 0xf000
+; GFX7-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX8-LABEL: test_div_scale_f64_all_scalar_2:
@@ -1644,14 +1638,14 @@ define amdgpu_kernel void @test_div_scale_f32_undef_undef_val(ptr addrspace(1) %
 define amdgpu_kernel void @test_div_scale_f64_val_undef_val(ptr addrspace(1) %out) #0 {
 ; GFX7-LABEL: test_div_scale_f64_val_undef_val:
 ; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
 ; GFX7-NEXT:    s_mov_b32 s2, 0
+; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
 ; GFX7-NEXT:    s_mov_b32 s3, 0x40200000
 ; GFX7-NEXT:    v_div_scale_f64 v[0:1], s[2:3], v[0:1], v[0:1], s[2:3]
+; GFX7-NEXT:    s_mov_b32 s2, -1
+; GFX7-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v3, s1
-; GFX7-NEXT:    v_mov_b32_e32 v2, s0
-; GFX7-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; GFX7-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX8-LABEL: test_div_scale_f64_val_undef_val:

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.set.inactive.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.set.inactive.ll
index 8506efc1d978623..043e69abaeef2d3 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.set.inactive.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.set.inactive.ll
@@ -31,9 +31,9 @@ define amdgpu_kernel void @set_inactive_64(ptr addrspace(1) %out, i64 %in) {
 ; GCN-NEXT:    v_mov_b32_e32 v0, 0
 ; GCN-NEXT:    v_mov_b32_e32 v1, 0
 ; GCN-NEXT:    s_not_b64 exec, exec
-; GCN-NEXT:    v_mov_b32_e32 v3, s1
-; GCN-NEXT:    v_mov_b32_e32 v2, s0
-; GCN-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; GCN-NEXT:    s_mov_b32 s2, -1
+; GCN-NEXT:    s_mov_b32 s3, 0xf000
+; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; GCN-NEXT:    s_endpgm
   %tmp = call i64 @llvm.amdgcn.set.inactive.i64(i64 %in, i64 0) #0
   store i64 %tmp, ptr addrspace(1) %out

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/load-constant.96.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/load-constant.96.ll
index 469e6ddad0a0701..c28d204784d4b2a 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/load-constant.96.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/load-constant.96.ll
@@ -4,8 +4,7 @@
 ; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=hawaii -mattr=+unaligned-access-mode < %s | FileCheck -check-prefixes=GCN,GFX7,GFX7-UNALIGNED %s
 ; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=hawaii -mattr=-unaligned-access-mode < %s | FileCheck -check-prefixes=GCN,GFX7,GFX7-NOUNALIGNED %s
 
-; FIXME:
-; XUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=tahiti < %s | FileCheck -check-prefixes=GCN,GFX6 %s
+; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=tahiti < %s | FileCheck -check-prefixes=GCN,GFX6 %s
 
 define <3 x i32> @v_load_constant_v3i32_align1(ptr addrspace(4) %ptr) {
 ; GFX9-UNALIGNED-LABEL: v_load_constant_v3i32_align1:
@@ -112,6 +111,56 @@ define <3 x i32> @v_load_constant_v3i32_align1(ptr addrspace(4) %ptr) {
 ; GFX7-NOUNALIGNED-NEXT:    v_or_b32_e32 v1, v4, v3
 ; GFX7-NOUNALIGNED-NEXT:    v_or_b32_e32 v2, v6, v5
 ; GFX7-NOUNALIGNED-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX6-LABEL: v_load_constant_v3i32_align1:
+; GFX6:       ; %bb.0:
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT:    s_mov_b32 s6, 0
+; GFX6-NEXT:    s_mov_b32 s7, 0xf000
+; GFX6-NEXT:    s_mov_b64 s[4:5], 0
+; GFX6-NEXT:    buffer_load_ubyte v2, v[0:1], s[4:7], 0 addr64 offset:1
+; GFX6-NEXT:    buffer_load_ubyte v3, v[0:1], s[4:7], 0 addr64 offset:3
+; GFX6-NEXT:    buffer_load_ubyte v4, v[0:1], s[4:7], 0 addr64 offset:2
+; GFX6-NEXT:    buffer_load_ubyte v5, v[0:1], s[4:7], 0 addr64 offset:5
+; GFX6-NEXT:    buffer_load_ubyte v6, v[0:1], s[4:7], 0 addr64 offset:7
+; GFX6-NEXT:    buffer_load_ubyte v7, v[0:1], s[4:7], 0 addr64 offset:6
+; GFX6-NEXT:    buffer_load_ubyte v8, v[0:1], s[4:7], 0 addr64 offset:9
+; GFX6-NEXT:    buffer_load_ubyte v9, v[0:1], s[4:7], 0 addr64 offset:11
+; GFX6-NEXT:    buffer_load_ubyte v10, v[0:1], s[4:7], 0 addr64 offset:10
+; GFX6-NEXT:    buffer_load_ubyte v11, v[0:1], s[4:7], 0 addr64
+; GFX6-NEXT:    buffer_load_ubyte v12, v[0:1], s[4:7], 0 addr64 offset:4
+; GFX6-NEXT:    buffer_load_ubyte v0, v[0:1], s[4:7], 0 addr64 offset:8
+; GFX6-NEXT:    s_waitcnt vmcnt(11)
+; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 8, v2
+; GFX6-NEXT:    s_waitcnt vmcnt(10)
+; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 24, v3
+; GFX6-NEXT:    s_waitcnt vmcnt(9)
+; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 16, v4
+; GFX6-NEXT:    s_waitcnt vmcnt(8)
+; GFX6-NEXT:    v_lshlrev_b32_e32 v4, 8, v5
+; GFX6-NEXT:    s_waitcnt vmcnt(7)
+; GFX6-NEXT:    v_lshlrev_b32_e32 v5, 24, v6
+; GFX6-NEXT:    s_waitcnt vmcnt(6)
+; GFX6-NEXT:    v_lshlrev_b32_e32 v6, 16, v7
+; GFX6-NEXT:    s_waitcnt vmcnt(5)
+; GFX6-NEXT:    v_lshlrev_b32_e32 v7, 8, v8
+; GFX6-NEXT:    s_waitcnt vmcnt(4)
+; GFX6-NEXT:    v_lshlrev_b32_e32 v8, 24, v9
+; GFX6-NEXT:    s_waitcnt vmcnt(3)
+; GFX6-NEXT:    v_lshlrev_b32_e32 v9, 16, v10
+; GFX6-NEXT:    s_waitcnt vmcnt(2)
+; GFX6-NEXT:    v_or_b32_e32 v1, v1, v11
+; GFX6-NEXT:    v_or_b32_e32 v2, v2, v3
+; GFX6-NEXT:    s_waitcnt vmcnt(1)
+; GFX6-NEXT:    v_or_b32_e32 v3, v4, v12
+; GFX6-NEXT:    v_or_b32_e32 v4, v5, v6
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    v_or_b32_e32 v5, v7, v0
+; GFX6-NEXT:    v_or_b32_e32 v6, v8, v9
+; GFX6-NEXT:    v_or_b32_e32 v0, v2, v1
+; GFX6-NEXT:    v_or_b32_e32 v1, v4, v3
+; GFX6-NEXT:    v_or_b32_e32 v2, v6, v5
+; GFX6-NEXT:    s_setpc_b64 s[30:31]
   %load = load <3 x i32>, ptr addrspace(4) %ptr, align 1
   ret <3 x i32> %load
 }
@@ -176,6 +225,32 @@ define <3 x i32> @v_load_constant_v3i32_align2(ptr addrspace(4) %ptr) {
 ; GFX7-NOUNALIGNED-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NOUNALIGNED-NEXT:    v_or_b32_e32 v2, v2, v7
 ; GFX7-NOUNALIGNED-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX6-LABEL: v_load_constant_v3i32_align2:
+; GFX6:       ; %bb.0:
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT:    s_mov_b32 s6, 0
+; GFX6-NEXT:    s_mov_b32 s7, 0xf000
+; GFX6-NEXT:    s_mov_b64 s[4:5], 0
+; GFX6-NEXT:    buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64 offset:2
+; GFX6-NEXT:    buffer_load_ushort v3, v[0:1], s[4:7], 0 addr64 offset:6
+; GFX6-NEXT:    buffer_load_ushort v4, v[0:1], s[4:7], 0 addr64 offset:10
+; GFX6-NEXT:    buffer_load_ushort v5, v[0:1], s[4:7], 0 addr64
+; GFX6-NEXT:    buffer_load_ushort v6, v[0:1], s[4:7], 0 addr64 offset:4
+; GFX6-NEXT:    buffer_load_ushort v7, v[0:1], s[4:7], 0 addr64 offset:8
+; GFX6-NEXT:    s_waitcnt vmcnt(5)
+; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 16, v2
+; GFX6-NEXT:    s_waitcnt vmcnt(4)
+; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v3
+; GFX6-NEXT:    s_waitcnt vmcnt(3)
+; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 16, v4
+; GFX6-NEXT:    s_waitcnt vmcnt(2)
+; GFX6-NEXT:    v_or_b32_e32 v0, v0, v5
+; GFX6-NEXT:    s_waitcnt vmcnt(1)
+; GFX6-NEXT:    v_or_b32_e32 v1, v1, v6
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    v_or_b32_e32 v2, v2, v7
+; GFX6-NEXT:    s_setpc_b64 s[30:31]
   %load = load <3 x i32>, ptr addrspace(4) %ptr, align 2
   ret <3 x i32> %load
 }
@@ -197,6 +272,20 @@ define <3 x i32> @v_load_constant_v3i32_align4(ptr addrspace(4) %ptr) {
 ; GFX7-NEXT:    buffer_load_dwordx3 v[0:2], v[0:1], s[4:7], 0 addr64
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX6-LABEL: v_load_constant_v3i32_align4:
+; GFX6:       ; %bb.0:
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT:    s_mov_b32 s6, 0
+; GFX6-NEXT:    s_mov_b32 s7, 0xf000
+; GFX6-NEXT:    s_mov_b64 s[4:5], 0
+; GFX6-NEXT:    buffer_load_dwordx2 v[3:4], v[0:1], s[4:7], 0 addr64
+; GFX6-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 offset:8
+; GFX6-NEXT:    s_waitcnt vmcnt(1)
+; GFX6-NEXT:    v_mov_b32_e32 v0, v3
+; GFX6-NEXT:    v_mov_b32_e32 v1, v4
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    s_setpc_b64 s[30:31]
   %load = load <3 x i32>, ptr addrspace(4) %ptr, align 4
   ret <3 x i32> %load
 }
@@ -218,6 +307,20 @@ define i96 @v_load_constant_i96_align8(ptr addrspace(4) %ptr) {
 ; GFX7-NEXT:    buffer_load_dwordx3 v[0:2], v[0:1], s[4:7], 0 addr64
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX6-LABEL: v_load_constant_i96_align8:
+; GFX6:       ; %bb.0:
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT:    s_mov_b32 s6, 0
+; GFX6-NEXT:    s_mov_b32 s7, 0xf000
+; GFX6-NEXT:    s_mov_b64 s[4:5], 0
+; GFX6-NEXT:    buffer_load_dwordx2 v[3:4], v[0:1], s[4:7], 0 addr64
+; GFX6-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 offset:8
+; GFX6-NEXT:    s_waitcnt vmcnt(1)
+; GFX6-NEXT:    v_mov_b32_e32 v0, v3
+; GFX6-NEXT:    v_mov_b32_e32 v1, v4
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    s_setpc_b64 s[30:31]
   %load = load i96, ptr addrspace(4) %ptr, align 8
   ret i96 %load
 }
@@ -239,6 +342,20 @@ define <3 x i32> @v_load_constant_v3i32_align8(ptr addrspace(4) %ptr) {
 ; GFX7-NEXT:    buffer_load_dwordx3 v[0:2], v[0:1], s[4:7], 0 addr64
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX6-LABEL: v_load_constant_v3i32_align8:
+; GFX6:       ; %bb.0:
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT:    s_mov_b32 s6, 0
+; GFX6-NEXT:    s_mov_b32 s7, 0xf000
+; GFX6-NEXT:    s_mov_b64 s[4:5], 0
+; GFX6-NEXT:    buffer_load_dwordx2 v[3:4], v[0:1], s[4:7], 0 addr64
+; GFX6-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 offset:8
+; GFX6-NEXT:    s_waitcnt vmcnt(1)
+; GFX6-NEXT:    v_mov_b32_e32 v0, v3
+; GFX6-NEXT:    v_mov_b32_e32 v1, v4
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    s_setpc_b64 s[30:31]
   %load = load <3 x i32>, ptr addrspace(4) %ptr, align 8
   ret <3 x i32> %load
 }
@@ -266,6 +383,23 @@ define <6 x i16> @v_load_constant_v6i16_align8(ptr addrspace(4) %ptr) {
 ; GFX7-NEXT:    v_mov_b32_e32 v2, v7
 ; GFX7-NEXT:    v_mov_b32_e32 v4, v8
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX6-LABEL: v_load_constant_v6i16_align8:
+; GFX6:       ; %bb.0:
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT:    s_mov_b32 s6, 0
+; GFX6-NEXT:    s_mov_b32 s7, 0xf000
+; GFX6-NEXT:    s_mov_b64 s[4:5], 0
+; GFX6-NEXT:    buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64
+; GFX6-NEXT:    buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:8
+; GFX6-NEXT:    s_waitcnt vmcnt(1)
+; GFX6-NEXT:    v_lshrrev_b32_e32 v1, 16, v6
+; GFX6-NEXT:    v_lshrrev_b32_e32 v3, 16, v7
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    v_lshrrev_b32_e32 v5, 16, v4
+; GFX6-NEXT:    v_mov_b32_e32 v0, v6
+; GFX6-NEXT:    v_mov_b32_e32 v2, v7
+; GFX6-NEXT:    s_setpc_b64 s[30:31]
   %load = load <6 x i16>, ptr addrspace(4) %ptr, align 8
   ret <6 x i16> %load
 }
@@ -313,6 +447,29 @@ define <12 x i8> @v_load_constant_v12i8_align8(ptr addrspace(4) %ptr) {
 ; GFX7-NEXT:    v_mov_b32_e32 v1, v13
 ; GFX7-NEXT:    v_mov_b32_e32 v2, v12
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX6-LABEL: v_load_constant_v12i8_align8:
+; GFX6:       ; %bb.0:
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT:    s_mov_b32 s6, 0
+; GFX6-NEXT:    s_mov_b32 s7, 0xf000
+; GFX6-NEXT:    s_mov_b64 s[4:5], 0
+; GFX6-NEXT:    buffer_load_dwordx2 v[12:13], v[0:1], s[4:7], 0 addr64
+; GFX6-NEXT:    buffer_load_dword v8, v[0:1], s[4:7], 0 addr64 offset:8
+; GFX6-NEXT:    s_waitcnt vmcnt(1)
+; GFX6-NEXT:    v_lshrrev_b32_e32 v1, 8, v12
+; GFX6-NEXT:    v_lshrrev_b32_e32 v2, 16, v12
+; GFX6-NEXT:    v_lshrrev_b32_e32 v3, 24, v12
+; GFX6-NEXT:    v_lshrrev_b32_e32 v5, 8, v13
+; GFX6-NEXT:    v_lshrrev_b32_e32 v6, 16, v13
+; GFX6-NEXT:    v_lshrrev_b32_e32 v7, 24, v13
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    v_lshrrev_b32_e32 v9, 8, v8
+; GFX6-NEXT:    v_lshrrev_b32_e32 v10, 16, v8
+; GFX6-NEXT:    v_lshrrev_b32_e32 v11, 24, v8
+; GFX6-NEXT:    v_mov_b32_e32 v0, v12
+; GFX6-NEXT:    v_mov_b32_e32 v4, v13
+; GFX6-NEXT:    s_setpc_b64 s[30:31]
   %load = load <12 x i8>, ptr addrspace(4) %ptr, align 8
   ret <12 x i8> %load
 }
@@ -334,6 +491,16 @@ define <3 x i32> @v_load_constant_v3i32_align16(ptr addrspace(4) %ptr) {
 ; GFX7-NEXT:    buffer_load_dwordx3 v[0:2], v[0:1], s[4:7], 0 addr64
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX6-LABEL: v_load_constant_v3i32_align16:
+; GFX6:       ; %bb.0:
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT:    s_mov_b32 s6, 0
+; GFX6-NEXT:    s_mov_b32 s7, 0xf000
+; GFX6-NEXT:    s_mov_b64 s[4:5], 0
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[0:1], s[4:7], 0 addr64
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    s_setpc_b64 s[30:31]
   %load = load <3 x i32>, ptr addrspace(4) %ptr, align 16
   ret <3 x i32> %load
 }
@@ -451,6 +618,57 @@ define amdgpu_ps <3 x i32> @s_load_constant_v3i32_align1(ptr addrspace(4) inreg
 ; GFX7-NOUNALIGNED-NEXT:    v_readfirstlane_b32 s1, v1
 ; GFX7-NOUNALIGNED-NEXT:    v_readfirstlane_b32 s2, v2
 ; GFX7-NOUNALIGNED-NEXT:    ; return to shader part epilog
+;
+; GFX6-LABEL: s_load_constant_v3i32_align1:
+; GFX6:       ; %bb.0:
+; GFX6-NEXT:    s_mov_b32 s2, -1
+; GFX6-NEXT:    s_mov_b32 s3, 0xf000
+; GFX6-NEXT:    buffer_load_ubyte v0, off, s[0:3], 0 offset:1
+; GFX6-NEXT:    buffer_load_ubyte v1, off, s[0:3], 0 offset:3
+; GFX6-NEXT:    buffer_load_ubyte v2, off, s[0:3], 0 offset:2
+; GFX6-NEXT:    buffer_load_ubyte v3, off, s[0:3], 0 offset:5
+; GFX6-NEXT:    buffer_load_ubyte v4, off, s[0:3], 0 offset:7
+; GFX6-NEXT:    buffer_load_ubyte v5, off, s[0:3], 0 offset:6
+; GFX6-NEXT:    buffer_load_ubyte v6, off, s[0:3], 0 offset:9
+; GFX6-NEXT:    buffer_load_ubyte v7, off, s[0:3], 0 offset:11
+; GFX6-NEXT:    buffer_load_ubyte v8, off, s[0:3], 0 offset:10
+; GFX6-NEXT:    buffer_load_ubyte v9, off, s[0:3], 0
+; GFX6-NEXT:    buffer_load_ubyte v10, off, s[0:3], 0 offset:4
+; GFX6-NEXT:    buffer_load_ubyte v11, off, s[0:3], 0 offset:8
+; GFX6-NEXT:    s_waitcnt vmcnt(11)
+; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 8, v0
+; GFX6-NEXT:    s_waitcnt vmcnt(10)
+; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 24, v1
+; GFX6-NEXT:    s_waitcnt vmcnt(9)
+; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX6-NEXT:    s_waitcnt vmcnt(8)
+; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 8, v3
+; GFX6-NEXT:    s_waitcnt vmcnt(7)
+; GFX6-NEXT:    v_lshlrev_b32_e32 v4, 24, v4
+; GFX6-NEXT:    s_waitcnt vmcnt(6)
+; GFX6-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX6-NEXT:    s_waitcnt vmcnt(5)
+; GFX6-NEXT:    v_lshlrev_b32_e32 v6, 8, v6
+; GFX6-NEXT:    s_waitcnt vmcnt(4)
+; GFX6-NEXT:    v_lshlrev_b32_e32 v7, 24, v7
+; GFX6-NEXT:    s_waitcnt vmcnt(3)
+; GFX6-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
+; GFX6-NEXT:    s_waitcnt vmcnt(2)
+; GFX6-NEXT:    v_or_b32_e32 v0, v0, v9
+; GFX6-NEXT:    v_or_b32_e32 v1, v1, v2
+; GFX6-NEXT:    s_waitcnt vmcnt(1)
+; GFX6-NEXT:    v_or_b32_e32 v2, v3, v10
+; GFX6-NEXT:    v_or_b32_e32 v3, v4, v5
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    v_or_b32_e32 v4, v6, v11
+; GFX6-NEXT:    v_or_b32_e32 v5, v7, v8
+; GFX6-NEXT:    v_or_b32_e32 v0, v1, v0
+; GFX6-NEXT:    v_or_b32_e32 v1, v3, v2
+; GFX6-NEXT:    v_or_b32_e32 v2, v5, v4
+; GFX6-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX6-NEXT:    v_readfirstlane_b32 s1, v1
+; GFX6-NEXT:    v_readfirstlane_b32 s2, v2
+; GFX6-NEXT:    ; return to shader part epilog
   %load = load <3 x i32>, ptr addrspace(4) %ptr, align 1
   ret <3 x i32> %load
 }
@@ -523,6 +741,33 @@ define amdgpu_ps <3 x i32> @s_load_constant_v3i32_align2(ptr addrspace(4) inreg
 ; GFX7-NOUNALIGNED-NEXT:    v_readfirstlane_b32 s1, v1
 ; GFX7-NOUNALIGNED-NEXT:    v_readfirstlane_b32 s2, v2
 ; GFX7-NOUNALIGNED-NEXT:    ; return to shader part epilog
+;
+; GFX6-LABEL: s_load_constant_v3i32_align2:
+; GFX6:       ; %bb.0:
+; GFX6-NEXT:    s_mov_b32 s2, -1
+; GFX6-NEXT:    s_mov_b32 s3, 0xf000
+; GFX6-NEXT:    buffer_load_ushort v0, off, s[0:3], 0 offset:2
+; GFX6-NEXT:    buffer_load_ushort v1, off, s[0:3], 0 offset:6
+; GFX6-NEXT:    buffer_load_ushort v2, off, s[0:3], 0 offset:10
+; GFX6-NEXT:    buffer_load_ushort v3, off, s[0:3], 0
+; GFX6-NEXT:    buffer_load_ushort v4, off, s[0:3], 0 offset:4
+; GFX6-NEXT:    buffer_load_ushort v5, off, s[0:3], 0 offset:8
+; GFX6-NEXT:    s_waitcnt vmcnt(5)
+; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX6-NEXT:    s_waitcnt vmcnt(4)
+; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX6-NEXT:    s_waitcnt vmcnt(3)
+; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX6-NEXT:    s_waitcnt vmcnt(2)
+; GFX6-NEXT:    v_or_b32_e32 v0, v0, v3
+; GFX6-NEXT:    s_waitcnt vmcnt(1)
+; GFX6-NEXT:    v_or_b32_e32 v1, v1, v4
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    v_or_b32_e32 v2, v2, v5
+; GFX6-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX6-NEXT:    v_readfirstlane_b32 s1, v1
+; GFX6-NEXT:    v_readfirstlane_b32 s2, v2
+; GFX6-NEXT:    ; return to shader part epilog
   %load = load <3 x i32>, ptr addrspace(4) %ptr, align 2
   ret <3 x i32> %load
 }
@@ -545,6 +790,15 @@ define amdgpu_ps <3 x i32> @s_load_constant_v3i32_align4(ptr addrspace(4) inreg
 ; GFX7-NEXT:    s_mov_b32 s0, s4
 ; GFX7-NEXT:    s_mov_b32 s1, s5
 ; GFX7-NEXT:    ; return to shader part epilog
+;
+; GFX6-LABEL: s_load_constant_v3i32_align4:
+; GFX6:       ; %bb.0:
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX6-NEXT:    s_load_dword s2, s[0:1], 0x2
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    s_mov_b32 s0, s4
+; GFX6-NEXT:    s_mov_b32 s1, s5
+; GFX6-NEXT:    ; return to shader part epilog
   %load = load <3 x i32>, ptr addrspace(4) %ptr, align 4
   ret <3 x i32> %load
 }
@@ -567,6 +821,15 @@ define amdgpu_ps i96 @s_load_constant_i96_align8(ptr addrspace(4) inreg %ptr) {
 ; GFX7-NEXT:    s_mov_b32 s0, s4
 ; GFX7-NEXT:    s_mov_b32 s1, s5
 ; GFX7-NEXT:    ; return to shader part epilog
+;
+; GFX6-LABEL: s_load_constant_i96_align8:
+; GFX6:       ; %bb.0:
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX6-NEXT:    s_load_dword s2, s[0:1], 0x2
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    s_mov_b32 s0, s4
+; GFX6-NEXT:    s_mov_b32 s1, s5
+; GFX6-NEXT:    ; return to shader part epilog
   %load = load i96, ptr addrspace(4) %ptr, align 8
   ret i96 %load
 }
@@ -589,6 +852,15 @@ define amdgpu_ps <3 x i32> @s_load_constant_v3i32_align8(ptr addrspace(4) inreg
 ; GFX7-NEXT:    s_mov_b32 s0, s4
 ; GFX7-NEXT:    s_mov_b32 s1, s5
 ; GFX7-NEXT:    ; return to shader part epilog
+;
+; GFX6-LABEL: s_load_constant_v3i32_align8:
+; GFX6:       ; %bb.0:
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX6-NEXT:    s_load_dword s2, s[0:1], 0x2
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    s_mov_b32 s0, s4
+; GFX6-NEXT:    s_mov_b32 s1, s5
+; GFX6-NEXT:    ; return to shader part epilog
   %load = load <3 x i32>, ptr addrspace(4) %ptr, align 8
   ret <3 x i32> %load
 }
@@ -611,6 +883,15 @@ define amdgpu_ps <3 x i32> @s_load_constant_v6i16_align8(ptr addrspace(4) inreg
 ; GFX7-NEXT:    s_mov_b32 s0, s4
 ; GFX7-NEXT:    s_mov_b32 s1, s5
 ; GFX7-NEXT:    ; return to shader part epilog
+;
+; GFX6-LABEL: s_load_constant_v6i16_align8:
+; GFX6:       ; %bb.0:
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX6-NEXT:    s_load_dword s2, s[0:1], 0x2
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    s_mov_b32 s0, s4
+; GFX6-NEXT:    s_mov_b32 s1, s5
+; GFX6-NEXT:    ; return to shader part epilog
   %load = load <6 x i16>, ptr addrspace(4) %ptr, align 8
   %cast = bitcast <6 x i16> %load to <3 x i32>
   ret <3 x i32> %cast
@@ -652,6 +933,24 @@ define amdgpu_ps <12 x i8> @s_load_constant_v12i8_align8(ptr addrspace(4) inreg
 ; GFX7-NEXT:    s_mov_b32 s0, s12
 ; GFX7-NEXT:    s_mov_b32 s4, s13
 ; GFX7-NEXT:    ; return to shader part epilog
+;
+; GFX6-LABEL: s_load_constant_v12i8_align8:
+; GFX6:       ; %bb.0:
+; GFX6-NEXT:    s_load_dwordx2 s[12:13], s[0:1], 0x0
+; GFX6-NEXT:    s_load_dword s8, s[0:1], 0x2
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    s_lshr_b32 s1, s12, 8
+; GFX6-NEXT:    s_lshr_b32 s2, s12, 16
+; GFX6-NEXT:    s_lshr_b32 s3, s12, 24
+; GFX6-NEXT:    s_lshr_b32 s5, s13, 8
+; GFX6-NEXT:    s_lshr_b32 s6, s13, 16
+; GFX6-NEXT:    s_lshr_b32 s7, s13, 24
+; GFX6-NEXT:    s_lshr_b32 s9, s8, 8
+; GFX6-NEXT:    s_lshr_b32 s10, s8, 16
+; GFX6-NEXT:    s_lshr_b32 s11, s8, 24
+; GFX6-NEXT:    s_mov_b32 s0, s12
+; GFX6-NEXT:    s_mov_b32 s4, s13
+; GFX6-NEXT:    ; return to shader part epilog
   %load = load <12 x i8>, ptr addrspace(4) %ptr, align 8
   ret <12 x i8> %load
 }

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/shl-ext-reduce.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/shl-ext-reduce.ll
index 50f143dc448c3e1..7ad19a47970039c 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/shl-ext-reduce.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/shl-ext-reduce.ll
@@ -329,16 +329,13 @@ define amdgpu_kernel void @muli24_shl64(ptr addrspace(1) nocapture %arg, ptr add
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[2:3]
 ; GFX7-NEXT:    buffer_load_dword v1, v[1:2], s[4:7], 0 addr64
-; GFX7-NEXT:    v_lshlrev_b32_e32 v5, 3, v0
-; GFX7-NEXT:    v_mov_b32_e32 v4, s1
-; GFX7-NEXT:    v_mov_b32_e32 v3, s0
+; GFX7-NEXT:    s_mov_b64 s[2:3], s[6:7]
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_or_b32_e32 v0, 0xff800000, v1
-; GFX7-NEXT:    v_mul_i32_i24_e32 v1, -7, v0
-; GFX7-NEXT:    v_lshl_b64 v[0:1], v[1:2], 3
-; GFX7-NEXT:    v_add_i32_e32 v2, vcc, v3, v5
-; GFX7-NEXT:    v_addc_u32_e32 v3, vcc, 0, v4, vcc
-; GFX7-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; GFX7-NEXT:    v_or_b32_e32 v1, 0xff800000, v1
+; GFX7-NEXT:    v_mul_i32_i24_e32 v1, -7, v1
+; GFX7-NEXT:    v_lshl_b64 v[3:4], v[1:2], 3
+; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 3, v0
+; GFX7-NEXT:    buffer_store_dwordx2 v[3:4], v[1:2], s[0:3], 0 addr64
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX8-LABEL: muli24_shl64:

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/smrd.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/smrd.ll
index 9bc618950388714..c040c912b9ff796 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/smrd.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/smrd.ll
@@ -1,5 +1,4 @@
-; FIXME: Need to add support for mubuf stores to enable this on SI.
-; XUN: llc < %s -march=amdgcn -mcpu=tahiti -show-mc-encoding -verify-machineinstrs -global-isel | FileCheck --check-prefixes=SI,GCN %s
+; RUN: llc < %s -march=amdgcn -mcpu=tahiti -show-mc-encoding -verify-machineinstrs -global-isel | FileCheck --check-prefixes=SI,GCN %s
 ; RUN: llc < %s -march=amdgcn -mcpu=bonaire -show-mc-encoding -verify-machineinstrs -global-isel | FileCheck --check-prefixes=CI,GCN,SICIVI %s
 ; RUN: llc < %s -march=amdgcn -mcpu=tonga -show-mc-encoding -verify-machineinstrs -global-isel | FileCheck --check-prefixes=VI,GCN,SICIVI %s
 ; RUN: llc -march=amdgcn -mcpu=gfx900 -show-mc-encoding -verify-machineinstrs -global-isel < %s | FileCheck --check-prefixes=GFX9_10,GCN,VIGFX9_10,SIVIGFX9_10  %s

diff  --git a/llvm/test/CodeGen/AMDGPU/fmed3.ll b/llvm/test/CodeGen/AMDGPU/fmed3.ll
index 40ebb191802a89b..70d915df7cb007f 100644
--- a/llvm/test/CodeGen/AMDGPU/fmed3.ll
+++ b/llvm/test/CodeGen/AMDGPU/fmed3.ll
@@ -1,7 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
-; RUN: llc -march=amdgcn -global-isel=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=SI %s
-; TODO: LLVM ERROR: cannot select: G_STORE %30:vgpr(s64), %22:vgpr(p1)
-; RUN: not --crash llc -march=amdgcn -global-isel=1 -verify-machineinstrs < %s
+; RUN: llc -march=amdgcn -global-isel=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=SI,SI-SDAG %s
+; RUN: llc -march=amdgcn -global-isel=1 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=SI,SI-GISEL %s
 ; RUN: llc -march=amdgcn -mcpu=tonga -global-isel=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=VI-SDAG %s
 ; RUN: llc -march=amdgcn -mcpu=tonga -global-isel=1 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=VI-GISEL %s
 ; RUN: llc -march=amdgcn -mcpu=gfx900 -global-isel=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9,GFX9-SDAG %s
@@ -10,22 +9,39 @@
 ; RUN: llc -march=amdgcn -mcpu=gfx1100 -global-isel=1 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX11-GISEL %s
 
 define amdgpu_kernel void @v_test_nnan_input_fmed3_r_i_i_f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #1 {
-; SI-LABEL: v_test_nnan_input_fmed3_r_i_i_f32:
-; SI:       ; %bb.0:
-; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
-; SI-NEXT:    s_mov_b32 s7, 0xf000
-; SI-NEXT:    s_mov_b32 s6, 0
-; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; SI-NEXT:    v_mov_b32_e32 v1, 0
-; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_mov_b64 s[4:5], s[2:3]
-; SI-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
-; SI-NEXT:    s_mov_b64 s[2:3], s[6:7]
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_add_f32_e32 v2, 1.0, v2
-; SI-NEXT:    v_med3_f32 v2, v2, 2.0, 4.0
-; SI-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
-; SI-NEXT:    s_endpgm
+; SI-SDAG-LABEL: v_test_nnan_input_fmed3_r_i_i_f32:
+; SI-SDAG:       ; %bb.0:
+; SI-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
+; SI-SDAG-NEXT:    s_mov_b32 s7, 0xf000
+; SI-SDAG-NEXT:    s_mov_b32 s6, 0
+; SI-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; SI-SDAG-NEXT:    v_mov_b32_e32 v1, 0
+; SI-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-SDAG-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; SI-SDAG-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
+; SI-SDAG-NEXT:    s_mov_b64 s[2:3], s[6:7]
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SI-SDAG-NEXT:    v_add_f32_e32 v2, 1.0, v2
+; SI-SDAG-NEXT:    v_med3_f32 v2, v2, 2.0, 4.0
+; SI-SDAG-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
+; SI-SDAG-NEXT:    s_endpgm
+;
+; SI-GISEL-LABEL: v_test_nnan_input_fmed3_r_i_i_f32:
+; SI-GISEL:       ; %bb.0:
+; SI-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; SI-GISEL-NEXT:    s_mov_b32 s6, 0
+; SI-GISEL-NEXT:    s_mov_b32 s7, 0xf000
+; SI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-GISEL-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; SI-GISEL-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; SI-GISEL-NEXT:    v_add_f32_e32 v2, 1.0, v2
+; SI-GISEL-NEXT:    v_med3_f32 v2, v2, 2.0, 4.0
+; SI-GISEL-NEXT:    s_mov_b64 s[2:3], s[6:7]
+; SI-GISEL-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
+; SI-GISEL-NEXT:    s_endpgm
 ;
 ; VI-SDAG-LABEL: v_test_nnan_input_fmed3_r_i_i_f32:
 ; VI-SDAG:       ; %bb.0:
@@ -104,22 +120,39 @@ define amdgpu_kernel void @v_test_nnan_input_fmed3_r_i_i_f32(ptr addrspace(1) %o
 }
 
 define amdgpu_kernel void @v_test_fmed3_nnan_r_i_i_f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #1 {
-; SI-LABEL: v_test_fmed3_nnan_r_i_i_f32:
-; SI:       ; %bb.0:
-; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
-; SI-NEXT:    s_mov_b32 s7, 0xf000
-; SI-NEXT:    s_mov_b32 s6, 0
-; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; SI-NEXT:    v_mov_b32_e32 v1, 0
-; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_mov_b64 s[4:5], s[2:3]
-; SI-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
-; SI-NEXT:    s_mov_b64 s[2:3], s[6:7]
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_add_f32_e32 v2, 1.0, v2
-; SI-NEXT:    v_med3_f32 v2, v2, 2.0, 4.0
-; SI-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
-; SI-NEXT:    s_endpgm
+; SI-SDAG-LABEL: v_test_fmed3_nnan_r_i_i_f32:
+; SI-SDAG:       ; %bb.0:
+; SI-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
+; SI-SDAG-NEXT:    s_mov_b32 s7, 0xf000
+; SI-SDAG-NEXT:    s_mov_b32 s6, 0
+; SI-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; SI-SDAG-NEXT:    v_mov_b32_e32 v1, 0
+; SI-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-SDAG-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; SI-SDAG-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
+; SI-SDAG-NEXT:    s_mov_b64 s[2:3], s[6:7]
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SI-SDAG-NEXT:    v_add_f32_e32 v2, 1.0, v2
+; SI-SDAG-NEXT:    v_med3_f32 v2, v2, 2.0, 4.0
+; SI-SDAG-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
+; SI-SDAG-NEXT:    s_endpgm
+;
+; SI-GISEL-LABEL: v_test_fmed3_nnan_r_i_i_f32:
+; SI-GISEL:       ; %bb.0:
+; SI-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; SI-GISEL-NEXT:    s_mov_b32 s6, 0
+; SI-GISEL-NEXT:    s_mov_b32 s7, 0xf000
+; SI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-GISEL-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; SI-GISEL-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; SI-GISEL-NEXT:    v_add_f32_e32 v2, 1.0, v2
+; SI-GISEL-NEXT:    v_med3_f32 v2, v2, 2.0, 4.0
+; SI-GISEL-NEXT:    s_mov_b64 s[2:3], s[6:7]
+; SI-GISEL-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
+; SI-GISEL-NEXT:    s_endpgm
 ;
 ; VI-SDAG-LABEL: v_test_fmed3_nnan_r_i_i_f32:
 ; VI-SDAG:       ; %bb.0:
@@ -199,22 +232,39 @@ define amdgpu_kernel void @v_test_fmed3_nnan_r_i_i_f32(ptr addrspace(1) %out, pt
 }
 
 define amdgpu_kernel void @v_test_fmed3_nnan_r_i_i_commute0_f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #1 {
-; SI-LABEL: v_test_fmed3_nnan_r_i_i_commute0_f32:
-; SI:       ; %bb.0:
-; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
-; SI-NEXT:    s_mov_b32 s7, 0xf000
-; SI-NEXT:    s_mov_b32 s6, 0
-; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; SI-NEXT:    v_mov_b32_e32 v1, 0
-; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_mov_b64 s[4:5], s[2:3]
-; SI-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
-; SI-NEXT:    s_mov_b64 s[2:3], s[6:7]
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_add_f32_e32 v2, 1.0, v2
-; SI-NEXT:    v_med3_f32 v2, v2, 2.0, 4.0
-; SI-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
-; SI-NEXT:    s_endpgm
+; SI-SDAG-LABEL: v_test_fmed3_nnan_r_i_i_commute0_f32:
+; SI-SDAG:       ; %bb.0:
+; SI-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
+; SI-SDAG-NEXT:    s_mov_b32 s7, 0xf000
+; SI-SDAG-NEXT:    s_mov_b32 s6, 0
+; SI-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; SI-SDAG-NEXT:    v_mov_b32_e32 v1, 0
+; SI-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-SDAG-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; SI-SDAG-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
+; SI-SDAG-NEXT:    s_mov_b64 s[2:3], s[6:7]
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SI-SDAG-NEXT:    v_add_f32_e32 v2, 1.0, v2
+; SI-SDAG-NEXT:    v_med3_f32 v2, v2, 2.0, 4.0
+; SI-SDAG-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
+; SI-SDAG-NEXT:    s_endpgm
+;
+; SI-GISEL-LABEL: v_test_fmed3_nnan_r_i_i_commute0_f32:
+; SI-GISEL:       ; %bb.0:
+; SI-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; SI-GISEL-NEXT:    s_mov_b32 s6, 0
+; SI-GISEL-NEXT:    s_mov_b32 s7, 0xf000
+; SI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-GISEL-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; SI-GISEL-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; SI-GISEL-NEXT:    v_add_f32_e32 v2, 1.0, v2
+; SI-GISEL-NEXT:    v_med3_f32 v2, v2, 2.0, 4.0
+; SI-GISEL-NEXT:    s_mov_b64 s[2:3], s[6:7]
+; SI-GISEL-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
+; SI-GISEL-NEXT:    s_endpgm
 ;
 ; VI-SDAG-LABEL: v_test_fmed3_nnan_r_i_i_commute0_f32:
 ; VI-SDAG:       ; %bb.0:
@@ -294,22 +344,39 @@ define amdgpu_kernel void @v_test_fmed3_nnan_r_i_i_commute0_f32(ptr addrspace(1)
 }
 
 define amdgpu_kernel void @v_test_fmed3_nnan_r_i_i_commute1_f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #1 {
-; SI-LABEL: v_test_fmed3_nnan_r_i_i_commute1_f32:
-; SI:       ; %bb.0:
-; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
-; SI-NEXT:    s_mov_b32 s7, 0xf000
-; SI-NEXT:    s_mov_b32 s6, 0
-; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; SI-NEXT:    v_mov_b32_e32 v1, 0
-; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_mov_b64 s[4:5], s[2:3]
-; SI-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
-; SI-NEXT:    s_mov_b64 s[2:3], s[6:7]
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_add_f32_e32 v2, 1.0, v2
-; SI-NEXT:    v_med3_f32 v2, v2, 2.0, 4.0
-; SI-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
-; SI-NEXT:    s_endpgm
+; SI-SDAG-LABEL: v_test_fmed3_nnan_r_i_i_commute1_f32:
+; SI-SDAG:       ; %bb.0:
+; SI-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
+; SI-SDAG-NEXT:    s_mov_b32 s7, 0xf000
+; SI-SDAG-NEXT:    s_mov_b32 s6, 0
+; SI-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; SI-SDAG-NEXT:    v_mov_b32_e32 v1, 0
+; SI-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-SDAG-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; SI-SDAG-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
+; SI-SDAG-NEXT:    s_mov_b64 s[2:3], s[6:7]
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SI-SDAG-NEXT:    v_add_f32_e32 v2, 1.0, v2
+; SI-SDAG-NEXT:    v_med3_f32 v2, v2, 2.0, 4.0
+; SI-SDAG-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
+; SI-SDAG-NEXT:    s_endpgm
+;
+; SI-GISEL-LABEL: v_test_fmed3_nnan_r_i_i_commute1_f32:
+; SI-GISEL:       ; %bb.0:
+; SI-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; SI-GISEL-NEXT:    s_mov_b32 s6, 0
+; SI-GISEL-NEXT:    s_mov_b32 s7, 0xf000
+; SI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-GISEL-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; SI-GISEL-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; SI-GISEL-NEXT:    v_add_f32_e32 v2, 1.0, v2
+; SI-GISEL-NEXT:    v_med3_f32 v2, v2, 2.0, 4.0
+; SI-GISEL-NEXT:    s_mov_b64 s[2:3], s[6:7]
+; SI-GISEL-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
+; SI-GISEL-NEXT:    s_endpgm
 ;
 ; VI-SDAG-LABEL: v_test_fmed3_nnan_r_i_i_commute1_f32:
 ; VI-SDAG:       ; %bb.0:
@@ -389,23 +456,41 @@ define amdgpu_kernel void @v_test_fmed3_nnan_r_i_i_commute1_f32(ptr addrspace(1)
 }
 
 define amdgpu_kernel void @v_test_fmed3_nnan_r_i_i_constant_order_f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #1 {
-; SI-LABEL: v_test_fmed3_nnan_r_i_i_constant_order_f32:
-; SI:       ; %bb.0:
-; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
-; SI-NEXT:    s_mov_b32 s7, 0xf000
-; SI-NEXT:    s_mov_b32 s6, 0
-; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; SI-NEXT:    v_mov_b32_e32 v1, 0
-; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_mov_b64 s[4:5], s[2:3]
-; SI-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
-; SI-NEXT:    s_mov_b64 s[2:3], s[6:7]
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_add_f32_e32 v2, 1.0, v2
-; SI-NEXT:    v_max_f32_e32 v2, 4.0, v2
-; SI-NEXT:    v_min_f32_e32 v2, 2.0, v2
-; SI-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
-; SI-NEXT:    s_endpgm
+; SI-SDAG-LABEL: v_test_fmed3_nnan_r_i_i_constant_order_f32:
+; SI-SDAG:       ; %bb.0:
+; SI-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
+; SI-SDAG-NEXT:    s_mov_b32 s7, 0xf000
+; SI-SDAG-NEXT:    s_mov_b32 s6, 0
+; SI-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; SI-SDAG-NEXT:    v_mov_b32_e32 v1, 0
+; SI-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-SDAG-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; SI-SDAG-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
+; SI-SDAG-NEXT:    s_mov_b64 s[2:3], s[6:7]
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SI-SDAG-NEXT:    v_add_f32_e32 v2, 1.0, v2
+; SI-SDAG-NEXT:    v_max_f32_e32 v2, 4.0, v2
+; SI-SDAG-NEXT:    v_min_f32_e32 v2, 2.0, v2
+; SI-SDAG-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
+; SI-SDAG-NEXT:    s_endpgm
+;
+; SI-GISEL-LABEL: v_test_fmed3_nnan_r_i_i_constant_order_f32:
+; SI-GISEL:       ; %bb.0:
+; SI-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; SI-GISEL-NEXT:    s_mov_b32 s6, 0
+; SI-GISEL-NEXT:    s_mov_b32 s7, 0xf000
+; SI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-GISEL-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; SI-GISEL-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; SI-GISEL-NEXT:    v_add_f32_e32 v2, 1.0, v2
+; SI-GISEL-NEXT:    v_max_f32_e32 v2, 4.0, v2
+; SI-GISEL-NEXT:    v_min_f32_e32 v2, 2.0, v2
+; SI-GISEL-NEXT:    s_mov_b64 s[2:3], s[6:7]
+; SI-GISEL-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
+; SI-GISEL-NEXT:    s_endpgm
 ;
 ; VI-SDAG-LABEL: v_test_fmed3_nnan_r_i_i_constant_order_f32:
 ; VI-SDAG:       ; %bb.0:
@@ -488,26 +573,47 @@ define amdgpu_kernel void @v_test_fmed3_nnan_r_i_i_constant_order_f32(ptr addrsp
 }
 
 define amdgpu_kernel void @v_test_fmed3_nnan_r_i_i_multi_use_f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #1 {
-; SI-LABEL: v_test_fmed3_nnan_r_i_i_multi_use_f32:
-; SI:       ; %bb.0:
-; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
-; SI-NEXT:    s_mov_b32 s7, 0xf000
-; SI-NEXT:    s_mov_b32 s6, 0
-; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; SI-NEXT:    v_mov_b32_e32 v1, 0
-; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_mov_b64 s[4:5], s[2:3]
-; SI-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
-; SI-NEXT:    s_mov_b64 s[2:3], s[6:7]
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_add_f32_e32 v2, 1.0, v2
-; SI-NEXT:    v_max_f32_e32 v2, 2.0, v2
-; SI-NEXT:    v_min_f32_e32 v3, 4.0, v2
-; SI-NEXT:    buffer_store_dword v3, v[0:1], s[0:3], 0 addr64
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    s_endpgm
+; SI-SDAG-LABEL: v_test_fmed3_nnan_r_i_i_multi_use_f32:
+; SI-SDAG:       ; %bb.0:
+; SI-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
+; SI-SDAG-NEXT:    s_mov_b32 s7, 0xf000
+; SI-SDAG-NEXT:    s_mov_b32 s6, 0
+; SI-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; SI-SDAG-NEXT:    v_mov_b32_e32 v1, 0
+; SI-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-SDAG-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; SI-SDAG-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
+; SI-SDAG-NEXT:    s_mov_b64 s[2:3], s[6:7]
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SI-SDAG-NEXT:    v_add_f32_e32 v2, 1.0, v2
+; SI-SDAG-NEXT:    v_max_f32_e32 v2, 2.0, v2
+; SI-SDAG-NEXT:    v_min_f32_e32 v3, 4.0, v2
+; SI-SDAG-NEXT:    buffer_store_dword v3, v[0:1], s[0:3], 0 addr64
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SI-SDAG-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SI-SDAG-NEXT:    s_endpgm
+;
+; SI-GISEL-LABEL: v_test_fmed3_nnan_r_i_i_multi_use_f32:
+; SI-GISEL:       ; %bb.0:
+; SI-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; SI-GISEL-NEXT:    s_mov_b32 s6, 0
+; SI-GISEL-NEXT:    s_mov_b32 s7, 0xf000
+; SI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-GISEL-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; SI-GISEL-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
+; SI-GISEL-NEXT:    s_mov_b64 s[2:3], s[6:7]
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; SI-GISEL-NEXT:    v_add_f32_e32 v2, 1.0, v2
+; SI-GISEL-NEXT:    v_max_f32_e32 v3, 2.0, v2
+; SI-GISEL-NEXT:    v_med3_f32 v2, v2, 2.0, 4.0
+; SI-GISEL-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; SI-GISEL-NEXT:    buffer_store_dword v3, v[0:1], s[0:3], 0 addr64
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; SI-GISEL-NEXT:    s_endpgm
 ;
 ; VI-SDAG-LABEL: v_test_fmed3_nnan_r_i_i_multi_use_f32:
 ; VI-SDAG:       ; %bb.0:
@@ -639,23 +745,41 @@ define amdgpu_kernel void @v_test_fmed3_nnan_r_i_i_multi_use_f32(ptr addrspace(1
 }
 
 define amdgpu_kernel void @v_test_fmed3_r_i_i_f64(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #1 {
-; SI-LABEL: v_test_fmed3_r_i_i_f64:
-; SI:       ; %bb.0:
-; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
-; SI-NEXT:    s_mov_b32 s7, 0xf000
-; SI-NEXT:    s_mov_b32 s6, 0
-; SI-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
-; SI-NEXT:    v_mov_b32_e32 v1, 0
-; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_mov_b64 s[4:5], s[2:3]
-; SI-NEXT:    buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64
-; SI-NEXT:    s_mov_b64 s[2:3], s[6:7]
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_add_f64 v[2:3], v[2:3], 1.0
-; SI-NEXT:    v_max_f64 v[2:3], v[2:3], 2.0
-; SI-NEXT:    v_min_f64 v[2:3], v[2:3], 4.0
-; SI-NEXT:    buffer_store_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64
-; SI-NEXT:    s_endpgm
+; SI-SDAG-LABEL: v_test_fmed3_r_i_i_f64:
+; SI-SDAG:       ; %bb.0:
+; SI-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
+; SI-SDAG-NEXT:    s_mov_b32 s7, 0xf000
+; SI-SDAG-NEXT:    s_mov_b32 s6, 0
+; SI-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
+; SI-SDAG-NEXT:    v_mov_b32_e32 v1, 0
+; SI-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-SDAG-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; SI-SDAG-NEXT:    buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64
+; SI-SDAG-NEXT:    s_mov_b64 s[2:3], s[6:7]
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SI-SDAG-NEXT:    v_add_f64 v[2:3], v[2:3], 1.0
+; SI-SDAG-NEXT:    v_max_f64 v[2:3], v[2:3], 2.0
+; SI-SDAG-NEXT:    v_min_f64 v[2:3], v[2:3], 4.0
+; SI-SDAG-NEXT:    buffer_store_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64
+; SI-SDAG-NEXT:    s_endpgm
+;
+; SI-GISEL-LABEL: v_test_fmed3_r_i_i_f64:
+; SI-GISEL:       ; %bb.0:
+; SI-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
+; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; SI-GISEL-NEXT:    s_mov_b32 s6, 0
+; SI-GISEL-NEXT:    s_mov_b32 s7, 0xf000
+; SI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-GISEL-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; SI-GISEL-NEXT:    buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; SI-GISEL-NEXT:    v_add_f64 v[2:3], v[2:3], 1.0
+; SI-GISEL-NEXT:    v_max_f64 v[2:3], v[2:3], 2.0
+; SI-GISEL-NEXT:    v_min_f64 v[2:3], v[2:3], 4.0
+; SI-GISEL-NEXT:    s_mov_b64 s[2:3], s[6:7]
+; SI-GISEL-NEXT:    buffer_store_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64
+; SI-GISEL-NEXT:    s_endpgm
 ;
 ; VI-SDAG-LABEL: v_test_fmed3_r_i_i_f64:
 ; VI-SDAG:       ; %bb.0:
@@ -739,21 +863,37 @@ define amdgpu_kernel void @v_test_fmed3_r_i_i_f64(ptr addrspace(1) %out, ptr add
 }
 
 define amdgpu_kernel void @v_test_fmed3_r_i_i_no_nans_f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #2 {
-; SI-LABEL: v_test_fmed3_r_i_i_no_nans_f32:
-; SI:       ; %bb.0:
-; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
-; SI-NEXT:    s_mov_b32 s7, 0xf000
-; SI-NEXT:    s_mov_b32 s6, 0
-; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; SI-NEXT:    v_mov_b32_e32 v1, 0
-; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_mov_b64 s[4:5], s[2:3]
-; SI-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
-; SI-NEXT:    s_mov_b64 s[2:3], s[6:7]
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_med3_f32 v2, v2, 2.0, 4.0
-; SI-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
-; SI-NEXT:    s_endpgm
+; SI-SDAG-LABEL: v_test_fmed3_r_i_i_no_nans_f32:
+; SI-SDAG:       ; %bb.0:
+; SI-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
+; SI-SDAG-NEXT:    s_mov_b32 s7, 0xf000
+; SI-SDAG-NEXT:    s_mov_b32 s6, 0
+; SI-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; SI-SDAG-NEXT:    v_mov_b32_e32 v1, 0
+; SI-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-SDAG-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; SI-SDAG-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
+; SI-SDAG-NEXT:    s_mov_b64 s[2:3], s[6:7]
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SI-SDAG-NEXT:    v_med3_f32 v2, v2, 2.0, 4.0
+; SI-SDAG-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
+; SI-SDAG-NEXT:    s_endpgm
+;
+; SI-GISEL-LABEL: v_test_fmed3_r_i_i_no_nans_f32:
+; SI-GISEL:       ; %bb.0:
+; SI-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; SI-GISEL-NEXT:    s_mov_b32 s6, 0
+; SI-GISEL-NEXT:    s_mov_b32 s7, 0xf000
+; SI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-GISEL-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; SI-GISEL-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; SI-GISEL-NEXT:    v_med3_f32 v2, v2, 2.0, 4.0
+; SI-GISEL-NEXT:    s_mov_b64 s[2:3], s[6:7]
+; SI-GISEL-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
+; SI-GISEL-NEXT:    s_endpgm
 ;
 ; VI-SDAG-LABEL: v_test_fmed3_r_i_i_no_nans_f32:
 ; VI-SDAG:       ; %bb.0:
@@ -827,22 +967,40 @@ define amdgpu_kernel void @v_test_fmed3_r_i_i_no_nans_f32(ptr addrspace(1) %out,
 }
 
 define amdgpu_kernel void @v_test_legacy_fmed3_r_i_i_f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #1 {
-; SI-LABEL: v_test_legacy_fmed3_r_i_i_f32:
-; SI:       ; %bb.0:
-; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
-; SI-NEXT:    s_mov_b32 s7, 0xf000
-; SI-NEXT:    s_mov_b32 s6, 0
-; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; SI-NEXT:    v_mov_b32_e32 v1, 0
-; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_mov_b64 s[4:5], s[2:3]
-; SI-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
-; SI-NEXT:    s_mov_b64 s[2:3], s[6:7]
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_add_f32_e32 v2, 1.0, v2
-; SI-NEXT:    v_med3_f32 v2, v2, 2.0, 4.0
-; SI-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
-; SI-NEXT:    s_endpgm
+; SI-SDAG-LABEL: v_test_legacy_fmed3_r_i_i_f32:
+; SI-SDAG:       ; %bb.0:
+; SI-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
+; SI-SDAG-NEXT:    s_mov_b32 s7, 0xf000
+; SI-SDAG-NEXT:    s_mov_b32 s6, 0
+; SI-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; SI-SDAG-NEXT:    v_mov_b32_e32 v1, 0
+; SI-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-SDAG-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; SI-SDAG-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
+; SI-SDAG-NEXT:    s_mov_b64 s[2:3], s[6:7]
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SI-SDAG-NEXT:    v_add_f32_e32 v2, 1.0, v2
+; SI-SDAG-NEXT:    v_med3_f32 v2, v2, 2.0, 4.0
+; SI-SDAG-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
+; SI-SDAG-NEXT:    s_endpgm
+;
+; SI-GISEL-LABEL: v_test_legacy_fmed3_r_i_i_f32:
+; SI-GISEL:       ; %bb.0:
+; SI-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; SI-GISEL-NEXT:    s_mov_b32 s6, 0
+; SI-GISEL-NEXT:    s_mov_b32 s7, 0xf000
+; SI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-GISEL-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; SI-GISEL-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; SI-GISEL-NEXT:    v_add_f32_e32 v2, 1.0, v2
+; SI-GISEL-NEXT:    v_max_legacy_f32_e64 v2, v2, 2.0
+; SI-GISEL-NEXT:    v_min_legacy_f32_e64 v2, v2, 4.0
+; SI-GISEL-NEXT:    s_mov_b64 s[2:3], s[6:7]
+; SI-GISEL-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
+; SI-GISEL-NEXT:    s_endpgm
 ;
 ; VI-SDAG-LABEL: v_test_legacy_fmed3_r_i_i_f32:
 ; VI-SDAG:       ; %bb.0:
@@ -963,29 +1121,52 @@ define amdgpu_kernel void @v_test_legacy_fmed3_r_i_i_f32(ptr addrspace(1) %out,
 }
 
 define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod0(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #2 {
-; SI-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod0:
-; SI:       ; %bb.0:
-; SI-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x9
-; SI-NEXT:    s_mov_b32 s11, 0xf000
-; SI-NEXT:    s_mov_b32 s10, 0
-; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; SI-NEXT:    v_mov_b32_e32 v1, 0
-; SI-NEXT:    s_mov_b64 s[14:15], s[10:11]
-; SI-NEXT:    s_mov_b64 s[18:19], s[10:11]
-; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_mov_b64 s[8:9], s[2:3]
-; SI-NEXT:    s_mov_b64 s[12:13], s[4:5]
-; SI-NEXT:    s_mov_b64 s[16:17], s[6:7]
-; SI-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_load_dword v3, v[0:1], s[12:15], 0 addr64 glc
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_load_dword v4, v[0:1], s[16:19], 0 addr64 glc
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    s_mov_b64 s[2:3], s[10:11]
-; SI-NEXT:    v_med3_f32 v2, -v2, v3, v4
-; SI-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
-; SI-NEXT:    s_endpgm
+; SI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod0:
+; SI-SDAG:       ; %bb.0:
+; SI-SDAG-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x9
+; SI-SDAG-NEXT:    s_mov_b32 s11, 0xf000
+; SI-SDAG-NEXT:    s_mov_b32 s10, 0
+; SI-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; SI-SDAG-NEXT:    v_mov_b32_e32 v1, 0
+; SI-SDAG-NEXT:    s_mov_b64 s[14:15], s[10:11]
+; SI-SDAG-NEXT:    s_mov_b64 s[18:19], s[10:11]
+; SI-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-SDAG-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; SI-SDAG-NEXT:    s_mov_b64 s[12:13], s[4:5]
+; SI-SDAG-NEXT:    s_mov_b64 s[16:17], s[6:7]
+; SI-SDAG-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SI-SDAG-NEXT:    buffer_load_dword v3, v[0:1], s[12:15], 0 addr64 glc
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SI-SDAG-NEXT:    buffer_load_dword v4, v[0:1], s[16:19], 0 addr64 glc
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SI-SDAG-NEXT:    s_mov_b64 s[2:3], s[10:11]
+; SI-SDAG-NEXT:    v_med3_f32 v2, -v2, v3, v4
+; SI-SDAG-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
+; SI-SDAG-NEXT:    s_endpgm
+;
+; SI-GISEL-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod0:
+; SI-GISEL:       ; %bb.0:
+; SI-GISEL-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x9
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; SI-GISEL-NEXT:    s_mov_b32 s10, 0
+; SI-GISEL-NEXT:    s_mov_b32 s11, 0xf000
+; SI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-GISEL-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; SI-GISEL-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; SI-GISEL-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; SI-GISEL-NEXT:    buffer_load_dword v3, v[0:1], s[8:11], 0 addr64 glc
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; SI-GISEL-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; SI-GISEL-NEXT:    buffer_load_dword v4, v[0:1], s[8:11], 0 addr64 glc
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; SI-GISEL-NEXT:    v_mul_f32_e32 v2, -1.0, v2
+; SI-GISEL-NEXT:    v_med3_f32 v2, v2, v3, v4
+; SI-GISEL-NEXT:    s_mov_b64 s[2:3], s[10:11]
+; SI-GISEL-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
+; SI-GISEL-NEXT:    s_endpgm
 ;
 ; VI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod0:
 ; VI-SDAG:       ; %bb.0:
@@ -1130,29 +1311,52 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod0(ptr addrspa
 }
 
 define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod1(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #2 {
-; SI-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod1:
-; SI:       ; %bb.0:
-; SI-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x9
-; SI-NEXT:    s_mov_b32 s11, 0xf000
-; SI-NEXT:    s_mov_b32 s10, 0
-; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; SI-NEXT:    v_mov_b32_e32 v1, 0
-; SI-NEXT:    s_mov_b64 s[14:15], s[10:11]
-; SI-NEXT:    s_mov_b64 s[18:19], s[10:11]
-; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_mov_b64 s[8:9], s[2:3]
-; SI-NEXT:    s_mov_b64 s[12:13], s[4:5]
-; SI-NEXT:    s_mov_b64 s[16:17], s[6:7]
-; SI-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_load_dword v3, v[0:1], s[12:15], 0 addr64 glc
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_load_dword v4, v[0:1], s[16:19], 0 addr64 glc
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    s_mov_b64 s[2:3], s[10:11]
-; SI-NEXT:    v_med3_f32 v2, v2, -v3, v4
-; SI-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
-; SI-NEXT:    s_endpgm
+; SI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod1:
+; SI-SDAG:       ; %bb.0:
+; SI-SDAG-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x9
+; SI-SDAG-NEXT:    s_mov_b32 s11, 0xf000
+; SI-SDAG-NEXT:    s_mov_b32 s10, 0
+; SI-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; SI-SDAG-NEXT:    v_mov_b32_e32 v1, 0
+; SI-SDAG-NEXT:    s_mov_b64 s[14:15], s[10:11]
+; SI-SDAG-NEXT:    s_mov_b64 s[18:19], s[10:11]
+; SI-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-SDAG-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; SI-SDAG-NEXT:    s_mov_b64 s[12:13], s[4:5]
+; SI-SDAG-NEXT:    s_mov_b64 s[16:17], s[6:7]
+; SI-SDAG-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SI-SDAG-NEXT:    buffer_load_dword v3, v[0:1], s[12:15], 0 addr64 glc
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SI-SDAG-NEXT:    buffer_load_dword v4, v[0:1], s[16:19], 0 addr64 glc
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SI-SDAG-NEXT:    s_mov_b64 s[2:3], s[10:11]
+; SI-SDAG-NEXT:    v_med3_f32 v2, v2, -v3, v4
+; SI-SDAG-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
+; SI-SDAG-NEXT:    s_endpgm
+;
+; SI-GISEL-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod1:
+; SI-GISEL:       ; %bb.0:
+; SI-GISEL-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x9
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; SI-GISEL-NEXT:    s_mov_b32 s10, 0
+; SI-GISEL-NEXT:    s_mov_b32 s11, 0xf000
+; SI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-GISEL-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; SI-GISEL-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; SI-GISEL-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; SI-GISEL-NEXT:    buffer_load_dword v3, v[0:1], s[8:11], 0 addr64 glc
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; SI-GISEL-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; SI-GISEL-NEXT:    buffer_load_dword v4, v[0:1], s[8:11], 0 addr64 glc
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; SI-GISEL-NEXT:    v_mul_f32_e32 v3, -1.0, v3
+; SI-GISEL-NEXT:    v_med3_f32 v2, v2, v3, v4
+; SI-GISEL-NEXT:    s_mov_b64 s[2:3], s[10:11]
+; SI-GISEL-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
+; SI-GISEL-NEXT:    s_endpgm
 ;
 ; VI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod1:
 ; VI-SDAG:       ; %bb.0:
@@ -1297,29 +1501,52 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod1(ptr addrspa
 }
 
 define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod2(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #2 {
-; SI-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod2:
-; SI:       ; %bb.0:
-; SI-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x9
-; SI-NEXT:    s_mov_b32 s11, 0xf000
-; SI-NEXT:    s_mov_b32 s10, 0
-; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; SI-NEXT:    v_mov_b32_e32 v1, 0
-; SI-NEXT:    s_mov_b64 s[14:15], s[10:11]
-; SI-NEXT:    s_mov_b64 s[18:19], s[10:11]
-; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_mov_b64 s[8:9], s[2:3]
-; SI-NEXT:    s_mov_b64 s[12:13], s[4:5]
-; SI-NEXT:    s_mov_b64 s[16:17], s[6:7]
-; SI-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_load_dword v3, v[0:1], s[12:15], 0 addr64 glc
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_load_dword v4, v[0:1], s[16:19], 0 addr64 glc
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    s_mov_b64 s[2:3], s[10:11]
-; SI-NEXT:    v_med3_f32 v2, v2, v3, -v4
-; SI-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
-; SI-NEXT:    s_endpgm
+; SI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod2:
+; SI-SDAG:       ; %bb.0:
+; SI-SDAG-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x9
+; SI-SDAG-NEXT:    s_mov_b32 s11, 0xf000
+; SI-SDAG-NEXT:    s_mov_b32 s10, 0
+; SI-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; SI-SDAG-NEXT:    v_mov_b32_e32 v1, 0
+; SI-SDAG-NEXT:    s_mov_b64 s[14:15], s[10:11]
+; SI-SDAG-NEXT:    s_mov_b64 s[18:19], s[10:11]
+; SI-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-SDAG-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; SI-SDAG-NEXT:    s_mov_b64 s[12:13], s[4:5]
+; SI-SDAG-NEXT:    s_mov_b64 s[16:17], s[6:7]
+; SI-SDAG-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SI-SDAG-NEXT:    buffer_load_dword v3, v[0:1], s[12:15], 0 addr64 glc
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SI-SDAG-NEXT:    buffer_load_dword v4, v[0:1], s[16:19], 0 addr64 glc
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SI-SDAG-NEXT:    s_mov_b64 s[2:3], s[10:11]
+; SI-SDAG-NEXT:    v_med3_f32 v2, v2, v3, -v4
+; SI-SDAG-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
+; SI-SDAG-NEXT:    s_endpgm
+;
+; SI-GISEL-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod2:
+; SI-GISEL:       ; %bb.0:
+; SI-GISEL-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x9
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; SI-GISEL-NEXT:    s_mov_b32 s10, 0
+; SI-GISEL-NEXT:    s_mov_b32 s11, 0xf000
+; SI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-GISEL-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; SI-GISEL-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; SI-GISEL-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; SI-GISEL-NEXT:    buffer_load_dword v3, v[0:1], s[8:11], 0 addr64 glc
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; SI-GISEL-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; SI-GISEL-NEXT:    buffer_load_dword v4, v[0:1], s[8:11], 0 addr64 glc
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; SI-GISEL-NEXT:    v_mul_f32_e32 v4, -1.0, v4
+; SI-GISEL-NEXT:    v_med3_f32 v2, v2, v3, v4
+; SI-GISEL-NEXT:    s_mov_b64 s[2:3], s[10:11]
+; SI-GISEL-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
+; SI-GISEL-NEXT:    s_endpgm
 ;
 ; VI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod2:
 ; VI-SDAG:       ; %bb.0:
@@ -1464,29 +1691,53 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod2(ptr addrspa
 }
 
 define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod012(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #2 {
-; SI-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod012:
-; SI:       ; %bb.0:
-; SI-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x9
-; SI-NEXT:    s_mov_b32 s11, 0xf000
-; SI-NEXT:    s_mov_b32 s10, 0
-; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; SI-NEXT:    v_mov_b32_e32 v1, 0
-; SI-NEXT:    s_mov_b64 s[14:15], s[10:11]
-; SI-NEXT:    s_mov_b64 s[18:19], s[10:11]
-; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_mov_b64 s[8:9], s[2:3]
-; SI-NEXT:    s_mov_b64 s[12:13], s[4:5]
-; SI-NEXT:    s_mov_b64 s[16:17], s[6:7]
-; SI-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_load_dword v3, v[0:1], s[12:15], 0 addr64 glc
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_load_dword v4, v[0:1], s[16:19], 0 addr64 glc
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    s_mov_b64 s[2:3], s[10:11]
-; SI-NEXT:    v_med3_f32 v2, -v2, |v3|, -|v4|
-; SI-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
-; SI-NEXT:    s_endpgm
+; SI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod012:
+; SI-SDAG:       ; %bb.0:
+; SI-SDAG-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x9
+; SI-SDAG-NEXT:    s_mov_b32 s11, 0xf000
+; SI-SDAG-NEXT:    s_mov_b32 s10, 0
+; SI-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; SI-SDAG-NEXT:    v_mov_b32_e32 v1, 0
+; SI-SDAG-NEXT:    s_mov_b64 s[14:15], s[10:11]
+; SI-SDAG-NEXT:    s_mov_b64 s[18:19], s[10:11]
+; SI-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-SDAG-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; SI-SDAG-NEXT:    s_mov_b64 s[12:13], s[4:5]
+; SI-SDAG-NEXT:    s_mov_b64 s[16:17], s[6:7]
+; SI-SDAG-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SI-SDAG-NEXT:    buffer_load_dword v3, v[0:1], s[12:15], 0 addr64 glc
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SI-SDAG-NEXT:    buffer_load_dword v4, v[0:1], s[16:19], 0 addr64 glc
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SI-SDAG-NEXT:    s_mov_b64 s[2:3], s[10:11]
+; SI-SDAG-NEXT:    v_med3_f32 v2, -v2, |v3|, -|v4|
+; SI-SDAG-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
+; SI-SDAG-NEXT:    s_endpgm
+;
+; SI-GISEL-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod012:
+; SI-GISEL:       ; %bb.0:
+; SI-GISEL-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x9
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; SI-GISEL-NEXT:    s_mov_b32 s10, 0
+; SI-GISEL-NEXT:    s_mov_b32 s11, 0xf000
+; SI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-GISEL-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; SI-GISEL-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; SI-GISEL-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; SI-GISEL-NEXT:    buffer_load_dword v3, v[0:1], s[8:11], 0 addr64 glc
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; SI-GISEL-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; SI-GISEL-NEXT:    buffer_load_dword v4, v[0:1], s[8:11], 0 addr64 glc
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; SI-GISEL-NEXT:    v_mul_f32_e32 v2, -1.0, v2
+; SI-GISEL-NEXT:    v_mul_f32_e64 v4, -1.0, |v4|
+; SI-GISEL-NEXT:    v_med3_f32 v2, v2, |v3|, v4
+; SI-GISEL-NEXT:    s_mov_b64 s[2:3], s[10:11]
+; SI-GISEL-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
+; SI-GISEL-NEXT:    s_endpgm
 ;
 ; VI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod012:
 ; VI-SDAG:       ; %bb.0:
@@ -1640,29 +1891,54 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod012(ptr addrs
 }
 
 define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_negabs012(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #2 {
-; SI-LABEL: v_test_global_nnans_med3_f32_pat0_negabs012:
-; SI:       ; %bb.0:
-; SI-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x9
-; SI-NEXT:    s_mov_b32 s11, 0xf000
-; SI-NEXT:    s_mov_b32 s10, 0
-; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; SI-NEXT:    v_mov_b32_e32 v1, 0
-; SI-NEXT:    s_mov_b64 s[14:15], s[10:11]
-; SI-NEXT:    s_mov_b64 s[18:19], s[10:11]
-; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_mov_b64 s[8:9], s[2:3]
-; SI-NEXT:    s_mov_b64 s[12:13], s[4:5]
-; SI-NEXT:    s_mov_b64 s[16:17], s[6:7]
-; SI-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_load_dword v3, v[0:1], s[12:15], 0 addr64 glc
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_load_dword v4, v[0:1], s[16:19], 0 addr64 glc
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    s_mov_b64 s[2:3], s[10:11]
-; SI-NEXT:    v_med3_f32 v2, -|v2|, -|v3|, -|v4|
-; SI-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
-; SI-NEXT:    s_endpgm
+; SI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat0_negabs012:
+; SI-SDAG:       ; %bb.0:
+; SI-SDAG-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x9
+; SI-SDAG-NEXT:    s_mov_b32 s11, 0xf000
+; SI-SDAG-NEXT:    s_mov_b32 s10, 0
+; SI-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; SI-SDAG-NEXT:    v_mov_b32_e32 v1, 0
+; SI-SDAG-NEXT:    s_mov_b64 s[14:15], s[10:11]
+; SI-SDAG-NEXT:    s_mov_b64 s[18:19], s[10:11]
+; SI-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-SDAG-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; SI-SDAG-NEXT:    s_mov_b64 s[12:13], s[4:5]
+; SI-SDAG-NEXT:    s_mov_b64 s[16:17], s[6:7]
+; SI-SDAG-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SI-SDAG-NEXT:    buffer_load_dword v3, v[0:1], s[12:15], 0 addr64 glc
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SI-SDAG-NEXT:    buffer_load_dword v4, v[0:1], s[16:19], 0 addr64 glc
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SI-SDAG-NEXT:    s_mov_b64 s[2:3], s[10:11]
+; SI-SDAG-NEXT:    v_med3_f32 v2, -|v2|, -|v3|, -|v4|
+; SI-SDAG-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
+; SI-SDAG-NEXT:    s_endpgm
+;
+; SI-GISEL-LABEL: v_test_global_nnans_med3_f32_pat0_negabs012:
+; SI-GISEL:       ; %bb.0:
+; SI-GISEL-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x9
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; SI-GISEL-NEXT:    s_mov_b32 s10, 0
+; SI-GISEL-NEXT:    s_mov_b32 s11, 0xf000
+; SI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-GISEL-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; SI-GISEL-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; SI-GISEL-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; SI-GISEL-NEXT:    buffer_load_dword v3, v[0:1], s[8:11], 0 addr64 glc
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; SI-GISEL-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; SI-GISEL-NEXT:    buffer_load_dword v4, v[0:1], s[8:11], 0 addr64 glc
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; SI-GISEL-NEXT:    v_mul_f32_e64 v2, -1.0, |v2|
+; SI-GISEL-NEXT:    v_mul_f32_e64 v3, -1.0, |v3|
+; SI-GISEL-NEXT:    v_mul_f32_e64 v4, -1.0, |v4|
+; SI-GISEL-NEXT:    v_med3_f32 v2, v2, v3, v4
+; SI-GISEL-NEXT:    s_mov_b64 s[2:3], s[10:11]
+; SI-GISEL-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
+; SI-GISEL-NEXT:    s_endpgm
 ;
 ; VI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat0_negabs012:
 ; VI-SDAG:       ; %bb.0:
@@ -1821,32 +2097,57 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_negabs012(ptr addrs
 }
 
 define amdgpu_kernel void @v_nnan_inputs_med3_f32_pat0(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #1 {
-; SI-LABEL: v_nnan_inputs_med3_f32_pat0:
-; SI:       ; %bb.0:
-; SI-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x9
-; SI-NEXT:    s_mov_b32 s11, 0xf000
-; SI-NEXT:    s_mov_b32 s10, 0
-; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; SI-NEXT:    v_mov_b32_e32 v1, 0
-; SI-NEXT:    s_mov_b64 s[14:15], s[10:11]
-; SI-NEXT:    s_mov_b64 s[18:19], s[10:11]
-; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_mov_b64 s[8:9], s[2:3]
-; SI-NEXT:    s_mov_b64 s[12:13], s[4:5]
-; SI-NEXT:    s_mov_b64 s[16:17], s[6:7]
-; SI-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_load_dword v3, v[0:1], s[12:15], 0 addr64 glc
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_load_dword v4, v[0:1], s[16:19], 0 addr64 glc
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    s_mov_b64 s[2:3], s[10:11]
-; SI-NEXT:    v_add_f32_e32 v2, 1.0, v2
-; SI-NEXT:    v_add_f32_e32 v3, 2.0, v3
-; SI-NEXT:    v_add_f32_e32 v4, 4.0, v4
-; SI-NEXT:    v_med3_f32 v2, v2, v3, v4
-; SI-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
-; SI-NEXT:    s_endpgm
+; SI-SDAG-LABEL: v_nnan_inputs_med3_f32_pat0:
+; SI-SDAG:       ; %bb.0:
+; SI-SDAG-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x9
+; SI-SDAG-NEXT:    s_mov_b32 s11, 0xf000
+; SI-SDAG-NEXT:    s_mov_b32 s10, 0
+; SI-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; SI-SDAG-NEXT:    v_mov_b32_e32 v1, 0
+; SI-SDAG-NEXT:    s_mov_b64 s[14:15], s[10:11]
+; SI-SDAG-NEXT:    s_mov_b64 s[18:19], s[10:11]
+; SI-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-SDAG-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; SI-SDAG-NEXT:    s_mov_b64 s[12:13], s[4:5]
+; SI-SDAG-NEXT:    s_mov_b64 s[16:17], s[6:7]
+; SI-SDAG-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SI-SDAG-NEXT:    buffer_load_dword v3, v[0:1], s[12:15], 0 addr64 glc
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SI-SDAG-NEXT:    buffer_load_dword v4, v[0:1], s[16:19], 0 addr64 glc
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SI-SDAG-NEXT:    s_mov_b64 s[2:3], s[10:11]
+; SI-SDAG-NEXT:    v_add_f32_e32 v2, 1.0, v2
+; SI-SDAG-NEXT:    v_add_f32_e32 v3, 2.0, v3
+; SI-SDAG-NEXT:    v_add_f32_e32 v4, 4.0, v4
+; SI-SDAG-NEXT:    v_med3_f32 v2, v2, v3, v4
+; SI-SDAG-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
+; SI-SDAG-NEXT:    s_endpgm
+;
+; SI-GISEL-LABEL: v_nnan_inputs_med3_f32_pat0:
+; SI-GISEL:       ; %bb.0:
+; SI-GISEL-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x9
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; SI-GISEL-NEXT:    s_mov_b32 s10, 0
+; SI-GISEL-NEXT:    s_mov_b32 s11, 0xf000
+; SI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-GISEL-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; SI-GISEL-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; SI-GISEL-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; SI-GISEL-NEXT:    buffer_load_dword v3, v[0:1], s[8:11], 0 addr64 glc
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; SI-GISEL-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; SI-GISEL-NEXT:    buffer_load_dword v4, v[0:1], s[8:11], 0 addr64 glc
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; SI-GISEL-NEXT:    v_add_f32_e32 v2, 1.0, v2
+; SI-GISEL-NEXT:    v_add_f32_e32 v3, 2.0, v3
+; SI-GISEL-NEXT:    v_add_f32_e32 v4, 4.0, v4
+; SI-GISEL-NEXT:    v_med3_f32 v2, v2, v3, v4
+; SI-GISEL-NEXT:    s_mov_b64 s[2:3], s[10:11]
+; SI-GISEL-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
+; SI-GISEL-NEXT:    s_endpgm
 ;
 ; VI-SDAG-LABEL: v_nnan_inputs_med3_f32_pat0:
 ; VI-SDAG:       ; %bb.0:
@@ -1971,29 +2272,51 @@ define amdgpu_kernel void @v_nnan_inputs_med3_f32_pat0(ptr addrspace(1) %out, pt
 }
 
 define amdgpu_kernel void @v_nnan_input_calls_med3_f32_pat0(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #1 {
-; SI-LABEL: v_nnan_input_calls_med3_f32_pat0:
-; SI:       ; %bb.0:
-; SI-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x9
-; SI-NEXT:    s_mov_b32 s11, 0xf000
-; SI-NEXT:    s_mov_b32 s10, 0
-; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; SI-NEXT:    v_mov_b32_e32 v1, 0
-; SI-NEXT:    s_mov_b64 s[14:15], s[10:11]
-; SI-NEXT:    s_mov_b64 s[18:19], s[10:11]
-; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_mov_b64 s[8:9], s[2:3]
-; SI-NEXT:    s_mov_b64 s[12:13], s[4:5]
-; SI-NEXT:    s_mov_b64 s[16:17], s[6:7]
-; SI-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_load_dword v3, v[0:1], s[12:15], 0 addr64 glc
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_load_dword v4, v[0:1], s[16:19], 0 addr64 glc
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    s_mov_b64 s[2:3], s[10:11]
-; SI-NEXT:    v_med3_f32 v2, v2, v3, v4
-; SI-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
-; SI-NEXT:    s_endpgm
+; SI-SDAG-LABEL: v_nnan_input_calls_med3_f32_pat0:
+; SI-SDAG:       ; %bb.0:
+; SI-SDAG-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x9
+; SI-SDAG-NEXT:    s_mov_b32 s11, 0xf000
+; SI-SDAG-NEXT:    s_mov_b32 s10, 0
+; SI-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; SI-SDAG-NEXT:    v_mov_b32_e32 v1, 0
+; SI-SDAG-NEXT:    s_mov_b64 s[14:15], s[10:11]
+; SI-SDAG-NEXT:    s_mov_b64 s[18:19], s[10:11]
+; SI-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-SDAG-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; SI-SDAG-NEXT:    s_mov_b64 s[12:13], s[4:5]
+; SI-SDAG-NEXT:    s_mov_b64 s[16:17], s[6:7]
+; SI-SDAG-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SI-SDAG-NEXT:    buffer_load_dword v3, v[0:1], s[12:15], 0 addr64 glc
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SI-SDAG-NEXT:    buffer_load_dword v4, v[0:1], s[16:19], 0 addr64 glc
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SI-SDAG-NEXT:    s_mov_b64 s[2:3], s[10:11]
+; SI-SDAG-NEXT:    v_med3_f32 v2, v2, v3, v4
+; SI-SDAG-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
+; SI-SDAG-NEXT:    s_endpgm
+;
+; SI-GISEL-LABEL: v_nnan_input_calls_med3_f32_pat0:
+; SI-GISEL:       ; %bb.0:
+; SI-GISEL-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x9
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; SI-GISEL-NEXT:    s_mov_b32 s10, 0
+; SI-GISEL-NEXT:    s_mov_b32 s11, 0xf000
+; SI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-GISEL-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; SI-GISEL-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; SI-GISEL-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; SI-GISEL-NEXT:    buffer_load_dword v3, v[0:1], s[8:11], 0 addr64 glc
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; SI-GISEL-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; SI-GISEL-NEXT:    buffer_load_dword v4, v[0:1], s[8:11], 0 addr64 glc
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; SI-GISEL-NEXT:    v_med3_f32 v2, v2, v3, v4
+; SI-GISEL-NEXT:    s_mov_b64 s[2:3], s[10:11]
+; SI-GISEL-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
+; SI-GISEL-NEXT:    s_endpgm
 ;
 ; VI-SDAG-LABEL: v_nnan_input_calls_med3_f32_pat0:
 ; VI-SDAG:       ; %bb.0:
@@ -2101,29 +2424,51 @@ define amdgpu_kernel void @v_nnan_input_calls_med3_f32_pat0(ptr addrspace(1) %ou
 }
 
 define amdgpu_kernel void @v_nnan_call_med3_f32_pat0(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #1 {
-; SI-LABEL: v_nnan_call_med3_f32_pat0:
-; SI:       ; %bb.0:
-; SI-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x9
-; SI-NEXT:    s_mov_b32 s11, 0xf000
-; SI-NEXT:    s_mov_b32 s10, 0
-; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; SI-NEXT:    v_mov_b32_e32 v1, 0
-; SI-NEXT:    s_mov_b64 s[14:15], s[10:11]
-; SI-NEXT:    s_mov_b64 s[18:19], s[10:11]
-; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_mov_b64 s[8:9], s[2:3]
-; SI-NEXT:    s_mov_b64 s[12:13], s[4:5]
-; SI-NEXT:    s_mov_b64 s[16:17], s[6:7]
-; SI-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_load_dword v3, v[0:1], s[12:15], 0 addr64 glc
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_load_dword v4, v[0:1], s[16:19], 0 addr64 glc
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    s_mov_b64 s[2:3], s[10:11]
-; SI-NEXT:    v_med3_f32 v2, v2, v3, v4
-; SI-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
-; SI-NEXT:    s_endpgm
+; SI-SDAG-LABEL: v_nnan_call_med3_f32_pat0:
+; SI-SDAG:       ; %bb.0:
+; SI-SDAG-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x9
+; SI-SDAG-NEXT:    s_mov_b32 s11, 0xf000
+; SI-SDAG-NEXT:    s_mov_b32 s10, 0
+; SI-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; SI-SDAG-NEXT:    v_mov_b32_e32 v1, 0
+; SI-SDAG-NEXT:    s_mov_b64 s[14:15], s[10:11]
+; SI-SDAG-NEXT:    s_mov_b64 s[18:19], s[10:11]
+; SI-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-SDAG-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; SI-SDAG-NEXT:    s_mov_b64 s[12:13], s[4:5]
+; SI-SDAG-NEXT:    s_mov_b64 s[16:17], s[6:7]
+; SI-SDAG-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SI-SDAG-NEXT:    buffer_load_dword v3, v[0:1], s[12:15], 0 addr64 glc
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SI-SDAG-NEXT:    buffer_load_dword v4, v[0:1], s[16:19], 0 addr64 glc
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SI-SDAG-NEXT:    s_mov_b64 s[2:3], s[10:11]
+; SI-SDAG-NEXT:    v_med3_f32 v2, v2, v3, v4
+; SI-SDAG-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
+; SI-SDAG-NEXT:    s_endpgm
+;
+; SI-GISEL-LABEL: v_nnan_call_med3_f32_pat0:
+; SI-GISEL:       ; %bb.0:
+; SI-GISEL-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x9
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; SI-GISEL-NEXT:    s_mov_b32 s10, 0
+; SI-GISEL-NEXT:    s_mov_b32 s11, 0xf000
+; SI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-GISEL-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; SI-GISEL-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; SI-GISEL-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; SI-GISEL-NEXT:    buffer_load_dword v3, v[0:1], s[8:11], 0 addr64 glc
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; SI-GISEL-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; SI-GISEL-NEXT:    buffer_load_dword v4, v[0:1], s[8:11], 0 addr64 glc
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; SI-GISEL-NEXT:    v_med3_f32 v2, v2, v3, v4
+; SI-GISEL-NEXT:    s_mov_b64 s[2:3], s[10:11]
+; SI-GISEL-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
+; SI-GISEL-NEXT:    s_endpgm
 ;
 ; VI-SDAG-LABEL: v_nnan_call_med3_f32_pat0:
 ; VI-SDAG:       ; %bb.0:
@@ -2231,29 +2576,51 @@ define amdgpu_kernel void @v_nnan_call_med3_f32_pat0(ptr addrspace(1) %out, ptr
 }
 
 define amdgpu_kernel void @v_fast_call_med3_f32_pat0(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #1 {
-; SI-LABEL: v_fast_call_med3_f32_pat0:
-; SI:       ; %bb.0:
-; SI-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x9
-; SI-NEXT:    s_mov_b32 s11, 0xf000
-; SI-NEXT:    s_mov_b32 s10, 0
-; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; SI-NEXT:    v_mov_b32_e32 v1, 0
-; SI-NEXT:    s_mov_b64 s[14:15], s[10:11]
-; SI-NEXT:    s_mov_b64 s[18:19], s[10:11]
-; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_mov_b64 s[8:9], s[2:3]
-; SI-NEXT:    s_mov_b64 s[12:13], s[4:5]
-; SI-NEXT:    s_mov_b64 s[16:17], s[6:7]
-; SI-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_load_dword v3, v[0:1], s[12:15], 0 addr64 glc
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_load_dword v4, v[0:1], s[16:19], 0 addr64 glc
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    s_mov_b64 s[2:3], s[10:11]
-; SI-NEXT:    v_med3_f32 v2, v2, v3, v4
-; SI-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
-; SI-NEXT:    s_endpgm
+; SI-SDAG-LABEL: v_fast_call_med3_f32_pat0:
+; SI-SDAG:       ; %bb.0:
+; SI-SDAG-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x9
+; SI-SDAG-NEXT:    s_mov_b32 s11, 0xf000
+; SI-SDAG-NEXT:    s_mov_b32 s10, 0
+; SI-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; SI-SDAG-NEXT:    v_mov_b32_e32 v1, 0
+; SI-SDAG-NEXT:    s_mov_b64 s[14:15], s[10:11]
+; SI-SDAG-NEXT:    s_mov_b64 s[18:19], s[10:11]
+; SI-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-SDAG-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; SI-SDAG-NEXT:    s_mov_b64 s[12:13], s[4:5]
+; SI-SDAG-NEXT:    s_mov_b64 s[16:17], s[6:7]
+; SI-SDAG-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SI-SDAG-NEXT:    buffer_load_dword v3, v[0:1], s[12:15], 0 addr64 glc
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SI-SDAG-NEXT:    buffer_load_dword v4, v[0:1], s[16:19], 0 addr64 glc
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SI-SDAG-NEXT:    s_mov_b64 s[2:3], s[10:11]
+; SI-SDAG-NEXT:    v_med3_f32 v2, v2, v3, v4
+; SI-SDAG-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
+; SI-SDAG-NEXT:    s_endpgm
+;
+; SI-GISEL-LABEL: v_fast_call_med3_f32_pat0:
+; SI-GISEL:       ; %bb.0:
+; SI-GISEL-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x9
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; SI-GISEL-NEXT:    s_mov_b32 s10, 0
+; SI-GISEL-NEXT:    s_mov_b32 s11, 0xf000
+; SI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-GISEL-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; SI-GISEL-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; SI-GISEL-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; SI-GISEL-NEXT:    buffer_load_dword v3, v[0:1], s[8:11], 0 addr64 glc
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; SI-GISEL-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; SI-GISEL-NEXT:    buffer_load_dword v4, v[0:1], s[8:11], 0 addr64 glc
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; SI-GISEL-NEXT:    v_med3_f32 v2, v2, v3, v4
+; SI-GISEL-NEXT:    s_mov_b64 s[2:3], s[10:11]
+; SI-GISEL-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
+; SI-GISEL-NEXT:    s_endpgm
 ;
 ; VI-SDAG-LABEL: v_fast_call_med3_f32_pat0:
 ; VI-SDAG:       ; %bb.0:
@@ -2373,29 +2740,51 @@ define amdgpu_kernel void @v_fast_call_med3_f32_pat0(ptr addrspace(1) %out, ptr
 ; + commute outermost max
 
 define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #2 {
-; SI-LABEL: v_test_global_nnans_med3_f32_pat0:
-; SI:       ; %bb.0:
-; SI-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x9
-; SI-NEXT:    s_mov_b32 s11, 0xf000
-; SI-NEXT:    s_mov_b32 s10, 0
-; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; SI-NEXT:    v_mov_b32_e32 v1, 0
-; SI-NEXT:    s_mov_b64 s[14:15], s[10:11]
-; SI-NEXT:    s_mov_b64 s[18:19], s[10:11]
-; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_mov_b64 s[8:9], s[2:3]
-; SI-NEXT:    s_mov_b64 s[12:13], s[4:5]
-; SI-NEXT:    s_mov_b64 s[16:17], s[6:7]
-; SI-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_load_dword v3, v[0:1], s[12:15], 0 addr64 glc
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_load_dword v4, v[0:1], s[16:19], 0 addr64 glc
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    s_mov_b64 s[2:3], s[10:11]
-; SI-NEXT:    v_med3_f32 v2, v2, v3, v4
-; SI-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
-; SI-NEXT:    s_endpgm
+; SI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat0:
+; SI-SDAG:       ; %bb.0:
+; SI-SDAG-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x9
+; SI-SDAG-NEXT:    s_mov_b32 s11, 0xf000
+; SI-SDAG-NEXT:    s_mov_b32 s10, 0
+; SI-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; SI-SDAG-NEXT:    v_mov_b32_e32 v1, 0
+; SI-SDAG-NEXT:    s_mov_b64 s[14:15], s[10:11]
+; SI-SDAG-NEXT:    s_mov_b64 s[18:19], s[10:11]
+; SI-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-SDAG-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; SI-SDAG-NEXT:    s_mov_b64 s[12:13], s[4:5]
+; SI-SDAG-NEXT:    s_mov_b64 s[16:17], s[6:7]
+; SI-SDAG-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SI-SDAG-NEXT:    buffer_load_dword v3, v[0:1], s[12:15], 0 addr64 glc
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SI-SDAG-NEXT:    buffer_load_dword v4, v[0:1], s[16:19], 0 addr64 glc
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SI-SDAG-NEXT:    s_mov_b64 s[2:3], s[10:11]
+; SI-SDAG-NEXT:    v_med3_f32 v2, v2, v3, v4
+; SI-SDAG-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
+; SI-SDAG-NEXT:    s_endpgm
+;
+; SI-GISEL-LABEL: v_test_global_nnans_med3_f32_pat0:
+; SI-GISEL:       ; %bb.0:
+; SI-GISEL-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x9
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; SI-GISEL-NEXT:    s_mov_b32 s10, 0
+; SI-GISEL-NEXT:    s_mov_b32 s11, 0xf000
+; SI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-GISEL-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; SI-GISEL-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; SI-GISEL-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; SI-GISEL-NEXT:    buffer_load_dword v3, v[0:1], s[8:11], 0 addr64 glc
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; SI-GISEL-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; SI-GISEL-NEXT:    buffer_load_dword v4, v[0:1], s[8:11], 0 addr64 glc
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; SI-GISEL-NEXT:    v_med3_f32 v2, v2, v3, v4
+; SI-GISEL-NEXT:    s_mov_b64 s[2:3], s[10:11]
+; SI-GISEL-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
+; SI-GISEL-NEXT:    s_endpgm
 ;
 ; VI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat0:
 ; VI-SDAG:       ; %bb.0:
@@ -2503,29 +2892,51 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0(ptr addrspace(1) %o
 }
 
 define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat1(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #2 {
-; SI-LABEL: v_test_global_nnans_med3_f32_pat1:
-; SI:       ; %bb.0:
-; SI-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x9
-; SI-NEXT:    s_mov_b32 s11, 0xf000
-; SI-NEXT:    s_mov_b32 s10, 0
-; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; SI-NEXT:    v_mov_b32_e32 v1, 0
-; SI-NEXT:    s_mov_b64 s[14:15], s[10:11]
-; SI-NEXT:    s_mov_b64 s[18:19], s[10:11]
-; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_mov_b64 s[8:9], s[2:3]
-; SI-NEXT:    s_mov_b64 s[12:13], s[4:5]
-; SI-NEXT:    s_mov_b64 s[16:17], s[6:7]
-; SI-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_load_dword v3, v[0:1], s[12:15], 0 addr64 glc
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_load_dword v4, v[0:1], s[16:19], 0 addr64 glc
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    s_mov_b64 s[2:3], s[10:11]
-; SI-NEXT:    v_med3_f32 v2, v2, v3, v4
-; SI-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
-; SI-NEXT:    s_endpgm
+; SI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat1:
+; SI-SDAG:       ; %bb.0:
+; SI-SDAG-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x9
+; SI-SDAG-NEXT:    s_mov_b32 s11, 0xf000
+; SI-SDAG-NEXT:    s_mov_b32 s10, 0
+; SI-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; SI-SDAG-NEXT:    v_mov_b32_e32 v1, 0
+; SI-SDAG-NEXT:    s_mov_b64 s[14:15], s[10:11]
+; SI-SDAG-NEXT:    s_mov_b64 s[18:19], s[10:11]
+; SI-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-SDAG-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; SI-SDAG-NEXT:    s_mov_b64 s[12:13], s[4:5]
+; SI-SDAG-NEXT:    s_mov_b64 s[16:17], s[6:7]
+; SI-SDAG-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SI-SDAG-NEXT:    buffer_load_dword v3, v[0:1], s[12:15], 0 addr64 glc
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SI-SDAG-NEXT:    buffer_load_dword v4, v[0:1], s[16:19], 0 addr64 glc
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SI-SDAG-NEXT:    s_mov_b64 s[2:3], s[10:11]
+; SI-SDAG-NEXT:    v_med3_f32 v2, v2, v3, v4
+; SI-SDAG-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
+; SI-SDAG-NEXT:    s_endpgm
+;
+; SI-GISEL-LABEL: v_test_global_nnans_med3_f32_pat1:
+; SI-GISEL:       ; %bb.0:
+; SI-GISEL-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x9
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; SI-GISEL-NEXT:    s_mov_b32 s10, 0
+; SI-GISEL-NEXT:    s_mov_b32 s11, 0xf000
+; SI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-GISEL-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; SI-GISEL-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; SI-GISEL-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; SI-GISEL-NEXT:    buffer_load_dword v3, v[0:1], s[8:11], 0 addr64 glc
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; SI-GISEL-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; SI-GISEL-NEXT:    buffer_load_dword v4, v[0:1], s[8:11], 0 addr64 glc
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; SI-GISEL-NEXT:    v_med3_f32 v2, v2, v3, v4
+; SI-GISEL-NEXT:    s_mov_b64 s[2:3], s[10:11]
+; SI-GISEL-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
+; SI-GISEL-NEXT:    s_endpgm
 ;
 ; VI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat1:
 ; VI-SDAG:       ; %bb.0:
@@ -2633,29 +3044,52 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat1(ptr addrspace(1) %o
 }
 
 define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat1_srcmod0(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #2 {
-; SI-LABEL: v_test_global_nnans_med3_f32_pat1_srcmod0:
-; SI:       ; %bb.0:
-; SI-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x9
-; SI-NEXT:    s_mov_b32 s11, 0xf000
-; SI-NEXT:    s_mov_b32 s10, 0
-; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; SI-NEXT:    v_mov_b32_e32 v1, 0
-; SI-NEXT:    s_mov_b64 s[14:15], s[10:11]
-; SI-NEXT:    s_mov_b64 s[18:19], s[10:11]
-; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_mov_b64 s[8:9], s[2:3]
-; SI-NEXT:    s_mov_b64 s[12:13], s[4:5]
-; SI-NEXT:    s_mov_b64 s[16:17], s[6:7]
-; SI-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_load_dword v3, v[0:1], s[12:15], 0 addr64 glc
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_load_dword v4, v[0:1], s[16:19], 0 addr64 glc
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    s_mov_b64 s[2:3], s[10:11]
-; SI-NEXT:    v_med3_f32 v2, -v2, v3, v4
-; SI-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
-; SI-NEXT:    s_endpgm
+; SI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat1_srcmod0:
+; SI-SDAG:       ; %bb.0:
+; SI-SDAG-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x9
+; SI-SDAG-NEXT:    s_mov_b32 s11, 0xf000
+; SI-SDAG-NEXT:    s_mov_b32 s10, 0
+; SI-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; SI-SDAG-NEXT:    v_mov_b32_e32 v1, 0
+; SI-SDAG-NEXT:    s_mov_b64 s[14:15], s[10:11]
+; SI-SDAG-NEXT:    s_mov_b64 s[18:19], s[10:11]
+; SI-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-SDAG-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; SI-SDAG-NEXT:    s_mov_b64 s[12:13], s[4:5]
+; SI-SDAG-NEXT:    s_mov_b64 s[16:17], s[6:7]
+; SI-SDAG-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SI-SDAG-NEXT:    buffer_load_dword v3, v[0:1], s[12:15], 0 addr64 glc
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SI-SDAG-NEXT:    buffer_load_dword v4, v[0:1], s[16:19], 0 addr64 glc
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SI-SDAG-NEXT:    s_mov_b64 s[2:3], s[10:11]
+; SI-SDAG-NEXT:    v_med3_f32 v2, -v2, v3, v4
+; SI-SDAG-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
+; SI-SDAG-NEXT:    s_endpgm
+;
+; SI-GISEL-LABEL: v_test_global_nnans_med3_f32_pat1_srcmod0:
+; SI-GISEL:       ; %bb.0:
+; SI-GISEL-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x9
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; SI-GISEL-NEXT:    s_mov_b32 s10, 0
+; SI-GISEL-NEXT:    s_mov_b32 s11, 0xf000
+; SI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-GISEL-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; SI-GISEL-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; SI-GISEL-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; SI-GISEL-NEXT:    buffer_load_dword v3, v[0:1], s[8:11], 0 addr64 glc
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; SI-GISEL-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; SI-GISEL-NEXT:    buffer_load_dword v4, v[0:1], s[8:11], 0 addr64 glc
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; SI-GISEL-NEXT:    v_mul_f32_e32 v2, -1.0, v2
+; SI-GISEL-NEXT:    v_med3_f32 v2, v2, v3, v4
+; SI-GISEL-NEXT:    s_mov_b64 s[2:3], s[10:11]
+; SI-GISEL-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
+; SI-GISEL-NEXT:    s_endpgm
 ;
 ; VI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat1_srcmod0:
 ; VI-SDAG:       ; %bb.0:
@@ -2800,29 +3234,51 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat1_srcmod0(ptr addrspa
 }
 
 define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat2(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #2 {
-; SI-LABEL: v_test_global_nnans_med3_f32_pat2:
-; SI:       ; %bb.0:
-; SI-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x9
-; SI-NEXT:    s_mov_b32 s11, 0xf000
-; SI-NEXT:    s_mov_b32 s10, 0
-; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; SI-NEXT:    v_mov_b32_e32 v1, 0
-; SI-NEXT:    s_mov_b64 s[14:15], s[10:11]
-; SI-NEXT:    s_mov_b64 s[18:19], s[10:11]
-; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_mov_b64 s[8:9], s[2:3]
-; SI-NEXT:    s_mov_b64 s[12:13], s[4:5]
-; SI-NEXT:    s_mov_b64 s[16:17], s[6:7]
-; SI-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_load_dword v3, v[0:1], s[12:15], 0 addr64 glc
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_load_dword v4, v[0:1], s[16:19], 0 addr64 glc
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    s_mov_b64 s[2:3], s[10:11]
-; SI-NEXT:    v_med3_f32 v2, v2, v3, v4
-; SI-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
-; SI-NEXT:    s_endpgm
+; SI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat2:
+; SI-SDAG:       ; %bb.0:
+; SI-SDAG-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x9
+; SI-SDAG-NEXT:    s_mov_b32 s11, 0xf000
+; SI-SDAG-NEXT:    s_mov_b32 s10, 0
+; SI-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; SI-SDAG-NEXT:    v_mov_b32_e32 v1, 0
+; SI-SDAG-NEXT:    s_mov_b64 s[14:15], s[10:11]
+; SI-SDAG-NEXT:    s_mov_b64 s[18:19], s[10:11]
+; SI-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-SDAG-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; SI-SDAG-NEXT:    s_mov_b64 s[12:13], s[4:5]
+; SI-SDAG-NEXT:    s_mov_b64 s[16:17], s[6:7]
+; SI-SDAG-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SI-SDAG-NEXT:    buffer_load_dword v3, v[0:1], s[12:15], 0 addr64 glc
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SI-SDAG-NEXT:    buffer_load_dword v4, v[0:1], s[16:19], 0 addr64 glc
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SI-SDAG-NEXT:    s_mov_b64 s[2:3], s[10:11]
+; SI-SDAG-NEXT:    v_med3_f32 v2, v2, v3, v4
+; SI-SDAG-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
+; SI-SDAG-NEXT:    s_endpgm
+;
+; SI-GISEL-LABEL: v_test_global_nnans_med3_f32_pat2:
+; SI-GISEL:       ; %bb.0:
+; SI-GISEL-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x9
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; SI-GISEL-NEXT:    s_mov_b32 s10, 0
+; SI-GISEL-NEXT:    s_mov_b32 s11, 0xf000
+; SI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-GISEL-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; SI-GISEL-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; SI-GISEL-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; SI-GISEL-NEXT:    buffer_load_dword v3, v[0:1], s[8:11], 0 addr64 glc
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; SI-GISEL-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; SI-GISEL-NEXT:    buffer_load_dword v4, v[0:1], s[8:11], 0 addr64 glc
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; SI-GISEL-NEXT:    v_med3_f32 v2, v2, v3, v4
+; SI-GISEL-NEXT:    s_mov_b64 s[2:3], s[10:11]
+; SI-GISEL-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
+; SI-GISEL-NEXT:    s_endpgm
 ;
 ; VI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat2:
 ; VI-SDAG:       ; %bb.0:
@@ -2930,29 +3386,51 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat2(ptr addrspace(1) %o
 }
 
 define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat3(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #2 {
-; SI-LABEL: v_test_global_nnans_med3_f32_pat3:
-; SI:       ; %bb.0:
-; SI-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x9
-; SI-NEXT:    s_mov_b32 s11, 0xf000
-; SI-NEXT:    s_mov_b32 s10, 0
-; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; SI-NEXT:    v_mov_b32_e32 v1, 0
-; SI-NEXT:    s_mov_b64 s[14:15], s[10:11]
-; SI-NEXT:    s_mov_b64 s[18:19], s[10:11]
-; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_mov_b64 s[8:9], s[2:3]
-; SI-NEXT:    s_mov_b64 s[12:13], s[4:5]
-; SI-NEXT:    s_mov_b64 s[16:17], s[6:7]
-; SI-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_load_dword v3, v[0:1], s[12:15], 0 addr64 glc
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_load_dword v4, v[0:1], s[16:19], 0 addr64 glc
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    s_mov_b64 s[2:3], s[10:11]
-; SI-NEXT:    v_med3_f32 v2, v2, v3, v4
-; SI-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
-; SI-NEXT:    s_endpgm
+; SI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat3:
+; SI-SDAG:       ; %bb.0:
+; SI-SDAG-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x9
+; SI-SDAG-NEXT:    s_mov_b32 s11, 0xf000
+; SI-SDAG-NEXT:    s_mov_b32 s10, 0
+; SI-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; SI-SDAG-NEXT:    v_mov_b32_e32 v1, 0
+; SI-SDAG-NEXT:    s_mov_b64 s[14:15], s[10:11]
+; SI-SDAG-NEXT:    s_mov_b64 s[18:19], s[10:11]
+; SI-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-SDAG-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; SI-SDAG-NEXT:    s_mov_b64 s[12:13], s[4:5]
+; SI-SDAG-NEXT:    s_mov_b64 s[16:17], s[6:7]
+; SI-SDAG-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SI-SDAG-NEXT:    buffer_load_dword v3, v[0:1], s[12:15], 0 addr64 glc
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SI-SDAG-NEXT:    buffer_load_dword v4, v[0:1], s[16:19], 0 addr64 glc
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SI-SDAG-NEXT:    s_mov_b64 s[2:3], s[10:11]
+; SI-SDAG-NEXT:    v_med3_f32 v2, v2, v3, v4
+; SI-SDAG-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
+; SI-SDAG-NEXT:    s_endpgm
+;
+; SI-GISEL-LABEL: v_test_global_nnans_med3_f32_pat3:
+; SI-GISEL:       ; %bb.0:
+; SI-GISEL-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x9
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; SI-GISEL-NEXT:    s_mov_b32 s10, 0
+; SI-GISEL-NEXT:    s_mov_b32 s11, 0xf000
+; SI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-GISEL-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; SI-GISEL-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; SI-GISEL-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; SI-GISEL-NEXT:    buffer_load_dword v3, v[0:1], s[8:11], 0 addr64 glc
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; SI-GISEL-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; SI-GISEL-NEXT:    buffer_load_dword v4, v[0:1], s[8:11], 0 addr64 glc
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; SI-GISEL-NEXT:    v_med3_f32 v2, v2, v3, v4
+; SI-GISEL-NEXT:    s_mov_b64 s[2:3], s[10:11]
+; SI-GISEL-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
+; SI-GISEL-NEXT:    s_endpgm
 ;
 ; VI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat3:
 ; VI-SDAG:       ; %bb.0:
@@ -3060,29 +3538,51 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat3(ptr addrspace(1) %o
 }
 
 define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat4(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #2 {
-; SI-LABEL: v_test_global_nnans_med3_f32_pat4:
-; SI:       ; %bb.0:
-; SI-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x9
-; SI-NEXT:    s_mov_b32 s11, 0xf000
-; SI-NEXT:    s_mov_b32 s10, 0
-; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; SI-NEXT:    v_mov_b32_e32 v1, 0
-; SI-NEXT:    s_mov_b64 s[14:15], s[10:11]
-; SI-NEXT:    s_mov_b64 s[18:19], s[10:11]
-; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_mov_b64 s[8:9], s[2:3]
-; SI-NEXT:    s_mov_b64 s[12:13], s[4:5]
-; SI-NEXT:    s_mov_b64 s[16:17], s[6:7]
-; SI-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_load_dword v3, v[0:1], s[12:15], 0 addr64 glc
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_load_dword v4, v[0:1], s[16:19], 0 addr64 glc
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    s_mov_b64 s[2:3], s[10:11]
-; SI-NEXT:    v_med3_f32 v2, v3, v2, v4
-; SI-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
-; SI-NEXT:    s_endpgm
+; SI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat4:
+; SI-SDAG:       ; %bb.0:
+; SI-SDAG-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x9
+; SI-SDAG-NEXT:    s_mov_b32 s11, 0xf000
+; SI-SDAG-NEXT:    s_mov_b32 s10, 0
+; SI-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; SI-SDAG-NEXT:    v_mov_b32_e32 v1, 0
+; SI-SDAG-NEXT:    s_mov_b64 s[14:15], s[10:11]
+; SI-SDAG-NEXT:    s_mov_b64 s[18:19], s[10:11]
+; SI-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-SDAG-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; SI-SDAG-NEXT:    s_mov_b64 s[12:13], s[4:5]
+; SI-SDAG-NEXT:    s_mov_b64 s[16:17], s[6:7]
+; SI-SDAG-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SI-SDAG-NEXT:    buffer_load_dword v3, v[0:1], s[12:15], 0 addr64 glc
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SI-SDAG-NEXT:    buffer_load_dword v4, v[0:1], s[16:19], 0 addr64 glc
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SI-SDAG-NEXT:    s_mov_b64 s[2:3], s[10:11]
+; SI-SDAG-NEXT:    v_med3_f32 v2, v3, v2, v4
+; SI-SDAG-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
+; SI-SDAG-NEXT:    s_endpgm
+;
+; SI-GISEL-LABEL: v_test_global_nnans_med3_f32_pat4:
+; SI-GISEL:       ; %bb.0:
+; SI-GISEL-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x9
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; SI-GISEL-NEXT:    s_mov_b32 s10, 0
+; SI-GISEL-NEXT:    s_mov_b32 s11, 0xf000
+; SI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-GISEL-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; SI-GISEL-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; SI-GISEL-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; SI-GISEL-NEXT:    buffer_load_dword v3, v[0:1], s[8:11], 0 addr64 glc
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; SI-GISEL-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; SI-GISEL-NEXT:    buffer_load_dword v4, v[0:1], s[8:11], 0 addr64 glc
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; SI-GISEL-NEXT:    v_med3_f32 v2, v3, v2, v4
+; SI-GISEL-NEXT:    s_mov_b64 s[2:3], s[10:11]
+; SI-GISEL-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
+; SI-GISEL-NEXT:    s_endpgm
 ;
 ; VI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat4:
 ; VI-SDAG:       ; %bb.0:
@@ -3190,29 +3690,51 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat4(ptr addrspace(1) %o
 }
 
 define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat5(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #2 {
-; SI-LABEL: v_test_global_nnans_med3_f32_pat5:
-; SI:       ; %bb.0:
-; SI-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x9
-; SI-NEXT:    s_mov_b32 s11, 0xf000
-; SI-NEXT:    s_mov_b32 s10, 0
-; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; SI-NEXT:    v_mov_b32_e32 v1, 0
-; SI-NEXT:    s_mov_b64 s[14:15], s[10:11]
-; SI-NEXT:    s_mov_b64 s[18:19], s[10:11]
-; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_mov_b64 s[8:9], s[2:3]
-; SI-NEXT:    s_mov_b64 s[12:13], s[4:5]
-; SI-NEXT:    s_mov_b64 s[16:17], s[6:7]
-; SI-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_load_dword v3, v[0:1], s[12:15], 0 addr64 glc
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_load_dword v4, v[0:1], s[16:19], 0 addr64 glc
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    s_mov_b64 s[2:3], s[10:11]
-; SI-NEXT:    v_med3_f32 v2, v3, v2, v4
-; SI-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
-; SI-NEXT:    s_endpgm
+; SI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat5:
+; SI-SDAG:       ; %bb.0:
+; SI-SDAG-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x9
+; SI-SDAG-NEXT:    s_mov_b32 s11, 0xf000
+; SI-SDAG-NEXT:    s_mov_b32 s10, 0
+; SI-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; SI-SDAG-NEXT:    v_mov_b32_e32 v1, 0
+; SI-SDAG-NEXT:    s_mov_b64 s[14:15], s[10:11]
+; SI-SDAG-NEXT:    s_mov_b64 s[18:19], s[10:11]
+; SI-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-SDAG-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; SI-SDAG-NEXT:    s_mov_b64 s[12:13], s[4:5]
+; SI-SDAG-NEXT:    s_mov_b64 s[16:17], s[6:7]
+; SI-SDAG-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SI-SDAG-NEXT:    buffer_load_dword v3, v[0:1], s[12:15], 0 addr64 glc
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SI-SDAG-NEXT:    buffer_load_dword v4, v[0:1], s[16:19], 0 addr64 glc
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SI-SDAG-NEXT:    s_mov_b64 s[2:3], s[10:11]
+; SI-SDAG-NEXT:    v_med3_f32 v2, v3, v2, v4
+; SI-SDAG-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
+; SI-SDAG-NEXT:    s_endpgm
+;
+; SI-GISEL-LABEL: v_test_global_nnans_med3_f32_pat5:
+; SI-GISEL:       ; %bb.0:
+; SI-GISEL-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x9
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; SI-GISEL-NEXT:    s_mov_b32 s10, 0
+; SI-GISEL-NEXT:    s_mov_b32 s11, 0xf000
+; SI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-GISEL-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; SI-GISEL-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; SI-GISEL-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; SI-GISEL-NEXT:    buffer_load_dword v3, v[0:1], s[8:11], 0 addr64 glc
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; SI-GISEL-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; SI-GISEL-NEXT:    buffer_load_dword v4, v[0:1], s[8:11], 0 addr64 glc
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; SI-GISEL-NEXT:    v_med3_f32 v2, v3, v2, v4
+; SI-GISEL-NEXT:    s_mov_b64 s[2:3], s[10:11]
+; SI-GISEL-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
+; SI-GISEL-NEXT:    s_endpgm
 ;
 ; VI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat5:
 ; VI-SDAG:       ; %bb.0:
@@ -3320,29 +3842,51 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat5(ptr addrspace(1) %o
 }
 
 define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat6(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #2 {
-; SI-LABEL: v_test_global_nnans_med3_f32_pat6:
-; SI:       ; %bb.0:
-; SI-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x9
-; SI-NEXT:    s_mov_b32 s11, 0xf000
-; SI-NEXT:    s_mov_b32 s10, 0
-; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; SI-NEXT:    v_mov_b32_e32 v1, 0
-; SI-NEXT:    s_mov_b64 s[14:15], s[10:11]
-; SI-NEXT:    s_mov_b64 s[18:19], s[10:11]
-; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_mov_b64 s[8:9], s[2:3]
-; SI-NEXT:    s_mov_b64 s[12:13], s[4:5]
-; SI-NEXT:    s_mov_b64 s[16:17], s[6:7]
-; SI-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_load_dword v3, v[0:1], s[12:15], 0 addr64 glc
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_load_dword v4, v[0:1], s[16:19], 0 addr64 glc
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    s_mov_b64 s[2:3], s[10:11]
-; SI-NEXT:    v_med3_f32 v2, v3, v2, v4
-; SI-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
-; SI-NEXT:    s_endpgm
+; SI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat6:
+; SI-SDAG:       ; %bb.0:
+; SI-SDAG-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x9
+; SI-SDAG-NEXT:    s_mov_b32 s11, 0xf000
+; SI-SDAG-NEXT:    s_mov_b32 s10, 0
+; SI-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; SI-SDAG-NEXT:    v_mov_b32_e32 v1, 0
+; SI-SDAG-NEXT:    s_mov_b64 s[14:15], s[10:11]
+; SI-SDAG-NEXT:    s_mov_b64 s[18:19], s[10:11]
+; SI-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-SDAG-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; SI-SDAG-NEXT:    s_mov_b64 s[12:13], s[4:5]
+; SI-SDAG-NEXT:    s_mov_b64 s[16:17], s[6:7]
+; SI-SDAG-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SI-SDAG-NEXT:    buffer_load_dword v3, v[0:1], s[12:15], 0 addr64 glc
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SI-SDAG-NEXT:    buffer_load_dword v4, v[0:1], s[16:19], 0 addr64 glc
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SI-SDAG-NEXT:    s_mov_b64 s[2:3], s[10:11]
+; SI-SDAG-NEXT:    v_med3_f32 v2, v3, v2, v4
+; SI-SDAG-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
+; SI-SDAG-NEXT:    s_endpgm
+;
+; SI-GISEL-LABEL: v_test_global_nnans_med3_f32_pat6:
+; SI-GISEL:       ; %bb.0:
+; SI-GISEL-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x9
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; SI-GISEL-NEXT:    s_mov_b32 s10, 0
+; SI-GISEL-NEXT:    s_mov_b32 s11, 0xf000
+; SI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-GISEL-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; SI-GISEL-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; SI-GISEL-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; SI-GISEL-NEXT:    buffer_load_dword v3, v[0:1], s[8:11], 0 addr64 glc
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; SI-GISEL-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; SI-GISEL-NEXT:    buffer_load_dword v4, v[0:1], s[8:11], 0 addr64 glc
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; SI-GISEL-NEXT:    v_med3_f32 v2, v3, v2, v4
+; SI-GISEL-NEXT:    s_mov_b64 s[2:3], s[10:11]
+; SI-GISEL-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
+; SI-GISEL-NEXT:    s_endpgm
 ;
 ; VI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat6:
 ; VI-SDAG:       ; %bb.0:
@@ -3450,29 +3994,51 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat6(ptr addrspace(1) %o
 }
 
 define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat7(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #2 {
-; SI-LABEL: v_test_global_nnans_med3_f32_pat7:
-; SI:       ; %bb.0:
-; SI-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x9
-; SI-NEXT:    s_mov_b32 s11, 0xf000
-; SI-NEXT:    s_mov_b32 s10, 0
-; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; SI-NEXT:    v_mov_b32_e32 v1, 0
-; SI-NEXT:    s_mov_b64 s[14:15], s[10:11]
-; SI-NEXT:    s_mov_b64 s[18:19], s[10:11]
-; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_mov_b64 s[8:9], s[2:3]
-; SI-NEXT:    s_mov_b64 s[12:13], s[4:5]
-; SI-NEXT:    s_mov_b64 s[16:17], s[6:7]
-; SI-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_load_dword v3, v[0:1], s[12:15], 0 addr64 glc
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_load_dword v4, v[0:1], s[16:19], 0 addr64 glc
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    s_mov_b64 s[2:3], s[10:11]
-; SI-NEXT:    v_med3_f32 v2, v3, v2, v4
-; SI-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
-; SI-NEXT:    s_endpgm
+; SI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat7:
+; SI-SDAG:       ; %bb.0:
+; SI-SDAG-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x9
+; SI-SDAG-NEXT:    s_mov_b32 s11, 0xf000
+; SI-SDAG-NEXT:    s_mov_b32 s10, 0
+; SI-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; SI-SDAG-NEXT:    v_mov_b32_e32 v1, 0
+; SI-SDAG-NEXT:    s_mov_b64 s[14:15], s[10:11]
+; SI-SDAG-NEXT:    s_mov_b64 s[18:19], s[10:11]
+; SI-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-SDAG-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; SI-SDAG-NEXT:    s_mov_b64 s[12:13], s[4:5]
+; SI-SDAG-NEXT:    s_mov_b64 s[16:17], s[6:7]
+; SI-SDAG-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SI-SDAG-NEXT:    buffer_load_dword v3, v[0:1], s[12:15], 0 addr64 glc
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SI-SDAG-NEXT:    buffer_load_dword v4, v[0:1], s[16:19], 0 addr64 glc
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SI-SDAG-NEXT:    s_mov_b64 s[2:3], s[10:11]
+; SI-SDAG-NEXT:    v_med3_f32 v2, v3, v2, v4
+; SI-SDAG-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
+; SI-SDAG-NEXT:    s_endpgm
+;
+; SI-GISEL-LABEL: v_test_global_nnans_med3_f32_pat7:
+; SI-GISEL:       ; %bb.0:
+; SI-GISEL-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x9
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; SI-GISEL-NEXT:    s_mov_b32 s10, 0
+; SI-GISEL-NEXT:    s_mov_b32 s11, 0xf000
+; SI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-GISEL-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; SI-GISEL-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; SI-GISEL-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; SI-GISEL-NEXT:    buffer_load_dword v3, v[0:1], s[8:11], 0 addr64 glc
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; SI-GISEL-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; SI-GISEL-NEXT:    buffer_load_dword v4, v[0:1], s[8:11], 0 addr64 glc
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; SI-GISEL-NEXT:    v_med3_f32 v2, v3, v2, v4
+; SI-GISEL-NEXT:    s_mov_b64 s[2:3], s[10:11]
+; SI-GISEL-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
+; SI-GISEL-NEXT:    s_endpgm
 ;
 ; VI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat7:
 ; VI-SDAG:       ; %bb.0:
@@ -3580,29 +4146,51 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat7(ptr addrspace(1) %o
 }
 
 define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat8(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #2 {
-; SI-LABEL: v_test_global_nnans_med3_f32_pat8:
-; SI:       ; %bb.0:
-; SI-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x9
-; SI-NEXT:    s_mov_b32 s11, 0xf000
-; SI-NEXT:    s_mov_b32 s10, 0
-; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; SI-NEXT:    v_mov_b32_e32 v1, 0
-; SI-NEXT:    s_mov_b64 s[14:15], s[10:11]
-; SI-NEXT:    s_mov_b64 s[18:19], s[10:11]
-; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_mov_b64 s[8:9], s[2:3]
-; SI-NEXT:    s_mov_b64 s[12:13], s[4:5]
-; SI-NEXT:    s_mov_b64 s[16:17], s[6:7]
-; SI-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_load_dword v3, v[0:1], s[12:15], 0 addr64 glc
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_load_dword v4, v[0:1], s[16:19], 0 addr64 glc
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    s_mov_b64 s[2:3], s[10:11]
-; SI-NEXT:    v_med3_f32 v2, v2, v3, v4
-; SI-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
-; SI-NEXT:    s_endpgm
+; SI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat8:
+; SI-SDAG:       ; %bb.0:
+; SI-SDAG-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x9
+; SI-SDAG-NEXT:    s_mov_b32 s11, 0xf000
+; SI-SDAG-NEXT:    s_mov_b32 s10, 0
+; SI-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; SI-SDAG-NEXT:    v_mov_b32_e32 v1, 0
+; SI-SDAG-NEXT:    s_mov_b64 s[14:15], s[10:11]
+; SI-SDAG-NEXT:    s_mov_b64 s[18:19], s[10:11]
+; SI-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-SDAG-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; SI-SDAG-NEXT:    s_mov_b64 s[12:13], s[4:5]
+; SI-SDAG-NEXT:    s_mov_b64 s[16:17], s[6:7]
+; SI-SDAG-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SI-SDAG-NEXT:    buffer_load_dword v3, v[0:1], s[12:15], 0 addr64 glc
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SI-SDAG-NEXT:    buffer_load_dword v4, v[0:1], s[16:19], 0 addr64 glc
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SI-SDAG-NEXT:    s_mov_b64 s[2:3], s[10:11]
+; SI-SDAG-NEXT:    v_med3_f32 v2, v2, v3, v4
+; SI-SDAG-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
+; SI-SDAG-NEXT:    s_endpgm
+;
+; SI-GISEL-LABEL: v_test_global_nnans_med3_f32_pat8:
+; SI-GISEL:       ; %bb.0:
+; SI-GISEL-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x9
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; SI-GISEL-NEXT:    s_mov_b32 s10, 0
+; SI-GISEL-NEXT:    s_mov_b32 s11, 0xf000
+; SI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-GISEL-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; SI-GISEL-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; SI-GISEL-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; SI-GISEL-NEXT:    buffer_load_dword v3, v[0:1], s[8:11], 0 addr64 glc
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; SI-GISEL-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; SI-GISEL-NEXT:    buffer_load_dword v4, v[0:1], s[8:11], 0 addr64 glc
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; SI-GISEL-NEXT:    v_med3_f32 v2, v2, v3, v4
+; SI-GISEL-NEXT:    s_mov_b64 s[2:3], s[10:11]
+; SI-GISEL-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
+; SI-GISEL-NEXT:    s_endpgm
 ;
 ; VI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat8:
 ; VI-SDAG:       ; %bb.0:
@@ -3710,29 +4298,51 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat8(ptr addrspace(1) %o
 }
 
 define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat9(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #2 {
-; SI-LABEL: v_test_global_nnans_med3_f32_pat9:
-; SI:       ; %bb.0:
-; SI-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x9
-; SI-NEXT:    s_mov_b32 s11, 0xf000
-; SI-NEXT:    s_mov_b32 s10, 0
-; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; SI-NEXT:    v_mov_b32_e32 v1, 0
-; SI-NEXT:    s_mov_b64 s[14:15], s[10:11]
-; SI-NEXT:    s_mov_b64 s[18:19], s[10:11]
-; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_mov_b64 s[8:9], s[2:3]
-; SI-NEXT:    s_mov_b64 s[12:13], s[4:5]
-; SI-NEXT:    s_mov_b64 s[16:17], s[6:7]
-; SI-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_load_dword v3, v[0:1], s[12:15], 0 addr64 glc
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_load_dword v4, v[0:1], s[16:19], 0 addr64 glc
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    s_mov_b64 s[2:3], s[10:11]
-; SI-NEXT:    v_med3_f32 v2, v3, v2, v4
-; SI-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
-; SI-NEXT:    s_endpgm
+; SI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat9:
+; SI-SDAG:       ; %bb.0:
+; SI-SDAG-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x9
+; SI-SDAG-NEXT:    s_mov_b32 s11, 0xf000
+; SI-SDAG-NEXT:    s_mov_b32 s10, 0
+; SI-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; SI-SDAG-NEXT:    v_mov_b32_e32 v1, 0
+; SI-SDAG-NEXT:    s_mov_b64 s[14:15], s[10:11]
+; SI-SDAG-NEXT:    s_mov_b64 s[18:19], s[10:11]
+; SI-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-SDAG-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; SI-SDAG-NEXT:    s_mov_b64 s[12:13], s[4:5]
+; SI-SDAG-NEXT:    s_mov_b64 s[16:17], s[6:7]
+; SI-SDAG-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SI-SDAG-NEXT:    buffer_load_dword v3, v[0:1], s[12:15], 0 addr64 glc
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SI-SDAG-NEXT:    buffer_load_dword v4, v[0:1], s[16:19], 0 addr64 glc
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SI-SDAG-NEXT:    s_mov_b64 s[2:3], s[10:11]
+; SI-SDAG-NEXT:    v_med3_f32 v2, v3, v2, v4
+; SI-SDAG-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
+; SI-SDAG-NEXT:    s_endpgm
+;
+; SI-GISEL-LABEL: v_test_global_nnans_med3_f32_pat9:
+; SI-GISEL:       ; %bb.0:
+; SI-GISEL-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x9
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; SI-GISEL-NEXT:    s_mov_b32 s10, 0
+; SI-GISEL-NEXT:    s_mov_b32 s11, 0xf000
+; SI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-GISEL-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; SI-GISEL-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; SI-GISEL-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; SI-GISEL-NEXT:    buffer_load_dword v3, v[0:1], s[8:11], 0 addr64 glc
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; SI-GISEL-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; SI-GISEL-NEXT:    buffer_load_dword v4, v[0:1], s[8:11], 0 addr64 glc
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; SI-GISEL-NEXT:    v_med3_f32 v2, v3, v2, v4
+; SI-GISEL-NEXT:    s_mov_b64 s[2:3], s[10:11]
+; SI-GISEL-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
+; SI-GISEL-NEXT:    s_endpgm
 ;
 ; VI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat9:
 ; VI-SDAG:       ; %bb.0:
@@ -3840,29 +4450,51 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat9(ptr addrspace(1) %o
 }
 
 define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat10(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #2 {
-; SI-LABEL: v_test_global_nnans_med3_f32_pat10:
-; SI:       ; %bb.0:
-; SI-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x9
-; SI-NEXT:    s_mov_b32 s11, 0xf000
-; SI-NEXT:    s_mov_b32 s10, 0
-; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; SI-NEXT:    v_mov_b32_e32 v1, 0
-; SI-NEXT:    s_mov_b64 s[14:15], s[10:11]
-; SI-NEXT:    s_mov_b64 s[18:19], s[10:11]
-; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_mov_b64 s[8:9], s[2:3]
-; SI-NEXT:    s_mov_b64 s[12:13], s[4:5]
-; SI-NEXT:    s_mov_b64 s[16:17], s[6:7]
-; SI-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_load_dword v3, v[0:1], s[12:15], 0 addr64 glc
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_load_dword v4, v[0:1], s[16:19], 0 addr64 glc
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    s_mov_b64 s[2:3], s[10:11]
-; SI-NEXT:    v_med3_f32 v2, v2, v3, v4
-; SI-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
-; SI-NEXT:    s_endpgm
+; SI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat10:
+; SI-SDAG:       ; %bb.0:
+; SI-SDAG-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x9
+; SI-SDAG-NEXT:    s_mov_b32 s11, 0xf000
+; SI-SDAG-NEXT:    s_mov_b32 s10, 0
+; SI-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; SI-SDAG-NEXT:    v_mov_b32_e32 v1, 0
+; SI-SDAG-NEXT:    s_mov_b64 s[14:15], s[10:11]
+; SI-SDAG-NEXT:    s_mov_b64 s[18:19], s[10:11]
+; SI-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-SDAG-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; SI-SDAG-NEXT:    s_mov_b64 s[12:13], s[4:5]
+; SI-SDAG-NEXT:    s_mov_b64 s[16:17], s[6:7]
+; SI-SDAG-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SI-SDAG-NEXT:    buffer_load_dword v3, v[0:1], s[12:15], 0 addr64 glc
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SI-SDAG-NEXT:    buffer_load_dword v4, v[0:1], s[16:19], 0 addr64 glc
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SI-SDAG-NEXT:    s_mov_b64 s[2:3], s[10:11]
+; SI-SDAG-NEXT:    v_med3_f32 v2, v2, v3, v4
+; SI-SDAG-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
+; SI-SDAG-NEXT:    s_endpgm
+;
+; SI-GISEL-LABEL: v_test_global_nnans_med3_f32_pat10:
+; SI-GISEL:       ; %bb.0:
+; SI-GISEL-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x9
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; SI-GISEL-NEXT:    s_mov_b32 s10, 0
+; SI-GISEL-NEXT:    s_mov_b32 s11, 0xf000
+; SI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-GISEL-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; SI-GISEL-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; SI-GISEL-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; SI-GISEL-NEXT:    buffer_load_dword v3, v[0:1], s[8:11], 0 addr64 glc
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; SI-GISEL-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; SI-GISEL-NEXT:    buffer_load_dword v4, v[0:1], s[8:11], 0 addr64 glc
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; SI-GISEL-NEXT:    v_med3_f32 v2, v2, v3, v4
+; SI-GISEL-NEXT:    s_mov_b64 s[2:3], s[10:11]
+; SI-GISEL-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
+; SI-GISEL-NEXT:    s_endpgm
 ;
 ; VI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat10:
 ; VI-SDAG:       ; %bb.0:
@@ -3970,29 +4602,51 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat10(ptr addrspace(1) %
 }
 
 define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat11(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #2 {
-; SI-LABEL: v_test_global_nnans_med3_f32_pat11:
-; SI:       ; %bb.0:
-; SI-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x9
-; SI-NEXT:    s_mov_b32 s11, 0xf000
-; SI-NEXT:    s_mov_b32 s10, 0
-; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; SI-NEXT:    v_mov_b32_e32 v1, 0
-; SI-NEXT:    s_mov_b64 s[14:15], s[10:11]
-; SI-NEXT:    s_mov_b64 s[18:19], s[10:11]
-; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_mov_b64 s[8:9], s[2:3]
-; SI-NEXT:    s_mov_b64 s[12:13], s[4:5]
-; SI-NEXT:    s_mov_b64 s[16:17], s[6:7]
-; SI-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_load_dword v3, v[0:1], s[12:15], 0 addr64 glc
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_load_dword v4, v[0:1], s[16:19], 0 addr64 glc
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    s_mov_b64 s[2:3], s[10:11]
-; SI-NEXT:    v_med3_f32 v2, v3, v2, v4
-; SI-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
-; SI-NEXT:    s_endpgm
+; SI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat11:
+; SI-SDAG:       ; %bb.0:
+; SI-SDAG-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x9
+; SI-SDAG-NEXT:    s_mov_b32 s11, 0xf000
+; SI-SDAG-NEXT:    s_mov_b32 s10, 0
+; SI-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; SI-SDAG-NEXT:    v_mov_b32_e32 v1, 0
+; SI-SDAG-NEXT:    s_mov_b64 s[14:15], s[10:11]
+; SI-SDAG-NEXT:    s_mov_b64 s[18:19], s[10:11]
+; SI-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-SDAG-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; SI-SDAG-NEXT:    s_mov_b64 s[12:13], s[4:5]
+; SI-SDAG-NEXT:    s_mov_b64 s[16:17], s[6:7]
+; SI-SDAG-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SI-SDAG-NEXT:    buffer_load_dword v3, v[0:1], s[12:15], 0 addr64 glc
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SI-SDAG-NEXT:    buffer_load_dword v4, v[0:1], s[16:19], 0 addr64 glc
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SI-SDAG-NEXT:    s_mov_b64 s[2:3], s[10:11]
+; SI-SDAG-NEXT:    v_med3_f32 v2, v3, v2, v4
+; SI-SDAG-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
+; SI-SDAG-NEXT:    s_endpgm
+;
+; SI-GISEL-LABEL: v_test_global_nnans_med3_f32_pat11:
+; SI-GISEL:       ; %bb.0:
+; SI-GISEL-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x9
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; SI-GISEL-NEXT:    s_mov_b32 s10, 0
+; SI-GISEL-NEXT:    s_mov_b32 s11, 0xf000
+; SI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-GISEL-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; SI-GISEL-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; SI-GISEL-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; SI-GISEL-NEXT:    buffer_load_dword v3, v[0:1], s[8:11], 0 addr64 glc
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; SI-GISEL-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; SI-GISEL-NEXT:    buffer_load_dword v4, v[0:1], s[8:11], 0 addr64 glc
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; SI-GISEL-NEXT:    v_med3_f32 v2, v3, v2, v4
+; SI-GISEL-NEXT:    s_mov_b64 s[2:3], s[10:11]
+; SI-GISEL-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
+; SI-GISEL-NEXT:    s_endpgm
 ;
 ; VI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat11:
 ; VI-SDAG:       ; %bb.0:
@@ -4100,29 +4754,51 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat11(ptr addrspace(1) %
 }
 
 define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat12(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #2 {
-; SI-LABEL: v_test_global_nnans_med3_f32_pat12:
-; SI:       ; %bb.0:
-; SI-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x9
-; SI-NEXT:    s_mov_b32 s11, 0xf000
-; SI-NEXT:    s_mov_b32 s10, 0
-; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; SI-NEXT:    v_mov_b32_e32 v1, 0
-; SI-NEXT:    s_mov_b64 s[14:15], s[10:11]
-; SI-NEXT:    s_mov_b64 s[18:19], s[10:11]
-; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_mov_b64 s[8:9], s[2:3]
-; SI-NEXT:    s_mov_b64 s[12:13], s[4:5]
-; SI-NEXT:    s_mov_b64 s[16:17], s[6:7]
-; SI-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_load_dword v3, v[0:1], s[12:15], 0 addr64 glc
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_load_dword v4, v[0:1], s[16:19], 0 addr64 glc
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    s_mov_b64 s[2:3], s[10:11]
-; SI-NEXT:    v_med3_f32 v2, v3, v2, v4
-; SI-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
-; SI-NEXT:    s_endpgm
+; SI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat12:
+; SI-SDAG:       ; %bb.0:
+; SI-SDAG-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x9
+; SI-SDAG-NEXT:    s_mov_b32 s11, 0xf000
+; SI-SDAG-NEXT:    s_mov_b32 s10, 0
+; SI-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; SI-SDAG-NEXT:    v_mov_b32_e32 v1, 0
+; SI-SDAG-NEXT:    s_mov_b64 s[14:15], s[10:11]
+; SI-SDAG-NEXT:    s_mov_b64 s[18:19], s[10:11]
+; SI-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-SDAG-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; SI-SDAG-NEXT:    s_mov_b64 s[12:13], s[4:5]
+; SI-SDAG-NEXT:    s_mov_b64 s[16:17], s[6:7]
+; SI-SDAG-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SI-SDAG-NEXT:    buffer_load_dword v3, v[0:1], s[12:15], 0 addr64 glc
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SI-SDAG-NEXT:    buffer_load_dword v4, v[0:1], s[16:19], 0 addr64 glc
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SI-SDAG-NEXT:    s_mov_b64 s[2:3], s[10:11]
+; SI-SDAG-NEXT:    v_med3_f32 v2, v3, v2, v4
+; SI-SDAG-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
+; SI-SDAG-NEXT:    s_endpgm
+;
+; SI-GISEL-LABEL: v_test_global_nnans_med3_f32_pat12:
+; SI-GISEL:       ; %bb.0:
+; SI-GISEL-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x9
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; SI-GISEL-NEXT:    s_mov_b32 s10, 0
+; SI-GISEL-NEXT:    s_mov_b32 s11, 0xf000
+; SI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-GISEL-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; SI-GISEL-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; SI-GISEL-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; SI-GISEL-NEXT:    buffer_load_dword v3, v[0:1], s[8:11], 0 addr64 glc
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; SI-GISEL-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; SI-GISEL-NEXT:    buffer_load_dword v4, v[0:1], s[8:11], 0 addr64 glc
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; SI-GISEL-NEXT:    v_med3_f32 v2, v3, v2, v4
+; SI-GISEL-NEXT:    s_mov_b64 s[2:3], s[10:11]
+; SI-GISEL-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
+; SI-GISEL-NEXT:    s_endpgm
 ;
 ; VI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat12:
 ; VI-SDAG:       ; %bb.0:
@@ -4230,29 +4906,51 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat12(ptr addrspace(1) %
 }
 
 define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat13(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #2 {
-; SI-LABEL: v_test_global_nnans_med3_f32_pat13:
-; SI:       ; %bb.0:
-; SI-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x9
-; SI-NEXT:    s_mov_b32 s11, 0xf000
-; SI-NEXT:    s_mov_b32 s10, 0
-; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; SI-NEXT:    v_mov_b32_e32 v1, 0
-; SI-NEXT:    s_mov_b64 s[14:15], s[10:11]
-; SI-NEXT:    s_mov_b64 s[18:19], s[10:11]
-; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_mov_b64 s[8:9], s[2:3]
-; SI-NEXT:    s_mov_b64 s[12:13], s[4:5]
-; SI-NEXT:    s_mov_b64 s[16:17], s[6:7]
-; SI-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_load_dword v3, v[0:1], s[12:15], 0 addr64 glc
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_load_dword v4, v[0:1], s[16:19], 0 addr64 glc
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    s_mov_b64 s[2:3], s[10:11]
-; SI-NEXT:    v_med3_f32 v2, v3, v2, v4
-; SI-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
-; SI-NEXT:    s_endpgm
+; SI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat13:
+; SI-SDAG:       ; %bb.0:
+; SI-SDAG-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x9
+; SI-SDAG-NEXT:    s_mov_b32 s11, 0xf000
+; SI-SDAG-NEXT:    s_mov_b32 s10, 0
+; SI-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; SI-SDAG-NEXT:    v_mov_b32_e32 v1, 0
+; SI-SDAG-NEXT:    s_mov_b64 s[14:15], s[10:11]
+; SI-SDAG-NEXT:    s_mov_b64 s[18:19], s[10:11]
+; SI-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-SDAG-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; SI-SDAG-NEXT:    s_mov_b64 s[12:13], s[4:5]
+; SI-SDAG-NEXT:    s_mov_b64 s[16:17], s[6:7]
+; SI-SDAG-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SI-SDAG-NEXT:    buffer_load_dword v3, v[0:1], s[12:15], 0 addr64 glc
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SI-SDAG-NEXT:    buffer_load_dword v4, v[0:1], s[16:19], 0 addr64 glc
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SI-SDAG-NEXT:    s_mov_b64 s[2:3], s[10:11]
+; SI-SDAG-NEXT:    v_med3_f32 v2, v3, v2, v4
+; SI-SDAG-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
+; SI-SDAG-NEXT:    s_endpgm
+;
+; SI-GISEL-LABEL: v_test_global_nnans_med3_f32_pat13:
+; SI-GISEL:       ; %bb.0:
+; SI-GISEL-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x9
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; SI-GISEL-NEXT:    s_mov_b32 s10, 0
+; SI-GISEL-NEXT:    s_mov_b32 s11, 0xf000
+; SI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-GISEL-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; SI-GISEL-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; SI-GISEL-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; SI-GISEL-NEXT:    buffer_load_dword v3, v[0:1], s[8:11], 0 addr64 glc
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; SI-GISEL-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; SI-GISEL-NEXT:    buffer_load_dword v4, v[0:1], s[8:11], 0 addr64 glc
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; SI-GISEL-NEXT:    v_med3_f32 v2, v3, v2, v4
+; SI-GISEL-NEXT:    s_mov_b64 s[2:3], s[10:11]
+; SI-GISEL-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
+; SI-GISEL-NEXT:    s_endpgm
 ;
 ; VI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat13:
 ; VI-SDAG:       ; %bb.0:
@@ -4360,29 +5058,51 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat13(ptr addrspace(1) %
 }
 
 define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat14(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #2 {
-; SI-LABEL: v_test_global_nnans_med3_f32_pat14:
-; SI:       ; %bb.0:
-; SI-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x9
-; SI-NEXT:    s_mov_b32 s11, 0xf000
-; SI-NEXT:    s_mov_b32 s10, 0
-; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; SI-NEXT:    v_mov_b32_e32 v1, 0
-; SI-NEXT:    s_mov_b64 s[14:15], s[10:11]
-; SI-NEXT:    s_mov_b64 s[18:19], s[10:11]
-; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_mov_b64 s[8:9], s[2:3]
-; SI-NEXT:    s_mov_b64 s[12:13], s[4:5]
-; SI-NEXT:    s_mov_b64 s[16:17], s[6:7]
-; SI-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_load_dword v3, v[0:1], s[12:15], 0 addr64 glc
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_load_dword v4, v[0:1], s[16:19], 0 addr64 glc
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    s_mov_b64 s[2:3], s[10:11]
-; SI-NEXT:    v_med3_f32 v2, v2, v3, v4
-; SI-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
-; SI-NEXT:    s_endpgm
+; SI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat14:
+; SI-SDAG:       ; %bb.0:
+; SI-SDAG-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x9
+; SI-SDAG-NEXT:    s_mov_b32 s11, 0xf000
+; SI-SDAG-NEXT:    s_mov_b32 s10, 0
+; SI-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; SI-SDAG-NEXT:    v_mov_b32_e32 v1, 0
+; SI-SDAG-NEXT:    s_mov_b64 s[14:15], s[10:11]
+; SI-SDAG-NEXT:    s_mov_b64 s[18:19], s[10:11]
+; SI-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-SDAG-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; SI-SDAG-NEXT:    s_mov_b64 s[12:13], s[4:5]
+; SI-SDAG-NEXT:    s_mov_b64 s[16:17], s[6:7]
+; SI-SDAG-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SI-SDAG-NEXT:    buffer_load_dword v3, v[0:1], s[12:15], 0 addr64 glc
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SI-SDAG-NEXT:    buffer_load_dword v4, v[0:1], s[16:19], 0 addr64 glc
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SI-SDAG-NEXT:    s_mov_b64 s[2:3], s[10:11]
+; SI-SDAG-NEXT:    v_med3_f32 v2, v2, v3, v4
+; SI-SDAG-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
+; SI-SDAG-NEXT:    s_endpgm
+;
+; SI-GISEL-LABEL: v_test_global_nnans_med3_f32_pat14:
+; SI-GISEL:       ; %bb.0:
+; SI-GISEL-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x9
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; SI-GISEL-NEXT:    s_mov_b32 s10, 0
+; SI-GISEL-NEXT:    s_mov_b32 s11, 0xf000
+; SI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-GISEL-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; SI-GISEL-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; SI-GISEL-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; SI-GISEL-NEXT:    buffer_load_dword v3, v[0:1], s[8:11], 0 addr64 glc
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; SI-GISEL-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; SI-GISEL-NEXT:    buffer_load_dword v4, v[0:1], s[8:11], 0 addr64 glc
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; SI-GISEL-NEXT:    v_med3_f32 v2, v2, v3, v4
+; SI-GISEL-NEXT:    s_mov_b64 s[2:3], s[10:11]
+; SI-GISEL-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
+; SI-GISEL-NEXT:    s_endpgm
 ;
 ; VI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat14:
 ; VI-SDAG:       ; %bb.0:
@@ -4490,29 +5210,51 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat14(ptr addrspace(1) %
 }
 
 define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat15(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #2 {
-; SI-LABEL: v_test_global_nnans_med3_f32_pat15:
-; SI:       ; %bb.0:
-; SI-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x9
-; SI-NEXT:    s_mov_b32 s11, 0xf000
-; SI-NEXT:    s_mov_b32 s10, 0
-; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; SI-NEXT:    v_mov_b32_e32 v1, 0
-; SI-NEXT:    s_mov_b64 s[14:15], s[10:11]
-; SI-NEXT:    s_mov_b64 s[18:19], s[10:11]
-; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_mov_b64 s[8:9], s[2:3]
-; SI-NEXT:    s_mov_b64 s[12:13], s[4:5]
-; SI-NEXT:    s_mov_b64 s[16:17], s[6:7]
-; SI-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_load_dword v3, v[0:1], s[12:15], 0 addr64 glc
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_load_dword v4, v[0:1], s[16:19], 0 addr64 glc
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    s_mov_b64 s[2:3], s[10:11]
-; SI-NEXT:    v_med3_f32 v2, v3, v2, v4
-; SI-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
-; SI-NEXT:    s_endpgm
+; SI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat15:
+; SI-SDAG:       ; %bb.0:
+; SI-SDAG-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x9
+; SI-SDAG-NEXT:    s_mov_b32 s11, 0xf000
+; SI-SDAG-NEXT:    s_mov_b32 s10, 0
+; SI-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; SI-SDAG-NEXT:    v_mov_b32_e32 v1, 0
+; SI-SDAG-NEXT:    s_mov_b64 s[14:15], s[10:11]
+; SI-SDAG-NEXT:    s_mov_b64 s[18:19], s[10:11]
+; SI-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-SDAG-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; SI-SDAG-NEXT:    s_mov_b64 s[12:13], s[4:5]
+; SI-SDAG-NEXT:    s_mov_b64 s[16:17], s[6:7]
+; SI-SDAG-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SI-SDAG-NEXT:    buffer_load_dword v3, v[0:1], s[12:15], 0 addr64 glc
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SI-SDAG-NEXT:    buffer_load_dword v4, v[0:1], s[16:19], 0 addr64 glc
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SI-SDAG-NEXT:    s_mov_b64 s[2:3], s[10:11]
+; SI-SDAG-NEXT:    v_med3_f32 v2, v3, v2, v4
+; SI-SDAG-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
+; SI-SDAG-NEXT:    s_endpgm
+;
+; SI-GISEL-LABEL: v_test_global_nnans_med3_f32_pat15:
+; SI-GISEL:       ; %bb.0:
+; SI-GISEL-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x9
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; SI-GISEL-NEXT:    s_mov_b32 s10, 0
+; SI-GISEL-NEXT:    s_mov_b32 s11, 0xf000
+; SI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-GISEL-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; SI-GISEL-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; SI-GISEL-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; SI-GISEL-NEXT:    buffer_load_dword v3, v[0:1], s[8:11], 0 addr64 glc
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; SI-GISEL-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; SI-GISEL-NEXT:    buffer_load_dword v4, v[0:1], s[8:11], 0 addr64 glc
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; SI-GISEL-NEXT:    v_med3_f32 v2, v3, v2, v4
+; SI-GISEL-NEXT:    s_mov_b64 s[2:3], s[10:11]
+; SI-GISEL-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
+; SI-GISEL-NEXT:    s_endpgm
 ;
 ; VI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat15:
 ; VI-SDAG:       ; %bb.0:
@@ -4623,29 +5365,51 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat15(ptr addrspace(1) %
 ; min(max(x, y), max(min(x, y), z))
 
 define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat16(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #2 {
-; SI-LABEL: v_test_global_nnans_med3_f32_pat16:
-; SI:       ; %bb.0:
-; SI-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x9
-; SI-NEXT:    s_mov_b32 s11, 0xf000
-; SI-NEXT:    s_mov_b32 s10, 0
-; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; SI-NEXT:    v_mov_b32_e32 v1, 0
-; SI-NEXT:    s_mov_b64 s[14:15], s[10:11]
-; SI-NEXT:    s_mov_b64 s[18:19], s[10:11]
-; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_mov_b64 s[8:9], s[2:3]
-; SI-NEXT:    s_mov_b64 s[12:13], s[4:5]
-; SI-NEXT:    s_mov_b64 s[16:17], s[6:7]
-; SI-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_load_dword v3, v[0:1], s[12:15], 0 addr64 glc
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_load_dword v4, v[0:1], s[16:19], 0 addr64 glc
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    s_mov_b64 s[2:3], s[10:11]
-; SI-NEXT:    v_med3_f32 v2, v2, v3, v4
-; SI-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
-; SI-NEXT:    s_endpgm
+; SI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat16:
+; SI-SDAG:       ; %bb.0:
+; SI-SDAG-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x9
+; SI-SDAG-NEXT:    s_mov_b32 s11, 0xf000
+; SI-SDAG-NEXT:    s_mov_b32 s10, 0
+; SI-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; SI-SDAG-NEXT:    v_mov_b32_e32 v1, 0
+; SI-SDAG-NEXT:    s_mov_b64 s[14:15], s[10:11]
+; SI-SDAG-NEXT:    s_mov_b64 s[18:19], s[10:11]
+; SI-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-SDAG-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; SI-SDAG-NEXT:    s_mov_b64 s[12:13], s[4:5]
+; SI-SDAG-NEXT:    s_mov_b64 s[16:17], s[6:7]
+; SI-SDAG-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SI-SDAG-NEXT:    buffer_load_dword v3, v[0:1], s[12:15], 0 addr64 glc
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SI-SDAG-NEXT:    buffer_load_dword v4, v[0:1], s[16:19], 0 addr64 glc
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SI-SDAG-NEXT:    s_mov_b64 s[2:3], s[10:11]
+; SI-SDAG-NEXT:    v_med3_f32 v2, v2, v3, v4
+; SI-SDAG-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
+; SI-SDAG-NEXT:    s_endpgm
+;
+; SI-GISEL-LABEL: v_test_global_nnans_med3_f32_pat16:
+; SI-GISEL:       ; %bb.0:
+; SI-GISEL-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x9
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; SI-GISEL-NEXT:    s_mov_b32 s10, 0
+; SI-GISEL-NEXT:    s_mov_b32 s11, 0xf000
+; SI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-GISEL-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; SI-GISEL-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; SI-GISEL-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; SI-GISEL-NEXT:    buffer_load_dword v3, v[0:1], s[8:11], 0 addr64 glc
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; SI-GISEL-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; SI-GISEL-NEXT:    buffer_load_dword v4, v[0:1], s[8:11], 0 addr64 glc
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; SI-GISEL-NEXT:    v_med3_f32 v2, v2, v3, v4
+; SI-GISEL-NEXT:    s_mov_b64 s[2:3], s[10:11]
+; SI-GISEL-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
+; SI-GISEL-NEXT:    s_endpgm
 ;
 ; VI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat16:
 ; VI-SDAG:       ; %bb.0:
@@ -4757,38 +5521,70 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat16(ptr addrspace(1) %
 ; ---------------------------------------------------------------------
 
 define amdgpu_kernel void @v_test_safe_med3_f32_pat0_multi_use0(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #1 {
-; SI-LABEL: v_test_safe_med3_f32_pat0_multi_use0:
-; SI:       ; %bb.0:
-; SI-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x9
-; SI-NEXT:    s_mov_b32 s10, 0
-; SI-NEXT:    s_mov_b32 s11, 0xf000
-; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; SI-NEXT:    v_mov_b32_e32 v1, 0
-; SI-NEXT:    s_mov_b64 s[14:15], s[10:11]
-; SI-NEXT:    s_mov_b64 s[18:19], s[10:11]
-; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_mov_b64 s[8:9], s[2:3]
-; SI-NEXT:    s_mov_b64 s[12:13], s[4:5]
-; SI-NEXT:    s_mov_b64 s[16:17], s[6:7]
-; SI-NEXT:    s_mov_b64 s[2:3], s[10:11]
-; SI-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_load_dword v3, v[0:1], s[12:15], 0 addr64 glc
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_load_dword v4, v[0:1], s[16:19], 0 addr64 glc
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    s_mov_b32 s10, -1
-; SI-NEXT:    v_mul_f32_e32 v2, 1.0, v2
-; SI-NEXT:    v_mul_f32_e32 v3, 1.0, v3
-; SI-NEXT:    v_mul_f32_e32 v4, 1.0, v4
-; SI-NEXT:    v_min_f32_e32 v5, v2, v3
-; SI-NEXT:    v_max_f32_e32 v2, v2, v3
-; SI-NEXT:    buffer_store_dword v5, off, s[8:11], 0
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_min_f32_e32 v2, v2, v4
-; SI-NEXT:    v_max_f32_e32 v2, v5, v2
-; SI-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
-; SI-NEXT:    s_endpgm
+; SI-SDAG-LABEL: v_test_safe_med3_f32_pat0_multi_use0:
+; SI-SDAG:       ; %bb.0:
+; SI-SDAG-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x9
+; SI-SDAG-NEXT:    s_mov_b32 s10, 0
+; SI-SDAG-NEXT:    s_mov_b32 s11, 0xf000
+; SI-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; SI-SDAG-NEXT:    v_mov_b32_e32 v1, 0
+; SI-SDAG-NEXT:    s_mov_b64 s[14:15], s[10:11]
+; SI-SDAG-NEXT:    s_mov_b64 s[18:19], s[10:11]
+; SI-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-SDAG-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; SI-SDAG-NEXT:    s_mov_b64 s[12:13], s[4:5]
+; SI-SDAG-NEXT:    s_mov_b64 s[16:17], s[6:7]
+; SI-SDAG-NEXT:    s_mov_b64 s[2:3], s[10:11]
+; SI-SDAG-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SI-SDAG-NEXT:    buffer_load_dword v3, v[0:1], s[12:15], 0 addr64 glc
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SI-SDAG-NEXT:    buffer_load_dword v4, v[0:1], s[16:19], 0 addr64 glc
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SI-SDAG-NEXT:    s_mov_b32 s10, -1
+; SI-SDAG-NEXT:    v_mul_f32_e32 v2, 1.0, v2
+; SI-SDAG-NEXT:    v_mul_f32_e32 v3, 1.0, v3
+; SI-SDAG-NEXT:    v_mul_f32_e32 v4, 1.0, v4
+; SI-SDAG-NEXT:    v_min_f32_e32 v5, v2, v3
+; SI-SDAG-NEXT:    v_max_f32_e32 v2, v2, v3
+; SI-SDAG-NEXT:    buffer_store_dword v5, off, s[8:11], 0
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SI-SDAG-NEXT:    v_min_f32_e32 v2, v2, v4
+; SI-SDAG-NEXT:    v_max_f32_e32 v2, v5, v2
+; SI-SDAG-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
+; SI-SDAG-NEXT:    s_endpgm
+;
+; SI-GISEL-LABEL: v_test_safe_med3_f32_pat0_multi_use0:
+; SI-GISEL:       ; %bb.0:
+; SI-GISEL-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x9
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; SI-GISEL-NEXT:    s_mov_b32 s10, 0
+; SI-GISEL-NEXT:    s_mov_b32 s11, 0xf000
+; SI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-GISEL-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; SI-GISEL-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; SI-GISEL-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; SI-GISEL-NEXT:    buffer_load_dword v3, v[0:1], s[8:11], 0 addr64 glc
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; SI-GISEL-NEXT:    s_mov_b32 s2, -1
+; SI-GISEL-NEXT:    s_mov_b32 s3, s11
+; SI-GISEL-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; SI-GISEL-NEXT:    buffer_load_dword v4, v[0:1], s[8:11], 0 addr64 glc
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; SI-GISEL-NEXT:    v_mul_f32_e32 v2, 1.0, v2
+; SI-GISEL-NEXT:    v_mul_f32_e32 v3, 1.0, v3
+; SI-GISEL-NEXT:    v_min_f32_e32 v5, v2, v3
+; SI-GISEL-NEXT:    v_max_f32_e32 v2, v2, v3
+; SI-GISEL-NEXT:    v_mul_f32_e32 v3, 1.0, v4
+; SI-GISEL-NEXT:    buffer_store_dword v5, off, s[0:3], 0
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; SI-GISEL-NEXT:    v_min_f32_e32 v2, v2, v3
+; SI-GISEL-NEXT:    v_max_f32_e32 v2, v5, v2
+; SI-GISEL-NEXT:    s_mov_b64 s[2:3], s[10:11]
+; SI-GISEL-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
+; SI-GISEL-NEXT:    s_endpgm
 ;
 ; VI-SDAG-LABEL: v_test_safe_med3_f32_pat0_multi_use0:
 ; VI-SDAG:       ; %bb.0:
@@ -4927,38 +5723,70 @@ define amdgpu_kernel void @v_test_safe_med3_f32_pat0_multi_use0(ptr addrspace(1)
 }
 
 define amdgpu_kernel void @v_test_safe_med3_f32_pat0_multi_use1(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #1 {
-; SI-LABEL: v_test_safe_med3_f32_pat0_multi_use1:
-; SI:       ; %bb.0:
-; SI-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x9
-; SI-NEXT:    s_mov_b32 s10, 0
-; SI-NEXT:    s_mov_b32 s11, 0xf000
-; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; SI-NEXT:    v_mov_b32_e32 v1, 0
-; SI-NEXT:    s_mov_b64 s[14:15], s[10:11]
-; SI-NEXT:    s_mov_b64 s[18:19], s[10:11]
-; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_mov_b64 s[8:9], s[2:3]
-; SI-NEXT:    s_mov_b64 s[12:13], s[4:5]
-; SI-NEXT:    s_mov_b64 s[16:17], s[6:7]
-; SI-NEXT:    s_mov_b64 s[2:3], s[10:11]
-; SI-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_load_dword v3, v[0:1], s[12:15], 0 addr64 glc
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_load_dword v4, v[0:1], s[16:19], 0 addr64 glc
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    s_mov_b32 s10, -1
-; SI-NEXT:    v_mul_f32_e32 v2, 1.0, v2
-; SI-NEXT:    v_mul_f32_e32 v3, 1.0, v3
-; SI-NEXT:    v_mul_f32_e32 v4, 1.0, v4
-; SI-NEXT:    v_min_f32_e32 v5, v2, v3
-; SI-NEXT:    v_max_f32_e32 v2, v2, v3
-; SI-NEXT:    buffer_store_dword v2, off, s[8:11], 0
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
-; SI-NEXT:    v_min_f32_e32 v2, v2, v4
-; SI-NEXT:    v_max_f32_e32 v2, v5, v2
-; SI-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
-; SI-NEXT:    s_endpgm
+; SI-SDAG-LABEL: v_test_safe_med3_f32_pat0_multi_use1:
+; SI-SDAG:       ; %bb.0:
+; SI-SDAG-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x9
+; SI-SDAG-NEXT:    s_mov_b32 s10, 0
+; SI-SDAG-NEXT:    s_mov_b32 s11, 0xf000
+; SI-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; SI-SDAG-NEXT:    v_mov_b32_e32 v1, 0
+; SI-SDAG-NEXT:    s_mov_b64 s[14:15], s[10:11]
+; SI-SDAG-NEXT:    s_mov_b64 s[18:19], s[10:11]
+; SI-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-SDAG-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; SI-SDAG-NEXT:    s_mov_b64 s[12:13], s[4:5]
+; SI-SDAG-NEXT:    s_mov_b64 s[16:17], s[6:7]
+; SI-SDAG-NEXT:    s_mov_b64 s[2:3], s[10:11]
+; SI-SDAG-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SI-SDAG-NEXT:    buffer_load_dword v3, v[0:1], s[12:15], 0 addr64 glc
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SI-SDAG-NEXT:    buffer_load_dword v4, v[0:1], s[16:19], 0 addr64 glc
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SI-SDAG-NEXT:    s_mov_b32 s10, -1
+; SI-SDAG-NEXT:    v_mul_f32_e32 v2, 1.0, v2
+; SI-SDAG-NEXT:    v_mul_f32_e32 v3, 1.0, v3
+; SI-SDAG-NEXT:    v_mul_f32_e32 v4, 1.0, v4
+; SI-SDAG-NEXT:    v_min_f32_e32 v5, v2, v3
+; SI-SDAG-NEXT:    v_max_f32_e32 v2, v2, v3
+; SI-SDAG-NEXT:    buffer_store_dword v2, off, s[8:11], 0
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; SI-SDAG-NEXT:    v_min_f32_e32 v2, v2, v4
+; SI-SDAG-NEXT:    v_max_f32_e32 v2, v5, v2
+; SI-SDAG-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
+; SI-SDAG-NEXT:    s_endpgm
+;
+; SI-GISEL-LABEL: v_test_safe_med3_f32_pat0_multi_use1:
+; SI-GISEL:       ; %bb.0:
+; SI-GISEL-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x9
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; SI-GISEL-NEXT:    s_mov_b32 s10, 0
+; SI-GISEL-NEXT:    s_mov_b32 s11, 0xf000
+; SI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-GISEL-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; SI-GISEL-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; SI-GISEL-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; SI-GISEL-NEXT:    buffer_load_dword v3, v[0:1], s[8:11], 0 addr64 glc
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; SI-GISEL-NEXT:    s_mov_b32 s2, -1
+; SI-GISEL-NEXT:    s_mov_b32 s3, s11
+; SI-GISEL-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; SI-GISEL-NEXT:    buffer_load_dword v4, v[0:1], s[8:11], 0 addr64 glc
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; SI-GISEL-NEXT:    v_mul_f32_e32 v2, 1.0, v2
+; SI-GISEL-NEXT:    v_mul_f32_e32 v3, 1.0, v3
+; SI-GISEL-NEXT:    v_min_f32_e32 v5, v2, v3
+; SI-GISEL-NEXT:    v_max_f32_e32 v2, v2, v3
+; SI-GISEL-NEXT:    v_mul_f32_e32 v3, 1.0, v4
+; SI-GISEL-NEXT:    buffer_store_dword v2, off, s[0:3], 0
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; SI-GISEL-NEXT:    v_min_f32_e32 v2, v2, v3
+; SI-GISEL-NEXT:    v_max_f32_e32 v2, v5, v2
+; SI-GISEL-NEXT:    s_mov_b64 s[2:3], s[10:11]
+; SI-GISEL-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
+; SI-GISEL-NEXT:    s_endpgm
 ;
 ; VI-SDAG-LABEL: v_test_safe_med3_f32_pat0_multi_use1:
 ; VI-SDAG:       ; %bb.0:
@@ -5121,38 +5949,70 @@ define amdgpu_kernel void @v_test_safe_med3_f32_pat0_multi_use1(ptr addrspace(1)
 }
 
 define amdgpu_kernel void @v_test_safe_med3_f32_pat0_multi_use2(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #1 {
-; SI-LABEL: v_test_safe_med3_f32_pat0_multi_use2:
-; SI:       ; %bb.0:
-; SI-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x9
-; SI-NEXT:    s_mov_b32 s10, 0
-; SI-NEXT:    s_mov_b32 s11, 0xf000
-; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; SI-NEXT:    v_mov_b32_e32 v1, 0
-; SI-NEXT:    s_mov_b64 s[14:15], s[10:11]
-; SI-NEXT:    s_mov_b64 s[18:19], s[10:11]
-; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_mov_b64 s[8:9], s[2:3]
-; SI-NEXT:    s_mov_b64 s[12:13], s[4:5]
-; SI-NEXT:    s_mov_b64 s[16:17], s[6:7]
-; SI-NEXT:    s_mov_b64 s[2:3], s[10:11]
-; SI-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_load_dword v3, v[0:1], s[12:15], 0 addr64 glc
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_load_dword v4, v[0:1], s[16:19], 0 addr64 glc
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    s_mov_b32 s10, -1
-; SI-NEXT:    v_mul_f32_e32 v2, 1.0, v2
-; SI-NEXT:    v_mul_f32_e32 v3, 1.0, v3
-; SI-NEXT:    v_mul_f32_e32 v4, 1.0, v4
-; SI-NEXT:    v_min_f32_e32 v5, v2, v3
-; SI-NEXT:    v_max_f32_e32 v2, v2, v3
-; SI-NEXT:    v_min_f32_e32 v2, v2, v4
-; SI-NEXT:    buffer_store_dword v2, off, s[8:11], 0
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
-; SI-NEXT:    v_max_f32_e32 v2, v5, v2
-; SI-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
-; SI-NEXT:    s_endpgm
+; SI-SDAG-LABEL: v_test_safe_med3_f32_pat0_multi_use2:
+; SI-SDAG:       ; %bb.0:
+; SI-SDAG-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x9
+; SI-SDAG-NEXT:    s_mov_b32 s10, 0
+; SI-SDAG-NEXT:    s_mov_b32 s11, 0xf000
+; SI-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; SI-SDAG-NEXT:    v_mov_b32_e32 v1, 0
+; SI-SDAG-NEXT:    s_mov_b64 s[14:15], s[10:11]
+; SI-SDAG-NEXT:    s_mov_b64 s[18:19], s[10:11]
+; SI-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-SDAG-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; SI-SDAG-NEXT:    s_mov_b64 s[12:13], s[4:5]
+; SI-SDAG-NEXT:    s_mov_b64 s[16:17], s[6:7]
+; SI-SDAG-NEXT:    s_mov_b64 s[2:3], s[10:11]
+; SI-SDAG-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SI-SDAG-NEXT:    buffer_load_dword v3, v[0:1], s[12:15], 0 addr64 glc
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SI-SDAG-NEXT:    buffer_load_dword v4, v[0:1], s[16:19], 0 addr64 glc
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SI-SDAG-NEXT:    s_mov_b32 s10, -1
+; SI-SDAG-NEXT:    v_mul_f32_e32 v2, 1.0, v2
+; SI-SDAG-NEXT:    v_mul_f32_e32 v3, 1.0, v3
+; SI-SDAG-NEXT:    v_mul_f32_e32 v4, 1.0, v4
+; SI-SDAG-NEXT:    v_min_f32_e32 v5, v2, v3
+; SI-SDAG-NEXT:    v_max_f32_e32 v2, v2, v3
+; SI-SDAG-NEXT:    v_min_f32_e32 v2, v2, v4
+; SI-SDAG-NEXT:    buffer_store_dword v2, off, s[8:11], 0
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; SI-SDAG-NEXT:    v_max_f32_e32 v2, v5, v2
+; SI-SDAG-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
+; SI-SDAG-NEXT:    s_endpgm
+;
+; SI-GISEL-LABEL: v_test_safe_med3_f32_pat0_multi_use2:
+; SI-GISEL:       ; %bb.0:
+; SI-GISEL-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x9
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; SI-GISEL-NEXT:    s_mov_b32 s10, 0
+; SI-GISEL-NEXT:    s_mov_b32 s11, 0xf000
+; SI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-GISEL-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; SI-GISEL-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; SI-GISEL-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; SI-GISEL-NEXT:    buffer_load_dword v3, v[0:1], s[8:11], 0 addr64 glc
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; SI-GISEL-NEXT:    s_mov_b32 s2, -1
+; SI-GISEL-NEXT:    s_mov_b32 s3, s11
+; SI-GISEL-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; SI-GISEL-NEXT:    buffer_load_dword v4, v[0:1], s[8:11], 0 addr64 glc
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; SI-GISEL-NEXT:    v_mul_f32_e32 v2, 1.0, v2
+; SI-GISEL-NEXT:    v_mul_f32_e32 v3, 1.0, v3
+; SI-GISEL-NEXT:    v_min_f32_e32 v5, v2, v3
+; SI-GISEL-NEXT:    v_max_f32_e32 v2, v2, v3
+; SI-GISEL-NEXT:    v_mul_f32_e32 v3, 1.0, v4
+; SI-GISEL-NEXT:    v_min_f32_e32 v2, v2, v3
+; SI-GISEL-NEXT:    buffer_store_dword v2, off, s[0:3], 0
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; SI-GISEL-NEXT:    v_max_f32_e32 v2, v5, v2
+; SI-GISEL-NEXT:    s_mov_b64 s[2:3], s[10:11]
+; SI-GISEL-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
+; SI-GISEL-NEXT:    s_endpgm
 ;
 ; VI-SDAG-LABEL: v_test_safe_med3_f32_pat0_multi_use2:
 ; VI-SDAG:       ; %bb.0:
@@ -5291,34 +6151,62 @@ define amdgpu_kernel void @v_test_safe_med3_f32_pat0_multi_use2(ptr addrspace(1)
 }
 
 define amdgpu_kernel void @v_test_safe_med3_f32_pat0(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #1 {
-; SI-LABEL: v_test_safe_med3_f32_pat0:
-; SI:       ; %bb.0:
-; SI-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x9
-; SI-NEXT:    s_mov_b32 s11, 0xf000
-; SI-NEXT:    s_mov_b32 s10, 0
-; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; SI-NEXT:    v_mov_b32_e32 v1, 0
-; SI-NEXT:    s_mov_b64 s[14:15], s[10:11]
-; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_mov_b64 s[8:9], s[2:3]
-; SI-NEXT:    s_mov_b64 s[12:13], s[4:5]
-; SI-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_load_dword v3, v[0:1], s[12:15], 0 addr64 glc
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    s_mov_b64 s[12:13], s[6:7]
-; SI-NEXT:    s_mov_b64 s[2:3], s[10:11]
-; SI-NEXT:    buffer_load_dword v4, v[0:1], s[12:15], 0 addr64 glc
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_mul_f32_e32 v2, 1.0, v2
-; SI-NEXT:    v_mul_f32_e32 v3, 1.0, v3
-; SI-NEXT:    v_min_f32_e32 v5, v2, v3
-; SI-NEXT:    v_max_f32_e32 v2, v2, v3
-; SI-NEXT:    v_mul_f32_e32 v3, 1.0, v4
-; SI-NEXT:    v_min_f32_e32 v2, v2, v3
-; SI-NEXT:    v_max_f32_e32 v2, v5, v2
-; SI-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
-; SI-NEXT:    s_endpgm
+; SI-SDAG-LABEL: v_test_safe_med3_f32_pat0:
+; SI-SDAG:       ; %bb.0:
+; SI-SDAG-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x9
+; SI-SDAG-NEXT:    s_mov_b32 s11, 0xf000
+; SI-SDAG-NEXT:    s_mov_b32 s10, 0
+; SI-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; SI-SDAG-NEXT:    v_mov_b32_e32 v1, 0
+; SI-SDAG-NEXT:    s_mov_b64 s[14:15], s[10:11]
+; SI-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-SDAG-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; SI-SDAG-NEXT:    s_mov_b64 s[12:13], s[4:5]
+; SI-SDAG-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SI-SDAG-NEXT:    buffer_load_dword v3, v[0:1], s[12:15], 0 addr64 glc
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SI-SDAG-NEXT:    s_mov_b64 s[12:13], s[6:7]
+; SI-SDAG-NEXT:    s_mov_b64 s[2:3], s[10:11]
+; SI-SDAG-NEXT:    buffer_load_dword v4, v[0:1], s[12:15], 0 addr64 glc
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SI-SDAG-NEXT:    v_mul_f32_e32 v2, 1.0, v2
+; SI-SDAG-NEXT:    v_mul_f32_e32 v3, 1.0, v3
+; SI-SDAG-NEXT:    v_min_f32_e32 v5, v2, v3
+; SI-SDAG-NEXT:    v_max_f32_e32 v2, v2, v3
+; SI-SDAG-NEXT:    v_mul_f32_e32 v3, 1.0, v4
+; SI-SDAG-NEXT:    v_min_f32_e32 v2, v2, v3
+; SI-SDAG-NEXT:    v_max_f32_e32 v2, v5, v2
+; SI-SDAG-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
+; SI-SDAG-NEXT:    s_endpgm
+;
+; SI-GISEL-LABEL: v_test_safe_med3_f32_pat0:
+; SI-GISEL:       ; %bb.0:
+; SI-GISEL-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x9
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; SI-GISEL-NEXT:    s_mov_b32 s10, 0
+; SI-GISEL-NEXT:    s_mov_b32 s11, 0xf000
+; SI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-GISEL-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; SI-GISEL-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; SI-GISEL-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; SI-GISEL-NEXT:    buffer_load_dword v3, v[0:1], s[8:11], 0 addr64 glc
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; SI-GISEL-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; SI-GISEL-NEXT:    buffer_load_dword v4, v[0:1], s[8:11], 0 addr64 glc
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; SI-GISEL-NEXT:    v_mul_f32_e32 v2, 1.0, v2
+; SI-GISEL-NEXT:    v_mul_f32_e32 v3, 1.0, v3
+; SI-GISEL-NEXT:    v_min_f32_e32 v5, v2, v3
+; SI-GISEL-NEXT:    v_max_f32_e32 v2, v2, v3
+; SI-GISEL-NEXT:    v_mul_f32_e32 v3, 1.0, v4
+; SI-GISEL-NEXT:    v_min_f32_e32 v2, v2, v3
+; SI-GISEL-NEXT:    v_max_f32_e32 v2, v5, v2
+; SI-GISEL-NEXT:    s_mov_b64 s[2:3], s[10:11]
+; SI-GISEL-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
+; SI-GISEL-NEXT:    s_endpgm
 ;
 ; VI-SDAG-LABEL: v_test_safe_med3_f32_pat0:
 ; VI-SDAG:       ; %bb.0:
@@ -5469,32 +6357,57 @@ define amdgpu_kernel void @v_test_safe_med3_f32_pat0(ptr addrspace(1) %out, ptr
 }
 
 define amdgpu_kernel void @v_nnan_inputs_missing0_med3_f32_pat0(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #1 {
-; SI-LABEL: v_nnan_inputs_missing0_med3_f32_pat0:
-; SI:       ; %bb.0:
-; SI-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x9
-; SI-NEXT:    s_mov_b32 s11, 0xf000
-; SI-NEXT:    s_mov_b32 s10, 0
-; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; SI-NEXT:    v_mov_b32_e32 v1, 0
-; SI-NEXT:    s_mov_b64 s[14:15], s[10:11]
-; SI-NEXT:    s_mov_b64 s[18:19], s[10:11]
-; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_mov_b64 s[8:9], s[2:3]
-; SI-NEXT:    s_mov_b64 s[12:13], s[4:5]
-; SI-NEXT:    s_mov_b64 s[16:17], s[6:7]
-; SI-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_load_dword v3, v[0:1], s[12:15], 0 addr64 glc
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_load_dword v4, v[0:1], s[16:19], 0 addr64 glc
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    s_mov_b64 s[2:3], s[10:11]
-; SI-NEXT:    v_add_f32_e32 v2, 1.0, v2
-; SI-NEXT:    v_add_f32_e32 v3, 2.0, v3
-; SI-NEXT:    v_add_f32_e32 v4, 4.0, v4
-; SI-NEXT:    v_med3_f32 v2, v2, v3, v4
-; SI-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
-; SI-NEXT:    s_endpgm
+; SI-SDAG-LABEL: v_nnan_inputs_missing0_med3_f32_pat0:
+; SI-SDAG:       ; %bb.0:
+; SI-SDAG-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x9
+; SI-SDAG-NEXT:    s_mov_b32 s11, 0xf000
+; SI-SDAG-NEXT:    s_mov_b32 s10, 0
+; SI-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; SI-SDAG-NEXT:    v_mov_b32_e32 v1, 0
+; SI-SDAG-NEXT:    s_mov_b64 s[14:15], s[10:11]
+; SI-SDAG-NEXT:    s_mov_b64 s[18:19], s[10:11]
+; SI-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-SDAG-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; SI-SDAG-NEXT:    s_mov_b64 s[12:13], s[4:5]
+; SI-SDAG-NEXT:    s_mov_b64 s[16:17], s[6:7]
+; SI-SDAG-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SI-SDAG-NEXT:    buffer_load_dword v3, v[0:1], s[12:15], 0 addr64 glc
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SI-SDAG-NEXT:    buffer_load_dword v4, v[0:1], s[16:19], 0 addr64 glc
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SI-SDAG-NEXT:    s_mov_b64 s[2:3], s[10:11]
+; SI-SDAG-NEXT:    v_add_f32_e32 v2, 1.0, v2
+; SI-SDAG-NEXT:    v_add_f32_e32 v3, 2.0, v3
+; SI-SDAG-NEXT:    v_add_f32_e32 v4, 4.0, v4
+; SI-SDAG-NEXT:    v_med3_f32 v2, v2, v3, v4
+; SI-SDAG-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
+; SI-SDAG-NEXT:    s_endpgm
+;
+; SI-GISEL-LABEL: v_nnan_inputs_missing0_med3_f32_pat0:
+; SI-GISEL:       ; %bb.0:
+; SI-GISEL-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x9
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; SI-GISEL-NEXT:    s_mov_b32 s10, 0
+; SI-GISEL-NEXT:    s_mov_b32 s11, 0xf000
+; SI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-GISEL-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; SI-GISEL-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; SI-GISEL-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; SI-GISEL-NEXT:    buffer_load_dword v3, v[0:1], s[8:11], 0 addr64 glc
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; SI-GISEL-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; SI-GISEL-NEXT:    buffer_load_dword v4, v[0:1], s[8:11], 0 addr64 glc
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; SI-GISEL-NEXT:    v_add_f32_e32 v2, 1.0, v2
+; SI-GISEL-NEXT:    v_add_f32_e32 v3, 2.0, v3
+; SI-GISEL-NEXT:    v_add_f32_e32 v4, 4.0, v4
+; SI-GISEL-NEXT:    v_med3_f32 v2, v2, v3, v4
+; SI-GISEL-NEXT:    s_mov_b64 s[2:3], s[10:11]
+; SI-GISEL-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
+; SI-GISEL-NEXT:    s_endpgm
 ;
 ; VI-SDAG-LABEL: v_nnan_inputs_missing0_med3_f32_pat0:
 ; VI-SDAG:       ; %bb.0:
@@ -5619,32 +6532,57 @@ define amdgpu_kernel void @v_nnan_inputs_missing0_med3_f32_pat0(ptr addrspace(1)
 }
 
 define amdgpu_kernel void @v_nnan_inputs_missing1_med3_f32_pat0(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #1 {
-; SI-LABEL: v_nnan_inputs_missing1_med3_f32_pat0:
-; SI:       ; %bb.0:
-; SI-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x9
-; SI-NEXT:    s_mov_b32 s11, 0xf000
-; SI-NEXT:    s_mov_b32 s10, 0
-; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; SI-NEXT:    v_mov_b32_e32 v1, 0
-; SI-NEXT:    s_mov_b64 s[14:15], s[10:11]
-; SI-NEXT:    s_mov_b64 s[18:19], s[10:11]
-; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_mov_b64 s[8:9], s[2:3]
-; SI-NEXT:    s_mov_b64 s[12:13], s[4:5]
-; SI-NEXT:    s_mov_b64 s[16:17], s[6:7]
-; SI-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_load_dword v3, v[0:1], s[12:15], 0 addr64 glc
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_load_dword v4, v[0:1], s[16:19], 0 addr64 glc
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    s_mov_b64 s[2:3], s[10:11]
-; SI-NEXT:    v_add_f32_e32 v2, 1.0, v2
-; SI-NEXT:    v_add_f32_e32 v3, 2.0, v3
-; SI-NEXT:    v_add_f32_e32 v4, 4.0, v4
-; SI-NEXT:    v_med3_f32 v2, v2, v3, v4
-; SI-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
-; SI-NEXT:    s_endpgm
+; SI-SDAG-LABEL: v_nnan_inputs_missing1_med3_f32_pat0:
+; SI-SDAG:       ; %bb.0:
+; SI-SDAG-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x9
+; SI-SDAG-NEXT:    s_mov_b32 s11, 0xf000
+; SI-SDAG-NEXT:    s_mov_b32 s10, 0
+; SI-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; SI-SDAG-NEXT:    v_mov_b32_e32 v1, 0
+; SI-SDAG-NEXT:    s_mov_b64 s[14:15], s[10:11]
+; SI-SDAG-NEXT:    s_mov_b64 s[18:19], s[10:11]
+; SI-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-SDAG-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; SI-SDAG-NEXT:    s_mov_b64 s[12:13], s[4:5]
+; SI-SDAG-NEXT:    s_mov_b64 s[16:17], s[6:7]
+; SI-SDAG-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SI-SDAG-NEXT:    buffer_load_dword v3, v[0:1], s[12:15], 0 addr64 glc
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SI-SDAG-NEXT:    buffer_load_dword v4, v[0:1], s[16:19], 0 addr64 glc
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SI-SDAG-NEXT:    s_mov_b64 s[2:3], s[10:11]
+; SI-SDAG-NEXT:    v_add_f32_e32 v2, 1.0, v2
+; SI-SDAG-NEXT:    v_add_f32_e32 v3, 2.0, v3
+; SI-SDAG-NEXT:    v_add_f32_e32 v4, 4.0, v4
+; SI-SDAG-NEXT:    v_med3_f32 v2, v2, v3, v4
+; SI-SDAG-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
+; SI-SDAG-NEXT:    s_endpgm
+;
+; SI-GISEL-LABEL: v_nnan_inputs_missing1_med3_f32_pat0:
+; SI-GISEL:       ; %bb.0:
+; SI-GISEL-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x9
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; SI-GISEL-NEXT:    s_mov_b32 s10, 0
+; SI-GISEL-NEXT:    s_mov_b32 s11, 0xf000
+; SI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-GISEL-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; SI-GISEL-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; SI-GISEL-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; SI-GISEL-NEXT:    buffer_load_dword v3, v[0:1], s[8:11], 0 addr64 glc
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; SI-GISEL-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; SI-GISEL-NEXT:    buffer_load_dword v4, v[0:1], s[8:11], 0 addr64 glc
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; SI-GISEL-NEXT:    v_add_f32_e32 v2, 1.0, v2
+; SI-GISEL-NEXT:    v_add_f32_e32 v3, 2.0, v3
+; SI-GISEL-NEXT:    v_add_f32_e32 v4, 4.0, v4
+; SI-GISEL-NEXT:    v_med3_f32 v2, v2, v3, v4
+; SI-GISEL-NEXT:    s_mov_b64 s[2:3], s[10:11]
+; SI-GISEL-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
+; SI-GISEL-NEXT:    s_endpgm
 ;
 ; VI-SDAG-LABEL: v_nnan_inputs_missing1_med3_f32_pat0:
 ; VI-SDAG:       ; %bb.0:
@@ -5769,32 +6707,57 @@ define amdgpu_kernel void @v_nnan_inputs_missing1_med3_f32_pat0(ptr addrspace(1)
 }
 
 define amdgpu_kernel void @v_nnan_inputs_missing2_med3_f32_pat0(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #1 {
-; SI-LABEL: v_nnan_inputs_missing2_med3_f32_pat0:
-; SI:       ; %bb.0:
-; SI-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x9
-; SI-NEXT:    s_mov_b32 s11, 0xf000
-; SI-NEXT:    s_mov_b32 s10, 0
-; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; SI-NEXT:    v_mov_b32_e32 v1, 0
-; SI-NEXT:    s_mov_b64 s[14:15], s[10:11]
-; SI-NEXT:    s_mov_b64 s[18:19], s[10:11]
-; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_mov_b64 s[8:9], s[2:3]
-; SI-NEXT:    s_mov_b64 s[12:13], s[4:5]
-; SI-NEXT:    s_mov_b64 s[16:17], s[6:7]
-; SI-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_load_dword v3, v[0:1], s[12:15], 0 addr64 glc
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_load_dword v4, v[0:1], s[16:19], 0 addr64 glc
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    s_mov_b64 s[2:3], s[10:11]
-; SI-NEXT:    v_add_f32_e32 v2, 1.0, v2
-; SI-NEXT:    v_add_f32_e32 v3, 2.0, v3
-; SI-NEXT:    v_add_f32_e32 v4, 4.0, v4
-; SI-NEXT:    v_med3_f32 v2, v2, v3, v4
-; SI-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
-; SI-NEXT:    s_endpgm
+; SI-SDAG-LABEL: v_nnan_inputs_missing2_med3_f32_pat0:
+; SI-SDAG:       ; %bb.0:
+; SI-SDAG-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x9
+; SI-SDAG-NEXT:    s_mov_b32 s11, 0xf000
+; SI-SDAG-NEXT:    s_mov_b32 s10, 0
+; SI-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; SI-SDAG-NEXT:    v_mov_b32_e32 v1, 0
+; SI-SDAG-NEXT:    s_mov_b64 s[14:15], s[10:11]
+; SI-SDAG-NEXT:    s_mov_b64 s[18:19], s[10:11]
+; SI-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-SDAG-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; SI-SDAG-NEXT:    s_mov_b64 s[12:13], s[4:5]
+; SI-SDAG-NEXT:    s_mov_b64 s[16:17], s[6:7]
+; SI-SDAG-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SI-SDAG-NEXT:    buffer_load_dword v3, v[0:1], s[12:15], 0 addr64 glc
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SI-SDAG-NEXT:    buffer_load_dword v4, v[0:1], s[16:19], 0 addr64 glc
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SI-SDAG-NEXT:    s_mov_b64 s[2:3], s[10:11]
+; SI-SDAG-NEXT:    v_add_f32_e32 v2, 1.0, v2
+; SI-SDAG-NEXT:    v_add_f32_e32 v3, 2.0, v3
+; SI-SDAG-NEXT:    v_add_f32_e32 v4, 4.0, v4
+; SI-SDAG-NEXT:    v_med3_f32 v2, v2, v3, v4
+; SI-SDAG-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
+; SI-SDAG-NEXT:    s_endpgm
+;
+; SI-GISEL-LABEL: v_nnan_inputs_missing2_med3_f32_pat0:
+; SI-GISEL:       ; %bb.0:
+; SI-GISEL-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x9
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; SI-GISEL-NEXT:    s_mov_b32 s10, 0
+; SI-GISEL-NEXT:    s_mov_b32 s11, 0xf000
+; SI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-GISEL-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; SI-GISEL-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; SI-GISEL-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; SI-GISEL-NEXT:    buffer_load_dword v3, v[0:1], s[8:11], 0 addr64 glc
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; SI-GISEL-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; SI-GISEL-NEXT:    buffer_load_dword v4, v[0:1], s[8:11], 0 addr64 glc
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; SI-GISEL-NEXT:    v_add_f32_e32 v2, 1.0, v2
+; SI-GISEL-NEXT:    v_add_f32_e32 v3, 2.0, v3
+; SI-GISEL-NEXT:    v_add_f32_e32 v4, 4.0, v4
+; SI-GISEL-NEXT:    v_med3_f32 v2, v2, v3, v4
+; SI-GISEL-NEXT:    s_mov_b64 s[2:3], s[10:11]
+; SI-GISEL-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
+; SI-GISEL-NEXT:    s_endpgm
 ;
 ; VI-SDAG-LABEL: v_nnan_inputs_missing2_med3_f32_pat0:
 ; VI-SDAG:       ; %bb.0:
@@ -5919,29 +6882,52 @@ define amdgpu_kernel void @v_nnan_inputs_missing2_med3_f32_pat0(ptr addrspace(1)
 }
 
 define amdgpu_kernel void @v_test_nnan_on_call_med3_f32_pat0_srcmod0(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #1 {
-; SI-LABEL: v_test_nnan_on_call_med3_f32_pat0_srcmod0:
-; SI:       ; %bb.0:
-; SI-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x9
-; SI-NEXT:    s_mov_b32 s11, 0xf000
-; SI-NEXT:    s_mov_b32 s10, 0
-; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; SI-NEXT:    v_mov_b32_e32 v1, 0
-; SI-NEXT:    s_mov_b64 s[14:15], s[10:11]
-; SI-NEXT:    s_mov_b64 s[18:19], s[10:11]
-; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_mov_b64 s[8:9], s[2:3]
-; SI-NEXT:    s_mov_b64 s[12:13], s[4:5]
-; SI-NEXT:    s_mov_b64 s[16:17], s[6:7]
-; SI-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_load_dword v3, v[0:1], s[12:15], 0 addr64 glc
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_load_dword v4, v[0:1], s[16:19], 0 addr64 glc
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    s_mov_b64 s[2:3], s[10:11]
-; SI-NEXT:    v_med3_f32 v2, -v2, v3, v4
-; SI-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
-; SI-NEXT:    s_endpgm
+; SI-SDAG-LABEL: v_test_nnan_on_call_med3_f32_pat0_srcmod0:
+; SI-SDAG:       ; %bb.0:
+; SI-SDAG-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x9
+; SI-SDAG-NEXT:    s_mov_b32 s11, 0xf000
+; SI-SDAG-NEXT:    s_mov_b32 s10, 0
+; SI-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; SI-SDAG-NEXT:    v_mov_b32_e32 v1, 0
+; SI-SDAG-NEXT:    s_mov_b64 s[14:15], s[10:11]
+; SI-SDAG-NEXT:    s_mov_b64 s[18:19], s[10:11]
+; SI-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-SDAG-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; SI-SDAG-NEXT:    s_mov_b64 s[12:13], s[4:5]
+; SI-SDAG-NEXT:    s_mov_b64 s[16:17], s[6:7]
+; SI-SDAG-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SI-SDAG-NEXT:    buffer_load_dword v3, v[0:1], s[12:15], 0 addr64 glc
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SI-SDAG-NEXT:    buffer_load_dword v4, v[0:1], s[16:19], 0 addr64 glc
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SI-SDAG-NEXT:    s_mov_b64 s[2:3], s[10:11]
+; SI-SDAG-NEXT:    v_med3_f32 v2, -v2, v3, v4
+; SI-SDAG-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
+; SI-SDAG-NEXT:    s_endpgm
+;
+; SI-GISEL-LABEL: v_test_nnan_on_call_med3_f32_pat0_srcmod0:
+; SI-GISEL:       ; %bb.0:
+; SI-GISEL-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x9
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; SI-GISEL-NEXT:    s_mov_b32 s10, 0
+; SI-GISEL-NEXT:    s_mov_b32 s11, 0xf000
+; SI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-GISEL-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; SI-GISEL-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; SI-GISEL-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; SI-GISEL-NEXT:    buffer_load_dword v3, v[0:1], s[8:11], 0 addr64 glc
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; SI-GISEL-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; SI-GISEL-NEXT:    buffer_load_dword v4, v[0:1], s[8:11], 0 addr64 glc
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; SI-GISEL-NEXT:    v_mul_f32_e32 v2, -1.0, v2
+; SI-GISEL-NEXT:    v_med3_f32 v2, v2, v3, v4
+; SI-GISEL-NEXT:    s_mov_b64 s[2:3], s[10:11]
+; SI-GISEL-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
+; SI-GISEL-NEXT:    s_endpgm
 ;
 ; VI-SDAG-LABEL: v_test_nnan_on_call_med3_f32_pat0_srcmod0:
 ; VI-SDAG:       ; %bb.0:
@@ -6086,31 +7072,57 @@ define amdgpu_kernel void @v_test_nnan_on_call_med3_f32_pat0_srcmod0(ptr addrspa
 }
 
 define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod0_mismatch(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #2 {
-; SI-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod0_mismatch:
-; SI:       ; %bb.0:
-; SI-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x9
-; SI-NEXT:    s_mov_b32 s11, 0xf000
-; SI-NEXT:    s_mov_b32 s10, 0
-; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; SI-NEXT:    v_mov_b32_e32 v1, 0
-; SI-NEXT:    s_mov_b64 s[14:15], s[10:11]
-; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_mov_b64 s[8:9], s[2:3]
-; SI-NEXT:    s_mov_b64 s[12:13], s[4:5]
-; SI-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_load_dword v3, v[0:1], s[12:15], 0 addr64 glc
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    s_mov_b64 s[12:13], s[6:7]
-; SI-NEXT:    buffer_load_dword v4, v[0:1], s[12:15], 0 addr64 glc
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    s_mov_b64 s[2:3], s[10:11]
-; SI-NEXT:    v_min_f32_e64 v5, -v2, v3
-; SI-NEXT:    v_max_f32_e32 v2, v2, v3
-; SI-NEXT:    v_min_f32_e32 v2, v2, v4
-; SI-NEXT:    v_max_f32_e32 v2, v5, v2
-; SI-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
-; SI-NEXT:    s_endpgm
+; SI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod0_mismatch:
+; SI-SDAG:       ; %bb.0:
+; SI-SDAG-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x9
+; SI-SDAG-NEXT:    s_mov_b32 s11, 0xf000
+; SI-SDAG-NEXT:    s_mov_b32 s10, 0
+; SI-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; SI-SDAG-NEXT:    v_mov_b32_e32 v1, 0
+; SI-SDAG-NEXT:    s_mov_b64 s[14:15], s[10:11]
+; SI-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-SDAG-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; SI-SDAG-NEXT:    s_mov_b64 s[12:13], s[4:5]
+; SI-SDAG-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SI-SDAG-NEXT:    buffer_load_dword v3, v[0:1], s[12:15], 0 addr64 glc
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SI-SDAG-NEXT:    s_mov_b64 s[12:13], s[6:7]
+; SI-SDAG-NEXT:    buffer_load_dword v4, v[0:1], s[12:15], 0 addr64 glc
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SI-SDAG-NEXT:    s_mov_b64 s[2:3], s[10:11]
+; SI-SDAG-NEXT:    v_min_f32_e64 v5, -v2, v3
+; SI-SDAG-NEXT:    v_max_f32_e32 v2, v2, v3
+; SI-SDAG-NEXT:    v_min_f32_e32 v2, v2, v4
+; SI-SDAG-NEXT:    v_max_f32_e32 v2, v5, v2
+; SI-SDAG-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
+; SI-SDAG-NEXT:    s_endpgm
+;
+; SI-GISEL-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod0_mismatch:
+; SI-GISEL:       ; %bb.0:
+; SI-GISEL-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x9
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; SI-GISEL-NEXT:    s_mov_b32 s10, 0
+; SI-GISEL-NEXT:    s_mov_b32 s11, 0xf000
+; SI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-GISEL-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; SI-GISEL-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; SI-GISEL-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; SI-GISEL-NEXT:    buffer_load_dword v3, v[0:1], s[8:11], 0 addr64 glc
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; SI-GISEL-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; SI-GISEL-NEXT:    buffer_load_dword v4, v[0:1], s[8:11], 0 addr64 glc
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; SI-GISEL-NEXT:    v_mul_f32_e32 v5, -1.0, v2
+; SI-GISEL-NEXT:    v_min_f32_e32 v5, v5, v3
+; SI-GISEL-NEXT:    v_max_f32_e32 v2, v2, v3
+; SI-GISEL-NEXT:    v_min_f32_e32 v2, v2, v4
+; SI-GISEL-NEXT:    v_max_f32_e32 v2, v5, v2
+; SI-GISEL-NEXT:    s_mov_b64 s[2:3], s[10:11]
+; SI-GISEL-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
+; SI-GISEL-NEXT:    s_endpgm
 ;
 ; VI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod0_mismatch:
 ; VI-SDAG:       ; %bb.0:
@@ -6272,30 +7284,53 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod0_mismatch(pt
 
 ; A simple min and max is not sufficient
 define amdgpu_kernel void @v_test_global_nnans_min_max_f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #2 {
-; SI-LABEL: v_test_global_nnans_min_max_f32:
-; SI:       ; %bb.0:
-; SI-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x9
-; SI-NEXT:    s_mov_b32 s11, 0xf000
-; SI-NEXT:    s_mov_b32 s10, 0
-; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; SI-NEXT:    v_mov_b32_e32 v1, 0
-; SI-NEXT:    s_mov_b64 s[14:15], s[10:11]
-; SI-NEXT:    s_mov_b64 s[18:19], s[10:11]
-; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_mov_b64 s[8:9], s[2:3]
-; SI-NEXT:    s_mov_b64 s[12:13], s[4:5]
-; SI-NEXT:    s_mov_b64 s[16:17], s[6:7]
-; SI-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_load_dword v3, v[0:1], s[12:15], 0 addr64 glc
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_load_dword v4, v[0:1], s[16:19], 0 addr64 glc
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    s_mov_b64 s[2:3], s[10:11]
-; SI-NEXT:    v_max_f32_e32 v2, v2, v3
-; SI-NEXT:    v_min_f32_e32 v2, v2, v4
-; SI-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
-; SI-NEXT:    s_endpgm
+; SI-SDAG-LABEL: v_test_global_nnans_min_max_f32:
+; SI-SDAG:       ; %bb.0:
+; SI-SDAG-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x9
+; SI-SDAG-NEXT:    s_mov_b32 s11, 0xf000
+; SI-SDAG-NEXT:    s_mov_b32 s10, 0
+; SI-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; SI-SDAG-NEXT:    v_mov_b32_e32 v1, 0
+; SI-SDAG-NEXT:    s_mov_b64 s[14:15], s[10:11]
+; SI-SDAG-NEXT:    s_mov_b64 s[18:19], s[10:11]
+; SI-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-SDAG-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; SI-SDAG-NEXT:    s_mov_b64 s[12:13], s[4:5]
+; SI-SDAG-NEXT:    s_mov_b64 s[16:17], s[6:7]
+; SI-SDAG-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SI-SDAG-NEXT:    buffer_load_dword v3, v[0:1], s[12:15], 0 addr64 glc
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SI-SDAG-NEXT:    buffer_load_dword v4, v[0:1], s[16:19], 0 addr64 glc
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SI-SDAG-NEXT:    s_mov_b64 s[2:3], s[10:11]
+; SI-SDAG-NEXT:    v_max_f32_e32 v2, v2, v3
+; SI-SDAG-NEXT:    v_min_f32_e32 v2, v2, v4
+; SI-SDAG-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
+; SI-SDAG-NEXT:    s_endpgm
+;
+; SI-GISEL-LABEL: v_test_global_nnans_min_max_f32:
+; SI-GISEL:       ; %bb.0:
+; SI-GISEL-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x9
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; SI-GISEL-NEXT:    s_mov_b32 s10, 0
+; SI-GISEL-NEXT:    s_mov_b32 s11, 0xf000
+; SI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-GISEL-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; SI-GISEL-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; SI-GISEL-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; SI-GISEL-NEXT:    buffer_load_dword v3, v[0:1], s[8:11], 0 addr64 glc
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; SI-GISEL-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; SI-GISEL-NEXT:    buffer_load_dword v4, v[0:1], s[8:11], 0 addr64 glc
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; SI-GISEL-NEXT:    v_max_f32_e32 v2, v2, v3
+; SI-GISEL-NEXT:    v_min_f32_e32 v2, v2, v4
+; SI-GISEL-NEXT:    s_mov_b64 s[2:3], s[10:11]
+; SI-GISEL-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
+; SI-GISEL-NEXT:    s_endpgm
 ;
 ; VI-SDAG-LABEL: v_test_global_nnans_min_max_f32:
 ; VI-SDAG:       ; %bb.0:
@@ -6404,24 +7439,51 @@ define amdgpu_kernel void @v_test_global_nnans_min_max_f32(ptr addrspace(1) %out
 }
 
 define amdgpu_kernel void @v_test_nnan_input_fmed3_r_i_i_f16(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #1 {
-; SI-LABEL: v_test_nnan_input_fmed3_r_i_i_f16:
-; SI:       ; %bb.0:
-; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
-; SI-NEXT:    s_mov_b32 s7, 0xf000
-; SI-NEXT:    s_mov_b32 s6, 0
-; SI-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
-; SI-NEXT:    v_mov_b32_e32 v1, 0
-; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_mov_b64 s[4:5], s[2:3]
-; SI-NEXT:    buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64
-; SI-NEXT:    s_mov_b64 s[2:3], s[6:7]
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_cvt_f32_f16_e32 v2, v2
-; SI-NEXT:    v_add_f32_e32 v2, 1.0, v2
-; SI-NEXT:    v_med3_f32 v2, v2, 2.0, 4.0
-; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
-; SI-NEXT:    buffer_store_short v2, v[0:1], s[0:3], 0 addr64
-; SI-NEXT:    s_endpgm
+; SI-SDAG-LABEL: v_test_nnan_input_fmed3_r_i_i_f16:
+; SI-SDAG:       ; %bb.0:
+; SI-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
+; SI-SDAG-NEXT:    s_mov_b32 s7, 0xf000
+; SI-SDAG-NEXT:    s_mov_b32 s6, 0
+; SI-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
+; SI-SDAG-NEXT:    v_mov_b32_e32 v1, 0
+; SI-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-SDAG-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; SI-SDAG-NEXT:    buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64
+; SI-SDAG-NEXT:    s_mov_b64 s[2:3], s[6:7]
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SI-SDAG-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; SI-SDAG-NEXT:    v_add_f32_e32 v2, 1.0, v2
+; SI-SDAG-NEXT:    v_med3_f32 v2, v2, 2.0, 4.0
+; SI-SDAG-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; SI-SDAG-NEXT:    buffer_store_short v2, v[0:1], s[0:3], 0 addr64
+; SI-SDAG-NEXT:    s_endpgm
+;
+; SI-GISEL-LABEL: v_test_nnan_input_fmed3_r_i_i_f16:
+; SI-GISEL:       ; %bb.0:
+; SI-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
+; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; SI-GISEL-NEXT:    s_mov_b32 s6, 0
+; SI-GISEL-NEXT:    s_mov_b32 s7, 0xf000
+; SI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-GISEL-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; SI-GISEL-NEXT:    buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64
+; SI-GISEL-NEXT:    v_cvt_f32_f16_e32 v3, 1.0
+; SI-GISEL-NEXT:    v_cvt_f32_f16_e32 v4, 2.0
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; SI-GISEL-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; SI-GISEL-NEXT:    v_add_f32_e32 v2, v2, v3
+; SI-GISEL-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; SI-GISEL-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; SI-GISEL-NEXT:    v_max_f32_e32 v2, v2, v4
+; SI-GISEL-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; SI-GISEL-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; SI-GISEL-NEXT:    v_cvt_f32_f16_e32 v3, 4.0
+; SI-GISEL-NEXT:    v_min_f32_e32 v2, v2, v3
+; SI-GISEL-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; SI-GISEL-NEXT:    s_mov_b64 s[2:3], s[6:7]
+; SI-GISEL-NEXT:    buffer_store_short v2, v[0:1], s[0:3], 0 addr64
+; SI-GISEL-NEXT:    s_endpgm
 ;
 ; VI-SDAG-LABEL: v_test_nnan_input_fmed3_r_i_i_f16:
 ; VI-SDAG:       ; %bb.0:
@@ -6502,36 +7564,83 @@ define amdgpu_kernel void @v_test_nnan_input_fmed3_r_i_i_f16(ptr addrspace(1) %o
 }
 
 define amdgpu_kernel void @v_nnan_inputs_med3_f16_pat0(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #1 {
-; SI-LABEL: v_nnan_inputs_med3_f16_pat0:
-; SI:       ; %bb.0:
-; SI-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x9
-; SI-NEXT:    s_mov_b32 s11, 0xf000
-; SI-NEXT:    s_mov_b32 s10, 0
-; SI-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
-; SI-NEXT:    v_mov_b32_e32 v1, 0
-; SI-NEXT:    s_mov_b64 s[14:15], s[10:11]
-; SI-NEXT:    s_mov_b64 s[18:19], s[10:11]
-; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_mov_b64 s[8:9], s[2:3]
-; SI-NEXT:    s_mov_b64 s[12:13], s[4:5]
-; SI-NEXT:    s_mov_b64 s[16:17], s[6:7]
-; SI-NEXT:    buffer_load_ushort v2, v[0:1], s[8:11], 0 addr64 glc
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_load_ushort v3, v[0:1], s[12:15], 0 addr64 glc
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_load_ushort v4, v[0:1], s[16:19], 0 addr64 glc
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    s_mov_b64 s[2:3], s[10:11]
-; SI-NEXT:    v_cvt_f32_f16_e32 v2, v2
-; SI-NEXT:    v_cvt_f32_f16_e32 v3, v3
-; SI-NEXT:    v_cvt_f32_f16_e32 v4, v4
-; SI-NEXT:    v_add_f32_e32 v2, 1.0, v2
-; SI-NEXT:    v_add_f32_e32 v3, 2.0, v3
-; SI-NEXT:    v_add_f32_e32 v4, 4.0, v4
-; SI-NEXT:    v_med3_f32 v2, v2, v3, v4
-; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
-; SI-NEXT:    buffer_store_short v2, v[0:1], s[0:3], 0 addr64
-; SI-NEXT:    s_endpgm
+; SI-SDAG-LABEL: v_nnan_inputs_med3_f16_pat0:
+; SI-SDAG:       ; %bb.0:
+; SI-SDAG-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x9
+; SI-SDAG-NEXT:    s_mov_b32 s11, 0xf000
+; SI-SDAG-NEXT:    s_mov_b32 s10, 0
+; SI-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
+; SI-SDAG-NEXT:    v_mov_b32_e32 v1, 0
+; SI-SDAG-NEXT:    s_mov_b64 s[14:15], s[10:11]
+; SI-SDAG-NEXT:    s_mov_b64 s[18:19], s[10:11]
+; SI-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-SDAG-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; SI-SDAG-NEXT:    s_mov_b64 s[12:13], s[4:5]
+; SI-SDAG-NEXT:    s_mov_b64 s[16:17], s[6:7]
+; SI-SDAG-NEXT:    buffer_load_ushort v2, v[0:1], s[8:11], 0 addr64 glc
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SI-SDAG-NEXT:    buffer_load_ushort v3, v[0:1], s[12:15], 0 addr64 glc
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SI-SDAG-NEXT:    buffer_load_ushort v4, v[0:1], s[16:19], 0 addr64 glc
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SI-SDAG-NEXT:    s_mov_b64 s[2:3], s[10:11]
+; SI-SDAG-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; SI-SDAG-NEXT:    v_cvt_f32_f16_e32 v3, v3
+; SI-SDAG-NEXT:    v_cvt_f32_f16_e32 v4, v4
+; SI-SDAG-NEXT:    v_add_f32_e32 v2, 1.0, v2
+; SI-SDAG-NEXT:    v_add_f32_e32 v3, 2.0, v3
+; SI-SDAG-NEXT:    v_add_f32_e32 v4, 4.0, v4
+; SI-SDAG-NEXT:    v_med3_f32 v2, v2, v3, v4
+; SI-SDAG-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; SI-SDAG-NEXT:    buffer_store_short v2, v[0:1], s[0:3], 0 addr64
+; SI-SDAG-NEXT:    s_endpgm
+;
+; SI-GISEL-LABEL: v_nnan_inputs_med3_f16_pat0:
+; SI-GISEL:       ; %bb.0:
+; SI-GISEL-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x9
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
+; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; SI-GISEL-NEXT:    s_mov_b32 s10, 0
+; SI-GISEL-NEXT:    s_mov_b32 s11, 0xf000
+; SI-GISEL-NEXT:    v_cvt_f32_f16_e32 v2, 1.0
+; SI-GISEL-NEXT:    v_cvt_f32_f16_e32 v3, 2.0
+; SI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-GISEL-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; SI-GISEL-NEXT:    buffer_load_ushort v4, v[0:1], s[8:11], 0 addr64 glc
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; SI-GISEL-NEXT:    v_cvt_f32_f16_e32 v5, 4.0
+; SI-GISEL-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; SI-GISEL-NEXT:    buffer_load_ushort v6, v[0:1], s[8:11], 0 addr64 glc
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; SI-GISEL-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; SI-GISEL-NEXT:    buffer_load_ushort v7, v[0:1], s[8:11], 0 addr64 glc
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; SI-GISEL-NEXT:    v_cvt_f32_f16_e32 v4, v4
+; SI-GISEL-NEXT:    v_add_f32_e32 v2, v4, v2
+; SI-GISEL-NEXT:    v_cvt_f32_f16_e32 v4, v6
+; SI-GISEL-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; SI-GISEL-NEXT:    v_add_f32_e32 v3, v4, v3
+; SI-GISEL-NEXT:    v_cvt_f32_f16_e32 v4, v7
+; SI-GISEL-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; SI-GISEL-NEXT:    v_add_f32_e32 v4, v4, v5
+; SI-GISEL-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; SI-GISEL-NEXT:    v_cvt_f16_f32_e32 v4, v4
+; SI-GISEL-NEXT:    v_cvt_f32_f16_e32 v3, v3
+; SI-GISEL-NEXT:    v_min_f32_e32 v5, v2, v3
+; SI-GISEL-NEXT:    v_max_f32_e32 v2, v2, v3
+; SI-GISEL-NEXT:    v_cvt_f32_f16_e32 v3, v4
+; SI-GISEL-NEXT:    v_cvt_f16_f32_e32 v4, v5
+; SI-GISEL-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; SI-GISEL-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; SI-GISEL-NEXT:    v_min_f32_e32 v2, v2, v3
+; SI-GISEL-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; SI-GISEL-NEXT:    v_cvt_f32_f16_e32 v3, v4
+; SI-GISEL-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; SI-GISEL-NEXT:    v_max_f32_e32 v2, v3, v2
+; SI-GISEL-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; SI-GISEL-NEXT:    s_mov_b64 s[2:3], s[10:11]
+; SI-GISEL-NEXT:    buffer_store_short v2, v[0:1], s[0:3], 0 addr64
+; SI-GISEL-NEXT:    s_endpgm
 ;
 ; VI-SDAG-LABEL: v_nnan_inputs_med3_f16_pat0:
 ; VI-SDAG:       ; %bb.0:
@@ -6663,23 +7772,41 @@ define amdgpu_kernel void @v_nnan_inputs_med3_f16_pat0(ptr addrspace(1) %out, pt
 }
 
 define amdgpu_kernel void @two_non_inline_constant(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #1 {
-; SI-LABEL: two_non_inline_constant:
-; SI:       ; %bb.0:
-; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
-; SI-NEXT:    s_mov_b32 s7, 0xf000
-; SI-NEXT:    s_mov_b32 s6, 0
-; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; SI-NEXT:    v_mov_b32_e32 v1, 0
-; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_mov_b64 s[4:5], s[2:3]
-; SI-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
-; SI-NEXT:    s_mov_b64 s[2:3], s[6:7]
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_add_f32_e32 v2, 0.5, v2
-; SI-NEXT:    v_max_f32_e32 v2, 0x41000000, v2
-; SI-NEXT:    v_min_f32_e32 v2, 0x41800000, v2
-; SI-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
-; SI-NEXT:    s_endpgm
+; SI-SDAG-LABEL: two_non_inline_constant:
+; SI-SDAG:       ; %bb.0:
+; SI-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
+; SI-SDAG-NEXT:    s_mov_b32 s7, 0xf000
+; SI-SDAG-NEXT:    s_mov_b32 s6, 0
+; SI-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; SI-SDAG-NEXT:    v_mov_b32_e32 v1, 0
+; SI-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-SDAG-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; SI-SDAG-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
+; SI-SDAG-NEXT:    s_mov_b64 s[2:3], s[6:7]
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SI-SDAG-NEXT:    v_add_f32_e32 v2, 0.5, v2
+; SI-SDAG-NEXT:    v_max_f32_e32 v2, 0x41000000, v2
+; SI-SDAG-NEXT:    v_min_f32_e32 v2, 0x41800000, v2
+; SI-SDAG-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
+; SI-SDAG-NEXT:    s_endpgm
+;
+; SI-GISEL-LABEL: two_non_inline_constant:
+; SI-GISEL:       ; %bb.0:
+; SI-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; SI-GISEL-NEXT:    s_mov_b32 s6, 0
+; SI-GISEL-NEXT:    s_mov_b32 s7, 0xf000
+; SI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-GISEL-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; SI-GISEL-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; SI-GISEL-NEXT:    v_add_f32_e32 v2, 0.5, v2
+; SI-GISEL-NEXT:    v_max_f32_e32 v2, 0x41000000, v2
+; SI-GISEL-NEXT:    v_min_f32_e32 v2, 0x41800000, v2
+; SI-GISEL-NEXT:    s_mov_b64 s[2:3], s[6:7]
+; SI-GISEL-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
+; SI-GISEL-NEXT:    s_endpgm
 ;
 ; VI-SDAG-LABEL: two_non_inline_constant:
 ; VI-SDAG:       ; %bb.0:
@@ -6779,27 +7906,49 @@ define amdgpu_kernel void @two_non_inline_constant(ptr addrspace(1) %out, ptr ad
 
 ; FIXME: Simple stores do not work as a multiple use because they are bitcasted to integer constants.
 define amdgpu_kernel void @one_non_inline_constant(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #1 {
-; SI-LABEL: one_non_inline_constant:
-; SI:       ; %bb.0:
-; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
-; SI-NEXT:    s_mov_b32 s6, 0
-; SI-NEXT:    s_mov_b32 s7, 0xf000
-; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; SI-NEXT:    v_mov_b32_e32 v1, 0
-; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_mov_b64 s[4:5], s[2:3]
-; SI-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
-; SI-NEXT:    v_mov_b32_e32 v3, 0x41800000
-; SI-NEXT:    s_mov_b64 s[2:3], s[6:7]
-; SI-NEXT:    s_mov_b32 s6, -1
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_add_f32_e32 v4, 0.5, v2
-; SI-NEXT:    v_add_f32_e32 v2, 0x41800000, v2
-; SI-NEXT:    v_med3_f32 v3, v4, 1.0, v3
-; SI-NEXT:    buffer_store_dword v3, v[0:1], s[0:3], 0 addr64
-; SI-NEXT:    buffer_store_dword v2, off, s[4:7], 0
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    s_endpgm
+; SI-SDAG-LABEL: one_non_inline_constant:
+; SI-SDAG:       ; %bb.0:
+; SI-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
+; SI-SDAG-NEXT:    s_mov_b32 s6, 0
+; SI-SDAG-NEXT:    s_mov_b32 s7, 0xf000
+; SI-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; SI-SDAG-NEXT:    v_mov_b32_e32 v1, 0
+; SI-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-SDAG-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; SI-SDAG-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
+; SI-SDAG-NEXT:    v_mov_b32_e32 v3, 0x41800000
+; SI-SDAG-NEXT:    s_mov_b64 s[2:3], s[6:7]
+; SI-SDAG-NEXT:    s_mov_b32 s6, -1
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SI-SDAG-NEXT:    v_add_f32_e32 v4, 0.5, v2
+; SI-SDAG-NEXT:    v_add_f32_e32 v2, 0x41800000, v2
+; SI-SDAG-NEXT:    v_med3_f32 v3, v4, 1.0, v3
+; SI-SDAG-NEXT:    buffer_store_dword v3, v[0:1], s[0:3], 0 addr64
+; SI-SDAG-NEXT:    buffer_store_dword v2, off, s[4:7], 0
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SI-SDAG-NEXT:    s_endpgm
+;
+; SI-GISEL-LABEL: one_non_inline_constant:
+; SI-GISEL:       ; %bb.0:
+; SI-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; SI-GISEL-NEXT:    s_mov_b32 s6, 0
+; SI-GISEL-NEXT:    s_mov_b32 s7, 0xf000
+; SI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-GISEL-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; SI-GISEL-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
+; SI-GISEL-NEXT:    s_mov_b32 s4, 0x41800000
+; SI-GISEL-NEXT:    s_mov_b64 s[2:3], s[6:7]
+; SI-GISEL-NEXT:    s_mov_b32 s6, -1
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; SI-GISEL-NEXT:    v_add_f32_e32 v3, 0.5, v2
+; SI-GISEL-NEXT:    v_add_f32_e32 v2, 0x41800000, v2
+; SI-GISEL-NEXT:    v_med3_f32 v3, v3, 1.0, s4
+; SI-GISEL-NEXT:    buffer_store_dword v3, v[0:1], s[0:3], 0 addr64
+; SI-GISEL-NEXT:    buffer_store_dword v2, off, s[4:7], 0
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; SI-GISEL-NEXT:    s_endpgm
 ;
 ; VI-SDAG-LABEL: one_non_inline_constant:
 ; VI-SDAG:       ; %bb.0:
@@ -6912,31 +8061,57 @@ define amdgpu_kernel void @one_non_inline_constant(ptr addrspace(1) %out, ptr ad
 }
 
 define amdgpu_kernel void @two_non_inline_constant_multi_use(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #1 {
-; SI-LABEL: two_non_inline_constant_multi_use:
-; SI:       ; %bb.0:
-; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
-; SI-NEXT:    s_mov_b32 s6, 0
-; SI-NEXT:    s_mov_b32 s7, 0xf000
-; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; SI-NEXT:    v_mov_b32_e32 v1, 0
-; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_mov_b64 s[4:5], s[2:3]
-; SI-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
-; SI-NEXT:    s_mov_b32 s4, 0x41000000
-; SI-NEXT:    v_mov_b32_e32 v3, 0x41800000
-; SI-NEXT:    s_mov_b64 s[2:3], s[6:7]
-; SI-NEXT:    s_mov_b32 s6, -1
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_add_f32_e32 v4, 0.5, v2
-; SI-NEXT:    v_add_f32_e32 v5, 0x41800000, v2
-; SI-NEXT:    v_add_f32_e32 v2, 0x41000000, v2
-; SI-NEXT:    v_med3_f32 v3, v4, s4, v3
-; SI-NEXT:    buffer_store_dword v3, v[0:1], s[0:3], 0 addr64
-; SI-NEXT:    buffer_store_dword v5, off, s[4:7], 0
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_store_dword v2, off, s[4:7], 0
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    s_endpgm
+; SI-SDAG-LABEL: two_non_inline_constant_multi_use:
+; SI-SDAG:       ; %bb.0:
+; SI-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
+; SI-SDAG-NEXT:    s_mov_b32 s6, 0
+; SI-SDAG-NEXT:    s_mov_b32 s7, 0xf000
+; SI-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; SI-SDAG-NEXT:    v_mov_b32_e32 v1, 0
+; SI-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-SDAG-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; SI-SDAG-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
+; SI-SDAG-NEXT:    s_mov_b32 s4, 0x41000000
+; SI-SDAG-NEXT:    v_mov_b32_e32 v3, 0x41800000
+; SI-SDAG-NEXT:    s_mov_b64 s[2:3], s[6:7]
+; SI-SDAG-NEXT:    s_mov_b32 s6, -1
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SI-SDAG-NEXT:    v_add_f32_e32 v4, 0.5, v2
+; SI-SDAG-NEXT:    v_add_f32_e32 v5, 0x41800000, v2
+; SI-SDAG-NEXT:    v_add_f32_e32 v2, 0x41000000, v2
+; SI-SDAG-NEXT:    v_med3_f32 v3, v4, s4, v3
+; SI-SDAG-NEXT:    buffer_store_dword v3, v[0:1], s[0:3], 0 addr64
+; SI-SDAG-NEXT:    buffer_store_dword v5, off, s[4:7], 0
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SI-SDAG-NEXT:    buffer_store_dword v2, off, s[4:7], 0
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SI-SDAG-NEXT:    s_endpgm
+;
+; SI-GISEL-LABEL: two_non_inline_constant_multi_use:
+; SI-GISEL:       ; %bb.0:
+; SI-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; SI-GISEL-NEXT:    s_mov_b32 s6, 0
+; SI-GISEL-NEXT:    s_mov_b32 s7, 0xf000
+; SI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-GISEL-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; SI-GISEL-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
+; SI-GISEL-NEXT:    s_mov_b32 s4, 0x41000000
+; SI-GISEL-NEXT:    v_mov_b32_e32 v3, 0x41800000
+; SI-GISEL-NEXT:    s_mov_b64 s[2:3], s[6:7]
+; SI-GISEL-NEXT:    s_mov_b32 s6, -1
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; SI-GISEL-NEXT:    v_add_f32_e32 v4, 0.5, v2
+; SI-GISEL-NEXT:    v_add_f32_e32 v5, 0x41800000, v2
+; SI-GISEL-NEXT:    v_add_f32_e32 v2, 0x41000000, v2
+; SI-GISEL-NEXT:    v_med3_f32 v3, v4, s4, v3
+; SI-GISEL-NEXT:    buffer_store_dword v3, v[0:1], s[0:3], 0 addr64
+; SI-GISEL-NEXT:    buffer_store_dword v5, off, s[4:7], 0
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; SI-GISEL-NEXT:    buffer_store_dword v2, off, s[4:7], 0
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; SI-GISEL-NEXT:    s_endpgm
 ;
 ; VI-SDAG-LABEL: two_non_inline_constant_multi_use:
 ; VI-SDAG:       ; %bb.0:
@@ -7085,3 +8260,5 @@ declare half @llvm.maxnum.f16(half, half) #0
 attributes #0 = { nounwind readnone }
 attributes #1 = { nounwind "unsafe-fp-math"="false" "no-nans-fp-math"="false" }
 attributes #2 = { nounwind "unsafe-fp-math"="false" "no-nans-fp-math"="true" }
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; SI: {{.*}}

diff  --git a/llvm/test/CodeGen/AMDGPU/fptrunc.f16.ll b/llvm/test/CodeGen/AMDGPU/fptrunc.f16.ll
index a8709c8a9a7c0fd..c5a430823ecf331 100644
--- a/llvm/test/CodeGen/AMDGPU/fptrunc.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fptrunc.f16.ll
@@ -1,7 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -global-isel=0 -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -enable-var-scope -check-prefixes=SI-SDAG %s
-; TODO: Crashes on selecting G_STORE.
-; RUN: not --crash llc -amdgpu-scalarize-global-loads=false -march=amdgcn -global-isel=1 -verify-machineinstrs -enable-unsafe-fp-math < %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -global-isel=1 -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefixes=SI-GISEL %s
 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -global-isel=0 -mattr=-flat-for-global -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -enable-var-scope -check-prefixes=VI-SDAG %s
 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -global-isel=1 -mattr=-flat-for-global -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -enable-var-scope -check-prefixes=VI-GISEL %s
 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx900 -global-isel=0 -mattr=-flat-for-global -denormal-fp-math=preserve-sign -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -enable-var-scope -check-prefixes=GFX9-SDAG %s
@@ -28,6 +27,18 @@ define amdgpu_kernel void @fptrunc_f32_to_f16(
 ; SI-SDAG-NEXT:    buffer_store_short v0, off, s[4:7], 0
 ; SI-SDAG-NEXT:    s_endpgm
 ;
+; SI-GISEL-LABEL: fptrunc_f32_to_f16:
+; SI-GISEL:       ; %bb.0: ; %entry
+; SI-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
+; SI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-GISEL-NEXT:    s_load_dword s3, s[2:3], 0x0
+; SI-GISEL-NEXT:    s_mov_b32 s2, -1
+; SI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, s3
+; SI-GISEL-NEXT:    s_mov_b32 s3, 0xf000
+; SI-GISEL-NEXT:    buffer_store_short v0, off, s[0:3], 0
+; SI-GISEL-NEXT:    s_endpgm
+;
 ; VI-SDAG-LABEL: fptrunc_f32_to_f16:
 ; VI-SDAG:       ; %bb.0: ; %entry
 ; VI-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
@@ -150,6 +161,19 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(
 ; SI-SDAG-NEXT:    buffer_store_short v0, off, s[4:7], 0
 ; SI-SDAG-NEXT:    s_endpgm
 ;
+; SI-GISEL-LABEL: fptrunc_f64_to_f16:
+; SI-GISEL:       ; %bb.0: ; %entry
+; SI-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
+; SI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-GISEL-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0x0
+; SI-GISEL-NEXT:    s_mov_b32 s2, -1
+; SI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-GISEL-NEXT:    v_cvt_f32_f64_e32 v0, s[4:5]
+; SI-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; SI-GISEL-NEXT:    s_mov_b32 s3, 0xf000
+; SI-GISEL-NEXT:    buffer_store_short v0, off, s[0:3], 0
+; SI-GISEL-NEXT:    s_endpgm
+;
 ; VI-SDAG-LABEL: fptrunc_f64_to_f16:
 ; VI-SDAG:       ; %bb.0: ; %entry
 ; VI-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
@@ -282,6 +306,21 @@ define amdgpu_kernel void @fptrunc_v2f32_to_v2f16(
 ; SI-SDAG-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; SI-SDAG-NEXT:    s_endpgm
 ;
+; SI-GISEL-LABEL: fptrunc_v2f32_to_v2f16:
+; SI-GISEL:       ; %bb.0: ; %entry
+; SI-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
+; SI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-GISEL-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0x0
+; SI-GISEL-NEXT:    s_mov_b32 s2, -1
+; SI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, s4
+; SI-GISEL-NEXT:    v_cvt_f16_f32_e32 v1, s5
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; SI-GISEL-NEXT:    v_or_b32_e32 v0, v0, v1
+; SI-GISEL-NEXT:    s_mov_b32 s3, 0xf000
+; SI-GISEL-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; SI-GISEL-NEXT:    s_endpgm
+;
 ; VI-SDAG-LABEL: fptrunc_v2f32_to_v2f16:
 ; VI-SDAG:       ; %bb.0: ; %entry
 ; VI-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
@@ -311,10 +350,10 @@ define amdgpu_kernel void @fptrunc_v2f32_to_v2f16(
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v1, s3
 ; VI-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, s2
 ; VI-GISEL-NEXT:    v_cvt_f16_f32_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
-; VI-GISEL-NEXT:    v_or_b32_e32 v2, v0, v1
-; VI-GISEL-NEXT:    v_mov_b32_e32 v0, s0
-; VI-GISEL-NEXT:    v_mov_b32_e32 v1, s1
-; VI-GISEL-NEXT:    flat_store_dword v[0:1], v2
+; VI-GISEL-NEXT:    s_mov_b32 s2, -1
+; VI-GISEL-NEXT:    s_mov_b32 s3, 0xf000
+; VI-GISEL-NEXT:    v_or_b32_e32 v0, v0, v1
+; VI-GISEL-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; VI-GISEL-NEXT:    s_endpgm
 ;
 ; GFX9-SDAG-LABEL: fptrunc_v2f32_to_v2f16:
@@ -345,9 +384,10 @@ define amdgpu_kernel void @fptrunc_v2f32_to_v2f16(
 ; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, s2
 ; GFX9-GISEL-NEXT:    v_cvt_f16_f32_e32 v1, s3
+; GFX9-GISEL-NEXT:    s_mov_b32 s2, -1
+; GFX9-GISEL-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX9-GISEL-NEXT:    v_pack_b32_f16 v0, v0, v1
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, 0
-; GFX9-GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX9-GISEL-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX9-GISEL-NEXT:    s_endpgm
 ;
 ; GFX11-SDAG-LABEL: fptrunc_v2f32_to_v2f16:
@@ -381,10 +421,11 @@ define amdgpu_kernel void @fptrunc_v2f32_to_v2f16(
 ; GFX11-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, s2
 ; GFX11-GISEL-NEXT:    v_cvt_f16_f32_e32 v1, s3
+; GFX11-GISEL-NEXT:    s_mov_b32 s2, -1
+; GFX11-GISEL-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-GISEL-NEXT:    v_pack_b32_f16 v0, v0, v1
-; GFX11-GISEL-NEXT:    v_mov_b32_e32 v1, 0
-; GFX11-GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX11-GISEL-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
 ; GFX11-GISEL-NEXT:    s_nop 0
 ; GFX11-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-GISEL-NEXT:    s_endpgm
@@ -421,6 +462,23 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f16(
 ; SI-SDAG-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; SI-SDAG-NEXT:    s_endpgm
 ;
+; SI-GISEL-LABEL: fptrunc_v2f64_to_v2f16:
+; SI-GISEL:       ; %bb.0: ; %entry
+; SI-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
+; SI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x0
+; SI-GISEL-NEXT:    s_mov_b32 s2, -1
+; SI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-GISEL-NEXT:    v_cvt_f32_f64_e32 v0, s[4:5]
+; SI-GISEL-NEXT:    v_cvt_f32_f64_e32 v1, s[6:7]
+; SI-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; SI-GISEL-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; SI-GISEL-NEXT:    v_or_b32_e32 v0, v0, v1
+; SI-GISEL-NEXT:    s_mov_b32 s3, 0xf000
+; SI-GISEL-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; SI-GISEL-NEXT:    s_endpgm
+;
 ; VI-SDAG-LABEL: fptrunc_v2f64_to_v2f16:
 ; VI-SDAG:       ; %bb.0: ; %entry
 ; VI-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
@@ -448,15 +506,15 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f16(
 ; VI-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; VI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x0
+; VI-GISEL-NEXT:    s_mov_b32 s2, -1
+; VI-GISEL-NEXT:    s_mov_b32 s3, 0xf000
 ; VI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-GISEL-NEXT:    v_cvt_f32_f64_e32 v0, s[4:5]
 ; VI-GISEL-NEXT:    v_cvt_f32_f64_e32 v1, s[6:7]
 ; VI-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; VI-GISEL-NEXT:    v_cvt_f16_f32_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
-; VI-GISEL-NEXT:    v_or_b32_e32 v2, v0, v1
-; VI-GISEL-NEXT:    v_mov_b32_e32 v0, s0
-; VI-GISEL-NEXT:    v_mov_b32_e32 v1, s1
-; VI-GISEL-NEXT:    flat_store_dword v[0:1], v2
+; VI-GISEL-NEXT:    v_or_b32_e32 v0, v0, v1
+; VI-GISEL-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; VI-GISEL-NEXT:    s_endpgm
 ;
 ; GFX9-SDAG-LABEL: fptrunc_v2f64_to_v2f16:
@@ -486,14 +544,15 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f16(
 ; GFX9-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x0
+; GFX9-GISEL-NEXT:    s_mov_b32 s2, -1
+; GFX9-GISEL-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    v_cvt_f32_f64_e32 v0, s[4:5]
 ; GFX9-GISEL-NEXT:    v_cvt_f32_f64_e32 v1, s[6:7]
 ; GFX9-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GFX9-GISEL-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; GFX9-GISEL-NEXT:    v_pack_b32_f16 v0, v0, v1
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, 0
-; GFX9-GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX9-GISEL-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX9-GISEL-NEXT:    s_endpgm
 ;
 ; GFX11-SDAG-LABEL: fptrunc_v2f64_to_v2f16:
@@ -528,6 +587,8 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f16(
 ; GFX11-GISEL-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
 ; GFX11-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    s_load_b128 s[4:7], s[2:3], 0x0
+; GFX11-GISEL-NEXT:    s_mov_b32 s2, -1
+; GFX11-GISEL-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX11-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    v_cvt_f32_f64_e32 v0, s[4:5]
 ; GFX11-GISEL-NEXT:    v_cvt_f32_f64_e32 v1, s[6:7]
@@ -536,8 +597,7 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f16(
 ; GFX11-GISEL-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-GISEL-NEXT:    v_pack_b32_f16 v0, v0, v1
-; GFX11-GISEL-NEXT:    v_mov_b32_e32 v1, 0
-; GFX11-GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX11-GISEL-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
 ; GFX11-GISEL-NEXT:    s_nop 0
 ; GFX11-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-GISEL-NEXT:    s_endpgm
@@ -569,6 +629,18 @@ define amdgpu_kernel void @fneg_fptrunc_f32_to_f16(
 ; SI-SDAG-NEXT:    buffer_store_short v0, off, s[4:7], 0
 ; SI-SDAG-NEXT:    s_endpgm
 ;
+; SI-GISEL-LABEL: fneg_fptrunc_f32_to_f16:
+; SI-GISEL:       ; %bb.0: ; %entry
+; SI-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
+; SI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-GISEL-NEXT:    s_load_dword s3, s[2:3], 0x0
+; SI-GISEL-NEXT:    s_mov_b32 s2, -1
+; SI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-GISEL-NEXT:    v_cvt_f16_f32_e64 v0, -s3
+; SI-GISEL-NEXT:    s_mov_b32 s3, 0xf000
+; SI-GISEL-NEXT:    buffer_store_short v0, off, s[0:3], 0
+; SI-GISEL-NEXT:    s_endpgm
+;
 ; VI-SDAG-LABEL: fneg_fptrunc_f32_to_f16:
 ; VI-SDAG:       ; %bb.0: ; %entry
 ; VI-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
@@ -691,6 +763,18 @@ define amdgpu_kernel void @fabs_fptrunc_f32_to_f16(
 ; SI-SDAG-NEXT:    buffer_store_short v0, off, s[4:7], 0
 ; SI-SDAG-NEXT:    s_endpgm
 ;
+; SI-GISEL-LABEL: fabs_fptrunc_f32_to_f16:
+; SI-GISEL:       ; %bb.0: ; %entry
+; SI-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
+; SI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-GISEL-NEXT:    s_load_dword s3, s[2:3], 0x0
+; SI-GISEL-NEXT:    s_mov_b32 s2, -1
+; SI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-GISEL-NEXT:    v_cvt_f16_f32_e64 v0, |s3|
+; SI-GISEL-NEXT:    s_mov_b32 s3, 0xf000
+; SI-GISEL-NEXT:    buffer_store_short v0, off, s[0:3], 0
+; SI-GISEL-NEXT:    s_endpgm
+;
 ; VI-SDAG-LABEL: fabs_fptrunc_f32_to_f16:
 ; VI-SDAG:       ; %bb.0: ; %entry
 ; VI-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
@@ -813,6 +897,18 @@ define amdgpu_kernel void @fneg_fabs_fptrunc_f32_to_f16(
 ; SI-SDAG-NEXT:    buffer_store_short v0, off, s[4:7], 0
 ; SI-SDAG-NEXT:    s_endpgm
 ;
+; SI-GISEL-LABEL: fneg_fabs_fptrunc_f32_to_f16:
+; SI-GISEL:       ; %bb.0: ; %entry
+; SI-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
+; SI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-GISEL-NEXT:    s_load_dword s3, s[2:3], 0x0
+; SI-GISEL-NEXT:    s_mov_b32 s2, -1
+; SI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-GISEL-NEXT:    v_cvt_f16_f32_e64 v0, -|s3|
+; SI-GISEL-NEXT:    s_mov_b32 s3, 0xf000
+; SI-GISEL-NEXT:    buffer_store_short v0, off, s[0:3], 0
+; SI-GISEL-NEXT:    s_endpgm
+;
 ; VI-SDAG-LABEL: fneg_fabs_fptrunc_f32_to_f16:
 ; VI-SDAG:       ; %bb.0: ; %entry
 ; VI-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
@@ -936,6 +1032,18 @@ define amdgpu_kernel void @fptrunc_f32_to_f16_zext_i32(
 ; SI-SDAG-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; SI-SDAG-NEXT:    s_endpgm
 ;
+; SI-GISEL-LABEL: fptrunc_f32_to_f16_zext_i32:
+; SI-GISEL:       ; %bb.0: ; %entry
+; SI-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
+; SI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-GISEL-NEXT:    s_load_dword s3, s[2:3], 0x0
+; SI-GISEL-NEXT:    s_mov_b32 s2, -1
+; SI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, s3
+; SI-GISEL-NEXT:    s_mov_b32 s3, 0xf000
+; SI-GISEL-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; SI-GISEL-NEXT:    s_endpgm
+;
 ; VI-SDAG-LABEL: fptrunc_f32_to_f16_zext_i32:
 ; VI-SDAG:       ; %bb.0: ; %entry
 ; VI-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
@@ -1063,6 +1171,18 @@ define amdgpu_kernel void @fptrunc_fabs_f32_to_f16_zext_i32(
 ; SI-SDAG-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; SI-SDAG-NEXT:    s_endpgm
 ;
+; SI-GISEL-LABEL: fptrunc_fabs_f32_to_f16_zext_i32:
+; SI-GISEL:       ; %bb.0: ; %entry
+; SI-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
+; SI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-GISEL-NEXT:    s_load_dword s3, s[2:3], 0x0
+; SI-GISEL-NEXT:    s_mov_b32 s2, -1
+; SI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-GISEL-NEXT:    v_cvt_f16_f32_e64 v0, |s3|
+; SI-GISEL-NEXT:    s_mov_b32 s3, 0xf000
+; SI-GISEL-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; SI-GISEL-NEXT:    s_endpgm
+;
 ; VI-SDAG-LABEL: fptrunc_fabs_f32_to_f16_zext_i32:
 ; VI-SDAG:       ; %bb.0: ; %entry
 ; VI-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
@@ -1192,6 +1312,19 @@ define amdgpu_kernel void @fptrunc_f32_to_f16_sext_i32(
 ; SI-SDAG-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; SI-SDAG-NEXT:    s_endpgm
 ;
+; SI-GISEL-LABEL: fptrunc_f32_to_f16_sext_i32:
+; SI-GISEL:       ; %bb.0: ; %entry
+; SI-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
+; SI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-GISEL-NEXT:    s_load_dword s3, s[2:3], 0x0
+; SI-GISEL-NEXT:    s_mov_b32 s2, -1
+; SI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, s3
+; SI-GISEL-NEXT:    v_bfe_i32 v0, v0, 0, 16
+; SI-GISEL-NEXT:    s_mov_b32 s3, 0xf000
+; SI-GISEL-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; SI-GISEL-NEXT:    s_endpgm
+;
 ; VI-SDAG-LABEL: fptrunc_f32_to_f16_sext_i32:
 ; VI-SDAG:       ; %bb.0: ; %entry
 ; VI-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24

diff  --git a/llvm/test/CodeGen/AMDGPU/v_pack.ll b/llvm/test/CodeGen/AMDGPU/v_pack.ll
index 5ec9284c870c186..a8cbb0000ce722b 100644
--- a/llvm/test/CodeGen/AMDGPU/v_pack.ll
+++ b/llvm/test/CodeGen/AMDGPU/v_pack.ll
@@ -131,9 +131,10 @@ define amdgpu_kernel void @fptrunc(
 ; GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GISEL-NEXT:    v_cvt_f16_f32_e32 v0, s2
 ; GISEL-NEXT:    v_cvt_f16_f32_e32 v1, s3
+; GISEL-NEXT:    s_mov_b32 s2, -1
+; GISEL-NEXT:    s_mov_b32 s3, 0x31016000
 ; GISEL-NEXT:    v_pack_b32_f16 v0, v0, v1
-; GISEL-NEXT:    v_mov_b32_e32 v1, 0
-; GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
+; GISEL-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GISEL-NEXT:    s_endpgm
     ptr addrspace(1) %r,
     ptr addrspace(1) %a) {


        


More information about the llvm-commits mailing list