[llvm] 33ab74a - [AMDGPU] Precommit switching test to generated checks for D134463

Jeffrey Byrnes via llvm-commits llvm-commits at lists.llvm.org
Fri Sep 23 15:13:23 PDT 2022


Author: jeff
Date: 2022-09-23T15:12:53-07:00
New Revision: 33ab74ac466fcbc3e5c1fcac664cfc544592a3a9

URL: https://github.com/llvm/llvm-project/commit/33ab74ac466fcbc3e5c1fcac664cfc544592a3a9
DIFF: https://github.com/llvm/llvm-project/commit/33ab74ac466fcbc3e5c1fcac664cfc544592a3a9.diff

LOG: [AMDGPU] Precommit switching test to generated checks for D134463

Change-Id: I0d90f86ab759347a2f20448d28cc09ddaea3a4d4

Added: 
    

Modified: 
    llvm/test/CodeGen/AMDGPU/add.v2i16.ll
    llvm/test/CodeGen/AMDGPU/divergence-driven-buildvector.ll
    llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll
    llvm/test/CodeGen/AMDGPU/load-hi16.ll
    llvm/test/CodeGen/AMDGPU/pack.v2i16.ll

Removed: 
    


################################################################################
diff  --git a/llvm/test/CodeGen/AMDGPU/add.v2i16.ll b/llvm/test/CodeGen/AMDGPU/add.v2i16.ll
index 32fdf242a28e..baa28bf78534 100644
--- a/llvm/test/CodeGen/AMDGPU/add.v2i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/add.v2i16.ll
@@ -1,16 +1,68 @@
-; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI %s
-; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9PLUS,GFX9 %s
-; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9PLUS,GFX10 %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=VI %s
+; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s
+; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX10 %s
 
 ; FIXME: Need to handle non-uniform case for function below (load without gep).
-; GCN-LABEL: {{^}}v_test_add_v2i16:
-; GFX9PLUS: v_pk_add_u16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-
-; FIXME: or should be unnecessary
-; VI: v_add_u16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-; VI: v_add_u16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; VI: v_or_b32
+; FIXME: VI or should be unnecessary
 define amdgpu_kernel void @v_test_add_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0, <2 x i16> addrspace(1)* %in1) #1 {
+; VI-LABEL: v_test_add_v2i16:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    v_mov_b32_e32 v1, s7
+; VI-NEXT:    v_add_u32_e32 v0, vcc, s6, v2
+; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-NEXT:    v_mov_b32_e32 v3, s1
+; VI-NEXT:    v_add_u32_e32 v2, vcc, s0, v2
+; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
+; VI-NEXT:    flat_load_dword v0, v[0:1] glc
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    flat_load_dword v1, v[2:3] glc
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    s_mov_b32 s7, 0xf000
+; VI-NEXT:    s_mov_b32 s6, -1
+; VI-NEXT:    v_add_u16_e32 v2, v0, v1
+; VI-NEXT:    v_add_u16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; VI-NEXT:    v_or_b32_e32 v0, v2, v0
+; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; VI-NEXT:    s_endpgm
+;
+; GFX9-LABEL: v_test_add_v2i16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    global_load_dword v1, v0, s[6:7] glc
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    global_load_dword v2, v0, s[2:3] glc
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_mov_b32 s7, 0xf000
+; GFX9-NEXT:    s_mov_b32 s6, -1
+; GFX9-NEXT:    v_pk_add_u16 v0, v1, v2
+; GFX9-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; GFX9-NEXT:    s_endpgm
+;
+; GFX10-LABEL: v_test_add_v2i16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_clause 0x1
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NEXT:    global_load_dword v1, v0, s[6:7] glc dlc
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    global_load_dword v2, v0, s[2:3] glc dlc
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-NEXT:    s_mov_b32 s7, 0x31016000
+; GFX10-NEXT:    s_mov_b32 s6, -1
+; GFX10-NEXT:    v_pk_add_u16 v0, v1, v2
+; GFX10-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; GFX10-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i32 %tid
   %gep.in0 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in0, i32 %tid
@@ -22,16 +74,61 @@ define amdgpu_kernel void @v_test_add_v2i16(<2 x i16> addrspace(1)* %out, <2 x i
   ret void
 }
 
-; GCN-LABEL: {{^}}s_test_add_v2i16:
-; GFX9PLUS: s_load_dword [[VAL0:s[0-9]+]]
-; GFX9PLUS: s_load_dword [[VAL1:s[0-9]+]]
-; GFX9: v_mov_b32_e32 [[VVAL1:v[0-9]+]]
-; GFX9: v_pk_add_u16 v{{[0-9]+}}, [[VAL1]], [[VVAL1]]
-; GFX10: v_pk_add_u16 v{{[0-9]+}}, [[VAL0]], [[VAL1]]
-
-; VI: s_add_i32
-; VI: s_add_i32
 define amdgpu_kernel void @s_test_add_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(4)* %in0, <2 x i16> addrspace(4)* %in1) #1 {
+; VI-LABEL: s_test_add_v2i16:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT:    s_mov_b32 s3, 0xf000
+; VI-NEXT:    s_mov_b32 s2, -1
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    s_load_dword s6, s[6:7], 0x0
+; VI-NEXT:    s_load_dword s7, s[0:1], 0x0
+; VI-NEXT:    s_mov_b32 s0, s4
+; VI-NEXT:    s_mov_b32 s1, s5
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    s_lshr_b32 s4, s6, 16
+; VI-NEXT:    s_lshr_b32 s5, s7, 16
+; VI-NEXT:    s_add_i32 s6, s6, s7
+; VI-NEXT:    s_add_i32 s4, s4, s5
+; VI-NEXT:    s_and_b32 s5, s6, 0xffff
+; VI-NEXT:    s_lshl_b32 s4, s4, 16
+; VI-NEXT:    s_or_b32 s4, s5, s4
+; VI-NEXT:    v_mov_b32_e32 v0, s4
+; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT:    s_endpgm
+;
+; GFX9-LABEL: s_test_add_v2i16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x34
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-NEXT:    s_mov_b32 s3, 0xf000
+; GFX9-NEXT:    s_mov_b32 s2, -1
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_load_dword s10, s[8:9], 0x0
+; GFX9-NEXT:    s_load_dword s11, s[6:7], 0x0
+; GFX9-NEXT:    s_mov_b32 s0, s4
+; GFX9-NEXT:    s_mov_b32 s1, s5
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v0, s10
+; GFX9-NEXT:    v_pk_add_u16 v0, s11, v0
+; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX9-NEXT:    s_endpgm
+;
+; GFX10-LABEL: s_test_add_v2i16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_clause 0x1
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NEXT:    s_load_dword s0, s[6:7], 0x0
+; GFX10-NEXT:    s_load_dword s1, s[2:3], 0x0
+; GFX10-NEXT:    s_mov_b32 s7, 0x31016000
+; GFX10-NEXT:    s_mov_b32 s6, -1
+; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NEXT:    v_pk_add_u16 v0, s0, s1
+; GFX10-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; GFX10-NEXT:    s_endpgm
   %a = load <2 x i16>, <2 x i16> addrspace(4)* %in0
   %b = load <2 x i16>, <2 x i16> addrspace(4)* %in1
   %add = add <2 x i16> %a, %b
@@ -39,13 +136,49 @@ define amdgpu_kernel void @s_test_add_v2i16(<2 x i16> addrspace(1)* %out, <2 x i
   ret void
 }
 
-; GCN-LABEL: {{^}}s_test_add_self_v2i16:
-; GFX9PLUS: s_load_dword [[VAL:s[0-9]+]]
-; GFX9PLUS: v_pk_add_u16 v{{[0-9]+}}, [[VAL]], [[VAL]]
-
-; VI: s_add_i32
-; VI: s_add_i32
 define amdgpu_kernel void @s_test_add_self_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(4)* %in0) #1 {
+; VI-LABEL: s_test_add_self_v2i16:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    s_load_dword s4, s[2:3], 0x0
+; VI-NEXT:    s_mov_b32 s3, 0xf000
+; VI-NEXT:    s_mov_b32 s2, -1
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    s_lshr_b32 s5, s4, 16
+; VI-NEXT:    s_and_b32 s4, s4, 0xffff
+; VI-NEXT:    s_add_i32 s4, s4, s4
+; VI-NEXT:    s_add_i32 s5, s5, s5
+; VI-NEXT:    s_lshl_b32 s5, s5, 16
+; VI-NEXT:    s_and_b32 s4, s4, 0xffff
+; VI-NEXT:    s_or_b32 s4, s4, s5
+; VI-NEXT:    v_mov_b32_e32 v0, s4
+; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT:    s_endpgm
+;
+; GFX9-LABEL: s_test_add_self_v2i16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_load_dword s4, s[2:3], 0x0
+; GFX9-NEXT:    s_mov_b32 s3, 0xf000
+; GFX9-NEXT:    s_mov_b32 s2, -1
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    v_pk_add_u16 v0, s4, s4
+; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX9-NEXT:    s_endpgm
+;
+; GFX10-LABEL: s_test_add_self_v2i16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NEXT:    s_load_dword s2, s[2:3], 0x0
+; GFX10-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NEXT:    v_pk_add_u16 v0, s2, s2
+; GFX10-NEXT:    s_mov_b32 s2, -1
+; GFX10-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX10-NEXT:    s_endpgm
   %a = load <2 x i16>, <2 x i16> addrspace(4)* %in0
   %add = add <2 x i16> %a, %a
   store <2 x i16> %add, <2 x i16> addrspace(1)* %out
@@ -53,32 +186,103 @@ define amdgpu_kernel void @s_test_add_self_v2i16(<2 x i16> addrspace(1)* %out, <
 }
 
 ; FIXME: VI should not scalarize arg access.
-; GCN-LABEL: {{^}}s_test_add_v2i16_kernarg:
-; GFX9: v_pk_add_u16 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}
-; GFX10: v_pk_add_u16 v{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}}
-
-; VI: s_add_i32
-; VI: s_add_i32
-; VI: s_lshl_b32 s{{[0-9]+}}, s{{[0-9]+}}, 16
-; VI: s_and_b32
-; VI: s_or_b32
 define amdgpu_kernel void @s_test_add_v2i16_kernarg(<2 x i16> addrspace(1)* %out, <2 x i16> %a, <2 x i16> %b) #1 {
+; VI-LABEL: s_test_add_v2i16_kernarg:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT:    s_mov_b32 s7, 0xf000
+; VI-NEXT:    s_mov_b32 s6, -1
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    s_mov_b32 s4, s0
+; VI-NEXT:    s_mov_b32 s5, s1
+; VI-NEXT:    s_lshr_b32 s0, s2, 16
+; VI-NEXT:    s_lshr_b32 s1, s3, 16
+; VI-NEXT:    s_add_i32 s0, s0, s1
+; VI-NEXT:    s_add_i32 s1, s2, s3
+; VI-NEXT:    s_lshl_b32 s0, s0, 16
+; VI-NEXT:    s_and_b32 s1, s1, 0xffff
+; VI-NEXT:    s_or_b32 s0, s1, s0
+; VI-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; VI-NEXT:    s_endpgm
+;
+; GFX9-LABEL: s_test_add_v2i16_kernarg:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT:    s_mov_b32 s7, 0xf000
+; GFX9-NEXT:    s_mov_b32 s6, -1
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v0, s3
+; GFX9-NEXT:    s_mov_b32 s4, s0
+; GFX9-NEXT:    s_mov_b32 s5, s1
+; GFX9-NEXT:    v_pk_add_u16 v0, s2, v0
+; GFX9-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; GFX9-NEXT:    s_endpgm
+;
+; GFX10-LABEL: s_test_add_v2i16_kernarg:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT:    s_mov_b32 s7, 0x31016000
+; GFX10-NEXT:    s_mov_b32 s6, -1
+; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NEXT:    v_pk_add_u16 v0, s2, s3
+; GFX10-NEXT:    s_mov_b32 s4, s0
+; GFX10-NEXT:    s_mov_b32 s5, s1
+; GFX10-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; GFX10-NEXT:    s_endpgm
   %add = add <2 x i16> %a, %b
   store <2 x i16> %add, <2 x i16> addrspace(1)* %out
   ret void
 }
 
 ; FIXME: Eliminate or with sdwa
-; GCN-LABEL: {{^}}v_test_add_v2i16_constant:
-; GFX9: s_mov_b32 [[CONST:s[0-9]+]], 0x1c8007b{{$}}
-; GFX9: v_pk_add_u16 v{{[0-9]+}}, v{{[0-9]+}}, [[CONST]]
-; GFX10: v_pk_add_u16 v{{[0-9]+}}, 0x1c8007b, v{{[0-9]+}}
-
-; VI-DAG: v_mov_b32_e32 v[[SCONST:[0-9]+]], 0x1c8
-; VI-DAG: v_add_u16_e32 v{{[0-9]+}}, 0x7b, v{{[0-9]+}}
-; VI-DAG: v_add_u16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v[[SCONST]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI: v_or_b32_e32
 define amdgpu_kernel void @v_test_add_v2i16_constant(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0) #1 {
+; VI-LABEL: v_test_add_v2i16_constant:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    v_mov_b32_e32 v1, s3
+; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
+; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-NEXT:    flat_load_dword v0, v[0:1] glc
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    v_mov_b32_e32 v1, 0x1c8
+; VI-NEXT:    s_mov_b32 s3, 0xf000
+; VI-NEXT:    s_mov_b32 s2, -1
+; VI-NEXT:    v_add_u16_e32 v2, 0x7b, v0
+; VI-NEXT:    v_add_u16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT:    v_or_b32_e32 v0, v2, v0
+; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT:    s_endpgm
+;
+; GFX9-LABEL: v_test_add_v2i16_constant:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-NEXT:    s_mov_b32 s4, 0x1c8007b
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    global_load_dword v0, v0, s[2:3] glc
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_mov_b32 s3, 0xf000
+; GFX9-NEXT:    s_mov_b32 s2, -1
+; GFX9-NEXT:    v_pk_add_u16 v0, v0, s4
+; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX9-NEXT:    s_endpgm
+;
+; GFX10-LABEL: v_test_add_v2i16_constant:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NEXT:    global_load_dword v0, v0, s[2:3] glc dlc
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX10-NEXT:    s_mov_b32 s2, -1
+; GFX10-NEXT:    v_pk_add_u16 v0, 0x1c8007b, v0
+; GFX10-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX10-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i32 %tid
   %gep.in0 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in0, i32 %tid
@@ -89,15 +293,53 @@ define amdgpu_kernel void @v_test_add_v2i16_constant(<2 x i16> addrspace(1)* %ou
 }
 
 ; FIXME: Need to handle non-uniform case for function below (load without gep).
-; GCN-LABEL: {{^}}v_test_add_v2i16_neg_constant:
-; GFX9: s_mov_b32 [[CONST:s[0-9]+]], 0xfc21fcb3{{$}}
-; GFX9: v_pk_add_u16 v{{[0-9]+}}, v{{[0-9]+}}, [[CONST]]
-; GFX10: v_pk_add_u16 v{{[0-9]+}}, 0xfc21fcb3, v{{[0-9]+}}
-
-; VI-DAG: v_add_u16_e32 v{{[0-9]+}}, 0xfcb3, v{{[0-9]+}}
-; VI-DAG: v_mov_b32_e32 v[[SCONST:[0-9]+]], 0xfffffc21
-; VI-DAG: v_add_u16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v[[SCONST]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 define amdgpu_kernel void @v_test_add_v2i16_neg_constant(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0) #1 {
+; VI-LABEL: v_test_add_v2i16_neg_constant:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    v_mov_b32_e32 v1, s3
+; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
+; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-NEXT:    flat_load_dword v0, v[0:1] glc
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    v_mov_b32_e32 v1, 0xfffffc21
+; VI-NEXT:    s_mov_b32 s3, 0xf000
+; VI-NEXT:    s_mov_b32 s2, -1
+; VI-NEXT:    v_add_u16_e32 v2, 0xfcb3, v0
+; VI-NEXT:    v_add_u16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT:    v_or_b32_e32 v0, v2, v0
+; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT:    s_endpgm
+;
+; GFX9-LABEL: v_test_add_v2i16_neg_constant:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-NEXT:    s_mov_b32 s4, 0xfc21fcb3
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    global_load_dword v0, v0, s[2:3] glc
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_mov_b32 s3, 0xf000
+; GFX9-NEXT:    s_mov_b32 s2, -1
+; GFX9-NEXT:    v_pk_add_u16 v0, v0, s4
+; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX9-NEXT:    s_endpgm
+;
+; GFX10-LABEL: v_test_add_v2i16_neg_constant:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NEXT:    global_load_dword v0, v0, s[2:3] glc dlc
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX10-NEXT:    s_mov_b32 s2, -1
+; GFX10-NEXT:    v_pk_add_u16 v0, 0xfc21fcb3, v0
+; GFX10-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX10-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i32 %tid
   %gep.in0 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in0, i32 %tid
@@ -107,15 +349,52 @@ define amdgpu_kernel void @v_test_add_v2i16_neg_constant(<2 x i16> addrspace(1)*
   ret void
 }
 
-; GCN-LABEL: {{^}}v_test_add_v2i16_inline_neg1:
-; GFX9PLUS: v_pk_sub_u16 v{{[0-9]+}}, v{{[0-9]+}}, 1 op_sel_hi:[1,0]{{$}}
-
-; VI-DAG: v_mov_b32_e32 v[[SCONST:[0-9]+]], -1
-; VI-DAG: flat_load_dword [[LOAD:v[0-9]+]]
-; VI-DAG: v_add_u16_sdwa v{{[0-9]+}}, [[LOAD]], v[[SCONST]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-DAG: v_add_u16_e32 v{{[0-9]+}}, -1, [[LOAD]]
-; VI: v_or_b32_e32
 define amdgpu_kernel void @v_test_add_v2i16_inline_neg1(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0) #1 {
+; VI-LABEL: v_test_add_v2i16_inline_neg1:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    v_mov_b32_e32 v1, s3
+; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
+; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-NEXT:    flat_load_dword v0, v[0:1] glc
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    v_mov_b32_e32 v1, -1
+; VI-NEXT:    s_mov_b32 s3, 0xf000
+; VI-NEXT:    s_mov_b32 s2, -1
+; VI-NEXT:    v_add_u16_e32 v2, -1, v0
+; VI-NEXT:    v_add_u16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT:    v_or_b32_e32 v0, v2, v0
+; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT:    s_endpgm
+;
+; GFX9-LABEL: v_test_add_v2i16_inline_neg1:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    global_load_dword v0, v0, s[2:3] glc
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_mov_b32 s3, 0xf000
+; GFX9-NEXT:    s_mov_b32 s2, -1
+; GFX9-NEXT:    v_pk_sub_u16 v0, v0, 1 op_sel_hi:[1,0]
+; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX9-NEXT:    s_endpgm
+;
+; GFX10-LABEL: v_test_add_v2i16_inline_neg1:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NEXT:    global_load_dword v0, v0, s[2:3] glc dlc
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX10-NEXT:    s_mov_b32 s2, -1
+; GFX10-NEXT:    v_pk_sub_u16 v0, v0, 1 op_sel_hi:[1,0]
+; GFX10-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX10-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i32 %tid
   %gep.in0 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in0, i32 %tid
@@ -125,16 +404,51 @@ define amdgpu_kernel void @v_test_add_v2i16_inline_neg1(<2 x i16> addrspace(1)*
   ret void
 }
 
-; GCN-LABEL: {{^}}v_test_add_v2i16_inline_lo_zero_hi:
-; GFX9PLUS: v_pk_add_u16 v{{[0-9]+}}, v{{[0-9]+}}, 32{{$}}
-
-; VI: flat_load_dword
-; VI-NOT: v_add_u16
-; VI: v_and_b32_e32 v{{[0-9]+}}, 0xffff0000,
-; VI: v_add_u16_e32 v{{[0-9]+}}, 32, v{{[0-9]+}}
-; VI-NOT: v_add_u16
-; VI: v_or_b32_e32
 define amdgpu_kernel void @v_test_add_v2i16_inline_lo_zero_hi(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0) #1 {
+; VI-LABEL: v_test_add_v2i16_inline_lo_zero_hi:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    v_mov_b32_e32 v1, s3
+; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
+; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-NEXT:    flat_load_dword v0, v[0:1] glc
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    s_mov_b32 s3, 0xf000
+; VI-NEXT:    s_mov_b32 s2, -1
+; VI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v0
+; VI-NEXT:    v_add_u16_e32 v0, 32, v0
+; VI-NEXT:    v_or_b32_e32 v0, v0, v1
+; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT:    s_endpgm
+;
+; GFX9-LABEL: v_test_add_v2i16_inline_lo_zero_hi:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    global_load_dword v0, v0, s[2:3] glc
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_mov_b32 s3, 0xf000
+; GFX9-NEXT:    s_mov_b32 s2, -1
+; GFX9-NEXT:    v_pk_add_u16 v0, v0, 32
+; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX9-NEXT:    s_endpgm
+;
+; GFX10-LABEL: v_test_add_v2i16_inline_lo_zero_hi:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NEXT:    global_load_dword v0, v0, s[2:3] glc dlc
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX10-NEXT:    s_mov_b32 s2, -1
+; GFX10-NEXT:    v_pk_add_u16 v0, v0, 32
+; GFX10-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX10-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i32 %tid
   %gep.in0 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in0, i32 %tid
@@ -145,17 +459,52 @@ define amdgpu_kernel void @v_test_add_v2i16_inline_lo_zero_hi(<2 x i16> addrspac
 }
 
 ; The high element gives fp
-; GCN-LABEL: {{^}}v_test_add_v2i16_inline_fp_split:
-; GFX9: s_mov_b32 [[K:s[0-9]+]], 1.0
-; GFX9: v_pk_add_u16 v{{[0-9]+}}, v{{[0-9]+}}, [[K]]{{$}}
-; GFX10: v_pk_add_u16 v{{[0-9]+}}, 0x3f80, v{{[0-9]+}}
-
-; VI-NOT: v_add_u16
-; VI: v_mov_b32_e32 v[[K:[0-9]+]], 0x3f80
-; VI: v_add_u16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v[[K]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NOT: v_add_u16
-; VI: v_or_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 define amdgpu_kernel void @v_test_add_v2i16_inline_fp_split(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0) #1 {
+; VI-LABEL: v_test_add_v2i16_inline_fp_split:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    v_mov_b32_e32 v1, s3
+; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
+; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-NEXT:    flat_load_dword v0, v[0:1] glc
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    v_mov_b32_e32 v1, 0x3f80
+; VI-NEXT:    s_mov_b32 s3, 0xf000
+; VI-NEXT:    s_mov_b32 s2, -1
+; VI-NEXT:    v_add_u16_sdwa v1, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT:    s_endpgm
+;
+; GFX9-LABEL: v_test_add_v2i16_inline_fp_split:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-NEXT:    s_mov_b32 s4, 1.0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    global_load_dword v0, v0, s[2:3] glc
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_mov_b32 s3, 0xf000
+; GFX9-NEXT:    s_mov_b32 s2, -1
+; GFX9-NEXT:    v_pk_add_u16 v0, v0, s4
+; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX9-NEXT:    s_endpgm
+;
+; GFX10-LABEL: v_test_add_v2i16_inline_fp_split:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NEXT:    global_load_dword v0, v0, s[2:3] glc dlc
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX10-NEXT:    s_mov_b32 s2, -1
+; GFX10-NEXT:    v_pk_add_u16 v0, 0x3f80, v0 op_sel:[1,0] op_sel_hi:[0,1]
+; GFX10-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX10-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i32 %tid
   %gep.in0 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in0, i32 %tid
@@ -166,26 +515,67 @@ define amdgpu_kernel void @v_test_add_v2i16_inline_fp_split(<2 x i16> addrspace(
 }
 
 ; FIXME: Need to handle non-uniform case for function below (load without gep).
-; GCN-LABEL: {{^}}v_test_add_v2i16_zext_to_v2i32:
-; GFX9PLUS: global_load_dword [[A:v[0-9]+]]
-; GFX9PLUS: global_load_dword [[B:v[0-9]+]]
-
-; GFX9PLUS: v_pk_add_u16 [[ADD:v[0-9]+]], [[A]], [[B]]
-; GFX9PLUS-DAG: v_and_b32_e32 v[[ELT0:[0-9]+]], 0xffff, [[ADD]]
-; GFX9PLUS-DAG: v_lshrrev_b32_e32 v[[ELT1:[0-9]+]], 16, [[ADD]]
-; GFX9PLUS: buffer_store_dwordx2 v[[[ELT0]]:[[ELT1]]]
-
-; VI: flat_load_dword v[[A:[0-9]+]]
-; VI: flat_load_dword v[[B:[0-9]+]]
-
-; VI-NOT: and
-; VI-NOT: shl
-; VI: v_add_u16_e32 v[[ADD_LO:[0-9]+]], v[[A]], v[[B]]
-; VI: v_add_u16_sdwa v[[ADD_HI:[0-9]+]], v[[A]], v[[B]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; VI-NOT: and
-; VI-NOT: shl
-; VI: buffer_store_dwordx2 v[[[ADD_LO]]:[[ADD_HI]]]
 define amdgpu_kernel void @v_test_add_v2i16_zext_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0, <2 x i16> addrspace(1)* %in1) #1 {
+; VI-LABEL: v_test_add_v2i16_zext_to_v2i32:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    v_mov_b32_e32 v1, s7
+; VI-NEXT:    v_add_u32_e32 v0, vcc, s6, v2
+; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-NEXT:    v_mov_b32_e32 v3, s1
+; VI-NEXT:    v_add_u32_e32 v2, vcc, s0, v2
+; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
+; VI-NEXT:    flat_load_dword v1, v[0:1] glc
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    flat_load_dword v2, v[2:3] glc
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    s_mov_b32 s7, 0xf000
+; VI-NEXT:    s_mov_b32 s6, -1
+; VI-NEXT:    v_add_u16_e32 v0, v1, v2
+; VI-NEXT:    v_add_u16_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; VI-NEXT:    s_endpgm
+;
+; GFX9-LABEL: v_test_add_v2i16_zext_to_v2i32:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    global_load_dword v1, v0, s[6:7] glc
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    global_load_dword v2, v0, s[2:3] glc
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_mov_b32 s7, 0xf000
+; GFX9-NEXT:    s_mov_b32 s6, -1
+; GFX9-NEXT:    v_pk_add_u16 v0, v1, v2
+; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GFX9-NEXT:    s_endpgm
+;
+; GFX10-LABEL: v_test_add_v2i16_zext_to_v2i32:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_clause 0x1
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NEXT:    global_load_dword v1, v0, s[6:7] glc dlc
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    global_load_dword v2, v0, s[2:3] glc dlc
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-NEXT:    s_mov_b32 s7, 0x31016000
+; GFX10-NEXT:    s_mov_b32 s6, -1
+; GFX10-NEXT:    v_pk_add_u16 v0, v1, v2
+; GFX10-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX10-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GFX10-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.out = getelementptr inbounds <2 x i32>, <2 x i32> addrspace(1)* %out, i32 %tid
   %gep.in0 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in0, i32 %tid
@@ -199,25 +589,75 @@ define amdgpu_kernel void @v_test_add_v2i16_zext_to_v2i32(<2 x i32> addrspace(1)
 }
 
 ; FIXME: Need to handle non-uniform case for function below (load without gep).
-; GCN-LABEL: {{^}}v_test_add_v2i16_zext_to_v2i64:
-; GFX9PLUS: v_mov_b32_e32 [[MASK:v[0-9+]]], 0xffff
-; GFX9PLUS: global_load_dword [[A:v[0-9]+]]
-; GFX9PLUS: global_load_dword [[B:v[0-9]+]]
-
-; GFX9PLUS: v_pk_add_u16 [[ADD:v[0-9]+]], [[A]], [[B]]
-; GFX9PLUS-DAG: v_and_b32_e32 v[[ELT0:[0-9]+]], 0xffff, [[ADD]]
-; GFX9PLUS-DAG: v_and_b32_sdwa v{{[0-9]+}}, [[MASK]], v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX9PLUS: buffer_store_dwordx4
-
-; VI-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0{{$}}
-; VI-DAG: flat_load_dword v[[A:[0-9]+]]
-; VI-DAG: flat_load_dword v[[B:[0-9]+]]
-
-; VI-DAG: v_add_u16_e32
-; VI: v_add_u16_sdwa v[[ADD_HI:[0-9]+]], v[[A]], v[[B]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-
-; VI: buffer_store_dwordx4
 define amdgpu_kernel void @v_test_add_v2i16_zext_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0, <2 x i16> addrspace(1)* %in1) #1 {
+; VI-LABEL: v_test_add_v2i16_zext_to_v2i64:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    v_mov_b32_e32 v1, s7
+; VI-NEXT:    v_add_u32_e32 v0, vcc, s6, v2
+; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-NEXT:    v_mov_b32_e32 v3, s1
+; VI-NEXT:    v_add_u32_e32 v2, vcc, s0, v2
+; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
+; VI-NEXT:    flat_load_dword v4, v[0:1] glc
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    flat_load_dword v2, v[2:3] glc
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    v_mov_b32_e32 v1, 0
+; VI-NEXT:    s_mov_b32 s7, 0xf000
+; VI-NEXT:    s_mov_b32 s6, -1
+; VI-NEXT:    v_mov_b32_e32 v3, v1
+; VI-NEXT:    v_add_u16_e32 v0, v4, v2
+; VI-NEXT:    v_add_u16_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
+; VI-NEXT:    s_endpgm
+;
+; GFX9-LABEL: v_test_add_v2i16_zext_to_v2i64:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0xffff
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    global_load_dword v2, v0, s[6:7] glc
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    global_load_dword v3, v0, s[2:3] glc
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_mov_b32 s7, 0xf000
+; GFX9-NEXT:    s_mov_b32 s6, -1
+; GFX9-NEXT:    v_pk_add_u16 v2, v2, v3
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff, v2
+; GFX9-NEXT:    v_and_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-NEXT:    v_mov_b32_e32 v3, v1
+; GFX9-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
+; GFX9-NEXT:    s_endpgm
+;
+; GFX10-LABEL: v_test_add_v2i16_zext_to_v2i64:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_clause 0x1
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX10-NEXT:    v_mov_b32_e32 v3, 0xffff
+; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NEXT:    global_load_dword v1, v0, s[6:7] glc dlc
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    global_load_dword v2, v0, s[2:3] glc dlc
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-NEXT:    s_mov_b32 s7, 0x31016000
+; GFX10-NEXT:    s_mov_b32 s6, -1
+; GFX10-NEXT:    v_pk_add_u16 v2, v1, v2
+; GFX10-NEXT:    v_mov_b32_e32 v1, 0
+; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff, v2
+; GFX10-NEXT:    v_and_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX10-NEXT:    v_mov_b32_e32 v3, v1
+; GFX10-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
+; GFX10-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.out = getelementptr inbounds <2 x i64>, <2 x i64> addrspace(1)* %out, i32 %tid
   %gep.in0 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in0, i32 %tid
@@ -231,22 +671,69 @@ define amdgpu_kernel void @v_test_add_v2i16_zext_to_v2i64(<2 x i64> addrspace(1)
 }
 
 ; FIXME: Need to handle non-uniform case for function below (load without gep).
-; GCN-LABEL: {{^}}v_test_add_v2i16_sext_to_v2i32:
-; GFX9PLUS: global_load_dword [[A:v[0-9]+]]
-; GFX9PLUS: global_load_dword [[B:v[0-9]+]]
-
-; GFX9PLUS: v_pk_add_u16 [[ADD:v[0-9]+]], [[A]], [[B]]
-; GFX9PLUS-DAG: v_bfe_i32 v[[ELT0:[0-9]+]], [[ADD]], 0, 16
-; GFX9PLUS-DAG: v_ashrrev_i32_e32 v[[ELT1:[0-9]+]], 16, [[ADD]]
-; GFX9PLUS: buffer_store_dwordx2 v[[[ELT0]]:[[ELT1]]]
-
-; VI: v_add_u16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; VI: v_add_u16_e32
-
-; VI: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 0, 16
-; VI: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 0, 16
-; VI: buffer_store_dwordx2
 define amdgpu_kernel void @v_test_add_v2i16_sext_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0, <2 x i16> addrspace(1)* %in1) #1 {
+; VI-LABEL: v_test_add_v2i16_sext_to_v2i32:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    v_mov_b32_e32 v1, s7
+; VI-NEXT:    v_add_u32_e32 v0, vcc, s6, v2
+; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-NEXT:    v_mov_b32_e32 v3, s1
+; VI-NEXT:    v_add_u32_e32 v2, vcc, s0, v2
+; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
+; VI-NEXT:    flat_load_dword v0, v[0:1] glc
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    flat_load_dword v1, v[2:3] glc
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    s_mov_b32 s7, 0xf000
+; VI-NEXT:    s_mov_b32 s6, -1
+; VI-NEXT:    v_add_u16_sdwa v2, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; VI-NEXT:    v_add_u16_e32 v0, v0, v1
+; VI-NEXT:    v_bfe_i32 v0, v0, 0, 16
+; VI-NEXT:    v_bfe_i32 v1, v2, 0, 16
+; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; VI-NEXT:    s_endpgm
+;
+; GFX9-LABEL: v_test_add_v2i16_sext_to_v2i32:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    global_load_dword v1, v0, s[6:7] glc
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    global_load_dword v2, v0, s[2:3] glc
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_mov_b32 s7, 0xf000
+; GFX9-NEXT:    s_mov_b32 s6, -1
+; GFX9-NEXT:    v_pk_add_u16 v0, v1, v2
+; GFX9-NEXT:    v_ashrrev_i32_e32 v1, 16, v0
+; GFX9-NEXT:    v_bfe_i32 v0, v0, 0, 16
+; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GFX9-NEXT:    s_endpgm
+;
+; GFX10-LABEL: v_test_add_v2i16_sext_to_v2i32:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_clause 0x1
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NEXT:    global_load_dword v1, v0, s[6:7] glc dlc
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    global_load_dword v2, v0, s[2:3] glc dlc
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-NEXT:    s_mov_b32 s7, 0x31016000
+; GFX10-NEXT:    s_mov_b32 s6, -1
+; GFX10-NEXT:    v_pk_add_u16 v0, v1, v2
+; GFX10-NEXT:    v_ashrrev_i32_e32 v1, 16, v0
+; GFX10-NEXT:    v_bfe_i32 v0, v0, 0, 16
+; GFX10-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GFX10-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.out = getelementptr inbounds <2 x i32>, <2 x i32> addrspace(1)* %out, i32 %tid
   %gep.in0 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in0, i32 %tid
@@ -260,21 +747,75 @@ define amdgpu_kernel void @v_test_add_v2i16_sext_to_v2i32(<2 x i32> addrspace(1)
 }
 
 ; FIXME: Need to handle non-uniform case for function below (load without gep).
-; GCN-LABEL: {{^}}v_test_add_v2i16_sext_to_v2i64:
-; GCN: {{flat|global}}_load_dword
-; GCN: {{flat|global}}_load_dword
-
-; GFX9PLUS: v_pk_add_u16
-; GFX9PLUS: v_lshrrev_b32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}}
-
-; VI: v_add_u16_sdwa
-; VI: v_add_u16_e32
-
-; GCN: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 0, 16
-; GCN: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 0, 16
-; GCN: v_ashrrev_i32_e32 v{{[0-9]+}}, 31, v{{[0-9]+}}
-; GCN: v_ashrrev_i32_e32 v{{[0-9]+}}, 31, v{{[0-9]+}}
 define amdgpu_kernel void @v_test_add_v2i16_sext_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0, <2 x i16> addrspace(1)* %in1) #1 {
+; VI-LABEL: v_test_add_v2i16_sext_to_v2i64:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    v_mov_b32_e32 v1, s7
+; VI-NEXT:    v_add_u32_e32 v0, vcc, s6, v2
+; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-NEXT:    v_mov_b32_e32 v3, s1
+; VI-NEXT:    v_add_u32_e32 v2, vcc, s0, v2
+; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
+; VI-NEXT:    flat_load_dword v0, v[0:1]
+; VI-NEXT:    flat_load_dword v1, v[2:3]
+; VI-NEXT:    s_mov_b32 s7, 0xf000
+; VI-NEXT:    s_mov_b32 s6, -1
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    v_add_u16_sdwa v2, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; VI-NEXT:    v_add_u16_e32 v0, v0, v1
+; VI-NEXT:    v_bfe_i32 v0, v0, 0, 16
+; VI-NEXT:    v_bfe_i32 v2, v2, 0, 16
+; VI-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
+; VI-NEXT:    v_ashrrev_i32_e32 v3, 31, v2
+; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
+; VI-NEXT:    s_endpgm
+;
+; GFX9-LABEL: v_test_add_v2i16_sext_to_v2i64:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    global_load_dword v1, v0, s[6:7]
+; GFX9-NEXT:    global_load_dword v2, v0, s[2:3]
+; GFX9-NEXT:    s_mov_b32 s7, 0xf000
+; GFX9-NEXT:    s_mov_b32 s6, -1
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_pk_add_u16 v1, v1, v2
+; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
+; GFX9-NEXT:    v_bfe_i32 v0, v1, 0, 16
+; GFX9-NEXT:    v_bfe_i32 v2, v2, 0, 16
+; GFX9-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
+; GFX9-NEXT:    v_ashrrev_i32_e32 v3, 31, v2
+; GFX9-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
+; GFX9-NEXT:    s_endpgm
+;
+; GFX10-LABEL: v_test_add_v2i16_sext_to_v2i64:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_clause 0x1
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NEXT:    s_clause 0x1
+; GFX10-NEXT:    global_load_dword v1, v0, s[6:7]
+; GFX10-NEXT:    global_load_dword v2, v0, s[2:3]
+; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-NEXT:    s_mov_b32 s7, 0x31016000
+; GFX10-NEXT:    s_mov_b32 s6, -1
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_pk_add_u16 v0, v1, v2
+; GFX10-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX10-NEXT:    v_bfe_i32 v0, v0, 0, 16
+; GFX10-NEXT:    v_bfe_i32 v2, v1, 0, 16
+; GFX10-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
+; GFX10-NEXT:    v_ashrrev_i32_e32 v3, 31, v2
+; GFX10-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
+; GFX10-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.out = getelementptr inbounds <2 x i64>, <2 x i64> addrspace(1)* %out, i32 %tid
   %gep.in0 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in0, i32 %tid

diff  --git a/llvm/test/CodeGen/AMDGPU/divergence-driven-buildvector.ll b/llvm/test/CodeGen/AMDGPU/divergence-driven-buildvector.ll
index b49a9a6fda8a..7819bdd18db5 100644
--- a/llvm/test/CodeGen/AMDGPU/divergence-driven-buildvector.ll
+++ b/llvm/test/CodeGen/AMDGPU/divergence-driven-buildvector.ll
@@ -1,10 +1,42 @@
-; RUN: llc -march=amdgcn -stop-after=amdgpu-isel < %s | FileCheck -check-prefix=GCN %s
-; RUN: llc -march=amdgcn -mcpu=gfx900 -stop-after=amdgpu-isel < %s | FileCheck -check-prefix=GFX9 %s
-; RUN: llc -march=amdgcn -mcpu=gfx906 -stop-after=amdgpu-isel < %s | FileCheck -check-prefix=GFX906 %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -march=amdgcn < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -march=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefix=GFX9 %s
+; RUN: llc -march=amdgcn -mcpu=gfx906 < %s | FileCheck -check-prefix=GFX906 %s
 
-; GCN-LABEL: name:            uniform_vec_0_i16
-; GCN: S_LSHL_B32
 define amdgpu_kernel void @uniform_vec_0_i16(i32 addrspace(1)* %out, i16 %a) {
+; GCN-LABEL: uniform_vec_0_i16:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_load_dword s2, s[0:1], 0xb
+; GCN-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
+; GCN-NEXT:    s_mov_b32 s3, 0xf000
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_lshl_b32 s4, s2, 16
+; GCN-NEXT:    s_mov_b32 s2, -1
+; GCN-NEXT:    v_mov_b32_e32 v0, s4
+; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GCN-NEXT:    s_endpgm
+;
+; GFX9-LABEL: uniform_vec_0_i16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_load_dword s4, s[0:1], 0x2c
+; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_lshl_b32 s0, s4, 16
+; GFX9-NEXT:    v_mov_b32_e32 v1, s0
+; GFX9-NEXT:    global_store_dword v0, v1, s[2:3]
+; GFX9-NEXT:    s_endpgm
+;
+; GFX906-LABEL: uniform_vec_0_i16:
+; GFX906:       ; %bb.0:
+; GFX906-NEXT:    s_load_dword s4, s[0:1], 0x2c
+; GFX906-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX906-NEXT:    v_mov_b32_e32 v0, 0
+; GFX906-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX906-NEXT:    s_lshl_b32 s0, s4, 16
+; GFX906-NEXT:    v_mov_b32_e32 v1, s0
+; GFX906-NEXT:    global_store_dword v0, v1, s[2:3]
+; GFX906-NEXT:    s_endpgm
   %tmp = insertelement <2 x i16> undef, i16 0, i32 0
   %vec = insertelement <2 x i16> %tmp, i16 %a, i32 1
   %val = bitcast <2 x i16> %vec to i32
@@ -12,18 +44,64 @@ define amdgpu_kernel void @uniform_vec_0_i16(i32 addrspace(1)* %out, i16 %a) {
   ret void
 }
 
-; GCN-LABEL: name:            divergent_vec_0_i16
-; GCN: V_LSHLREV_B32_e64
 define i32 @divergent_vec_0_i16(i16 %a) {
+; GCN-LABEL: divergent_vec_0_i16:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: divergent_vec_0_i16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX906-LABEL: divergent_vec_0_i16:
+; GFX906:       ; %bb.0:
+; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX906-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX906-NEXT:    s_setpc_b64 s[30:31]
   %tmp = insertelement <2 x i16> undef, i16 0, i32 0
   %vec = insertelement <2 x i16> %tmp, i16 %a, i32 1
   %val = bitcast <2 x i16> %vec to i32
   ret i32 %val
 }
 
-; GCN-LABEL: name:            uniform_vec_i16_0
-; GCN: S_AND_B32
 define amdgpu_kernel void @uniform_vec_i16_0(i32 addrspace(1)* %out, i16 %a) {
+; GCN-LABEL: uniform_vec_i16_0:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_load_dword s2, s[0:1], 0xb
+; GCN-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
+; GCN-NEXT:    s_mov_b32 s3, 0xf000
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_and_b32 s4, s2, 0xffff
+; GCN-NEXT:    s_mov_b32 s2, -1
+; GCN-NEXT:    v_mov_b32_e32 v0, s4
+; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GCN-NEXT:    s_endpgm
+;
+; GFX9-LABEL: uniform_vec_i16_0:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_load_dword s4, s[0:1], 0x2c
+; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_and_b32 s0, 0xffff, s4
+; GFX9-NEXT:    v_mov_b32_e32 v1, s0
+; GFX9-NEXT:    global_store_dword v0, v1, s[2:3]
+; GFX9-NEXT:    s_endpgm
+;
+; GFX906-LABEL: uniform_vec_i16_0:
+; GFX906:       ; %bb.0:
+; GFX906-NEXT:    s_load_dword s4, s[0:1], 0x2c
+; GFX906-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX906-NEXT:    v_mov_b32_e32 v0, 0
+; GFX906-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX906-NEXT:    s_and_b32 s0, 0xffff, s4
+; GFX906-NEXT:    v_mov_b32_e32 v1, s0
+; GFX906-NEXT:    global_store_dword v0, v1, s[2:3]
+; GFX906-NEXT:    s_endpgm
   %tmp = insertelement <2 x i16> undef, i16 %a, i32 0
   %vec = insertelement <2 x i16> %tmp, i16 0, i32 1
   %val = bitcast <2 x i16> %vec to i32
@@ -31,18 +109,64 @@ define amdgpu_kernel void @uniform_vec_i16_0(i32 addrspace(1)* %out, i16 %a) {
   ret void
 }
 
-; GCN-LABEL: name:            divergent_vec_i16_0
-; GCN: V_AND_B32_e64
 define i32 @divergent_vec_i16_0(i16 %a) {
+; GCN-LABEL: divergent_vec_i16_0:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: divergent_vec_i16_0:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX906-LABEL: divergent_vec_i16_0:
+; GFX906:       ; %bb.0:
+; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX906-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX906-NEXT:    s_setpc_b64 s[30:31]
   %tmp = insertelement <2 x i16> undef, i16 %a, i32 0
   %vec = insertelement <2 x i16> %tmp, i16 0, i32 1
   %val = bitcast <2 x i16> %vec to i32
   ret i32 %val
 }
 
-; GCN-LABEL: name:            uniform_vec_f16_0
-; GCN: S_AND_B32
 define amdgpu_kernel void @uniform_vec_f16_0(float addrspace(1)* %out, half %a) {
+; GCN-LABEL: uniform_vec_f16_0:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_load_dword s2, s[0:1], 0xb
+; GCN-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
+; GCN-NEXT:    s_mov_b32 s3, 0xf000
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_and_b32 s4, s2, 0xffff
+; GCN-NEXT:    s_mov_b32 s2, -1
+; GCN-NEXT:    v_mov_b32_e32 v0, s4
+; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GCN-NEXT:    s_endpgm
+;
+; GFX9-LABEL: uniform_vec_f16_0:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_load_dword s4, s[0:1], 0x2c
+; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_and_b32 s0, 0xffff, s4
+; GFX9-NEXT:    v_mov_b32_e32 v1, s0
+; GFX9-NEXT:    global_store_dword v0, v1, s[2:3]
+; GFX9-NEXT:    s_endpgm
+;
+; GFX906-LABEL: uniform_vec_f16_0:
+; GFX906:       ; %bb.0:
+; GFX906-NEXT:    s_load_dword s4, s[0:1], 0x2c
+; GFX906-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX906-NEXT:    v_mov_b32_e32 v0, 0
+; GFX906-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX906-NEXT:    s_and_b32 s0, 0xffff, s4
+; GFX906-NEXT:    v_mov_b32_e32 v1, s0
+; GFX906-NEXT:    global_store_dword v0, v1, s[2:3]
+; GFX906-NEXT:    s_endpgm
   %tmp = insertelement <2 x half> undef, half %a, i32 0
   %vec = insertelement <2 x half> %tmp, half 0.0, i32 1
   %val = bitcast <2 x half> %vec to float
@@ -50,29 +174,71 @@ define amdgpu_kernel void @uniform_vec_f16_0(float addrspace(1)* %out, half %a)
   ret void
 }
 
-; GCN-LABEL: name:            divergent_vec_f16_0
-; GCN: V_CVT_F16_F32_e64 0, %0
-; GCN: COPY %1
-
-; GFX9-LABEL: name:            divergent_vec_f16_0
-; GFX9: V_AND_B32_e64
 define float @divergent_vec_f16_0(half %a) {
+; GCN-LABEL: divergent_vec_f16_0:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: divergent_vec_f16_0:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX906-LABEL: divergent_vec_f16_0:
+; GFX906:       ; %bb.0:
+; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX906-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX906-NEXT:    s_setpc_b64 s[30:31]
   %tmp = insertelement <2 x half> undef, half %a, i32 0
   %vec = insertelement <2 x half> %tmp, half 0.0, i32 1
   %val = bitcast <2 x half> %vec to float
   ret float %val
 }
 
-; GCN-LABEL: name:            uniform_vec_i16_LL
-; GCN: %[[IMM:[0-9]+]]:sreg_32 = S_MOV_B32 65535
-; GCN: %[[AND:[0-9]+]]:sreg_32 = S_AND_B32 killed %{{[0-9]+}}, killed %[[IMM]]
-; GCN: %[[SHIFT:[0-9]+]]:sreg_32 = S_MOV_B32 16
-; GCN:  %[[SHL:[0-9]+]]:sreg_32 = S_LSHL_B32 killed %{{[0-9]+}}, killed %[[SHIFT]]
-; GCN: S_OR_B32 killed %[[AND]], killed %[[SHL]]
-
-; GFX9-LABEL: name:            uniform_vec_i16_LL
-; GFX9: S_PACK_LL_B32_B16
 define amdgpu_kernel void @uniform_vec_i16_LL(i32 addrspace(4)* %in0, i32 addrspace(4)* %in1) {
+; GCN-LABEL: uniform_vec_i16_LL:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_load_dword s0, s[0:1], 0x0
+; GCN-NEXT:    s_load_dword s1, s[2:3], 0x0
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_and_b32 s0, s0, 0xffff
+; GCN-NEXT:    s_lshl_b32 s1, s1, 16
+; GCN-NEXT:    s_or_b32 s0, s0, s1
+; GCN-NEXT:    ;;#ASMSTART
+; GCN-NEXT:    ; use s0
+; GCN-NEXT:    ;;#ASMEND
+; GCN-NEXT:    s_endpgm
+;
+; GFX9-LABEL: uniform_vec_i16_LL:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_load_dword s4, s[0:1], 0x0
+; GFX9-NEXT:    s_load_dword s5, s[2:3], 0x0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_pack_ll_b32_b16 s0, s4, s5
+; GFX9-NEXT:    ;;#ASMSTART
+; GFX9-NEXT:    ; use s0
+; GFX9-NEXT:    ;;#ASMEND
+; GFX9-NEXT:    s_endpgm
+;
+; GFX906-LABEL: uniform_vec_i16_LL:
+; GFX906:       ; %bb.0:
+; GFX906-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX906-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX906-NEXT:    s_load_dword s4, s[0:1], 0x0
+; GFX906-NEXT:    s_load_dword s5, s[2:3], 0x0
+; GFX906-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX906-NEXT:    s_pack_ll_b32_b16 s0, s4, s5
+; GFX906-NEXT:    ;;#ASMSTART
+; GFX906-NEXT:    ; use s0
+; GFX906-NEXT:    ;;#ASMEND
+; GFX906-NEXT:    s_endpgm
   %val0 = load volatile i32, i32 addrspace(4)* %in0
   %val1 = load volatile i32, i32 addrspace(4)* %in1
   %lo = trunc i32 %val0 to i16
@@ -84,34 +250,69 @@ define amdgpu_kernel void @uniform_vec_i16_LL(i32 addrspace(4)* %in0, i32 addrsp
   ret void
 }
 
-; GCN-LABEL: name:            divergent_vec_i16_LL
-; GCN: %[[SHIFT:[0-9]+]]:sreg_32 = S_MOV_B32 16
-; GCN: %[[SHL:[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 killed %[[SHIFT]], %1, implicit $exec
-; GCN: %[[IMM:[0-9]+]]:sreg_32 = S_MOV_B32 65535
-; GCN: %[[AND:[0-9]+]]:vgpr_32 = V_AND_B32_e64 %0, killed %[[IMM]], implicit $exec
-; GCN: V_OR_B32_e64 killed %[[AND]], killed %[[SHL]], implicit $exec
-
-; GFX9-LABEL: name:            divergent_vec_i16_LL
-; GFX9: %[[IMM:[0-9]+]]:vgpr_32 = V_MOV_B32_e32 65535
-; GFX9: %[[AND:[0-9]+]]:vgpr_32 = V_AND_B32_e64 killed %[[IMM]]
-; GFX9: V_LSHL_OR_B32_e64 %{{[0-9]+}}, 16, killed %[[AND]]
 define i32 @divergent_vec_i16_LL(i16 %a, i16 %b) {
+; GCN-LABEL: divergent_vec_i16_LL:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GCN-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GCN-NEXT:    v_or_b32_e32 v0, v0, v1
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: divergent_vec_i16_LL:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX9-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX906-LABEL: divergent_vec_i16_LL:
+; GFX906:       ; %bb.0:
+; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX906-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX906-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
+; GFX906-NEXT:    s_setpc_b64 s[30:31]
   %tmp = insertelement <2 x i16> undef, i16 %a, i32 0
   %vec = insertelement <2 x i16> %tmp, i16 %b, i32 1
   %val = bitcast <2 x i16> %vec to i32
   ret i32 %val
 }
 
-; GCN-LABEL: name:            uniform_vec_i16_LH
-; GCN-DAG: %[[IMM:[0-9]+]]:sreg_32 = S_MOV_B32 65535
-; GCN-DAG: %[[AND:[0-9]+]]:sreg_32 = S_AND_B32 killed %{{[0-9]+}}, killed %[[IMM]]
-; GCN-DAG: %[[NEG:[0-9]+]]:sreg_32 = S_MOV_B32 -65536
-; GCN-DAG: %[[ANDN:[0-9]+]]:sreg_32 = S_AND_B32 killed %{{[0-9]+}}, killed %[[NEG]]
-; GCN: S_OR_B32 killed %[[AND]], killed %[[ANDN]]
-
-; GFX9-LABEL: name:            uniform_vec_i16_LH
-; GFX9: S_PACK_LH_B32_B16
 define amdgpu_kernel void @uniform_vec_i16_LH(i32 addrspace(1)* %out, i16 %a, i32 %b) {
+; GCN-LABEL: uniform_vec_i16_LH:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
+; GCN-NEXT:    s_mov_b32 s7, 0xf000
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_and_b32 s3, s3, 0xffff0000
+; GCN-NEXT:    s_and_b32 s2, s2, 0xffff
+; GCN-NEXT:    s_or_b32 s2, s2, s3
+; GCN-NEXT:    s_mov_b32 s6, -1
+; GCN-NEXT:    s_mov_b32 s4, s0
+; GCN-NEXT:    s_mov_b32 s5, s1
+; GCN-NEXT:    v_mov_b32_e32 v0, s2
+; GCN-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; GCN-NEXT:    s_endpgm
+;
+; GFX9-LABEL: uniform_vec_i16_LH:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_pack_lh_b32_b16 s2, s2, s3
+; GFX9-NEXT:    v_mov_b32_e32 v1, s2
+; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT:    s_endpgm
+;
+; GFX906-LABEL: uniform_vec_i16_LH:
+; GFX906:       ; %bb.0:
+; GFX906-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX906-NEXT:    v_mov_b32_e32 v0, 0
+; GFX906-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX906-NEXT:    s_pack_lh_b32_b16 s2, s2, s3
+; GFX906-NEXT:    v_mov_b32_e32 v1, s2
+; GFX906-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX906-NEXT:    s_endpgm
   %shift = lshr i32 %b, 16
   %tr = trunc i32 %shift to i16
   %tmp = insertelement <2 x i16> undef, i16 %a, i32 0
@@ -121,10 +322,27 @@ define amdgpu_kernel void @uniform_vec_i16_LH(i32 addrspace(1)* %out, i16 %a, i3
   ret void
 }
 
-; GCN-LABEL: name:            divergent_vec_i16_LH
-; GCN: %[[IMM:[0-9]+]]:sreg_32 = S_MOV_B32 65535
-; GCN: V_BFI_B32_e64 killed %[[IMM]]
 define i32 @divergent_vec_i16_LH(i16 %a, i32 %b) {
+; GCN-LABEL: divergent_vec_i16_LH:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    s_mov_b32 s4, 0xffff
+; GCN-NEXT:    v_bfi_b32 v0, s4, v0, v1
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: divergent_vec_i16_LH:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0xffff
+; GFX9-NEXT:    v_bfi_b32 v0, v2, v0, v1
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX906-LABEL: divergent_vec_i16_LH:
+; GFX906:       ; %bb.0:
+; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX906-NEXT:    v_mov_b32_e32 v2, 0xffff
+; GFX906-NEXT:    v_bfi_b32 v0, v2, v0, v1
+; GFX906-NEXT:    s_setpc_b64 s[30:31]
   %shift = lshr i32 %b, 16
   %tr = trunc i32 %shift to i16
   %tmp = insertelement <2 x i16> undef, i16 %a, i32 0
@@ -133,16 +351,41 @@ define i32 @divergent_vec_i16_LH(i16 %a, i32 %b) {
   ret i32 %val
 }
 
-; GCN-LABEL: name:            uniform_vec_i16_HH
-; GCN: %[[SHIFT:[0-9]+]]:sreg_32 = S_MOV_B32 16
-; GCN:  %[[SHR:[0-9]+]]:sreg_32 = S_LSHR_B32 killed %{{[0-9]+}}, killed %[[SHIFT]]
-; GCN: %[[IMM:[0-9]+]]:sreg_32 = S_MOV_B32 -65536
-; GCN: %[[AND:[0-9]+]]:sreg_32 = S_AND_B32 killed %{{[0-9]+}}, killed %[[IMM]]
-; GCN: S_OR_B32 killed %[[SHR]], killed %[[AND]]
-
-; GFX9-LABEL: name:            uniform_vec_i16_HH
-; GFX9: S_PACK_HH_B32_B16
 define amdgpu_kernel void @uniform_vec_i16_HH(i32 addrspace(1)* %out, i32 %a, i32 %b) {
+; GCN-LABEL: uniform_vec_i16_HH:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
+; GCN-NEXT:    s_mov_b32 s7, 0xf000
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_lshr_b32 s2, s2, 16
+; GCN-NEXT:    s_and_b32 s3, s3, 0xffff0000
+; GCN-NEXT:    s_or_b32 s2, s2, s3
+; GCN-NEXT:    s_mov_b32 s6, -1
+; GCN-NEXT:    s_mov_b32 s4, s0
+; GCN-NEXT:    s_mov_b32 s5, s1
+; GCN-NEXT:    v_mov_b32_e32 v0, s2
+; GCN-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; GCN-NEXT:    s_endpgm
+;
+; GFX9-LABEL: uniform_vec_i16_HH:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_pack_hh_b32_b16 s2, s2, s3
+; GFX9-NEXT:    v_mov_b32_e32 v1, s2
+; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT:    s_endpgm
+;
+; GFX906-LABEL: uniform_vec_i16_HH:
+; GFX906:       ; %bb.0:
+; GFX906-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX906-NEXT:    v_mov_b32_e32 v0, 0
+; GFX906-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX906-NEXT:    s_pack_hh_b32_b16 s2, s2, s3
+; GFX906-NEXT:    v_mov_b32_e32 v1, s2
+; GFX906-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX906-NEXT:    s_endpgm
   %shift_a = lshr i32 %a, 16
   %tr_a = trunc i32 %shift_a to i16
   %shift_b = lshr i32 %b, 16
@@ -154,17 +397,30 @@ define amdgpu_kernel void @uniform_vec_i16_HH(i32 addrspace(1)* %out, i32 %a, i3
   ret void
 }
 
-; GCN-LABEL: name:            divergent_vec_i16_HH
-; GCN: %[[SHR:[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 killed %{{[0-9]+}}, %0, implicit $exec
-; GCN: %[[IMM:[0-9]+]]:sreg_32 = S_MOV_B32 -65536
-; GCN: %[[AND:[0-9]+]]:vgpr_32 = V_AND_B32_e64 %1, killed %[[IMM]], implicit $exec
-; GCN: V_OR_B32_e64 killed %[[SHR]], killed %[[AND]], implicit $exec
-
-; GFX9-LABEL: name:            divergent_vec_i16_HH
-; GFX9: %[[SHR:[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 16, %0
-; GFX9: %[[IMM:[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -65536, implicit $exec
-; GFX9: V_AND_OR_B32_e64 %1, killed %[[IMM]], killed %[[SHR]]
 define i32 @divergent_vec_i16_HH(i32 %a, i32 %b) {
+; GCN-LABEL: divergent_vec_i16_HH:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GCN-NEXT:    v_or_b32_e32 v0, v0, v1
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: divergent_vec_i16_HH:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0xffff0000
+; GFX9-NEXT:    v_and_or_b32 v0, v1, v2, v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX906-LABEL: divergent_vec_i16_HH:
+; GFX906:       ; %bb.0:
+; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX906-NEXT:    v_mov_b32_e32 v2, 0xffff0000
+; GFX906-NEXT:    v_and_or_b32 v0, v1, v2, v0
+; GFX906-NEXT:    s_setpc_b64 s[30:31]
   %shift_a = lshr i32 %a, 16
   %tr_a = trunc i32 %shift_a to i16
   %shift_b = lshr i32 %b, 16
@@ -175,16 +431,47 @@ define i32 @divergent_vec_i16_HH(i32 %a, i32 %b) {
   ret i32 %val
 }
 
-; GCN-LABEL: name:            uniform_vec_f16_LL
-; GCN: %[[IMM:[0-9]+]]:sreg_32 = S_MOV_B32 65535
-; GCN: %[[AND:[0-9]+]]:sreg_32 = S_AND_B32 killed %{{[0-9]+}}, killed %[[IMM]]
-; GCN: %[[SHIFT:[0-9]+]]:sreg_32 = S_MOV_B32 16
-; GCN:  %[[SHL:[0-9]+]]:sreg_32 = S_LSHL_B32 killed %{{[0-9]+}}, killed %[[SHIFT]]
-; GCN: S_OR_B32 killed %[[AND]], killed %[[SHL]]
-
-; GFX9-LABEL: name:            uniform_vec_f16_LL
-; GFX9: S_PACK_LL_B32_B16
 define amdgpu_kernel void @uniform_vec_f16_LL(i32 addrspace(4)* %in0, i32 addrspace(4)* %in1) {
+; GCN-LABEL: uniform_vec_f16_LL:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_load_dword s0, s[0:1], 0x0
+; GCN-NEXT:    s_load_dword s1, s[2:3], 0x0
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_and_b32 s0, s0, 0xffff
+; GCN-NEXT:    s_lshl_b32 s1, s1, 16
+; GCN-NEXT:    s_or_b32 s0, s0, s1
+; GCN-NEXT:    ;;#ASMSTART
+; GCN-NEXT:    ; use s0
+; GCN-NEXT:    ;;#ASMEND
+; GCN-NEXT:    s_endpgm
+;
+; GFX9-LABEL: uniform_vec_f16_LL:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_load_dword s4, s[0:1], 0x0
+; GFX9-NEXT:    s_load_dword s5, s[2:3], 0x0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_pack_ll_b32_b16 s0, s4, s5
+; GFX9-NEXT:    ;;#ASMSTART
+; GFX9-NEXT:    ; use s0
+; GFX9-NEXT:    ;;#ASMEND
+; GFX9-NEXT:    s_endpgm
+;
+; GFX906-LABEL: uniform_vec_f16_LL:
+; GFX906:       ; %bb.0:
+; GFX906-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX906-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX906-NEXT:    s_load_dword s4, s[0:1], 0x0
+; GFX906-NEXT:    s_load_dword s5, s[2:3], 0x0
+; GFX906-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX906-NEXT:    s_pack_ll_b32_b16 s0, s4, s5
+; GFX906-NEXT:    ;;#ASMSTART
+; GFX906-NEXT:    ; use s0
+; GFX906-NEXT:    ;;#ASMEND
+; GFX906-NEXT:    s_endpgm
   %val0 = load volatile i32, i32 addrspace(4)* %in0
   %val1 = load volatile i32, i32 addrspace(4)* %in1
   %lo.i = trunc i32 %val0 to i16
@@ -199,36 +486,101 @@ define amdgpu_kernel void @uniform_vec_f16_LL(i32 addrspace(4)* %in0, i32 addrsp
   ret void
 }
 
-; GCN-LABEL: name:            divergent_vec_f16_LL
-; GCN: %[[SHIFT:[0-9]+]]:sreg_32 = S_MOV_B32 16
-; GCN: %[[SHL:[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 killed %[[SHIFT]]
-; GCN: V_OR_B32_e64 killed %{{[0-9]+}}, killed %[[SHL]], implicit $exec
-
-; GFX9-LABEL: name:            divergent_vec_f16_LL
-; GFX9: %[[IMM:[0-9]+]]:vgpr_32 = V_MOV_B32_e32 65535
-; GFX9: %[[AND:[0-9]+]]:vgpr_32 = V_AND_B32_e64 killed %[[IMM]]
-; GFX9: V_LSHL_OR_B32_e64 %{{[0-9]+}}, 16, killed %[[AND]]
 define float @divergent_vec_f16_LL(half %a, half %b) {
+; GCN-LABEL: divergent_vec_f16_LL:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GCN-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GCN-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GCN-NEXT:    v_or_b32_e32 v0, v0, v1
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: divergent_vec_f16_LL:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX9-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX906-LABEL: divergent_vec_f16_LL:
+; GFX906:       ; %bb.0:
+; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX906-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX906-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
+; GFX906-NEXT:    s_setpc_b64 s[30:31]
   %tmp = insertelement <2 x half> undef, half %a, i32 0
   %vec = insertelement <2 x half> %tmp, half %b, i32 1
   %val = bitcast <2 x half> %vec to float
   ret float %val
 }
 
-; GFX906-LABEL: name:            build_vec_v2i16_undeflo_divergent
-; GFX906: %[[LOAD:[0-9]+]]:vgpr_32 = DS_READ_U16
-; GFX906: %{{[0-9]+}}:vgpr_32 = COPY %[[LOAD]]
 define <2 x i16> @build_vec_v2i16_undeflo_divergent(i16 addrspace(3)* %in) #0 {
+; GCN-LABEL: build_vec_v2i16_undeflo_divergent:
+; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    s_mov_b32 m0, -1
+; GCN-NEXT:    ds_read_u16 v0, v0
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: build_vec_v2i16_undeflo_divergent:
+; GFX9:       ; %bb.0: ; %entry
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    ds_read_u16_d16 v0, v0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX906-LABEL: build_vec_v2i16_undeflo_divergent:
+; GFX906:       ; %bb.0: ; %entry
+; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX906-NEXT:    ds_read_u16 v0, v0
+; GFX906-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX906-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %load = load i16, i16 addrspace(3)* %in
   %build = insertelement <2 x i16> undef, i16 %load, i32 0
   ret <2 x i16> %build
 }
 
-; GFX906-LABEL: name:            build_vec_v2i16_undeflo_uniform
-; GFX906: %[[LOAD:[0-9]+]]:vgpr_32 = DS_READ_U16
-; GFX906: %{{[0-9]+}}:sreg_32 = COPY %[[LOAD]]
 define amdgpu_kernel void @build_vec_v2i16_undeflo_uniform(i16 addrspace(3)* %in, i32 addrspace(1)* %out) #0 {
+; GCN-LABEL: build_vec_v2i16_undeflo_uniform:
+; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    s_load_dword s2, s[0:1], 0x9
+; GCN-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    v_mov_b32_e32 v0, s2
+; GCN-NEXT:    s_mov_b32 m0, -1
+; GCN-NEXT:    ds_read_u16 v0, v0
+; GCN-NEXT:    s_mov_b32 s3, 0xf000
+; GCN-NEXT:    s_mov_b32 s2, -1
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GCN-NEXT:    s_endpgm
+;
+; GFX9-LABEL: build_vec_v2i16_undeflo_uniform:
+; GFX9:       ; %bb.0: ; %entry
+; GFX9-NEXT:    s_load_dword s4, s[0:1], 0x24
+; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9-NEXT:    ds_read_u16_d16 v0, v0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    global_store_dword v1, v0, s[2:3]
+; GFX9-NEXT:    s_endpgm
+;
+; GFX906-LABEL: build_vec_v2i16_undeflo_uniform:
+; GFX906:       ; %bb.0: ; %entry
+; GFX906-NEXT:    s_load_dword s4, s[0:1], 0x24
+; GFX906-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
+; GFX906-NEXT:    v_mov_b32_e32 v1, 0
+; GFX906-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX906-NEXT:    v_mov_b32_e32 v0, s4
+; GFX906-NEXT:    ds_read_u16 v0, v0
+; GFX906-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX906-NEXT:    global_store_dword v1, v0, s[2:3]
+; GFX906-NEXT:    s_endpgm
 entry:
   %load = load i16, i16 addrspace(3)* %in
   %build = insertelement <2 x i16> undef, i16 %load, i32 0

diff  --git a/llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll b/llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll
index ca8cf1c5ae12..48836409aa0e 100644
--- a/llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll
@@ -1,6 +1,7 @@
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI,GFX89 %s
-; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9,GFX89 %s
-; RUN: llc -march=amdgcn -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,CI %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=VI %s
+; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s
+; RUN: llc -march=amdgcn -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=CI %s
 
 declare half @llvm.fabs.f16(half) #0
 declare half @llvm.canonicalize.f16(half) #0
@@ -10,57 +11,191 @@ declare <3 x half> @llvm.canonicalize.v3f16(<3 x half>) #0
 declare <4 x half> @llvm.canonicalize.v4f16(<4 x half>) #0
 declare i32 @llvm.amdgcn.workitem.id.x() #0
 
-; GCN-LABEL: {{^}}test_fold_canonicalize_undef_value_f16:
-; GFX89: v_mov_b32_e32 [[REG:v[0-9]+]], 0x7e00{{$}}
-; GFX89: {{flat|global}}_store_short v{{.+}}, [[REG]]
 define amdgpu_kernel void @test_fold_canonicalize_undef_value_f16(half addrspace(1)* %out) #1 {
+; VI-LABEL: test_fold_canonicalize_undef_value_f16:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT:    v_mov_b32_e32 v2, 0x7e00
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NEXT:    v_mov_b32_e32 v1, s1
+; VI-NEXT:    flat_store_short v[0:1], v2
+; VI-NEXT:    s_endpgm
+;
+; GFX9-LABEL: test_fold_canonicalize_undef_value_f16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0x7e00
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    global_store_short v0, v1, s[0:1]
+; GFX9-NEXT:    s_endpgm
+;
+; CI-LABEL: test_fold_canonicalize_undef_value_f16:
+; CI:       ; %bb.0:
+; CI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
+; CI-NEXT:    s_mov_b32 s3, 0xf000
+; CI-NEXT:    s_mov_b32 s2, -1
+; CI-NEXT:    v_mov_b32_e32 v0, 0x7e00
+; CI-NEXT:    s_waitcnt lgkmcnt(0)
+; CI-NEXT:    buffer_store_short v0, off, s[0:3], 0
+; CI-NEXT:    s_endpgm
   %canonicalized = call half @llvm.canonicalize.f16(half undef)
   store half %canonicalized, half addrspace(1)* %out
   ret void
 }
 
-; GCN-LABEL: {{^}}v_test_canonicalize_var_f16:
-; GFX89: v_max_f16_e32 [[REG:v[0-9]+]], {{v[0-9]+}}, {{v[0-9]+}}
-; GFX89: {{flat|global}}_store_short v{{\[[0-9]+:[0-9]+\]}}, [[REG]]
-
-; CI: v_cvt_f32_f16_e32
-; CI: v_mul_f32_e32 {{v[0-9]+}}, 1.0, {{v[0-9]+}}
 define amdgpu_kernel void @v_test_canonicalize_var_f16(half addrspace(1)* %out) #1 {
+; VI-LABEL: v_test_canonicalize_var_f16:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NEXT:    v_mov_b32_e32 v1, s1
+; VI-NEXT:    flat_load_ushort v0, v[0:1]
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    v_max_f16_e32 v0, v0, v0
+; VI-NEXT:    flat_store_short v[0:1], v0
+; VI-NEXT:    s_endpgm
+;
+; GFX9-LABEL: v_test_canonicalize_var_f16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    global_load_ushort v0, v0, s[0:1]
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_max_f16_e32 v0, v0, v0
+; GFX9-NEXT:    global_store_short v[0:1], v0, off
+; GFX9-NEXT:    s_endpgm
+;
+; CI-LABEL: v_test_canonicalize_var_f16:
+; CI:       ; %bb.0:
+; CI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
+; CI-NEXT:    s_mov_b32 s3, 0xf000
+; CI-NEXT:    s_mov_b32 s2, -1
+; CI-NEXT:    s_waitcnt lgkmcnt(0)
+; CI-NEXT:    buffer_load_ushort v0, off, s[0:3], 0
+; CI-NEXT:    s_waitcnt vmcnt(0)
+; CI-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; CI-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; CI-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; CI-NEXT:    buffer_store_short v0, off, s[0:3], 0
+; CI-NEXT:    s_endpgm
   %val = load half, half addrspace(1)* %out
   %canonicalized = call half @llvm.canonicalize.f16(half %val)
   store half %canonicalized, half addrspace(1)* undef
   ret void
 }
 
-; GCN-LABEL: {{^}}s_test_canonicalize_var_f16:
-; GFX89: v_max_f16_e64 [[REG:v[0-9]+]], {{s[0-9]+}}, {{s[0-9]+}}
-; GFX89: {{flat|global}}_store_short v{{.+}}, [[REG]]
 define amdgpu_kernel void @s_test_canonicalize_var_f16(half addrspace(1)* %out, i16 zeroext %val.arg) #1 {
+; VI-LABEL: s_test_canonicalize_var_f16:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_load_dword s2, s[0:1], 0x2c
+; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    v_max_f16_e64 v2, s2, s2
+; VI-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NEXT:    v_mov_b32_e32 v1, s1
+; VI-NEXT:    flat_store_short v[0:1], v2
+; VI-NEXT:    s_endpgm
+;
+; GFX9-LABEL: s_test_canonicalize_var_f16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_load_dword s4, s[0:1], 0x2c
+; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    v_max_f16_e64 v1, s4, s4
+; GFX9-NEXT:    global_store_short v0, v1, s[2:3]
+; GFX9-NEXT:    s_endpgm
+;
+; CI-LABEL: s_test_canonicalize_var_f16:
+; CI:       ; %bb.0:
+; CI-NEXT:    s_load_dword s2, s[0:1], 0xb
+; CI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
+; CI-NEXT:    s_mov_b32 s3, 0xf000
+; CI-NEXT:    s_waitcnt lgkmcnt(0)
+; CI-NEXT:    v_cvt_f32_f16_e32 v0, s2
+; CI-NEXT:    s_mov_b32 s2, -1
+; CI-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; CI-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; CI-NEXT:    buffer_store_short v0, off, s[0:3], 0
+; CI-NEXT:    s_endpgm
   %val = bitcast i16 %val.arg to half
   %canonicalized = call half @llvm.canonicalize.f16(half %val)
   store half %canonicalized, half addrspace(1)* %out
   ret void
 }
 
-; GCN-LABEL: {{^}}v_test_canonicalize_build_vector_v2f16:
-; GFX9: v_and_b32_e32 v0, 0xffff, v0
-; GFX9-NEXT: v_lshl_or_b32 v0, v1, 16, v0
-; GFX9-NEXT: v_pk_max_f16 v0, v0, v0
-
-; VI: v_max_f16_sdwa v1, v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; VI: v_max_f16_e32 v0, v0, v0
-; VI: v_or_b32_e32 v0, v0, v1
 define <2 x half> @v_test_canonicalize_build_vector_v2f16(half %lo, half %hi) #1 {
+; VI-LABEL: v_test_canonicalize_build_vector_v2f16:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT:    v_max_f16_sdwa v1, v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; VI-NEXT:    v_max_f16_e32 v0, v0, v0
+; VI-NEXT:    v_or_b32_e32 v0, v0, v1
+; VI-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_test_canonicalize_build_vector_v2f16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX9-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
+; GFX9-NEXT:    v_pk_max_f16 v0, v0, v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; CI-LABEL: v_test_canonicalize_build_vector_v2f16:
+; CI:       ; %bb.0:
+; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CI-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; CI-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; CI-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; CI-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; CI-NEXT:    s_setpc_b64 s[30:31]
   %ins0 = insertelement <2 x half> undef, half %lo, i32 0
   %ins1 = insertelement <2 x half> %ins0, half %hi, i32 1
   %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> %ins1)
   ret <2 x half> %canonicalized
 }
 
-; GCN-LABEL: {{^}}v_test_canonicalize_fabs_var_f16:
-; GFX89: v_max_f16_e64 [[REG:v[0-9]+]], |{{v[0-9]+}}|, |{{v[0-9]+}}|
-; GFX89: {{flat|global}}_store_short v{{.+}}, [[REG]]
 define amdgpu_kernel void @v_test_canonicalize_fabs_var_f16(half addrspace(1)* %out) #1 {
+; VI-LABEL: v_test_canonicalize_fabs_var_f16:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NEXT:    v_mov_b32_e32 v1, s1
+; VI-NEXT:    flat_load_ushort v2, v[0:1]
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    v_max_f16_e64 v2, |v2|, |v2|
+; VI-NEXT:    flat_store_short v[0:1], v2
+; VI-NEXT:    s_endpgm
+;
+; GFX9-LABEL: v_test_canonicalize_fabs_var_f16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    global_load_ushort v1, v0, s[0:1]
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_max_f16_e64 v1, |v1|, |v1|
+; GFX9-NEXT:    global_store_short v0, v1, s[0:1]
+; GFX9-NEXT:    s_endpgm
+;
+; CI-LABEL: v_test_canonicalize_fabs_var_f16:
+; CI:       ; %bb.0:
+; CI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
+; CI-NEXT:    s_mov_b32 s3, 0xf000
+; CI-NEXT:    s_mov_b32 s2, -1
+; CI-NEXT:    s_waitcnt lgkmcnt(0)
+; CI-NEXT:    buffer_load_ushort v0, off, s[0:3], 0
+; CI-NEXT:    s_waitcnt vmcnt(0)
+; CI-NEXT:    v_cvt_f32_f16_e64 v0, |v0|
+; CI-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; CI-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; CI-NEXT:    buffer_store_short v0, off, s[0:3], 0
+; CI-NEXT:    s_endpgm
   %val = load half, half addrspace(1)* %out
   %val.fabs = call half @llvm.fabs.f16(half %val)
   %canonicalized = call half @llvm.canonicalize.f16(half %val.fabs)
@@ -68,13 +203,43 @@ define amdgpu_kernel void @v_test_canonicalize_fabs_var_f16(half addrspace(1)* %
   ret void
 }
 
-; GCN-LABEL: {{^}}v_test_canonicalize_fneg_fabs_var_f16:
-; GFX89: v_max_f16_e64 [[REG:v[0-9]+]], -|{{v[0-9]+}}|, -|{{v[0-9]+}}|
-; GFX89: {{flat|global}}_store_short v{{.+}}, [[REG]]
-
-; CI: v_cvt_f32_f16_e64 v{{[0-9]+}}, -|v{{[0-9]+}}|
-; CI: v_mul_f32_e32 {{v[0-9]+}}, 1.0, {{v[0-9]+}}
 define amdgpu_kernel void @v_test_canonicalize_fneg_fabs_var_f16(half addrspace(1)* %out) #1 {
+; VI-LABEL: v_test_canonicalize_fneg_fabs_var_f16:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NEXT:    v_mov_b32_e32 v1, s1
+; VI-NEXT:    flat_load_ushort v2, v[0:1]
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    v_max_f16_e64 v2, -|v2|, -|v2|
+; VI-NEXT:    flat_store_short v[0:1], v2
+; VI-NEXT:    s_endpgm
+;
+; GFX9-LABEL: v_test_canonicalize_fneg_fabs_var_f16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    global_load_ushort v1, v0, s[0:1]
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_max_f16_e64 v1, -|v1|, -|v1|
+; GFX9-NEXT:    global_store_short v0, v1, s[0:1]
+; GFX9-NEXT:    s_endpgm
+;
+; CI-LABEL: v_test_canonicalize_fneg_fabs_var_f16:
+; CI:       ; %bb.0:
+; CI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
+; CI-NEXT:    s_mov_b32 s3, 0xf000
+; CI-NEXT:    s_mov_b32 s2, -1
+; CI-NEXT:    s_waitcnt lgkmcnt(0)
+; CI-NEXT:    buffer_load_ushort v0, off, s[0:3], 0
+; CI-NEXT:    s_waitcnt vmcnt(0)
+; CI-NEXT:    v_cvt_f32_f16_e64 v0, -|v0|
+; CI-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; CI-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; CI-NEXT:    buffer_store_short v0, off, s[0:3], 0
+; CI-NEXT:    s_endpgm
   %val = load half, half addrspace(1)* %out
   %val.fabs = call half @llvm.fabs.f16(half %val)
   %val.fabs.fneg = fneg half %val.fabs
@@ -83,13 +248,43 @@ define amdgpu_kernel void @v_test_canonicalize_fneg_fabs_var_f16(half addrspace(
   ret void
 }
 
-; GCN-LABEL: {{^}}v_test_canonicalize_fneg_var_f16:
-; GFX89: v_max_f16_e64 [[REG:v[0-9]+]], -{{v[0-9]+}}, -{{v[0-9]+}}
-; GFX89: {{flat|global}}_store_short v{{.+}}, [[REG]]
-
-; CI: v_cvt_f32_f16_e64 {{v[0-9]+}}, -{{v[0-9]+}}
-; CI: v_mul_f32_e32 {{v[0-9]+}}, 1.0, {{v[0-9]+}}
 define amdgpu_kernel void @v_test_canonicalize_fneg_var_f16(half addrspace(1)* %out) #1 {
+; VI-LABEL: v_test_canonicalize_fneg_var_f16:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NEXT:    v_mov_b32_e32 v1, s1
+; VI-NEXT:    flat_load_ushort v2, v[0:1]
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    v_max_f16_e64 v2, -v2, -v2
+; VI-NEXT:    flat_store_short v[0:1], v2
+; VI-NEXT:    s_endpgm
+;
+; GFX9-LABEL: v_test_canonicalize_fneg_var_f16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    global_load_ushort v1, v0, s[0:1]
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_max_f16_e64 v1, -v1, -v1
+; GFX9-NEXT:    global_store_short v0, v1, s[0:1]
+; GFX9-NEXT:    s_endpgm
+;
+; CI-LABEL: v_test_canonicalize_fneg_var_f16:
+; CI:       ; %bb.0:
+; CI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
+; CI-NEXT:    s_mov_b32 s3, 0xf000
+; CI-NEXT:    s_mov_b32 s2, -1
+; CI-NEXT:    s_waitcnt lgkmcnt(0)
+; CI-NEXT:    buffer_load_ushort v0, off, s[0:3], 0
+; CI-NEXT:    s_waitcnt vmcnt(0)
+; CI-NEXT:    v_cvt_f32_f16_e64 v0, -v0
+; CI-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; CI-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; CI-NEXT:    buffer_store_short v0, off, s[0:3], 0
+; CI-NEXT:    s_endpgm
   %val = load half, half addrspace(1)* %out
   %val.fneg = fneg half %val
   %canonicalized = call half @llvm.canonicalize.f16(half %val.fneg)
@@ -97,11 +292,43 @@ define amdgpu_kernel void @v_test_canonicalize_fneg_var_f16(half addrspace(1)* %
   ret void
 }
 
-; GCN-LABEL: {{^}}v_test_no_denormals_canonicalize_fneg_var_f16:
-; VI: v_mul_f16_e32 [[REG:v[0-9]+]], -1.0, v{{[0-9]+}}
-; GFX9: v_max_f16_e64 [[REG:v[0-9]+]], -v{{[0-9]+}}, -v{{[0-9]+}}
-; GFX89: {{flat|global}}_store_short v{{.+}}, [[REG]]
 define amdgpu_kernel void @v_test_no_denormals_canonicalize_fneg_var_f16(half addrspace(1)* %out) #2 {
+; VI-LABEL: v_test_no_denormals_canonicalize_fneg_var_f16:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NEXT:    v_mov_b32_e32 v1, s1
+; VI-NEXT:    flat_load_ushort v2, v[0:1]
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    v_mul_f16_e32 v2, -1.0, v2
+; VI-NEXT:    flat_store_short v[0:1], v2
+; VI-NEXT:    s_endpgm
+;
+; GFX9-LABEL: v_test_no_denormals_canonicalize_fneg_var_f16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    global_load_ushort v1, v0, s[0:1]
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_max_f16_e64 v1, -v1, -v1
+; GFX9-NEXT:    global_store_short v0, v1, s[0:1]
+; GFX9-NEXT:    s_endpgm
+;
+; CI-LABEL: v_test_no_denormals_canonicalize_fneg_var_f16:
+; CI:       ; %bb.0:
+; CI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
+; CI-NEXT:    s_mov_b32 s3, 0xf000
+; CI-NEXT:    s_mov_b32 s2, -1
+; CI-NEXT:    s_waitcnt lgkmcnt(0)
+; CI-NEXT:    buffer_load_ushort v0, off, s[0:3], 0
+; CI-NEXT:    s_waitcnt vmcnt(0)
+; CI-NEXT:    v_cvt_f32_f16_e64 v0, -v0
+; CI-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; CI-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; CI-NEXT:    buffer_store_short v0, off, s[0:3], 0
+; CI-NEXT:    s_endpgm
   %val = load half, half addrspace(1)* %out
   %val.fneg = fneg half %val
   %canonicalized = call half @llvm.canonicalize.f16(half %val.fneg)
@@ -109,15 +336,43 @@ define amdgpu_kernel void @v_test_no_denormals_canonicalize_fneg_var_f16(half ad
   ret void
 }
 
-; GCN-LABEL: {{^}}v_test_no_denormals_canonicalize_fneg_fabs_var_f16:
-; VI: v_mul_f16_e64 [[REG:v[0-9]+]], -1.0, |v{{[0-9]+}}|
-; GFX9: v_max_f16_e64 [[REG:v[0-9]+]], -|v{{[0-9]+}}|, -|v{{[0-9]+}}|
-
-; GFX89: {{flat|global}}_store_short v{{.+}}, [[REG]]
-
-; CI: v_cvt_f32_f16_e64 {{v[0-9]+}}, -|{{v[0-9]+}}|
-; CI: v_mul_f32_e32 {{v[0-9]+}}, 1.0, {{v[0-9]+}}
 define amdgpu_kernel void @v_test_no_denormals_canonicalize_fneg_fabs_var_f16(half addrspace(1)* %out) #2 {
+; VI-LABEL: v_test_no_denormals_canonicalize_fneg_fabs_var_f16:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NEXT:    v_mov_b32_e32 v1, s1
+; VI-NEXT:    flat_load_ushort v2, v[0:1]
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    v_mul_f16_e64 v2, -1.0, |v2|
+; VI-NEXT:    flat_store_short v[0:1], v2
+; VI-NEXT:    s_endpgm
+;
+; GFX9-LABEL: v_test_no_denormals_canonicalize_fneg_fabs_var_f16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    global_load_ushort v1, v0, s[0:1]
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_max_f16_e64 v1, -|v1|, -|v1|
+; GFX9-NEXT:    global_store_short v0, v1, s[0:1]
+; GFX9-NEXT:    s_endpgm
+;
+; CI-LABEL: v_test_no_denormals_canonicalize_fneg_fabs_var_f16:
+; CI:       ; %bb.0:
+; CI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
+; CI-NEXT:    s_mov_b32 s3, 0xf000
+; CI-NEXT:    s_mov_b32 s2, -1
+; CI-NEXT:    s_waitcnt lgkmcnt(0)
+; CI-NEXT:    buffer_load_ushort v0, off, s[0:3], 0
+; CI-NEXT:    s_waitcnt vmcnt(0)
+; CI-NEXT:    v_cvt_f32_f16_e64 v0, -|v0|
+; CI-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; CI-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; CI-NEXT:    buffer_store_short v0, off, s[0:3], 0
+; CI-NEXT:    s_endpgm
   %val = load half, half addrspace(1)* %out
   %val.fabs = call half @llvm.fabs.f16(half %val)
   %val.fabs.fneg = fneg half %val.fabs
@@ -126,158 +381,604 @@ define amdgpu_kernel void @v_test_no_denormals_canonicalize_fneg_fabs_var_f16(ha
   ret void
 }
 
-; GCN-LABEL: {{^}}test_fold_canonicalize_p0_f16:
-; GFX89: v_mov_b32_e32 [[REG:v[0-9]+]], 0{{$}}
-; GFX89: {{flat|global}}_store_short v{{.+}}, [[REG]]
 define amdgpu_kernel void @test_fold_canonicalize_p0_f16(half addrspace(1)* %out) #1 {
+; VI-LABEL: test_fold_canonicalize_p0_f16:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT:    v_mov_b32_e32 v2, 0
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NEXT:    v_mov_b32_e32 v1, s1
+; VI-NEXT:    flat_store_short v[0:1], v2
+; VI-NEXT:    s_endpgm
+;
+; GFX9-LABEL: test_fold_canonicalize_p0_f16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    global_store_short v0, v0, s[0:1]
+; GFX9-NEXT:    s_endpgm
+;
+; CI-LABEL: test_fold_canonicalize_p0_f16:
+; CI:       ; %bb.0:
+; CI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
+; CI-NEXT:    s_mov_b32 s3, 0xf000
+; CI-NEXT:    s_mov_b32 s2, -1
+; CI-NEXT:    v_mov_b32_e32 v0, 0
+; CI-NEXT:    s_waitcnt lgkmcnt(0)
+; CI-NEXT:    buffer_store_short v0, off, s[0:3], 0
+; CI-NEXT:    s_endpgm
   %canonicalized = call half @llvm.canonicalize.f16(half 0.0)
   store half %canonicalized, half addrspace(1)* %out
   ret void
 }
 
-; GCN-LABEL: {{^}}test_fold_canonicalize_n0_f16:
-; GFX89: v_mov_b32_e32 [[REG:v[0-9]+]], 0xffff8000{{$}}
-; GFX89: {{flat|global}}_store_short v{{.+}}, [[REG]]
 define amdgpu_kernel void @test_fold_canonicalize_n0_f16(half addrspace(1)* %out) #1 {
+; VI-LABEL: test_fold_canonicalize_n0_f16:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT:    v_mov_b32_e32 v2, 0xffff8000
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NEXT:    v_mov_b32_e32 v1, s1
+; VI-NEXT:    flat_store_short v[0:1], v2
+; VI-NEXT:    s_endpgm
+;
+; GFX9-LABEL: test_fold_canonicalize_n0_f16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0xffff8000
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    global_store_short v0, v1, s[0:1]
+; GFX9-NEXT:    s_endpgm
+;
+; CI-LABEL: test_fold_canonicalize_n0_f16:
+; CI:       ; %bb.0:
+; CI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
+; CI-NEXT:    s_mov_b32 s3, 0xf000
+; CI-NEXT:    s_mov_b32 s2, -1
+; CI-NEXT:    v_mov_b32_e32 v0, 0x8000
+; CI-NEXT:    s_waitcnt lgkmcnt(0)
+; CI-NEXT:    buffer_store_short v0, off, s[0:3], 0
+; CI-NEXT:    s_endpgm
   %canonicalized = call half @llvm.canonicalize.f16(half -0.0)
   store half %canonicalized, half addrspace(1)* %out
   ret void
 }
 
-; GCN-LABEL: {{^}}test_fold_canonicalize_p1_f16:
-; GFX89: v_mov_b32_e32 [[REG:v[0-9]+]], 0x3c00{{$}}
-; GFX89: {{flat|global}}_store_short v{{.+}}, [[REG]]
 define amdgpu_kernel void @test_fold_canonicalize_p1_f16(half addrspace(1)* %out) #1 {
+; VI-LABEL: test_fold_canonicalize_p1_f16:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT:    v_mov_b32_e32 v2, 0x3c00
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NEXT:    v_mov_b32_e32 v1, s1
+; VI-NEXT:    flat_store_short v[0:1], v2
+; VI-NEXT:    s_endpgm
+;
+; GFX9-LABEL: test_fold_canonicalize_p1_f16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0x3c00
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    global_store_short v0, v1, s[0:1]
+; GFX9-NEXT:    s_endpgm
+;
+; CI-LABEL: test_fold_canonicalize_p1_f16:
+; CI:       ; %bb.0:
+; CI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
+; CI-NEXT:    s_mov_b32 s3, 0xf000
+; CI-NEXT:    s_mov_b32 s2, -1
+; CI-NEXT:    v_mov_b32_e32 v0, 0x3c00
+; CI-NEXT:    s_waitcnt lgkmcnt(0)
+; CI-NEXT:    buffer_store_short v0, off, s[0:3], 0
+; CI-NEXT:    s_endpgm
   %canonicalized = call half @llvm.canonicalize.f16(half 1.0)
   store half %canonicalized, half addrspace(1)* %out
   ret void
 }
 
-; GCN-LABEL: {{^}}test_fold_canonicalize_n1_f16:
-; GFX89: v_mov_b32_e32 [[REG:v[0-9]+]], 0xffffbc00{{$}}
-; GFX89: {{flat|global}}_store_short v{{.+}}, [[REG]]
 define amdgpu_kernel void @test_fold_canonicalize_n1_f16(half addrspace(1)* %out) #1 {
+; VI-LABEL: test_fold_canonicalize_n1_f16:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT:    v_mov_b32_e32 v2, 0xffffbc00
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NEXT:    v_mov_b32_e32 v1, s1
+; VI-NEXT:    flat_store_short v[0:1], v2
+; VI-NEXT:    s_endpgm
+;
+; GFX9-LABEL: test_fold_canonicalize_n1_f16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0xffffbc00
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    global_store_short v0, v1, s[0:1]
+; GFX9-NEXT:    s_endpgm
+;
+; CI-LABEL: test_fold_canonicalize_n1_f16:
+; CI:       ; %bb.0:
+; CI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
+; CI-NEXT:    s_mov_b32 s3, 0xf000
+; CI-NEXT:    s_mov_b32 s2, -1
+; CI-NEXT:    v_mov_b32_e32 v0, 0xbc00
+; CI-NEXT:    s_waitcnt lgkmcnt(0)
+; CI-NEXT:    buffer_store_short v0, off, s[0:3], 0
+; CI-NEXT:    s_endpgm
   %canonicalized = call half @llvm.canonicalize.f16(half -1.0)
   store half %canonicalized, half addrspace(1)* %out
   ret void
 }
 
-; GCN-LABEL: {{^}}test_fold_canonicalize_literal_f16:
-; GFX89: v_mov_b32_e32 [[REG:v[0-9]+]], 0x4c00{{$}}
-; GFX89: {{flat|global}}_store_short v{{.+}}, [[REG]]
 define amdgpu_kernel void @test_fold_canonicalize_literal_f16(half addrspace(1)* %out) #1 {
+; VI-LABEL: test_fold_canonicalize_literal_f16:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT:    v_mov_b32_e32 v2, 0x4c00
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NEXT:    v_mov_b32_e32 v1, s1
+; VI-NEXT:    flat_store_short v[0:1], v2
+; VI-NEXT:    s_endpgm
+;
+; GFX9-LABEL: test_fold_canonicalize_literal_f16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0x4c00
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    global_store_short v0, v1, s[0:1]
+; GFX9-NEXT:    s_endpgm
+;
+; CI-LABEL: test_fold_canonicalize_literal_f16:
+; CI:       ; %bb.0:
+; CI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
+; CI-NEXT:    s_mov_b32 s3, 0xf000
+; CI-NEXT:    s_mov_b32 s2, -1
+; CI-NEXT:    v_mov_b32_e32 v0, 0x4c00
+; CI-NEXT:    s_waitcnt lgkmcnt(0)
+; CI-NEXT:    buffer_store_short v0, off, s[0:3], 0
+; CI-NEXT:    s_endpgm
   %canonicalized = call half @llvm.canonicalize.f16(half 16.0)
   store half %canonicalized, half addrspace(1)* %out
   ret void
 }
 
-; GCN-LABEL: {{^}}test_default_denormals_fold_canonicalize_denormal0_f16:
-; GFX89: v_mov_b32_e32 [[REG:v[0-9]+]], 0x3ff{{$}}
-; GFX89: {{flat|global}}_store_short v{{.+}}, [[REG]]
 define amdgpu_kernel void @test_default_denormals_fold_canonicalize_denormal0_f16(half addrspace(1)* %out) #1 {
+; VI-LABEL: test_default_denormals_fold_canonicalize_denormal0_f16:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT:    v_mov_b32_e32 v2, 0x3ff
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NEXT:    v_mov_b32_e32 v1, s1
+; VI-NEXT:    flat_store_short v[0:1], v2
+; VI-NEXT:    s_endpgm
+;
+; GFX9-LABEL: test_default_denormals_fold_canonicalize_denormal0_f16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0x3ff
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    global_store_short v0, v1, s[0:1]
+; GFX9-NEXT:    s_endpgm
+;
+; CI-LABEL: test_default_denormals_fold_canonicalize_denormal0_f16:
+; CI:       ; %bb.0:
+; CI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
+; CI-NEXT:    s_mov_b32 s3, 0xf000
+; CI-NEXT:    s_mov_b32 s2, -1
+; CI-NEXT:    v_mov_b32_e32 v0, 0x3ff
+; CI-NEXT:    s_waitcnt lgkmcnt(0)
+; CI-NEXT:    buffer_store_short v0, off, s[0:3], 0
+; CI-NEXT:    s_endpgm
   %canonicalized = call half @llvm.canonicalize.f16(half 0xH03FF)
   store half %canonicalized, half addrspace(1)* %out
   ret void
 }
 
-; GCN-LABEL: {{^}}test_denormals_fold_canonicalize_denormal0_f16:
-; GFX89: v_mov_b32_e32 [[REG:v[0-9]+]], 0x3ff{{$}}
-; GFX89: {{flat|global}}_store_short v{{.+}}, [[REG]]
 define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal0_f16(half addrspace(1)* %out) #3 {
+; VI-LABEL: test_denormals_fold_canonicalize_denormal0_f16:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT:    v_mov_b32_e32 v2, 0x3ff
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NEXT:    v_mov_b32_e32 v1, s1
+; VI-NEXT:    flat_store_short v[0:1], v2
+; VI-NEXT:    s_endpgm
+;
+; GFX9-LABEL: test_denormals_fold_canonicalize_denormal0_f16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0x3ff
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    global_store_short v0, v1, s[0:1]
+; GFX9-NEXT:    s_endpgm
+;
+; CI-LABEL: test_denormals_fold_canonicalize_denormal0_f16:
+; CI:       ; %bb.0:
+; CI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
+; CI-NEXT:    s_mov_b32 s3, 0xf000
+; CI-NEXT:    s_mov_b32 s2, -1
+; CI-NEXT:    v_mov_b32_e32 v0, 0x3ff
+; CI-NEXT:    s_waitcnt lgkmcnt(0)
+; CI-NEXT:    buffer_store_short v0, off, s[0:3], 0
+; CI-NEXT:    s_endpgm
   %canonicalized = call half @llvm.canonicalize.f16(half 0xH03FF)
   store half %canonicalized, half addrspace(1)* %out
   ret void
 }
 
-; GCN-LABEL: {{^}}test_default_denormals_fold_canonicalize_denormal1_f16:
-; GFX89: v_mov_b32_e32 [[REG:v[0-9]+]], 0xffff83ff{{$}}
-; GFX89: {{flat|global}}_store_short v{{.+}}, [[REG]]
 define amdgpu_kernel void @test_default_denormals_fold_canonicalize_denormal1_f16(half addrspace(1)* %out) #1 {
+; VI-LABEL: test_default_denormals_fold_canonicalize_denormal1_f16:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT:    v_mov_b32_e32 v2, 0xffff83ff
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NEXT:    v_mov_b32_e32 v1, s1
+; VI-NEXT:    flat_store_short v[0:1], v2
+; VI-NEXT:    s_endpgm
+;
+; GFX9-LABEL: test_default_denormals_fold_canonicalize_denormal1_f16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0xffff83ff
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    global_store_short v0, v1, s[0:1]
+; GFX9-NEXT:    s_endpgm
+;
+; CI-LABEL: test_default_denormals_fold_canonicalize_denormal1_f16:
+; CI:       ; %bb.0:
+; CI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
+; CI-NEXT:    s_mov_b32 s3, 0xf000
+; CI-NEXT:    s_mov_b32 s2, -1
+; CI-NEXT:    v_mov_b32_e32 v0, 0x83ff
+; CI-NEXT:    s_waitcnt lgkmcnt(0)
+; CI-NEXT:    buffer_store_short v0, off, s[0:3], 0
+; CI-NEXT:    s_endpgm
   %canonicalized = call half @llvm.canonicalize.f16(half 0xH83FF)
   store half %canonicalized, half addrspace(1)* %out
   ret void
 }
 
-; GCN-LABEL: {{^}}test_denormals_fold_canonicalize_denormal1_f16:
-; GFX89: v_mov_b32_e32 [[REG:v[0-9]+]], 0xffff83ff{{$}}
-; GFX89: {{flat|global}}_store_short v{{.+}}, [[REG]]
 define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal1_f16(half addrspace(1)* %out) #3 {
+; VI-LABEL: test_denormals_fold_canonicalize_denormal1_f16:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT:    v_mov_b32_e32 v2, 0xffff83ff
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NEXT:    v_mov_b32_e32 v1, s1
+; VI-NEXT:    flat_store_short v[0:1], v2
+; VI-NEXT:    s_endpgm
+;
+; GFX9-LABEL: test_denormals_fold_canonicalize_denormal1_f16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0xffff83ff
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    global_store_short v0, v1, s[0:1]
+; GFX9-NEXT:    s_endpgm
+;
+; CI-LABEL: test_denormals_fold_canonicalize_denormal1_f16:
+; CI:       ; %bb.0:
+; CI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
+; CI-NEXT:    s_mov_b32 s3, 0xf000
+; CI-NEXT:    s_mov_b32 s2, -1
+; CI-NEXT:    v_mov_b32_e32 v0, 0x83ff
+; CI-NEXT:    s_waitcnt lgkmcnt(0)
+; CI-NEXT:    buffer_store_short v0, off, s[0:3], 0
+; CI-NEXT:    s_endpgm
   %canonicalized = call half @llvm.canonicalize.f16(half 0xH83FF)
   store half %canonicalized, half addrspace(1)* %out
   ret void
 }
 
-; GCN-LABEL: {{^}}test_fold_canonicalize_qnan_f16:
-; GFX89: v_mov_b32_e32 [[REG:v[0-9]+]], 0x7c00{{$}}
-; GFX89: {{flat|global}}_store_short v{{.+}}, [[REG]]
 define amdgpu_kernel void @test_fold_canonicalize_qnan_f16(half addrspace(1)* %out) #1 {
+; VI-LABEL: test_fold_canonicalize_qnan_f16:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT:    v_mov_b32_e32 v2, 0x7c00
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NEXT:    v_mov_b32_e32 v1, s1
+; VI-NEXT:    flat_store_short v[0:1], v2
+; VI-NEXT:    s_endpgm
+;
+; GFX9-LABEL: test_fold_canonicalize_qnan_f16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0x7c00
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    global_store_short v0, v1, s[0:1]
+; GFX9-NEXT:    s_endpgm
+;
+; CI-LABEL: test_fold_canonicalize_qnan_f16:
+; CI:       ; %bb.0:
+; CI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
+; CI-NEXT:    s_mov_b32 s3, 0xf000
+; CI-NEXT:    s_mov_b32 s2, -1
+; CI-NEXT:    v_mov_b32_e32 v0, 0x7c00
+; CI-NEXT:    s_waitcnt lgkmcnt(0)
+; CI-NEXT:    buffer_store_short v0, off, s[0:3], 0
+; CI-NEXT:    s_endpgm
   %canonicalized = call half @llvm.canonicalize.f16(half 0xH7C00)
   store half %canonicalized, half addrspace(1)* %out
   ret void
 }
 
-; GCN-LABEL: {{^}}test_fold_canonicalize_qnan_value_neg1_f16:
-; GFX89: v_mov_b32_e32 [[REG:v[0-9]+]], 0x7e00{{$}}
-; GFX89: {{flat|global}}_store_short v{{.+}}, [[REG]]
 define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg1_f16(half addrspace(1)* %out) #1 {
+; VI-LABEL: test_fold_canonicalize_qnan_value_neg1_f16:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT:    v_mov_b32_e32 v2, 0x7e00
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NEXT:    v_mov_b32_e32 v1, s1
+; VI-NEXT:    flat_store_short v[0:1], v2
+; VI-NEXT:    s_endpgm
+;
+; GFX9-LABEL: test_fold_canonicalize_qnan_value_neg1_f16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0x7e00
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    global_store_short v0, v1, s[0:1]
+; GFX9-NEXT:    s_endpgm
+;
+; CI-LABEL: test_fold_canonicalize_qnan_value_neg1_f16:
+; CI:       ; %bb.0:
+; CI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
+; CI-NEXT:    s_mov_b32 s3, 0xf000
+; CI-NEXT:    s_mov_b32 s2, -1
+; CI-NEXT:    v_mov_b32_e32 v0, 0x7e00
+; CI-NEXT:    s_waitcnt lgkmcnt(0)
+; CI-NEXT:    buffer_store_short v0, off, s[0:3], 0
+; CI-NEXT:    s_endpgm
   %canonicalized = call half @llvm.canonicalize.f16(half bitcast (i16 -1 to half))
   store half %canonicalized, half addrspace(1)* %out
   ret void
 }
 
-; GCN-LABEL: {{^}}test_fold_canonicalize_qnan_value_neg2_f16:
-; GFX89: v_mov_b32_e32 [[REG:v[0-9]+]], 0x7e00{{$}}
-; GFX89: {{flat|global}}_store_short v{{.+}}, [[REG]]
 define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg2_f16(half addrspace(1)* %out) #1 {
+; VI-LABEL: test_fold_canonicalize_qnan_value_neg2_f16:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT:    v_mov_b32_e32 v2, 0x7e00
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NEXT:    v_mov_b32_e32 v1, s1
+; VI-NEXT:    flat_store_short v[0:1], v2
+; VI-NEXT:    s_endpgm
+;
+; GFX9-LABEL: test_fold_canonicalize_qnan_value_neg2_f16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0x7e00
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    global_store_short v0, v1, s[0:1]
+; GFX9-NEXT:    s_endpgm
+;
+; CI-LABEL: test_fold_canonicalize_qnan_value_neg2_f16:
+; CI:       ; %bb.0:
+; CI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
+; CI-NEXT:    s_mov_b32 s3, 0xf000
+; CI-NEXT:    s_mov_b32 s2, -1
+; CI-NEXT:    v_mov_b32_e32 v0, 0x7e00
+; CI-NEXT:    s_waitcnt lgkmcnt(0)
+; CI-NEXT:    buffer_store_short v0, off, s[0:3], 0
+; CI-NEXT:    s_endpgm
   %canonicalized = call half @llvm.canonicalize.f16(half bitcast (i16 -2 to half))
   store half %canonicalized, half addrspace(1)* %out
   ret void
 }
 
-; GCN-LABEL: {{^}}test_fold_canonicalize_snan0_value_f16:
-; GFX89: v_mov_b32_e32 [[REG:v[0-9]+]], 0x7e00{{$}}
-; GFX89: {{flat|global}}_store_short v{{.+}}, [[REG]]
 define amdgpu_kernel void @test_fold_canonicalize_snan0_value_f16(half addrspace(1)* %out) #1 {
+; VI-LABEL: test_fold_canonicalize_snan0_value_f16:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT:    v_mov_b32_e32 v2, 0x7e00
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NEXT:    v_mov_b32_e32 v1, s1
+; VI-NEXT:    flat_store_short v[0:1], v2
+; VI-NEXT:    s_endpgm
+;
+; GFX9-LABEL: test_fold_canonicalize_snan0_value_f16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0x7e00
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    global_store_short v0, v1, s[0:1]
+; GFX9-NEXT:    s_endpgm
+;
+; CI-LABEL: test_fold_canonicalize_snan0_value_f16:
+; CI:       ; %bb.0:
+; CI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
+; CI-NEXT:    s_mov_b32 s3, 0xf000
+; CI-NEXT:    s_mov_b32 s2, -1
+; CI-NEXT:    v_mov_b32_e32 v0, 0x7e00
+; CI-NEXT:    s_waitcnt lgkmcnt(0)
+; CI-NEXT:    buffer_store_short v0, off, s[0:3], 0
+; CI-NEXT:    s_endpgm
   %canonicalized = call half @llvm.canonicalize.f16(half 0xH7C01)
   store half %canonicalized, half addrspace(1)* %out
   ret void
 }
 
-; GCN-LABEL: {{^}}test_fold_canonicalize_snan1_value_f16:
-; GFX89: v_mov_b32_e32 [[REG:v[0-9]+]], 0x7e00{{$}}
-; GFX89: {{flat|global}}_store_short v{{.+}}, [[REG]]
 define amdgpu_kernel void @test_fold_canonicalize_snan1_value_f16(half addrspace(1)* %out) #1 {
+; VI-LABEL: test_fold_canonicalize_snan1_value_f16:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT:    v_mov_b32_e32 v2, 0x7e00
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NEXT:    v_mov_b32_e32 v1, s1
+; VI-NEXT:    flat_store_short v[0:1], v2
+; VI-NEXT:    s_endpgm
+;
+; GFX9-LABEL: test_fold_canonicalize_snan1_value_f16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0x7e00
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    global_store_short v0, v1, s[0:1]
+; GFX9-NEXT:    s_endpgm
+;
+; CI-LABEL: test_fold_canonicalize_snan1_value_f16:
+; CI:       ; %bb.0:
+; CI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
+; CI-NEXT:    s_mov_b32 s3, 0xf000
+; CI-NEXT:    s_mov_b32 s2, -1
+; CI-NEXT:    v_mov_b32_e32 v0, 0x7e00
+; CI-NEXT:    s_waitcnt lgkmcnt(0)
+; CI-NEXT:    buffer_store_short v0, off, s[0:3], 0
+; CI-NEXT:    s_endpgm
   %canonicalized = call half @llvm.canonicalize.f16(half 0xH7DFF)
   store half %canonicalized, half addrspace(1)* %out
   ret void
 }
 
-; GCN-LABEL: {{^}}test_fold_canonicalize_snan2_value_f16:
-; GFX89: v_mov_b32_e32 [[REG:v[0-9]+]], 0x7e00{{$}}
-; GFX89: {{flat|global}}_store_short v{{.+}}, [[REG]]
 define amdgpu_kernel void @test_fold_canonicalize_snan2_value_f16(half addrspace(1)* %out) #1 {
+; VI-LABEL: test_fold_canonicalize_snan2_value_f16:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT:    v_mov_b32_e32 v2, 0x7e00
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NEXT:    v_mov_b32_e32 v1, s1
+; VI-NEXT:    flat_store_short v[0:1], v2
+; VI-NEXT:    s_endpgm
+;
+; GFX9-LABEL: test_fold_canonicalize_snan2_value_f16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0x7e00
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    global_store_short v0, v1, s[0:1]
+; GFX9-NEXT:    s_endpgm
+;
+; CI-LABEL: test_fold_canonicalize_snan2_value_f16:
+; CI:       ; %bb.0:
+; CI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
+; CI-NEXT:    s_mov_b32 s3, 0xf000
+; CI-NEXT:    s_mov_b32 s2, -1
+; CI-NEXT:    v_mov_b32_e32 v0, 0x7e00
+; CI-NEXT:    s_waitcnt lgkmcnt(0)
+; CI-NEXT:    buffer_store_short v0, off, s[0:3], 0
+; CI-NEXT:    s_endpgm
   %canonicalized = call half @llvm.canonicalize.f16(half 0xHFDFF)
   store half %canonicalized, half addrspace(1)* %out
   ret void
 }
 
-; GCN-LABEL: {{^}}test_fold_canonicalize_snan3_value_f16:
-; GFX89: v_mov_b32_e32 [[REG:v[0-9]+]], 0x7e00{{$}}
-; GFX89: {{flat|global}}_store_short v{{.+}}, [[REG]]
 define amdgpu_kernel void @test_fold_canonicalize_snan3_value_f16(half addrspace(1)* %out) #1 {
+; VI-LABEL: test_fold_canonicalize_snan3_value_f16:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT:    v_mov_b32_e32 v2, 0x7e00
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NEXT:    v_mov_b32_e32 v1, s1
+; VI-NEXT:    flat_store_short v[0:1], v2
+; VI-NEXT:    s_endpgm
+;
+; GFX9-LABEL: test_fold_canonicalize_snan3_value_f16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0x7e00
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    global_store_short v0, v1, s[0:1]
+; GFX9-NEXT:    s_endpgm
+;
+; CI-LABEL: test_fold_canonicalize_snan3_value_f16:
+; CI:       ; %bb.0:
+; CI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
+; CI-NEXT:    s_mov_b32 s3, 0xf000
+; CI-NEXT:    s_mov_b32 s2, -1
+; CI-NEXT:    v_mov_b32_e32 v0, 0x7e00
+; CI-NEXT:    s_waitcnt lgkmcnt(0)
+; CI-NEXT:    buffer_store_short v0, off, s[0:3], 0
+; CI-NEXT:    s_endpgm
   %canonicalized = call half @llvm.canonicalize.f16(half 0xHFC01)
   store half %canonicalized, half addrspace(1)* %out
   ret void
 }
 
-; GCN-LABEL: {{^}}v_test_canonicalize_var_v2f16:
-; VI-DAG: v_max_f16_sdwa [[REG0:v[0-9]+]], {{v[0-9]+}}, {{v[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; VI-DAG: v_max_f16_e32 [[REG1:v[0-9]+]], {{v[0-9]+}}, {{v[0-9]+}}
-; VI-NOT: v_and_b32
-
-; GFX9: v_pk_max_f16 [[REG:v[0-9]+]], {{v[0-9]+}}, {{v[0-9]+$}}
-; GFX9: global_store_dword v{{.+}}, [[REG]], s
 define amdgpu_kernel void @v_test_canonicalize_var_v2f16(<2 x half> addrspace(1)* %out) #1 {
+; VI-LABEL: v_test_canonicalize_var_v2f16:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    v_mov_b32_e32 v1, s1
+; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
+; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-NEXT:    flat_load_dword v0, v[0:1]
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    v_max_f16_sdwa v1, v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; VI-NEXT:    v_max_f16_e32 v0, v0, v0
+; VI-NEXT:    v_or_b32_e32 v2, v0, v1
+; VI-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NEXT:    v_mov_b32_e32 v1, s1
+; VI-NEXT:    flat_store_dword v[0:1], v2
+; VI-NEXT:    s_endpgm
+;
+; GFX9-LABEL: v_test_canonicalize_var_v2f16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    global_load_dword v0, v0, s[0:1]
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_pk_max_f16 v0, v0, v0
+; GFX9-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX9-NEXT:    s_endpgm
+;
+; CI-LABEL: v_test_canonicalize_var_v2f16:
+; CI:       ; %bb.0:
+; CI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
+; CI-NEXT:    s_mov_b32 s3, 0xf000
+; CI-NEXT:    s_mov_b32 s6, 0
+; CI-NEXT:    s_mov_b32 s7, s3
+; CI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; CI-NEXT:    s_waitcnt lgkmcnt(0)
+; CI-NEXT:    s_mov_b64 s[4:5], s[0:1]
+; CI-NEXT:    v_mov_b32_e32 v1, 0
+; CI-NEXT:    buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
+; CI-NEXT:    s_mov_b32 s2, -1
+; CI-NEXT:    s_waitcnt vmcnt(0)
+; CI-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; CI-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; CI-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; CI-NEXT:    v_mul_f32_e32 v1, 1.0, v1
+; CI-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; CI-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; CI-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; CI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; CI-NEXT:    v_or_b32_e32 v0, v0, v1
+; CI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; CI-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep = getelementptr <2 x half>, <2 x half> addrspace(1)* %out, i32 %tid
   %val = load <2 x half>, <2 x half> addrspace(1)* %gep
@@ -286,16 +987,62 @@ define amdgpu_kernel void @v_test_canonicalize_var_v2f16(<2 x half> addrspace(1)
   ret void
 }
 
-; GCN-LABEL: {{^}}v_test_canonicalize_fabs_var_v2f16:
-; VI: v_max_f16_sdwa [[REG0:v[0-9]+]], |v{{[0-9]+}}|, |v{{[0-9]+}}| dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; VI: v_max_f16_e64 [[REG1:v[0-9]+]], |v{{[0-9]+}}|, |v{{[0-9]+}}|
-; VI-NOT: 0xffff
-; VI: v_or_b32
-
-; GFX9: v_and_b32_e32 [[ABS:v[0-9]+]], 0x7fff7fff, v{{[0-9]+}}
-; GFX9: v_pk_max_f16 [[REG:v[0-9]+]], [[ABS]], [[ABS]]{{$}}
-; GFX89: {{flat|global}}_store_dword
 define amdgpu_kernel void @v_test_canonicalize_fabs_var_v2f16(<2 x half> addrspace(1)* %out) #1 {
+; VI-LABEL: v_test_canonicalize_fabs_var_v2f16:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    v_mov_b32_e32 v1, s1
+; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
+; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-NEXT:    flat_load_dword v0, v[0:1]
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    v_max_f16_sdwa v1, |v0|, |v0| dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; VI-NEXT:    v_max_f16_e64 v0, |v0|, |v0|
+; VI-NEXT:    v_or_b32_e32 v2, v0, v1
+; VI-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NEXT:    v_mov_b32_e32 v1, s1
+; VI-NEXT:    flat_store_dword v[0:1], v2
+; VI-NEXT:    s_endpgm
+;
+; GFX9-LABEL: v_test_canonicalize_fabs_var_v2f16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    global_load_dword v0, v0, s[0:1]
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_and_b32_e32 v0, 0x7fff7fff, v0
+; GFX9-NEXT:    v_pk_max_f16 v0, v0, v0
+; GFX9-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX9-NEXT:    s_endpgm
+;
+; CI-LABEL: v_test_canonicalize_fabs_var_v2f16:
+; CI:       ; %bb.0:
+; CI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
+; CI-NEXT:    s_mov_b32 s3, 0xf000
+; CI-NEXT:    s_mov_b32 s6, 0
+; CI-NEXT:    s_mov_b32 s7, s3
+; CI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; CI-NEXT:    s_waitcnt lgkmcnt(0)
+; CI-NEXT:    s_mov_b64 s[4:5], s[0:1]
+; CI-NEXT:    v_mov_b32_e32 v1, 0
+; CI-NEXT:    buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
+; CI-NEXT:    s_mov_b32 s2, -1
+; CI-NEXT:    s_waitcnt vmcnt(0)
+; CI-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; CI-NEXT:    v_cvt_f32_f16_e64 v1, |v1|
+; CI-NEXT:    v_cvt_f32_f16_e64 v0, |v0|
+; CI-NEXT:    v_mul_f32_e32 v1, 1.0, v1
+; CI-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; CI-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; CI-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; CI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; CI-NEXT:    v_or_b32_e32 v0, v0, v1
+; CI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; CI-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep = getelementptr <2 x half>, <2 x half> addrspace(1)* %out, i32 %tid
   %val = load <2 x half>, <2 x half> addrspace(1)* %gep
@@ -305,20 +1052,63 @@ define amdgpu_kernel void @v_test_canonicalize_fabs_var_v2f16(<2 x half> addrspa
   ret void
 }
 
-; GCN-LABEL: {{^}}v_test_canonicalize_fneg_fabs_var_v2f16:
-; VI-DAG: v_max_f16_sdwa [[REG0:v[0-9]+]], -|v{{[0-9]+}}|, -|v{{[0-9]+}}| dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; VI-DAG: v_max_f16_e64 [[REG1:v[0-9]+]], -|v{{[0-9]+}}|, -|v{{[0-9]+}}|
-; VI: v_or_b32
-
-; GFX9: v_and_b32_e32 [[ABS:v[0-9]+]], 0x7fff7fff, v{{[0-9]+}}
-; GFX9: v_pk_max_f16 [[REG:v[0-9]+]], [[ABS]], [[ABS]] neg_lo:[1,1] neg_hi:[1,1]{{$}}
-; GFX89: {{flat|global}}_store_dword
-
-; CI: v_cvt_f32_f16
-; CI: v_cvt_f32_f16
-; CI: v_mul_f32_e32 v{{[0-9]+}}, 1.0
-; CI: v_mul_f32_e32 v{{[0-9]+}}, 1.0
 define amdgpu_kernel void @v_test_canonicalize_fneg_fabs_var_v2f16(<2 x half> addrspace(1)* %out) #1 {
+; VI-LABEL: v_test_canonicalize_fneg_fabs_var_v2f16:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    v_mov_b32_e32 v1, s1
+; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
+; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-NEXT:    flat_load_dword v0, v[0:1]
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    v_max_f16_sdwa v1, -|v0|, -|v0| dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; VI-NEXT:    v_max_f16_e64 v0, -|v0|, -|v0|
+; VI-NEXT:    v_or_b32_e32 v2, v0, v1
+; VI-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NEXT:    v_mov_b32_e32 v1, s1
+; VI-NEXT:    flat_store_dword v[0:1], v2
+; VI-NEXT:    s_endpgm
+;
+; GFX9-LABEL: v_test_canonicalize_fneg_fabs_var_v2f16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    global_load_dword v0, v0, s[0:1]
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_and_b32_e32 v0, 0x7fff7fff, v0
+; GFX9-NEXT:    v_pk_max_f16 v0, v0, v0 neg_lo:[1,1] neg_hi:[1,1]
+; GFX9-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX9-NEXT:    s_endpgm
+;
+; CI-LABEL: v_test_canonicalize_fneg_fabs_var_v2f16:
+; CI:       ; %bb.0:
+; CI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
+; CI-NEXT:    s_mov_b32 s3, 0xf000
+; CI-NEXT:    s_mov_b32 s6, 0
+; CI-NEXT:    s_mov_b32 s7, s3
+; CI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; CI-NEXT:    s_waitcnt lgkmcnt(0)
+; CI-NEXT:    s_mov_b64 s[4:5], s[0:1]
+; CI-NEXT:    v_mov_b32_e32 v1, 0
+; CI-NEXT:    buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
+; CI-NEXT:    s_mov_b32 s2, -1
+; CI-NEXT:    s_waitcnt vmcnt(0)
+; CI-NEXT:    v_or_b32_e32 v0, 0x80008000, v0
+; CI-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; CI-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; CI-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; CI-NEXT:    v_mul_f32_e32 v1, 1.0, v1
+; CI-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; CI-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; CI-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; CI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; CI-NEXT:    v_or_b32_e32 v0, v0, v1
+; CI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; CI-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep = getelementptr <2 x half>, <2 x half> addrspace(1)* %out, i32 %tid
   %val = load <2 x half>, <2 x half> addrspace(1)* %gep
@@ -329,14 +1119,62 @@ define amdgpu_kernel void @v_test_canonicalize_fneg_fabs_var_v2f16(<2 x half> ad
   ret void
 }
 
-; GCN-LABEL: {{^}}v_test_canonicalize_fneg_var_v2f16:
-; VI-DAG: v_max_f16_sdwa [[REG1:v[0-9]+]], -v{{[0-9]+}}, -v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; VI-DAG: v_max_f16_e64 [[REG0:v[0-9]+]], -v{{[0-9]+}}, -v{{[0-9]+}}
-; VI-NOT: 0xffff
-
-; GFX9: v_pk_max_f16 [[REG:v[0-9]+]], {{v[0-9]+}}, {{v[0-9]+}} neg_lo:[1,1] neg_hi:[1,1]{{$}}
-; GFX9: global_store_dword v{{[0-9]+}}, [[REG]], s
 define amdgpu_kernel void @v_test_canonicalize_fneg_var_v2f16(<2 x half> addrspace(1)* %out) #1 {
+; VI-LABEL: v_test_canonicalize_fneg_var_v2f16:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    v_mov_b32_e32 v1, s1
+; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
+; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-NEXT:    flat_load_dword v0, v[0:1]
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    v_max_f16_sdwa v1, -v0, -v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; VI-NEXT:    v_max_f16_e64 v0, -v0, -v0
+; VI-NEXT:    v_or_b32_e32 v2, v0, v1
+; VI-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NEXT:    v_mov_b32_e32 v1, s1
+; VI-NEXT:    flat_store_dword v[0:1], v2
+; VI-NEXT:    s_endpgm
+;
+; GFX9-LABEL: v_test_canonicalize_fneg_var_v2f16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    global_load_dword v0, v0, s[0:1]
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_pk_max_f16 v0, v0, v0 neg_lo:[1,1] neg_hi:[1,1]
+; GFX9-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX9-NEXT:    s_endpgm
+;
+; CI-LABEL: v_test_canonicalize_fneg_var_v2f16:
+; CI:       ; %bb.0:
+; CI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
+; CI-NEXT:    s_mov_b32 s3, 0xf000
+; CI-NEXT:    s_mov_b32 s6, 0
+; CI-NEXT:    s_mov_b32 s7, s3
+; CI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; CI-NEXT:    s_waitcnt lgkmcnt(0)
+; CI-NEXT:    s_mov_b64 s[4:5], s[0:1]
+; CI-NEXT:    v_mov_b32_e32 v1, 0
+; CI-NEXT:    buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
+; CI-NEXT:    s_mov_b32 s2, -1
+; CI-NEXT:    s_waitcnt vmcnt(0)
+; CI-NEXT:    v_xor_b32_e32 v0, 0x80008000, v0
+; CI-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; CI-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; CI-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; CI-NEXT:    v_mul_f32_e32 v1, 1.0, v1
+; CI-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; CI-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; CI-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; CI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; CI-NEXT:    v_or_b32_e32 v0, v0, v1
+; CI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; CI-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep = getelementptr <2 x half>, <2 x half> addrspace(1)* %out, i32 %tid
   %val = load <2 x half>, <2 x half> addrspace(1)* %gep
@@ -346,390 +1184,1045 @@ define amdgpu_kernel void @v_test_canonicalize_fneg_var_v2f16(<2 x half> addrspa
   ret void
 }
 
-; GCN-LABEL: {{^}}s_test_canonicalize_var_v2f16:
-; VI-DAG: v_max_f16_sdwa [[REG0:v[0-9]+]], {{v[0-9]+}}, {{v[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; VI-DAG: v_max_f16_e64 [[REG1:v[0-9]+]], {{s[0-9]+}}, {{s[0-9]+}}
-; VI-NOT: v_and_b32
-
-; GFX9: v_pk_max_f16 [[REG:v[0-9]+]], {{s[0-9]+}}, {{s[0-9]+$}}
-; GFX9: global_store_dword v{{[0-9]+}}, [[REG]], s
 define amdgpu_kernel void @s_test_canonicalize_var_v2f16(<2 x half> addrspace(1)* %out, i32 zeroext %val.arg) #1 {
+; VI-LABEL: s_test_canonicalize_var_v2f16:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_load_dword s2, s[0:1], 0x2c
+; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    s_lshr_b32 s3, s2, 16
+; VI-NEXT:    v_mov_b32_e32 v1, s3
+; VI-NEXT:    v_max_f16_e64 v0, s2, s2
+; VI-NEXT:    v_max_f16_sdwa v1, v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; VI-NEXT:    v_or_b32_e32 v2, v0, v1
+; VI-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NEXT:    v_mov_b32_e32 v1, s1
+; VI-NEXT:    flat_store_dword v[0:1], v2
+; VI-NEXT:    s_endpgm
+;
+; GFX9-LABEL: s_test_canonicalize_var_v2f16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_load_dword s4, s[0:1], 0x2c
+; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    v_pk_max_f16 v1, s4, s4
+; GFX9-NEXT:    global_store_dword v0, v1, s[2:3]
+; GFX9-NEXT:    s_endpgm
+;
+; CI-LABEL: s_test_canonicalize_var_v2f16:
+; CI:       ; %bb.0:
+; CI-NEXT:    s_load_dword s2, s[0:1], 0xb
+; CI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
+; CI-NEXT:    s_waitcnt lgkmcnt(0)
+; CI-NEXT:    s_lshr_b32 s3, s2, 16
+; CI-NEXT:    v_cvt_f32_f16_e32 v0, s3
+; CI-NEXT:    v_cvt_f32_f16_e32 v1, s2
+; CI-NEXT:    s_mov_b32 s3, 0xf000
+; CI-NEXT:    s_mov_b32 s2, -1
+; CI-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; CI-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; CI-NEXT:    v_mul_f32_e32 v1, 1.0, v1
+; CI-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; CI-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; CI-NEXT:    v_or_b32_e32 v0, v1, v0
+; CI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; CI-NEXT:    s_endpgm
   %val = bitcast i32 %val.arg to <2 x half>
   %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> %val)
   store <2 x half> %canonicalized, <2 x half> addrspace(1)* %out
   ret void
 }
 
-; GCN-LABEL: {{^}}test_fold_canonicalize_p0_v2f16:
-; GFX89: v_mov_b32_e32 [[REG:v[0-9]+]], 0{{$}}
-; GFX89: {{flat|global}}_store_dword v{{.+}}, [[REG]]
 define amdgpu_kernel void @test_fold_canonicalize_p0_v2f16(<2 x half> addrspace(1)* %out) #1 {
+; VI-LABEL: test_fold_canonicalize_p0_v2f16:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT:    v_mov_b32_e32 v2, 0
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NEXT:    v_mov_b32_e32 v1, s1
+; VI-NEXT:    flat_store_dword v[0:1], v2
+; VI-NEXT:    s_endpgm
+;
+; GFX9-LABEL: test_fold_canonicalize_p0_v2f16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    global_store_dword v0, v0, s[0:1]
+; GFX9-NEXT:    s_endpgm
+;
+; CI-LABEL: test_fold_canonicalize_p0_v2f16:
+; CI:       ; %bb.0:
+; CI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
+; CI-NEXT:    s_mov_b32 s3, 0xf000
+; CI-NEXT:    s_mov_b32 s2, -1
+; CI-NEXT:    v_mov_b32_e32 v0, 0
+; CI-NEXT:    s_waitcnt lgkmcnt(0)
+; CI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; CI-NEXT:    s_endpgm
   %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> zeroinitializer)
   store <2 x half> %canonicalized, <2 x half> addrspace(1)* %out
   ret void
 }
 
-; GCN-LABEL: {{^}}test_fold_canonicalize_n0_v2f16:
-; GFX89: v_mov_b32_e32 [[REG:v[0-9]+]], 0x80008000{{$}}
-; GFX89: {{flat|global}}_store_dword v{{.+}}, [[REG]]
 define amdgpu_kernel void @test_fold_canonicalize_n0_v2f16(<2 x half> addrspace(1)* %out) #1 {
+; VI-LABEL: test_fold_canonicalize_n0_v2f16:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT:    v_mov_b32_e32 v2, 0x80008000
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NEXT:    v_mov_b32_e32 v1, s1
+; VI-NEXT:    flat_store_dword v[0:1], v2
+; VI-NEXT:    s_endpgm
+;
+; GFX9-LABEL: test_fold_canonicalize_n0_v2f16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0x80008000
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT:    s_endpgm
+;
+; CI-LABEL: test_fold_canonicalize_n0_v2f16:
+; CI:       ; %bb.0:
+; CI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
+; CI-NEXT:    s_mov_b32 s3, 0xf000
+; CI-NEXT:    s_mov_b32 s2, -1
+; CI-NEXT:    v_mov_b32_e32 v0, 0x80008000
+; CI-NEXT:    s_waitcnt lgkmcnt(0)
+; CI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; CI-NEXT:    s_endpgm
   %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> <half -0.0, half -0.0>)
   store <2 x half> %canonicalized, <2 x half> addrspace(1)* %out
   ret void
 }
 
-; GCN-LABEL: {{^}}test_fold_canonicalize_p1_v2f16:
-; GFX89: v_mov_b32_e32 [[REG:v[0-9]+]], 0x3c003c00{{$}}
-; GFX89: {{flat|global}}_store_dword v{{.+}}, [[REG]]
 define amdgpu_kernel void @test_fold_canonicalize_p1_v2f16(<2 x half> addrspace(1)* %out) #1 {
+; VI-LABEL: test_fold_canonicalize_p1_v2f16:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT:    v_mov_b32_e32 v2, 0x3c003c00
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NEXT:    v_mov_b32_e32 v1, s1
+; VI-NEXT:    flat_store_dword v[0:1], v2
+; VI-NEXT:    s_endpgm
+;
+; GFX9-LABEL: test_fold_canonicalize_p1_v2f16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0x3c003c00
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT:    s_endpgm
+;
+; CI-LABEL: test_fold_canonicalize_p1_v2f16:
+; CI:       ; %bb.0:
+; CI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
+; CI-NEXT:    s_mov_b32 s3, 0xf000
+; CI-NEXT:    s_mov_b32 s2, -1
+; CI-NEXT:    v_mov_b32_e32 v0, 0x3c003c00
+; CI-NEXT:    s_waitcnt lgkmcnt(0)
+; CI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; CI-NEXT:    s_endpgm
   %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> <half 1.0, half 1.0>)
   store <2 x half> %canonicalized, <2 x half> addrspace(1)* %out
   ret void
 }
 
-; GCN-LABEL: {{^}}test_fold_canonicalize_n1_v2f16:
-; GFX89: v_mov_b32_e32 [[REG:v[0-9]+]], 0xbc00bc00{{$}}
-; GFX89: {{flat|global}}_store_dword v{{.+}}, [[REG]]
 define amdgpu_kernel void @test_fold_canonicalize_n1_v2f16(<2 x half> addrspace(1)* %out) #1 {
+; VI-LABEL: test_fold_canonicalize_n1_v2f16:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT:    v_mov_b32_e32 v2, 0xbc00bc00
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NEXT:    v_mov_b32_e32 v1, s1
+; VI-NEXT:    flat_store_dword v[0:1], v2
+; VI-NEXT:    s_endpgm
+;
+; GFX9-LABEL: test_fold_canonicalize_n1_v2f16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0xbc00bc00
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT:    s_endpgm
+;
+; CI-LABEL: test_fold_canonicalize_n1_v2f16:
+; CI:       ; %bb.0:
+; CI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
+; CI-NEXT:    s_mov_b32 s3, 0xf000
+; CI-NEXT:    s_mov_b32 s2, -1
+; CI-NEXT:    v_mov_b32_e32 v0, 0xbc00bc00
+; CI-NEXT:    s_waitcnt lgkmcnt(0)
+; CI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; CI-NEXT:    s_endpgm
   %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> <half -1.0, half -1.0>)
   store <2 x half> %canonicalized, <2 x half> addrspace(1)* %out
   ret void
 }
 
-; GCN-LABEL: {{^}}test_fold_canonicalize_literal_v2f16:
-; GFX89: v_mov_b32_e32 [[REG:v[0-9]+]], 0x4c004c00{{$}}
-; GFX89: {{flat|global}}_store_dword v{{.+}}, [[REG]]
 define amdgpu_kernel void @test_fold_canonicalize_literal_v2f16(<2 x half> addrspace(1)* %out) #1 {
+; VI-LABEL: test_fold_canonicalize_literal_v2f16:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT:    v_mov_b32_e32 v2, 0x4c004c00
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NEXT:    v_mov_b32_e32 v1, s1
+; VI-NEXT:    flat_store_dword v[0:1], v2
+; VI-NEXT:    s_endpgm
+;
+; GFX9-LABEL: test_fold_canonicalize_literal_v2f16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0x4c004c00
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT:    s_endpgm
+;
+; CI-LABEL: test_fold_canonicalize_literal_v2f16:
+; CI:       ; %bb.0:
+; CI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
+; CI-NEXT:    s_mov_b32 s3, 0xf000
+; CI-NEXT:    s_mov_b32 s2, -1
+; CI-NEXT:    v_mov_b32_e32 v0, 0x4c004c00
+; CI-NEXT:    s_waitcnt lgkmcnt(0)
+; CI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; CI-NEXT:    s_endpgm
   %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> <half 16.0, half 16.0>)
   store <2 x half> %canonicalized, <2 x half> addrspace(1)* %out
   ret void
 }
 
-; GCN-LABEL: {{^}}test_no_denormals_fold_canonicalize_denormal0_v2f16:
-; GFX89: v_mov_b32_e32 [[REG:v[0-9]+]], 0x3ff03ff{{$}}
-; GFX89: {{flat|global}}_store_dword v{{.+}}, [[REG]]
 define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal0_v2f16(<2 x half> addrspace(1)* %out) #1 {
+; VI-LABEL: test_no_denormals_fold_canonicalize_denormal0_v2f16:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT:    v_mov_b32_e32 v2, 0x3ff03ff
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NEXT:    v_mov_b32_e32 v1, s1
+; VI-NEXT:    flat_store_dword v[0:1], v2
+; VI-NEXT:    s_endpgm
+;
+; GFX9-LABEL: test_no_denormals_fold_canonicalize_denormal0_v2f16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0x3ff03ff
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT:    s_endpgm
+;
+; CI-LABEL: test_no_denormals_fold_canonicalize_denormal0_v2f16:
+; CI:       ; %bb.0:
+; CI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
+; CI-NEXT:    s_mov_b32 s3, 0xf000
+; CI-NEXT:    s_mov_b32 s2, -1
+; CI-NEXT:    v_mov_b32_e32 v0, 0x3ff03ff
+; CI-NEXT:    s_waitcnt lgkmcnt(0)
+; CI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; CI-NEXT:    s_endpgm
   %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> <half 0xH03FF, half 0xH03FF>)
   store <2 x half> %canonicalized, <2 x half> addrspace(1)* %out
   ret void
 }
 
-; GCN-LABEL: {{^}}test_denormals_fold_canonicalize_denormal0_v2f16:
-; GFX89: v_mov_b32_e32 [[REG:v[0-9]+]], 0x3ff03ff{{$}}
-; GFX89: {{flat|global}}_store_dword v{{.+}}, [[REG]]
 define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal0_v2f16(<2 x half> addrspace(1)* %out) #3 {
+; VI-LABEL: test_denormals_fold_canonicalize_denormal0_v2f16:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT:    v_mov_b32_e32 v2, 0x3ff03ff
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NEXT:    v_mov_b32_e32 v1, s1
+; VI-NEXT:    flat_store_dword v[0:1], v2
+; VI-NEXT:    s_endpgm
+;
+; GFX9-LABEL: test_denormals_fold_canonicalize_denormal0_v2f16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0x3ff03ff
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT:    s_endpgm
+;
+; CI-LABEL: test_denormals_fold_canonicalize_denormal0_v2f16:
+; CI:       ; %bb.0:
+; CI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
+; CI-NEXT:    s_mov_b32 s3, 0xf000
+; CI-NEXT:    s_mov_b32 s2, -1
+; CI-NEXT:    v_mov_b32_e32 v0, 0x3ff03ff
+; CI-NEXT:    s_waitcnt lgkmcnt(0)
+; CI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; CI-NEXT:    s_endpgm
   %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> <half 0xH03FF, half 0xH03FF>)
   store <2 x half> %canonicalized, <2 x half> addrspace(1)* %out
   ret void
 }
 
-; GCN-LABEL: {{^}}test_no_denormals_fold_canonicalize_denormal1_v2f16:
-; GFX89: v_mov_b32_e32 [[REG:v[0-9]+]], 0x83ff83ff{{$}}
-; GFX89: {{flat|global}}_store_dword v{{.+}}, [[REG]]
 define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal1_v2f16(<2 x half> addrspace(1)* %out) #1 {
+; VI-LABEL: test_no_denormals_fold_canonicalize_denormal1_v2f16:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT:    v_mov_b32_e32 v2, 0x83ff83ff
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NEXT:    v_mov_b32_e32 v1, s1
+; VI-NEXT:    flat_store_dword v[0:1], v2
+; VI-NEXT:    s_endpgm
+;
+; GFX9-LABEL: test_no_denormals_fold_canonicalize_denormal1_v2f16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0x83ff83ff
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT:    s_endpgm
+;
+; CI-LABEL: test_no_denormals_fold_canonicalize_denormal1_v2f16:
+; CI:       ; %bb.0:
+; CI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
+; CI-NEXT:    s_mov_b32 s3, 0xf000
+; CI-NEXT:    s_mov_b32 s2, -1
+; CI-NEXT:    v_mov_b32_e32 v0, 0x83ff83ff
+; CI-NEXT:    s_waitcnt lgkmcnt(0)
+; CI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; CI-NEXT:    s_endpgm
   %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> <half 0xH83FF, half 0xH83FF>)
   store <2 x half> %canonicalized, <2 x half> addrspace(1)* %out
   ret void
 }
 
-; GCN-LABEL: {{^}}test_denormals_fold_canonicalize_denormal1_v2f16:
-; GFX89: v_mov_b32_e32 [[REG:v[0-9]+]], 0x83ff83ff{{$}}
-; GFX89: {{flat|global}}_store_dword v{{.+}}, [[REG]]
 define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal1_v2f16(<2 x half> addrspace(1)* %out) #3 {
+; VI-LABEL: test_denormals_fold_canonicalize_denormal1_v2f16:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT:    v_mov_b32_e32 v2, 0x83ff83ff
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NEXT:    v_mov_b32_e32 v1, s1
+; VI-NEXT:    flat_store_dword v[0:1], v2
+; VI-NEXT:    s_endpgm
+;
+; GFX9-LABEL: test_denormals_fold_canonicalize_denormal1_v2f16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0x83ff83ff
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT:    s_endpgm
+;
+; CI-LABEL: test_denormals_fold_canonicalize_denormal1_v2f16:
+; CI:       ; %bb.0:
+; CI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
+; CI-NEXT:    s_mov_b32 s3, 0xf000
+; CI-NEXT:    s_mov_b32 s2, -1
+; CI-NEXT:    v_mov_b32_e32 v0, 0x83ff83ff
+; CI-NEXT:    s_waitcnt lgkmcnt(0)
+; CI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; CI-NEXT:    s_endpgm
   %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> <half 0xH83FF, half 0xH83FF>)
   store <2 x half> %canonicalized, <2 x half> addrspace(1)* %out
   ret void
 }
 
-; GCN-LABEL: {{^}}test_fold_canonicalize_qnan_v2f16:
-; GFX89: v_mov_b32_e32 [[REG:v[0-9]+]], 0x7c007c00{{$}}
-; GFX89: {{flat|global}}_store_dword v{{.+}}, [[REG]]
 define amdgpu_kernel void @test_fold_canonicalize_qnan_v2f16(<2 x half> addrspace(1)* %out) #1 {
+; VI-LABEL: test_fold_canonicalize_qnan_v2f16:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT:    v_mov_b32_e32 v2, 0x7c007c00
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NEXT:    v_mov_b32_e32 v1, s1
+; VI-NEXT:    flat_store_dword v[0:1], v2
+; VI-NEXT:    s_endpgm
+;
+; GFX9-LABEL: test_fold_canonicalize_qnan_v2f16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0x7c007c00
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT:    s_endpgm
+;
+; CI-LABEL: test_fold_canonicalize_qnan_v2f16:
+; CI:       ; %bb.0:
+; CI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
+; CI-NEXT:    s_mov_b32 s3, 0xf000
+; CI-NEXT:    s_mov_b32 s2, -1
+; CI-NEXT:    v_mov_b32_e32 v0, 0x7c007c00
+; CI-NEXT:    s_waitcnt lgkmcnt(0)
+; CI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; CI-NEXT:    s_endpgm
   %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> <half 0xH7C00, half 0xH7C00>)
   store <2 x half> %canonicalized, <2 x half> addrspace(1)* %out
   ret void
 }
 
-; GCN-LABEL: {{^}}test_fold_canonicalize_qnan_value_neg1_v2f16:
-; GFX89: v_mov_b32_e32 [[REG:v[0-9]+]], 0x7e007e00{{$}}
-; GFX89: {{flat|global}}_store_dword v{{.+}}, [[REG]]
 define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg1_v2f16(<2 x half> addrspace(1)* %out) #1 {
+; VI-LABEL: test_fold_canonicalize_qnan_value_neg1_v2f16:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT:    v_mov_b32_e32 v2, 0x7e007e00
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NEXT:    v_mov_b32_e32 v1, s1
+; VI-NEXT:    flat_store_dword v[0:1], v2
+; VI-NEXT:    s_endpgm
+;
+; GFX9-LABEL: test_fold_canonicalize_qnan_value_neg1_v2f16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0x7e007e00
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT:    s_endpgm
+;
+; CI-LABEL: test_fold_canonicalize_qnan_value_neg1_v2f16:
+; CI:       ; %bb.0:
+; CI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
+; CI-NEXT:    s_mov_b32 s3, 0xf000
+; CI-NEXT:    s_mov_b32 s2, -1
+; CI-NEXT:    v_mov_b32_e32 v0, 0x7e007e00
+; CI-NEXT:    s_waitcnt lgkmcnt(0)
+; CI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; CI-NEXT:    s_endpgm
   %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> bitcast (i32 -1 to <2 x half>))
   store <2 x half> %canonicalized, <2 x half> addrspace(1)* %out
   ret void
 }
 
-; GCN-LABEL: {{^}}test_fold_canonicalize_qnan_value_neg2_v2f16:
-; GFX89: v_mov_b32_e32 [[REG:v[0-9]+]], 0x7e007e00{{$}}
-; GFX89: {{flat|global}}_store_dword v{{.+}}, [[REG]]
 define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg2_v2f16(<2 x half> addrspace(1)* %out) #1 {
+; VI-LABEL: test_fold_canonicalize_qnan_value_neg2_v2f16:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT:    v_mov_b32_e32 v2, 0x7e007e00
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NEXT:    v_mov_b32_e32 v1, s1
+; VI-NEXT:    flat_store_dword v[0:1], v2
+; VI-NEXT:    s_endpgm
+;
+; GFX9-LABEL: test_fold_canonicalize_qnan_value_neg2_v2f16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0x7e007e00
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT:    s_endpgm
+;
+; CI-LABEL: test_fold_canonicalize_qnan_value_neg2_v2f16:
+; CI:       ; %bb.0:
+; CI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
+; CI-NEXT:    s_mov_b32 s3, 0xf000
+; CI-NEXT:    s_mov_b32 s2, -1
+; CI-NEXT:    v_mov_b32_e32 v0, 0x7e007e00
+; CI-NEXT:    s_waitcnt lgkmcnt(0)
+; CI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; CI-NEXT:    s_endpgm
   %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> <half bitcast (i16 -2 to half), half bitcast (i16 -2 to half)>)
   store <2 x half> %canonicalized, <2 x half> addrspace(1)* %out
   ret void
 }
 
-; GCN-LABEL: {{^}}test_fold_canonicalize_snan0_value_v2f16:
-; GFX89: v_mov_b32_e32 [[REG:v[0-9]+]], 0x7e007e00{{$}}
-; GFX89: {{flat|global}}_store_dword v{{.+}}, [[REG]]
 define amdgpu_kernel void @test_fold_canonicalize_snan0_value_v2f16(<2 x half> addrspace(1)* %out) #1 {
+; VI-LABEL: test_fold_canonicalize_snan0_value_v2f16:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT:    v_mov_b32_e32 v2, 0x7e007e00
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NEXT:    v_mov_b32_e32 v1, s1
+; VI-NEXT:    flat_store_dword v[0:1], v2
+; VI-NEXT:    s_endpgm
+;
+; GFX9-LABEL: test_fold_canonicalize_snan0_value_v2f16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0x7e007e00
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT:    s_endpgm
+;
+; CI-LABEL: test_fold_canonicalize_snan0_value_v2f16:
+; CI:       ; %bb.0:
+; CI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
+; CI-NEXT:    s_mov_b32 s3, 0xf000
+; CI-NEXT:    s_mov_b32 s2, -1
+; CI-NEXT:    v_mov_b32_e32 v0, 0x7e007e00
+; CI-NEXT:    s_waitcnt lgkmcnt(0)
+; CI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; CI-NEXT:    s_endpgm
   %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> <half 0xH7C01, half 0xH7C01>)
   store <2 x half> %canonicalized, <2 x half> addrspace(1)* %out
   ret void
 }
 
-; GCN-LABEL: {{^}}test_fold_canonicalize_snan1_value_v2f16:
-; GFX89: v_mov_b32_e32 [[REG:v[0-9]+]], 0x7e007e00{{$}}
-; GFX89: {{flat|global}}_store_dword v{{.+}}, [[REG]]
 define amdgpu_kernel void @test_fold_canonicalize_snan1_value_v2f16(<2 x half> addrspace(1)* %out) #1 {
+; VI-LABEL: test_fold_canonicalize_snan1_value_v2f16:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT:    v_mov_b32_e32 v2, 0x7e007e00
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NEXT:    v_mov_b32_e32 v1, s1
+; VI-NEXT:    flat_store_dword v[0:1], v2
+; VI-NEXT:    s_endpgm
+;
+; GFX9-LABEL: test_fold_canonicalize_snan1_value_v2f16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0x7e007e00
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT:    s_endpgm
+;
+; CI-LABEL: test_fold_canonicalize_snan1_value_v2f16:
+; CI:       ; %bb.0:
+; CI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
+; CI-NEXT:    s_mov_b32 s3, 0xf000
+; CI-NEXT:    s_mov_b32 s2, -1
+; CI-NEXT:    v_mov_b32_e32 v0, 0x7e007e00
+; CI-NEXT:    s_waitcnt lgkmcnt(0)
+; CI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; CI-NEXT:    s_endpgm
   %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> <half 0xH7DFF, half 0xH7DFF>)
   store <2 x half> %canonicalized, <2 x half> addrspace(1)* %out
   ret void
 }
 
-; GCN-LABEL: {{^}}test_fold_canonicalize_snan2_value_v2f16:
-; GFX89: v_mov_b32_e32 [[REG:v[0-9]+]], 0x7e007e00{{$}}
-; GFX89: {{flat|global}}_store_dword v{{.+}}, [[REG]]
 define amdgpu_kernel void @test_fold_canonicalize_snan2_value_v2f16(<2 x half> addrspace(1)* %out) #1 {
+; VI-LABEL: test_fold_canonicalize_snan2_value_v2f16:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT:    v_mov_b32_e32 v2, 0x7e007e00
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NEXT:    v_mov_b32_e32 v1, s1
+; VI-NEXT:    flat_store_dword v[0:1], v2
+; VI-NEXT:    s_endpgm
+;
+; GFX9-LABEL: test_fold_canonicalize_snan2_value_v2f16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0x7e007e00
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT:    s_endpgm
+;
+; CI-LABEL: test_fold_canonicalize_snan2_value_v2f16:
+; CI:       ; %bb.0:
+; CI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
+; CI-NEXT:    s_mov_b32 s3, 0xf000
+; CI-NEXT:    s_mov_b32 s2, -1
+; CI-NEXT:    v_mov_b32_e32 v0, 0x7e007e00
+; CI-NEXT:    s_waitcnt lgkmcnt(0)
+; CI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; CI-NEXT:    s_endpgm
   %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> <half 0xHFDFF, half 0xHFDFF>)
   store <2 x half> %canonicalized, <2 x half> addrspace(1)* %out
   ret void
 }
 
-; GCN-LABEL: {{^}}test_fold_canonicalize_snan3_value_v2f16:
-; GFX89: v_mov_b32_e32 [[REG:v[0-9]+]], 0x7e007e00{{$}}
-; GFX89: {{flat|global}}_store_dword v{{.+}}, [[REG]]
 define amdgpu_kernel void @test_fold_canonicalize_snan3_value_v2f16(<2 x half> addrspace(1)* %out) #1 {
+; VI-LABEL: test_fold_canonicalize_snan3_value_v2f16:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT:    v_mov_b32_e32 v2, 0x7e007e00
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NEXT:    v_mov_b32_e32 v1, s1
+; VI-NEXT:    flat_store_dword v[0:1], v2
+; VI-NEXT:    s_endpgm
+;
+; GFX9-LABEL: test_fold_canonicalize_snan3_value_v2f16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0x7e007e00
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT:    s_endpgm
+;
+; CI-LABEL: test_fold_canonicalize_snan3_value_v2f16:
+; CI:       ; %bb.0:
+; CI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
+; CI-NEXT:    s_mov_b32 s3, 0xf000
+; CI-NEXT:    s_mov_b32 s2, -1
+; CI-NEXT:    v_mov_b32_e32 v0, 0x7e007e00
+; CI-NEXT:    s_waitcnt lgkmcnt(0)
+; CI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; CI-NEXT:    s_endpgm
   %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> <half 0xHFC01, half 0xHFC01>)
   store <2 x half> %canonicalized, <2 x half> addrspace(1)* %out
   ret void
 }
 
-; FIXME: Extra 4th component handled
-; GCN-LABEL: {{^}}v_test_canonicalize_var_v3f16:
-; GFX9: s_waitcnt
-; GFX9-NEXT: v_pk_max_f16 v0, v0, v0
-; GFX9-NEXT: v_pk_max_f16 v1, v1, v1
-; GFX9-NEXT: s_setpc_b64
-
-; VI-DAG: v_max_f16_sdwa [[CANON_ELT1:v[0-9]+]], v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; VI-DAG: v_max_f16_e32 [[CANON_ELT0:v[0-9]+]], v0, v0
-; VI-DAG: v_max_f16_e32 v1, v1, v1
-; VI-DAG: v_or_b32_e32 v0, [[CANON_ELT0]], [[CANON_ELT1]]
-
-; VI: s_setpc_b64
 define <3 x half> @v_test_canonicalize_var_v3f16(<3 x half> %val) #1 {
+; VI-LABEL: v_test_canonicalize_var_v3f16:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT:    v_max_f16_sdwa v2, v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; VI-NEXT:    v_max_f16_e32 v0, v0, v0
+; VI-NEXT:    v_max_f16_e32 v1, v1, v1
+; VI-NEXT:    v_or_b32_e32 v0, v0, v2
+; VI-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_test_canonicalize_var_v3f16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_pk_max_f16 v0, v0, v0
+; GFX9-NEXT:    v_pk_max_f16 v1, v1, v1
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; CI-LABEL: v_test_canonicalize_var_v3f16:
+; CI:       ; %bb.0:
+; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CI-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; CI-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; CI-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; CI-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; CI-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; CI-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; CI-NEXT:    s_setpc_b64 s[30:31]
   %canonicalized = call <3 x half> @llvm.canonicalize.v3f16(<3 x half> %val)
   ret <3 x half> %canonicalized
 }
 
-; GCN-LABEL: {{^}}v_test_canonicalize_var_v4f16:
-; GFX9: s_waitcnt
-; GFX9-NEXT: v_pk_max_f16 v0, v0, v0
-; GFX9-NEXT: v_pk_max_f16 v1, v1, v1
-; GFX9-NEXT: s_setpc_b64
-
-; VI-DAG: v_max_f16_sdwa [[CANON_ELT3:v[0-9]+]], v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; VI-DAG: v_max_f16_e32 [[CANON_ELT2:v[0-9]+]], v1, v1
-; VI-DAG: v_max_f16_sdwa [[CANON_ELT1:v[0-9]+]], v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; VI-DAG: v_max_f16_e32 [[CANON_ELT0:v[0-9]+]], v0, v0
-; VI-DAG: v_or_b32_e32 v0, [[CANON_ELT0]], [[CANON_ELT1]]
-; VI-DAG: v_or_b32_e32 v1, [[CANON_ELT2]], [[CANON_ELT3]]
-; VI: s_setpc_b64
 define <4 x half> @v_test_canonicalize_var_v4f16(<4 x half> %val) #1 {
+; VI-LABEL: v_test_canonicalize_var_v4f16:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT:    v_max_f16_sdwa v2, v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; VI-NEXT:    v_max_f16_sdwa v3, v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; VI-NEXT:    v_max_f16_e32 v1, v1, v1
+; VI-NEXT:    v_max_f16_e32 v0, v0, v0
+; VI-NEXT:    v_or_b32_e32 v0, v0, v3
+; VI-NEXT:    v_or_b32_e32 v1, v1, v2
+; VI-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_test_canonicalize_var_v4f16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_pk_max_f16 v0, v0, v0
+; GFX9-NEXT:    v_pk_max_f16 v1, v1, v1
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; CI-LABEL: v_test_canonicalize_var_v4f16:
+; CI:       ; %bb.0:
+; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CI-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; CI-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; CI-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; CI-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; CI-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; CI-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; CI-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; CI-NEXT:    v_cvt_f32_f16_e32 v3, v3
+; CI-NEXT:    s_setpc_b64 s[30:31]
   %canonicalized = call <4 x half> @llvm.canonicalize.v4f16(<4 x half> %val)
   ret <4 x half> %canonicalized
 }
 
-; GCN-LABEL: {{^}}s_test_canonicalize_undef_v2f16:
-; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x7e007e00
-; GFX89: {{flat|global}}_store_dword v{{.+}}, [[REG]]
 define amdgpu_kernel void @s_test_canonicalize_undef_v2f16(<2 x half> addrspace(1)* %out) #1 {
+; VI-LABEL: s_test_canonicalize_undef_v2f16:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT:    v_mov_b32_e32 v2, 0x7e007e00
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NEXT:    v_mov_b32_e32 v1, s1
+; VI-NEXT:    flat_store_dword v[0:1], v2
+; VI-NEXT:    s_endpgm
+;
+; GFX9-LABEL: s_test_canonicalize_undef_v2f16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0x7e007e00
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT:    s_endpgm
+;
+; CI-LABEL: s_test_canonicalize_undef_v2f16:
+; CI:       ; %bb.0:
+; CI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
+; CI-NEXT:    s_mov_b32 s3, 0xf000
+; CI-NEXT:    s_mov_b32 s2, -1
+; CI-NEXT:    v_mov_b32_e32 v0, 0x7e007e00
+; CI-NEXT:    s_waitcnt lgkmcnt(0)
+; CI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; CI-NEXT:    s_endpgm
   %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> undef)
   store <2 x half> %canonicalized, <2 x half> addrspace(1)* %out
   ret void
 }
 
-; GCN-LABEL: {{^}}v_test_canonicalize_reg_undef_v2f16:
-; GFX9: s_waitcnt
-; GFX9-NEXT: v_max_f16_e32 v0, v0, v0
-; GFX9-NEXT: v_pack_b32_f16 v0, v0, 0
-; GFX9-NEXT: s_setpc_b64
-
-; High bits known zero
-; FIXME: Should also be true on gfx9 by default?
-; VI: s_waitcnt
-; VI-NEXT: v_max_f16_e32 v0, v0, v0
-; VI-NEXT: s_setpc_b64
 define <2 x half> @v_test_canonicalize_reg_undef_v2f16(half %val) #1 {
+; VI-LABEL: v_test_canonicalize_reg_undef_v2f16:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT:    v_max_f16_e32 v0, v0, v0
+; VI-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_test_canonicalize_reg_undef_v2f16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_max_f16_e32 v0, v0, v0
+; GFX9-NEXT:    v_pack_b32_f16 v0, v0, 0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; CI-LABEL: v_test_canonicalize_reg_undef_v2f16:
+; CI:       ; %bb.0:
+; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CI-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; CI-NEXT:    v_mov_b32_e32 v1, 0x7fc00000
+; CI-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; CI-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; CI-NEXT:    s_setpc_b64 s[30:31]
   %vec = insertelement <2 x half> undef, half %val, i32 0
   %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> %vec)
   ret <2 x half> %canonicalized
 }
 
-; GCN-LABEL: {{^}}v_test_canonicalize_undef_reg_v2f16:
-; GFX89: s_waitcnt
-; GFX89-NEXT: v_max_f16_sdwa v0, v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX89-NEXT: s_setpc_b64
 define <2 x half> @v_test_canonicalize_undef_reg_v2f16(half %val) #1 {
+; VI-LABEL: v_test_canonicalize_undef_reg_v2f16:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT:    v_max_f16_sdwa v0, v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; VI-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_test_canonicalize_undef_reg_v2f16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_max_f16_sdwa v0, v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; CI-LABEL: v_test_canonicalize_undef_reg_v2f16:
+; CI:       ; %bb.0:
+; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CI-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; CI-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; CI-NEXT:    v_mul_f32_e32 v1, 1.0, v0
+; CI-NEXT:    v_mov_b32_e32 v0, 0x7fc00000
+; CI-NEXT:    s_setpc_b64 s[30:31]
   %vec = insertelement <2 x half> undef, half %val, i32 1
   %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> %vec)
   ret <2 x half> %canonicalized
 }
 
-; GCN-LABEL: {{^}}v_test_canonicalize_undef_lo_imm_hi_v2f16:
-; GCN: s_waitcnt
-; GFX89-NEXT: v_mov_b32_e32 v0, 0x3c003c00
-; GFX89-NEXT: s_setpc_b64
-
-; CI-NEXT: v_mov_b32_e32 v0, 0x7fc00000
-; CI-NEXT: v_mov_b32_e32 v1, 1.0
-; CI-NEXT: s_setpc_b64
 define <2 x half> @v_test_canonicalize_undef_lo_imm_hi_v2f16() #1 {
+; VI-LABEL: v_test_canonicalize_undef_lo_imm_hi_v2f16:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT:    v_mov_b32_e32 v0, 0x3c003c00
+; VI-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_test_canonicalize_undef_lo_imm_hi_v2f16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0x3c003c00
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; CI-LABEL: v_test_canonicalize_undef_lo_imm_hi_v2f16:
+; CI:       ; %bb.0:
+; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CI-NEXT:    v_mov_b32_e32 v0, 0x7fc00000
+; CI-NEXT:    v_mov_b32_e32 v1, 1.0
+; CI-NEXT:    s_setpc_b64 s[30:31]
   %vec = insertelement <2 x half> undef, half 1.0, i32 1
   %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> %vec)
   ret <2 x half> %canonicalized
 }
 
-; GCN-LABEL: {{^}}v_test_canonicalize_imm_lo_undef_hi_v2f16:
-; GCN: s_waitcnt
-; GFX89-NEXT: v_mov_b32_e32 v0, 0x3c003c00
-; GFX89-NEXT: s_setpc_b64
-
-; CI-NEXT: v_mov_b32_e32 v0, 1.0
-; CI-NEXT: v_mov_b32_e32 v1, 0x7fc00000
-; CI-NEXT: s_setpc_b64
 define <2 x half> @v_test_canonicalize_imm_lo_undef_hi_v2f16() #1 {
+; VI-LABEL: v_test_canonicalize_imm_lo_undef_hi_v2f16:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT:    v_mov_b32_e32 v0, 0x3c003c00
+; VI-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_test_canonicalize_imm_lo_undef_hi_v2f16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0x3c003c00
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; CI-LABEL: v_test_canonicalize_imm_lo_undef_hi_v2f16:
+; CI:       ; %bb.0:
+; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CI-NEXT:    v_mov_b32_e32 v0, 1.0
+; CI-NEXT:    v_mov_b32_e32 v1, 0x7fc00000
+; CI-NEXT:    s_setpc_b64 s[30:31]
   %vec = insertelement <2 x half> undef, half 1.0, i32 0
   %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> %vec)
   ret <2 x half> %canonicalized
 }
 
-; GCN-LABEL: {{^}}v_test_canonicalize_undef_lo_k_hi_v2f16:
-; GCN: s_waitcnt
-; GFX89-NEXT: v_mov_b32_e32 v0, 0x4c004c00
-; GFX89-NEXT: s_setpc_b64
-
-; CI-NEXT: v_mov_b32_e32 v0, 0x7fc00000
-; CI-NEXT: v_mov_b32_e32 v1, 0x41800000
-; CI-NEXT: s_setpc_b64
 define <2 x half> @v_test_canonicalize_undef_lo_k_hi_v2f16() #1 {
+; VI-LABEL: v_test_canonicalize_undef_lo_k_hi_v2f16:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT:    v_mov_b32_e32 v0, 0x4c004c00
+; VI-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_test_canonicalize_undef_lo_k_hi_v2f16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0x4c004c00
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; CI-LABEL: v_test_canonicalize_undef_lo_k_hi_v2f16:
+; CI:       ; %bb.0:
+; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CI-NEXT:    v_mov_b32_e32 v0, 0x7fc00000
+; CI-NEXT:    v_mov_b32_e32 v1, 0x41800000
+; CI-NEXT:    s_setpc_b64 s[30:31]
   %vec = insertelement <2 x half> undef, half 16.0, i32 1
   %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> %vec)
   ret <2 x half> %canonicalized
 }
 
-; GCN-LABEL: {{^}}v_test_canonicalize_k_lo_undef_hi_v2f16:
-; GCN: s_waitcnt
-; GFX89-NEXT: v_mov_b32_e32 v0, 0x4c004c00
-; GFX89-NEXT: s_setpc_b64
-
-; CI-NEXT: v_mov_b32_e32 v0, 0x41800000
-; CI-NEXT: v_mov_b32_e32 v1, 0x7fc00000
-; CI-NEXT: s_setpc_b64
 define <2 x half> @v_test_canonicalize_k_lo_undef_hi_v2f16() #1 {
+; VI-LABEL: v_test_canonicalize_k_lo_undef_hi_v2f16:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT:    v_mov_b32_e32 v0, 0x4c004c00
+; VI-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_test_canonicalize_k_lo_undef_hi_v2f16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0x4c004c00
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; CI-LABEL: v_test_canonicalize_k_lo_undef_hi_v2f16:
+; CI:       ; %bb.0:
+; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CI-NEXT:    v_mov_b32_e32 v0, 0x41800000
+; CI-NEXT:    v_mov_b32_e32 v1, 0x7fc00000
+; CI-NEXT:    s_setpc_b64 s[30:31]
   %vec = insertelement <2 x half> undef, half 16.0, i32 0
   %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> %vec)
   ret <2 x half> %canonicalized
 }
 
-; GCN-LABEL: {{^}}v_test_canonicalize_reg_k_v2f16:
-; GFX9: s_waitcnt
-; GFX9-DAG: v_max_f16_e32 v0, v0, v0
-; GFX9: v_pack_b32_f16 v0, v0, 2.0
-; GFX9: s_setpc_b64
-
-; VI: s_waitcnt
-; VI-NEXT: v_max_f16_e32 v0, v0, v0
-; VI-NEXT: v_or_b32_e32 v0, 2.0, v0
-; VI-NEXT: s_setpc_b64
 define <2 x half> @v_test_canonicalize_reg_k_v2f16(half %val) #1 {
+; VI-LABEL: v_test_canonicalize_reg_k_v2f16:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT:    v_max_f16_e32 v0, v0, v0
+; VI-NEXT:    v_or_b32_e32 v0, 2.0, v0
+; VI-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_test_canonicalize_reg_k_v2f16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_max_f16_e32 v0, v0, v0
+; GFX9-NEXT:    v_pack_b32_f16 v0, v0, 2.0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; CI-LABEL: v_test_canonicalize_reg_k_v2f16:
+; CI:       ; %bb.0:
+; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CI-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; CI-NEXT:    v_mov_b32_e32 v1, 2.0
+; CI-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; CI-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = insertelement <2 x half> undef, half %val, i32 0
   %vec1 = insertelement <2 x half> %vec0, half 2.0, i32 1
   %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> %vec1)
   ret <2 x half> %canonicalized
 }
 
-; GCN-LABEL: {{^}}v_test_canonicalize_k_reg_v2f16:
-; GFX9: v_max_f16_e32 v0, v0, v0
-; GFX9: v_pack_b32_f16 v0, 2.0, v0
-; GFX9: s_setpc_b64
-
-; VI: s_waitcnt
-; VI-NEXT: v_max_f16_sdwa v0, v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; VI-NEXT: v_or_b32_e32 v0, 0x4000, v0
-; VI-NEXT: s_setpc_b64
 define <2 x half> @v_test_canonicalize_k_reg_v2f16(half %val) #1 {
+; VI-LABEL: v_test_canonicalize_k_reg_v2f16:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT:    v_max_f16_sdwa v0, v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; VI-NEXT:    v_or_b32_e32 v0, 0x4000, v0
+; VI-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_test_canonicalize_k_reg_v2f16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_max_f16_e32 v0, v0, v0
+; GFX9-NEXT:    v_pack_b32_f16 v0, 2.0, v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; CI-LABEL: v_test_canonicalize_k_reg_v2f16:
+; CI:       ; %bb.0:
+; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CI-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; CI-NEXT:    v_cvt_f32_f16_e32 v1, v0
+; CI-NEXT:    v_mov_b32_e32 v0, 2.0
+; CI-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = insertelement <2 x half> undef, half 2.0, i32 0
   %vec1 = insertelement <2 x half> %vec0, half %val, i32 1
   %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> %vec1)
   ret <2 x half> %canonicalized
 }
 
-; GCN-LABEL: {{^}}s_test_canonicalize_undef_v4f16:
-; GCN: v_mov_b32_e32 v0, 0x7e007e00
-; GCN: v_mov_b32_e32 v1, v0
 define amdgpu_kernel void @s_test_canonicalize_undef_v4f16(<4 x half> addrspace(1)* %out) #1 {
+; VI-LABEL: s_test_canonicalize_undef_v4f16:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT:    v_mov_b32_e32 v0, 0x7e007e00
+; VI-NEXT:    v_mov_b32_e32 v1, v0
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    v_mov_b32_e32 v3, s1
+; VI-NEXT:    v_mov_b32_e32 v2, s0
+; VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; VI-NEXT:    s_endpgm
+;
+; GFX9-LABEL: s_test_canonicalize_undef_v4f16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0x7e007e00
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9-NEXT:    v_mov_b32_e32 v1, v0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9-NEXT:    s_endpgm
+;
+; CI-LABEL: s_test_canonicalize_undef_v4f16:
+; CI:       ; %bb.0:
+; CI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
+; CI-NEXT:    v_mov_b32_e32 v0, 0x7e007e00
+; CI-NEXT:    s_mov_b32 s3, 0xf000
+; CI-NEXT:    s_mov_b32 s2, -1
+; CI-NEXT:    v_mov_b32_e32 v1, v0
+; CI-NEXT:    s_waitcnt lgkmcnt(0)
+; CI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; CI-NEXT:    s_endpgm
   %canonicalized = call <4 x half> @llvm.canonicalize.v4f16(<4 x half> undef)
   store <4 x half> %canonicalized, <4 x half> addrspace(1)* %out
   ret void
 }
 
-; GCN-LABEL: {{^}}v_test_canonicalize_reg_undef_undef_undef_v4f16:
-; GFX9: s_waitcnt
-; GFX9-NEXT: v_max_f16_e32 v0, v0, v0
-; GFX9-NEXT: v_pack_b32_f16 v0, v0, 0
-; GFX9-NEXT: v_mov_b32_e32 v1, 0x7e007e00
-; GFX9-NEXT: s_setpc_b64
-
-; VI: s_waitcnt
-; VI-NEXT: v_max_f16_e32 v0, v0, v0
-; VI-NEXT: v_or_b32_e32 v0, 0x7e000000, v0
-; VI-NEXT: v_mov_b32_e32 v1, 0x7e007e00
-; VI-NEXT: s_setpc_b64
 define <4 x half> @v_test_canonicalize_reg_undef_undef_undef_v4f16(half %val) #1 {
+; VI-LABEL: v_test_canonicalize_reg_undef_undef_undef_v4f16:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT:    v_max_f16_e32 v0, v0, v0
+; VI-NEXT:    v_or_b32_e32 v0, 0x7e000000, v0
+; VI-NEXT:    v_mov_b32_e32 v1, 0x7e007e00
+; VI-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_test_canonicalize_reg_undef_undef_undef_v4f16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_max_f16_e32 v0, v0, v0
+; GFX9-NEXT:    v_pack_b32_f16 v0, v0, 0
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0x7e007e00
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; CI-LABEL: v_test_canonicalize_reg_undef_undef_undef_v4f16:
+; CI:       ; %bb.0:
+; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CI-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; CI-NEXT:    v_mov_b32_e32 v1, 0x7fc00000
+; CI-NEXT:    v_mov_b32_e32 v2, 0x7fc00000
+; CI-NEXT:    v_mov_b32_e32 v3, 0x7fc00000
+; CI-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; CI-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; CI-NEXT:    s_setpc_b64 s[30:31]
   %vec = insertelement <4 x half> undef, half %val, i32 0
   %canonicalized = call <4 x half> @llvm.canonicalize.v4f16(<4 x half> %vec)
   ret <4 x half> %canonicalized
 }
 
-; GCN-LABEL: {{^}}v_test_canonicalize_reg_reg_undef_undef_v4f16:
-; GFX9: s_waitcnt
-; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX9-NEXT: v_lshl_or_b32 v0, v1, 16, v0
-; GFX9-NEXT: v_pk_max_f16 v0, v0, v0
-; GFX9-NEXT: v_mov_b32_e32 v1, 0
-; GFX9-NEXT: s_setpc_b64
-
-; VI: s_waitcnt
-; VI-DAG: v_max_f16_e32 v0, v0, v0
-; VI-DAG: v_max_f16_sdwa v1, v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; VI: v_or_b32_e32 v0, v0, v1
-; VI-NEXT: v_mov_b32_e32 v1, 0x7e007e00
-; VI-NEXT: s_setpc_b64
 define <4 x half> @v_test_canonicalize_reg_reg_undef_undef_v4f16(half %val0, half %val1) #1 {
+; VI-LABEL: v_test_canonicalize_reg_reg_undef_undef_v4f16:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT:    v_max_f16_e32 v0, v0, v0
+; VI-NEXT:    v_max_f16_sdwa v1, v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; VI-NEXT:    v_or_b32_e32 v0, v0, v1
+; VI-NEXT:    v_mov_b32_e32 v1, 0x7e007e00
+; VI-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_test_canonicalize_reg_reg_undef_undef_v4f16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX9-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
+; GFX9-NEXT:    v_pk_max_f16 v0, v0, v0
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0x7e007e00
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; CI-LABEL: v_test_canonicalize_reg_reg_undef_undef_v4f16:
+; CI:       ; %bb.0:
+; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CI-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; CI-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; CI-NEXT:    v_mov_b32_e32 v2, 0x7fc00000
+; CI-NEXT:    v_mov_b32_e32 v3, 0x7fc00000
+; CI-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; CI-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; CI-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; CI-NEXT:    v_mul_f32_e32 v1, 1.0, v1
+; CI-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = insertelement <4 x half> undef, half %val0, i32 0
   %vec1 = insertelement <4 x half> %vec0, half %val1, i32 1
   %canonicalized = call <4 x half> @llvm.canonicalize.v4f16(<4 x half> %vec1)
   ret <4 x half> %canonicalized
 }
 
-; GCN-LABEL: {{^}}v_test_canonicalize_reg_undef_reg_reg_v4f16:
-; GFX9: s_waitcnt
-; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX9-NEXT: v_max_f16_e32 v0, v0, v0
-; GFX9-NEXT: v_lshl_or_b32 v1, v2, 16, v1
-; GFX9-NEXT: v_pack_b32_f16 v0, v0, 0
-; GFX9-NEXT: v_pk_max_f16 v1, v1, v1
-; GFX9-NEXT: s_setpc_b64
-
-; VI: s_waitcnt
-; VI-NEXT: v_max_f16_e32 v1, v1, v1
-; VI-NEXT: v_max_f16_sdwa v2, v2, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; VI-NEXT: v_max_f16_e32 v0, v0, v0
-; VI-NEXT: v_or_b32_e32 v0, 0x7e000000, v0
-; VI-NEXT: v_or_b32_e32 v1, v1, v2
-; VI-NEXT: s_setpc_b64
 define <4 x half> @v_test_canonicalize_reg_undef_reg_reg_v4f16(half %val0, half %val1, half %val2) #1 {
+; VI-LABEL: v_test_canonicalize_reg_undef_reg_reg_v4f16:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT:    v_max_f16_e32 v1, v1, v1
+; VI-NEXT:    v_max_f16_sdwa v2, v2, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; VI-NEXT:    v_max_f16_e32 v0, v0, v0
+; VI-NEXT:    v_or_b32_e32 v0, 0x7e000000, v0
+; VI-NEXT:    v_or_b32_e32 v1, v1, v2
+; VI-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_test_canonicalize_reg_undef_reg_reg_v4f16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX9-NEXT:    v_max_f16_e32 v0, v0, v0
+; GFX9-NEXT:    v_lshl_or_b32 v1, v2, 16, v1
+; GFX9-NEXT:    v_pack_b32_f16 v0, v0, 0
+; GFX9-NEXT:    v_pk_max_f16 v1, v1, v1
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; CI-LABEL: v_test_canonicalize_reg_undef_reg_reg_v4f16:
+; CI:       ; %bb.0:
+; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CI-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; CI-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; CI-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; CI-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; CI-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; CI-NEXT:    v_cvt_f32_f16_e32 v3, v2
+; CI-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; CI-NEXT:    v_mul_f32_e32 v2, 1.0, v1
+; CI-NEXT:    v_mul_f32_e32 v3, 1.0, v3
+; CI-NEXT:    v_mov_b32_e32 v1, 0x7fc00000
+; CI-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = insertelement <4 x half> undef, half %val0, i32 0
   %vec1 = insertelement <4 x half> %vec0, half %val1, i32 2
   %vec2 = insertelement <4 x half> %vec1, half %val2, i32 3

diff  --git a/llvm/test/CodeGen/AMDGPU/load-hi16.ll b/llvm/test/CodeGen/AMDGPU/load-hi16.ll
index 0072ab932302..551a0209237c 100644
--- a/llvm/test/CodeGen/AMDGPU/load-hi16.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-hi16.ll
@@ -1,21 +1,66 @@
-; RUN: llc -march=amdgcn -mcpu=gfx900 -amdgpu-sroa=0 -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX900,GFX900-MUBUF %s
-; RUN: llc -march=amdgcn -mcpu=gfx906 -amdgpu-sroa=0 -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX906,NO-D16-HI %s
-; RUN: llc -march=amdgcn -mcpu=fiji -amdgpu-sroa=0 -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX803,NO-D16-HI %s
-; RUN: llc -march=amdgcn -mcpu=gfx900 -amdgpu-sroa=0 -mattr=-promote-alloca -mattr=+enable-flat-scratch -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX900,GFX900-FLATSCR %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -march=amdgcn -mcpu=gfx900 -amdgpu-sroa=0 -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX900 %s
+; RUN: llc -march=amdgcn -mcpu=gfx906 -amdgpu-sroa=0 -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX906 %s
+; RUN: llc -march=amdgcn -mcpu=fiji -amdgpu-sroa=0 -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX803 %s
+; RUN: llc -march=amdgcn -mcpu=gfx900 -amdgpu-sroa=0 -mattr=-promote-alloca -mattr=+enable-flat-scratch -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX900-FLATSCR %s
 
-; GCN-LABEL: {{^}}load_local_lo_hi_v2i16_multi_use_lo:
-; GFX900: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-NEXT: ds_read_u16 v2, v0
-; GFX900-DAG: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0
-; GFX900-DAG: s_waitcnt lgkmcnt(0)
-; GFX900-DAG: v_mov_b32_e32 v1, v2
-; GFX900-DAG: ds_read_u16_d16_hi v1, v0 offset:16
-; GFX900: ds_write_b16 [[ZERO]], v2
-; GFX900-NEXT: s_waitcnt lgkmcnt(1)
-; GFX900-NEXT: v_mov_b32_e32 v0, v1
-; GFX900-NEXT: s_waitcnt lgkmcnt(0)
-; GFX900-NEXT: s_setpc_b64 s[30:31]
 define <2 x i16> @load_local_lo_hi_v2i16_multi_use_lo(i16 addrspace(3)* noalias %in) #0 {
+; GFX900-LABEL: load_local_lo_hi_v2i16_multi_use_lo:
+; GFX900:       ; %bb.0: ; %entry
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ds_read_u16 v2, v0
+; GFX900-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX900-NEXT:    v_mov_b32_e32 v1, v2
+; GFX900-NEXT:    ds_read_u16_d16_hi v1, v0 offset:16
+; GFX900-NEXT:    v_mov_b32_e32 v0, 0
+; GFX900-NEXT:    ds_write_b16 v0, v2
+; GFX900-NEXT:    s_waitcnt lgkmcnt(1)
+; GFX900-NEXT:    v_mov_b32_e32 v0, v1
+; GFX900-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX906-LABEL: load_local_lo_hi_v2i16_multi_use_lo:
+; GFX906:       ; %bb.0: ; %entry
+; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX906-NEXT:    ds_read_u16 v1, v0
+; GFX906-NEXT:    ds_read_u16 v0, v0 offset:16
+; GFX906-NEXT:    v_mov_b32_e32 v2, 0
+; GFX906-NEXT:    s_waitcnt lgkmcnt(1)
+; GFX906-NEXT:    ds_write_b16 v2, v1
+; GFX906-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX906-NEXT:    s_waitcnt lgkmcnt(1)
+; GFX906-NEXT:    v_lshl_or_b32 v0, v0, 16, v1
+; GFX906-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX906-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX803-LABEL: load_local_lo_hi_v2i16_multi_use_lo:
+; GFX803:       ; %bb.0: ; %entry
+; GFX803-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX803-NEXT:    s_mov_b32 m0, -1
+; GFX803-NEXT:    ds_read_u16 v1, v0
+; GFX803-NEXT:    ds_read_u16 v0, v0 offset:16
+; GFX803-NEXT:    v_mov_b32_e32 v2, 0
+; GFX803-NEXT:    s_waitcnt lgkmcnt(1)
+; GFX803-NEXT:    ds_write_b16 v2, v1
+; GFX803-NEXT:    s_waitcnt lgkmcnt(1)
+; GFX803-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX803-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX803-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX803-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX900-FLATSCR-LABEL: load_local_lo_hi_v2i16_multi_use_lo:
+; GFX900-FLATSCR:       ; %bb.0: ; %entry
+; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-FLATSCR-NEXT:    ds_read_u16 v2, v0
+; GFX900-FLATSCR-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX900-FLATSCR-NEXT:    v_mov_b32_e32 v1, v2
+; GFX900-FLATSCR-NEXT:    ds_read_u16_d16_hi v1, v0 offset:16
+; GFX900-FLATSCR-NEXT:    v_mov_b32_e32 v0, 0
+; GFX900-FLATSCR-NEXT:    ds_write_b16 v0, v2
+; GFX900-FLATSCR-NEXT:    s_waitcnt lgkmcnt(1)
+; GFX900-FLATSCR-NEXT:    v_mov_b32_e32 v0, v1
+; GFX900-FLATSCR-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX900-FLATSCR-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %gep = getelementptr inbounds i16, i16 addrspace(3)* %in, i32 8
   %load.lo = load i16, i16 addrspace(3)* %in
@@ -26,18 +71,63 @@ entry:
   ret <2 x i16> %build1
 }
 
-; GCN-LABEL: {{^}}load_local_lo_hi_v2i16_multi_use_hi:
-; GFX900: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-DAG: ds_read_u16 [[HI:v[0-9]+]], v0 offset:16
-; GFX900-DAG: ds_read_u16 [[LO:v[0-9]+]], v0
-; GFX900-DAG: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0
-; GFX900-DAG: v_and_b32_e32 [[AND:v[0-9]+]], 0xffff, [[LO]]
-; GFX900-DAG: s_waitcnt lgkmcnt(1)
-; GFX900-DAG: ds_write_b16 [[ZERO]], [[HI]]
-; GFX900: v_lshl_or_b32 v{{[0-9]+}}, [[HI]], 16, [[AND]]
-; GFX900-NEXT: s_waitcnt lgkmcnt(0)
-; GFX900-NEXT: s_setpc_b64 s[30:31]
 define <2 x i16> @load_local_lo_hi_v2i16_multi_use_hi(i16 addrspace(3)* noalias %in) #0 {
+; GFX900-LABEL: load_local_lo_hi_v2i16_multi_use_hi:
+; GFX900:       ; %bb.0: ; %entry
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ds_read_u16 v1, v0 offset:16
+; GFX900-NEXT:    ds_read_u16 v0, v0
+; GFX900-NEXT:    v_mov_b32_e32 v2, 0
+; GFX900-NEXT:    s_waitcnt lgkmcnt(1)
+; GFX900-NEXT:    ds_write_b16 v2, v1
+; GFX900-NEXT:    s_waitcnt lgkmcnt(1)
+; GFX900-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX900-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
+; GFX900-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX906-LABEL: load_local_lo_hi_v2i16_multi_use_hi:
+; GFX906:       ; %bb.0: ; %entry
+; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX906-NEXT:    ds_read_u16 v1, v0 offset:16
+; GFX906-NEXT:    ds_read_u16 v0, v0
+; GFX906-NEXT:    v_mov_b32_e32 v2, 0
+; GFX906-NEXT:    s_waitcnt lgkmcnt(1)
+; GFX906-NEXT:    ds_write_b16 v2, v1
+; GFX906-NEXT:    s_waitcnt lgkmcnt(1)
+; GFX906-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX906-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
+; GFX906-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX906-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX803-LABEL: load_local_lo_hi_v2i16_multi_use_hi:
+; GFX803:       ; %bb.0: ; %entry
+; GFX803-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX803-NEXT:    s_mov_b32 m0, -1
+; GFX803-NEXT:    ds_read_u16 v1, v0 offset:16
+; GFX803-NEXT:    ds_read_u16 v0, v0
+; GFX803-NEXT:    v_mov_b32_e32 v2, 0
+; GFX803-NEXT:    s_waitcnt lgkmcnt(1)
+; GFX803-NEXT:    ds_write_b16 v2, v1
+; GFX803-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX803-NEXT:    s_waitcnt lgkmcnt(1)
+; GFX803-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX803-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX803-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX900-FLATSCR-LABEL: load_local_lo_hi_v2i16_multi_use_hi:
+; GFX900-FLATSCR:       ; %bb.0: ; %entry
+; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-FLATSCR-NEXT:    ds_read_u16 v1, v0 offset:16
+; GFX900-FLATSCR-NEXT:    ds_read_u16 v0, v0
+; GFX900-FLATSCR-NEXT:    v_mov_b32_e32 v2, 0
+; GFX900-FLATSCR-NEXT:    s_waitcnt lgkmcnt(1)
+; GFX900-FLATSCR-NEXT:    ds_write_b16 v2, v1
+; GFX900-FLATSCR-NEXT:    s_waitcnt lgkmcnt(1)
+; GFX900-FLATSCR-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX900-FLATSCR-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
+; GFX900-FLATSCR-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX900-FLATSCR-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %gep = getelementptr inbounds i16, i16 addrspace(3)* %in, i32 8
   %load.lo = load i16, i16 addrspace(3)* %in
@@ -48,18 +138,63 @@ entry:
   ret <2 x i16> %build1
 }
 
-; GCN-LABEL: {{^}}load_local_lo_hi_v2i16_multi_use_lohi:
-; GFX900: ds_read_u16 v3, v0
-; GFX900-NEXT: ds_read_u16 v0, v0 offset:16
-; GFX900-NEXT: s_waitcnt lgkmcnt(1)
-; GFX900-NEXT: ds_write_b16 v1, v3
-; GFX900-NEXT: s_waitcnt lgkmcnt(1)
-; GFX900-NEXT: ds_write_b16 v2, v0
-; GFX900-NEXT: v_and_b32_e32 v1, 0xffff, v3
-; GFX900-NEXT: v_lshl_or_b32 v0, v0, 16, v1
-; GFX900-NEXT: s_waitcnt lgkmcnt(0)
-; GFX900-NEXT: s_setpc_b64 s[30:31]
 define <2 x i16> @load_local_lo_hi_v2i16_multi_use_lohi(i16 addrspace(3)* noalias %in, i16 addrspace(3)* noalias %out0, i16 addrspace(3)* noalias %out1) #0 {
+; GFX900-LABEL: load_local_lo_hi_v2i16_multi_use_lohi:
+; GFX900:       ; %bb.0: ; %entry
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ds_read_u16 v3, v0
+; GFX900-NEXT:    ds_read_u16 v0, v0 offset:16
+; GFX900-NEXT:    s_waitcnt lgkmcnt(1)
+; GFX900-NEXT:    ds_write_b16 v1, v3
+; GFX900-NEXT:    s_waitcnt lgkmcnt(1)
+; GFX900-NEXT:    ds_write_b16 v2, v0
+; GFX900-NEXT:    v_and_b32_e32 v1, 0xffff, v3
+; GFX900-NEXT:    v_lshl_or_b32 v0, v0, 16, v1
+; GFX900-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX906-LABEL: load_local_lo_hi_v2i16_multi_use_lohi:
+; GFX906:       ; %bb.0: ; %entry
+; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX906-NEXT:    ds_read_u16 v3, v0
+; GFX906-NEXT:    ds_read_u16 v0, v0 offset:16
+; GFX906-NEXT:    s_waitcnt lgkmcnt(1)
+; GFX906-NEXT:    ds_write_b16 v1, v3
+; GFX906-NEXT:    s_waitcnt lgkmcnt(1)
+; GFX906-NEXT:    ds_write_b16 v2, v0
+; GFX906-NEXT:    v_and_b32_e32 v1, 0xffff, v3
+; GFX906-NEXT:    v_lshl_or_b32 v0, v0, 16, v1
+; GFX906-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX906-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX803-LABEL: load_local_lo_hi_v2i16_multi_use_lohi:
+; GFX803:       ; %bb.0: ; %entry
+; GFX803-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX803-NEXT:    s_mov_b32 m0, -1
+; GFX803-NEXT:    ds_read_u16 v3, v0
+; GFX803-NEXT:    ds_read_u16 v0, v0 offset:16
+; GFX803-NEXT:    s_waitcnt lgkmcnt(1)
+; GFX803-NEXT:    ds_write_b16 v1, v3
+; GFX803-NEXT:    s_waitcnt lgkmcnt(1)
+; GFX803-NEXT:    ds_write_b16 v2, v0
+; GFX803-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX803-NEXT:    v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX803-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX803-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX900-FLATSCR-LABEL: load_local_lo_hi_v2i16_multi_use_lohi:
+; GFX900-FLATSCR:       ; %bb.0: ; %entry
+; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-FLATSCR-NEXT:    ds_read_u16 v3, v0
+; GFX900-FLATSCR-NEXT:    ds_read_u16 v0, v0 offset:16
+; GFX900-FLATSCR-NEXT:    s_waitcnt lgkmcnt(1)
+; GFX900-FLATSCR-NEXT:    ds_write_b16 v1, v3
+; GFX900-FLATSCR-NEXT:    s_waitcnt lgkmcnt(1)
+; GFX900-FLATSCR-NEXT:    ds_write_b16 v2, v0
+; GFX900-FLATSCR-NEXT:    v_and_b32_e32 v1, 0xffff, v3
+; GFX900-FLATSCR-NEXT:    v_lshl_or_b32 v0, v0, 16, v1
+; GFX900-FLATSCR-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX900-FLATSCR-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %gep = getelementptr inbounds i16, i16 addrspace(3)* %in, i32 8
   %load.lo = load i16, i16 addrspace(3)* %in
@@ -71,29 +206,78 @@ entry:
   ret <2 x i16> %build1
 }
 
-; GCN-LABEL: {{^}}load_local_hi_v2i16_undeflo:
-; GCN: s_waitcnt
-; GFX900-NEXT: ds_read_u16_d16_hi v0, v0
-; GFX900-NEXT: s_waitcnt
-; GFX900-NEXT: s_setpc_b64
-
-; NO-D16-HI: ds_read_u16 v
 define <2 x i16> @load_local_hi_v2i16_undeflo(i16 addrspace(3)* %in) #0 {
+; GFX900-LABEL: load_local_hi_v2i16_undeflo:
+; GFX900:       ; %bb.0: ; %entry
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ds_read_u16_d16_hi v0, v0
+; GFX900-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX906-LABEL: load_local_hi_v2i16_undeflo:
+; GFX906:       ; %bb.0: ; %entry
+; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX906-NEXT:    ds_read_u16 v0, v0
+; GFX906-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX906-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX906-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX803-LABEL: load_local_hi_v2i16_undeflo:
+; GFX803:       ; %bb.0: ; %entry
+; GFX803-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX803-NEXT:    s_mov_b32 m0, -1
+; GFX803-NEXT:    ds_read_u16 v0, v0
+; GFX803-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX803-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX803-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX900-FLATSCR-LABEL: load_local_hi_v2i16_undeflo:
+; GFX900-FLATSCR:       ; %bb.0: ; %entry
+; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-FLATSCR-NEXT:    ds_read_u16_d16_hi v0, v0
+; GFX900-FLATSCR-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX900-FLATSCR-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %load = load i16, i16 addrspace(3)* %in
   %build = insertelement <2 x i16> undef, i16 %load, i32 1
   ret <2 x i16> %build
 }
 
-; GCN-LABEL: {{^}}load_local_hi_v2i16_reglo:
-; GCN: s_waitcnt
-; GFX900-NEXT: ds_read_u16_d16_hi v1, v0
-; GFX900-NEXT: s_waitcnt
-; GFX900-NEXT: v_mov_b32_e32 v0, v1
-; GFX900-NEXT: s_setpc_b64
-
-; NO-D16-HI: ds_read_u16 v
 define <2 x i16> @load_local_hi_v2i16_reglo(i16 addrspace(3)* %in, i16 %reg) #0 {
+; GFX900-LABEL: load_local_hi_v2i16_reglo:
+; GFX900:       ; %bb.0: ; %entry
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ds_read_u16_d16_hi v1, v0
+; GFX900-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX900-NEXT:    v_mov_b32_e32 v0, v1
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX906-LABEL: load_local_hi_v2i16_reglo:
+; GFX906:       ; %bb.0: ; %entry
+; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX906-NEXT:    ds_read_u16 v0, v0
+; GFX906-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX906-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX906-NEXT:    v_lshl_or_b32 v0, v0, 16, v1
+; GFX906-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX803-LABEL: load_local_hi_v2i16_reglo:
+; GFX803:       ; %bb.0: ; %entry
+; GFX803-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX803-NEXT:    s_mov_b32 m0, -1
+; GFX803-NEXT:    ds_read_u16 v0, v0
+; GFX803-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX803-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX803-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX803-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX900-FLATSCR-LABEL: load_local_hi_v2i16_reglo:
+; GFX900-FLATSCR:       ; %bb.0: ; %entry
+; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-FLATSCR-NEXT:    ds_read_u16_d16_hi v1, v0
+; GFX900-FLATSCR-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX900-FLATSCR-NEXT:    v_mov_b32_e32 v0, v1
+; GFX900-FLATSCR-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %load = load i16, i16 addrspace(3)* %in
   %build0 = insertelement <2 x i16> undef, i16 %reg, i32 0
@@ -101,17 +285,47 @@ entry:
   ret <2 x i16> %build1
 }
 
-; Show that we get reasonable regalloc without physreg constraints.
-; GCN-LABEL: {{^}}load_local_hi_v2i16_reglo_vreg:
-; GCN: s_waitcnt
-; GFX900-NEXT: ds_read_u16_d16_hi v1, v0
-; GFX900-NEXT: s_waitcnt
-; GFX900-NEXT: global_store_dword v[0:1], v1, off{{$}}
-; GFX900-NEXT: s_waitcnt
-; GFX900-NEXT: s_setpc_b64
-
-; NO-D16-HI: ds_read_u16 v
 define void @load_local_hi_v2i16_reglo_vreg(i16 addrspace(3)* %in, i16 %reg) #0 {
+; GFX900-LABEL: load_local_hi_v2i16_reglo_vreg:
+; GFX900:       ; %bb.0: ; %entry
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ds_read_u16_d16_hi v1, v0
+; GFX900-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX900-NEXT:    global_store_dword v[0:1], v1, off
+; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX906-LABEL: load_local_hi_v2i16_reglo_vreg:
+; GFX906:       ; %bb.0: ; %entry
+; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX906-NEXT:    ds_read_u16 v0, v0
+; GFX906-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX906-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX906-NEXT:    v_lshl_or_b32 v0, v0, 16, v1
+; GFX906-NEXT:    global_store_dword v[0:1], v0, off
+; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX803-LABEL: load_local_hi_v2i16_reglo_vreg:
+; GFX803:       ; %bb.0: ; %entry
+; GFX803-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX803-NEXT:    s_mov_b32 m0, -1
+; GFX803-NEXT:    ds_read_u16 v0, v0
+; GFX803-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX803-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX803-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX803-NEXT:    flat_store_dword v[0:1], v0
+; GFX803-NEXT:    s_waitcnt vmcnt(0)
+; GFX803-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX900-FLATSCR-LABEL: load_local_hi_v2i16_reglo_vreg:
+; GFX900-FLATSCR:       ; %bb.0: ; %entry
+; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-FLATSCR-NEXT:    ds_read_u16_d16_hi v1, v0
+; GFX900-FLATSCR-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX900-FLATSCR-NEXT:    global_store_dword v[0:1], v1, off
+; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-FLATSCR-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %load = load i16, i16 addrspace(3)* %in
   %build0 = insertelement <2 x i16> undef, i16 %reg, i32 0
@@ -120,16 +334,41 @@ entry:
   ret void
 }
 
-; GCN-LABEL: {{^}}load_local_hi_v2i16_zerolo:
-; GCN: s_waitcnt
-; GFX900-NEXT: v_mov_b32_e32 v1, 0
-; GFX900-NEXT: ds_read_u16_d16_hi v1, v0
-; GFX900-NEXT: s_waitcnt
-; GFX900-NEXT: v_mov_b32_e32 v0, v1
-; GFX900-NEXT: s_setpc_b64
-
-; NO-D16-HI: ds_read_u16 v
 define <2 x i16> @load_local_hi_v2i16_zerolo(i16 addrspace(3)* %in) #0 {
+; GFX900-LABEL: load_local_hi_v2i16_zerolo:
+; GFX900:       ; %bb.0: ; %entry
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_mov_b32_e32 v1, 0
+; GFX900-NEXT:    ds_read_u16_d16_hi v1, v0
+; GFX900-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX900-NEXT:    v_mov_b32_e32 v0, v1
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX906-LABEL: load_local_hi_v2i16_zerolo:
+; GFX906:       ; %bb.0: ; %entry
+; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX906-NEXT:    ds_read_u16 v0, v0
+; GFX906-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX906-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX906-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX803-LABEL: load_local_hi_v2i16_zerolo:
+; GFX803:       ; %bb.0: ; %entry
+; GFX803-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX803-NEXT:    s_mov_b32 m0, -1
+; GFX803-NEXT:    ds_read_u16 v0, v0
+; GFX803-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX803-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX803-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX900-FLATSCR-LABEL: load_local_hi_v2i16_zerolo:
+; GFX900-FLATSCR:       ; %bb.0: ; %entry
+; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-FLATSCR-NEXT:    v_mov_b32_e32 v1, 0
+; GFX900-FLATSCR-NEXT:    ds_read_u16_d16_hi v1, v0
+; GFX900-FLATSCR-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX900-FLATSCR-NEXT:    v_mov_b32_e32 v0, v1
+; GFX900-FLATSCR-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %load = load i16, i16 addrspace(3)* %in
   %build = insertelement <2 x i16> zeroinitializer, i16 %load, i32 1
@@ -137,16 +376,39 @@ entry:
 }
 
 ; FIXME: Remove m0 initialization
-; GCN-LABEL: {{^}}load_local_hi_v2i16_zerolo_shift:
-; GCN: s_waitcnt
-; GFX900-NEXT: ds_read_u16 v0, v0
-; GFX900-NEXT: s_waitcnt lgkmcnt(0)
-; GFX900-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX900-NEXT: s_setpc_b64
-
-; NO-D16-HI: ds_read_u16 v
-; NO-D16-HI: v_lshlrev_b32_e32 v0, 16, v0
 define i32 @load_local_hi_v2i16_zerolo_shift(i16 addrspace(3)* %in) #0 {
+; GFX900-LABEL: load_local_hi_v2i16_zerolo_shift:
+; GFX900:       ; %bb.0: ; %entry
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ds_read_u16 v0, v0
+; GFX900-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX900-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX906-LABEL: load_local_hi_v2i16_zerolo_shift:
+; GFX906:       ; %bb.0: ; %entry
+; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX906-NEXT:    ds_read_u16 v0, v0
+; GFX906-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX906-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX906-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX803-LABEL: load_local_hi_v2i16_zerolo_shift:
+; GFX803:       ; %bb.0: ; %entry
+; GFX803-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX803-NEXT:    s_mov_b32 m0, -1
+; GFX803-NEXT:    ds_read_u16 v0, v0
+; GFX803-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX803-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX803-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX900-FLATSCR-LABEL: load_local_hi_v2i16_zerolo_shift:
+; GFX900-FLATSCR:       ; %bb.0: ; %entry
+; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-FLATSCR-NEXT:    ds_read_u16 v0, v0
+; GFX900-FLATSCR-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX900-FLATSCR-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX900-FLATSCR-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %load = load i16, i16 addrspace(3)* %in
   %zext = zext i16 %load to i32
@@ -154,16 +416,47 @@ entry:
   ret i32 %shift
 }
 
-; GCN-LABEL: {{^}}load_local_hi_v2f16_reglo_vreg:
-; GCN: s_waitcnt
-; GFX900-NEXT: ds_read_u16_d16_hi v1, v0
-; GFX900-NEXT: s_waitcnt
-; GFX900-NEXT: global_store_dword v[0:1], v1, off{{$}}
-; GFX900-NEXT: s_waitcnt
-; GFX900-NEXT: s_setpc_b64
-
-; NO-D16-HI: ds_read_u16 v
 define void @load_local_hi_v2f16_reglo_vreg(half addrspace(3)* %in, half %reg) #0 {
+; GFX900-LABEL: load_local_hi_v2f16_reglo_vreg:
+; GFX900:       ; %bb.0: ; %entry
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ds_read_u16_d16_hi v1, v0
+; GFX900-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX900-NEXT:    global_store_dword v[0:1], v1, off
+; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX906-LABEL: load_local_hi_v2f16_reglo_vreg:
+; GFX906:       ; %bb.0: ; %entry
+; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX906-NEXT:    ds_read_u16 v0, v0
+; GFX906-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX906-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX906-NEXT:    v_lshl_or_b32 v0, v0, 16, v1
+; GFX906-NEXT:    global_store_dword v[0:1], v0, off
+; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX803-LABEL: load_local_hi_v2f16_reglo_vreg:
+; GFX803:       ; %bb.0: ; %entry
+; GFX803-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX803-NEXT:    s_mov_b32 m0, -1
+; GFX803-NEXT:    ds_read_u16 v0, v0
+; GFX803-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX803-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX803-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX803-NEXT:    flat_store_dword v[0:1], v0
+; GFX803-NEXT:    s_waitcnt vmcnt(0)
+; GFX803-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX900-FLATSCR-LABEL: load_local_hi_v2f16_reglo_vreg:
+; GFX900-FLATSCR:       ; %bb.0: ; %entry
+; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-FLATSCR-NEXT:    ds_read_u16_d16_hi v1, v0
+; GFX900-FLATSCR-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX900-FLATSCR-NEXT:    global_store_dword v[0:1], v1, off
+; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-FLATSCR-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %load = load half, half addrspace(3)* %in
   %build0 = insertelement <2 x half> undef, half %reg, i32 0
@@ -172,16 +465,47 @@ entry:
   ret void
 }
 
-; GCN-LABEL: {{^}}load_local_hi_v2i16_reglo_vreg_zexti8:
-; GCN: s_waitcnt
-; GFX900-NEXT: ds_read_u8_d16_hi v1, v0
-; GFX900-NEXT: s_waitcnt
-; GFX900-NEXT: global_store_dword v[0:1], v1, off{{$}}
-; GFX900-NEXT: s_waitcnt
-; GFX900-NEXT: s_setpc_b64
-
-; NO-D16-HI: ds_read_u8 v
 define void @load_local_hi_v2i16_reglo_vreg_zexti8(i8 addrspace(3)* %in, i16 %reg) #0 {
+; GFX900-LABEL: load_local_hi_v2i16_reglo_vreg_zexti8:
+; GFX900:       ; %bb.0: ; %entry
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ds_read_u8_d16_hi v1, v0
+; GFX900-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX900-NEXT:    global_store_dword v[0:1], v1, off
+; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX906-LABEL: load_local_hi_v2i16_reglo_vreg_zexti8:
+; GFX906:       ; %bb.0: ; %entry
+; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX906-NEXT:    ds_read_u8 v0, v0
+; GFX906-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX906-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX906-NEXT:    v_lshl_or_b32 v0, v0, 16, v1
+; GFX906-NEXT:    global_store_dword v[0:1], v0, off
+; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX803-LABEL: load_local_hi_v2i16_reglo_vreg_zexti8:
+; GFX803:       ; %bb.0: ; %entry
+; GFX803-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX803-NEXT:    s_mov_b32 m0, -1
+; GFX803-NEXT:    ds_read_u8 v0, v0
+; GFX803-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX803-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX803-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX803-NEXT:    flat_store_dword v[0:1], v0
+; GFX803-NEXT:    s_waitcnt vmcnt(0)
+; GFX803-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX900-FLATSCR-LABEL: load_local_hi_v2i16_reglo_vreg_zexti8:
+; GFX900-FLATSCR:       ; %bb.0: ; %entry
+; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-FLATSCR-NEXT:    ds_read_u8_d16_hi v1, v0
+; GFX900-FLATSCR-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX900-FLATSCR-NEXT:    global_store_dword v[0:1], v1, off
+; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-FLATSCR-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %load = load i8, i8 addrspace(3)* %in
   %ext = zext i8 %load to i16
@@ -191,16 +515,47 @@ entry:
   ret void
 }
 
-; GCN-LABEL: {{^}}load_local_hi_v2i16_reglo_vreg_sexti8:
-; GCN: s_waitcnt
-; GFX900-NEXT: ds_read_i8_d16_hi v1, v0
-; GFX900-NEXT: s_waitcnt
-; GFX900-NEXT: global_store_dword v[0:1], v1, off{{$}}
-; GFX900-NEXT: s_waitcnt
-; GFX900-NEXT: s_setpc_b64
-
-; NO-D16-HI: ds_read_i8 v
 define void @load_local_hi_v2i16_reglo_vreg_sexti8(i8 addrspace(3)* %in, i16 %reg) #0 {
+; GFX900-LABEL: load_local_hi_v2i16_reglo_vreg_sexti8:
+; GFX900:       ; %bb.0: ; %entry
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ds_read_i8_d16_hi v1, v0
+; GFX900-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX900-NEXT:    global_store_dword v[0:1], v1, off
+; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX906-LABEL: load_local_hi_v2i16_reglo_vreg_sexti8:
+; GFX906:       ; %bb.0: ; %entry
+; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX906-NEXT:    ds_read_i8 v0, v0
+; GFX906-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX906-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX906-NEXT:    v_lshl_or_b32 v0, v0, 16, v1
+; GFX906-NEXT:    global_store_dword v[0:1], v0, off
+; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX803-LABEL: load_local_hi_v2i16_reglo_vreg_sexti8:
+; GFX803:       ; %bb.0: ; %entry
+; GFX803-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX803-NEXT:    s_mov_b32 m0, -1
+; GFX803-NEXT:    ds_read_i8 v0, v0
+; GFX803-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX803-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX803-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX803-NEXT:    flat_store_dword v[0:1], v0
+; GFX803-NEXT:    s_waitcnt vmcnt(0)
+; GFX803-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX900-FLATSCR-LABEL: load_local_hi_v2i16_reglo_vreg_sexti8:
+; GFX900-FLATSCR:       ; %bb.0: ; %entry
+; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-FLATSCR-NEXT:    ds_read_i8_d16_hi v1, v0
+; GFX900-FLATSCR-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX900-FLATSCR-NEXT:    global_store_dword v[0:1], v1, off
+; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-FLATSCR-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %load = load i8, i8 addrspace(3)* %in
   %ext = sext i8 %load to i16
@@ -210,16 +565,47 @@ entry:
   ret void
 }
 
-; GCN-LABEL: {{^}}load_local_hi_v2f16_reglo_vreg_zexti8:
-; GCN: s_waitcnt
-; GFX900-NEXT: ds_read_u8_d16_hi v1, v0
-; GFX900-NEXT: s_waitcnt
-; GFX900-NEXT: global_store_dword v[0:1], v1, off{{$}}
-; GFX900-NEXT: s_waitcnt
-; GFX900-NEXT: s_setpc_b64
-
-; NO-D16-HI: ds_read_u8 v
 define void @load_local_hi_v2f16_reglo_vreg_zexti8(i8 addrspace(3)* %in, half %reg) #0 {
+; GFX900-LABEL: load_local_hi_v2f16_reglo_vreg_zexti8:
+; GFX900:       ; %bb.0: ; %entry
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ds_read_u8_d16_hi v1, v0
+; GFX900-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX900-NEXT:    global_store_dword v[0:1], v1, off
+; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX906-LABEL: load_local_hi_v2f16_reglo_vreg_zexti8:
+; GFX906:       ; %bb.0: ; %entry
+; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX906-NEXT:    ds_read_u8 v0, v0
+; GFX906-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX906-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX906-NEXT:    v_lshl_or_b32 v0, v0, 16, v1
+; GFX906-NEXT:    global_store_dword v[0:1], v0, off
+; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX803-LABEL: load_local_hi_v2f16_reglo_vreg_zexti8:
+; GFX803:       ; %bb.0: ; %entry
+; GFX803-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX803-NEXT:    s_mov_b32 m0, -1
+; GFX803-NEXT:    ds_read_u8 v0, v0
+; GFX803-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX803-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX803-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX803-NEXT:    flat_store_dword v[0:1], v0
+; GFX803-NEXT:    s_waitcnt vmcnt(0)
+; GFX803-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX900-FLATSCR-LABEL: load_local_hi_v2f16_reglo_vreg_zexti8:
+; GFX900-FLATSCR:       ; %bb.0: ; %entry
+; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-FLATSCR-NEXT:    ds_read_u8_d16_hi v1, v0
+; GFX900-FLATSCR-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX900-FLATSCR-NEXT:    global_store_dword v[0:1], v1, off
+; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-FLATSCR-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %load = load i8, i8 addrspace(3)* %in
   %ext = zext i8 %load to i16
@@ -231,16 +617,47 @@ entry:
   ret void
 }
 
-; GCN-LABEL: {{^}}load_local_hi_v2f16_reglo_vreg_sexti8:
-; GCN: s_waitcnt
-; GFX900-NEXT: ds_read_i8_d16_hi v1, v0
-; GFX900-NEXT: s_waitcnt
-; GFX900-NEXT: global_store_dword v[0:1], v1, off{{$}}
-; GFX900-NEXT: s_waitcnt
-; GFX900-NEXT: s_setpc_b64
-
-; NO-D16-HI: ds_read_i8 v
 define void @load_local_hi_v2f16_reglo_vreg_sexti8(i8 addrspace(3)* %in, half %reg) #0 {
+; GFX900-LABEL: load_local_hi_v2f16_reglo_vreg_sexti8:
+; GFX900:       ; %bb.0: ; %entry
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ds_read_i8_d16_hi v1, v0
+; GFX900-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX900-NEXT:    global_store_dword v[0:1], v1, off
+; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX906-LABEL: load_local_hi_v2f16_reglo_vreg_sexti8:
+; GFX906:       ; %bb.0: ; %entry
+; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX906-NEXT:    ds_read_i8 v0, v0
+; GFX906-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX906-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX906-NEXT:    v_lshl_or_b32 v0, v0, 16, v1
+; GFX906-NEXT:    global_store_dword v[0:1], v0, off
+; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX803-LABEL: load_local_hi_v2f16_reglo_vreg_sexti8:
+; GFX803:       ; %bb.0: ; %entry
+; GFX803-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX803-NEXT:    s_mov_b32 m0, -1
+; GFX803-NEXT:    ds_read_i8 v0, v0
+; GFX803-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX803-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX803-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX803-NEXT:    flat_store_dword v[0:1], v0
+; GFX803-NEXT:    s_waitcnt vmcnt(0)
+; GFX803-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX900-FLATSCR-LABEL: load_local_hi_v2f16_reglo_vreg_sexti8:
+; GFX900-FLATSCR:       ; %bb.0: ; %entry
+; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-FLATSCR-NEXT:    ds_read_i8_d16_hi v1, v0
+; GFX900-FLATSCR-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX900-FLATSCR-NEXT:    global_store_dword v[0:1], v1, off
+; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-FLATSCR-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %load = load i8, i8 addrspace(3)* %in
   %ext = sext i8 %load to i16
@@ -252,14 +669,48 @@ entry:
   ret void
 }
 
-; GCN-LABEL: {{^}}load_global_hi_v2i16_reglo_vreg:
-; GCN: s_waitcnt
-; GFX900-NEXT: global_load_short_d16_hi v2, v[0:1], off offset:-4094
-; GFX900-NEXT: s_waitcnt
-; GFX900-NEXT: global_store_dword
-; GFX900-NEXT: s_waitcnt
-; GFX900-NEXT: s_setpc_b64
 define void @load_global_hi_v2i16_reglo_vreg(i16 addrspace(1)* %in, i16 %reg) #0 {
+; GFX900-LABEL: load_global_hi_v2i16_reglo_vreg:
+; GFX900:       ; %bb.0: ; %entry
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    global_load_short_d16_hi v2, v[0:1], off offset:-4094
+; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    global_store_dword v[0:1], v2, off
+; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX906-LABEL: load_global_hi_v2i16_reglo_vreg:
+; GFX906:       ; %bb.0: ; %entry
+; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX906-NEXT:    global_load_ushort v0, v[0:1], off offset:-4094
+; GFX906-NEXT:    v_and_b32_e32 v1, 0xffff, v2
+; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    v_lshl_or_b32 v0, v0, 16, v1
+; GFX906-NEXT:    global_store_dword v[0:1], v0, off
+; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX803-LABEL: load_global_hi_v2i16_reglo_vreg:
+; GFX803:       ; %bb.0: ; %entry
+; GFX803-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX803-NEXT:    v_add_u32_e32 v0, vcc, 0xfffff002, v0
+; GFX803-NEXT:    v_addc_u32_e32 v1, vcc, -1, v1, vcc
+; GFX803-NEXT:    flat_load_ushort v0, v[0:1]
+; GFX803-NEXT:    s_waitcnt vmcnt(0)
+; GFX803-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX803-NEXT:    v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX803-NEXT:    flat_store_dword v[0:1], v0
+; GFX803-NEXT:    s_waitcnt vmcnt(0)
+; GFX803-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX900-FLATSCR-LABEL: load_global_hi_v2i16_reglo_vreg:
+; GFX900-FLATSCR:       ; %bb.0: ; %entry
+; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-FLATSCR-NEXT:    global_load_short_d16_hi v2, v[0:1], off offset:-4094
+; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-FLATSCR-NEXT:    global_store_dword v[0:1], v2, off
+; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-FLATSCR-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %gep = getelementptr inbounds i16, i16 addrspace(1)* %in, i64 -2047
   %load = load i16, i16 addrspace(1)* %gep
@@ -269,14 +720,48 @@ entry:
   ret void
 }
 
-; GCN-LABEL: {{^}}load_global_hi_v2f16_reglo_vreg:
-; GCN: s_waitcnt
-; GFX900-NEXT: global_load_short_d16_hi v2, v[0:1], off offset:-4094
-; GFX900-NEXT: s_waitcnt
-; GFX900-NEXT: global_store_dword
-; GFX900-NEXT: s_waitcnt
-; GFX900-NEXT: s_setpc_b64
 define void @load_global_hi_v2f16_reglo_vreg(half addrspace(1)* %in, half %reg) #0 {
+; GFX900-LABEL: load_global_hi_v2f16_reglo_vreg:
+; GFX900:       ; %bb.0: ; %entry
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    global_load_short_d16_hi v2, v[0:1], off offset:-4094
+; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    global_store_dword v[0:1], v2, off
+; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX906-LABEL: load_global_hi_v2f16_reglo_vreg:
+; GFX906:       ; %bb.0: ; %entry
+; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX906-NEXT:    global_load_ushort v0, v[0:1], off offset:-4094
+; GFX906-NEXT:    v_and_b32_e32 v1, 0xffff, v2
+; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    v_lshl_or_b32 v0, v0, 16, v1
+; GFX906-NEXT:    global_store_dword v[0:1], v0, off
+; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX803-LABEL: load_global_hi_v2f16_reglo_vreg:
+; GFX803:       ; %bb.0: ; %entry
+; GFX803-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX803-NEXT:    v_add_u32_e32 v0, vcc, 0xfffff002, v0
+; GFX803-NEXT:    v_addc_u32_e32 v1, vcc, -1, v1, vcc
+; GFX803-NEXT:    flat_load_ushort v0, v[0:1]
+; GFX803-NEXT:    s_waitcnt vmcnt(0)
+; GFX803-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX803-NEXT:    v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX803-NEXT:    flat_store_dword v[0:1], v0
+; GFX803-NEXT:    s_waitcnt vmcnt(0)
+; GFX803-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX900-FLATSCR-LABEL: load_global_hi_v2f16_reglo_vreg:
+; GFX900-FLATSCR:       ; %bb.0: ; %entry
+; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-FLATSCR-NEXT:    global_load_short_d16_hi v2, v[0:1], off offset:-4094
+; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-FLATSCR-NEXT:    global_store_dword v[0:1], v2, off
+; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-FLATSCR-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %gep = getelementptr inbounds half, half addrspace(1)* %in, i64 -2047
   %load = load half, half addrspace(1)* %gep
@@ -286,14 +771,48 @@ entry:
   ret void
 }
 
-; GCN-LABEL: {{^}}load_global_hi_v2i16_reglo_vreg_zexti8:
-; GCN: s_waitcnt
-; GFX900-NEXT: global_load_ubyte_d16_hi v2, v[0:1], off offset:-4095
-; GFX900-NEXT: s_waitcnt
-; GFX900-NEXT: global_store_dword
-; GFX900-NEXT: s_waitcnt
-; GFX900-NEXT: s_setpc_b64
 define void @load_global_hi_v2i16_reglo_vreg_zexti8(i8 addrspace(1)* %in, i16 %reg) #0 {
+; GFX900-LABEL: load_global_hi_v2i16_reglo_vreg_zexti8:
+; GFX900:       ; %bb.0: ; %entry
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    global_load_ubyte_d16_hi v2, v[0:1], off offset:-4095
+; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    global_store_dword v[0:1], v2, off
+; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX906-LABEL: load_global_hi_v2i16_reglo_vreg_zexti8:
+; GFX906:       ; %bb.0: ; %entry
+; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX906-NEXT:    global_load_ubyte v0, v[0:1], off offset:-4095
+; GFX906-NEXT:    v_and_b32_e32 v1, 0xffff, v2
+; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    v_lshl_or_b32 v0, v0, 16, v1
+; GFX906-NEXT:    global_store_dword v[0:1], v0, off
+; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX803-LABEL: load_global_hi_v2i16_reglo_vreg_zexti8:
+; GFX803:       ; %bb.0: ; %entry
+; GFX803-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX803-NEXT:    v_add_u32_e32 v0, vcc, 0xfffff001, v0
+; GFX803-NEXT:    v_addc_u32_e32 v1, vcc, -1, v1, vcc
+; GFX803-NEXT:    flat_load_ubyte v0, v[0:1]
+; GFX803-NEXT:    s_waitcnt vmcnt(0)
+; GFX803-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX803-NEXT:    v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX803-NEXT:    flat_store_dword v[0:1], v0
+; GFX803-NEXT:    s_waitcnt vmcnt(0)
+; GFX803-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX900-FLATSCR-LABEL: load_global_hi_v2i16_reglo_vreg_zexti8:
+; GFX900-FLATSCR:       ; %bb.0: ; %entry
+; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-FLATSCR-NEXT:    global_load_ubyte_d16_hi v2, v[0:1], off offset:-4095
+; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-FLATSCR-NEXT:    global_store_dword v[0:1], v2, off
+; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-FLATSCR-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %gep = getelementptr inbounds i8, i8 addrspace(1)* %in, i64 -4095
   %load = load i8, i8 addrspace(1)* %gep
@@ -304,14 +823,48 @@ entry:
   ret void
 }
 
-; GCN-LABEL: {{^}}load_global_hi_v2i16_reglo_vreg_sexti8:
-; GCN: s_waitcnt
-; GFX900-NEXT: global_load_sbyte_d16_hi v2, v[0:1], off offset:-4095
-; GFX900-NEXT: s_waitcnt
-; GFX900-NEXT: global_store_dword
-; GFX900-NEXT: s_waitcnt
-; GFX900-NEXT: s_setpc_b64
 define void @load_global_hi_v2i16_reglo_vreg_sexti8(i8 addrspace(1)* %in, i16 %reg) #0 {
+; GFX900-LABEL: load_global_hi_v2i16_reglo_vreg_sexti8:
+; GFX900:       ; %bb.0: ; %entry
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    global_load_sbyte_d16_hi v2, v[0:1], off offset:-4095
+; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    global_store_dword v[0:1], v2, off
+; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX906-LABEL: load_global_hi_v2i16_reglo_vreg_sexti8:
+; GFX906:       ; %bb.0: ; %entry
+; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX906-NEXT:    global_load_sbyte v0, v[0:1], off offset:-4095
+; GFX906-NEXT:    v_and_b32_e32 v1, 0xffff, v2
+; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    v_lshl_or_b32 v0, v0, 16, v1
+; GFX906-NEXT:    global_store_dword v[0:1], v0, off
+; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX803-LABEL: load_global_hi_v2i16_reglo_vreg_sexti8:
+; GFX803:       ; %bb.0: ; %entry
+; GFX803-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX803-NEXT:    v_add_u32_e32 v0, vcc, 0xfffff001, v0
+; GFX803-NEXT:    v_addc_u32_e32 v1, vcc, -1, v1, vcc
+; GFX803-NEXT:    flat_load_sbyte v0, v[0:1]
+; GFX803-NEXT:    s_waitcnt vmcnt(0)
+; GFX803-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX803-NEXT:    v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX803-NEXT:    flat_store_dword v[0:1], v0
+; GFX803-NEXT:    s_waitcnt vmcnt(0)
+; GFX803-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX900-FLATSCR-LABEL: load_global_hi_v2i16_reglo_vreg_sexti8:
+; GFX900-FLATSCR:       ; %bb.0: ; %entry
+; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-FLATSCR-NEXT:    global_load_sbyte_d16_hi v2, v[0:1], off offset:-4095
+; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-FLATSCR-NEXT:    global_store_dword v[0:1], v2, off
+; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-FLATSCR-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %gep = getelementptr inbounds i8, i8 addrspace(1)* %in, i64 -4095
   %load = load i8, i8 addrspace(1)* %gep
@@ -322,14 +875,48 @@ entry:
   ret void
 }
 
-; GCN-LABEL: {{^}}load_global_hi_v2f16_reglo_vreg_sexti8:
-; GCN: s_waitcnt
-; GFX900-NEXT: global_load_sbyte_d16_hi v2, v[0:1], off offset:-4095
-; GFX900-NEXT: s_waitcnt
-; GFX900-NEXT: global_store_dword
-; GFX900-NEXT: s_waitcnt
-; GFX900-NEXT: s_setpc_b64
 define void @load_global_hi_v2f16_reglo_vreg_sexti8(i8 addrspace(1)* %in, half %reg) #0 {
+; GFX900-LABEL: load_global_hi_v2f16_reglo_vreg_sexti8:
+; GFX900:       ; %bb.0: ; %entry
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    global_load_sbyte_d16_hi v2, v[0:1], off offset:-4095
+; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    global_store_dword v[0:1], v2, off
+; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX906-LABEL: load_global_hi_v2f16_reglo_vreg_sexti8:
+; GFX906:       ; %bb.0: ; %entry
+; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX906-NEXT:    global_load_sbyte v0, v[0:1], off offset:-4095
+; GFX906-NEXT:    v_and_b32_e32 v1, 0xffff, v2
+; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    v_lshl_or_b32 v0, v0, 16, v1
+; GFX906-NEXT:    global_store_dword v[0:1], v0, off
+; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX803-LABEL: load_global_hi_v2f16_reglo_vreg_sexti8:
+; GFX803:       ; %bb.0: ; %entry
+; GFX803-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX803-NEXT:    v_add_u32_e32 v0, vcc, 0xfffff001, v0
+; GFX803-NEXT:    v_addc_u32_e32 v1, vcc, -1, v1, vcc
+; GFX803-NEXT:    flat_load_sbyte v0, v[0:1]
+; GFX803-NEXT:    s_waitcnt vmcnt(0)
+; GFX803-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX803-NEXT:    v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX803-NEXT:    flat_store_dword v[0:1], v0
+; GFX803-NEXT:    s_waitcnt vmcnt(0)
+; GFX803-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX900-FLATSCR-LABEL: load_global_hi_v2f16_reglo_vreg_sexti8:
+; GFX900-FLATSCR:       ; %bb.0: ; %entry
+; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-FLATSCR-NEXT:    global_load_sbyte_d16_hi v2, v[0:1], off offset:-4095
+; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-FLATSCR-NEXT:    global_store_dword v[0:1], v2, off
+; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-FLATSCR-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %gep = getelementptr inbounds i8, i8 addrspace(1)* %in, i64 -4095
   %load = load i8, i8 addrspace(1)* %gep
@@ -341,14 +928,48 @@ entry:
   ret void
 }
 
-; GCN-LABEL: {{^}}load_global_hi_v2f16_reglo_vreg_zexti8:
-; GCN: s_waitcnt
-; GFX900-NEXT: global_load_ubyte_d16_hi v2, v[0:1], off offset:-4095
-; GFX900-NEXT: s_waitcnt
-; GFX900-NEXT: global_store_dword
-; GFX900-NEXT: s_waitcnt
-; GFX900-NEXT: s_setpc_b64
 define void @load_global_hi_v2f16_reglo_vreg_zexti8(i8 addrspace(1)* %in, half %reg) #0 {
+; GFX900-LABEL: load_global_hi_v2f16_reglo_vreg_zexti8:
+; GFX900:       ; %bb.0: ; %entry
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    global_load_ubyte_d16_hi v2, v[0:1], off offset:-4095
+; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    global_store_dword v[0:1], v2, off
+; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX906-LABEL: load_global_hi_v2f16_reglo_vreg_zexti8:
+; GFX906:       ; %bb.0: ; %entry
+; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX906-NEXT:    global_load_ubyte v0, v[0:1], off offset:-4095
+; GFX906-NEXT:    v_and_b32_e32 v1, 0xffff, v2
+; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    v_lshl_or_b32 v0, v0, 16, v1
+; GFX906-NEXT:    global_store_dword v[0:1], v0, off
+; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX803-LABEL: load_global_hi_v2f16_reglo_vreg_zexti8:
+; GFX803:       ; %bb.0: ; %entry
+; GFX803-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX803-NEXT:    v_add_u32_e32 v0, vcc, 0xfffff001, v0
+; GFX803-NEXT:    v_addc_u32_e32 v1, vcc, -1, v1, vcc
+; GFX803-NEXT:    flat_load_ubyte v0, v[0:1]
+; GFX803-NEXT:    s_waitcnt vmcnt(0)
+; GFX803-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX803-NEXT:    v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX803-NEXT:    flat_store_dword v[0:1], v0
+; GFX803-NEXT:    s_waitcnt vmcnt(0)
+; GFX803-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX900-FLATSCR-LABEL: load_global_hi_v2f16_reglo_vreg_zexti8:
+; GFX900-FLATSCR:       ; %bb.0: ; %entry
+; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-FLATSCR-NEXT:    global_load_ubyte_d16_hi v2, v[0:1], off offset:-4095
+; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-FLATSCR-NEXT:    global_store_dword v[0:1], v2, off
+; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-FLATSCR-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %gep = getelementptr inbounds i8, i8 addrspace(1)* %in, i64 -4095
   %load = load i8, i8 addrspace(1)* %gep
@@ -360,19 +981,46 @@ entry:
   ret void
 }
 
-; GCN-LABEL: load_flat_hi_v2i16_reglo_vreg:
-; GCN: s_waitcnt
-; GFX900-NEXT: flat_load_short_d16_hi v2, v[0:1]
-; GFX900-NEXT: s_waitcnt
-; GFX900-NEXT: global_store_dword v[0:1], v2
-; GFX900-NEXT: s_waitcnt
-; GFX900-NEXT: s_setpc_b64
-
-; NO-D16-HI: flat_load_ushort v{{[0-9]+}}
-; GFX803: v_lshlrev_b32_e32 v{{[0-9]+}}, 16,
-; GFX803: v_or_b32_sdwa
-; GFX906: v_lshl_or_b32 v{{[0-9]+}}, v{{[0-9]+}}, 16,
 define void @load_flat_hi_v2i16_reglo_vreg(i16* %in, i16 %reg) #0 {
+; GFX900-LABEL: load_flat_hi_v2i16_reglo_vreg:
+; GFX900:       ; %bb.0: ; %entry
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    flat_load_short_d16_hi v2, v[0:1]
+; GFX900-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    global_store_dword v[0:1], v2, off
+; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX906-LABEL: load_flat_hi_v2i16_reglo_vreg:
+; GFX906:       ; %bb.0: ; %entry
+; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX906-NEXT:    flat_load_ushort v0, v[0:1]
+; GFX906-NEXT:    v_and_b32_e32 v1, 0xffff, v2
+; GFX906-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX906-NEXT:    v_lshl_or_b32 v0, v0, 16, v1
+; GFX906-NEXT:    global_store_dword v[0:1], v0, off
+; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX803-LABEL: load_flat_hi_v2i16_reglo_vreg:
+; GFX803:       ; %bb.0: ; %entry
+; GFX803-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX803-NEXT:    flat_load_ushort v0, v[0:1]
+; GFX803-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX803-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX803-NEXT:    v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX803-NEXT:    flat_store_dword v[0:1], v0
+; GFX803-NEXT:    s_waitcnt vmcnt(0)
+; GFX803-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX900-FLATSCR-LABEL: load_flat_hi_v2i16_reglo_vreg:
+; GFX900-FLATSCR:       ; %bb.0: ; %entry
+; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-FLATSCR-NEXT:    flat_load_short_d16_hi v2, v[0:1]
+; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX900-FLATSCR-NEXT:    global_store_dword v[0:1], v2, off
+; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-FLATSCR-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %load = load i16, i16* %in
   %build0 = insertelement <2 x i16> undef, i16 %reg, i32 0
@@ -381,19 +1029,46 @@ entry:
   ret void
 }
 
-; GCN-LABEL: {{^}}load_flat_hi_v2f16_reglo_vreg:
-; GCN: s_waitcnt
-; GFX900-NEXT: flat_load_short_d16_hi v2, v[0:1]
-; GFX900-NEXT: s_waitcnt
-; GFX900-NEXT: global_store_dword v[0:1], v2
-; GFX900-NEXT: s_waitcnt
-; GFX900-NEXT: s_setpc_b64
-
-; NO-D16-HI: flat_load_ushort v{{[0-9]+}}
-; GFX803: v_lshlrev_b32_e32 v{{[0-9]+}}, 16,
-; GFX803: v_or_b32_sdwa
-; GFX906: v_lshl_or_b32 v{{[0-9]+}}, v{{[0-9]+}}, 16,
 define void @load_flat_hi_v2f16_reglo_vreg(half* %in, half %reg) #0 {
+; GFX900-LABEL: load_flat_hi_v2f16_reglo_vreg:
+; GFX900:       ; %bb.0: ; %entry
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    flat_load_short_d16_hi v2, v[0:1]
+; GFX900-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    global_store_dword v[0:1], v2, off
+; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX906-LABEL: load_flat_hi_v2f16_reglo_vreg:
+; GFX906:       ; %bb.0: ; %entry
+; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX906-NEXT:    flat_load_ushort v0, v[0:1]
+; GFX906-NEXT:    v_and_b32_e32 v1, 0xffff, v2
+; GFX906-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX906-NEXT:    v_lshl_or_b32 v0, v0, 16, v1
+; GFX906-NEXT:    global_store_dword v[0:1], v0, off
+; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX803-LABEL: load_flat_hi_v2f16_reglo_vreg:
+; GFX803:       ; %bb.0: ; %entry
+; GFX803-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX803-NEXT:    flat_load_ushort v0, v[0:1]
+; GFX803-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX803-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX803-NEXT:    v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX803-NEXT:    flat_store_dword v[0:1], v0
+; GFX803-NEXT:    s_waitcnt vmcnt(0)
+; GFX803-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX900-FLATSCR-LABEL: load_flat_hi_v2f16_reglo_vreg:
+; GFX900-FLATSCR:       ; %bb.0: ; %entry
+; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-FLATSCR-NEXT:    flat_load_short_d16_hi v2, v[0:1]
+; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX900-FLATSCR-NEXT:    global_store_dword v[0:1], v2, off
+; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-FLATSCR-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %load = load half, half* %in
   %build0 = insertelement <2 x half> undef, half %reg, i32 0
@@ -402,19 +1077,46 @@ entry:
   ret void
 }
 
-; GCN-LABEL: {{^}}load_flat_hi_v2i16_reglo_vreg_zexti8:
-; GCN: s_waitcnt
-; GFX900-NEXT: flat_load_ubyte_d16_hi v2, v[0:1]
-; GFX900-NEXT: s_waitcnt
-; GFX900-NEXT: global_store_dword v[0:1], v2
-; GFX900-NEXT: s_waitcnt
-; GFX900-NEXT: s_setpc_b64
-
-; NO-D16-HI: flat_load_ubyte v{{[0-9]+}}
-; GFX803: v_lshlrev_b32_e32 v{{[0-9]+}}, 16,
-; GFX803: v_or_b32_sdwa
-; GFX906: v_lshl_or_b32 v{{[0-9]+}}, v{{[0-9]+}}, 16,
 define void @load_flat_hi_v2i16_reglo_vreg_zexti8(i8* %in, i16 %reg) #0 {
+; GFX900-LABEL: load_flat_hi_v2i16_reglo_vreg_zexti8:
+; GFX900:       ; %bb.0: ; %entry
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    flat_load_ubyte_d16_hi v2, v[0:1]
+; GFX900-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    global_store_dword v[0:1], v2, off
+; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX906-LABEL: load_flat_hi_v2i16_reglo_vreg_zexti8:
+; GFX906:       ; %bb.0: ; %entry
+; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX906-NEXT:    flat_load_ubyte v0, v[0:1]
+; GFX906-NEXT:    v_and_b32_e32 v1, 0xffff, v2
+; GFX906-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX906-NEXT:    v_lshl_or_b32 v0, v0, 16, v1
+; GFX906-NEXT:    global_store_dword v[0:1], v0, off
+; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX803-LABEL: load_flat_hi_v2i16_reglo_vreg_zexti8:
+; GFX803:       ; %bb.0: ; %entry
+; GFX803-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX803-NEXT:    flat_load_ubyte v0, v[0:1]
+; GFX803-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX803-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX803-NEXT:    v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX803-NEXT:    flat_store_dword v[0:1], v0
+; GFX803-NEXT:    s_waitcnt vmcnt(0)
+; GFX803-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX900-FLATSCR-LABEL: load_flat_hi_v2i16_reglo_vreg_zexti8:
+; GFX900-FLATSCR:       ; %bb.0: ; %entry
+; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-FLATSCR-NEXT:    flat_load_ubyte_d16_hi v2, v[0:1]
+; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX900-FLATSCR-NEXT:    global_store_dword v[0:1], v2, off
+; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-FLATSCR-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %load = load i8, i8* %in
   %ext = zext i8 %load to i16
@@ -424,19 +1126,46 @@ entry:
   ret void
 }
 
-; GCN-LABEL: {{^}}load_flat_hi_v2i16_reglo_vreg_sexti8:
-; GCN: s_waitcnt
-; GFX900-NEXT: flat_load_sbyte_d16_hi v2, v[0:1]
-; GFX900-NEXT: s_waitcnt
-; GFX900-NEXT: global_store_dword v[0:1], v2
-; GFX900-NEXT: s_waitcnt
-; GFX900-NEXT: s_setpc_b64
-
-; NO-D16-HI: flat_load_sbyte v{{[0-9]+}}
-; GFX803: v_lshlrev_b32_e32 v{{[0-9]+}}, 16,
-; GFX803: v_or_b32_sdwa
-; GFX906: v_lshl_or_b32 v{{[0-9]+}}, v{{[0-9]+}}, 16,
 define void @load_flat_hi_v2i16_reglo_vreg_sexti8(i8* %in, i16 %reg) #0 {
+; GFX900-LABEL: load_flat_hi_v2i16_reglo_vreg_sexti8:
+; GFX900:       ; %bb.0: ; %entry
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    flat_load_sbyte_d16_hi v2, v[0:1]
+; GFX900-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    global_store_dword v[0:1], v2, off
+; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX906-LABEL: load_flat_hi_v2i16_reglo_vreg_sexti8:
+; GFX906:       ; %bb.0: ; %entry
+; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX906-NEXT:    flat_load_sbyte v0, v[0:1]
+; GFX906-NEXT:    v_and_b32_e32 v1, 0xffff, v2
+; GFX906-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX906-NEXT:    v_lshl_or_b32 v0, v0, 16, v1
+; GFX906-NEXT:    global_store_dword v[0:1], v0, off
+; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX803-LABEL: load_flat_hi_v2i16_reglo_vreg_sexti8:
+; GFX803:       ; %bb.0: ; %entry
+; GFX803-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX803-NEXT:    flat_load_sbyte v0, v[0:1]
+; GFX803-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX803-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX803-NEXT:    v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX803-NEXT:    flat_store_dword v[0:1], v0
+; GFX803-NEXT:    s_waitcnt vmcnt(0)
+; GFX803-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX900-FLATSCR-LABEL: load_flat_hi_v2i16_reglo_vreg_sexti8:
+; GFX900-FLATSCR:       ; %bb.0: ; %entry
+; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-FLATSCR-NEXT:    flat_load_sbyte_d16_hi v2, v[0:1]
+; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX900-FLATSCR-NEXT:    global_store_dword v[0:1], v2, off
+; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-FLATSCR-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %load = load i8, i8* %in
   %ext = sext i8 %load to i16
@@ -446,19 +1175,46 @@ entry:
   ret void
 }
 
-; GCN-LABEL: {{^}}load_flat_hi_v2f16_reglo_vreg_zexti8:
-; GCN: s_waitcnt
-; GFX900-NEXT: flat_load_ubyte_d16_hi v2, v[0:1]
-; GFX900-NEXT: s_waitcnt
-; GFX900-NEXT: global_store_dword v[0:1], v2
-; GFX900-NEXT: s_waitcnt
-; GFX900-NEXT: s_setpc_b64
-
-; NO-D16-HI: flat_load_ubyte v{{[0-9]+}}
-; GFX803: v_lshlrev_b32_e32 v{{[0-9]+}}, 16,
-; GFX803: v_or_b32_sdwa
-; GFX906: v_lshl_or_b32 v{{[0-9]+}}, v{{[0-9]+}}, 16,
 define void @load_flat_hi_v2f16_reglo_vreg_zexti8(i8* %in, half %reg) #0 {
+; GFX900-LABEL: load_flat_hi_v2f16_reglo_vreg_zexti8:
+; GFX900:       ; %bb.0: ; %entry
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    flat_load_ubyte_d16_hi v2, v[0:1]
+; GFX900-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    global_store_dword v[0:1], v2, off
+; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX906-LABEL: load_flat_hi_v2f16_reglo_vreg_zexti8:
+; GFX906:       ; %bb.0: ; %entry
+; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX906-NEXT:    flat_load_ubyte v0, v[0:1]
+; GFX906-NEXT:    v_and_b32_e32 v1, 0xffff, v2
+; GFX906-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX906-NEXT:    v_lshl_or_b32 v0, v0, 16, v1
+; GFX906-NEXT:    global_store_dword v[0:1], v0, off
+; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX803-LABEL: load_flat_hi_v2f16_reglo_vreg_zexti8:
+; GFX803:       ; %bb.0: ; %entry
+; GFX803-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX803-NEXT:    flat_load_ubyte v0, v[0:1]
+; GFX803-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX803-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX803-NEXT:    v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX803-NEXT:    flat_store_dword v[0:1], v0
+; GFX803-NEXT:    s_waitcnt vmcnt(0)
+; GFX803-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX900-FLATSCR-LABEL: load_flat_hi_v2f16_reglo_vreg_zexti8:
+; GFX900-FLATSCR:       ; %bb.0: ; %entry
+; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-FLATSCR-NEXT:    flat_load_ubyte_d16_hi v2, v[0:1]
+; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX900-FLATSCR-NEXT:    global_store_dword v[0:1], v2, off
+; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-FLATSCR-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %load = load i8, i8* %in
   %ext = zext i8 %load to i16
@@ -469,19 +1225,46 @@ entry:
   ret void
 }
 
-; GCN-LABEL: {{^}}load_flat_hi_v2f16_reglo_vreg_sexti8:
-; GCN: s_waitcnt
-; GFX900-NEXT: flat_load_sbyte_d16_hi v2, v[0:1]
-; GFX900-NEXT: s_waitcnt
-; GFX900-NEXT: global_store_dword v[0:1], v2
-; GFX900-NEXT: s_waitcnt
-; GFX900-NEXT: s_setpc_b64
-
-; NO-D16-HI: flat_load_sbyte v{{[0-9]+}}
-; GFX803: v_lshlrev_b32_e32 v{{[0-9]+}}, 16,
-; GFX803: v_or_b32_sdwa
-; GFX906: v_lshl_or_b32 v{{[0-9]+}}, v{{[0-9]+}}, 16,
 define void @load_flat_hi_v2f16_reglo_vreg_sexti8(i8* %in, half %reg) #0 {
+; GFX900-LABEL: load_flat_hi_v2f16_reglo_vreg_sexti8:
+; GFX900:       ; %bb.0: ; %entry
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    flat_load_sbyte_d16_hi v2, v[0:1]
+; GFX900-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    global_store_dword v[0:1], v2, off
+; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX906-LABEL: load_flat_hi_v2f16_reglo_vreg_sexti8:
+; GFX906:       ; %bb.0: ; %entry
+; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX906-NEXT:    flat_load_sbyte v0, v[0:1]
+; GFX906-NEXT:    v_and_b32_e32 v1, 0xffff, v2
+; GFX906-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX906-NEXT:    v_lshl_or_b32 v0, v0, 16, v1
+; GFX906-NEXT:    global_store_dword v[0:1], v0, off
+; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX803-LABEL: load_flat_hi_v2f16_reglo_vreg_sexti8:
+; GFX803:       ; %bb.0: ; %entry
+; GFX803-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX803-NEXT:    flat_load_sbyte v0, v[0:1]
+; GFX803-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX803-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX803-NEXT:    v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX803-NEXT:    flat_store_dword v[0:1], v0
+; GFX803-NEXT:    s_waitcnt vmcnt(0)
+; GFX803-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX900-FLATSCR-LABEL: load_flat_hi_v2f16_reglo_vreg_sexti8:
+; GFX900-FLATSCR:       ; %bb.0: ; %entry
+; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-FLATSCR-NEXT:    flat_load_sbyte_d16_hi v2, v[0:1]
+; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX900-FLATSCR-NEXT:    global_store_dword v[0:1], v2, off
+; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-FLATSCR-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %load = load i8, i8* %in
   %ext = sext i8 %load to i16
@@ -492,17 +1275,46 @@ entry:
   ret void
 }
 
-; GCN-LABEL: {{^}}load_private_hi_v2i16_reglo_vreg:
-; GCN: s_waitcnt
-; GFX900-MUBUF:   buffer_load_short_d16_hi v0, off, s[0:3], s32 offset:4094{{$}}
-; GFX900-FLATSCR: scratch_load_short_d16_hi v0, off, s32 offset:4094{{$}}
-; GFX900-NEXT: s_waitcnt
-; GFX900-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v0
-; GFX900-NEXT: s_waitcnt
-; GFX900-NEXT: s_setpc_b64
-
-; NO-D16-HI: buffer_load_ushort v{{[0-9]+}}, off, s[0:3], s32 offset:4094{{$}}
 define void @load_private_hi_v2i16_reglo_vreg(i16 addrspace(5)* byval(i16) %in, i16 %reg) #0 {
+; GFX900-LABEL: load_private_hi_v2i16_reglo_vreg:
+; GFX900:       ; %bb.0: ; %entry
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    buffer_load_short_d16_hi v0, off, s[0:3], s32 offset:4094
+; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    global_store_dword v[0:1], v0, off
+; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX906-LABEL: load_private_hi_v2i16_reglo_vreg:
+; GFX906:       ; %bb.0: ; %entry
+; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX906-NEXT:    buffer_load_ushort v1, off, s[0:3], s32 offset:4094
+; GFX906-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
+; GFX906-NEXT:    global_store_dword v[0:1], v0, off
+; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX803-LABEL: load_private_hi_v2i16_reglo_vreg:
+; GFX803:       ; %bb.0: ; %entry
+; GFX803-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX803-NEXT:    buffer_load_ushort v1, off, s[0:3], s32 offset:4094
+; GFX803-NEXT:    s_waitcnt vmcnt(0)
+; GFX803-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX803-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX803-NEXT:    flat_store_dword v[0:1], v0
+; GFX803-NEXT:    s_waitcnt vmcnt(0)
+; GFX803-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX900-FLATSCR-LABEL: load_private_hi_v2i16_reglo_vreg:
+; GFX900-FLATSCR:       ; %bb.0: ; %entry
+; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-FLATSCR-NEXT:    scratch_load_short_d16_hi v0, off, s32 offset:4094
+; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-FLATSCR-NEXT:    global_store_dword v[0:1], v0, off
+; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-FLATSCR-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %gep = getelementptr inbounds i16, i16 addrspace(5)* %in, i64 2047
   %load = load i16, i16 addrspace(5)* %gep
@@ -512,17 +1324,46 @@ entry:
   ret void
 }
 
-; GCN-LABEL: {{^}}load_private_hi_v2f16_reglo_vreg:
-; GCN: s_waitcnt
-; GFX900-MUBUF:   buffer_load_short_d16_hi v0, off, s[0:3], s32 offset:4094{{$}}
-; GFX900-FLATSCR: scratch_load_short_d16_hi v0, off, s32 offset:4094{{$}}
-; GFX900-NEXT: s_waitcnt
-; GFX900-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v0
-; GFX900-NEXT: s_waitcnt
-; GFX900-NEXT: s_setpc_b64
-
-; NO-D16-HI: buffer_load_ushort v{{[0-9]+}}, off, s[0:3], s32 offset:4094{{$}}
 define void @load_private_hi_v2f16_reglo_vreg(half addrspace(5)* byval(half) %in, half %reg) #0 {
+; GFX900-LABEL: load_private_hi_v2f16_reglo_vreg:
+; GFX900:       ; %bb.0: ; %entry
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    buffer_load_short_d16_hi v0, off, s[0:3], s32 offset:4094
+; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    global_store_dword v[0:1], v0, off
+; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX906-LABEL: load_private_hi_v2f16_reglo_vreg:
+; GFX906:       ; %bb.0: ; %entry
+; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX906-NEXT:    buffer_load_ushort v1, off, s[0:3], s32 offset:4094
+; GFX906-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
+; GFX906-NEXT:    global_store_dword v[0:1], v0, off
+; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX803-LABEL: load_private_hi_v2f16_reglo_vreg:
+; GFX803:       ; %bb.0: ; %entry
+; GFX803-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX803-NEXT:    buffer_load_ushort v1, off, s[0:3], s32 offset:4094
+; GFX803-NEXT:    s_waitcnt vmcnt(0)
+; GFX803-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX803-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX803-NEXT:    flat_store_dword v[0:1], v0
+; GFX803-NEXT:    s_waitcnt vmcnt(0)
+; GFX803-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX900-FLATSCR-LABEL: load_private_hi_v2f16_reglo_vreg:
+; GFX900-FLATSCR:       ; %bb.0: ; %entry
+; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-FLATSCR-NEXT:    scratch_load_short_d16_hi v0, off, s32 offset:4094
+; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-FLATSCR-NEXT:    global_store_dword v[0:1], v0, off
+; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-FLATSCR-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %gep = getelementptr inbounds half, half addrspace(5)* %in, i64 2047
   %load = load half, half addrspace(5)* %gep
@@ -532,18 +1373,47 @@ entry:
   ret void
 }
 
-; GCN-LABEL: {{^}}load_private_hi_v2i16_reglo_vreg_nooff:
-; GCN: s_waitcnt
-; GFX900-MUBUFF:  buffer_load_short_d16_hi v0, off, s[0:3], 0 offset:4094{{$}}
-; GFX900-FLATSCR: s_movk_i32 [[SOFF:[^,]+]], 0xffe
-; GFX900-FLATSCR: scratch_load_short_d16_hi v0, off, [[SOFF]] glc{{$}}
-; GFX900: s_waitcnt
-; GFX900-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v0
-; GFX900-NEXT: s_waitcnt
-; GFX900-NEXT: s_setpc_b64
-
-; NO-D16-HI: buffer_load_ushort v{{[0-9]+}}, off, s[0:3], 0 offset:4094 glc{{$}}
 define void @load_private_hi_v2i16_reglo_vreg_nooff(i16 addrspace(5)* byval(i16) %in, i16 %reg) #0 {
+; GFX900-LABEL: load_private_hi_v2i16_reglo_vreg_nooff:
+; GFX900:       ; %bb.0: ; %entry
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    buffer_load_short_d16_hi v0, off, s[0:3], 0 offset:4094 glc
+; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    global_store_dword v[0:1], v0, off
+; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX906-LABEL: load_private_hi_v2i16_reglo_vreg_nooff:
+; GFX906:       ; %bb.0: ; %entry
+; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX906-NEXT:    buffer_load_ushort v1, off, s[0:3], 0 offset:4094 glc
+; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX906-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
+; GFX906-NEXT:    global_store_dword v[0:1], v0, off
+; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX803-LABEL: load_private_hi_v2i16_reglo_vreg_nooff:
+; GFX803:       ; %bb.0: ; %entry
+; GFX803-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX803-NEXT:    buffer_load_ushort v1, off, s[0:3], 0 offset:4094 glc
+; GFX803-NEXT:    s_waitcnt vmcnt(0)
+; GFX803-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX803-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX803-NEXT:    flat_store_dword v[0:1], v0
+; GFX803-NEXT:    s_waitcnt vmcnt(0)
+; GFX803-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX900-FLATSCR-LABEL: load_private_hi_v2i16_reglo_vreg_nooff:
+; GFX900-FLATSCR:       ; %bb.0: ; %entry
+; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-FLATSCR-NEXT:    s_movk_i32 s0, 0xffe
+; GFX900-FLATSCR-NEXT:    scratch_load_short_d16_hi v0, off, s0 glc
+; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-FLATSCR-NEXT:    global_store_dword v[0:1], v0, off
+; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-FLATSCR-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %load = load volatile i16, i16 addrspace(5)* inttoptr (i32 4094 to i16 addrspace(5)*)
   %build0 = insertelement <2 x i16> undef, i16 %reg, i32 0
@@ -552,18 +1422,47 @@ entry:
   ret void
 }
 
-; GCN-LABEL: {{^}}load_private_hi_v2f16_reglo_vreg_nooff:
-; GCN: s_waitcnt
-; GFX900-MUBUF-NEXT:   buffer_load_short_d16_hi v1, off, s[0:3], 0 offset:4094 glc{{$}}
-; GFX900-FLATSCR-NEXT: s_movk_i32 [[SOFF:[^,]+]], 0xffe
-; GFX900-FLATSCR-NEXT: scratch_load_short_d16_hi v1, off, [[SOFF]] glc{{$}}
-; GFX900-NEXT: s_waitcnt
-; GFX900-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v1
-; GFX900-NEXT: s_waitcnt
-; GFX900-NEXT: s_setpc_b64
-
-; NO-D16-HI: buffer_load_ushort v{{[0-9]+}}, off, s[0:3], 0 offset:4094 glc{{$}}
 define void @load_private_hi_v2f16_reglo_vreg_nooff(half addrspace(5)* %in, half %reg) #0 {
+; GFX900-LABEL: load_private_hi_v2f16_reglo_vreg_nooff:
+; GFX900:       ; %bb.0: ; %entry
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    buffer_load_short_d16_hi v1, off, s[0:3], 0 offset:4094 glc
+; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    global_store_dword v[0:1], v1, off
+; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX906-LABEL: load_private_hi_v2f16_reglo_vreg_nooff:
+; GFX906:       ; %bb.0: ; %entry
+; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX906-NEXT:    buffer_load_ushort v0, off, s[0:3], 0 offset:4094 glc
+; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX906-NEXT:    v_lshl_or_b32 v0, v0, 16, v1
+; GFX906-NEXT:    global_store_dword v[0:1], v0, off
+; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX803-LABEL: load_private_hi_v2f16_reglo_vreg_nooff:
+; GFX803:       ; %bb.0: ; %entry
+; GFX803-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX803-NEXT:    buffer_load_ushort v0, off, s[0:3], 0 offset:4094 glc
+; GFX803-NEXT:    s_waitcnt vmcnt(0)
+; GFX803-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX803-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX803-NEXT:    flat_store_dword v[0:1], v0
+; GFX803-NEXT:    s_waitcnt vmcnt(0)
+; GFX803-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX900-FLATSCR-LABEL: load_private_hi_v2f16_reglo_vreg_nooff:
+; GFX900-FLATSCR:       ; %bb.0: ; %entry
+; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-FLATSCR-NEXT:    s_movk_i32 s0, 0xffe
+; GFX900-FLATSCR-NEXT:    scratch_load_short_d16_hi v1, off, s0 glc
+; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-FLATSCR-NEXT:    global_store_dword v[0:1], v1, off
+; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-FLATSCR-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %load = load volatile half, half addrspace(5)* inttoptr (i32 4094 to half addrspace(5)*)
   %build0 = insertelement <2 x half> undef, half %reg, i32 0
@@ -572,17 +1471,46 @@ entry:
   ret void
 }
 
-; GCN-LABEL: {{^}}load_private_hi_v2i16_reglo_vreg_zexti8:
-; GCN: s_waitcnt
-; GFX900-MUBUF:   buffer_load_ubyte_d16_hi v0, off, s[0:3], s32 offset:4095{{$}}
-; GFX900-FLATSCR: scratch_load_ubyte_d16_hi v0, off, s32 offset:4095{{$}}
-; GFX900-NEXT: s_waitcnt
-; GFX900-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v0
-; GFX900-NEXT: s_waitcnt
-; GFX900-NEXT: s_setpc_b64
-
-; NO-D16-HI: buffer_load_ubyte v{{[0-9]+}}, off, s[0:3], s32 offset:4095{{$}}
 define void @load_private_hi_v2i16_reglo_vreg_zexti8(i8 addrspace(5)* byval(i8) %in, i16 %reg) #0 {
+; GFX900-LABEL: load_private_hi_v2i16_reglo_vreg_zexti8:
+; GFX900:       ; %bb.0: ; %entry
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    buffer_load_ubyte_d16_hi v0, off, s[0:3], s32 offset:4095
+; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    global_store_dword v[0:1], v0, off
+; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX906-LABEL: load_private_hi_v2i16_reglo_vreg_zexti8:
+; GFX906:       ; %bb.0: ; %entry
+; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX906-NEXT:    buffer_load_ubyte v1, off, s[0:3], s32 offset:4095
+; GFX906-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
+; GFX906-NEXT:    global_store_dword v[0:1], v0, off
+; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX803-LABEL: load_private_hi_v2i16_reglo_vreg_zexti8:
+; GFX803:       ; %bb.0: ; %entry
+; GFX803-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX803-NEXT:    buffer_load_ubyte v1, off, s[0:3], s32 offset:4095
+; GFX803-NEXT:    s_waitcnt vmcnt(0)
+; GFX803-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX803-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX803-NEXT:    flat_store_dword v[0:1], v0
+; GFX803-NEXT:    s_waitcnt vmcnt(0)
+; GFX803-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX900-FLATSCR-LABEL: load_private_hi_v2i16_reglo_vreg_zexti8:
+; GFX900-FLATSCR:       ; %bb.0: ; %entry
+; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-FLATSCR-NEXT:    scratch_load_ubyte_d16_hi v0, off, s32 offset:4095
+; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-FLATSCR-NEXT:    global_store_dword v[0:1], v0, off
+; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-FLATSCR-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %gep = getelementptr inbounds i8, i8 addrspace(5)* %in, i64 4095
   %load = load i8, i8 addrspace(5)* %gep
@@ -593,17 +1521,46 @@ entry:
   ret void
 }
 
-; GCN-LABEL: {{^}}load_private_hi_v2f16_reglo_vreg_zexti8:
-; GCN: s_waitcnt
-; GFX900-MUBUF:   buffer_load_ubyte_d16_hi v0, off, s[0:3], s32 offset:4095{{$}}
-; GFX900-FLATSCR: scratch_load_ubyte_d16_hi v0, off, s32 offset:4095{{$}}
-; GFX900-NEXT: s_waitcnt
-; GFX900-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v0
-; GFX900-NEXT: s_waitcnt
-; GFX900-NEXT: s_setpc_b64
-
-; NO-D16-HI: buffer_load_ubyte v{{[0-9]+}}, off, s[0:3], s32 offset:4095{{$}}
 define void @load_private_hi_v2f16_reglo_vreg_zexti8(i8 addrspace(5)* byval(i8) %in, half %reg) #0 {
+; GFX900-LABEL: load_private_hi_v2f16_reglo_vreg_zexti8:
+; GFX900:       ; %bb.0: ; %entry
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    buffer_load_ubyte_d16_hi v0, off, s[0:3], s32 offset:4095
+; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    global_store_dword v[0:1], v0, off
+; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX906-LABEL: load_private_hi_v2f16_reglo_vreg_zexti8:
+; GFX906:       ; %bb.0: ; %entry
+; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX906-NEXT:    buffer_load_ubyte v1, off, s[0:3], s32 offset:4095
+; GFX906-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
+; GFX906-NEXT:    global_store_dword v[0:1], v0, off
+; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX803-LABEL: load_private_hi_v2f16_reglo_vreg_zexti8:
+; GFX803:       ; %bb.0: ; %entry
+; GFX803-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX803-NEXT:    buffer_load_ubyte v1, off, s[0:3], s32 offset:4095
+; GFX803-NEXT:    s_waitcnt vmcnt(0)
+; GFX803-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX803-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX803-NEXT:    flat_store_dword v[0:1], v0
+; GFX803-NEXT:    s_waitcnt vmcnt(0)
+; GFX803-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX900-FLATSCR-LABEL: load_private_hi_v2f16_reglo_vreg_zexti8:
+; GFX900-FLATSCR:       ; %bb.0: ; %entry
+; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-FLATSCR-NEXT:    scratch_load_ubyte_d16_hi v0, off, s32 offset:4095
+; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-FLATSCR-NEXT:    global_store_dword v[0:1], v0, off
+; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-FLATSCR-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %gep = getelementptr inbounds i8, i8 addrspace(5)* %in, i64 4095
   %load = load i8, i8 addrspace(5)* %gep
@@ -615,17 +1572,46 @@ entry:
   ret void
 }
 
-; GCN-LABEL: {{^}}load_private_hi_v2f16_reglo_vreg_sexti8:
-; GCN: s_waitcnt
-; GFX900-MUBUF:   buffer_load_sbyte_d16_hi v0, off, s[0:3], s32 offset:4095{{$}}
-; GFX900-FLATSCR: scratch_load_sbyte_d16_hi v0, off, s32 offset:4095{{$}}
-; GFX900-NEXT: s_waitcnt
-; GFX900-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v0
-; GFX900-NEXT: s_waitcnt
-; GFX900-NEXT: s_setpc_b64
-
-; NO-D16-HI: buffer_load_sbyte v{{[0-9]+}}, off, s[0:3], s32 offset:4095{{$}}
 define void @load_private_hi_v2f16_reglo_vreg_sexti8(i8 addrspace(5)* byval(i8) %in, half %reg) #0 {
+; GFX900-LABEL: load_private_hi_v2f16_reglo_vreg_sexti8:
+; GFX900:       ; %bb.0: ; %entry
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    buffer_load_sbyte_d16_hi v0, off, s[0:3], s32 offset:4095
+; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    global_store_dword v[0:1], v0, off
+; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX906-LABEL: load_private_hi_v2f16_reglo_vreg_sexti8:
+; GFX906:       ; %bb.0: ; %entry
+; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX906-NEXT:    buffer_load_sbyte v1, off, s[0:3], s32 offset:4095
+; GFX906-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
+; GFX906-NEXT:    global_store_dword v[0:1], v0, off
+; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX803-LABEL: load_private_hi_v2f16_reglo_vreg_sexti8:
+; GFX803:       ; %bb.0: ; %entry
+; GFX803-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX803-NEXT:    buffer_load_sbyte v1, off, s[0:3], s32 offset:4095
+; GFX803-NEXT:    s_waitcnt vmcnt(0)
+; GFX803-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX803-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX803-NEXT:    flat_store_dword v[0:1], v0
+; GFX803-NEXT:    s_waitcnt vmcnt(0)
+; GFX803-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX900-FLATSCR-LABEL: load_private_hi_v2f16_reglo_vreg_sexti8:
+; GFX900-FLATSCR:       ; %bb.0: ; %entry
+; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-FLATSCR-NEXT:    scratch_load_sbyte_d16_hi v0, off, s32 offset:4095
+; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-FLATSCR-NEXT:    global_store_dword v[0:1], v0, off
+; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-FLATSCR-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %gep = getelementptr inbounds i8, i8 addrspace(5)* %in, i64 4095
   %load = load i8, i8 addrspace(5)* %gep
@@ -637,17 +1623,46 @@ entry:
   ret void
 }
 
-; GCN-LABEL: {{^}}load_private_hi_v2i16_reglo_vreg_sexti8:
-; GCN: s_waitcnt
-; GFX900-MUBUF:   buffer_load_sbyte_d16_hi v0, off, s[0:3], s32 offset:4095{{$}}
-; GFX900-FLATSCR: scratch_load_sbyte_d16_hi v0, off, s32 offset:4095{{$}}
-; GFX900-NEXT: s_waitcnt
-; GFX900-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v0
-; GFX900-NEXT: s_waitcnt
-; GFX900-NEXT: s_setpc_b64
-
-; NO-D16-HI: buffer_load_sbyte v{{[0-9]+}}, off, s[0:3], s32 offset:4095{{$}}
 define void @load_private_hi_v2i16_reglo_vreg_sexti8(i8 addrspace(5)* byval(i8) %in, i16 %reg) #0 {
+; GFX900-LABEL: load_private_hi_v2i16_reglo_vreg_sexti8:
+; GFX900:       ; %bb.0: ; %entry
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    buffer_load_sbyte_d16_hi v0, off, s[0:3], s32 offset:4095
+; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    global_store_dword v[0:1], v0, off
+; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX906-LABEL: load_private_hi_v2i16_reglo_vreg_sexti8:
+; GFX906:       ; %bb.0: ; %entry
+; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX906-NEXT:    buffer_load_sbyte v1, off, s[0:3], s32 offset:4095
+; GFX906-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
+; GFX906-NEXT:    global_store_dword v[0:1], v0, off
+; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX803-LABEL: load_private_hi_v2i16_reglo_vreg_sexti8:
+; GFX803:       ; %bb.0: ; %entry
+; GFX803-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX803-NEXT:    buffer_load_sbyte v1, off, s[0:3], s32 offset:4095
+; GFX803-NEXT:    s_waitcnt vmcnt(0)
+; GFX803-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX803-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX803-NEXT:    flat_store_dword v[0:1], v0
+; GFX803-NEXT:    s_waitcnt vmcnt(0)
+; GFX803-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX900-FLATSCR-LABEL: load_private_hi_v2i16_reglo_vreg_sexti8:
+; GFX900-FLATSCR:       ; %bb.0: ; %entry
+; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-FLATSCR-NEXT:    scratch_load_sbyte_d16_hi v0, off, s32 offset:4095
+; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-FLATSCR-NEXT:    global_store_dword v[0:1], v0, off
+; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-FLATSCR-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %gep = getelementptr inbounds i8, i8 addrspace(5)* %in, i64 4095
   %load = load i8, i8 addrspace(5)* %gep
@@ -658,18 +1673,47 @@ entry:
   ret void
 }
 
-; GCN-LABEL: {{^}}load_private_hi_v2i16_reglo_vreg_nooff_zexti8:
-; GCN: s_waitcnt
-; GFX900-MUBUF-NEXT:   buffer_load_ubyte_d16_hi v1, off, s[0:3], 0 offset:4094 glc{{$}}
-; GFX900-FLATSCR-NEXT: s_movk_i32 [[SOFF:[^,]+]], 0xffe
-; GFX900-FLATSCR-NEXT: scratch_load_ubyte_d16_hi v1, off, [[SOFF]] glc{{$}}
-; GFX900-NEXT: s_waitcnt
-; GFX900-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v1
-; GFX900-NEXT: s_waitcnt
-; GFX900-NEXT: s_setpc_b64
-
-; NO-D16-HI: buffer_load_ubyte v0, off, s[0:3], 0 offset:4094 glc{{$}}
 define void @load_private_hi_v2i16_reglo_vreg_nooff_zexti8(i8 addrspace(5)* %in, i16 %reg) #0 {
+; GFX900-LABEL: load_private_hi_v2i16_reglo_vreg_nooff_zexti8:
+; GFX900:       ; %bb.0: ; %entry
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    buffer_load_ubyte_d16_hi v1, off, s[0:3], 0 offset:4094 glc
+; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    global_store_dword v[0:1], v1, off
+; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX906-LABEL: load_private_hi_v2i16_reglo_vreg_nooff_zexti8:
+; GFX906:       ; %bb.0: ; %entry
+; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX906-NEXT:    buffer_load_ubyte v0, off, s[0:3], 0 offset:4094 glc
+; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX906-NEXT:    v_lshl_or_b32 v0, v0, 16, v1
+; GFX906-NEXT:    global_store_dword v[0:1], v0, off
+; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX803-LABEL: load_private_hi_v2i16_reglo_vreg_nooff_zexti8:
+; GFX803:       ; %bb.0: ; %entry
+; GFX803-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX803-NEXT:    buffer_load_ubyte v0, off, s[0:3], 0 offset:4094 glc
+; GFX803-NEXT:    s_waitcnt vmcnt(0)
+; GFX803-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX803-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX803-NEXT:    flat_store_dword v[0:1], v0
+; GFX803-NEXT:    s_waitcnt vmcnt(0)
+; GFX803-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX900-FLATSCR-LABEL: load_private_hi_v2i16_reglo_vreg_nooff_zexti8:
+; GFX900-FLATSCR:       ; %bb.0: ; %entry
+; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-FLATSCR-NEXT:    s_movk_i32 s0, 0xffe
+; GFX900-FLATSCR-NEXT:    scratch_load_ubyte_d16_hi v1, off, s0 glc
+; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-FLATSCR-NEXT:    global_store_dword v[0:1], v1, off
+; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-FLATSCR-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %load = load volatile i8, i8 addrspace(5)* inttoptr (i32 4094 to i8 addrspace(5)*)
   %ext = zext i8 %load to i16
@@ -679,18 +1723,47 @@ entry:
   ret void
 }
 
-; GCN-LABEL: {{^}}load_private_hi_v2i16_reglo_vreg_nooff_sexti8:
-; GCN: s_waitcnt
-; GFX900-MUBUF-NEXT:   buffer_load_sbyte_d16_hi v1, off, s[0:3], 0 offset:4094 glc{{$}}
-; GFX900-FLATSCR-NEXT: s_movk_i32 [[SOFF:[^,]+]], 0xffe
-; GFX900-FLATSCR-NEXT: scratch_load_sbyte_d16_hi v1, off, [[SOFF]] glc{{$}}
-; GFX900-NEXT: s_waitcnt
-; GFX900-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v1
-; GFX900-NEXT: s_waitcnt
-; GFX900-NEXT: s_setpc_b64
-
-; NO-D16-HI: buffer_load_sbyte v0, off, s[0:3], 0 offset:4094 glc{{$}}
 define void @load_private_hi_v2i16_reglo_vreg_nooff_sexti8(i8 addrspace(5)* %in, i16 %reg) #0 {
+; GFX900-LABEL: load_private_hi_v2i16_reglo_vreg_nooff_sexti8:
+; GFX900:       ; %bb.0: ; %entry
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    buffer_load_sbyte_d16_hi v1, off, s[0:3], 0 offset:4094 glc
+; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    global_store_dword v[0:1], v1, off
+; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX906-LABEL: load_private_hi_v2i16_reglo_vreg_nooff_sexti8:
+; GFX906:       ; %bb.0: ; %entry
+; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX906-NEXT:    buffer_load_sbyte v0, off, s[0:3], 0 offset:4094 glc
+; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX906-NEXT:    v_lshl_or_b32 v0, v0, 16, v1
+; GFX906-NEXT:    global_store_dword v[0:1], v0, off
+; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX803-LABEL: load_private_hi_v2i16_reglo_vreg_nooff_sexti8:
+; GFX803:       ; %bb.0: ; %entry
+; GFX803-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX803-NEXT:    buffer_load_sbyte v0, off, s[0:3], 0 offset:4094 glc
+; GFX803-NEXT:    s_waitcnt vmcnt(0)
+; GFX803-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX803-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX803-NEXT:    flat_store_dword v[0:1], v0
+; GFX803-NEXT:    s_waitcnt vmcnt(0)
+; GFX803-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX900-FLATSCR-LABEL: load_private_hi_v2i16_reglo_vreg_nooff_sexti8:
+; GFX900-FLATSCR:       ; %bb.0: ; %entry
+; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-FLATSCR-NEXT:    s_movk_i32 s0, 0xffe
+; GFX900-FLATSCR-NEXT:    scratch_load_sbyte_d16_hi v1, off, s0 glc
+; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-FLATSCR-NEXT:    global_store_dword v[0:1], v1, off
+; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-FLATSCR-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %load = load volatile i8, i8 addrspace(5)* inttoptr (i32 4094 to i8 addrspace(5)*)
   %ext = sext i8 %load to i16
@@ -700,18 +1773,47 @@ entry:
   ret void
 }
 
-; GCN-LABEL: {{^}}load_private_hi_v2f16_reglo_vreg_nooff_zexti8:
-; GCN: s_waitcnt
-; GFX900-MUBUF-NEXT:   buffer_load_ubyte_d16_hi v1, off, s[0:3], 0 offset:4094 glc{{$}}
-; GFX900-FLATSCR-NEXT: s_movk_i32 [[SOFF:[^,]+]], 0xffe
-; GFX900-FLATSCR-NEXT: scratch_load_ubyte_d16_hi v1, off, [[SOFF]] glc{{$}}
-; GFX900-NEXT: s_waitcnt
-; GFX900-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v1
-; GFX900-NEXT: s_waitcnt
-; GFX900-NEXT: s_setpc_b64
-
-; NO-D16-HI: buffer_load_ubyte v0, off, s[0:3], 0 offset:4094 glc{{$}}
 define void @load_private_hi_v2f16_reglo_vreg_nooff_zexti8(i8 addrspace(5)* %in, half %reg) #0 {
+; GFX900-LABEL: load_private_hi_v2f16_reglo_vreg_nooff_zexti8:
+; GFX900:       ; %bb.0: ; %entry
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    buffer_load_ubyte_d16_hi v1, off, s[0:3], 0 offset:4094 glc
+; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    global_store_dword v[0:1], v1, off
+; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX906-LABEL: load_private_hi_v2f16_reglo_vreg_nooff_zexti8:
+; GFX906:       ; %bb.0: ; %entry
+; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX906-NEXT:    buffer_load_ubyte v0, off, s[0:3], 0 offset:4094 glc
+; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX906-NEXT:    v_lshl_or_b32 v0, v0, 16, v1
+; GFX906-NEXT:    global_store_dword v[0:1], v0, off
+; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX803-LABEL: load_private_hi_v2f16_reglo_vreg_nooff_zexti8:
+; GFX803:       ; %bb.0: ; %entry
+; GFX803-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX803-NEXT:    buffer_load_ubyte v0, off, s[0:3], 0 offset:4094 glc
+; GFX803-NEXT:    s_waitcnt vmcnt(0)
+; GFX803-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX803-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX803-NEXT:    flat_store_dword v[0:1], v0
+; GFX803-NEXT:    s_waitcnt vmcnt(0)
+; GFX803-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX900-FLATSCR-LABEL: load_private_hi_v2f16_reglo_vreg_nooff_zexti8:
+; GFX900-FLATSCR:       ; %bb.0: ; %entry
+; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-FLATSCR-NEXT:    s_movk_i32 s0, 0xffe
+; GFX900-FLATSCR-NEXT:    scratch_load_ubyte_d16_hi v1, off, s0 glc
+; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-FLATSCR-NEXT:    global_store_dword v[0:1], v1, off
+; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-FLATSCR-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %load = load volatile i8, i8 addrspace(5)* inttoptr (i32 4094 to i8 addrspace(5)*)
   %ext = zext i8 %load to i16
@@ -722,17 +1824,48 @@ entry:
   ret void
 }
 
-; GCN-LABEL: {{^}}load_constant_hi_v2i16_reglo_vreg:
-; GCN: s_waitcnt
-; GFX900-NEXT: global_load_short_d16_hi v2, v[0:1], off offset:-4094
-; GFX900-NEXT: s_waitcnt
-; GFX900-NEXT: global_store_dword
-; GFX900-NEXT: s_waitcnt
-; GFX900-NEXT: s_setpc_b64
-
-; GFX803: flat_load_ushort
-; GFX906: global_load_ushort
 define void @load_constant_hi_v2i16_reglo_vreg(i16 addrspace(4)* %in, i16 %reg) #0 {
+; GFX900-LABEL: load_constant_hi_v2i16_reglo_vreg:
+; GFX900:       ; %bb.0: ; %entry
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    global_load_short_d16_hi v2, v[0:1], off offset:-4094
+; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    global_store_dword v[0:1], v2, off
+; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX906-LABEL: load_constant_hi_v2i16_reglo_vreg:
+; GFX906:       ; %bb.0: ; %entry
+; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX906-NEXT:    global_load_ushort v0, v[0:1], off offset:-4094
+; GFX906-NEXT:    v_and_b32_e32 v1, 0xffff, v2
+; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    v_lshl_or_b32 v0, v0, 16, v1
+; GFX906-NEXT:    global_store_dword v[0:1], v0, off
+; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX803-LABEL: load_constant_hi_v2i16_reglo_vreg:
+; GFX803:       ; %bb.0: ; %entry
+; GFX803-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX803-NEXT:    v_add_u32_e32 v0, vcc, 0xfffff002, v0
+; GFX803-NEXT:    v_addc_u32_e32 v1, vcc, -1, v1, vcc
+; GFX803-NEXT:    flat_load_ushort v0, v[0:1]
+; GFX803-NEXT:    s_waitcnt vmcnt(0)
+; GFX803-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX803-NEXT:    v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX803-NEXT:    flat_store_dword v[0:1], v0
+; GFX803-NEXT:    s_waitcnt vmcnt(0)
+; GFX803-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX900-FLATSCR-LABEL: load_constant_hi_v2i16_reglo_vreg:
+; GFX900-FLATSCR:       ; %bb.0: ; %entry
+; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-FLATSCR-NEXT:    global_load_short_d16_hi v2, v[0:1], off offset:-4094
+; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-FLATSCR-NEXT:    global_store_dword v[0:1], v2, off
+; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-FLATSCR-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %gep = getelementptr inbounds i16, i16 addrspace(4)* %in, i64 -2047
   %load = load i16, i16 addrspace(4)* %gep
@@ -742,17 +1875,48 @@ entry:
   ret void
 }
 
-; GCN-LABEL: load_constant_hi_v2f16_reglo_vreg
-; GCN: s_waitcnt
-; GFX900-NEXT: global_load_short_d16_hi v2, v[0:1], off offset:-4094
-; GFX900-NEXT: s_waitcnt
-; GFX900-NEXT: global_store_dword
-; GFX900-NEXT: s_waitcnt
-; GFX900-NEXT: s_setpc_b64
-
-; GFX803: flat_load_ushort
-; GFX906: global_load_ushort
 define void @load_constant_hi_v2f16_reglo_vreg(half addrspace(4)* %in, half %reg) #0 {
+; GFX900-LABEL: load_constant_hi_v2f16_reglo_vreg:
+; GFX900:       ; %bb.0: ; %entry
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    global_load_short_d16_hi v2, v[0:1], off offset:-4094
+; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    global_store_dword v[0:1], v2, off
+; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX906-LABEL: load_constant_hi_v2f16_reglo_vreg:
+; GFX906:       ; %bb.0: ; %entry
+; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX906-NEXT:    global_load_ushort v0, v[0:1], off offset:-4094
+; GFX906-NEXT:    v_and_b32_e32 v1, 0xffff, v2
+; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    v_lshl_or_b32 v0, v0, 16, v1
+; GFX906-NEXT:    global_store_dword v[0:1], v0, off
+; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX803-LABEL: load_constant_hi_v2f16_reglo_vreg:
+; GFX803:       ; %bb.0: ; %entry
+; GFX803-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX803-NEXT:    v_add_u32_e32 v0, vcc, 0xfffff002, v0
+; GFX803-NEXT:    v_addc_u32_e32 v1, vcc, -1, v1, vcc
+; GFX803-NEXT:    flat_load_ushort v0, v[0:1]
+; GFX803-NEXT:    s_waitcnt vmcnt(0)
+; GFX803-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX803-NEXT:    v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX803-NEXT:    flat_store_dword v[0:1], v0
+; GFX803-NEXT:    s_waitcnt vmcnt(0)
+; GFX803-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX900-FLATSCR-LABEL: load_constant_hi_v2f16_reglo_vreg:
+; GFX900-FLATSCR:       ; %bb.0: ; %entry
+; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-FLATSCR-NEXT:    global_load_short_d16_hi v2, v[0:1], off offset:-4094
+; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-FLATSCR-NEXT:    global_store_dword v[0:1], v2, off
+; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-FLATSCR-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %gep = getelementptr inbounds half, half addrspace(4)* %in, i64 -2047
   %load = load half, half addrspace(4)* %gep
@@ -762,14 +1926,48 @@ entry:
   ret void
 }
 
-; GCN-LABEL: {{^}}load_constant_hi_v2f16_reglo_vreg_sexti8:
-; GCN: s_waitcnt
-; GFX900-NEXT: global_load_sbyte_d16_hi v2, v[0:1], off offset:-4095
-; GFX900-NEXT: s_waitcnt
-; GFX900-NEXT: global_store_dword
-; GFX900-NEXT: s_waitcnt
-; GFX900-NEXT: s_setpc_b64
 define void @load_constant_hi_v2f16_reglo_vreg_sexti8(i8 addrspace(4)* %in, half %reg) #0 {
+; GFX900-LABEL: load_constant_hi_v2f16_reglo_vreg_sexti8:
+; GFX900:       ; %bb.0: ; %entry
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    global_load_sbyte_d16_hi v2, v[0:1], off offset:-4095
+; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    global_store_dword v[0:1], v2, off
+; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX906-LABEL: load_constant_hi_v2f16_reglo_vreg_sexti8:
+; GFX906:       ; %bb.0: ; %entry
+; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX906-NEXT:    global_load_sbyte v0, v[0:1], off offset:-4095
+; GFX906-NEXT:    v_and_b32_e32 v1, 0xffff, v2
+; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    v_lshl_or_b32 v0, v0, 16, v1
+; GFX906-NEXT:    global_store_dword v[0:1], v0, off
+; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX803-LABEL: load_constant_hi_v2f16_reglo_vreg_sexti8:
+; GFX803:       ; %bb.0: ; %entry
+; GFX803-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX803-NEXT:    v_add_u32_e32 v0, vcc, 0xfffff001, v0
+; GFX803-NEXT:    v_addc_u32_e32 v1, vcc, -1, v1, vcc
+; GFX803-NEXT:    flat_load_sbyte v0, v[0:1]
+; GFX803-NEXT:    s_waitcnt vmcnt(0)
+; GFX803-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX803-NEXT:    v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX803-NEXT:    flat_store_dword v[0:1], v0
+; GFX803-NEXT:    s_waitcnt vmcnt(0)
+; GFX803-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX900-FLATSCR-LABEL: load_constant_hi_v2f16_reglo_vreg_sexti8:
+; GFX900-FLATSCR:       ; %bb.0: ; %entry
+; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-FLATSCR-NEXT:    global_load_sbyte_d16_hi v2, v[0:1], off offset:-4095
+; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-FLATSCR-NEXT:    global_store_dword v[0:1], v2, off
+; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-FLATSCR-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %gep = getelementptr inbounds i8, i8 addrspace(4)* %in, i64 -4095
   %load = load i8, i8 addrspace(4)* %gep
@@ -781,14 +1979,48 @@ entry:
   ret void
 }
 
-; GCN-LABEL: {{^}}load_constant_hi_v2f16_reglo_vreg_zexti8:
-; GCN: s_waitcnt
-; GFX900-NEXT: global_load_ubyte_d16_hi v2, v[0:1], off offset:-4095
-; GFX900-NEXT: s_waitcnt
-; GFX900-NEXT: global_store_dword
-; GFX900-NEXT: s_waitcnt
-; GFX900-NEXT: s_setpc_b64
 define void @load_constant_hi_v2f16_reglo_vreg_zexti8(i8 addrspace(4)* %in, half %reg) #0 {
+; GFX900-LABEL: load_constant_hi_v2f16_reglo_vreg_zexti8:
+; GFX900:       ; %bb.0: ; %entry
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    global_load_ubyte_d16_hi v2, v[0:1], off offset:-4095
+; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    global_store_dword v[0:1], v2, off
+; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX906-LABEL: load_constant_hi_v2f16_reglo_vreg_zexti8:
+; GFX906:       ; %bb.0: ; %entry
+; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX906-NEXT:    global_load_ubyte v0, v[0:1], off offset:-4095
+; GFX906-NEXT:    v_and_b32_e32 v1, 0xffff, v2
+; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    v_lshl_or_b32 v0, v0, 16, v1
+; GFX906-NEXT:    global_store_dword v[0:1], v0, off
+; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX803-LABEL: load_constant_hi_v2f16_reglo_vreg_zexti8:
+; GFX803:       ; %bb.0: ; %entry
+; GFX803-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX803-NEXT:    v_add_u32_e32 v0, vcc, 0xfffff001, v0
+; GFX803-NEXT:    v_addc_u32_e32 v1, vcc, -1, v1, vcc
+; GFX803-NEXT:    flat_load_ubyte v0, v[0:1]
+; GFX803-NEXT:    s_waitcnt vmcnt(0)
+; GFX803-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX803-NEXT:    v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX803-NEXT:    flat_store_dword v[0:1], v0
+; GFX803-NEXT:    s_waitcnt vmcnt(0)
+; GFX803-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX900-FLATSCR-LABEL: load_constant_hi_v2f16_reglo_vreg_zexti8:
+; GFX900-FLATSCR:       ; %bb.0: ; %entry
+; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-FLATSCR-NEXT:    global_load_ubyte_d16_hi v2, v[0:1], off offset:-4095
+; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-FLATSCR-NEXT:    global_store_dword v[0:1], v2, off
+; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-FLATSCR-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %gep = getelementptr inbounds i8, i8 addrspace(4)* %in, i64 -4095
   %load = load i8, i8 addrspace(4)* %gep
@@ -803,16 +2035,58 @@ entry:
 ; Local object gives known offset, so requires converting from offen
 ; to offset variant.
 
-; GCN-LABEL: {{^}}load_private_hi_v2i16_reglo_vreg_to_offset:
-; GFX900-MUBUF:        buffer_store_dword
-; GFX900-MUBUF-NEXT:   s_waitcnt vmcnt(0)
-; GFX900-MUBUF-NEXT:   buffer_load_short_d16_hi v{{[0-9]+}}, off, s[0:3], s32 offset:4058
-; GFX900-MUBUF-NEXT:   s_waitcnt vmcnt(0)
-; GFX900-FLATSCR:      scratch_store_dword
-; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX900-FLATSCR-NEXT: scratch_load_short_d16_hi v{{[0-9]+}}, off, s32 offset:4058
-; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0)
 define void @load_private_hi_v2i16_reglo_vreg_to_offset(i16 %reg, [10 x i32] addrspace(5)* %obj0) #0 {
+; GFX900-LABEL: load_private_hi_v2i16_reglo_vreg_to_offset:
+; GFX900:       ; %bb.0: ; %entry
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_mov_b32_e32 v2, 0x7b
+; GFX900-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
+; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    buffer_load_short_d16_hi v0, off, s[0:3], s32 offset:4058
+; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    global_store_dword v[0:1], v0, off
+; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX906-LABEL: load_private_hi_v2i16_reglo_vreg_to_offset:
+; GFX906:       ; %bb.0: ; %entry
+; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX906-NEXT:    v_mov_b32_e32 v2, 0x7b
+; GFX906-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
+; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    buffer_load_ushort v1, off, s[0:3], s32 offset:4058
+; GFX906-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
+; GFX906-NEXT:    global_store_dword v[0:1], v0, off
+; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX803-LABEL: load_private_hi_v2i16_reglo_vreg_to_offset:
+; GFX803:       ; %bb.0: ; %entry
+; GFX803-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX803-NEXT:    v_mov_b32_e32 v2, 0x7b
+; GFX803-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
+; GFX803-NEXT:    s_waitcnt vmcnt(0)
+; GFX803-NEXT:    buffer_load_ushort v1, off, s[0:3], s32 offset:4058
+; GFX803-NEXT:    s_waitcnt vmcnt(0)
+; GFX803-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX803-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX803-NEXT:    flat_store_dword v[0:1], v0
+; GFX803-NEXT:    s_waitcnt vmcnt(0)
+; GFX803-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX900-FLATSCR-LABEL: load_private_hi_v2i16_reglo_vreg_to_offset:
+; GFX900-FLATSCR:       ; %bb.0: ; %entry
+; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-FLATSCR-NEXT:    v_mov_b32_e32 v2, 0x7b
+; GFX900-FLATSCR-NEXT:    scratch_store_dword v1, v2, off
+; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-FLATSCR-NEXT:    scratch_load_short_d16_hi v0, off, s32 offset:4058
+; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-FLATSCR-NEXT:    global_store_dword v[0:1], v0, off
+; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-FLATSCR-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %obj1 = alloca [4096 x i16], align 2, addrspace(5)
   %bc = bitcast [10 x i32] addrspace(5)* %obj0 to i32 addrspace(5)*
@@ -825,16 +2099,58 @@ entry:
   ret void
 }
 
-; GCN-LABEL: {{^}}load_private_hi_v2i16_reglo_vreg_sexti8_to_offset:
-; GFX900-MUBUF:        buffer_store_dword
-; GFX900-MUBUF-NEXT:   s_waitcnt vmcnt(0)
-; GFX900-MUBUF-NEXT:   buffer_load_sbyte_d16_hi v{{[0-9]+}}, off, s[0:3], s32 offset:4059
-; GFX900-MUBUF-NEXT:   s_waitcnt vmcnt(0)
-; GFX900-FLATSCR:      scratch_store_dword
-; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX900-FLATSCR-NEXT: scratch_load_sbyte_d16_hi v{{[0-9]+}}, off, s32 offset:4059
-; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0)
 define void @load_private_hi_v2i16_reglo_vreg_sexti8_to_offset(i16 %reg, [10 x i32] addrspace(5)* %obj0) #0 {
+; GFX900-LABEL: load_private_hi_v2i16_reglo_vreg_sexti8_to_offset:
+; GFX900:       ; %bb.0: ; %entry
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_mov_b32_e32 v2, 0x7b
+; GFX900-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
+; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    buffer_load_sbyte_d16_hi v0, off, s[0:3], s32 offset:4059
+; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    global_store_dword v[0:1], v0, off
+; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX906-LABEL: load_private_hi_v2i16_reglo_vreg_sexti8_to_offset:
+; GFX906:       ; %bb.0: ; %entry
+; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX906-NEXT:    v_mov_b32_e32 v2, 0x7b
+; GFX906-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
+; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    buffer_load_sbyte v1, off, s[0:3], s32 offset:4059
+; GFX906-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
+; GFX906-NEXT:    global_store_dword v[0:1], v0, off
+; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX803-LABEL: load_private_hi_v2i16_reglo_vreg_sexti8_to_offset:
+; GFX803:       ; %bb.0: ; %entry
+; GFX803-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX803-NEXT:    v_mov_b32_e32 v2, 0x7b
+; GFX803-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
+; GFX803-NEXT:    s_waitcnt vmcnt(0)
+; GFX803-NEXT:    buffer_load_sbyte v1, off, s[0:3], s32 offset:4059
+; GFX803-NEXT:    s_waitcnt vmcnt(0)
+; GFX803-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX803-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX803-NEXT:    flat_store_dword v[0:1], v0
+; GFX803-NEXT:    s_waitcnt vmcnt(0)
+; GFX803-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX900-FLATSCR-LABEL: load_private_hi_v2i16_reglo_vreg_sexti8_to_offset:
+; GFX900-FLATSCR:       ; %bb.0: ; %entry
+; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-FLATSCR-NEXT:    v_mov_b32_e32 v2, 0x7b
+; GFX900-FLATSCR-NEXT:    scratch_store_dword v1, v2, off
+; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-FLATSCR-NEXT:    scratch_load_sbyte_d16_hi v0, off, s32 offset:4059
+; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-FLATSCR-NEXT:    global_store_dword v[0:1], v0, off
+; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-FLATSCR-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %obj1 = alloca [4096 x i8], align 2, addrspace(5)
   %bc = bitcast [10 x i32] addrspace(5)* %obj0 to i32 addrspace(5)*
@@ -848,16 +2164,58 @@ entry:
   ret void
 }
 
-; GCN-LABEL: {{^}}load_private_hi_v2i16_reglo_vreg_zexti8_to_offset:
-; GFX900-MUBUF:        buffer_store_dword
-; GFX900-MUBUF-NEXT:   s_waitcnt vmcnt(0)
-; GFX900-MUBUF-NEXT:   buffer_load_ubyte_d16_hi v{{[0-9]+}}, off, s[0:3], s32 offset:4059
-; GFX900-MUBUF-NEXT:   s_waitcnt vmcnt(0)
-; GFX900-FLATSCR:      scratch_store_dword
-; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX900-FLATSCR-NEXT: scratch_load_ubyte_d16_hi v{{[0-9]+}}, off, s32 offset:4059
-; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0)
 define void @load_private_hi_v2i16_reglo_vreg_zexti8_to_offset(i16 %reg, [10 x i32] addrspace(5)* %obj0) #0 {
+; GFX900-LABEL: load_private_hi_v2i16_reglo_vreg_zexti8_to_offset:
+; GFX900:       ; %bb.0: ; %entry
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_mov_b32_e32 v2, 0x7b
+; GFX900-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
+; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    buffer_load_ubyte_d16_hi v0, off, s[0:3], s32 offset:4059
+; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    global_store_dword v[0:1], v0, off
+; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX906-LABEL: load_private_hi_v2i16_reglo_vreg_zexti8_to_offset:
+; GFX906:       ; %bb.0: ; %entry
+; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX906-NEXT:    v_mov_b32_e32 v2, 0x7b
+; GFX906-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
+; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    buffer_load_ubyte v1, off, s[0:3], s32 offset:4059
+; GFX906-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
+; GFX906-NEXT:    global_store_dword v[0:1], v0, off
+; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX803-LABEL: load_private_hi_v2i16_reglo_vreg_zexti8_to_offset:
+; GFX803:       ; %bb.0: ; %entry
+; GFX803-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX803-NEXT:    v_mov_b32_e32 v2, 0x7b
+; GFX803-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
+; GFX803-NEXT:    s_waitcnt vmcnt(0)
+; GFX803-NEXT:    buffer_load_ubyte v1, off, s[0:3], s32 offset:4059
+; GFX803-NEXT:    s_waitcnt vmcnt(0)
+; GFX803-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX803-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX803-NEXT:    flat_store_dword v[0:1], v0
+; GFX803-NEXT:    s_waitcnt vmcnt(0)
+; GFX803-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX900-FLATSCR-LABEL: load_private_hi_v2i16_reglo_vreg_zexti8_to_offset:
+; GFX900-FLATSCR:       ; %bb.0: ; %entry
+; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-FLATSCR-NEXT:    v_mov_b32_e32 v2, 0x7b
+; GFX900-FLATSCR-NEXT:    scratch_store_dword v1, v2, off
+; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-FLATSCR-NEXT:    scratch_load_ubyte_d16_hi v0, off, s32 offset:4059
+; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-FLATSCR-NEXT:    global_store_dword v[0:1], v0, off
+; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-FLATSCR-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %obj1 = alloca [4096 x i8], align 2, addrspace(5)
   %bc = bitcast [10 x i32] addrspace(5)* %obj0 to i32 addrspace(5)*
@@ -873,15 +2231,48 @@ entry:
 
 ; FIXME: Remove m0 init and waitcnt between reads
 ; FIXME: Is there a cost to using the extload over not?
-; GCN-LABEL: {{^}}load_local_v2i16_split_multi_chain:
-; GCN: s_waitcnt
-; GFX900-NEXT: ds_read_u16 v1, v0
-; GFX900-NEXT: s_waitcnt
-; GFX900-NEXT: ds_read_u16_d16_hi v1, v0 offset:2
-; GFX900-NEXT: s_waitcnt
-; GFX900-NEXT: v_mov_b32_e32 v0, v1
-; GFX900-NEXT: s_setpc_b64
 define <2 x i16> @load_local_v2i16_split_multi_chain(i16 addrspace(3)* %in) #0 {
+; GFX900-LABEL: load_local_v2i16_split_multi_chain:
+; GFX900:       ; %bb.0: ; %entry
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ds_read_u16 v1, v0
+; GFX900-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX900-NEXT:    ds_read_u16_d16_hi v1, v0 offset:2
+; GFX900-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX900-NEXT:    v_mov_b32_e32 v0, v1
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX906-LABEL: load_local_v2i16_split_multi_chain:
+; GFX906:       ; %bb.0: ; %entry
+; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX906-NEXT:    ds_read_u16 v1, v0
+; GFX906-NEXT:    ds_read_u16 v0, v0 offset:2
+; GFX906-NEXT:    s_waitcnt lgkmcnt(1)
+; GFX906-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX906-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX906-NEXT:    v_lshl_or_b32 v0, v0, 16, v1
+; GFX906-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX803-LABEL: load_local_v2i16_split_multi_chain:
+; GFX803:       ; %bb.0: ; %entry
+; GFX803-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX803-NEXT:    s_mov_b32 m0, -1
+; GFX803-NEXT:    ds_read_u16 v1, v0
+; GFX803-NEXT:    ds_read_u16 v0, v0 offset:2
+; GFX803-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX803-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX803-NEXT:    v_or_b32_e32 v0, v1, v0
+; GFX803-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX900-FLATSCR-LABEL: load_local_v2i16_split_multi_chain:
+; GFX900-FLATSCR:       ; %bb.0: ; %entry
+; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-FLATSCR-NEXT:    ds_read_u16 v1, v0
+; GFX900-FLATSCR-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX900-FLATSCR-NEXT:    ds_read_u16_d16_hi v1, v0 offset:2
+; GFX900-FLATSCR-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX900-FLATSCR-NEXT:    v_mov_b32_e32 v0, v1
+; GFX900-FLATSCR-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %gep = getelementptr inbounds i16, i16 addrspace(3)* %in, i32 1
   %load0 = load volatile i16, i16 addrspace(3)* %in
@@ -891,17 +2282,49 @@ entry:
   ret <2 x i16> %build1
 }
 
-; GCN-LABEL: {{^}}load_local_lo_hi_v2i16_samechain:
-; GFX900: ds_read_u16 v1, v0
-; GFX900-NEXT: s_waitcnt lgkmcnt(0)
-; GFX900-NEXT: ds_read_u16_d16_hi v1, v0 offset:16
-; GFX900-NEXT: s_waitcnt lgkmcnt(0)
-; GFX900-NEXT: v_mov_b32_e32 v0, v1
-; GFX900-NEXT: s_setpc_b64
-
-; NO-D16-HI: ds_read_u16
-; NO-D16-HI: ds_read_u16
 define <2 x i16> @load_local_lo_hi_v2i16_samechain(i16 addrspace(3)* %in) #0 {
+; GFX900-LABEL: load_local_lo_hi_v2i16_samechain:
+; GFX900:       ; %bb.0: ; %entry
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ds_read_u16 v1, v0
+; GFX900-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX900-NEXT:    ds_read_u16_d16_hi v1, v0 offset:16
+; GFX900-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX900-NEXT:    v_mov_b32_e32 v0, v1
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX906-LABEL: load_local_lo_hi_v2i16_samechain:
+; GFX906:       ; %bb.0: ; %entry
+; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX906-NEXT:    ds_read_u16 v1, v0
+; GFX906-NEXT:    ds_read_u16 v0, v0 offset:16
+; GFX906-NEXT:    s_waitcnt lgkmcnt(1)
+; GFX906-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX906-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX906-NEXT:    v_lshl_or_b32 v0, v0, 16, v1
+; GFX906-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX803-LABEL: load_local_lo_hi_v2i16_samechain:
+; GFX803:       ; %bb.0: ; %entry
+; GFX803-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX803-NEXT:    s_mov_b32 m0, -1
+; GFX803-NEXT:    ds_read_u16 v1, v0 offset:16
+; GFX803-NEXT:    ds_read_u16 v0, v0
+; GFX803-NEXT:    s_waitcnt lgkmcnt(1)
+; GFX803-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX803-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX803-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX803-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX900-FLATSCR-LABEL: load_local_lo_hi_v2i16_samechain:
+; GFX900-FLATSCR:       ; %bb.0: ; %entry
+; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-FLATSCR-NEXT:    ds_read_u16 v1, v0
+; GFX900-FLATSCR-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX900-FLATSCR-NEXT:    ds_read_u16_d16_hi v1, v0 offset:16
+; GFX900-FLATSCR-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX900-FLATSCR-NEXT:    v_mov_b32_e32 v0, v1
+; GFX900-FLATSCR-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %gep = getelementptr inbounds i16, i16 addrspace(3)* %in, i32 8
   %load.lo = load i16, i16 addrspace(3)* %in
@@ -912,12 +2335,43 @@ entry:
 }
 
 ; FIXME: Remove and
-; GCN-LABEL: {{^}}load_local_v2i16_broadcast:
-; GCN: ds_read_u16 [[LOAD:v[0-9]+]]
-; GCN-NOT: ds_read
-; GFX9: v_and_b32_e32 [[AND:v[0-9]+]], 0xffff, [[LOAD]]
-; GFX9: v_lshl_or_b32 v0, [[LOAD]], 16, [[AND]]
 define <2 x i16> @load_local_v2i16_broadcast(i16 addrspace(3)* %in) #0 {
+; GFX900-LABEL: load_local_v2i16_broadcast:
+; GFX900:       ; %bb.0: ; %entry
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ds_read_u16 v0, v0
+; GFX900-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX900-NEXT:    v_and_b32_e32 v1, 0xffff, v0
+; GFX900-NEXT:    v_lshl_or_b32 v0, v0, 16, v1
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX906-LABEL: load_local_v2i16_broadcast:
+; GFX906:       ; %bb.0: ; %entry
+; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX906-NEXT:    ds_read_u16 v0, v0
+; GFX906-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX906-NEXT:    v_and_b32_e32 v1, 0xffff, v0
+; GFX906-NEXT:    v_lshl_or_b32 v0, v0, 16, v1
+; GFX906-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX803-LABEL: load_local_v2i16_broadcast:
+; GFX803:       ; %bb.0: ; %entry
+; GFX803-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX803-NEXT:    s_mov_b32 m0, -1
+; GFX803-NEXT:    ds_read_u16 v0, v0
+; GFX803-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX803-NEXT:    v_lshlrev_b32_e32 v1, 16, v0
+; GFX803-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX803-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX900-FLATSCR-LABEL: load_local_v2i16_broadcast:
+; GFX900-FLATSCR:       ; %bb.0: ; %entry
+; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-FLATSCR-NEXT:    ds_read_u16 v0, v0
+; GFX900-FLATSCR-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX900-FLATSCR-NEXT:    v_and_b32_e32 v1, 0xffff, v0
+; GFX900-FLATSCR-NEXT:    v_lshl_or_b32 v0, v0, 16, v1
+; GFX900-FLATSCR-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %gep = getelementptr inbounds i16, i16 addrspace(3)* %in, i32 1
   %load0 = load i16, i16 addrspace(3)* %in
@@ -926,15 +2380,56 @@ entry:
   ret <2 x i16> %build1
 }
 
-; GCN-LABEL: {{^}}load_local_lo_hi_v2i16_side_effect:
-; GFX900: ds_read_u16 [[LOAD0:v[0-9]+]], v0
-; GFX900: ds_write_b16
-; GFX900: ds_read_u16_d16_hi [[LOAD0]], v0 offset:16
-
-; NO-D16-HI: ds_read_u16
-; NO-D16-HI: ds_write_b16
-; NO-D16-HI: ds_read_u16
 define <2 x i16> @load_local_lo_hi_v2i16_side_effect(i16 addrspace(3)* %in, i16 addrspace(3)* %may.alias) #0 {
+; GFX900-LABEL: load_local_lo_hi_v2i16_side_effect:
+; GFX900:       ; %bb.0: ; %entry
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    ds_read_u16 v2, v0
+; GFX900-NEXT:    v_mov_b32_e32 v3, 0x7b
+; GFX900-NEXT:    ds_write_b16 v1, v3
+; GFX900-NEXT:    s_waitcnt lgkmcnt(1)
+; GFX900-NEXT:    ds_read_u16_d16_hi v2, v0 offset:16
+; GFX900-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX900-NEXT:    v_mov_b32_e32 v0, v2
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX906-LABEL: load_local_lo_hi_v2i16_side_effect:
+; GFX906:       ; %bb.0: ; %entry
+; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX906-NEXT:    v_mov_b32_e32 v3, 0x7b
+; GFX906-NEXT:    ds_read_u16 v2, v0
+; GFX906-NEXT:    ds_write_b16 v1, v3
+; GFX906-NEXT:    ds_read_u16 v0, v0 offset:16
+; GFX906-NEXT:    s_waitcnt lgkmcnt(2)
+; GFX906-NEXT:    v_and_b32_e32 v1, 0xffff, v2
+; GFX906-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX906-NEXT:    v_lshl_or_b32 v0, v0, 16, v1
+; GFX906-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX803-LABEL: load_local_lo_hi_v2i16_side_effect:
+; GFX803:       ; %bb.0: ; %entry
+; GFX803-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX803-NEXT:    s_mov_b32 m0, -1
+; GFX803-NEXT:    v_mov_b32_e32 v3, 0x7b
+; GFX803-NEXT:    ds_read_u16 v2, v0
+; GFX803-NEXT:    ds_write_b16 v1, v3
+; GFX803-NEXT:    ds_read_u16 v0, v0 offset:16
+; GFX803-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX803-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX803-NEXT:    v_or_b32_e32 v0, v2, v0
+; GFX803-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX900-FLATSCR-LABEL: load_local_lo_hi_v2i16_side_effect:
+; GFX900-FLATSCR:       ; %bb.0: ; %entry
+; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-FLATSCR-NEXT:    ds_read_u16 v2, v0
+; GFX900-FLATSCR-NEXT:    v_mov_b32_e32 v3, 0x7b
+; GFX900-FLATSCR-NEXT:    ds_write_b16 v1, v3
+; GFX900-FLATSCR-NEXT:    s_waitcnt lgkmcnt(1)
+; GFX900-FLATSCR-NEXT:    ds_read_u16_d16_hi v2, v0 offset:16
+; GFX900-FLATSCR-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX900-FLATSCR-NEXT:    v_mov_b32_e32 v0, v2
+; GFX900-FLATSCR-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %gep = getelementptr inbounds i16, i16 addrspace(3)* %in, i32 8
   %load.lo = load i16, i16 addrspace(3)* %in
@@ -946,15 +2441,50 @@ entry:
 }
 
 ; FIXME: Remove waitcnt between reads
-; GCN-LABEL: {{^}}load_global_v2i16_split:
-; GCN: s_waitcnt
-; GFX900-NEXT: global_load_ushort v2
-; GFX900-NEXT: s_waitcnt
-; GFX900-NEXT: global_load_short_d16_hi v2
-; GFX900-NEXT: s_waitcnt
-; GFX900-NEXT: v_mov_b32_e32 v0, v2
-; GFX900-NEXT: s_setpc_b64
 define <2 x i16> @load_global_v2i16_split(i16 addrspace(1)* %in) #0 {
+; GFX900-LABEL: load_global_v2i16_split:
+; GFX900:       ; %bb.0: ; %entry
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    global_load_ushort v2, v[0:1], off glc
+; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    global_load_short_d16_hi v2, v[0:1], off offset:2 glc
+; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    v_mov_b32_e32 v0, v2
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX906-LABEL: load_global_v2i16_split:
+; GFX906:       ; %bb.0: ; %entry
+; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX906-NEXT:    global_load_ushort v2, v[0:1], off glc
+; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    global_load_ushort v3, v[0:1], off offset:2 glc
+; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    v_and_b32_e32 v0, 0xffff, v2
+; GFX906-NEXT:    v_lshl_or_b32 v0, v3, 16, v0
+; GFX906-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX803-LABEL: load_global_v2i16_split:
+; GFX803:       ; %bb.0: ; %entry
+; GFX803-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX803-NEXT:    v_add_u32_e32 v2, vcc, 2, v0
+; GFX803-NEXT:    v_addc_u32_e32 v3, vcc, 0, v1, vcc
+; GFX803-NEXT:    flat_load_ushort v0, v[0:1] glc
+; GFX803-NEXT:    s_waitcnt vmcnt(0)
+; GFX803-NEXT:    flat_load_ushort v1, v[2:3] glc
+; GFX803-NEXT:    s_waitcnt vmcnt(0)
+; GFX803-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX803-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX803-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX900-FLATSCR-LABEL: load_global_v2i16_split:
+; GFX900-FLATSCR:       ; %bb.0: ; %entry
+; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-FLATSCR-NEXT:    global_load_ushort v2, v[0:1], off glc
+; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-FLATSCR-NEXT:    global_load_short_d16_hi v2, v[0:1], off offset:2 glc
+; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-FLATSCR-NEXT:    v_mov_b32_e32 v0, v2
+; GFX900-FLATSCR-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %gep = getelementptr inbounds i16, i16 addrspace(1)* %in, i64 1
   %load0 = load volatile i16, i16 addrspace(1)* %in
@@ -965,15 +2495,50 @@ entry:
 }
 
 ; FIXME: Remove waitcnt between reads
-; GCN-LABEL: {{^}}load_flat_v2i16_split:
-; GCN: s_waitcnt
-; GFX900-NEXT: flat_load_ushort v2
-; GFX900-NEXT: s_waitcnt
-; GFX900-NEXT: flat_load_short_d16_hi v2
-; GFX900-NEXT: s_waitcnt
-; GFX900-NEXT: v_mov_b32_e32 v0, v2
-; GFX900-NEXT: s_setpc_b64
 define <2 x i16> @load_flat_v2i16_split(i16* %in) #0 {
+; GFX900-LABEL: load_flat_v2i16_split:
+; GFX900:       ; %bb.0: ; %entry
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    flat_load_ushort v2, v[0:1] glc
+; GFX900-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    flat_load_short_d16_hi v2, v[0:1] offset:2 glc
+; GFX900-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_mov_b32_e32 v0, v2
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX906-LABEL: load_flat_v2i16_split:
+; GFX906:       ; %bb.0: ; %entry
+; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX906-NEXT:    flat_load_ushort v2, v[0:1] glc
+; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    flat_load_ushort v3, v[0:1] offset:2 glc
+; GFX906-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX906-NEXT:    v_and_b32_e32 v0, 0xffff, v2
+; GFX906-NEXT:    v_lshl_or_b32 v0, v3, 16, v0
+; GFX906-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX803-LABEL: load_flat_v2i16_split:
+; GFX803:       ; %bb.0: ; %entry
+; GFX803-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX803-NEXT:    v_add_u32_e32 v2, vcc, 2, v0
+; GFX803-NEXT:    v_addc_u32_e32 v3, vcc, 0, v1, vcc
+; GFX803-NEXT:    flat_load_ushort v0, v[0:1] glc
+; GFX803-NEXT:    s_waitcnt vmcnt(0)
+; GFX803-NEXT:    flat_load_ushort v1, v[2:3] glc
+; GFX803-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX803-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX803-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX803-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX900-FLATSCR-LABEL: load_flat_v2i16_split:
+; GFX900-FLATSCR:       ; %bb.0: ; %entry
+; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-FLATSCR-NEXT:    flat_load_ushort v2, v[0:1] glc
+; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX900-FLATSCR-NEXT:    flat_load_short_d16_hi v2, v[0:1] offset:2 glc
+; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX900-FLATSCR-NEXT:    v_mov_b32_e32 v0, v2
+; GFX900-FLATSCR-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %gep = getelementptr inbounds i16, i16* %in, i64 1
   %load0 = load volatile i16, i16* %in
@@ -984,15 +2549,49 @@ entry:
 }
 
 ; FIXME: Remove waitcnt between reads
-; GCN-LABEL: {{^}}load_constant_v2i16_split:
-; GCN: s_waitcnt
-; GFX900-NEXT: global_load_ushort v2
-; GFX900-NEXT: s_waitcnt
-; GFX900-NEXT: global_load_short_d16_hi v2
-; GFX900-NEXT: s_waitcnt
-; GFX900-NEXT: v_mov_b32_e32 v0, v2
-; GFX900-NEXT: s_setpc_b64
 define <2 x i16> @load_constant_v2i16_split(i16 addrspace(4)* %in) #0 {
+; GFX900-LABEL: load_constant_v2i16_split:
+; GFX900:       ; %bb.0: ; %entry
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    global_load_ushort v2, v[0:1], off glc
+; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    global_load_short_d16_hi v2, v[0:1], off offset:2 glc
+; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    v_mov_b32_e32 v0, v2
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX906-LABEL: load_constant_v2i16_split:
+; GFX906:       ; %bb.0: ; %entry
+; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX906-NEXT:    global_load_ushort v2, v[0:1], off glc
+; GFX906-NEXT:    global_load_ushort v3, v[0:1], off offset:2 glc
+; GFX906-NEXT:    s_waitcnt vmcnt(1)
+; GFX906-NEXT:    v_and_b32_e32 v0, 0xffff, v2
+; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    v_lshl_or_b32 v0, v3, 16, v0
+; GFX906-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX803-LABEL: load_constant_v2i16_split:
+; GFX803:       ; %bb.0: ; %entry
+; GFX803-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX803-NEXT:    v_add_u32_e32 v2, vcc, 2, v0
+; GFX803-NEXT:    v_addc_u32_e32 v3, vcc, 0, v1, vcc
+; GFX803-NEXT:    flat_load_ushort v0, v[0:1] glc
+; GFX803-NEXT:    flat_load_ushort v1, v[2:3] glc
+; GFX803-NEXT:    s_waitcnt vmcnt(0)
+; GFX803-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX803-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX803-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX900-FLATSCR-LABEL: load_constant_v2i16_split:
+; GFX900-FLATSCR:       ; %bb.0: ; %entry
+; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-FLATSCR-NEXT:    global_load_ushort v2, v[0:1], off glc
+; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-FLATSCR-NEXT:    global_load_short_d16_hi v2, v[0:1], off offset:2 glc
+; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-FLATSCR-NEXT:    v_mov_b32_e32 v0, v2
+; GFX900-FLATSCR-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %gep = getelementptr inbounds i16, i16 addrspace(4)* %in, i64 1
   %load0 = load volatile i16, i16 addrspace(4)* %in
@@ -1004,16 +2603,46 @@ entry:
 
 ; FIXME: Remove m0 init and waitcnt between reads
 ; FIXME: Is there a cost to using the extload over not?
-; GCN-LABEL: {{^}}load_private_v2i16_split:
-; GCN: s_waitcnt
-; GFX900-MUBUF:   buffer_load_ushort v0, off, s[0:3], s32 glc{{$}}
-; GFX900-FLATSCR: scratch_load_ushort v0, off, s32 glc{{$}}
-; GFX900-NEXT: s_waitcnt
-; GFX900-MUBUF-NEXT:   buffer_load_short_d16_hi v0, off, s[0:3], s32 offset:2
-; GFX900-FLATSCR-NEXT: scratch_load_short_d16_hi v0, off, s32 offset:2
-; GFX900-NEXT: s_waitcnt
-; GFX900-NEXT: s_setpc_b64
 define <2 x i16> @load_private_v2i16_split(i16 addrspace(5)* byval(i16) %in) #0 {
+; GFX900-LABEL: load_private_v2i16_split:
+; GFX900:       ; %bb.0: ; %entry
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 glc
+; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    buffer_load_short_d16_hi v0, off, s[0:3], s32 offset:2 glc
+; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX906-LABEL: load_private_v2i16_split:
+; GFX906:       ; %bb.0: ; %entry
+; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX906-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 glc
+; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    buffer_load_ushort v1, off, s[0:3], s32 offset:2 glc
+; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX906-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
+; GFX906-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX803-LABEL: load_private_v2i16_split:
+; GFX803:       ; %bb.0: ; %entry
+; GFX803-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX803-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 glc
+; GFX803-NEXT:    s_waitcnt vmcnt(0)
+; GFX803-NEXT:    buffer_load_ushort v1, off, s[0:3], s32 offset:2 glc
+; GFX803-NEXT:    s_waitcnt vmcnt(0)
+; GFX803-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX803-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX803-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX900-FLATSCR-LABEL: load_private_v2i16_split:
+; GFX900-FLATSCR:       ; %bb.0: ; %entry
+; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-FLATSCR-NEXT:    scratch_load_ushort v0, off, s32 glc
+; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-FLATSCR-NEXT:    scratch_load_short_d16_hi v0, off, s32 offset:2 glc
+; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-FLATSCR-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %gep = getelementptr inbounds i16, i16 addrspace(5)* %in, i32 1
   %load0 = load volatile i16, i16 addrspace(5)* %in
@@ -1026,15 +2655,53 @@ entry:
 ; FIXME: This test should work without copying of v0.
 ;        ds_read_u16_d16_hi preserves low 16 bits of the destination
 ;        and ds_write_b16 only reads low 16 bits.
-; GCN: s_waitcnt
-; GFX900:      v_mov_b32_e32 [[COPY:v[0-9]+]], v0
-; GFX900-NEXT: ds_read_u16_d16_hi [[COPY]], v1
-; GFX900-NEXT: ds_write_b16 v1, v0
-; GFX900-NEXT: s_waitcnt
-; GFX900-NEXT: v_mov_b32_e32 v0, [[COPY]]
-; GFX900-NEXT: s_waitcnt
-; GFX900-NEXT: s_setpc_b64
 define <2 x i16> @load_local_hi_v2i16_store_local_lo(i16 %reg, i16 addrspace(3)* %in) #0 {
+; GFX900-LABEL: load_local_hi_v2i16_store_local_lo:
+; GFX900:       ; %bb.0: ; %entry
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_mov_b32_e32 v2, v0
+; GFX900-NEXT:    ds_read_u16_d16_hi v2, v1
+; GFX900-NEXT:    ds_write_b16 v1, v0
+; GFX900-NEXT:    s_waitcnt lgkmcnt(1)
+; GFX900-NEXT:    v_mov_b32_e32 v0, v2
+; GFX900-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX906-LABEL: load_local_hi_v2i16_store_local_lo:
+; GFX906:       ; %bb.0: ; %entry
+; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX906-NEXT:    ds_read_u16 v2, v1
+; GFX906-NEXT:    v_and_b32_e32 v3, 0xffff, v0
+; GFX906-NEXT:    ds_write_b16 v1, v0
+; GFX906-NEXT:    s_waitcnt lgkmcnt(1)
+; GFX906-NEXT:    v_lshl_or_b32 v2, v2, 16, v3
+; GFX906-NEXT:    v_mov_b32_e32 v0, v2
+; GFX906-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX906-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX803-LABEL: load_local_hi_v2i16_store_local_lo:
+; GFX803:       ; %bb.0: ; %entry
+; GFX803-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX803-NEXT:    s_mov_b32 m0, -1
+; GFX803-NEXT:    ds_read_u16 v2, v1
+; GFX803-NEXT:    ds_write_b16 v1, v0
+; GFX803-NEXT:    s_waitcnt lgkmcnt(1)
+; GFX803-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX803-NEXT:    v_or_b32_sdwa v2, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX803-NEXT:    v_mov_b32_e32 v0, v2
+; GFX803-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX803-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX900-FLATSCR-LABEL: load_local_hi_v2i16_store_local_lo:
+; GFX900-FLATSCR:       ; %bb.0: ; %entry
+; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-FLATSCR-NEXT:    v_mov_b32_e32 v2, v0
+; GFX900-FLATSCR-NEXT:    ds_read_u16_d16_hi v2, v1
+; GFX900-FLATSCR-NEXT:    ds_write_b16 v1, v0
+; GFX900-FLATSCR-NEXT:    s_waitcnt lgkmcnt(1)
+; GFX900-FLATSCR-NEXT:    v_mov_b32_e32 v0, v2
+; GFX900-FLATSCR-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX900-FLATSCR-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %load = load i16, i16 addrspace(3)* %in
   %build0 = insertelement <2 x i16> undef, i16 %reg, i32 0

diff  --git a/llvm/test/CodeGen/AMDGPU/pack.v2i16.ll b/llvm/test/CodeGen/AMDGPU/pack.v2i16.ll
index b2881ce43596..63bf55f679e5 100644
--- a/llvm/test/CodeGen/AMDGPU/pack.v2i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/pack.v2i16.ll
@@ -1,14 +1,52 @@
-; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx900 -mattr=-flat-for-global -denormal-fp-math=preserve-sign -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefixes=GCN,GFX9 %s
-; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN %s
-; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=kaveri -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx900 -mattr=-flat-for-global -denormal-fp-math=preserve-sign -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefixes=GFX9 %s
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GFX803 %s
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=kaveri -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GFX7 %s
 
 
-; GCN-LABEL: {{^}}s_pack_v2i16:
-; GFX9: s_load_dword [[VAL0:s[0-9]+]]
-; GFX9: s_load_dword [[VAL1:s[0-9]+]]
-; GFX9: s_pack_ll_b32_b16 [[PACKED:s[0-9]+]], [[VAL0]], [[VAL1]]
-; GFX9: ; use [[PACKED]]
 define amdgpu_kernel void @s_pack_v2i16(i32 addrspace(4)* %in0, i32 addrspace(4)* %in1) #0 {
+; GFX9-LABEL: s_pack_v2i16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_load_dword s4, s[0:1], 0x0
+; GFX9-NEXT:    s_load_dword s5, s[2:3], 0x0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_pack_ll_b32_b16 s0, s4, s5
+; GFX9-NEXT:    ;;#ASMSTART
+; GFX9-NEXT:    ; use s0
+; GFX9-NEXT:    ;;#ASMEND
+; GFX9-NEXT:    s_endpgm
+;
+; GFX803-LABEL: s_pack_v2i16:
+; GFX803:       ; %bb.0:
+; GFX803-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX803-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX803-NEXT:    s_load_dword s0, s[0:1], 0x0
+; GFX803-NEXT:    s_load_dword s1, s[2:3], 0x0
+; GFX803-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX803-NEXT:    s_and_b32 s0, s0, 0xffff
+; GFX803-NEXT:    s_lshl_b32 s1, s1, 16
+; GFX803-NEXT:    s_or_b32 s0, s0, s1
+; GFX803-NEXT:    ;;#ASMSTART
+; GFX803-NEXT:    ; use s0
+; GFX803-NEXT:    ;;#ASMEND
+; GFX803-NEXT:    s_endpgm
+;
+; GFX7-LABEL: s_pack_v2i16:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_load_dword s0, s[0:1], 0x0
+; GFX7-NEXT:    s_load_dword s1, s[2:3], 0x0
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_and_b32 s0, s0, 0xffff
+; GFX7-NEXT:    s_lshl_b32 s1, s1, 16
+; GFX7-NEXT:    s_or_b32 s0, s0, s1
+; GFX7-NEXT:    ;;#ASMSTART
+; GFX7-NEXT:    ; use s0
+; GFX7-NEXT:    ;;#ASMEND
+; GFX7-NEXT:    s_endpgm
   %val0 = load volatile i32, i32 addrspace(4)* %in0
   %val1 = load volatile i32, i32 addrspace(4)* %in1
   %lo = trunc i32 %val0 to i16
@@ -21,11 +59,44 @@ define amdgpu_kernel void @s_pack_v2i16(i32 addrspace(4)* %in0, i32 addrspace(4)
   ret void
 }
 
-; GCN-LABEL: {{^}}s_pack_v2i16_imm_lo:
-; GFX9: s_load_dword [[VAL1:s[0-9]+]]
-; GFX9: s_pack_ll_b32_b16 [[PACKED:s[0-9]+]], 0x1c8, [[VAL1]]
-; GFX9: ; use [[PACKED]]
 define amdgpu_kernel void @s_pack_v2i16_imm_lo(i32 addrspace(4)* %in1) #0 {
+; GFX9-LABEL: s_pack_v2i16_imm_lo:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_load_dword s0, s[0:1], 0x0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_pack_ll_b32_b16 s0, 0x1c8, s0
+; GFX9-NEXT:    ;;#ASMSTART
+; GFX9-NEXT:    ; use s0
+; GFX9-NEXT:    ;;#ASMEND
+; GFX9-NEXT:    s_endpgm
+;
+; GFX803-LABEL: s_pack_v2i16_imm_lo:
+; GFX803:       ; %bb.0:
+; GFX803-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX803-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX803-NEXT:    s_load_dword s0, s[0:1], 0x0
+; GFX803-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX803-NEXT:    s_lshl_b32 s0, s0, 16
+; GFX803-NEXT:    s_or_b32 s0, s0, 0x1c8
+; GFX803-NEXT:    ;;#ASMSTART
+; GFX803-NEXT:    ; use s0
+; GFX803-NEXT:    ;;#ASMEND
+; GFX803-NEXT:    s_endpgm
+;
+; GFX7-LABEL: s_pack_v2i16_imm_lo:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_load_dword s0, s[0:1], 0x0
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_lshl_b32 s0, s0, 16
+; GFX7-NEXT:    s_or_b32 s0, s0, 0x1c8
+; GFX7-NEXT:    ;;#ASMSTART
+; GFX7-NEXT:    ; use s0
+; GFX7-NEXT:    ;;#ASMEND
+; GFX7-NEXT:    s_endpgm
   %val1 = load i32, i32 addrspace(4)* %in1
   %hi = trunc i32 %val1 to i16
   %vec.0 = insertelement <2 x i16> undef, i16 456, i32 0
@@ -36,11 +107,44 @@ define amdgpu_kernel void @s_pack_v2i16_imm_lo(i32 addrspace(4)* %in1) #0 {
   ret void
 }
 
-; GCN-LABEL: {{^}}s_pack_v2i16_imm_hi:
-; GFX9: s_load_dword [[VAL0:s[0-9]+]]
-; GFX9: s_pack_ll_b32_b16 [[PACKED:s[0-9]+]], [[VAL0]], 0x1c8
-; GFX9: ; use [[PACKED]]
 define amdgpu_kernel void @s_pack_v2i16_imm_hi(i32 addrspace(4)* %in0) #0 {
+; GFX9-LABEL: s_pack_v2i16_imm_hi:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_load_dword s0, s[0:1], 0x0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_pack_ll_b32_b16 s0, s0, 0x1c8
+; GFX9-NEXT:    ;;#ASMSTART
+; GFX9-NEXT:    ; use s0
+; GFX9-NEXT:    ;;#ASMEND
+; GFX9-NEXT:    s_endpgm
+;
+; GFX803-LABEL: s_pack_v2i16_imm_hi:
+; GFX803:       ; %bb.0:
+; GFX803-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX803-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX803-NEXT:    s_load_dword s0, s[0:1], 0x0
+; GFX803-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX803-NEXT:    s_and_b32 s0, s0, 0xffff
+; GFX803-NEXT:    s_or_b32 s0, s0, 0x1c80000
+; GFX803-NEXT:    ;;#ASMSTART
+; GFX803-NEXT:    ; use s0
+; GFX803-NEXT:    ;;#ASMEND
+; GFX803-NEXT:    s_endpgm
+;
+; GFX7-LABEL: s_pack_v2i16_imm_hi:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_load_dword s0, s[0:1], 0x0
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_and_b32 s0, s0, 0xffff
+; GFX7-NEXT:    s_or_b32 s0, s0, 0x1c80000
+; GFX7-NEXT:    ;;#ASMSTART
+; GFX7-NEXT:    ; use s0
+; GFX7-NEXT:    ;;#ASMEND
+; GFX7-NEXT:    s_endpgm
   %val0 = load i32, i32 addrspace(4)* %in0
   %lo = trunc i32 %val0 to i16
   %vec.0 = insertelement <2 x i16> undef, i16 %lo, i32 0
@@ -51,14 +155,67 @@ define amdgpu_kernel void @s_pack_v2i16_imm_hi(i32 addrspace(4)* %in0) #0 {
   ret void
 }
 
-; GCN-LABEL: {{^}}v_pack_v2i16:
-; GFX9: global_load_dword [[VAL0:v[0-9]+]]
-; GFX9: global_load_dword [[VAL1:v[0-9]+]]
-
-; GFX9: v_and_b32_e32 [[MASKED:v[0-9]+]], 0xffff, [[VAL0]]
-; GFX9: v_lshl_or_b32 [[PACKED:v[0-9]+]], [[VAL1]], 16, [[MASKED]]
-; GFX9: ; use [[PACKED]]
 define amdgpu_kernel void @v_pack_v2i16(i32 addrspace(1)* %in0, i32 addrspace(1)* %in1) #0 {
+; GFX9-LABEL: v_pack_v2i16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    global_load_dword v1, v0, s[0:1] glc
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    global_load_dword v2, v0, s[2:3] glc
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff, v1
+; GFX9-NEXT:    v_lshl_or_b32 v0, v2, 16, v0
+; GFX9-NEXT:    ;;#ASMSTART
+; GFX9-NEXT:    ; use v0
+; GFX9-NEXT:    ;;#ASMEND
+; GFX9-NEXT:    s_endpgm
+;
+; GFX803-LABEL: v_pack_v2i16:
+; GFX803:       ; %bb.0:
+; GFX803-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX803-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
+; GFX803-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX803-NEXT:    v_mov_b32_e32 v1, s1
+; GFX803-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
+; GFX803-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX803-NEXT:    v_mov_b32_e32 v3, s3
+; GFX803-NEXT:    v_add_u32_e32 v2, vcc, s2, v2
+; GFX803-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
+; GFX803-NEXT:    flat_load_dword v0, v[0:1] glc
+; GFX803-NEXT:    s_waitcnt vmcnt(0)
+; GFX803-NEXT:    flat_load_dword v1, v[2:3] glc
+; GFX803-NEXT:    s_waitcnt vmcnt(0)
+; GFX803-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX803-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX803-NEXT:    ;;#ASMSTART
+; GFX803-NEXT:    ; use v0
+; GFX803-NEXT:    ;;#ASMEND
+; GFX803-NEXT:    s_endpgm
+;
+; GFX7-LABEL: v_pack_v2i16:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b32 s7, 0x100f000
+; GFX7-NEXT:    s_mov_b32 s6, 0
+; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX7-NEXT:    v_mov_b32_e32 v1, 0
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[0:1]
+; GFX7-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX7-NEXT:    s_mov_b64 s[2:3], s[6:7]
+; GFX7-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 glc
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[0:3], 0 addr64 glc
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff, v2
+; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX7-NEXT:    v_or_b32_e32 v0, v1, v0
+; GFX7-NEXT:    ;;#ASMSTART
+; GFX7-NEXT:    ; use v0
+; GFX7-NEXT:    ;;#ASMEND
+; GFX7-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %in0.gep = getelementptr inbounds i32, i32 addrspace(1)* %in0, i64 %tid.ext
@@ -74,15 +231,72 @@ define amdgpu_kernel void @v_pack_v2i16(i32 addrspace(1)* %in0, i32 addrspace(1)
   ret void
 }
 
-; GCN-LABEL: {{^}}v_pack_v2i16_user:
-; GFX9: global_load_dword [[VAL0:v[0-9]+]]
-; GFX9: global_load_dword [[VAL1:v[0-9]+]]
-
-; GFX9: v_and_b32_e32 [[MASKED:v[0-9]+]], 0xffff, [[VAL0]]
-; GFX9: v_lshl_or_b32 [[PACKED:v[0-9]+]], [[VAL1]], 16, [[MASKED]]
-
-; GFX9: v_add_u32_e32 v{{[0-9]+}}, 9, [[PACKED]]
 define amdgpu_kernel void @v_pack_v2i16_user(i32 addrspace(1)* %in0, i32 addrspace(1)* %in1) #0 {
+; GFX9-LABEL: v_pack_v2i16_user:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    global_load_dword v1, v0, s[0:1] glc
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    global_load_dword v2, v0, s[2:3] glc
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_mov_b32 s3, 0xf000
+; GFX9-NEXT:    s_mov_b32 s2, -1
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff, v1
+; GFX9-NEXT:    v_lshl_or_b32 v0, v2, 16, v0
+; GFX9-NEXT:    v_add_u32_e32 v0, 9, v0
+; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_endpgm
+;
+; GFX803-LABEL: v_pack_v2i16_user:
+; GFX803:       ; %bb.0:
+; GFX803-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX803-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
+; GFX803-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX803-NEXT:    v_mov_b32_e32 v1, s1
+; GFX803-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
+; GFX803-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX803-NEXT:    v_mov_b32_e32 v3, s3
+; GFX803-NEXT:    v_add_u32_e32 v2, vcc, s2, v2
+; GFX803-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
+; GFX803-NEXT:    flat_load_dword v0, v[0:1] glc
+; GFX803-NEXT:    s_waitcnt vmcnt(0)
+; GFX803-NEXT:    flat_load_dword v1, v[2:3] glc
+; GFX803-NEXT:    s_waitcnt vmcnt(0)
+; GFX803-NEXT:    s_mov_b32 s3, 0x1100f000
+; GFX803-NEXT:    s_mov_b32 s2, -1
+; GFX803-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX803-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX803-NEXT:    v_add_u32_e32 v0, vcc, 9, v0
+; GFX803-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX803-NEXT:    s_waitcnt vmcnt(0)
+; GFX803-NEXT:    s_endpgm
+;
+; GFX7-LABEL: v_pack_v2i16_user:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b32 s6, 0
+; GFX7-NEXT:    s_mov_b32 s7, 0x100f000
+; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX7-NEXT:    v_mov_b32_e32 v1, 0
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[0:1]
+; GFX7-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX7-NEXT:    s_mov_b64 s[2:3], s[6:7]
+; GFX7-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 glc
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[0:3], 0 addr64 glc
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    s_mov_b32 s6, -1
+; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff, v2
+; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX7-NEXT:    v_or_b32_e32 v0, v1, v0
+; GFX7-NEXT:    v_add_i32_e32 v0, vcc, 9, v0
+; GFX7-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %in0.gep = getelementptr inbounds i32, i32 addrspace(1)* %in0, i64 %tid.ext
@@ -99,15 +313,54 @@ define amdgpu_kernel void @v_pack_v2i16_user(i32 addrspace(1)* %in0, i32 addrspa
   ret void
 }
 
-; GCN-LABEL: {{^}}v_pack_v2i16_imm_lo:
-; GFX9-DAG: global_load_dword [[VAL1:v[0-9]+]]
-; GFX9-DENORM-DAG: s_movk_i32 [[K:s[0-9]+]], 0x7b{{$}}
-
-; GFX9-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x7b{{$}}
-; GFX9: v_lshl_or_b32 [[PACKED:v[0-9]+]], [[VAL1]], 16, [[K]]
-
-; GFX9: ; use [[PACKED]]
 define amdgpu_kernel void @v_pack_v2i16_imm_lo(i32 addrspace(1)* %in1) #0 {
+; GFX9-LABEL: v_pack_v2i16_imm_lo:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0x7b
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    global_load_dword v0, v0, s[0:1] glc
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_lshl_or_b32 v0, v0, 16, v1
+; GFX9-NEXT:    ;;#ASMSTART
+; GFX9-NEXT:    ; use v0
+; GFX9-NEXT:    ;;#ASMEND
+; GFX9-NEXT:    s_endpgm
+;
+; GFX803-LABEL: v_pack_v2i16_imm_lo:
+; GFX803:       ; %bb.0:
+; GFX803-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX803-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX803-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX803-NEXT:    v_mov_b32_e32 v1, s1
+; GFX803-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
+; GFX803-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX803-NEXT:    flat_load_dword v0, v[0:1] glc
+; GFX803-NEXT:    s_waitcnt vmcnt(0)
+; GFX803-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX803-NEXT:    v_or_b32_e32 v0, 0x7b, v0
+; GFX803-NEXT:    ;;#ASMSTART
+; GFX803-NEXT:    ; use v0
+; GFX803-NEXT:    ;;#ASMEND
+; GFX803-NEXT:    s_endpgm
+;
+; GFX7-LABEL: v_pack_v2i16_imm_lo:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b32 s3, 0x100f000
+; GFX7-NEXT:    s_mov_b32 s2, 0
+; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX7-NEXT:    v_mov_b32_e32 v1, 0
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[0:3], 0 addr64 glc
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX7-NEXT:    v_or_b32_e32 v0, 0x7b, v0
+; GFX7-NEXT:    ;;#ASMSTART
+; GFX7-NEXT:    ; use v0
+; GFX7-NEXT:    ;;#ASMEND
+; GFX7-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %in1.gep = getelementptr inbounds i32, i32 addrspace(1)* %in1, i64 %tid.ext
@@ -120,12 +373,53 @@ define amdgpu_kernel void @v_pack_v2i16_imm_lo(i32 addrspace(1)* %in1) #0 {
   ret void
 }
 
-; GCN-LABEL: {{^}}v_pack_v2i16_inline_imm_lo:
-; GFX9: global_load_dword [[VAL1:v[0-9]+]]
-
-; GFX9: v_lshl_or_b32 [[PACKED:v[0-9]+]], [[VAL1]], 16, 64
-; GFX9: ; use [[PACKED]]
 define amdgpu_kernel void @v_pack_v2i16_inline_imm_lo(i32 addrspace(1)* %in1) #0 {
+; GFX9-LABEL: v_pack_v2i16_inline_imm_lo:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    global_load_dword v0, v0, s[0:1] glc
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_lshl_or_b32 v0, v0, 16, 64
+; GFX9-NEXT:    ;;#ASMSTART
+; GFX9-NEXT:    ; use v0
+; GFX9-NEXT:    ;;#ASMEND
+; GFX9-NEXT:    s_endpgm
+;
+; GFX803-LABEL: v_pack_v2i16_inline_imm_lo:
+; GFX803:       ; %bb.0:
+; GFX803-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX803-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX803-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX803-NEXT:    v_mov_b32_e32 v1, s1
+; GFX803-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
+; GFX803-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX803-NEXT:    flat_load_dword v0, v[0:1] glc
+; GFX803-NEXT:    s_waitcnt vmcnt(0)
+; GFX803-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX803-NEXT:    v_or_b32_e32 v0, 64, v0
+; GFX803-NEXT:    ;;#ASMSTART
+; GFX803-NEXT:    ; use v0
+; GFX803-NEXT:    ;;#ASMEND
+; GFX803-NEXT:    s_endpgm
+;
+; GFX7-LABEL: v_pack_v2i16_inline_imm_lo:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b32 s3, 0x100f000
+; GFX7-NEXT:    s_mov_b32 s2, 0
+; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX7-NEXT:    v_mov_b32_e32 v1, 0
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[0:3], 0 addr64 glc
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX7-NEXT:    v_or_b32_e32 v0, 64, v0
+; GFX7-NEXT:    ;;#ASMSTART
+; GFX7-NEXT:    ; use v0
+; GFX7-NEXT:    ;;#ASMEND
+; GFX7-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %in1.gep = getelementptr inbounds i32, i32 addrspace(1)* %in1, i64 %tid.ext
@@ -138,14 +432,55 @@ define amdgpu_kernel void @v_pack_v2i16_inline_imm_lo(i32 addrspace(1)* %in1) #0
   ret void
 }
 
-; GCN-LABEL: {{^}}v_pack_v2i16_imm_hi:
-; GFX9-DAG: global_load_dword [[VAL0:v[0-9]+]]
-
-; GFX9-DAG: s_movk_i32 [[K:s[0-9]+]], 0x7b{{$}}
-; GFX9: v_lshl_or_b32 [[PACKED:v[0-9]+]], [[K]], 16, [[VAL0]]
-
-; GFX9: ; use [[PACKED]]
 define amdgpu_kernel void @v_pack_v2i16_imm_hi(i32 addrspace(1)* %in0) #0 {
+; GFX9-LABEL: v_pack_v2i16_imm_hi:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    global_load_dword v0, v0, s[0:1] glc
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_movk_i32 s0, 0x7b
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX9-NEXT:    v_lshl_or_b32 v0, s0, 16, v0
+; GFX9-NEXT:    ;;#ASMSTART
+; GFX9-NEXT:    ; use v0
+; GFX9-NEXT:    ;;#ASMEND
+; GFX9-NEXT:    s_endpgm
+;
+; GFX803-LABEL: v_pack_v2i16_imm_hi:
+; GFX803:       ; %bb.0:
+; GFX803-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX803-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX803-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX803-NEXT:    v_mov_b32_e32 v1, s1
+; GFX803-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
+; GFX803-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX803-NEXT:    flat_load_dword v0, v[0:1] glc
+; GFX803-NEXT:    s_waitcnt vmcnt(0)
+; GFX803-NEXT:    v_mov_b32_e32 v1, 0x7b0000
+; GFX803-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX803-NEXT:    ;;#ASMSTART
+; GFX803-NEXT:    ; use v0
+; GFX803-NEXT:    ;;#ASMEND
+; GFX803-NEXT:    s_endpgm
+;
+; GFX7-LABEL: v_pack_v2i16_imm_hi:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b32 s3, 0x100f000
+; GFX7-NEXT:    s_mov_b32 s2, 0
+; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX7-NEXT:    v_mov_b32_e32 v1, 0
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[0:3], 0 addr64 glc
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX7-NEXT:    v_or_b32_e32 v0, 0x7b0000, v0
+; GFX7-NEXT:    ;;#ASMSTART
+; GFX7-NEXT:    ; use v0
+; GFX7-NEXT:    ;;#ASMEND
+; GFX7-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %in0.gep = getelementptr inbounds i32, i32 addrspace(1)* %in0, i64 %tid.ext
@@ -158,11 +493,54 @@ define amdgpu_kernel void @v_pack_v2i16_imm_hi(i32 addrspace(1)* %in0) #0 {
   ret void
 }
 
-; GCN-LABEL: {{^}}v_pack_v2i16_inline_imm_hi:
-; GFX9: global_load_dword [[VAL:v[0-9]+]]
-; GFX9: v_lshl_or_b32 [[PACKED:v[0-9]+]], 7, 16, [[VAL]]
-; GFX9: ; use [[PACKED]]
 define amdgpu_kernel void @v_pack_v2i16_inline_imm_hi(i32 addrspace(1)* %in0) #0 {
+; GFX9-LABEL: v_pack_v2i16_inline_imm_hi:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    global_load_dword v0, v0, s[0:1] glc
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX9-NEXT:    v_lshl_or_b32 v0, 7, 16, v0
+; GFX9-NEXT:    ;;#ASMSTART
+; GFX9-NEXT:    ; use v0
+; GFX9-NEXT:    ;;#ASMEND
+; GFX9-NEXT:    s_endpgm
+;
+; GFX803-LABEL: v_pack_v2i16_inline_imm_hi:
+; GFX803:       ; %bb.0:
+; GFX803-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX803-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX803-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX803-NEXT:    v_mov_b32_e32 v1, s1
+; GFX803-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
+; GFX803-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX803-NEXT:    flat_load_dword v0, v[0:1] glc
+; GFX803-NEXT:    s_waitcnt vmcnt(0)
+; GFX803-NEXT:    v_mov_b32_e32 v1, 0x70000
+; GFX803-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX803-NEXT:    ;;#ASMSTART
+; GFX803-NEXT:    ; use v0
+; GFX803-NEXT:    ;;#ASMEND
+; GFX803-NEXT:    s_endpgm
+;
+; GFX7-LABEL: v_pack_v2i16_inline_imm_hi:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX7-NEXT:    s_mov_b32 s3, 0x100f000
+; GFX7-NEXT:    s_mov_b32 s2, 0
+; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX7-NEXT:    v_mov_b32_e32 v1, 0
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[0:3], 0 addr64 glc
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX7-NEXT:    v_or_b32_e32 v0, 0x70000, v0
+; GFX7-NEXT:    ;;#ASMSTART
+; GFX7-NEXT:    ; use v0
+; GFX7-NEXT:    ;;#ASMEND
+; GFX7-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %in0.gep = getelementptr inbounds i32, i32 addrspace(1)* %in0, i64 %tid.ext


        


More information about the llvm-commits mailing list