[llvm] [NFC][AMDGPU] Auto-generate check lines for some test cases (PR #112426)

Shilei Tian via llvm-commits llvm-commits at lists.llvm.org
Wed Oct 16 17:06:45 PDT 2024


https://github.com/shiltian updated https://github.com/llvm/llvm-project/pull/112426

>From 76a1bb22fc83ff63e98f2d85ffd4b63e45518847 Mon Sep 17 00:00:00 2001
From: Shilei Tian <i at tianshilei.me>
Date: Tue, 15 Oct 2024 15:58:20 -0400
Subject: [PATCH] [NFC][AMDGPU] Auto-generate check lines for
 `llvm/test/CodeGen/AMDGPU/andorbitset.ll`

---
 llvm/test/CodeGen/AMDGPU/andorbitset.ll       | 102 +++-
 llvm/test/CodeGen/AMDGPU/andorxorinvimm.ll    |  79 ++-
 llvm/test/CodeGen/AMDGPU/fabs.f64.ll          | 155 ++++--
 .../llvm.amdgcn.raw.ptr.buffer.store.ll       | 471 +++++++++++++-----
 llvm/test/CodeGen/AMDGPU/s_mulk_i32.ll        | 137 ++++-
 5 files changed, 752 insertions(+), 192 deletions(-)

diff --git a/llvm/test/CodeGen/AMDGPU/andorbitset.ll b/llvm/test/CodeGen/AMDGPU/andorbitset.ll
index a189ba9b103421..0fa58f3c444a54 100644
--- a/llvm/test/CodeGen/AMDGPU/andorbitset.ll
+++ b/llvm/test/CodeGen/AMDGPU/andorbitset.ll
@@ -1,48 +1,103 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
 ; RUN: llc -mtriple=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
 
-; SI-LABEL: {{^}}s_clear_msb:
-; SI: s_bitset0_b32 s{{[0-9]+}}, 31
 define amdgpu_kernel void @s_clear_msb(ptr addrspace(1) %out, i32 %in) {
+; SI-LABEL: s_clear_msb:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_load_dword s4, s[2:3], 0xb
+; SI-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x9
+; SI-NEXT:    s_mov_b32 s3, 0xf000
+; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    s_bitset0_b32 s4, 31
+; SI-NEXT:    v_mov_b32_e32 v0, s4
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; SI-NEXT:    s_endpgm
   %x = and i32 %in, 2147483647
   store i32 %x, ptr addrspace(1) %out
   ret void
 }
 
-; SI-LABEL: {{^}}s_set_msb:
-; SI: s_bitset1_b32 s{{[0-9]+}}, 31
 define amdgpu_kernel void @s_set_msb(ptr addrspace(1) %out, i32 %in) {
+; SI-LABEL: s_set_msb:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_load_dword s4, s[2:3], 0xb
+; SI-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x9
+; SI-NEXT:    s_mov_b32 s3, 0xf000
+; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    s_bitset1_b32 s4, 31
+; SI-NEXT:    v_mov_b32_e32 v0, s4
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; SI-NEXT:    s_endpgm
   %x = or i32 %in, 2147483648
   store i32 %x, ptr addrspace(1) %out
   ret void
 }
 
-; SI-LABEL: {{^}}s_clear_lsb:
-; SI: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, -2
 define amdgpu_kernel void @s_clear_lsb(ptr addrspace(1) %out, i32 %in) {
+; SI-LABEL: s_clear_lsb:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_load_dword s4, s[2:3], 0xb
+; SI-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x9
+; SI-NEXT:    s_mov_b32 s3, 0xf000
+; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    s_and_b32 s4, s4, -2
+; SI-NEXT:    v_mov_b32_e32 v0, s4
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; SI-NEXT:    s_endpgm
   %x = and i32 %in, 4294967294
   store i32 %x, ptr addrspace(1) %out
   ret void
 }
 
-; SI-LABEL: {{^}}s_set_lsb:
-; SI: s_or_b32 s{{[0-9]+}}, s{{[0-9]+}}, 1
 define amdgpu_kernel void @s_set_lsb(ptr addrspace(1) %out, i32 %in) {
+; SI-LABEL: s_set_lsb:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_load_dword s4, s[2:3], 0xb
+; SI-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x9
+; SI-NEXT:    s_mov_b32 s3, 0xf000
+; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    s_or_b32 s4, s4, 1
+; SI-NEXT:    v_mov_b32_e32 v0, s4
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; SI-NEXT:    s_endpgm
   %x = or i32 %in, 1
   store i32 %x, ptr addrspace(1) %out
   ret void
 }
 
-; SI-LABEL: {{^}}s_clear_midbit:
-; SI: s_bitset0_b32 s{{[0-9]+}}, 8
 define amdgpu_kernel void @s_clear_midbit(ptr addrspace(1) %out, i32 %in) {
+; SI-LABEL: s_clear_midbit:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_load_dword s4, s[2:3], 0xb
+; SI-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x9
+; SI-NEXT:    s_mov_b32 s3, 0xf000
+; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    s_bitset0_b32 s4, 8
+; SI-NEXT:    v_mov_b32_e32 v0, s4
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; SI-NEXT:    s_endpgm
   %x = and i32 %in, 4294967039
   store i32 %x, ptr addrspace(1) %out
   ret void
 }
 
-; SI-LABEL: {{^}}s_set_midbit:
-; SI: s_bitset1_b32 s{{[0-9]+}}, 8
 define amdgpu_kernel void @s_set_midbit(ptr addrspace(1) %out, i32 %in) {
+; SI-LABEL: s_set_midbit:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_load_dword s4, s[2:3], 0xb
+; SI-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x9
+; SI-NEXT:    s_mov_b32 s3, 0xf000
+; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    s_bitset1_b32 s4, 8
+; SI-NEXT:    v_mov_b32_e32 v0, s4
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; SI-NEXT:    s_endpgm
   %x = or i32 %in, 256
   store i32 %x, ptr addrspace(1) %out
   ret void
@@ -51,10 +106,27 @@ define amdgpu_kernel void @s_set_midbit(ptr addrspace(1) %out, i32 %in) {
 @gv = external addrspace(1) global i32
 
 ; Make sure there's no verifier error with an undef source.
-; SI-LABEL: {{^}}bitset_verifier_error:
-; SI-NOT:   %bb.1:
-; SI:       s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x7fffffff
 define void @bitset_verifier_error() local_unnamed_addr #0 {
+; SI-LABEL: bitset_verifier_error:
+; SI:       ; %bb.0: ; %bb
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    s_getpc_b64 s[4:5]
+; SI-NEXT:    s_add_u32 s4, s4, gv at gotpcrel32@lo+4
+; SI-NEXT:    s_addc_u32 s5, s5, gv at gotpcrel32@hi+12
+; SI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
+; SI-NEXT:    s_mov_b32 s7, 0xf000
+; SI-NEXT:    s_mov_b32 s6, -1
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    s_and_b32 s8, s4, 0x7fffffff
+; SI-NEXT:    v_mov_b32_e32 v0, s8
+; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v0, 0x3f7fbe77
+; SI-NEXT:    v_cmp_ge_f32_e64 s[4:5], |s4|, v0
+; SI-NEXT:    s_and_b64 vcc, exec, s[4:5]
+; SI-NEXT:    s_cbranch_vccnz .LBB6_2
+; SI-NEXT:  ; %bb.1: ; %bb5
+; SI-NEXT:  .LBB6_2: ; %bb6
 bb:
   %i = call float @llvm.fabs.f32(float undef) #0
   %i1 = bitcast float %i to i32
diff --git a/llvm/test/CodeGen/AMDGPU/andorxorinvimm.ll b/llvm/test/CodeGen/AMDGPU/andorxorinvimm.ll
index dc158028bd7b05..4b56b5e9d24f5c 100644
--- a/llvm/test/CodeGen/AMDGPU/andorxorinvimm.ll
+++ b/llvm/test/CodeGen/AMDGPU/andorxorinvimm.ll
@@ -1,48 +1,103 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
 ; RUN: llc -mtriple=amdgcn -mcpu=tahiti -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
 
-; SI-LABEL: {{^}}s_or_to_orn2:
-; SI: s_orn2_b32 s{{[0-9]+}}, s{{[0-9]+}}, 50
 define amdgpu_kernel void @s_or_to_orn2(ptr addrspace(1) %out, i32 %in) {
+; SI-LABEL: s_or_to_orn2:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_load_dword s4, s[2:3], 0xb
+; SI-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x9
+; SI-NEXT:    s_mov_b32 s3, 0xf000
+; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    s_orn2_b32 s4, s4, 50
+; SI-NEXT:    v_mov_b32_e32 v0, s4
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; SI-NEXT:    s_endpgm
   %x = or i32 %in, -51
   store i32 %x, ptr addrspace(1) %out
   ret void
 }
 
-; SI-LABEL: {{^}}s_or_to_orn2_imm0:
-; SI: s_orn2_b32 s{{[0-9]+}}, s{{[0-9]+}}, 50
 define amdgpu_kernel void @s_or_to_orn2_imm0(ptr addrspace(1) %out, i32 %in) {
+; SI-LABEL: s_or_to_orn2_imm0:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_load_dword s4, s[2:3], 0xb
+; SI-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x9
+; SI-NEXT:    s_mov_b32 s3, 0xf000
+; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    s_orn2_b32 s4, s4, 50
+; SI-NEXT:    v_mov_b32_e32 v0, s4
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; SI-NEXT:    s_endpgm
   %x = or i32 -51, %in
   store i32 %x, ptr addrspace(1) %out
   ret void
 }
 
-; SI-LABEL: {{^}}s_and_to_andn2:
-; SI: s_andn2_b32 s{{[0-9]+}}, s{{[0-9]+}}, 50
 define amdgpu_kernel void @s_and_to_andn2(ptr addrspace(1) %out, i32 %in) {
+; SI-LABEL: s_and_to_andn2:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_load_dword s4, s[2:3], 0xb
+; SI-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x9
+; SI-NEXT:    s_mov_b32 s3, 0xf000
+; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    s_andn2_b32 s4, s4, 50
+; SI-NEXT:    v_mov_b32_e32 v0, s4
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; SI-NEXT:    s_endpgm
   %x = and i32 %in, -51
   store i32 %x, ptr addrspace(1) %out
   ret void
 }
 
-; SI-LABEL: {{^}}s_and_to_andn2_imm0:
-; SI: s_andn2_b32 s{{[0-9]+}}, s{{[0-9]+}}, 50
 define amdgpu_kernel void @s_and_to_andn2_imm0(ptr addrspace(1) %out, i32 %in) {
+; SI-LABEL: s_and_to_andn2_imm0:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_load_dword s4, s[2:3], 0xb
+; SI-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x9
+; SI-NEXT:    s_mov_b32 s3, 0xf000
+; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    s_andn2_b32 s4, s4, 50
+; SI-NEXT:    v_mov_b32_e32 v0, s4
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; SI-NEXT:    s_endpgm
   %x = and i32 -51, %in
   store i32 %x, ptr addrspace(1) %out
   ret void
 }
 
-; SI-LABEL: {{^}}s_xor_to_xnor:
-; SI: s_xnor_b32 s{{[0-9]+}}, s{{[0-9]+}}, 50
 define amdgpu_kernel void @s_xor_to_xnor(ptr addrspace(1) %out, i32 %in) {
+; SI-LABEL: s_xor_to_xnor:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_load_dword s4, s[2:3], 0xb
+; SI-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x9
+; SI-NEXT:    s_mov_b32 s3, 0xf000
+; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    s_xnor_b32 s4, s4, 50
+; SI-NEXT:    v_mov_b32_e32 v0, s4
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; SI-NEXT:    s_endpgm
   %x = xor i32 %in, -51
   store i32 %x, ptr addrspace(1) %out
   ret void
 }
 
-; SI-LABEL: {{^}}s_xor_to_xnor_imm0:
-; SI: s_xnor_b32 s{{[0-9]+}}, s{{[0-9]+}}, 50
 define amdgpu_kernel void @s_xor_to_xnor_imm0(ptr addrspace(1) %out, i32 %in) {
+; SI-LABEL: s_xor_to_xnor_imm0:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_load_dword s4, s[2:3], 0xb
+; SI-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x9
+; SI-NEXT:    s_mov_b32 s3, 0xf000
+; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    s_xnor_b32 s4, s4, 50
+; SI-NEXT:    v_mov_b32_e32 v0, s4
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; SI-NEXT:    s_endpgm
   %x = xor i32 -51, %in
   store i32 %x, ptr addrspace(1) %out
   ret void
diff --git a/llvm/test/CodeGen/AMDGPU/fabs.f64.ll b/llvm/test/CodeGen/AMDGPU/fabs.f64.ll
index 32d5fa6e72d791..f98124fe2ed731 100644
--- a/llvm/test/CodeGen/AMDGPU/fabs.f64.ll
+++ b/llvm/test/CodeGen/AMDGPU/fabs.f64.ll
@@ -1,4 +1,5 @@
-; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
 
 declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
 
@@ -7,10 +8,25 @@ declare double @llvm.fabs.f64(double) readnone
 declare <2 x double> @llvm.fabs.v2f64(<2 x double>) readnone
 declare <4 x double> @llvm.fabs.v4f64(<4 x double>) readnone
 
-; FUNC-LABEL: {{^}}v_fabs_f64:
-; SI: v_and_b32
-; SI: s_endpgm
 define amdgpu_kernel void @v_fabs_f64(ptr addrspace(1) %out, ptr addrspace(1) %in) {
+; SI-LABEL: v_fabs_f64:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x9
+; SI-NEXT:    s_mov_b32 s7, 0xf000
+; SI-NEXT:    s_mov_b32 s10, 0
+; SI-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
+; SI-NEXT:    v_mov_b32_e32 v1, 0
+; SI-NEXT:    s_mov_b32 s11, s7
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; SI-NEXT:    buffer_load_dwordx2 v[0:1], v[0:1], s[8:11], 0 addr64
+; SI-NEXT:    s_mov_b32 s6, -1
+; SI-NEXT:    s_mov_b32 s4, s0
+; SI-NEXT:    s_mov_b32 s5, s1
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_and_b32_e32 v1, 0x7fffffff, v1
+; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; SI-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
   %tidext = sext i32 %tid to i64
   %gep = getelementptr double, ptr addrspace(1) %in, i64 %tidext
@@ -20,75 +36,148 @@ define amdgpu_kernel void @v_fabs_f64(ptr addrspace(1) %out, ptr addrspace(1) %i
   ret void
 }
 
-; FUNC-LABEL: {{^}}fabs_f64:
-; SI: s_bitset0_b32
-; SI: s_endpgm
 define amdgpu_kernel void @fabs_f64(ptr addrspace(1) %out, double %in) {
+; SI-LABEL: fabs_f64:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x9
+; SI-NEXT:    s_mov_b32 s7, 0xf000
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    s_bitset0_b32 s3, 31
+; SI-NEXT:    s_mov_b32 s6, -1
+; SI-NEXT:    s_mov_b32 s4, s0
+; SI-NEXT:    s_mov_b32 s5, s1
+; SI-NEXT:    v_mov_b32_e32 v0, s2
+; SI-NEXT:    v_mov_b32_e32 v1, s3
+; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; SI-NEXT:    s_endpgm
   %fabs = call double @llvm.fabs.f64(double %in)
   store double %fabs, ptr addrspace(1) %out
   ret void
 }
 
-; FUNC-LABEL: {{^}}fabs_v2f64:
-; SI: s_bitset0_b32
-; SI: s_bitset0_b32
-; SI: s_endpgm
 define amdgpu_kernel void @fabs_v2f64(ptr addrspace(1) %out, <2 x double> %in) {
+; SI-LABEL: fabs_v2f64:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0xd
+; SI-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x9
+; SI-NEXT:    s_mov_b32 s3, 0xf000
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    s_bitset0_b32 s7, 31
+; SI-NEXT:    s_bitset0_b32 s5, 31
+; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    v_mov_b32_e32 v0, s4
+; SI-NEXT:    v_mov_b32_e32 v2, s6
+; SI-NEXT:    v_mov_b32_e32 v1, s5
+; SI-NEXT:    v_mov_b32_e32 v3, s7
+; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
+; SI-NEXT:    s_endpgm
   %fabs = call <2 x double> @llvm.fabs.v2f64(<2 x double> %in)
   store <2 x double> %fabs, ptr addrspace(1) %out
   ret void
 }
 
-; FUNC-LABEL: {{^}}fabs_v4f64:
-; SI: s_bitset0_b32
-; SI: s_bitset0_b32
-; SI: s_bitset0_b32
-; SI: s_bitset0_b32
-; SI: s_endpgm
 define amdgpu_kernel void @fabs_v4f64(ptr addrspace(1) %out, <4 x double> %in) {
+; SI-LABEL: fabs_v4f64:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x11
+; SI-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x9
+; SI-NEXT:    s_mov_b32 s3, 0xf000
+; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    s_bitset0_b32 s7, 31
+; SI-NEXT:    s_bitset0_b32 s11, 31
+; SI-NEXT:    s_bitset0_b32 s9, 31
+; SI-NEXT:    s_bitset0_b32 s5, 31
+; SI-NEXT:    v_mov_b32_e32 v0, s8
+; SI-NEXT:    v_mov_b32_e32 v2, s10
+; SI-NEXT:    v_mov_b32_e32 v4, s4
+; SI-NEXT:    v_mov_b32_e32 v6, s6
+; SI-NEXT:    v_mov_b32_e32 v1, s9
+; SI-NEXT:    v_mov_b32_e32 v3, s11
+; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16
+; SI-NEXT:    v_mov_b32_e32 v5, s5
+; SI-NEXT:    v_mov_b32_e32 v7, s7
+; SI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[0:3], 0
+; SI-NEXT:    s_endpgm
   %fabs = call <4 x double> @llvm.fabs.v4f64(<4 x double> %in)
   store <4 x double> %fabs, ptr addrspace(1) %out
   ret void
 }
 
-; SI-LABEL: {{^}}fabs_fold_f64:
-; SI: s_load_dwordx2 [[ABS_VALUE:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x13
-; SI-NOT: and
-; SI: v_mul_f64 {{v\[[0-9]+:[0-9]+\]}}, |[[ABS_VALUE]]|, {{v\[[0-9]+:[0-9]+\]}}
-; SI: s_endpgm
 define amdgpu_kernel void @fabs_fold_f64(ptr addrspace(1) %out, [8 x i32], double %in0, [8 x i32], double %in1) {
+; SI-LABEL: fabs_fold_f64:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0x1d
+; SI-NEXT:    s_load_dwordx2 s[6:7], s[2:3], 0x13
+; SI-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x9
+; SI-NEXT:    s_mov_b32 s3, 0xf000
+; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v0, s4
+; SI-NEXT:    v_mov_b32_e32 v1, s5
+; SI-NEXT:    v_mul_f64 v[0:1], |s[6:7]|, v[0:1]
+; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; SI-NEXT:    s_endpgm
   %fabs = call double @llvm.fabs.f64(double %in0)
   %fmul = fmul double %fabs, %in1
   store double %fmul, ptr addrspace(1) %out
   ret void
 }
 
-; SI-LABEL: {{^}}fabs_fn_fold_f64:
-; SI: s_load_dwordx2 [[ABS_VALUE:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x13
-; SI-NOT: and
-; SI: v_mul_f64 {{v\[[0-9]+:[0-9]+\]}}, |[[ABS_VALUE]]|, {{v\[[0-9]+:[0-9]+\]}}
-; SI: s_endpgm
 define amdgpu_kernel void @fabs_fn_fold_f64(ptr addrspace(1) %out, [8 x i32], double %in0, [8 x i32], double %in1) {
+; SI-LABEL: fabs_fn_fold_f64:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0x1d
+; SI-NEXT:    s_load_dwordx2 s[6:7], s[2:3], 0x13
+; SI-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x9
+; SI-NEXT:    s_mov_b32 s3, 0xf000
+; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v0, s4
+; SI-NEXT:    v_mov_b32_e32 v1, s5
+; SI-NEXT:    v_mul_f64 v[0:1], |s[6:7]|, v[0:1]
+; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; SI-NEXT:    s_endpgm
   %fabs = call double @fabs(double %in0)
   %fmul = fmul double %fabs, %in1
   store double %fmul, ptr addrspace(1) %out
   ret void
 }
 
-; FUNC-LABEL: {{^}}fabs_free_f64:
-; SI: s_bitset0_b32
-; SI: s_endpgm
 define amdgpu_kernel void @fabs_free_f64(ptr addrspace(1) %out, i64 %in) {
+; SI-LABEL: fabs_free_f64:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x9
+; SI-NEXT:    s_mov_b32 s7, 0xf000
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    s_bitset0_b32 s3, 31
+; SI-NEXT:    s_mov_b32 s6, -1
+; SI-NEXT:    s_mov_b32 s4, s0
+; SI-NEXT:    s_mov_b32 s5, s1
+; SI-NEXT:    v_mov_b32_e32 v0, s2
+; SI-NEXT:    v_mov_b32_e32 v1, s3
+; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; SI-NEXT:    s_endpgm
   %bc= bitcast i64 %in to double
   %fabs = call double @llvm.fabs.f64(double %bc)
   store double %fabs, ptr addrspace(1) %out
   ret void
 }
 
-; FUNC-LABEL: {{^}}fabs_fn_free_f64:
-; SI: s_bitset0_b32
-; SI: s_endpgm
 define amdgpu_kernel void @fabs_fn_free_f64(ptr addrspace(1) %out, i64 %in) {
+; SI-LABEL: fabs_fn_free_f64:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x9
+; SI-NEXT:    s_mov_b32 s7, 0xf000
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    s_bitset0_b32 s3, 31
+; SI-NEXT:    s_mov_b32 s6, -1
+; SI-NEXT:    s_mov_b32 s4, s0
+; SI-NEXT:    s_mov_b32 s5, s1
+; SI-NEXT:    v_mov_b32_e32 v0, s2
+; SI-NEXT:    v_mov_b32_e32 v1, s3
+; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; SI-NEXT:    s_endpgm
   %bc= bitcast i64 %in to double
   %fabs = call double @fabs(double %bc)
   store double %fabs, ptr addrspace(1) %out
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.store.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.store.ll
index d9227724c22a14..855ca390aabdce 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.store.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.store.ll
@@ -1,12 +1,21 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
 ;RUN: llc < %s -mtriple=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck -check-prefix=VERDE %s
 ;RUN: llc < %s -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s
 
-;CHECK-LABEL: {{^}}buffer_store:
-;CHECK-NOT: s_waitcnt
-;CHECK: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
-;CHECK: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 glc
-;CHECK: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 slc
 define amdgpu_ps void @buffer_store(ptr addrspace(8) inreg, <4 x float>, <4 x float>, <4 x float>) {
+; VERDE-LABEL: buffer_store:
+; VERDE:       ; %bb.0: ; %main_body
+; VERDE-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
+; VERDE-NEXT:    buffer_store_dwordx4 v[4:7], off, s[0:3], 0 glc
+; VERDE-NEXT:    buffer_store_dwordx4 v[8:11], off, s[0:3], 0 slc
+; VERDE-NEXT:    s_endpgm
+;
+; CHECK-LABEL: buffer_store:
+; CHECK:       ; %bb.0: ; %main_body
+; CHECK-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
+; CHECK-NEXT:    buffer_store_dwordx4 v[4:7], off, s[0:3], 0 glc
+; CHECK-NEXT:    buffer_store_dwordx4 v[8:11], off, s[0:3], 0 slc
+; CHECK-NEXT:    s_endpgm
 main_body:
   call void @llvm.amdgcn.raw.ptr.buffer.store.v4f32(<4 x float> %1, ptr addrspace(8) %0, i32 0, i32 0, i32 0)
   call void @llvm.amdgcn.raw.ptr.buffer.store.v4f32(<4 x float> %2, ptr addrspace(8) %0, i32 0, i32 0, i32 1)
@@ -14,34 +23,54 @@ main_body:
   ret void
 }
 
-;CHECK-LABEL: {{^}}buffer_store_immoffs:
-;CHECK-NOT: s_waitcnt
-;CHECK: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:42
 define amdgpu_ps void @buffer_store_immoffs(ptr addrspace(8) inreg, <4 x float>) {
+; VERDE-LABEL: buffer_store_immoffs:
+; VERDE:       ; %bb.0: ; %main_body
+; VERDE-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:42
+; VERDE-NEXT:    s_endpgm
+;
+; CHECK-LABEL: buffer_store_immoffs:
+; CHECK:       ; %bb.0: ; %main_body
+; CHECK-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:42
+; CHECK-NEXT:    s_endpgm
 main_body:
   call void @llvm.amdgcn.raw.ptr.buffer.store.v4f32(<4 x float> %1, ptr addrspace(8) %0, i32 42, i32 0, i32 0)
   ret void
 }
 
-;CHECK-LABEL: {{^}}buffer_store_ofs:
-;CHECK-NOT: s_waitcnt
-;CHECK: buffer_store_dwordx4 v[0:3], v4, s[0:3], 0 offen
 define amdgpu_ps void @buffer_store_ofs(ptr addrspace(8) inreg, <4 x float>, i32) {
+; VERDE-LABEL: buffer_store_ofs:
+; VERDE:       ; %bb.0: ; %main_body
+; VERDE-NEXT:    buffer_store_dwordx4 v[0:3], v4, s[0:3], 0 offen
+; VERDE-NEXT:    s_endpgm
+;
+; CHECK-LABEL: buffer_store_ofs:
+; CHECK:       ; %bb.0: ; %main_body
+; CHECK-NEXT:    buffer_store_dwordx4 v[0:3], v4, s[0:3], 0 offen
+; CHECK-NEXT:    s_endpgm
 main_body:
   call void @llvm.amdgcn.raw.ptr.buffer.store.v4f32(<4 x float> %1, ptr addrspace(8) %0, i32 %2, i32 0, i32 0)
   ret void
 }
 
 ; Ideally, the register allocator would avoid the wait here
-;
-;CHECK-LABEL: {{^}}buffer_store_wait:
-;CHECK-NOT: s_waitcnt
-;CHECK: buffer_store_dwordx4 v[0:3], v4, s[0:3], 0 offen
-;VERDE: s_waitcnt expcnt(0)
-;CHECK: buffer_load_dwordx4 v[0:3], v5, s[0:3], 0 offen
-;CHECK: s_waitcnt vmcnt(0)
-;CHECK: buffer_store_dwordx4 v[0:3], v6, s[0:3], 0 offen
 define amdgpu_ps void @buffer_store_wait(ptr addrspace(8) inreg, <4 x float>, i32, i32, i32) {
+; VERDE-LABEL: buffer_store_wait:
+; VERDE:       ; %bb.0: ; %main_body
+; VERDE-NEXT:    buffer_store_dwordx4 v[0:3], v4, s[0:3], 0 offen
+; VERDE-NEXT:    s_waitcnt expcnt(0)
+; VERDE-NEXT:    buffer_load_dwordx4 v[0:3], v5, s[0:3], 0 offen
+; VERDE-NEXT:    s_waitcnt vmcnt(0)
+; VERDE-NEXT:    buffer_store_dwordx4 v[0:3], v6, s[0:3], 0 offen
+; VERDE-NEXT:    s_endpgm
+;
+; CHECK-LABEL: buffer_store_wait:
+; CHECK:       ; %bb.0: ; %main_body
+; CHECK-NEXT:    buffer_store_dwordx4 v[0:3], v4, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_load_dwordx4 v[0:3], v5, s[0:3], 0 offen
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    buffer_store_dwordx4 v[0:3], v6, s[0:3], 0 offen
+; CHECK-NEXT:    s_endpgm
 main_body:
   call void @llvm.amdgcn.raw.ptr.buffer.store.v4f32(<4 x float> %1, ptr addrspace(8) %0, i32 %2, i32 0, i32 0)
   %data = call <4 x float> @llvm.amdgcn.raw.ptr.buffer.load.v4f32(ptr addrspace(8) %0, i32 %3, i32 0, i32 0)
@@ -49,29 +78,48 @@ main_body:
   ret void
 }
 
-;CHECK-LABEL: {{^}}buffer_store_x1:
-;CHECK-NOT: s_waitcnt
-;CHECK: buffer_store_dword v0, v1, s[0:3], 0 offen
 define amdgpu_ps void @buffer_store_x1(ptr addrspace(8) inreg %rsrc, float %data, i32 %offset) {
+; VERDE-LABEL: buffer_store_x1:
+; VERDE:       ; %bb.0: ; %main_body
+; VERDE-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; VERDE-NEXT:    s_endpgm
+;
+; CHECK-LABEL: buffer_store_x1:
+; CHECK:       ; %bb.0: ; %main_body
+; CHECK-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; CHECK-NEXT:    s_endpgm
 main_body:
   call void @llvm.amdgcn.raw.ptr.buffer.store.f32(float %data, ptr addrspace(8) %rsrc, i32 %offset, i32 0, i32 0)
   ret void
 }
 
-;CHECK-LABEL: {{^}}buffer_store_x2:
-;CHECK-NOT: s_waitcnt
-;CHECK: buffer_store_dwordx2 v[0:1], v2, s[0:3], 0 offen
 define amdgpu_ps void @buffer_store_x2(ptr addrspace(8) inreg %rsrc, <2 x float> %data, i32 %offset) #0 {
+; VERDE-LABEL: buffer_store_x2:
+; VERDE:       ; %bb.0: ; %main_body
+; VERDE-NEXT:    buffer_store_dwordx2 v[0:1], v2, s[0:3], 0 offen
+; VERDE-NEXT:    s_endpgm
+;
+; CHECK-LABEL: buffer_store_x2:
+; CHECK:       ; %bb.0: ; %main_body
+; CHECK-NEXT:    buffer_store_dwordx2 v[0:1], v2, s[0:3], 0 offen
+; CHECK-NEXT:    s_endpgm
 main_body:
   call void @llvm.amdgcn.raw.ptr.buffer.store.v2f32(<2 x float> %data, ptr addrspace(8) %rsrc, i32 %offset, i32 0, i32 0)
   ret void
 }
 
-;CHECK-LABEL: {{^}}buffer_store_x1_offen_merged_and:
-;CHECK-NOT: s_waitcnt
-;CHECK-DAG: buffer_store_dwordx4 v[{{[0-9]}}:{{[0-9]}}], v0, s[0:3], 0 offen offset:4
-;CHECK-DAG: buffer_store_dwordx2 v[{{[0-9]}}:{{[0-9]}}], v0, s[0:3], 0 offen offset:28
 define amdgpu_ps void @buffer_store_x1_offen_merged_and(ptr addrspace(8) inreg %rsrc, i32 %a, float %v1, float %v2, float %v3, float %v4, float %v5, float %v6) {
+; VERDE-LABEL: buffer_store_x1_offen_merged_and:
+; VERDE:       ; %bb.0:
+; VERDE-NEXT:    buffer_store_dwordx4 v[1:4], v0, s[0:3], 0 offen offset:4
+; VERDE-NEXT:    buffer_store_dwordx2 v[5:6], v0, s[0:3], 0 offen offset:28
+; VERDE-NEXT:    s_endpgm
+;
+; CHECK-LABEL: buffer_store_x1_offen_merged_and:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    buffer_store_dwordx4 v[1:4], v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    buffer_store_dwordx2 v[5:6], v0, s[0:3], 0 offen offset:28
+; CHECK-NEXT:    s_endpgm
   %a1 = add i32 %a, 4
   %a2 = add i32 %a, 8
   %a3 = add i32 %a, 12
@@ -87,11 +135,20 @@ define amdgpu_ps void @buffer_store_x1_offen_merged_and(ptr addrspace(8) inreg %
   ret void
 }
 
-;CHECK-LABEL: {{^}}buffer_store_x1_offen_merged_or:
-;CHECK-NOT: s_waitcnt
-;CHECK-DAG: buffer_store_dwordx4 v[{{[0-9]}}:{{[0-9]}}], v{{[0-9]}}, s[0:3], 0 offen offset:4
-;CHECK-DAG: buffer_store_dwordx2 v[{{[0-9]}}:{{[0-9]}}], v{{[0-9]}}, s[0:3], 0 offen offset:28
 define amdgpu_ps void @buffer_store_x1_offen_merged_or(ptr addrspace(8) inreg %rsrc, i32 %inp, float %v1, float %v2, float %v3, float %v4, float %v5, float %v6) {
+; VERDE-LABEL: buffer_store_x1_offen_merged_or:
+; VERDE:       ; %bb.0:
+; VERDE-NEXT:    v_lshlrev_b32_e32 v0, 6, v0
+; VERDE-NEXT:    buffer_store_dwordx4 v[1:4], v0, s[0:3], 0 offen offset:4
+; VERDE-NEXT:    buffer_store_dwordx2 v[5:6], v0, s[0:3], 0 offen offset:28
+; VERDE-NEXT:    s_endpgm
+;
+; CHECK-LABEL: buffer_store_x1_offen_merged_or:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    v_lshlrev_b32_e32 v0, 6, v0
+; CHECK-NEXT:    buffer_store_dwordx4 v[1:4], v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    buffer_store_dwordx2 v[5:6], v0, s[0:3], 0 offen offset:28
+; CHECK-NEXT:    s_endpgm
   %a = shl i32 %inp, 6
   %a1 = add i32 %a, 4
   %a2 = add i32 %a, 8
@@ -109,12 +166,20 @@ define amdgpu_ps void @buffer_store_x1_offen_merged_or(ptr addrspace(8) inreg %r
 }
 
 
-;CHECK-LABEL: {{^}}buffer_store_x1_offen_merged_glc_slc:
-;CHECK-NOT: s_waitcnt
-;CHECK-DAG: buffer_store_dwordx2 v[{{[0-9]}}:{{[0-9]}}], v0, s[0:3], 0 offen offset:4{{$}}
-;CHECK-DAG: buffer_store_dwordx2 v[{{[0-9]}}:{{[0-9]}}], v0, s[0:3], 0 offen offset:12 glc{{$}}
-;CHECK-DAG: buffer_store_dwordx2 v[{{[0-9]}}:{{[0-9]}}], v0, s[0:3], 0 offen offset:28 glc slc{{$}}
 define amdgpu_ps void @buffer_store_x1_offen_merged_glc_slc(ptr addrspace(8) inreg %rsrc, i32 %a, float %v1, float %v2, float %v3, float %v4, float %v5, float %v6) {
+; VERDE-LABEL: buffer_store_x1_offen_merged_glc_slc:
+; VERDE:       ; %bb.0:
+; VERDE-NEXT:    buffer_store_dwordx2 v[1:2], v0, s[0:3], 0 offen offset:4
+; VERDE-NEXT:    buffer_store_dwordx2 v[3:4], v0, s[0:3], 0 offen offset:12 glc
+; VERDE-NEXT:    buffer_store_dwordx2 v[5:6], v0, s[0:3], 0 offen offset:28 glc slc
+; VERDE-NEXT:    s_endpgm
+;
+; CHECK-LABEL: buffer_store_x1_offen_merged_glc_slc:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    buffer_store_dwordx2 v[1:2], v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    buffer_store_dwordx2 v[3:4], v0, s[0:3], 0 offen offset:12 glc
+; CHECK-NEXT:    buffer_store_dwordx2 v[5:6], v0, s[0:3], 0 offen offset:28 glc slc
+; CHECK-NEXT:    s_endpgm
   %a1 = add i32 %a, 4
   %a2 = add i32 %a, 8
   %a3 = add i32 %a, 12
@@ -130,10 +195,16 @@ define amdgpu_ps void @buffer_store_x1_offen_merged_glc_slc(ptr addrspace(8) inr
   ret void
 }
 
-;CHECK-LABEL: {{^}}buffer_store_x2_offen_merged_and:
-;CHECK-NOT: s_waitcnt
-;CHECK: buffer_store_dwordx4 v[{{[0-9]}}:{{[0-9]}}], v0, s[0:3], 0 offen offset:4
 define amdgpu_ps void @buffer_store_x2_offen_merged_and(ptr addrspace(8) inreg %rsrc, i32 %a, <2 x float> %v1, <2 x float> %v2) {
+; VERDE-LABEL: buffer_store_x2_offen_merged_and:
+; VERDE:       ; %bb.0:
+; VERDE-NEXT:    buffer_store_dwordx4 v[1:4], v0, s[0:3], 0 offen offset:4
+; VERDE-NEXT:    s_endpgm
+;
+; CHECK-LABEL: buffer_store_x2_offen_merged_and:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    buffer_store_dwordx4 v[1:4], v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    s_endpgm
   %a1 = add i32 %a, 4
   %a2 = add i32 %a, 12
   call void @llvm.amdgcn.raw.ptr.buffer.store.v2f32(<2 x float> %v1, ptr addrspace(8) %rsrc, i32 %a1, i32 0, i32 0)
@@ -141,10 +212,18 @@ define amdgpu_ps void @buffer_store_x2_offen_merged_and(ptr addrspace(8) inreg %
   ret void
 }
 
-;CHECK-LABEL: {{^}}buffer_store_x2_offen_merged_or:
-;CHECK-NOT: s_waitcnt
-;CHECK: buffer_store_dwordx4 v[{{[0-9]}}:{{[0-9]}}], v{{[0-9]}}, s[0:3], 0 offen offset:4
 define amdgpu_ps void @buffer_store_x2_offen_merged_or(ptr addrspace(8) inreg %rsrc, i32 %inp, <2 x float> %v1, <2 x float> %v2) {
+; VERDE-LABEL: buffer_store_x2_offen_merged_or:
+; VERDE:       ; %bb.0:
+; VERDE-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
+; VERDE-NEXT:    buffer_store_dwordx4 v[1:4], v0, s[0:3], 0 offen offset:4
+; VERDE-NEXT:    s_endpgm
+;
+; CHECK-LABEL: buffer_store_x2_offen_merged_or:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
+; CHECK-NEXT:    buffer_store_dwordx4 v[1:4], v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    s_endpgm
   %a = shl i32 %inp, 4
   %a1 = add i32 %a, 4
   %a2 = add i32 %a, 12
@@ -153,11 +232,18 @@ define amdgpu_ps void @buffer_store_x2_offen_merged_or(ptr addrspace(8) inreg %r
   ret void
 }
 
-;CHECK-LABEL: {{^}}buffer_store_x1_offset_merged:
-;CHECK-NOT: s_waitcnt
-;CHECK-DAG: buffer_store_dwordx4 v[{{[0-9]}}:{{[0-9]}}], off, s[0:3], 0 offset:4
-;CHECK-DAG: buffer_store_dwordx2 v[{{[0-9]}}:{{[0-9]}}], off, s[0:3], 0 offset:28
 define amdgpu_ps void @buffer_store_x1_offset_merged(ptr addrspace(8) inreg %rsrc, float %v1, float %v2, float %v3, float %v4, float %v5, float %v6) {
+; VERDE-LABEL: buffer_store_x1_offset_merged:
+; VERDE:       ; %bb.0:
+; VERDE-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:4
+; VERDE-NEXT:    buffer_store_dwordx2 v[4:5], off, s[0:3], 0 offset:28
+; VERDE-NEXT:    s_endpgm
+;
+; CHECK-LABEL: buffer_store_x1_offset_merged:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:4
+; CHECK-NEXT:    buffer_store_dwordx2 v[4:5], off, s[0:3], 0 offset:28
+; CHECK-NEXT:    s_endpgm
   call void @llvm.amdgcn.raw.ptr.buffer.store.f32(float %v1, ptr addrspace(8) %rsrc, i32 4, i32 0, i32 0)
   call void @llvm.amdgcn.raw.ptr.buffer.store.f32(float %v2, ptr addrspace(8) %rsrc, i32 8, i32 0, i32 0)
   call void @llvm.amdgcn.raw.ptr.buffer.store.f32(float %v3, ptr addrspace(8) %rsrc, i32 12, i32 0, i32 0)
@@ -167,21 +253,35 @@ define amdgpu_ps void @buffer_store_x1_offset_merged(ptr addrspace(8) inreg %rsr
   ret void
 }
 
-;CHECK-LABEL: {{^}}buffer_store_x2_offset_merged:
-;CHECK-NOT: s_waitcnt
-;CHECK: buffer_store_dwordx4 v[{{[0-9]}}:{{[0-9]}}], off, s[0:3], 0 offset:4
 define amdgpu_ps void @buffer_store_x2_offset_merged(ptr addrspace(8) inreg %rsrc, <2 x float> %v1,<2 x float> %v2) {
+; VERDE-LABEL: buffer_store_x2_offset_merged:
+; VERDE:       ; %bb.0:
+; VERDE-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:4
+; VERDE-NEXT:    s_endpgm
+;
+; CHECK-LABEL: buffer_store_x2_offset_merged:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:4
+; CHECK-NEXT:    s_endpgm
   call void @llvm.amdgcn.raw.ptr.buffer.store.v2f32(<2 x float> %v1, ptr addrspace(8) %rsrc, i32 4, i32 0, i32 0)
   call void @llvm.amdgcn.raw.ptr.buffer.store.v2f32(<2 x float> %v2, ptr addrspace(8) %rsrc, i32 12, i32 0, i32 0)
   ret void
 }
 
-;CHECK-LABEL: {{^}}buffer_store_int:
-;CHECK-NOT: s_waitcnt
-;CHECK: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
-;CHECK: buffer_store_dwordx2 v[4:5], off, s[0:3], 0 glc
-;CHECK: buffer_store_dword v6, off, s[0:3], 0 slc
 define amdgpu_ps void @buffer_store_int(ptr addrspace(8) inreg, <4 x i32>, <2 x i32>, i32) {
+; VERDE-LABEL: buffer_store_int:
+; VERDE:       ; %bb.0: ; %main_body
+; VERDE-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
+; VERDE-NEXT:    buffer_store_dwordx2 v[4:5], off, s[0:3], 0 glc
+; VERDE-NEXT:    buffer_store_dword v6, off, s[0:3], 0 slc
+; VERDE-NEXT:    s_endpgm
+;
+; CHECK-LABEL: buffer_store_int:
+; CHECK:       ; %bb.0: ; %main_body
+; CHECK-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
+; CHECK-NEXT:    buffer_store_dwordx2 v[4:5], off, s[0:3], 0 glc
+; CHECK-NEXT:    buffer_store_dword v6, off, s[0:3], 0 slc
+; CHECK-NEXT:    s_endpgm
 main_body:
   call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> %1, ptr addrspace(8) %0, i32 0, i32 0, i32 0)
   call void @llvm.amdgcn.raw.ptr.buffer.store.v2i32(<2 x i32> %2, ptr addrspace(8) %0, i32 0, i32 0, i32 1)
@@ -189,12 +289,18 @@ main_body:
   ret void
 }
 
-;CHECK-LABEL: {{^}}raw_ptr_buffer_store_byte:
-;CHECK-NEXT: %bb.
-;CHECK-NEXT: v_cvt_u32_f32_e32 v{{[0-9]}}, v{{[0-9]}}
-;CHECK-NEXT: buffer_store_byte v{{[0-9]}}, off, s[0:3], 0
-;CHECK-NEXT: s_endpgm
 define amdgpu_ps void @raw_ptr_buffer_store_byte(ptr addrspace(8) inreg %rsrc, float %v1) {
+; VERDE-LABEL: raw_ptr_buffer_store_byte:
+; VERDE:       ; %bb.0: ; %main_body
+; VERDE-NEXT:    v_cvt_u32_f32_e32 v0, v0
+; VERDE-NEXT:    buffer_store_byte v0, off, s[0:3], 0
+; VERDE-NEXT:    s_endpgm
+;
+; CHECK-LABEL: raw_ptr_buffer_store_byte:
+; CHECK:       ; %bb.0: ; %main_body
+; CHECK-NEXT:    v_cvt_u32_f32_e32 v0, v0
+; CHECK-NEXT:    buffer_store_byte v0, off, s[0:3], 0
+; CHECK-NEXT:    s_endpgm
 main_body:
   %v2 = fptoui float %v1 to i32
   %v3 = trunc i32 %v2 to i8
@@ -202,12 +308,18 @@ main_body:
   ret void
 }
 
-;CHECK-LABEL: {{^}}raw_ptr_buffer_store_short:
-;CHECK-NEXT: %bb.
-;CHECK-NEXT: v_cvt_u32_f32_e32 v{{[0-9]}}, v{{[0-9]}}
-;CHECK-NEXT: buffer_store_short v{{[0-9]}}, off, s[0:3], 0
-;CHECK-NEXT: s_endpgm
 define amdgpu_ps void @raw_ptr_buffer_store_short(ptr addrspace(8) inreg %rsrc, float %v1) {
+; VERDE-LABEL: raw_ptr_buffer_store_short:
+; VERDE:       ; %bb.0: ; %main_body
+; VERDE-NEXT:    v_cvt_u32_f32_e32 v0, v0
+; VERDE-NEXT:    buffer_store_short v0, off, s[0:3], 0
+; VERDE-NEXT:    s_endpgm
+;
+; CHECK-LABEL: raw_ptr_buffer_store_short:
+; CHECK:       ; %bb.0: ; %main_body
+; CHECK-NEXT:    v_cvt_u32_f32_e32 v0, v0
+; CHECK-NEXT:    buffer_store_short v0, off, s[0:3], 0
+; CHECK-NEXT:    s_endpgm
 main_body:
   %v2 = fptoui float %v1 to i32
   %v3 = trunc i32 %v2 to i16
@@ -215,12 +327,16 @@ main_body:
   ret void
 }
 
-;CHECK-LABEL: {{^}}raw_ptr_buffer_store_f16:
-;CHECK-NEXT: %bb.
-;CHECK-NOT: v0
-;CHECK-NEXT: buffer_store_short v0, off, s[0:3], 0
-;CHECK-NEXT: s_endpgm
 define amdgpu_ps void @raw_ptr_buffer_store_f16(ptr addrspace(8) inreg %rsrc, i32 %v1) {
+; VERDE-LABEL: raw_ptr_buffer_store_f16:
+; VERDE:       ; %bb.0: ; %main_body
+; VERDE-NEXT:    buffer_store_short v0, off, s[0:3], 0
+; VERDE-NEXT:    s_endpgm
+;
+; CHECK-LABEL: raw_ptr_buffer_store_f16:
+; CHECK:       ; %bb.0: ; %main_body
+; CHECK-NEXT:    buffer_store_short v0, off, s[0:3], 0
+; CHECK-NEXT:    s_endpgm
 main_body:
   %trunc = trunc i32 %v1 to i16
   %cast = bitcast i16 %trunc to half
@@ -228,74 +344,169 @@ main_body:
   ret void
 }
 
-;CHECK-LABEL: {{^}}buffer_store_v2f16:
-;CHECK-NOT: s_waitcnt
-;CHECK: buffer_store_dword v0, v1, s[0:3], 0 offen
 define amdgpu_ps void @buffer_store_v2f16(ptr addrspace(8) inreg %rsrc, <2 x half> %data, i32 %offset) {
+; VERDE-LABEL: buffer_store_v2f16:
+; VERDE:       ; %bb.0: ; %main_body
+; VERDE-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; VERDE-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; VERDE-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; VERDE-NEXT:    v_or_b32_e32 v0, v0, v1
+; VERDE-NEXT:    buffer_store_dword v0, v2, s[0:3], 0 offen
+; VERDE-NEXT:    s_endpgm
+;
+; CHECK-LABEL: buffer_store_v2f16:
+; CHECK:       ; %bb.0: ; %main_body
+; CHECK-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; CHECK-NEXT:    s_endpgm
 main_body:
   call void @llvm.amdgcn.raw.ptr.buffer.store.v2f16(<2 x half> %data, ptr addrspace(8) %rsrc, i32 %offset, i32 0, i32 0)
   ret void
 }
 
-;CHECK-LABEL: {{^}}buffer_store_v4f16:
-;CHECK-NOT: s_waitcnt
-;CHECK: buffer_store_dwordx2 v[0:1], v2, s[0:3], 0 offen
 define amdgpu_ps void @buffer_store_v4f16(ptr addrspace(8) inreg %rsrc, <4 x half> %data, i32 %offset) #0 {
+; VERDE-LABEL: buffer_store_v4f16:
+; VERDE:       ; %bb.0: ; %main_body
+; VERDE-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; VERDE-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; VERDE-NEXT:    v_cvt_f16_f32_e32 v5, v1
+; VERDE-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; VERDE-NEXT:    v_lshlrev_b32_e32 v1, 16, v3
+; VERDE-NEXT:    v_or_b32_e32 v1, v2, v1
+; VERDE-NEXT:    v_lshlrev_b32_e32 v2, 16, v5
+; VERDE-NEXT:    v_or_b32_e32 v0, v0, v2
+; VERDE-NEXT:    buffer_store_dwordx2 v[0:1], v4, s[0:3], 0 offen
+; VERDE-NEXT:    s_endpgm
+;
+; CHECK-LABEL: buffer_store_v4f16:
+; CHECK:       ; %bb.0: ; %main_body
+; CHECK-NEXT:    buffer_store_dwordx2 v[0:1], v2, s[0:3], 0 offen
+; CHECK-NEXT:    s_endpgm
 main_body:
   call void @llvm.amdgcn.raw.ptr.buffer.store.v4f16(<4 x half> %data, ptr addrspace(8) %rsrc, i32 %offset, i32 0, i32 0)
   ret void
 }
 
-;CHECK-LABEL: {{^}}buffer_store_v8f16:
-;CHECK-NOT: s_waitcnt
-;CHECK: buffer_store_dwordx4 v[0:3], v4, s[0:3], 0 offen
 define amdgpu_ps void @buffer_store_v8f16(ptr addrspace(8) inreg %rsrc, <8 x half> %data, i32 %offset) #0 {
+; VERDE-LABEL: buffer_store_v8f16:
+; VERDE:       ; %bb.0: ; %main_body
+; VERDE-NEXT:    v_cvt_f16_f32_e32 v7, v7
+; VERDE-NEXT:    v_cvt_f16_f32_e32 v6, v6
+; VERDE-NEXT:    v_cvt_f16_f32_e32 v9, v5
+; VERDE-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; VERDE-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; VERDE-NEXT:    v_cvt_f16_f32_e32 v4, v4
+; VERDE-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; VERDE-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; VERDE-NEXT:    v_lshlrev_b32_e32 v5, 16, v7
+; VERDE-NEXT:    v_or_b32_e32 v5, v6, v5
+; VERDE-NEXT:    v_lshlrev_b32_e32 v6, 16, v9
+; VERDE-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; VERDE-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; VERDE-NEXT:    v_or_b32_e32 v4, v4, v6
+; VERDE-NEXT:    v_or_b32_e32 v3, v2, v3
+; VERDE-NEXT:    v_or_b32_e32 v2, v0, v1
+; VERDE-NEXT:    buffer_store_dwordx4 v[2:5], v8, s[0:3], 0 offen
+; VERDE-NEXT:    s_endpgm
+;
+; CHECK-LABEL: buffer_store_v8f16:
+; CHECK:       ; %bb.0: ; %main_body
+; CHECK-NEXT:    buffer_store_dwordx4 v[0:3], v4, s[0:3], 0 offen
+; CHECK-NEXT:    s_endpgm
 main_body:
   call void @llvm.amdgcn.raw.ptr.buffer.store.v8f16(<8 x half> %data, ptr addrspace(8) %rsrc, i32 %offset, i32 0, i32 0)
   ret void
 }
 
-;CHECK-LABEL: {{^}}buffer_store_v2bf16:
-;CHECK-NOT: s_waitcnt
-;CHECK: buffer_store_dword v0, v1, s[0:3], 0 offen
 define amdgpu_ps void @buffer_store_v2bf16(ptr addrspace(8) inreg %rsrc, <2 x bfloat> %data, i32 %offset) {
+; VERDE-LABEL: buffer_store_v2bf16:
+; VERDE:       ; %bb.0:
+; VERDE-NEXT:    v_mul_f32_e32 v1, 1.0, v1
+; VERDE-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; VERDE-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; VERDE-NEXT:    v_alignbit_b32 v0, v1, v0, 16
+; VERDE-NEXT:    buffer_store_dword v0, v2, s[0:3], 0 offen
+; VERDE-NEXT:    s_endpgm
+;
+; CHECK-LABEL: buffer_store_v2bf16:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; CHECK-NEXT:    s_endpgm
   call void @llvm.amdgcn.raw.ptr.buffer.store.v2bf16(<2 x bfloat> %data, ptr addrspace(8) %rsrc, i32 %offset, i32 0, i32 0)
   ret void
 }
 
-;CHECK-LABEL: {{^}}buffer_store_v4bf16:
-;CHECK-NOT: s_waitcnt
-;CHECK: buffer_store_dwordx2 v[0:1], v2, s[0:3], 0 offen
 define amdgpu_ps void @buffer_store_v4bf16(ptr addrspace(8) inreg %rsrc, <4 x bfloat> %data, i32 %offset) #0 {
+; VERDE-LABEL: buffer_store_v4bf16:
+; VERDE:       ; %bb.0:
+; VERDE-NEXT:    v_mul_f32_e32 v3, 1.0, v3
+; VERDE-NEXT:    v_mul_f32_e32 v1, 1.0, v1
+; VERDE-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; VERDE-NEXT:    v_mul_f32_e32 v2, 1.0, v2
+; VERDE-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; VERDE-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; VERDE-NEXT:    v_alignbit_b32 v2, v3, v2, 16
+; VERDE-NEXT:    v_alignbit_b32 v1, v1, v0, 16
+; VERDE-NEXT:    buffer_store_dwordx2 v[1:2], v4, s[0:3], 0 offen
+; VERDE-NEXT:    s_endpgm
+;
+; CHECK-LABEL: buffer_store_v4bf16:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    buffer_store_dwordx2 v[0:1], v2, s[0:3], 0 offen
+; CHECK-NEXT:    s_endpgm
   call void @llvm.amdgcn.raw.ptr.buffer.store.v4bf16(<4 x bfloat> %data, ptr addrspace(8) %rsrc, i32 %offset, i32 0, i32 0)
   ret void
 }
 
-;CHECK-LABEL: {{^}}raw_ptr_buffer_store_i16:
-;CHECK-NEXT: %bb.
-;CHECK-NOT: v0
-;CHECK-NEXT: buffer_store_short v0, off, s[0:3], 0
-;CHECK-NEXT: s_endpgm
 define amdgpu_ps void @raw_ptr_buffer_store_i16(ptr addrspace(8) inreg %rsrc, i32 %v1) {
+; VERDE-LABEL: raw_ptr_buffer_store_i16:
+; VERDE:       ; %bb.0: ; %main_body
+; VERDE-NEXT:    buffer_store_short v0, off, s[0:3], 0
+; VERDE-NEXT:    s_endpgm
+;
+; CHECK-LABEL: raw_ptr_buffer_store_i16:
+; CHECK:       ; %bb.0: ; %main_body
+; CHECK-NEXT:    buffer_store_short v0, off, s[0:3], 0
+; CHECK-NEXT:    s_endpgm
 main_body:
   %trunc = trunc i32 %v1 to i16
   call void @llvm.amdgcn.raw.ptr.buffer.store.i16(i16 %trunc, ptr addrspace(8) %rsrc, i32 0, i32 0, i32 0)
   ret void
 }
 
-;CHECK-LABEL: {{^}}buffer_store_v2i16:
-;CHECK-NOT: s_waitcnt
-;CHECK: buffer_store_dword v0, v1, s[0:3], 0 offen
 define amdgpu_ps void @buffer_store_v2i16(ptr addrspace(8) inreg %rsrc, <2 x i16> %data, i32 %offset) {
+; VERDE-LABEL: buffer_store_v2i16:
+; VERDE:       ; %bb.0: ; %main_body
+; VERDE-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; VERDE-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; VERDE-NEXT:    v_or_b32_e32 v0, v0, v1
+; VERDE-NEXT:    buffer_store_dword v0, v2, s[0:3], 0 offen
+; VERDE-NEXT:    s_endpgm
+;
+; CHECK-LABEL: buffer_store_v2i16:
+; CHECK:       ; %bb.0: ; %main_body
+; CHECK-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; CHECK-NEXT:    s_endpgm
 main_body:
   call void @llvm.amdgcn.raw.ptr.buffer.store.v2i16(<2 x i16> %data, ptr addrspace(8) %rsrc, i32 %offset, i32 0, i32 0)
   ret void
 }
 
-;CHECK-LABEL: {{^}}buffer_store_v4i16:
-;CHECK-NOT: s_waitcnt
-;CHECK: buffer_store_dwordx2 v[0:1], v2, s[0:3], 0 offen
 define amdgpu_ps void @buffer_store_v4i16(ptr addrspace(8) inreg %rsrc, <4 x i16> %data, i32 %offset) #0 {
+; VERDE-LABEL: buffer_store_v4i16:
+; VERDE:       ; %bb.0: ; %main_body
+; VERDE-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; VERDE-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; VERDE-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; VERDE-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; VERDE-NEXT:    v_or_b32_e32 v2, v2, v3
+; VERDE-NEXT:    v_or_b32_e32 v1, v0, v1
+; VERDE-NEXT:    buffer_store_dwordx2 v[1:2], v4, s[0:3], 0 offen
+; VERDE-NEXT:    s_endpgm
+;
+; CHECK-LABEL: buffer_store_v4i16:
+; CHECK:       ; %bb.0: ; %main_body
+; CHECK-NEXT:    buffer_store_dwordx2 v[0:1], v2, s[0:3], 0 offen
+; CHECK-NEXT:    s_endpgm
 main_body:
   call void @llvm.amdgcn.raw.ptr.buffer.store.v4i16(<4 x i16> %data, ptr addrspace(8) %rsrc, i32 %offset, i32 0, i32 0)
   ret void
@@ -307,21 +518,45 @@ main_body:
 ;   call void @llvm.amdgcn.raw.ptr.buffer.store.v6i16(<6 x i16> %data, ptr addrspace(8) %rsrc, i32 %offset, i32 0, i32 0)
 ;   ret void
 ; }
-
-;CHECK-LABEL: {{^}}buffer_store_v8i16:
-;CHECK-NOT: s_waitcnt
-;CHECK: buffer_store_dwordx4 v[0:3], v4, s[0:3], 0 offen
 define amdgpu_ps void @buffer_store_v8i16(ptr addrspace(8) inreg %rsrc, <8 x i16> %data, i32 %offset) #0 {
+; VERDE-LABEL: buffer_store_v8i16:
+; VERDE:       ; %bb.0: ; %main_body
+; VERDE-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
+; VERDE-NEXT:    v_and_b32_e32 v6, 0xffff, v6
+; VERDE-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; VERDE-NEXT:    v_and_b32_e32 v4, 0xffff, v4
+; VERDE-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; VERDE-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; VERDE-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; VERDE-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; VERDE-NEXT:    v_or_b32_e32 v6, v6, v7
+; VERDE-NEXT:    v_or_b32_e32 v5, v4, v5
+; VERDE-NEXT:    v_or_b32_e32 v4, v2, v3
+; VERDE-NEXT:    v_or_b32_e32 v3, v0, v1
+; VERDE-NEXT:    buffer_store_dwordx4 v[3:6], v8, s[0:3], 0 offen
+; VERDE-NEXT:    s_endpgm
+;
+; CHECK-LABEL: buffer_store_v8i16:
+; CHECK:       ; %bb.0: ; %main_body
+; CHECK-NEXT:    buffer_store_dwordx4 v[0:3], v4, s[0:3], 0 offen
+; CHECK-NEXT:    s_endpgm
 main_body:
   call void @llvm.amdgcn.raw.ptr.buffer.store.v8i16(<8 x i16> %data, ptr addrspace(8) %rsrc, i32 %offset, i32 0, i32 0)
   ret void
 }
 
-;CHECK-LABEL: {{^}}raw_ptr_buffer_store_x1_offset_merged:
-;CHECK-NOT: s_waitcnt
-;CHECK-DAG: buffer_store_dwordx4 v[{{[0-9]}}:{{[0-9]}}], off, s[0:3], 0 offset:4
-;CHECK-DAG: buffer_store_dwordx2 v[{{[0-9]}}:{{[0-9]}}], off, s[0:3], 0 offset:28
 define amdgpu_ps void @raw_ptr_buffer_store_x1_offset_merged(ptr addrspace(8) inreg %rsrc, float %v1, float %v2, float %v3, float %v4, float %v5, float %v6) {
+; VERDE-LABEL: raw_ptr_buffer_store_x1_offset_merged:
+; VERDE:       ; %bb.0:
+; VERDE-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:4
+; VERDE-NEXT:    buffer_store_dwordx2 v[4:5], off, s[0:3], 0 offset:28
+; VERDE-NEXT:    s_endpgm
+;
+; CHECK-LABEL: raw_ptr_buffer_store_x1_offset_merged:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:4
+; CHECK-NEXT:    buffer_store_dwordx2 v[4:5], off, s[0:3], 0 offset:28
+; CHECK-NEXT:    s_endpgm
   call void @llvm.amdgcn.raw.ptr.buffer.store.f32(float %v1, ptr addrspace(8) %rsrc, i32 4, i32 0, i32 0)
   call void @llvm.amdgcn.raw.ptr.buffer.store.f32(float %v2, ptr addrspace(8) %rsrc, i32 8, i32 0, i32 0)
   call void @llvm.amdgcn.raw.ptr.buffer.store.f32(float %v3, ptr addrspace(8) %rsrc, i32 12, i32 0, i32 0)
@@ -331,14 +566,26 @@ define amdgpu_ps void @raw_ptr_buffer_store_x1_offset_merged(ptr addrspace(8) in
   ret void
 }
 
-;CHECK-LABEL: {{^}}raw_ptr_buffer_store_x1_offset_swizzled_not_merged:
-;CHECK-DAG: buffer_store_dword v{{[0-9]}}, off, s[0:3], 0 offset:4
-;CHECK-DAG: buffer_store_dword v{{[0-9]}}, off, s[0:3], 0 offset:8
-;CHECK-DAG: buffer_store_dword v{{[0-9]}}, off, s[0:3], 0 offset:12
-;CHECK-DAG: buffer_store_dword v{{[0-9]}}, off, s[0:3], 0 offset:16
-;CHECK-DAG: buffer_store_dword v{{[0-9]}}, off, s[0:3], 0 offset:28
-;CHECK-DAG: buffer_store_dword v{{[0-9]}}, off, s[0:3], 0 offset:32
 define amdgpu_ps void @raw_ptr_buffer_store_x1_offset_swizzled_not_merged(ptr addrspace(8) inreg %rsrc, float %v1, float %v2, float %v3, float %v4, float %v5, float %v6) {
+; VERDE-LABEL: raw_ptr_buffer_store_x1_offset_swizzled_not_merged:
+; VERDE:       ; %bb.0:
+; VERDE-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:4
+; VERDE-NEXT:    buffer_store_dword v1, off, s[0:3], 0 offset:8
+; VERDE-NEXT:    buffer_store_dword v2, off, s[0:3], 0 offset:12
+; VERDE-NEXT:    buffer_store_dword v3, off, s[0:3], 0 offset:16
+; VERDE-NEXT:    buffer_store_dword v4, off, s[0:3], 0 offset:28
+; VERDE-NEXT:    buffer_store_dword v5, off, s[0:3], 0 offset:32
+; VERDE-NEXT:    s_endpgm
+;
+; CHECK-LABEL: raw_ptr_buffer_store_x1_offset_swizzled_not_merged:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:4
+; CHECK-NEXT:    buffer_store_dword v1, off, s[0:3], 0 offset:8
+; CHECK-NEXT:    buffer_store_dword v2, off, s[0:3], 0 offset:12
+; CHECK-NEXT:    buffer_store_dword v3, off, s[0:3], 0 offset:16
+; CHECK-NEXT:    buffer_store_dword v4, off, s[0:3], 0 offset:28
+; CHECK-NEXT:    buffer_store_dword v5, off, s[0:3], 0 offset:32
+; CHECK-NEXT:    s_endpgm
   call void @llvm.amdgcn.raw.ptr.buffer.store.f32(float %v1, ptr addrspace(8) %rsrc, i32 4, i32 0, i32 8)
   call void @llvm.amdgcn.raw.ptr.buffer.store.f32(float %v2, ptr addrspace(8) %rsrc, i32 8, i32 0, i32 8)
   call void @llvm.amdgcn.raw.ptr.buffer.store.f32(float %v3, ptr addrspace(8) %rsrc, i32 12, i32 0, i32 8)
diff --git a/llvm/test/CodeGen/AMDGPU/s_mulk_i32.ll b/llvm/test/CodeGen/AMDGPU/s_mulk_i32.ll
index c7987d3d009175..02641f5b6ae8c1 100644
--- a/llvm/test/CodeGen/AMDGPU/s_mulk_i32.ll
+++ b/llvm/test/CodeGen/AMDGPU/s_mulk_i32.ll
@@ -1,40 +1,118 @@
-; RUN: llc -mtriple=amdgcn--amdpal -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
-; RUN: llc -mtriple=amdgcn--amdpal -mcpu=tonga -mattr=-flat-for-global,-xnack -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
-
-; SI-LABEL: {{^}}s_mulk_i32_k0:
-; SI: s_load_dword [[VAL:s[0-9]+]]
-; SI: s_mulk_i32 [[VAL]], 0x41
-; SI: v_mov_b32_e32 [[VRESULT:v[0-9]+]], [[VAL]]
-; SI: buffer_store_dword [[VRESULT]]
-; SI: s_endpgm
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=amdgcn--amdpal -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=GFX6 %s
+; RUN: llc -mtriple=amdgcn--amdpal -mcpu=tonga -mattr=-flat-for-global,-xnack -verify-machineinstrs < %s | FileCheck -check-prefix=GFX8 %s
+
 define amdgpu_kernel void @s_mulk_i32_k0(ptr addrspace(1) %out, i32 %b) {
+; GFX6-LABEL: s_mulk_i32_k0:
+; GFX6:       ; %bb.0:
+; GFX6-NEXT:    s_load_dword s4, s[2:3], 0x2
+; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; GFX6-NEXT:    s_mov_b32 s3, 0xf000
+; GFX6-NEXT:    s_mov_b32 s2, -1
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    s_mulk_i32 s4, 0x41
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
+; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX6-NEXT:    s_endpgm
+;
+; GFX8-LABEL: s_mulk_i32_k0:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_load_dword s4, s[2:3], 0x8
+; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; GFX8-NEXT:    s_mov_b32 s3, 0xf000
+; GFX8-NEXT:    s_mov_b32 s2, -1
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    s_mulk_i32 s4, 0x41
+; GFX8-NEXT:    v_mov_b32_e32 v0, s4
+; GFX8-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX8-NEXT:    s_endpgm
   %mul = mul i32 %b, 65
   store i32 %mul, ptr addrspace(1) %out
   ret void
 }
 
-; SI-LABEL: {{^}}s_mulk_i32_k1:
-; SI: s_mulk_i32 {{s[0-9]+}}, 0x7fff{{$}}
-; SI: s_endpgm
 define amdgpu_kernel void @s_mulk_i32_k1(ptr addrspace(1) %out, i32 %b) {
+; GFX6-LABEL: s_mulk_i32_k1:
+; GFX6:       ; %bb.0:
+; GFX6-NEXT:    s_load_dword s4, s[2:3], 0x2
+; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; GFX6-NEXT:    s_mov_b32 s3, 0xf000
+; GFX6-NEXT:    s_mov_b32 s2, -1
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    s_mulk_i32 s4, 0x7fff
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
+; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX6-NEXT:    s_endpgm
+;
+; GFX8-LABEL: s_mulk_i32_k1:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_load_dword s4, s[2:3], 0x8
+; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; GFX8-NEXT:    s_mov_b32 s3, 0xf000
+; GFX8-NEXT:    s_mov_b32 s2, -1
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    s_mulk_i32 s4, 0x7fff
+; GFX8-NEXT:    v_mov_b32_e32 v0, s4
+; GFX8-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX8-NEXT:    s_endpgm
   %mul = mul i32 %b, 32767 ; (1 << 15) - 1
   store i32 %mul, ptr addrspace(1) %out
   ret void
 }
 
-; SI-LABEL: {{^}}s_mulk_i32_k2:
-; SI: s_mulk_i32 {{s[0-9]+}}, 0xffef{{$}}
-; SI: s_endpgm
 define amdgpu_kernel void @s_mulk_i32_k2(ptr addrspace(1) %out, i32 %b) {
+; GFX6-LABEL: s_mulk_i32_k2:
+; GFX6:       ; %bb.0:
+; GFX6-NEXT:    s_load_dword s4, s[2:3], 0x2
+; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; GFX6-NEXT:    s_mov_b32 s3, 0xf000
+; GFX6-NEXT:    s_mov_b32 s2, -1
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    s_mulk_i32 s4, 0xffef
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
+; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX6-NEXT:    s_endpgm
+;
+; GFX8-LABEL: s_mulk_i32_k2:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_load_dword s4, s[2:3], 0x8
+; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; GFX8-NEXT:    s_mov_b32 s3, 0xf000
+; GFX8-NEXT:    s_mov_b32 s2, -1
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    s_mulk_i32 s4, 0xffef
+; GFX8-NEXT:    v_mov_b32_e32 v0, s4
+; GFX8-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX8-NEXT:    s_endpgm
   %mul = mul i32 %b, -17
   store i32 %mul, ptr addrspace(1) %out
   ret void
 }
 
-; SI-LABEL: {{^}}no_s_mulk_i32_k0:
-; SI: s_mul_i32 {{s[0-9]+}}, {{s[0-9]+}}, 0x8001{{$}}
-; SI: s_endpgm
 define amdgpu_kernel void @no_s_mulk_i32_k0(ptr addrspace(1) %out, i32 %b) {
+; GFX6-LABEL: no_s_mulk_i32_k0:
+; GFX6:       ; %bb.0:
+; GFX6-NEXT:    s_load_dword s4, s[2:3], 0x2
+; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; GFX6-NEXT:    s_mov_b32 s3, 0xf000
+; GFX6-NEXT:    s_mov_b32 s2, -1
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    s_mul_i32 s4, s4, 0x8001
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
+; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX6-NEXT:    s_endpgm
+;
+; GFX8-LABEL: no_s_mulk_i32_k0:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_load_dword s4, s[2:3], 0x8
+; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; GFX8-NEXT:    s_mov_b32 s3, 0xf000
+; GFX8-NEXT:    s_mov_b32 s2, -1
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    s_mul_i32 s4, s4, 0x8001
+; GFX8-NEXT:    v_mov_b32_e32 v0, s4
+; GFX8-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX8-NEXT:    s_endpgm
   %mul = mul i32 %b, 32769 ; 1 << 15 + 1
   store i32 %mul, ptr addrspace(1) %out
   ret void
@@ -42,9 +120,28 @@ define amdgpu_kernel void @no_s_mulk_i32_k0(ptr addrspace(1) %out, i32 %b) {
 
 @lds = addrspace(3) global [512 x i32] undef, align 4
 
-; SI-LABEL: {{^}}commute_s_mulk_i32:
-; SI: s_mulk_i32 s{{[0-9]+}}, 0x800{{$}}
 define amdgpu_kernel void @commute_s_mulk_i32(ptr addrspace(1) %out, i32 %b) #0 {
+; GFX6-LABEL: commute_s_mulk_i32:
+; GFX6:       ; %bb.0:
+; GFX6-NEXT:    s_load_dword s0, s[2:3], 0x2
+; GFX6-NEXT:    v_mov_b32_e32 v0, 0
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    s_mulk_i32 s0, 0x800
+; GFX6-NEXT:    ;;#ASMSTART
+; GFX6-NEXT:    ; foo v0, s0
+; GFX6-NEXT:    ;;#ASMEND
+; GFX6-NEXT:    s_endpgm
+;
+; GFX8-LABEL: commute_s_mulk_i32:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_load_dword s0, s[2:3], 0x8
+; GFX8-NEXT:    v_mov_b32_e32 v0, 0
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    s_mulk_i32 s0, 0x800
+; GFX8-NEXT:    ;;#ASMSTART
+; GFX8-NEXT:    ; foo v0, s0
+; GFX8-NEXT:    ;;#ASMEND
+; GFX8-NEXT:    s_endpgm
   %size = call i32 @llvm.amdgcn.groupstaticsize()
   %add = mul i32 %size, %b
   call void asm sideeffect "; foo $0, $1", "v,s"(ptr addrspace(3) @lds, i32 %add)



More information about the llvm-commits mailing list