[llvm] b8c8d1b - AMDGPU: Convert some tests to use new buffer intrinsics

Thu Jul 30 07:30:49 PDT 2020

Author: Matt Arsenault
Date: 2020-07-30T10:30:43-04:00
New Revision: b8c8d1b30986a25ef392c786daf178beff230f6d

URL: https://github.com/llvm/llvm-project/commit/b8c8d1b30986a25ef392c786daf178beff230f6d
DIFF: https://github.com/llvm/llvm-project/commit/b8c8d1b30986a25ef392c786daf178beff230f6d.diff

LOG: AMDGPU: Convert some tests to use new buffer intrinsics

The legacy not struct or raw buffer intrinsics should now all be
consolidated into the tests specifically for those intrinsics.

Added: 
    

Modified: 
    llvm/test/CodeGen/AMDGPU/amdgcn-load-offset-from-reg.ll
    llvm/test/CodeGen/AMDGPU/fix-wwm-vgpr-copy.ll
    llvm/test/CodeGen/AMDGPU/merge-store-crash.ll
    llvm/test/CodeGen/AMDGPU/merge-store-usedef.ll
    llvm/test/CodeGen/AMDGPU/mubuf.ll
    llvm/test/CodeGen/AMDGPU/scheduler-subrange-crash.ll
    llvm/test/CodeGen/AMDGPU/si-triv-disjoint-mem-access.ll

Removed: 
    


################################################################################
diff  --git a/llvm/test/CodeGen/AMDGPU/amdgcn-load-offset-from-reg.ll b/llvm/test/CodeGen/AMDGPU/amdgcn-load-offset-from-reg.ll
index fde3ab8c6d4a..be819ceb88cc 100644

--- a/llvm/test/CodeGen/AMDGPU/amdgcn-load-offset-from-reg.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn-load-offset-from-reg.ll
@@ -23,8 +23,6 @@ define amdgpu_cs void @test_load_zext(i32 inreg %0, i32 inreg %1, i32 inreg %res
   ret void
 }
 
-declare <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32>, i32, i32, i1, i1) #0
-; Function Attrs: nounwind writeonly
 declare void @llvm.amdgcn.raw.buffer.store.v4i32(<4 x i32>, <4 x i32>, i32, i32, i32 immarg) #1
 
 ; Function Attrs: nounwind readnone speculatable

diff  --git a/llvm/test/CodeGen/AMDGPU/fix-wwm-vgpr-copy.ll b/llvm/test/CodeGen/AMDGPU/fix-wwm-vgpr-copy.ll
index 3375efa14dd9..f4c8f67bbd63 100644
--- a/llvm/test/CodeGen/AMDGPU/fix-wwm-vgpr-copy.ll
+++ b/llvm/test/CodeGen/AMDGPU/fix-wwm-vgpr-copy.ll
@@ -1,11 +1,5 @@
 ; RUN: llc -mtriple=amdgcn--amdpal -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
 
-declare i64 @llvm.amdgcn.icmp.i32(i32, i32, i32) #0
-declare i32 @llvm.amdgcn.set.inactive.i32(i32, i32) #0
-declare i32 @llvm.amdgcn.wwm.i32(i32) #1
-declare void @llvm.amdgcn.tbuffer.store.f32(float, <4 x i32>, i32, i32, i32, i32, i32, i32, i1, i1) #2
-declare void @llvm.amdgcn.exp.f32(i32, i32, float, float, float, float, i1, i1) #2
-
 define amdgpu_hs void @foo(i32 inreg %arg, <4 x i32> inreg %buffer) {
 entry:
   br label %work
@@ -19,7 +13,7 @@ bb602:
   br i1 %tmp607, label %bb49, label %bb54
 
 bb49:
-  tail call void @llvm.amdgcn.tbuffer.store.f32(float 1.000000e+00, <4 x i32> %buffer, i32 0, i32 1, i32 1, i32 4, i32 4, i32 7, i1 true, i1 false) #7
+  call void @llvm.amdgcn.raw.tbuffer.store.f32(float 1.0, <4 x i32> %buffer, i32 4, i32 1, i32 116, i32 1)
   ret void
 
 bb54:
@@ -42,6 +36,10 @@ work:
   br i1 %tmp34, label %bb602, label %bb42
 }
 
-attributes #0 = { convergent nounwind readnone }
-attributes #1 = { nounwind readnone speculatable }
-attributes #2 = { nounwind writeonly }
+declare i32 @llvm.amdgcn.set.inactive.i32(i32, i32) #0
+declare i32 @llvm.amdgcn.wwm.i32(i32) #1
+declare void @llvm.amdgcn.raw.tbuffer.store.f32(float, <4 x i32>, i32, i32, i32 immarg, i32 immarg) #2
+
+attributes #0 = { convergent nounwind readnone willreturn }
+attributes #1 = { convergent nounwind readnone speculatable willreturn }
+attributes #2 = { nounwind willreturn writeonly }

diff  --git a/llvm/test/CodeGen/AMDGPU/merge-store-crash.ll b/llvm/test/CodeGen/AMDGPU/merge-store-crash.ll
index fea47db60d9e..73f4f9e0cfc0 100644
--- a/llvm/test/CodeGen/AMDGPU/merge-store-crash.ll
+++ b/llvm/test/CodeGen/AMDGPU/merge-store-crash.ll
@@ -27,11 +27,10 @@ main_body:
   %tmp9 = insertelement <4 x i32> %tmp8, i32 %tmp7, i32 1
   %tmp10 = insertelement <4 x i32> %tmp9, i32 undef, i32 2
   %tmp11 = insertelement <4 x i32> %tmp10, i32 undef, i32 3
-  call void @llvm.amdgcn.tbuffer.store.v4i32(<4 x i32> %tmp11, <4 x i32> undef, i32 undef, i32 0, i32 %arg, i32 0, i32 14, i32 4, i1 1, i1 1)
+  call void @llvm.amdgcn.struct.tbuffer.store.v4i32(<4 x i32> %tmp11, <4 x i32> undef, i32 0, i32 0, i32 %arg, i32 78, i32 3) #2
   ret void
 }
 
-; Function Attrs: nounwind
-declare void @llvm.amdgcn.tbuffer.store.v4i32(<4 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i1, i1) #0
+declare void @llvm.amdgcn.struct.tbuffer.store.v4i32(<4 x i32>, <4 x i32>, i32, i32, i32, i32 immarg, i32 immarg) #0
 
-attributes #0 = { nounwind }
+attributes #0 = { nounwind willreturn writeonly }

diff  --git a/llvm/test/CodeGen/AMDGPU/merge-store-usedef.ll b/llvm/test/CodeGen/AMDGPU/merge-store-usedef.ll
index 958692e0c92b..e363d039548b 100644
--- a/llvm/test/CodeGen/AMDGPU/merge-store-usedef.ll
+++ b/llvm/test/CodeGen/AMDGPU/merge-store-usedef.ll
@@ -11,13 +11,14 @@ define amdgpu_vs void @test1(i32 %v) #0 {
 
   store i32 %v, i32 addrspace(3)* %p0
 
-  call void @llvm.amdgcn.tbuffer.store.i32(i32 %v, <4 x i32> undef, i32 0, i32 0, i32 0, i32 0, i32 4, i32 4, i1 1, i1 0)
+  call void @llvm.amdgcn.raw.tbuffer.store.i32(i32 %v, <4 x i32> undef, i32 0, i32 0, i32 68, i32 1)
 
   %w = load i32, i32 addrspace(3)* %p0
   store i32 %w, i32 addrspace(3)* %p1
   ret void
 }
 
-declare void @llvm.amdgcn.tbuffer.store.i32(i32, <4 x i32>, i32, i32, i32, i32, i32, i32, i1, i1) #0
+declare void @llvm.amdgcn.raw.tbuffer.store.i32(i32, <4 x i32>, i32, i32, i32 immarg, i32 immarg) #1
 
 attributes #0 = { nounwind }
+attributes #1 = { nounwind willreturn writeonly }

diff  --git a/llvm/test/CodeGen/AMDGPU/mubuf.ll b/llvm/test/CodeGen/AMDGPU/mubuf.ll
index b28e8fa72ec7..90da8406c4b7 100644
--- a/llvm/test/CodeGen/AMDGPU/mubuf.ll
+++ b/llvm/test/CodeGen/AMDGPU/mubuf.ll
@@ -1,7 +1,5 @@
 ; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=amdgcn -show-mc-encoding -verify-machineinstrs < %s | FileCheck %s
 
-declare i32 @llvm.amdgcn.workitem.id.x() readnone
-
 ;;;==========================================================================;;;
 ;;; MUBUF LOAD TESTS
 ;;;==========================================================================;;;
@@ -60,10 +58,10 @@ main_body:
   %tmp0 = getelementptr [6 x <4 x i32>], [6 x <4 x i32>] addrspace(4)* %0, i32 0, i32 0
   %tmp1 = load <4 x i32>, <4 x i32> addrspace(4)* %tmp0
   %tmp2 = shl i32 %6, 2
-  %tmp3 = call i32 @llvm.amdgcn.raw.buffer.load.dword.i32(<4 x i32> %tmp1, i32 %tmp2, i32 64, i32 1)
+  %tmp3 = call i32 @llvm.amdgcn.raw.buffer.load.i32(<4 x i32> %tmp1, i32 %tmp2, i32 64, i32 1)
   %tmp4 = add i32 %6, 16
   %tmp1.4xi32 = bitcast <4 x i32> %tmp1 to <4 x i32>
-  call void @llvm.amdgcn.tbuffer.store.i32(i32 %tmp3, <4 x i32> %tmp1.4xi32, i32 0, i32 %tmp4, i32 %4, i32 0, i32 4, i32 4, i1 1, i1 1)
+  call void @llvm.amdgcn.raw.tbuffer.store.i32(i32 %tmp3, <4 x i32> %tmp1.4xi32, i32 %tmp4, i32 %4, i32 68, i32 3)
   ret void
 }
 
@@ -79,10 +77,10 @@ main_body:
   %tmp0 = getelementptr [6 x <4 x i32>], [6 x <4 x i32>] addrspace(4)* %0, i32 0, i32 0
   %tmp1 = load <4 x i32>, <4 x i32> addrspace(4)* %tmp0
   %tmp2 = shl i32 %6, 2
-  %tmp3 = call i32 @llvm.amdgcn.raw.buffer.load.dword.i32(<4 x i32> %tmp1, i32 %tmp2, i32 65, i32 1)
+  %tmp3 = call i32 @llvm.amdgcn.raw.buffer.load.i32(<4 x i32> %tmp1, i32 %tmp2, i32 65, i32 1)
   %tmp4 = add i32 %6, 16
   %tmp1.4xi32 = bitcast <4 x i32> %tmp1 to <4 x i32>
-  call void @llvm.amdgcn.tbuffer.store.i32(i32 %tmp3, <4 x i32> %tmp1.4xi32, i32 0, i32 %tmp4, i32 %4, i32 0, i32 4, i32 4, i1 1, i1 1)
+  call void @llvm.amdgcn.raw.tbuffer.store.i32(i32 %tmp3, <4 x i32> %tmp1.4xi32, i32 %tmp4, i32 %4, i32 68, i32 3)
   ret void
 }
 
@@ -136,14 +134,14 @@ entry:
 
 ; CHECK-LABEL: {{^}}store_sgpr_ptr:
 ; CHECK: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0
-define amdgpu_kernel void @store_sgpr_ptr(i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @store_sgpr_ptr(i32 addrspace(1)* %out) {
   store i32 99, i32 addrspace(1)* %out, align 4
   ret void
 }
 
 ; CHECK-LABEL: {{^}}store_sgpr_ptr_offset:
 ; CHECK: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:40
-define amdgpu_kernel void @store_sgpr_ptr_offset(i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @store_sgpr_ptr_offset(i32 addrspace(1)* %out) {
   %out.gep = getelementptr i32, i32 addrspace(1)* %out, i32 10
   store i32 99, i32 addrspace(1)* %out.gep, align 4
   ret void
@@ -152,7 +150,7 @@ define amdgpu_kernel void @store_sgpr_ptr_offset(i32 addrspace(1)* %out) #0 {
 ; CHECK-LABEL: {{^}}store_sgpr_ptr_large_offset:
 ; CHECK: s_mov_b32 [[SOFFSET:s[0-9]+]], 0x20000
 ; CHECK: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, [[SOFFSET]]
-define amdgpu_kernel void @store_sgpr_ptr_large_offset(i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @store_sgpr_ptr_large_offset(i32 addrspace(1)* %out) {
   %out.gep = getelementptr i32, i32 addrspace(1)* %out, i32 32768
   store i32 99, i32 addrspace(1)* %out.gep, align 4
   ret void
@@ -161,7 +159,7 @@ define amdgpu_kernel void @store_sgpr_ptr_large_offset(i32 addrspace(1)* %out) #
 ; CHECK-LABEL: {{^}}store_sgpr_ptr_large_offset_atomic:
 ; CHECK: s_mov_b32 [[SOFFSET:s[0-9]+]], 0x20000
 ; CHECK: buffer_atomic_add v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, [[SOFFSET]]
-define amdgpu_kernel void @store_sgpr_ptr_large_offset_atomic(i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @store_sgpr_ptr_large_offset_atomic(i32 addrspace(1)* %out) {
   %gep = getelementptr i32, i32 addrspace(1)* %out, i32 32768
   %val = atomicrmw volatile add i32 addrspace(1)* %gep, i32 5 seq_cst
   ret void
@@ -169,14 +167,20 @@ define amdgpu_kernel void @store_sgpr_ptr_large_offset_atomic(i32 addrspace(1)*
 
 ; CHECK-LABEL: {{^}}store_vgpr_ptr:
 ; CHECK: buffer_store_dword v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64
-define amdgpu_kernel void @store_vgpr_ptr(i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @store_vgpr_ptr(i32 addrspace(1)* %out) {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() readnone
   %out.gep = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
   store i32 99, i32 addrspace(1)* %out.gep, align 4
   ret void
 }
 
-declare i32 @llvm.amdgcn.raw.buffer.load.dword.i32(<4 x i32>, i32, i32, i32) #0
-declare void @llvm.amdgcn.tbuffer.store.i32(i32, <4 x i32>, i32, i32, i32, i32, i32, i32, i1, i1)
+
+declare i32 @llvm.amdgcn.workitem.id.x() #1
+declare void @llvm.amdgcn.raw.tbuffer.store.i32(i32, <4 x i32>, i32, i32, i32 immarg, i32 immarg) #2
+declare i32 @llvm.amdgcn.raw.buffer.load.i32(<4 x i32>, i32, i32, i32 immarg) #3
 
 attributes #0 = { nounwind readonly }
+attributes #1 = { nounwind readnone speculatable willreturn }
+attributes #2 = { nounwind willreturn writeonly }
+attributes #3 = { nounwind readonly willreturn }
+attributes #4 = { readnone }

diff  --git a/llvm/test/CodeGen/AMDGPU/scheduler-subrange-crash.ll b/llvm/test/CodeGen/AMDGPU/scheduler-subrange-crash.ll
index be76371c42df..8aa48d4d24ca 100644
--- a/llvm/test/CodeGen/AMDGPU/scheduler-subrange-crash.ll
+++ b/llvm/test/CodeGen/AMDGPU/scheduler-subrange-crash.ll
@@ -25,31 +25,31 @@ main_body:
   %array_vector10 = insertelement <4 x float> %array_vector9, float 0.000000e+00, i32 2
   %array_vector11 = insertelement <4 x float> %array_vector10, float undef, i32 3
   %tmp3 = call i32 @llvm.amdgcn.raw.buffer.load.i32(<4 x i32> undef, i32 undef, i32 4864, i32 0)
-  call void @llvm.amdgcn.tbuffer.store.i32(i32 %tmp3, <4 x i32> undef, i32 0, i32 0, i32 %arg, i32 36, i32 4, i32 4, i1 1, i1 1)
+  call void @llvm.amdgcn.raw.tbuffer.store.i32(i32 %tmp3, <4 x i32> undef, i32 36, i32 %arg, i32 68, i32 3)
   %bc = bitcast <4 x float> %array_vector3 to <4 x i32>
   %tmp4 = extractelement <4 x i32> %bc, i32 undef
-  call void @llvm.amdgcn.tbuffer.store.i32(i32 %tmp4, <4 x i32> undef, i32 0, i32 0, i32 %arg, i32 48, i32 4, i32 4, i1 1, i1 1)
+  call void @llvm.amdgcn.raw.tbuffer.store.i32(i32 %tmp4, <4 x i32> undef, i32 48, i32 %arg, i32 68, i32 3)
   %bc49 = bitcast <4 x float> %array_vector11 to <4 x i32>
   %tmp5 = extractelement <4 x i32> %bc49, i32 undef
-  call void @llvm.amdgcn.tbuffer.store.i32(i32 %tmp5, <4 x i32> undef, i32 0, i32 0, i32 %arg, i32 72, i32 4, i32 4, i1 1, i1 1)
+  call void @llvm.amdgcn.raw.tbuffer.store.i32(i32 %tmp5, <4 x i32> undef, i32 72, i32 %arg, i32 68, i32 3)
   %array_vector21 = insertelement <4 x float> <float 0.000000e+00, float undef, float undef, float undef>, float %tmp, i32 1
   %array_vector22 = insertelement <4 x float> %array_vector21, float undef, i32 2
   %array_vector23 = insertelement <4 x float> %array_vector22, float undef, i32 3
-  call void @llvm.amdgcn.tbuffer.store.i32(i32 undef, <4 x i32> undef, i32 0, i32 0, i32 %arg, i32 28, i32 4, i32 4, i1 1, i1 1)
+  call void @llvm.amdgcn.raw.tbuffer.store.i32(i32 undef, <4 x i32> undef, i32 28, i32 %arg, i32 68, i32 3)
   %bc52 = bitcast <4 x float> %array_vector23 to <4 x i32>
   %tmp6 = extractelement <4 x i32> %bc52, i32 undef
-  call void @llvm.amdgcn.tbuffer.store.i32(i32 %tmp6, <4 x i32> undef, i32 0, i32 0, i32 %arg, i32 64, i32 4, i32 4, i1 1, i1 1)
-  call void @llvm.amdgcn.tbuffer.store.i32(i32 undef, <4 x i32> undef, i32 0, i32 0, i32 %arg, i32 20, i32 4, i32 4, i1 1, i1 1)
-  call void @llvm.amdgcn.tbuffer.store.i32(i32 undef, <4 x i32> undef, i32 0, i32 0, i32 %arg, i32 56, i32 4, i32 4, i1 1, i1 1)
-  call void @llvm.amdgcn.tbuffer.store.i32(i32 undef, <4 x i32> undef, i32 0, i32 0, i32 %arg, i32 92, i32 4, i32 4, i1 1, i1 1)
+  call void @llvm.amdgcn.raw.tbuffer.store.i32(i32 %tmp6, <4 x i32> undef, i32 64, i32 %arg, i32 68, i32 3)
+  call void @llvm.amdgcn.raw.tbuffer.store.i32(i32 undef, <4 x i32> undef, i32 20, i32 %arg, i32 68, i32 3)
+  call void @llvm.amdgcn.raw.tbuffer.store.i32(i32 undef, <4 x i32> undef, i32 56, i32 %arg, i32 68, i32 3)
+  call void @llvm.amdgcn.raw.tbuffer.store.i32(i32 undef, <4 x i32> undef, i32 92, i32 %arg, i32 68, i32 3)
   ret void
 }
 
-declare float @llvm.amdgcn.s.buffer.load.f32(<4 x i32>, i32, i32) #1
-declare i32 @llvm.amdgcn.raw.buffer.load.i32(<4 x i32>, i32, i32, i32) #2
-declare void @llvm.amdgcn.tbuffer.store.i32(i32, <4 x i32>, i32, i32, i32, i32, i32, i32, i1, i1) #3
+declare float @llvm.amdgcn.s.buffer.load.f32(<4 x i32>, i32, i32 immarg) #1
+declare i32 @llvm.amdgcn.raw.buffer.load.i32(<4 x i32>, i32, i32, i32 immarg) #2
+declare void @llvm.amdgcn.raw.tbuffer.store.i32(i32, <4 x i32>, i32, i32, i32 immarg, i32 immarg) #3
 
 attributes #0 = { nounwind "target-cpu"="tonga" }
-attributes #1 = { nounwind readnone }
-attributes #2 = { nounwind readonly }
-attributes #3 = { nounwind }
+attributes #1 = { nounwind readnone willreturn }
+attributes #2 = { nounwind readonly willreturn }
+attributes #3 = { nounwind willreturn writeonly }

diff  --git a/llvm/test/CodeGen/AMDGPU/si-triv-disjoint-mem-access.ll b/llvm/test/CodeGen/AMDGPU/si-triv-disjoint-mem-access.ll
index ec2dbb995c98..225d7cc0d9a2 100644
--- a/llvm/test/CodeGen/AMDGPU/si-triv-disjoint-mem-access.ll
+++ b/llvm/test/CodeGen/AMDGPU/si-triv-disjoint-mem-access.ll
@@ -1,12 +1,6 @@
 ; RUN: llc -amdgpu-scalarize-global-loads=false  -march=amdgcn -mcpu=bonaire -enable-amdgpu-aa=0 -verify-machineinstrs -enable-misched -enable-aa-sched-mi < %s | FileCheck -enable-var-scope -check-prefixes=GCN,CI %s
 ; RUN: llc -amdgpu-scalarize-global-loads=false  -march=amdgcn -mcpu=gfx900 -enable-amdgpu-aa=0 -verify-machineinstrs -enable-misched -enable-aa-sched-mi -amdgpu-enable-global-sgpr-addr < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9 %s
 
-declare void @llvm.amdgcn.tbuffer.store.i32(i32, <4 x i32>, i32, i32, i32, i32, i32, i32, i1, i1)
-declare void @llvm.amdgcn.tbuffer.store.v4i32(<4 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i1, i1)
-declare void @llvm.amdgcn.s.barrier() #1
-declare i32 @llvm.amdgcn.workitem.id.x() #2
-
-
 @stored_lds_ptr = addrspace(3) global i32 addrspace(3)* undef, align 4
 @stored_constant_ptr = addrspace(3) global i32 addrspace(4)* undef, align 8
 @stored_global_ptr = addrspace(3) global i32 addrspace(1)* undef, align 8
@@ -296,30 +290,33 @@ define amdgpu_kernel void @reorder_global_offsets_addr64_soffset0(i32 addrspace(
   ret void
 }
 
-; XGCN-LABEL: {{^}}reorder_local_load_tbuffer_store_local_load:
-; XCI: ds_read_b32 {{v[0-9]+}}, {{v[0-9]+}}, 0x4
-; XCI: TBUFFER_STORE_FORMAT
-; XCI: ds_read_b32 {{v[0-9]+}}, {{v[0-9]+}}, 0x8
-; define amdgpu_vs void @reorder_local_load_tbuffer_store_local_load(i32 addrspace(1)* %out, i32 %a1, i32 %vaddr) #0 {
-;   %ptr0 = load i32 addrspace(3)*, i32 addrspace(3)* addrspace(3)* @stored_lds_ptr, align 4
+; GCN-LABEL: {{^}}reorder_local_load_tbuffer_store_local_load:
+; GCN: tbuffer_store_format
+; GCN: ds_read2_b32 {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}} offset0:1 offset1:2
+define amdgpu_vs void @reorder_local_load_tbuffer_store_local_load(i32 addrspace(1)* %out, i32 %a1, i32 %vaddr) #0 {
+  %ptr0 = load i32 addrspace(3)*, i32 addrspace(3)* addrspace(3)* @stored_lds_ptr, align 4
 
-;   %ptr1 = getelementptr inbounds i32, i32 addrspace(3)* %ptr0, i32 1
-;   %ptr2 = getelementptr inbounds i32, i32 addrspace(3)* %ptr0, i32 2
+  %ptr1 = getelementptr inbounds i32, i32 addrspace(3)* %ptr0, i32 1
+  %ptr2 = getelementptr inbounds i32, i32 addrspace(3)* %ptr0, i32 2
 
-;   %tmp1 = load i32, i32 addrspace(3)* %ptr1, align 4
+  %tmp1 = load i32, i32 addrspace(3)* %ptr1, align 4
 
-;   %vdata = insertelement <4 x i32> undef, i32 %a1, i32 0
-;   call void @llvm.amdgcn.tbuffer.store.v4i32(<4 x i32> %vdata, <4 x i32> undef,
-;         i32 %vaddr, i32 0, i32 0, i32 32, i32 14, i32 4, i1 1, i1 1)
+  %vdata = insertelement <4 x i32> undef, i32 %a1, i32 0
+  %vaddr.add = add i32 %vaddr, 32
+  call void @llvm.amdgcn.struct.tbuffer.store.v4i32(<4 x i32> %vdata, <4 x i32> undef, i32 %vaddr.add, i32 0, i32 0, i32 228, i32 3)
 
-;   %tmp2 = load i32, i32 addrspace(3)* %ptr2, align 4
+  %tmp2 = load i32, i32 addrspace(3)* %ptr2, align 4
 
-;   %add = add nsw i32 %tmp1, %tmp2
+  %add = add nsw i32 %tmp1, %tmp2
+  store i32 %add, i32 addrspace(1)* %out, align 4
+  ret void
+}
 
-;   store i32 %add, i32 addrspace(1)* %out, align 4
-;   ret void
-; }
+declare void @llvm.amdgcn.s.barrier() #1
+declare i32 @llvm.amdgcn.workitem.id.x() #2
+declare void @llvm.amdgcn.struct.tbuffer.store.v4i32(<4 x i32>, <4 x i32>, i32, i32, i32, i32 immarg, i32 immarg) #3
 
 attributes #0 = { nounwind }
-attributes #1 = { nounwind convergent }
-attributes #2 = { nounwind readnone }
+attributes #1 = { convergent nounwind willreturn }
+attributes #2 = { nounwind readnone speculatable willreturn }
+attributes #3 = { nounwind willreturn writeonly }