[llvm] b8c8d1b - AMDGPU: Convert some tests to use new buffer intrinsics
Matt Arsenault via llvm-commits
llvm-commits at lists.llvm.org
Thu Jul 30 07:30:49 PDT 2020
Author: Matt Arsenault
Date: 2020-07-30T10:30:43-04:00
New Revision: b8c8d1b30986a25ef392c786daf178beff230f6d
URL: https://github.com/llvm/llvm-project/commit/b8c8d1b30986a25ef392c786daf178beff230f6d
DIFF: https://github.com/llvm/llvm-project/commit/b8c8d1b30986a25ef392c786daf178beff230f6d.diff
LOG: AMDGPU: Convert some tests to use new buffer intrinsics
The legacy not struct or raw buffer intrinsics should now all be
consolidated into the tests specifically for those intrinsics.
Added:
Modified:
llvm/test/CodeGen/AMDGPU/amdgcn-load-offset-from-reg.ll
llvm/test/CodeGen/AMDGPU/fix-wwm-vgpr-copy.ll
llvm/test/CodeGen/AMDGPU/merge-store-crash.ll
llvm/test/CodeGen/AMDGPU/merge-store-usedef.ll
llvm/test/CodeGen/AMDGPU/mubuf.ll
llvm/test/CodeGen/AMDGPU/scheduler-subrange-crash.ll
llvm/test/CodeGen/AMDGPU/si-triv-disjoint-mem-access.ll
Removed:
################################################################################
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn-load-offset-from-reg.ll b/llvm/test/CodeGen/AMDGPU/amdgcn-load-offset-from-reg.ll
index fde3ab8c6d4a..be819ceb88cc 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn-load-offset-from-reg.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn-load-offset-from-reg.ll
@@ -23,8 +23,6 @@ define amdgpu_cs void @test_load_zext(i32 inreg %0, i32 inreg %1, i32 inreg %res
ret void
}
-declare <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32>, i32, i32, i1, i1) #0
-; Function Attrs: nounwind writeonly
declare void @llvm.amdgcn.raw.buffer.store.v4i32(<4 x i32>, <4 x i32>, i32, i32, i32 immarg) #1
; Function Attrs: nounwind readnone speculatable
diff --git a/llvm/test/CodeGen/AMDGPU/fix-wwm-vgpr-copy.ll b/llvm/test/CodeGen/AMDGPU/fix-wwm-vgpr-copy.ll
index 3375efa14dd9..f4c8f67bbd63 100644
--- a/llvm/test/CodeGen/AMDGPU/fix-wwm-vgpr-copy.ll
+++ b/llvm/test/CodeGen/AMDGPU/fix-wwm-vgpr-copy.ll
@@ -1,11 +1,5 @@
; RUN: llc -mtriple=amdgcn--amdpal -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
-declare i64 @llvm.amdgcn.icmp.i32(i32, i32, i32) #0
-declare i32 @llvm.amdgcn.set.inactive.i32(i32, i32) #0
-declare i32 @llvm.amdgcn.wwm.i32(i32) #1
-declare void @llvm.amdgcn.tbuffer.store.f32(float, <4 x i32>, i32, i32, i32, i32, i32, i32, i1, i1) #2
-declare void @llvm.amdgcn.exp.f32(i32, i32, float, float, float, float, i1, i1) #2
-
define amdgpu_hs void @foo(i32 inreg %arg, <4 x i32> inreg %buffer) {
entry:
br label %work
@@ -19,7 +13,7 @@ bb602:
br i1 %tmp607, label %bb49, label %bb54
bb49:
- tail call void @llvm.amdgcn.tbuffer.store.f32(float 1.000000e+00, <4 x i32> %buffer, i32 0, i32 1, i32 1, i32 4, i32 4, i32 7, i1 true, i1 false) #7
+ call void @llvm.amdgcn.raw.tbuffer.store.f32(float 1.0, <4 x i32> %buffer, i32 4, i32 1, i32 116, i32 1)
ret void
bb54:
@@ -42,6 +36,10 @@ work:
br i1 %tmp34, label %bb602, label %bb42
}
-attributes #0 = { convergent nounwind readnone }
-attributes #1 = { nounwind readnone speculatable }
-attributes #2 = { nounwind writeonly }
+declare i32 @llvm.amdgcn.set.inactive.i32(i32, i32) #0
+declare i32 @llvm.amdgcn.wwm.i32(i32) #1
+declare void @llvm.amdgcn.raw.tbuffer.store.f32(float, <4 x i32>, i32, i32, i32 immarg, i32 immarg) #2
+
+attributes #0 = { convergent nounwind readnone willreturn }
+attributes #1 = { convergent nounwind readnone speculatable willreturn }
+attributes #2 = { nounwind willreturn writeonly }
diff --git a/llvm/test/CodeGen/AMDGPU/merge-store-crash.ll b/llvm/test/CodeGen/AMDGPU/merge-store-crash.ll
index fea47db60d9e..73f4f9e0cfc0 100644
--- a/llvm/test/CodeGen/AMDGPU/merge-store-crash.ll
+++ b/llvm/test/CodeGen/AMDGPU/merge-store-crash.ll
@@ -27,11 +27,10 @@ main_body:
%tmp9 = insertelement <4 x i32> %tmp8, i32 %tmp7, i32 1
%tmp10 = insertelement <4 x i32> %tmp9, i32 undef, i32 2
%tmp11 = insertelement <4 x i32> %tmp10, i32 undef, i32 3
- call void @llvm.amdgcn.tbuffer.store.v4i32(<4 x i32> %tmp11, <4 x i32> undef, i32 undef, i32 0, i32 %arg, i32 0, i32 14, i32 4, i1 1, i1 1)
+ call void @llvm.amdgcn.struct.tbuffer.store.v4i32(<4 x i32> %tmp11, <4 x i32> undef, i32 0, i32 0, i32 %arg, i32 78, i32 3) #2
ret void
}
-; Function Attrs: nounwind
-declare void @llvm.amdgcn.tbuffer.store.v4i32(<4 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i1, i1) #0
+declare void @llvm.amdgcn.struct.tbuffer.store.v4i32(<4 x i32>, <4 x i32>, i32, i32, i32, i32 immarg, i32 immarg) #0
-attributes #0 = { nounwind }
+attributes #0 = { nounwind willreturn writeonly }
diff --git a/llvm/test/CodeGen/AMDGPU/merge-store-usedef.ll b/llvm/test/CodeGen/AMDGPU/merge-store-usedef.ll
index 958692e0c92b..e363d039548b 100644
--- a/llvm/test/CodeGen/AMDGPU/merge-store-usedef.ll
+++ b/llvm/test/CodeGen/AMDGPU/merge-store-usedef.ll
@@ -11,13 +11,14 @@ define amdgpu_vs void @test1(i32 %v) #0 {
store i32 %v, i32 addrspace(3)* %p0
- call void @llvm.amdgcn.tbuffer.store.i32(i32 %v, <4 x i32> undef, i32 0, i32 0, i32 0, i32 0, i32 4, i32 4, i1 1, i1 0)
+ call void @llvm.amdgcn.raw.tbuffer.store.i32(i32 %v, <4 x i32> undef, i32 0, i32 0, i32 68, i32 1)
%w = load i32, i32 addrspace(3)* %p0
store i32 %w, i32 addrspace(3)* %p1
ret void
}
-declare void @llvm.amdgcn.tbuffer.store.i32(i32, <4 x i32>, i32, i32, i32, i32, i32, i32, i1, i1) #0
+declare void @llvm.amdgcn.raw.tbuffer.store.i32(i32, <4 x i32>, i32, i32, i32 immarg, i32 immarg) #1
attributes #0 = { nounwind }
+attributes #1 = { nounwind willreturn writeonly }
diff --git a/llvm/test/CodeGen/AMDGPU/mubuf.ll b/llvm/test/CodeGen/AMDGPU/mubuf.ll
index b28e8fa72ec7..90da8406c4b7 100644
--- a/llvm/test/CodeGen/AMDGPU/mubuf.ll
+++ b/llvm/test/CodeGen/AMDGPU/mubuf.ll
@@ -1,7 +1,5 @@
; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -show-mc-encoding -verify-machineinstrs < %s | FileCheck %s
-declare i32 @llvm.amdgcn.workitem.id.x() readnone
-
;;;==========================================================================;;;
;;; MUBUF LOAD TESTS
;;;==========================================================================;;;
@@ -60,10 +58,10 @@ main_body:
%tmp0 = getelementptr [6 x <4 x i32>], [6 x <4 x i32>] addrspace(4)* %0, i32 0, i32 0
%tmp1 = load <4 x i32>, <4 x i32> addrspace(4)* %tmp0
%tmp2 = shl i32 %6, 2
- %tmp3 = call i32 @llvm.amdgcn.raw.buffer.load.dword.i32(<4 x i32> %tmp1, i32 %tmp2, i32 64, i32 1)
+ %tmp3 = call i32 @llvm.amdgcn.raw.buffer.load.i32(<4 x i32> %tmp1, i32 %tmp2, i32 64, i32 1)
%tmp4 = add i32 %6, 16
%tmp1.4xi32 = bitcast <4 x i32> %tmp1 to <4 x i32>
- call void @llvm.amdgcn.tbuffer.store.i32(i32 %tmp3, <4 x i32> %tmp1.4xi32, i32 0, i32 %tmp4, i32 %4, i32 0, i32 4, i32 4, i1 1, i1 1)
+ call void @llvm.amdgcn.raw.tbuffer.store.i32(i32 %tmp3, <4 x i32> %tmp1.4xi32, i32 %tmp4, i32 %4, i32 68, i32 3)
ret void
}
@@ -79,10 +77,10 @@ main_body:
%tmp0 = getelementptr [6 x <4 x i32>], [6 x <4 x i32>] addrspace(4)* %0, i32 0, i32 0
%tmp1 = load <4 x i32>, <4 x i32> addrspace(4)* %tmp0
%tmp2 = shl i32 %6, 2
- %tmp3 = call i32 @llvm.amdgcn.raw.buffer.load.dword.i32(<4 x i32> %tmp1, i32 %tmp2, i32 65, i32 1)
+ %tmp3 = call i32 @llvm.amdgcn.raw.buffer.load.i32(<4 x i32> %tmp1, i32 %tmp2, i32 65, i32 1)
%tmp4 = add i32 %6, 16
%tmp1.4xi32 = bitcast <4 x i32> %tmp1 to <4 x i32>
- call void @llvm.amdgcn.tbuffer.store.i32(i32 %tmp3, <4 x i32> %tmp1.4xi32, i32 0, i32 %tmp4, i32 %4, i32 0, i32 4, i32 4, i1 1, i1 1)
+ call void @llvm.amdgcn.raw.tbuffer.store.i32(i32 %tmp3, <4 x i32> %tmp1.4xi32, i32 %tmp4, i32 %4, i32 68, i32 3)
ret void
}
@@ -136,14 +134,14 @@ entry:
; CHECK-LABEL: {{^}}store_sgpr_ptr:
; CHECK: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0
-define amdgpu_kernel void @store_sgpr_ptr(i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @store_sgpr_ptr(i32 addrspace(1)* %out) {
store i32 99, i32 addrspace(1)* %out, align 4
ret void
}
; CHECK-LABEL: {{^}}store_sgpr_ptr_offset:
; CHECK: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:40
-define amdgpu_kernel void @store_sgpr_ptr_offset(i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @store_sgpr_ptr_offset(i32 addrspace(1)* %out) {
%out.gep = getelementptr i32, i32 addrspace(1)* %out, i32 10
store i32 99, i32 addrspace(1)* %out.gep, align 4
ret void
@@ -152,7 +150,7 @@ define amdgpu_kernel void @store_sgpr_ptr_offset(i32 addrspace(1)* %out) #0 {
; CHECK-LABEL: {{^}}store_sgpr_ptr_large_offset:
; CHECK: s_mov_b32 [[SOFFSET:s[0-9]+]], 0x20000
; CHECK: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, [[SOFFSET]]
-define amdgpu_kernel void @store_sgpr_ptr_large_offset(i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @store_sgpr_ptr_large_offset(i32 addrspace(1)* %out) {
%out.gep = getelementptr i32, i32 addrspace(1)* %out, i32 32768
store i32 99, i32 addrspace(1)* %out.gep, align 4
ret void
@@ -161,7 +159,7 @@ define amdgpu_kernel void @store_sgpr_ptr_large_offset(i32 addrspace(1)* %out) #
; CHECK-LABEL: {{^}}store_sgpr_ptr_large_offset_atomic:
; CHECK: s_mov_b32 [[SOFFSET:s[0-9]+]], 0x20000
; CHECK: buffer_atomic_add v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, [[SOFFSET]]
-define amdgpu_kernel void @store_sgpr_ptr_large_offset_atomic(i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @store_sgpr_ptr_large_offset_atomic(i32 addrspace(1)* %out) {
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 32768
%val = atomicrmw volatile add i32 addrspace(1)* %gep, i32 5 seq_cst
ret void
@@ -169,14 +167,20 @@ define amdgpu_kernel void @store_sgpr_ptr_large_offset_atomic(i32 addrspace(1)*
; CHECK-LABEL: {{^}}store_vgpr_ptr:
; CHECK: buffer_store_dword v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64
-define amdgpu_kernel void @store_vgpr_ptr(i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @store_vgpr_ptr(i32 addrspace(1)* %out) {
%tid = call i32 @llvm.amdgcn.workitem.id.x() readnone
%out.gep = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
store i32 99, i32 addrspace(1)* %out.gep, align 4
ret void
}
-declare i32 @llvm.amdgcn.raw.buffer.load.dword.i32(<4 x i32>, i32, i32, i32) #0
-declare void @llvm.amdgcn.tbuffer.store.i32(i32, <4 x i32>, i32, i32, i32, i32, i32, i32, i1, i1)
+
+declare i32 @llvm.amdgcn.workitem.id.x() #1
+declare void @llvm.amdgcn.raw.tbuffer.store.i32(i32, <4 x i32>, i32, i32, i32 immarg, i32 immarg) #2
+declare i32 @llvm.amdgcn.raw.buffer.load.i32(<4 x i32>, i32, i32, i32 immarg) #3
attributes #0 = { nounwind readonly }
+attributes #1 = { nounwind readnone speculatable willreturn }
+attributes #2 = { nounwind willreturn writeonly }
+attributes #3 = { nounwind readonly willreturn }
+attributes #4 = { readnone }
diff --git a/llvm/test/CodeGen/AMDGPU/scheduler-subrange-crash.ll b/llvm/test/CodeGen/AMDGPU/scheduler-subrange-crash.ll
index be76371c42df..8aa48d4d24ca 100644
--- a/llvm/test/CodeGen/AMDGPU/scheduler-subrange-crash.ll
+++ b/llvm/test/CodeGen/AMDGPU/scheduler-subrange-crash.ll
@@ -25,31 +25,31 @@ main_body:
%array_vector10 = insertelement <4 x float> %array_vector9, float 0.000000e+00, i32 2
%array_vector11 = insertelement <4 x float> %array_vector10, float undef, i32 3
%tmp3 = call i32 @llvm.amdgcn.raw.buffer.load.i32(<4 x i32> undef, i32 undef, i32 4864, i32 0)
- call void @llvm.amdgcn.tbuffer.store.i32(i32 %tmp3, <4 x i32> undef, i32 0, i32 0, i32 %arg, i32 36, i32 4, i32 4, i1 1, i1 1)
+ call void @llvm.amdgcn.raw.tbuffer.store.i32(i32 %tmp3, <4 x i32> undef, i32 36, i32 %arg, i32 68, i32 3)
%bc = bitcast <4 x float> %array_vector3 to <4 x i32>
%tmp4 = extractelement <4 x i32> %bc, i32 undef
- call void @llvm.amdgcn.tbuffer.store.i32(i32 %tmp4, <4 x i32> undef, i32 0, i32 0, i32 %arg, i32 48, i32 4, i32 4, i1 1, i1 1)
+ call void @llvm.amdgcn.raw.tbuffer.store.i32(i32 %tmp4, <4 x i32> undef, i32 48, i32 %arg, i32 68, i32 3)
%bc49 = bitcast <4 x float> %array_vector11 to <4 x i32>
%tmp5 = extractelement <4 x i32> %bc49, i32 undef
- call void @llvm.amdgcn.tbuffer.store.i32(i32 %tmp5, <4 x i32> undef, i32 0, i32 0, i32 %arg, i32 72, i32 4, i32 4, i1 1, i1 1)
+ call void @llvm.amdgcn.raw.tbuffer.store.i32(i32 %tmp5, <4 x i32> undef, i32 72, i32 %arg, i32 68, i32 3)
%array_vector21 = insertelement <4 x float> <float 0.000000e+00, float undef, float undef, float undef>, float %tmp, i32 1
%array_vector22 = insertelement <4 x float> %array_vector21, float undef, i32 2
%array_vector23 = insertelement <4 x float> %array_vector22, float undef, i32 3
- call void @llvm.amdgcn.tbuffer.store.i32(i32 undef, <4 x i32> undef, i32 0, i32 0, i32 %arg, i32 28, i32 4, i32 4, i1 1, i1 1)
+ call void @llvm.amdgcn.raw.tbuffer.store.i32(i32 undef, <4 x i32> undef, i32 28, i32 %arg, i32 68, i32 3)
%bc52 = bitcast <4 x float> %array_vector23 to <4 x i32>
%tmp6 = extractelement <4 x i32> %bc52, i32 undef
- call void @llvm.amdgcn.tbuffer.store.i32(i32 %tmp6, <4 x i32> undef, i32 0, i32 0, i32 %arg, i32 64, i32 4, i32 4, i1 1, i1 1)
- call void @llvm.amdgcn.tbuffer.store.i32(i32 undef, <4 x i32> undef, i32 0, i32 0, i32 %arg, i32 20, i32 4, i32 4, i1 1, i1 1)
- call void @llvm.amdgcn.tbuffer.store.i32(i32 undef, <4 x i32> undef, i32 0, i32 0, i32 %arg, i32 56, i32 4, i32 4, i1 1, i1 1)
- call void @llvm.amdgcn.tbuffer.store.i32(i32 undef, <4 x i32> undef, i32 0, i32 0, i32 %arg, i32 92, i32 4, i32 4, i1 1, i1 1)
+ call void @llvm.amdgcn.raw.tbuffer.store.i32(i32 %tmp6, <4 x i32> undef, i32 64, i32 %arg, i32 68, i32 3)
+ call void @llvm.amdgcn.raw.tbuffer.store.i32(i32 undef, <4 x i32> undef, i32 20, i32 %arg, i32 68, i32 3)
+ call void @llvm.amdgcn.raw.tbuffer.store.i32(i32 undef, <4 x i32> undef, i32 56, i32 %arg, i32 68, i32 3)
+ call void @llvm.amdgcn.raw.tbuffer.store.i32(i32 undef, <4 x i32> undef, i32 92, i32 %arg, i32 68, i32 3)
ret void
}
-declare float @llvm.amdgcn.s.buffer.load.f32(<4 x i32>, i32, i32) #1
-declare i32 @llvm.amdgcn.raw.buffer.load.i32(<4 x i32>, i32, i32, i32) #2
-declare void @llvm.amdgcn.tbuffer.store.i32(i32, <4 x i32>, i32, i32, i32, i32, i32, i32, i1, i1) #3
+declare float @llvm.amdgcn.s.buffer.load.f32(<4 x i32>, i32, i32 immarg) #1
+declare i32 @llvm.amdgcn.raw.buffer.load.i32(<4 x i32>, i32, i32, i32 immarg) #2
+declare void @llvm.amdgcn.raw.tbuffer.store.i32(i32, <4 x i32>, i32, i32, i32 immarg, i32 immarg) #3
attributes #0 = { nounwind "target-cpu"="tonga" }
-attributes #1 = { nounwind readnone }
-attributes #2 = { nounwind readonly }
-attributes #3 = { nounwind }
+attributes #1 = { nounwind readnone willreturn }
+attributes #2 = { nounwind readonly willreturn }
+attributes #3 = { nounwind willreturn writeonly }
diff --git a/llvm/test/CodeGen/AMDGPU/si-triv-disjoint-mem-access.ll b/llvm/test/CodeGen/AMDGPU/si-triv-disjoint-mem-access.ll
index ec2dbb995c98..225d7cc0d9a2 100644
--- a/llvm/test/CodeGen/AMDGPU/si-triv-disjoint-mem-access.ll
+++ b/llvm/test/CodeGen/AMDGPU/si-triv-disjoint-mem-access.ll
@@ -1,12 +1,6 @@
; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=bonaire -enable-amdgpu-aa=0 -verify-machineinstrs -enable-misched -enable-aa-sched-mi < %s | FileCheck -enable-var-scope -check-prefixes=GCN,CI %s
; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx900 -enable-amdgpu-aa=0 -verify-machineinstrs -enable-misched -enable-aa-sched-mi -amdgpu-enable-global-sgpr-addr < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9 %s
-declare void @llvm.amdgcn.tbuffer.store.i32(i32, <4 x i32>, i32, i32, i32, i32, i32, i32, i1, i1)
-declare void @llvm.amdgcn.tbuffer.store.v4i32(<4 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i1, i1)
-declare void @llvm.amdgcn.s.barrier() #1
-declare i32 @llvm.amdgcn.workitem.id.x() #2
-
-
@stored_lds_ptr = addrspace(3) global i32 addrspace(3)* undef, align 4
@stored_constant_ptr = addrspace(3) global i32 addrspace(4)* undef, align 8
@stored_global_ptr = addrspace(3) global i32 addrspace(1)* undef, align 8
@@ -296,30 +290,33 @@ define amdgpu_kernel void @reorder_global_offsets_addr64_soffset0(i32 addrspace(
ret void
}
-; XGCN-LABEL: {{^}}reorder_local_load_tbuffer_store_local_load:
-; XCI: ds_read_b32 {{v[0-9]+}}, {{v[0-9]+}}, 0x4
-; XCI: TBUFFER_STORE_FORMAT
-; XCI: ds_read_b32 {{v[0-9]+}}, {{v[0-9]+}}, 0x8
-; define amdgpu_vs void @reorder_local_load_tbuffer_store_local_load(i32 addrspace(1)* %out, i32 %a1, i32 %vaddr) #0 {
-; %ptr0 = load i32 addrspace(3)*, i32 addrspace(3)* addrspace(3)* @stored_lds_ptr, align 4
+; GCN-LABEL: {{^}}reorder_local_load_tbuffer_store_local_load:
+; GCN: tbuffer_store_format
+; GCN: ds_read2_b32 {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}} offset0:1 offset1:2
+define amdgpu_vs void @reorder_local_load_tbuffer_store_local_load(i32 addrspace(1)* %out, i32 %a1, i32 %vaddr) #0 {
+ %ptr0 = load i32 addrspace(3)*, i32 addrspace(3)* addrspace(3)* @stored_lds_ptr, align 4
-; %ptr1 = getelementptr inbounds i32, i32 addrspace(3)* %ptr0, i32 1
-; %ptr2 = getelementptr inbounds i32, i32 addrspace(3)* %ptr0, i32 2
+ %ptr1 = getelementptr inbounds i32, i32 addrspace(3)* %ptr0, i32 1
+ %ptr2 = getelementptr inbounds i32, i32 addrspace(3)* %ptr0, i32 2
-; %tmp1 = load i32, i32 addrspace(3)* %ptr1, align 4
+ %tmp1 = load i32, i32 addrspace(3)* %ptr1, align 4
-; %vdata = insertelement <4 x i32> undef, i32 %a1, i32 0
-; call void @llvm.amdgcn.tbuffer.store.v4i32(<4 x i32> %vdata, <4 x i32> undef,
-; i32 %vaddr, i32 0, i32 0, i32 32, i32 14, i32 4, i1 1, i1 1)
+ %vdata = insertelement <4 x i32> undef, i32 %a1, i32 0
+ %vaddr.add = add i32 %vaddr, 32
+ call void @llvm.amdgcn.struct.tbuffer.store.v4i32(<4 x i32> %vdata, <4 x i32> undef, i32 %vaddr.add, i32 0, i32 0, i32 228, i32 3)
-; %tmp2 = load i32, i32 addrspace(3)* %ptr2, align 4
+ %tmp2 = load i32, i32 addrspace(3)* %ptr2, align 4
-; %add = add nsw i32 %tmp1, %tmp2
+ %add = add nsw i32 %tmp1, %tmp2
+ store i32 %add, i32 addrspace(1)* %out, align 4
+ ret void
+}
-; store i32 %add, i32 addrspace(1)* %out, align 4
-; ret void
-; }
+declare void @llvm.amdgcn.s.barrier() #1
+declare i32 @llvm.amdgcn.workitem.id.x() #2
+declare void @llvm.amdgcn.struct.tbuffer.store.v4i32(<4 x i32>, <4 x i32>, i32, i32, i32, i32 immarg, i32 immarg) #3
attributes #0 = { nounwind }
-attributes #1 = { nounwind convergent }
-attributes #2 = { nounwind readnone }
+attributes #1 = { convergent nounwind willreturn }
+attributes #2 = { nounwind readnone speculatable willreturn }
+attributes #3 = { nounwind willreturn writeonly }
More information about the llvm-commits
mailing list