[llvm] r351494 - AMDGPU: Convert tests away from llvm.SI.load.const
Matt Arsenault via llvm-commits
llvm-commits at lists.llvm.org
Thu Jan 17 14:47:26 PST 2019
Author: arsenm
Date: Thu Jan 17 14:47:26 2019
New Revision: 351494
URL: http://llvm.org/viewvc/llvm-project?rev=351494&view=rev
Log:
AMDGPU: Convert tests away from llvm.SI.load.const
Modified:
llvm/trunk/test/CodeGen/AMDGPU/bug-vopc-commute.ll
llvm/trunk/test/CodeGen/AMDGPU/scheduler-subrange-crash.ll
llvm/trunk/test/CodeGen/AMDGPU/sgpr-copy.ll
llvm/trunk/test/CodeGen/AMDGPU/si-sgpr-spill.ll
llvm/trunk/test/CodeGen/AMDGPU/si-spill-cf.ll
llvm/trunk/test/CodeGen/AMDGPU/smrd.ll
llvm/trunk/test/CodeGen/AMDGPU/split-smrd.ll
llvm/trunk/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot.ll
Modified: llvm/trunk/test/CodeGen/AMDGPU/bug-vopc-commute.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/bug-vopc-commute.ll?rev=351494&r1=351493&r2=351494&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/bug-vopc-commute.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/bug-vopc-commute.ll Thu Jan 17 14:47:26 2019
@@ -8,8 +8,8 @@
; of which were in SGPRs.
define amdgpu_vs float @main(i32 %v) {
main_body:
- %d1 = call float @llvm.SI.load.const.v4i32(<4 x i32> undef, i32 960)
- %d2 = call float @llvm.SI.load.const.v4i32(<4 x i32> undef, i32 976)
+ %d1 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> undef, i32 960, i32 0)
+ %d2 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> undef, i32 976, i32 0)
br i1 undef, label %ENDIF56, label %IF57
IF57: ; preds = %ENDIF
@@ -41,7 +41,7 @@ ENDIF62:
}
; Function Attrs: nounwind readnone
-declare float @llvm.SI.load.const.v4i32(<4 x i32>, i32) #0
+declare float @llvm.amdgcn.s.buffer.load.f32(<4 x i32>, i32, i32) #0
attributes #0 = { nounwind readnone }
attributes #1 = { readnone }
Modified: llvm/trunk/test/CodeGen/AMDGPU/scheduler-subrange-crash.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/scheduler-subrange-crash.ll?rev=351494&r1=351493&r2=351494&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/scheduler-subrange-crash.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/scheduler-subrange-crash.ll Thu Jan 17 14:47:26 2019
@@ -15,9 +15,9 @@ target triple = "amdgcn--"
define amdgpu_gs void @main(i32 inreg %arg) #0 {
main_body:
- %tmp = call float @llvm.SI.load.const.v4i32(<4 x i32> undef, i32 20)
- %tmp1 = call float @llvm.SI.load.const.v4i32(<4 x i32> undef, i32 24)
- %tmp2 = call float @llvm.SI.load.const.v4i32(<4 x i32> undef, i32 48)
+ %tmp = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> undef, i32 20, i32 0)
+ %tmp1 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> undef, i32 24, i32 0)
+ %tmp2 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> undef, i32 48, i32 0)
%array_vector3 = insertelement <4 x float> zeroinitializer, float %tmp2, i32 3
%array_vector5 = insertelement <4 x float> <float 0.000000e+00, float undef, float undef, float undef>, float %tmp, i32 1
%array_vector6 = insertelement <4 x float> %array_vector5, float undef, i32 2
@@ -45,7 +45,7 @@ main_body:
ret void
}
-declare float @llvm.SI.load.const.v4i32(<4 x i32>, i32) #1
+declare float @llvm.amdgcn.s.buffer.load.f32(<4 x i32>, i32, i32) #1
declare i32 @llvm.amdgcn.raw.buffer.load.i32(<4 x i32>, i32, i32, i32) #2
declare void @llvm.amdgcn.tbuffer.store.i32(i32, <4 x i32>, i32, i32, i32, i32, i32, i32, i1, i1) #3
Modified: llvm/trunk/test/CodeGen/AMDGPU/sgpr-copy.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/sgpr-copy.ll?rev=351494&r1=351493&r2=351494&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/sgpr-copy.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/sgpr-copy.ll Thu Jan 17 14:47:26 2019
@@ -8,9 +8,9 @@ define amdgpu_ps void @phi1(<4 x i32> ad
main_body:
%tmp = getelementptr <4 x i32>, <4 x i32> addrspace(4)* %arg, i32 0
%tmp20 = load <4 x i32>, <4 x i32> addrspace(4)* %tmp, !tbaa !0
- %tmp21 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp20, i32 0)
- %tmp22 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp20, i32 16)
- %tmp23 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp20, i32 32)
+ %tmp21 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp20, i32 0, i32 0)
+ %tmp22 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp20, i32 16, i32 0)
+ %tmp23 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp20, i32 32, i32 0)
%tmp24 = fptosi float %tmp22 to i32
%tmp25 = icmp ne i32 %tmp24, 0
br i1 %tmp25, label %ENDIF, label %ELSE
@@ -32,21 +32,21 @@ define amdgpu_ps void @phi2(<4 x i32> ad
main_body:
%tmp = getelementptr <4 x i32>, <4 x i32> addrspace(4)* %arg, i32 0
%tmp20 = load <4 x i32>, <4 x i32> addrspace(4)* %tmp, !tbaa !0
- %tmp21 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp20, i32 16)
- %tmp22 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp20, i32 32)
- %tmp23 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp20, i32 36)
- %tmp24 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp20, i32 40)
- %tmp25 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp20, i32 48)
- %tmp26 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp20, i32 52)
- %tmp27 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp20, i32 56)
- %tmp28 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp20, i32 64)
- %tmp29 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp20, i32 68)
- %tmp30 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp20, i32 72)
- %tmp31 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp20, i32 76)
- %tmp32 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp20, i32 80)
- %tmp33 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp20, i32 84)
- %tmp34 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp20, i32 88)
- %tmp35 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp20, i32 92)
+ %tmp21 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp20, i32 16, i32 0)
+ %tmp22 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp20, i32 32, i32 0)
+ %tmp23 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp20, i32 36, i32 0)
+ %tmp24 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp20, i32 40, i32 0)
+ %tmp25 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp20, i32 48, i32 0)
+ %tmp26 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp20, i32 52, i32 0)
+ %tmp27 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp20, i32 56, i32 0)
+ %tmp28 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp20, i32 64, i32 0)
+ %tmp29 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp20, i32 68, i32 0)
+ %tmp30 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp20, i32 72, i32 0)
+ %tmp31 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp20, i32 76, i32 0)
+ %tmp32 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp20, i32 80, i32 0)
+ %tmp33 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp20, i32 84, i32 0)
+ %tmp34 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp20, i32 88, i32 0)
+ %tmp35 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp20, i32 92, i32 0)
%tmp36 = getelementptr <8 x i32>, <8 x i32> addrspace(4)* %arg2, i32 0
%tmp37 = load <8 x i32>, <8 x i32> addrspace(4)* %tmp36, !tbaa !0
%tmp38 = getelementptr <4 x i32>, <4 x i32> addrspace(4)* %arg1, i32 0
@@ -172,10 +172,10 @@ define amdgpu_ps void @loop(<4 x i32> ad
main_body:
%tmp = getelementptr <4 x i32>, <4 x i32> addrspace(4)* %arg, i32 0
%tmp20 = load <4 x i32>, <4 x i32> addrspace(4)* %tmp, !tbaa !0
- %tmp21 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp20, i32 0)
- %tmp22 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp20, i32 4)
- %tmp23 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp20, i32 8)
- %tmp24 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp20, i32 12)
+ %tmp21 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp20, i32 0, i32 0)
+ %tmp22 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp20, i32 4, i32 0)
+ %tmp23 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp20, i32 8, i32 0)
+ %tmp24 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp20, i32 12, i32 0)
%tmp25 = fptosi float %tmp24 to i32
%tmp26 = bitcast i32 %tmp25 to float
%tmp27 = bitcast float %tmp26 to i32
@@ -225,7 +225,7 @@ define amdgpu_ps void @sample_v3([17 x <
entry:
%tmp = getelementptr [17 x <4 x i32>], [17 x <4 x i32>] addrspace(4)* %arg, i64 0, i32 0
%tmp21 = load <4 x i32>, <4 x i32> addrspace(4)* %tmp, !tbaa !0
- %tmp22 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 16)
+ %tmp22 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 16, i32 0)
%tmp23 = getelementptr [16 x <8 x i32>], [16 x <8 x i32>] addrspace(4)* %arg2, i64 0, i32 0
%tmp24 = load <8 x i32>, <8 x i32> addrspace(4)* %tmp23, !tbaa !0
%tmp25 = getelementptr [32 x <4 x i32>], [32 x <4 x i32>] addrspace(4)* %arg1, i64 0, i32 0
@@ -325,7 +325,7 @@ define amdgpu_ps void @sample_rsrc([6 x
bb:
%tmp = getelementptr [17 x <4 x i32>], [17 x <4 x i32>] addrspace(4)* %arg1, i32 0, i32 0
%tmp22 = load <4 x i32>, <4 x i32> addrspace(4)* %tmp, !tbaa !3
- %tmp23 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp22, i32 16)
+ %tmp23 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp22, i32 16, i32 0)
%tmp25 = getelementptr [32 x <8 x i32>], [32 x <8 x i32>] addrspace(4)* %arg3, i32 0, i32 0
%tmp26 = load <8 x i32>, <8 x i32> addrspace(4)* %tmp25, !tbaa !3
%tmp27 = getelementptr [16 x <4 x i32>], [16 x <4 x i32>] addrspace(4)* %arg2, i32 0, i32 0
@@ -409,7 +409,7 @@ declare void @llvm.amdgcn.exp.f32(i32, i
declare void @llvm.amdgcn.exp.compr.v2f16(i32, i32, <2 x half>, <2 x half>, i1, i1) #0
declare <2 x half> @llvm.amdgcn.cvt.pkrtz(float, float) #1
declare <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #2
-declare float @llvm.SI.load.const.v4i32(<4 x i32>, i32) #1
+declare float @llvm.amdgcn.s.buffer.load.f32(<4 x i32>, i32, i32) #1
attributes #0 = { nounwind }
attributes #1 = { nounwind readnone }
Modified: llvm/trunk/test/CodeGen/AMDGPU/si-sgpr-spill.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/si-sgpr-spill.ll?rev=351494&r1=351493&r2=351494&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/si-sgpr-spill.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/si-sgpr-spill.ll Thu Jan 17 14:47:26 2019
@@ -28,44 +28,44 @@ define amdgpu_ps void @main([17 x <4 x i
main_body:
%tmp = getelementptr [17 x <4 x i32>], [17 x <4 x i32>] addrspace(4)* %arg, i64 0, i32 0
%tmp21 = load <4 x i32>, <4 x i32> addrspace(4)* %tmp, !tbaa !0
- %tmp22 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 96)
- %tmp23 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 100)
- %tmp24 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 104)
- %tmp25 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 112)
- %tmp26 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 116)
- %tmp27 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 120)
- %tmp28 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 128)
- %tmp29 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 132)
- %tmp30 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 140)
- %tmp31 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 144)
- %tmp32 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 160)
- %tmp33 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 176)
- %tmp34 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 180)
- %tmp35 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 184)
- %tmp36 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 192)
- %tmp37 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 196)
- %tmp38 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 200)
- %tmp39 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 208)
- %tmp40 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 212)
- %tmp41 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 216)
- %tmp42 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 224)
- %tmp43 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 240)
- %tmp44 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 244)
- %tmp45 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 248)
- %tmp46 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 256)
- %tmp47 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 272)
- %tmp48 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 276)
- %tmp49 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 280)
- %tmp50 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 288)
- %tmp51 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 292)
- %tmp52 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 296)
- %tmp53 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 304)
- %tmp54 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 308)
- %tmp55 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 312)
- %tmp56 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 368)
- %tmp57 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 372)
- %tmp58 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 376)
- %tmp59 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 384)
+ %tmp22 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 96, i32 0)
+ %tmp23 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 100, i32 0)
+ %tmp24 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 104, i32 0)
+ %tmp25 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 112, i32 0)
+ %tmp26 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 116, i32 0)
+ %tmp27 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 120, i32 0)
+ %tmp28 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 128, i32 0)
+ %tmp29 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 132, i32 0)
+ %tmp30 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 140, i32 0)
+ %tmp31 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 144, i32 0)
+ %tmp32 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 160, i32 0)
+ %tmp33 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 176, i32 0)
+ %tmp34 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 180, i32 0)
+ %tmp35 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 184, i32 0)
+ %tmp36 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 192, i32 0)
+ %tmp37 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 196, i32 0)
+ %tmp38 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 200, i32 0)
+ %tmp39 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 208, i32 0)
+ %tmp40 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 212, i32 0)
+ %tmp41 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 216, i32 0)
+ %tmp42 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 224, i32 0)
+ %tmp43 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 240, i32 0)
+ %tmp44 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 244, i32 0)
+ %tmp45 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 248, i32 0)
+ %tmp46 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 256, i32 0)
+ %tmp47 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 272, i32 0)
+ %tmp48 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 276, i32 0)
+ %tmp49 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 280, i32 0)
+ %tmp50 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 288, i32 0)
+ %tmp51 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 292, i32 0)
+ %tmp52 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 296, i32 0)
+ %tmp53 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 304, i32 0)
+ %tmp54 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 308, i32 0)
+ %tmp55 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 312, i32 0)
+ %tmp56 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 368, i32 0)
+ %tmp57 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 372, i32 0)
+ %tmp58 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 376, i32 0)
+ %tmp59 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 384, i32 0)
%tmp60 = getelementptr [16 x <8 x i32>], [16 x <8 x i32>] addrspace(4)* %arg2, i64 0, i32 0
%tmp61 = load <8 x i32>, <8 x i32> addrspace(4)* %tmp60, !tbaa !0
%tmp62 = getelementptr [32 x <4 x i32>], [32 x <4 x i32>] addrspace(4)* %arg1, i64 0, i32 0
@@ -647,109 +647,109 @@ define amdgpu_ps void @main1([17 x <4 x
main_body:
%tmp = getelementptr [17 x <4 x i32>], [17 x <4 x i32>] addrspace(4)* %arg, i64 0, i32 0
%tmp21 = load <4 x i32>, <4 x i32> addrspace(4)* %tmp, !tbaa !0
- %tmp22 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 0)
- %tmp23 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 4)
- %tmp24 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 8)
- %tmp25 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 12)
- %tmp26 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 28)
- %tmp27 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 48)
- %tmp28 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 52)
- %tmp29 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 56)
- %tmp30 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 64)
- %tmp31 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 68)
- %tmp32 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 72)
- %tmp33 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 76)
- %tmp34 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 128)
- %tmp35 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 132)
- %tmp36 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 144)
- %tmp37 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 148)
- %tmp38 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 152)
- %tmp39 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 160)
- %tmp40 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 164)
- %tmp41 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 168)
- %tmp42 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 172)
- %tmp43 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 176)
- %tmp44 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 180)
- %tmp45 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 184)
- %tmp46 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 192)
- %tmp47 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 196)
- %tmp48 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 200)
- %tmp49 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 208)
- %tmp50 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 212)
- %tmp51 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 216)
- %tmp52 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 220)
- %tmp53 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 236)
- %tmp54 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 240)
- %tmp55 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 244)
- %tmp56 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 248)
- %tmp57 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 252)
- %tmp58 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 256)
- %tmp59 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 260)
- %tmp60 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 264)
- %tmp61 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 268)
- %tmp62 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 272)
- %tmp63 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 276)
- %tmp64 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 280)
- %tmp65 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 284)
- %tmp66 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 288)
- %tmp67 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 292)
- %tmp68 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 464)
- %tmp69 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 468)
- %tmp70 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 472)
- %tmp71 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 496)
- %tmp72 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 500)
- %tmp73 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 504)
- %tmp74 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 512)
- %tmp75 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 516)
- %tmp76 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 524)
- %tmp77 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 532)
- %tmp78 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 536)
- %tmp79 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 540)
- %tmp80 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 544)
- %tmp81 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 548)
- %tmp82 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 552)
- %tmp83 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 556)
- %tmp84 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 560)
- %tmp85 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 564)
- %tmp86 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 568)
- %tmp87 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 572)
- %tmp88 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 576)
- %tmp89 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 580)
- %tmp90 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 584)
- %tmp91 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 588)
- %tmp92 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 592)
- %tmp93 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 596)
- %tmp94 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 600)
- %tmp95 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 604)
- %tmp96 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 608)
- %tmp97 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 612)
- %tmp98 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 616)
- %tmp99 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 624)
- %tmp100 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 628)
- %tmp101 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 632)
- %tmp102 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 636)
- %tmp103 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 640)
- %tmp104 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 644)
- %tmp105 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 648)
- %tmp106 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 652)
- %tmp107 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 656)
- %tmp108 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 660)
- %tmp109 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 664)
- %tmp110 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 668)
- %tmp111 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 672)
- %tmp112 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 676)
- %tmp113 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 680)
- %tmp114 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 684)
- %tmp115 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 688)
- %tmp116 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 692)
- %tmp117 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 696)
- %tmp118 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 700)
- %tmp119 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 704)
- %tmp120 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 708)
- %tmp121 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 712)
- %tmp122 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 716)
- %tmp123 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 864)
- %tmp124 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 868)
+ %tmp22 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 0, i32 0)
+ %tmp23 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 4, i32 0)
+ %tmp24 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 8, i32 0)
+ %tmp25 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 12, i32 0)
+ %tmp26 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 28, i32 0)
+ %tmp27 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 48, i32 0)
+ %tmp28 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 52, i32 0)
+ %tmp29 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 56, i32 0)
+ %tmp30 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 64, i32 0)
+ %tmp31 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 68, i32 0)
+ %tmp32 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 72, i32 0)
+ %tmp33 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 76, i32 0)
+ %tmp34 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 128, i32 0)
+ %tmp35 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 132, i32 0)
+ %tmp36 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 144, i32 0)
+ %tmp37 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 148, i32 0)
+ %tmp38 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 152, i32 0)
+ %tmp39 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 160, i32 0)
+ %tmp40 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 164, i32 0)
+ %tmp41 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 168, i32 0)
+ %tmp42 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 172, i32 0)
+ %tmp43 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 176, i32 0)
+ %tmp44 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 180, i32 0)
+ %tmp45 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 184, i32 0)
+ %tmp46 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 192, i32 0)
+ %tmp47 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 196, i32 0)
+ %tmp48 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 200, i32 0)
+ %tmp49 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 208, i32 0)
+ %tmp50 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 212, i32 0)
+ %tmp51 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 216, i32 0)
+ %tmp52 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 220, i32 0)
+ %tmp53 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 236, i32 0)
+ %tmp54 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 240, i32 0)
+ %tmp55 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 244, i32 0)
+ %tmp56 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 248, i32 0)
+ %tmp57 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 252, i32 0)
+ %tmp58 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 256, i32 0)
+ %tmp59 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 260, i32 0)
+ %tmp60 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 264, i32 0)
+ %tmp61 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 268, i32 0)
+ %tmp62 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 272, i32 0)
+ %tmp63 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 276, i32 0)
+ %tmp64 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 280, i32 0)
+ %tmp65 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 284, i32 0)
+ %tmp66 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 288, i32 0)
+ %tmp67 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 292, i32 0)
+ %tmp68 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 464, i32 0)
+ %tmp69 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 468, i32 0)
+ %tmp70 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 472, i32 0)
+ %tmp71 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 496, i32 0)
+ %tmp72 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 500, i32 0)
+ %tmp73 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 504, i32 0)
+ %tmp74 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 512, i32 0)
+ %tmp75 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 516, i32 0)
+ %tmp76 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 524, i32 0)
+ %tmp77 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 532, i32 0)
+ %tmp78 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 536, i32 0)
+ %tmp79 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 540, i32 0)
+ %tmp80 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 544, i32 0)
+ %tmp81 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 548, i32 0)
+ %tmp82 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 552, i32 0)
+ %tmp83 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 556, i32 0)
+ %tmp84 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 560, i32 0)
+ %tmp85 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 564, i32 0)
+ %tmp86 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 568, i32 0)
+ %tmp87 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 572, i32 0)
+ %tmp88 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 576, i32 0)
+ %tmp89 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 580, i32 0)
+ %tmp90 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 584, i32 0)
+ %tmp91 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 588, i32 0)
+ %tmp92 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 592, i32 0)
+ %tmp93 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 596, i32 0)
+ %tmp94 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 600, i32 0)
+ %tmp95 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 604, i32 0)
+ %tmp96 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 608, i32 0)
+ %tmp97 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 612, i32 0)
+ %tmp98 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 616, i32 0)
+ %tmp99 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 624, i32 0)
+ %tmp100 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 628, i32 0)
+ %tmp101 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 632, i32 0)
+ %tmp102 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 636, i32 0)
+ %tmp103 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 640, i32 0)
+ %tmp104 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 644, i32 0)
+ %tmp105 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 648, i32 0)
+ %tmp106 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 652, i32 0)
+ %tmp107 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 656, i32 0)
+ %tmp108 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 660, i32 0)
+ %tmp109 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 664, i32 0)
+ %tmp110 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 668, i32 0)
+ %tmp111 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 672, i32 0)
+ %tmp112 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 676, i32 0)
+ %tmp113 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 680, i32 0)
+ %tmp114 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 684, i32 0)
+ %tmp115 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 688, i32 0)
+ %tmp116 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 692, i32 0)
+ %tmp117 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 696, i32 0)
+ %tmp118 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 700, i32 0)
+ %tmp119 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 704, i32 0)
+ %tmp120 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 708, i32 0)
+ %tmp121 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 712, i32 0)
+ %tmp122 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 716, i32 0)
+ %tmp123 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 864, i32 0)
+ %tmp124 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 868, i32 0)
%tmp125 = getelementptr [16 x <8 x i32>], [16 x <8 x i32>] addrspace(4)* %arg2, i64 0, i32 0
%tmp126 = load <8 x i32>, <8 x i32> addrspace(4)* %tmp125, !tbaa !0
%tmp127 = getelementptr [32 x <4 x i32>], [32 x <4 x i32>] addrspace(4)* %arg1, i64 0, i32 0
@@ -1683,7 +1683,7 @@ declare <4 x float> @llvm.amdgcn.image.s
declare <4 x float> @llvm.amdgcn.image.sample.cube.v4f32.f32(i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #2
declare <4 x float> @llvm.amdgcn.image.sample.d.2d.v4f32.f32.f32(i32, float, float, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #2
declare <4 x float> @llvm.amdgcn.image.sample.l.2d.v4f32.f32(i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #2
-declare float @llvm.SI.load.const.v4i32(<4 x i32>, i32) #1
+declare float @llvm.amdgcn.s.buffer.load.f32(<4 x i32>, i32, i32) #1
attributes #0 = { nounwind }
attributes #1 = { nounwind readnone }
Modified: llvm/trunk/test/CodeGen/AMDGPU/si-spill-cf.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/si-spill-cf.ll?rev=351494&r1=351493&r2=351494&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/si-spill-cf.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/si-spill-cf.ll Thu Jan 17 14:47:26 2019
@@ -9,73 +9,73 @@
define amdgpu_ps void @main() #0 {
main_body:
- %tmp = call float @llvm.SI.load.const.v4i32(<4 x i32> undef, i32 16)
- %tmp1 = call float @llvm.SI.load.const.v4i32(<4 x i32> undef, i32 32)
- %tmp2 = call float @llvm.SI.load.const.v4i32(<4 x i32> undef, i32 80)
- %tmp3 = call float @llvm.SI.load.const.v4i32(<4 x i32> undef, i32 84)
- %tmp4 = call float @llvm.SI.load.const.v4i32(<4 x i32> undef, i32 88)
- %tmp5 = call float @llvm.SI.load.const.v4i32(<4 x i32> undef, i32 96)
- %tmp6 = call float @llvm.SI.load.const.v4i32(<4 x i32> undef, i32 100)
- %tmp7 = call float @llvm.SI.load.const.v4i32(<4 x i32> undef, i32 104)
- %tmp8 = call float @llvm.SI.load.const.v4i32(<4 x i32> undef, i32 112)
- %tmp9 = call float @llvm.SI.load.const.v4i32(<4 x i32> undef, i32 116)
- %tmp10 = call float @llvm.SI.load.const.v4i32(<4 x i32> undef, i32 120)
- %tmp11 = call float @llvm.SI.load.const.v4i32(<4 x i32> undef, i32 128)
- %tmp12 = call float @llvm.SI.load.const.v4i32(<4 x i32> undef, i32 132)
- %tmp13 = call float @llvm.SI.load.const.v4i32(<4 x i32> undef, i32 136)
- %tmp14 = call float @llvm.SI.load.const.v4i32(<4 x i32> undef, i32 144)
- %tmp15 = call float @llvm.SI.load.const.v4i32(<4 x i32> undef, i32 148)
- %tmp16 = call float @llvm.SI.load.const.v4i32(<4 x i32> undef, i32 152)
- %tmp17 = call float @llvm.SI.load.const.v4i32(<4 x i32> undef, i32 160)
- %tmp18 = call float @llvm.SI.load.const.v4i32(<4 x i32> undef, i32 164)
- %tmp19 = call float @llvm.SI.load.const.v4i32(<4 x i32> undef, i32 168)
- %tmp20 = call float @llvm.SI.load.const.v4i32(<4 x i32> undef, i32 176)
- %tmp21 = call float @llvm.SI.load.const.v4i32(<4 x i32> undef, i32 180)
- %tmp22 = call float @llvm.SI.load.const.v4i32(<4 x i32> undef, i32 184)
- %tmp23 = call float @llvm.SI.load.const.v4i32(<4 x i32> undef, i32 192)
- %tmp24 = call float @llvm.SI.load.const.v4i32(<4 x i32> undef, i32 196)
- %tmp25 = call float @llvm.SI.load.const.v4i32(<4 x i32> undef, i32 200)
- %tmp26 = call float @llvm.SI.load.const.v4i32(<4 x i32> undef, i32 208)
- %tmp27 = call float @llvm.SI.load.const.v4i32(<4 x i32> undef, i32 212)
- %tmp28 = call float @llvm.SI.load.const.v4i32(<4 x i32> undef, i32 216)
- %tmp29 = call float @llvm.SI.load.const.v4i32(<4 x i32> undef, i32 224)
- %tmp30 = call float @llvm.SI.load.const.v4i32(<4 x i32> undef, i32 228)
- %tmp31 = call float @llvm.SI.load.const.v4i32(<4 x i32> undef, i32 232)
- %tmp32 = call float @llvm.SI.load.const.v4i32(<4 x i32> undef, i32 240)
- %tmp33 = call float @llvm.SI.load.const.v4i32(<4 x i32> undef, i32 244)
- %tmp34 = call float @llvm.SI.load.const.v4i32(<4 x i32> undef, i32 248)
- %tmp35 = call float @llvm.SI.load.const.v4i32(<4 x i32> undef, i32 256)
- %tmp36 = call float @llvm.SI.load.const.v4i32(<4 x i32> undef, i32 260)
- %tmp37 = call float @llvm.SI.load.const.v4i32(<4 x i32> undef, i32 264)
- %tmp38 = call float @llvm.SI.load.const.v4i32(<4 x i32> undef, i32 272)
- %tmp39 = call float @llvm.SI.load.const.v4i32(<4 x i32> undef, i32 276)
- %tmp40 = call float @llvm.SI.load.const.v4i32(<4 x i32> undef, i32 280)
- %tmp41 = call float @llvm.SI.load.const.v4i32(<4 x i32> undef, i32 288)
- %tmp42 = call float @llvm.SI.load.const.v4i32(<4 x i32> undef, i32 292)
- %tmp43 = call float @llvm.SI.load.const.v4i32(<4 x i32> undef, i32 296)
- %tmp44 = call float @llvm.SI.load.const.v4i32(<4 x i32> undef, i32 304)
- %tmp45 = call float @llvm.SI.load.const.v4i32(<4 x i32> undef, i32 308)
- %tmp46 = call float @llvm.SI.load.const.v4i32(<4 x i32> undef, i32 312)
- %tmp47 = call float @llvm.SI.load.const.v4i32(<4 x i32> undef, i32 320)
- %tmp48 = call float @llvm.SI.load.const.v4i32(<4 x i32> undef, i32 324)
- %tmp49 = call float @llvm.SI.load.const.v4i32(<4 x i32> undef, i32 328)
- %tmp50 = call float @llvm.SI.load.const.v4i32(<4 x i32> undef, i32 336)
- %tmp51 = call float @llvm.SI.load.const.v4i32(<4 x i32> undef, i32 340)
- %tmp52 = call float @llvm.SI.load.const.v4i32(<4 x i32> undef, i32 344)
- %tmp53 = call float @llvm.SI.load.const.v4i32(<4 x i32> undef, i32 352)
- %tmp54 = call float @llvm.SI.load.const.v4i32(<4 x i32> undef, i32 356)
- %tmp55 = call float @llvm.SI.load.const.v4i32(<4 x i32> undef, i32 360)
- %tmp56 = call float @llvm.SI.load.const.v4i32(<4 x i32> undef, i32 368)
- %tmp57 = call float @llvm.SI.load.const.v4i32(<4 x i32> undef, i32 372)
- %tmp58 = call float @llvm.SI.load.const.v4i32(<4 x i32> undef, i32 376)
- %tmp59 = call float @llvm.SI.load.const.v4i32(<4 x i32> undef, i32 384)
- %tmp60 = call float @llvm.SI.load.const.v4i32(<4 x i32> undef, i32 388)
- %tmp61 = call float @llvm.SI.load.const.v4i32(<4 x i32> undef, i32 392)
- %tmp62 = call float @llvm.SI.load.const.v4i32(<4 x i32> undef, i32 400)
- %tmp63 = call float @llvm.SI.load.const.v4i32(<4 x i32> undef, i32 404)
- %tmp64 = call float @llvm.SI.load.const.v4i32(<4 x i32> undef, i32 408)
- %tmp65 = call float @llvm.SI.load.const.v4i32(<4 x i32> undef, i32 416)
- %tmp66 = call float @llvm.SI.load.const.v4i32(<4 x i32> undef, i32 420)
+ %tmp = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> undef, i32 16, i32 0)
+ %tmp1 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> undef, i32 32, i32 0)
+ %tmp2 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> undef, i32 80, i32 0)
+ %tmp3 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> undef, i32 84, i32 0)
+ %tmp4 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> undef, i32 88, i32 0)
+ %tmp5 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> undef, i32 96, i32 0)
+ %tmp6 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> undef, i32 100, i32 0)
+ %tmp7 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> undef, i32 104, i32 0)
+ %tmp8 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> undef, i32 112, i32 0)
+ %tmp9 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> undef, i32 116, i32 0)
+ %tmp10 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> undef, i32 120, i32 0)
+ %tmp11 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> undef, i32 128, i32 0)
+ %tmp12 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> undef, i32 132, i32 0)
+ %tmp13 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> undef, i32 136, i32 0)
+ %tmp14 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> undef, i32 144, i32 0)
+ %tmp15 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> undef, i32 148, i32 0)
+ %tmp16 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> undef, i32 152, i32 0)
+ %tmp17 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> undef, i32 160, i32 0)
+ %tmp18 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> undef, i32 164, i32 0)
+ %tmp19 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> undef, i32 168, i32 0)
+ %tmp20 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> undef, i32 176, i32 0)
+ %tmp21 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> undef, i32 180, i32 0)
+ %tmp22 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> undef, i32 184, i32 0)
+ %tmp23 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> undef, i32 192, i32 0)
+ %tmp24 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> undef, i32 196, i32 0)
+ %tmp25 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> undef, i32 200, i32 0)
+ %tmp26 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> undef, i32 208, i32 0)
+ %tmp27 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> undef, i32 212, i32 0)
+ %tmp28 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> undef, i32 216, i32 0)
+ %tmp29 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> undef, i32 224, i32 0)
+ %tmp30 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> undef, i32 228, i32 0)
+ %tmp31 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> undef, i32 232, i32 0)
+ %tmp32 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> undef, i32 240, i32 0)
+ %tmp33 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> undef, i32 244, i32 0)
+ %tmp34 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> undef, i32 248, i32 0)
+ %tmp35 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> undef, i32 256, i32 0)
+ %tmp36 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> undef, i32 260, i32 0)
+ %tmp37 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> undef, i32 264, i32 0)
+ %tmp38 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> undef, i32 272, i32 0)
+ %tmp39 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> undef, i32 276, i32 0)
+ %tmp40 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> undef, i32 280, i32 0)
+ %tmp41 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> undef, i32 288, i32 0)
+ %tmp42 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> undef, i32 292, i32 0)
+ %tmp43 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> undef, i32 296, i32 0)
+ %tmp44 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> undef, i32 304, i32 0)
+ %tmp45 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> undef, i32 308, i32 0)
+ %tmp46 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> undef, i32 312, i32 0)
+ %tmp47 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> undef, i32 320, i32 0)
+ %tmp48 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> undef, i32 324, i32 0)
+ %tmp49 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> undef, i32 328, i32 0)
+ %tmp50 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> undef, i32 336, i32 0)
+ %tmp51 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> undef, i32 340, i32 0)
+ %tmp52 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> undef, i32 344, i32 0)
+ %tmp53 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> undef, i32 352, i32 0)
+ %tmp54 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> undef, i32 356, i32 0)
+ %tmp55 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> undef, i32 360, i32 0)
+ %tmp56 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> undef, i32 368, i32 0)
+ %tmp57 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> undef, i32 372, i32 0)
+ %tmp58 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> undef, i32 376, i32 0)
+ %tmp59 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> undef, i32 384, i32 0)
+ %tmp60 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> undef, i32 388, i32 0)
+ %tmp61 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> undef, i32 392, i32 0)
+ %tmp62 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> undef, i32 400, i32 0)
+ %tmp63 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> undef, i32 404, i32 0)
+ %tmp64 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> undef, i32 408, i32 0)
+ %tmp65 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> undef, i32 416, i32 0)
+ %tmp66 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> undef, i32 420, i32 0)
br label %LOOP
LOOP: ; preds = %ENDIF2795, %main_body
@@ -497,7 +497,7 @@ declare float @llvm.minnum.f32(float, fl
declare float @llvm.maxnum.f32(float, float) #1
declare i32 @llvm.amdgcn.mbcnt.lo(i32, i32) #1
declare void @llvm.amdgcn.exp.f32(i32, i32, float, float, float, float, i1, i1) #0
-declare float @llvm.SI.load.const.v4i32(<4 x i32>, i32) #1
+declare float @llvm.amdgcn.s.buffer.load.f32(<4 x i32>, i32, i32) #1
attributes #0 = { nounwind }
attributes #1 = { nounwind readnone }
Modified: llvm/trunk/test/CodeGen/AMDGPU/smrd.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/smrd.ll?rev=351494&r1=351493&r2=351494&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/smrd.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/smrd.ll Thu Jan 17 14:47:26 2019
@@ -98,7 +98,7 @@ main_body:
%d1 = insertelement <4 x i32> %d0, i32 1, i32 1
%d2 = insertelement <4 x i32> %d1, i32 2, i32 2
%d3 = insertelement <4 x i32> %d2, i32 3, i32 3
- %r = call float @llvm.SI.load.const.v4i32(<4 x i32> %d3, i32 0)
+ %r = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %d3, i32 0, i32 0)
ret float %r
}
@@ -110,7 +110,7 @@ define amdgpu_ps void @smrd_load_const0(
main_body:
%tmp = getelementptr <4 x i32>, <4 x i32> addrspace(4)* %arg, i32 0
%tmp20 = load <4 x i32>, <4 x i32> addrspace(4)* %tmp
- %tmp21 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp20, i32 16)
+ %tmp21 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp20, i32 16, i32 0)
call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %tmp21, float %tmp21, float %tmp21, float %tmp21, i1 true, i1 true) #0
ret void
}
@@ -126,7 +126,7 @@ define amdgpu_ps void @smrd_load_const1(
main_body:
%tmp = getelementptr <4 x i32>, <4 x i32> addrspace(4)* %arg, i32 0
%tmp20 = load <4 x i32>, <4 x i32> addrspace(4)* %tmp
- %tmp21 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp20, i32 1020)
+ %tmp21 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp20, i32 1020, i32 0)
%tmp22 = load <4 x i32>, <4 x i32> addrspace(4)* %in
%s.buffer = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %tmp22, i32 1020, i32 1)
%s.buffer.float = bitcast i32 %s.buffer to float
@@ -149,7 +149,7 @@ define amdgpu_ps void @smrd_load_const2(
main_body:
%tmp = getelementptr <4 x i32>, <4 x i32> addrspace(4)* %arg, i32 0
%tmp20 = load <4 x i32>, <4 x i32> addrspace(4)* %tmp
- %tmp21 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp20, i32 1024)
+ %tmp21 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp20, i32 1024, i32 0)
%tmp22 = load <4 x i32>, <4 x i32> addrspace(4)* %in
%s.buffer = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %tmp22, i32 1024, i32 0)
%s.buffer.float = bitcast i32 %s.buffer to float
@@ -170,7 +170,7 @@ define amdgpu_ps void @smrd_load_const3(
main_body:
%tmp = getelementptr <4 x i32>, <4 x i32> addrspace(4)* %arg, i32 0
%tmp20 = load <4 x i32>, <4 x i32> addrspace(4)* %tmp
- %tmp21 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp20, i32 1048572)
+ %tmp21 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp20, i32 1048572, i32 0)
%tmp22 = load <4 x i32>, <4 x i32> addrspace(4)* %in
%s.buffer = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %tmp22, i32 1048572, i32 0)
%s.buffer.float = bitcast i32 %s.buffer to float
@@ -190,7 +190,7 @@ define amdgpu_ps void @smrd_load_const4(
main_body:
%tmp = getelementptr <4 x i32>, <4 x i32> addrspace(4)* %arg, i32 0
%tmp20 = load <4 x i32>, <4 x i32> addrspace(4)* %tmp
- %tmp21 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp20, i32 1048576)
+ %tmp21 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp20, i32 1048576, i32 0)
%tmp22 = load <4 x i32>, <4 x i32> addrspace(4)* %in
%s.buffer = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %tmp22, i32 1048576, i32 0)
%s.buffer.float = bitcast i32 %s.buffer to float
@@ -278,7 +278,7 @@ main_body:
; GCN: s_buffer_load_dword s{{[0-9]}}, s[0:3], s4
define amdgpu_ps float @smrd_sgpr_offset(<4 x i32> inreg %desc, i32 inreg %offset) #0 {
main_body:
- %r = call float @llvm.SI.load.const.v4i32(<4 x i32> %desc, i32 %offset)
+ %r = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %desc, i32 %offset, i32 0)
ret float %r
}
@@ -286,7 +286,7 @@ main_body:
; GCN: buffer_load_dword v{{[0-9]}}, v0, s[0:3], 0 offen ;
define amdgpu_ps float @smrd_vgpr_offset(<4 x i32> inreg %desc, i32 %offset) #0 {
main_body:
- %r = call float @llvm.SI.load.const.v4i32(<4 x i32> %desc, i32 %offset)
+ %r = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %desc, i32 %offset, i32 0)
ret float %r
}
@@ -296,7 +296,7 @@ main_body:
define amdgpu_ps float @smrd_vgpr_offset_imm(<4 x i32> inreg %desc, i32 %offset) #0 {
main_body:
%off = add i32 %offset, 4092
- %r = call float @llvm.SI.load.const.v4i32(<4 x i32> %desc, i32 %off)
+ %r = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %desc, i32 %off, i32 0)
ret float %r
}
@@ -308,7 +308,7 @@ main_body:
define amdgpu_ps float @smrd_vgpr_offset_imm_too_large(<4 x i32> inreg %desc, i32 %offset) #0 {
main_body:
%off = add i32 %offset, 4096
- %r = call float @llvm.SI.load.const.v4i32(<4 x i32> %desc, i32 %off)
+ %r = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %desc, i32 %off, i32 0)
ret float %r
}
@@ -320,12 +320,12 @@ main_body:
; VIGFX9-NEXT: s_buffer_load_dwordx2 s[{{[0-9]}}:{{[0-9]}}], s[0:3], 0x1c
define amdgpu_ps void @smrd_imm_merged(<4 x i32> inreg %desc) #0 {
main_body:
- %r1 = call float @llvm.SI.load.const.v4i32(<4 x i32> %desc, i32 4)
- %r2 = call float @llvm.SI.load.const.v4i32(<4 x i32> %desc, i32 8)
- %r3 = call float @llvm.SI.load.const.v4i32(<4 x i32> %desc, i32 12)
- %r4 = call float @llvm.SI.load.const.v4i32(<4 x i32> %desc, i32 16)
- %r5 = call float @llvm.SI.load.const.v4i32(<4 x i32> %desc, i32 28)
- %r6 = call float @llvm.SI.load.const.v4i32(<4 x i32> %desc, i32 32)
+ %r1 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %desc, i32 4, i32 0)
+ %r2 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %desc, i32 8, i32 0)
+ %r3 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %desc, i32 12, i32 0)
+ %r4 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %desc, i32 16, i32 0)
+ %r5 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %desc, i32 28, i32 0)
+ %r6 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %desc, i32 32, i32 0)
call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %r1, float %r2, float %r3, float %r4, i1 true, i1 true) #0
call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %r5, float %r6, float undef, float undef, i1 true, i1 true) #0
ret void
@@ -352,7 +352,7 @@ main_body:
;
define amdgpu_ps float @smrd_imm_merge_m0(<4 x i32> inreg %desc, i32 inreg %prim, float %u, float %v) #0 {
main_body:
- %idx1.f = call float @llvm.SI.load.const.v4i32(<4 x i32> %desc, i32 0)
+ %idx1.f = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %desc, i32 0, i32 0)
%idx1 = bitcast float %idx1.f to i32
%v0.x1 = call nsz float @llvm.amdgcn.interp.p1(float %u, i32 0, i32 0, i32 %prim)
@@ -377,7 +377,7 @@ main_body:
%v1 = insertelement <3 x float> %v0.tmp1, float %v0.z, i32 2
%b = extractelement <3 x float> %v1, i32 %idx1
- %c = call float @llvm.SI.load.const.v4i32(<4 x i32> %desc, i32 4)
+ %c = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %desc, i32 4, i32 0)
%res.tmp = fadd float %a, %b
%res = fadd float %res.tmp, %c
@@ -396,12 +396,12 @@ main_body:
%a4 = add i32 %a, 16
%a5 = add i32 %a, 28
%a6 = add i32 %a, 32
- %r1 = call float @llvm.SI.load.const.v4i32(<4 x i32> %desc, i32 %a1)
- %r2 = call float @llvm.SI.load.const.v4i32(<4 x i32> %desc, i32 %a2)
- %r3 = call float @llvm.SI.load.const.v4i32(<4 x i32> %desc, i32 %a3)
- %r4 = call float @llvm.SI.load.const.v4i32(<4 x i32> %desc, i32 %a4)
- %r5 = call float @llvm.SI.load.const.v4i32(<4 x i32> %desc, i32 %a5)
- %r6 = call float @llvm.SI.load.const.v4i32(<4 x i32> %desc, i32 %a6)
+ %r1 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %desc, i32 %a1, i32 0)
+ %r2 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %desc, i32 %a2, i32 0)
+ %r3 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %desc, i32 %a3, i32 0)
+ %r4 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %desc, i32 %a4, i32 0)
+ %r5 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %desc, i32 %a5, i32 0)
+ %r6 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %desc, i32 %a6, i32 0)
call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %r1, float %r2, float %r3, float %r4, i1 true, i1 true) #0
call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %r5, float %r6, float undef, float undef, i1 true, i1 true) #0
ret void
@@ -428,14 +428,14 @@ ret_block:
.inner_loop_body:
%descriptor = load <4 x i32>, <4 x i32> addrspace(4)* %descptr, align 16, !invariant.load !0
- %load1result = call float @llvm.SI.load.const.v4i32(<4 x i32> %descriptor, i32 0)
+ %load1result = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %descriptor, i32 0, i32 0)
store float %load1result, float addrspace(1)* undef
%inner_br2 = icmp uge i32 %1, 10
br i1 %inner_br2, label %.inner_loop_header, label %.outer_loop_body
.outer_loop_body:
%offset = shl i32 %loopctr.2, 6
- %load2result = call float @llvm.SI.load.const.v4i32(<4 x i32> %descriptor, i32 %offset)
+ %load2result = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %descriptor, i32 %offset, i32 0)
%outer_br = fcmp ueq float %load2result, 0x0
br i1 %outer_br, label %.outer_loop_header, label %ret_block
}
@@ -451,7 +451,7 @@ define amdgpu_ps void @smrd_load_noncons
main_body:
%tmp = getelementptr <4 x i32>, <4 x i32> addrspace(4)* %arg, i32 0
%tmp20 = load <4 x i32>, <4 x i32> addrspace(4)* %tmp
- %tmp21 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp20, i32 %ncoff)
+ %tmp21 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp20, i32 %ncoff, i32 0)
%tmp22 = load <4 x i32>, <4 x i32> addrspace(4)* %in
%s.buffer = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %tmp22, i32 %ncoff, i32 0)
%s.buffer.float = bitcast i32 %s.buffer to float
@@ -470,7 +470,7 @@ define amdgpu_ps void @smrd_load_noncons
main_body:
%tmp = getelementptr <4 x i32>, <4 x i32> addrspace(4)* %arg, i32 0
%tmp20 = load <4 x i32>, <4 x i32> addrspace(4)* %tmp
- %tmp21 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp20, i32 %ncoff)
+ %tmp21 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp20, i32 %ncoff, i32 0)
%tmp22 = load <4 x i32>, <4 x i32> addrspace(4)* %in
%s.buffer = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %tmp22, i32 %ncoff, i32 0)
%s.buffer.float = bitcast i32 %s.buffer to float
@@ -489,7 +489,7 @@ define amdgpu_ps void @smrd_load_noncons
main_body:
%tmp = getelementptr <4 x i32>, <4 x i32> addrspace(4)* %arg, i32 0
%tmp20 = load <4 x i32>, <4 x i32> addrspace(4)* %tmp
- %tmp21 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp20, i32 %ncoff)
+ %tmp21 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp20, i32 %ncoff, i32 0)
%tmp22 = load <4 x i32>, <4 x i32> addrspace(4)* %in
%s.buffer = call <8 x i32> @llvm.amdgcn.s.buffer.load.v8i32(<4 x i32> %tmp22, i32 %ncoff, i32 0)
%s.buffer.elt = extractelement <8 x i32> %s.buffer, i32 1
@@ -581,7 +581,7 @@ loop:
%counter = phi i32 [ 0, %main_body ], [ %counter.next, %loop ]
%sum = phi float [ 0.0, %main_body ], [ %sum.next, %loop ]
%offset = shl i32 %counter, 2
- %v = call float @llvm.SI.load.const.v4i32(<4 x i32> %desc, i32 %offset)
+ %v = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %desc, i32 %offset, i32 0)
%sum.next = fadd float %sum, %v
%counter.next = add i32 %counter, 1
%cc = icmp uge i32 %counter.next, %bound
@@ -607,7 +607,7 @@ loop:
%counter = phi i32 [ 0, %main_body ], [ %counter.next, %loop.a ], [ %counter.next, %loop.b ]
%sum = phi float [ 0.0, %main_body ], [ %sum.next, %loop.a ], [ %sum.next.b, %loop.b ]
%offset = shl i32 %counter, 2
- %v = call float @llvm.SI.load.const.v4i32(<4 x i32> %desc, i32 %offset)
+ %v = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %desc, i32 %offset, i32 0)
%sum.next = fadd float %sum, %v
%counter.next = add i32 %counter, 1
%cc = icmp uge i32 %counter.next, %bound
@@ -644,7 +644,7 @@ if1:
endif1: ; preds = %if1, %main_body
%tmp13 = extractelement <3 x i32> %arg4, i32 0
- %tmp97 = call float @llvm.SI.load.const.v4i32(<4 x i32> undef, i32 %tmp13)
+ %tmp97 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> undef, i32 %tmp13, i32 0)
ret float %tmp97
}
@@ -689,10 +689,10 @@ define amdgpu_ps void @s_buffer_load_v16
}
declare void @llvm.amdgcn.exp.f32(i32, i32, float, float, float, float, i1, i1) #0
-declare float @llvm.SI.load.const.v4i32(<4 x i32>, i32) #1
declare float @llvm.amdgcn.interp.p1(float, i32, i32, i32) #2
declare float @llvm.amdgcn.interp.p2(float, float, i32, i32, i32) #2
-declare i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32>, i32, i32)
+
+declare i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32>, i32, i32) #1
declare <2 x i32> @llvm.amdgcn.s.buffer.load.v2i32(<4 x i32>, i32, i32)
declare <4 x i32> @llvm.amdgcn.s.buffer.load.v4i32(<4 x i32>, i32, i32)
declare <8 x i32> @llvm.amdgcn.s.buffer.load.v8i32(<4 x i32>, i32, i32)
Modified: llvm/trunk/test/CodeGen/AMDGPU/split-smrd.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/split-smrd.ll?rev=351494&r1=351493&r2=351494&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/split-smrd.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/split-smrd.ll Thu Jan 17 14:47:26 2019
@@ -8,7 +8,7 @@
; GCN: image_sample v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}] dmask:0x1
define amdgpu_ps void @split_smrd_add_worklist([34 x <8 x i32>] addrspace(4)* byval %arg) #0 {
bb:
- %tmp = call float @llvm.SI.load.const.v4i32(<4 x i32> undef, i32 96)
+ %tmp = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> undef, i32 96, i32 0)
%tmp1 = bitcast float %tmp to i32
br i1 undef, label %bb2, label %bb3
@@ -31,7 +31,7 @@ bb3:
declare <2 x half> @llvm.amdgcn.cvt.pkrtz(float, float) #1
declare void @llvm.amdgcn.exp.compr.v2f16(i32, i32, <2 x half>, <2 x half>, i1, i1) #0
declare <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #2
-declare float @llvm.SI.load.const.v4i32(<4 x i32>, i32) #1
+declare float @llvm.amdgcn.s.buffer.load.f32(<4 x i32>, i32, i32) #1
attributes #0 = { nounwind }
attributes #1 = { nounwind readnone }
Modified: llvm/trunk/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot.ll?rev=351494&r1=351493&r2=351494&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot.ll Thu Jan 17 14:47:26 2019
@@ -31,9 +31,9 @@ define amdgpu_vs void @main([9 x <4 x i3
bb:
%tmp = getelementptr [17 x <4 x i32>], [17 x <4 x i32>] addrspace(4)* %arg1, i64 0, i64 0
%tmp11 = load <4 x i32>, <4 x i32> addrspace(4)* %tmp, align 16, !tbaa !0
- %tmp12 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp11, i32 0)
- %tmp13 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp11, i32 16)
- %tmp14 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp11, i32 32)
+ %tmp12 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp11, i32 0, i32 0)
+ %tmp13 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp11, i32 16, i32 0)
+ %tmp14 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp11, i32 32, i32 0)
%tmp15 = getelementptr [16 x <4 x i32>], [16 x <4 x i32>] addrspace(4)* %arg4, i64 0, i64 0
%tmp16 = load <4 x i32>, <4 x i32> addrspace(4)* %tmp15, align 16, !tbaa !0
%tmp17 = add i32 %arg5, %arg7
@@ -488,7 +488,7 @@ bb157:
declare i32 @llvm.amdgcn.mbcnt.lo(i32, i32) #1
declare void @llvm.amdgcn.exp.f32(i32, i32, float, float, float, float, i1, i1) #0
-declare float @llvm.SI.load.const.v4i32(<4 x i32>, i32) #1
+declare float @llvm.amdgcn.s.buffer.load.f32(<4 x i32>, i32, i32) #1
declare <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32>, i32, i32, i1, i1) #2
attributes #0 = { nounwind }
More information about the llvm-commits
mailing list