[llvm] r317754 - AMDGPU: Lower buffer store and atomic intrinsics manually
Michel Dänzer via llvm-commits
llvm-commits at lists.llvm.org
Thu Nov 9 07:49:52 PST 2017
Hi Marek,
On 09/11/17 02:52 AM, Marek Olsak via llvm-commits wrote:
> Author: mareko
> Date: Wed Nov 8 17:52:48 2017
> New Revision: 317754
>
> URL: http://llvm.org/viewvc/llvm-project?rev=317754&view=rev
> Log:
> AMDGPU: Lower buffer store and atomic intrinsics manually
This change broke piglit
spec at arb_tessellation_shader@execution at tes-input@tes-input-gl_clipdistance
on my Tonga:
PIGLIT TEST: 61 - tes-input-gl_ClipDistance
Probe color at (62,62)
Expected: 0.000000 1.000000 0.000000
Observed: 1.000000 0.000000 0.000000
Test failure on line 184
Probe color at (187,62)
Expected: 0.000000 1.000000 0.000000
Observed: 1.000000 0.000000 0.000000
Test failure on line 185
Probe color at (62,187)
Expected: 0.000000 1.000000 0.000000
Observed: 1.000000 0.000000 0.000000
Test failure on line 186
Probe color at (187,187)
Expected: 0.000000 1.000000 0.000000
Observed: 1.000000 0.000000 0.000000
Test failure on line 187
Note that it doesn't fail every time, but most of the time. Before this
change, it always passed.
I'm attaching a dump of the affected shader from before and after this
change. Let me know if you need more information.
--
Earthling Michel Dänzer | http://www.amd.com
Libre software enthusiast | Mesa and X developer
-------------- next part --------------
TESS_CTRL
PROPERTY TCS_VERTICES_OUT 3
DCL SV[0], INVOCATIONID
DCL SV[1], PRIM_ID
DCL OUT[][0..1], ARRAY(1), CLIPDIST
DCL OUT[2], TESSOUTER
DCL OUT[3].xy, TESSINNER
DCL CONST[0][0..95]
DCL TEMP[0..23], LOCAL
IMM[0] INT32 {3, 8, 0, 0}
IMM[1] FLT32 { 1.0000, 0.0000, 0.0000, 0.0000}
0: MOV TEMP[0].yzw, OUT[SV[0].x](1)[0].zyzw
1: UMAD TEMP[1].x, SV[1].xxxx, IMM[0].xxxx, SV[0].xxxx
2: UMUL TEMP[2].x, TEMP[1].xxxx, IMM[0].yyyy
3: MOV TEMP[0].x, CONST[0][TEMP[2].x].xxxx
4: MOV OUT[SV[0].x](1)[0], TEMP[0]
5: MOV TEMP[3].xzw, OUT[SV[0].x](1)[0].xxzw
6: UMAD TEMP[4].x, SV[1].xxxx, IMM[0].xxxx, SV[0].xxxx
7: UMUL TEMP[5].x, TEMP[4].xxxx, IMM[0].yyyy
8: MOV TEMP[3].y, CONST[0][TEMP[5].x+1].yxyy
9: MOV OUT[SV[0].x](1)[0], TEMP[3]
10: MOV TEMP[6].xyw, OUT[SV[0].x](1)[0].xyxw
11: UMAD TEMP[7].x, SV[1].xxxx, IMM[0].xxxx, SV[0].xxxx
12: UMUL TEMP[8].x, TEMP[7].xxxx, IMM[0].yyyy
13: MOV TEMP[6].z, CONST[0][TEMP[8].x+2].zzxz
14: MOV OUT[SV[0].x](1)[0], TEMP[6]
15: MOV TEMP[9].xyz, OUT[SV[0].x](1)[0].xyzx
16: UMAD TEMP[10].x, SV[1].xxxx, IMM[0].xxxx, SV[0].xxxx
17: UMUL TEMP[11].x, TEMP[10].xxxx, IMM[0].yyyy
18: MOV TEMP[9].w, CONST[0][TEMP[11].x+3].wwwx
19: MOV OUT[SV[0].x](1)[0], TEMP[9]
20: MOV TEMP[12].yzw, OUT[SV[0].x](1)[1].zyzw
21: UMAD TEMP[13].x, SV[1].xxxx, IMM[0].xxxx, SV[0].xxxx
22: UMUL TEMP[14].x, TEMP[13].xxxx, IMM[0].yyyy
23: MOV TEMP[12].x, CONST[0][TEMP[14].x+4].xxxx
24: MOV OUT[SV[0].x](1)[1], TEMP[12]
25: MOV TEMP[15].xzw, OUT[SV[0].x](1)[1].xxzw
26: UMAD TEMP[16].x, SV[1].xxxx, IMM[0].xxxx, SV[0].xxxx
27: UMUL TEMP[17].x, TEMP[16].xxxx, IMM[0].yyyy
28: MOV TEMP[15].y, CONST[0][TEMP[17].x+5].yxyy
29: MOV OUT[SV[0].x](1)[1], TEMP[15]
30: MOV TEMP[18].xyw, OUT[SV[0].x](1)[1].xyxw
31: UMAD TEMP[19].x, SV[1].xxxx, IMM[0].xxxx, SV[0].xxxx
32: UMUL TEMP[20].x, TEMP[19].xxxx, IMM[0].yyyy
33: MOV TEMP[18].z, CONST[0][TEMP[20].x+6].zzxz
34: MOV OUT[SV[0].x](1)[1], TEMP[18]
35: MOV TEMP[21].xyz, OUT[SV[0].x](1)[1].xyzx
36: UMAD TEMP[22].x, SV[1].xxxx, IMM[0].xxxx, SV[0].xxxx
37: UMUL TEMP[23].x, TEMP[22].xxxx, IMM[0].yyyy
38: MOV TEMP[21].w, CONST[0][TEMP[23].x+7].wwwx
39: MOV OUT[SV[0].x](1)[1], TEMP[21]
40: MOV OUT[2].x, IMM[1].xxxx
41: MOV OUT[2].y, IMM[1].xxxx
42: MOV OUT[2].z, IMM[1].xxxx
43: MOV OUT[2].w, IMM[1].xxxx
44: MOV OUT[3].x, IMM[1].xxxx
45: MOV OUT[3].y, IMM[1].xxxx
46: END
radeonsi: Compiling shader 2
TGSI shader LLVM IR:
; ModuleID = 'tgsi'
source_filename = "tgsi"
target datalayout = "e-p:32:32-p1:64:64-p2:64:64-p3:32:32-p4:64:64-p5:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64"
target triple = "amdgcn--"
; Function Attrs: nounwind readnone
declare float @llvm.SI.load.const.v4i32(<4 x i32>, i32) #0
; Function Attrs: nounwind writeonly
declare void @llvm.amdgcn.buffer.store.v4f32(<4 x float>, <4 x i32>, i32, i32, i1, i1) #1
; Function Attrs: nounwind writeonly
declare void @llvm.amdgcn.buffer.store.f32(float, <4 x i32>, i32, i32, i1, i1) #1
; Function Attrs: nounwind writeonly
declare void @llvm.amdgcn.buffer.store.v2f32(<2 x float>, <4 x i32>, i32, i32, i1, i1) #1
define amdgpu_hs void @wrapper([12 x <4 x i32>] addrspace(2)* byval noalias dereferenceable(18446744073709551615), [0 x <8 x i32>] addrspace(2)* byval noalias dereferenceable(18446744073709551615), [0 x float] addrspace(2)* byval noalias dereferenceable(18446744073709551615), [80 x <8 x i32>] addrspace(2)* byval noalias dereferenceable(18446744073709551615), i32 inreg, i32 inreg, i32 inreg, i32 inreg, i32 inreg, i32 inreg, i32 inreg, i32 inreg, i32, i32) #2 {
main_body:
%14 = ptrtoint [0 x float] addrspace(2)* %2 to i64
%15 = bitcast i64 %14 to <2 x i32>
%16 = lshr i32 %13, 8
%17 = and i32 %16, 31
%18 = shl i32 %5, 2
%19 = and i32 %18, 262140
%20 = and i32 %13, 255
%21 = mul nuw nsw i32 %20, 608
%22 = add nuw nsw i32 %19, %21
%23 = mul nuw nsw i32 %17, 200
%24 = add nuw nsw i32 %22, %23
%25 = add nuw nsw i32 %24, 192
%26 = add nuw nsw i32 %24, 193
%27 = getelementptr [16384 x i32], [16384 x i32] addrspace(3)* null, i32 0, i32 %26
%28 = load i32, i32 addrspace(3)* %27, align 4, !noalias !0
%29 = add nuw nsw i32 %24, 194
%30 = getelementptr [16384 x i32], [16384 x i32] addrspace(3)* null, i32 0, i32 %29
%31 = load i32, i32 addrspace(3)* %30, align 8, !noalias !0
%32 = add nuw nsw i32 %24, 195
%33 = getelementptr [16384 x i32], [16384 x i32] addrspace(3)* null, i32 0, i32 %32
%34 = load i32, i32 addrspace(3)* %33, align 4, !noalias !0
%35 = mul i32 %12, 3
%36 = add i32 %35, %17
%37 = shl i32 %36, 7
%38 = shufflevector <2 x i32> %15, <2 x i32> undef, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
%39 = shufflevector <2 x i32> %15, <2 x i32> undef, <4 x i32> <i32 undef, i32 1, i32 undef, i32 undef>
%40 = shufflevector <4 x i32> <i32 undef, i32 undef, i32 1536, i32 163756>, <4 x i32> %38, <4 x i32> <i32 4, i32 undef, i32 2, i32 3>
%41 = shufflevector <4 x i32> %40, <4 x i32> %39, <4 x i32> <i32 0, i32 5, i32 2, i32 3>
%42 = call nsz float @llvm.SI.load.const.v4i32(<4 x i32> %41, i32 %37)
%43 = zext i32 %8 to i64
%44 = shl nuw nsw i64 %43, 16
%45 = insertelement <2 x i64> <i64 undef, i64 703330959491071>, i64 %44, i32 0
%46 = bitcast <2 x i64> %45 to <4 x i32>
%47 = and i32 %4, 63
%48 = mul nuw nsw i32 %20, 3
%49 = add nuw nsw i32 %48, %17
%50 = mul nuw nsw i32 %47, 144
%51 = add nuw nsw i32 %49, %50
%52 = shl nuw nsw i32 %51, 4
%53 = bitcast float %42 to i32
%54 = getelementptr [16384 x i32], [16384 x i32] addrspace(3)* null, i32 0, i32 %25
%55 = bitcast i32 addrspace(3)* %54 to float addrspace(3)*
store float %42, float addrspace(3)* %55, align 16, !noalias !0
%56 = insertelement <4 x i32> undef, i32 %53, i32 0
%57 = insertelement <4 x i32> %56, i32 %28, i32 1
%58 = insertelement <4 x i32> %57, i32 %31, i32 2
%59 = insertelement <4 x i32> %58, i32 %34, i32 3
%60 = add i32 %52, %10
%61 = bitcast <4 x i32> %59 to <4 x float>
call void @llvm.amdgcn.buffer.store.v4f32(<4 x float> %61, <4 x i32> %46, i32 0, i32 %60, i1 true, i1 false) #3
%62 = or i32 %37, 16
%63 = call nsz float @llvm.SI.load.const.v4i32(<4 x i32> %41, i32 %62)
store float %42, float addrspace(3)* %55, align 16, !noalias !0
%64 = bitcast float %63 to i32
%65 = bitcast i32 addrspace(3)* %27 to float addrspace(3)*
store float %63, float addrspace(3)* %65, align 4, !noalias !0
%66 = insertelement <4 x i32> %56, i32 %64, i32 1
%67 = insertelement <4 x i32> %66, i32 %31, i32 2
%68 = insertelement <4 x i32> %67, i32 %34, i32 3
%69 = bitcast <4 x i32> %68 to <4 x float>
call void @llvm.amdgcn.buffer.store.v4f32(<4 x float> %69, <4 x i32> %46, i32 0, i32 %60, i1 true, i1 false) #3
%70 = or i32 %37, 32
%71 = call nsz float @llvm.SI.load.const.v4i32(<4 x i32> %41, i32 %70)
store float %42, float addrspace(3)* %55, align 16, !noalias !0
store float %63, float addrspace(3)* %65, align 4, !noalias !0
%72 = bitcast float %71 to i32
%73 = bitcast i32 addrspace(3)* %30 to float addrspace(3)*
store float %71, float addrspace(3)* %73, align 8, !noalias !0
%74 = insertelement <4 x i32> %66, i32 %72, i32 2
%75 = insertelement <4 x i32> %74, i32 %34, i32 3
%76 = bitcast <4 x i32> %75 to <4 x float>
call void @llvm.amdgcn.buffer.store.v4f32(<4 x float> %76, <4 x i32> %46, i32 0, i32 %60, i1 true, i1 false) #3
%77 = or i32 %37, 48
%78 = call nsz float @llvm.SI.load.const.v4i32(<4 x i32> %41, i32 %77)
store float %42, float addrspace(3)* %55, align 16, !noalias !0
store float %63, float addrspace(3)* %65, align 4, !noalias !0
store float %71, float addrspace(3)* %73, align 8, !noalias !0
%79 = bitcast float %78 to i32
%80 = bitcast i32 addrspace(3)* %33 to float addrspace(3)*
store float %78, float addrspace(3)* %80, align 4, !noalias !0
%81 = insertelement <4 x i32> %74, i32 %79, i32 3
%82 = bitcast <4 x i32> %81 to <4 x float>
call void @llvm.amdgcn.buffer.store.v4f32(<4 x float> %82, <4 x i32> %46, i32 0, i32 %60, i1 true, i1 false) #3
%83 = add nuw nsw i32 %24, 196
%84 = add nuw nsw i32 %24, 197
%85 = getelementptr [16384 x i32], [16384 x i32] addrspace(3)* null, i32 0, i32 %84
%86 = load i32, i32 addrspace(3)* %85, align 4, !noalias !0
%87 = add nuw nsw i32 %24, 198
%88 = getelementptr [16384 x i32], [16384 x i32] addrspace(3)* null, i32 0, i32 %87
%89 = load i32, i32 addrspace(3)* %88, align 8, !noalias !0
%90 = add nuw nsw i32 %24, 199
%91 = getelementptr [16384 x i32], [16384 x i32] addrspace(3)* null, i32 0, i32 %90
%92 = load i32, i32 addrspace(3)* %91, align 4, !noalias !0
%93 = or i32 %37, 64
%94 = call nsz float @llvm.SI.load.const.v4i32(<4 x i32> %41, i32 %93)
%95 = mul nuw nsw i32 %47, 147
%96 = add nuw nsw i32 %49, %95
%97 = shl nuw nsw i32 %96, 4
%98 = bitcast float %94 to i32
%99 = getelementptr [16384 x i32], [16384 x i32] addrspace(3)* null, i32 0, i32 %83
%100 = bitcast i32 addrspace(3)* %99 to float addrspace(3)*
store float %94, float addrspace(3)* %100, align 16, !noalias !0
%101 = insertelement <4 x i32> undef, i32 %98, i32 0
%102 = insertelement <4 x i32> %101, i32 %86, i32 1
%103 = insertelement <4 x i32> %102, i32 %89, i32 2
%104 = insertelement <4 x i32> %103, i32 %92, i32 3
%105 = add i32 %97, %10
%106 = bitcast <4 x i32> %104 to <4 x float>
call void @llvm.amdgcn.buffer.store.v4f32(<4 x float> %106, <4 x i32> %46, i32 0, i32 %105, i1 true, i1 false) #3
%107 = or i32 %37, 80
%108 = call nsz float @llvm.SI.load.const.v4i32(<4 x i32> %41, i32 %107)
store float %94, float addrspace(3)* %100, align 16, !noalias !0
%109 = bitcast float %108 to i32
%110 = bitcast i32 addrspace(3)* %85 to float addrspace(3)*
store float %108, float addrspace(3)* %110, align 4, !noalias !0
%111 = insertelement <4 x i32> %101, i32 %109, i32 1
%112 = insertelement <4 x i32> %111, i32 %89, i32 2
%113 = insertelement <4 x i32> %112, i32 %92, i32 3
%114 = bitcast <4 x i32> %113 to <4 x float>
call void @llvm.amdgcn.buffer.store.v4f32(<4 x float> %114, <4 x i32> %46, i32 0, i32 %105, i1 true, i1 false) #3
%115 = or i32 %37, 96
%116 = call nsz float @llvm.SI.load.const.v4i32(<4 x i32> %41, i32 %115)
store float %94, float addrspace(3)* %100, align 16, !noalias !0
store float %108, float addrspace(3)* %110, align 4, !noalias !0
%117 = bitcast float %116 to i32
%118 = bitcast i32 addrspace(3)* %88 to float addrspace(3)*
store float %116, float addrspace(3)* %118, align 8, !noalias !0
%119 = insertelement <4 x i32> %111, i32 %117, i32 2
%120 = insertelement <4 x i32> %119, i32 %92, i32 3
%121 = bitcast <4 x i32> %120 to <4 x float>
call void @llvm.amdgcn.buffer.store.v4f32(<4 x float> %121, <4 x i32> %46, i32 0, i32 %105, i1 true, i1 false) #3
%122 = or i32 %37, 112
%123 = call nsz float @llvm.SI.load.const.v4i32(<4 x i32> %41, i32 %122)
store float %94, float addrspace(3)* %100, align 16, !noalias !0
store float %108, float addrspace(3)* %110, align 4, !noalias !0
store float %116, float addrspace(3)* %118, align 8, !noalias !0
%124 = bitcast float %123 to i32
%125 = bitcast i32 addrspace(3)* %91 to float addrspace(3)*
store float %123, float addrspace(3)* %125, align 4, !noalias !0
%126 = insertelement <4 x i32> %119, i32 %124, i32 3
%127 = bitcast <4 x i32> %126 to <4 x float>
call void @llvm.amdgcn.buffer.store.v4f32(<4 x float> %127, <4 x i32> %46, i32 0, i32 %105, i1 true, i1 false) #3
%128 = icmp eq i32 %17, 0
br i1 %128, label %if-true-block.i, label %tcs_epilog.exit
if-true-block.i: ; preds = %main_body
%129 = zext i32 %9 to i64
%130 = shl nuw nsw i64 %129, 16
%131 = insertelement <2 x i64> <i64 undef, i64 703330959491071>, i64 %130, i32 0
%132 = bitcast <2 x i64> %131 to <4 x i32>
%133 = mul nuw nsw i32 %20, 24
%134 = icmp eq i32 %20, 0
br i1 %134, label %if-true-block2.i, label %endif-block1.i
if-true-block2.i: ; preds = %if-true-block.i
call void @llvm.amdgcn.buffer.store.f32(float -0.000000e+00, <4 x i32> %132, i32 0, i32 %11, i1 true, i1 false) #3
br label %endif-block1.i
endif-block1.i: ; preds = %if-true-block2.i, %if-true-block.i
%135 = add i32 %11, 4
%136 = add i32 %135, %133
call void @llvm.amdgcn.buffer.store.v4f32(<4 x float> <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>, <4 x i32> %132, i32 0, i32 %136, i1 true, i1 false) #3
%137 = add i32 %11, 20
%138 = add i32 %137, %133
call void @llvm.amdgcn.buffer.store.v2f32(<2 x float> <float 1.000000e+00, float 1.000000e+00>, <4 x i32> %132, i32 0, i32 %138, i1 true, i1 false) #3
br label %tcs_epilog.exit
tcs_epilog.exit: ; preds = %main_body, %endif-block1.i
ret void
}
attributes #0 = { nounwind readnone }
attributes #1 = { nounwind writeonly }
attributes #2 = { "amdgpu-max-work-group-size"="128" "no-signed-zeros-fp-math"="true" }
attributes #3 = { inaccessiblememonly nounwind }
!0 = !{!1}
!1 = distinct !{!1, !2, !"main: argument 0"}
!2 = distinct !{!2, !"main"}
SHADER KEY
part.tcs.epilog.prim_mode = 7
mono.u.ff_tcs_inputs_to_copy = 0x0
Tessellation Control Shader:
Shader main disassembly:
v_mul_lo_i32 v0, v0, 3 ; D2850000 00010700
v_bfe_u32 v17, v1, 8, 5 ; D1C80011 02151101
s_lshl_b32 s0, s9, 2 ; 8E008209
s_and_b32 s0, s0, 0x3fffc ; 8600FF00 0003FFFC
v_add_i32_e32 v0, vcc, v0, v17 ; 32002300
v_lshlrev_b32_e32 v18, 7, v0 ; 24240087
v_and_b32_e32 v0, 0xff, v1 ; 260002FF 000000FF
v_mov_b32_e32 v1, 0x260 ; 7E0202FF 00000260
s_mov_b32 s3, 0x27fac ; BE8300FF 00027FAC
v_mad_u32_u24 v1, v0, v1, s0 ; D1C30001 00020300
v_mov_b32_e32 v3, 0xc8 ; 7E0602FF 000000C8
v_mad_u32_u24 v1, v17, v3, v1 ; D1C30001 04060711
s_movk_i32 s6, 0x600 ; B0060600
s_mov_b32 s7, s3 ; BE870003
v_lshlrev_b32_e32 v19, 2, v1 ; 24260282
buffer_load_dword v2, v18, s[4:7], 0 offen ; E0501000 80010212
s_and_b32 s8, s8, 63 ; 8608BF08
v_mad_u32_u24 v1, v0, 3, v17 ; D1C30001 04450700
v_mov_b32_e32 v5, 0x90 ; 7E0A02FF 00000090
v_mad_u32_u24 v1, s8, v5, v1 ; D1C30001 04060A08
v_or_b32_e32 v5, 16, v18 ; 280A2490
buffer_load_dword v14, v5, s[4:7], 0 offen ; E0501000 80010E05
s_mov_b32 m0, -1 ; BEFC00C1
ds_read2_b32 v[3:4], v19 offset0:193 offset1:194 ; D86EC2C1 03000013
v_or_b32_e32 v15, 32, v18 ; 281E24A0
s_mov_b32 s11, 0 ; BE8B0080
s_mov_b32 s10, s13 ; BE8A000D
s_mov_b32 s13, s11 ; BE8D000B
s_lshl_b64 s[0:1], s[12:13], 16 ; 8E80900C
s_mov_b32 s2, -1 ; BE8200C1
v_or_b32_e32 v21, 0x50, v18 ; 282A24FF 00000050
s_waitcnt vmcnt(1) lgkmcnt(0) ; BF8C0071
v_mov_b32_e32 v9, v5 ; 7E120305
v_mov_b32_e32 v7, v3 ; 7E0E0303
v_mov_b32_e32 v8, v4 ; 7E100304
v_mov_b32_e32 v6, v2 ; 7E0C0302
v_lshlrev_b32_e32 v5, 4, v1 ; 240A0284
v_add_i32_e32 v20, vcc, s14, v5 ; 32280A0E
s_waitcnt vmcnt(0) ; BF8C0F70
v_mov_b32_e32 v7, v14 ; 7E0E030E
v_mov_b32_e32 v13, v9 ; 7E1A0309
v_mov_b32_e32 v12, v8 ; 7E180308
v_mov_b32_e32 v11, v7 ; 7E160307
v_mov_b32_e32 v10, v6 ; 7E140306
buffer_load_dword v8, v15, s[4:7], 0 offen ; E0501000 8001080F
v_mov_b32_e32 v13, v2 ; 7E1A0302
v_or_b32_e32 v5, 48, v18 ; 280A24B0
v_mad_u32_u24 v1, s8, 3, v1 ; D1C30001 04050608
v_lshlrev_b32_e32 v1, 4, v1 ; 24020284
v_mov_b32_e32 v12, v4 ; 7E180304
s_waitcnt vmcnt(0) ; BF8C0F70
ds_write2_b32 v19, v2, v8 offset0:192 offset1:194 ; D81CC2C0 00080213
ds_write_b64 v19, v[13:14] offset:768 ; D89A0300 00000D13
ds_write_b64 v19, v[13:14] offset:768 ; D89A0300 00000D13
ds_write_b64 v19, v[13:14] offset:768 ; D89A0300 00000D13
buffer_load_dword v14, v5, s[4:7], 0 offen ; E0501000 80010E05
ds_read2_b32 v[15:16], v19 offset0:195 offset1:199 ; D86EC7C3 0F000013
s_waitcnt lgkmcnt(0) ; BF8C007F
v_mov_b32_e32 v5, v15 ; 7E0A030F
v_mov_b32_e32 v13, v15 ; 7E1A030F
s_waitcnt vmcnt(0) ; BF8C0F70
v_mov_b32_e32 v9, v14 ; 7E12030E
buffer_store_dwordx4 v[6:9], v20, s[0:3], 0 offen glc ; E07C5000 80000614
buffer_store_dwordx4 v[2:5], v20, s[0:3], 0 offen glc ; E07C5000 80000214
s_waitcnt expcnt(0) ; BF8C0F0F
ds_read2_b32 v[2:3], v19 offset0:197 offset1:198 ; D86EC6C5 02000013
v_add_i32_e32 v5, vcc, s14, v1 ; 320A020E
v_or_b32_e32 v1, 64, v18 ; 280224C0
buffer_load_dword v1, v1, s[4:7], 0 offen ; E0501000 80010101
v_mov_b32_e32 v9, v15 ; 7E12030F
v_mov_b32_e32 v4, v16 ; 7E080310
buffer_store_dwordx4 v[10:13], v20, s[0:3], 0 offen glc ; E07C5000 80000A14
buffer_store_dwordx4 v[6:9], v20, s[0:3], 0 offen glc ; E07C5000 80000614
s_waitcnt expcnt(1) ; BF8C0F1F
v_or_b32_e32 v10, 0x60, v18 ; 281424FF 00000060
v_or_b32_e32 v11, 0x70, v18 ; 281624FF 00000070
v_cmp_eq_u32_e32 vcc, 0, v17 ; 7D942280
v_mov_b32_e32 v13, v8 ; 7E1A0308
s_waitcnt vmcnt(2) lgkmcnt(0) ; BF8C0072
buffer_store_dwordx4 v[1:4], v5, s[0:3], 0 offen glc ; E07C5000 80000105
s_waitcnt expcnt(0) ; BF8C0F0F
buffer_load_dword v2, v21, s[4:7], 0 offen ; E0501000 80010215
s_waitcnt vmcnt(0) ; BF8C0F70
buffer_store_dwordx4 v[1:4], v5, s[0:3], 0 offen glc ; E07C5000 80000105
s_waitcnt expcnt(0) ; BF8C0F0F
buffer_load_dword v3, v10, s[4:7], 0 offen ; E0501000 8001030A
s_waitcnt vmcnt(0) ; BF8C0F70
buffer_store_dwordx4 v[1:4], v5, s[0:3], 0 offen glc ; E07C5000 80000105
s_waitcnt expcnt(0) ; BF8C0F0F
buffer_load_dword v4, v11, s[4:7], 0 offen ; E0501000 8001040B
ds_write_b32 v19, v1 offset:784 ; D81A0310 00000113
ds_write2_b64 v19, v[13:14], v[1:2] offset0:97 offset1:98 ; D89C6261 00010D13
ds_write_b64 v19, v[1:2] offset:784 ; D89A0310 00000113
ds_write_b32 v19, v3 offset:792 ; D81A0318 00000313
s_waitcnt vmcnt(0) ; BF8C0F70
ds_write2_b64 v19, v[1:2], v[3:4] offset0:98 offset1:99 ; D89C6362 00030113
buffer_store_dwordx4 v[1:4], v5, s[0:3], 0 offen glc ; E07C5000 80000105
s_and_saveexec_b64 s[0:1], vcc ; BE80206A
s_cbranch_execz BB0_4 ; BF880000
s_lshl_b64 s[0:1], s[10:11], 16 ; 8E80900A
v_cmp_eq_u32_e32 vcc, 0, v0 ; 7D940080
s_and_saveexec_b64 s[4:5], vcc ; BE84206A
s_waitcnt expcnt(0) ; BF8C0F0F
v_bfrev_b32_e32 v1, 1 ; 7E025881
v_mov_b32_e32 v2, s15 ; 7E04020F
buffer_store_dword v1, v2, s[0:3], 0 offen glc ; E0705000 80000102
s_or_b64 exec, exec, s[4:5] ; 87FE047E
s_waitcnt expcnt(0) ; BF8C0F0F
v_mov_b32_e32 v1, 1.0 ; 7E0202F2
v_mov_b32_e32 v2, v1 ; 7E040301
v_mov_b32_e32 v3, v1 ; 7E060301
v_mov_b32_e32 v4, v1 ; 7E080301
v_mad_u32_u24 v0, v0, 24, s15 ; D1C30000 003D3100
buffer_store_dwordx4 v[1:4], v0, s[0:3], 0 offen offset:4 glc ; E07C5004 80000100
buffer_store_dwordx2 v[1:2], v0, s[0:3], 0 offen offset:20 glc ; E0745014 80000100
s_endpgm ; BF810000
*** SHADER STATS ***
SGPRS: 96
VGPRS: 24
Spilled SGPRs: 0
Spilled VGPRs: 0
Private memory VGPRs: 0
Code Size: 656 bytes
LDS: 0 blocks
Scratch: 0 bytes per wave
Max Waves: 8
********************
-------------- next part --------------
TESS_CTRL
PROPERTY TCS_VERTICES_OUT 3
DCL SV[0], INVOCATIONID
DCL SV[1], PRIM_ID
DCL OUT[][0..1], ARRAY(1), CLIPDIST
DCL OUT[2], TESSOUTER
DCL OUT[3].xy, TESSINNER
DCL CONST[0][0..95]
DCL TEMP[0..23], LOCAL
IMM[0] INT32 {3, 8, 0, 0}
IMM[1] FLT32 { 1.0000, 0.0000, 0.0000, 0.0000}
0: MOV TEMP[0].yzw, OUT[SV[0].x](1)[0].zyzw
1: UMAD TEMP[1].x, SV[1].xxxx, IMM[0].xxxx, SV[0].xxxx
2: UMUL TEMP[2].x, TEMP[1].xxxx, IMM[0].yyyy
3: MOV TEMP[0].x, CONST[0][TEMP[2].x].xxxx
4: MOV OUT[SV[0].x](1)[0], TEMP[0]
5: MOV TEMP[3].xzw, OUT[SV[0].x](1)[0].xxzw
6: UMAD TEMP[4].x, SV[1].xxxx, IMM[0].xxxx, SV[0].xxxx
7: UMUL TEMP[5].x, TEMP[4].xxxx, IMM[0].yyyy
8: MOV TEMP[3].y, CONST[0][TEMP[5].x+1].yxyy
9: MOV OUT[SV[0].x](1)[0], TEMP[3]
10: MOV TEMP[6].xyw, OUT[SV[0].x](1)[0].xyxw
11: UMAD TEMP[7].x, SV[1].xxxx, IMM[0].xxxx, SV[0].xxxx
12: UMUL TEMP[8].x, TEMP[7].xxxx, IMM[0].yyyy
13: MOV TEMP[6].z, CONST[0][TEMP[8].x+2].zzxz
14: MOV OUT[SV[0].x](1)[0], TEMP[6]
15: MOV TEMP[9].xyz, OUT[SV[0].x](1)[0].xyzx
16: UMAD TEMP[10].x, SV[1].xxxx, IMM[0].xxxx, SV[0].xxxx
17: UMUL TEMP[11].x, TEMP[10].xxxx, IMM[0].yyyy
18: MOV TEMP[9].w, CONST[0][TEMP[11].x+3].wwwx
19: MOV OUT[SV[0].x](1)[0], TEMP[9]
20: MOV TEMP[12].yzw, OUT[SV[0].x](1)[1].zyzw
21: UMAD TEMP[13].x, SV[1].xxxx, IMM[0].xxxx, SV[0].xxxx
22: UMUL TEMP[14].x, TEMP[13].xxxx, IMM[0].yyyy
23: MOV TEMP[12].x, CONST[0][TEMP[14].x+4].xxxx
24: MOV OUT[SV[0].x](1)[1], TEMP[12]
25: MOV TEMP[15].xzw, OUT[SV[0].x](1)[1].xxzw
26: UMAD TEMP[16].x, SV[1].xxxx, IMM[0].xxxx, SV[0].xxxx
27: UMUL TEMP[17].x, TEMP[16].xxxx, IMM[0].yyyy
28: MOV TEMP[15].y, CONST[0][TEMP[17].x+5].yxyy
29: MOV OUT[SV[0].x](1)[1], TEMP[15]
30: MOV TEMP[18].xyw, OUT[SV[0].x](1)[1].xyxw
31: UMAD TEMP[19].x, SV[1].xxxx, IMM[0].xxxx, SV[0].xxxx
32: UMUL TEMP[20].x, TEMP[19].xxxx, IMM[0].yyyy
33: MOV TEMP[18].z, CONST[0][TEMP[20].x+6].zzxz
34: MOV OUT[SV[0].x](1)[1], TEMP[18]
35: MOV TEMP[21].xyz, OUT[SV[0].x](1)[1].xyzx
36: UMAD TEMP[22].x, SV[1].xxxx, IMM[0].xxxx, SV[0].xxxx
37: UMUL TEMP[23].x, TEMP[22].xxxx, IMM[0].yyyy
38: MOV TEMP[21].w, CONST[0][TEMP[23].x+7].wwwx
39: MOV OUT[SV[0].x](1)[1], TEMP[21]
40: MOV OUT[2].x, IMM[1].xxxx
41: MOV OUT[2].y, IMM[1].xxxx
42: MOV OUT[2].z, IMM[1].xxxx
43: MOV OUT[2].w, IMM[1].xxxx
44: MOV OUT[3].x, IMM[1].xxxx
45: MOV OUT[3].y, IMM[1].xxxx
46: END
radeonsi: Compiling shader 2
TGSI shader LLVM IR:
; ModuleID = 'tgsi'
source_filename = "tgsi"
target datalayout = "e-p:32:32-p1:64:64-p2:64:64-p3:32:32-p4:64:64-p5:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64"
target triple = "amdgcn--"
; Function Attrs: nounwind readnone
declare float @llvm.SI.load.const.v4i32(<4 x i32>, i32) #0
; Function Attrs: nounwind writeonly
declare void @llvm.amdgcn.buffer.store.v4f32(<4 x float>, <4 x i32>, i32, i32, i1, i1) #1
; Function Attrs: nounwind writeonly
declare void @llvm.amdgcn.buffer.store.f32(float, <4 x i32>, i32, i32, i1, i1) #1
; Function Attrs: nounwind writeonly
declare void @llvm.amdgcn.buffer.store.v2f32(<2 x float>, <4 x i32>, i32, i32, i1, i1) #1
define amdgpu_hs void @wrapper([12 x <4 x i32>] addrspace(2)* byval noalias dereferenceable(18446744073709551615), [0 x <8 x i32>] addrspace(2)* byval noalias dereferenceable(18446744073709551615), [0 x float] addrspace(2)* byval noalias dereferenceable(18446744073709551615), [80 x <8 x i32>] addrspace(2)* byval noalias dereferenceable(18446744073709551615), i32 inreg, i32 inreg, i32 inreg, i32 inreg, i32 inreg, i32 inreg, i32 inreg, i32 inreg, i32, i32) #2 {
main_body:
%14 = ptrtoint [0 x float] addrspace(2)* %2 to i64
%15 = bitcast i64 %14 to <2 x i32>
%16 = lshr i32 %13, 8
%17 = and i32 %16, 31
%18 = shl i32 %5, 2
%19 = and i32 %18, 262140
%20 = and i32 %13, 255
%21 = mul nuw nsw i32 %20, 608
%22 = add nuw nsw i32 %19, %21
%23 = mul nuw nsw i32 %17, 200
%24 = add nuw nsw i32 %22, %23
%25 = add nuw nsw i32 %24, 192
%26 = add nuw nsw i32 %24, 193
%27 = getelementptr [16384 x i32], [16384 x i32] addrspace(3)* null, i32 0, i32 %26
%28 = load i32, i32 addrspace(3)* %27, align 4, !noalias !0
%29 = add nuw nsw i32 %24, 194
%30 = getelementptr [16384 x i32], [16384 x i32] addrspace(3)* null, i32 0, i32 %29
%31 = load i32, i32 addrspace(3)* %30, align 8, !noalias !0
%32 = add nuw nsw i32 %24, 195
%33 = getelementptr [16384 x i32], [16384 x i32] addrspace(3)* null, i32 0, i32 %32
%34 = load i32, i32 addrspace(3)* %33, align 4, !noalias !0
%35 = mul i32 %12, 3
%36 = add i32 %35, %17
%37 = shl i32 %36, 7
%38 = shufflevector <2 x i32> %15, <2 x i32> undef, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
%39 = shufflevector <2 x i32> %15, <2 x i32> undef, <4 x i32> <i32 undef, i32 1, i32 undef, i32 undef>
%40 = shufflevector <4 x i32> <i32 undef, i32 undef, i32 1536, i32 163756>, <4 x i32> %38, <4 x i32> <i32 4, i32 undef, i32 2, i32 3>
%41 = shufflevector <4 x i32> %40, <4 x i32> %39, <4 x i32> <i32 0, i32 5, i32 2, i32 3>
%42 = call nsz float @llvm.SI.load.const.v4i32(<4 x i32> %41, i32 %37)
%43 = zext i32 %8 to i64
%44 = shl nuw nsw i64 %43, 16
%45 = insertelement <2 x i64> <i64 undef, i64 703330959491071>, i64 %44, i32 0
%46 = bitcast <2 x i64> %45 to <4 x i32>
%47 = and i32 %4, 63
%48 = mul nuw nsw i32 %20, 3
%49 = add nuw nsw i32 %48, %17
%50 = mul nuw nsw i32 %47, 144
%51 = add nuw nsw i32 %49, %50
%52 = shl nuw nsw i32 %51, 4
%53 = bitcast float %42 to i32
%54 = getelementptr [16384 x i32], [16384 x i32] addrspace(3)* null, i32 0, i32 %25
%55 = bitcast i32 addrspace(3)* %54 to float addrspace(3)*
store float %42, float addrspace(3)* %55, align 16, !noalias !0
%56 = insertelement <4 x i32> undef, i32 %53, i32 0
%57 = insertelement <4 x i32> %56, i32 %28, i32 1
%58 = insertelement <4 x i32> %57, i32 %31, i32 2
%59 = insertelement <4 x i32> %58, i32 %34, i32 3
%60 = add i32 %52, %10
%61 = bitcast <4 x i32> %59 to <4 x float>
call void @llvm.amdgcn.buffer.store.v4f32(<4 x float> %61, <4 x i32> %46, i32 0, i32 %60, i1 true, i1 false) #3
%62 = or i32 %37, 16
%63 = call nsz float @llvm.SI.load.const.v4i32(<4 x i32> %41, i32 %62)
store float %42, float addrspace(3)* %55, align 16, !noalias !0
%64 = bitcast float %63 to i32
%65 = bitcast i32 addrspace(3)* %27 to float addrspace(3)*
store float %63, float addrspace(3)* %65, align 4, !noalias !0
%66 = insertelement <4 x i32> %56, i32 %64, i32 1
%67 = insertelement <4 x i32> %66, i32 %31, i32 2
%68 = insertelement <4 x i32> %67, i32 %34, i32 3
%69 = bitcast <4 x i32> %68 to <4 x float>
call void @llvm.amdgcn.buffer.store.v4f32(<4 x float> %69, <4 x i32> %46, i32 0, i32 %60, i1 true, i1 false) #3
%70 = or i32 %37, 32
%71 = call nsz float @llvm.SI.load.const.v4i32(<4 x i32> %41, i32 %70)
store float %42, float addrspace(3)* %55, align 16, !noalias !0
store float %63, float addrspace(3)* %65, align 4, !noalias !0
%72 = bitcast float %71 to i32
%73 = bitcast i32 addrspace(3)* %30 to float addrspace(3)*
store float %71, float addrspace(3)* %73, align 8, !noalias !0
%74 = insertelement <4 x i32> %66, i32 %72, i32 2
%75 = insertelement <4 x i32> %74, i32 %34, i32 3
%76 = bitcast <4 x i32> %75 to <4 x float>
call void @llvm.amdgcn.buffer.store.v4f32(<4 x float> %76, <4 x i32> %46, i32 0, i32 %60, i1 true, i1 false) #3
%77 = or i32 %37, 48
%78 = call nsz float @llvm.SI.load.const.v4i32(<4 x i32> %41, i32 %77)
store float %42, float addrspace(3)* %55, align 16, !noalias !0
store float %63, float addrspace(3)* %65, align 4, !noalias !0
store float %71, float addrspace(3)* %73, align 8, !noalias !0
%79 = bitcast float %78 to i32
%80 = bitcast i32 addrspace(3)* %33 to float addrspace(3)*
store float %78, float addrspace(3)* %80, align 4, !noalias !0
%81 = insertelement <4 x i32> %74, i32 %79, i32 3
%82 = bitcast <4 x i32> %81 to <4 x float>
call void @llvm.amdgcn.buffer.store.v4f32(<4 x float> %82, <4 x i32> %46, i32 0, i32 %60, i1 true, i1 false) #3
%83 = add nuw nsw i32 %24, 196
%84 = add nuw nsw i32 %24, 197
%85 = getelementptr [16384 x i32], [16384 x i32] addrspace(3)* null, i32 0, i32 %84
%86 = load i32, i32 addrspace(3)* %85, align 4, !noalias !0
%87 = add nuw nsw i32 %24, 198
%88 = getelementptr [16384 x i32], [16384 x i32] addrspace(3)* null, i32 0, i32 %87
%89 = load i32, i32 addrspace(3)* %88, align 8, !noalias !0
%90 = add nuw nsw i32 %24, 199
%91 = getelementptr [16384 x i32], [16384 x i32] addrspace(3)* null, i32 0, i32 %90
%92 = load i32, i32 addrspace(3)* %91, align 4, !noalias !0
%93 = or i32 %37, 64
%94 = call nsz float @llvm.SI.load.const.v4i32(<4 x i32> %41, i32 %93)
%95 = mul nuw nsw i32 %47, 147
%96 = add nuw nsw i32 %49, %95
%97 = shl nuw nsw i32 %96, 4
%98 = bitcast float %94 to i32
%99 = getelementptr [16384 x i32], [16384 x i32] addrspace(3)* null, i32 0, i32 %83
%100 = bitcast i32 addrspace(3)* %99 to float addrspace(3)*
store float %94, float addrspace(3)* %100, align 16, !noalias !0
%101 = insertelement <4 x i32> undef, i32 %98, i32 0
%102 = insertelement <4 x i32> %101, i32 %86, i32 1
%103 = insertelement <4 x i32> %102, i32 %89, i32 2
%104 = insertelement <4 x i32> %103, i32 %92, i32 3
%105 = add i32 %97, %10
%106 = bitcast <4 x i32> %104 to <4 x float>
call void @llvm.amdgcn.buffer.store.v4f32(<4 x float> %106, <4 x i32> %46, i32 0, i32 %105, i1 true, i1 false) #3
%107 = or i32 %37, 80
%108 = call nsz float @llvm.SI.load.const.v4i32(<4 x i32> %41, i32 %107)
store float %94, float addrspace(3)* %100, align 16, !noalias !0
%109 = bitcast float %108 to i32
%110 = bitcast i32 addrspace(3)* %85 to float addrspace(3)*
store float %108, float addrspace(3)* %110, align 4, !noalias !0
%111 = insertelement <4 x i32> %101, i32 %109, i32 1
%112 = insertelement <4 x i32> %111, i32 %89, i32 2
%113 = insertelement <4 x i32> %112, i32 %92, i32 3
%114 = bitcast <4 x i32> %113 to <4 x float>
call void @llvm.amdgcn.buffer.store.v4f32(<4 x float> %114, <4 x i32> %46, i32 0, i32 %105, i1 true, i1 false) #3
%115 = or i32 %37, 96
%116 = call nsz float @llvm.SI.load.const.v4i32(<4 x i32> %41, i32 %115)
store float %94, float addrspace(3)* %100, align 16, !noalias !0
store float %108, float addrspace(3)* %110, align 4, !noalias !0
%117 = bitcast float %116 to i32
%118 = bitcast i32 addrspace(3)* %88 to float addrspace(3)*
store float %116, float addrspace(3)* %118, align 8, !noalias !0
%119 = insertelement <4 x i32> %111, i32 %117, i32 2
%120 = insertelement <4 x i32> %119, i32 %92, i32 3
%121 = bitcast <4 x i32> %120 to <4 x float>
call void @llvm.amdgcn.buffer.store.v4f32(<4 x float> %121, <4 x i32> %46, i32 0, i32 %105, i1 true, i1 false) #3
%122 = or i32 %37, 112
%123 = call nsz float @llvm.SI.load.const.v4i32(<4 x i32> %41, i32 %122)
store float %94, float addrspace(3)* %100, align 16, !noalias !0
store float %108, float addrspace(3)* %110, align 4, !noalias !0
store float %116, float addrspace(3)* %118, align 8, !noalias !0
%124 = bitcast float %123 to i32
%125 = bitcast i32 addrspace(3)* %91 to float addrspace(3)*
store float %123, float addrspace(3)* %125, align 4, !noalias !0
%126 = insertelement <4 x i32> %119, i32 %124, i32 3
%127 = bitcast <4 x i32> %126 to <4 x float>
call void @llvm.amdgcn.buffer.store.v4f32(<4 x float> %127, <4 x i32> %46, i32 0, i32 %105, i1 true, i1 false) #3
%128 = icmp eq i32 %17, 0
br i1 %128, label %if-true-block.i, label %tcs_epilog.exit
if-true-block.i: ; preds = %main_body
%129 = zext i32 %9 to i64
%130 = shl nuw nsw i64 %129, 16
%131 = insertelement <2 x i64> <i64 undef, i64 703330959491071>, i64 %130, i32 0
%132 = bitcast <2 x i64> %131 to <4 x i32>
%133 = mul nuw nsw i32 %20, 24
%134 = icmp eq i32 %20, 0
br i1 %134, label %if-true-block2.i, label %endif-block1.i
if-true-block2.i: ; preds = %if-true-block.i
call void @llvm.amdgcn.buffer.store.f32(float -0.000000e+00, <4 x i32> %132, i32 0, i32 %11, i1 true, i1 false) #3
br label %endif-block1.i
endif-block1.i: ; preds = %if-true-block2.i, %if-true-block.i
%135 = add i32 %11, 4
%136 = add i32 %135, %133
call void @llvm.amdgcn.buffer.store.v4f32(<4 x float> <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>, <4 x i32> %132, i32 0, i32 %136, i1 true, i1 false) #3
%137 = add i32 %11, 20
%138 = add i32 %137, %133
call void @llvm.amdgcn.buffer.store.v2f32(<2 x float> <float 1.000000e+00, float 1.000000e+00>, <4 x i32> %132, i32 0, i32 %138, i1 true, i1 false) #3
br label %tcs_epilog.exit
tcs_epilog.exit: ; preds = %main_body, %endif-block1.i
ret void
}
attributes #0 = { nounwind readnone }
attributes #1 = { nounwind writeonly }
attributes #2 = { "amdgpu-max-work-group-size"="128" "no-signed-zeros-fp-math"="true" }
attributes #3 = { inaccessiblememonly nounwind }
!0 = !{!1}
!1 = distinct !{!1, !2, !"main: argument 0"}
!2 = distinct !{!2, !"main"}
SHADER KEY
part.tcs.epilog.prim_mode = 7
mono.u.ff_tcs_inputs_to_copy = 0x0
Tessellation Control Shader:
Shader main disassembly:
v_mul_lo_i32 v0, v0, 3 ; D2850000 00010700
v_bfe_u32 v6, v1, 8, 5 ; D1C80006 02151101
s_mov_b32 s3, 0x27fac ; BE8300FF 00027FAC
s_movk_i32 s6, 0x600 ; B0060600
v_add_i32_e32 v0, vcc, v0, v6 ; 32000D00
v_lshlrev_b32_e32 v7, 7, v0 ; 240E0087
s_mov_b32 s7, s3 ; BE870003
buffer_load_dword v2, v7, s[4:7], 0 offen ; E0501000 80010207
s_lshl_b32 s0, s9, 2 ; 8E008209
v_and_b32_e32 v0, 0xff, v1 ; 260002FF 000000FF
v_mov_b32_e32 v3, 0x260 ; 7E0602FF 00000260
s_and_b32 s9, s0, 0x3fffc ; 8609FF00 0003FFFC
v_mov_b32_e32 v4, 0xc8 ; 7E0802FF 000000C8
v_mad_u32_u24 v3, v0, v3, s9 ; D1C30003 00260700
v_mov_b32_e32 v5, 0x90 ; 7E0A02FF 00000090
s_and_b32 s8, s8, 63 ; 8608BF08
v_mad_u32_u24 v1, v0, 3, v6 ; D1C30001 04190700
v_mad_u32_u24 v3, v6, v4, v3 ; D1C30003 040E0906
v_mad_u32_u24 v1, s8, v5, v1 ; D1C30001 04060A08
v_lshlrev_b32_e32 v4, 4, v1 ; 24080284
s_mov_b32 m0, -1 ; BEFC00C1
v_lshlrev_b32_e32 v8, 2, v3 ; 24100682
v_add_i32_e32 v9, vcc, s14, v4 ; 3212080E
ds_read2_b32 v[3:4], v8 offset0:193 offset1:194 ; D86EC2C1 03000008
ds_read_b32 v5, v8 offset:780 ; D86C030C 05000008
s_mov_b32 s11, 0 ; BE8B0080
v_mad_u32_u24 v1, s8, 3, v1 ; D1C30001 04050608
s_mov_b32 s10, s13 ; BE8A000D
s_mov_b32 s13, s11 ; BE8D000B
v_lshlrev_b32_e32 v1, 4, v1 ; 24020284
s_mov_b32 s2, -1 ; BE8200C1
s_lshl_b64 s[0:1], s[12:13], 16 ; 8E80900C
v_add_i32_e32 v10, vcc, s14, v1 ; 3214020E
v_or_b32_e32 v1, 16, v7 ; 28020E90
v_or_b32_e32 v11, 32, v7 ; 28160EA0
v_or_b32_e32 v12, 48, v7 ; 28180EB0
v_or_b32_e32 v13, 64, v7 ; 281A0EC0
v_or_b32_e32 v14, 0x50, v7 ; 281C0EFF 00000050
v_or_b32_e32 v15, 0x60, v7 ; 281E0EFF 00000060
v_or_b32_e32 v7, 0x70, v7 ; 280E0EFF 00000070
v_cmp_eq_u32_e32 vcc, 0, v6 ; 7D940C80
s_waitcnt vmcnt(0) ; BF8C0F70
ds_write_b32 v8, v2 offset:768 ; D81A0300 00000208
s_waitcnt lgkmcnt(1) ; BF8C017F
s_waitcnt vmcnt(0) ; BF8C0F70
buffer_store_dwordx4 v[2:5], v9, s[0:3], 0 offen glc ; E07C5000 80000209
s_waitcnt expcnt(0) ; BF8C0F0F
buffer_load_dword v3, v1, s[4:7], 0 offen ; E0501000 80010301
s_waitcnt vmcnt(0) ; BF8C0F70
ds_write_b64 v8, v[2:3] offset:768 ; D89A0300 00000208
s_waitcnt vmcnt(0) ; BF8C0F70
buffer_store_dwordx4 v[2:5], v9, s[0:3], 0 offen glc ; E07C5000 80000209
s_waitcnt expcnt(0) ; BF8C0F0F
buffer_load_dword v4, v11, s[4:7], 0 offen ; E0501000 8001040B
ds_write_b64 v8, v[2:3] offset:768 ; D89A0300 00000208
s_waitcnt vmcnt(0) ; BF8C0F70
ds_write_b32 v8, v4 offset:776 ; D81A0308 00000408
s_waitcnt vmcnt(0) ; BF8C0F70
buffer_store_dwordx4 v[2:5], v9, s[0:3], 0 offen glc ; E07C5000 80000209
s_waitcnt expcnt(0) ; BF8C0F0F
buffer_load_dword v5, v12, s[4:7], 0 offen ; E0501000 8001050C
buffer_load_dword v1, v13, s[4:7], 0 offen ; E0501000 8001010D
s_waitcnt vmcnt(1) ; BF8C0F71
ds_write2_b64 v8, v[2:3], v[4:5] offset0:96 offset1:97 ; D89C6160 00040208
s_waitcnt vmcnt(0) ; BF8C0F70
buffer_store_dwordx4 v[2:5], v9, s[0:3], 0 offen glc ; E07C5000 80000209
s_waitcnt vmcnt(1) ; BF8C0F71
ds_write_b32 v8, v1 offset:784 ; D81A0310 00000108
s_waitcnt expcnt(0) ; BF8C0F0F
ds_read2_b32 v[2:3], v8 offset0:197 offset1:198 ; D86EC6C5 02000008
ds_read_b32 v4, v8 offset:796 ; D86C031C 04000008
s_waitcnt lgkmcnt(0) ; BF8C007F
s_waitcnt vmcnt(0) ; BF8C0F70
buffer_store_dwordx4 v[1:4], v10, s[0:3], 0 offen glc ; E07C5000 8000010A
s_waitcnt expcnt(0) ; BF8C0F0F
buffer_load_dword v2, v14, s[4:7], 0 offen ; E0501000 8001020E
s_waitcnt vmcnt(0) ; BF8C0F70
ds_write_b64 v8, v[1:2] offset:784 ; D89A0310 00000108
s_waitcnt vmcnt(0) ; BF8C0F70
buffer_store_dwordx4 v[1:4], v10, s[0:3], 0 offen glc ; E07C5000 8000010A
s_waitcnt expcnt(0) ; BF8C0F0F
buffer_load_dword v3, v15, s[4:7], 0 offen ; E0501000 8001030F
ds_write_b64 v8, v[1:2] offset:784 ; D89A0310 00000108
s_waitcnt vmcnt(0) ; BF8C0F70
ds_write_b32 v8, v3 offset:792 ; D81A0318 00000308
s_waitcnt vmcnt(0) ; BF8C0F70
buffer_store_dwordx4 v[1:4], v10, s[0:3], 0 offen glc ; E07C5000 8000010A
s_waitcnt expcnt(0) ; BF8C0F0F
buffer_load_dword v4, v7, s[4:7], 0 offen ; E0501000 80010407
s_waitcnt vmcnt(0) ; BF8C0F70
ds_write2_b64 v8, v[1:2], v[3:4] offset0:98 offset1:99 ; D89C6362 00030108
s_waitcnt vmcnt(0) ; BF8C0F70
buffer_store_dwordx4 v[1:4], v10, s[0:3], 0 offen glc ; E07C5000 8000010A
s_and_saveexec_b64 s[0:1], vcc ; BE80206A
s_cbranch_execz BB0_4 ; BF880000
s_lshl_b64 s[0:1], s[10:11], 16 ; 8E80900A
v_cmp_eq_u32_e32 vcc, 0, v0 ; 7D940080
s_and_saveexec_b64 s[4:5], vcc ; BE84206A
s_waitcnt expcnt(0) ; BF8C0F0F
v_bfrev_b32_e32 v1, 1 ; 7E025881
v_mov_b32_e32 v2, s15 ; 7E04020F
s_waitcnt vmcnt(0) ; BF8C0F70
buffer_store_dword v1, v2, s[0:3], 0 offen glc ; E0705000 80000102
s_or_b64 exec, exec, s[4:5] ; 87FE047E
s_waitcnt expcnt(0) ; BF8C0F0F
v_mov_b32_e32 v1, 1.0 ; 7E0202F2
v_mov_b32_e32 v2, v1 ; 7E040301
v_mov_b32_e32 v3, v1 ; 7E060301
v_mov_b32_e32 v4, v1 ; 7E080301
v_mad_u32_u24 v0, v0, 24, s15 ; D1C30000 003D3100
s_waitcnt vmcnt(0) ; BF8C0F70
buffer_store_dwordx4 v[1:4], v0, s[0:3], 0 offen offset:4 glc ; E07C5004 80000100
s_waitcnt vmcnt(0) ; BF8C0F70
buffer_store_dwordx2 v[1:2], v0, s[0:3], 0 offen offset:20 glc ; E0745014 80000100
s_endpgm ; BF810000
*** SHADER STATS ***
SGPRS: 96
VGPRS: 16
Spilled SGPRs: 0
Spilled VGPRs: 0
Private memory VGPRs: 0
Code Size: 660 bytes
LDS: 0 blocks
Scratch: 0 bytes per wave
Max Waves: 8
********************
More information about the llvm-commits
mailing list