[llvm] r286171 - [AMDGPU] Allow hoisting of comparisons out of a loop and eliminate condition copies
Michel Dänzer via llvm-commits
llvm-commits at lists.llvm.org
Tue Nov 8 22:47:34 PST 2016
Hi Stanislav,
On 08/11/16 08:04 AM, Stanislav Mekhanoshin via llvm-commits wrote:
> Author: rampitec
> Date: Mon Nov 7 17:04:50 2016
> New Revision: 286171
>
> URL: http://llvm.org/viewvc/llvm-project?rev=286171&view=rev
> Log:
> [AMDGPU] Allow hoisting of comparisons out of a loop and eliminate condition copies
This change broke the piglit test
spec at glsl-1.30@execution at fs-discard-exit-2 for me with the radeonsi
driver on Kaveri. I'm attaching shader dumps for the good and bad case.
--
Earthling Michel Dänzer | http://www.amd.com
Libre software enthusiast | Mesa and X developer
-------------- next part --------------
FRAG
PROPERTY FS_COLOR0_WRITES_ALL_CBUFS 1
DCL IN[0], COLOR, COLOR
DCL OUT[0], COLOR
0: MOV OUT[0], IN[0]
1: END
radeonsi: Compiling shader 1
TGSI shader LLVM IR:
; ModuleID = 'tgsi'
source_filename = "tgsi"
target triple = "amdgcn--"
define amdgpu_ps <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> @main([17 x <16 x i8>] addrspace(2)* byval dereferenceable(18446744073709551615), [16 x <16 x i8>] addrspace(2)* byval dereferenceable(18446744073709551615), [32 x <8 x i32>] addrspace(2)* byval dereferenceable(18446744073709551615), [16 x <8 x i32>] addrspace(2)* byval dereferenceable(18446744073709551615), [16 x <4 x i32>] addrspace(2)* byval dereferenceable(18446744073709551615), float inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, i32, i32, float, i32, float, float, float, float) #0 {
main_body:
%27 = bitcast float %5 to i32
%28 = insertvalue <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> undef, i32 %27, 10
%29 = insertvalue <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> %28, float %23, 11
%30 = insertvalue <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> %29, float %24, 12
%31 = insertvalue <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> %30, float %25, 13
%32 = insertvalue <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> %31, float %26, 14
%33 = insertvalue <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> %32, float %21, 24
ret <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> %33
}
attributes #0 = { "InitialPSInputAddr"="36983" }
FRAG
PROPERTY FS_COLOR0_WRITES_ALL_CBUFS 1
DCL SV[0], POSITION
DCL OUT[0], COLOR
DCL CONST[1..2]
DCL CONST[0]
DCL TEMP[0]
DCL TEMP[1..6], LOCAL
IMM[0] FLT32 { 1.0000, 0.0000, 0.1000, 0.0000}
IMM[1] INT32 {0, 10, 0, 0}
0: MOV TEMP[0], SV[0]
1: MAD TEMP[0].y, SV[0], CONST[2].xxxx, CONST[2].yyyy
2: F2I TEMP[1], TEMP[0]
3: MOV TEMP[2].xy, TEMP[1].xyxx
4: INEG TEMP[2].x, CONST[1].xxxx
5: UADD TEMP[2].x, TEMP[1].xxxx, TEMP[2].xxxx
6: IABS TEMP[2].x, TEMP[2].xxxx
7: INEG TEMP[3].x, CONST[1].yyyy
8: UADD TEMP[3].x, TEMP[1].yyyy, TEMP[3].xxxx
9: IABS TEMP[3].x, TEMP[3].xxxx
10: UADD TEMP[2].x, TEMP[2].xxxx, TEMP[3].xxxx
11: USEQ TEMP[1], TEMP[1].xyyy, CONST[0].xyyy
12: AND TEMP[1].x, TEMP[1].xxxx, TEMP[1].yyyy
13: AND TEMP[3], TEMP[1].xxxx, IMM[0].xxxx
14: KILL_IF -TEMP[3].xxxx
15: MOV TEMP[3], IMM[0].yyyy
16: MOV TEMP[4].x, IMM[1].xxxx
17: BGNLOOP :0
18: ISGE TEMP[5].x, TEMP[4].xxxx, IMM[1].yyyy
19: UIF TEMP[5].xxxx :0
20: BRK
21: ENDIF
22: ADD TEMP[6].x, TEMP[3].zzzz, IMM[0].zzzz
23: MOV TEMP[3].z, TEMP[6].xxxx
24: UADD TEMP[4].x, TEMP[4].xxxx, TEMP[2].xxxx
25: UIF TEMP[1].xxxx :0
26: BRK
27: ENDIF
28: ENDLOOP :0
29: MOV OUT[0], TEMP[3]
30: END
radeonsi: Compiling shader 4
TGSI shader LLVM IR:
; ModuleID = 'tgsi'
source_filename = "tgsi"
target triple = "amdgcn--"
define amdgpu_ps <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> @main([17 x <16 x i8>] addrspace(2)* byval dereferenceable(18446744073709551615), [16 x <16 x i8>] addrspace(2)* byval dereferenceable(18446744073709551615), [32 x <8 x i32>] addrspace(2)* byval dereferenceable(18446744073709551615), [16 x <8 x i32>] addrspace(2)* byval dereferenceable(18446744073709551615), [16 x <4 x i32>] addrspace(2)* byval dereferenceable(18446744073709551615), float inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, i32, i32, float, i32) #0 {
main_body:
%23 = getelementptr [16 x <16 x i8>], [16 x <16 x i8>] addrspace(2)* %1, i64 0, i64 0, !amdgpu.uniform !0
%24 = load <16 x i8>, <16 x i8> addrspace(2)* %23, align 16, !invariant.load !0
%25 = call float @llvm.SI.load.const(<16 x i8> %24, i32 32)
%26 = call float @llvm.SI.load.const(<16 x i8> %24, i32 36)
%27 = fmul float %25, %16
%28 = fadd float %27, %26
%29 = fptosi float %15 to i32
%30 = fptosi float %28 to i32
%31 = getelementptr [16 x <16 x i8>], [16 x <16 x i8>] addrspace(2)* %1, i64 0, i64 0, !amdgpu.uniform !0
%32 = load <16 x i8>, <16 x i8> addrspace(2)* %31, align 16, !invariant.load !0
%33 = call float @llvm.SI.load.const(<16 x i8> %32, i32 16)
%34 = bitcast float %33 to i32
%35 = sub i32 %29, %34
%36 = sub i32 0, %35
%37 = icmp sgt i32 %35, %36
%38 = select i1 %37, i32 %35, i32 %36
%39 = getelementptr [16 x <16 x i8>], [16 x <16 x i8>] addrspace(2)* %1, i64 0, i64 0, !amdgpu.uniform !0
%40 = load <16 x i8>, <16 x i8> addrspace(2)* %39, align 16, !invariant.load !0
%41 = call float @llvm.SI.load.const(<16 x i8> %40, i32 20)
%42 = bitcast float %41 to i32
%43 = sub i32 %30, %42
%44 = sub i32 0, %43
%45 = icmp sgt i32 %43, %44
%46 = select i1 %45, i32 %43, i32 %44
%47 = add i32 %38, %46
%48 = getelementptr [16 x <16 x i8>], [16 x <16 x i8>] addrspace(2)* %1, i64 0, i64 0, !amdgpu.uniform !0
%49 = load <16 x i8>, <16 x i8> addrspace(2)* %48, align 16, !invariant.load !0
%50 = call float @llvm.SI.load.const(<16 x i8> %49, i32 0)
%51 = bitcast float %50 to i32
%52 = icmp eq i32 %29, %51
%53 = call float @llvm.SI.load.const(<16 x i8> %49, i32 4)
%54 = bitcast float %53 to i32
%55 = icmp eq i32 %30, %54
%56 = and i1 %52, %55
%57 = select i1 %56, float -1.000000e+00, float 0.000000e+00
call void @llvm.AMDGPU.kill(float %57)
br label %loop17
loop17: ; preds = %endif21, %main_body
%58 = phi i32 [ 0, %main_body ], [ %62, %endif21 ]
%59 = phi i32 [ 0, %main_body ], [ %62, %endif21 ]
%TEMP3.z.0 = phi float [ 0.000000e+00, %main_body ], [ %61, %endif21 ]
%60 = icmp sgt i32 %58, 9
br i1 %60, label %endloop28, label %endif21
endif21: ; preds = %loop17
%61 = fadd float %TEMP3.z.0, 0x3FB99999A0000000
%62 = add i32 %59, %47
br i1 %56, label %endloop28, label %loop17
endloop28: ; preds = %endif21, %loop17
%TEMP3.z.1 = phi float [ %TEMP3.z.0, %loop17 ], [ %61, %endif21 ]
%63 = bitcast float %5 to i32
%64 = insertvalue <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> undef, i32 %63, 10
%65 = insertvalue <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> %64, float 0.000000e+00, 11
%66 = insertvalue <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> %65, float 0.000000e+00, 12
%67 = insertvalue <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> %66, float %TEMP3.z.1, 13
%68 = insertvalue <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> %67, float 0.000000e+00, 14
%69 = insertvalue <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> %68, float %21, 24
ret <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> %69
}
; Function Attrs: nounwind readnone
declare float @llvm.SI.load.const(<16 x i8>, i32) #1
; Function Attrs: nounwind
declare void @llvm.AMDGPU.kill(float) #2
attributes #0 = { "InitialPSInputAddr"="36983" }
attributes #1 = { nounwind readnone }
attributes #2 = { nounwind }
!0 = !{}
FRAG
PROPERTY FS_COLOR0_WRITES_ALL_CBUFS 1
DCL IN[0], GENERIC[0], CONSTANT
DCL OUT[0], COLOR
0: MOV OUT[0], IN[0]
1: END
radeonsi: Compiling shader 5
TGSI shader LLVM IR:
; ModuleID = 'tgsi'
source_filename = "tgsi"
target triple = "amdgcn--"
define amdgpu_ps <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> @main([17 x <16 x i8>] addrspace(2)* byval dereferenceable(18446744073709551615), [16 x <16 x i8>] addrspace(2)* byval dereferenceable(18446744073709551615), [32 x <8 x i32>] addrspace(2)* byval dereferenceable(18446744073709551615), [16 x <8 x i32>] addrspace(2)* byval dereferenceable(18446744073709551615), [16 x <4 x i32>] addrspace(2)* byval dereferenceable(18446744073709551615), float inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, i32, i32, float, i32) #0 {
main_body:
%23 = call float @llvm.SI.fs.constant(i32 0, i32 0, i32 %6)
%24 = call float @llvm.SI.fs.constant(i32 1, i32 0, i32 %6)
%25 = call float @llvm.SI.fs.constant(i32 2, i32 0, i32 %6)
%26 = call float @llvm.SI.fs.constant(i32 3, i32 0, i32 %6)
%27 = bitcast float %5 to i32
%28 = insertvalue <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> undef, i32 %27, 10
%29 = insertvalue <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> %28, float %23, 11
%30 = insertvalue <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> %29, float %24, 12
%31 = insertvalue <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> %30, float %25, 13
%32 = insertvalue <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> %31, float %26, 14
%33 = insertvalue <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> %32, float %21, 24
ret <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> %33
}
; Function Attrs: nounwind readnone
declare float @llvm.SI.fs.constant(i32, i32, i32) #1
attributes #0 = { "InitialPSInputAddr"="36983" }
attributes #1 = { nounwind readnone }
radeonsi: Compiling shader 9
Fragment Shader Epilog LLVM IR:
; ModuleID = 'tgsi'
source_filename = "tgsi"
target triple = "amdgcn--"
define amdgpu_ps void @ps_epilog(i64 inreg, i64 inreg, i64 inreg, i64 inreg, i64 inreg, float inreg, float, float, float, float, float, float, float, float, float, float, float, float, float, float) #0 {
main_body:
%20 = call i32 @llvm.SI.packf16(float %6, float %7)
%21 = bitcast i32 %20 to float
%22 = call i32 @llvm.SI.packf16(float %8, float %9)
%23 = bitcast i32 %22 to float
call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %21, float %23, float undef, float undef)
ret void
}
; Function Attrs: nounwind readnone
declare i32 @llvm.SI.packf16(float, float) #1
; Function Attrs: nounwind
declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float) #2
attributes #0 = { "InitialPSInputAddr"="16777215" }
attributes #1 = { nounwind readnone }
attributes #2 = { nounwind }
SHADER KEY
prolog.color_two_side = 0
prolog.flatshade_colors = 0
prolog.poly_stipple = 0
prolog.force_persp_sample_interp = 0
prolog.force_linear_sample_interp = 0
prolog.force_persp_center_interp = 0
prolog.force_linear_center_interp = 0
prolog.bc_optimize_for_persp = 0
prolog.bc_optimize_for_linear = 0
epilog.spi_shader_col_format = 0x4
epilog.color_is_int8 = 0x0
epilog.last_cbuf = 0
epilog.alpha_func = 7
epilog.alpha_to_one = 0
epilog.poly_line_smoothing = 0
epilog.clamp_color = 0
Pixel Shader:
Shader main disassembly:
s_mov_b32 m0, s11 ; BEFC030B
v_interp_mov_f32 v0, P0, 0, 0, [m0] ; C8020002
v_interp_mov_f32 v1, P0, 1, 0, [m0] ; C8060102
v_interp_mov_f32 v2, P0, 2, 0, [m0] ; C80A0202
v_interp_mov_f32 v3, P0, 3, 0, [m0] ; C80E0302
Shader epilog disassembly:
v_cvt_pkrtz_f16_f32_e32 v0, v0, v1 ; 5E000300
v_cvt_pkrtz_f16_f32_e32 v1, v2, v3 ; 5E020702
exp 15, 0, 1, 1, 1, v0, v1, v0, v0 ; F8001C0F 00000100
s_endpgm ; BF810000
*** SHADER CONFIG ***
SPI_PS_INPUT_ADDR = 0xd077
SPI_PS_INPUT_ENA = 0x0020
*** SHADER STATS ***
SGPRS: 16
VGPRS: 16
Spilled SGPRs: 0
Spilled VGPRs: 0
Code Size: 40 bytes
LDS: 0 blocks
Scratch: 0 bytes per wave
Max Waves: 10
********************
SHADER KEY
prolog.color_two_side = 0
prolog.flatshade_colors = 0
prolog.poly_stipple = 0
prolog.force_persp_sample_interp = 0
prolog.force_linear_sample_interp = 0
prolog.force_persp_center_interp = 0
prolog.force_linear_center_interp = 0
prolog.bc_optimize_for_persp = 0
prolog.bc_optimize_for_linear = 0
epilog.spi_shader_col_format = 0x4
epilog.color_is_int8 = 0x0
epilog.last_cbuf = 0
epilog.alpha_func = 7
epilog.alpha_to_one = 0
epilog.poly_line_smoothing = 0
epilog.clamp_color = 0
Pixel Shader:
Shader main disassembly:
s_load_dwordx4 s[0:3], s[2:3], 0x0 ; C0800300
v_cvt_i32_f32_e32 v0, v12 ; 7E00110C
s_waitcnt lgkmcnt(0) ; BF8C007F
s_buffer_load_dword s4, s[0:3], 0x9 ; C2020109
s_buffer_load_dword s5, s[0:3], 0x0 ; C2028100
s_buffer_load_dword s6, s[0:3], 0x1 ; C2030101
s_buffer_load_dword s7, s[0:3], 0x4 ; C2038104
s_buffer_load_dword s8, s[0:3], 0x5 ; C2040105
s_buffer_load_dword s2, s[0:3], 0x8 ; C2010108
s_waitcnt lgkmcnt(0) ; BF8C007F
v_mov_b32_e32 v1, s4 ; 7E020204
v_cmp_eq_u32_e64 s[0:1], s5, v0 ; D1840000 00020005
v_subrev_i32_e32 v0, vcc, s7, v0 ; 4E000007
v_sub_i32_e32 v2, vcc, 0, v0 ; 4C040080
v_mac_f32_e32 v1, s2, v13 ; 3E021A02
v_cvt_i32_f32_e32 v1, v1 ; 7E021101
v_max_i32_e32 v0, v2, v0 ; 24000102
v_subrev_i32_e32 v2, vcc, s8, v1 ; 4E040208
v_sub_i32_e32 v3, vcc, 0, v2 ; 4C060480
v_max_i32_e32 v2, v3, v2 ; 24040503
v_add_i32_e32 v0, vcc, v2, v0 ; 4A000102
v_cmp_eq_u32_e32 vcc, s6, v1 ; 7D840206
s_and_b64 s[0:1], s[0:1], vcc ; 87806A00
v_mov_b32_e32 v2, 0 ; 7E040280
v_cndmask_b32_e64 v1, 0, -1.0, s[0:1] ; D2000001 0001E680
v_cmpx_le_f32_e32 vcc, 0, v1 ; 7C260280
s_cbranch_execnz BB0_2 ; BF890000
exp 0, 9, 0, 1, 1, v0, v0, v0, v0 ; F8001890 00000000
s_endpgm ; BF810000
v_sub_i32_e32 v1, vcc, 0, v0 ; 4C020080
s_mov_b64 s[2:3], 0 ; BE820480
v_add_i32_e32 v1, vcc, v1, v0 ; 4A020101
v_cmp_gt_i32_e32 vcc, 10, v1 ; 7D08028A
s_and_saveexec_b64 s[4:5], vcc ; BE84246A
s_xor_b64 s[4:5], exec, s[4:5] ; 8984047E
s_or_b64 s[2:3], s[0:1], s[2:3] ; 88820200
v_add_f32_e32 v2, 0x3dcccccd, v2 ; 060404FF 3DCCCCCD
s_or_b64 exec, exec, s[4:5] ; 88FE047E
s_or_b64 s[2:3], s[4:5], s[2:3] ; 88820204
s_andn2_b64 exec, exec, s[2:3] ; 8AFE027E
s_cbranch_execnz BB0_3 ; BF890000
s_or_b64 exec, exec, s[2:3] ; 88FE027E
v_mov_b32_e32 v0, 0 ; 7E000280
v_mov_b32_e32 v1, 0 ; 7E020280
v_mov_b32_e32 v3, 0 ; 7E060280
v_mov_b32_e32 v13, v15 ; 7E1A030F
Shader epilog disassembly:
v_cvt_pkrtz_f16_f32_e32 v0, v0, v1 ; 5E000300
v_cvt_pkrtz_f16_f32_e32 v1, v2, v3 ; 5E020702
exp 15, 0, 1, 1, 1, v0, v1, v0, v0 ; F8001C0F 00000100
s_endpgm ; BF810000
*** SHADER CONFIG ***
SPI_PS_INPUT_ADDR = 0xd377
SPI_PS_INPUT_ENA = 0x0320
*** SHADER STATS ***
SGPRS: 16
VGPRS: 17
Spilled SGPRs: 0
Spilled VGPRs: 0
Code Size: 220 bytes
LDS: 0 blocks
Scratch: 0 bytes per wave
Max Waves: 10
********************
FRAG
DCL IN[0], GENERIC[0], LINEAR
DCL OUT[0], COLOR
DCL SAMP[0]
DCL SVIEW[0], 2D, FLOAT
0: TEX OUT[0], IN[0], SAMP[0], 2D
1: END
radeonsi: Compiling shader 11
TGSI shader LLVM IR:
; ModuleID = 'tgsi'
source_filename = "tgsi"
target triple = "amdgcn--"
define amdgpu_ps <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> @main([17 x <16 x i8>] addrspace(2)* byval dereferenceable(18446744073709551615), [16 x <16 x i8>] addrspace(2)* byval dereferenceable(18446744073709551615), [32 x <8 x i32>] addrspace(2)* byval dereferenceable(18446744073709551615), [16 x <8 x i32>] addrspace(2)* byval dereferenceable(18446744073709551615), [16 x <4 x i32>] addrspace(2)* byval dereferenceable(18446744073709551615), float inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, i32, i32, float, i32) #0 {
main_body:
%23 = getelementptr [32 x <8 x i32>], [32 x <8 x i32>] addrspace(2)* %2, i64 0, i64 0, !amdgpu.uniform !0
%24 = load <8 x i32>, <8 x i32> addrspace(2)* %23, align 32, !invariant.load !0
%25 = bitcast [32 x <8 x i32>] addrspace(2)* %2 to [0 x <4 x i32>] addrspace(2)*
%26 = getelementptr [0 x <4 x i32>], [0 x <4 x i32>] addrspace(2)* %25, i64 0, i64 3, !amdgpu.uniform !0
%27 = load <4 x i32>, <4 x i32> addrspace(2)* %26, align 16, !invariant.load !0
%28 = extractelement <8 x i32> %24, i32 7
%29 = extractelement <4 x i32> %27, i32 0
%30 = and i32 %29, %28
%31 = insertelement <4 x i32> %27, i32 %30, i32 0
%32 = call float @llvm.SI.fs.interp(i32 0, i32 0, i32 %6, <2 x i32> %12)
%33 = call float @llvm.SI.fs.interp(i32 1, i32 0, i32 %6, <2 x i32> %12)
%34 = bitcast float %32 to i32
%35 = bitcast float %33 to i32
%36 = insertelement <2 x i32> undef, i32 %34, i32 0
%37 = insertelement <2 x i32> %36, i32 %35, i32 1
%38 = call <4 x float> @llvm.SI.image.sample.v2i32(<2 x i32> %37, <8 x i32> %24, <4 x i32> %31, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
%39 = extractelement <4 x float> %38, i32 0
%40 = extractelement <4 x float> %38, i32 1
%41 = extractelement <4 x float> %38, i32 2
%42 = extractelement <4 x float> %38, i32 3
%43 = bitcast float %5 to i32
%44 = insertvalue <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> undef, i32 %43, 10
%45 = insertvalue <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> %44, float %39, 11
%46 = insertvalue <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> %45, float %40, 12
%47 = insertvalue <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> %46, float %41, 13
%48 = insertvalue <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> %47, float %42, 14
%49 = insertvalue <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> %48, float %21, 24
ret <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> %49
}
; Function Attrs: nounwind readnone
declare float @llvm.SI.fs.interp(i32, i32, i32, <2 x i32>) #1
; Function Attrs: nounwind readnone
declare <4 x float> @llvm.SI.image.sample.v2i32(<2 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1
attributes #0 = { "InitialPSInputAddr"="36983" }
attributes #1 = { nounwind readnone }
!0 = !{}
radeonsi: Compiling shader 12
Fragment Shader Epilog LLVM IR:
; ModuleID = 'tgsi'
source_filename = "tgsi"
target triple = "amdgcn--"
define amdgpu_ps void @ps_epilog(i64 inreg, i64 inreg, i64 inreg, i64 inreg, i64 inreg, float inreg, float, float, float, float, float, float, float, float, float, float, float, float, float, float) #0 {
main_body:
call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 0, float %6, float %7, float %8, float %9)
ret void
}
; Function Attrs: nounwind
declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float) #1
attributes #0 = { "InitialPSInputAddr"="16777215" }
attributes #1 = { nounwind }
SHADER KEY
prolog.color_two_side = 0
prolog.flatshade_colors = 0
prolog.poly_stipple = 0
prolog.force_persp_sample_interp = 0
prolog.force_linear_sample_interp = 0
prolog.force_persp_center_interp = 0
prolog.force_linear_center_interp = 0
prolog.bc_optimize_for_persp = 0
prolog.bc_optimize_for_linear = 0
epilog.spi_shader_col_format = 0x9
epilog.color_is_int8 = 0x0
epilog.last_cbuf = 0
epilog.alpha_func = 7
epilog.alpha_to_one = 0
epilog.poly_line_smoothing = 0
epilog.clamp_color = 0
Pixel Shader:
Shader main disassembly:
s_mov_b64 s[6:7], exec ; BE86047E
s_wqm_b64 exec, exec ; BEFE0A7E
s_load_dwordx8 s[12:19], s[4:5], 0x0 ; C0C60500
s_load_dwordx4 s[0:3], s[4:5], 0xc ; C080050C
s_mov_b32 m0, s11 ; BEFC030B
v_interp_p1_f32 v0, v8, 0, 0, [m0] ; C8000008
v_interp_p2_f32 v0, [v0], v9, 0, 0, [m0] ; C8010009
v_interp_p1_f32 v1, v8, 1, 0, [m0] ; C8040108
s_waitcnt lgkmcnt(0) ; BF8C007F
s_and_b32 s0, s0, s19 ; 87001300
v_interp_p2_f32 v1, [v1], v9, 1, 0, [m0] ; C8050109
s_and_b64 exec, exec, s[6:7] ; 87FE067E
image_sample v[0:3], v[0:1], s[12:19], s[0:3] dmask:0xf ; F0800F00 00030000
s_waitcnt vmcnt(0) ; BF8C0F70
Shader epilog disassembly:
exp 15, 0, 0, 1, 1, v0, v1, v2, v3 ; F800180F 03020100
s_endpgm ; BF810000
*** SHADER CONFIG ***
SPI_PS_INPUT_ADDR = 0xd077
SPI_PS_INPUT_ENA = 0x0020
*** SHADER STATS ***
SGPRS: 24
VGPRS: 16
Spilled SGPRs: 0
Spilled VGPRs: 0
Code Size: 72 bytes
LDS: 0 blocks
Scratch: 0 bytes per wave
Max Waves: 10
********************
-------------- next part --------------
FRAG
PROPERTY FS_COLOR0_WRITES_ALL_CBUFS 1
DCL IN[0], COLOR, COLOR
DCL OUT[0], COLOR
0: MOV OUT[0], IN[0]
1: END
radeonsi: Compiling shader 1
TGSI shader LLVM IR:
; ModuleID = 'tgsi'
source_filename = "tgsi"
target triple = "amdgcn--"
define amdgpu_ps <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> @main([17 x <16 x i8>] addrspace(2)* byval dereferenceable(18446744073709551615), [16 x <16 x i8>] addrspace(2)* byval dereferenceable(18446744073709551615), [32 x <8 x i32>] addrspace(2)* byval dereferenceable(18446744073709551615), [16 x <8 x i32>] addrspace(2)* byval dereferenceable(18446744073709551615), [16 x <4 x i32>] addrspace(2)* byval dereferenceable(18446744073709551615), float inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, i32, i32, float, i32, float, float, float, float) #0 {
main_body:
%27 = bitcast float %5 to i32
%28 = insertvalue <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> undef, i32 %27, 10
%29 = insertvalue <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> %28, float %23, 11
%30 = insertvalue <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> %29, float %24, 12
%31 = insertvalue <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> %30, float %25, 13
%32 = insertvalue <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> %31, float %26, 14
%33 = insertvalue <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> %32, float %21, 24
ret <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> %33
}
attributes #0 = { "InitialPSInputAddr"="36983" }
FRAG
PROPERTY FS_COLOR0_WRITES_ALL_CBUFS 1
DCL SV[0], POSITION
DCL OUT[0], COLOR
DCL CONST[1..2]
DCL CONST[0]
DCL TEMP[0]
DCL TEMP[1..6], LOCAL
IMM[0] FLT32 { 1.0000, 0.0000, 0.1000, 0.0000}
IMM[1] INT32 {0, 10, 0, 0}
0: MOV TEMP[0], SV[0]
1: MAD TEMP[0].y, SV[0], CONST[2].xxxx, CONST[2].yyyy
2: F2I TEMP[1], TEMP[0]
3: MOV TEMP[2].xy, TEMP[1].xyxx
4: INEG TEMP[2].x, CONST[1].xxxx
5: UADD TEMP[2].x, TEMP[1].xxxx, TEMP[2].xxxx
6: IABS TEMP[2].x, TEMP[2].xxxx
7: INEG TEMP[3].x, CONST[1].yyyy
8: UADD TEMP[3].x, TEMP[1].yyyy, TEMP[3].xxxx
9: IABS TEMP[3].x, TEMP[3].xxxx
10: UADD TEMP[2].x, TEMP[2].xxxx, TEMP[3].xxxx
11: USEQ TEMP[1], TEMP[1].xyyy, CONST[0].xyyy
12: AND TEMP[1].x, TEMP[1].xxxx, TEMP[1].yyyy
13: AND TEMP[3], TEMP[1].xxxx, IMM[0].xxxx
14: KILL_IF -TEMP[3].xxxx
15: MOV TEMP[3], IMM[0].yyyy
16: MOV TEMP[4].x, IMM[1].xxxx
17: BGNLOOP :0
18: ISGE TEMP[5].x, TEMP[4].xxxx, IMM[1].yyyy
19: UIF TEMP[5].xxxx :0
20: BRK
21: ENDIF
22: ADD TEMP[6].x, TEMP[3].zzzz, IMM[0].zzzz
23: MOV TEMP[3].z, TEMP[6].xxxx
24: UADD TEMP[4].x, TEMP[4].xxxx, TEMP[2].xxxx
25: UIF TEMP[1].xxxx :0
26: BRK
27: ENDIF
28: ENDLOOP :0
29: MOV OUT[0], TEMP[3]
30: END
radeonsi: Compiling shader 4
TGSI shader LLVM IR:
; ModuleID = 'tgsi'
source_filename = "tgsi"
target triple = "amdgcn--"
define amdgpu_ps <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> @main([17 x <16 x i8>] addrspace(2)* byval dereferenceable(18446744073709551615), [16 x <16 x i8>] addrspace(2)* byval dereferenceable(18446744073709551615), [32 x <8 x i32>] addrspace(2)* byval dereferenceable(18446744073709551615), [16 x <8 x i32>] addrspace(2)* byval dereferenceable(18446744073709551615), [16 x <4 x i32>] addrspace(2)* byval dereferenceable(18446744073709551615), float inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, i32, i32, float, i32) #0 {
main_body:
%23 = getelementptr [16 x <16 x i8>], [16 x <16 x i8>] addrspace(2)* %1, i64 0, i64 0, !amdgpu.uniform !0
%24 = load <16 x i8>, <16 x i8> addrspace(2)* %23, align 16, !invariant.load !0
%25 = call float @llvm.SI.load.const(<16 x i8> %24, i32 32)
%26 = call float @llvm.SI.load.const(<16 x i8> %24, i32 36)
%27 = fmul float %25, %16
%28 = fadd float %27, %26
%29 = fptosi float %15 to i32
%30 = fptosi float %28 to i32
%31 = getelementptr [16 x <16 x i8>], [16 x <16 x i8>] addrspace(2)* %1, i64 0, i64 0, !amdgpu.uniform !0
%32 = load <16 x i8>, <16 x i8> addrspace(2)* %31, align 16, !invariant.load !0
%33 = call float @llvm.SI.load.const(<16 x i8> %32, i32 16)
%34 = bitcast float %33 to i32
%35 = sub i32 %29, %34
%36 = sub i32 0, %35
%37 = icmp sgt i32 %35, %36
%38 = select i1 %37, i32 %35, i32 %36
%39 = getelementptr [16 x <16 x i8>], [16 x <16 x i8>] addrspace(2)* %1, i64 0, i64 0, !amdgpu.uniform !0
%40 = load <16 x i8>, <16 x i8> addrspace(2)* %39, align 16, !invariant.load !0
%41 = call float @llvm.SI.load.const(<16 x i8> %40, i32 20)
%42 = bitcast float %41 to i32
%43 = sub i32 %30, %42
%44 = sub i32 0, %43
%45 = icmp sgt i32 %43, %44
%46 = select i1 %45, i32 %43, i32 %44
%47 = add i32 %38, %46
%48 = getelementptr [16 x <16 x i8>], [16 x <16 x i8>] addrspace(2)* %1, i64 0, i64 0, !amdgpu.uniform !0
%49 = load <16 x i8>, <16 x i8> addrspace(2)* %48, align 16, !invariant.load !0
%50 = call float @llvm.SI.load.const(<16 x i8> %49, i32 0)
%51 = bitcast float %50 to i32
%52 = icmp eq i32 %29, %51
%53 = call float @llvm.SI.load.const(<16 x i8> %49, i32 4)
%54 = bitcast float %53 to i32
%55 = icmp eq i32 %30, %54
%56 = and i1 %52, %55
%57 = select i1 %56, float -1.000000e+00, float 0.000000e+00
call void @llvm.AMDGPU.kill(float %57)
br label %loop17
loop17: ; preds = %endif21, %main_body
%58 = phi i32 [ 0, %main_body ], [ %62, %endif21 ]
%59 = phi i32 [ 0, %main_body ], [ %62, %endif21 ]
%TEMP3.z.0 = phi float [ 0.000000e+00, %main_body ], [ %61, %endif21 ]
%60 = icmp sgt i32 %58, 9
br i1 %60, label %endloop28, label %endif21
endif21: ; preds = %loop17
%61 = fadd float %TEMP3.z.0, 0x3FB99999A0000000
%62 = add i32 %59, %47
br i1 %56, label %endloop28, label %loop17
endloop28: ; preds = %endif21, %loop17
%TEMP3.z.1 = phi float [ %TEMP3.z.0, %loop17 ], [ %61, %endif21 ]
%63 = bitcast float %5 to i32
%64 = insertvalue <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> undef, i32 %63, 10
%65 = insertvalue <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> %64, float 0.000000e+00, 11
%66 = insertvalue <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> %65, float 0.000000e+00, 12
%67 = insertvalue <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> %66, float %TEMP3.z.1, 13
%68 = insertvalue <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> %67, float 0.000000e+00, 14
%69 = insertvalue <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> %68, float %21, 24
ret <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> %69
}
; Function Attrs: nounwind readnone
declare float @llvm.SI.load.const(<16 x i8>, i32) #1
; Function Attrs: nounwind
declare void @llvm.AMDGPU.kill(float) #2
attributes #0 = { "InitialPSInputAddr"="36983" }
attributes #1 = { nounwind readnone }
attributes #2 = { nounwind }
!0 = !{}
FRAG
PROPERTY FS_COLOR0_WRITES_ALL_CBUFS 1
DCL IN[0], GENERIC[0], CONSTANT
DCL OUT[0], COLOR
0: MOV OUT[0], IN[0]
1: END
radeonsi: Compiling shader 5
TGSI shader LLVM IR:
; ModuleID = 'tgsi'
source_filename = "tgsi"
target triple = "amdgcn--"
define amdgpu_ps <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> @main([17 x <16 x i8>] addrspace(2)* byval dereferenceable(18446744073709551615), [16 x <16 x i8>] addrspace(2)* byval dereferenceable(18446744073709551615), [32 x <8 x i32>] addrspace(2)* byval dereferenceable(18446744073709551615), [16 x <8 x i32>] addrspace(2)* byval dereferenceable(18446744073709551615), [16 x <4 x i32>] addrspace(2)* byval dereferenceable(18446744073709551615), float inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, i32, i32, float, i32) #0 {
main_body:
%23 = call float @llvm.SI.fs.constant(i32 0, i32 0, i32 %6)
%24 = call float @llvm.SI.fs.constant(i32 1, i32 0, i32 %6)
%25 = call float @llvm.SI.fs.constant(i32 2, i32 0, i32 %6)
%26 = call float @llvm.SI.fs.constant(i32 3, i32 0, i32 %6)
%27 = bitcast float %5 to i32
%28 = insertvalue <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> undef, i32 %27, 10
%29 = insertvalue <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> %28, float %23, 11
%30 = insertvalue <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> %29, float %24, 12
%31 = insertvalue <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> %30, float %25, 13
%32 = insertvalue <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> %31, float %26, 14
%33 = insertvalue <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> %32, float %21, 24
ret <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> %33
}
; Function Attrs: nounwind readnone
declare float @llvm.SI.fs.constant(i32, i32, i32) #1
attributes #0 = { "InitialPSInputAddr"="36983" }
attributes #1 = { nounwind readnone }
radeonsi: Compiling shader 9
Fragment Shader Epilog LLVM IR:
; ModuleID = 'tgsi'
source_filename = "tgsi"
target triple = "amdgcn--"
define amdgpu_ps void @ps_epilog(i64 inreg, i64 inreg, i64 inreg, i64 inreg, i64 inreg, float inreg, float, float, float, float, float, float, float, float, float, float, float, float, float, float) #0 {
main_body:
%20 = call i32 @llvm.SI.packf16(float %6, float %7)
%21 = bitcast i32 %20 to float
%22 = call i32 @llvm.SI.packf16(float %8, float %9)
%23 = bitcast i32 %22 to float
call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %21, float %23, float undef, float undef)
ret void
}
; Function Attrs: nounwind readnone
declare i32 @llvm.SI.packf16(float, float) #1
; Function Attrs: nounwind
declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float) #2
attributes #0 = { "InitialPSInputAddr"="16777215" }
attributes #1 = { nounwind readnone }
attributes #2 = { nounwind }
SHADER KEY
prolog.color_two_side = 0
prolog.flatshade_colors = 0
prolog.poly_stipple = 0
prolog.force_persp_sample_interp = 0
prolog.force_linear_sample_interp = 0
prolog.force_persp_center_interp = 0
prolog.force_linear_center_interp = 0
prolog.bc_optimize_for_persp = 0
prolog.bc_optimize_for_linear = 0
epilog.spi_shader_col_format = 0x4
epilog.color_is_int8 = 0x0
epilog.last_cbuf = 0
epilog.alpha_func = 7
epilog.alpha_to_one = 0
epilog.poly_line_smoothing = 0
epilog.clamp_color = 0
Pixel Shader:
Shader main disassembly:
s_mov_b32 m0, s11 ; BEFC030B
v_interp_mov_f32 v0, P0, 0, 0, [m0] ; C8020002
v_interp_mov_f32 v1, P0, 1, 0, [m0] ; C8060102
v_interp_mov_f32 v2, P0, 2, 0, [m0] ; C80A0202
v_interp_mov_f32 v3, P0, 3, 0, [m0] ; C80E0302
Shader epilog disassembly:
v_cvt_pkrtz_f16_f32_e32 v0, v0, v1 ; 5E000300
v_cvt_pkrtz_f16_f32_e32 v1, v2, v3 ; 5E020702
exp 15, 0, 1, 1, 1, v0, v1, v0, v0 ; F8001C0F 00000100
s_endpgm ; BF810000
*** SHADER CONFIG ***
SPI_PS_INPUT_ADDR = 0xd077
SPI_PS_INPUT_ENA = 0x0020
*** SHADER STATS ***
SGPRS: 16
VGPRS: 16
Spilled SGPRs: 0
Spilled VGPRs: 0
Code Size: 40 bytes
LDS: 0 blocks
Scratch: 0 bytes per wave
Max Waves: 10
********************
SHADER KEY
prolog.color_two_side = 0
prolog.flatshade_colors = 0
prolog.poly_stipple = 0
prolog.force_persp_sample_interp = 0
prolog.force_linear_sample_interp = 0
prolog.force_persp_center_interp = 0
prolog.force_linear_center_interp = 0
prolog.bc_optimize_for_persp = 0
prolog.bc_optimize_for_linear = 0
epilog.spi_shader_col_format = 0x4
epilog.color_is_int8 = 0x0
epilog.last_cbuf = 0
epilog.alpha_func = 7
epilog.alpha_to_one = 0
epilog.poly_line_smoothing = 0
epilog.clamp_color = 0
Pixel Shader:
Shader main disassembly:
s_load_dwordx4 s[0:3], s[2:3], 0x0 ; C0800300
v_cvt_i32_f32_e32 v0, v12 ; 7E00110C
s_waitcnt lgkmcnt(0) ; BF8C007F
s_buffer_load_dword s4, s[0:3], 0x9 ; C2020109
s_buffer_load_dword s5, s[0:3], 0x0 ; C2028100
s_buffer_load_dword s6, s[0:3], 0x1 ; C2030101
s_buffer_load_dword s7, s[0:3], 0x4 ; C2038104
s_buffer_load_dword s8, s[0:3], 0x5 ; C2040105
s_buffer_load_dword s2, s[0:3], 0x8 ; C2010108
s_waitcnt lgkmcnt(0) ; BF8C007F
v_mov_b32_e32 v1, s4 ; 7E020204
v_cmp_eq_u32_e64 s[0:1], s5, v0 ; D1840000 00020005
v_subrev_i32_e32 v0, vcc, s7, v0 ; 4E000007
v_sub_i32_e32 v2, vcc, 0, v0 ; 4C040080
v_mac_f32_e32 v1, s2, v13 ; 3E021A02
v_cvt_i32_f32_e32 v1, v1 ; 7E021101
v_max_i32_e32 v0, v2, v0 ; 24000102
v_subrev_i32_e32 v2, vcc, s8, v1 ; 4E040208
v_sub_i32_e32 v3, vcc, 0, v2 ; 4C060480
v_max_i32_e32 v2, v3, v2 ; 24040503
v_add_i32_e32 v0, vcc, v2, v0 ; 4A000102
v_cmp_eq_u32_e32 vcc, s6, v1 ; 7D840206
s_and_b64 s[0:1], s[0:1], vcc ; 87806A00
v_cndmask_b32_e64 v1, 0, -1, s[0:1] ; D2000001 00018280
v_mov_b32_e32 v2, 0 ; 7E040280
v_cndmask_b32_e64 v3, 0, -1.0, s[0:1] ; D2000003 0001E680
v_cmpx_le_f32_e32 vcc, 0, v3 ; 7C260680
s_cbranch_execnz BB0_2 ; BF890000
exp 0, 9, 0, 1, 1, v0, v0, v0, v0 ; F8001890 00000000
s_endpgm ; BF810000
v_sub_i32_e32 v3, vcc, 0, v0 ; 4C060080
s_mov_b64 s[2:3], 0 ; BE820480
v_add_i32_e32 v3, vcc, v3, v0 ; 4A060103
v_cmp_ne_u32_e64 s[0:1], 0, v1 ; D18A0000 00020280
v_cmp_gt_i32_e32 vcc, 10, v3 ; 7D08068A
s_and_saveexec_b64 s[4:5], vcc ; BE84246A
s_xor_b64 s[4:5], exec, s[4:5] ; 8984047E
s_or_b64 s[2:3], s[0:1], s[2:3] ; 88820200
v_add_f32_e32 v2, 0x3dcccccd, v2 ; 060404FF 3DCCCCCD
s_or_b64 exec, exec, s[4:5] ; 88FE047E
s_or_b64 s[2:3], s[4:5], s[2:3] ; 88820204
s_andn2_b64 exec, exec, s[2:3] ; 8AFE027E
s_cbranch_execnz BB0_3 ; BF890000
s_or_b64 exec, exec, s[2:3] ; 88FE027E
v_mov_b32_e32 v0, 0 ; 7E000280
v_mov_b32_e32 v1, 0 ; 7E020280
v_mov_b32_e32 v3, 0 ; 7E060280
v_mov_b32_e32 v13, v15 ; 7E1A030F
Shader epilog disassembly:
v_cvt_pkrtz_f16_f32_e32 v0, v0, v1 ; 5E000300
v_cvt_pkrtz_f16_f32_e32 v1, v2, v3 ; 5E020702
exp 15, 0, 1, 1, 1, v0, v1, v0, v0 ; F8001C0F 00000100
s_endpgm ; BF810000
*** SHADER CONFIG ***
SPI_PS_INPUT_ADDR = 0xd377
SPI_PS_INPUT_ENA = 0x0320
*** SHADER STATS ***
SGPRS: 16
VGPRS: 17
Spilled SGPRs: 0
Spilled VGPRs: 0
Code Size: 236 bytes
LDS: 0 blocks
Scratch: 0 bytes per wave
Max Waves: 10
********************
FRAG
DCL IN[0], GENERIC[0], LINEAR
DCL OUT[0], COLOR
DCL SAMP[0]
DCL SVIEW[0], 2D, FLOAT
0: TEX OUT[0], IN[0], SAMP[0], 2D
1: END
radeonsi: Compiling shader 11
TGSI shader LLVM IR:
; ModuleID = 'tgsi'
source_filename = "tgsi"
target triple = "amdgcn--"
define amdgpu_ps <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> @main([17 x <16 x i8>] addrspace(2)* byval dereferenceable(18446744073709551615), [16 x <16 x i8>] addrspace(2)* byval dereferenceable(18446744073709551615), [32 x <8 x i32>] addrspace(2)* byval dereferenceable(18446744073709551615), [16 x <8 x i32>] addrspace(2)* byval dereferenceable(18446744073709551615), [16 x <4 x i32>] addrspace(2)* byval dereferenceable(18446744073709551615), float inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, i32, i32, float, i32) #0 {
main_body:
%23 = getelementptr [32 x <8 x i32>], [32 x <8 x i32>] addrspace(2)* %2, i64 0, i64 0, !amdgpu.uniform !0
%24 = load <8 x i32>, <8 x i32> addrspace(2)* %23, align 32, !invariant.load !0
%25 = bitcast [32 x <8 x i32>] addrspace(2)* %2 to [0 x <4 x i32>] addrspace(2)*
%26 = getelementptr [0 x <4 x i32>], [0 x <4 x i32>] addrspace(2)* %25, i64 0, i64 3, !amdgpu.uniform !0
%27 = load <4 x i32>, <4 x i32> addrspace(2)* %26, align 16, !invariant.load !0
%28 = extractelement <8 x i32> %24, i32 7
%29 = extractelement <4 x i32> %27, i32 0
%30 = and i32 %29, %28
%31 = insertelement <4 x i32> %27, i32 %30, i32 0
%32 = call float @llvm.SI.fs.interp(i32 0, i32 0, i32 %6, <2 x i32> %12)
%33 = call float @llvm.SI.fs.interp(i32 1, i32 0, i32 %6, <2 x i32> %12)
%34 = bitcast float %32 to i32
%35 = bitcast float %33 to i32
%36 = insertelement <2 x i32> undef, i32 %34, i32 0
%37 = insertelement <2 x i32> %36, i32 %35, i32 1
%38 = call <4 x float> @llvm.SI.image.sample.v2i32(<2 x i32> %37, <8 x i32> %24, <4 x i32> %31, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
%39 = extractelement <4 x float> %38, i32 0
%40 = extractelement <4 x float> %38, i32 1
%41 = extractelement <4 x float> %38, i32 2
%42 = extractelement <4 x float> %38, i32 3
%43 = bitcast float %5 to i32
%44 = insertvalue <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> undef, i32 %43, 10
%45 = insertvalue <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> %44, float %39, 11
%46 = insertvalue <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> %45, float %40, 12
%47 = insertvalue <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> %46, float %41, 13
%48 = insertvalue <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> %47, float %42, 14
%49 = insertvalue <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> %48, float %21, 24
ret <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> %49
}
; Function Attrs: nounwind readnone
declare float @llvm.SI.fs.interp(i32, i32, i32, <2 x i32>) #1
; Function Attrs: nounwind readnone
declare <4 x float> @llvm.SI.image.sample.v2i32(<2 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1
attributes #0 = { "InitialPSInputAddr"="36983" }
attributes #1 = { nounwind readnone }
!0 = !{}
radeonsi: Compiling shader 12
Fragment Shader Epilog LLVM IR:
; ModuleID = 'tgsi'
source_filename = "tgsi"
target triple = "amdgcn--"
define amdgpu_ps void @ps_epilog(i64 inreg, i64 inreg, i64 inreg, i64 inreg, i64 inreg, float inreg, float, float, float, float, float, float, float, float, float, float, float, float, float, float) #0 {
main_body:
call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 0, float %6, float %7, float %8, float %9)
ret void
}
; Function Attrs: nounwind
declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float) #1
attributes #0 = { "InitialPSInputAddr"="16777215" }
attributes #1 = { nounwind }
SHADER KEY
prolog.color_two_side = 0
prolog.flatshade_colors = 0
prolog.poly_stipple = 0
prolog.force_persp_sample_interp = 0
prolog.force_linear_sample_interp = 0
prolog.force_persp_center_interp = 0
prolog.force_linear_center_interp = 0
prolog.bc_optimize_for_persp = 0
prolog.bc_optimize_for_linear = 0
epilog.spi_shader_col_format = 0x9
epilog.color_is_int8 = 0x0
epilog.last_cbuf = 0
epilog.alpha_func = 7
epilog.alpha_to_one = 0
epilog.poly_line_smoothing = 0
epilog.clamp_color = 0
Pixel Shader:
Shader main disassembly:
s_mov_b64 s[6:7], exec ; BE86047E
s_wqm_b64 exec, exec ; BEFE0A7E
s_load_dwordx8 s[12:19], s[4:5], 0x0 ; C0C60500
s_load_dwordx4 s[0:3], s[4:5], 0xc ; C080050C
s_mov_b32 m0, s11 ; BEFC030B
v_interp_p1_f32 v0, v8, 0, 0, [m0] ; C8000008
v_interp_p2_f32 v0, [v0], v9, 0, 0, [m0] ; C8010009
v_interp_p1_f32 v1, v8, 1, 0, [m0] ; C8040108
s_waitcnt lgkmcnt(0) ; BF8C007F
s_and_b32 s0, s0, s19 ; 87001300
v_interp_p2_f32 v1, [v1], v9, 1, 0, [m0] ; C8050109
s_and_b64 exec, exec, s[6:7] ; 87FE067E
image_sample v[0:3], v[0:1], s[12:19], s[0:3] dmask:0xf ; F0800F00 00030000
s_waitcnt vmcnt(0) ; BF8C0F70
Shader epilog disassembly:
exp 15, 0, 0, 1, 1, v0, v1, v2, v3 ; F800180F 03020100
s_endpgm ; BF810000
*** SHADER CONFIG ***
SPI_PS_INPUT_ADDR = 0xd077
SPI_PS_INPUT_ENA = 0x0020
*** SHADER STATS ***
SGPRS: 24
VGPRS: 16
Spilled SGPRs: 0
Spilled VGPRs: 0
Code Size: 72 bytes
LDS: 0 blocks
Scratch: 0 bytes per wave
Max Waves: 10
********************
More information about the llvm-commits
mailing list