[llvm] r286171 - [AMDGPU] Allow hoisting of comparisons out of a loop and eliminate condition copies
Michel Dänzer via llvm-commits
llvm-commits at lists.llvm.org
Wed Nov 9 22:56:10 PST 2016
On 10/11/16 03:36 PM, Mekhanoshin, Stanislav wrote:
> I would really appreciate if someone familiar with GLSL could explain the problem. Here is the only difference in code produced:
>
> 303d302
> < v_cndmask_b32_e64 v1, 0, -1, s[0:1] ; D2000001 00018280
> 305,306c304,305
> < v_cndmask_b32_e64 v3, 0, -1.0, s[0:1] ; D2000003 0001E680
> < v_cmpx_le_f32_e32 vcc, 0, v3 ; 7C260680
> ---
>> v_cndmask_b32_e64 v1, 0, -1.0, s[0:1] ; D2000001 0001E680
>> v_cmpx_le_f32_e32 vcc, 0, v1 ; 7C260280
> 310c309
> < v_sub_i32_e32 v3, vcc, 0, v0 ; 4C060080
> ---
>> v_sub_i32_e32 v1, vcc, 0, v0 ; 4C020080
> 312,314c311,312
> < v_add_i32_e32 v3, vcc, v3, v0 ; 4A060103
> < v_cmp_ne_u32_e64 s[0:1], 0, v1 ; D18A0000 00020280
> < v_cmp_gt_i32_e32 vcc, 10, v3 ; 7D08068A
> ---
>> v_add_i32_e32 v1, vcc, v1, v0 ; 4A020101
>> v_cmp_gt_i32_e32 vcc, 10, v1 ; 7D08028A
>
> So, what happens is this instruction is removed:
>
> v_cndmask_b32_e64 v1, 0, -1, s[0:1]
>
> it copies s[0:1] into v1 for the lane.
>
> Then the instruction which restores s[0:1] from v1 is also removed:
>
> v_cmp_ne_u32_e64 s[0:1], 0, v1
>
> Neither s0 nor s1 are written in between and anywhere after this point. Since v1 is now free other modified instructions use v1 instead of v3, which again does not seem to be an issue for me.
> The only difference I can see is the contents of v1 and v3 upon kernel termination in case if discard is called... Is there anything is GLSL ABI which requires v1 and v3 to hold specific values on exit? My question comes from this epilogue on non-discard return:
>
> v_mov_b32_e32 v0, 0
> v_mov_b32_e32 v1, 0
> v_mov_b32_e32 v3, 0
> v_mov_b32_e32 v13, v15
> ; return
>
> Then this piece of code does not call s_endpgm as well. I also do not see branch target BB0_2 used in the generated code and generally have a suspicion this is just a part of a bigger kernel (based on the absence of s_endpgm at the end). I.e. there can be potentially a problem if this code is just inserted somewhere in a bigger context, not visible to the compiler.
The dumps were from radeonsi's default mode using separate shader
prologues and epilogues. Attached to this e-mail are the corresponding
dumps using monolithic shaders. As you can see, v1 contributes to the
colour value exported from the pixel shader.
--
Earthling Michel Dänzer | http://www.amd.com
Libre software enthusiast | Mesa and X developer
-------------- next part --------------
FRAG
PROPERTY FS_COLOR0_WRITES_ALL_CBUFS 1
DCL IN[0], GENERIC[0], CONSTANT
DCL OUT[0], COLOR
0: MOV OUT[0], IN[0]
1: END
radeonsi: Compiling shader 2
TGSI shader LLVM IR:
; ModuleID = 'tgsi'
source_filename = "tgsi"
target triple = "amdgcn--"
; Function Attrs: nounwind readnone
declare float @llvm.SI.fs.constant(i32, i32, i32) #0
; Function Attrs: nounwind readnone
declare i32 @llvm.SI.packf16(float, float) #0
; Function Attrs: nounwind
declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float) #1
define amdgpu_ps void @wrapper([17 x <16 x i8>] addrspace(2)* byval dereferenceable(18446744073709551615), [16 x <16 x i8>] addrspace(2)* byval dereferenceable(18446744073709551615), [32 x <8 x i32>] addrspace(2)* byval dereferenceable(18446744073709551615), [16 x <8 x i32>] addrspace(2)* byval dereferenceable(18446744073709551615), [16 x <4 x i32>] addrspace(2)* byval dereferenceable(18446744073709551615), float inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, i32, i32, float, i32) {
main_body:
%23 = call float @llvm.SI.fs.constant(i32 0, i32 0, i32 %6)
%24 = call float @llvm.SI.fs.constant(i32 1, i32 0, i32 %6)
%25 = call float @llvm.SI.fs.constant(i32 2, i32 0, i32 %6)
%26 = call float @llvm.SI.fs.constant(i32 3, i32 0, i32 %6)
%27 = call i32 @llvm.SI.packf16(float %23, float %24)
%28 = bitcast i32 %27 to float
%29 = call i32 @llvm.SI.packf16(float %25, float %26)
%30 = bitcast i32 %29 to float
call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %28, float %30, float undef, float undef)
ret void
}
attributes #0 = { nounwind readnone }
attributes #1 = { nounwind }
SHADER KEY
prolog.color_two_side = 0
prolog.flatshade_colors = 0
prolog.poly_stipple = 0
prolog.force_persp_sample_interp = 0
prolog.force_linear_sample_interp = 0
prolog.force_persp_center_interp = 0
prolog.force_linear_center_interp = 0
prolog.bc_optimize_for_persp = 0
prolog.bc_optimize_for_linear = 0
epilog.spi_shader_col_format = 0x4
epilog.color_is_int8 = 0x0
epilog.last_cbuf = 0
epilog.alpha_func = 7
epilog.alpha_to_one = 0
epilog.poly_line_smoothing = 0
epilog.clamp_color = 0
Pixel Shader:
Shader main disassembly:
s_mov_b32 m0, s11 ; BEFC030B
v_interp_mov_f32 v0, P0, 0, 0, [m0] ; C8020002
v_interp_mov_f32 v1, P0, 1, 0, [m0] ; C8060102
v_interp_mov_f32 v2, P0, 2, 0, [m0] ; C80A0202
v_interp_mov_f32 v3, P0, 3, 0, [m0] ; C80E0302
v_cvt_pkrtz_f16_f32_e32 v0, v0, v1 ; 5E000300
v_cvt_pkrtz_f16_f32_e32 v1, v2, v3 ; 5E020702
exp 15, 0, 1, 1, 1, v0, v1, v0, v0 ; F8001C0F 00000100
s_endpgm ; BF810000
*** SHADER CONFIG ***
SPI_PS_INPUT_ADDR = 0x0001
SPI_PS_INPUT_ENA = 0x0001
*** SHADER STATS ***
SGPRS: 16
VGPRS: 4
Spilled SGPRs: 0
Spilled VGPRs: 0
Code Size: 40 bytes
LDS: 0 blocks
Scratch: 0 bytes per wave
Max Waves: 10
********************
FRAG
PROPERTY FS_COLOR0_WRITES_ALL_CBUFS 1
DCL SV[0], POSITION
DCL OUT[0], COLOR
DCL CONST[1..2]
DCL CONST[0]
DCL TEMP[0]
DCL TEMP[1..6], LOCAL
IMM[0] FLT32 { 1.0000, 0.0000, 0.1000, 0.0000}
IMM[1] INT32 {0, 10, 0, 0}
0: MOV TEMP[0], SV[0]
1: MAD TEMP[0].y, SV[0], CONST[2].xxxx, CONST[2].yyyy
2: F2I TEMP[1], TEMP[0]
3: MOV TEMP[2].xy, TEMP[1].xyxx
4: INEG TEMP[2].x, CONST[1].xxxx
5: UADD TEMP[2].x, TEMP[1].xxxx, TEMP[2].xxxx
6: IABS TEMP[2].x, TEMP[2].xxxx
7: INEG TEMP[3].x, CONST[1].yyyy
8: UADD TEMP[3].x, TEMP[1].yyyy, TEMP[3].xxxx
9: IABS TEMP[3].x, TEMP[3].xxxx
10: UADD TEMP[2].x, TEMP[2].xxxx, TEMP[3].xxxx
11: USEQ TEMP[1], TEMP[1].xyyy, CONST[0].xyyy
12: AND TEMP[1].x, TEMP[1].xxxx, TEMP[1].yyyy
13: AND TEMP[3], TEMP[1].xxxx, IMM[0].xxxx
14: KILL_IF -TEMP[3].xxxx
15: MOV TEMP[3], IMM[0].yyyy
16: MOV TEMP[4].x, IMM[1].xxxx
17: BGNLOOP :0
18: ISGE TEMP[5].x, TEMP[4].xxxx, IMM[1].yyyy
19: UIF TEMP[5].xxxx :0
20: BRK
21: ENDIF
22: ADD TEMP[6].x, TEMP[3].zzzz, IMM[0].zzzz
23: MOV TEMP[3].z, TEMP[6].xxxx
24: UADD TEMP[4].x, TEMP[4].xxxx, TEMP[2].xxxx
25: UIF TEMP[1].xxxx :0
26: BRK
27: ENDIF
28: ENDLOOP :0
29: MOV OUT[0], TEMP[3]
30: END
radeonsi: Compiling shader 4
TGSI shader LLVM IR:
; ModuleID = 'tgsi'
source_filename = "tgsi"
target triple = "amdgcn--"
; Function Attrs: nounwind readnone
declare float @llvm.SI.load.const(<16 x i8>, i32) #0
; Function Attrs: nounwind
declare void @llvm.AMDGPU.kill(float) #1
; Function Attrs: nounwind readnone
declare i32 @llvm.SI.packf16(float, float) #0
; Function Attrs: nounwind
declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float) #1
define amdgpu_ps void @wrapper([17 x <16 x i8>] addrspace(2)* byval dereferenceable(18446744073709551615), [16 x <16 x i8>] addrspace(2)* byval dereferenceable(18446744073709551615), [32 x <8 x i32>] addrspace(2)* byval dereferenceable(18446744073709551615), [16 x <8 x i32>] addrspace(2)* byval dereferenceable(18446744073709551615), [16 x <4 x i32>] addrspace(2)* byval dereferenceable(18446744073709551615), float inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, i32, i32, float, i32) {
main_body:
%23 = getelementptr [16 x <16 x i8>], [16 x <16 x i8>] addrspace(2)* %1, i64 0, i64 0, !amdgpu.uniform !0
%24 = load <16 x i8>, <16 x i8> addrspace(2)* %23, align 16, !invariant.load !0
%25 = call float @llvm.SI.load.const(<16 x i8> %24, i32 32)
%26 = call float @llvm.SI.load.const(<16 x i8> %24, i32 36)
%27 = fmul float %25, %16
%28 = fadd float %27, %26
%29 = fptosi float %15 to i32
%30 = fptosi float %28 to i32
%31 = getelementptr [16 x <16 x i8>], [16 x <16 x i8>] addrspace(2)* %1, i64 0, i64 0, !amdgpu.uniform !0
%32 = load <16 x i8>, <16 x i8> addrspace(2)* %31, align 16, !invariant.load !0
%33 = call float @llvm.SI.load.const(<16 x i8> %32, i32 16)
%34 = bitcast float %33 to i32
%35 = sub i32 %29, %34
%36 = sub i32 0, %35
%37 = icmp sgt i32 %35, %36
%38 = select i1 %37, i32 %35, i32 %36
%39 = getelementptr [16 x <16 x i8>], [16 x <16 x i8>] addrspace(2)* %1, i64 0, i64 0, !amdgpu.uniform !0
%40 = load <16 x i8>, <16 x i8> addrspace(2)* %39, align 16, !invariant.load !0
%41 = call float @llvm.SI.load.const(<16 x i8> %40, i32 20)
%42 = bitcast float %41 to i32
%43 = sub i32 %30, %42
%44 = sub i32 0, %43
%45 = icmp sgt i32 %43, %44
%46 = select i1 %45, i32 %43, i32 %44
%47 = add i32 %38, %46
%48 = getelementptr [16 x <16 x i8>], [16 x <16 x i8>] addrspace(2)* %1, i64 0, i64 0, !amdgpu.uniform !0
%49 = load <16 x i8>, <16 x i8> addrspace(2)* %48, align 16, !invariant.load !0
%50 = call float @llvm.SI.load.const(<16 x i8> %49, i32 0)
%51 = bitcast float %50 to i32
%52 = icmp eq i32 %29, %51
%53 = call float @llvm.SI.load.const(<16 x i8> %49, i32 4)
%54 = bitcast float %53 to i32
%55 = icmp eq i32 %30, %54
%56 = and i1 %52, %55
%57 = select i1 %56, float -1.000000e+00, float 0.000000e+00
call void @llvm.AMDGPU.kill(float %57)
br label %loop17.i
loop17.i: ; preds = %endif21.i, %main_body
%58 = phi i32 [ 0, %main_body ], [ %61, %endif21.i ]
%TEMP3.z.0.i = phi float [ 0.000000e+00, %main_body ], [ %60, %endif21.i ]
%59 = icmp sgt i32 %58, 9
br i1 %59, label %main.exit, label %endif21.i
endif21.i: ; preds = %loop17.i
%60 = fadd float %TEMP3.z.0.i, 0x3FB99999A0000000
%61 = add i32 %58, %47
br i1 %56, label %main.exit, label %loop17.i
main.exit: ; preds = %loop17.i, %endif21.i
%TEMP3.z.1.i = phi float [ %TEMP3.z.0.i, %loop17.i ], [ %60, %endif21.i ]
%62 = call i32 @llvm.SI.packf16(float 0.000000e+00, float 0.000000e+00)
%63 = bitcast i32 %62 to float
%64 = call i32 @llvm.SI.packf16(float %TEMP3.z.1.i, float 0.000000e+00)
%65 = bitcast i32 %64 to float
call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %63, float %65, float undef, float undef)
ret void
}
attributes #0 = { nounwind readnone }
attributes #1 = { nounwind }
!0 = !{}
SHADER KEY
prolog.color_two_side = 0
prolog.flatshade_colors = 0
prolog.poly_stipple = 0
prolog.force_persp_sample_interp = 0
prolog.force_linear_sample_interp = 0
prolog.force_persp_center_interp = 0
prolog.force_linear_center_interp = 0
prolog.bc_optimize_for_persp = 0
prolog.bc_optimize_for_linear = 0
epilog.spi_shader_col_format = 0x4
epilog.color_is_int8 = 0x0
epilog.last_cbuf = 0
epilog.alpha_func = 7
epilog.alpha_to_one = 0
epilog.poly_line_smoothing = 0
epilog.clamp_color = 0
Pixel Shader:
Shader main disassembly:
s_load_dwordx4 s[0:3], s[2:3], 0x0 ; C0800300
v_cvt_i32_f32_e32 v0, v2 ; 7E001102
s_waitcnt lgkmcnt(0) ; BF8C007F
s_buffer_load_dword s4, s[0:3], 0x0 ; C2020100
s_buffer_load_dword s5, s[0:3], 0x1 ; C2028101
s_buffer_load_dword s6, s[0:3], 0x4 ; C2030104
s_buffer_load_dword s7, s[0:3], 0x5 ; C2038105
s_buffer_load_dword s8, s[0:3], 0x8 ; C2040108
s_buffer_load_dword s0, s[0:3], 0x9 ; C2000109
s_waitcnt lgkmcnt(0) ; BF8C007F
v_mov_b32_e32 v1, s0 ; 7E020200
v_mac_f32_e32 v1, s8, v3 ; 3E020608
v_cvt_i32_f32_e32 v2, v1 ; 7E041101
v_subrev_i32_e32 v1, vcc, s6, v0 ; 4E020006
v_sub_i32_e32 v3, vcc, 0, v1 ; 4C060280
v_max_i32_e32 v1, v3, v1 ; 24020303
v_subrev_i32_e32 v3, vcc, s7, v2 ; 4E060407
v_sub_i32_e32 v4, vcc, 0, v3 ; 4C080680
v_max_i32_e32 v3, v4, v3 ; 24060704
v_add_i32_e32 v1, vcc, v3, v1 ; 4A020303
v_cmp_eq_u32_e32 vcc, s4, v0 ; 7D840004
v_cmp_eq_u32_e64 s[0:1], s5, v2 ; D1840000 00020405
s_and_b64 s[0:1], vcc, s[0:1] ; 8780006A
v_mov_b32_e32 v0, 0 ; 7E000280
v_cndmask_b32_e64 v2, 0, -1.0, s[0:1] ; D2000002 0001E680
v_cmpx_le_f32_e32 vcc, 0, v2 ; 7C260480
s_cbranch_execnz BB0_2 ; BF890000
exp 0, 9, 0, 1, 1, v0, v0, v0, v0 ; F8001890 00000000
s_endpgm ; BF810000
v_sub_i32_e32 v2, vcc, 0, v1 ; 4C040280
s_mov_b64 s[2:3], 0 ; BE820480
v_add_i32_e32 v2, vcc, v2, v1 ; 4A040302
v_cmp_gt_i32_e32 vcc, 10, v2 ; 7D08048A
s_and_saveexec_b64 s[4:5], vcc ; BE84246A
s_xor_b64 s[4:5], exec, s[4:5] ; 8984047E
s_or_b64 s[2:3], s[0:1], s[2:3] ; 88820200
v_add_f32_e32 v0, 0x3dcccccd, v0 ; 060000FF 3DCCCCCD
s_or_b64 exec, exec, s[4:5] ; 88FE047E
s_or_b64 s[2:3], s[4:5], s[2:3] ; 88820204
s_andn2_b64 exec, exec, s[2:3] ; 8AFE027E
s_cbranch_execnz BB0_3 ; BF890000
s_or_b64 exec, exec, s[2:3] ; 88FE027E
v_cvt_pkrtz_f16_f32_e64 v1, 0, 0 ; D25E0001 00010080
v_cvt_pkrtz_f16_f32_e64 v0, v0, 0 ; D25E0000 00010100
exp 15, 0, 1, 1, 1, v1, v0, v0, v0 ; F8001C0F 00000001
s_endpgm ; BF810000
*** SHADER CONFIG ***
SPI_PS_INPUT_ADDR = 0x0301
SPI_PS_INPUT_ENA = 0x0301
*** SHADER STATS ***
SGPRS: 16
VGPRS: 8
Spilled SGPRs: 0
Spilled VGPRs: 0
Code Size: 212 bytes
LDS: 0 blocks
Scratch: 0 bytes per wave
Max Waves: 10
********************
FRAG
DCL IN[0], GENERIC[0], LINEAR
DCL OUT[0], COLOR
DCL SAMP[0]
DCL SVIEW[0], 2D, FLOAT
0: TEX OUT[0], IN[0], SAMP[0], 2D
1: END
radeonsi: Compiling shader 5
TGSI shader LLVM IR:
; ModuleID = 'tgsi'
source_filename = "tgsi"
target triple = "amdgcn--"
; Function Attrs: nounwind readnone
declare float @llvm.SI.fs.interp(i32, i32, i32, <2 x i32>) #0
; Function Attrs: nounwind readnone
declare <4 x float> @llvm.SI.image.sample.v2i32(<2 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0
; Function Attrs: nounwind
declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float) #1
define amdgpu_ps void @wrapper([17 x <16 x i8>] addrspace(2)* byval dereferenceable(18446744073709551615), [16 x <16 x i8>] addrspace(2)* byval dereferenceable(18446744073709551615), [32 x <8 x i32>] addrspace(2)* byval dereferenceable(18446744073709551615), [16 x <8 x i32>] addrspace(2)* byval dereferenceable(18446744073709551615), [16 x <4 x i32>] addrspace(2)* byval dereferenceable(18446744073709551615), float inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, i32, i32, float, i32) {
main_body:
%23 = getelementptr [32 x <8 x i32>], [32 x <8 x i32>] addrspace(2)* %2, i64 0, i64 0, !amdgpu.uniform !0
%24 = load <8 x i32>, <8 x i32> addrspace(2)* %23, align 32, !invariant.load !0
%25 = bitcast [32 x <8 x i32>] addrspace(2)* %2 to [0 x <4 x i32>] addrspace(2)*
%26 = getelementptr [0 x <4 x i32>], [0 x <4 x i32>] addrspace(2)* %25, i64 0, i64 3, !amdgpu.uniform !0
%27 = load <4 x i32>, <4 x i32> addrspace(2)* %26, align 16, !invariant.load !0
%28 = extractelement <8 x i32> %24, i32 7
%29 = extractelement <4 x i32> %27, i32 0
%30 = and i32 %29, %28
%31 = insertelement <4 x i32> %27, i32 %30, i32 0
%32 = call float @llvm.SI.fs.interp(i32 0, i32 0, i32 %6, <2 x i32> %12)
%33 = call float @llvm.SI.fs.interp(i32 1, i32 0, i32 %6, <2 x i32> %12)
%34 = bitcast float %32 to i32
%35 = bitcast float %33 to i32
%36 = insertelement <2 x i32> undef, i32 %34, i32 0
%37 = insertelement <2 x i32> %36, i32 %35, i32 1
%38 = call <4 x float> @llvm.SI.image.sample.v2i32(<2 x i32> %37, <8 x i32> %24, <4 x i32> %31, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
%39 = extractelement <4 x float> %38, i32 0
%40 = extractelement <4 x float> %38, i32 1
%41 = extractelement <4 x float> %38, i32 2
%42 = extractelement <4 x float> %38, i32 3
call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 0, float %39, float %40, float %41, float %42)
ret void
}
attributes #0 = { nounwind readnone }
attributes #1 = { nounwind }
!0 = !{}
SHADER KEY
prolog.color_two_side = 0
prolog.flatshade_colors = 0
prolog.poly_stipple = 0
prolog.force_persp_sample_interp = 0
prolog.force_linear_sample_interp = 0
prolog.force_persp_center_interp = 0
prolog.force_linear_center_interp = 0
prolog.bc_optimize_for_persp = 0
prolog.bc_optimize_for_linear = 0
epilog.spi_shader_col_format = 0x9
epilog.color_is_int8 = 0x0
epilog.last_cbuf = 0
epilog.alpha_func = 7
epilog.alpha_to_one = 0
epilog.poly_line_smoothing = 0
epilog.clamp_color = 0
Pixel Shader:
Shader main disassembly:
s_wqm_b64 exec, exec ; BEFE0A7E
s_load_dwordx8 s[12:19], s[4:5], 0x0 ; C0C60500
s_load_dwordx4 s[0:3], s[4:5], 0xc ; C080050C
s_mov_b32 m0, s11 ; BEFC030B
v_interp_p1_f32 v2, v0, 0, 0, [m0] ; C8080000
v_interp_p2_f32 v2, [v2], v1, 0, 0, [m0] ; C8090001
v_interp_p1_f32 v3, v0, 1, 0, [m0] ; C80C0100
s_waitcnt lgkmcnt(0) ; BF8C007F
s_and_b32 s0, s0, s19 ; 87001300
v_interp_p2_f32 v3, [v3], v1, 1, 0, [m0] ; C80D0101
image_sample v[0:3], v[2:3], s[12:19], s[0:3] dmask:0xf ; F0800F00 00030002
s_waitcnt vmcnt(0) ; BF8C0F70
exp 15, 0, 0, 1, 1, v0, v1, v2, v3 ; F800180F 03020100
s_endpgm ; BF810000
*** SHADER CONFIG ***
SPI_PS_INPUT_ADDR = 0x0020
SPI_PS_INPUT_ENA = 0x0020
*** SHADER STATS ***
SGPRS: 24
VGPRS: 4
Spilled SGPRs: 0
Spilled VGPRs: 0
Code Size: 64 bytes
LDS: 0 blocks
Scratch: 0 bytes per wave
Max Waves: 10
********************
-------------- next part --------------
FRAG
PROPERTY FS_COLOR0_WRITES_ALL_CBUFS 1
DCL IN[0], GENERIC[0], CONSTANT
DCL OUT[0], COLOR
0: MOV OUT[0], IN[0]
1: END
radeonsi: Compiling shader 2
TGSI shader LLVM IR:
; ModuleID = 'tgsi'
source_filename = "tgsi"
target triple = "amdgcn--"
; Function Attrs: nounwind readnone
declare float @llvm.SI.fs.constant(i32, i32, i32) #0
; Function Attrs: nounwind readnone
declare i32 @llvm.SI.packf16(float, float) #0
; Function Attrs: nounwind
declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float) #1
define amdgpu_ps void @wrapper([17 x <16 x i8>] addrspace(2)* byval dereferenceable(18446744073709551615), [16 x <16 x i8>] addrspace(2)* byval dereferenceable(18446744073709551615), [32 x <8 x i32>] addrspace(2)* byval dereferenceable(18446744073709551615), [16 x <8 x i32>] addrspace(2)* byval dereferenceable(18446744073709551615), [16 x <4 x i32>] addrspace(2)* byval dereferenceable(18446744073709551615), float inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, i32, i32, float, i32) {
main_body:
%23 = call float @llvm.SI.fs.constant(i32 0, i32 0, i32 %6)
%24 = call float @llvm.SI.fs.constant(i32 1, i32 0, i32 %6)
%25 = call float @llvm.SI.fs.constant(i32 2, i32 0, i32 %6)
%26 = call float @llvm.SI.fs.constant(i32 3, i32 0, i32 %6)
%27 = call i32 @llvm.SI.packf16(float %23, float %24)
%28 = bitcast i32 %27 to float
%29 = call i32 @llvm.SI.packf16(float %25, float %26)
%30 = bitcast i32 %29 to float
call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %28, float %30, float undef, float undef)
ret void
}
attributes #0 = { nounwind readnone }
attributes #1 = { nounwind }
SHADER KEY
prolog.color_two_side = 0
prolog.flatshade_colors = 0
prolog.poly_stipple = 0
prolog.force_persp_sample_interp = 0
prolog.force_linear_sample_interp = 0
prolog.force_persp_center_interp = 0
prolog.force_linear_center_interp = 0
prolog.bc_optimize_for_persp = 0
prolog.bc_optimize_for_linear = 0
epilog.spi_shader_col_format = 0x4
epilog.color_is_int8 = 0x0
epilog.last_cbuf = 0
epilog.alpha_func = 7
epilog.alpha_to_one = 0
epilog.poly_line_smoothing = 0
epilog.clamp_color = 0
Pixel Shader:
Shader main disassembly:
s_mov_b32 m0, s11 ; BEFC030B
v_interp_mov_f32 v0, P0, 0, 0, [m0] ; C8020002
v_interp_mov_f32 v1, P0, 1, 0, [m0] ; C8060102
v_interp_mov_f32 v2, P0, 2, 0, [m0] ; C80A0202
v_interp_mov_f32 v3, P0, 3, 0, [m0] ; C80E0302
v_cvt_pkrtz_f16_f32_e32 v0, v0, v1 ; 5E000300
v_cvt_pkrtz_f16_f32_e32 v1, v2, v3 ; 5E020702
exp 15, 0, 1, 1, 1, v0, v1, v0, v0 ; F8001C0F 00000100
s_endpgm ; BF810000
*** SHADER CONFIG ***
SPI_PS_INPUT_ADDR = 0x0001
SPI_PS_INPUT_ENA = 0x0001
*** SHADER STATS ***
SGPRS: 16
VGPRS: 4
Spilled SGPRs: 0
Spilled VGPRs: 0
Code Size: 40 bytes
LDS: 0 blocks
Scratch: 0 bytes per wave
Max Waves: 10
********************
FRAG
PROPERTY FS_COLOR0_WRITES_ALL_CBUFS 1
DCL SV[0], POSITION
DCL OUT[0], COLOR
DCL CONST[1..2]
DCL CONST[0]
DCL TEMP[0]
DCL TEMP[1..6], LOCAL
IMM[0] FLT32 { 1.0000, 0.0000, 0.1000, 0.0000}
IMM[1] INT32 {0, 10, 0, 0}
0: MOV TEMP[0], SV[0]
1: MAD TEMP[0].y, SV[0], CONST[2].xxxx, CONST[2].yyyy
2: F2I TEMP[1], TEMP[0]
3: MOV TEMP[2].xy, TEMP[1].xyxx
4: INEG TEMP[2].x, CONST[1].xxxx
5: UADD TEMP[2].x, TEMP[1].xxxx, TEMP[2].xxxx
6: IABS TEMP[2].x, TEMP[2].xxxx
7: INEG TEMP[3].x, CONST[1].yyyy
8: UADD TEMP[3].x, TEMP[1].yyyy, TEMP[3].xxxx
9: IABS TEMP[3].x, TEMP[3].xxxx
10: UADD TEMP[2].x, TEMP[2].xxxx, TEMP[3].xxxx
11: USEQ TEMP[1], TEMP[1].xyyy, CONST[0].xyyy
12: AND TEMP[1].x, TEMP[1].xxxx, TEMP[1].yyyy
13: AND TEMP[3], TEMP[1].xxxx, IMM[0].xxxx
14: KILL_IF -TEMP[3].xxxx
15: MOV TEMP[3], IMM[0].yyyy
16: MOV TEMP[4].x, IMM[1].xxxx
17: BGNLOOP :0
18: ISGE TEMP[5].x, TEMP[4].xxxx, IMM[1].yyyy
19: UIF TEMP[5].xxxx :0
20: BRK
21: ENDIF
22: ADD TEMP[6].x, TEMP[3].zzzz, IMM[0].zzzz
23: MOV TEMP[3].z, TEMP[6].xxxx
24: UADD TEMP[4].x, TEMP[4].xxxx, TEMP[2].xxxx
25: UIF TEMP[1].xxxx :0
26: BRK
27: ENDIF
28: ENDLOOP :0
29: MOV OUT[0], TEMP[3]
30: END
radeonsi: Compiling shader 4
TGSI shader LLVM IR:
; ModuleID = 'tgsi'
source_filename = "tgsi"
target triple = "amdgcn--"
; Function Attrs: nounwind readnone
declare float @llvm.SI.load.const(<16 x i8>, i32) #0
; Function Attrs: nounwind
declare void @llvm.AMDGPU.kill(float) #1
; Function Attrs: nounwind readnone
declare i32 @llvm.SI.packf16(float, float) #0
; Function Attrs: nounwind
declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float) #1
define amdgpu_ps void @wrapper([17 x <16 x i8>] addrspace(2)* byval dereferenceable(18446744073709551615), [16 x <16 x i8>] addrspace(2)* byval dereferenceable(18446744073709551615), [32 x <8 x i32>] addrspace(2)* byval dereferenceable(18446744073709551615), [16 x <8 x i32>] addrspace(2)* byval dereferenceable(18446744073709551615), [16 x <4 x i32>] addrspace(2)* byval dereferenceable(18446744073709551615), float inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, i32, i32, float, i32) {
main_body:
%23 = getelementptr [16 x <16 x i8>], [16 x <16 x i8>] addrspace(2)* %1, i64 0, i64 0, !amdgpu.uniform !0
%24 = load <16 x i8>, <16 x i8> addrspace(2)* %23, align 16, !invariant.load !0
%25 = call float @llvm.SI.load.const(<16 x i8> %24, i32 32)
%26 = call float @llvm.SI.load.const(<16 x i8> %24, i32 36)
%27 = fmul float %25, %16
%28 = fadd float %27, %26
%29 = fptosi float %15 to i32
%30 = fptosi float %28 to i32
%31 = getelementptr [16 x <16 x i8>], [16 x <16 x i8>] addrspace(2)* %1, i64 0, i64 0, !amdgpu.uniform !0
%32 = load <16 x i8>, <16 x i8> addrspace(2)* %31, align 16, !invariant.load !0
%33 = call float @llvm.SI.load.const(<16 x i8> %32, i32 16)
%34 = bitcast float %33 to i32
%35 = sub i32 %29, %34
%36 = sub i32 0, %35
%37 = icmp sgt i32 %35, %36
%38 = select i1 %37, i32 %35, i32 %36
%39 = getelementptr [16 x <16 x i8>], [16 x <16 x i8>] addrspace(2)* %1, i64 0, i64 0, !amdgpu.uniform !0
%40 = load <16 x i8>, <16 x i8> addrspace(2)* %39, align 16, !invariant.load !0
%41 = call float @llvm.SI.load.const(<16 x i8> %40, i32 20)
%42 = bitcast float %41 to i32
%43 = sub i32 %30, %42
%44 = sub i32 0, %43
%45 = icmp sgt i32 %43, %44
%46 = select i1 %45, i32 %43, i32 %44
%47 = add i32 %38, %46
%48 = getelementptr [16 x <16 x i8>], [16 x <16 x i8>] addrspace(2)* %1, i64 0, i64 0, !amdgpu.uniform !0
%49 = load <16 x i8>, <16 x i8> addrspace(2)* %48, align 16, !invariant.load !0
%50 = call float @llvm.SI.load.const(<16 x i8> %49, i32 0)
%51 = bitcast float %50 to i32
%52 = icmp eq i32 %29, %51
%53 = call float @llvm.SI.load.const(<16 x i8> %49, i32 4)
%54 = bitcast float %53 to i32
%55 = icmp eq i32 %30, %54
%56 = and i1 %52, %55
%57 = select i1 %56, float -1.000000e+00, float 0.000000e+00
call void @llvm.AMDGPU.kill(float %57)
br label %loop17.i
loop17.i: ; preds = %endif21.i, %main_body
%58 = phi i32 [ 0, %main_body ], [ %61, %endif21.i ]
%TEMP3.z.0.i = phi float [ 0.000000e+00, %main_body ], [ %60, %endif21.i ]
%59 = icmp sgt i32 %58, 9
br i1 %59, label %main.exit, label %endif21.i
endif21.i: ; preds = %loop17.i
%60 = fadd float %TEMP3.z.0.i, 0x3FB99999A0000000
%61 = add i32 %58, %47
br i1 %56, label %main.exit, label %loop17.i
main.exit: ; preds = %loop17.i, %endif21.i
%TEMP3.z.1.i = phi float [ %TEMP3.z.0.i, %loop17.i ], [ %60, %endif21.i ]
%62 = call i32 @llvm.SI.packf16(float 0.000000e+00, float 0.000000e+00)
%63 = bitcast i32 %62 to float
%64 = call i32 @llvm.SI.packf16(float %TEMP3.z.1.i, float 0.000000e+00)
%65 = bitcast i32 %64 to float
call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %63, float %65, float undef, float undef)
ret void
}
attributes #0 = { nounwind readnone }
attributes #1 = { nounwind }
!0 = !{}
SHADER KEY
prolog.color_two_side = 0
prolog.flatshade_colors = 0
prolog.poly_stipple = 0
prolog.force_persp_sample_interp = 0
prolog.force_linear_sample_interp = 0
prolog.force_persp_center_interp = 0
prolog.force_linear_center_interp = 0
prolog.bc_optimize_for_persp = 0
prolog.bc_optimize_for_linear = 0
epilog.spi_shader_col_format = 0x4
epilog.color_is_int8 = 0x0
epilog.last_cbuf = 0
epilog.alpha_func = 7
epilog.alpha_to_one = 0
epilog.poly_line_smoothing = 0
epilog.clamp_color = 0
Pixel Shader:
Shader main disassembly:
s_load_dwordx4 s[0:3], s[2:3], 0x0 ; C0800300
v_cvt_i32_f32_e32 v0, v2 ; 7E001102
s_waitcnt lgkmcnt(0) ; BF8C007F
s_buffer_load_dword s4, s[0:3], 0x0 ; C2020100
s_buffer_load_dword s5, s[0:3], 0x1 ; C2028101
s_buffer_load_dword s6, s[0:3], 0x4 ; C2030104
s_buffer_load_dword s7, s[0:3], 0x5 ; C2038105
s_buffer_load_dword s8, s[0:3], 0x8 ; C2040108
s_buffer_load_dword s0, s[0:3], 0x9 ; C2000109
s_waitcnt lgkmcnt(0) ; BF8C007F
v_mov_b32_e32 v1, s0 ; 7E020200
v_mac_f32_e32 v1, s8, v3 ; 3E020608
v_cvt_i32_f32_e32 v2, v1 ; 7E041101
v_subrev_i32_e32 v1, vcc, s6, v0 ; 4E020006
v_sub_i32_e32 v3, vcc, 0, v1 ; 4C060280
v_max_i32_e32 v1, v3, v1 ; 24020303
v_subrev_i32_e32 v3, vcc, s7, v2 ; 4E060407
v_sub_i32_e32 v4, vcc, 0, v3 ; 4C080680
v_max_i32_e32 v3, v4, v3 ; 24060704
v_add_i32_e32 v1, vcc, v3, v1 ; 4A020303
v_cmp_eq_u32_e32 vcc, s4, v0 ; 7D840004
v_cmp_eq_u32_e64 s[0:1], s5, v2 ; D1840000 00020405
s_and_b64 s[0:1], vcc, s[0:1] ; 8780006A
v_cndmask_b32_e64 v2, 0, -1, s[0:1] ; D2000002 00018280
v_mov_b32_e32 v0, 0 ; 7E000280
v_cndmask_b32_e64 v3, 0, -1.0, s[0:1] ; D2000003 0001E680
v_cmpx_le_f32_e32 vcc, 0, v3 ; 7C260680
s_cbranch_execnz BB0_2 ; BF890000
exp 0, 9, 0, 1, 1, v0, v0, v0, v0 ; F8001890 00000000
s_endpgm ; BF810000
v_sub_i32_e32 v3, vcc, 0, v1 ; 4C060280
s_mov_b64 s[2:3], 0 ; BE820480
v_add_i32_e32 v3, vcc, v3, v1 ; 4A060303
v_cmp_ne_u32_e64 s[0:1], 0, v2 ; D18A0000 00020480
v_cmp_gt_i32_e32 vcc, 10, v3 ; 7D08068A
s_and_saveexec_b64 s[4:5], vcc ; BE84246A
s_xor_b64 s[4:5], exec, s[4:5] ; 8984047E
s_or_b64 s[2:3], s[0:1], s[2:3] ; 88820200
v_add_f32_e32 v0, 0x3dcccccd, v0 ; 060000FF 3DCCCCCD
s_or_b64 exec, exec, s[4:5] ; 88FE047E
s_or_b64 s[2:3], s[4:5], s[2:3] ; 88820204
s_andn2_b64 exec, exec, s[2:3] ; 8AFE027E
s_cbranch_execnz BB0_3 ; BF890000
s_or_b64 exec, exec, s[2:3] ; 88FE027E
v_cvt_pkrtz_f16_f32_e64 v1, 0, 0 ; D25E0001 00010080
v_cvt_pkrtz_f16_f32_e64 v0, v0, 0 ; D25E0000 00010100
exp 15, 0, 1, 1, 1, v1, v0, v0, v0 ; F8001C0F 00000001
s_endpgm ; BF810000
*** SHADER CONFIG ***
SPI_PS_INPUT_ADDR = 0x0301
SPI_PS_INPUT_ENA = 0x0301
*** SHADER STATS ***
SGPRS: 16
VGPRS: 8
Spilled SGPRs: 0
Spilled VGPRs: 0
Code Size: 228 bytes
LDS: 0 blocks
Scratch: 0 bytes per wave
Max Waves: 10
********************
FRAG
DCL IN[0], GENERIC[0], LINEAR
DCL OUT[0], COLOR
DCL SAMP[0]
DCL SVIEW[0], 2D, FLOAT
0: TEX OUT[0], IN[0], SAMP[0], 2D
1: END
radeonsi: Compiling shader 5
TGSI shader LLVM IR:
; ModuleID = 'tgsi'
source_filename = "tgsi"
target triple = "amdgcn--"
; Function Attrs: nounwind readnone
declare float @llvm.SI.fs.interp(i32, i32, i32, <2 x i32>) #0
; Function Attrs: nounwind readnone
declare <4 x float> @llvm.SI.image.sample.v2i32(<2 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0
; Function Attrs: nounwind
declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float) #1
define amdgpu_ps void @wrapper([17 x <16 x i8>] addrspace(2)* byval dereferenceable(18446744073709551615), [16 x <16 x i8>] addrspace(2)* byval dereferenceable(18446744073709551615), [32 x <8 x i32>] addrspace(2)* byval dereferenceable(18446744073709551615), [16 x <8 x i32>] addrspace(2)* byval dereferenceable(18446744073709551615), [16 x <4 x i32>] addrspace(2)* byval dereferenceable(18446744073709551615), float inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, i32, i32, float, i32) {
main_body:
%23 = getelementptr [32 x <8 x i32>], [32 x <8 x i32>] addrspace(2)* %2, i64 0, i64 0, !amdgpu.uniform !0
%24 = load <8 x i32>, <8 x i32> addrspace(2)* %23, align 32, !invariant.load !0
%25 = bitcast [32 x <8 x i32>] addrspace(2)* %2 to [0 x <4 x i32>] addrspace(2)*
%26 = getelementptr [0 x <4 x i32>], [0 x <4 x i32>] addrspace(2)* %25, i64 0, i64 3, !amdgpu.uniform !0
%27 = load <4 x i32>, <4 x i32> addrspace(2)* %26, align 16, !invariant.load !0
%28 = extractelement <8 x i32> %24, i32 7
%29 = extractelement <4 x i32> %27, i32 0
%30 = and i32 %29, %28
%31 = insertelement <4 x i32> %27, i32 %30, i32 0
%32 = call float @llvm.SI.fs.interp(i32 0, i32 0, i32 %6, <2 x i32> %12)
%33 = call float @llvm.SI.fs.interp(i32 1, i32 0, i32 %6, <2 x i32> %12)
%34 = bitcast float %32 to i32
%35 = bitcast float %33 to i32
%36 = insertelement <2 x i32> undef, i32 %34, i32 0
%37 = insertelement <2 x i32> %36, i32 %35, i32 1
%38 = call <4 x float> @llvm.SI.image.sample.v2i32(<2 x i32> %37, <8 x i32> %24, <4 x i32> %31, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
%39 = extractelement <4 x float> %38, i32 0
%40 = extractelement <4 x float> %38, i32 1
%41 = extractelement <4 x float> %38, i32 2
%42 = extractelement <4 x float> %38, i32 3
call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 0, float %39, float %40, float %41, float %42)
ret void
}
attributes #0 = { nounwind readnone }
attributes #1 = { nounwind }
!0 = !{}
SHADER KEY
prolog.color_two_side = 0
prolog.flatshade_colors = 0
prolog.poly_stipple = 0
prolog.force_persp_sample_interp = 0
prolog.force_linear_sample_interp = 0
prolog.force_persp_center_interp = 0
prolog.force_linear_center_interp = 0
prolog.bc_optimize_for_persp = 0
prolog.bc_optimize_for_linear = 0
epilog.spi_shader_col_format = 0x9
epilog.color_is_int8 = 0x0
epilog.last_cbuf = 0
epilog.alpha_func = 7
epilog.alpha_to_one = 0
epilog.poly_line_smoothing = 0
epilog.clamp_color = 0
Pixel Shader:
Shader main disassembly:
s_wqm_b64 exec, exec ; BEFE0A7E
s_load_dwordx8 s[12:19], s[4:5], 0x0 ; C0C60500
s_load_dwordx4 s[0:3], s[4:5], 0xc ; C080050C
s_mov_b32 m0, s11 ; BEFC030B
v_interp_p1_f32 v2, v0, 0, 0, [m0] ; C8080000
v_interp_p2_f32 v2, [v2], v1, 0, 0, [m0] ; C8090001
v_interp_p1_f32 v3, v0, 1, 0, [m0] ; C80C0100
s_waitcnt lgkmcnt(0) ; BF8C007F
s_and_b32 s0, s0, s19 ; 87001300
v_interp_p2_f32 v3, [v3], v1, 1, 0, [m0] ; C80D0101
image_sample v[0:3], v[2:3], s[12:19], s[0:3] dmask:0xf ; F0800F00 00030002
s_waitcnt vmcnt(0) ; BF8C0F70
exp 15, 0, 0, 1, 1, v0, v1, v2, v3 ; F800180F 03020100
s_endpgm ; BF810000
*** SHADER CONFIG ***
SPI_PS_INPUT_ADDR = 0x0020
SPI_PS_INPUT_ENA = 0x0020
*** SHADER STATS ***
SGPRS: 24
VGPRS: 4
Spilled SGPRs: 0
Spilled VGPRs: 0
Code Size: 64 bytes
LDS: 0 blocks
Scratch: 0 bytes per wave
Max Waves: 10
********************
More information about the llvm-commits
mailing list