[llvm] r286171 - [AMDGPU] Allow hoisting of comparisons out of a loop and eliminate condition copies

Tue Nov 8 22:47:34 PST 2016

Hi Stanislav,

On 08/11/16 08:04 AM, Stanislav Mekhanoshin via llvm-commits wrote:
> Author: rampitec
> Date: Mon Nov  7 17:04:50 2016
> New Revision: 286171
> 
> URL: http://llvm.org/viewvc/llvm-project?rev=286171&view=rev
> Log:
> [AMDGPU] Allow hoisting of comparisons out of a loop and eliminate condition copies

This change broke the piglit test
spec at glsl-1.30@execution at fs-discard-exit-2 for me with the radeonsi
driver on Kaveri. I'm attaching shader dumps for the good and bad case.

-- 
Earthling Michel Dänzer               |               http://www.amd.com
Libre software enthusiast             |             Mesa and X developer
-------------- next part --------------
FRAG
PROPERTY FS_COLOR0_WRITES_ALL_CBUFS 1
DCL IN[0], COLOR, COLOR
DCL OUT[0], COLOR
  0: MOV OUT[0], IN[0]
  1: END
radeonsi: Compiling shader 1
TGSI shader LLVM IR:

; ModuleID = 'tgsi'
source_filename = "tgsi"
target triple = "amdgcn--"

define amdgpu_ps <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> @main([17 x <16 x i8>] addrspace(2)* byval dereferenceable(18446744073709551615), [16 x <16 x i8>] addrspace(2)* byval dereferenceable(18446744073709551615), [32 x <8 x i32>] addrspace(2)* byval dereferenceable(18446744073709551615), [16 x <8 x i32>] addrspace(2)* byval dereferenceable(18446744073709551615), [16 x <4 x i32>] addrspace(2)* byval dereferenceable(18446744073709551615), float inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, i32, i32, float, i32, float, float, float, float) #0 {
main_body:
  %27 = bitcast float %5 to i32
  %28 = insertvalue <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> undef, i32 %27, 10
  %29 = insertvalue <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> %28, float %23, 11
  %30 = insertvalue <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> %29, float %24, 12
  %31 = insertvalue <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> %30, float %25, 13
  %32 = insertvalue <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> %31, float %26, 14
  %33 = insertvalue <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> %32, float %21, 24
  ret <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> %33
}

attributes #0 = { "InitialPSInputAddr"="36983" }

FRAG
PROPERTY FS_COLOR0_WRITES_ALL_CBUFS 1
DCL SV[0], POSITION
DCL OUT[0], COLOR
DCL CONST[1..2]
DCL CONST[0]
DCL TEMP[0]
DCL TEMP[1..6], LOCAL
IMM[0] FLT32 {    1.0000,     0.0000,     0.1000,     0.0000}
IMM[1] INT32 {0, 10, 0, 0}
  0: MOV TEMP[0], SV[0]
  1: MAD TEMP[0].y, SV[0], CONST[2].xxxx, CONST[2].yyyy
  2: F2I TEMP[1], TEMP[0]
  3: MOV TEMP[2].xy, TEMP[1].xyxx
  4: INEG TEMP[2].x, CONST[1].xxxx
  5: UADD TEMP[2].x, TEMP[1].xxxx, TEMP[2].xxxx
  6: IABS TEMP[2].x, TEMP[2].xxxx
  7: INEG TEMP[3].x, CONST[1].yyyy
  8: UADD TEMP[3].x, TEMP[1].yyyy, TEMP[3].xxxx
  9: IABS TEMP[3].x, TEMP[3].xxxx
 10: UADD TEMP[2].x, TEMP[2].xxxx, TEMP[3].xxxx
 11: USEQ TEMP[1], TEMP[1].xyyy, CONST[0].xyyy
 12: AND TEMP[1].x, TEMP[1].xxxx, TEMP[1].yyyy
 13: AND TEMP[3], TEMP[1].xxxx, IMM[0].xxxx
 14: KILL_IF -TEMP[3].xxxx
 15: MOV TEMP[3], IMM[0].yyyy
 16: MOV TEMP[4].x, IMM[1].xxxx
 17: BGNLOOP :0
 18:   ISGE TEMP[5].x, TEMP[4].xxxx, IMM[1].yyyy
 19:   UIF TEMP[5].xxxx :0
 20:     BRK
 21:   ENDIF
 22:   ADD TEMP[6].x, TEMP[3].zzzz, IMM[0].zzzz
 23:   MOV TEMP[3].z, TEMP[6].xxxx
 24:   UADD TEMP[4].x, TEMP[4].xxxx, TEMP[2].xxxx
 25:   UIF TEMP[1].xxxx :0
 26:     BRK
 27:   ENDIF
 28: ENDLOOP :0
 29: MOV OUT[0], TEMP[3]
 30: END
radeonsi: Compiling shader 4
TGSI shader LLVM IR:

; ModuleID = 'tgsi'
source_filename = "tgsi"
target triple = "amdgcn--"

define amdgpu_ps <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> @main([17 x <16 x i8>] addrspace(2)* byval dereferenceable(18446744073709551615), [16 x <16 x i8>] addrspace(2)* byval dereferenceable(18446744073709551615), [32 x <8 x i32>] addrspace(2)* byval dereferenceable(18446744073709551615), [16 x <8 x i32>] addrspace(2)* byval dereferenceable(18446744073709551615), [16 x <4 x i32>] addrspace(2)* byval dereferenceable(18446744073709551615), float inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, i32, i32, float, i32) #0 {
main_body:
  %23 = getelementptr [16 x <16 x i8>], [16 x <16 x i8>] addrspace(2)* %1, i64 0, i64 0, !amdgpu.uniform !0
  %24 = load <16 x i8>, <16 x i8> addrspace(2)* %23, align 16, !invariant.load !0
  %25 = call float @llvm.SI.load.const(<16 x i8> %24, i32 32)
  %26 = call float @llvm.SI.load.const(<16 x i8> %24, i32 36)
  %27 = fmul float %25, %16
  %28 = fadd float %27, %26
  %29 = fptosi float %15 to i32
  %30 = fptosi float %28 to i32
  %31 = getelementptr [16 x <16 x i8>], [16 x <16 x i8>] addrspace(2)* %1, i64 0, i64 0, !amdgpu.uniform !0
  %32 = load <16 x i8>, <16 x i8> addrspace(2)* %31, align 16, !invariant.load !0
  %33 = call float @llvm.SI.load.const(<16 x i8> %32, i32 16)
  %34 = bitcast float %33 to i32
  %35 = sub i32 %29, %34
  %36 = sub i32 0, %35
  %37 = icmp sgt i32 %35, %36
  %38 = select i1 %37, i32 %35, i32 %36
  %39 = getelementptr [16 x <16 x i8>], [16 x <16 x i8>] addrspace(2)* %1, i64 0, i64 0, !amdgpu.uniform !0
  %40 = load <16 x i8>, <16 x i8> addrspace(2)* %39, align 16, !invariant.load !0
  %41 = call float @llvm.SI.load.const(<16 x i8> %40, i32 20)
  %42 = bitcast float %41 to i32
  %43 = sub i32 %30, %42
  %44 = sub i32 0, %43
  %45 = icmp sgt i32 %43, %44
  %46 = select i1 %45, i32 %43, i32 %44
  %47 = add i32 %38, %46
  %48 = getelementptr [16 x <16 x i8>], [16 x <16 x i8>] addrspace(2)* %1, i64 0, i64 0, !amdgpu.uniform !0
  %49 = load <16 x i8>, <16 x i8> addrspace(2)* %48, align 16, !invariant.load !0
  %50 = call float @llvm.SI.load.const(<16 x i8> %49, i32 0)
  %51 = bitcast float %50 to i32
  %52 = icmp eq i32 %29, %51
  %53 = call float @llvm.SI.load.const(<16 x i8> %49, i32 4)
  %54 = bitcast float %53 to i32
  %55 = icmp eq i32 %30, %54
  %56 = and i1 %52, %55
  %57 = select i1 %56, float -1.000000e+00, float 0.000000e+00
  call void @llvm.AMDGPU.kill(float %57)
  br label %loop17

loop17:                                           ; preds = %endif21, %main_body
  %58 = phi i32 [ 0, %main_body ], [ %62, %endif21 ]
  %59 = phi i32 [ 0, %main_body ], [ %62, %endif21 ]
  %TEMP3.z.0 = phi float [ 0.000000e+00, %main_body ], [ %61, %endif21 ]
  %60 = icmp sgt i32 %58, 9
  br i1 %60, label %endloop28, label %endif21

endif21:                                          ; preds = %loop17
  %61 = fadd float %TEMP3.z.0, 0x3FB99999A0000000
  %62 = add i32 %59, %47
  br i1 %56, label %endloop28, label %loop17

endloop28:                                        ; preds = %endif21, %loop17
  %TEMP3.z.1 = phi float [ %TEMP3.z.0, %loop17 ], [ %61, %endif21 ]
  %63 = bitcast float %5 to i32
  %64 = insertvalue <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> undef, i32 %63, 10
  %65 = insertvalue <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> %64, float 0.000000e+00, 11
  %66 = insertvalue <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> %65, float 0.000000e+00, 12
  %67 = insertvalue <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> %66, float %TEMP3.z.1, 13
  %68 = insertvalue <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> %67, float 0.000000e+00, 14
  %69 = insertvalue <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> %68, float %21, 24
  ret <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> %69
}

; Function Attrs: nounwind readnone
declare float @llvm.SI.load.const(<16 x i8>, i32) #1

; Function Attrs: nounwind
declare void @llvm.AMDGPU.kill(float) #2

attributes #0 = { "InitialPSInputAddr"="36983" }
attributes #1 = { nounwind readnone }
attributes #2 = { nounwind }

!0 = !{}

FRAG
PROPERTY FS_COLOR0_WRITES_ALL_CBUFS 1
DCL IN[0], GENERIC[0], CONSTANT
DCL OUT[0], COLOR
  0: MOV OUT[0], IN[0]
  1: END
radeonsi: Compiling shader 5
TGSI shader LLVM IR:

; ModuleID = 'tgsi'
source_filename = "tgsi"
target triple = "amdgcn--"

define amdgpu_ps <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> @main([17 x <16 x i8>] addrspace(2)* byval dereferenceable(18446744073709551615), [16 x <16 x i8>] addrspace(2)* byval dereferenceable(18446744073709551615), [32 x <8 x i32>] addrspace(2)* byval dereferenceable(18446744073709551615), [16 x <8 x i32>] addrspace(2)* byval dereferenceable(18446744073709551615), [16 x <4 x i32>] addrspace(2)* byval dereferenceable(18446744073709551615), float inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, i32, i32, float, i32) #0 {
main_body:
  %23 = call float @llvm.SI.fs.constant(i32 0, i32 0, i32 %6)
  %24 = call float @llvm.SI.fs.constant(i32 1, i32 0, i32 %6)
  %25 = call float @llvm.SI.fs.constant(i32 2, i32 0, i32 %6)
  %26 = call float @llvm.SI.fs.constant(i32 3, i32 0, i32 %6)
  %27 = bitcast float %5 to i32
  %28 = insertvalue <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> undef, i32 %27, 10
  %29 = insertvalue <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> %28, float %23, 11
  %30 = insertvalue <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> %29, float %24, 12
  %31 = insertvalue <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> %30, float %25, 13
  %32 = insertvalue <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> %31, float %26, 14
  %33 = insertvalue <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> %32, float %21, 24
  ret <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> %33
}

; Function Attrs: nounwind readnone
declare float @llvm.SI.fs.constant(i32, i32, i32) #1

attributes #0 = { "InitialPSInputAddr"="36983" }
attributes #1 = { nounwind readnone }

radeonsi: Compiling shader 9
Fragment Shader Epilog LLVM IR:

; ModuleID = 'tgsi'
source_filename = "tgsi"
target triple = "amdgcn--"

define amdgpu_ps void @ps_epilog(i64 inreg, i64 inreg, i64 inreg, i64 inreg, i64 inreg, float inreg, float, float, float, float, float, float, float, float, float, float, float, float, float, float) #0 {
main_body:
  %20 = call i32 @llvm.SI.packf16(float %6, float %7)
  %21 = bitcast i32 %20 to float
  %22 = call i32 @llvm.SI.packf16(float %8, float %9)
  %23 = bitcast i32 %22 to float
  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %21, float %23, float undef, float undef)
  ret void
}

; Function Attrs: nounwind readnone
declare i32 @llvm.SI.packf16(float, float) #1

; Function Attrs: nounwind
declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float) #2

attributes #0 = { "InitialPSInputAddr"="16777215" }
attributes #1 = { nounwind readnone }
attributes #2 = { nounwind }

SHADER KEY
  prolog.color_two_side = 0
  prolog.flatshade_colors = 0
  prolog.poly_stipple = 0
  prolog.force_persp_sample_interp = 0
  prolog.force_linear_sample_interp = 0
  prolog.force_persp_center_interp = 0
  prolog.force_linear_center_interp = 0
  prolog.bc_optimize_for_persp = 0
  prolog.bc_optimize_for_linear = 0
  epilog.spi_shader_col_format = 0x4
  epilog.color_is_int8 = 0x0
  epilog.last_cbuf = 0
  epilog.alpha_func = 7
  epilog.alpha_to_one = 0
  epilog.poly_line_smoothing = 0
  epilog.clamp_color = 0

Pixel Shader:
Shader main disassembly:
	s_mov_b32 m0, s11                   ; BEFC030B
	v_interp_mov_f32 v0, P0, 0, 0, [m0] ; C8020002
	v_interp_mov_f32 v1, P0, 1, 0, [m0] ; C8060102
	v_interp_mov_f32 v2, P0, 2, 0, [m0] ; C80A0202
	v_interp_mov_f32 v3, P0, 3, 0, [m0] ; C80E0302
Shader epilog disassembly:
	v_cvt_pkrtz_f16_f32_e32 v0, v0, v1 ; 5E000300
	v_cvt_pkrtz_f16_f32_e32 v1, v2, v3 ; 5E020702
	exp 15, 0, 1, 1, 1, v0, v1, v0, v0 ; F8001C0F 00000100
	s_endpgm                           ; BF810000

*** SHADER CONFIG ***
SPI_PS_INPUT_ADDR = 0xd077
SPI_PS_INPUT_ENA  = 0x0020
*** SHADER STATS ***
SGPRS: 16
VGPRS: 16
Spilled SGPRs: 0
Spilled VGPRs: 0
Code Size: 40 bytes
LDS: 0 blocks
Scratch: 0 bytes per wave
Max Waves: 10
********************

SHADER KEY
  prolog.color_two_side = 0
  prolog.flatshade_colors = 0
  prolog.poly_stipple = 0
  prolog.force_persp_sample_interp = 0
  prolog.force_linear_sample_interp = 0
  prolog.force_persp_center_interp = 0
  prolog.force_linear_center_interp = 0
  prolog.bc_optimize_for_persp = 0
  prolog.bc_optimize_for_linear = 0
  epilog.spi_shader_col_format = 0x4
  epilog.color_is_int8 = 0x0
  epilog.last_cbuf = 0
  epilog.alpha_func = 7
  epilog.alpha_to_one = 0
  epilog.poly_line_smoothing = 0
  epilog.clamp_color = 0

Pixel Shader:
Shader main disassembly:
	s_load_dwordx4 s[0:3], s[2:3], 0x0    ; C0800300
	v_cvt_i32_f32_e32 v0, v12             ; 7E00110C
	s_waitcnt lgkmcnt(0)                  ; BF8C007F
	s_buffer_load_dword s4, s[0:3], 0x9   ; C2020109
	s_buffer_load_dword s5, s[0:3], 0x0   ; C2028100
	s_buffer_load_dword s6, s[0:3], 0x1   ; C2030101
	s_buffer_load_dword s7, s[0:3], 0x4   ; C2038104
	s_buffer_load_dword s8, s[0:3], 0x5   ; C2040105
	s_buffer_load_dword s2, s[0:3], 0x8   ; C2010108
	s_waitcnt lgkmcnt(0)                  ; BF8C007F
	v_mov_b32_e32 v1, s4                  ; 7E020204
	v_cmp_eq_u32_e64 s[0:1], s5, v0       ; D1840000 00020005
	v_subrev_i32_e32 v0, vcc, s7, v0      ; 4E000007
	v_sub_i32_e32 v2, vcc, 0, v0          ; 4C040080
	v_mac_f32_e32 v1, s2, v13             ; 3E021A02
	v_cvt_i32_f32_e32 v1, v1              ; 7E021101
	v_max_i32_e32 v0, v2, v0              ; 24000102
	v_subrev_i32_e32 v2, vcc, s8, v1      ; 4E040208
	v_sub_i32_e32 v3, vcc, 0, v2          ; 4C060480
	v_max_i32_e32 v2, v3, v2              ; 24040503
	v_add_i32_e32 v0, vcc, v2, v0         ; 4A000102
	v_cmp_eq_u32_e32 vcc, s6, v1          ; 7D840206
	s_and_b64 s[0:1], s[0:1], vcc         ; 87806A00
	v_mov_b32_e32 v2, 0                   ; 7E040280
	v_cndmask_b32_e64 v1, 0, -1.0, s[0:1] ; D2000001 0001E680
	v_cmpx_le_f32_e32 vcc, 0, v1          ; 7C260280
	s_cbranch_execnz BB0_2                ; BF890000
	exp 0, 9, 0, 1, 1, v0, v0, v0, v0     ; F8001890 00000000
	s_endpgm                              ; BF810000
	v_sub_i32_e32 v1, vcc, 0, v0          ; 4C020080
	s_mov_b64 s[2:3], 0                   ; BE820480
	v_add_i32_e32 v1, vcc, v1, v0         ; 4A020101
	v_cmp_gt_i32_e32 vcc, 10, v1          ; 7D08028A
	s_and_saveexec_b64 s[4:5], vcc        ; BE84246A
	s_xor_b64 s[4:5], exec, s[4:5]        ; 8984047E
	s_or_b64 s[2:3], s[0:1], s[2:3]       ; 88820200
	v_add_f32_e32 v2, 0x3dcccccd, v2      ; 060404FF 3DCCCCCD
	s_or_b64 exec, exec, s[4:5]           ; 88FE047E
	s_or_b64 s[2:3], s[4:5], s[2:3]       ; 88820204
	s_andn2_b64 exec, exec, s[2:3]        ; 8AFE027E
	s_cbranch_execnz BB0_3                ; BF890000
	s_or_b64 exec, exec, s[2:3]           ; 88FE027E
	v_mov_b32_e32 v0, 0                   ; 7E000280
	v_mov_b32_e32 v1, 0                   ; 7E020280
	v_mov_b32_e32 v3, 0                   ; 7E060280
	v_mov_b32_e32 v13, v15                ; 7E1A030F
Shader epilog disassembly:
	v_cvt_pkrtz_f16_f32_e32 v0, v0, v1 ; 5E000300
	v_cvt_pkrtz_f16_f32_e32 v1, v2, v3 ; 5E020702
	exp 15, 0, 1, 1, 1, v0, v1, v0, v0 ; F8001C0F 00000100
	s_endpgm                           ; BF810000

*** SHADER CONFIG ***
SPI_PS_INPUT_ADDR = 0xd377
SPI_PS_INPUT_ENA  = 0x0320
*** SHADER STATS ***
SGPRS: 16
VGPRS: 17
Spilled SGPRs: 0
Spilled VGPRs: 0
Code Size: 220 bytes
LDS: 0 blocks
Scratch: 0 bytes per wave
Max Waves: 10
********************

FRAG
DCL IN[0], GENERIC[0], LINEAR
DCL OUT[0], COLOR
DCL SAMP[0]
DCL SVIEW[0], 2D, FLOAT
  0: TEX OUT[0], IN[0], SAMP[0], 2D
  1: END
radeonsi: Compiling shader 11
TGSI shader LLVM IR:

; ModuleID = 'tgsi'
source_filename = "tgsi"
target triple = "amdgcn--"

define amdgpu_ps <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> @main([17 x <16 x i8>] addrspace(2)* byval dereferenceable(18446744073709551615), [16 x <16 x i8>] addrspace(2)* byval dereferenceable(18446744073709551615), [32 x <8 x i32>] addrspace(2)* byval dereferenceable(18446744073709551615), [16 x <8 x i32>] addrspace(2)* byval dereferenceable(18446744073709551615), [16 x <4 x i32>] addrspace(2)* byval dereferenceable(18446744073709551615), float inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, i32, i32, float, i32) #0 {
main_body:
  %23 = getelementptr [32 x <8 x i32>], [32 x <8 x i32>] addrspace(2)* %2, i64 0, i64 0, !amdgpu.uniform !0
  %24 = load <8 x i32>, <8 x i32> addrspace(2)* %23, align 32, !invariant.load !0
  %25 = bitcast [32 x <8 x i32>] addrspace(2)* %2 to [0 x <4 x i32>] addrspace(2)*
  %26 = getelementptr [0 x <4 x i32>], [0 x <4 x i32>] addrspace(2)* %25, i64 0, i64 3, !amdgpu.uniform !0
  %27 = load <4 x i32>, <4 x i32> addrspace(2)* %26, align 16, !invariant.load !0
  %28 = extractelement <8 x i32> %24, i32 7
  %29 = extractelement <4 x i32> %27, i32 0
  %30 = and i32 %29, %28
  %31 = insertelement <4 x i32> %27, i32 %30, i32 0
  %32 = call float @llvm.SI.fs.interp(i32 0, i32 0, i32 %6, <2 x i32> %12)
  %33 = call float @llvm.SI.fs.interp(i32 1, i32 0, i32 %6, <2 x i32> %12)
  %34 = bitcast float %32 to i32
  %35 = bitcast float %33 to i32
  %36 = insertelement <2 x i32> undef, i32 %34, i32 0
  %37 = insertelement <2 x i32> %36, i32 %35, i32 1
  %38 = call <4 x float> @llvm.SI.image.sample.v2i32(<2 x i32> %37, <8 x i32> %24, <4 x i32> %31, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
  %39 = extractelement <4 x float> %38, i32 0
  %40 = extractelement <4 x float> %38, i32 1
  %41 = extractelement <4 x float> %38, i32 2
  %42 = extractelement <4 x float> %38, i32 3
  %43 = bitcast float %5 to i32
  %44 = insertvalue <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> undef, i32 %43, 10
  %45 = insertvalue <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> %44, float %39, 11
  %46 = insertvalue <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> %45, float %40, 12
  %47 = insertvalue <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> %46, float %41, 13
  %48 = insertvalue <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> %47, float %42, 14
  %49 = insertvalue <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> %48, float %21, 24
  ret <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> %49
}

; Function Attrs: nounwind readnone
declare float @llvm.SI.fs.interp(i32, i32, i32, <2 x i32>) #1

; Function Attrs: nounwind readnone
declare <4 x float> @llvm.SI.image.sample.v2i32(<2 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1

attributes #0 = { "InitialPSInputAddr"="36983" }
attributes #1 = { nounwind readnone }

!0 = !{}

radeonsi: Compiling shader 12
Fragment Shader Epilog LLVM IR:

; ModuleID = 'tgsi'
source_filename = "tgsi"
target triple = "amdgcn--"

define amdgpu_ps void @ps_epilog(i64 inreg, i64 inreg, i64 inreg, i64 inreg, i64 inreg, float inreg, float, float, float, float, float, float, float, float, float, float, float, float, float, float) #0 {
main_body:
  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 0, float %6, float %7, float %8, float %9)
  ret void
}

; Function Attrs: nounwind
declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float) #1

attributes #0 = { "InitialPSInputAddr"="16777215" }
attributes #1 = { nounwind }

SHADER KEY
  prolog.color_two_side = 0
  prolog.flatshade_colors = 0
  prolog.poly_stipple = 0
  prolog.force_persp_sample_interp = 0
  prolog.force_linear_sample_interp = 0
  prolog.force_persp_center_interp = 0
  prolog.force_linear_center_interp = 0
  prolog.bc_optimize_for_persp = 0
  prolog.bc_optimize_for_linear = 0
  epilog.spi_shader_col_format = 0x9
  epilog.color_is_int8 = 0x0
  epilog.last_cbuf = 0
  epilog.alpha_func = 7
  epilog.alpha_to_one = 0
  epilog.poly_line_smoothing = 0
  epilog.clamp_color = 0

Pixel Shader:
Shader main disassembly:
	s_mov_b64 s[6:7], exec                                  ; BE86047E
	s_wqm_b64 exec, exec                                    ; BEFE0A7E
	s_load_dwordx8 s[12:19], s[4:5], 0x0                    ; C0C60500
	s_load_dwordx4 s[0:3], s[4:5], 0xc                      ; C080050C
	s_mov_b32 m0, s11                                       ; BEFC030B
	v_interp_p1_f32 v0, v8, 0, 0, [m0]                      ; C8000008
	v_interp_p2_f32 v0, [v0], v9, 0, 0, [m0]                ; C8010009
	v_interp_p1_f32 v1, v8, 1, 0, [m0]                      ; C8040108
	s_waitcnt lgkmcnt(0)                                    ; BF8C007F
	s_and_b32 s0, s0, s19                                   ; 87001300
	v_interp_p2_f32 v1, [v1], v9, 1, 0, [m0]                ; C8050109
	s_and_b64 exec, exec, s[6:7]                            ; 87FE067E
	image_sample v[0:3], v[0:1], s[12:19], s[0:3] dmask:0xf ; F0800F00 00030000
	s_waitcnt vmcnt(0)                                      ; BF8C0F70
Shader epilog disassembly:
	exp 15, 0, 0, 1, 1, v0, v1, v2, v3 ; F800180F 03020100
	s_endpgm                           ; BF810000

*** SHADER CONFIG ***
SPI_PS_INPUT_ADDR = 0xd077
SPI_PS_INPUT_ENA  = 0x0020
*** SHADER STATS ***
SGPRS: 24
VGPRS: 16
Spilled SGPRs: 0
Spilled VGPRs: 0
Code Size: 72 bytes
LDS: 0 blocks
Scratch: 0 bytes per wave
Max Waves: 10
********************

-------------- next part --------------
FRAG
PROPERTY FS_COLOR0_WRITES_ALL_CBUFS 1
DCL IN[0], COLOR, COLOR
DCL OUT[0], COLOR
  0: MOV OUT[0], IN[0]
  1: END
radeonsi: Compiling shader 1
TGSI shader LLVM IR:

; ModuleID = 'tgsi'
source_filename = "tgsi"
target triple = "amdgcn--"

define amdgpu_ps <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> @main([17 x <16 x i8>] addrspace(2)* byval dereferenceable(18446744073709551615), [16 x <16 x i8>] addrspace(2)* byval dereferenceable(18446744073709551615), [32 x <8 x i32>] addrspace(2)* byval dereferenceable(18446744073709551615), [16 x <8 x i32>] addrspace(2)* byval dereferenceable(18446744073709551615), [16 x <4 x i32>] addrspace(2)* byval dereferenceable(18446744073709551615), float inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, i32, i32, float, i32, float, float, float, float) #0 {
main_body:
  %27 = bitcast float %5 to i32
  %28 = insertvalue <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> undef, i32 %27, 10
  %29 = insertvalue <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> %28, float %23, 11
  %30 = insertvalue <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> %29, float %24, 12
  %31 = insertvalue <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> %30, float %25, 13
  %32 = insertvalue <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> %31, float %26, 14
  %33 = insertvalue <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> %32, float %21, 24
  ret <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> %33
}

attributes #0 = { "InitialPSInputAddr"="36983" }

FRAG
PROPERTY FS_COLOR0_WRITES_ALL_CBUFS 1
DCL SV[0], POSITION
DCL OUT[0], COLOR
DCL CONST[1..2]
DCL CONST[0]
DCL TEMP[0]
DCL TEMP[1..6], LOCAL
IMM[0] FLT32 {    1.0000,     0.0000,     0.1000,     0.0000}
IMM[1] INT32 {0, 10, 0, 0}
  0: MOV TEMP[0], SV[0]
  1: MAD TEMP[0].y, SV[0], CONST[2].xxxx, CONST[2].yyyy
  2: F2I TEMP[1], TEMP[0]
  3: MOV TEMP[2].xy, TEMP[1].xyxx
  4: INEG TEMP[2].x, CONST[1].xxxx
  5: UADD TEMP[2].x, TEMP[1].xxxx, TEMP[2].xxxx
  6: IABS TEMP[2].x, TEMP[2].xxxx
  7: INEG TEMP[3].x, CONST[1].yyyy
  8: UADD TEMP[3].x, TEMP[1].yyyy, TEMP[3].xxxx
  9: IABS TEMP[3].x, TEMP[3].xxxx
 10: UADD TEMP[2].x, TEMP[2].xxxx, TEMP[3].xxxx
 11: USEQ TEMP[1], TEMP[1].xyyy, CONST[0].xyyy
 12: AND TEMP[1].x, TEMP[1].xxxx, TEMP[1].yyyy
 13: AND TEMP[3], TEMP[1].xxxx, IMM[0].xxxx
 14: KILL_IF -TEMP[3].xxxx
 15: MOV TEMP[3], IMM[0].yyyy
 16: MOV TEMP[4].x, IMM[1].xxxx
 17: BGNLOOP :0
 18:   ISGE TEMP[5].x, TEMP[4].xxxx, IMM[1].yyyy
 19:   UIF TEMP[5].xxxx :0
 20:     BRK
 21:   ENDIF
 22:   ADD TEMP[6].x, TEMP[3].zzzz, IMM[0].zzzz
 23:   MOV TEMP[3].z, TEMP[6].xxxx
 24:   UADD TEMP[4].x, TEMP[4].xxxx, TEMP[2].xxxx
 25:   UIF TEMP[1].xxxx :0
 26:     BRK
 27:   ENDIF
 28: ENDLOOP :0
 29: MOV OUT[0], TEMP[3]
 30: END
radeonsi: Compiling shader 4
TGSI shader LLVM IR:

; ModuleID = 'tgsi'
source_filename = "tgsi"
target triple = "amdgcn--"

define amdgpu_ps <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> @main([17 x <16 x i8>] addrspace(2)* byval dereferenceable(18446744073709551615), [16 x <16 x i8>] addrspace(2)* byval dereferenceable(18446744073709551615), [32 x <8 x i32>] addrspace(2)* byval dereferenceable(18446744073709551615), [16 x <8 x i32>] addrspace(2)* byval dereferenceable(18446744073709551615), [16 x <4 x i32>] addrspace(2)* byval dereferenceable(18446744073709551615), float inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, i32, i32, float, i32) #0 {
main_body:
  %23 = getelementptr [16 x <16 x i8>], [16 x <16 x i8>] addrspace(2)* %1, i64 0, i64 0, !amdgpu.uniform !0
  %24 = load <16 x i8>, <16 x i8> addrspace(2)* %23, align 16, !invariant.load !0
  %25 = call float @llvm.SI.load.const(<16 x i8> %24, i32 32)
  %26 = call float @llvm.SI.load.const(<16 x i8> %24, i32 36)
  %27 = fmul float %25, %16
  %28 = fadd float %27, %26
  %29 = fptosi float %15 to i32
  %30 = fptosi float %28 to i32
  %31 = getelementptr [16 x <16 x i8>], [16 x <16 x i8>] addrspace(2)* %1, i64 0, i64 0, !amdgpu.uniform !0
  %32 = load <16 x i8>, <16 x i8> addrspace(2)* %31, align 16, !invariant.load !0
  %33 = call float @llvm.SI.load.const(<16 x i8> %32, i32 16)
  %34 = bitcast float %33 to i32
  %35 = sub i32 %29, %34
  %36 = sub i32 0, %35
  %37 = icmp sgt i32 %35, %36
  %38 = select i1 %37, i32 %35, i32 %36
  %39 = getelementptr [16 x <16 x i8>], [16 x <16 x i8>] addrspace(2)* %1, i64 0, i64 0, !amdgpu.uniform !0
  %40 = load <16 x i8>, <16 x i8> addrspace(2)* %39, align 16, !invariant.load !0
  %41 = call float @llvm.SI.load.const(<16 x i8> %40, i32 20)
  %42 = bitcast float %41 to i32
  %43 = sub i32 %30, %42
  %44 = sub i32 0, %43
  %45 = icmp sgt i32 %43, %44
  %46 = select i1 %45, i32 %43, i32 %44
  %47 = add i32 %38, %46
  %48 = getelementptr [16 x <16 x i8>], [16 x <16 x i8>] addrspace(2)* %1, i64 0, i64 0, !amdgpu.uniform !0
  %49 = load <16 x i8>, <16 x i8> addrspace(2)* %48, align 16, !invariant.load !0
  %50 = call float @llvm.SI.load.const(<16 x i8> %49, i32 0)
  %51 = bitcast float %50 to i32
  %52 = icmp eq i32 %29, %51
  %53 = call float @llvm.SI.load.const(<16 x i8> %49, i32 4)
  %54 = bitcast float %53 to i32
  %55 = icmp eq i32 %30, %54
  %56 = and i1 %52, %55
  %57 = select i1 %56, float -1.000000e+00, float 0.000000e+00
  call void @llvm.AMDGPU.kill(float %57)
  br label %loop17

loop17:                                           ; preds = %endif21, %main_body
  %58 = phi i32 [ 0, %main_body ], [ %62, %endif21 ]
  %59 = phi i32 [ 0, %main_body ], [ %62, %endif21 ]
  %TEMP3.z.0 = phi float [ 0.000000e+00, %main_body ], [ %61, %endif21 ]
  %60 = icmp sgt i32 %58, 9
  br i1 %60, label %endloop28, label %endif21

endif21:                                          ; preds = %loop17
  %61 = fadd float %TEMP3.z.0, 0x3FB99999A0000000
  %62 = add i32 %59, %47
  br i1 %56, label %endloop28, label %loop17

endloop28:                                        ; preds = %endif21, %loop17
  %TEMP3.z.1 = phi float [ %TEMP3.z.0, %loop17 ], [ %61, %endif21 ]
  %63 = bitcast float %5 to i32
  %64 = insertvalue <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> undef, i32 %63, 10
  %65 = insertvalue <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> %64, float 0.000000e+00, 11
  %66 = insertvalue <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> %65, float 0.000000e+00, 12
  %67 = insertvalue <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> %66, float %TEMP3.z.1, 13
  %68 = insertvalue <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> %67, float 0.000000e+00, 14
  %69 = insertvalue <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> %68, float %21, 24
  ret <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> %69
}

; Function Attrs: nounwind readnone
declare float @llvm.SI.load.const(<16 x i8>, i32) #1

; Function Attrs: nounwind
declare void @llvm.AMDGPU.kill(float) #2

attributes #0 = { "InitialPSInputAddr"="36983" }
attributes #1 = { nounwind readnone }
attributes #2 = { nounwind }

!0 = !{}

FRAG
PROPERTY FS_COLOR0_WRITES_ALL_CBUFS 1
DCL IN[0], GENERIC[0], CONSTANT
DCL OUT[0], COLOR
  0: MOV OUT[0], IN[0]
  1: END
radeonsi: Compiling shader 5
TGSI shader LLVM IR:

; ModuleID = 'tgsi'
source_filename = "tgsi"
target triple = "amdgcn--"

define amdgpu_ps <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> @main([17 x <16 x i8>] addrspace(2)* byval dereferenceable(18446744073709551615), [16 x <16 x i8>] addrspace(2)* byval dereferenceable(18446744073709551615), [32 x <8 x i32>] addrspace(2)* byval dereferenceable(18446744073709551615), [16 x <8 x i32>] addrspace(2)* byval dereferenceable(18446744073709551615), [16 x <4 x i32>] addrspace(2)* byval dereferenceable(18446744073709551615), float inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, i32, i32, float, i32) #0 {
main_body:
  %23 = call float @llvm.SI.fs.constant(i32 0, i32 0, i32 %6)
  %24 = call float @llvm.SI.fs.constant(i32 1, i32 0, i32 %6)
  %25 = call float @llvm.SI.fs.constant(i32 2, i32 0, i32 %6)
  %26 = call float @llvm.SI.fs.constant(i32 3, i32 0, i32 %6)
  %27 = bitcast float %5 to i32
  %28 = insertvalue <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> undef, i32 %27, 10
  %29 = insertvalue <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> %28, float %23, 11
  %30 = insertvalue <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> %29, float %24, 12
  %31 = insertvalue <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> %30, float %25, 13
  %32 = insertvalue <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> %31, float %26, 14
  %33 = insertvalue <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> %32, float %21, 24
  ret <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> %33
}

; Function Attrs: nounwind readnone
declare float @llvm.SI.fs.constant(i32, i32, i32) #1

attributes #0 = { "InitialPSInputAddr"="36983" }
attributes #1 = { nounwind readnone }

radeonsi: Compiling shader 9
Fragment Shader Epilog LLVM IR:

; ModuleID = 'tgsi'
source_filename = "tgsi"
target triple = "amdgcn--"

define amdgpu_ps void @ps_epilog(i64 inreg, i64 inreg, i64 inreg, i64 inreg, i64 inreg, float inreg, float, float, float, float, float, float, float, float, float, float, float, float, float, float) #0 {
main_body:
  %20 = call i32 @llvm.SI.packf16(float %6, float %7)
  %21 = bitcast i32 %20 to float
  %22 = call i32 @llvm.SI.packf16(float %8, float %9)
  %23 = bitcast i32 %22 to float
  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %21, float %23, float undef, float undef)
  ret void
}

; Function Attrs: nounwind readnone
declare i32 @llvm.SI.packf16(float, float) #1

; Function Attrs: nounwind
declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float) #2

attributes #0 = { "InitialPSInputAddr"="16777215" }
attributes #1 = { nounwind readnone }
attributes #2 = { nounwind }

SHADER KEY
  prolog.color_two_side = 0
  prolog.flatshade_colors = 0
  prolog.poly_stipple = 0
  prolog.force_persp_sample_interp = 0
  prolog.force_linear_sample_interp = 0
  prolog.force_persp_center_interp = 0
  prolog.force_linear_center_interp = 0
  prolog.bc_optimize_for_persp = 0
  prolog.bc_optimize_for_linear = 0
  epilog.spi_shader_col_format = 0x4
  epilog.color_is_int8 = 0x0
  epilog.last_cbuf = 0
  epilog.alpha_func = 7
  epilog.alpha_to_one = 0
  epilog.poly_line_smoothing = 0
  epilog.clamp_color = 0

Pixel Shader:
Shader main disassembly:
	s_mov_b32 m0, s11                   ; BEFC030B
	v_interp_mov_f32 v0, P0, 0, 0, [m0] ; C8020002
	v_interp_mov_f32 v1, P0, 1, 0, [m0] ; C8060102
	v_interp_mov_f32 v2, P0, 2, 0, [m0] ; C80A0202
	v_interp_mov_f32 v3, P0, 3, 0, [m0] ; C80E0302
Shader epilog disassembly:
	v_cvt_pkrtz_f16_f32_e32 v0, v0, v1 ; 5E000300
	v_cvt_pkrtz_f16_f32_e32 v1, v2, v3 ; 5E020702
	exp 15, 0, 1, 1, 1, v0, v1, v0, v0 ; F8001C0F 00000100
	s_endpgm                           ; BF810000

*** SHADER CONFIG ***
SPI_PS_INPUT_ADDR = 0xd077
SPI_PS_INPUT_ENA  = 0x0020
*** SHADER STATS ***
SGPRS: 16
VGPRS: 16
Spilled SGPRs: 0
Spilled VGPRs: 0
Code Size: 40 bytes
LDS: 0 blocks
Scratch: 0 bytes per wave
Max Waves: 10
********************

SHADER KEY
  prolog.color_two_side = 0
  prolog.flatshade_colors = 0
  prolog.poly_stipple = 0
  prolog.force_persp_sample_interp = 0
  prolog.force_linear_sample_interp = 0
  prolog.force_persp_center_interp = 0
  prolog.force_linear_center_interp = 0
  prolog.bc_optimize_for_persp = 0
  prolog.bc_optimize_for_linear = 0
  epilog.spi_shader_col_format = 0x4
  epilog.color_is_int8 = 0x0
  epilog.last_cbuf = 0
  epilog.alpha_func = 7
  epilog.alpha_to_one = 0
  epilog.poly_line_smoothing = 0
  epilog.clamp_color = 0

Pixel Shader:
Shader main disassembly:
	s_load_dwordx4 s[0:3], s[2:3], 0x0    ; C0800300
	v_cvt_i32_f32_e32 v0, v12             ; 7E00110C
	s_waitcnt lgkmcnt(0)                  ; BF8C007F
	s_buffer_load_dword s4, s[0:3], 0x9   ; C2020109
	s_buffer_load_dword s5, s[0:3], 0x0   ; C2028100
	s_buffer_load_dword s6, s[0:3], 0x1   ; C2030101
	s_buffer_load_dword s7, s[0:3], 0x4   ; C2038104
	s_buffer_load_dword s8, s[0:3], 0x5   ; C2040105
	s_buffer_load_dword s2, s[0:3], 0x8   ; C2010108
	s_waitcnt lgkmcnt(0)                  ; BF8C007F
	v_mov_b32_e32 v1, s4                  ; 7E020204
	v_cmp_eq_u32_e64 s[0:1], s5, v0       ; D1840000 00020005
	v_subrev_i32_e32 v0, vcc, s7, v0      ; 4E000007
	v_sub_i32_e32 v2, vcc, 0, v0          ; 4C040080
	v_mac_f32_e32 v1, s2, v13             ; 3E021A02
	v_cvt_i32_f32_e32 v1, v1              ; 7E021101
	v_max_i32_e32 v0, v2, v0              ; 24000102
	v_subrev_i32_e32 v2, vcc, s8, v1      ; 4E040208
	v_sub_i32_e32 v3, vcc, 0, v2          ; 4C060480
	v_max_i32_e32 v2, v3, v2              ; 24040503
	v_add_i32_e32 v0, vcc, v2, v0         ; 4A000102
	v_cmp_eq_u32_e32 vcc, s6, v1          ; 7D840206
	s_and_b64 s[0:1], s[0:1], vcc         ; 87806A00
	v_cndmask_b32_e64 v1, 0, -1, s[0:1]   ; D2000001 00018280
	v_mov_b32_e32 v2, 0                   ; 7E040280
	v_cndmask_b32_e64 v3, 0, -1.0, s[0:1] ; D2000003 0001E680
	v_cmpx_le_f32_e32 vcc, 0, v3          ; 7C260680
	s_cbranch_execnz BB0_2                ; BF890000
	exp 0, 9, 0, 1, 1, v0, v0, v0, v0     ; F8001890 00000000
	s_endpgm                              ; BF810000
	v_sub_i32_e32 v3, vcc, 0, v0          ; 4C060080
	s_mov_b64 s[2:3], 0                   ; BE820480
	v_add_i32_e32 v3, vcc, v3, v0         ; 4A060103
	v_cmp_ne_u32_e64 s[0:1], 0, v1        ; D18A0000 00020280
	v_cmp_gt_i32_e32 vcc, 10, v3          ; 7D08068A
	s_and_saveexec_b64 s[4:5], vcc        ; BE84246A
	s_xor_b64 s[4:5], exec, s[4:5]        ; 8984047E
	s_or_b64 s[2:3], s[0:1], s[2:3]       ; 88820200
	v_add_f32_e32 v2, 0x3dcccccd, v2      ; 060404FF 3DCCCCCD
	s_or_b64 exec, exec, s[4:5]           ; 88FE047E
	s_or_b64 s[2:3], s[4:5], s[2:3]       ; 88820204
	s_andn2_b64 exec, exec, s[2:3]        ; 8AFE027E
	s_cbranch_execnz BB0_3                ; BF890000
	s_or_b64 exec, exec, s[2:3]           ; 88FE027E
	v_mov_b32_e32 v0, 0                   ; 7E000280
	v_mov_b32_e32 v1, 0                   ; 7E020280
	v_mov_b32_e32 v3, 0                   ; 7E060280
	v_mov_b32_e32 v13, v15                ; 7E1A030F
Shader epilog disassembly:
	v_cvt_pkrtz_f16_f32_e32 v0, v0, v1 ; 5E000300
	v_cvt_pkrtz_f16_f32_e32 v1, v2, v3 ; 5E020702
	exp 15, 0, 1, 1, 1, v0, v1, v0, v0 ; F8001C0F 00000100
	s_endpgm                           ; BF810000

*** SHADER CONFIG ***
SPI_PS_INPUT_ADDR = 0xd377
SPI_PS_INPUT_ENA  = 0x0320
*** SHADER STATS ***
SGPRS: 16
VGPRS: 17
Spilled SGPRs: 0
Spilled VGPRs: 0
Code Size: 236 bytes
LDS: 0 blocks
Scratch: 0 bytes per wave
Max Waves: 10
********************

FRAG
DCL IN[0], GENERIC[0], LINEAR
DCL OUT[0], COLOR
DCL SAMP[0]
DCL SVIEW[0], 2D, FLOAT
  0: TEX OUT[0], IN[0], SAMP[0], 2D
  1: END
radeonsi: Compiling shader 11
TGSI shader LLVM IR:

; ModuleID = 'tgsi'
source_filename = "tgsi"
target triple = "amdgcn--"

define amdgpu_ps <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> @main([17 x <16 x i8>] addrspace(2)* byval dereferenceable(18446744073709551615), [16 x <16 x i8>] addrspace(2)* byval dereferenceable(18446744073709551615), [32 x <8 x i32>] addrspace(2)* byval dereferenceable(18446744073709551615), [16 x <8 x i32>] addrspace(2)* byval dereferenceable(18446744073709551615), [16 x <4 x i32>] addrspace(2)* byval dereferenceable(18446744073709551615), float inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, i32, i32, float, i32) #0 {
main_body:
  %23 = getelementptr [32 x <8 x i32>], [32 x <8 x i32>] addrspace(2)* %2, i64 0, i64 0, !amdgpu.uniform !0
  %24 = load <8 x i32>, <8 x i32> addrspace(2)* %23, align 32, !invariant.load !0
  %25 = bitcast [32 x <8 x i32>] addrspace(2)* %2 to [0 x <4 x i32>] addrspace(2)*
  %26 = getelementptr [0 x <4 x i32>], [0 x <4 x i32>] addrspace(2)* %25, i64 0, i64 3, !amdgpu.uniform !0
  %27 = load <4 x i32>, <4 x i32> addrspace(2)* %26, align 16, !invariant.load !0
  %28 = extractelement <8 x i32> %24, i32 7
  %29 = extractelement <4 x i32> %27, i32 0
  %30 = and i32 %29, %28
  %31 = insertelement <4 x i32> %27, i32 %30, i32 0
  %32 = call float @llvm.SI.fs.interp(i32 0, i32 0, i32 %6, <2 x i32> %12)
  %33 = call float @llvm.SI.fs.interp(i32 1, i32 0, i32 %6, <2 x i32> %12)
  %34 = bitcast float %32 to i32
  %35 = bitcast float %33 to i32
  %36 = insertelement <2 x i32> undef, i32 %34, i32 0
  %37 = insertelement <2 x i32> %36, i32 %35, i32 1
  %38 = call <4 x float> @llvm.SI.image.sample.v2i32(<2 x i32> %37, <8 x i32> %24, <4 x i32> %31, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
  %39 = extractelement <4 x float> %38, i32 0
  %40 = extractelement <4 x float> %38, i32 1
  %41 = extractelement <4 x float> %38, i32 2
  %42 = extractelement <4 x float> %38, i32 3
  %43 = bitcast float %5 to i32
  %44 = insertvalue <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> undef, i32 %43, 10
  %45 = insertvalue <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> %44, float %39, 11
  %46 = insertvalue <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> %45, float %40, 12
  %47 = insertvalue <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> %46, float %41, 13
  %48 = insertvalue <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> %47, float %42, 14
  %49 = insertvalue <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> %48, float %21, 24
  ret <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> %49
}

; Function Attrs: nounwind readnone
declare float @llvm.SI.fs.interp(i32, i32, i32, <2 x i32>) #1

; Function Attrs: nounwind readnone
declare <4 x float> @llvm.SI.image.sample.v2i32(<2 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1

attributes #0 = { "InitialPSInputAddr"="36983" }
attributes #1 = { nounwind readnone }

!0 = !{}

radeonsi: Compiling shader 12
Fragment Shader Epilog LLVM IR:

; ModuleID = 'tgsi'
source_filename = "tgsi"
target triple = "amdgcn--"

define amdgpu_ps void @ps_epilog(i64 inreg, i64 inreg, i64 inreg, i64 inreg, i64 inreg, float inreg, float, float, float, float, float, float, float, float, float, float, float, float, float, float) #0 {
main_body:
  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 0, float %6, float %7, float %8, float %9)
  ret void
}

; Function Attrs: nounwind
declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float) #1

attributes #0 = { "InitialPSInputAddr"="16777215" }
attributes #1 = { nounwind }

SHADER KEY
  prolog.color_two_side = 0
  prolog.flatshade_colors = 0
  prolog.poly_stipple = 0
  prolog.force_persp_sample_interp = 0
  prolog.force_linear_sample_interp = 0
  prolog.force_persp_center_interp = 0
  prolog.force_linear_center_interp = 0
  prolog.bc_optimize_for_persp = 0
  prolog.bc_optimize_for_linear = 0
  epilog.spi_shader_col_format = 0x9
  epilog.color_is_int8 = 0x0
  epilog.last_cbuf = 0
  epilog.alpha_func = 7
  epilog.alpha_to_one = 0
  epilog.poly_line_smoothing = 0
  epilog.clamp_color = 0

Pixel Shader:
Shader main disassembly:
	s_mov_b64 s[6:7], exec                                  ; BE86047E
	s_wqm_b64 exec, exec                                    ; BEFE0A7E
	s_load_dwordx8 s[12:19], s[4:5], 0x0                    ; C0C60500
	s_load_dwordx4 s[0:3], s[4:5], 0xc                      ; C080050C
	s_mov_b32 m0, s11                                       ; BEFC030B
	v_interp_p1_f32 v0, v8, 0, 0, [m0]                      ; C8000008
	v_interp_p2_f32 v0, [v0], v9, 0, 0, [m0]                ; C8010009
	v_interp_p1_f32 v1, v8, 1, 0, [m0]                      ; C8040108
	s_waitcnt lgkmcnt(0)                                    ; BF8C007F
	s_and_b32 s0, s0, s19                                   ; 87001300
	v_interp_p2_f32 v1, [v1], v9, 1, 0, [m0]                ; C8050109
	s_and_b64 exec, exec, s[6:7]                            ; 87FE067E
	image_sample v[0:3], v[0:1], s[12:19], s[0:3] dmask:0xf ; F0800F00 00030000
	s_waitcnt vmcnt(0)                                      ; BF8C0F70
Shader epilog disassembly:
	exp 15, 0, 0, 1, 1, v0, v1, v2, v3 ; F800180F 03020100
	s_endpgm                           ; BF810000

*** SHADER CONFIG ***
SPI_PS_INPUT_ADDR = 0xd077
SPI_PS_INPUT_ENA  = 0x0020
*** SHADER STATS ***
SGPRS: 24
VGPRS: 16
Spilled SGPRs: 0
Spilled VGPRs: 0
Code Size: 72 bytes
LDS: 0 blocks
Scratch: 0 bytes per wave
Max Waves: 10
********************