[llvm] r247074 - AMDGPU/SI: Fix input vcc operand for VOP2b instructions
Michel Dänzer via llvm-commits
llvm-commits at lists.llvm.org
Wed Sep 9 00:20:57 PDT 2015
Hi Matt,
On 09.09.2015 06:15, Matt Arsenault via llvm-commits wrote:
> Author: arsenm
> Date: Tue Sep 8 16:15:00 2015
> New Revision: 247074
>
> URL: http://llvm.org/viewvc/llvm-project?rev=247074&view=rev
> Log:
> AMDGPU/SI: Fix input vcc operand for VOP2b instructions
>
> Adds vcc to output string input for e32. Allows option
> of using e64 encoding with assembler.
>
> Also fixes these instructions not implicitly reading exec.
This change broke a bunch of fp64 related piglit (graphics) tests on my
Kaveri. I'm attaching an example of good and bad LLVM IR.
--
Earthling Michel Dänzer | http://www.amd.com
Libre software enthusiast | Mesa and X developer
-------------- next part --------------
SHADER KEY
export_16bpc = 0x3
last_cbuf = 0
color_two_side = 0
alpha_func = 7
alpha_to_one = 0
poly_stipple = 0
FRAG
PROPERTY FS_COLOR0_WRITES_ALL_CBUFS 1
DCL OUT[0], COLOR
DCL CONST[0..4]
DCL TEMP[0..5], LOCAL
IMM[0] FLT32 { 0.0000, 1.0000, 0.0000, 0.0000}
0: DNEG TEMP[0].xy, CONST[0].zwzw
1: DNEG TEMP[1].xy, CONST[1].xyxy
2: DMUL TEMP[2].xy, CONST[1].xyxy, CONST[0].zwzw
3: DNEG TEMP[2].xy, TEMP[2].xyxy
4: DMAD TEMP[2].xy, CONST[0].xyxy, CONST[1].zwzw, TEMP[2].xyxy
5: DRCP TEMP[3].xy, TEMP[2].xyxy
6: DNEG TEMP[4].xy, CONST[3].xyxy
7: DNEG TEMP[4].zw, CONST[3].zwzw
8: DMAD TEMP[5].xy, CONST[1].zwzw, TEMP[3].xyxy, TEMP[4].xyxy
9: DMAD TEMP[5].zw, TEMP[0].xyxy, TEMP[3].xyxy, TEMP[4].zwzw
10: DRCP TEMP[0].xy, TEMP[2].xyxy
11: DNEG TEMP[2].xy, CONST[4].xyxy
12: DNEG TEMP[2].zw, CONST[4].zwzw
13: DMAD TEMP[1].xy, TEMP[1].xyxy, TEMP[0].xyxy, TEMP[2].xyxy
14: DMAD TEMP[1].zw, CONST[0].xyxy, TEMP[0].xyxy, TEMP[2].zwzw
15: DMUL TEMP[0].xy, TEMP[5].xyxy, TEMP[5].xyxy
16: DMAD TEMP[0].xy, TEMP[5].zwzw, TEMP[5].zwzw, TEMP[0].xyxy
17: DMAD TEMP[0].xy, TEMP[1].xyxy, TEMP[1].xyxy, TEMP[0].xyxy
18: DMAD TEMP[0].xy, TEMP[1].zwzw, TEMP[1].zwzw, TEMP[0].xyxy
19: DMUL TEMP[1].xy, CONST[2].xyxy, CONST[2].xyxy
20: DSGE TEMP[0].x, TEMP[1].xyxy, TEMP[0].xyxy
21: UIF TEMP[0].xxxx :0
22: MOV TEMP[0], IMM[0].xyxy
23: ELSE :0
24: MOV TEMP[0], IMM[0].yxxy
25: ENDIF
26: MOV OUT[0], TEMP[0]
27: END
; ModuleID = 'tgsi'
define void @main([9 x <16 x i8>] addrspace(2)* byval, [17 x <16 x i8>] addrspace(2)* byval, [17 x <4 x i32>] addrspace(2)* byval, [34 x <8 x i32>] addrspace(2)* byval, float inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, i32, float, float) #0 {
main_body:
%22 = getelementptr [17 x <16 x i8>], [17 x <16 x i8>] addrspace(2)* %1, i64 0, i64 0
%23 = load <16 x i8>, <16 x i8> addrspace(2)* %22, align 16, !tbaa !0
%24 = call float @llvm.SI.load.const(<16 x i8> %23, i32 0)
%25 = call float @llvm.SI.load.const(<16 x i8> %23, i32 4)
%26 = call float @llvm.SI.load.const(<16 x i8> %23, i32 8)
%27 = call float @llvm.SI.load.const(<16 x i8> %23, i32 12)
%28 = call float @llvm.SI.load.const(<16 x i8> %23, i32 16)
%29 = call float @llvm.SI.load.const(<16 x i8> %23, i32 20)
%30 = call float @llvm.SI.load.const(<16 x i8> %23, i32 24)
%31 = call float @llvm.SI.load.const(<16 x i8> %23, i32 28)
%32 = call float @llvm.SI.load.const(<16 x i8> %23, i32 32)
%33 = call float @llvm.SI.load.const(<16 x i8> %23, i32 36)
%34 = call float @llvm.SI.load.const(<16 x i8> %23, i32 48)
%35 = call float @llvm.SI.load.const(<16 x i8> %23, i32 52)
%36 = call float @llvm.SI.load.const(<16 x i8> %23, i32 56)
%37 = call float @llvm.SI.load.const(<16 x i8> %23, i32 60)
%38 = call float @llvm.SI.load.const(<16 x i8> %23, i32 64)
%39 = call float @llvm.SI.load.const(<16 x i8> %23, i32 68)
%40 = call float @llvm.SI.load.const(<16 x i8> %23, i32 72)
%41 = call float @llvm.SI.load.const(<16 x i8> %23, i32 76)
%42 = bitcast float %26 to i32
%43 = insertelement <2 x i32> undef, i32 %42, i32 0
%44 = bitcast float %27 to i32
%45 = insertelement <2 x i32> %43, i32 %44, i32 1
%46 = bitcast <2 x i32> %45 to double
%47 = bitcast float %28 to i32
%48 = insertelement <2 x i32> undef, i32 %47, i32 0
%49 = bitcast float %29 to i32
%50 = insertelement <2 x i32> %48, i32 %49, i32 1
%51 = bitcast <2 x i32> %50 to double
%52 = bitcast float %28 to i32
%53 = insertelement <2 x i32> undef, i32 %52, i32 0
%54 = bitcast float %29 to i32
%55 = insertelement <2 x i32> %53, i32 %54, i32 1
%56 = bitcast <2 x i32> %55 to double
%57 = bitcast float %26 to i32
%58 = insertelement <2 x i32> undef, i32 %57, i32 0
%59 = bitcast float %27 to i32
%60 = insertelement <2 x i32> %58, i32 %59, i32 1
%61 = bitcast <2 x i32> %60 to double
%62 = fmul double %56, %61
%63 = bitcast float %24 to i32
%64 = insertelement <2 x i32> undef, i32 %63, i32 0
%65 = bitcast float %25 to i32
%66 = insertelement <2 x i32> %64, i32 %65, i32 1
%67 = bitcast <2 x i32> %66 to double
%68 = bitcast float %30 to i32
%69 = insertelement <2 x i32> undef, i32 %68, i32 0
%70 = bitcast float %31 to i32
%71 = insertelement <2 x i32> %69, i32 %70, i32 1
%72 = bitcast <2 x i32> %71 to double
%73 = fmul double %67, %72
%74 = fsub double %73, %62
%75 = fdiv double 1.000000e+00, %74
%76 = bitcast float %34 to i32
%77 = insertelement <2 x i32> undef, i32 %76, i32 0
%78 = bitcast float %35 to i32
%79 = insertelement <2 x i32> %77, i32 %78, i32 1
%80 = bitcast <2 x i32> %79 to double
%81 = bitcast float %36 to i32
%82 = insertelement <2 x i32> undef, i32 %81, i32 0
%83 = bitcast float %37 to i32
%84 = insertelement <2 x i32> %82, i32 %83, i32 1
%85 = bitcast <2 x i32> %84 to double
%86 = fsub double -0.000000e+00, %85
%87 = bitcast float %30 to i32
%88 = insertelement <2 x i32> undef, i32 %87, i32 0
%89 = bitcast float %31 to i32
%90 = insertelement <2 x i32> %88, i32 %89, i32 1
%91 = bitcast <2 x i32> %90 to double
%92 = fmul double %91, %75
%93 = fsub double %92, %80
%94 = fmul double %46, %75
%95 = fsub double %86, %94
%96 = fdiv double 1.000000e+00, %74
%97 = bitcast float %38 to i32
%98 = insertelement <2 x i32> undef, i32 %97, i32 0
%99 = bitcast float %39 to i32
%100 = insertelement <2 x i32> %98, i32 %99, i32 1
%101 = bitcast <2 x i32> %100 to double
%102 = fsub double -0.000000e+00, %101
%103 = bitcast float %40 to i32
%104 = insertelement <2 x i32> undef, i32 %103, i32 0
%105 = bitcast float %41 to i32
%106 = insertelement <2 x i32> %104, i32 %105, i32 1
%107 = bitcast <2 x i32> %106 to double
%108 = fmul double %51, %96
%109 = fsub double %102, %108
%110 = bitcast float %24 to i32
%111 = insertelement <2 x i32> undef, i32 %110, i32 0
%112 = bitcast float %25 to i32
%113 = insertelement <2 x i32> %111, i32 %112, i32 1
%114 = bitcast <2 x i32> %113 to double
%115 = fmul double %114, %96
%116 = fsub double %115, %107
%117 = fmul double %93, %93
%118 = fmul double %95, %95
%119 = fadd double %118, %117
%120 = fmul double %109, %109
%121 = fadd double %120, %119
%122 = fmul double %116, %116
%123 = fadd double %122, %121
%124 = bitcast float %32 to i32
%125 = insertelement <2 x i32> undef, i32 %124, i32 0
%126 = bitcast float %33 to i32
%127 = insertelement <2 x i32> %125, i32 %126, i32 1
%128 = bitcast <2 x i32> %127 to double
%129 = bitcast float %32 to i32
%130 = insertelement <2 x i32> undef, i32 %129, i32 0
%131 = bitcast float %33 to i32
%132 = insertelement <2 x i32> %130, i32 %131, i32 1
%133 = bitcast <2 x i32> %132 to double
%134 = fmul double %128, %133
%135 = fcmp oge double %134, %123
%. = select i1 %135, float 1.000000e+00, float 0.000000e+00
%.24 = select i1 %135, float 0.000000e+00, float 1.000000e+00
%136 = call i32 @llvm.SI.packf16(float %.24, float %.)
%137 = bitcast i32 %136 to float
%138 = call i32 @llvm.SI.packf16(float 0.000000e+00, float 1.000000e+00)
%139 = bitcast i32 %138 to float
call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %137, float %139, float %137, float %139)
ret void
}
; Function Attrs: nounwind readnone
declare float @llvm.SI.load.const(<16 x i8>, i32) #1
; Function Attrs: nounwind readnone
declare i32 @llvm.SI.packf16(float, float) #1
declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float)
attributes #0 = { "ShaderType"="0" "enable-no-nans-fp-math"="true" }
attributes #1 = { nounwind readnone }
!0 = !{!"const", null, i32 1}
Shader Disassembly:
s_load_dwordx4 s[0:3], s[2:3], 0x0 ; C0800300
s_waitcnt lgkmcnt(0) ; BF8C007F
s_buffer_load_dword s4, s[0:3], 0x0 ; C2020100
s_buffer_load_dword s5, s[0:3], 0x1 ; C2028101
s_buffer_load_dword s6, s[0:3], 0x2 ; C2030102
s_buffer_load_dword s7, s[0:3], 0x3 ; C2038103
s_buffer_load_dword s8, s[0:3], 0x4 ; C2040104
s_buffer_load_dword s9, s[0:3], 0x5 ; C2048105
s_buffer_load_dword s10, s[0:3], 0x6 ; C2050106
s_buffer_load_dword s11, s[0:3], 0x7 ; C2058107
s_buffer_load_dword s12, s[0:3], 0x8 ; C2060108
s_buffer_load_dword s13, s[0:3], 0x9 ; C2068109
s_waitcnt lgkmcnt(0) ; BF8C007F
v_mov_b32_e32 v0, s4 ; 7E000204
s_buffer_load_dword s4, s[0:3], 0xc ; C202010C
v_mov_b32_e32 v1, s5 ; 7E020205
s_buffer_load_dword s5, s[0:3], 0xd ; C202810D
s_buffer_load_dword s14, s[0:3], 0xe ; C207010E
s_buffer_load_dword s15, s[0:3], 0xf ; C207810F
s_buffer_load_dword s16, s[0:3], 0x10 ; C2080110
v_mov_b32_e32 v2, s6 ; 7E040206
v_mov_b32_e32 v4, s8 ; 7E080208
v_mov_b32_e32 v6, s10 ; 7E0C020A
v_mov_b32_e32 v3, s7 ; 7E060207
v_mov_b32_e32 v5, s9 ; 7E0A0209
v_mov_b32_e32 v7, s11 ; 7E0E020B
v_mul_f64 v[8:9], v[4:5], v[2:3] ; D2CA0008 00020504
v_mul_f64 v[10:11], v[0:1], v[6:7] ; D2CA000A 00020D00
v_add_f64 v[8:9], v[10:11], -v[8:9] ; D2C80008 4002110A
v_div_scale_f64 v[10:11], s[6:7], v[8:9], v[8:9], 1.0 ; D2DC060A 00021108
v_rcp_f64_e32 v[12:13], v[10:11] ; 7E185F0A
v_div_scale_f64 v[14:15], vcc, 1.0, v[8:9], 1.0 ; D2DC6A0E 000210F2
v_fma_f64 v[16:17], -v[10:11], v[12:13], 1.0 ; D2980010 23CA190A
v_fma_f64 v[12:13], v[12:13], v[16:17], v[12:13] ; D298000C 0432210C
v_fma_f64 v[16:17], -v[10:11], v[12:13], 1.0 ; D2980010 23CA190A
v_fma_f64 v[12:13], v[12:13], v[16:17], v[12:13] ; D298000C 0432210C
v_mul_f64 v[16:17], v[14:15], v[12:13] ; D2CA0010 0002190E
v_fma_f64 v[10:11], -v[10:11], v[16:17], v[14:15] ; D298000A 243A210A
v_div_fmas_f64 v[10:11], v[10:11], v[12:13], v[16:17] ; D2E0000A 0442190A
v_div_fixup_f64 v[8:9], v[10:11], v[8:9], 1.0 ; D2C00008 03CA110A
v_mul_f64 v[6:7], v[6:7], v[8:9] ; D2CA0006 00021106
v_mul_f64 v[2:3], v[2:3], v[8:9] ; D2CA0002 00021102
s_buffer_load_dword s6, s[0:3], 0x11 ; C2030111
s_waitcnt lgkmcnt(0) ; BF8C007F
v_mov_b32_e32 v10, s4 ; 7E140204
v_mov_b32_e32 v11, s5 ; 7E160205
v_mul_f64 v[4:5], v[4:5], v[8:9] ; D2CA0004 00021104
v_mov_b32_e32 v12, s14 ; 7E18020E
v_add_f64 v[6:7], v[6:7], -v[10:11] ; D2C80006 40021506
v_mov_b32_e32 v13, s15 ; 7E1A020F
v_add_f64 v[2:3], -v[12:13], -v[2:3] ; D2C80002 6002050C
s_buffer_load_dword s4, s[0:3], 0x12 ; C2020112
s_buffer_load_dword s0, s[0:3], 0x13 ; C2000113
v_mov_b32_e32 v10, s16 ; 7E140210
v_mov_b32_e32 v11, s6 ; 7E160206
v_mul_f64 v[0:1], v[0:1], v[8:9] ; D2CA0000 00021100
v_add_f64 v[4:5], -v[10:11], -v[4:5] ; D2C80004 6002090A
v_mul_f64 v[6:7], v[6:7], v[6:7] ; D2CA0006 00020D06
v_mul_f64 v[2:3], v[2:3], v[2:3] ; D2CA0002 00020502
s_waitcnt lgkmcnt(0) ; BF8C007F
v_mov_b32_e32 v8, s4 ; 7E100204
v_mov_b32_e32 v9, s0 ; 7E120200
v_add_f64 v[0:1], v[0:1], -v[8:9] ; D2C80000 40021100
v_mul_f64 v[4:5], v[4:5], v[4:5] ; D2CA0004 00020904
v_add_f64 v[2:3], v[2:3], v[6:7] ; D2C80002 00020D02
v_mul_f64 v[0:1], v[0:1], v[0:1] ; D2CA0000 00020100
v_add_f64 v[2:3], v[4:5], v[2:3] ; D2C80002 00020504
v_mov_b32_e32 v4, s12 ; 7E08020C
v_mov_b32_e32 v5, s13 ; 7E0A020D
v_mul_f64 v[4:5], v[4:5], v[4:5] ; D2CA0004 00020904
v_add_f64 v[0:1], v[0:1], v[2:3] ; D2C80000 00020500
v_cmp_ge_f64_e32 vcc, v[4:5], v[0:1] ; 7C4C0104
v_cndmask_b32_e64 v0, 0, 1.0, vcc ; D2000000 01A9E480
v_cndmask_b32_e64 v1, 1.0, 0, vcc ; D2000001 01A900F2
v_cvt_pkrtz_f16_f32_e32 v0, v1, v0 ; 5E000101
v_cvt_pkrtz_f16_f32_e64 v1, 0, 1.0 ; D25E0001 0001E480
exp 15, 0, 1, 1, 1, v0, v1, v0, v1 ; F8001C0F 01000100
s_endpgm ; BF810000
*** SHADER STATS ***
SGPRS: 24
VGPRS: 20
Code Size: 444 bytes
LDS: 0 blocks
Scratch: 0 bytes per wave
********************
SHADER KEY
export_16bpc = 0x3
last_cbuf = 0
color_two_side = 0
alpha_func = 7
alpha_to_one = 0
poly_stipple = 0
FRAG
DCL IN[0], GENERIC[0], CONSTANT
DCL OUT[0], COLOR
0: MOV OUT[0], IN[0]
1: END
; ModuleID = 'tgsi'
define void @main([9 x <16 x i8>] addrspace(2)* byval, [17 x <16 x i8>] addrspace(2)* byval, [17 x <4 x i32>] addrspace(2)* byval, [34 x <8 x i32>] addrspace(2)* byval, float inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, i32, float, float) #0 {
main_body:
%22 = call float @llvm.SI.fs.constant(i32 0, i32 0, i32 %5)
%23 = call float @llvm.SI.fs.constant(i32 1, i32 0, i32 %5)
%24 = call float @llvm.SI.fs.constant(i32 2, i32 0, i32 %5)
%25 = call float @llvm.SI.fs.constant(i32 3, i32 0, i32 %5)
%26 = call i32 @llvm.SI.packf16(float %22, float %23)
%27 = bitcast i32 %26 to float
%28 = call i32 @llvm.SI.packf16(float %24, float %25)
%29 = bitcast i32 %28 to float
call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %27, float %29, float %27, float %29)
ret void
}
; Function Attrs: nounwind readnone
declare float @llvm.SI.fs.constant(i32, i32, i32) #1
; Function Attrs: nounwind readnone
declare i32 @llvm.SI.packf16(float, float) #1
declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float)
attributes #0 = { "ShaderType"="0" "enable-no-nans-fp-math"="true" }
attributes #1 = { nounwind readnone }
Shader Disassembly:
s_mov_b32 m0, s9 ; BEFC0309
v_interp_mov_f32 v0, P0, 0, 0, [m0] ; C8020002
v_interp_mov_f32 v1, P0, 1, 0, [m0] ; C8060102
v_cvt_pkrtz_f16_f32_e32 v0, v0, v1 ; 5E000300
v_interp_mov_f32 v1, P0, 2, 0, [m0] ; C8060202
v_interp_mov_f32 v2, P0, 3, 0, [m0] ; C80A0302
v_cvt_pkrtz_f16_f32_e32 v1, v1, v2 ; 5E020501
exp 15, 0, 1, 1, 1, v0, v1, v0, v1 ; F8001C0F 01000100
s_endpgm ; BF810000
*** SHADER STATS ***
SGPRS: 16
VGPRS: 4
Code Size: 40 bytes
LDS: 0 blocks
Scratch: 0 bytes per wave
********************
SHADER KEY
export_16bpc = 0x0
last_cbuf = 0
color_two_side = 0
alpha_func = 7
alpha_to_one = 0
poly_stipple = 0
FRAG
DCL IN[0], GENERIC[0], LINEAR
DCL OUT[0], COLOR
DCL SAMP[0]
DCL SVIEW[0], 2D, FLOAT
0: TEX OUT[0], IN[0], SAMP[0], 2D
1: END
; ModuleID = 'tgsi'
define void @main([9 x <16 x i8>] addrspace(2)* byval, [17 x <16 x i8>] addrspace(2)* byval, [17 x <4 x i32>] addrspace(2)* byval, [34 x <8 x i32>] addrspace(2)* byval, float inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, i32, float, float) #0 {
main_body:
%22 = bitcast [34 x <8 x i32>] addrspace(2)* %3 to <32 x i8> addrspace(2)*
%23 = load <32 x i8>, <32 x i8> addrspace(2)* %22, align 32, !tbaa !0
%24 = bitcast [17 x <4 x i32>] addrspace(2)* %2 to <16 x i8> addrspace(2)*
%25 = load <16 x i8>, <16 x i8> addrspace(2)* %24, align 16, !tbaa !0
%26 = call float @llvm.SI.fs.interp(i32 0, i32 0, i32 %5, <2 x i32> %11)
%27 = call float @llvm.SI.fs.interp(i32 1, i32 0, i32 %5, <2 x i32> %11)
%28 = bitcast float %26 to i32
%29 = bitcast float %27 to i32
%30 = insertelement <2 x i32> undef, i32 %28, i32 0
%31 = insertelement <2 x i32> %30, i32 %29, i32 1
%32 = call <4 x float> @llvm.SI.sample.v2i32(<2 x i32> %31, <32 x i8> %23, <16 x i8> %25, i32 2)
%33 = extractelement <4 x float> %32, i32 0
%34 = extractelement <4 x float> %32, i32 1
%35 = extractelement <4 x float> %32, i32 2
%36 = extractelement <4 x float> %32, i32 3
call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 0, float %33, float %34, float %35, float %36)
ret void
}
; Function Attrs: nounwind readnone
declare float @llvm.SI.fs.interp(i32, i32, i32, <2 x i32>) #1
; Function Attrs: nounwind readnone
declare <4 x float> @llvm.SI.sample.v2i32(<2 x i32>, <32 x i8>, <16 x i8>, i32) #1
declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float)
attributes #0 = { "ShaderType"="0" "enable-no-nans-fp-math"="true" }
attributes #1 = { nounwind readnone }
!0 = !{!"const", null, i32 1}
Shader Disassembly:
s_wqm_b64 exec, exec ; BEFE0A7E
s_load_dwordx8 s[12:19], s[6:7], 0x0 ; C0C60700
s_load_dwordx4 s[0:3], s[4:5], 0x0 ; C0800500
s_mov_b32 m0, s9 ; BEFC0309
v_interp_p1_f32 v2, v0, 0, 0, [m0] ; C8080000
v_interp_p2_f32 v2, [v2], v1, 0, 0, [m0] ; C8090001
v_interp_p1_f32 v3, v0, 1, 0, [m0] ; C80C0100
v_interp_p2_f32 v3, [v3], v1, 1, 0, [m0] ; C80D0101
s_waitcnt lgkmcnt(0) ; BF8C007F
image_sample v[0:3], 15, 0, 0, 0, 0, 0, 0, 0, v[2:3], s[12:19], s[0:3] ; F0800F00 00030002
s_waitcnt vmcnt(0) ; BF8C0770
exp 15, 0, 0, 1, 1, v0, v1, v2, v3 ; F800180F 03020100
s_endpgm ; BF810000
*** SHADER STATS ***
SGPRS: 24
VGPRS: 4
Code Size: 60 bytes
LDS: 0 blocks
Scratch: 0 bytes per wave
********************
-------------- next part --------------
SHADER KEY
export_16bpc = 0x3
last_cbuf = 0
color_two_side = 0
alpha_func = 7
alpha_to_one = 0
poly_stipple = 0
FRAG
PROPERTY FS_COLOR0_WRITES_ALL_CBUFS 1
DCL OUT[0], COLOR
DCL CONST[0..4]
DCL TEMP[0..5], LOCAL
IMM[0] FLT32 { 0.0000, 1.0000, 0.0000, 0.0000}
0: DNEG TEMP[0].xy, CONST[0].zwzw
1: DNEG TEMP[1].xy, CONST[1].xyxy
2: DMUL TEMP[2].xy, CONST[1].xyxy, CONST[0].zwzw
3: DNEG TEMP[2].xy, TEMP[2].xyxy
4: DMAD TEMP[2].xy, CONST[0].xyxy, CONST[1].zwzw, TEMP[2].xyxy
5: DRCP TEMP[3].xy, TEMP[2].xyxy
6: DNEG TEMP[4].xy, CONST[3].xyxy
7: DNEG TEMP[4].zw, CONST[3].zwzw
8: DMAD TEMP[5].xy, CONST[1].zwzw, TEMP[3].xyxy, TEMP[4].xyxy
9: DMAD TEMP[5].zw, TEMP[0].xyxy, TEMP[3].xyxy, TEMP[4].zwzw
10: DRCP TEMP[0].xy, TEMP[2].xyxy
11: DNEG TEMP[2].xy, CONST[4].xyxy
12: DNEG TEMP[2].zw, CONST[4].zwzw
13: DMAD TEMP[1].xy, TEMP[1].xyxy, TEMP[0].xyxy, TEMP[2].xyxy
14: DMAD TEMP[1].zw, CONST[0].xyxy, TEMP[0].xyxy, TEMP[2].zwzw
15: DMUL TEMP[0].xy, TEMP[5].xyxy, TEMP[5].xyxy
16: DMAD TEMP[0].xy, TEMP[5].zwzw, TEMP[5].zwzw, TEMP[0].xyxy
17: DMAD TEMP[0].xy, TEMP[1].xyxy, TEMP[1].xyxy, TEMP[0].xyxy
18: DMAD TEMP[0].xy, TEMP[1].zwzw, TEMP[1].zwzw, TEMP[0].xyxy
19: DMUL TEMP[1].xy, CONST[2].xyxy, CONST[2].xyxy
20: DSGE TEMP[0].x, TEMP[1].xyxy, TEMP[0].xyxy
21: UIF TEMP[0].xxxx :0
22: MOV TEMP[0], IMM[0].xyxy
23: ELSE :0
24: MOV TEMP[0], IMM[0].yxxy
25: ENDIF
26: MOV OUT[0], TEMP[0]
27: END
; ModuleID = 'tgsi'
define void @main([9 x <16 x i8>] addrspace(2)* byval, [17 x <16 x i8>] addrspace(2)* byval, [17 x <4 x i32>] addrspace(2)* byval, [34 x <8 x i32>] addrspace(2)* byval, float inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, i32, float, float) #0 {
main_body:
%22 = getelementptr [17 x <16 x i8>], [17 x <16 x i8>] addrspace(2)* %1, i64 0, i64 0
%23 = load <16 x i8>, <16 x i8> addrspace(2)* %22, align 16, !tbaa !0
%24 = call float @llvm.SI.load.const(<16 x i8> %23, i32 0)
%25 = call float @llvm.SI.load.const(<16 x i8> %23, i32 4)
%26 = call float @llvm.SI.load.const(<16 x i8> %23, i32 8)
%27 = call float @llvm.SI.load.const(<16 x i8> %23, i32 12)
%28 = call float @llvm.SI.load.const(<16 x i8> %23, i32 16)
%29 = call float @llvm.SI.load.const(<16 x i8> %23, i32 20)
%30 = call float @llvm.SI.load.const(<16 x i8> %23, i32 24)
%31 = call float @llvm.SI.load.const(<16 x i8> %23, i32 28)
%32 = call float @llvm.SI.load.const(<16 x i8> %23, i32 32)
%33 = call float @llvm.SI.load.const(<16 x i8> %23, i32 36)
%34 = call float @llvm.SI.load.const(<16 x i8> %23, i32 48)
%35 = call float @llvm.SI.load.const(<16 x i8> %23, i32 52)
%36 = call float @llvm.SI.load.const(<16 x i8> %23, i32 56)
%37 = call float @llvm.SI.load.const(<16 x i8> %23, i32 60)
%38 = call float @llvm.SI.load.const(<16 x i8> %23, i32 64)
%39 = call float @llvm.SI.load.const(<16 x i8> %23, i32 68)
%40 = call float @llvm.SI.load.const(<16 x i8> %23, i32 72)
%41 = call float @llvm.SI.load.const(<16 x i8> %23, i32 76)
%42 = bitcast float %26 to i32
%43 = insertelement <2 x i32> undef, i32 %42, i32 0
%44 = bitcast float %27 to i32
%45 = insertelement <2 x i32> %43, i32 %44, i32 1
%46 = bitcast <2 x i32> %45 to double
%47 = bitcast float %28 to i32
%48 = insertelement <2 x i32> undef, i32 %47, i32 0
%49 = bitcast float %29 to i32
%50 = insertelement <2 x i32> %48, i32 %49, i32 1
%51 = bitcast <2 x i32> %50 to double
%52 = bitcast float %28 to i32
%53 = insertelement <2 x i32> undef, i32 %52, i32 0
%54 = bitcast float %29 to i32
%55 = insertelement <2 x i32> %53, i32 %54, i32 1
%56 = bitcast <2 x i32> %55 to double
%57 = bitcast float %26 to i32
%58 = insertelement <2 x i32> undef, i32 %57, i32 0
%59 = bitcast float %27 to i32
%60 = insertelement <2 x i32> %58, i32 %59, i32 1
%61 = bitcast <2 x i32> %60 to double
%62 = fmul double %56, %61
%63 = bitcast float %24 to i32
%64 = insertelement <2 x i32> undef, i32 %63, i32 0
%65 = bitcast float %25 to i32
%66 = insertelement <2 x i32> %64, i32 %65, i32 1
%67 = bitcast <2 x i32> %66 to double
%68 = bitcast float %30 to i32
%69 = insertelement <2 x i32> undef, i32 %68, i32 0
%70 = bitcast float %31 to i32
%71 = insertelement <2 x i32> %69, i32 %70, i32 1
%72 = bitcast <2 x i32> %71 to double
%73 = fmul double %67, %72
%74 = fsub double %73, %62
%75 = fdiv double 1.000000e+00, %74
%76 = bitcast float %34 to i32
%77 = insertelement <2 x i32> undef, i32 %76, i32 0
%78 = bitcast float %35 to i32
%79 = insertelement <2 x i32> %77, i32 %78, i32 1
%80 = bitcast <2 x i32> %79 to double
%81 = bitcast float %36 to i32
%82 = insertelement <2 x i32> undef, i32 %81, i32 0
%83 = bitcast float %37 to i32
%84 = insertelement <2 x i32> %82, i32 %83, i32 1
%85 = bitcast <2 x i32> %84 to double
%86 = fsub double -0.000000e+00, %85
%87 = bitcast float %30 to i32
%88 = insertelement <2 x i32> undef, i32 %87, i32 0
%89 = bitcast float %31 to i32
%90 = insertelement <2 x i32> %88, i32 %89, i32 1
%91 = bitcast <2 x i32> %90 to double
%92 = fmul double %91, %75
%93 = fsub double %92, %80
%94 = fmul double %46, %75
%95 = fsub double %86, %94
%96 = fdiv double 1.000000e+00, %74
%97 = bitcast float %38 to i32
%98 = insertelement <2 x i32> undef, i32 %97, i32 0
%99 = bitcast float %39 to i32
%100 = insertelement <2 x i32> %98, i32 %99, i32 1
%101 = bitcast <2 x i32> %100 to double
%102 = fsub double -0.000000e+00, %101
%103 = bitcast float %40 to i32
%104 = insertelement <2 x i32> undef, i32 %103, i32 0
%105 = bitcast float %41 to i32
%106 = insertelement <2 x i32> %104, i32 %105, i32 1
%107 = bitcast <2 x i32> %106 to double
%108 = fmul double %51, %96
%109 = fsub double %102, %108
%110 = bitcast float %24 to i32
%111 = insertelement <2 x i32> undef, i32 %110, i32 0
%112 = bitcast float %25 to i32
%113 = insertelement <2 x i32> %111, i32 %112, i32 1
%114 = bitcast <2 x i32> %113 to double
%115 = fmul double %114, %96
%116 = fsub double %115, %107
%117 = fmul double %93, %93
%118 = fmul double %95, %95
%119 = fadd double %118, %117
%120 = fmul double %109, %109
%121 = fadd double %120, %119
%122 = fmul double %116, %116
%123 = fadd double %122, %121
%124 = bitcast float %32 to i32
%125 = insertelement <2 x i32> undef, i32 %124, i32 0
%126 = bitcast float %33 to i32
%127 = insertelement <2 x i32> %125, i32 %126, i32 1
%128 = bitcast <2 x i32> %127 to double
%129 = bitcast float %32 to i32
%130 = insertelement <2 x i32> undef, i32 %129, i32 0
%131 = bitcast float %33 to i32
%132 = insertelement <2 x i32> %130, i32 %131, i32 1
%133 = bitcast <2 x i32> %132 to double
%134 = fmul double %128, %133
%135 = fcmp oge double %134, %123
%. = select i1 %135, float 1.000000e+00, float 0.000000e+00
%.24 = select i1 %135, float 0.000000e+00, float 1.000000e+00
%136 = call i32 @llvm.SI.packf16(float %.24, float %.)
%137 = bitcast i32 %136 to float
%138 = call i32 @llvm.SI.packf16(float 0.000000e+00, float 1.000000e+00)
%139 = bitcast i32 %138 to float
call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %137, float %139, float %137, float %139)
ret void
}
; Function Attrs: nounwind readnone
declare float @llvm.SI.load.const(<16 x i8>, i32) #1
; Function Attrs: nounwind readnone
declare i32 @llvm.SI.packf16(float, float) #1
declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float)
attributes #0 = { "ShaderType"="0" "enable-no-nans-fp-math"="true" }
attributes #1 = { nounwind readnone }
!0 = !{!"const", null, i32 1}
Shader Disassembly:
s_load_dwordx4 s[0:3], s[2:3], 0x0 ; C0800300
s_waitcnt lgkmcnt(0) ; BF8C007F
s_buffer_load_dword s4, s[0:3], 0x0 ; C2020100
s_buffer_load_dword s5, s[0:3], 0x1 ; C2028101
s_buffer_load_dword s6, s[0:3], 0x2 ; C2030102
s_buffer_load_dword s7, s[0:3], 0x3 ; C2038103
s_buffer_load_dword s8, s[0:3], 0x4 ; C2040104
s_buffer_load_dword s9, s[0:3], 0x5 ; C2048105
s_buffer_load_dword s10, s[0:3], 0x6 ; C2050106
s_buffer_load_dword s11, s[0:3], 0x7 ; C2058107
s_buffer_load_dword s12, s[0:3], 0x8 ; C2060108
s_buffer_load_dword s13, s[0:3], 0x9 ; C2068109
s_waitcnt lgkmcnt(0) ; BF8C007F
v_mov_b32_e32 v0, s4 ; 7E000204
s_buffer_load_dword s4, s[0:3], 0xc ; C202010C
v_mov_b32_e32 v1, s5 ; 7E020205
s_buffer_load_dword s5, s[0:3], 0xd ; C202810D
s_buffer_load_dword s14, s[0:3], 0xe ; C207010E
s_buffer_load_dword s15, s[0:3], 0xf ; C207810F
s_buffer_load_dword s16, s[0:3], 0x10 ; C2080110
v_mov_b32_e32 v2, s6 ; 7E040206
v_mov_b32_e32 v4, s8 ; 7E080208
v_mov_b32_e32 v6, s10 ; 7E0C020A
v_mov_b32_e32 v3, s7 ; 7E060207
v_mov_b32_e32 v5, s9 ; 7E0A0209
v_mov_b32_e32 v7, s11 ; 7E0E020B
v_mul_f64 v[8:9], v[4:5], v[2:3] ; D2CA0008 00020504
v_mul_f64 v[10:11], v[0:1], v[6:7] ; D2CA000A 00020D00
v_add_f64 v[8:9], v[10:11], -v[8:9] ; D2C80008 4002110A
v_div_scale_f64 v[10:11], s[6:7], v[8:9], v[8:9], 1.0 ; D2DC060A 03CA1108
v_rcp_f64_e32 v[12:13], v[10:11] ; 7E185F0A
v_div_scale_f64 v[14:15], vcc, 1.0, v[8:9], 1.0 ; D2DC6A0E 03CA10F2
v_fma_f64 v[16:17], -v[10:11], v[12:13], 1.0 ; D2980010 23CA190A
v_fma_f64 v[12:13], v[12:13], v[16:17], v[12:13] ; D298000C 0432210C
v_fma_f64 v[16:17], -v[10:11], v[12:13], 1.0 ; D2980010 23CA190A
v_fma_f64 v[12:13], v[12:13], v[16:17], v[12:13] ; D298000C 0432210C
v_mul_f64 v[16:17], v[14:15], v[12:13] ; D2CA0010 0002190E
v_fma_f64 v[10:11], -v[10:11], v[16:17], v[14:15] ; D298000A 243A210A
v_div_fmas_f64 v[10:11], v[10:11], v[12:13], v[16:17] ; D2E0000A 0442190A
v_div_fixup_f64 v[8:9], v[10:11], v[8:9], 1.0 ; D2C00008 03CA110A
v_mul_f64 v[6:7], v[6:7], v[8:9] ; D2CA0006 00021106
v_mul_f64 v[2:3], v[2:3], v[8:9] ; D2CA0002 00021102
s_buffer_load_dword s6, s[0:3], 0x11 ; C2030111
s_waitcnt lgkmcnt(0) ; BF8C007F
v_mov_b32_e32 v10, s4 ; 7E140204
v_mov_b32_e32 v11, s5 ; 7E160205
v_mul_f64 v[4:5], v[4:5], v[8:9] ; D2CA0004 00021104
v_mov_b32_e32 v12, s14 ; 7E18020E
v_add_f64 v[6:7], v[6:7], -v[10:11] ; D2C80006 40021506
v_mov_b32_e32 v13, s15 ; 7E1A020F
v_add_f64 v[2:3], -v[12:13], -v[2:3] ; D2C80002 6002050C
s_buffer_load_dword s4, s[0:3], 0x12 ; C2020112
s_buffer_load_dword s0, s[0:3], 0x13 ; C2000113
v_mov_b32_e32 v10, s16 ; 7E140210
v_mov_b32_e32 v11, s6 ; 7E160206
v_mul_f64 v[0:1], v[0:1], v[8:9] ; D2CA0000 00021100
v_add_f64 v[4:5], -v[10:11], -v[4:5] ; D2C80004 6002090A
v_mul_f64 v[6:7], v[6:7], v[6:7] ; D2CA0006 00020D06
v_mul_f64 v[2:3], v[2:3], v[2:3] ; D2CA0002 00020502
s_waitcnt lgkmcnt(0) ; BF8C007F
v_mov_b32_e32 v8, s4 ; 7E100204
v_mov_b32_e32 v9, s0 ; 7E120200
v_add_f64 v[0:1], v[0:1], -v[8:9] ; D2C80000 40021100
v_mul_f64 v[4:5], v[4:5], v[4:5] ; D2CA0004 00020904
v_add_f64 v[2:3], v[2:3], v[6:7] ; D2C80002 00020D02
v_mul_f64 v[0:1], v[0:1], v[0:1] ; D2CA0000 00020100
v_add_f64 v[2:3], v[4:5], v[2:3] ; D2C80002 00020504
v_mov_b32_e32 v4, s12 ; 7E08020C
v_mov_b32_e32 v5, s13 ; 7E0A020D
v_mul_f64 v[4:5], v[4:5], v[4:5] ; D2CA0004 00020904
v_add_f64 v[0:1], v[0:1], v[2:3] ; D2C80000 00020500
v_cmp_ge_f64_e32 vcc, v[4:5], v[0:1] ; 7C4C0104
v_cndmask_b32_e64 v0, 0, 1.0, vcc ; D2000000 01A9E480
v_cndmask_b32_e64 v1, 1.0, 0, vcc ; D2000001 01A900F2
v_cvt_pkrtz_f16_f32_e32 v0, v1, v0 ; 5E000101
v_cvt_pkrtz_f16_f32_e64 v1, 0, 1.0 ; D25E0001 0001E480
exp 15, 0, 1, 1, 1, v0, v1, v0, v1 ; F8001C0F 01000100
s_endpgm ; BF810000
*** SHADER STATS ***
SGPRS: 24
VGPRS: 20
Code Size: 444 bytes
LDS: 0 blocks
Scratch: 0 bytes per wave
********************
SHADER KEY
export_16bpc = 0x3
last_cbuf = 0
color_two_side = 0
alpha_func = 7
alpha_to_one = 0
poly_stipple = 0
FRAG
DCL IN[0], GENERIC[0], CONSTANT
DCL OUT[0], COLOR
0: MOV OUT[0], IN[0]
1: END
; ModuleID = 'tgsi'
define void @main([9 x <16 x i8>] addrspace(2)* byval, [17 x <16 x i8>] addrspace(2)* byval, [17 x <4 x i32>] addrspace(2)* byval, [34 x <8 x i32>] addrspace(2)* byval, float inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, i32, float, float) #0 {
main_body:
%22 = call float @llvm.SI.fs.constant(i32 0, i32 0, i32 %5)
%23 = call float @llvm.SI.fs.constant(i32 1, i32 0, i32 %5)
%24 = call float @llvm.SI.fs.constant(i32 2, i32 0, i32 %5)
%25 = call float @llvm.SI.fs.constant(i32 3, i32 0, i32 %5)
%26 = call i32 @llvm.SI.packf16(float %22, float %23)
%27 = bitcast i32 %26 to float
%28 = call i32 @llvm.SI.packf16(float %24, float %25)
%29 = bitcast i32 %28 to float
call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %27, float %29, float %27, float %29)
ret void
}
; Function Attrs: nounwind readnone
declare float @llvm.SI.fs.constant(i32, i32, i32) #1
; Function Attrs: nounwind readnone
declare i32 @llvm.SI.packf16(float, float) #1
declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float)
attributes #0 = { "ShaderType"="0" "enable-no-nans-fp-math"="true" }
attributes #1 = { nounwind readnone }
Shader Disassembly:
s_mov_b32 m0, s9 ; BEFC0309
v_interp_mov_f32 v0, P0, 0, 0, [m0] ; C8020002
v_interp_mov_f32 v1, P0, 1, 0, [m0] ; C8060102
v_cvt_pkrtz_f16_f32_e32 v0, v0, v1 ; 5E000300
v_interp_mov_f32 v1, P0, 2, 0, [m0] ; C8060202
v_interp_mov_f32 v2, P0, 3, 0, [m0] ; C80A0302
v_cvt_pkrtz_f16_f32_e32 v1, v1, v2 ; 5E020501
exp 15, 0, 1, 1, 1, v0, v1, v0, v1 ; F8001C0F 01000100
s_endpgm ; BF810000
*** SHADER STATS ***
SGPRS: 16
VGPRS: 4
Code Size: 40 bytes
LDS: 0 blocks
Scratch: 0 bytes per wave
********************
SHADER KEY
export_16bpc = 0x0
last_cbuf = 0
color_two_side = 0
alpha_func = 7
alpha_to_one = 0
poly_stipple = 0
FRAG
DCL IN[0], GENERIC[0], LINEAR
DCL OUT[0], COLOR
DCL SAMP[0]
DCL SVIEW[0], 2D, FLOAT
0: TEX OUT[0], IN[0], SAMP[0], 2D
1: END
; ModuleID = 'tgsi'
define void @main([9 x <16 x i8>] addrspace(2)* byval, [17 x <16 x i8>] addrspace(2)* byval, [17 x <4 x i32>] addrspace(2)* byval, [34 x <8 x i32>] addrspace(2)* byval, float inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, i32, float, float) #0 {
main_body:
%22 = bitcast [34 x <8 x i32>] addrspace(2)* %3 to <32 x i8> addrspace(2)*
%23 = load <32 x i8>, <32 x i8> addrspace(2)* %22, align 32, !tbaa !0
%24 = bitcast [17 x <4 x i32>] addrspace(2)* %2 to <16 x i8> addrspace(2)*
%25 = load <16 x i8>, <16 x i8> addrspace(2)* %24, align 16, !tbaa !0
%26 = call float @llvm.SI.fs.interp(i32 0, i32 0, i32 %5, <2 x i32> %11)
%27 = call float @llvm.SI.fs.interp(i32 1, i32 0, i32 %5, <2 x i32> %11)
%28 = bitcast float %26 to i32
%29 = bitcast float %27 to i32
%30 = insertelement <2 x i32> undef, i32 %28, i32 0
%31 = insertelement <2 x i32> %30, i32 %29, i32 1
%32 = call <4 x float> @llvm.SI.sample.v2i32(<2 x i32> %31, <32 x i8> %23, <16 x i8> %25, i32 2)
%33 = extractelement <4 x float> %32, i32 0
%34 = extractelement <4 x float> %32, i32 1
%35 = extractelement <4 x float> %32, i32 2
%36 = extractelement <4 x float> %32, i32 3
call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 0, float %33, float %34, float %35, float %36)
ret void
}
; Function Attrs: nounwind readnone
declare float @llvm.SI.fs.interp(i32, i32, i32, <2 x i32>) #1
; Function Attrs: nounwind readnone
declare <4 x float> @llvm.SI.sample.v2i32(<2 x i32>, <32 x i8>, <16 x i8>, i32) #1
declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float)
attributes #0 = { "ShaderType"="0" "enable-no-nans-fp-math"="true" }
attributes #1 = { nounwind readnone }
!0 = !{!"const", null, i32 1}
Shader Disassembly:
s_wqm_b64 exec, exec ; BEFE0A7E
s_load_dwordx8 s[12:19], s[6:7], 0x0 ; C0C60700
s_load_dwordx4 s[0:3], s[4:5], 0x0 ; C0800500
s_mov_b32 m0, s9 ; BEFC0309
v_interp_p1_f32 v2, v0, 0, 0, [m0] ; C8080000
v_interp_p2_f32 v2, [v2], v1, 0, 0, [m0] ; C8090001
v_interp_p1_f32 v3, v0, 1, 0, [m0] ; C80C0100
v_interp_p2_f32 v3, [v3], v1, 1, 0, [m0] ; C80D0101
s_waitcnt lgkmcnt(0) ; BF8C007F
image_sample v[0:3], 15, 0, 0, 0, 0, 0, 0, 0, v[2:3], s[12:19], s[0:3] ; F0800F00 00030002
s_waitcnt vmcnt(0) ; BF8C0770
exp 15, 0, 0, 1, 1, v0, v1, v2, v3 ; F800180F 03020100
s_endpgm ; BF810000
*** SHADER STATS ***
SGPRS: 24
VGPRS: 4
Code Size: 60 bytes
LDS: 0 blocks
Scratch: 0 bytes per wave
********************
More information about the llvm-commits
mailing list