AMDGPU/SI: Fix bitcast between v2f32 and f64

Tue Dec 15 09:19:18 PST 2015

On Fri, Dec 11, 2015 at 04:37:47PM +0900, Michel Dänzer wrote:
> 
> Attached is a failure generated by the radeonsi driver with the piglit
> graphics test fs-abs-dvec2.shader_test and a patch fixing it (and
> similar failures in many other fp64 tests). I haven't been able to
> produce a simple test case for this.
> 

r255657.

-Tom
> 
> -- 
> Earthling Michel Dänzer               |               http://www.amd.com
> Libre software enthusiast             |             Mesa and X developer

> From 6213cb7c9c3bdc2d6306c278c2b6466a27dec71c Mon Sep 17 00:00:00 2001
> From: =?UTF-8?q?Michel=20D=C3=A4nzer?= <michel.daenzer at amd.com>
> Date: Fri, 11 Dec 2015 16:10:30 +0900
> Subject: [PATCH] AMDGPU/SI: Fix bitcast between v2f32 and f64
> MIME-Version: 1.0
> Content-Type: text/plain; charset=UTF-8
> Content-Transfer-Encoding: 8bit
> 
> The radeonsi fp64 support can hit these now that some redundant bitcasts
> are folded.
> 
> Signed-off-by: Michel D??nzer <michel.daenzer at amd.com>
> ---
>  lib/Target/AMDGPU/SIInstructions.td | 4 ++++
>  1 file changed, 4 insertions(+)
> 
> diff --git a/lib/Target/AMDGPU/SIInstructions.td b/lib/Target/AMDGPU/SIInstructions.td
> index dcc74f7..d45762b 100644
> --- a/lib/Target/AMDGPU/SIInstructions.td
> +++ b/lib/Target/AMDGPU/SIInstructions.td
> @@ -2561,7 +2561,9 @@ def : BitConvert <v2i32, i64, VReg_64>;
>  def : BitConvert <i64, v2i32, VReg_64>;
>  def : BitConvert <v2f32, i64, VReg_64>;
>  def : BitConvert <i64, v2f32, VReg_64>;
> +def : BitConvert <v2f32, f64, VReg_64>;
>  def : BitConvert <v2i32, f64, VReg_64>;
> +def : BitConvert <f64, v2f32, VReg_64>;
>  def : BitConvert <f64, v2i32, VReg_64>;
>  def : BitConvert <v4f32, v4i32, VReg_128>;
>  def : BitConvert <v4i32, v4f32, VReg_128>;
> @@ -2570,7 +2572,9 @@ def : BitConvert <v4i32, v4f32, VReg_128>;
>  def : BitConvert <v2i64, v4i32, SReg_128>;
>  def : BitConvert <v4i32, v2i64, SReg_128>;
>  
> +def : BitConvert <v2f64, v4f32, VReg_128>;
>  def : BitConvert <v2f64, v4i32, VReg_128>;
> +def : BitConvert <v4f32, v2f64, VReg_128>;
>  def : BitConvert <v4i32, v2f64, VReg_128>;
>  
>  
> -- 
> 2.6.2
> 

> SHADER KEY
>   instance_divisors = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}
>   as_es = 0
>   as_ls = 0
>   export_prim_id = 0
> VERT
> DCL IN[0]
> DCL OUT[0], POSITION
>   0: MOV OUT[0], IN[0]
>   1: END
> ; ModuleID = 'tgsi'
> 
> define void @main([9 x <16 x i8>] addrspace(2)* byval, [17 x <16 x i8>] addrspace(2)* byval, [17 x <4 x i32>] addrspace(2)* byval, [34 x <8 x i32>] addrspace(2)* byval, [16 x <16 x i8>] addrspace(2)* byval, i32 inreg, i32 inreg, i32 inreg, i32, i32, i32, i32) #0 {
> main_body:
>   %12 = getelementptr [16 x <16 x i8>], [16 x <16 x i8>] addrspace(2)* %4, i64 0, i64 0
>   %13 = load <16 x i8>, <16 x i8> addrspace(2)* %12, align 16, !tbaa !0
>   %14 = add i32 %5, %8
>   %15 = call <4 x float> @llvm.SI.vs.load.input(<16 x i8> %13, i32 0, i32 %14)
>   %16 = extractelement <4 x float> %15, i32 0
>   %17 = extractelement <4 x float> %15, i32 1
>   %18 = extractelement <4 x float> %15, i32 2
>   %19 = extractelement <4 x float> %15, i32 3
>   call void @llvm.SI.export(i32 15, i32 0, i32 1, i32 12, i32 0, float %16, float %17, float %18, float %19)
>   ret void
> }
> 
> ; Function Attrs: nounwind readnone
> declare <4 x float> @llvm.SI.vs.load.input(<16 x i8>, i32, i32) #1
> 
> declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float)
> 
> attributes #0 = { "ShaderType"="1" "enable-no-nans-fp-math"="true" }
> attributes #1 = { nounwind readnone }
> 
> !0 = !{!"const", null, i32 1}
> 
> Shader Disassembly:
> 
> 	s_load_dwordx4 s[0:3], s[8:9], 0x0                  ; C0800900
> 	v_add_i32_e32 v0, vcc, s10, v0                      ; 4A00000A
> 	s_waitcnt lgkmcnt(0)                                ; BF8C007F
> 	buffer_load_format_xyzw v[0:3], v0, s[0:3], 0 idxen ; E00C2000 80000000
> 	s_waitcnt vmcnt(0)                                  ; BF8C0770
> 	exp 15, 12, 0, 1, 0, v0, v1, v2, v3                 ; F80008CF 03020100
> 	s_endpgm                                            ; BF810000
> 
> *** SHADER STATS ***
> SGPRS: 16
> VGPRS: 4
> Code Size: 36 bytes
> LDS: 0 blocks
> Scratch: 0 bytes per wave
> ********************
> SHADER KEY
>   export_16bpc = 0x3
>   last_cbuf = 0
>   color_two_side = 0
>   alpha_func = 7
>   alpha_to_one = 0
>   poly_stipple = 0
>   clamp_color = 0
> FRAG
> PROPERTY FS_COLOR0_WRITES_ALL_CBUFS 1
> DCL OUT[0], COLOR
> DCL CONST[0..2]
> DCL TEMP[0..2], LOCAL
> IMM[0] FLT32 {    0.0000,     1.0000,     0.0000,     0.0000}
>   0: DABS TEMP[0].xy, CONST[0].xyxy
>   1: DABS TEMP[0].zw, CONST[0].zwzw
>   2: DNEG TEMP[1].xy, CONST[2].xyxy
>   3: DNEG TEMP[1].zw, CONST[2].zwzw
>   4: DADD TEMP[2].xy, TEMP[0].xyxy, TEMP[1].xyxy
>   5: DADD TEMP[2].zw, TEMP[0].zwzw, TEMP[1].zwzw
>   6: DMUL TEMP[0].xy, TEMP[2].zwzw, TEMP[2].zwzw
>   7: DFMA TEMP[0].xy, TEMP[2].xyxy, TEMP[2].xyxy, TEMP[0].xyxy
>   8: DSQRT TEMP[0].xy, TEMP[0].xyxy
>   9: DSGE TEMP[0].x, CONST[1].xyxy, TEMP[0].xyxy
>  10: UIF TEMP[0].xxxx :0
>  11:   MOV TEMP[0], IMM[0].xyxy
>  12: ELSE :0
>  13:   MOV TEMP[0], IMM[0].yxxy
>  14: ENDIF
>  15: MOV OUT[0], TEMP[0]
>  16: END
> ; ModuleID = 'tgsi'
> 
> define void @main([9 x <16 x i8>] addrspace(2)* byval, [17 x <16 x i8>] addrspace(2)* byval, [17 x <4 x i32>] addrspace(2)* byval, [34 x <8 x i32>] addrspace(2)* byval, float inreg, i32 inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, i32, float, float) #0 {
> main_body:
>   %23 = getelementptr [17 x <16 x i8>], [17 x <16 x i8>] addrspace(2)* %1, i64 0, i64 0
>   %24 = load <16 x i8>, <16 x i8> addrspace(2)* %23, align 16, !tbaa !0
>   %25 = call float @llvm.SI.load.const(<16 x i8> %24, i32 0)
>   %26 = call float @llvm.SI.load.const(<16 x i8> %24, i32 4)
>   %27 = call float @llvm.SI.load.const(<16 x i8> %24, i32 8)
>   %28 = call float @llvm.SI.load.const(<16 x i8> %24, i32 12)
>   %29 = call float @llvm.SI.load.const(<16 x i8> %24, i32 16)
>   %30 = call float @llvm.SI.load.const(<16 x i8> %24, i32 20)
>   %31 = call float @llvm.SI.load.const(<16 x i8> %24, i32 32)
>   %32 = call float @llvm.SI.load.const(<16 x i8> %24, i32 36)
>   %33 = call float @llvm.SI.load.const(<16 x i8> %24, i32 40)
>   %34 = call float @llvm.SI.load.const(<16 x i8> %24, i32 44)
>   %35 = bitcast float %25 to i32
>   %36 = insertelement <2 x i32> undef, i32 %35, i32 0
>   %37 = bitcast float %26 to i32
>   %38 = insertelement <2 x i32> %36, i32 %37, i32 1
>   %39 = bitcast <2 x i32> %38 to double
>   %40 = call double @llvm.fabs.f64(double %39)
>   %41 = bitcast float %27 to i32
>   %42 = insertelement <2 x i32> undef, i32 %41, i32 0
>   %43 = bitcast float %28 to i32
>   %44 = insertelement <2 x i32> %42, i32 %43, i32 1
>   %45 = bitcast <2 x i32> %44 to double
>   %46 = call double @llvm.fabs.f64(double %45)
>   %47 = bitcast float %31 to i32
>   %48 = insertelement <2 x i32> undef, i32 %47, i32 0
>   %49 = bitcast float %32 to i32
>   %50 = insertelement <2 x i32> %48, i32 %49, i32 1
>   %51 = bitcast <2 x i32> %50 to double
>   %52 = fsub double -0.000000e+00, %51
>   %53 = bitcast float %33 to i32
>   %54 = insertelement <2 x i32> undef, i32 %53, i32 0
>   %55 = bitcast float %34 to i32
>   %56 = insertelement <2 x i32> %54, i32 %55, i32 1
>   %57 = bitcast <2 x i32> %56 to double
>   %58 = fsub double -0.000000e+00, %57
>   %bc20 = bitcast double %40 to <2 x i32>
>   %bc = bitcast double %40 to <2 x i32>
>   %59 = shufflevector <2 x i32> %bc20, <2 x i32> %bc, <2 x i32> <i32 0, i32 3>
>   %60 = bitcast <2 x i32> %59 to double
>   %bc21 = bitcast double %52 to <2 x i32>
>   %bc22 = bitcast double %52 to <2 x i32>
>   %61 = shufflevector <2 x i32> %bc21, <2 x i32> %bc22, <2 x i32> <i32 0, i32 3>
>   %62 = bitcast <2 x i32> %61 to double
>   %63 = fadd double %60, %62
>   %bc23 = bitcast double %63 to <2 x float>
>   %64 = extractelement <2 x float> %bc23, i32 0
>   %bc24 = bitcast double %63 to <2 x float>
>   %65 = extractelement <2 x float> %bc24, i32 1
>   %bc25 = bitcast double %46 to <2 x i32>
>   %bc26 = bitcast double %46 to <2 x i32>
>   %66 = shufflevector <2 x i32> %bc25, <2 x i32> %bc26, <2 x i32> <i32 0, i32 3>
>   %67 = bitcast <2 x i32> %66 to double
>   %bc27 = bitcast double %58 to <2 x i32>
>   %bc28 = bitcast double %58 to <2 x i32>
>   %68 = shufflevector <2 x i32> %bc27, <2 x i32> %bc28, <2 x i32> <i32 0, i32 3>
>   %69 = bitcast <2 x i32> %68 to double
>   %70 = fadd double %67, %69
>   %bc29 = bitcast double %70 to <2 x float>
>   %71 = extractelement <2 x float> %bc29, i32 0
>   %bc30 = bitcast double %70 to <2 x float>
>   %72 = extractelement <2 x float> %bc30, i32 1
>   %73 = bitcast float %71 to i32
>   %74 = insertelement <2 x i32> undef, i32 %73, i32 0
>   %75 = bitcast float %72 to i32
>   %76 = insertelement <2 x i32> %74, i32 %75, i32 1
>   %77 = bitcast <2 x i32> %76 to double
>   %78 = bitcast float %71 to i32
>   %79 = insertelement <2 x i32> undef, i32 %78, i32 0
>   %80 = bitcast float %72 to i32
>   %81 = insertelement <2 x i32> %79, i32 %80, i32 1
>   %82 = bitcast <2 x i32> %81 to double
>   %83 = fmul double %77, %82
>   %84 = bitcast float %64 to i32
>   %85 = insertelement <2 x i32> undef, i32 %84, i32 0
>   %86 = bitcast float %65 to i32
>   %87 = insertelement <2 x i32> %85, i32 %86, i32 1
>   %88 = bitcast <2 x i32> %87 to double
>   %89 = bitcast float %64 to i32
>   %90 = insertelement <2 x i32> undef, i32 %89, i32 0
>   %91 = bitcast float %65 to i32
>   %92 = insertelement <2 x i32> %90, i32 %91, i32 1
>   %93 = bitcast <2 x i32> %92 to double
>   %bc33 = bitcast double %83 to <2 x i32>
>   %bc34 = bitcast double %83 to <2 x i32>
>   %94 = shufflevector <2 x i32> %bc33, <2 x i32> %bc34, <2 x i32> <i32 0, i32 3>
>   %95 = bitcast <2 x i32> %94 to double
>   %96 = call double @llvm.fma.f64(double %88, double %93, double %95)
>   %bc37 = bitcast double %96 to <2 x i32>
>   %bc38 = bitcast double %96 to <2 x i32>
>   %97 = shufflevector <2 x i32> %bc37, <2 x i32> %bc38, <2 x i32> <i32 0, i32 3>
>   %98 = bitcast <2 x i32> %97 to double
>   %99 = call double @llvm.sqrt.f64(double %98)
>   %100 = bitcast float %29 to i32
>   %101 = insertelement <2 x i32> undef, i32 %100, i32 0
>   %102 = bitcast float %30 to i32
>   %103 = insertelement <2 x i32> %101, i32 %102, i32 1
>   %104 = bitcast <2 x i32> %103 to double
>   %bc41 = bitcast double %99 to <2 x i32>
>   %bc42 = bitcast double %99 to <2 x i32>
>   %105 = shufflevector <2 x i32> %bc41, <2 x i32> %bc42, <2 x i32> <i32 0, i32 3>
>   %106 = bitcast <2 x i32> %105 to double
>   %107 = fcmp oge double %104, %106
>   %. = select i1 %107, float 1.000000e+00, float 0.000000e+00
>   %.12 = select i1 %107, float 0.000000e+00, float 1.000000e+00
>   %108 = call i32 @llvm.SI.packf16(float %.12, float %.)
>   %109 = bitcast i32 %108 to float
>   %110 = call i32 @llvm.SI.packf16(float 0.000000e+00, float 1.000000e+00)
>   %111 = bitcast i32 %110 to float
>   call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %109, float %111, float %109, float %111)
>   ret void
> }
> 
> ; Function Attrs: nounwind readnone
> declare float @llvm.SI.load.const(<16 x i8>, i32) #1
> 
> ; Function Attrs: nounwind readnone
> declare double @llvm.fabs.f64(double) #1
> 
> ; Function Attrs: nounwind readnone
> declare double @llvm.fma.f64(double, double, double) #1
> 
> ; Function Attrs: nounwind readnone
> declare double @llvm.sqrt.f64(double) #1
> 
> ; Function Attrs: nounwind readnone
> declare i32 @llvm.SI.packf16(float, float) #1
> 
> declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float)
> 
> attributes #0 = { "ShaderType"="0" "enable-no-nans-fp-math"="true" }
> attributes #1 = { nounwind readnone }
> 
> !0 = !{!"const", null, i32 1}
> 
> Shader Disassembly:
> 
> 	s_load_dwordx4 s[0:3], s[2:3], 0x0       ; C0800300
> 	s_waitcnt lgkmcnt(0)                     ; BF8C007F
> 	s_buffer_load_dword s4, s[0:3], 0x0      ; C2020100
> 	s_buffer_load_dword s6, s[0:3], 0x2      ; C2030102
> 	s_buffer_load_dword s8, s[0:3], 0x4      ; C2040104
> 	s_buffer_load_dword s10, s[0:3], 0x8     ; C2050108
> 	s_buffer_load_dword s11, s[0:3], 0x9     ; C2058109
> 	s_buffer_load_dword s12, s[0:3], 0xa     ; C206010A
> 	s_buffer_load_dword s13, s[0:3], 0xb     ; C206810B
> 	s_buffer_load_dword s7, s[0:3], 0x3      ; C2038103
> 	s_buffer_load_dword s5, s[0:3], 0x1      ; C2028101
> 	s_buffer_load_dword s9, s[0:3], 0x5      ; C2048105
> 	s_waitcnt lgkmcnt(0)                     ; BF8C007F
> 	v_mov_b32_e32 v0, s10                    ; 7E00020A
> 	v_mov_b32_e32 v1, s11                    ; 7E02020B
> 	v_mov_b32_e32 v2, s12                    ; 7E04020C
> 	v_mov_b32_e32 v3, s13                    ; 7E06020D
> 	v_add_f64 v[2:3], |s[6:7]|, -v[2:3]      ; D2C80102 40020406
> 	v_add_f64 v[0:1], |s[4:5]|, -v[0:1]      ; D2C80100 40020004
> 	v_mul_f64 v[2:3], v[2:3], v[2:3]         ; D2CA0002 00020502
> 	v_fma_f64 v[0:1], v[0:1], v[0:1], v[2:3] ; D2980000 040A0100
> 	v_sqrt_f64_e32 v[0:1], v[0:1]            ; 7E006900
> 	v_cmp_ge_f64_e32 vcc, s[8:9], v[0:1]     ; 7C4C0008
> 	v_cndmask_b32_e64 v0, 0, 1.0, vcc        ; D2000000 01A9E480
> 	v_cndmask_b32_e64 v1, 1.0, 0, vcc        ; D2000001 01A900F2
> 	v_cvt_pkrtz_f16_f32_e32 v0, v1, v0       ; 5E000101
> 	v_cvt_pkrtz_f16_f32_e64 v1, 0, 1.0       ; D25E0001 0001E480
> 	exp 15, 0, 1, 1, 1, v0, v1, v0, v1       ; F8001C0F 01000100
> 	s_endpgm                                 ; BF810000
> 
> *** SHADER STATS ***
> SGPRS: 16
> VGPRS: 4
> Code Size: 148 bytes
> LDS: 0 blocks
> Scratch: 0 bytes per wave
> ********************
> SHADER KEY
>   instance_divisors = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}
>   as_es = 0
>   as_ls = 0
>   export_prim_id = 0
> VERT
> DCL IN[0]
> DCL IN[1]
> DCL OUT[0], POSITION
> DCL OUT[1], GENERIC[0]
>   0: MOV OUT[0], IN[0]
>   1: MOV OUT[1], IN[1]
>   2: END
> ; ModuleID = 'tgsi'
> 
> define void @main([9 x <16 x i8>] addrspace(2)* byval, [17 x <16 x i8>] addrspace(2)* byval, [17 x <4 x i32>] addrspace(2)* byval, [34 x <8 x i32>] addrspace(2)* byval, [16 x <16 x i8>] addrspace(2)* byval, i32 inreg, i32 inreg, i32 inreg, i32, i32, i32, i32) #0 {
> main_body:
>   %12 = getelementptr [16 x <16 x i8>], [16 x <16 x i8>] addrspace(2)* %4, i64 0, i64 0
>   %13 = load <16 x i8>, <16 x i8> addrspace(2)* %12, align 16, !tbaa !0
>   %14 = add i32 %5, %8
>   %15 = call <4 x float> @llvm.SI.vs.load.input(<16 x i8> %13, i32 0, i32 %14)
>   %16 = extractelement <4 x float> %15, i32 0
>   %17 = extractelement <4 x float> %15, i32 1
>   %18 = extractelement <4 x float> %15, i32 2
>   %19 = extractelement <4 x float> %15, i32 3
>   %20 = getelementptr [16 x <16 x i8>], [16 x <16 x i8>] addrspace(2)* %4, i64 0, i64 1
>   %21 = load <16 x i8>, <16 x i8> addrspace(2)* %20, align 16, !tbaa !0
>   %22 = add i32 %5, %8
>   %23 = call <4 x float> @llvm.SI.vs.load.input(<16 x i8> %21, i32 0, i32 %22)
>   %24 = extractelement <4 x float> %23, i32 0
>   %25 = extractelement <4 x float> %23, i32 1
>   %26 = extractelement <4 x float> %23, i32 2
>   %27 = extractelement <4 x float> %23, i32 3
>   call void @llvm.SI.export(i32 15, i32 0, i32 0, i32 32, i32 0, float %24, float %25, float %26, float %27)
>   call void @llvm.SI.export(i32 15, i32 0, i32 1, i32 12, i32 0, float %16, float %17, float %18, float %19)
>   ret void
> }
> 
> ; Function Attrs: nounwind readnone
> declare <4 x float> @llvm.SI.vs.load.input(<16 x i8>, i32, i32) #1
> 
> declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float)
> 
> attributes #0 = { "ShaderType"="1" "enable-no-nans-fp-math"="true" }
> attributes #1 = { nounwind readnone }
> 
> !0 = !{!"const", null, i32 1}
> 
> Shader Disassembly:
> 
> 	s_load_dwordx4 s[0:3], s[8:9], 0x0                  ; C0800900
> 	s_load_dwordx4 s[4:7], s[8:9], 0x4                  ; C0820904
> 	v_add_i32_e32 v0, vcc, s10, v0                      ; 4A00000A
> 	s_waitcnt lgkmcnt(0)                                ; BF8C007F
> 	buffer_load_format_xyzw v[1:4], v0, s[0:3], 0 idxen ; E00C2000 80000100
> 	buffer_load_format_xyzw v[5:8], v0, s[4:7], 0 idxen ; E00C2000 80010500
> 	s_waitcnt vmcnt(0)                                  ; BF8C0770
> 	exp 15, 32, 0, 0, 0, v5, v6, v7, v8                 ; F800020F 08070605
> 	exp 15, 12, 0, 1, 0, v1, v2, v3, v4                 ; F80008CF 04030201
> 	s_endpgm                                            ; BF810000
> 
> *** SHADER STATS ***
> SGPRS: 16
> VGPRS: 12
> Code Size: 56 bytes
> LDS: 0 blocks
> Scratch: 0 bytes per wave
> ********************
> SHADER KEY
>   export_16bpc = 0x3
>   last_cbuf = 0
>   color_two_side = 0
>   alpha_func = 7
>   alpha_to_one = 0
>   poly_stipple = 0
>   clamp_color = 0
> FRAG
> DCL IN[0], GENERIC[0], CONSTANT
> DCL OUT[0], COLOR
>   0: MOV OUT[0], IN[0]
>   1: END
> ; ModuleID = 'tgsi'
> 
> define void @main([9 x <16 x i8>] addrspace(2)* byval, [17 x <16 x i8>] addrspace(2)* byval, [17 x <4 x i32>] addrspace(2)* byval, [34 x <8 x i32>] addrspace(2)* byval, float inreg, i32 inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, i32, float, float) #0 {
> main_body:
>   %23 = call float @llvm.SI.fs.constant(i32 0, i32 0, i32 %6)
>   %24 = call float @llvm.SI.fs.constant(i32 1, i32 0, i32 %6)
>   %25 = call float @llvm.SI.fs.constant(i32 2, i32 0, i32 %6)
>   %26 = call float @llvm.SI.fs.constant(i32 3, i32 0, i32 %6)
>   %27 = call i32 @llvm.SI.packf16(float %23, float %24)
>   %28 = bitcast i32 %27 to float
>   %29 = call i32 @llvm.SI.packf16(float %25, float %26)
>   %30 = bitcast i32 %29 to float
>   call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %28, float %30, float %28, float %30)
>   ret void
> }
> 
> ; Function Attrs: nounwind readnone
> declare float @llvm.SI.fs.constant(i32, i32, i32) #1
> 
> ; Function Attrs: nounwind readnone
> declare i32 @llvm.SI.packf16(float, float) #1
> 
> declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float)
> 
> attributes #0 = { "ShaderType"="0" "enable-no-nans-fp-math"="true" }
> attributes #1 = { nounwind readnone }
> 
> Shader Disassembly:
> 
> 	s_mov_b32 m0, s10                   ; BEFC030A
> 	v_interp_mov_f32 v0, P0, 0, 0, [m0] ; C8020002
> 	v_interp_mov_f32 v1, P0, 1, 0, [m0] ; C8060102
> 	v_cvt_pkrtz_f16_f32_e32 v0, v0, v1  ; 5E000300
> 	v_interp_mov_f32 v1, P0, 2, 0, [m0] ; C8060202
> 	v_interp_mov_f32 v2, P0, 3, 0, [m0] ; C80A0302
> 	v_cvt_pkrtz_f16_f32_e32 v1, v1, v2  ; 5E020501
> 	exp 15, 0, 1, 1, 1, v0, v1, v0, v1  ; F8001C0F 01000100
> 	s_endpgm                            ; BF810000
> 
> *** SHADER STATS ***
> SGPRS: 16
> VGPRS: 4
> Code Size: 40 bytes
> LDS: 0 blocks
> Scratch: 0 bytes per wave
> ********************
> SHADER KEY
>   export_16bpc = 0x0
>   last_cbuf = 0
>   color_two_side = 0
>   alpha_func = 7
>   alpha_to_one = 0
>   poly_stipple = 0
>   clamp_color = 0
> FRAG
> DCL IN[0], GENERIC[0], LINEAR
> DCL OUT[0], COLOR
> DCL SAMP[0]
> DCL SVIEW[0], 2D, FLOAT
>   0: TEX OUT[0], IN[0], SAMP[0], 2D
>   1: END
> ; ModuleID = 'tgsi'
> 
> define void @main([9 x <16 x i8>] addrspace(2)* byval, [17 x <16 x i8>] addrspace(2)* byval, [17 x <4 x i32>] addrspace(2)* byval, [34 x <8 x i32>] addrspace(2)* byval, float inreg, i32 inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, i32, float, float) #0 {
> main_body:
>   %23 = getelementptr [34 x <8 x i32>], [34 x <8 x i32>] addrspace(2)* %3, i64 0, i64 0
>   %24 = load <8 x i32>, <8 x i32> addrspace(2)* %23, align 32, !tbaa !0
>   %25 = getelementptr [17 x <4 x i32>], [17 x <4 x i32>] addrspace(2)* %2, i64 0, i64 0
>   %26 = load <4 x i32>, <4 x i32> addrspace(2)* %25, align 16, !tbaa !0
>   %27 = call float @llvm.SI.fs.interp(i32 0, i32 0, i32 %6, <2 x i32> %12)
>   %28 = call float @llvm.SI.fs.interp(i32 1, i32 0, i32 %6, <2 x i32> %12)
>   %29 = bitcast float %27 to i32
>   %30 = bitcast float %28 to i32
>   %31 = insertelement <2 x i32> undef, i32 %29, i32 0
>   %32 = insertelement <2 x i32> %31, i32 %30, i32 1
>   %33 = call <4 x float> @llvm.SI.image.sample.v2i32(<2 x i32> %32, <8 x i32> %24, <4 x i32> %26, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
>   %34 = extractelement <4 x float> %33, i32 0
>   %35 = extractelement <4 x float> %33, i32 1
>   %36 = extractelement <4 x float> %33, i32 2
>   %37 = extractelement <4 x float> %33, i32 3
>   call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 0, float %34, float %35, float %36, float %37)
>   ret void
> }
> 
> ; Function Attrs: nounwind readnone
> declare float @llvm.SI.fs.interp(i32, i32, i32, <2 x i32>) #1
> 
> ; Function Attrs: nounwind readnone
> declare <4 x float> @llvm.SI.image.sample.v2i32(<2 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1
> 
> declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float)
> 
> attributes #0 = { "ShaderType"="0" "enable-no-nans-fp-math"="true" }
> attributes #1 = { nounwind readnone }
> 
> !0 = !{!"const", null, i32 1}
> 
> Shader Disassembly:
> 
> 	s_wqm_b64 exec, exec                                                   ; BEFE0A7E
> 	s_load_dwordx8 s[12:19], s[6:7], 0x0                                   ; C0C60700
> 	s_load_dwordx4 s[0:3], s[4:5], 0x0                                     ; C0800500
> 	s_mov_b32 m0, s10                                                      ; BEFC030A
> 	v_interp_p1_f32 v2, v0, 0, 0, [m0]                                     ; C8080000
> 	v_interp_p2_f32 v2, [v2], v1, 0, 0, [m0]                               ; C8090001
> 	v_interp_p1_f32 v3, v0, 1, 0, [m0]                                     ; C80C0100
> 	v_interp_p2_f32 v3, [v3], v1, 1, 0, [m0]                               ; C80D0101
> 	s_waitcnt lgkmcnt(0)                                                   ; BF8C007F
> 	image_sample v[0:3], 15, 0, 0, 0, 0, 0, 0, 0, v[2:3], s[12:19], s[0:3] ; F0800F00 00030002
> 	s_waitcnt vmcnt(0)                                                     ; BF8C0770
> 	exp 15, 0, 0, 1, 1, v0, v1, v2, v3                                     ; F800180F 03020100
> 	s_endpgm                                                               ; BF810000
> 
> *** SHADER STATS ***
> SGPRS: 24
> VGPRS: 4
> Code Size: 60 bytes
> LDS: 0 blocks
> Scratch: 0 bytes per wave
> ********************