AMDGPU/SI: Fix bitcast between v2f32 and f64

Thu Dec 10 23:37:47 PST 2015

Attached is a failure generated by the radeonsi driver with the piglit
graphics test fs-abs-dvec2.shader_test and a patch fixing it (and
similar failures in many other fp64 tests). I haven't been able to
produce a simple test case for this.

-- 
Earthling Michel Dänzer               |               http://www.amd.com
Libre software enthusiast             |             Mesa and X developer
-------------- next part --------------
A non-text attachment was scrubbed...
Name: 0001-AMDGPU-SI-Fix-bitcast-between-v2f32-and-f64.patch
Type: text/x-patch
Size: 1535 bytes
Desc: not available
URL: <http://lists.llvm.org/pipermail/llvm-commits/attachments/20151211/d1713efd/attachment.bin>
-------------- next part --------------
SHADER KEY
  instance_divisors = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}
  as_es = 0
  as_ls = 0
  export_prim_id = 0
VERT
DCL IN[0]
DCL OUT[0], POSITION
  0: MOV OUT[0], IN[0]
  1: END
; ModuleID = 'tgsi'

define void @main([9 x <16 x i8>] addrspace(2)* byval, [17 x <16 x i8>] addrspace(2)* byval, [17 x <4 x i32>] addrspace(2)* byval, [34 x <8 x i32>] addrspace(2)* byval, [16 x <16 x i8>] addrspace(2)* byval, i32 inreg, i32 inreg, i32 inreg, i32, i32, i32, i32) #0 {
main_body:
  %12 = getelementptr [16 x <16 x i8>], [16 x <16 x i8>] addrspace(2)* %4, i64 0, i64 0
  %13 = load <16 x i8>, <16 x i8> addrspace(2)* %12, align 16, !tbaa !0
  %14 = add i32 %5, %8
  %15 = call <4 x float> @llvm.SI.vs.load.input(<16 x i8> %13, i32 0, i32 %14)
  %16 = extractelement <4 x float> %15, i32 0
  %17 = extractelement <4 x float> %15, i32 1
  %18 = extractelement <4 x float> %15, i32 2
  %19 = extractelement <4 x float> %15, i32 3
  call void @llvm.SI.export(i32 15, i32 0, i32 1, i32 12, i32 0, float %16, float %17, float %18, float %19)
  ret void
}

; Function Attrs: nounwind readnone
declare <4 x float> @llvm.SI.vs.load.input(<16 x i8>, i32, i32) #1

declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float)

attributes #0 = { "ShaderType"="1" "enable-no-nans-fp-math"="true" }
attributes #1 = { nounwind readnone }

!0 = !{!"const", null, i32 1}

Shader Disassembly:

	s_load_dwordx4 s[0:3], s[8:9], 0x0                  ; C0800900
	v_add_i32_e32 v0, vcc, s10, v0                      ; 4A00000A
	s_waitcnt lgkmcnt(0)                                ; BF8C007F
	buffer_load_format_xyzw v[0:3], v0, s[0:3], 0 idxen ; E00C2000 80000000
	s_waitcnt vmcnt(0)                                  ; BF8C0770
	exp 15, 12, 0, 1, 0, v0, v1, v2, v3                 ; F80008CF 03020100
	s_endpgm                                            ; BF810000

*** SHADER STATS ***
SGPRS: 16
VGPRS: 4
Code Size: 36 bytes
LDS: 0 blocks
Scratch: 0 bytes per wave
********************
SHADER KEY
  export_16bpc = 0x3
  last_cbuf = 0
  color_two_side = 0
  alpha_func = 7
  alpha_to_one = 0
  poly_stipple = 0
  clamp_color = 0
FRAG
PROPERTY FS_COLOR0_WRITES_ALL_CBUFS 1
DCL OUT[0], COLOR
DCL CONST[0..2]
DCL TEMP[0..2], LOCAL
IMM[0] FLT32 {    0.0000,     1.0000,     0.0000,     0.0000}
  0: DABS TEMP[0].xy, CONST[0].xyxy
  1: DABS TEMP[0].zw, CONST[0].zwzw
  2: DNEG TEMP[1].xy, CONST[2].xyxy
  3: DNEG TEMP[1].zw, CONST[2].zwzw
  4: DADD TEMP[2].xy, TEMP[0].xyxy, TEMP[1].xyxy
  5: DADD TEMP[2].zw, TEMP[0].zwzw, TEMP[1].zwzw
  6: DMUL TEMP[0].xy, TEMP[2].zwzw, TEMP[2].zwzw
  7: DFMA TEMP[0].xy, TEMP[2].xyxy, TEMP[2].xyxy, TEMP[0].xyxy
  8: DSQRT TEMP[0].xy, TEMP[0].xyxy
  9: DSGE TEMP[0].x, CONST[1].xyxy, TEMP[0].xyxy
 10: UIF TEMP[0].xxxx :0
 11:   MOV TEMP[0], IMM[0].xyxy
 12: ELSE :0
 13:   MOV TEMP[0], IMM[0].yxxy
 14: ENDIF
 15: MOV OUT[0], TEMP[0]
 16: END
; ModuleID = 'tgsi'

define void @main([9 x <16 x i8>] addrspace(2)* byval, [17 x <16 x i8>] addrspace(2)* byval, [17 x <4 x i32>] addrspace(2)* byval, [34 x <8 x i32>] addrspace(2)* byval, float inreg, i32 inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, i32, float, float) #0 {
main_body:
  %23 = getelementptr [17 x <16 x i8>], [17 x <16 x i8>] addrspace(2)* %1, i64 0, i64 0
  %24 = load <16 x i8>, <16 x i8> addrspace(2)* %23, align 16, !tbaa !0
  %25 = call float @llvm.SI.load.const(<16 x i8> %24, i32 0)
  %26 = call float @llvm.SI.load.const(<16 x i8> %24, i32 4)
  %27 = call float @llvm.SI.load.const(<16 x i8> %24, i32 8)
  %28 = call float @llvm.SI.load.const(<16 x i8> %24, i32 12)
  %29 = call float @llvm.SI.load.const(<16 x i8> %24, i32 16)
  %30 = call float @llvm.SI.load.const(<16 x i8> %24, i32 20)
  %31 = call float @llvm.SI.load.const(<16 x i8> %24, i32 32)
  %32 = call float @llvm.SI.load.const(<16 x i8> %24, i32 36)
  %33 = call float @llvm.SI.load.const(<16 x i8> %24, i32 40)
  %34 = call float @llvm.SI.load.const(<16 x i8> %24, i32 44)
  %35 = bitcast float %25 to i32
  %36 = insertelement <2 x i32> undef, i32 %35, i32 0
  %37 = bitcast float %26 to i32
  %38 = insertelement <2 x i32> %36, i32 %37, i32 1
  %39 = bitcast <2 x i32> %38 to double
  %40 = call double @llvm.fabs.f64(double %39)
  %41 = bitcast float %27 to i32
  %42 = insertelement <2 x i32> undef, i32 %41, i32 0
  %43 = bitcast float %28 to i32
  %44 = insertelement <2 x i32> %42, i32 %43, i32 1
  %45 = bitcast <2 x i32> %44 to double
  %46 = call double @llvm.fabs.f64(double %45)
  %47 = bitcast float %31 to i32
  %48 = insertelement <2 x i32> undef, i32 %47, i32 0
  %49 = bitcast float %32 to i32
  %50 = insertelement <2 x i32> %48, i32 %49, i32 1
  %51 = bitcast <2 x i32> %50 to double
  %52 = fsub double -0.000000e+00, %51
  %53 = bitcast float %33 to i32
  %54 = insertelement <2 x i32> undef, i32 %53, i32 0
  %55 = bitcast float %34 to i32
  %56 = insertelement <2 x i32> %54, i32 %55, i32 1
  %57 = bitcast <2 x i32> %56 to double
  %58 = fsub double -0.000000e+00, %57
  %bc20 = bitcast double %40 to <2 x i32>
  %bc = bitcast double %40 to <2 x i32>
  %59 = shufflevector <2 x i32> %bc20, <2 x i32> %bc, <2 x i32> <i32 0, i32 3>
  %60 = bitcast <2 x i32> %59 to double
  %bc21 = bitcast double %52 to <2 x i32>
  %bc22 = bitcast double %52 to <2 x i32>
  %61 = shufflevector <2 x i32> %bc21, <2 x i32> %bc22, <2 x i32> <i32 0, i32 3>
  %62 = bitcast <2 x i32> %61 to double
  %63 = fadd double %60, %62
  %bc23 = bitcast double %63 to <2 x float>
  %64 = extractelement <2 x float> %bc23, i32 0
  %bc24 = bitcast double %63 to <2 x float>
  %65 = extractelement <2 x float> %bc24, i32 1
  %bc25 = bitcast double %46 to <2 x i32>
  %bc26 = bitcast double %46 to <2 x i32>
  %66 = shufflevector <2 x i32> %bc25, <2 x i32> %bc26, <2 x i32> <i32 0, i32 3>
  %67 = bitcast <2 x i32> %66 to double
  %bc27 = bitcast double %58 to <2 x i32>
  %bc28 = bitcast double %58 to <2 x i32>
  %68 = shufflevector <2 x i32> %bc27, <2 x i32> %bc28, <2 x i32> <i32 0, i32 3>
  %69 = bitcast <2 x i32> %68 to double
  %70 = fadd double %67, %69
  %bc29 = bitcast double %70 to <2 x float>
  %71 = extractelement <2 x float> %bc29, i32 0
  %bc30 = bitcast double %70 to <2 x float>
  %72 = extractelement <2 x float> %bc30, i32 1
  %73 = bitcast float %71 to i32
  %74 = insertelement <2 x i32> undef, i32 %73, i32 0
  %75 = bitcast float %72 to i32
  %76 = insertelement <2 x i32> %74, i32 %75, i32 1
  %77 = bitcast <2 x i32> %76 to double
  %78 = bitcast float %71 to i32
  %79 = insertelement <2 x i32> undef, i32 %78, i32 0
  %80 = bitcast float %72 to i32
  %81 = insertelement <2 x i32> %79, i32 %80, i32 1
  %82 = bitcast <2 x i32> %81 to double
  %83 = fmul double %77, %82
  %84 = bitcast float %64 to i32
  %85 = insertelement <2 x i32> undef, i32 %84, i32 0
  %86 = bitcast float %65 to i32
  %87 = insertelement <2 x i32> %85, i32 %86, i32 1
  %88 = bitcast <2 x i32> %87 to double
  %89 = bitcast float %64 to i32
  %90 = insertelement <2 x i32> undef, i32 %89, i32 0
  %91 = bitcast float %65 to i32
  %92 = insertelement <2 x i32> %90, i32 %91, i32 1
  %93 = bitcast <2 x i32> %92 to double
  %bc33 = bitcast double %83 to <2 x i32>
  %bc34 = bitcast double %83 to <2 x i32>
  %94 = shufflevector <2 x i32> %bc33, <2 x i32> %bc34, <2 x i32> <i32 0, i32 3>
  %95 = bitcast <2 x i32> %94 to double
  %96 = call double @llvm.fma.f64(double %88, double %93, double %95)
  %bc37 = bitcast double %96 to <2 x i32>
  %bc38 = bitcast double %96 to <2 x i32>
  %97 = shufflevector <2 x i32> %bc37, <2 x i32> %bc38, <2 x i32> <i32 0, i32 3>
  %98 = bitcast <2 x i32> %97 to double
  %99 = call double @llvm.sqrt.f64(double %98)
  %100 = bitcast float %29 to i32
  %101 = insertelement <2 x i32> undef, i32 %100, i32 0
  %102 = bitcast float %30 to i32
  %103 = insertelement <2 x i32> %101, i32 %102, i32 1
  %104 = bitcast <2 x i32> %103 to double
  %bc41 = bitcast double %99 to <2 x i32>
  %bc42 = bitcast double %99 to <2 x i32>
  %105 = shufflevector <2 x i32> %bc41, <2 x i32> %bc42, <2 x i32> <i32 0, i32 3>
  %106 = bitcast <2 x i32> %105 to double
  %107 = fcmp oge double %104, %106
  %. = select i1 %107, float 1.000000e+00, float 0.000000e+00
  %.12 = select i1 %107, float 0.000000e+00, float 1.000000e+00
  %108 = call i32 @llvm.SI.packf16(float %.12, float %.)
  %109 = bitcast i32 %108 to float
  %110 = call i32 @llvm.SI.packf16(float 0.000000e+00, float 1.000000e+00)
  %111 = bitcast i32 %110 to float
  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %109, float %111, float %109, float %111)
  ret void
}

; Function Attrs: nounwind readnone
declare float @llvm.SI.load.const(<16 x i8>, i32) #1

; Function Attrs: nounwind readnone
declare double @llvm.fabs.f64(double) #1

; Function Attrs: nounwind readnone
declare double @llvm.fma.f64(double, double, double) #1

; Function Attrs: nounwind readnone
declare double @llvm.sqrt.f64(double) #1

; Function Attrs: nounwind readnone
declare i32 @llvm.SI.packf16(float, float) #1

declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float)

attributes #0 = { "ShaderType"="0" "enable-no-nans-fp-math"="true" }
attributes #1 = { nounwind readnone }

!0 = !{!"const", null, i32 1}

Shader Disassembly:

	s_load_dwordx4 s[0:3], s[2:3], 0x0       ; C0800300
	s_waitcnt lgkmcnt(0)                     ; BF8C007F
	s_buffer_load_dword s4, s[0:3], 0x0      ; C2020100
	s_buffer_load_dword s6, s[0:3], 0x2      ; C2030102
	s_buffer_load_dword s8, s[0:3], 0x4      ; C2040104
	s_buffer_load_dword s10, s[0:3], 0x8     ; C2050108
	s_buffer_load_dword s11, s[0:3], 0x9     ; C2058109
	s_buffer_load_dword s12, s[0:3], 0xa     ; C206010A
	s_buffer_load_dword s13, s[0:3], 0xb     ; C206810B
	s_buffer_load_dword s7, s[0:3], 0x3      ; C2038103
	s_buffer_load_dword s5, s[0:3], 0x1      ; C2028101
	s_buffer_load_dword s9, s[0:3], 0x5      ; C2048105
	s_waitcnt lgkmcnt(0)                     ; BF8C007F
	v_mov_b32_e32 v0, s10                    ; 7E00020A
	v_mov_b32_e32 v1, s11                    ; 7E02020B
	v_mov_b32_e32 v2, s12                    ; 7E04020C
	v_mov_b32_e32 v3, s13                    ; 7E06020D
	v_add_f64 v[2:3], |s[6:7]|, -v[2:3]      ; D2C80102 40020406
	v_add_f64 v[0:1], |s[4:5]|, -v[0:1]      ; D2C80100 40020004
	v_mul_f64 v[2:3], v[2:3], v[2:3]         ; D2CA0002 00020502
	v_fma_f64 v[0:1], v[0:1], v[0:1], v[2:3] ; D2980000 040A0100
	v_sqrt_f64_e32 v[0:1], v[0:1]            ; 7E006900
	v_cmp_ge_f64_e32 vcc, s[8:9], v[0:1]     ; 7C4C0008
	v_cndmask_b32_e64 v0, 0, 1.0, vcc        ; D2000000 01A9E480
	v_cndmask_b32_e64 v1, 1.0, 0, vcc        ; D2000001 01A900F2
	v_cvt_pkrtz_f16_f32_e32 v0, v1, v0       ; 5E000101
	v_cvt_pkrtz_f16_f32_e64 v1, 0, 1.0       ; D25E0001 0001E480
	exp 15, 0, 1, 1, 1, v0, v1, v0, v1       ; F8001C0F 01000100
	s_endpgm                                 ; BF810000

*** SHADER STATS ***
SGPRS: 16
VGPRS: 4
Code Size: 148 bytes
LDS: 0 blocks
Scratch: 0 bytes per wave
********************
SHADER KEY
  instance_divisors = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}
  as_es = 0
  as_ls = 0
  export_prim_id = 0
VERT
DCL IN[0]
DCL IN[1]
DCL OUT[0], POSITION
DCL OUT[1], GENERIC[0]
  0: MOV OUT[0], IN[0]
  1: MOV OUT[1], IN[1]
  2: END
; ModuleID = 'tgsi'

define void @main([9 x <16 x i8>] addrspace(2)* byval, [17 x <16 x i8>] addrspace(2)* byval, [17 x <4 x i32>] addrspace(2)* byval, [34 x <8 x i32>] addrspace(2)* byval, [16 x <16 x i8>] addrspace(2)* byval, i32 inreg, i32 inreg, i32 inreg, i32, i32, i32, i32) #0 {
main_body:
  %12 = getelementptr [16 x <16 x i8>], [16 x <16 x i8>] addrspace(2)* %4, i64 0, i64 0
  %13 = load <16 x i8>, <16 x i8> addrspace(2)* %12, align 16, !tbaa !0
  %14 = add i32 %5, %8
  %15 = call <4 x float> @llvm.SI.vs.load.input(<16 x i8> %13, i32 0, i32 %14)
  %16 = extractelement <4 x float> %15, i32 0
  %17 = extractelement <4 x float> %15, i32 1
  %18 = extractelement <4 x float> %15, i32 2
  %19 = extractelement <4 x float> %15, i32 3
  %20 = getelementptr [16 x <16 x i8>], [16 x <16 x i8>] addrspace(2)* %4, i64 0, i64 1
  %21 = load <16 x i8>, <16 x i8> addrspace(2)* %20, align 16, !tbaa !0
  %22 = add i32 %5, %8
  %23 = call <4 x float> @llvm.SI.vs.load.input(<16 x i8> %21, i32 0, i32 %22)
  %24 = extractelement <4 x float> %23, i32 0
  %25 = extractelement <4 x float> %23, i32 1
  %26 = extractelement <4 x float> %23, i32 2
  %27 = extractelement <4 x float> %23, i32 3
  call void @llvm.SI.export(i32 15, i32 0, i32 0, i32 32, i32 0, float %24, float %25, float %26, float %27)
  call void @llvm.SI.export(i32 15, i32 0, i32 1, i32 12, i32 0, float %16, float %17, float %18, float %19)
  ret void
}

; Function Attrs: nounwind readnone
declare <4 x float> @llvm.SI.vs.load.input(<16 x i8>, i32, i32) #1

declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float)

attributes #0 = { "ShaderType"="1" "enable-no-nans-fp-math"="true" }
attributes #1 = { nounwind readnone }

!0 = !{!"const", null, i32 1}

Shader Disassembly:

	s_load_dwordx4 s[0:3], s[8:9], 0x0                  ; C0800900
	s_load_dwordx4 s[4:7], s[8:9], 0x4                  ; C0820904
	v_add_i32_e32 v0, vcc, s10, v0                      ; 4A00000A
	s_waitcnt lgkmcnt(0)                                ; BF8C007F
	buffer_load_format_xyzw v[1:4], v0, s[0:3], 0 idxen ; E00C2000 80000100
	buffer_load_format_xyzw v[5:8], v0, s[4:7], 0 idxen ; E00C2000 80010500
	s_waitcnt vmcnt(0)                                  ; BF8C0770
	exp 15, 32, 0, 0, 0, v5, v6, v7, v8                 ; F800020F 08070605
	exp 15, 12, 0, 1, 0, v1, v2, v3, v4                 ; F80008CF 04030201
	s_endpgm                                            ; BF810000

*** SHADER STATS ***
SGPRS: 16
VGPRS: 12
Code Size: 56 bytes
LDS: 0 blocks
Scratch: 0 bytes per wave
********************
SHADER KEY
  export_16bpc = 0x3
  last_cbuf = 0
  color_two_side = 0
  alpha_func = 7
  alpha_to_one = 0
  poly_stipple = 0
  clamp_color = 0
FRAG
DCL IN[0], GENERIC[0], CONSTANT
DCL OUT[0], COLOR
  0: MOV OUT[0], IN[0]
  1: END
; ModuleID = 'tgsi'

define void @main([9 x <16 x i8>] addrspace(2)* byval, [17 x <16 x i8>] addrspace(2)* byval, [17 x <4 x i32>] addrspace(2)* byval, [34 x <8 x i32>] addrspace(2)* byval, float inreg, i32 inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, i32, float, float) #0 {
main_body:
  %23 = call float @llvm.SI.fs.constant(i32 0, i32 0, i32 %6)
  %24 = call float @llvm.SI.fs.constant(i32 1, i32 0, i32 %6)
  %25 = call float @llvm.SI.fs.constant(i32 2, i32 0, i32 %6)
  %26 = call float @llvm.SI.fs.constant(i32 3, i32 0, i32 %6)
  %27 = call i32 @llvm.SI.packf16(float %23, float %24)
  %28 = bitcast i32 %27 to float
  %29 = call i32 @llvm.SI.packf16(float %25, float %26)
  %30 = bitcast i32 %29 to float
  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %28, float %30, float %28, float %30)
  ret void
}

; Function Attrs: nounwind readnone
declare float @llvm.SI.fs.constant(i32, i32, i32) #1

; Function Attrs: nounwind readnone
declare i32 @llvm.SI.packf16(float, float) #1

declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float)

attributes #0 = { "ShaderType"="0" "enable-no-nans-fp-math"="true" }
attributes #1 = { nounwind readnone }

Shader Disassembly:

	s_mov_b32 m0, s10                   ; BEFC030A
	v_interp_mov_f32 v0, P0, 0, 0, [m0] ; C8020002
	v_interp_mov_f32 v1, P0, 1, 0, [m0] ; C8060102
	v_cvt_pkrtz_f16_f32_e32 v0, v0, v1  ; 5E000300
	v_interp_mov_f32 v1, P0, 2, 0, [m0] ; C8060202
	v_interp_mov_f32 v2, P0, 3, 0, [m0] ; C80A0302
	v_cvt_pkrtz_f16_f32_e32 v1, v1, v2  ; 5E020501
	exp 15, 0, 1, 1, 1, v0, v1, v0, v1  ; F8001C0F 01000100
	s_endpgm                            ; BF810000

*** SHADER STATS ***
SGPRS: 16
VGPRS: 4
Code Size: 40 bytes
LDS: 0 blocks
Scratch: 0 bytes per wave
********************
SHADER KEY
  export_16bpc = 0x0
  last_cbuf = 0
  color_two_side = 0
  alpha_func = 7
  alpha_to_one = 0
  poly_stipple = 0
  clamp_color = 0
FRAG
DCL IN[0], GENERIC[0], LINEAR
DCL OUT[0], COLOR
DCL SAMP[0]
DCL SVIEW[0], 2D, FLOAT
  0: TEX OUT[0], IN[0], SAMP[0], 2D
  1: END
; ModuleID = 'tgsi'

define void @main([9 x <16 x i8>] addrspace(2)* byval, [17 x <16 x i8>] addrspace(2)* byval, [17 x <4 x i32>] addrspace(2)* byval, [34 x <8 x i32>] addrspace(2)* byval, float inreg, i32 inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, i32, float, float) #0 {
main_body:
  %23 = getelementptr [34 x <8 x i32>], [34 x <8 x i32>] addrspace(2)* %3, i64 0, i64 0
  %24 = load <8 x i32>, <8 x i32> addrspace(2)* %23, align 32, !tbaa !0
  %25 = getelementptr [17 x <4 x i32>], [17 x <4 x i32>] addrspace(2)* %2, i64 0, i64 0
  %26 = load <4 x i32>, <4 x i32> addrspace(2)* %25, align 16, !tbaa !0
  %27 = call float @llvm.SI.fs.interp(i32 0, i32 0, i32 %6, <2 x i32> %12)
  %28 = call float @llvm.SI.fs.interp(i32 1, i32 0, i32 %6, <2 x i32> %12)
  %29 = bitcast float %27 to i32
  %30 = bitcast float %28 to i32
  %31 = insertelement <2 x i32> undef, i32 %29, i32 0
  %32 = insertelement <2 x i32> %31, i32 %30, i32 1
  %33 = call <4 x float> @llvm.SI.image.sample.v2i32(<2 x i32> %32, <8 x i32> %24, <4 x i32> %26, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
  %34 = extractelement <4 x float> %33, i32 0
  %35 = extractelement <4 x float> %33, i32 1
  %36 = extractelement <4 x float> %33, i32 2
  %37 = extractelement <4 x float> %33, i32 3
  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 0, float %34, float %35, float %36, float %37)
  ret void
}

; Function Attrs: nounwind readnone
declare float @llvm.SI.fs.interp(i32, i32, i32, <2 x i32>) #1

; Function Attrs: nounwind readnone
declare <4 x float> @llvm.SI.image.sample.v2i32(<2 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1

declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float)

attributes #0 = { "ShaderType"="0" "enable-no-nans-fp-math"="true" }
attributes #1 = { nounwind readnone }

!0 = !{!"const", null, i32 1}

Shader Disassembly:

	s_wqm_b64 exec, exec                                                   ; BEFE0A7E
	s_load_dwordx8 s[12:19], s[6:7], 0x0                                   ; C0C60700
	s_load_dwordx4 s[0:3], s[4:5], 0x0                                     ; C0800500
	s_mov_b32 m0, s10                                                      ; BEFC030A
	v_interp_p1_f32 v2, v0, 0, 0, [m0]                                     ; C8080000
	v_interp_p2_f32 v2, [v2], v1, 0, 0, [m0]                               ; C8090001
	v_interp_p1_f32 v3, v0, 1, 0, [m0]                                     ; C80C0100
	v_interp_p2_f32 v3, [v3], v1, 1, 0, [m0]                               ; C80D0101
	s_waitcnt lgkmcnt(0)                                                   ; BF8C007F
	image_sample v[0:3], 15, 0, 0, 0, 0, 0, 0, 0, v[2:3], s[12:19], s[0:3] ; F0800F00 00030002
	s_waitcnt vmcnt(0)                                                     ; BF8C0770
	exp 15, 0, 0, 1, 1, v0, v1, v2, v3                                     ; F800180F 03020100
	s_endpgm                                                               ; BF810000

*** SHADER STATS ***
SGPRS: 24
VGPRS: 4
Code Size: 60 bytes
LDS: 0 blocks
Scratch: 0 bytes per wave
********************