[llvm] r321556 - AMDGPU: Remove mayLoad/hasSideEffects from MIMG stores

Michel Dänzer via llvm-commits llvm-commits at lists.llvm.org
Wed Jan 3 08:18:16 PST 2018


Hi Matt,


On 2017-12-29 06:18 PM, Matt Arsenault via llvm-commits wrote:
> Author: arsenm
> Date: Fri Dec 29 09:18:18 2017
> New Revision: 321556
> 
> URL: http://llvm.org/viewvc/llvm-project?rev=321556&view=rev
> Log:
> AMDGPU: Remove mayLoad/hasSideEffects from MIMG stores
> 
> Atomics still have hasSideEffects set on them because
> of the mess that is the memory properties.

This change broke the piglit test
"spec at arb_shader_image_load_store@restrict at no qualifier image aliasing
test" for me on Tonga.

I'm attaching the pre-optimization LLVM IR and the good and bad disassembly.


-- 
Earthling Michel Dänzer               |               http://www.amd.com
Libre software enthusiast             |             Mesa and X developer
-------------- next part --------------
; ModuleID = 'tgsi'
source_filename = "tgsi"
target datalayout = "e-p:32:32-p1:64:64-p2:64:64-p3:32:32-p4:64:64-p5:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64"
target triple = "amdgcn--"

; Function Attrs: alwaysinline
define private amdgpu_ps <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> @main([12 x <4 x i32>] addrspace(2)* inreg noalias dereferenceable(18446744073709551615), [0 x <8 x i32>] addrspace(2)* inreg noalias dereferenceable(18446744073709551615), [0 x <4 x i32>] addrspace(2)* inreg noalias dereferenceable(18446744073709551615), [80 x <8 x i32>] addrspace(2)* inreg noalias dereferenceable(18446744073709551615), float inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, i32, i32, float, i32) #0 {
main_body:
  %TEMP11.w = alloca float
  %TEMP11.z = alloca float
  %TEMP11.y = alloca float
  %TEMP11.x = alloca float
  %TEMP10.w = alloca float
  %TEMP10.z = alloca float
  %TEMP10.y = alloca float
  %TEMP10.x = alloca float
  %TEMP9.w = alloca float
  %TEMP9.z = alloca float
  %TEMP9.y = alloca float
  %TEMP9.x = alloca float
  %TEMP8.w = alloca float
  %TEMP8.z = alloca float
  %TEMP8.y = alloca float
  %TEMP8.x = alloca float
  %TEMP7.w = alloca float
  %TEMP7.z = alloca float
  %TEMP7.y = alloca float
  %TEMP7.x = alloca float
  %TEMP6.w = alloca float
  %TEMP6.z = alloca float
  %TEMP6.y = alloca float
  %TEMP6.x = alloca float
  %TEMP5.w = alloca float
  %TEMP5.z = alloca float
  %TEMP5.y = alloca float
  %TEMP5.x = alloca float
  %TEMP4.w = alloca float
  %TEMP4.z = alloca float
  %TEMP4.y = alloca float
  %TEMP4.x = alloca float
  %TEMP3.w = alloca float
  %TEMP3.z = alloca float
  %TEMP3.y = alloca float
  %TEMP3.x = alloca float
  %TEMP2.w = alloca float
  %TEMP2.z = alloca float
  %TEMP2.y = alloca float
  %TEMP2.x = alloca float
  %TEMP1.w = alloca float
  %TEMP1.z = alloca float
  %TEMP1.y = alloca float
  %TEMP1.x = alloca float
  %TEMP0.w = alloca float
  %TEMP0.z = alloca float
  %TEMP0.y = alloca float
  %TEMP0.x = alloca float
  %OUT0.w = alloca float
  %OUT0.z = alloca float
  %OUT0.y = alloca float
  %OUT0.x = alloca float
  store float 0.000000e+00, float* %TEMP0.x
  br label %loop1

loop1:                                            ; preds = %endif5, %main_body
  %22 = load float, float* %TEMP0.x
  %23 = bitcast float %22 to i32
  %24 = icmp sge i32 %23, 768
  %25 = sext i1 %24 to i32
  %26 = bitcast i32 %25 to float
  store float %26, float* %TEMP1.x
  %27 = load float, float* %TEMP1.x
  %28 = bitcast float %27 to i32
  %29 = icmp ne i32 %28, 0
  br i1 %29, label %if3, label %endif5

if3:                                              ; preds = %loop1
  br label %endloop19

endif5:                                           ; preds = %loop1
  store float 0x36E0000000000000, float* %TEMP2.x
  %30 = getelementptr [80 x <8 x i32>], [80 x <8 x i32>] addrspace(2)* %3, i32 0, i32 15, !amdgpu.uniform !0
  %31 = load <8 x i32>, <8 x i32> addrspace(2)* %30, !invariant.load !0
  %32 = call nsz <4 x float> @llvm.amdgcn.image.load.v4f32.i32.v8i32(i32 16, <8 x i32> %31, i32 15, i1 false, i1 false, i1 false, i1 false) #1
  %33 = extractelement <4 x float> %32, i32 0
  %34 = extractelement <4 x float> %32, i32 1
  %35 = extractelement <4 x float> %32, i32 2
  %36 = extractelement <4 x float> %32, i32 3
  store float %33, float* %TEMP3.x
  store float %34, float* %TEMP3.y
  store float %35, float* %TEMP3.z
  store float %36, float* %TEMP3.w
  %37 = load float, float* %TEMP0.x
  %38 = bitcast float %37 to i32
  %39 = mul i32 2, %38
  %40 = bitcast i32 %39 to float
  store float %40, float* %TEMP4.x
  %41 = load float, float* %TEMP4.x
  store float %41, float* %TEMP5.x
  %42 = load float, float* %TEMP3.x
  %43 = bitcast float %42 to i32
  %44 = add i32 %43, 1
  %45 = load float, float* %TEMP3.y
  %46 = bitcast float %45 to i32
  %47 = add i32 %46, 1
  %48 = load float, float* %TEMP3.z
  %49 = bitcast float %48 to i32
  %50 = add i32 %49, 1
  %51 = load float, float* %TEMP3.w
  %52 = bitcast float %51 to i32
  %53 = add i32 %52, 1
  %54 = bitcast i32 %44 to float
  store float %54, float* %TEMP6.x
  %55 = bitcast i32 %47 to float
  store float %55, float* %TEMP6.y
  %56 = bitcast i32 %50 to float
  store float %56, float* %TEMP6.z
  %57 = bitcast i32 %53 to float
  store float %57, float* %TEMP6.w
  %58 = load float, float* %TEMP6.x
  %59 = load float, float* %TEMP6.y
  %60 = load float, float* %TEMP6.z
  %61 = load float, float* %TEMP6.w
  %62 = insertelement <4 x float> undef, float %58, i32 0
  %63 = insertelement <4 x float> %62, float %59, i32 1
  %64 = insertelement <4 x float> %63, float %60, i32 2
  %65 = insertelement <4 x float> %64, float %61, i32 3
  %66 = getelementptr [80 x <8 x i32>], [80 x <8 x i32>] addrspace(2)* %3, i32 0, i32 14, !amdgpu.uniform !0
  %67 = load <8 x i32>, <8 x i32> addrspace(2)* %66, !invariant.load !0
  %68 = extractelement <8 x i32> %67, i32 6
  %69 = and i32 %68, -2097153
  %70 = insertelement <8 x i32> %67, i32 %69, i32 6
  %71 = load float, float* %TEMP4.x
  %72 = bitcast float %71 to i32
  call void @llvm.amdgcn.image.store.v4f32.i32.v8i32(<4 x float> %65, i32 %72, <8 x i32> %70, i32 15, i1 false, i1 false, i1 false, i1 false) #2
  store float 0x36E0000000000000, float* %TEMP7.x
  %73 = getelementptr [80 x <8 x i32>], [80 x <8 x i32>] addrspace(2)* %3, i32 0, i32 15, !amdgpu.uniform !0
  %74 = load <8 x i32>, <8 x i32> addrspace(2)* %73, !invariant.load !0
  %75 = call nsz <4 x float> @llvm.amdgcn.image.load.v4f32.i32.v8i32(i32 16, <8 x i32> %74, i32 15, i1 false, i1 false, i1 false, i1 false) #1
  %76 = extractelement <4 x float> %75, i32 0
  %77 = extractelement <4 x float> %75, i32 1
  %78 = extractelement <4 x float> %75, i32 2
  %79 = extractelement <4 x float> %75, i32 3
  store float %76, float* %TEMP8.x
  store float %77, float* %TEMP8.y
  store float %78, float* %TEMP8.z
  store float %79, float* %TEMP8.w
  %80 = load float, float* %TEMP0.x
  %81 = bitcast float %80 to i32
  %82 = mul i32 2, %81
  %83 = add i32 %82, 1
  %84 = bitcast i32 %83 to float
  store float %84, float* %TEMP9.x
  %85 = load float, float* %TEMP9.x
  store float %85, float* %TEMP10.x
  %86 = load float, float* %TEMP8.x
  %87 = bitcast float %86 to i32
  %88 = add i32 %87, -1
  %89 = load float, float* %TEMP8.y
  %90 = bitcast float %89 to i32
  %91 = add i32 %90, -1
  %92 = load float, float* %TEMP8.z
  %93 = bitcast float %92 to i32
  %94 = add i32 %93, -1
  %95 = load float, float* %TEMP8.w
  %96 = bitcast float %95 to i32
  %97 = add i32 %96, -1
  %98 = bitcast i32 %88 to float
  store float %98, float* %TEMP11.x
  %99 = bitcast i32 %91 to float
  store float %99, float* %TEMP11.y
  %100 = bitcast i32 %94 to float
  store float %100, float* %TEMP11.z
  %101 = bitcast i32 %97 to float
  store float %101, float* %TEMP11.w
  %102 = load float, float* %TEMP11.x
  %103 = load float, float* %TEMP11.y
  %104 = load float, float* %TEMP11.z
  %105 = load float, float* %TEMP11.w
  %106 = insertelement <4 x float> undef, float %102, i32 0
  %107 = insertelement <4 x float> %106, float %103, i32 1
  %108 = insertelement <4 x float> %107, float %104, i32 2
  %109 = insertelement <4 x float> %108, float %105, i32 3
  %110 = getelementptr [80 x <8 x i32>], [80 x <8 x i32>] addrspace(2)* %3, i32 0, i32 14, !amdgpu.uniform !0
  %111 = load <8 x i32>, <8 x i32> addrspace(2)* %110, !invariant.load !0
  %112 = extractelement <8 x i32> %111, i32 6
  %113 = and i32 %112, -2097153
  %114 = insertelement <8 x i32> %111, i32 %113, i32 6
  %115 = load float, float* %TEMP9.x
  %116 = bitcast float %115 to i32
  call void @llvm.amdgcn.image.store.v4f32.i32.v8i32(<4 x float> %109, i32 %116, <8 x i32> %114, i32 15, i1 false, i1 false, i1 false, i1 false) #2
  %117 = load float, float* %TEMP0.x
  %118 = bitcast float %117 to i32
  %119 = add i32 %118, 1
  %120 = bitcast i32 %119 to float
  store float %120, float* %TEMP0.x
  br label %loop1

endloop19:                                        ; preds = %if3
  %121 = call nsz float @llvm.amdgcn.interp.mov(i32 2, i32 0, i32 0, i32 %5) #7
  %122 = call nsz float @llvm.amdgcn.interp.mov(i32 2, i32 1, i32 0, i32 %5) #7
  %123 = call nsz float @llvm.amdgcn.interp.mov(i32 2, i32 2, i32 0, i32 %5) #7
  %124 = call nsz float @llvm.amdgcn.interp.mov(i32 2, i32 3, i32 0, i32 %5) #7
  %125 = call nsz float @llvm.amdgcn.interp.mov(i32 2, i32 0, i32 0, i32 %5) #7
  %126 = call nsz float @llvm.amdgcn.interp.mov(i32 2, i32 1, i32 0, i32 %5) #7
  %127 = call nsz float @llvm.amdgcn.interp.mov(i32 2, i32 2, i32 0, i32 %5) #7
  %128 = call nsz float @llvm.amdgcn.interp.mov(i32 2, i32 3, i32 0, i32 %5) #7
  %129 = call nsz float @llvm.amdgcn.interp.mov(i32 2, i32 0, i32 0, i32 %5) #7
  %130 = call nsz float @llvm.amdgcn.interp.mov(i32 2, i32 1, i32 0, i32 %5) #7
  %131 = call nsz float @llvm.amdgcn.interp.mov(i32 2, i32 2, i32 0, i32 %5) #7
  %132 = call nsz float @llvm.amdgcn.interp.mov(i32 2, i32 3, i32 0, i32 %5) #7
  %133 = call nsz float @llvm.amdgcn.interp.mov(i32 2, i32 0, i32 0, i32 %5) #7
  %134 = call nsz float @llvm.amdgcn.interp.mov(i32 2, i32 1, i32 0, i32 %5) #7
  %135 = call nsz float @llvm.amdgcn.interp.mov(i32 2, i32 2, i32 0, i32 %5) #7
  %136 = call nsz float @llvm.amdgcn.interp.mov(i32 2, i32 3, i32 0, i32 %5) #7
  store float %121, float* %OUT0.x
  store float %126, float* %OUT0.y
  store float %131, float* %OUT0.z
  store float %136, float* %OUT0.w
  %137 = load float, float* %OUT0.x
  %138 = load float, float* %OUT0.y
  %139 = load float, float* %OUT0.z
  %140 = load float, float* %OUT0.w
  %141 = bitcast float %4 to i32
  %142 = insertvalue <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> undef, i32 %141, 8
  %143 = insertvalue <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> %142, float %137, 9
  %144 = insertvalue <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> %143, float %138, 10
  %145 = insertvalue <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> %144, float %139, 11
  %146 = insertvalue <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> %145, float %140, 12
  %147 = insertvalue <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> %146, float %20, 23
  ret <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> %147
}

; Function Attrs: nounwind readonly
declare <4 x float> @llvm.amdgcn.image.load.v4f32.i32.v8i32(i32, <8 x i32>, i32, i1, i1, i1, i1) #1

; Function Attrs: nounwind writeonly
declare void @llvm.amdgcn.image.store.v4f32.i32.v8i32(<4 x float>, i32, <8 x i32>, i32, i1, i1, i1, i1) #2

; Function Attrs: nounwind readnone speculatable
declare float @llvm.amdgcn.interp.mov(i32, i32, i32, i32) #3

; Function Attrs: alwaysinline
define private amdgpu_ps void @ps_epilog(i64 inreg, i64 inreg, i64 inreg, i64 inreg, float inreg, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float) #4 {
main_body:
  call void @llvm.amdgcn.exp.f32(i32 0, i32 1, float %5, float undef, float undef, float undef, i1 true, i1 true) #5
  ret void
}

; Function Attrs: nounwind
declare void @llvm.amdgcn.exp.f32(i32, i32, float, float, float, float, i1, i1) #5

define amdgpu_ps void @wrapper([12 x <4 x i32>] addrspace(2)* byval noalias dereferenceable(18446744073709551615), [0 x <8 x i32>] addrspace(2)* byval noalias dereferenceable(18446744073709551615), [0 x <4 x i32>] addrspace(2)* byval noalias dereferenceable(18446744073709551615), [80 x <8 x i32>] addrspace(2)* byval noalias dereferenceable(18446744073709551615), float inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, i32, i32, float, i32) #6 {
main_body:
  %22 = ptrtoint [12 x <4 x i32>] addrspace(2)* %0 to i64
  %23 = bitcast i64 %22 to <2 x i32>
  %24 = extractelement <2 x i32> %23, i32 0
  %25 = extractelement <2 x i32> %23, i32 1
  %26 = ptrtoint [0 x <8 x i32>] addrspace(2)* %1 to i64
  %27 = bitcast i64 %26 to <2 x i32>
  %28 = extractelement <2 x i32> %27, i32 0
  %29 = extractelement <2 x i32> %27, i32 1
  %30 = ptrtoint [0 x <4 x i32>] addrspace(2)* %2 to i64
  %31 = bitcast i64 %30 to <2 x i32>
  %32 = extractelement <2 x i32> %31, i32 0
  %33 = extractelement <2 x i32> %31, i32 1
  %34 = ptrtoint [80 x <8 x i32>] addrspace(2)* %3 to i64
  %35 = bitcast i64 %34 to <2 x i32>
  %36 = extractelement <2 x i32> %35, i32 0
  %37 = extractelement <2 x i32> %35, i32 1
  %38 = bitcast float %4 to i32
  %39 = bitcast <2 x i32> %6 to <2 x float>
  %40 = extractelement <2 x float> %39, i32 0
  %41 = extractelement <2 x float> %39, i32 1
  %42 = bitcast <2 x i32> %7 to <2 x float>
  %43 = extractelement <2 x float> %42, i32 0
  %44 = extractelement <2 x float> %42, i32 1
  %45 = bitcast <2 x i32> %8 to <2 x float>
  %46 = extractelement <2 x float> %45, i32 0
  %47 = extractelement <2 x float> %45, i32 1
  %48 = bitcast <3 x i32> %9 to <3 x float>
  %49 = extractelement <3 x float> %48, i32 0
  %50 = extractelement <3 x float> %48, i32 1
  %51 = extractelement <3 x float> %48, i32 2
  %52 = bitcast <2 x i32> %10 to <2 x float>
  %53 = extractelement <2 x float> %52, i32 0
  %54 = extractelement <2 x float> %52, i32 1
  %55 = bitcast <2 x i32> %11 to <2 x float>
  %56 = extractelement <2 x float> %55, i32 0
  %57 = extractelement <2 x float> %55, i32 1
  %58 = bitcast <2 x i32> %12 to <2 x float>
  %59 = extractelement <2 x float> %58, i32 0
  %60 = extractelement <2 x float> %58, i32 1
  %61 = bitcast i32 %18 to float
  %62 = bitcast i32 %19 to float
  %63 = bitcast i32 %21 to float
  %64 = insertelement <2 x i32> undef, i32 %24, i32 0
  %65 = insertelement <2 x i32> %64, i32 %25, i32 1
  %66 = bitcast <2 x i32> %65 to i64
  %67 = inttoptr i64 %66 to [12 x <4 x i32>] addrspace(2)*
  %68 = insertelement <2 x i32> undef, i32 %28, i32 0
  %69 = insertelement <2 x i32> %68, i32 %29, i32 1
  %70 = bitcast <2 x i32> %69 to i64
  %71 = inttoptr i64 %70 to [0 x <8 x i32>] addrspace(2)*
  %72 = insertelement <2 x i32> undef, i32 %32, i32 0
  %73 = insertelement <2 x i32> %72, i32 %33, i32 1
  %74 = bitcast <2 x i32> %73 to i64
  %75 = inttoptr i64 %74 to [0 x <4 x i32>] addrspace(2)*
  %76 = insertelement <2 x i32> undef, i32 %36, i32 0
  %77 = insertelement <2 x i32> %76, i32 %37, i32 1
  %78 = bitcast <2 x i32> %77 to i64
  %79 = inttoptr i64 %78 to [80 x <8 x i32>] addrspace(2)*
  %80 = bitcast i32 %38 to float
  %81 = insertelement <2 x float> undef, float %40, i32 0
  %82 = insertelement <2 x float> %81, float %41, i32 1
  %83 = bitcast <2 x float> %82 to <2 x i32>
  %84 = insertelement <2 x float> undef, float %43, i32 0
  %85 = insertelement <2 x float> %84, float %44, i32 1
  %86 = bitcast <2 x float> %85 to <2 x i32>
  %87 = insertelement <2 x float> undef, float %46, i32 0
  %88 = insertelement <2 x float> %87, float %47, i32 1
  %89 = bitcast <2 x float> %88 to <2 x i32>
  %90 = insertelement <3 x float> undef, float %49, i32 0
  %91 = insertelement <3 x float> %90, float %50, i32 1
  %92 = insertelement <3 x float> %91, float %51, i32 2
  %93 = bitcast <3 x float> %92 to <3 x i32>
  %94 = insertelement <2 x float> undef, float %53, i32 0
  %95 = insertelement <2 x float> %94, float %54, i32 1
  %96 = bitcast <2 x float> %95 to <2 x i32>
  %97 = insertelement <2 x float> undef, float %56, i32 0
  %98 = insertelement <2 x float> %97, float %57, i32 1
  %99 = bitcast <2 x float> %98 to <2 x i32>
  %100 = insertelement <2 x float> undef, float %59, i32 0
  %101 = insertelement <2 x float> %100, float %60, i32 1
  %102 = bitcast <2 x float> %101 to <2 x i32>
  %103 = bitcast float %61 to i32
  %104 = bitcast float %62 to i32
  %105 = bitcast float %63 to i32
  %106 = call <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> @main([12 x <4 x i32>] addrspace(2)* %67, [0 x <8 x i32>] addrspace(2)* %71, [0 x <4 x i32>] addrspace(2)* %75, [80 x <8 x i32>] addrspace(2)* %79, float %80, i32 %5, <2 x i32> %83, <2 x i32> %86, <2 x i32> %89, <3 x i32> %93, <2 x i32> %96, <2 x i32> %99, <2 x i32> %102, float %13, float %14, float %15, float %16, float %17, i32 %103, i32 %104, float %20, i32 %105)
  %107 = extractvalue <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> %106, 0
  %108 = extractvalue <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> %106, 1
  %109 = extractvalue <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> %106, 2
  %110 = extractvalue <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> %106, 3
  %111 = extractvalue <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> %106, 4
  %112 = extractvalue <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> %106, 5
  %113 = extractvalue <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> %106, 6
  %114 = extractvalue <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> %106, 7
  %115 = extractvalue <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> %106, 8
  %116 = extractvalue <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> %106, 9
  %117 = extractvalue <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> %106, 10
  %118 = extractvalue <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> %106, 11
  %119 = extractvalue <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> %106, 12
  %120 = extractvalue <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> %106, 13
  %121 = extractvalue <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> %106, 14
  %122 = extractvalue <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> %106, 15
  %123 = extractvalue <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> %106, 16
  %124 = extractvalue <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> %106, 17
  %125 = extractvalue <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> %106, 18
  %126 = extractvalue <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> %106, 19
  %127 = extractvalue <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> %106, 20
  %128 = extractvalue <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> %106, 21
  %129 = extractvalue <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> %106, 22
  %130 = extractvalue <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> %106, 23
  %131 = insertelement <2 x i32> undef, i32 %107, i32 0
  %132 = insertelement <2 x i32> %131, i32 %108, i32 1
  %133 = bitcast <2 x i32> %132 to i64
  %134 = insertelement <2 x i32> undef, i32 %109, i32 0
  %135 = insertelement <2 x i32> %134, i32 %110, i32 1
  %136 = bitcast <2 x i32> %135 to i64
  %137 = insertelement <2 x i32> undef, i32 %111, i32 0
  %138 = insertelement <2 x i32> %137, i32 %112, i32 1
  %139 = bitcast <2 x i32> %138 to i64
  %140 = insertelement <2 x i32> undef, i32 %113, i32 0
  %141 = insertelement <2 x i32> %140, i32 %114, i32 1
  %142 = bitcast <2 x i32> %141 to i64
  %143 = bitcast i32 %115 to float
  call void @ps_epilog(i64 %133, i64 %136, i64 %139, i64 %142, float %143, float %116, float %117, float %118, float %119, float %120, float %121, float %122, float %123, float %124, float %125, float %126, float %127, float %128, float %129, float %130)
  ret void
}

attributes #0 = { alwaysinline "no-signed-zeros-fp-math"="true" }
attributes #1 = { nounwind readonly }
attributes #2 = { nounwind writeonly }
attributes #3 = { nounwind readnone speculatable }
attributes #4 = { alwaysinline "InitialPSInputAddr"="16777215" "no-signed-zeros-fp-math"="true" }
attributes #5 = { nounwind }
attributes #6 = { "no-signed-zeros-fp-math"="true" }
attributes #7 = { nounwind readnone }

!0 = !{}
radeonsi: Compiling shader 6
SHADER KEY
  part.ps.prolog.color_two_side = 0
  part.ps.prolog.flatshade_colors = 0
  part.ps.prolog.poly_stipple = 0
  part.ps.prolog.force_persp_sample_interp = 0
  part.ps.prolog.force_linear_sample_interp = 0
  part.ps.prolog.force_persp_center_interp = 0
  part.ps.prolog.force_linear_center_interp = 0
  part.ps.prolog.bc_optimize_for_persp = 0
  part.ps.prolog.bc_optimize_for_linear = 0
  part.ps.epilog.spi_shader_col_format = 0x1
  part.ps.epilog.color_is_int8 = 0x0
  part.ps.epilog.color_is_int10 = 0x0
  part.ps.epilog.last_cbuf = 0
  part.ps.epilog.alpha_func = 7
  part.ps.epilog.alpha_to_one = 0
  part.ps.epilog.poly_line_smoothing = 0
  part.ps.epilog.clamp_color = 0

Pixel Shader:
Shader main disassembly:
wrapper:
BB0_0:
	s_load_dwordx8 s[20:27], s[6:7], 0x1c0            ; C00E0503 000001C0
	s_load_dwordx8 s[12:19], s[6:7], 0x1e0            ; C00E0303 000001E0
	s_mov_b32 s0, 0                                   ; BE800080
	s_mov_b32 s1, 1                                   ; BE810081
	v_mov_b32_e32 v0, 16                              ; 7E000290
	s_waitcnt lgkmcnt(0)                              ; BF8C007F
	s_and_b32 s26, s26, 0xffdfffff                    ; 861AFF1A FFDFFFFF
	s_cmpk_gt_i32 s0, 0x2ff                           ; B20002FF
	s_cbranch_scc1 BB0_2                              ; BF850000
BB0_1:
	image_load v[1:4], v0, s[12:19] dmask:0xf unorm   ; F0001F00 00030100
	image_load v[5:8], v0, s[12:19] dmask:0xf unorm   ; F0001F00 00030500
	s_add_i32 s2, s1, -1                              ; 8102C101
	v_mov_b32_e32 v9, s1                              ; 7E120201
	v_mov_b32_e32 v10, s2                             ; 7E140202
	s_add_i32 s0, s0, 1                               ; 81008100
	s_add_i32 s1, s1, 2                               ; 81018201
	s_waitcnt vmcnt(1)                                ; BF8C0F71
	v_add_u32_e32 v1, vcc, 1, v1                      ; 32020281
	v_add_u32_e32 v2, vcc, 1, v2                      ; 32040481
	v_add_u32_e32 v3, vcc, 1, v3                      ; 32060681
	v_add_u32_e32 v4, vcc, 1, v4                      ; 32080881
	s_waitcnt vmcnt(0)                                ; BF8C0F70
	v_add_u32_e32 v5, vcc, -1, v5                     ; 320A0AC1
	v_add_u32_e32 v6, vcc, -1, v6                     ; 320C0CC1
	v_add_u32_e32 v7, vcc, -1, v7                     ; 320E0EC1
	v_add_u32_e32 v8, vcc, -1, v8                     ; 321010C1
	image_store v[1:4], v10, s[20:27] dmask:0xf unorm ; F0201F00 0005010A
	image_store v[5:8], v9, s[20:27] dmask:0xf unorm  ; F0201F00 00050509
	s_cmpk_gt_i32 s0, 0x2ff                           ; B20002FF
	s_cbranch_scc0 BB0_1                              ; BF840000
BB0_2:
	s_mov_b32 m0, s9                                  ; BEFC0009
	v_interp_mov_f32 v0, p0, attr0.x                  ; D4020002
	exp mrt0 v0, off, off, off done vm                ; C4001801 00000000
	s_endpgm                                          ; BF810000

*** SHADER CONFIG ***
SPI_PS_INPUT_ADDR = 0x0001
SPI_PS_INPUT_ENA  = 0x0001
*** SHADER STATS ***
SGPRS: 96
VGPRS: 12
Spilled SGPRs: 0
Spilled VGPRs: 0
Private memory VGPRs: 0
Code Size: 168 bytes
LDS: 0 blocks
Scratch: 0 bytes per wave
Max Waves: 8
********************


FRAG
DCL IN[0], GENERIC[0], CONSTANT
DCL OUT[0], COLOR
DCL IMAGE[0], 1D, PIPE_FORMAT_R32_UINT, WR
DCL IMAGE[1], 1D, PIPE_FORMAT_R32_UINT, WR
DCL TEMP[0..11], LOCAL
IMM[0] INT32 {0, 768, 16, 2}
IMM[1] UINT32 {1, 4294967295, 0, 0}
IMM[2] INT32 {1, 0, 0, 0}
  0: MOV TEMP[0].x, IMM[0].xxxx
  1: BGNLOOP
  2:   ISGE TEMP[1].x, TEMP[0].xxxx, IMM[0].yyyy
  3:   UIF TEMP[1].xxxx
  4:     BRK
  5:   ENDIF
  6:   MOV TEMP[2].x, IMM[0].zzzz
  7:   LOAD TEMP[3], IMAGE[0], IMM[0].zzzz, RESTRICT, 1D, PIPE_FORMAT_R32_UINT
  8:   UMUL TEMP[4].x, IMM[0].wwww, TEMP[0].xxxx
  9:   MOV TEMP[5].x, TEMP[4].xxxx
 10:   UADD TEMP[6], TEMP[3], IMM[1].xxxx
 11:   STORE IMAGE[1], TEMP[4].xxxx, TEMP[6], RESTRICT, 1D, PIPE_FORMAT_R32_UINT
 12:   MOV TEMP[7].x, IMM[0].zzzz
 13:   LOAD TEMP[8], IMAGE[0], IMM[0].zzzz, RESTRICT, 1D, PIPE_FORMAT_R32_UINT
 14:   UMAD TEMP[9].x, IMM[0].wwww, TEMP[0].xxxx, IMM[2].xxxx
 15:   MOV TEMP[10].x, TEMP[9].xxxx
 16:   UADD TEMP[11], TEMP[8], IMM[1].yyyy
 17:   STORE IMAGE[1], TEMP[9].xxxx, TEMP[11], RESTRICT, 1D, PIPE_FORMAT_R32_UINT
 18:   UADD TEMP[0].x, TEMP[0].xxxx, IMM[2].xxxx
 19: ENDLOOP
 20: MOV OUT[0], IN[0]
 21: END
; ModuleID = 'tgsi'
source_filename = "tgsi"
target datalayout = "e-p:32:32-p1:64:64-p2:64:64-p3:32:32-p4:64:64-p5:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64"
target triple = "amdgcn--"

; Function Attrs: alwaysinline
define private amdgpu_ps <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> @main([12 x <4 x i32>] addrspace(2)* inreg noalias dereferenceable(18446744073709551615), [0 x <8 x i32>] addrspace(2)* inreg noalias dereferenceable(18446744073709551615), [0 x <4 x i32>] addrspace(2)* inreg noalias dereferenceable(18446744073709551615), [80 x <8 x i32>] addrspace(2)* inreg noalias dereferenceable(18446744073709551615), float inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, i32, i32, float, i32) #0 {
main_body:
  %TEMP11.w = alloca float
  %TEMP11.z = alloca float
  %TEMP11.y = alloca float
  %TEMP11.x = alloca float
  %TEMP10.w = alloca float
  %TEMP10.z = alloca float
  %TEMP10.y = alloca float
  %TEMP10.x = alloca float
  %TEMP9.w = alloca float
  %TEMP9.z = alloca float
  %TEMP9.y = alloca float
  %TEMP9.x = alloca float
  %TEMP8.w = alloca float
  %TEMP8.z = alloca float
  %TEMP8.y = alloca float
  %TEMP8.x = alloca float
  %TEMP7.w = alloca float
  %TEMP7.z = alloca float
  %TEMP7.y = alloca float
  %TEMP7.x = alloca float
  %TEMP6.w = alloca float
  %TEMP6.z = alloca float
  %TEMP6.y = alloca float
  %TEMP6.x = alloca float
  %TEMP5.w = alloca float
  %TEMP5.z = alloca float
  %TEMP5.y = alloca float
  %TEMP5.x = alloca float
  %TEMP4.w = alloca float
  %TEMP4.z = alloca float
  %TEMP4.y = alloca float
  %TEMP4.x = alloca float
  %TEMP3.w = alloca float
  %TEMP3.z = alloca float
  %TEMP3.y = alloca float
  %TEMP3.x = alloca float
  %TEMP2.w = alloca float
  %TEMP2.z = alloca float
  %TEMP2.y = alloca float
  %TEMP2.x = alloca float
  %TEMP1.w = alloca float
  %TEMP1.z = alloca float
  %TEMP1.y = alloca float
  %TEMP1.x = alloca float
  %TEMP0.w = alloca float
  %TEMP0.z = alloca float
  %TEMP0.y = alloca float
  %TEMP0.x = alloca float
  %OUT0.w = alloca float
  %OUT0.z = alloca float
  %OUT0.y = alloca float
  %OUT0.x = alloca float
  store float 0.000000e+00, float* %TEMP0.x
  br label %loop1

loop1:                                            ; preds = %endif5, %main_body
  %22 = load float, float* %TEMP0.x
  %23 = bitcast float %22 to i32
  %24 = icmp sge i32 %23, 768
  %25 = sext i1 %24 to i32
  %26 = bitcast i32 %25 to float
  store float %26, float* %TEMP1.x
  %27 = load float, float* %TEMP1.x
  %28 = bitcast float %27 to i32
  %29 = icmp ne i32 %28, 0
  br i1 %29, label %if3, label %endif5

if3:                                              ; preds = %loop1
  br label %endloop19

endif5:                                           ; preds = %loop1
  store float 0x36E0000000000000, float* %TEMP2.x
  %30 = getelementptr [80 x <8 x i32>], [80 x <8 x i32>] addrspace(2)* %3, i32 0, i32 15, !amdgpu.uniform !0
  %31 = load <8 x i32>, <8 x i32> addrspace(2)* %30, !invariant.load !0
  %32 = call nsz <4 x float> @llvm.amdgcn.image.load.v4f32.i32.v8i32(i32 16, <8 x i32> %31, i32 15, i1 false, i1 false, i1 false, i1 false) #7
  %33 = extractelement <4 x float> %32, i32 0
  %34 = extractelement <4 x float> %32, i32 1
  %35 = extractelement <4 x float> %32, i32 2
  %36 = extractelement <4 x float> %32, i32 3
  store float %33, float* %TEMP3.x
  store float %34, float* %TEMP3.y
  store float %35, float* %TEMP3.z
  store float %36, float* %TEMP3.w
  %37 = load float, float* %TEMP0.x
  %38 = bitcast float %37 to i32
  %39 = mul i32 2, %38
  %40 = bitcast i32 %39 to float
  store float %40, float* %TEMP4.x
  %41 = load float, float* %TEMP4.x
  store float %41, float* %TEMP5.x
  %42 = load float, float* %TEMP3.x
  %43 = bitcast float %42 to i32
  %44 = add i32 %43, 1
  %45 = load float, float* %TEMP3.y
  %46 = bitcast float %45 to i32
  %47 = add i32 %46, 1
  %48 = load float, float* %TEMP3.z
  %49 = bitcast float %48 to i32
  %50 = add i32 %49, 1
  %51 = load float, float* %TEMP3.w
  %52 = bitcast float %51 to i32
  %53 = add i32 %52, 1
  %54 = bitcast i32 %44 to float
  store float %54, float* %TEMP6.x
  %55 = bitcast i32 %47 to float
  store float %55, float* %TEMP6.y
  %56 = bitcast i32 %50 to float
  store float %56, float* %TEMP6.z
  %57 = bitcast i32 %53 to float
  store float %57, float* %TEMP6.w
  %58 = load float, float* %TEMP6.x
  %59 = load float, float* %TEMP6.y
  %60 = load float, float* %TEMP6.z
  %61 = load float, float* %TEMP6.w
  %62 = insertelement <4 x float> undef, float %58, i32 0
  %63 = insertelement <4 x float> %62, float %59, i32 1
  %64 = insertelement <4 x float> %63, float %60, i32 2
  %65 = insertelement <4 x float> %64, float %61, i32 3
  %66 = getelementptr [80 x <8 x i32>], [80 x <8 x i32>] addrspace(2)* %3, i32 0, i32 14, !amdgpu.uniform !0
  %67 = load <8 x i32>, <8 x i32> addrspace(2)* %66, !invariant.load !0
  %68 = extractelement <8 x i32> %67, i32 6
  %69 = and i32 %68, -2097153
  %70 = insertelement <8 x i32> %67, i32 %69, i32 6
  %71 = load float, float* %TEMP4.x
  %72 = bitcast float %71 to i32
  call void @llvm.amdgcn.image.store.v4f32.i32.v8i32(<4 x float> %65, i32 %72, <8 x i32> %70, i32 15, i1 false, i1 false, i1 false, i1 false) #8
  store float 0x36E0000000000000, float* %TEMP7.x
  %73 = getelementptr [80 x <8 x i32>], [80 x <8 x i32>] addrspace(2)* %3, i32 0, i32 15, !amdgpu.uniform !0
  %74 = load <8 x i32>, <8 x i32> addrspace(2)* %73, !invariant.load !0
  %75 = call nsz <4 x float> @llvm.amdgcn.image.load.v4f32.i32.v8i32(i32 16, <8 x i32> %74, i32 15, i1 false, i1 false, i1 false, i1 false) #7
  %76 = extractelement <4 x float> %75, i32 0
  %77 = extractelement <4 x float> %75, i32 1
  %78 = extractelement <4 x float> %75, i32 2
  %79 = extractelement <4 x float> %75, i32 3
  store float %76, float* %TEMP8.x
  store float %77, float* %TEMP8.y
  store float %78, float* %TEMP8.z
  store float %79, float* %TEMP8.w
  %80 = load float, float* %TEMP0.x
  %81 = bitcast float %80 to i32
  %82 = mul i32 2, %81
  %83 = add i32 %82, 1
  %84 = bitcast i32 %83 to float
  store float %84, float* %TEMP9.x
  %85 = load float, float* %TEMP9.x
  store float %85, float* %TEMP10.x
  %86 = load float, float* %TEMP8.x
  %87 = bitcast float %86 to i32
  %88 = add i32 %87, -1
  %89 = load float, float* %TEMP8.y
  %90 = bitcast float %89 to i32
  %91 = add i32 %90, -1
  %92 = load float, float* %TEMP8.z
  %93 = bitcast float %92 to i32
  %94 = add i32 %93, -1
  %95 = load float, float* %TEMP8.w
  %96 = bitcast float %95 to i32
  %97 = add i32 %96, -1
  %98 = bitcast i32 %88 to float
  store float %98, float* %TEMP11.x
  %99 = bitcast i32 %91 to float
  store float %99, float* %TEMP11.y
  %100 = bitcast i32 %94 to float
  store float %100, float* %TEMP11.z
  %101 = bitcast i32 %97 to float
  store float %101, float* %TEMP11.w
  %102 = load float, float* %TEMP11.x
  %103 = load float, float* %TEMP11.y
  %104 = load float, float* %TEMP11.z
  %105 = load float, float* %TEMP11.w
  %106 = insertelement <4 x float> undef, float %102, i32 0
  %107 = insertelement <4 x float> %106, float %103, i32 1
  %108 = insertelement <4 x float> %107, float %104, i32 2
  %109 = insertelement <4 x float> %108, float %105, i32 3
  %110 = getelementptr [80 x <8 x i32>], [80 x <8 x i32>] addrspace(2)* %3, i32 0, i32 14, !amdgpu.uniform !0
  %111 = load <8 x i32>, <8 x i32> addrspace(2)* %110, !invariant.load !0
  %112 = extractelement <8 x i32> %111, i32 6
  %113 = and i32 %112, -2097153
  %114 = insertelement <8 x i32> %111, i32 %113, i32 6
  %115 = load float, float* %TEMP9.x
  %116 = bitcast float %115 to i32
  call void @llvm.amdgcn.image.store.v4f32.i32.v8i32(<4 x float> %109, i32 %116, <8 x i32> %114, i32 15, i1 false, i1 false, i1 false, i1 false) #8
  %117 = load float, float* %TEMP0.x
  %118 = bitcast float %117 to i32
  %119 = add i32 %118, 1
  %120 = bitcast i32 %119 to float
  store float %120, float* %TEMP0.x
  br label %loop1

endloop19:                                        ; preds = %if3
  %121 = call nsz float @llvm.amdgcn.interp.mov(i32 2, i32 0, i32 0, i32 %5) #7
  %122 = call nsz float @llvm.amdgcn.interp.mov(i32 2, i32 1, i32 0, i32 %5) #7
  %123 = call nsz float @llvm.amdgcn.interp.mov(i32 2, i32 2, i32 0, i32 %5) #7
  %124 = call nsz float @llvm.amdgcn.interp.mov(i32 2, i32 3, i32 0, i32 %5) #7
  %125 = call nsz float @llvm.amdgcn.interp.mov(i32 2, i32 0, i32 0, i32 %5) #7
  %126 = call nsz float @llvm.amdgcn.interp.mov(i32 2, i32 1, i32 0, i32 %5) #7
  %127 = call nsz float @llvm.amdgcn.interp.mov(i32 2, i32 2, i32 0, i32 %5) #7
  %128 = call nsz float @llvm.amdgcn.interp.mov(i32 2, i32 3, i32 0, i32 %5) #7
  %129 = call nsz float @llvm.amdgcn.interp.mov(i32 2, i32 0, i32 0, i32 %5) #7
  %130 = call nsz float @llvm.amdgcn.interp.mov(i32 2, i32 1, i32 0, i32 %5) #7
  %131 = call nsz float @llvm.amdgcn.interp.mov(i32 2, i32 2, i32 0, i32 %5) #7
  %132 = call nsz float @llvm.amdgcn.interp.mov(i32 2, i32 3, i32 0, i32 %5) #7
  %133 = call nsz float @llvm.amdgcn.interp.mov(i32 2, i32 0, i32 0, i32 %5) #7
  %134 = call nsz float @llvm.amdgcn.interp.mov(i32 2, i32 1, i32 0, i32 %5) #7
  %135 = call nsz float @llvm.amdgcn.interp.mov(i32 2, i32 2, i32 0, i32 %5) #7
  %136 = call nsz float @llvm.amdgcn.interp.mov(i32 2, i32 3, i32 0, i32 %5) #7
  store float %121, float* %OUT0.x
  store float %126, float* %OUT0.y
  store float %131, float* %OUT0.z
  store float %136, float* %OUT0.w
  %137 = load float, float* %OUT0.x
  %138 = load float, float* %OUT0.y
  %139 = load float, float* %OUT0.z
  %140 = load float, float* %OUT0.w
  %141 = bitcast float %4 to i32
  %142 = insertvalue <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> undef, i32 %141, 8
  %143 = insertvalue <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> %142, float %137, 9
  %144 = insertvalue <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> %143, float %138, 10
  %145 = insertvalue <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> %144, float %139, 11
  %146 = insertvalue <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> %145, float %140, 12
  %147 = insertvalue <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> %146, float %20, 23
  ret <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> %147
}

; Function Attrs: nounwind readonly
declare <4 x float> @llvm.amdgcn.image.load.v4f32.i32.v8i32(i32, <8 x i32>, i32, i1, i1, i1, i1) #1

; Function Attrs: nounwind writeonly
declare void @llvm.amdgcn.image.store.v4f32.i32.v8i32(<4 x float>, i32, <8 x i32>, i32, i1, i1, i1, i1) #2

; Function Attrs: nounwind readnone speculatable
declare float @llvm.amdgcn.interp.mov(i32, i32, i32, i32) #3

; Function Attrs: alwaysinline
define private amdgpu_ps void @ps_epilog(i64 inreg, i64 inreg, i64 inreg, i64 inreg, float inreg, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float) #4 {
main_body:
  call void @llvm.amdgcn.exp.f32(i32 0, i32 1, float %5, float undef, float undef, float undef, i1 true, i1 true) #5
  ret void
}

; Function Attrs: nounwind
declare void @llvm.amdgcn.exp.f32(i32, i32, float, float, float, float, i1, i1) #5

define amdgpu_ps void @wrapper([12 x <4 x i32>] addrspace(2)* byval noalias dereferenceable(18446744073709551615), [0 x <8 x i32>] addrspace(2)* byval noalias dereferenceable(18446744073709551615), [0 x <4 x i32>] addrspace(2)* byval noalias dereferenceable(18446744073709551615), [80 x <8 x i32>] addrspace(2)* byval noalias dereferenceable(18446744073709551615), float inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, i32, i32, float, i32) #6 {
main_body:
  %22 = ptrtoint [12 x <4 x i32>] addrspace(2)* %0 to i64
  %23 = bitcast i64 %22 to <2 x i32>
  %24 = extractelement <2 x i32> %23, i32 0
  %25 = extractelement <2 x i32> %23, i32 1
  %26 = ptrtoint [0 x <8 x i32>] addrspace(2)* %1 to i64
  %27 = bitcast i64 %26 to <2 x i32>
  %28 = extractelement <2 x i32> %27, i32 0
  %29 = extractelement <2 x i32> %27, i32 1
  %30 = ptrtoint [0 x <4 x i32>] addrspace(2)* %2 to i64
  %31 = bitcast i64 %30 to <2 x i32>
  %32 = extractelement <2 x i32> %31, i32 0
  %33 = extractelement <2 x i32> %31, i32 1
  %34 = ptrtoint [80 x <8 x i32>] addrspace(2)* %3 to i64
  %35 = bitcast i64 %34 to <2 x i32>
  %36 = extractelement <2 x i32> %35, i32 0
  %37 = extractelement <2 x i32> %35, i32 1
  %38 = bitcast float %4 to i32
  %39 = bitcast <2 x i32> %6 to <2 x float>
  %40 = extractelement <2 x float> %39, i32 0
  %41 = extractelement <2 x float> %39, i32 1
  %42 = bitcast <2 x i32> %7 to <2 x float>
  %43 = extractelement <2 x float> %42, i32 0
  %44 = extractelement <2 x float> %42, i32 1
  %45 = bitcast <2 x i32> %8 to <2 x float>
  %46 = extractelement <2 x float> %45, i32 0
  %47 = extractelement <2 x float> %45, i32 1
  %48 = bitcast <3 x i32> %9 to <3 x float>
  %49 = extractelement <3 x float> %48, i32 0
  %50 = extractelement <3 x float> %48, i32 1
  %51 = extractelement <3 x float> %48, i32 2
  %52 = bitcast <2 x i32> %10 to <2 x float>
  %53 = extractelement <2 x float> %52, i32 0
  %54 = extractelement <2 x float> %52, i32 1
  %55 = bitcast <2 x i32> %11 to <2 x float>
  %56 = extractelement <2 x float> %55, i32 0
  %57 = extractelement <2 x float> %55, i32 1
  %58 = bitcast <2 x i32> %12 to <2 x float>
  %59 = extractelement <2 x float> %58, i32 0
  %60 = extractelement <2 x float> %58, i32 1
  %61 = bitcast i32 %18 to float
  %62 = bitcast i32 %19 to float
  %63 = bitcast i32 %21 to float
  %64 = insertelement <2 x i32> undef, i32 %24, i32 0
  %65 = insertelement <2 x i32> %64, i32 %25, i32 1
  %66 = bitcast <2 x i32> %65 to i64
  %67 = inttoptr i64 %66 to [12 x <4 x i32>] addrspace(2)*
  %68 = insertelement <2 x i32> undef, i32 %28, i32 0
  %69 = insertelement <2 x i32> %68, i32 %29, i32 1
  %70 = bitcast <2 x i32> %69 to i64
  %71 = inttoptr i64 %70 to [0 x <8 x i32>] addrspace(2)*
  %72 = insertelement <2 x i32> undef, i32 %32, i32 0
  %73 = insertelement <2 x i32> %72, i32 %33, i32 1
  %74 = bitcast <2 x i32> %73 to i64
  %75 = inttoptr i64 %74 to [0 x <4 x i32>] addrspace(2)*
  %76 = insertelement <2 x i32> undef, i32 %36, i32 0
  %77 = insertelement <2 x i32> %76, i32 %37, i32 1
  %78 = bitcast <2 x i32> %77 to i64
  %79 = inttoptr i64 %78 to [80 x <8 x i32>] addrspace(2)*
  %80 = bitcast i32 %38 to float
  %81 = insertelement <2 x float> undef, float %40, i32 0
  %82 = insertelement <2 x float> %81, float %41, i32 1
  %83 = bitcast <2 x float> %82 to <2 x i32>
  %84 = insertelement <2 x float> undef, float %43, i32 0
  %85 = insertelement <2 x float> %84, float %44, i32 1
  %86 = bitcast <2 x float> %85 to <2 x i32>
  %87 = insertelement <2 x float> undef, float %46, i32 0
  %88 = insertelement <2 x float> %87, float %47, i32 1
  %89 = bitcast <2 x float> %88 to <2 x i32>
  %90 = insertelement <3 x float> undef, float %49, i32 0
  %91 = insertelement <3 x float> %90, float %50, i32 1
  %92 = insertelement <3 x float> %91, float %51, i32 2
  %93 = bitcast <3 x float> %92 to <3 x i32>
  %94 = insertelement <2 x float> undef, float %53, i32 0
  %95 = insertelement <2 x float> %94, float %54, i32 1
  %96 = bitcast <2 x float> %95 to <2 x i32>
  %97 = insertelement <2 x float> undef, float %56, i32 0
  %98 = insertelement <2 x float> %97, float %57, i32 1
  %99 = bitcast <2 x float> %98 to <2 x i32>
  %100 = insertelement <2 x float> undef, float %59, i32 0
  %101 = insertelement <2 x float> %100, float %60, i32 1
  %102 = bitcast <2 x float> %101 to <2 x i32>
  %103 = bitcast float %61 to i32
  %104 = bitcast float %62 to i32
  %105 = bitcast float %63 to i32
  %106 = call <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> @main([12 x <4 x i32>] addrspace(2)* %67, [0 x <8 x i32>] addrspace(2)* %71, [0 x <4 x i32>] addrspace(2)* %75, [80 x <8 x i32>] addrspace(2)* %79, float %80, i32 %5, <2 x i32> %83, <2 x i32> %86, <2 x i32> %89, <3 x i32> %93, <2 x i32> %96, <2 x i32> %99, <2 x i32> %102, float %13, float %14, float %15, float %16, float %17, i32 %103, i32 %104, float %20, i32 %105)
  %107 = extractvalue <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> %106, 0
  %108 = extractvalue <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> %106, 1
  %109 = extractvalue <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> %106, 2
  %110 = extractvalue <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> %106, 3
  %111 = extractvalue <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> %106, 4
  %112 = extractvalue <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> %106, 5
  %113 = extractvalue <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> %106, 6
  %114 = extractvalue <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> %106, 7
  %115 = extractvalue <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> %106, 8
  %116 = extractvalue <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> %106, 9
  %117 = extractvalue <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> %106, 10
  %118 = extractvalue <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> %106, 11
  %119 = extractvalue <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> %106, 12
  %120 = extractvalue <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> %106, 13
  %121 = extractvalue <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> %106, 14
  %122 = extractvalue <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> %106, 15
  %123 = extractvalue <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> %106, 16
  %124 = extractvalue <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> %106, 17
  %125 = extractvalue <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> %106, 18
  %126 = extractvalue <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> %106, 19
  %127 = extractvalue <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> %106, 20
  %128 = extractvalue <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> %106, 21
  %129 = extractvalue <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> %106, 22
  %130 = extractvalue <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> %106, 23
  %131 = insertelement <2 x i32> undef, i32 %107, i32 0
  %132 = insertelement <2 x i32> %131, i32 %108, i32 1
  %133 = bitcast <2 x i32> %132 to i64
  %134 = insertelement <2 x i32> undef, i32 %109, i32 0
  %135 = insertelement <2 x i32> %134, i32 %110, i32 1
  %136 = bitcast <2 x i32> %135 to i64
  %137 = insertelement <2 x i32> undef, i32 %111, i32 0
  %138 = insertelement <2 x i32> %137, i32 %112, i32 1
  %139 = bitcast <2 x i32> %138 to i64
  %140 = insertelement <2 x i32> undef, i32 %113, i32 0
  %141 = insertelement <2 x i32> %140, i32 %114, i32 1
  %142 = bitcast <2 x i32> %141 to i64
  %143 = bitcast i32 %115 to float
  call void @ps_epilog(i64 %133, i64 %136, i64 %139, i64 %142, float %143, float %116, float %117, float %118, float %119, float %120, float %121, float %122, float %123, float %124, float %125, float %126, float %127, float %128, float %129, float %130)
  ret void
}

attributes #0 = { alwaysinline "no-signed-zeros-fp-math"="true" }
attributes #1 = { nounwind readonly }
attributes #2 = { nounwind writeonly }
attributes #3 = { nounwind readnone speculatable }
attributes #4 = { alwaysinline "InitialPSInputAddr"="16777215" "no-signed-zeros-fp-math"="true" }
attributes #5 = { nounwind }
attributes #6 = { "no-signed-zeros-fp-math"="true" }
attributes #7 = { nounwind readnone }
attributes #8 = { inaccessiblememonly nounwind }

!0 = !{}
radeonsi: Compiling shader 8
SHADER KEY
  part.ps.prolog.color_two_side = 0
  part.ps.prolog.flatshade_colors = 0
  part.ps.prolog.poly_stipple = 0
  part.ps.prolog.force_persp_sample_interp = 0
  part.ps.prolog.force_linear_sample_interp = 0
  part.ps.prolog.force_persp_center_interp = 0
  part.ps.prolog.force_linear_center_interp = 0
  part.ps.prolog.bc_optimize_for_persp = 0
  part.ps.prolog.bc_optimize_for_linear = 0
  part.ps.epilog.spi_shader_col_format = 0x1
  part.ps.epilog.color_is_int8 = 0x0
  part.ps.epilog.color_is_int10 = 0x0
  part.ps.epilog.last_cbuf = 0
  part.ps.epilog.alpha_func = 7
  part.ps.epilog.alpha_to_one = 0
  part.ps.epilog.poly_line_smoothing = 0
  part.ps.epilog.clamp_color = 0

Pixel Shader:
Shader main disassembly:
wrapper:
BB0_0:
	s_load_dwordx8 s[20:27], s[6:7], 0x1c0            ; C00E0503 000001C0
	s_load_dwordx8 s[12:19], s[6:7], 0x1e0            ; C00E0303 000001E0
	s_mov_b32 s0, 0                                   ; BE800080
	s_mov_b32 s1, 1                                   ; BE810081
	v_mov_b32_e32 v0, 16                              ; 7E000290
	s_waitcnt lgkmcnt(0)                              ; BF8C007F
	s_and_b32 s26, s26, 0xffdfffff                    ; 861AFF1A FFDFFFFF
	s_cmpk_gt_i32 s0, 0x2ff                           ; B20002FF
	s_cbranch_scc1 BB0_2                              ; BF850000
BB0_1:
	image_load v[1:4], v0, s[12:19] dmask:0xf unorm   ; F0001F00 00030100
	s_add_i32 s2, s1, -1                              ; 8102C101
	v_mov_b32_e32 v9, s1                              ; 7E120201
	v_mov_b32_e32 v10, s2                             ; 7E140202
	s_add_i32 s0, s0, 1                               ; 81008100
	s_add_i32 s1, s1, 2                               ; 81018201
	s_waitcnt vmcnt(0)                                ; BF8C0F70
	v_add_u32_e32 v5, vcc, 1, v1                      ; 320A0281
	v_add_u32_e32 v6, vcc, 1, v2                      ; 320C0481
	v_add_u32_e32 v7, vcc, 1, v3                      ; 320E0681
	v_add_u32_e32 v8, vcc, 1, v4                      ; 32100881
	v_add_u32_e32 v1, vcc, -1, v1                     ; 320202C1
	v_add_u32_e32 v2, vcc, -1, v2                     ; 320404C1
	v_add_u32_e32 v3, vcc, -1, v3                     ; 320606C1
	v_add_u32_e32 v4, vcc, -1, v4                     ; 320808C1
	image_store v[5:8], v10, s[20:27] dmask:0xf unorm ; F0201F00 0005050A
	image_store v[1:4], v9, s[20:27] dmask:0xf unorm  ; F0201F00 00050109
	s_cmpk_gt_i32 s0, 0x2ff                           ; B20002FF
	s_cbranch_scc0 BB0_1                              ; BF840000
BB0_2:
	s_mov_b32 m0, s9                                  ; BEFC0009
	v_interp_mov_f32 v0, p0, attr0.x                  ; D4020002
	exp mrt0 v0, off, off, off done vm                ; C4001801 00000000
	s_endpgm                                          ; BF810000

*** SHADER CONFIG ***
SPI_PS_INPUT_ADDR = 0x0001
SPI_PS_INPUT_ENA  = 0x0001
*** SHADER STATS ***
SGPRS: 96
VGPRS: 12
Spilled SGPRs: 0
Spilled VGPRs: 0
Private memory VGPRs: 0
Code Size: 156 bytes
LDS: 0 blocks
Scratch: 0 bytes per wave
Max Waves: 8
********************


-------------- next part --------------
; ModuleID = 'tgsi'
source_filename = "tgsi"
target datalayout = "e-p:32:32-p1:64:64-p2:64:64-p3:32:32-p4:64:64-p5:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64"
target triple = "amdgcn--"

; Function Attrs: alwaysinline
define private amdgpu_ps <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> @main([12 x <4 x i32>] addrspace(2)* inreg noalias dereferenceable(18446744073709551615), [0 x <8 x i32>] addrspace(2)* inreg noalias dereferenceable(18446744073709551615), [0 x <4 x i32>] addrspace(2)* inreg noalias dereferenceable(18446744073709551615), [80 x <8 x i32>] addrspace(2)* inreg noalias dereferenceable(18446744073709551615), float inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, i32, i32, float, i32) #0 {
main_body:
  %TEMP11.w = alloca float
  %TEMP11.z = alloca float
  %TEMP11.y = alloca float
  %TEMP11.x = alloca float
  %TEMP10.w = alloca float
  %TEMP10.z = alloca float
  %TEMP10.y = alloca float
  %TEMP10.x = alloca float
  %TEMP9.w = alloca float
  %TEMP9.z = alloca float
  %TEMP9.y = alloca float
  %TEMP9.x = alloca float
  %TEMP8.w = alloca float
  %TEMP8.z = alloca float
  %TEMP8.y = alloca float
  %TEMP8.x = alloca float
  %TEMP7.w = alloca float
  %TEMP7.z = alloca float
  %TEMP7.y = alloca float
  %TEMP7.x = alloca float
  %TEMP6.w = alloca float
  %TEMP6.z = alloca float
  %TEMP6.y = alloca float
  %TEMP6.x = alloca float
  %TEMP5.w = alloca float
  %TEMP5.z = alloca float
  %TEMP5.y = alloca float
  %TEMP5.x = alloca float
  %TEMP4.w = alloca float
  %TEMP4.z = alloca float
  %TEMP4.y = alloca float
  %TEMP4.x = alloca float
  %TEMP3.w = alloca float
  %TEMP3.z = alloca float
  %TEMP3.y = alloca float
  %TEMP3.x = alloca float
  %TEMP2.w = alloca float
  %TEMP2.z = alloca float
  %TEMP2.y = alloca float
  %TEMP2.x = alloca float
  %TEMP1.w = alloca float
  %TEMP1.z = alloca float
  %TEMP1.y = alloca float
  %TEMP1.x = alloca float
  %TEMP0.w = alloca float
  %TEMP0.z = alloca float
  %TEMP0.y = alloca float
  %TEMP0.x = alloca float
  %OUT0.w = alloca float
  %OUT0.z = alloca float
  %OUT0.y = alloca float
  %OUT0.x = alloca float
  store float 0.000000e+00, float* %TEMP0.x
  br label %loop1

loop1:                                            ; preds = %endif5, %main_body
  %22 = load float, float* %TEMP0.x
  %23 = bitcast float %22 to i32
  %24 = icmp sge i32 %23, 768
  %25 = sext i1 %24 to i32
  %26 = bitcast i32 %25 to float
  store float %26, float* %TEMP1.x
  %27 = load float, float* %TEMP1.x
  %28 = bitcast float %27 to i32
  %29 = icmp ne i32 %28, 0
  br i1 %29, label %if3, label %endif5

if3:                                              ; preds = %loop1
  br label %endloop19

endif5:                                           ; preds = %loop1
  store float 0x36E0000000000000, float* %TEMP2.x
  %30 = getelementptr [80 x <8 x i32>], [80 x <8 x i32>] addrspace(2)* %3, i32 0, i32 15, !amdgpu.uniform !0
  %31 = load <8 x i32>, <8 x i32> addrspace(2)* %30, !invariant.load !0
  %32 = call nsz <4 x float> @llvm.amdgcn.image.load.v4f32.i32.v8i32(i32 16, <8 x i32> %31, i32 15, i1 false, i1 false, i1 false, i1 false) #1
  %33 = extractelement <4 x float> %32, i32 0
  %34 = extractelement <4 x float> %32, i32 1
  %35 = extractelement <4 x float> %32, i32 2
  %36 = extractelement <4 x float> %32, i32 3
  store float %33, float* %TEMP3.x
  store float %34, float* %TEMP3.y
  store float %35, float* %TEMP3.z
  store float %36, float* %TEMP3.w
  %37 = load float, float* %TEMP0.x
  %38 = bitcast float %37 to i32
  %39 = mul i32 2, %38
  %40 = bitcast i32 %39 to float
  store float %40, float* %TEMP4.x
  %41 = load float, float* %TEMP4.x
  store float %41, float* %TEMP5.x
  %42 = load float, float* %TEMP3.x
  %43 = bitcast float %42 to i32
  %44 = add i32 %43, 1
  %45 = load float, float* %TEMP3.y
  %46 = bitcast float %45 to i32
  %47 = add i32 %46, 1
  %48 = load float, float* %TEMP3.z
  %49 = bitcast float %48 to i32
  %50 = add i32 %49, 1
  %51 = load float, float* %TEMP3.w
  %52 = bitcast float %51 to i32
  %53 = add i32 %52, 1
  %54 = bitcast i32 %44 to float
  store float %54, float* %TEMP6.x
  %55 = bitcast i32 %47 to float
  store float %55, float* %TEMP6.y
  %56 = bitcast i32 %50 to float
  store float %56, float* %TEMP6.z
  %57 = bitcast i32 %53 to float
  store float %57, float* %TEMP6.w
  %58 = load float, float* %TEMP6.x
  %59 = load float, float* %TEMP6.y
  %60 = load float, float* %TEMP6.z
  %61 = load float, float* %TEMP6.w
  %62 = insertelement <4 x float> undef, float %58, i32 0
  %63 = insertelement <4 x float> %62, float %59, i32 1
  %64 = insertelement <4 x float> %63, float %60, i32 2
  %65 = insertelement <4 x float> %64, float %61, i32 3
  %66 = getelementptr [80 x <8 x i32>], [80 x <8 x i32>] addrspace(2)* %3, i32 0, i32 14, !amdgpu.uniform !0
  %67 = load <8 x i32>, <8 x i32> addrspace(2)* %66, !invariant.load !0
  %68 = extractelement <8 x i32> %67, i32 6
  %69 = and i32 %68, -2097153
  %70 = insertelement <8 x i32> %67, i32 %69, i32 6
  %71 = load float, float* %TEMP4.x
  %72 = bitcast float %71 to i32
  call void @llvm.amdgcn.image.store.v4f32.i32.v8i32(<4 x float> %65, i32 %72, <8 x i32> %70, i32 15, i1 false, i1 false, i1 false, i1 false) #2
  store float 0x36E0000000000000, float* %TEMP7.x
  %73 = getelementptr [80 x <8 x i32>], [80 x <8 x i32>] addrspace(2)* %3, i32 0, i32 15, !amdgpu.uniform !0
  %74 = load <8 x i32>, <8 x i32> addrspace(2)* %73, !invariant.load !0
  %75 = call nsz <4 x float> @llvm.amdgcn.image.load.v4f32.i32.v8i32(i32 16, <8 x i32> %74, i32 15, i1 false, i1 false, i1 false, i1 false) #1
  %76 = extractelement <4 x float> %75, i32 0
  %77 = extractelement <4 x float> %75, i32 1
  %78 = extractelement <4 x float> %75, i32 2
  %79 = extractelement <4 x float> %75, i32 3
  store float %76, float* %TEMP8.x
  store float %77, float* %TEMP8.y
  store float %78, float* %TEMP8.z
  store float %79, float* %TEMP8.w
  %80 = load float, float* %TEMP0.x
  %81 = bitcast float %80 to i32
  %82 = mul i32 2, %81
  %83 = add i32 %82, 1
  %84 = bitcast i32 %83 to float
  store float %84, float* %TEMP9.x
  %85 = load float, float* %TEMP9.x
  store float %85, float* %TEMP10.x
  %86 = load float, float* %TEMP8.x
  %87 = bitcast float %86 to i32
  %88 = add i32 %87, -1
  %89 = load float, float* %TEMP8.y
  %90 = bitcast float %89 to i32
  %91 = add i32 %90, -1
  %92 = load float, float* %TEMP8.z
  %93 = bitcast float %92 to i32
  %94 = add i32 %93, -1
  %95 = load float, float* %TEMP8.w
  %96 = bitcast float %95 to i32
  %97 = add i32 %96, -1
  %98 = bitcast i32 %88 to float
  store float %98, float* %TEMP11.x
  %99 = bitcast i32 %91 to float
  store float %99, float* %TEMP11.y
  %100 = bitcast i32 %94 to float
  store float %100, float* %TEMP11.z
  %101 = bitcast i32 %97 to float
  store float %101, float* %TEMP11.w
  %102 = load float, float* %TEMP11.x
  %103 = load float, float* %TEMP11.y
  %104 = load float, float* %TEMP11.z
  %105 = load float, float* %TEMP11.w
  %106 = insertelement <4 x float> undef, float %102, i32 0
  %107 = insertelement <4 x float> %106, float %103, i32 1
  %108 = insertelement <4 x float> %107, float %104, i32 2
  %109 = insertelement <4 x float> %108, float %105, i32 3
  %110 = getelementptr [80 x <8 x i32>], [80 x <8 x i32>] addrspace(2)* %3, i32 0, i32 14, !amdgpu.uniform !0
  %111 = load <8 x i32>, <8 x i32> addrspace(2)* %110, !invariant.load !0
  %112 = extractelement <8 x i32> %111, i32 6
  %113 = and i32 %112, -2097153
  %114 = insertelement <8 x i32> %111, i32 %113, i32 6
  %115 = load float, float* %TEMP9.x
  %116 = bitcast float %115 to i32
  call void @llvm.amdgcn.image.store.v4f32.i32.v8i32(<4 x float> %109, i32 %116, <8 x i32> %114, i32 15, i1 false, i1 false, i1 false, i1 false) #2
  %117 = load float, float* %TEMP0.x
  %118 = bitcast float %117 to i32
  %119 = add i32 %118, 1
  %120 = bitcast i32 %119 to float
  store float %120, float* %TEMP0.x
  br label %loop1

endloop19:                                        ; preds = %if3
  %121 = call nsz float @llvm.amdgcn.interp.mov(i32 2, i32 0, i32 0, i32 %5) #7
  %122 = call nsz float @llvm.amdgcn.interp.mov(i32 2, i32 1, i32 0, i32 %5) #7
  %123 = call nsz float @llvm.amdgcn.interp.mov(i32 2, i32 2, i32 0, i32 %5) #7
  %124 = call nsz float @llvm.amdgcn.interp.mov(i32 2, i32 3, i32 0, i32 %5) #7
  %125 = call nsz float @llvm.amdgcn.interp.mov(i32 2, i32 0, i32 0, i32 %5) #7
  %126 = call nsz float @llvm.amdgcn.interp.mov(i32 2, i32 1, i32 0, i32 %5) #7
  %127 = call nsz float @llvm.amdgcn.interp.mov(i32 2, i32 2, i32 0, i32 %5) #7
  %128 = call nsz float @llvm.amdgcn.interp.mov(i32 2, i32 3, i32 0, i32 %5) #7
  %129 = call nsz float @llvm.amdgcn.interp.mov(i32 2, i32 0, i32 0, i32 %5) #7
  %130 = call nsz float @llvm.amdgcn.interp.mov(i32 2, i32 1, i32 0, i32 %5) #7
  %131 = call nsz float @llvm.amdgcn.interp.mov(i32 2, i32 2, i32 0, i32 %5) #7
  %132 = call nsz float @llvm.amdgcn.interp.mov(i32 2, i32 3, i32 0, i32 %5) #7
  %133 = call nsz float @llvm.amdgcn.interp.mov(i32 2, i32 0, i32 0, i32 %5) #7
  %134 = call nsz float @llvm.amdgcn.interp.mov(i32 2, i32 1, i32 0, i32 %5) #7
  %135 = call nsz float @llvm.amdgcn.interp.mov(i32 2, i32 2, i32 0, i32 %5) #7
  %136 = call nsz float @llvm.amdgcn.interp.mov(i32 2, i32 3, i32 0, i32 %5) #7
  store float %121, float* %OUT0.x
  store float %126, float* %OUT0.y
  store float %131, float* %OUT0.z
  store float %136, float* %OUT0.w
  %137 = load float, float* %OUT0.x
  %138 = load float, float* %OUT0.y
  %139 = load float, float* %OUT0.z
  %140 = load float, float* %OUT0.w
  %141 = bitcast float %4 to i32
  %142 = insertvalue <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> undef, i32 %141, 8
  %143 = insertvalue <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> %142, float %137, 9
  %144 = insertvalue <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> %143, float %138, 10
  %145 = insertvalue <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> %144, float %139, 11
  %146 = insertvalue <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> %145, float %140, 12
  %147 = insertvalue <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> %146, float %20, 23
  ret <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> %147
}

; Function Attrs: nounwind readonly
declare <4 x float> @llvm.amdgcn.image.load.v4f32.i32.v8i32(i32, <8 x i32>, i32, i1, i1, i1, i1) #1

; Function Attrs: nounwind writeonly
declare void @llvm.amdgcn.image.store.v4f32.i32.v8i32(<4 x float>, i32, <8 x i32>, i32, i1, i1, i1, i1) #2

; Function Attrs: nounwind readnone speculatable
declare float @llvm.amdgcn.interp.mov(i32, i32, i32, i32) #3

; Function Attrs: alwaysinline
define private amdgpu_ps void @ps_epilog(i64 inreg, i64 inreg, i64 inreg, i64 inreg, float inreg, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float) #4 {
main_body:
  call void @llvm.amdgcn.exp.f32(i32 0, i32 1, float %5, float undef, float undef, float undef, i1 true, i1 true) #5
  ret void
}

; Function Attrs: nounwind
declare void @llvm.amdgcn.exp.f32(i32, i32, float, float, float, float, i1, i1) #5

define amdgpu_ps void @wrapper([12 x <4 x i32>] addrspace(2)* byval noalias dereferenceable(18446744073709551615), [0 x <8 x i32>] addrspace(2)* byval noalias dereferenceable(18446744073709551615), [0 x <4 x i32>] addrspace(2)* byval noalias dereferenceable(18446744073709551615), [80 x <8 x i32>] addrspace(2)* byval noalias dereferenceable(18446744073709551615), float inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, i32, i32, float, i32) #6 {
main_body:
  %22 = ptrtoint [12 x <4 x i32>] addrspace(2)* %0 to i64
  %23 = bitcast i64 %22 to <2 x i32>
  %24 = extractelement <2 x i32> %23, i32 0
  %25 = extractelement <2 x i32> %23, i32 1
  %26 = ptrtoint [0 x <8 x i32>] addrspace(2)* %1 to i64
  %27 = bitcast i64 %26 to <2 x i32>
  %28 = extractelement <2 x i32> %27, i32 0
  %29 = extractelement <2 x i32> %27, i32 1
  %30 = ptrtoint [0 x <4 x i32>] addrspace(2)* %2 to i64
  %31 = bitcast i64 %30 to <2 x i32>
  %32 = extractelement <2 x i32> %31, i32 0
  %33 = extractelement <2 x i32> %31, i32 1
  %34 = ptrtoint [80 x <8 x i32>] addrspace(2)* %3 to i64
  %35 = bitcast i64 %34 to <2 x i32>
  %36 = extractelement <2 x i32> %35, i32 0
  %37 = extractelement <2 x i32> %35, i32 1
  %38 = bitcast float %4 to i32
  %39 = bitcast <2 x i32> %6 to <2 x float>
  %40 = extractelement <2 x float> %39, i32 0
  %41 = extractelement <2 x float> %39, i32 1
  %42 = bitcast <2 x i32> %7 to <2 x float>
  %43 = extractelement <2 x float> %42, i32 0
  %44 = extractelement <2 x float> %42, i32 1
  %45 = bitcast <2 x i32> %8 to <2 x float>
  %46 = extractelement <2 x float> %45, i32 0
  %47 = extractelement <2 x float> %45, i32 1
  %48 = bitcast <3 x i32> %9 to <3 x float>
  %49 = extractelement <3 x float> %48, i32 0
  %50 = extractelement <3 x float> %48, i32 1
  %51 = extractelement <3 x float> %48, i32 2
  %52 = bitcast <2 x i32> %10 to <2 x float>
  %53 = extractelement <2 x float> %52, i32 0
  %54 = extractelement <2 x float> %52, i32 1
  %55 = bitcast <2 x i32> %11 to <2 x float>
  %56 = extractelement <2 x float> %55, i32 0
  %57 = extractelement <2 x float> %55, i32 1
  %58 = bitcast <2 x i32> %12 to <2 x float>
  %59 = extractelement <2 x float> %58, i32 0
  %60 = extractelement <2 x float> %58, i32 1
  %61 = bitcast i32 %18 to float
  %62 = bitcast i32 %19 to float
  %63 = bitcast i32 %21 to float
  %64 = insertelement <2 x i32> undef, i32 %24, i32 0
  %65 = insertelement <2 x i32> %64, i32 %25, i32 1
  %66 = bitcast <2 x i32> %65 to i64
  %67 = inttoptr i64 %66 to [12 x <4 x i32>] addrspace(2)*
  %68 = insertelement <2 x i32> undef, i32 %28, i32 0
  %69 = insertelement <2 x i32> %68, i32 %29, i32 1
  %70 = bitcast <2 x i32> %69 to i64
  %71 = inttoptr i64 %70 to [0 x <8 x i32>] addrspace(2)*
  %72 = insertelement <2 x i32> undef, i32 %32, i32 0
  %73 = insertelement <2 x i32> %72, i32 %33, i32 1
  %74 = bitcast <2 x i32> %73 to i64
  %75 = inttoptr i64 %74 to [0 x <4 x i32>] addrspace(2)*
  %76 = insertelement <2 x i32> undef, i32 %36, i32 0
  %77 = insertelement <2 x i32> %76, i32 %37, i32 1
  %78 = bitcast <2 x i32> %77 to i64
  %79 = inttoptr i64 %78 to [80 x <8 x i32>] addrspace(2)*
  %80 = bitcast i32 %38 to float
  %81 = insertelement <2 x float> undef, float %40, i32 0
  %82 = insertelement <2 x float> %81, float %41, i32 1
  %83 = bitcast <2 x float> %82 to <2 x i32>
  %84 = insertelement <2 x float> undef, float %43, i32 0
  %85 = insertelement <2 x float> %84, float %44, i32 1
  %86 = bitcast <2 x float> %85 to <2 x i32>
  %87 = insertelement <2 x float> undef, float %46, i32 0
  %88 = insertelement <2 x float> %87, float %47, i32 1
  %89 = bitcast <2 x float> %88 to <2 x i32>
  %90 = insertelement <3 x float> undef, float %49, i32 0
  %91 = insertelement <3 x float> %90, float %50, i32 1
  %92 = insertelement <3 x float> %91, float %51, i32 2
  %93 = bitcast <3 x float> %92 to <3 x i32>
  %94 = insertelement <2 x float> undef, float %53, i32 0
  %95 = insertelement <2 x float> %94, float %54, i32 1
  %96 = bitcast <2 x float> %95 to <2 x i32>
  %97 = insertelement <2 x float> undef, float %56, i32 0
  %98 = insertelement <2 x float> %97, float %57, i32 1
  %99 = bitcast <2 x float> %98 to <2 x i32>
  %100 = insertelement <2 x float> undef, float %59, i32 0
  %101 = insertelement <2 x float> %100, float %60, i32 1
  %102 = bitcast <2 x float> %101 to <2 x i32>
  %103 = bitcast float %61 to i32
  %104 = bitcast float %62 to i32
  %105 = bitcast float %63 to i32
  %106 = call <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> @main([12 x <4 x i32>] addrspace(2)* %67, [0 x <8 x i32>] addrspace(2)* %71, [0 x <4 x i32>] addrspace(2)* %75, [80 x <8 x i32>] addrspace(2)* %79, float %80, i32 %5, <2 x i32> %83, <2 x i32> %86, <2 x i32> %89, <3 x i32> %93, <2 x i32> %96, <2 x i32> %99, <2 x i32> %102, float %13, float %14, float %15, float %16, float %17, i32 %103, i32 %104, float %20, i32 %105)
  %107 = extractvalue <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> %106, 0
  %108 = extractvalue <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> %106, 1
  %109 = extractvalue <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> %106, 2
  %110 = extractvalue <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> %106, 3
  %111 = extractvalue <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> %106, 4
  %112 = extractvalue <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> %106, 5
  %113 = extractvalue <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> %106, 6
  %114 = extractvalue <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> %106, 7
  %115 = extractvalue <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> %106, 8
  %116 = extractvalue <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> %106, 9
  %117 = extractvalue <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> %106, 10
  %118 = extractvalue <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> %106, 11
  %119 = extractvalue <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> %106, 12
  %120 = extractvalue <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> %106, 13
  %121 = extractvalue <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> %106, 14
  %122 = extractvalue <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> %106, 15
  %123 = extractvalue <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> %106, 16
  %124 = extractvalue <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> %106, 17
  %125 = extractvalue <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> %106, 18
  %126 = extractvalue <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> %106, 19
  %127 = extractvalue <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> %106, 20
  %128 = extractvalue <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> %106, 21
  %129 = extractvalue <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> %106, 22
  %130 = extractvalue <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> %106, 23
  %131 = insertelement <2 x i32> undef, i32 %107, i32 0
  %132 = insertelement <2 x i32> %131, i32 %108, i32 1
  %133 = bitcast <2 x i32> %132 to i64
  %134 = insertelement <2 x i32> undef, i32 %109, i32 0
  %135 = insertelement <2 x i32> %134, i32 %110, i32 1
  %136 = bitcast <2 x i32> %135 to i64
  %137 = insertelement <2 x i32> undef, i32 %111, i32 0
  %138 = insertelement <2 x i32> %137, i32 %112, i32 1
  %139 = bitcast <2 x i32> %138 to i64
  %140 = insertelement <2 x i32> undef, i32 %113, i32 0
  %141 = insertelement <2 x i32> %140, i32 %114, i32 1
  %142 = bitcast <2 x i32> %141 to i64
  %143 = bitcast i32 %115 to float
  call void @ps_epilog(i64 %133, i64 %136, i64 %139, i64 %142, float %143, float %116, float %117, float %118, float %119, float %120, float %121, float %122, float %123, float %124, float %125, float %126, float %127, float %128, float %129, float %130)
  ret void
}

attributes #0 = { alwaysinline "no-signed-zeros-fp-math"="true" }
attributes #1 = { nounwind readonly }
attributes #2 = { nounwind writeonly }
attributes #3 = { nounwind readnone speculatable }
attributes #4 = { alwaysinline "InitialPSInputAddr"="16777215" "no-signed-zeros-fp-math"="true" }
attributes #5 = { nounwind }
attributes #6 = { "no-signed-zeros-fp-math"="true" }
attributes #7 = { nounwind readnone }

!0 = !{}
radeonsi: Compiling shader 6
SHADER KEY
  part.ps.prolog.color_two_side = 0
  part.ps.prolog.flatshade_colors = 0
  part.ps.prolog.poly_stipple = 0
  part.ps.prolog.force_persp_sample_interp = 0
  part.ps.prolog.force_linear_sample_interp = 0
  part.ps.prolog.force_persp_center_interp = 0
  part.ps.prolog.force_linear_center_interp = 0
  part.ps.prolog.bc_optimize_for_persp = 0
  part.ps.prolog.bc_optimize_for_linear = 0
  part.ps.epilog.spi_shader_col_format = 0x1
  part.ps.epilog.color_is_int8 = 0x0
  part.ps.epilog.color_is_int10 = 0x0
  part.ps.epilog.last_cbuf = 0
  part.ps.epilog.alpha_func = 7
  part.ps.epilog.alpha_to_one = 0
  part.ps.epilog.poly_line_smoothing = 0
  part.ps.epilog.clamp_color = 0

Pixel Shader:
Shader main disassembly:
wrapper:
BB0_0:
	s_load_dwordx8 s[20:27], s[6:7], 0x1c0           ; C00E0503 000001C0
	s_load_dwordx8 s[12:19], s[6:7], 0x1e0           ; C00E0303 000001E0
	s_mov_b32 s0, 0                                  ; BE800080
	s_mov_b32 s1, 1                                  ; BE810081
	v_mov_b32_e32 v0, 16                             ; 7E000290
	s_waitcnt lgkmcnt(0)                             ; BF8C007F
	s_and_b32 s26, s26, 0xffdfffff                   ; 861AFF1A FFDFFFFF
	s_cmpk_gt_i32 s0, 0x2ff                          ; B20002FF
	s_cbranch_scc1 BB0_2                             ; BF850000
BB0_1:
	image_load v[1:4], v0, s[12:19] dmask:0xf unorm  ; F0001F00 00030100
	s_add_i32 s2, s1, -1                             ; 8102C101
	v_mov_b32_e32 v5, s2                             ; 7E0A0202
	s_add_i32 s0, s0, 1                              ; 81008100
	s_waitcnt vmcnt(0)                               ; BF8C0F70
	v_add_u32_e32 v1, vcc, 1, v1                     ; 32020281
	v_add_u32_e32 v2, vcc, 1, v2                     ; 32040481
	v_add_u32_e32 v3, vcc, 1, v3                     ; 32060681
	v_add_u32_e32 v4, vcc, 1, v4                     ; 32080881
	image_store v[1:4], v5, s[20:27] dmask:0xf unorm ; F0201F00 00050105
	s_waitcnt expcnt(0)                              ; BF8C0F0F
	image_load v[1:4], v0, s[12:19] dmask:0xf unorm  ; F0001F00 00030100
	v_mov_b32_e32 v5, s1                             ; 7E0A0201
	s_add_i32 s1, s1, 2                              ; 81018201
	s_waitcnt vmcnt(0)                               ; BF8C0F70
	v_add_u32_e32 v1, vcc, -1, v1                    ; 320202C1
	v_add_u32_e32 v2, vcc, -1, v2                    ; 320404C1
	v_add_u32_e32 v3, vcc, -1, v3                    ; 320606C1
	v_add_u32_e32 v4, vcc, -1, v4                    ; 320808C1
	image_store v[1:4], v5, s[20:27] dmask:0xf unorm ; F0201F00 00050105
	s_cmpk_gt_i32 s0, 0x2ff                          ; B20002FF
	s_cbranch_scc0 BB0_1                             ; BF840000
BB0_2:
	s_mov_b32 m0, s9                                 ; BEFC0009
	v_interp_mov_f32 v0, p0, attr0.x                 ; D4020002
	exp mrt0 v0, off, off, off done vm               ; C4001801 00000000
	s_endpgm                                         ; BF810000

*** SHADER CONFIG ***
SPI_PS_INPUT_ADDR = 0x0001
SPI_PS_INPUT_ENA  = 0x0001
*** SHADER STATS ***
SGPRS: 96
VGPRS: 8
Spilled SGPRs: 0
Spilled VGPRs: 0
Private memory VGPRs: 0
Code Size: 172 bytes
LDS: 0 blocks
Scratch: 0 bytes per wave
Max Waves: 8
********************


FRAG
DCL IN[0], GENERIC[0], CONSTANT
DCL OUT[0], COLOR
DCL IMAGE[0], 1D, PIPE_FORMAT_R32_UINT, WR
DCL IMAGE[1], 1D, PIPE_FORMAT_R32_UINT, WR
DCL TEMP[0..11], LOCAL
IMM[0] INT32 {0, 768, 16, 2}
IMM[1] UINT32 {1, 4294967295, 0, 0}
IMM[2] INT32 {1, 0, 0, 0}
  0: MOV TEMP[0].x, IMM[0].xxxx
  1: BGNLOOP
  2:   ISGE TEMP[1].x, TEMP[0].xxxx, IMM[0].yyyy
  3:   UIF TEMP[1].xxxx
  4:     BRK
  5:   ENDIF
  6:   MOV TEMP[2].x, IMM[0].zzzz
  7:   LOAD TEMP[3], IMAGE[0], IMM[0].zzzz, RESTRICT, 1D, PIPE_FORMAT_R32_UINT
  8:   UMUL TEMP[4].x, IMM[0].wwww, TEMP[0].xxxx
  9:   MOV TEMP[5].x, TEMP[4].xxxx
 10:   UADD TEMP[6], TEMP[3], IMM[1].xxxx
 11:   STORE IMAGE[1], TEMP[4].xxxx, TEMP[6], RESTRICT, 1D, PIPE_FORMAT_R32_UINT
 12:   MOV TEMP[7].x, IMM[0].zzzz
 13:   LOAD TEMP[8], IMAGE[0], IMM[0].zzzz, RESTRICT, 1D, PIPE_FORMAT_R32_UINT
 14:   UMAD TEMP[9].x, IMM[0].wwww, TEMP[0].xxxx, IMM[2].xxxx
 15:   MOV TEMP[10].x, TEMP[9].xxxx
 16:   UADD TEMP[11], TEMP[8], IMM[1].yyyy
 17:   STORE IMAGE[1], TEMP[9].xxxx, TEMP[11], RESTRICT, 1D, PIPE_FORMAT_R32_UINT
 18:   UADD TEMP[0].x, TEMP[0].xxxx, IMM[2].xxxx
 19: ENDLOOP
 20: MOV OUT[0], IN[0]
 21: END
; ModuleID = 'tgsi'
source_filename = "tgsi"
target datalayout = "e-p:32:32-p1:64:64-p2:64:64-p3:32:32-p4:64:64-p5:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64"
target triple = "amdgcn--"

; Function Attrs: alwaysinline
define private amdgpu_ps <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> @main([12 x <4 x i32>] addrspace(2)* inreg noalias dereferenceable(18446744073709551615), [0 x <8 x i32>] addrspace(2)* inreg noalias dereferenceable(18446744073709551615), [0 x <4 x i32>] addrspace(2)* inreg noalias dereferenceable(18446744073709551615), [80 x <8 x i32>] addrspace(2)* inreg noalias dereferenceable(18446744073709551615), float inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, i32, i32, float, i32) #0 {
main_body:
  %TEMP11.w = alloca float
  %TEMP11.z = alloca float
  %TEMP11.y = alloca float
  %TEMP11.x = alloca float
  %TEMP10.w = alloca float
  %TEMP10.z = alloca float
  %TEMP10.y = alloca float
  %TEMP10.x = alloca float
  %TEMP9.w = alloca float
  %TEMP9.z = alloca float
  %TEMP9.y = alloca float
  %TEMP9.x = alloca float
  %TEMP8.w = alloca float
  %TEMP8.z = alloca float
  %TEMP8.y = alloca float
  %TEMP8.x = alloca float
  %TEMP7.w = alloca float
  %TEMP7.z = alloca float
  %TEMP7.y = alloca float
  %TEMP7.x = alloca float
  %TEMP6.w = alloca float
  %TEMP6.z = alloca float
  %TEMP6.y = alloca float
  %TEMP6.x = alloca float
  %TEMP5.w = alloca float
  %TEMP5.z = alloca float
  %TEMP5.y = alloca float
  %TEMP5.x = alloca float
  %TEMP4.w = alloca float
  %TEMP4.z = alloca float
  %TEMP4.y = alloca float
  %TEMP4.x = alloca float
  %TEMP3.w = alloca float
  %TEMP3.z = alloca float
  %TEMP3.y = alloca float
  %TEMP3.x = alloca float
  %TEMP2.w = alloca float
  %TEMP2.z = alloca float
  %TEMP2.y = alloca float
  %TEMP2.x = alloca float
  %TEMP1.w = alloca float
  %TEMP1.z = alloca float
  %TEMP1.y = alloca float
  %TEMP1.x = alloca float
  %TEMP0.w = alloca float
  %TEMP0.z = alloca float
  %TEMP0.y = alloca float
  %TEMP0.x = alloca float
  %OUT0.w = alloca float
  %OUT0.z = alloca float
  %OUT0.y = alloca float
  %OUT0.x = alloca float
  store float 0.000000e+00, float* %TEMP0.x
  br label %loop1

loop1:                                            ; preds = %endif5, %main_body
  %22 = load float, float* %TEMP0.x
  %23 = bitcast float %22 to i32
  %24 = icmp sge i32 %23, 768
  %25 = sext i1 %24 to i32
  %26 = bitcast i32 %25 to float
  store float %26, float* %TEMP1.x
  %27 = load float, float* %TEMP1.x
  %28 = bitcast float %27 to i32
  %29 = icmp ne i32 %28, 0
  br i1 %29, label %if3, label %endif5

if3:                                              ; preds = %loop1
  br label %endloop19

endif5:                                           ; preds = %loop1
  store float 0x36E0000000000000, float* %TEMP2.x
  %30 = getelementptr [80 x <8 x i32>], [80 x <8 x i32>] addrspace(2)* %3, i32 0, i32 15, !amdgpu.uniform !0
  %31 = load <8 x i32>, <8 x i32> addrspace(2)* %30, !invariant.load !0
  %32 = call nsz <4 x float> @llvm.amdgcn.image.load.v4f32.i32.v8i32(i32 16, <8 x i32> %31, i32 15, i1 false, i1 false, i1 false, i1 false) #7
  %33 = extractelement <4 x float> %32, i32 0
  %34 = extractelement <4 x float> %32, i32 1
  %35 = extractelement <4 x float> %32, i32 2
  %36 = extractelement <4 x float> %32, i32 3
  store float %33, float* %TEMP3.x
  store float %34, float* %TEMP3.y
  store float %35, float* %TEMP3.z
  store float %36, float* %TEMP3.w
  %37 = load float, float* %TEMP0.x
  %38 = bitcast float %37 to i32
  %39 = mul i32 2, %38
  %40 = bitcast i32 %39 to float
  store float %40, float* %TEMP4.x
  %41 = load float, float* %TEMP4.x
  store float %41, float* %TEMP5.x
  %42 = load float, float* %TEMP3.x
  %43 = bitcast float %42 to i32
  %44 = add i32 %43, 1
  %45 = load float, float* %TEMP3.y
  %46 = bitcast float %45 to i32
  %47 = add i32 %46, 1
  %48 = load float, float* %TEMP3.z
  %49 = bitcast float %48 to i32
  %50 = add i32 %49, 1
  %51 = load float, float* %TEMP3.w
  %52 = bitcast float %51 to i32
  %53 = add i32 %52, 1
  %54 = bitcast i32 %44 to float
  store float %54, float* %TEMP6.x
  %55 = bitcast i32 %47 to float
  store float %55, float* %TEMP6.y
  %56 = bitcast i32 %50 to float
  store float %56, float* %TEMP6.z
  %57 = bitcast i32 %53 to float
  store float %57, float* %TEMP6.w
  %58 = load float, float* %TEMP6.x
  %59 = load float, float* %TEMP6.y
  %60 = load float, float* %TEMP6.z
  %61 = load float, float* %TEMP6.w
  %62 = insertelement <4 x float> undef, float %58, i32 0
  %63 = insertelement <4 x float> %62, float %59, i32 1
  %64 = insertelement <4 x float> %63, float %60, i32 2
  %65 = insertelement <4 x float> %64, float %61, i32 3
  %66 = getelementptr [80 x <8 x i32>], [80 x <8 x i32>] addrspace(2)* %3, i32 0, i32 14, !amdgpu.uniform !0
  %67 = load <8 x i32>, <8 x i32> addrspace(2)* %66, !invariant.load !0
  %68 = extractelement <8 x i32> %67, i32 6
  %69 = and i32 %68, -2097153
  %70 = insertelement <8 x i32> %67, i32 %69, i32 6
  %71 = load float, float* %TEMP4.x
  %72 = bitcast float %71 to i32
  call void @llvm.amdgcn.image.store.v4f32.i32.v8i32(<4 x float> %65, i32 %72, <8 x i32> %70, i32 15, i1 false, i1 false, i1 false, i1 false) #8
  store float 0x36E0000000000000, float* %TEMP7.x
  %73 = getelementptr [80 x <8 x i32>], [80 x <8 x i32>] addrspace(2)* %3, i32 0, i32 15, !amdgpu.uniform !0
  %74 = load <8 x i32>, <8 x i32> addrspace(2)* %73, !invariant.load !0
  %75 = call nsz <4 x float> @llvm.amdgcn.image.load.v4f32.i32.v8i32(i32 16, <8 x i32> %74, i32 15, i1 false, i1 false, i1 false, i1 false) #7
  %76 = extractelement <4 x float> %75, i32 0
  %77 = extractelement <4 x float> %75, i32 1
  %78 = extractelement <4 x float> %75, i32 2
  %79 = extractelement <4 x float> %75, i32 3
  store float %76, float* %TEMP8.x
  store float %77, float* %TEMP8.y
  store float %78, float* %TEMP8.z
  store float %79, float* %TEMP8.w
  %80 = load float, float* %TEMP0.x
  %81 = bitcast float %80 to i32
  %82 = mul i32 2, %81
  %83 = add i32 %82, 1
  %84 = bitcast i32 %83 to float
  store float %84, float* %TEMP9.x
  %85 = load float, float* %TEMP9.x
  store float %85, float* %TEMP10.x
  %86 = load float, float* %TEMP8.x
  %87 = bitcast float %86 to i32
  %88 = add i32 %87, -1
  %89 = load float, float* %TEMP8.y
  %90 = bitcast float %89 to i32
  %91 = add i32 %90, -1
  %92 = load float, float* %TEMP8.z
  %93 = bitcast float %92 to i32
  %94 = add i32 %93, -1
  %95 = load float, float* %TEMP8.w
  %96 = bitcast float %95 to i32
  %97 = add i32 %96, -1
  %98 = bitcast i32 %88 to float
  store float %98, float* %TEMP11.x
  %99 = bitcast i32 %91 to float
  store float %99, float* %TEMP11.y
  %100 = bitcast i32 %94 to float
  store float %100, float* %TEMP11.z
  %101 = bitcast i32 %97 to float
  store float %101, float* %TEMP11.w
  %102 = load float, float* %TEMP11.x
  %103 = load float, float* %TEMP11.y
  %104 = load float, float* %TEMP11.z
  %105 = load float, float* %TEMP11.w
  %106 = insertelement <4 x float> undef, float %102, i32 0
  %107 = insertelement <4 x float> %106, float %103, i32 1
  %108 = insertelement <4 x float> %107, float %104, i32 2
  %109 = insertelement <4 x float> %108, float %105, i32 3
  %110 = getelementptr [80 x <8 x i32>], [80 x <8 x i32>] addrspace(2)* %3, i32 0, i32 14, !amdgpu.uniform !0
  %111 = load <8 x i32>, <8 x i32> addrspace(2)* %110, !invariant.load !0
  %112 = extractelement <8 x i32> %111, i32 6
  %113 = and i32 %112, -2097153
  %114 = insertelement <8 x i32> %111, i32 %113, i32 6
  %115 = load float, float* %TEMP9.x
  %116 = bitcast float %115 to i32
  call void @llvm.amdgcn.image.store.v4f32.i32.v8i32(<4 x float> %109, i32 %116, <8 x i32> %114, i32 15, i1 false, i1 false, i1 false, i1 false) #8
  %117 = load float, float* %TEMP0.x
  %118 = bitcast float %117 to i32
  %119 = add i32 %118, 1
  %120 = bitcast i32 %119 to float
  store float %120, float* %TEMP0.x
  br label %loop1

endloop19:                                        ; preds = %if3
  %121 = call nsz float @llvm.amdgcn.interp.mov(i32 2, i32 0, i32 0, i32 %5) #7
  %122 = call nsz float @llvm.amdgcn.interp.mov(i32 2, i32 1, i32 0, i32 %5) #7
  %123 = call nsz float @llvm.amdgcn.interp.mov(i32 2, i32 2, i32 0, i32 %5) #7
  %124 = call nsz float @llvm.amdgcn.interp.mov(i32 2, i32 3, i32 0, i32 %5) #7
  %125 = call nsz float @llvm.amdgcn.interp.mov(i32 2, i32 0, i32 0, i32 %5) #7
  %126 = call nsz float @llvm.amdgcn.interp.mov(i32 2, i32 1, i32 0, i32 %5) #7
  %127 = call nsz float @llvm.amdgcn.interp.mov(i32 2, i32 2, i32 0, i32 %5) #7
  %128 = call nsz float @llvm.amdgcn.interp.mov(i32 2, i32 3, i32 0, i32 %5) #7
  %129 = call nsz float @llvm.amdgcn.interp.mov(i32 2, i32 0, i32 0, i32 %5) #7
  %130 = call nsz float @llvm.amdgcn.interp.mov(i32 2, i32 1, i32 0, i32 %5) #7
  %131 = call nsz float @llvm.amdgcn.interp.mov(i32 2, i32 2, i32 0, i32 %5) #7
  %132 = call nsz float @llvm.amdgcn.interp.mov(i32 2, i32 3, i32 0, i32 %5) #7
  %133 = call nsz float @llvm.amdgcn.interp.mov(i32 2, i32 0, i32 0, i32 %5) #7
  %134 = call nsz float @llvm.amdgcn.interp.mov(i32 2, i32 1, i32 0, i32 %5) #7
  %135 = call nsz float @llvm.amdgcn.interp.mov(i32 2, i32 2, i32 0, i32 %5) #7
  %136 = call nsz float @llvm.amdgcn.interp.mov(i32 2, i32 3, i32 0, i32 %5) #7
  store float %121, float* %OUT0.x
  store float %126, float* %OUT0.y
  store float %131, float* %OUT0.z
  store float %136, float* %OUT0.w
  %137 = load float, float* %OUT0.x
  %138 = load float, float* %OUT0.y
  %139 = load float, float* %OUT0.z
  %140 = load float, float* %OUT0.w
  %141 = bitcast float %4 to i32
  %142 = insertvalue <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> undef, i32 %141, 8
  %143 = insertvalue <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> %142, float %137, 9
  %144 = insertvalue <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> %143, float %138, 10
  %145 = insertvalue <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> %144, float %139, 11
  %146 = insertvalue <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> %145, float %140, 12
  %147 = insertvalue <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> %146, float %20, 23
  ret <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> %147
}

; Function Attrs: nounwind readonly
declare <4 x float> @llvm.amdgcn.image.load.v4f32.i32.v8i32(i32, <8 x i32>, i32, i1, i1, i1, i1) #1

; Function Attrs: nounwind writeonly
declare void @llvm.amdgcn.image.store.v4f32.i32.v8i32(<4 x float>, i32, <8 x i32>, i32, i1, i1, i1, i1) #2

; Function Attrs: nounwind readnone speculatable
declare float @llvm.amdgcn.interp.mov(i32, i32, i32, i32) #3

; Function Attrs: alwaysinline
define private amdgpu_ps void @ps_epilog(i64 inreg, i64 inreg, i64 inreg, i64 inreg, float inreg, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float) #4 {
main_body:
  call void @llvm.amdgcn.exp.f32(i32 0, i32 1, float %5, float undef, float undef, float undef, i1 true, i1 true) #5
  ret void
}

; Function Attrs: nounwind
declare void @llvm.amdgcn.exp.f32(i32, i32, float, float, float, float, i1, i1) #5

define amdgpu_ps void @wrapper([12 x <4 x i32>] addrspace(2)* byval noalias dereferenceable(18446744073709551615), [0 x <8 x i32>] addrspace(2)* byval noalias dereferenceable(18446744073709551615), [0 x <4 x i32>] addrspace(2)* byval noalias dereferenceable(18446744073709551615), [80 x <8 x i32>] addrspace(2)* byval noalias dereferenceable(18446744073709551615), float inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, i32, i32, float, i32) #6 {
main_body:
  %22 = ptrtoint [12 x <4 x i32>] addrspace(2)* %0 to i64
  %23 = bitcast i64 %22 to <2 x i32>
  %24 = extractelement <2 x i32> %23, i32 0
  %25 = extractelement <2 x i32> %23, i32 1
  %26 = ptrtoint [0 x <8 x i32>] addrspace(2)* %1 to i64
  %27 = bitcast i64 %26 to <2 x i32>
  %28 = extractelement <2 x i32> %27, i32 0
  %29 = extractelement <2 x i32> %27, i32 1
  %30 = ptrtoint [0 x <4 x i32>] addrspace(2)* %2 to i64
  %31 = bitcast i64 %30 to <2 x i32>
  %32 = extractelement <2 x i32> %31, i32 0
  %33 = extractelement <2 x i32> %31, i32 1
  %34 = ptrtoint [80 x <8 x i32>] addrspace(2)* %3 to i64
  %35 = bitcast i64 %34 to <2 x i32>
  %36 = extractelement <2 x i32> %35, i32 0
  %37 = extractelement <2 x i32> %35, i32 1
  %38 = bitcast float %4 to i32
  %39 = bitcast <2 x i32> %6 to <2 x float>
  %40 = extractelement <2 x float> %39, i32 0
  %41 = extractelement <2 x float> %39, i32 1
  %42 = bitcast <2 x i32> %7 to <2 x float>
  %43 = extractelement <2 x float> %42, i32 0
  %44 = extractelement <2 x float> %42, i32 1
  %45 = bitcast <2 x i32> %8 to <2 x float>
  %46 = extractelement <2 x float> %45, i32 0
  %47 = extractelement <2 x float> %45, i32 1
  %48 = bitcast <3 x i32> %9 to <3 x float>
  %49 = extractelement <3 x float> %48, i32 0
  %50 = extractelement <3 x float> %48, i32 1
  %51 = extractelement <3 x float> %48, i32 2
  %52 = bitcast <2 x i32> %10 to <2 x float>
  %53 = extractelement <2 x float> %52, i32 0
  %54 = extractelement <2 x float> %52, i32 1
  %55 = bitcast <2 x i32> %11 to <2 x float>
  %56 = extractelement <2 x float> %55, i32 0
  %57 = extractelement <2 x float> %55, i32 1
  %58 = bitcast <2 x i32> %12 to <2 x float>
  %59 = extractelement <2 x float> %58, i32 0
  %60 = extractelement <2 x float> %58, i32 1
  %61 = bitcast i32 %18 to float
  %62 = bitcast i32 %19 to float
  %63 = bitcast i32 %21 to float
  %64 = insertelement <2 x i32> undef, i32 %24, i32 0
  %65 = insertelement <2 x i32> %64, i32 %25, i32 1
  %66 = bitcast <2 x i32> %65 to i64
  %67 = inttoptr i64 %66 to [12 x <4 x i32>] addrspace(2)*
  %68 = insertelement <2 x i32> undef, i32 %28, i32 0
  %69 = insertelement <2 x i32> %68, i32 %29, i32 1
  %70 = bitcast <2 x i32> %69 to i64
  %71 = inttoptr i64 %70 to [0 x <8 x i32>] addrspace(2)*
  %72 = insertelement <2 x i32> undef, i32 %32, i32 0
  %73 = insertelement <2 x i32> %72, i32 %33, i32 1
  %74 = bitcast <2 x i32> %73 to i64
  %75 = inttoptr i64 %74 to [0 x <4 x i32>] addrspace(2)*
  %76 = insertelement <2 x i32> undef, i32 %36, i32 0
  %77 = insertelement <2 x i32> %76, i32 %37, i32 1
  %78 = bitcast <2 x i32> %77 to i64
  %79 = inttoptr i64 %78 to [80 x <8 x i32>] addrspace(2)*
  %80 = bitcast i32 %38 to float
  %81 = insertelement <2 x float> undef, float %40, i32 0
  %82 = insertelement <2 x float> %81, float %41, i32 1
  %83 = bitcast <2 x float> %82 to <2 x i32>
  %84 = insertelement <2 x float> undef, float %43, i32 0
  %85 = insertelement <2 x float> %84, float %44, i32 1
  %86 = bitcast <2 x float> %85 to <2 x i32>
  %87 = insertelement <2 x float> undef, float %46, i32 0
  %88 = insertelement <2 x float> %87, float %47, i32 1
  %89 = bitcast <2 x float> %88 to <2 x i32>
  %90 = insertelement <3 x float> undef, float %49, i32 0
  %91 = insertelement <3 x float> %90, float %50, i32 1
  %92 = insertelement <3 x float> %91, float %51, i32 2
  %93 = bitcast <3 x float> %92 to <3 x i32>
  %94 = insertelement <2 x float> undef, float %53, i32 0
  %95 = insertelement <2 x float> %94, float %54, i32 1
  %96 = bitcast <2 x float> %95 to <2 x i32>
  %97 = insertelement <2 x float> undef, float %56, i32 0
  %98 = insertelement <2 x float> %97, float %57, i32 1
  %99 = bitcast <2 x float> %98 to <2 x i32>
  %100 = insertelement <2 x float> undef, float %59, i32 0
  %101 = insertelement <2 x float> %100, float %60, i32 1
  %102 = bitcast <2 x float> %101 to <2 x i32>
  %103 = bitcast float %61 to i32
  %104 = bitcast float %62 to i32
  %105 = bitcast float %63 to i32
  %106 = call <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> @main([12 x <4 x i32>] addrspace(2)* %67, [0 x <8 x i32>] addrspace(2)* %71, [0 x <4 x i32>] addrspace(2)* %75, [80 x <8 x i32>] addrspace(2)* %79, float %80, i32 %5, <2 x i32> %83, <2 x i32> %86, <2 x i32> %89, <3 x i32> %93, <2 x i32> %96, <2 x i32> %99, <2 x i32> %102, float %13, float %14, float %15, float %16, float %17, i32 %103, i32 %104, float %20, i32 %105)
  %107 = extractvalue <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> %106, 0
  %108 = extractvalue <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> %106, 1
  %109 = extractvalue <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> %106, 2
  %110 = extractvalue <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> %106, 3
  %111 = extractvalue <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> %106, 4
  %112 = extractvalue <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> %106, 5
  %113 = extractvalue <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> %106, 6
  %114 = extractvalue <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> %106, 7
  %115 = extractvalue <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> %106, 8
  %116 = extractvalue <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> %106, 9
  %117 = extractvalue <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> %106, 10
  %118 = extractvalue <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> %106, 11
  %119 = extractvalue <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> %106, 12
  %120 = extractvalue <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> %106, 13
  %121 = extractvalue <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> %106, 14
  %122 = extractvalue <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> %106, 15
  %123 = extractvalue <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> %106, 16
  %124 = extractvalue <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> %106, 17
  %125 = extractvalue <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> %106, 18
  %126 = extractvalue <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> %106, 19
  %127 = extractvalue <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> %106, 20
  %128 = extractvalue <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> %106, 21
  %129 = extractvalue <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> %106, 22
  %130 = extractvalue <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> %106, 23
  %131 = insertelement <2 x i32> undef, i32 %107, i32 0
  %132 = insertelement <2 x i32> %131, i32 %108, i32 1
  %133 = bitcast <2 x i32> %132 to i64
  %134 = insertelement <2 x i32> undef, i32 %109, i32 0
  %135 = insertelement <2 x i32> %134, i32 %110, i32 1
  %136 = bitcast <2 x i32> %135 to i64
  %137 = insertelement <2 x i32> undef, i32 %111, i32 0
  %138 = insertelement <2 x i32> %137, i32 %112, i32 1
  %139 = bitcast <2 x i32> %138 to i64
  %140 = insertelement <2 x i32> undef, i32 %113, i32 0
  %141 = insertelement <2 x i32> %140, i32 %114, i32 1
  %142 = bitcast <2 x i32> %141 to i64
  %143 = bitcast i32 %115 to float
  call void @ps_epilog(i64 %133, i64 %136, i64 %139, i64 %142, float %143, float %116, float %117, float %118, float %119, float %120, float %121, float %122, float %123, float %124, float %125, float %126, float %127, float %128, float %129, float %130)
  ret void
}

attributes #0 = { alwaysinline "no-signed-zeros-fp-math"="true" }
attributes #1 = { nounwind readonly }
attributes #2 = { nounwind writeonly }
attributes #3 = { nounwind readnone speculatable }
attributes #4 = { alwaysinline "InitialPSInputAddr"="16777215" "no-signed-zeros-fp-math"="true" }
attributes #5 = { nounwind }
attributes #6 = { "no-signed-zeros-fp-math"="true" }
attributes #7 = { nounwind readnone }
attributes #8 = { inaccessiblememonly nounwind }

!0 = !{}
radeonsi: Compiling shader 8
SHADER KEY
  part.ps.prolog.color_two_side = 0
  part.ps.prolog.flatshade_colors = 0
  part.ps.prolog.poly_stipple = 0
  part.ps.prolog.force_persp_sample_interp = 0
  part.ps.prolog.force_linear_sample_interp = 0
  part.ps.prolog.force_persp_center_interp = 0
  part.ps.prolog.force_linear_center_interp = 0
  part.ps.prolog.bc_optimize_for_persp = 0
  part.ps.prolog.bc_optimize_for_linear = 0
  part.ps.epilog.spi_shader_col_format = 0x1
  part.ps.epilog.color_is_int8 = 0x0
  part.ps.epilog.color_is_int10 = 0x0
  part.ps.epilog.last_cbuf = 0
  part.ps.epilog.alpha_func = 7
  part.ps.epilog.alpha_to_one = 0
  part.ps.epilog.poly_line_smoothing = 0
  part.ps.epilog.clamp_color = 0

Pixel Shader:
Shader main disassembly:
wrapper:
BB0_0:
	s_load_dwordx8 s[20:27], s[6:7], 0x1c0           ; C00E0503 000001C0
	s_load_dwordx8 s[12:19], s[6:7], 0x1e0           ; C00E0303 000001E0
	s_mov_b32 s0, 0                                  ; BE800080
	s_mov_b32 s1, 1                                  ; BE810081
	v_mov_b32_e32 v0, 16                             ; 7E000290
	s_waitcnt lgkmcnt(0)                             ; BF8C007F
	s_and_b32 s26, s26, 0xffdfffff                   ; 861AFF1A FFDFFFFF
	s_cmpk_gt_i32 s0, 0x2ff                          ; B20002FF
	s_cbranch_scc1 BB0_2                             ; BF850000
BB0_1:
	image_load v[1:4], v0, s[12:19] dmask:0xf unorm  ; F0001F00 00030100
	s_add_i32 s2, s1, -1                             ; 8102C101
	v_mov_b32_e32 v9, s2                             ; 7E120202
	s_add_i32 s0, s0, 1                              ; 81008100
	s_waitcnt vmcnt(0)                               ; BF8C0F70
	v_add_u32_e32 v5, vcc, 1, v1                     ; 320A0281
	v_add_u32_e32 v6, vcc, 1, v2                     ; 320C0481
	v_add_u32_e32 v7, vcc, 1, v3                     ; 320E0681
	v_add_u32_e32 v8, vcc, 1, v4                     ; 32100881
	image_store v[5:8], v9, s[20:27] dmask:0xf unorm ; F0201F00 00050509
	s_waitcnt expcnt(0)                              ; BF8C0F0F
	v_mov_b32_e32 v5, s1                             ; 7E0A0201
	v_add_u32_e32 v1, vcc, -1, v1                    ; 320202C1
	v_add_u32_e32 v2, vcc, -1, v2                    ; 320404C1
	v_add_u32_e32 v3, vcc, -1, v3                    ; 320606C1
	v_add_u32_e32 v4, vcc, -1, v4                    ; 320808C1
	image_store v[1:4], v5, s[20:27] dmask:0xf unorm ; F0201F00 00050105
	s_add_i32 s1, s1, 2                              ; 81018201
	s_cmpk_gt_i32 s0, 0x2ff                          ; B20002FF
	s_cbranch_scc0 BB0_1                             ; BF840000
BB0_2:
	s_mov_b32 m0, s9                                 ; BEFC0009
	v_interp_mov_f32 v0, p0, attr0.x                 ; D4020002
	exp mrt0 v0, off, off, off done vm               ; C4001801 00000000
	s_endpgm                                         ; BF810000

*** SHADER CONFIG ***
SPI_PS_INPUT_ADDR = 0x0001
SPI_PS_INPUT_ENA  = 0x0001
*** SHADER STATS ***
SGPRS: 96
VGPRS: 12
Spilled SGPRs: 0
Spilled VGPRs: 0
Private memory VGPRs: 0
Code Size: 160 bytes
LDS: 0 blocks
Scratch: 0 bytes per wave
Max Waves: 8
********************




More information about the llvm-commits mailing list