[llvm] r317754 - AMDGPU: Lower buffer store and atomic intrinsics manually

Thu Nov 9 07:49:52 PST 2017

Hi Marek,

On 09/11/17 02:52 AM, Marek Olsak via llvm-commits wrote:
> Author: mareko
> Date: Wed Nov  8 17:52:48 2017
> New Revision: 317754
> 
> URL: http://llvm.org/viewvc/llvm-project?rev=317754&view=rev
> Log:
> AMDGPU: Lower buffer store and atomic intrinsics manually

This change broke piglit
spec at arb_tessellation_shader@execution at tes-input@tes-input-gl_clipdistance
on my Tonga:

PIGLIT TEST: 61 - tes-input-gl_ClipDistance
Probe color at (62,62)
  Expected: 0.000000 1.000000 0.000000
  Observed: 1.000000 0.000000 0.000000
Test failure on line 184
Probe color at (187,62)
  Expected: 0.000000 1.000000 0.000000
  Observed: 1.000000 0.000000 0.000000
Test failure on line 185
Probe color at (62,187)
  Expected: 0.000000 1.000000 0.000000
  Observed: 1.000000 0.000000 0.000000
Test failure on line 186
Probe color at (187,187)
  Expected: 0.000000 1.000000 0.000000
  Observed: 1.000000 0.000000 0.000000
Test failure on line 187

Note that it doesn't fail every time, but most of the time. Before this
change, it always passed.

I'm attaching a dump of the affected shader from before and after this
change. Let me know if you need more information.

-- 
Earthling Michel Dänzer               |               http://www.amd.com
Libre software enthusiast             |             Mesa and X developer
-------------- next part --------------
TESS_CTRL
PROPERTY TCS_VERTICES_OUT 3
DCL SV[0], INVOCATIONID
DCL SV[1], PRIM_ID
DCL OUT[][0..1], ARRAY(1), CLIPDIST
DCL OUT[2], TESSOUTER
DCL OUT[3].xy, TESSINNER
DCL CONST[0][0..95]
DCL TEMP[0..23], LOCAL
IMM[0] INT32 {3, 8, 0, 0}
IMM[1] FLT32 {    1.0000,     0.0000,     0.0000,     0.0000}
  0: MOV TEMP[0].yzw, OUT[SV[0].x](1)[0].zyzw
  1: UMAD TEMP[1].x, SV[1].xxxx, IMM[0].xxxx, SV[0].xxxx
  2: UMUL TEMP[2].x, TEMP[1].xxxx, IMM[0].yyyy
  3: MOV TEMP[0].x, CONST[0][TEMP[2].x].xxxx
  4: MOV OUT[SV[0].x](1)[0], TEMP[0]
  5: MOV TEMP[3].xzw, OUT[SV[0].x](1)[0].xxzw
  6: UMAD TEMP[4].x, SV[1].xxxx, IMM[0].xxxx, SV[0].xxxx
  7: UMUL TEMP[5].x, TEMP[4].xxxx, IMM[0].yyyy
  8: MOV TEMP[3].y, CONST[0][TEMP[5].x+1].yxyy
  9: MOV OUT[SV[0].x](1)[0], TEMP[3]
 10: MOV TEMP[6].xyw, OUT[SV[0].x](1)[0].xyxw
 11: UMAD TEMP[7].x, SV[1].xxxx, IMM[0].xxxx, SV[0].xxxx
 12: UMUL TEMP[8].x, TEMP[7].xxxx, IMM[0].yyyy
 13: MOV TEMP[6].z, CONST[0][TEMP[8].x+2].zzxz
 14: MOV OUT[SV[0].x](1)[0], TEMP[6]
 15: MOV TEMP[9].xyz, OUT[SV[0].x](1)[0].xyzx
 16: UMAD TEMP[10].x, SV[1].xxxx, IMM[0].xxxx, SV[0].xxxx
 17: UMUL TEMP[11].x, TEMP[10].xxxx, IMM[0].yyyy
 18: MOV TEMP[9].w, CONST[0][TEMP[11].x+3].wwwx
 19: MOV OUT[SV[0].x](1)[0], TEMP[9]
 20: MOV TEMP[12].yzw, OUT[SV[0].x](1)[1].zyzw
 21: UMAD TEMP[13].x, SV[1].xxxx, IMM[0].xxxx, SV[0].xxxx
 22: UMUL TEMP[14].x, TEMP[13].xxxx, IMM[0].yyyy
 23: MOV TEMP[12].x, CONST[0][TEMP[14].x+4].xxxx
 24: MOV OUT[SV[0].x](1)[1], TEMP[12]
 25: MOV TEMP[15].xzw, OUT[SV[0].x](1)[1].xxzw
 26: UMAD TEMP[16].x, SV[1].xxxx, IMM[0].xxxx, SV[0].xxxx
 27: UMUL TEMP[17].x, TEMP[16].xxxx, IMM[0].yyyy
 28: MOV TEMP[15].y, CONST[0][TEMP[17].x+5].yxyy
 29: MOV OUT[SV[0].x](1)[1], TEMP[15]
 30: MOV TEMP[18].xyw, OUT[SV[0].x](1)[1].xyxw
 31: UMAD TEMP[19].x, SV[1].xxxx, IMM[0].xxxx, SV[0].xxxx
 32: UMUL TEMP[20].x, TEMP[19].xxxx, IMM[0].yyyy
 33: MOV TEMP[18].z, CONST[0][TEMP[20].x+6].zzxz
 34: MOV OUT[SV[0].x](1)[1], TEMP[18]
 35: MOV TEMP[21].xyz, OUT[SV[0].x](1)[1].xyzx
 36: UMAD TEMP[22].x, SV[1].xxxx, IMM[0].xxxx, SV[0].xxxx
 37: UMUL TEMP[23].x, TEMP[22].xxxx, IMM[0].yyyy
 38: MOV TEMP[21].w, CONST[0][TEMP[23].x+7].wwwx
 39: MOV OUT[SV[0].x](1)[1], TEMP[21]
 40: MOV OUT[2].x, IMM[1].xxxx
 41: MOV OUT[2].y, IMM[1].xxxx
 42: MOV OUT[2].z, IMM[1].xxxx
 43: MOV OUT[2].w, IMM[1].xxxx
 44: MOV OUT[3].x, IMM[1].xxxx
 45: MOV OUT[3].y, IMM[1].xxxx
 46: END
radeonsi: Compiling shader 2
TGSI shader LLVM IR:

; ModuleID = 'tgsi'
source_filename = "tgsi"
target datalayout = "e-p:32:32-p1:64:64-p2:64:64-p3:32:32-p4:64:64-p5:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64"
target triple = "amdgcn--"

; Function Attrs: nounwind readnone
declare float @llvm.SI.load.const.v4i32(<4 x i32>, i32) #0

; Function Attrs: nounwind writeonly
declare void @llvm.amdgcn.buffer.store.v4f32(<4 x float>, <4 x i32>, i32, i32, i1, i1) #1

; Function Attrs: nounwind writeonly
declare void @llvm.amdgcn.buffer.store.f32(float, <4 x i32>, i32, i32, i1, i1) #1

; Function Attrs: nounwind writeonly
declare void @llvm.amdgcn.buffer.store.v2f32(<2 x float>, <4 x i32>, i32, i32, i1, i1) #1

define amdgpu_hs void @wrapper([12 x <4 x i32>] addrspace(2)* byval noalias dereferenceable(18446744073709551615), [0 x <8 x i32>] addrspace(2)* byval noalias dereferenceable(18446744073709551615), [0 x float] addrspace(2)* byval noalias dereferenceable(18446744073709551615), [80 x <8 x i32>] addrspace(2)* byval noalias dereferenceable(18446744073709551615), i32 inreg, i32 inreg, i32 inreg, i32 inreg, i32 inreg, i32 inreg, i32 inreg, i32 inreg, i32, i32) #2 {
main_body:
  %14 = ptrtoint [0 x float] addrspace(2)* %2 to i64
  %15 = bitcast i64 %14 to <2 x i32>
  %16 = lshr i32 %13, 8
  %17 = and i32 %16, 31
  %18 = shl i32 %5, 2
  %19 = and i32 %18, 262140
  %20 = and i32 %13, 255
  %21 = mul nuw nsw i32 %20, 608
  %22 = add nuw nsw i32 %19, %21
  %23 = mul nuw nsw i32 %17, 200
  %24 = add nuw nsw i32 %22, %23
  %25 = add nuw nsw i32 %24, 192
  %26 = add nuw nsw i32 %24, 193
  %27 = getelementptr [16384 x i32], [16384 x i32] addrspace(3)* null, i32 0, i32 %26
  %28 = load i32, i32 addrspace(3)* %27, align 4, !noalias !0
  %29 = add nuw nsw i32 %24, 194
  %30 = getelementptr [16384 x i32], [16384 x i32] addrspace(3)* null, i32 0, i32 %29
  %31 = load i32, i32 addrspace(3)* %30, align 8, !noalias !0
  %32 = add nuw nsw i32 %24, 195
  %33 = getelementptr [16384 x i32], [16384 x i32] addrspace(3)* null, i32 0, i32 %32
  %34 = load i32, i32 addrspace(3)* %33, align 4, !noalias !0
  %35 = mul i32 %12, 3
  %36 = add i32 %35, %17
  %37 = shl i32 %36, 7
  %38 = shufflevector <2 x i32> %15, <2 x i32> undef, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
  %39 = shufflevector <2 x i32> %15, <2 x i32> undef, <4 x i32> <i32 undef, i32 1, i32 undef, i32 undef>
  %40 = shufflevector <4 x i32> <i32 undef, i32 undef, i32 1536, i32 163756>, <4 x i32> %38, <4 x i32> <i32 4, i32 undef, i32 2, i32 3>
  %41 = shufflevector <4 x i32> %40, <4 x i32> %39, <4 x i32> <i32 0, i32 5, i32 2, i32 3>
  %42 = call nsz float @llvm.SI.load.const.v4i32(<4 x i32> %41, i32 %37)
  %43 = zext i32 %8 to i64
  %44 = shl nuw nsw i64 %43, 16
  %45 = insertelement <2 x i64> <i64 undef, i64 703330959491071>, i64 %44, i32 0
  %46 = bitcast <2 x i64> %45 to <4 x i32>
  %47 = and i32 %4, 63
  %48 = mul nuw nsw i32 %20, 3
  %49 = add nuw nsw i32 %48, %17
  %50 = mul nuw nsw i32 %47, 144
  %51 = add nuw nsw i32 %49, %50
  %52 = shl nuw nsw i32 %51, 4
  %53 = bitcast float %42 to i32
  %54 = getelementptr [16384 x i32], [16384 x i32] addrspace(3)* null, i32 0, i32 %25
  %55 = bitcast i32 addrspace(3)* %54 to float addrspace(3)*
  store float %42, float addrspace(3)* %55, align 16, !noalias !0
  %56 = insertelement <4 x i32> undef, i32 %53, i32 0
  %57 = insertelement <4 x i32> %56, i32 %28, i32 1
  %58 = insertelement <4 x i32> %57, i32 %31, i32 2
  %59 = insertelement <4 x i32> %58, i32 %34, i32 3
  %60 = add i32 %52, %10
  %61 = bitcast <4 x i32> %59 to <4 x float>
  call void @llvm.amdgcn.buffer.store.v4f32(<4 x float> %61, <4 x i32> %46, i32 0, i32 %60, i1 true, i1 false) #3
  %62 = or i32 %37, 16
  %63 = call nsz float @llvm.SI.load.const.v4i32(<4 x i32> %41, i32 %62)
  store float %42, float addrspace(3)* %55, align 16, !noalias !0
  %64 = bitcast float %63 to i32
  %65 = bitcast i32 addrspace(3)* %27 to float addrspace(3)*
  store float %63, float addrspace(3)* %65, align 4, !noalias !0
  %66 = insertelement <4 x i32> %56, i32 %64, i32 1
  %67 = insertelement <4 x i32> %66, i32 %31, i32 2
  %68 = insertelement <4 x i32> %67, i32 %34, i32 3
  %69 = bitcast <4 x i32> %68 to <4 x float>
  call void @llvm.amdgcn.buffer.store.v4f32(<4 x float> %69, <4 x i32> %46, i32 0, i32 %60, i1 true, i1 false) #3
  %70 = or i32 %37, 32
  %71 = call nsz float @llvm.SI.load.const.v4i32(<4 x i32> %41, i32 %70)
  store float %42, float addrspace(3)* %55, align 16, !noalias !0
  store float %63, float addrspace(3)* %65, align 4, !noalias !0
  %72 = bitcast float %71 to i32
  %73 = bitcast i32 addrspace(3)* %30 to float addrspace(3)*
  store float %71, float addrspace(3)* %73, align 8, !noalias !0
  %74 = insertelement <4 x i32> %66, i32 %72, i32 2
  %75 = insertelement <4 x i32> %74, i32 %34, i32 3
  %76 = bitcast <4 x i32> %75 to <4 x float>
  call void @llvm.amdgcn.buffer.store.v4f32(<4 x float> %76, <4 x i32> %46, i32 0, i32 %60, i1 true, i1 false) #3
  %77 = or i32 %37, 48
  %78 = call nsz float @llvm.SI.load.const.v4i32(<4 x i32> %41, i32 %77)
  store float %42, float addrspace(3)* %55, align 16, !noalias !0
  store float %63, float addrspace(3)* %65, align 4, !noalias !0
  store float %71, float addrspace(3)* %73, align 8, !noalias !0
  %79 = bitcast float %78 to i32
  %80 = bitcast i32 addrspace(3)* %33 to float addrspace(3)*
  store float %78, float addrspace(3)* %80, align 4, !noalias !0
  %81 = insertelement <4 x i32> %74, i32 %79, i32 3
  %82 = bitcast <4 x i32> %81 to <4 x float>
  call void @llvm.amdgcn.buffer.store.v4f32(<4 x float> %82, <4 x i32> %46, i32 0, i32 %60, i1 true, i1 false) #3
  %83 = add nuw nsw i32 %24, 196
  %84 = add nuw nsw i32 %24, 197
  %85 = getelementptr [16384 x i32], [16384 x i32] addrspace(3)* null, i32 0, i32 %84
  %86 = load i32, i32 addrspace(3)* %85, align 4, !noalias !0
  %87 = add nuw nsw i32 %24, 198
  %88 = getelementptr [16384 x i32], [16384 x i32] addrspace(3)* null, i32 0, i32 %87
  %89 = load i32, i32 addrspace(3)* %88, align 8, !noalias !0
  %90 = add nuw nsw i32 %24, 199
  %91 = getelementptr [16384 x i32], [16384 x i32] addrspace(3)* null, i32 0, i32 %90
  %92 = load i32, i32 addrspace(3)* %91, align 4, !noalias !0
  %93 = or i32 %37, 64
  %94 = call nsz float @llvm.SI.load.const.v4i32(<4 x i32> %41, i32 %93)
  %95 = mul nuw nsw i32 %47, 147
  %96 = add nuw nsw i32 %49, %95
  %97 = shl nuw nsw i32 %96, 4
  %98 = bitcast float %94 to i32
  %99 = getelementptr [16384 x i32], [16384 x i32] addrspace(3)* null, i32 0, i32 %83
  %100 = bitcast i32 addrspace(3)* %99 to float addrspace(3)*
  store float %94, float addrspace(3)* %100, align 16, !noalias !0
  %101 = insertelement <4 x i32> undef, i32 %98, i32 0
  %102 = insertelement <4 x i32> %101, i32 %86, i32 1
  %103 = insertelement <4 x i32> %102, i32 %89, i32 2
  %104 = insertelement <4 x i32> %103, i32 %92, i32 3
  %105 = add i32 %97, %10
  %106 = bitcast <4 x i32> %104 to <4 x float>
  call void @llvm.amdgcn.buffer.store.v4f32(<4 x float> %106, <4 x i32> %46, i32 0, i32 %105, i1 true, i1 false) #3
  %107 = or i32 %37, 80
  %108 = call nsz float @llvm.SI.load.const.v4i32(<4 x i32> %41, i32 %107)
  store float %94, float addrspace(3)* %100, align 16, !noalias !0
  %109 = bitcast float %108 to i32
  %110 = bitcast i32 addrspace(3)* %85 to float addrspace(3)*
  store float %108, float addrspace(3)* %110, align 4, !noalias !0
  %111 = insertelement <4 x i32> %101, i32 %109, i32 1
  %112 = insertelement <4 x i32> %111, i32 %89, i32 2
  %113 = insertelement <4 x i32> %112, i32 %92, i32 3
  %114 = bitcast <4 x i32> %113 to <4 x float>
  call void @llvm.amdgcn.buffer.store.v4f32(<4 x float> %114, <4 x i32> %46, i32 0, i32 %105, i1 true, i1 false) #3
  %115 = or i32 %37, 96
  %116 = call nsz float @llvm.SI.load.const.v4i32(<4 x i32> %41, i32 %115)
  store float %94, float addrspace(3)* %100, align 16, !noalias !0
  store float %108, float addrspace(3)* %110, align 4, !noalias !0
  %117 = bitcast float %116 to i32
  %118 = bitcast i32 addrspace(3)* %88 to float addrspace(3)*
  store float %116, float addrspace(3)* %118, align 8, !noalias !0
  %119 = insertelement <4 x i32> %111, i32 %117, i32 2
  %120 = insertelement <4 x i32> %119, i32 %92, i32 3
  %121 = bitcast <4 x i32> %120 to <4 x float>
  call void @llvm.amdgcn.buffer.store.v4f32(<4 x float> %121, <4 x i32> %46, i32 0, i32 %105, i1 true, i1 false) #3
  %122 = or i32 %37, 112
  %123 = call nsz float @llvm.SI.load.const.v4i32(<4 x i32> %41, i32 %122)
  store float %94, float addrspace(3)* %100, align 16, !noalias !0
  store float %108, float addrspace(3)* %110, align 4, !noalias !0
  store float %116, float addrspace(3)* %118, align 8, !noalias !0
  %124 = bitcast float %123 to i32
  %125 = bitcast i32 addrspace(3)* %91 to float addrspace(3)*
  store float %123, float addrspace(3)* %125, align 4, !noalias !0
  %126 = insertelement <4 x i32> %119, i32 %124, i32 3
  %127 = bitcast <4 x i32> %126 to <4 x float>
  call void @llvm.amdgcn.buffer.store.v4f32(<4 x float> %127, <4 x i32> %46, i32 0, i32 %105, i1 true, i1 false) #3
  %128 = icmp eq i32 %17, 0
  br i1 %128, label %if-true-block.i, label %tcs_epilog.exit

if-true-block.i:                                  ; preds = %main_body
  %129 = zext i32 %9 to i64
  %130 = shl nuw nsw i64 %129, 16
  %131 = insertelement <2 x i64> <i64 undef, i64 703330959491071>, i64 %130, i32 0
  %132 = bitcast <2 x i64> %131 to <4 x i32>
  %133 = mul nuw nsw i32 %20, 24
  %134 = icmp eq i32 %20, 0
  br i1 %134, label %if-true-block2.i, label %endif-block1.i

if-true-block2.i:                                 ; preds = %if-true-block.i
  call void @llvm.amdgcn.buffer.store.f32(float -0.000000e+00, <4 x i32> %132, i32 0, i32 %11, i1 true, i1 false) #3
  br label %endif-block1.i

endif-block1.i:                                   ; preds = %if-true-block2.i, %if-true-block.i
  %135 = add i32 %11, 4
  %136 = add i32 %135, %133
  call void @llvm.amdgcn.buffer.store.v4f32(<4 x float> <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>, <4 x i32> %132, i32 0, i32 %136, i1 true, i1 false) #3
  %137 = add i32 %11, 20
  %138 = add i32 %137, %133
  call void @llvm.amdgcn.buffer.store.v2f32(<2 x float> <float 1.000000e+00, float 1.000000e+00>, <4 x i32> %132, i32 0, i32 %138, i1 true, i1 false) #3
  br label %tcs_epilog.exit

tcs_epilog.exit:                                  ; preds = %main_body, %endif-block1.i
  ret void
}

attributes #0 = { nounwind readnone }
attributes #1 = { nounwind writeonly }
attributes #2 = { "amdgpu-max-work-group-size"="128" "no-signed-zeros-fp-math"="true" }
attributes #3 = { inaccessiblememonly nounwind }

!0 = !{!1}
!1 = distinct !{!1, !2, !"main: argument 0"}
!2 = distinct !{!2, !"main"}

SHADER KEY
  part.tcs.epilog.prim_mode = 7
  mono.u.ff_tcs_inputs_to_copy = 0x0

Tessellation Control Shader:
Shader main disassembly:
	v_mul_lo_i32 v0, v0, 3                                         ; D2850000 00010700
	v_bfe_u32 v17, v1, 8, 5                                        ; D1C80011 02151101
	s_lshl_b32 s0, s9, 2                                           ; 8E008209
	s_and_b32 s0, s0, 0x3fffc                                      ; 8600FF00 0003FFFC
	v_add_i32_e32 v0, vcc, v0, v17                                 ; 32002300
	v_lshlrev_b32_e32 v18, 7, v0                                   ; 24240087
	v_and_b32_e32 v0, 0xff, v1                                     ; 260002FF 000000FF
	v_mov_b32_e32 v1, 0x260                                        ; 7E0202FF 00000260
	s_mov_b32 s3, 0x27fac                                          ; BE8300FF 00027FAC
	v_mad_u32_u24 v1, v0, v1, s0                                   ; D1C30001 00020300
	v_mov_b32_e32 v3, 0xc8                                         ; 7E0602FF 000000C8
	v_mad_u32_u24 v1, v17, v3, v1                                  ; D1C30001 04060711
	s_movk_i32 s6, 0x600                                           ; B0060600
	s_mov_b32 s7, s3                                               ; BE870003
	v_lshlrev_b32_e32 v19, 2, v1                                   ; 24260282
	buffer_load_dword v2, v18, s[4:7], 0 offen                     ; E0501000 80010212
	s_and_b32 s8, s8, 63                                           ; 8608BF08
	v_mad_u32_u24 v1, v0, 3, v17                                   ; D1C30001 04450700
	v_mov_b32_e32 v5, 0x90                                         ; 7E0A02FF 00000090
	v_mad_u32_u24 v1, s8, v5, v1                                   ; D1C30001 04060A08
	v_or_b32_e32 v5, 16, v18                                       ; 280A2490
	buffer_load_dword v14, v5, s[4:7], 0 offen                     ; E0501000 80010E05
	s_mov_b32 m0, -1                                               ; BEFC00C1
	ds_read2_b32 v[3:4], v19 offset0:193 offset1:194               ; D86EC2C1 03000013
	v_or_b32_e32 v15, 32, v18                                      ; 281E24A0
	s_mov_b32 s11, 0                                               ; BE8B0080
	s_mov_b32 s10, s13                                             ; BE8A000D
	s_mov_b32 s13, s11                                             ; BE8D000B
	s_lshl_b64 s[0:1], s[12:13], 16                                ; 8E80900C
	s_mov_b32 s2, -1                                               ; BE8200C1
	v_or_b32_e32 v21, 0x50, v18                                    ; 282A24FF 00000050
	s_waitcnt vmcnt(1) lgkmcnt(0)                                  ; BF8C0071
	v_mov_b32_e32 v9, v5                                           ; 7E120305
	v_mov_b32_e32 v7, v3                                           ; 7E0E0303
	v_mov_b32_e32 v8, v4                                           ; 7E100304
	v_mov_b32_e32 v6, v2                                           ; 7E0C0302
	v_lshlrev_b32_e32 v5, 4, v1                                    ; 240A0284
	v_add_i32_e32 v20, vcc, s14, v5                                ; 32280A0E
	s_waitcnt vmcnt(0)                                             ; BF8C0F70
	v_mov_b32_e32 v7, v14                                          ; 7E0E030E
	v_mov_b32_e32 v13, v9                                          ; 7E1A0309
	v_mov_b32_e32 v12, v8                                          ; 7E180308
	v_mov_b32_e32 v11, v7                                          ; 7E160307
	v_mov_b32_e32 v10, v6                                          ; 7E140306
	buffer_load_dword v8, v15, s[4:7], 0 offen                     ; E0501000 8001080F
	v_mov_b32_e32 v13, v2                                          ; 7E1A0302
	v_or_b32_e32 v5, 48, v18                                       ; 280A24B0
	v_mad_u32_u24 v1, s8, 3, v1                                    ; D1C30001 04050608
	v_lshlrev_b32_e32 v1, 4, v1                                    ; 24020284
	v_mov_b32_e32 v12, v4                                          ; 7E180304
	s_waitcnt vmcnt(0)                                             ; BF8C0F70
	ds_write2_b32 v19, v2, v8 offset0:192 offset1:194              ; D81CC2C0 00080213
	ds_write_b64 v19, v[13:14] offset:768                          ; D89A0300 00000D13
	ds_write_b64 v19, v[13:14] offset:768                          ; D89A0300 00000D13
	ds_write_b64 v19, v[13:14] offset:768                          ; D89A0300 00000D13
	buffer_load_dword v14, v5, s[4:7], 0 offen                     ; E0501000 80010E05
	ds_read2_b32 v[15:16], v19 offset0:195 offset1:199             ; D86EC7C3 0F000013
	s_waitcnt lgkmcnt(0)                                           ; BF8C007F
	v_mov_b32_e32 v5, v15                                          ; 7E0A030F
	v_mov_b32_e32 v13, v15                                         ; 7E1A030F
	s_waitcnt vmcnt(0)                                             ; BF8C0F70
	v_mov_b32_e32 v9, v14                                          ; 7E12030E
	buffer_store_dwordx4 v[6:9], v20, s[0:3], 0 offen glc          ; E07C5000 80000614
	buffer_store_dwordx4 v[2:5], v20, s[0:3], 0 offen glc          ; E07C5000 80000214
	s_waitcnt expcnt(0)                                            ; BF8C0F0F
	ds_read2_b32 v[2:3], v19 offset0:197 offset1:198               ; D86EC6C5 02000013
	v_add_i32_e32 v5, vcc, s14, v1                                 ; 320A020E
	v_or_b32_e32 v1, 64, v18                                       ; 280224C0
	buffer_load_dword v1, v1, s[4:7], 0 offen                      ; E0501000 80010101
	v_mov_b32_e32 v9, v15                                          ; 7E12030F
	v_mov_b32_e32 v4, v16                                          ; 7E080310
	buffer_store_dwordx4 v[10:13], v20, s[0:3], 0 offen glc        ; E07C5000 80000A14
	buffer_store_dwordx4 v[6:9], v20, s[0:3], 0 offen glc          ; E07C5000 80000614
	s_waitcnt expcnt(1)                                            ; BF8C0F1F
	v_or_b32_e32 v10, 0x60, v18                                    ; 281424FF 00000060
	v_or_b32_e32 v11, 0x70, v18                                    ; 281624FF 00000070
	v_cmp_eq_u32_e32 vcc, 0, v17                                   ; 7D942280
	v_mov_b32_e32 v13, v8                                          ; 7E1A0308
	s_waitcnt vmcnt(2) lgkmcnt(0)                                  ; BF8C0072
	buffer_store_dwordx4 v[1:4], v5, s[0:3], 0 offen glc           ; E07C5000 80000105
	s_waitcnt expcnt(0)                                            ; BF8C0F0F
	buffer_load_dword v2, v21, s[4:7], 0 offen                     ; E0501000 80010215
	s_waitcnt vmcnt(0)                                             ; BF8C0F70
	buffer_store_dwordx4 v[1:4], v5, s[0:3], 0 offen glc           ; E07C5000 80000105
	s_waitcnt expcnt(0)                                            ; BF8C0F0F
	buffer_load_dword v3, v10, s[4:7], 0 offen                     ; E0501000 8001030A
	s_waitcnt vmcnt(0)                                             ; BF8C0F70
	buffer_store_dwordx4 v[1:4], v5, s[0:3], 0 offen glc           ; E07C5000 80000105
	s_waitcnt expcnt(0)                                            ; BF8C0F0F
	buffer_load_dword v4, v11, s[4:7], 0 offen                     ; E0501000 8001040B
	ds_write_b32 v19, v1 offset:784                                ; D81A0310 00000113
	ds_write2_b64 v19, v[13:14], v[1:2] offset0:97 offset1:98      ; D89C6261 00010D13
	ds_write_b64 v19, v[1:2] offset:784                            ; D89A0310 00000113
	ds_write_b32 v19, v3 offset:792                                ; D81A0318 00000313
	s_waitcnt vmcnt(0)                                             ; BF8C0F70
	ds_write2_b64 v19, v[1:2], v[3:4] offset0:98 offset1:99        ; D89C6362 00030113
	buffer_store_dwordx4 v[1:4], v5, s[0:3], 0 offen glc           ; E07C5000 80000105
	s_and_saveexec_b64 s[0:1], vcc                                 ; BE80206A
	s_cbranch_execz BB0_4                                          ; BF880000
	s_lshl_b64 s[0:1], s[10:11], 16                                ; 8E80900A
	v_cmp_eq_u32_e32 vcc, 0, v0                                    ; 7D940080
	s_and_saveexec_b64 s[4:5], vcc                                 ; BE84206A
	s_waitcnt expcnt(0)                                            ; BF8C0F0F
	v_bfrev_b32_e32 v1, 1                                          ; 7E025881
	v_mov_b32_e32 v2, s15                                          ; 7E04020F
	buffer_store_dword v1, v2, s[0:3], 0 offen glc                 ; E0705000 80000102
	s_or_b64 exec, exec, s[4:5]                                    ; 87FE047E
	s_waitcnt expcnt(0)                                            ; BF8C0F0F
	v_mov_b32_e32 v1, 1.0                                          ; 7E0202F2
	v_mov_b32_e32 v2, v1                                           ; 7E040301
	v_mov_b32_e32 v3, v1                                           ; 7E060301
	v_mov_b32_e32 v4, v1                                           ; 7E080301
	v_mad_u32_u24 v0, v0, 24, s15                                  ; D1C30000 003D3100
	buffer_store_dwordx4 v[1:4], v0, s[0:3], 0 offen offset:4 glc  ; E07C5004 80000100
	buffer_store_dwordx2 v[1:2], v0, s[0:3], 0 offen offset:20 glc ; E0745014 80000100
	s_endpgm                                                       ; BF810000

*** SHADER STATS ***
SGPRS: 96
VGPRS: 24
Spilled SGPRs: 0
Spilled VGPRs: 0
Private memory VGPRs: 0
Code Size: 656 bytes
LDS: 0 blocks
Scratch: 0 bytes per wave
Max Waves: 8
********************
-------------- next part --------------
TESS_CTRL
PROPERTY TCS_VERTICES_OUT 3
DCL SV[0], INVOCATIONID
DCL SV[1], PRIM_ID
DCL OUT[][0..1], ARRAY(1), CLIPDIST
DCL OUT[2], TESSOUTER
DCL OUT[3].xy, TESSINNER
DCL CONST[0][0..95]
DCL TEMP[0..23], LOCAL
IMM[0] INT32 {3, 8, 0, 0}
IMM[1] FLT32 {    1.0000,     0.0000,     0.0000,     0.0000}
  0: MOV TEMP[0].yzw, OUT[SV[0].x](1)[0].zyzw
  1: UMAD TEMP[1].x, SV[1].xxxx, IMM[0].xxxx, SV[0].xxxx
  2: UMUL TEMP[2].x, TEMP[1].xxxx, IMM[0].yyyy
  3: MOV TEMP[0].x, CONST[0][TEMP[2].x].xxxx
  4: MOV OUT[SV[0].x](1)[0], TEMP[0]
  5: MOV TEMP[3].xzw, OUT[SV[0].x](1)[0].xxzw
  6: UMAD TEMP[4].x, SV[1].xxxx, IMM[0].xxxx, SV[0].xxxx
  7: UMUL TEMP[5].x, TEMP[4].xxxx, IMM[0].yyyy
  8: MOV TEMP[3].y, CONST[0][TEMP[5].x+1].yxyy
  9: MOV OUT[SV[0].x](1)[0], TEMP[3]
 10: MOV TEMP[6].xyw, OUT[SV[0].x](1)[0].xyxw
 11: UMAD TEMP[7].x, SV[1].xxxx, IMM[0].xxxx, SV[0].xxxx
 12: UMUL TEMP[8].x, TEMP[7].xxxx, IMM[0].yyyy
 13: MOV TEMP[6].z, CONST[0][TEMP[8].x+2].zzxz
 14: MOV OUT[SV[0].x](1)[0], TEMP[6]
 15: MOV TEMP[9].xyz, OUT[SV[0].x](1)[0].xyzx
 16: UMAD TEMP[10].x, SV[1].xxxx, IMM[0].xxxx, SV[0].xxxx
 17: UMUL TEMP[11].x, TEMP[10].xxxx, IMM[0].yyyy
 18: MOV TEMP[9].w, CONST[0][TEMP[11].x+3].wwwx
 19: MOV OUT[SV[0].x](1)[0], TEMP[9]
 20: MOV TEMP[12].yzw, OUT[SV[0].x](1)[1].zyzw
 21: UMAD TEMP[13].x, SV[1].xxxx, IMM[0].xxxx, SV[0].xxxx
 22: UMUL TEMP[14].x, TEMP[13].xxxx, IMM[0].yyyy
 23: MOV TEMP[12].x, CONST[0][TEMP[14].x+4].xxxx
 24: MOV OUT[SV[0].x](1)[1], TEMP[12]
 25: MOV TEMP[15].xzw, OUT[SV[0].x](1)[1].xxzw
 26: UMAD TEMP[16].x, SV[1].xxxx, IMM[0].xxxx, SV[0].xxxx
 27: UMUL TEMP[17].x, TEMP[16].xxxx, IMM[0].yyyy
 28: MOV TEMP[15].y, CONST[0][TEMP[17].x+5].yxyy
 29: MOV OUT[SV[0].x](1)[1], TEMP[15]
 30: MOV TEMP[18].xyw, OUT[SV[0].x](1)[1].xyxw
 31: UMAD TEMP[19].x, SV[1].xxxx, IMM[0].xxxx, SV[0].xxxx
 32: UMUL TEMP[20].x, TEMP[19].xxxx, IMM[0].yyyy
 33: MOV TEMP[18].z, CONST[0][TEMP[20].x+6].zzxz
 34: MOV OUT[SV[0].x](1)[1], TEMP[18]
 35: MOV TEMP[21].xyz, OUT[SV[0].x](1)[1].xyzx
 36: UMAD TEMP[22].x, SV[1].xxxx, IMM[0].xxxx, SV[0].xxxx
 37: UMUL TEMP[23].x, TEMP[22].xxxx, IMM[0].yyyy
 38: MOV TEMP[21].w, CONST[0][TEMP[23].x+7].wwwx
 39: MOV OUT[SV[0].x](1)[1], TEMP[21]
 40: MOV OUT[2].x, IMM[1].xxxx
 41: MOV OUT[2].y, IMM[1].xxxx
 42: MOV OUT[2].z, IMM[1].xxxx
 43: MOV OUT[2].w, IMM[1].xxxx
 44: MOV OUT[3].x, IMM[1].xxxx
 45: MOV OUT[3].y, IMM[1].xxxx
 46: END
radeonsi: Compiling shader 2
TGSI shader LLVM IR:

; ModuleID = 'tgsi'
source_filename = "tgsi"
target datalayout = "e-p:32:32-p1:64:64-p2:64:64-p3:32:32-p4:64:64-p5:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64"
target triple = "amdgcn--"

; Function Attrs: nounwind readnone
declare float @llvm.SI.load.const.v4i32(<4 x i32>, i32) #0

; Function Attrs: nounwind writeonly
declare void @llvm.amdgcn.buffer.store.v4f32(<4 x float>, <4 x i32>, i32, i32, i1, i1) #1

; Function Attrs: nounwind writeonly
declare void @llvm.amdgcn.buffer.store.f32(float, <4 x i32>, i32, i32, i1, i1) #1

; Function Attrs: nounwind writeonly
declare void @llvm.amdgcn.buffer.store.v2f32(<2 x float>, <4 x i32>, i32, i32, i1, i1) #1

define amdgpu_hs void @wrapper([12 x <4 x i32>] addrspace(2)* byval noalias dereferenceable(18446744073709551615), [0 x <8 x i32>] addrspace(2)* byval noalias dereferenceable(18446744073709551615), [0 x float] addrspace(2)* byval noalias dereferenceable(18446744073709551615), [80 x <8 x i32>] addrspace(2)* byval noalias dereferenceable(18446744073709551615), i32 inreg, i32 inreg, i32 inreg, i32 inreg, i32 inreg, i32 inreg, i32 inreg, i32 inreg, i32, i32) #2 {
main_body:
  %14 = ptrtoint [0 x float] addrspace(2)* %2 to i64
  %15 = bitcast i64 %14 to <2 x i32>
  %16 = lshr i32 %13, 8
  %17 = and i32 %16, 31
  %18 = shl i32 %5, 2
  %19 = and i32 %18, 262140
  %20 = and i32 %13, 255
  %21 = mul nuw nsw i32 %20, 608
  %22 = add nuw nsw i32 %19, %21
  %23 = mul nuw nsw i32 %17, 200
  %24 = add nuw nsw i32 %22, %23
  %25 = add nuw nsw i32 %24, 192
  %26 = add nuw nsw i32 %24, 193
  %27 = getelementptr [16384 x i32], [16384 x i32] addrspace(3)* null, i32 0, i32 %26
  %28 = load i32, i32 addrspace(3)* %27, align 4, !noalias !0
  %29 = add nuw nsw i32 %24, 194
  %30 = getelementptr [16384 x i32], [16384 x i32] addrspace(3)* null, i32 0, i32 %29
  %31 = load i32, i32 addrspace(3)* %30, align 8, !noalias !0
  %32 = add nuw nsw i32 %24, 195
  %33 = getelementptr [16384 x i32], [16384 x i32] addrspace(3)* null, i32 0, i32 %32
  %34 = load i32, i32 addrspace(3)* %33, align 4, !noalias !0
  %35 = mul i32 %12, 3
  %36 = add i32 %35, %17
  %37 = shl i32 %36, 7
  %38 = shufflevector <2 x i32> %15, <2 x i32> undef, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
  %39 = shufflevector <2 x i32> %15, <2 x i32> undef, <4 x i32> <i32 undef, i32 1, i32 undef, i32 undef>
  %40 = shufflevector <4 x i32> <i32 undef, i32 undef, i32 1536, i32 163756>, <4 x i32> %38, <4 x i32> <i32 4, i32 undef, i32 2, i32 3>
  %41 = shufflevector <4 x i32> %40, <4 x i32> %39, <4 x i32> <i32 0, i32 5, i32 2, i32 3>
  %42 = call nsz float @llvm.SI.load.const.v4i32(<4 x i32> %41, i32 %37)
  %43 = zext i32 %8 to i64
  %44 = shl nuw nsw i64 %43, 16
  %45 = insertelement <2 x i64> <i64 undef, i64 703330959491071>, i64 %44, i32 0
  %46 = bitcast <2 x i64> %45 to <4 x i32>
  %47 = and i32 %4, 63
  %48 = mul nuw nsw i32 %20, 3
  %49 = add nuw nsw i32 %48, %17
  %50 = mul nuw nsw i32 %47, 144
  %51 = add nuw nsw i32 %49, %50
  %52 = shl nuw nsw i32 %51, 4
  %53 = bitcast float %42 to i32
  %54 = getelementptr [16384 x i32], [16384 x i32] addrspace(3)* null, i32 0, i32 %25
  %55 = bitcast i32 addrspace(3)* %54 to float addrspace(3)*
  store float %42, float addrspace(3)* %55, align 16, !noalias !0
  %56 = insertelement <4 x i32> undef, i32 %53, i32 0
  %57 = insertelement <4 x i32> %56, i32 %28, i32 1
  %58 = insertelement <4 x i32> %57, i32 %31, i32 2
  %59 = insertelement <4 x i32> %58, i32 %34, i32 3
  %60 = add i32 %52, %10
  %61 = bitcast <4 x i32> %59 to <4 x float>
  call void @llvm.amdgcn.buffer.store.v4f32(<4 x float> %61, <4 x i32> %46, i32 0, i32 %60, i1 true, i1 false) #3
  %62 = or i32 %37, 16
  %63 = call nsz float @llvm.SI.load.const.v4i32(<4 x i32> %41, i32 %62)
  store float %42, float addrspace(3)* %55, align 16, !noalias !0
  %64 = bitcast float %63 to i32
  %65 = bitcast i32 addrspace(3)* %27 to float addrspace(3)*
  store float %63, float addrspace(3)* %65, align 4, !noalias !0
  %66 = insertelement <4 x i32> %56, i32 %64, i32 1
  %67 = insertelement <4 x i32> %66, i32 %31, i32 2
  %68 = insertelement <4 x i32> %67, i32 %34, i32 3
  %69 = bitcast <4 x i32> %68 to <4 x float>
  call void @llvm.amdgcn.buffer.store.v4f32(<4 x float> %69, <4 x i32> %46, i32 0, i32 %60, i1 true, i1 false) #3
  %70 = or i32 %37, 32
  %71 = call nsz float @llvm.SI.load.const.v4i32(<4 x i32> %41, i32 %70)
  store float %42, float addrspace(3)* %55, align 16, !noalias !0
  store float %63, float addrspace(3)* %65, align 4, !noalias !0
  %72 = bitcast float %71 to i32
  %73 = bitcast i32 addrspace(3)* %30 to float addrspace(3)*
  store float %71, float addrspace(3)* %73, align 8, !noalias !0
  %74 = insertelement <4 x i32> %66, i32 %72, i32 2
  %75 = insertelement <4 x i32> %74, i32 %34, i32 3
  %76 = bitcast <4 x i32> %75 to <4 x float>
  call void @llvm.amdgcn.buffer.store.v4f32(<4 x float> %76, <4 x i32> %46, i32 0, i32 %60, i1 true, i1 false) #3
  %77 = or i32 %37, 48
  %78 = call nsz float @llvm.SI.load.const.v4i32(<4 x i32> %41, i32 %77)
  store float %42, float addrspace(3)* %55, align 16, !noalias !0
  store float %63, float addrspace(3)* %65, align 4, !noalias !0
  store float %71, float addrspace(3)* %73, align 8, !noalias !0
  %79 = bitcast float %78 to i32
  %80 = bitcast i32 addrspace(3)* %33 to float addrspace(3)*
  store float %78, float addrspace(3)* %80, align 4, !noalias !0
  %81 = insertelement <4 x i32> %74, i32 %79, i32 3
  %82 = bitcast <4 x i32> %81 to <4 x float>
  call void @llvm.amdgcn.buffer.store.v4f32(<4 x float> %82, <4 x i32> %46, i32 0, i32 %60, i1 true, i1 false) #3
  %83 = add nuw nsw i32 %24, 196
  %84 = add nuw nsw i32 %24, 197
  %85 = getelementptr [16384 x i32], [16384 x i32] addrspace(3)* null, i32 0, i32 %84
  %86 = load i32, i32 addrspace(3)* %85, align 4, !noalias !0
  %87 = add nuw nsw i32 %24, 198
  %88 = getelementptr [16384 x i32], [16384 x i32] addrspace(3)* null, i32 0, i32 %87
  %89 = load i32, i32 addrspace(3)* %88, align 8, !noalias !0
  %90 = add nuw nsw i32 %24, 199
  %91 = getelementptr [16384 x i32], [16384 x i32] addrspace(3)* null, i32 0, i32 %90
  %92 = load i32, i32 addrspace(3)* %91, align 4, !noalias !0
  %93 = or i32 %37, 64
  %94 = call nsz float @llvm.SI.load.const.v4i32(<4 x i32> %41, i32 %93)
  %95 = mul nuw nsw i32 %47, 147
  %96 = add nuw nsw i32 %49, %95
  %97 = shl nuw nsw i32 %96, 4
  %98 = bitcast float %94 to i32
  %99 = getelementptr [16384 x i32], [16384 x i32] addrspace(3)* null, i32 0, i32 %83
  %100 = bitcast i32 addrspace(3)* %99 to float addrspace(3)*
  store float %94, float addrspace(3)* %100, align 16, !noalias !0
  %101 = insertelement <4 x i32> undef, i32 %98, i32 0
  %102 = insertelement <4 x i32> %101, i32 %86, i32 1
  %103 = insertelement <4 x i32> %102, i32 %89, i32 2
  %104 = insertelement <4 x i32> %103, i32 %92, i32 3
  %105 = add i32 %97, %10
  %106 = bitcast <4 x i32> %104 to <4 x float>
  call void @llvm.amdgcn.buffer.store.v4f32(<4 x float> %106, <4 x i32> %46, i32 0, i32 %105, i1 true, i1 false) #3
  %107 = or i32 %37, 80
  %108 = call nsz float @llvm.SI.load.const.v4i32(<4 x i32> %41, i32 %107)
  store float %94, float addrspace(3)* %100, align 16, !noalias !0
  %109 = bitcast float %108 to i32
  %110 = bitcast i32 addrspace(3)* %85 to float addrspace(3)*
  store float %108, float addrspace(3)* %110, align 4, !noalias !0
  %111 = insertelement <4 x i32> %101, i32 %109, i32 1
  %112 = insertelement <4 x i32> %111, i32 %89, i32 2
  %113 = insertelement <4 x i32> %112, i32 %92, i32 3
  %114 = bitcast <4 x i32> %113 to <4 x float>
  call void @llvm.amdgcn.buffer.store.v4f32(<4 x float> %114, <4 x i32> %46, i32 0, i32 %105, i1 true, i1 false) #3
  %115 = or i32 %37, 96
  %116 = call nsz float @llvm.SI.load.const.v4i32(<4 x i32> %41, i32 %115)
  store float %94, float addrspace(3)* %100, align 16, !noalias !0
  store float %108, float addrspace(3)* %110, align 4, !noalias !0
  %117 = bitcast float %116 to i32
  %118 = bitcast i32 addrspace(3)* %88 to float addrspace(3)*
  store float %116, float addrspace(3)* %118, align 8, !noalias !0
  %119 = insertelement <4 x i32> %111, i32 %117, i32 2
  %120 = insertelement <4 x i32> %119, i32 %92, i32 3
  %121 = bitcast <4 x i32> %120 to <4 x float>
  call void @llvm.amdgcn.buffer.store.v4f32(<4 x float> %121, <4 x i32> %46, i32 0, i32 %105, i1 true, i1 false) #3
  %122 = or i32 %37, 112
  %123 = call nsz float @llvm.SI.load.const.v4i32(<4 x i32> %41, i32 %122)
  store float %94, float addrspace(3)* %100, align 16, !noalias !0
  store float %108, float addrspace(3)* %110, align 4, !noalias !0
  store float %116, float addrspace(3)* %118, align 8, !noalias !0
  %124 = bitcast float %123 to i32
  %125 = bitcast i32 addrspace(3)* %91 to float addrspace(3)*
  store float %123, float addrspace(3)* %125, align 4, !noalias !0
  %126 = insertelement <4 x i32> %119, i32 %124, i32 3
  %127 = bitcast <4 x i32> %126 to <4 x float>
  call void @llvm.amdgcn.buffer.store.v4f32(<4 x float> %127, <4 x i32> %46, i32 0, i32 %105, i1 true, i1 false) #3
  %128 = icmp eq i32 %17, 0
  br i1 %128, label %if-true-block.i, label %tcs_epilog.exit

if-true-block.i:                                  ; preds = %main_body
  %129 = zext i32 %9 to i64
  %130 = shl nuw nsw i64 %129, 16
  %131 = insertelement <2 x i64> <i64 undef, i64 703330959491071>, i64 %130, i32 0
  %132 = bitcast <2 x i64> %131 to <4 x i32>
  %133 = mul nuw nsw i32 %20, 24
  %134 = icmp eq i32 %20, 0
  br i1 %134, label %if-true-block2.i, label %endif-block1.i

if-true-block2.i:                                 ; preds = %if-true-block.i
  call void @llvm.amdgcn.buffer.store.f32(float -0.000000e+00, <4 x i32> %132, i32 0, i32 %11, i1 true, i1 false) #3
  br label %endif-block1.i

endif-block1.i:                                   ; preds = %if-true-block2.i, %if-true-block.i
  %135 = add i32 %11, 4
  %136 = add i32 %135, %133
  call void @llvm.amdgcn.buffer.store.v4f32(<4 x float> <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>, <4 x i32> %132, i32 0, i32 %136, i1 true, i1 false) #3
  %137 = add i32 %11, 20
  %138 = add i32 %137, %133
  call void @llvm.amdgcn.buffer.store.v2f32(<2 x float> <float 1.000000e+00, float 1.000000e+00>, <4 x i32> %132, i32 0, i32 %138, i1 true, i1 false) #3
  br label %tcs_epilog.exit

tcs_epilog.exit:                                  ; preds = %main_body, %endif-block1.i
  ret void
}

attributes #0 = { nounwind readnone }
attributes #1 = { nounwind writeonly }
attributes #2 = { "amdgpu-max-work-group-size"="128" "no-signed-zeros-fp-math"="true" }
attributes #3 = { inaccessiblememonly nounwind }

!0 = !{!1}
!1 = distinct !{!1, !2, !"main: argument 0"}
!2 = distinct !{!2, !"main"}

SHADER KEY
  part.tcs.epilog.prim_mode = 7
  mono.u.ff_tcs_inputs_to_copy = 0x0

Tessellation Control Shader:
Shader main disassembly:
	v_mul_lo_i32 v0, v0, 3                                         ; D2850000 00010700
	v_bfe_u32 v6, v1, 8, 5                                         ; D1C80006 02151101
	s_mov_b32 s3, 0x27fac                                          ; BE8300FF 00027FAC
	s_movk_i32 s6, 0x600                                           ; B0060600
	v_add_i32_e32 v0, vcc, v0, v6                                  ; 32000D00
	v_lshlrev_b32_e32 v7, 7, v0                                    ; 240E0087
	s_mov_b32 s7, s3                                               ; BE870003
	buffer_load_dword v2, v7, s[4:7], 0 offen                      ; E0501000 80010207
	s_lshl_b32 s0, s9, 2                                           ; 8E008209
	v_and_b32_e32 v0, 0xff, v1                                     ; 260002FF 000000FF
	v_mov_b32_e32 v3, 0x260                                        ; 7E0602FF 00000260
	s_and_b32 s9, s0, 0x3fffc                                      ; 8609FF00 0003FFFC
	v_mov_b32_e32 v4, 0xc8                                         ; 7E0802FF 000000C8
	v_mad_u32_u24 v3, v0, v3, s9                                   ; D1C30003 00260700
	v_mov_b32_e32 v5, 0x90                                         ; 7E0A02FF 00000090
	s_and_b32 s8, s8, 63                                           ; 8608BF08
	v_mad_u32_u24 v1, v0, 3, v6                                    ; D1C30001 04190700
	v_mad_u32_u24 v3, v6, v4, v3                                   ; D1C30003 040E0906
	v_mad_u32_u24 v1, s8, v5, v1                                   ; D1C30001 04060A08
	v_lshlrev_b32_e32 v4, 4, v1                                    ; 24080284
	s_mov_b32 m0, -1                                               ; BEFC00C1
	v_lshlrev_b32_e32 v8, 2, v3                                    ; 24100682
	v_add_i32_e32 v9, vcc, s14, v4                                 ; 3212080E
	ds_read2_b32 v[3:4], v8 offset0:193 offset1:194                ; D86EC2C1 03000008
	ds_read_b32 v5, v8 offset:780                                  ; D86C030C 05000008
	s_mov_b32 s11, 0                                               ; BE8B0080
	v_mad_u32_u24 v1, s8, 3, v1                                    ; D1C30001 04050608
	s_mov_b32 s10, s13                                             ; BE8A000D
	s_mov_b32 s13, s11                                             ; BE8D000B
	v_lshlrev_b32_e32 v1, 4, v1                                    ; 24020284
	s_mov_b32 s2, -1                                               ; BE8200C1
	s_lshl_b64 s[0:1], s[12:13], 16                                ; 8E80900C
	v_add_i32_e32 v10, vcc, s14, v1                                ; 3214020E
	v_or_b32_e32 v1, 16, v7                                        ; 28020E90
	v_or_b32_e32 v11, 32, v7                                       ; 28160EA0
	v_or_b32_e32 v12, 48, v7                                       ; 28180EB0
	v_or_b32_e32 v13, 64, v7                                       ; 281A0EC0
	v_or_b32_e32 v14, 0x50, v7                                     ; 281C0EFF 00000050
	v_or_b32_e32 v15, 0x60, v7                                     ; 281E0EFF 00000060
	v_or_b32_e32 v7, 0x70, v7                                      ; 280E0EFF 00000070
	v_cmp_eq_u32_e32 vcc, 0, v6                                    ; 7D940C80
	s_waitcnt vmcnt(0)                                             ; BF8C0F70
	ds_write_b32 v8, v2 offset:768                                 ; D81A0300 00000208
	s_waitcnt lgkmcnt(1)                                           ; BF8C017F
	s_waitcnt vmcnt(0)                                             ; BF8C0F70
	buffer_store_dwordx4 v[2:5], v9, s[0:3], 0 offen glc           ; E07C5000 80000209
	s_waitcnt expcnt(0)                                            ; BF8C0F0F
	buffer_load_dword v3, v1, s[4:7], 0 offen                      ; E0501000 80010301
	s_waitcnt vmcnt(0)                                             ; BF8C0F70
	ds_write_b64 v8, v[2:3] offset:768                             ; D89A0300 00000208
	s_waitcnt vmcnt(0)                                             ; BF8C0F70
	buffer_store_dwordx4 v[2:5], v9, s[0:3], 0 offen glc           ; E07C5000 80000209
	s_waitcnt expcnt(0)                                            ; BF8C0F0F
	buffer_load_dword v4, v11, s[4:7], 0 offen                     ; E0501000 8001040B
	ds_write_b64 v8, v[2:3] offset:768                             ; D89A0300 00000208
	s_waitcnt vmcnt(0)                                             ; BF8C0F70
	ds_write_b32 v8, v4 offset:776                                 ; D81A0308 00000408
	s_waitcnt vmcnt(0)                                             ; BF8C0F70
	buffer_store_dwordx4 v[2:5], v9, s[0:3], 0 offen glc           ; E07C5000 80000209
	s_waitcnt expcnt(0)                                            ; BF8C0F0F
	buffer_load_dword v5, v12, s[4:7], 0 offen                     ; E0501000 8001050C
	buffer_load_dword v1, v13, s[4:7], 0 offen                     ; E0501000 8001010D
	s_waitcnt vmcnt(1)                                             ; BF8C0F71
	ds_write2_b64 v8, v[2:3], v[4:5] offset0:96 offset1:97         ; D89C6160 00040208
	s_waitcnt vmcnt(0)                                             ; BF8C0F70
	buffer_store_dwordx4 v[2:5], v9, s[0:3], 0 offen glc           ; E07C5000 80000209
	s_waitcnt vmcnt(1)                                             ; BF8C0F71
	ds_write_b32 v8, v1 offset:784                                 ; D81A0310 00000108
	s_waitcnt expcnt(0)                                            ; BF8C0F0F
	ds_read2_b32 v[2:3], v8 offset0:197 offset1:198                ; D86EC6C5 02000008
	ds_read_b32 v4, v8 offset:796                                  ; D86C031C 04000008
	s_waitcnt lgkmcnt(0)                                           ; BF8C007F
	s_waitcnt vmcnt(0)                                             ; BF8C0F70
	buffer_store_dwordx4 v[1:4], v10, s[0:3], 0 offen glc          ; E07C5000 8000010A
	s_waitcnt expcnt(0)                                            ; BF8C0F0F
	buffer_load_dword v2, v14, s[4:7], 0 offen                     ; E0501000 8001020E
	s_waitcnt vmcnt(0)                                             ; BF8C0F70
	ds_write_b64 v8, v[1:2] offset:784                             ; D89A0310 00000108
	s_waitcnt vmcnt(0)                                             ; BF8C0F70
	buffer_store_dwordx4 v[1:4], v10, s[0:3], 0 offen glc          ; E07C5000 8000010A
	s_waitcnt expcnt(0)                                            ; BF8C0F0F
	buffer_load_dword v3, v15, s[4:7], 0 offen                     ; E0501000 8001030F
	ds_write_b64 v8, v[1:2] offset:784                             ; D89A0310 00000108
	s_waitcnt vmcnt(0)                                             ; BF8C0F70
	ds_write_b32 v8, v3 offset:792                                 ; D81A0318 00000308
	s_waitcnt vmcnt(0)                                             ; BF8C0F70
	buffer_store_dwordx4 v[1:4], v10, s[0:3], 0 offen glc          ; E07C5000 8000010A
	s_waitcnt expcnt(0)                                            ; BF8C0F0F
	buffer_load_dword v4, v7, s[4:7], 0 offen                      ; E0501000 80010407
	s_waitcnt vmcnt(0)                                             ; BF8C0F70
	ds_write2_b64 v8, v[1:2], v[3:4] offset0:98 offset1:99         ; D89C6362 00030108
	s_waitcnt vmcnt(0)                                             ; BF8C0F70
	buffer_store_dwordx4 v[1:4], v10, s[0:3], 0 offen glc          ; E07C5000 8000010A
	s_and_saveexec_b64 s[0:1], vcc                                 ; BE80206A
	s_cbranch_execz BB0_4                                          ; BF880000
	s_lshl_b64 s[0:1], s[10:11], 16                                ; 8E80900A
	v_cmp_eq_u32_e32 vcc, 0, v0                                    ; 7D940080
	s_and_saveexec_b64 s[4:5], vcc                                 ; BE84206A
	s_waitcnt expcnt(0)                                            ; BF8C0F0F
	v_bfrev_b32_e32 v1, 1                                          ; 7E025881
	v_mov_b32_e32 v2, s15                                          ; 7E04020F
	s_waitcnt vmcnt(0)                                             ; BF8C0F70
	buffer_store_dword v1, v2, s[0:3], 0 offen glc                 ; E0705000 80000102
	s_or_b64 exec, exec, s[4:5]                                    ; 87FE047E
	s_waitcnt expcnt(0)                                            ; BF8C0F0F
	v_mov_b32_e32 v1, 1.0                                          ; 7E0202F2
	v_mov_b32_e32 v2, v1                                           ; 7E040301
	v_mov_b32_e32 v3, v1                                           ; 7E060301
	v_mov_b32_e32 v4, v1                                           ; 7E080301
	v_mad_u32_u24 v0, v0, 24, s15                                  ; D1C30000 003D3100
	s_waitcnt vmcnt(0)                                             ; BF8C0F70
	buffer_store_dwordx4 v[1:4], v0, s[0:3], 0 offen offset:4 glc  ; E07C5004 80000100
	s_waitcnt vmcnt(0)                                             ; BF8C0F70
	buffer_store_dwordx2 v[1:2], v0, s[0:3], 0 offen offset:20 glc ; E0745014 80000100
	s_endpgm                                                       ; BF810000

*** SHADER STATS ***
SGPRS: 96
VGPRS: 16
Spilled SGPRs: 0
Spilled VGPRs: 0
Private memory VGPRs: 0
Code Size: 660 bytes
LDS: 0 blocks
Scratch: 0 bytes per wave
Max Waves: 8
********************