[llvm] r319894 - [InstSimplify] Fold insertelement into undef if index is out of bounds
Michel Dänzer via llvm-commits
llvm-commits at lists.llvm.org
Fri Dec 8 08:51:05 PST 2017
Hi Igor,
On 2017-12-06 03:04 PM, Igor Laevsky via llvm-commits wrote:
> Author: igor.laevsky
> Date: Wed Dec 6 06:04:45 2017
> New Revision: 319894
>
> URL: http://llvm.org/viewvc/llvm-project?rev=319894&view=rev
> Log:
> [InstSimplify] Fold insertelement into undef if index is out of bounds
This change broke a bunch of piglit OpenGL tests with AMD GPUs using the
Mesa radeonsi driver. I'm attaching the LLVM IR and disassembly of the
generated code for the good and bad case of one test.
--
Earthling Michel Dänzer | http://www.amd.com
Libre software enthusiast | Mesa and X developer
-------------- next part --------------
; ModuleID = 'tgsi'
source_filename = "tgsi"
target datalayout = "e-p:32:32-p1:64:64-p2:64:64-p3:32:32-p4:64:64-p5:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64"
target triple = "amdgcn--"
; Function Attrs: nounwind readnone
declare float @llvm.SI.load.const.v4i32(<4 x i32>, i32) #0
; Function Attrs: nounwind readonly
declare <4 x float> @llvm.amdgcn.image.load.mip.v4f32.v4i32.v8i32(<4 x i32>, <8 x i32>, i32, i1, i1, i1, i1) #1
; Function Attrs: nounwind writeonly
declare void @llvm.amdgcn.buffer.store.format.v4f32(<4 x float>, <4 x i32>, i32, i32, i1, i1) #2
; Function Attrs: nounwind
declare void @llvm.amdgcn.exp.f32(i32, i32, float, float, float, float, i1, i1) #3
define amdgpu_ps void @wrapper([12 x <4 x i32>] addrspace(2)* byval noalias dereferenceable(18446744073709551615), [0 x <8 x i32>] addrspace(2)* byval noalias dereferenceable(18446744073709551615), [0 x float] addrspace(2)* byval noalias dereferenceable(18446744073709551615), [80 x <8 x i32>] addrspace(2)* byval noalias dereferenceable(18446744073709551615), float inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, i32, i32, float, i32) #4 {
main_body:
%22 = ptrtoint [0 x float] addrspace(2)* %2 to i64
%23 = bitcast i64 %22 to <2 x i32>
%24 = fptosi float %14 to i32
%25 = fptosi float %15 to i32
%26 = shufflevector <2 x i32> %23, <2 x i32> undef, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
%27 = shufflevector <2 x i32> %23, <2 x i32> undef, <4 x i32> <i32 undef, i32 1, i32 undef, i32 undef>
%28 = shufflevector <4 x i32> <i32 undef, i32 undef, i32 32, i32 163756>, <4 x i32> %26, <4 x i32> <i32 4, i32 undef, i32 2, i32 3>
%29 = shufflevector <4 x i32> %28, <4 x i32> %27, <4 x i32> <i32 0, i32 5, i32 2, i32 3>
%30 = call nsz float @llvm.SI.load.const.v4i32(<4 x i32> %29, i32 0)
%31 = bitcast float %30 to i32
%32 = add i32 %24, %31
%33 = call nsz float @llvm.SI.load.const.v4i32(<4 x i32> %29, i32 4)
%34 = bitcast float %33 to i32
%35 = add i32 %25, %34
%36 = call nsz float @llvm.SI.load.const.v4i32(<4 x i32> %29, i32 8)
%37 = bitcast float %36 to i32
%38 = mul i32 %35, %37
%39 = add i32 %38, %32
%40 = getelementptr [80 x <8 x i32>], [80 x <8 x i32>] addrspace(2)* %3, i64 0, i64 16, !amdgpu.uniform !0
%41 = load <8 x i32>, <8 x i32> addrspace(2)* %40, align 32, !invariant.load !0, !alias.scope !1, !noalias !4
%42 = insertelement <4 x i32> <i32 undef, i32 undef, i32 0, i32 undef>, i32 %24, i32 0
%43 = insertelement <4 x i32> %42, i32 %25, i32 1
%44 = call nsz <4 x float> @llvm.amdgcn.image.load.mip.v4f32.v4i32.v8i32(<4 x i32> %43, <8 x i32> %41, i32 15, i1 false, i1 false, i1 false, i1 false) #0
call void @llvm.amdgcn.buffer.store.format.v4f32(<4 x float> %44, <4 x i32> undef, i32 %39, i32 0, i1 false, i1 false) #5, !noalias !1
call void @llvm.amdgcn.exp.f32(i32 9, i32 0, float undef, float undef, float undef, float undef, i1 true, i1 true) #3
ret void
}
attributes #0 = { nounwind readnone }
attributes #1 = { nounwind readonly }
attributes #2 = { nounwind writeonly }
attributes #3 = { nounwind }
attributes #4 = { "no-signed-zeros-fp-math"="true" }
attributes #5 = { inaccessiblememonly nounwind }
!0 = !{}
!1 = !{!2}
!2 = distinct !{!2, !3, !"main: argument 1"}
!3 = distinct !{!3, !"main"}
!4 = !{!5}
!5 = distinct !{!5, !3, !"main: argument 0"}
SHADER KEY
part.ps.prolog.color_two_side = 0
part.ps.prolog.flatshade_colors = 0
part.ps.prolog.poly_stipple = 0
part.ps.prolog.force_persp_sample_interp = 0
part.ps.prolog.force_linear_sample_interp = 0
part.ps.prolog.force_persp_center_interp = 0
part.ps.prolog.force_linear_center_interp = 0
part.ps.prolog.bc_optimize_for_persp = 0
part.ps.prolog.bc_optimize_for_linear = 0
part.ps.epilog.spi_shader_col_format = 0x0
part.ps.epilog.color_is_int8 = 0x0
part.ps.epilog.color_is_int10 = 0x0
part.ps.epilog.last_cbuf = 0
part.ps.epilog.alpha_func = 7
part.ps.epilog.alpha_to_one = 0
part.ps.epilog.poly_line_smoothing = 0
part.ps.epilog.clamp_color = 0
Pixel Shader:
Shader main disassembly:
s_load_dwordx8 s[8:15], s[6:7], 0x200 ; C00E0203 00000200
v_cvt_i32_f32_e32 v0, v2 ; 7E001102
v_cvt_i32_f32_e32 v1, v3 ; 7E021103
v_mov_b32_e32 v2, 0 ; 7E040280
s_mov_b32 s7, 0x27fac ; BE8700FF 00027FAC
s_mov_b32 s6, 32 ; BE8600A0
s_waitcnt lgkmcnt(0) ; BF8C007F
image_load_mip v[2:5], v[0:3], s[8:15] dmask:0xf unorm ; F0041F00 00020200
s_buffer_load_dwordx2 s[0:1], s[4:7], 0x0 ; C0260002 00000000
s_buffer_load_dword s2, s[4:7], 0x8 ; C0220082 00000008
s_waitcnt lgkmcnt(0) ; BF8C007F
v_add_u32_e32 v1, vcc, s1, v1 ; 32020201
v_mul_lo_i32 v1, v1, s2 ; D2850001 00000501
v_add_u32_e32 v0, vcc, s0, v0 ; 32000000
v_add_u32_e32 v0, vcc, v1, v0 ; 32000101
s_waitcnt vmcnt(0) ; BF8C0F70
buffer_store_format_xyzw v[2:5], v0, s[0:3], 0 idxen ; E01C2000 80000200
exp null off, off, off, off done vm ; C4001890 00000000
s_endpgm ; BF810000
*** SHADER CONFIG ***
SPI_PS_INPUT_ADDR = 0x0301
SPI_PS_INPUT_ENA = 0x0301
*** SHADER STATS ***
SGPRS: 96
VGPRS: 8
Spilled SGPRs: 0
Spilled VGPRs: 0
Private memory VGPRs: 0
Code Size: 108 bytes
LDS: 0 blocks
Scratch: 0 bytes per wave
Max Waves: 8
********************
-------------- next part --------------
; ModuleID = 'tgsi'
source_filename = "tgsi"
target datalayout = "e-p:32:32-p1:64:64-p2:64:64-p3:32:32-p4:64:64-p5:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64"
target triple = "amdgcn--"
; Function Attrs: nounwind readnone
declare float @llvm.SI.load.const.v4i32(<4 x i32>, i32) #0
; Function Attrs: nounwind readonly
declare <4 x float> @llvm.amdgcn.image.load.mip.v4f32.v4i32.v8i32(<4 x i32>, <8 x i32>, i32, i1, i1, i1, i1) #1
; Function Attrs: nounwind writeonly
declare void @llvm.amdgcn.buffer.store.format.v4f32(<4 x float>, <4 x i32>, i32, i32, i1, i1) #2
; Function Attrs: nounwind
declare void @llvm.amdgcn.exp.f32(i32, i32, float, float, float, float, i1, i1) #3
define amdgpu_ps void @wrapper([12 x <4 x i32>] addrspace(2)* byval noalias dereferenceable(18446744073709551615), [0 x <8 x i32>] addrspace(2)* byval noalias dereferenceable(18446744073709551615), [0 x float] addrspace(2)* byval noalias dereferenceable(18446744073709551615), [80 x <8 x i32>] addrspace(2)* byval noalias dereferenceable(18446744073709551615), float inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, i32, i32, float, i32) #4 {
main_body:
%22 = ptrtoint [0 x float] addrspace(2)* %2 to i64
%23 = bitcast i64 %22 to <2 x i32>
%24 = fptosi float %14 to i32
%25 = fptosi float %15 to i32
%26 = shufflevector <2 x i32> %23, <2 x i32> undef, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
%27 = shufflevector <2 x i32> %23, <2 x i32> undef, <4 x i32> <i32 undef, i32 1, i32 undef, i32 undef>
%28 = shufflevector <4 x i32> <i32 undef, i32 undef, i32 32, i32 163756>, <4 x i32> %26, <4 x i32> <i32 4, i32 undef, i32 2, i32 3>
%29 = shufflevector <4 x i32> %28, <4 x i32> %27, <4 x i32> <i32 0, i32 5, i32 2, i32 3>
%30 = call nsz float @llvm.SI.load.const.v4i32(<4 x i32> %29, i32 0)
%31 = bitcast float %30 to i32
%32 = add i32 %24, %31
%33 = call nsz float @llvm.SI.load.const.v4i32(<4 x i32> %29, i32 4)
%34 = bitcast float %33 to i32
%35 = add i32 %25, %34
%36 = call nsz float @llvm.SI.load.const.v4i32(<4 x i32> %29, i32 8)
%37 = bitcast float %36 to i32
%38 = mul i32 %35, %37
%39 = add i32 %38, %32
%40 = getelementptr [80 x <8 x i32>], [80 x <8 x i32>] addrspace(2)* %3, i64 0, i64 16, !amdgpu.uniform !0
%41 = load <8 x i32>, <8 x i32> addrspace(2)* %40, align 32, !invariant.load !0, !alias.scope !1, !noalias !4
%42 = insertelement <4 x i32> <i32 undef, i32 undef, i32 0, i32 undef>, i32 %24, i32 0
%43 = insertelement <4 x i32> %42, i32 %25, i32 1
%44 = call nsz <4 x float> @llvm.amdgcn.image.load.mip.v4f32.v4i32.v8i32(<4 x i32> %43, <8 x i32> %41, i32 15, i1 false, i1 false, i1 false, i1 false) #0
%45 = bitcast [80 x <8 x i32>] addrspace(2)* %3 to [0 x <4 x i32>] addrspace(2)*
%46 = getelementptr [0 x <4 x i32>], [0 x <4 x i32>] addrspace(2)* %45, i64 0, i64 31, !amdgpu.uniform !0
%47 = load <4 x i32>, <4 x i32> addrspace(2)* %46, align 16, !invariant.load !0, !alias.scope !1, !noalias !4
call void @llvm.amdgcn.buffer.store.format.v4f32(<4 x float> %44, <4 x i32> %47, i32 %39, i32 0, i1 false, i1 false) #5, !noalias !1
call void @llvm.amdgcn.exp.f32(i32 9, i32 0, float undef, float undef, float undef, float undef, i1 true, i1 true) #3
ret void
}
attributes #0 = { nounwind readnone }
attributes #1 = { nounwind readonly }
attributes #2 = { nounwind writeonly }
attributes #3 = { nounwind }
attributes #4 = { "no-signed-zeros-fp-math"="true" }
attributes #5 = { inaccessiblememonly nounwind }
!0 = !{}
!1 = !{!2}
!2 = distinct !{!2, !3, !"main: argument 1"}
!3 = distinct !{!3, !"main"}
!4 = !{!5}
!5 = distinct !{!5, !3, !"main: argument 0"}
SHADER KEY
part.ps.prolog.color_two_side = 0
part.ps.prolog.flatshade_colors = 0
part.ps.prolog.poly_stipple = 0
part.ps.prolog.force_persp_sample_interp = 0
part.ps.prolog.force_linear_sample_interp = 0
part.ps.prolog.force_persp_center_interp = 0
part.ps.prolog.force_linear_center_interp = 0
part.ps.prolog.bc_optimize_for_persp = 0
part.ps.prolog.bc_optimize_for_linear = 0
part.ps.epilog.spi_shader_col_format = 0x0
part.ps.epilog.color_is_int8 = 0x0
part.ps.epilog.color_is_int10 = 0x0
part.ps.epilog.last_cbuf = 0
part.ps.epilog.alpha_func = 7
part.ps.epilog.alpha_to_one = 0
part.ps.epilog.poly_line_smoothing = 0
part.ps.epilog.clamp_color = 0
Pixel Shader:
Shader main disassembly:
s_load_dwordx4 s[0:3], s[6:7], 0x1f0 ; C00A0003 000001F0
s_load_dwordx8 s[8:15], s[6:7], 0x200 ; C00E0203 00000200
v_cvt_i32_f32_e32 v0, v2 ; 7E001102
v_cvt_i32_f32_e32 v1, v3 ; 7E021103
v_mov_b32_e32 v2, 0 ; 7E040280
s_mov_b32 s7, 0x27fac ; BE8700FF 00027FAC
s_mov_b32 s6, 32 ; BE8600A0
s_waitcnt lgkmcnt(0) ; BF8C007F
image_load_mip v[2:5], v[0:3], s[8:15] dmask:0xf unorm ; F0041F00 00020200
s_buffer_load_dwordx2 s[8:9], s[4:7], 0x0 ; C0260202 00000000
s_buffer_load_dword s4, s[4:7], 0x8 ; C0220102 00000008
s_waitcnt lgkmcnt(0) ; BF8C007F
v_add_u32_e32 v1, vcc, s9, v1 ; 32020209
v_mul_lo_i32 v1, v1, s4 ; D2850001 00000901
v_add_u32_e32 v0, vcc, s8, v0 ; 32000008
v_add_u32_e32 v0, vcc, v1, v0 ; 32000101
s_waitcnt vmcnt(0) ; BF8C0F70
buffer_store_format_xyzw v[2:5], v0, s[0:3], 0 idxen ; E01C2000 80000200
exp null off, off, off, off done vm ; C4001890 00000000
s_endpgm ; BF810000
*** SHADER CONFIG ***
SPI_PS_INPUT_ADDR = 0x0301
SPI_PS_INPUT_ENA = 0x0301
*** SHADER STATS ***
SGPRS: 96
VGPRS: 8
Spilled SGPRs: 0
Spilled VGPRs: 0
Private memory VGPRs: 0
Code Size: 116 bytes
LDS: 0 blocks
Scratch: 0 bytes per wave
Max Waves: 8
********************
More information about the llvm-commits
mailing list