[llvm] r319894 - [InstSimplify] Fold insertelement into undef if index is out of bounds

Fri Dec 8 08:51:05 PST 2017

Hi Igor,

On 2017-12-06 03:04 PM, Igor Laevsky via llvm-commits wrote:
> Author: igor.laevsky
> Date: Wed Dec  6 06:04:45 2017
> New Revision: 319894
> 
> URL: http://llvm.org/viewvc/llvm-project?rev=319894&view=rev
> Log:
> [InstSimplify] Fold insertelement into undef if index is out of bounds

This change broke a bunch of piglit OpenGL tests with AMD GPUs using the
Mesa radeonsi driver. I'm attaching the LLVM IR and disassembly of the
generated code for the good and bad case of one test.

-- 
Earthling Michel Dänzer               |               http://www.amd.com
Libre software enthusiast             |             Mesa and X developer
-------------- next part --------------
; ModuleID = 'tgsi'
source_filename = "tgsi"
target datalayout = "e-p:32:32-p1:64:64-p2:64:64-p3:32:32-p4:64:64-p5:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64"
target triple = "amdgcn--"

; Function Attrs: nounwind readnone
declare float @llvm.SI.load.const.v4i32(<4 x i32>, i32) #0

; Function Attrs: nounwind readonly
declare <4 x float> @llvm.amdgcn.image.load.mip.v4f32.v4i32.v8i32(<4 x i32>, <8 x i32>, i32, i1, i1, i1, i1) #1

; Function Attrs: nounwind writeonly
declare void @llvm.amdgcn.buffer.store.format.v4f32(<4 x float>, <4 x i32>, i32, i32, i1, i1) #2

; Function Attrs: nounwind
declare void @llvm.amdgcn.exp.f32(i32, i32, float, float, float, float, i1, i1) #3

define amdgpu_ps void @wrapper([12 x <4 x i32>] addrspace(2)* byval noalias dereferenceable(18446744073709551615), [0 x <8 x i32>] addrspace(2)* byval noalias dereferenceable(18446744073709551615), [0 x float] addrspace(2)* byval noalias dereferenceable(18446744073709551615), [80 x <8 x i32>] addrspace(2)* byval noalias dereferenceable(18446744073709551615), float inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, i32, i32, float, i32) #4 {
main_body:
  %22 = ptrtoint [0 x float] addrspace(2)* %2 to i64
  %23 = bitcast i64 %22 to <2 x i32>
  %24 = fptosi float %14 to i32
  %25 = fptosi float %15 to i32
  %26 = shufflevector <2 x i32> %23, <2 x i32> undef, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
  %27 = shufflevector <2 x i32> %23, <2 x i32> undef, <4 x i32> <i32 undef, i32 1, i32 undef, i32 undef>
  %28 = shufflevector <4 x i32> <i32 undef, i32 undef, i32 32, i32 163756>, <4 x i32> %26, <4 x i32> <i32 4, i32 undef, i32 2, i32 3>
  %29 = shufflevector <4 x i32> %28, <4 x i32> %27, <4 x i32> <i32 0, i32 5, i32 2, i32 3>
  %30 = call nsz float @llvm.SI.load.const.v4i32(<4 x i32> %29, i32 0)
  %31 = bitcast float %30 to i32
  %32 = add i32 %24, %31
  %33 = call nsz float @llvm.SI.load.const.v4i32(<4 x i32> %29, i32 4)
  %34 = bitcast float %33 to i32
  %35 = add i32 %25, %34
  %36 = call nsz float @llvm.SI.load.const.v4i32(<4 x i32> %29, i32 8)
  %37 = bitcast float %36 to i32
  %38 = mul i32 %35, %37
  %39 = add i32 %38, %32
  %40 = getelementptr [80 x <8 x i32>], [80 x <8 x i32>] addrspace(2)* %3, i64 0, i64 16, !amdgpu.uniform !0
  %41 = load <8 x i32>, <8 x i32> addrspace(2)* %40, align 32, !invariant.load !0, !alias.scope !1, !noalias !4
  %42 = insertelement <4 x i32> <i32 undef, i32 undef, i32 0, i32 undef>, i32 %24, i32 0
  %43 = insertelement <4 x i32> %42, i32 %25, i32 1
  %44 = call nsz <4 x float> @llvm.amdgcn.image.load.mip.v4f32.v4i32.v8i32(<4 x i32> %43, <8 x i32> %41, i32 15, i1 false, i1 false, i1 false, i1 false) #0
  call void @llvm.amdgcn.buffer.store.format.v4f32(<4 x float> %44, <4 x i32> undef, i32 %39, i32 0, i1 false, i1 false) #5, !noalias !1
  call void @llvm.amdgcn.exp.f32(i32 9, i32 0, float undef, float undef, float undef, float undef, i1 true, i1 true) #3
  ret void
}

attributes #0 = { nounwind readnone }
attributes #1 = { nounwind readonly }
attributes #2 = { nounwind writeonly }
attributes #3 = { nounwind }
attributes #4 = { "no-signed-zeros-fp-math"="true" }
attributes #5 = { inaccessiblememonly nounwind }

!0 = !{}
!1 = !{!2}
!2 = distinct !{!2, !3, !"main: argument 1"}
!3 = distinct !{!3, !"main"}
!4 = !{!5}
!5 = distinct !{!5, !3, !"main: argument 0"}

SHADER KEY
  part.ps.prolog.color_two_side = 0
  part.ps.prolog.flatshade_colors = 0
  part.ps.prolog.poly_stipple = 0
  part.ps.prolog.force_persp_sample_interp = 0
  part.ps.prolog.force_linear_sample_interp = 0
  part.ps.prolog.force_persp_center_interp = 0
  part.ps.prolog.force_linear_center_interp = 0
  part.ps.prolog.bc_optimize_for_persp = 0
  part.ps.prolog.bc_optimize_for_linear = 0
  part.ps.epilog.spi_shader_col_format = 0x0
  part.ps.epilog.color_is_int8 = 0x0
  part.ps.epilog.color_is_int10 = 0x0
  part.ps.epilog.last_cbuf = 0
  part.ps.epilog.alpha_func = 7
  part.ps.epilog.alpha_to_one = 0
  part.ps.epilog.poly_line_smoothing = 0
  part.ps.epilog.clamp_color = 0

Pixel Shader:
Shader main disassembly:
	s_load_dwordx8 s[8:15], s[6:7], 0x200                  ; C00E0203 00000200
	v_cvt_i32_f32_e32 v0, v2                               ; 7E001102
	v_cvt_i32_f32_e32 v1, v3                               ; 7E021103
	v_mov_b32_e32 v2, 0                                    ; 7E040280
	s_mov_b32 s7, 0x27fac                                  ; BE8700FF 00027FAC
	s_mov_b32 s6, 32                                       ; BE8600A0
	s_waitcnt lgkmcnt(0)                                   ; BF8C007F
	image_load_mip v[2:5], v[0:3], s[8:15] dmask:0xf unorm ; F0041F00 00020200
	s_buffer_load_dwordx2 s[0:1], s[4:7], 0x0              ; C0260002 00000000
	s_buffer_load_dword s2, s[4:7], 0x8                    ; C0220082 00000008
	s_waitcnt lgkmcnt(0)                                   ; BF8C007F
	v_add_u32_e32 v1, vcc, s1, v1                          ; 32020201
	v_mul_lo_i32 v1, v1, s2                                ; D2850001 00000501
	v_add_u32_e32 v0, vcc, s0, v0                          ; 32000000
	v_add_u32_e32 v0, vcc, v1, v0                          ; 32000101
	s_waitcnt vmcnt(0)                                     ; BF8C0F70
	buffer_store_format_xyzw v[2:5], v0, s[0:3], 0 idxen   ; E01C2000 80000200
	exp null off, off, off, off done vm                    ; C4001890 00000000
	s_endpgm                                               ; BF810000

*** SHADER CONFIG ***
SPI_PS_INPUT_ADDR = 0x0301
SPI_PS_INPUT_ENA  = 0x0301
*** SHADER STATS ***
SGPRS: 96
VGPRS: 8
Spilled SGPRs: 0
Spilled VGPRs: 0
Private memory VGPRs: 0
Code Size: 108 bytes
LDS: 0 blocks
Scratch: 0 bytes per wave
Max Waves: 8
********************

-------------- next part --------------
; ModuleID = 'tgsi'
source_filename = "tgsi"
target datalayout = "e-p:32:32-p1:64:64-p2:64:64-p3:32:32-p4:64:64-p5:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64"
target triple = "amdgcn--"

; Function Attrs: nounwind readnone
declare float @llvm.SI.load.const.v4i32(<4 x i32>, i32) #0

; Function Attrs: nounwind readonly
declare <4 x float> @llvm.amdgcn.image.load.mip.v4f32.v4i32.v8i32(<4 x i32>, <8 x i32>, i32, i1, i1, i1, i1) #1

; Function Attrs: nounwind writeonly
declare void @llvm.amdgcn.buffer.store.format.v4f32(<4 x float>, <4 x i32>, i32, i32, i1, i1) #2

; Function Attrs: nounwind
declare void @llvm.amdgcn.exp.f32(i32, i32, float, float, float, float, i1, i1) #3

define amdgpu_ps void @wrapper([12 x <4 x i32>] addrspace(2)* byval noalias dereferenceable(18446744073709551615), [0 x <8 x i32>] addrspace(2)* byval noalias dereferenceable(18446744073709551615), [0 x float] addrspace(2)* byval noalias dereferenceable(18446744073709551615), [80 x <8 x i32>] addrspace(2)* byval noalias dereferenceable(18446744073709551615), float inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, i32, i32, float, i32) #4 {
main_body:
  %22 = ptrtoint [0 x float] addrspace(2)* %2 to i64
  %23 = bitcast i64 %22 to <2 x i32>
  %24 = fptosi float %14 to i32
  %25 = fptosi float %15 to i32
  %26 = shufflevector <2 x i32> %23, <2 x i32> undef, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
  %27 = shufflevector <2 x i32> %23, <2 x i32> undef, <4 x i32> <i32 undef, i32 1, i32 undef, i32 undef>
  %28 = shufflevector <4 x i32> <i32 undef, i32 undef, i32 32, i32 163756>, <4 x i32> %26, <4 x i32> <i32 4, i32 undef, i32 2, i32 3>
  %29 = shufflevector <4 x i32> %28, <4 x i32> %27, <4 x i32> <i32 0, i32 5, i32 2, i32 3>
  %30 = call nsz float @llvm.SI.load.const.v4i32(<4 x i32> %29, i32 0)
  %31 = bitcast float %30 to i32
  %32 = add i32 %24, %31
  %33 = call nsz float @llvm.SI.load.const.v4i32(<4 x i32> %29, i32 4)
  %34 = bitcast float %33 to i32
  %35 = add i32 %25, %34
  %36 = call nsz float @llvm.SI.load.const.v4i32(<4 x i32> %29, i32 8)
  %37 = bitcast float %36 to i32
  %38 = mul i32 %35, %37
  %39 = add i32 %38, %32
  %40 = getelementptr [80 x <8 x i32>], [80 x <8 x i32>] addrspace(2)* %3, i64 0, i64 16, !amdgpu.uniform !0
  %41 = load <8 x i32>, <8 x i32> addrspace(2)* %40, align 32, !invariant.load !0, !alias.scope !1, !noalias !4
  %42 = insertelement <4 x i32> <i32 undef, i32 undef, i32 0, i32 undef>, i32 %24, i32 0
  %43 = insertelement <4 x i32> %42, i32 %25, i32 1
  %44 = call nsz <4 x float> @llvm.amdgcn.image.load.mip.v4f32.v4i32.v8i32(<4 x i32> %43, <8 x i32> %41, i32 15, i1 false, i1 false, i1 false, i1 false) #0
  %45 = bitcast [80 x <8 x i32>] addrspace(2)* %3 to [0 x <4 x i32>] addrspace(2)*
  %46 = getelementptr [0 x <4 x i32>], [0 x <4 x i32>] addrspace(2)* %45, i64 0, i64 31, !amdgpu.uniform !0
  %47 = load <4 x i32>, <4 x i32> addrspace(2)* %46, align 16, !invariant.load !0, !alias.scope !1, !noalias !4
  call void @llvm.amdgcn.buffer.store.format.v4f32(<4 x float> %44, <4 x i32> %47, i32 %39, i32 0, i1 false, i1 false) #5, !noalias !1
  call void @llvm.amdgcn.exp.f32(i32 9, i32 0, float undef, float undef, float undef, float undef, i1 true, i1 true) #3
  ret void
}

attributes #0 = { nounwind readnone }
attributes #1 = { nounwind readonly }
attributes #2 = { nounwind writeonly }
attributes #3 = { nounwind }
attributes #4 = { "no-signed-zeros-fp-math"="true" }
attributes #5 = { inaccessiblememonly nounwind }

!0 = !{}
!1 = !{!2}
!2 = distinct !{!2, !3, !"main: argument 1"}
!3 = distinct !{!3, !"main"}
!4 = !{!5}
!5 = distinct !{!5, !3, !"main: argument 0"}

SHADER KEY
  part.ps.prolog.color_two_side = 0
  part.ps.prolog.flatshade_colors = 0
  part.ps.prolog.poly_stipple = 0
  part.ps.prolog.force_persp_sample_interp = 0
  part.ps.prolog.force_linear_sample_interp = 0
  part.ps.prolog.force_persp_center_interp = 0
  part.ps.prolog.force_linear_center_interp = 0
  part.ps.prolog.bc_optimize_for_persp = 0
  part.ps.prolog.bc_optimize_for_linear = 0
  part.ps.epilog.spi_shader_col_format = 0x0
  part.ps.epilog.color_is_int8 = 0x0
  part.ps.epilog.color_is_int10 = 0x0
  part.ps.epilog.last_cbuf = 0
  part.ps.epilog.alpha_func = 7
  part.ps.epilog.alpha_to_one = 0
  part.ps.epilog.poly_line_smoothing = 0
  part.ps.epilog.clamp_color = 0

Pixel Shader:
Shader main disassembly:
	s_load_dwordx4 s[0:3], s[6:7], 0x1f0                   ; C00A0003 000001F0
	s_load_dwordx8 s[8:15], s[6:7], 0x200                  ; C00E0203 00000200
	v_cvt_i32_f32_e32 v0, v2                               ; 7E001102
	v_cvt_i32_f32_e32 v1, v3                               ; 7E021103
	v_mov_b32_e32 v2, 0                                    ; 7E040280
	s_mov_b32 s7, 0x27fac                                  ; BE8700FF 00027FAC
	s_mov_b32 s6, 32                                       ; BE8600A0
	s_waitcnt lgkmcnt(0)                                   ; BF8C007F
	image_load_mip v[2:5], v[0:3], s[8:15] dmask:0xf unorm ; F0041F00 00020200
	s_buffer_load_dwordx2 s[8:9], s[4:7], 0x0              ; C0260202 00000000
	s_buffer_load_dword s4, s[4:7], 0x8                    ; C0220102 00000008
	s_waitcnt lgkmcnt(0)                                   ; BF8C007F
	v_add_u32_e32 v1, vcc, s9, v1                          ; 32020209
	v_mul_lo_i32 v1, v1, s4                                ; D2850001 00000901
	v_add_u32_e32 v0, vcc, s8, v0                          ; 32000008
	v_add_u32_e32 v0, vcc, v1, v0                          ; 32000101
	s_waitcnt vmcnt(0)                                     ; BF8C0F70
	buffer_store_format_xyzw v[2:5], v0, s[0:3], 0 idxen   ; E01C2000 80000200
	exp null off, off, off, off done vm                    ; C4001890 00000000
	s_endpgm                                               ; BF810000

*** SHADER CONFIG ***
SPI_PS_INPUT_ADDR = 0x0301
SPI_PS_INPUT_ENA  = 0x0301
*** SHADER STATS ***
SGPRS: 96
VGPRS: 8
Spilled SGPRs: 0
Spilled VGPRs: 0
Private memory VGPRs: 0
Code Size: 116 bytes
LDS: 0 blocks
Scratch: 0 bytes per wave
Max Waves: 8
********************