[llvm] r308325 - AMDGPU: Figure out private memory regs after lowering

Mon Jul 24 02:58:32 PDT 2017

Hi Matt,

On 19/07/17 01:44 AM, Matt Arsenault via llvm-commits wrote:
> Author: arsenm
> Date: Tue Jul 18 09:44:56 2017
> New Revision: 308325
> 
> URL: http://llvm.org/viewvc/llvm-project?rev=308325&view=rev
> Log:
> AMDGPU: Figure out private memory regs after lowering
> 
> Introduce pseudo-registers for registers needed for stack
> access, which are replaced during finalizeLowering.
> Note these pseudo-registers are currently only used for the
> used register location, and not for determining their
> input argument register.
> 
> This is better because it avoids the need to try to predict
> whether a call will be emitted from the IR, and also
> detects stack objects introduced by legalization.
> 
> Test changes are from the HasStackObjects check being more
> accurate since stack objects introduced during legalization
> are now known.

This broke the piglit test
spec at glsl-1.50@execution at geometry@max-input-components and a bunch of
tests under spec at arb_gpu_shader_int64@execution at built-in-functions, e.g.
spec at arb_gpu_shader_int64@execution at built-in-functions@fs-op-div-int64_t-i64vec4
for me on Kaveri.

Attaching the LLVM IR of the shader from the latter which gets miscompiled.

-- 
Earthling Michel Dänzer               |               http://www.amd.com
Libre software enthusiast             |             Mesa and X developer
-------------- next part --------------
; ModuleID = 'tgsi'
source_filename = "tgsi"
target datalayout = "e-p:32:32-p1:64:64-p2:64:64-p3:32:32-p4:64:64-p5:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64"
target triple = "amdgcn--"

; Function Attrs: nounwind readnone
declare float @llvm.SI.load.const.v4i32(<4 x i32>, i32) #0

; Function Attrs: nounwind readnone speculatable
declare <2 x half> @llvm.amdgcn.cvt.pkrtz(float, float) #1

; Function Attrs: nounwind
declare void @llvm.amdgcn.exp.compr.v2i16(i32, i32, <2 x i16>, <2 x i16>, i1, i1) #2

define amdgpu_ps void @wrapper([12 x <4 x i32>] addrspace(2)* byval noalias dereferenceable(18446744073709551615), [32 x <4 x i32>] addrspace(2)* byval noalias dereferenceable(18446744073709551615), [80 x <8 x i32>] addrspace(2)* byval noalias dereferenceable(18446744073709551615), float inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, i32, i32, float, i32) #3 {
main_body:
  %21 = getelementptr [32 x <4 x i32>], [32 x <4 x i32>] addrspace(2)* %1, i64 0, i64 16, !amdgpu.uniform !0
  %22 = load <4 x i32>, <4 x i32> addrspace(2)* %21, align 16, !invariant.load !0, !alias.scope !1
  %23 = call nsz float @llvm.SI.load.const.v4i32(<4 x i32> %22, i32 0)
  %24 = call nsz float @llvm.SI.load.const.v4i32(<4 x i32> %22, i32 4)
  %25 = bitcast float %23 to i32
  %26 = insertelement <2 x i32> undef, i32 %25, i32 0
  %27 = bitcast float %24 to i32
  %28 = insertelement <2 x i32> %26, i32 %27, i32 1
  %29 = bitcast <2 x i32> %28 to i64
  %30 = call nsz float @llvm.SI.load.const.v4i32(<4 x i32> %22, i32 16)
  %31 = call nsz float @llvm.SI.load.const.v4i32(<4 x i32> %22, i32 20)
  %32 = bitcast float %30 to i32
  %33 = insertelement <2 x i32> undef, i32 %32, i32 0
  %34 = bitcast float %31 to i32
  %35 = insertelement <2 x i32> %33, i32 %34, i32 1
  %36 = bitcast <2 x i32> %35 to i64
  %37 = sdiv i64 %29, %36
  %38 = call nsz float @llvm.SI.load.const.v4i32(<4 x i32> %22, i32 24)
  %39 = call nsz float @llvm.SI.load.const.v4i32(<4 x i32> %22, i32 28)
  %40 = bitcast float %38 to i32
  %41 = insertelement <2 x i32> undef, i32 %40, i32 0
  %42 = bitcast float %39 to i32
  %43 = insertelement <2 x i32> %41, i32 %42, i32 1
  %44 = bitcast <2 x i32> %43 to i64
  %45 = sdiv i64 %29, %44
  %46 = call nsz float @llvm.SI.load.const.v4i32(<4 x i32> %22, i32 32)
  %47 = call nsz float @llvm.SI.load.const.v4i32(<4 x i32> %22, i32 36)
  %48 = bitcast float %46 to i32
  %49 = insertelement <2 x i32> undef, i32 %48, i32 0
  %50 = bitcast float %47 to i32
  %51 = insertelement <2 x i32> %49, i32 %50, i32 1
  %52 = bitcast <2 x i32> %51 to i64
  %53 = sdiv i64 %29, %52
  %54 = call nsz float @llvm.SI.load.const.v4i32(<4 x i32> %22, i32 40)
  %55 = call nsz float @llvm.SI.load.const.v4i32(<4 x i32> %22, i32 44)
  %56 = bitcast float %54 to i32
  %57 = insertelement <2 x i32> undef, i32 %56, i32 0
  %58 = bitcast float %55 to i32
  %59 = insertelement <2 x i32> %57, i32 %58, i32 1
  %60 = bitcast <2 x i32> %59 to i64
  %61 = sdiv i64 %29, %60
  %62 = call nsz float @llvm.SI.load.const.v4i32(<4 x i32> %22, i32 48)
  %63 = call nsz float @llvm.SI.load.const.v4i32(<4 x i32> %22, i32 52)
  %64 = bitcast float %62 to i32
  %65 = insertelement <2 x i32> undef, i32 %64, i32 0
  %66 = bitcast float %63 to i32
  %67 = insertelement <2 x i32> %65, i32 %66, i32 1
  %68 = bitcast <2 x i32> %67 to i64
  %69 = icmp eq i64 %37, %68
  %70 = call nsz float @llvm.SI.load.const.v4i32(<4 x i32> %22, i32 56)
  %71 = call nsz float @llvm.SI.load.const.v4i32(<4 x i32> %22, i32 60)
  %72 = bitcast float %70 to i32
  %73 = insertelement <2 x i32> undef, i32 %72, i32 0
  %74 = bitcast float %71 to i32
  %75 = insertelement <2 x i32> %73, i32 %74, i32 1
  %76 = bitcast <2 x i32> %75 to i64
  %77 = icmp eq i64 %45, %76
  %78 = call nsz float @llvm.SI.load.const.v4i32(<4 x i32> %22, i32 64)
  %79 = call nsz float @llvm.SI.load.const.v4i32(<4 x i32> %22, i32 68)
  %80 = bitcast float %78 to i32
  %81 = insertelement <2 x i32> undef, i32 %80, i32 0
  %82 = bitcast float %79 to i32
  %83 = insertelement <2 x i32> %81, i32 %82, i32 1
  %84 = bitcast <2 x i32> %83 to i64
  %85 = icmp eq i64 %53, %84
  %86 = call nsz float @llvm.SI.load.const.v4i32(<4 x i32> %22, i32 72)
  %87 = call nsz float @llvm.SI.load.const.v4i32(<4 x i32> %22, i32 76)
  %88 = bitcast float %86 to i32
  %89 = insertelement <2 x i32> undef, i32 %88, i32 0
  %90 = bitcast float %87 to i32
  %91 = insertelement <2 x i32> %89, i32 %90, i32 1
  %92 = bitcast <2 x i32> %91 to i64
  %93 = icmp eq i64 %61, %92
  %94 = and i1 %69, %77
  %95 = and i1 %85, %93
  %96 = and i1 %94, %95
  %97 = select i1 %96, float 0.000000e+00, float 1.000000e+00
  %98 = select i1 %96, float 1.000000e+00, float 0.000000e+00
  %99 = call nsz <2 x half> @llvm.amdgcn.cvt.pkrtz(float %97, float %98) #0
  %100 = bitcast <2 x half> %99 to <2 x i16>
  call void @llvm.amdgcn.exp.compr.v2i16(i32 0, i32 15, <2 x i16> %100, <2 x i16> <i16 0, i16 15360>, i1 true, i1 true) #2
  ret void
}

attributes #0 = { nounwind readnone }
attributes #1 = { nounwind readnone speculatable }
attributes #2 = { nounwind }
attributes #3 = { "no-signed-zeros-fp-math"="true" }

!0 = !{}
!1 = !{!2}
!2 = distinct !{!2, !3, !"main: argument 0"}
!3 = distinct !{!3, !"main"}