[llvm] r289076 - [AMDGPU] Scalarization of global uniform loads.
Michel Dänzer via llvm-commits
llvm-commits at lists.llvm.org
Fri Dec 9 00:09:10 PST 2016
Hi Alexander,
On 09/12/16 02:28 AM, Alexander Timofeev via llvm-commits wrote:
> Author: alex-t
> Date: Thu Dec 8 11:28:47 2016
> New Revision: 289076
>
> URL: http://llvm.org/viewvc/llvm-project?rev=289076&view=rev
> Log:
> [AMDGPU] Scalarization of global uniform loads.
This change broke a few OpenCL piglit tests for me with the Mesa OpenCL
implementation on Kaveri, e.g.
program at execute@store at store-double16-global:
LLVM ERROR: Cannot select: t77: v16i32,ch = load<LD64[%0(addrspace=1)(align=128)+64](align=64)(tbaa=<0x559592d62578>)> t11, t76, undef:i64
t76: i64 = add t344, Constant:i64<64>
t344: i64 = bitcast t343
t343: v2i32,ch = load<LD8[undef(addrspace=2)](nontemporal)(dereferenceable)(invariant)> t0, t8, undef:i64
t8: i64 = add t2, Constant:i64<8>
t2: i64,ch = CopyFromReg t0, Register:i64 %vreg1
t1: i64 = Register %vreg1
t7: i64 = Constant<8>
t4: i64 = undef
t75: i64 = Constant<64>
t4: i64 = undef
In function: store_global
You can reproduce it by feeding the attached LLVM IR to
llc -march=amdgcn
--
Earthling Michel Dänzer | http://www.amd.com
Libre software enthusiast | Mesa and X developer
-------------- next part --------------
; ModuleID = 'link'
source_filename = "link"
target datalayout = "e-p:32:32-p1:64:64-p2:64:64-p3:32:32-p4:64:64-p5:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64"
target triple = "amdgcn-mesa-mesa3d"
; Function Attrs: norecurse nounwind
define amdgpu_kernel void @store_global(<16 x double> addrspace(1)* nocapture %out, <16 x double> addrspace(1)* nocapture readonly %in) local_unnamed_addr #0 !kernel_arg_addr_space !0 !kernel_arg_access_qual !2 !kernel_arg_type !3 !kernel_arg_base_type !4 !kernel_arg_type_qual !5 {
entry:
%0 = load <16 x double>, <16 x double> addrspace(1)* %in, align 128, !tbaa !6
store <16 x double> %0, <16 x double> addrspace(1)* %out, align 128, !tbaa !6
%arrayidx2 = getelementptr inbounds <16 x double>, <16 x double> addrspace(1)* %in, i64 1
%1 = load <16 x double>, <16 x double> addrspace(1)* %arrayidx2, align 128, !tbaa !6
%arrayidx3 = getelementptr inbounds <16 x double>, <16 x double> addrspace(1)* %out, i64 1
store <16 x double> %1, <16 x double> addrspace(1)* %arrayidx3, align 128, !tbaa !6
%arrayidx4 = getelementptr inbounds <16 x double>, <16 x double> addrspace(1)* %in, i64 2
%2 = load <16 x double>, <16 x double> addrspace(1)* %arrayidx4, align 128, !tbaa !6
%arrayidx5 = getelementptr inbounds <16 x double>, <16 x double> addrspace(1)* %out, i64 2
store <16 x double> %2, <16 x double> addrspace(1)* %arrayidx5, align 128, !tbaa !6
%arrayidx6 = getelementptr inbounds <16 x double>, <16 x double> addrspace(1)* %in, i64 3
%3 = load <16 x double>, <16 x double> addrspace(1)* %arrayidx6, align 128, !tbaa !6
%arrayidx7 = getelementptr inbounds <16 x double>, <16 x double> addrspace(1)* %out, i64 3
store <16 x double> %3, <16 x double> addrspace(1)* %arrayidx7, align 128, !tbaa !6
%arrayidx8 = getelementptr inbounds <16 x double>, <16 x double> addrspace(1)* %in, i64 4
%4 = load <16 x double>, <16 x double> addrspace(1)* %arrayidx8, align 128, !tbaa !6
%arrayidx9 = getelementptr inbounds <16 x double>, <16 x double> addrspace(1)* %out, i64 4
store <16 x double> %4, <16 x double> addrspace(1)* %arrayidx9, align 128, !tbaa !6
%arrayidx10 = getelementptr inbounds <16 x double>, <16 x double> addrspace(1)* %in, i64 5
%5 = load <16 x double>, <16 x double> addrspace(1)* %arrayidx10, align 128, !tbaa !6
%arrayidx11 = getelementptr inbounds <16 x double>, <16 x double> addrspace(1)* %out, i64 5
store <16 x double> %5, <16 x double> addrspace(1)* %arrayidx11, align 128, !tbaa !6
%arrayidx12 = getelementptr inbounds <16 x double>, <16 x double> addrspace(1)* %in, i64 6
%6 = load <16 x double>, <16 x double> addrspace(1)* %arrayidx12, align 128, !tbaa !6
%arrayidx13 = getelementptr inbounds <16 x double>, <16 x double> addrspace(1)* %out, i64 6
store <16 x double> %6, <16 x double> addrspace(1)* %arrayidx13, align 128, !tbaa !6
%arrayidx14 = getelementptr inbounds <16 x double>, <16 x double> addrspace(1)* %in, i64 7
%7 = load <16 x double>, <16 x double> addrspace(1)* %arrayidx14, align 128, !tbaa !6
%arrayidx15 = getelementptr inbounds <16 x double>, <16 x double> addrspace(1)* %out, i64 7
store <16 x double> %7, <16 x double> addrspace(1)* %arrayidx15, align 128, !tbaa !6
ret void
}
; Function Attrs: nounwind
define amdgpu_kernel void @store_global_wi(<16 x double> addrspace(1)* nocapture %out, <16 x double> addrspace(1)* nocapture readonly %in) local_unnamed_addr #1 !kernel_arg_addr_space !0 !kernel_arg_access_qual !2 !kernel_arg_type !3 !kernel_arg_base_type !4 !kernel_arg_type_qual !5 {
entry:
%0 = tail call i32 @llvm.amdgcn.workgroup.id.x() #3
%retval.0.i8.i = zext i32 %0 to i64
%dispatch_ptr.i9.i = tail call noalias nonnull dereferenceable(64) i8 addrspace(2)* @llvm.amdgcn.dispatch.ptr() #3
%xy_size_ptr.i10.i = getelementptr inbounds i8, i8 addrspace(2)* %dispatch_ptr.i9.i, i64 4
%1 = bitcast i8 addrspace(2)* %xy_size_ptr.i10.i to i32 addrspace(2)*
%xy_size.i11.i = load i32, i32 addrspace(2)* %1, align 4, !invariant.load !9
%x_size.i.i = and i32 %xy_size.i11.i, 65535
%x_size.ext.i.i = zext i32 %x_size.i.i to i64
%mul29.i = mul nuw nsw i64 %x_size.ext.i.i, %retval.0.i8.i
%2 = tail call i32 @llvm.amdgcn.workitem.id.x() #3, !range !10
%retval.0.i638.i = zext i32 %2 to i64
%add39.i = add nuw nsw i64 %mul29.i, %retval.0.i638.i
%3 = tail call i8 addrspace(2)* @llvm.amdgcn.implicitarg.ptr() #3
%arrayidx.i.i = getelementptr inbounds i8, i8 addrspace(2)* %3, i64 4
%4 = bitcast i8 addrspace(2)* %arrayidx.i.i to i32 addrspace(2)*
%5 = load i32, i32 addrspace(2)* %4, align 4, !tbaa !11
%conv.i.i = zext i32 %5 to i64
%add4.i = add nuw nsw i64 %add39.i, %conv.i.i
%arrayidx = getelementptr inbounds <16 x double>, <16 x double> addrspace(1)* %in, i64 %add4.i
%6 = load <16 x double>, <16 x double> addrspace(1)* %arrayidx, align 128, !tbaa !6
%arrayidx1 = getelementptr inbounds <16 x double>, <16 x double> addrspace(1)* %out, i64 %add4.i
store <16 x double> %6, <16 x double> addrspace(1)* %arrayidx1, align 128, !tbaa !6
ret void
}
; Function Attrs: nounwind readnone
declare i32 @llvm.amdgcn.workgroup.id.x() #2
; Function Attrs: nounwind readnone
declare i8 addrspace(2)* @llvm.amdgcn.dispatch.ptr() #2
; Function Attrs: nounwind readnone
declare i32 @llvm.amdgcn.workitem.id.x() #2
; Function Attrs: nounwind readnone
declare i8 addrspace(2)* @llvm.amdgcn.implicitarg.ptr() #2
attributes #0 = { norecurse nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="kaveri" "target-features"="+fp64-denormals,-fp32-denormals" "unsafe-fp-math"="false" "use-soft-float"="false" }
attributes #1 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="kaveri" "target-features"="+fp64-denormals,-fp32-denormals" "unsafe-fp-math"="false" "use-soft-float"="false" }
attributes #2 = { nounwind readnone }
attributes #3 = { nounwind }
!opencl.ocl.version = !{!0, !0}
!llvm.ident = !{!1, !1}
!0 = !{i32 1, i32 1}
!1 = !{!"clang version 4.0.0 (http://llvm.org/git/clang.git 6e73e3464e96a4e00492c24aa790d36e1adb5702) (llvm/trunk 289187)"}
!2 = !{!"none", !"none"}
!3 = !{!"type_t*", !"type_t*"}
!4 = !{!"double __attribute__((ext_vector_type(16)))*", !"double __attribute__((ext_vector_type(16)))*"}
!5 = !{!"", !""}
!6 = !{!7, !7, i64 0}
!7 = !{!"omnipotent char", !8, i64 0}
!8 = !{!"Simple C/C++ TBAA"}
!9 = !{}
!10 = !{i32 0, i32 1024}
!11 = !{!12, !12, i64 0}
!12 = !{!"int", !7, i64 0}
More information about the llvm-commits
mailing list