[PATCH] D52548: Stop instcombining propagating wider shufflevector arguments to predecessors.
Jan Vesely via llvm-commits
llvm-commits at lists.llvm.org
Sat Sep 29 06:52:46 PDT 2018
On Fri, 2018-09-28 at 15:26 +0000, Sanjay Patel via Phabricator via
llvm-commits wrote:
> This revision was automatically updated to reflect the committed changes.
> Closed by commit rL343329: [InstCombine] don't propagate wider shufflevector arguments to predecessors (authored by spatel, committed by ).
>
> Changed prior to commit:
> https://reviews.llvm.org/D52548?vs=167472&id=167481#toc
>
> Repository:
> rL LLVM
>
> https://reviews.llvm.org/D52548
>
> Files:
> llvm/trunk/include/llvm/IR/Instructions.h
> llvm/trunk/lib/Transforms/InstCombine/InstCombineVectorOps.cpp
> llvm/trunk/test/Transforms/InstCombine/vec_shuffle.ll
>
>
> Index: llvm/trunk/test/Transforms/InstCombine/vec_shuffle.ll
> ===================================================================
> --- llvm/trunk/test/Transforms/InstCombine/vec_shuffle.ll
> +++ llvm/trunk/test/Transforms/InstCombine/vec_shuffle.ll
> @@ -184,27 +184,32 @@
> ret <2 x i8> %D
> }
>
> -; TODO: Increasing length of vector ops is not a good canonicalization.
> -
> +; Increasing length of vector ops is not a good canonicalization.
> +
> define <3 x i32> @add_wider(i32 %y, i32 %z) {
> -; CHECK-LABEL: @add(
> -; CHECK-NEXT: [[TMP1:%.*]] = insertelement <3 x i32> undef, i32 [[Y:%.*]], i32 0
> -; CHECK-NEXT: [[TMP2:%.*]] = insertelement <3 x i32> [[TMP1]], i32 [[Z:%.*]], i32 1
> -; CHECK-NEXT: [[TMP3:%.*]] = add <3 x i32> [[TMP2]], <i32 255, i32 255, i32 undef>
> -; CHECK-NEXT: ret <3 x i32> [[TMP3]]
> +; CHECK-LABEL: @add_wider(
> +; CHECK-NEXT: [[I0:%.*]] = insertelement <2 x i32> undef, i32 [[Y:%.*]], i32 0
> +; CHECK-NEXT: [[I1:%.*]] = insertelement <2 x i32> [[I0]], i32 [[Z:%.*]], i32 1
> +; CHECK-NEXT: [[A:%.*]] = add <2 x i32> [[I1]], <i32 255, i32 255>
> +; CHECK-NEXT: [[EXT:%.*]] = shufflevector <2 x i32> [[A]], <2 x i32> undef, <3 x i32> <i32 0, i32 1, i32 undef>
> +; CHECK-NEXT: ret <3 x i32> [[EXT]]
> ;
> %i0 = insertelement <2 x i32> undef, i32 %y, i32 0
> %i1 = insertelement <2 x i32> %i0, i32 %z, i32 1
> %a = add <2 x i32> %i1, <i32 255, i32 255>
> %ext = shufflevector <2 x i32> %a, <2 x i32> undef, <3 x i32> <i32 0, i32 1, i32 undef>
> ret <3 x i32> %ext
> }
>
> -; FIXME: Increasing length of vector ops must be safe from illegal undef propagation.
> +; Increasing length of vector ops must be safe from illegal undef propagation.
>
> define <3 x i32> @div_wider(i32 %y, i32 %z) {
> -; CHECK-LABEL: @div(
> -; CHECK-NEXT: ret <3 x i32> undef
> +; CHECK-LABEL: @div_wider(
> +; CHECK-NEXT: [[I0:%.*]] = insertelement <2 x i32> undef, i32 [[Y:%.*]], i32 0
> +; CHECK-NEXT: [[I1:%.*]] = insertelement <2 x i32> [[I0]], i32 [[Z:%.*]], i32 1
> +; CHECK-NEXT: [[A:%.*]] = sdiv <2 x i32> [[I1]], <i32 255, i32 255>
> +; CHECK-NEXT: [[EXT:%.*]] = shufflevector <2 x i32> [[A]], <2 x i32> undef, <3 x i32> <i32 0, i32 1, i32 undef>
> +; CHECK-NEXT: ret <3 x i32> [[EXT]]
> ;
> %i0 = insertelement <2 x i32> undef, i32 %y, i32 0
> %i1 = insertelement <2 x i32> %i0, i32 %z, i32 1
> Index: llvm/trunk/lib/Transforms/InstCombine/InstCombineVectorOps.cpp
> ===================================================================
> --- llvm/trunk/lib/Transforms/InstCombine/InstCombineVectorOps.cpp
> +++ llvm/trunk/lib/Transforms/InstCombine/InstCombineVectorOps.cpp
> @@ -1464,7 +1464,8 @@
> if (isRHSID) return replaceInstUsesWith(SVI, RHS);
> }
>
> - if (isa<UndefValue>(RHS) && CanEvaluateShuffled(LHS, Mask)) {
> + if (isa<UndefValue>(RHS) && !SVI.increasesLength() &&
> + CanEvaluateShuffled(LHS, Mask)) {
> Value *V = EvaluateInDifferentElementOrder(LHS, Mask);
> return replaceInstUsesWith(SVI, V);
> }
> Index: llvm/trunk/include/llvm/IR/Instructions.h
> ===================================================================
> --- llvm/trunk/include/llvm/IR/Instructions.h
> +++ llvm/trunk/include/llvm/IR/Instructions.h
> @@ -2457,13 +2457,23 @@
>
> /// Return true if this shuffle returns a vector with a different number of
> /// elements than its source vectors.
> - /// Example: shufflevector <4 x n> A, <4 x n> B, <1,2>
> + /// Examples: shufflevector <4 x n> A, <4 x n> B, <1,2,3>
> + /// shufflevector <4 x n> A, <4 x n> B, <1,2,3,4,5>
> bool changesLength() const {
> unsigned NumSourceElts = Op<0>()->getType()->getVectorNumElements();
> unsigned NumMaskElts = getMask()->getType()->getVectorNumElements();
> return NumSourceElts != NumMaskElts;
> }
>
> + /// Return true if this shuffle returns a vector with a greater number of
> + /// elements than its source vectors.
> + /// Example: shufflevector <2 x n> A, <2 x n> B, <1,2,3>
> + bool increasesLength() const {
> + unsigned NumSourceElts = Op<0>()->getType()->getVectorNumElements();
> + unsigned NumMaskElts = getMask()->getType()->getVectorNumElements();
> + return NumSourceElts < NumMaskElts;
> + }
> +
> /// Return true if this shuffle mask chooses elements from exactly one source
> /// vector.
> /// Example: <7,5,undef,7>
Hi,
this change introduces failures when loading/storing char3 vectors in
OpenCL (amdgcn backend):
llc: /home/orome/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp:4159:
llvm::SDValue llvm::SelectionDAG::getNode(unsigned int, const
llvm::SDLoc&, llvm::EVT, llvm::SDValue, llvm::SDNodeFlags): Assertion
`VT.getSizeInBits() == Operand.getValueSizeInBits() && "Cannot BITCAST
between types of different sizes!"' failed.
Stack dump:
0. Program arguments: /home/orome/.local/bin/llc -march=amdgcn
1. Running pass 'CallGraph Pass Manager' on module '<stdin>'.
2. Running pass 'AMDGPU DAG->DAG Pattern Instruction Selection'
on function '@vload3_constant'
the patch changes how char3 vector is constructed. instead of starting
with <4xi8> (edited):
%3 = insertelement <4 x i8> undef, i8 %2, i32 0
%5 = insertelement <4 x i8> %3, i8 %4, i32 1
%7 = insertelement <4 x i8> %5, i8 %6, i32 2
store <4 x i8> %7, <4 x i8> addrspace(1)* %storetmp, align 4, !tbaa !11
it creates <3 x i8>, which is then shuffled into another <3 x i8>, which is then bitcast to <4 x i8>:
%vecinit.i = insertelement <3 x i8> undef, i8 %2, i32 0
%vecinit1.i = insertelement <3 x i8> %vecinit.i, i8 %3, i32 1
%vecinit4.i = insertelement <3 x i8> %vecinit1.i, i8 %4, i32 2
%extractVec = shufflevector <3 x i8> %vecinit4.i, <3 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 undef>
^^^ the shuffle result should probably be <4 x i8> ????
%storetmp = bitcast <3 x i8> addrspace(1)* %out to <4 x i8> addrspace(1)*
^^^ This one fails
store <4 x i8> %extractVec, <4 x i8> addrspace(1)* %storetmp, align 4, !tbaa !11
I've attached the original .cl file (it will also need libclc to
compile) and both the good and bad .ll processed files.
Just running
'llc -march=amdgcn < bad.link-0.ll'
reproduces the above assertion failure.
The problem appears only with char3 and not other 3 element vector
types nor vectors of different size.
thanks,
Jan
>
>
> _______________________________________________
> llvm-commits mailing list
> llvm-commits at lists.llvm.org
> http://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-commits
--
Jan Vesely <jan.vesely at rutgers.edu>
-------------- next part --------------
; ModuleID = 'link'
source_filename = "link"
target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5"
target triple = "amdgcn-mesa-mesa3d"
; Function Attrs: norecurse nounwind
define amdgpu_kernel void @vload2_constant(i8 addrspace(4)* nocapture readonly %in, <2 x i8> addrspace(1)* nocapture %out) local_unnamed_addr #0 !kernel_arg_addr_space !6 !kernel_arg_access_qual !7 !kernel_arg_type !8 !kernel_arg_base_type !9 !kernel_arg_type_qual !10 {
entry:
%0 = bitcast i8 addrspace(4)* %in to <2 x i8> addrspace(4)*
%1 = load <2 x i8>, <2 x i8> addrspace(4)* %0, align 1, !tbaa !11
store <2 x i8> %1, <2 x i8> addrspace(1)* %out, align 2, !tbaa !11
%add.ptr = getelementptr inbounds i8, i8 addrspace(4)* %in, i64 1
%2 = bitcast i8 addrspace(4)* %add.ptr to <2 x i8> addrspace(4)*
%3 = load <2 x i8>, <2 x i8> addrspace(4)* %2, align 1, !tbaa !11
%arrayidx2 = getelementptr inbounds <2 x i8>, <2 x i8> addrspace(1)* %out, i64 1
store <2 x i8> %3, <2 x i8> addrspace(1)* %arrayidx2, align 2, !tbaa !11
ret void
}
; Function Attrs: norecurse nounwind
define amdgpu_kernel void @vload2_constant_offset(i8 addrspace(4)* nocapture readonly %in, <2 x i8> addrspace(1)* nocapture %out) local_unnamed_addr #0 !kernel_arg_addr_space !6 !kernel_arg_access_qual !7 !kernel_arg_type !8 !kernel_arg_base_type !9 !kernel_arg_type_qual !10 {
entry:
%arrayidx.i = getelementptr inbounds i8, i8 addrspace(4)* %in, i64 2
%0 = bitcast i8 addrspace(4)* %arrayidx.i to <2 x i8> addrspace(4)*
%1 = load <2 x i8>, <2 x i8> addrspace(4)* %0, align 1, !tbaa !11
store <2 x i8> %1, <2 x i8> addrspace(1)* %out, align 2, !tbaa !11
%arrayidx.i5 = getelementptr inbounds i8, i8 addrspace(4)* %in, i64 3
%2 = bitcast i8 addrspace(4)* %arrayidx.i5 to <2 x i8> addrspace(4)*
%3 = load <2 x i8>, <2 x i8> addrspace(4)* %2, align 1, !tbaa !11
%arrayidx2 = getelementptr inbounds <2 x i8>, <2 x i8> addrspace(1)* %out, i64 1
store <2 x i8> %3, <2 x i8> addrspace(1)* %arrayidx2, align 2, !tbaa !11
ret void
}
; Function Attrs: norecurse nounwind
define amdgpu_kernel void @vload3_constant(i8 addrspace(4)* nocapture readonly %in, <3 x i8> addrspace(1)* nocapture %out) local_unnamed_addr #0 !kernel_arg_addr_space !6 !kernel_arg_access_qual !7 !kernel_arg_type !14 !kernel_arg_base_type !15 !kernel_arg_type_qual !10 {
entry:
%0 = bitcast i8 addrspace(4)* %in to <2 x i8> addrspace(4)*
%1 = load <2 x i8>, <2 x i8> addrspace(4)* %0, align 1, !tbaa !11
%2 = extractelement <2 x i8> %1, i64 0
%vecinit.i = insertelement <3 x i8> undef, i8 %2, i32 0
%3 = extractelement <2 x i8> %1, i64 1
%vecinit1.i = insertelement <3 x i8> %vecinit.i, i8 %3, i32 1
%arrayidx3.i = getelementptr inbounds i8, i8 addrspace(4)* %in, i64 2
%4 = load i8, i8 addrspace(4)* %arrayidx3.i, align 1, !tbaa !11
%vecinit4.i = insertelement <3 x i8> %vecinit1.i, i8 %4, i32 2
%extractVec = shufflevector <3 x i8> %vecinit4.i, <3 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 undef>
%storetmp = bitcast <3 x i8> addrspace(1)* %out to <4 x i8> addrspace(1)*
store <4 x i8> %extractVec, <4 x i8> addrspace(1)* %storetmp, align 4, !tbaa !11
%add.ptr = getelementptr inbounds i8, i8 addrspace(4)* %in, i64 1
%5 = bitcast i8 addrspace(4)* %add.ptr to <2 x i8> addrspace(4)*
%6 = load <2 x i8>, <2 x i8> addrspace(4)* %5, align 1, !tbaa !11
%7 = extractelement <2 x i8> %6, i64 0
%vecinit.i7 = insertelement <3 x i8> undef, i8 %7, i32 0
%8 = extractelement <2 x i8> %6, i64 1
%vecinit1.i8 = insertelement <3 x i8> %vecinit.i7, i8 %8, i32 1
%arrayidx3.i9 = getelementptr inbounds i8, i8 addrspace(4)* %in, i64 3
%9 = load i8, i8 addrspace(4)* %arrayidx3.i9, align 1, !tbaa !11
%vecinit4.i10 = insertelement <3 x i8> %vecinit1.i8, i8 %9, i32 2
%arrayidx2 = getelementptr inbounds <3 x i8>, <3 x i8> addrspace(1)* %out, i64 1
%extractVec3 = shufflevector <3 x i8> %vecinit4.i10, <3 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 undef>
%storetmp4 = bitcast <3 x i8> addrspace(1)* %arrayidx2 to <4 x i8> addrspace(1)*
store <4 x i8> %extractVec3, <4 x i8> addrspace(1)* %storetmp4, align 4, !tbaa !11
ret void
}
; Function Attrs: norecurse nounwind
define amdgpu_kernel void @vload3_constant_offset(i8 addrspace(4)* nocapture readonly %in, <3 x i8> addrspace(1)* nocapture %out) local_unnamed_addr #0 !kernel_arg_addr_space !6 !kernel_arg_access_qual !7 !kernel_arg_type !14 !kernel_arg_base_type !15 !kernel_arg_type_qual !10 {
entry:
%arrayidx.i = getelementptr inbounds i8, i8 addrspace(4)* %in, i64 3
%0 = bitcast i8 addrspace(4)* %arrayidx.i to <2 x i8> addrspace(4)*
%1 = load <2 x i8>, <2 x i8> addrspace(4)* %0, align 1, !tbaa !11
%2 = extractelement <2 x i8> %1, i64 0
%vecinit.i = insertelement <3 x i8> undef, i8 %2, i32 0
%3 = extractelement <2 x i8> %1, i64 1
%vecinit1.i = insertelement <3 x i8> %vecinit.i, i8 %3, i32 1
%arrayidx3.i = getelementptr inbounds i8, i8 addrspace(4)* %in, i64 5
%4 = load i8, i8 addrspace(4)* %arrayidx3.i, align 1, !tbaa !11
%vecinit4.i = insertelement <3 x i8> %vecinit1.i, i8 %4, i32 2
%extractVec = shufflevector <3 x i8> %vecinit4.i, <3 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 undef>
%storetmp = bitcast <3 x i8> addrspace(1)* %out to <4 x i8> addrspace(1)*
store <4 x i8> %extractVec, <4 x i8> addrspace(1)* %storetmp, align 4, !tbaa !11
%arrayidx.i7 = getelementptr inbounds i8, i8 addrspace(4)* %in, i64 4
%5 = bitcast i8 addrspace(4)* %arrayidx.i7 to <2 x i8> addrspace(4)*
%6 = load <2 x i8>, <2 x i8> addrspace(4)* %5, align 1, !tbaa !11
%7 = extractelement <2 x i8> %6, i64 0
%vecinit.i8 = insertelement <3 x i8> undef, i8 %7, i32 0
%8 = extractelement <2 x i8> %6, i64 1
%vecinit1.i9 = insertelement <3 x i8> %vecinit.i8, i8 %8, i32 1
%arrayidx3.i10 = getelementptr inbounds i8, i8 addrspace(4)* %in, i64 6
%9 = load i8, i8 addrspace(4)* %arrayidx3.i10, align 1, !tbaa !11
%vecinit4.i11 = insertelement <3 x i8> %vecinit1.i9, i8 %9, i32 2
%arrayidx2 = getelementptr inbounds <3 x i8>, <3 x i8> addrspace(1)* %out, i64 1
%extractVec3 = shufflevector <3 x i8> %vecinit4.i11, <3 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 undef>
%storetmp4 = bitcast <3 x i8> addrspace(1)* %arrayidx2 to <4 x i8> addrspace(1)*
store <4 x i8> %extractVec3, <4 x i8> addrspace(1)* %storetmp4, align 4, !tbaa !11
ret void
}
; Function Attrs: norecurse nounwind
define amdgpu_kernel void @vload4_constant(i8 addrspace(4)* nocapture readonly %in, <4 x i8> addrspace(1)* nocapture %out) local_unnamed_addr #0 !kernel_arg_addr_space !6 !kernel_arg_access_qual !7 !kernel_arg_type !16 !kernel_arg_base_type !17 !kernel_arg_type_qual !10 {
entry:
%0 = bitcast i8 addrspace(4)* %in to i32 addrspace(4)*
%1 = load i32, i32 addrspace(4)* %0, align 1, !tbaa !11
%2 = bitcast <4 x i8> addrspace(1)* %out to i32 addrspace(1)*
store i32 %1, i32 addrspace(1)* %2, align 4, !tbaa !11
%add.ptr = getelementptr inbounds i8, i8 addrspace(4)* %in, i64 1
%3 = bitcast i8 addrspace(4)* %add.ptr to i32 addrspace(4)*
%4 = load i32, i32 addrspace(4)* %3, align 1, !tbaa !11
%arrayidx2 = getelementptr inbounds <4 x i8>, <4 x i8> addrspace(1)* %out, i64 1
%5 = bitcast <4 x i8> addrspace(1)* %arrayidx2 to i32 addrspace(1)*
store i32 %4, i32 addrspace(1)* %5, align 4, !tbaa !11
ret void
}
; Function Attrs: norecurse nounwind
define amdgpu_kernel void @vload4_constant_offset(i8 addrspace(4)* nocapture readonly %in, <4 x i8> addrspace(1)* nocapture %out) local_unnamed_addr #0 !kernel_arg_addr_space !6 !kernel_arg_access_qual !7 !kernel_arg_type !16 !kernel_arg_base_type !17 !kernel_arg_type_qual !10 {
entry:
%arrayidx.i = getelementptr inbounds i8, i8 addrspace(4)* %in, i64 4
%0 = bitcast i8 addrspace(4)* %arrayidx.i to i32 addrspace(4)*
%1 = load i32, i32 addrspace(4)* %0, align 1, !tbaa !11
%2 = bitcast <4 x i8> addrspace(1)* %out to i32 addrspace(1)*
store i32 %1, i32 addrspace(1)* %2, align 4, !tbaa !11
%arrayidx.i5 = getelementptr inbounds i8, i8 addrspace(4)* %in, i64 5
%3 = bitcast i8 addrspace(4)* %arrayidx.i5 to i32 addrspace(4)*
%4 = load i32, i32 addrspace(4)* %3, align 1, !tbaa !11
%arrayidx2 = getelementptr inbounds <4 x i8>, <4 x i8> addrspace(1)* %out, i64 1
%5 = bitcast <4 x i8> addrspace(1)* %arrayidx2 to i32 addrspace(1)*
store i32 %4, i32 addrspace(1)* %5, align 4, !tbaa !11
ret void
}
; Function Attrs: norecurse nounwind
define amdgpu_kernel void @vload8_constant(i8 addrspace(4)* nocapture readonly %in, <8 x i8> addrspace(1)* nocapture %out) local_unnamed_addr #0 !kernel_arg_addr_space !6 !kernel_arg_access_qual !7 !kernel_arg_type !18 !kernel_arg_base_type !19 !kernel_arg_type_qual !10 {
entry:
%0 = bitcast i8 addrspace(4)* %in to i64 addrspace(4)*
%1 = load i64, i64 addrspace(4)* %0, align 1, !tbaa !11
%2 = bitcast <8 x i8> addrspace(1)* %out to i64 addrspace(1)*
store i64 %1, i64 addrspace(1)* %2, align 8, !tbaa !11
%add.ptr = getelementptr inbounds i8, i8 addrspace(4)* %in, i64 1
%3 = bitcast i8 addrspace(4)* %add.ptr to i64 addrspace(4)*
%4 = load i64, i64 addrspace(4)* %3, align 1, !tbaa !11
%arrayidx2 = getelementptr inbounds <8 x i8>, <8 x i8> addrspace(1)* %out, i64 1
%5 = bitcast <8 x i8> addrspace(1)* %arrayidx2 to i64 addrspace(1)*
store i64 %4, i64 addrspace(1)* %5, align 8, !tbaa !11
ret void
}
; Function Attrs: norecurse nounwind
define amdgpu_kernel void @vload8_constant_offset(i8 addrspace(4)* nocapture readonly %in, <8 x i8> addrspace(1)* nocapture %out) local_unnamed_addr #0 !kernel_arg_addr_space !6 !kernel_arg_access_qual !7 !kernel_arg_type !18 !kernel_arg_base_type !19 !kernel_arg_type_qual !10 {
entry:
%arrayidx.i = getelementptr inbounds i8, i8 addrspace(4)* %in, i64 8
%0 = bitcast i8 addrspace(4)* %arrayidx.i to i64 addrspace(4)*
%1 = load i64, i64 addrspace(4)* %0, align 1, !tbaa !11
%2 = bitcast <8 x i8> addrspace(1)* %out to i64 addrspace(1)*
store i64 %1, i64 addrspace(1)* %2, align 8, !tbaa !11
%arrayidx.i5 = getelementptr inbounds i8, i8 addrspace(4)* %in, i64 9
%3 = bitcast i8 addrspace(4)* %arrayidx.i5 to i64 addrspace(4)*
%4 = load i64, i64 addrspace(4)* %3, align 1, !tbaa !11
%arrayidx2 = getelementptr inbounds <8 x i8>, <8 x i8> addrspace(1)* %out, i64 1
%5 = bitcast <8 x i8> addrspace(1)* %arrayidx2 to i64 addrspace(1)*
store i64 %4, i64 addrspace(1)* %5, align 8, !tbaa !11
ret void
}
; Function Attrs: norecurse nounwind
define amdgpu_kernel void @vload16_constant(i8 addrspace(4)* nocapture readonly %in, <16 x i8> addrspace(1)* nocapture %out) local_unnamed_addr #0 !kernel_arg_addr_space !6 !kernel_arg_access_qual !7 !kernel_arg_type !20 !kernel_arg_base_type !21 !kernel_arg_type_qual !10 {
entry:
%0 = bitcast i8 addrspace(4)* %in to <16 x i8> addrspace(4)*
%1 = load <16 x i8>, <16 x i8> addrspace(4)* %0, align 1, !tbaa !11
store <16 x i8> %1, <16 x i8> addrspace(1)* %out, align 16, !tbaa !11
%add.ptr = getelementptr inbounds i8, i8 addrspace(4)* %in, i64 1
%2 = bitcast i8 addrspace(4)* %add.ptr to <16 x i8> addrspace(4)*
%3 = load <16 x i8>, <16 x i8> addrspace(4)* %2, align 1, !tbaa !11
%arrayidx2 = getelementptr inbounds <16 x i8>, <16 x i8> addrspace(1)* %out, i64 1
store <16 x i8> %3, <16 x i8> addrspace(1)* %arrayidx2, align 16, !tbaa !11
ret void
}
; Function Attrs: norecurse nounwind
define amdgpu_kernel void @vload16_constant_offset(i8 addrspace(4)* nocapture readonly %in, <16 x i8> addrspace(1)* nocapture %out) local_unnamed_addr #0 !kernel_arg_addr_space !6 !kernel_arg_access_qual !7 !kernel_arg_type !20 !kernel_arg_base_type !21 !kernel_arg_type_qual !10 {
entry:
%arrayidx.i = getelementptr inbounds i8, i8 addrspace(4)* %in, i64 16
%0 = bitcast i8 addrspace(4)* %arrayidx.i to <16 x i8> addrspace(4)*
%1 = load <16 x i8>, <16 x i8> addrspace(4)* %0, align 1, !tbaa !11
store <16 x i8> %1, <16 x i8> addrspace(1)* %out, align 16, !tbaa !11
%arrayidx.i5 = getelementptr inbounds i8, i8 addrspace(4)* %in, i64 17
%2 = bitcast i8 addrspace(4)* %arrayidx.i5 to <16 x i8> addrspace(4)*
%3 = load <16 x i8>, <16 x i8> addrspace(4)* %2, align 1, !tbaa !11
%arrayidx2 = getelementptr inbounds <16 x i8>, <16 x i8> addrspace(1)* %out, i64 1
store <16 x i8> %3, <16 x i8> addrspace(1)* %arrayidx2, align 16, !tbaa !11
ret void
}
attributes #0 = { norecurse nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "denorms-are-zero"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="gfx902" "target-features"="+16-bit-insts,+ci-insts,+dpp,+fp32-denormals,+fp64-fp16-denormals,+gfx9-insts,+s-memrealtime,+vi-insts" "uniform-work-group-size"="true" "unsafe-fp-math"="false" "use-soft-float"="false" }
!opencl.ocl.version = !{!0}
!llvm.ident = !{!1, !2, !3}
!llvm.module.flags = !{!4, !5}
!0 = !{i32 1, i32 1}
!1 = !{!"clang version 8.0.0 (https://git.llvm.org/git/clang.git 40ec5c4d12536fe608d41b4e7b8b6791c60d5e36) (https://git.llvm.org/git/llvm.git 331a5ec71329eb0a6d46104ff00af89598f4292a)"}
!2 = !{!"clang version 8.0.0 (https://git.llvm.org/git/clang.git 0a8a76da18b28bea9500c5f4539fe9c65075b4ac) (https://git.llvm.org/git/llvm.git a67d12007b6f06d9a47d21f545e199789b1652a2)"}
!3 = !{!"clang version 7.0.0 (https://git.llvm.org/git/clang.git 261aad80c8b0592f57fbda1244a44295ad2bfeb3) (https://git.llvm.org/git/llvm.git 639a95afd42e0a525108992263b94d0def6fdca7)"}
!4 = !{i32 1, !"wchar_size", i32 4}
!5 = !{i32 7, !"PIC Level", i32 1}
!6 = !{i32 2, i32 1}
!7 = !{!"none", !"none"}
!8 = !{!"char*", !"char2*"}
!9 = !{!"char*", !"char __attribute__((ext_vector_type(2)))*"}
!10 = !{!"const", !""}
!11 = !{!12, !12, i64 0}
!12 = !{!"omnipotent char", !13, i64 0}
!13 = !{!"Simple C/C++ TBAA"}
!14 = !{!"char*", !"char3*"}
!15 = !{!"char*", !"char __attribute__((ext_vector_type(3)))*"}
!16 = !{!"char*", !"char4*"}
!17 = !{!"char*", !"char __attribute__((ext_vector_type(4)))*"}
!18 = !{!"char*", !"char8*"}
!19 = !{!"char*", !"char __attribute__((ext_vector_type(8)))*"}
!20 = !{!"char*", !"char16*"}
!21 = !{!"char*", !"char __attribute__((ext_vector_type(16)))*"}
-------------- next part --------------
; ModuleID = 'link'
source_filename = "link"
target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5"
target triple = "amdgcn-mesa-mesa3d"
; Function Attrs: norecurse nounwind
define amdgpu_kernel void @vload2_constant(i8 addrspace(4)* nocapture readonly %in, <2 x i8> addrspace(1)* nocapture %out) local_unnamed_addr #0 !kernel_arg_addr_space !6 !kernel_arg_access_qual !7 !kernel_arg_type !8 !kernel_arg_base_type !9 !kernel_arg_type_qual !10 {
entry:
%0 = bitcast i8 addrspace(4)* %in to <2 x i8> addrspace(4)*
%1 = load <2 x i8>, <2 x i8> addrspace(4)* %0, align 1, !tbaa !11
store <2 x i8> %1, <2 x i8> addrspace(1)* %out, align 2, !tbaa !11
%add.ptr = getelementptr inbounds i8, i8 addrspace(4)* %in, i64 1
%2 = bitcast i8 addrspace(4)* %add.ptr to <2 x i8> addrspace(4)*
%3 = load <2 x i8>, <2 x i8> addrspace(4)* %2, align 1, !tbaa !11
%arrayidx2 = getelementptr inbounds <2 x i8>, <2 x i8> addrspace(1)* %out, i64 1
store <2 x i8> %3, <2 x i8> addrspace(1)* %arrayidx2, align 2, !tbaa !11
ret void
}
; Function Attrs: norecurse nounwind
define amdgpu_kernel void @vload2_constant_offset(i8 addrspace(4)* nocapture readonly %in, <2 x i8> addrspace(1)* nocapture %out) local_unnamed_addr #0 !kernel_arg_addr_space !6 !kernel_arg_access_qual !7 !kernel_arg_type !8 !kernel_arg_base_type !9 !kernel_arg_type_qual !10 {
entry:
%arrayidx.i = getelementptr inbounds i8, i8 addrspace(4)* %in, i64 2
%0 = bitcast i8 addrspace(4)* %arrayidx.i to <2 x i8> addrspace(4)*
%1 = load <2 x i8>, <2 x i8> addrspace(4)* %0, align 1, !tbaa !11
store <2 x i8> %1, <2 x i8> addrspace(1)* %out, align 2, !tbaa !11
%arrayidx.i5 = getelementptr inbounds i8, i8 addrspace(4)* %in, i64 3
%2 = bitcast i8 addrspace(4)* %arrayidx.i5 to <2 x i8> addrspace(4)*
%3 = load <2 x i8>, <2 x i8> addrspace(4)* %2, align 1, !tbaa !11
%arrayidx2 = getelementptr inbounds <2 x i8>, <2 x i8> addrspace(1)* %out, i64 1
store <2 x i8> %3, <2 x i8> addrspace(1)* %arrayidx2, align 2, !tbaa !11
ret void
}
; Function Attrs: norecurse nounwind
define amdgpu_kernel void @vload3_constant(i8 addrspace(4)* nocapture readonly %in, <3 x i8> addrspace(1)* nocapture %out) local_unnamed_addr #0 !kernel_arg_addr_space !6 !kernel_arg_access_qual !7 !kernel_arg_type !14 !kernel_arg_base_type !15 !kernel_arg_type_qual !10 {
entry:
%0 = bitcast i8 addrspace(4)* %in to <2 x i8> addrspace(4)*
%1 = load <2 x i8>, <2 x i8> addrspace(4)* %0, align 1, !tbaa !11
%2 = extractelement <2 x i8> %1, i64 0
%3 = insertelement <4 x i8> undef, i8 %2, i32 0
%4 = extractelement <2 x i8> %1, i64 1
%5 = insertelement <4 x i8> %3, i8 %4, i32 1
%arrayidx3.i = getelementptr inbounds i8, i8 addrspace(4)* %in, i64 2
%6 = load i8, i8 addrspace(4)* %arrayidx3.i, align 1, !tbaa !11
%7 = insertelement <4 x i8> %5, i8 %6, i32 2
%storetmp = bitcast <3 x i8> addrspace(1)* %out to <4 x i8> addrspace(1)*
store <4 x i8> %7, <4 x i8> addrspace(1)* %storetmp, align 4, !tbaa !11
%add.ptr = getelementptr inbounds i8, i8 addrspace(4)* %in, i64 1
%8 = bitcast i8 addrspace(4)* %add.ptr to <2 x i8> addrspace(4)*
%9 = load <2 x i8>, <2 x i8> addrspace(4)* %8, align 1, !tbaa !11
%10 = extractelement <2 x i8> %9, i64 0
%11 = insertelement <4 x i8> undef, i8 %10, i32 0
%12 = extractelement <2 x i8> %9, i64 1
%13 = insertelement <4 x i8> %11, i8 %12, i32 1
%arrayidx3.i9 = getelementptr inbounds i8, i8 addrspace(4)* %in, i64 3
%14 = load i8, i8 addrspace(4)* %arrayidx3.i9, align 1, !tbaa !11
%15 = insertelement <4 x i8> %13, i8 %14, i32 2
%arrayidx2 = getelementptr inbounds <3 x i8>, <3 x i8> addrspace(1)* %out, i64 1
%storetmp4 = bitcast <3 x i8> addrspace(1)* %arrayidx2 to <4 x i8> addrspace(1)*
store <4 x i8> %15, <4 x i8> addrspace(1)* %storetmp4, align 4, !tbaa !11
ret void
}
; Function Attrs: norecurse nounwind
define amdgpu_kernel void @vload3_constant_offset(i8 addrspace(4)* nocapture readonly %in, <3 x i8> addrspace(1)* nocapture %out) local_unnamed_addr #0 !kernel_arg_addr_space !6 !kernel_arg_access_qual !7 !kernel_arg_type !14 !kernel_arg_base_type !15 !kernel_arg_type_qual !10 {
entry:
%arrayidx.i = getelementptr inbounds i8, i8 addrspace(4)* %in, i64 3
%0 = bitcast i8 addrspace(4)* %arrayidx.i to <2 x i8> addrspace(4)*
%1 = load <2 x i8>, <2 x i8> addrspace(4)* %0, align 1, !tbaa !11
%2 = extractelement <2 x i8> %1, i64 0
%3 = insertelement <4 x i8> undef, i8 %2, i32 0
%4 = extractelement <2 x i8> %1, i64 1
%5 = insertelement <4 x i8> %3, i8 %4, i32 1
%arrayidx3.i = getelementptr inbounds i8, i8 addrspace(4)* %in, i64 5
%6 = load i8, i8 addrspace(4)* %arrayidx3.i, align 1, !tbaa !11
%7 = insertelement <4 x i8> %5, i8 %6, i32 2
%storetmp = bitcast <3 x i8> addrspace(1)* %out to <4 x i8> addrspace(1)*
store <4 x i8> %7, <4 x i8> addrspace(1)* %storetmp, align 4, !tbaa !11
%arrayidx.i7 = getelementptr inbounds i8, i8 addrspace(4)* %in, i64 4
%8 = bitcast i8 addrspace(4)* %arrayidx.i7 to <2 x i8> addrspace(4)*
%9 = load <2 x i8>, <2 x i8> addrspace(4)* %8, align 1, !tbaa !11
%10 = extractelement <2 x i8> %9, i64 0
%11 = insertelement <4 x i8> undef, i8 %10, i32 0
%12 = extractelement <2 x i8> %9, i64 1
%13 = insertelement <4 x i8> %11, i8 %12, i32 1
%arrayidx3.i10 = getelementptr inbounds i8, i8 addrspace(4)* %in, i64 6
%14 = load i8, i8 addrspace(4)* %arrayidx3.i10, align 1, !tbaa !11
%15 = insertelement <4 x i8> %13, i8 %14, i32 2
%arrayidx2 = getelementptr inbounds <3 x i8>, <3 x i8> addrspace(1)* %out, i64 1
%storetmp4 = bitcast <3 x i8> addrspace(1)* %arrayidx2 to <4 x i8> addrspace(1)*
store <4 x i8> %15, <4 x i8> addrspace(1)* %storetmp4, align 4, !tbaa !11
ret void
}
; Function Attrs: norecurse nounwind
define amdgpu_kernel void @vload4_constant(i8 addrspace(4)* nocapture readonly %in, <4 x i8> addrspace(1)* nocapture %out) local_unnamed_addr #0 !kernel_arg_addr_space !6 !kernel_arg_access_qual !7 !kernel_arg_type !16 !kernel_arg_base_type !17 !kernel_arg_type_qual !10 {
entry:
%0 = bitcast i8 addrspace(4)* %in to i32 addrspace(4)*
%1 = load i32, i32 addrspace(4)* %0, align 1, !tbaa !11
%2 = bitcast <4 x i8> addrspace(1)* %out to i32 addrspace(1)*
store i32 %1, i32 addrspace(1)* %2, align 4, !tbaa !11
%add.ptr = getelementptr inbounds i8, i8 addrspace(4)* %in, i64 1
%3 = bitcast i8 addrspace(4)* %add.ptr to i32 addrspace(4)*
%4 = load i32, i32 addrspace(4)* %3, align 1, !tbaa !11
%arrayidx2 = getelementptr inbounds <4 x i8>, <4 x i8> addrspace(1)* %out, i64 1
%5 = bitcast <4 x i8> addrspace(1)* %arrayidx2 to i32 addrspace(1)*
store i32 %4, i32 addrspace(1)* %5, align 4, !tbaa !11
ret void
}
; Function Attrs: norecurse nounwind
define amdgpu_kernel void @vload4_constant_offset(i8 addrspace(4)* nocapture readonly %in, <4 x i8> addrspace(1)* nocapture %out) local_unnamed_addr #0 !kernel_arg_addr_space !6 !kernel_arg_access_qual !7 !kernel_arg_type !16 !kernel_arg_base_type !17 !kernel_arg_type_qual !10 {
entry:
%arrayidx.i = getelementptr inbounds i8, i8 addrspace(4)* %in, i64 4
%0 = bitcast i8 addrspace(4)* %arrayidx.i to i32 addrspace(4)*
%1 = load i32, i32 addrspace(4)* %0, align 1, !tbaa !11
%2 = bitcast <4 x i8> addrspace(1)* %out to i32 addrspace(1)*
store i32 %1, i32 addrspace(1)* %2, align 4, !tbaa !11
%arrayidx.i5 = getelementptr inbounds i8, i8 addrspace(4)* %in, i64 5
%3 = bitcast i8 addrspace(4)* %arrayidx.i5 to i32 addrspace(4)*
%4 = load i32, i32 addrspace(4)* %3, align 1, !tbaa !11
%arrayidx2 = getelementptr inbounds <4 x i8>, <4 x i8> addrspace(1)* %out, i64 1
%5 = bitcast <4 x i8> addrspace(1)* %arrayidx2 to i32 addrspace(1)*
store i32 %4, i32 addrspace(1)* %5, align 4, !tbaa !11
ret void
}
; Function Attrs: norecurse nounwind
define amdgpu_kernel void @vload8_constant(i8 addrspace(4)* nocapture readonly %in, <8 x i8> addrspace(1)* nocapture %out) local_unnamed_addr #0 !kernel_arg_addr_space !6 !kernel_arg_access_qual !7 !kernel_arg_type !18 !kernel_arg_base_type !19 !kernel_arg_type_qual !10 {
entry:
%0 = bitcast i8 addrspace(4)* %in to i64 addrspace(4)*
%1 = load i64, i64 addrspace(4)* %0, align 1, !tbaa !11
%2 = bitcast <8 x i8> addrspace(1)* %out to i64 addrspace(1)*
store i64 %1, i64 addrspace(1)* %2, align 8, !tbaa !11
%add.ptr = getelementptr inbounds i8, i8 addrspace(4)* %in, i64 1
%3 = bitcast i8 addrspace(4)* %add.ptr to i64 addrspace(4)*
%4 = load i64, i64 addrspace(4)* %3, align 1, !tbaa !11
%arrayidx2 = getelementptr inbounds <8 x i8>, <8 x i8> addrspace(1)* %out, i64 1
%5 = bitcast <8 x i8> addrspace(1)* %arrayidx2 to i64 addrspace(1)*
store i64 %4, i64 addrspace(1)* %5, align 8, !tbaa !11
ret void
}
; Function Attrs: norecurse nounwind
define amdgpu_kernel void @vload8_constant_offset(i8 addrspace(4)* nocapture readonly %in, <8 x i8> addrspace(1)* nocapture %out) local_unnamed_addr #0 !kernel_arg_addr_space !6 !kernel_arg_access_qual !7 !kernel_arg_type !18 !kernel_arg_base_type !19 !kernel_arg_type_qual !10 {
entry:
%arrayidx.i = getelementptr inbounds i8, i8 addrspace(4)* %in, i64 8
%0 = bitcast i8 addrspace(4)* %arrayidx.i to i64 addrspace(4)*
%1 = load i64, i64 addrspace(4)* %0, align 1, !tbaa !11
%2 = bitcast <8 x i8> addrspace(1)* %out to i64 addrspace(1)*
store i64 %1, i64 addrspace(1)* %2, align 8, !tbaa !11
%arrayidx.i5 = getelementptr inbounds i8, i8 addrspace(4)* %in, i64 9
%3 = bitcast i8 addrspace(4)* %arrayidx.i5 to i64 addrspace(4)*
%4 = load i64, i64 addrspace(4)* %3, align 1, !tbaa !11
%arrayidx2 = getelementptr inbounds <8 x i8>, <8 x i8> addrspace(1)* %out, i64 1
%5 = bitcast <8 x i8> addrspace(1)* %arrayidx2 to i64 addrspace(1)*
store i64 %4, i64 addrspace(1)* %5, align 8, !tbaa !11
ret void
}
; Function Attrs: norecurse nounwind
define amdgpu_kernel void @vload16_constant(i8 addrspace(4)* nocapture readonly %in, <16 x i8> addrspace(1)* nocapture %out) local_unnamed_addr #0 !kernel_arg_addr_space !6 !kernel_arg_access_qual !7 !kernel_arg_type !20 !kernel_arg_base_type !21 !kernel_arg_type_qual !10 {
entry:
%0 = bitcast i8 addrspace(4)* %in to <16 x i8> addrspace(4)*
%1 = load <16 x i8>, <16 x i8> addrspace(4)* %0, align 1, !tbaa !11
store <16 x i8> %1, <16 x i8> addrspace(1)* %out, align 16, !tbaa !11
%add.ptr = getelementptr inbounds i8, i8 addrspace(4)* %in, i64 1
%2 = bitcast i8 addrspace(4)* %add.ptr to <16 x i8> addrspace(4)*
%3 = load <16 x i8>, <16 x i8> addrspace(4)* %2, align 1, !tbaa !11
%arrayidx2 = getelementptr inbounds <16 x i8>, <16 x i8> addrspace(1)* %out, i64 1
store <16 x i8> %3, <16 x i8> addrspace(1)* %arrayidx2, align 16, !tbaa !11
ret void
}
; Function Attrs: norecurse nounwind
define amdgpu_kernel void @vload16_constant_offset(i8 addrspace(4)* nocapture readonly %in, <16 x i8> addrspace(1)* nocapture %out) local_unnamed_addr #0 !kernel_arg_addr_space !6 !kernel_arg_access_qual !7 !kernel_arg_type !20 !kernel_arg_base_type !21 !kernel_arg_type_qual !10 {
entry:
%arrayidx.i = getelementptr inbounds i8, i8 addrspace(4)* %in, i64 16
%0 = bitcast i8 addrspace(4)* %arrayidx.i to <16 x i8> addrspace(4)*
%1 = load <16 x i8>, <16 x i8> addrspace(4)* %0, align 1, !tbaa !11
store <16 x i8> %1, <16 x i8> addrspace(1)* %out, align 16, !tbaa !11
%arrayidx.i5 = getelementptr inbounds i8, i8 addrspace(4)* %in, i64 17
%2 = bitcast i8 addrspace(4)* %arrayidx.i5 to <16 x i8> addrspace(4)*
%3 = load <16 x i8>, <16 x i8> addrspace(4)* %2, align 1, !tbaa !11
%arrayidx2 = getelementptr inbounds <16 x i8>, <16 x i8> addrspace(1)* %out, i64 1
store <16 x i8> %3, <16 x i8> addrspace(1)* %arrayidx2, align 16, !tbaa !11
ret void
}
attributes #0 = { norecurse nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "denorms-are-zero"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="gfx902" "target-features"="+16-bit-insts,+ci-insts,+dpp,+fp32-denormals,+fp64-fp16-denormals,+gfx9-insts,+s-memrealtime,+vi-insts" "uniform-work-group-size"="true" "unsafe-fp-math"="false" "use-soft-float"="false" }
!opencl.ocl.version = !{!0}
!llvm.ident = !{!1, !2, !3}
!llvm.module.flags = !{!4, !5}
!0 = !{i32 1, i32 1}
!1 = !{!"clang version 8.0.0 (https://git.llvm.org/git/clang.git 40ec5c4d12536fe608d41b4e7b8b6791c60d5e36) (https://git.llvm.org/git/llvm.git 67529047631348866b13473e3213659a1a1906c9)"}
!2 = !{!"clang version 8.0.0 (https://git.llvm.org/git/clang.git 0a8a76da18b28bea9500c5f4539fe9c65075b4ac) (https://git.llvm.org/git/llvm.git a67d12007b6f06d9a47d21f545e199789b1652a2)"}
!3 = !{!"clang version 7.0.0 (https://git.llvm.org/git/clang.git 261aad80c8b0592f57fbda1244a44295ad2bfeb3) (https://git.llvm.org/git/llvm.git 639a95afd42e0a525108992263b94d0def6fdca7)"}
!4 = !{i32 1, !"wchar_size", i32 4}
!5 = !{i32 7, !"PIC Level", i32 1}
!6 = !{i32 2, i32 1}
!7 = !{!"none", !"none"}
!8 = !{!"char*", !"char2*"}
!9 = !{!"char*", !"char __attribute__((ext_vector_type(2)))*"}
!10 = !{!"const", !""}
!11 = !{!12, !12, i64 0}
!12 = !{!"omnipotent char", !13, i64 0}
!13 = !{!"Simple C/C++ TBAA"}
!14 = !{!"char*", !"char3*"}
!15 = !{!"char*", !"char __attribute__((ext_vector_type(3)))*"}
!16 = !{!"char*", !"char4*"}
!17 = !{!"char*", !"char __attribute__((ext_vector_type(4)))*"}
!18 = !{!"char*", !"char8*"}
!19 = !{!"char*", !"char __attribute__((ext_vector_type(8)))*"}
!20 = !{!"char*", !"char16*"}
!21 = !{!"char*", !"char __attribute__((ext_vector_type(16)))*"}
-------------- next part --------------
A non-text attachment was scrubbed...
Name: vload-char-constant.cl
Type: text/x-opencl-src
Size: 3885 bytes
Desc: not available
URL: <http://lists.llvm.org/pipermail/llvm-commits/attachments/20180929/8f0605db/attachment-0001.bin>
-------------- next part --------------
A non-text attachment was scrubbed...
Name: signature.asc
Type: application/pgp-signature
Size: 833 bytes
Desc: This is a digitally signed message part
URL: <http://lists.llvm.org/pipermail/llvm-commits/attachments/20180929/8f0605db/attachment-0001.sig>
More information about the llvm-commits
mailing list