[llvm] r323530 - [SLP] Fix for PR32086: Count InsertElementInstr of the same elements as shuffle.
Sanjoy Das via llvm-commits
llvm-commits at lists.llvm.org
Fri Jan 26 17:24:57 PST 2018
Hi,
I think this patch is buggy. If I run the following IR
target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-grtev4-linux-gnu"
; Function Attrs: norecurse nounwind
define void @zot(i8* nocapture align 8 dereferenceable(24) %arg, i8*
noalias nocapture readnone %arg1, i8** noalias nocapture readonly
%arg2, i8** noalias nocapture readnone %arg3, i64* noalias nocapture
readnone %arg4) local_unnamed_addr #0 {
bb:
%tmp = bitcast i8** %arg2 to [2 x float]**
%tmp5 = load [2 x float]*, [2 x float]** %tmp, align 8,
!invariant.load !0, !dereferenceable !1, !align !1
%tmp6 = getelementptr inbounds i8*, i8** %arg2, i64 1
%tmp7 = bitcast i8** %tmp6 to [2 x [3 x float]]**
%tmp8 = load [2 x [3 x float]]*, [2 x [3 x float]]** %tmp7, align 8,
!invariant.load !0, !dereferenceable !2, !align !1
%tmp9 = getelementptr inbounds i8*, i8** %arg2, i64 2
%tmp10 = bitcast i8** %tmp9 to [2 x float]**
%tmp11 = load [2 x float]*, [2 x float]** %tmp10, align 8,
!invariant.load !0, !dereferenceable !1, !align !1
%tmp12 = getelementptr inbounds [2 x float], [2 x float]* %tmp5, i64 0, i64 0
%tmp13 = load float, float* %tmp12, align 8, !alias.scope !3, !noalias !6
%tmp14 = getelementptr inbounds [2 x float], [2 x float]* %tmp11, i64 0, i64 0
%tmp15 = load float, float* %tmp14, align 8, !alias.scope !8, !noalias !6
%tmp16 = getelementptr inbounds [2 x [3 x float]], [2 x [3 x
float]]* %tmp8, i64 0, i64 0, i64 0
%tmp17 = load float, float* %tmp16, align 8, !invariant.load !0, !noalias !6
%tmp18 = fadd fast float %tmp17, %tmp13
%tmp19 = fadd fast float %tmp18, %tmp15
%tmp20 = bitcast i8* %arg to float*
store float %tmp19, float* %tmp20, align 8, !alias.scope !6
%tmp21 = getelementptr inbounds [2 x [3 x float]], [2 x [3 x
float]]* %tmp8, i64 0, i64 0, i64 1
%tmp22 = load float, float* %tmp21, align 4, !invariant.load !0, !noalias !6
%tmp23 = fadd fast float %tmp22, %tmp13
%tmp24 = fadd fast float %tmp23, %tmp15
%tmp25 = getelementptr inbounds i8, i8* %arg, i64 4
%tmp26 = bitcast i8* %tmp25 to float*
store float %tmp24, float* %tmp26, align 4, !alias.scope !6
%tmp27 = getelementptr inbounds [2 x [3 x float]], [2 x [3 x
float]]* %tmp8, i64 0, i64 0, i64 2
%tmp28 = load float, float* %tmp27, align 8, !invariant.load !0, !noalias !6
%tmp29 = fadd fast float %tmp28, %tmp13
%tmp30 = fadd fast float %tmp29, %tmp15
%tmp31 = getelementptr inbounds i8, i8* %arg, i64 8
%tmp32 = bitcast i8* %tmp31 to float*
store float %tmp30, float* %tmp32, align 8, !alias.scope !6
%tmp33 = getelementptr inbounds [2 x float], [2 x float]* %tmp5, i64 0, i64 1
%tmp34 = load float, float* %tmp33, align 4, !alias.scope !3, !noalias !6
%tmp35 = getelementptr inbounds [2 x float], [2 x float]* %tmp11, i64 0, i64 1
%tmp36 = load float, float* %tmp35, align 4, !alias.scope !8, !noalias !6
%tmp37 = getelementptr inbounds [2 x [3 x float]], [2 x [3 x
float]]* %tmp8, i64 0, i64 1, i64 0
%tmp38 = load float, float* %tmp37, align 4, !invariant.load !0, !noalias !6
%tmp39 = fadd fast float %tmp38, %tmp34
%tmp40 = fadd fast float %tmp39, %tmp36
%tmp41 = getelementptr inbounds i8, i8* %arg, i64 12
%tmp42 = bitcast i8* %tmp41 to float*
store float %tmp40, float* %tmp42, align 4, !alias.scope !6
%tmp43 = getelementptr inbounds [2 x [3 x float]], [2 x [3 x
float]]* %tmp8, i64 0, i64 1, i64 1
%tmp44 = load float, float* %tmp43, align 4, !invariant.load !0, !noalias !6
%tmp45 = fadd fast float %tmp44, %tmp34
%tmp46 = fadd fast float %tmp45, %tmp36
%tmp47 = getelementptr inbounds i8, i8* %arg, i64 16
%tmp48 = bitcast i8* %tmp47 to float*
store float %tmp46, float* %tmp48, align 8, !alias.scope !6
%tmp49 = getelementptr inbounds [2 x [3 x float]], [2 x [3 x
float]]* %tmp8, i64 0, i64 1, i64 2
%tmp50 = load float, float* %tmp49, align 4, !invariant.load !0, !noalias !6
%tmp51 = fadd fast float %tmp50, %tmp34
%tmp52 = fadd fast float %tmp51, %tmp36
%tmp53 = getelementptr inbounds i8, i8* %arg, i64 20
%tmp54 = bitcast i8* %tmp53 to float*
store float %tmp52, float* %tmp54, align 4, !alias.scope !6
ret void
}
attributes #0 = { norecurse nounwind "no-frame-pointer-elim"="false"
"no-infs-fp-math"="true" "no-nans-fp-math"="true"
"no-signed-zeros-fp-math"="true" "unsafe-fp-math"="true" }
!0 = !{}
!1 = !{i64 8}
!2 = !{i64 24}
!3 = !{!4}
!4 = !{!"buffer: {index:3, offset:0, size:8}", !5}
!5 = !{!"XLA global AA domain"}
!6 = !{!7}
!7 = !{!"buffer: {index:0, offset:0, size:24}", !5}
!8 = !{!9}
!9 = !{!"buffer: {index:2, offset:0, size:8}", !5}
through opt -slp-vectorizer -scoped-noalias -mcpu=haswell
I get
target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-grtev4-linux-gnu"
; Function Attrs: norecurse nounwind
define void @zot(i8* nocapture align 8 dereferenceable(24) %arg, i8*
noalias nocapture readnone %arg1, i8** noalias nocapture readonly
%arg2, i8** noalias nocapture readnone %arg3, i64* noalias nocapture
readnone %arg4) local_unnamed_addr #0 {
bb:
%tmp = bitcast i8** %arg2 to [2 x float]**
%tmp5 = load [2 x float]*, [2 x float]** %tmp, align 8,
!invariant.load !0, !dereferenceable !1, !align !1
%tmp6 = getelementptr inbounds i8*, i8** %arg2, i64 1
%tmp7 = bitcast i8** %tmp6 to [2 x [3 x float]]**
%tmp8 = load [2 x [3 x float]]*, [2 x [3 x float]]** %tmp7, align 8,
!invariant.load !0, !dereferenceable !2, !align !1
%tmp9 = getelementptr inbounds i8*, i8** %arg2, i64 2
%tmp10 = bitcast i8** %tmp9 to [2 x float]**
%tmp11 = load [2 x float]*, [2 x float]** %tmp10, align 8,
!invariant.load !0, !dereferenceable !1, !align !1
%tmp12 = getelementptr inbounds [2 x float], [2 x float]* %tmp5, i64 0, i64 0
%tmp14 = getelementptr inbounds [2 x float], [2 x float]* %tmp11, i64 0, i64 0
%tmp16 = getelementptr inbounds [2 x [3 x float]], [2 x [3 x
float]]* %tmp8, i64 0, i64 0, i64 0
%tmp20 = bitcast i8* %arg to float*
%tmp21 = getelementptr inbounds [2 x [3 x float]], [2 x [3 x
float]]* %tmp8, i64 0, i64 0, i64 1
%tmp25 = getelementptr inbounds i8, i8* %arg, i64 4
%tmp26 = bitcast i8* %tmp25 to float*
%tmp27 = getelementptr inbounds [2 x [3 x float]], [2 x [3 x
float]]* %tmp8, i64 0, i64 0, i64 2
%tmp31 = getelementptr inbounds i8, i8* %arg, i64 8
%tmp32 = bitcast i8* %tmp31 to float*
%tmp33 = getelementptr inbounds [2 x float], [2 x float]* %tmp5, i64 0, i64 1
%0 = bitcast float* %tmp12 to <2 x float>*
%1 = load <2 x float>, <2 x float>* %0, align 8, !alias.scope !3, !noalias !6
%shuffle = shufflevector <2 x float> %1, <2 x float> undef, <4 x
i32> <i32 0, i32 0, i32 0, i32 1>
%tmp35 = getelementptr inbounds [2 x float], [2 x float]* %tmp11, i64 0, i64 1
%2 = bitcast float* %tmp14 to <2 x float>*
%3 = load <2 x float>, <2 x float>* %2, align 8, !alias.scope !8, !noalias !6
%shuffle1 = shufflevector <2 x float> %3, <2 x float> undef, <4 x
i32> <i32 0, i32 0, i32 0, i32 1>
%tmp37 = getelementptr inbounds [2 x [3 x float]], [2 x [3 x
float]]* %tmp8, i64 0, i64 1, i64 0
%4 = bitcast float* %tmp16 to <4 x float>*
%5 = load <4 x float>, <4 x float>* %4, align 8, !invariant.load !0,
!noalias !6
%6 = fadd fast <4 x float> %5, %shuffle
%7 = fadd fast <4 x float> %6, %shuffle1
%tmp41 = getelementptr inbounds i8, i8* %arg, i64 12
%tmp42 = bitcast i8* %tmp41 to float*
%8 = bitcast float* %tmp20 to <4 x float>*
store <4 x float> %7, <4 x float>* %8, align 8, !alias.scope !6
%tmp43 = getelementptr inbounds [2 x [3 x float]], [2 x [3 x
float]]* %tmp8, i64 0, i64 1, i64 1
%tmp44 = load float, float* %tmp43, align 4, !invariant.load !0, !noalias !6
%9 = extractelement <4 x float> %shuffle, i32 1
%tmp45 = fadd fast float %tmp44, %9
%10 = extractelement <4 x float> %shuffle1, i32 1
%tmp46 = fadd fast float %tmp45, %10
%tmp47 = getelementptr inbounds i8, i8* %arg, i64 16
%tmp48 = bitcast i8* %tmp47 to float*
store float %tmp46, float* %tmp48, align 8, !alias.scope !6
%tmp49 = getelementptr inbounds [2 x [3 x float]], [2 x [3 x
float]]* %tmp8, i64 0, i64 1, i64 2
%tmp50 = load float, float* %tmp49, align 4, !invariant.load !0, !noalias !6
%tmp51 = fadd fast float %tmp50, %9
%tmp52 = fadd fast float %tmp51, %10
%tmp53 = getelementptr inbounds i8, i8* %arg, i64 20
%tmp54 = bitcast i8* %tmp53 to float*
store float %tmp52, float* %tmp54, align 4, !alias.scope !6
ret void
}
attributes #0 = { norecurse nounwind "no-frame-pointer-elim"="false"
"no-infs-fp-math"="true" "no-nans-fp-math"="true"
"no-signed-zeros-fp-math"="true" "target-cpu"="haswell"
"unsafe-fp-math"="true" }
!0 = !{}
!1 = !{i64 8}
!2 = !{i64 24}
!3 = !{!4}
!4 = !{!"buffer: {index:3, offset:0, size:8}", !5}
!5 = !{!"XLA global AA domain"}
!6 = !{!7}
!7 = !{!"buffer: {index:0, offset:0, size:24}", !5}
!8 = !{!9}
!9 = !{!"buffer: {index:2, offset:0, size:8}", !5}
I think in the optimized IR the store to %tmp48 is storing an
incorrect value. In the original IR it was storing %tmp50 + %tmp34 +
%tmp36 == *%tmp49 + *%tmp33 + *%tmp35. In the new IR the same store
stores the value %tmp50 + %9 + %10. However %9 is *%tmp12, which is
%tmp13 in the original IR. Similarly %10 is *%tmp14 which is %tmp15
in the original IR.
Either the extract element indices here should be 0
%9 = extractelement <4 x float> %shuffle, i32 1
%10 = extractelement <4 x float> %shuffle1, i32 1
or their operands should be %1 and %3 respectively.
If this is a correct assessment, can you please revert for now?
-- Sanjoy
More information about the llvm-commits
mailing list