[llvm] r323530 - [SLP] Fix for PR32086: Count InsertElementInstr of the same elements as shuffle.
Alexey Bataev via llvm-commits
llvm-commits at lists.llvm.org
Fri Jan 26 17:51:34 PST 2018
Hi Sanjoy,
Thanks for the report. I'll revert it just in case in several minutes and investigate it on Monday.
Best regards,
Alexey Bataev
> 26 янв. 2018 г., в 20:25, Sanjoy Das <sanjoy at playingwithpointers.com> написал(а):
>
> Hi,
>
> I think this patch is buggy. If I run the following IR
>
> target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
> target triple = "x86_64-grtev4-linux-gnu"
>
> ; Function Attrs: norecurse nounwind
> define void @zot(i8* nocapture align 8 dereferenceable(24) %arg, i8*
> noalias nocapture readnone %arg1, i8** noalias nocapture readonly
> %arg2, i8** noalias nocapture readnone %arg3, i64* noalias nocapture
> readnone %arg4) local_unnamed_addr #0 {
> bb:
> %tmp = bitcast i8** %arg2 to [2 x float]**
> %tmp5 = load [2 x float]*, [2 x float]** %tmp, align 8,
> !invariant.load !0, !dereferenceable !1, !align !1
> %tmp6 = getelementptr inbounds i8*, i8** %arg2, i64 1
> %tmp7 = bitcast i8** %tmp6 to [2 x [3 x float]]**
> %tmp8 = load [2 x [3 x float]]*, [2 x [3 x float]]** %tmp7, align 8,
> !invariant.load !0, !dereferenceable !2, !align !1
> %tmp9 = getelementptr inbounds i8*, i8** %arg2, i64 2
> %tmp10 = bitcast i8** %tmp9 to [2 x float]**
> %tmp11 = load [2 x float]*, [2 x float]** %tmp10, align 8,
> !invariant.load !0, !dereferenceable !1, !align !1
> %tmp12 = getelementptr inbounds [2 x float], [2 x float]* %tmp5, i64 0, i64 0
> %tmp13 = load float, float* %tmp12, align 8, !alias.scope !3, !noalias !6
> %tmp14 = getelementptr inbounds [2 x float], [2 x float]* %tmp11, i64 0, i64 0
> %tmp15 = load float, float* %tmp14, align 8, !alias.scope !8, !noalias !6
> %tmp16 = getelementptr inbounds [2 x [3 x float]], [2 x [3 x
> float]]* %tmp8, i64 0, i64 0, i64 0
> %tmp17 = load float, float* %tmp16, align 8, !invariant.load !0, !noalias !6
> %tmp18 = fadd fast float %tmp17, %tmp13
> %tmp19 = fadd fast float %tmp18, %tmp15
> %tmp20 = bitcast i8* %arg to float*
> store float %tmp19, float* %tmp20, align 8, !alias.scope !6
> %tmp21 = getelementptr inbounds [2 x [3 x float]], [2 x [3 x
> float]]* %tmp8, i64 0, i64 0, i64 1
> %tmp22 = load float, float* %tmp21, align 4, !invariant.load !0, !noalias !6
> %tmp23 = fadd fast float %tmp22, %tmp13
> %tmp24 = fadd fast float %tmp23, %tmp15
> %tmp25 = getelementptr inbounds i8, i8* %arg, i64 4
> %tmp26 = bitcast i8* %tmp25 to float*
> store float %tmp24, float* %tmp26, align 4, !alias.scope !6
> %tmp27 = getelementptr inbounds [2 x [3 x float]], [2 x [3 x
> float]]* %tmp8, i64 0, i64 0, i64 2
> %tmp28 = load float, float* %tmp27, align 8, !invariant.load !0, !noalias !6
> %tmp29 = fadd fast float %tmp28, %tmp13
> %tmp30 = fadd fast float %tmp29, %tmp15
> %tmp31 = getelementptr inbounds i8, i8* %arg, i64 8
> %tmp32 = bitcast i8* %tmp31 to float*
> store float %tmp30, float* %tmp32, align 8, !alias.scope !6
> %tmp33 = getelementptr inbounds [2 x float], [2 x float]* %tmp5, i64 0, i64 1
> %tmp34 = load float, float* %tmp33, align 4, !alias.scope !3, !noalias !6
> %tmp35 = getelementptr inbounds [2 x float], [2 x float]* %tmp11, i64 0, i64 1
> %tmp36 = load float, float* %tmp35, align 4, !alias.scope !8, !noalias !6
> %tmp37 = getelementptr inbounds [2 x [3 x float]], [2 x [3 x
> float]]* %tmp8, i64 0, i64 1, i64 0
> %tmp38 = load float, float* %tmp37, align 4, !invariant.load !0, !noalias !6
> %tmp39 = fadd fast float %tmp38, %tmp34
> %tmp40 = fadd fast float %tmp39, %tmp36
> %tmp41 = getelementptr inbounds i8, i8* %arg, i64 12
> %tmp42 = bitcast i8* %tmp41 to float*
> store float %tmp40, float* %tmp42, align 4, !alias.scope !6
> %tmp43 = getelementptr inbounds [2 x [3 x float]], [2 x [3 x
> float]]* %tmp8, i64 0, i64 1, i64 1
> %tmp44 = load float, float* %tmp43, align 4, !invariant.load !0, !noalias !6
> %tmp45 = fadd fast float %tmp44, %tmp34
> %tmp46 = fadd fast float %tmp45, %tmp36
> %tmp47 = getelementptr inbounds i8, i8* %arg, i64 16
> %tmp48 = bitcast i8* %tmp47 to float*
> store float %tmp46, float* %tmp48, align 8, !alias.scope !6
> %tmp49 = getelementptr inbounds [2 x [3 x float]], [2 x [3 x
> float]]* %tmp8, i64 0, i64 1, i64 2
> %tmp50 = load float, float* %tmp49, align 4, !invariant.load !0, !noalias !6
> %tmp51 = fadd fast float %tmp50, %tmp34
> %tmp52 = fadd fast float %tmp51, %tmp36
> %tmp53 = getelementptr inbounds i8, i8* %arg, i64 20
> %tmp54 = bitcast i8* %tmp53 to float*
> store float %tmp52, float* %tmp54, align 4, !alias.scope !6
> ret void
> }
>
> attributes #0 = { norecurse nounwind "no-frame-pointer-elim"="false"
> "no-infs-fp-math"="true" "no-nans-fp-math"="true"
> "no-signed-zeros-fp-math"="true" "unsafe-fp-math"="true" }
>
> !0 = !{}
> !1 = !{i64 8}
> !2 = !{i64 24}
> !3 = !{!4}
> !4 = !{!"buffer: {index:3, offset:0, size:8}", !5}
> !5 = !{!"XLA global AA domain"}
> !6 = !{!7}
> !7 = !{!"buffer: {index:0, offset:0, size:24}", !5}
> !8 = !{!9}
> !9 = !{!"buffer: {index:2, offset:0, size:8}", !5}
>
>
> through opt -slp-vectorizer -scoped-noalias -mcpu=haswell
>
> I get
>
> target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
> target triple = "x86_64-grtev4-linux-gnu"
>
> ; Function Attrs: norecurse nounwind
> define void @zot(i8* nocapture align 8 dereferenceable(24) %arg, i8*
> noalias nocapture readnone %arg1, i8** noalias nocapture readonly
> %arg2, i8** noalias nocapture readnone %arg3, i64* noalias nocapture
> readnone %arg4) local_unnamed_addr #0 {
> bb:
> %tmp = bitcast i8** %arg2 to [2 x float]**
> %tmp5 = load [2 x float]*, [2 x float]** %tmp, align 8,
> !invariant.load !0, !dereferenceable !1, !align !1
> %tmp6 = getelementptr inbounds i8*, i8** %arg2, i64 1
> %tmp7 = bitcast i8** %tmp6 to [2 x [3 x float]]**
> %tmp8 = load [2 x [3 x float]]*, [2 x [3 x float]]** %tmp7, align 8,
> !invariant.load !0, !dereferenceable !2, !align !1
> %tmp9 = getelementptr inbounds i8*, i8** %arg2, i64 2
> %tmp10 = bitcast i8** %tmp9 to [2 x float]**
> %tmp11 = load [2 x float]*, [2 x float]** %tmp10, align 8,
> !invariant.load !0, !dereferenceable !1, !align !1
> %tmp12 = getelementptr inbounds [2 x float], [2 x float]* %tmp5, i64 0, i64 0
> %tmp14 = getelementptr inbounds [2 x float], [2 x float]* %tmp11, i64 0, i64 0
> %tmp16 = getelementptr inbounds [2 x [3 x float]], [2 x [3 x
> float]]* %tmp8, i64 0, i64 0, i64 0
> %tmp20 = bitcast i8* %arg to float*
> %tmp21 = getelementptr inbounds [2 x [3 x float]], [2 x [3 x
> float]]* %tmp8, i64 0, i64 0, i64 1
> %tmp25 = getelementptr inbounds i8, i8* %arg, i64 4
> %tmp26 = bitcast i8* %tmp25 to float*
> %tmp27 = getelementptr inbounds [2 x [3 x float]], [2 x [3 x
> float]]* %tmp8, i64 0, i64 0, i64 2
> %tmp31 = getelementptr inbounds i8, i8* %arg, i64 8
> %tmp32 = bitcast i8* %tmp31 to float*
> %tmp33 = getelementptr inbounds [2 x float], [2 x float]* %tmp5, i64 0, i64 1
> %0 = bitcast float* %tmp12 to <2 x float>*
> %1 = load <2 x float>, <2 x float>* %0, align 8, !alias.scope !3, !noalias !6
> %shuffle = shufflevector <2 x float> %1, <2 x float> undef, <4 x
> i32> <i32 0, i32 0, i32 0, i32 1>
> %tmp35 = getelementptr inbounds [2 x float], [2 x float]* %tmp11, i64 0, i64 1
> %2 = bitcast float* %tmp14 to <2 x float>*
> %3 = load <2 x float>, <2 x float>* %2, align 8, !alias.scope !8, !noalias !6
> %shuffle1 = shufflevector <2 x float> %3, <2 x float> undef, <4 x
> i32> <i32 0, i32 0, i32 0, i32 1>
> %tmp37 = getelementptr inbounds [2 x [3 x float]], [2 x [3 x
> float]]* %tmp8, i64 0, i64 1, i64 0
> %4 = bitcast float* %tmp16 to <4 x float>*
> %5 = load <4 x float>, <4 x float>* %4, align 8, !invariant.load !0,
> !noalias !6
> %6 = fadd fast <4 x float> %5, %shuffle
> %7 = fadd fast <4 x float> %6, %shuffle1
> %tmp41 = getelementptr inbounds i8, i8* %arg, i64 12
> %tmp42 = bitcast i8* %tmp41 to float*
> %8 = bitcast float* %tmp20 to <4 x float>*
> store <4 x float> %7, <4 x float>* %8, align 8, !alias.scope !6
> %tmp43 = getelementptr inbounds [2 x [3 x float]], [2 x [3 x
> float]]* %tmp8, i64 0, i64 1, i64 1
> %tmp44 = load float, float* %tmp43, align 4, !invariant.load !0, !noalias !6
> %9 = extractelement <4 x float> %shuffle, i32 1
> %tmp45 = fadd fast float %tmp44, %9
> %10 = extractelement <4 x float> %shuffle1, i32 1
> %tmp46 = fadd fast float %tmp45, %10
> %tmp47 = getelementptr inbounds i8, i8* %arg, i64 16
> %tmp48 = bitcast i8* %tmp47 to float*
> store float %tmp46, float* %tmp48, align 8, !alias.scope !6
> %tmp49 = getelementptr inbounds [2 x [3 x float]], [2 x [3 x
> float]]* %tmp8, i64 0, i64 1, i64 2
> %tmp50 = load float, float* %tmp49, align 4, !invariant.load !0, !noalias !6
> %tmp51 = fadd fast float %tmp50, %9
> %tmp52 = fadd fast float %tmp51, %10
> %tmp53 = getelementptr inbounds i8, i8* %arg, i64 20
> %tmp54 = bitcast i8* %tmp53 to float*
> store float %tmp52, float* %tmp54, align 4, !alias.scope !6
> ret void
> }
>
> attributes #0 = { norecurse nounwind "no-frame-pointer-elim"="false"
> "no-infs-fp-math"="true" "no-nans-fp-math"="true"
> "no-signed-zeros-fp-math"="true" "target-cpu"="haswell"
> "unsafe-fp-math"="true" }
>
> !0 = !{}
> !1 = !{i64 8}
> !2 = !{i64 24}
> !3 = !{!4}
> !4 = !{!"buffer: {index:3, offset:0, size:8}", !5}
> !5 = !{!"XLA global AA domain"}
> !6 = !{!7}
> !7 = !{!"buffer: {index:0, offset:0, size:24}", !5}
> !8 = !{!9}
> !9 = !{!"buffer: {index:2, offset:0, size:8}", !5}
>
>
> I think in the optimized IR the store to %tmp48 is storing an
> incorrect value. In the original IR it was storing %tmp50 + %tmp34 +
> %tmp36 == *%tmp49 + *%tmp33 + *%tmp35. In the new IR the same store
> stores the value %tmp50 + %9 + %10. However %9 is *%tmp12, which is
> %tmp13 in the original IR. Similarly %10 is *%tmp14 which is %tmp15
> in the original IR.
>
> Either the extract element indices here should be 0
>
> %9 = extractelement <4 x float> %shuffle, i32 1
> %10 = extractelement <4 x float> %shuffle1, i32 1
>
> or their operands should be %1 and %3 respectively.
>
> If this is a correct assessment, can you please revert for now?
>
> -- Sanjoy
More information about the llvm-commits
mailing list