[llvm] r323530 - [SLP] Fix for PR32086: Count InsertElementInstr of the same elements as shuffle.

Fri Jan 26 17:51:34 PST 2018

Hi Sanjoy, 
Thanks for the report. I'll revert it just in case in several minutes and investigate it on Monday. 

Best regards,
Alexey Bataev

> 26 янв. 2018 г., в 20:25, Sanjoy Das <sanjoy at playingwithpointers.com> написал(а):
> 
> Hi,
> 
> I think this patch is buggy.  If I run the following IR
> 
> target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
> target triple = "x86_64-grtev4-linux-gnu"
> 
> ; Function Attrs: norecurse nounwind
> define void @zot(i8* nocapture align 8 dereferenceable(24) %arg, i8*
> noalias nocapture readnone %arg1, i8** noalias nocapture readonly
> %arg2, i8** noalias nocapture readnone %arg3, i64* noalias nocapture
> readnone %arg4) local_unnamed_addr #0 {
> bb:
>  %tmp = bitcast i8** %arg2 to [2 x float]**
>  %tmp5 = load [2 x float]*, [2 x float]** %tmp, align 8,
> !invariant.load !0, !dereferenceable !1, !align !1
>  %tmp6 = getelementptr inbounds i8*, i8** %arg2, i64 1
>  %tmp7 = bitcast i8** %tmp6 to [2 x [3 x float]]**
>  %tmp8 = load [2 x [3 x float]]*, [2 x [3 x float]]** %tmp7, align 8,
> !invariant.load !0, !dereferenceable !2, !align !1
>  %tmp9 = getelementptr inbounds i8*, i8** %arg2, i64 2
>  %tmp10 = bitcast i8** %tmp9 to [2 x float]**
>  %tmp11 = load [2 x float]*, [2 x float]** %tmp10, align 8,
> !invariant.load !0, !dereferenceable !1, !align !1
>  %tmp12 = getelementptr inbounds [2 x float], [2 x float]* %tmp5, i64 0, i64 0
>  %tmp13 = load float, float* %tmp12, align 8, !alias.scope !3, !noalias !6
>  %tmp14 = getelementptr inbounds [2 x float], [2 x float]* %tmp11, i64 0, i64 0
>  %tmp15 = load float, float* %tmp14, align 8, !alias.scope !8, !noalias !6
>  %tmp16 = getelementptr inbounds [2 x [3 x float]], [2 x [3 x
> float]]* %tmp8, i64 0, i64 0, i64 0
>  %tmp17 = load float, float* %tmp16, align 8, !invariant.load !0, !noalias !6
>  %tmp18 = fadd fast float %tmp17, %tmp13
>  %tmp19 = fadd fast float %tmp18, %tmp15
>  %tmp20 = bitcast i8* %arg to float*
>  store float %tmp19, float* %tmp20, align 8, !alias.scope !6
>  %tmp21 = getelementptr inbounds [2 x [3 x float]], [2 x [3 x
> float]]* %tmp8, i64 0, i64 0, i64 1
>  %tmp22 = load float, float* %tmp21, align 4, !invariant.load !0, !noalias !6
>  %tmp23 = fadd fast float %tmp22, %tmp13
>  %tmp24 = fadd fast float %tmp23, %tmp15
>  %tmp25 = getelementptr inbounds i8, i8* %arg, i64 4
>  %tmp26 = bitcast i8* %tmp25 to float*
>  store float %tmp24, float* %tmp26, align 4, !alias.scope !6
>  %tmp27 = getelementptr inbounds [2 x [3 x float]], [2 x [3 x
> float]]* %tmp8, i64 0, i64 0, i64 2
>  %tmp28 = load float, float* %tmp27, align 8, !invariant.load !0, !noalias !6
>  %tmp29 = fadd fast float %tmp28, %tmp13
>  %tmp30 = fadd fast float %tmp29, %tmp15
>  %tmp31 = getelementptr inbounds i8, i8* %arg, i64 8
>  %tmp32 = bitcast i8* %tmp31 to float*
>  store float %tmp30, float* %tmp32, align 8, !alias.scope !6
>  %tmp33 = getelementptr inbounds [2 x float], [2 x float]* %tmp5, i64 0, i64 1
>  %tmp34 = load float, float* %tmp33, align 4, !alias.scope !3, !noalias !6
>  %tmp35 = getelementptr inbounds [2 x float], [2 x float]* %tmp11, i64 0, i64 1
>  %tmp36 = load float, float* %tmp35, align 4, !alias.scope !8, !noalias !6
>  %tmp37 = getelementptr inbounds [2 x [3 x float]], [2 x [3 x
> float]]* %tmp8, i64 0, i64 1, i64 0
>  %tmp38 = load float, float* %tmp37, align 4, !invariant.load !0, !noalias !6
>  %tmp39 = fadd fast float %tmp38, %tmp34
>  %tmp40 = fadd fast float %tmp39, %tmp36
>  %tmp41 = getelementptr inbounds i8, i8* %arg, i64 12
>  %tmp42 = bitcast i8* %tmp41 to float*
>  store float %tmp40, float* %tmp42, align 4, !alias.scope !6
>  %tmp43 = getelementptr inbounds [2 x [3 x float]], [2 x [3 x
> float]]* %tmp8, i64 0, i64 1, i64 1
>  %tmp44 = load float, float* %tmp43, align 4, !invariant.load !0, !noalias !6
>  %tmp45 = fadd fast float %tmp44, %tmp34
>  %tmp46 = fadd fast float %tmp45, %tmp36
>  %tmp47 = getelementptr inbounds i8, i8* %arg, i64 16
>  %tmp48 = bitcast i8* %tmp47 to float*
>  store float %tmp46, float* %tmp48, align 8, !alias.scope !6
>  %tmp49 = getelementptr inbounds [2 x [3 x float]], [2 x [3 x
> float]]* %tmp8, i64 0, i64 1, i64 2
>  %tmp50 = load float, float* %tmp49, align 4, !invariant.load !0, !noalias !6
>  %tmp51 = fadd fast float %tmp50, %tmp34
>  %tmp52 = fadd fast float %tmp51, %tmp36
>  %tmp53 = getelementptr inbounds i8, i8* %arg, i64 20
>  %tmp54 = bitcast i8* %tmp53 to float*
>  store float %tmp52, float* %tmp54, align 4, !alias.scope !6
>  ret void
> }
> 
> attributes #0 = { norecurse nounwind "no-frame-pointer-elim"="false"
> "no-infs-fp-math"="true" "no-nans-fp-math"="true"
> "no-signed-zeros-fp-math"="true" "unsafe-fp-math"="true" }
> 
> !0 = !{}
> !1 = !{i64 8}
> !2 = !{i64 24}
> !3 = !{!4}
> !4 = !{!"buffer: {index:3, offset:0, size:8}", !5}
> !5 = !{!"XLA global AA domain"}
> !6 = !{!7}
> !7 = !{!"buffer: {index:0, offset:0, size:24}", !5}
> !8 = !{!9}
> !9 = !{!"buffer: {index:2, offset:0, size:8}", !5}
> 
> 
> through opt -slp-vectorizer -scoped-noalias -mcpu=haswell
> 
> I get
> 
> target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
> target triple = "x86_64-grtev4-linux-gnu"
> 
> ; Function Attrs: norecurse nounwind
> define void @zot(i8* nocapture align 8 dereferenceable(24) %arg, i8*
> noalias nocapture readnone %arg1, i8** noalias nocapture readonly
> %arg2, i8** noalias nocapture readnone %arg3, i64* noalias nocapture
> readnone %arg4) local_unnamed_addr #0 {
> bb:
>  %tmp = bitcast i8** %arg2 to [2 x float]**
>  %tmp5 = load [2 x float]*, [2 x float]** %tmp, align 8,
> !invariant.load !0, !dereferenceable !1, !align !1
>  %tmp6 = getelementptr inbounds i8*, i8** %arg2, i64 1
>  %tmp7 = bitcast i8** %tmp6 to [2 x [3 x float]]**
>  %tmp8 = load [2 x [3 x float]]*, [2 x [3 x float]]** %tmp7, align 8,
> !invariant.load !0, !dereferenceable !2, !align !1
>  %tmp9 = getelementptr inbounds i8*, i8** %arg2, i64 2
>  %tmp10 = bitcast i8** %tmp9 to [2 x float]**
>  %tmp11 = load [2 x float]*, [2 x float]** %tmp10, align 8,
> !invariant.load !0, !dereferenceable !1, !align !1
>  %tmp12 = getelementptr inbounds [2 x float], [2 x float]* %tmp5, i64 0, i64 0
>  %tmp14 = getelementptr inbounds [2 x float], [2 x float]* %tmp11, i64 0, i64 0
>  %tmp16 = getelementptr inbounds [2 x [3 x float]], [2 x [3 x
> float]]* %tmp8, i64 0, i64 0, i64 0
>  %tmp20 = bitcast i8* %arg to float*
>  %tmp21 = getelementptr inbounds [2 x [3 x float]], [2 x [3 x
> float]]* %tmp8, i64 0, i64 0, i64 1
>  %tmp25 = getelementptr inbounds i8, i8* %arg, i64 4
>  %tmp26 = bitcast i8* %tmp25 to float*
>  %tmp27 = getelementptr inbounds [2 x [3 x float]], [2 x [3 x
> float]]* %tmp8, i64 0, i64 0, i64 2
>  %tmp31 = getelementptr inbounds i8, i8* %arg, i64 8
>  %tmp32 = bitcast i8* %tmp31 to float*
>  %tmp33 = getelementptr inbounds [2 x float], [2 x float]* %tmp5, i64 0, i64 1
>  %0 = bitcast float* %tmp12 to <2 x float>*
>  %1 = load <2 x float>, <2 x float>* %0, align 8, !alias.scope !3, !noalias !6
>  %shuffle = shufflevector <2 x float> %1, <2 x float> undef, <4 x
> i32> <i32 0, i32 0, i32 0, i32 1>
>  %tmp35 = getelementptr inbounds [2 x float], [2 x float]* %tmp11, i64 0, i64 1
>  %2 = bitcast float* %tmp14 to <2 x float>*
>  %3 = load <2 x float>, <2 x float>* %2, align 8, !alias.scope !8, !noalias !6
>  %shuffle1 = shufflevector <2 x float> %3, <2 x float> undef, <4 x
> i32> <i32 0, i32 0, i32 0, i32 1>
>  %tmp37 = getelementptr inbounds [2 x [3 x float]], [2 x [3 x
> float]]* %tmp8, i64 0, i64 1, i64 0
>  %4 = bitcast float* %tmp16 to <4 x float>*
>  %5 = load <4 x float>, <4 x float>* %4, align 8, !invariant.load !0,
> !noalias !6
>  %6 = fadd fast <4 x float> %5, %shuffle
>  %7 = fadd fast <4 x float> %6, %shuffle1
>  %tmp41 = getelementptr inbounds i8, i8* %arg, i64 12
>  %tmp42 = bitcast i8* %tmp41 to float*
>  %8 = bitcast float* %tmp20 to <4 x float>*
>  store <4 x float> %7, <4 x float>* %8, align 8, !alias.scope !6
>  %tmp43 = getelementptr inbounds [2 x [3 x float]], [2 x [3 x
> float]]* %tmp8, i64 0, i64 1, i64 1
>  %tmp44 = load float, float* %tmp43, align 4, !invariant.load !0, !noalias !6
>  %9 = extractelement <4 x float> %shuffle, i32 1
>  %tmp45 = fadd fast float %tmp44, %9
>  %10 = extractelement <4 x float> %shuffle1, i32 1
>  %tmp46 = fadd fast float %tmp45, %10
>  %tmp47 = getelementptr inbounds i8, i8* %arg, i64 16
>  %tmp48 = bitcast i8* %tmp47 to float*
>  store float %tmp46, float* %tmp48, align 8, !alias.scope !6
>  %tmp49 = getelementptr inbounds [2 x [3 x float]], [2 x [3 x
> float]]* %tmp8, i64 0, i64 1, i64 2
>  %tmp50 = load float, float* %tmp49, align 4, !invariant.load !0, !noalias !6
>  %tmp51 = fadd fast float %tmp50, %9
>  %tmp52 = fadd fast float %tmp51, %10
>  %tmp53 = getelementptr inbounds i8, i8* %arg, i64 20
>  %tmp54 = bitcast i8* %tmp53 to float*
>  store float %tmp52, float* %tmp54, align 4, !alias.scope !6
>  ret void
> }
> 
> attributes #0 = { norecurse nounwind "no-frame-pointer-elim"="false"
> "no-infs-fp-math"="true" "no-nans-fp-math"="true"
> "no-signed-zeros-fp-math"="true" "target-cpu"="haswell"
> "unsafe-fp-math"="true" }
> 
> !0 = !{}
> !1 = !{i64 8}
> !2 = !{i64 24}
> !3 = !{!4}
> !4 = !{!"buffer: {index:3, offset:0, size:8}", !5}
> !5 = !{!"XLA global AA domain"}
> !6 = !{!7}
> !7 = !{!"buffer: {index:0, offset:0, size:24}", !5}
> !8 = !{!9}
> !9 = !{!"buffer: {index:2, offset:0, size:8}", !5}
> 
> 
> I think in the optimized IR the store to %tmp48 is storing an
> incorrect value.  In the original IR it was storing %tmp50 + %tmp34 +
> %tmp36 == *%tmp49 + *%tmp33 + *%tmp35.  In the new IR the same store
> stores the value %tmp50 + %9 + %10.  However %9 is *%tmp12, which is
> %tmp13 in the original IR.  Similarly %10 is *%tmp14 which is %tmp15
> in the original IR.
> 
> Either the extract element indices here should be 0
> 
>  %9 = extractelement <4 x float> %shuffle, i32 1
>  %10 = extractelement <4 x float> %shuffle1, i32 1
> 
> or their operands should be %1 and %3 respectively.
> 
> If this is a correct assessment, can you please revert for now?
> 
> -- Sanjoy