[llvm] r323530 - [SLP] Fix for PR32086: Count InsertElementInstr of the same elements as shuffle.

Fri Jan 26 17:24:57 PST 2018

Hi,

I think this patch is buggy.  If I run the following IR

target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-grtev4-linux-gnu"

; Function Attrs: norecurse nounwind
define void @zot(i8* nocapture align 8 dereferenceable(24) %arg, i8*
noalias nocapture readnone %arg1, i8** noalias nocapture readonly
%arg2, i8** noalias nocapture readnone %arg3, i64* noalias nocapture
readnone %arg4) local_unnamed_addr #0 {
bb:
  %tmp = bitcast i8** %arg2 to [2 x float]**
  %tmp5 = load [2 x float]*, [2 x float]** %tmp, align 8,
!invariant.load !0, !dereferenceable !1, !align !1
  %tmp6 = getelementptr inbounds i8*, i8** %arg2, i64 1
  %tmp7 = bitcast i8** %tmp6 to [2 x [3 x float]]**
  %tmp8 = load [2 x [3 x float]]*, [2 x [3 x float]]** %tmp7, align 8,
!invariant.load !0, !dereferenceable !2, !align !1
  %tmp9 = getelementptr inbounds i8*, i8** %arg2, i64 2
  %tmp10 = bitcast i8** %tmp9 to [2 x float]**
  %tmp11 = load [2 x float]*, [2 x float]** %tmp10, align 8,
!invariant.load !0, !dereferenceable !1, !align !1
  %tmp12 = getelementptr inbounds [2 x float], [2 x float]* %tmp5, i64 0, i64 0
  %tmp13 = load float, float* %tmp12, align 8, !alias.scope !3, !noalias !6
  %tmp14 = getelementptr inbounds [2 x float], [2 x float]* %tmp11, i64 0, i64 0
  %tmp15 = load float, float* %tmp14, align 8, !alias.scope !8, !noalias !6
  %tmp16 = getelementptr inbounds [2 x [3 x float]], [2 x [3 x
float]]* %tmp8, i64 0, i64 0, i64 0
  %tmp17 = load float, float* %tmp16, align 8, !invariant.load !0, !noalias !6
  %tmp18 = fadd fast float %tmp17, %tmp13
  %tmp19 = fadd fast float %tmp18, %tmp15
  %tmp20 = bitcast i8* %arg to float*
  store float %tmp19, float* %tmp20, align 8, !alias.scope !6
  %tmp21 = getelementptr inbounds [2 x [3 x float]], [2 x [3 x
float]]* %tmp8, i64 0, i64 0, i64 1
  %tmp22 = load float, float* %tmp21, align 4, !invariant.load !0, !noalias !6
  %tmp23 = fadd fast float %tmp22, %tmp13
  %tmp24 = fadd fast float %tmp23, %tmp15
  %tmp25 = getelementptr inbounds i8, i8* %arg, i64 4
  %tmp26 = bitcast i8* %tmp25 to float*
  store float %tmp24, float* %tmp26, align 4, !alias.scope !6
  %tmp27 = getelementptr inbounds [2 x [3 x float]], [2 x [3 x
float]]* %tmp8, i64 0, i64 0, i64 2
  %tmp28 = load float, float* %tmp27, align 8, !invariant.load !0, !noalias !6
  %tmp29 = fadd fast float %tmp28, %tmp13
  %tmp30 = fadd fast float %tmp29, %tmp15
  %tmp31 = getelementptr inbounds i8, i8* %arg, i64 8
  %tmp32 = bitcast i8* %tmp31 to float*
  store float %tmp30, float* %tmp32, align 8, !alias.scope !6
  %tmp33 = getelementptr inbounds [2 x float], [2 x float]* %tmp5, i64 0, i64 1
  %tmp34 = load float, float* %tmp33, align 4, !alias.scope !3, !noalias !6
  %tmp35 = getelementptr inbounds [2 x float], [2 x float]* %tmp11, i64 0, i64 1
  %tmp36 = load float, float* %tmp35, align 4, !alias.scope !8, !noalias !6
  %tmp37 = getelementptr inbounds [2 x [3 x float]], [2 x [3 x
float]]* %tmp8, i64 0, i64 1, i64 0
  %tmp38 = load float, float* %tmp37, align 4, !invariant.load !0, !noalias !6
  %tmp39 = fadd fast float %tmp38, %tmp34
  %tmp40 = fadd fast float %tmp39, %tmp36
  %tmp41 = getelementptr inbounds i8, i8* %arg, i64 12
  %tmp42 = bitcast i8* %tmp41 to float*
  store float %tmp40, float* %tmp42, align 4, !alias.scope !6
  %tmp43 = getelementptr inbounds [2 x [3 x float]], [2 x [3 x
float]]* %tmp8, i64 0, i64 1, i64 1
  %tmp44 = load float, float* %tmp43, align 4, !invariant.load !0, !noalias !6
  %tmp45 = fadd fast float %tmp44, %tmp34
  %tmp46 = fadd fast float %tmp45, %tmp36
  %tmp47 = getelementptr inbounds i8, i8* %arg, i64 16
  %tmp48 = bitcast i8* %tmp47 to float*
  store float %tmp46, float* %tmp48, align 8, !alias.scope !6
  %tmp49 = getelementptr inbounds [2 x [3 x float]], [2 x [3 x
float]]* %tmp8, i64 0, i64 1, i64 2
  %tmp50 = load float, float* %tmp49, align 4, !invariant.load !0, !noalias !6
  %tmp51 = fadd fast float %tmp50, %tmp34
  %tmp52 = fadd fast float %tmp51, %tmp36
  %tmp53 = getelementptr inbounds i8, i8* %arg, i64 20
  %tmp54 = bitcast i8* %tmp53 to float*
  store float %tmp52, float* %tmp54, align 4, !alias.scope !6
  ret void
}

attributes #0 = { norecurse nounwind "no-frame-pointer-elim"="false"
"no-infs-fp-math"="true" "no-nans-fp-math"="true"
"no-signed-zeros-fp-math"="true" "unsafe-fp-math"="true" }

!0 = !{}
!1 = !{i64 8}
!2 = !{i64 24}
!3 = !{!4}
!4 = !{!"buffer: {index:3, offset:0, size:8}", !5}
!5 = !{!"XLA global AA domain"}
!6 = !{!7}
!7 = !{!"buffer: {index:0, offset:0, size:24}", !5}
!8 = !{!9}
!9 = !{!"buffer: {index:2, offset:0, size:8}", !5}

through opt -slp-vectorizer -scoped-noalias -mcpu=haswell

I get

target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-grtev4-linux-gnu"

; Function Attrs: norecurse nounwind
define void @zot(i8* nocapture align 8 dereferenceable(24) %arg, i8*
noalias nocapture readnone %arg1, i8** noalias nocapture readonly
%arg2, i8** noalias nocapture readnone %arg3, i64* noalias nocapture
readnone %arg4) local_unnamed_addr #0 {
bb:
  %tmp = bitcast i8** %arg2 to [2 x float]**
  %tmp5 = load [2 x float]*, [2 x float]** %tmp, align 8,
!invariant.load !0, !dereferenceable !1, !align !1
  %tmp6 = getelementptr inbounds i8*, i8** %arg2, i64 1
  %tmp7 = bitcast i8** %tmp6 to [2 x [3 x float]]**
  %tmp8 = load [2 x [3 x float]]*, [2 x [3 x float]]** %tmp7, align 8,
!invariant.load !0, !dereferenceable !2, !align !1
  %tmp9 = getelementptr inbounds i8*, i8** %arg2, i64 2
  %tmp10 = bitcast i8** %tmp9 to [2 x float]**
  %tmp11 = load [2 x float]*, [2 x float]** %tmp10, align 8,
!invariant.load !0, !dereferenceable !1, !align !1
  %tmp12 = getelementptr inbounds [2 x float], [2 x float]* %tmp5, i64 0, i64 0
  %tmp14 = getelementptr inbounds [2 x float], [2 x float]* %tmp11, i64 0, i64 0
  %tmp16 = getelementptr inbounds [2 x [3 x float]], [2 x [3 x
float]]* %tmp8, i64 0, i64 0, i64 0
  %tmp20 = bitcast i8* %arg to float*
  %tmp21 = getelementptr inbounds [2 x [3 x float]], [2 x [3 x
float]]* %tmp8, i64 0, i64 0, i64 1
  %tmp25 = getelementptr inbounds i8, i8* %arg, i64 4
  %tmp26 = bitcast i8* %tmp25 to float*
  %tmp27 = getelementptr inbounds [2 x [3 x float]], [2 x [3 x
float]]* %tmp8, i64 0, i64 0, i64 2
  %tmp31 = getelementptr inbounds i8, i8* %arg, i64 8
  %tmp32 = bitcast i8* %tmp31 to float*
  %tmp33 = getelementptr inbounds [2 x float], [2 x float]* %tmp5, i64 0, i64 1
  %0 = bitcast float* %tmp12 to <2 x float>*
  %1 = load <2 x float>, <2 x float>* %0, align 8, !alias.scope !3, !noalias !6
  %shuffle = shufflevector <2 x float> %1, <2 x float> undef, <4 x
i32> <i32 0, i32 0, i32 0, i32 1>
  %tmp35 = getelementptr inbounds [2 x float], [2 x float]* %tmp11, i64 0, i64 1
  %2 = bitcast float* %tmp14 to <2 x float>*
  %3 = load <2 x float>, <2 x float>* %2, align 8, !alias.scope !8, !noalias !6
  %shuffle1 = shufflevector <2 x float> %3, <2 x float> undef, <4 x
i32> <i32 0, i32 0, i32 0, i32 1>
  %tmp37 = getelementptr inbounds [2 x [3 x float]], [2 x [3 x
float]]* %tmp8, i64 0, i64 1, i64 0
  %4 = bitcast float* %tmp16 to <4 x float>*
  %5 = load <4 x float>, <4 x float>* %4, align 8, !invariant.load !0,
!noalias !6
  %6 = fadd fast <4 x float> %5, %shuffle
  %7 = fadd fast <4 x float> %6, %shuffle1
  %tmp41 = getelementptr inbounds i8, i8* %arg, i64 12
  %tmp42 = bitcast i8* %tmp41 to float*
  %8 = bitcast float* %tmp20 to <4 x float>*
  store <4 x float> %7, <4 x float>* %8, align 8, !alias.scope !6
  %tmp43 = getelementptr inbounds [2 x [3 x float]], [2 x [3 x
float]]* %tmp8, i64 0, i64 1, i64 1
  %tmp44 = load float, float* %tmp43, align 4, !invariant.load !0, !noalias !6
  %9 = extractelement <4 x float> %shuffle, i32 1
  %tmp45 = fadd fast float %tmp44, %9
  %10 = extractelement <4 x float> %shuffle1, i32 1
  %tmp46 = fadd fast float %tmp45, %10
  %tmp47 = getelementptr inbounds i8, i8* %arg, i64 16
  %tmp48 = bitcast i8* %tmp47 to float*
  store float %tmp46, float* %tmp48, align 8, !alias.scope !6
  %tmp49 = getelementptr inbounds [2 x [3 x float]], [2 x [3 x
float]]* %tmp8, i64 0, i64 1, i64 2
  %tmp50 = load float, float* %tmp49, align 4, !invariant.load !0, !noalias !6
  %tmp51 = fadd fast float %tmp50, %9
  %tmp52 = fadd fast float %tmp51, %10
  %tmp53 = getelementptr inbounds i8, i8* %arg, i64 20
  %tmp54 = bitcast i8* %tmp53 to float*
  store float %tmp52, float* %tmp54, align 4, !alias.scope !6
  ret void
}

attributes #0 = { norecurse nounwind "no-frame-pointer-elim"="false"
"no-infs-fp-math"="true" "no-nans-fp-math"="true"
"no-signed-zeros-fp-math"="true" "target-cpu"="haswell"
"unsafe-fp-math"="true" }

!0 = !{}
!1 = !{i64 8}
!2 = !{i64 24}
!3 = !{!4}
!4 = !{!"buffer: {index:3, offset:0, size:8}", !5}
!5 = !{!"XLA global AA domain"}
!6 = !{!7}
!7 = !{!"buffer: {index:0, offset:0, size:24}", !5}
!8 = !{!9}
!9 = !{!"buffer: {index:2, offset:0, size:8}", !5}

I think in the optimized IR the store to %tmp48 is storing an
incorrect value.  In the original IR it was storing %tmp50 + %tmp34 +
%tmp36 == *%tmp49 + *%tmp33 + *%tmp35.  In the new IR the same store
stores the value %tmp50 + %9 + %10.  However %9 is *%tmp12, which is
%tmp13 in the original IR.  Similarly %10 is *%tmp14 which is %tmp15
in the original IR.

Either the extract element indices here should be 0

  %9 = extractelement <4 x float> %shuffle, i32 1
  %10 = extractelement <4 x float> %shuffle1, i32 1

or their operands should be %1 and %3 respectively.

If this is a correct assessment, can you please revert for now?

-- Sanjoy