[MachineCopyPropagation] Handle undef flags conservatively so that we do not remove copies that are useful after breaking some hardware dependencies

Thu May 28 07:06:48 PDT 2015

Hi Quentin,

I think I have found a possible regression in LLVM that was revealed by 
one of your commits (r235647) which changes the handling of undefs by 
MachineCopyPropagation. This occurs on X86-64 with the following code:

%vreg92<def> = IMPLICIT_DEF; VR128:%vreg92
%vreg91<def,tied1> = PUNPCKLBWrr %vreg78<tied0>, %vreg92; 
VR128:%vreg91,%vreg78,%vreg92
%vreg94<def> = IMPLICIT_DEF; VR128:%vreg94
%vreg93<def,tied1> = PUNPCKLWDrr %vreg91<tied0>, %vreg94; 
VR128:%vreg93,%vreg91,%vreg94
%vreg95<def,tied1> = PSLLDri %vreg93<tied0>, 31; VR128:%vreg95,%vreg93
%vreg96<def,tied1> = PSRADri %vreg95<tied0>, 31; VR128:%vreg96,%vreg95

Later on the IMPLICIT_DEFs are turned into <undef> which, after your 
changes, causes MachineCopyPropagation to remove one copy:

MOVAPSmr %RSP, 1, %noreg, 16, %noreg, %XMM0<kill>; mem:ST16[FixedStack11]
%XMM2<def> = MOVAPSrm %RSP, 1, %noreg, 160, %noreg; mem:LD16[FixedStack2]
%XMM0<def> = KILL %XMM2 ; This was COPY before MachineCopyPropagation
%XMM1<def> = COPY %XMM2
%XMM2<def,tied1> = PUNPCKLBWrr %XMM2<kill,tied0>, %XMM0<undef>
%XMM2<def,tied1> = PUNPCKLWDrr %XMM2<kill,tied0>, %XMM0<undef>
%XMM2<def,tied1> = PSLLDri %XMM2<kill,tied0>, 31
%XMM2<def,tied1> = PSRADri %XMM2<kill,tied0>, 31

One of our test that was previously passing now fails, and the removed 
COPY is the only difference in the generated code I can see. Looking at 
only the code above it seems that the copy is not needed, which is strange.

I have reduced the IR that exhibits this issue to a manageable size and 
created a .ll file for testing. When I run this file with lli using the 
interpreter it passes, same with the JIT on ARM. It fails however on 
X86-64 using the JIT. Reverting your changes, it passes on X86-64 using 
the JIT.

Do you think that it's a bug in the X86 target that was revealed by your 
changes or that MachineCopyPropagation is doing something wrong?

Thanks,
Pierre-Andre

-- 
Pierre-Andre Saulais
Principal Software Engineer (Compilers)
Codeplay Software Ltd
45 York Place, Edinburgh, EH1 3HP
Tel: 0131 466 0503
Fax: 0131 557 6600
Website: http://www.codeplay.com
Twitter: https://twitter.com/codeplaysoft

This email and any attachments may contain confidential and /or privileged information and is for use by the addressee only. If you are not the intended recipient, please notify Codeplay Software Ltd immediately and delete the message from your computer. You may not copy or forward it,or use or disclose its contents to any other person. Any views or other information in this message which do not relate to our business are not authorized by Codeplay software Ltd, nor does this message form part of any contract unless so stated.
As internet communications are capable of data corruption Codeplay Software Ltd does not accept any responsibility for any changes made to this message after it was sent. Please note that Codeplay Software Ltd does not accept any liability or responsibility for viruses and it is your responsibility to scan any attachments.
Company registered in England and Wales, number: 04567874
Registered office: 81 Linkfield Street, Redhill RH1 6BY

-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://lists.llvm.org/pipermail/llvm-commits/attachments/20150528/48219f02/attachment.html>
-------------- next part --------------
; ModuleID = 'test_sext_v16i1_punpcklbw_machine_cp.ll'
target triple = "armv7-unknown-linux-gnueabi"

@V = constant <16 x float> <float -8.000000e+00, float -7.000000e+00, float -6.000000e+00, float -5.000000e+00, float -4.000000e+00, float -3.000000e+00, float -2.000000e+00, float -1.000000e+00, float 0.000000e+00, float 1.000000e+00, float 2.000000e+00, float 3.000000e+00, float 4.000000e+00, float 5.000000e+00, float 6.000000e+00, float 7.000000e+00>, align 64
@.str = private unnamed_addr constant [42 x i8] c"error at %d: expected = %f, actual = %f \0A\00", align 1

; Function Attrs: nounwind readnone noinline
define <16 x float> @foo(<16 x float> %x) #4 {
  %v1 = bitcast <16 x float> %x to <16 x i32>
  %v3 = icmp slt <16 x i32> %v1, zeroinitializer
  %v4 = and <16 x i32> %v1, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
  %v7 = icmp eq <16 x i32> %v4, zeroinitializer
  %v10 = or <16 x i1> %v3, %v7
  %v11 = sext <16 x i1> %v10 to <16 x i32>
  %v14 = zext <16 x i1> %v3 to <16 x i32>
  %v16 = fcmp olt <16 x float> %x, zeroinitializer
  %v17 = sext <16 x i1> %v16 to <16 x i32>
  %v18 = zext <16 x i1> %v16 to <16 x i32>
  %v19 = xor <16 x i32> %v14, %v18
  %v20 = or <16 x i32> %v17, %v11
  %v21 = fptosi <16 x float> %x to <16 x i32>
  %v22 = sitofp <16 x i32> %v21 to <16 x float>
  %v69 = fcmp ogt <16 x float> %v22, zeroinitializer
  %v75 = and <16 x i1> %v69, %v3
  %v77 = bitcast <16 x float> %v22 to <16 x i32>
  %v78 = xor <16 x i32> %v77, %v1
  %v79 = sext <16 x i1> %v75 to <16 x i32>
  %v80 = and <16 x i32> %v78, %v79
  %v81 = xor <16 x i32> %v77, %v80
  %v82 = and <16 x i32> %v1, %v81
  %v83 = xor <16 x i32> %v19, %v82
  %v84 = and <16 x i32> %v83, %v20
  %v85 = xor <16 x i32> %v19, %v84
  %v86 = bitcast <16 x i32> %v85 to <16 x float>
  ret <16 x float> %v86
}

; Function Attrs: nounwind uwtable
define i32 @main(i32 %argc, i8** nocapture readnone %argv) #1 {
  %1 = alloca <16 x float>, align 64
  %2 = call <16 x float> @foo(<16 x float> <float -8.000000e+00, float -7.000000e+00, float -6.000000e+00, float -5.000000e+00, float -4.000000e+00, float -3.000000e+00, float -2.000000e+00, float -1.000000e+00, float 0.000000e+00, float 1.000000e+00, float 2.000000e+00, float 3.000000e+00, float 4.000000e+00, float 5.000000e+00, float 6.000000e+00, float 7.000000e+00>) #4
  br label %4

; <label>:3                                       ; preds = %12
  %failed.1.lcssa = phi i32 [ %failed.1, %12 ]
  ret i32 %failed.1.lcssa

; <label>:4                                       ; preds = %12, %0
  %i.02 = phi i32 [ 0, %0 ], [ %13, %12 ]
  %failed.01 = phi i32 [ 0, %0 ], [ %failed.1, %12 ]
  %5 = extractelement <16 x float> %2, i32 %i.02
  %6 = extractelement <16 x float> <float -8.000000e+00, float -7.000000e+00, float -6.000000e+00, float -5.000000e+00, float -4.000000e+00, float -3.000000e+00, float -2.000000e+00, float -1.000000e+00, float 0.000000e+00, float 1.000000e+00, float 2.000000e+00, float 3.000000e+00, float 4.000000e+00, float 5.000000e+00, float 6.000000e+00, float 7.000000e+00>, i32 %i.02
  %7 = fcmp une float %5, %6
  br i1 %7, label %8, label %12

; <label>:8                                       ; preds = %4
  %9 = fpext float %6 to double
  %10 = fpext float %5 to double
  %11 = tail call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([42 x i8], [42 x i8]* @.str, i64 0, i64 0), i32 %i.02, double %9, double %10) #3
  br label %12

; <label>:12                                      ; preds = %4, %8
  %failed.1 = phi i32 [ 1, %8 ], [ %failed.01, %4 ]
  %13 = add nuw nsw i32 %i.02, 1
  %exitcond = icmp eq i32 %13, 16
  br i1 %exitcond, label %3, label %4
}

; Function Attrs: nounwind
declare i32 @printf(i8* nocapture readonly, ...) #2

attributes #0 = { noinline nounwind readonly uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8"  "unsafe-fp-math"="false" "use-soft-float"="false" }
attributes #1 = { nounwind uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8"  "unsafe-fp-math"="false" "use-soft-float"="false" }
attributes #2 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8"  "unsafe-fp-math"="false" "use-soft-float"="false" }
attributes #3 = { nounwind }
attributes #4 = { nounwind readnone noinline "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }

!llvm.ident = !{!0}

!0 = !{!"clang version 3.7.0 "}
!1 = !{!2, !2, i64 0}
!2 = !{!"omnipotent char", !3, i64 0}
!3 = !{!"Simple C/C++ TBAA"}
-------------- next part --------------
; ModuleID = 'test_sext_v16i1_punpcklbw_machine_cp.ll'
target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-unknown-linux-gnu"

@V = constant <16 x float> <float -8.000000e+00, float -7.000000e+00, float -6.000000e+00, float -5.000000e+00, float -4.000000e+00, float -3.000000e+00, float -2.000000e+00, float -1.000000e+00, float 0.000000e+00, float 1.000000e+00, float 2.000000e+00, float 3.000000e+00, float 4.000000e+00, float 5.000000e+00, float 6.000000e+00, float 7.000000e+00>, align 64
@.str = private unnamed_addr constant [42 x i8] c"error at %d: expected = %f, actual = %f \0A\00", align 1

; Function Attrs: nounwind readnone noinline
define <16 x float> @foo(<16 x float> %x) #4 {
  %v1 = bitcast <16 x float> %x to <16 x i32>
  %v3 = icmp slt <16 x i32> %v1, zeroinitializer
  %v4 = and <16 x i32> %v1, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
  %v7 = icmp eq <16 x i32> %v4, zeroinitializer
  %v10 = or <16 x i1> %v3, %v7
  %v11 = sext <16 x i1> %v10 to <16 x i32>
  %v14 = zext <16 x i1> %v3 to <16 x i32>
  %v16 = fcmp olt <16 x float> %x, zeroinitializer
  %v17 = sext <16 x i1> %v16 to <16 x i32>
  %v18 = zext <16 x i1> %v16 to <16 x i32>
  %v19 = xor <16 x i32> %v14, %v18
  %v20 = or <16 x i32> %v17, %v11
  %v21 = fptosi <16 x float> %x to <16 x i32>
  %v22 = sitofp <16 x i32> %v21 to <16 x float>
  %v69 = fcmp ogt <16 x float> %v22, zeroinitializer
  %v75 = and <16 x i1> %v69, %v3
  %v77 = bitcast <16 x float> %v22 to <16 x i32>
  %v78 = xor <16 x i32> %v77, %v1
  %v79 = sext <16 x i1> %v75 to <16 x i32>
  %v80 = and <16 x i32> %v78, %v79
  %v81 = xor <16 x i32> %v77, %v80
  %v82 = and <16 x i32> %v1, %v81
  %v83 = xor <16 x i32> %v19, %v82
  %v84 = and <16 x i32> %v83, %v20
  %v85 = xor <16 x i32> %v19, %v84
  %v86 = bitcast <16 x i32> %v85 to <16 x float>
  ret <16 x float> %v86
}

; Function Attrs: nounwind uwtable
define i32 @main(i32 %argc, i8** nocapture readnone %argv) #1 {
  %1 = alloca <16 x float>, align 64
  %2 = call <16 x float> @foo(<16 x float> <float -8.000000e+00, float -7.000000e+00, float -6.000000e+00, float -5.000000e+00, float -4.000000e+00, float -3.000000e+00, float -2.000000e+00, float -1.000000e+00, float 0.000000e+00, float 1.000000e+00, float 2.000000e+00, float 3.000000e+00, float 4.000000e+00, float 5.000000e+00, float 6.000000e+00, float 7.000000e+00>) #4
  br label %4

; <label>:3                                       ; preds = %12
  %failed.1.lcssa = phi i32 [ %failed.1, %12 ]
  ret i32 %failed.1.lcssa

; <label>:4                                       ; preds = %12, %0
  %i.02 = phi i32 [ 0, %0 ], [ %13, %12 ]
  %failed.01 = phi i32 [ 0, %0 ], [ %failed.1, %12 ]
  %5 = extractelement <16 x float> %2, i32 %i.02
  %6 = extractelement <16 x float> <float -8.000000e+00, float -7.000000e+00, float -6.000000e+00, float -5.000000e+00, float -4.000000e+00, float -3.000000e+00, float -2.000000e+00, float -1.000000e+00, float 0.000000e+00, float 1.000000e+00, float 2.000000e+00, float 3.000000e+00, float 4.000000e+00, float 5.000000e+00, float 6.000000e+00, float 7.000000e+00>, i32 %i.02
  %7 = fcmp une float %5, %6
  br i1 %7, label %8, label %12

; <label>:8                                       ; preds = %4
  %9 = fpext float %6 to double
  %10 = fpext float %5 to double
  %11 = tail call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([42 x i8], [42 x i8]* @.str, i64 0, i64 0), i32 %i.02, double %9, double %10) #3
  br label %12

; <label>:12                                      ; preds = %4, %8
  %failed.1 = phi i32 [ 1, %8 ], [ %failed.01, %4 ]
  %13 = add nuw nsw i32 %i.02, 1
  %exitcond = icmp eq i32 %13, 16
  br i1 %exitcond, label %3, label %4
}

; Function Attrs: nounwind
declare i32 @printf(i8* nocapture readonly, ...) #2

attributes #0 = { noinline nounwind readonly uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" }
attributes #1 = { nounwind uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" }
attributes #2 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" }
attributes #3 = { nounwind }
attributes #4 = { nounwind readnone noinline "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }

!llvm.ident = !{!0}

!0 = !{!"clang version 3.7.0 "}
!1 = !{!2, !2, i64 0}
!2 = !{!"omnipotent char", !3, i64 0}
!3 = !{!"Simple C/C++ TBAA"}