[llvm-dev] [Codegen bug in LLVM 3.8?] br following `fcmp une` is present in ll, absent in asm

Thu Mar 9 16:04:11 PST 2017

In term of releases I observe the bug to go away in llvm 3.9.0. It seems like the next step is to bisect changes between 3.8.1 and 3.9.0. The following tool looks promising : https://github.com/llvm-mirror/zorg/blob/master/llvmbisect/docs/llvmlab_bisect.rst

Ketan

________________________________
From: Ramkumar Ramachandra
Sent: Monday, March 6, 2017 3:51:10 PM
To: Robinson, Paul
Cc: Dale Martin; Ketan Surender; llvm-dev at lists.llvm.org
Subject: Re: [Codegen bug in LLVM 3.8?] br following `fcmp une` is present in ll, absent in asm

Nope, the bug seems to be fixed in the latest LLVM.

Ram

________________________________
From: Robinson, Paul <paul.robinson at sony.com>
Sent: Monday, March 6, 2017 3:49:40 PM
To: Ramkumar Ramachandra
Cc: Dale Martin; Ketan Surender; llvm-dev at lists.llvm.org
Subject: RE: [Codegen bug in LLVM 3.8?] br following `fcmp une` is present in ll, absent in asm

LLVM 3.8 was released a year ago.  A lot of things have changed since then.
Are you able to reproduce the problem with the current HEAD, or with the LLVM 4.0 release candidate?
--paulr

From: llvm-dev [mailto:llvm-dev-bounces at lists.llvm.org] On Behalf Of Ramkumar Ramachandra via llvm-dev
Sent: Wednesday, March 01, 2017 10:17 AM
To: llvm-dev at lists.llvm.org
Cc: Dale Martin; Ketan Surender
Subject: [llvm-dev] [Codegen bug in LLVM 3.8?] br following `fcmp une` is present in ll, absent in asm

Hi,

We seem to have found a bug in the LLVM 3.8 code generator. We are using MCJIT and have isolated working.ll and broken.ll after middle-end optimizations -- in the block merge128, notice that broken.ll has a fcmp une comparison to zero and a jump based on that branch:

merge128:                                         ; preds = %true71, %false72
  %_rtB_724 = load %B_repro_T*, %B_repro_T** %_rtB_, align 8
  %590 = getelementptr inbounds %B_repro_T, %B_repro_T* %_rtB_724, i64 0, i32 38
  %591 = bitcast double* %590 to i64*
  %_rtB__Step31020 = load i64, i64* %591, align 1
  %592 = getelementptr inbounds %B_repro_T, %B_repro_T* %_rtB_724, i64 0, i32 39
  %593 = bitcast [4 x double]* %592 to i64*
  store i64 %_rtB__Step31020, i64* %593, align 1
  %_rtB_726 = load %B_repro_T*, %B_repro_T** %_rtB_, align 8
  %594 = getelementptr inbounds %B_repro_T, %B_repro_T* %_rtB_726, i64 0, i32 39, i64 1
  store double 0.000000e+00, double* %594, align 1
  %_rtB_727 = load %B_repro_T*, %B_repro_T** %_rtB_, align 8
  %595 = getelementptr inbounds %B_repro_T, %B_repro_T* %_rtB_727, i64 0, i32 39, i64 2
  store double 0.000000e+00, double* %595, align 1
  %_rtB_728 = load %B_repro_T*, %B_repro_T** %_rtB_, align 8
  %596 = getelementptr inbounds %B_repro_T, %B_repro_T* %_rtB_728, i64 0, i32 39, i64 3
  store double 0.000000e+00, double* %596, align 1
  %_rtP_729 = load %P_repro_T*, %P_repro_T** %_rtP_, align 8
  %597 = getelementptr inbounds %P_repro_T, %P_repro_T* %_rtP_729, i64 0, i32 149
  %_rtP__Kp_Gain_n = load double, double* %597, align 1
  %_rtB_730 = load %B_repro_T*, %B_repro_T** %_rtB_, align 8
  %598 = getelementptr inbounds %B_repro_T, %B_repro_T* %_rtB_730, i64 0, i32 34, i64 0
  %_rtB__Switch_k_el = load double, double* %598, align 1
  %tmp140 = fmul double %_rtP__Kp_Gain_n, %_rtB__Switch_k_el
  %tmp141 = fadd double %_rtDW__broken_discrete_time_integrator1_DSTATE_el10061129, %tmp140
  %599 = getelementptr inbounds %B_repro_T, %B_repro_T* %_rtB_730, i64 0, i32 35, i64 0
  store double %tmp141, double* %599, align 1
  %_rtP_733 = load %P_repro_T*, %P_repro_T** %_rtP_, align 8
  %600 = getelementptr inbounds %P_repro_T, %P_repro_T* %_rtP_733, i64 0, i32 154
  %_rtP__Ki_Gain_b = load double, double* %600, align 1
  %_rtB_734 = load %B_repro_T*, %B_repro_T** %_rtB_, align 8
  %601 = getelementptr inbounds %B_repro_T, %B_repro_T* %_rtB_734, i64 0, i32 34, i64 0
  %_rtB__Switch_k_el735 = load double, double* %601, align 1
  %tmp142 = fmul double %_rtP__Ki_Gain_b, %_rtB__Switch_k_el735
  %602 = getelementptr inbounds %B_repro_T, %B_repro_T* %_rtB_734, i64 0, i32 37, i64 0
  store double %tmp142, double* %602, align 1
  %rtb_Sum3_737 = load double, double* %rtb_Sum3_, align 8
  %603 = fcmp une double %rtb_Sum3_737, 0.000000e+00
  %_rtB_739 = load %B_repro_T*, %B_repro_T** %_rtB_, align 8
  br i1 %603, label %true73, label %false74

Now, in broken.asm, notice the same merge128 is missing the branch instruction:

.LBB6_55:                               # %merge128
movq
184(%rsp), %rcx
movq
%rax, 728(%rcx)
movq
184(%rsp), %rax
movq
728(%rax), %rcx
movq
%rcx, 736(%rax)
movq
184(%rsp), %rax
movq
$0, 744(%rax)
movq
184(%rsp), %rax
movq
$0, 752(%rax)
movq
184(%rsp), %rax
movq
$0, 760(%rax)
movq
176(%rsp), %rax
movsd
5608(%rax), %xmm0       # xmm0 = mem[0],zero
movq
184(%rsp), %rax
mulsd
648(%rax), %xmm0
movsd
160(%rsp), %xmm1        # 8-byte Reload
                                        # xmm1 = mem[0],zero
addsd
%xmm0, %xmm1
movsd
%xmm1, 672(%rax)
movq
176(%rsp), %rax
movsd
5648(%rax), %xmm0       # xmm0 = mem[0],zero
movq
184(%rsp), %rax
mulsd
648(%rax), %xmm0
movsd
%xmm0, 704(%rax)
movsd
192(%rsp), %xmm0        # xmm0 = mem[0],zero
movq
184(%rsp), %rax
xorpd
%xmm1, %xmm1
ucomisd
%xmm1, %xmm0
movq
672(%rax), %rcx
movq
%rcx, 200(%rsp)
movd
%rcx, %xmm0
addsd
120(%rax), %xmm0
movq
176(%rsp), %rcx
mulsd
5680(%rcx), %xmm0
movsd
%xmm0, 768(%rax)
movq
176(%rsp), %rax
movsd
5608(%rax), %xmm0       # xmm0 = mem[0],zero

We know that this is the right instruction to be looking at, because we can line it up with working.ll and working.asm:

merge128:                                         ; preds = %true71, %false72
  %_rtB_724 = load %B_repro_T.0*, %B_repro_T.0** %_rtB_, align 8
  %590 = getelementptr inbounds %B_repro_T.0, %B_repro_T.0* %_rtB_724, i64 0, i32 38
  %591 = bitcast double* %590 to i64*
  %_rtB__Step31025 = load i64, i64* %591, align 1
  %592 = getelementptr inbounds %B_repro_T.0, %B_repro_T.0* %_rtB_724, i64 0, i32 39
  %593 = bitcast [4 x double]* %592 to i64*
  store i64 %_rtB__Step31025, i64* %593, align 1
  %_rtB_726 = load %B_repro_T.0*, %B_repro_T.0** %_rtB_, align 8
  %594 = getelementptr inbounds %B_repro_T.0, %B_repro_T.0* %_rtB_726, i64 0, i32 39, i64 1
  store double 0.000000e+00, double* %594, align 1
  %_rtB_727 = load %B_repro_T.0*, %B_repro_T.0** %_rtB_, align 8
  %595 = getelementptr inbounds %B_repro_T.0, %B_repro_T.0* %_rtB_727, i64 0, i32 39, i64 2
  store double 0.000000e+00, double* %595, align 1
  %_rtB_728 = load %B_repro_T.0*, %B_repro_T.0** %_rtB_, align 8
  %596 = getelementptr inbounds %B_repro_T.0, %B_repro_T.0* %_rtB_728, i64 0, i32 39, i64 3
  store double 0.000000e+00, double* %596, align 1
  %_rtP_729 = load %P_repro_T.2*, %P_repro_T.2** %_rtP_, align 8
  %597 = getelementptr inbounds %P_repro_T.2, %P_repro_T.2* %_rtP_729, i64 0, i32 149
  %_rtP__Kp_Gain_n = load double, double* %597, align 1
  %_rtB_730 = load %B_repro_T.0*, %B_repro_T.0** %_rtB_, align 8
  %598 = getelementptr inbounds %B_repro_T.0, %B_repro_T.0* %_rtB_730, i64 0, i32 34, i64 0
  %_rtB__Switch_k_el = load double, double* %598, align 1
  %tmp140 = fmul double %_rtP__Kp_Gain_n, %_rtB__Switch_k_el
  %tmp141 = fadd double %_rtDW__broken_discrete_time_integrator1_DSTATE_el10111134, %tmp140
  %599 = getelementptr inbounds %B_repro_T.0, %B_repro_T.0* %_rtB_730, i64 0, i32 35, i64 0
  store double %tmp141, double* %599, align 1
  %_rtP_733 = load %P_repro_T.2*, %P_repro_T.2** %_rtP_, align 8
  %600 = getelementptr inbounds %P_repro_T.2, %P_repro_T.2* %_rtP_733, i64 0, i32 155
  %_rtP__Ki_Gain_b = load double, double* %600, align 1
  %_rtB_734 = load %B_repro_T.0*, %B_repro_T.0** %_rtB_, align 8
  %601 = getelementptr inbounds %B_repro_T.0, %B_repro_T.0* %_rtB_734, i64 0, i32 34, i64 0
  %_rtB__Switch_k_el735 = load double, double* %601, align 1
  %tmp142 = fmul double %_rtP__Ki_Gain_b, %_rtB__Switch_k_el735
  %602 = getelementptr inbounds %B_repro_T.0, %B_repro_T.0* %_rtB_734, i64 0, i32 37, i64 0
  store double %tmp142, double* %602, align 1
  %rtb_Sum3_737 = load double, double* %rtb_Sum3_, align 8
  %_rtP_738 = load %P_repro_T.2*, %P_repro_T.2** %_rtP_, align 8
  %603 = getelementptr inbounds %P_repro_T.2, %P_repro_T.2* %_rtP_738, i64 0, i32 154
  %_rtP__Switch_Threshold = load double, double* %603, align 1
  %604 = fcmp ogt double %rtb_Sum3_737, %_rtP__Switch_Threshold
  %_rtB_740 = load %B_repro_T.0*, %B_repro_T.0** %_rtB_, align 8
  br i1 %604, label %true73, label %false74

working.ll is a slightly different model from broken.ll, in that it loads the "zero value" from memory and does fcmp ogt instead of fcmp une. Otherwise, they're the same. Now, let's look at working.asm:

.LBB6_55:                               # %merge128
movq
184(%rsp), %rcx
movq
%rax, 728(%rcx)
movq
184(%rsp), %rax
movq
728(%rax), %rcx
movq
%rcx, 736(%rax)
movq
184(%rsp), %rax
movq
$0, 744(%rax)
movq
184(%rsp), %rax
movq
$0, 752(%rax)
movq
184(%rsp), %rax
movq
$0, 760(%rax)
movq
176(%rsp), %rax
movsd
5608(%rax), %xmm0       # xmm0 = mem[0],zero
movq
184(%rsp), %rax
mulsd
648(%rax), %xmm0
movsd
160(%rsp), %xmm1        # 8-byte Reload
                                        # xmm1 = mem[0],zero
addsd
%xmm0, %xmm1
movsd
%xmm1, 672(%rax)
movq
176(%rsp), %rax
movsd
5656(%rax), %xmm0       # xmm0 = mem[0],zero
movq
184(%rsp), %rax
mulsd
648(%rax), %xmm0
movsd
%xmm0, 704(%rax)
movsd
192(%rsp), %xmm0        # xmm0 = mem[0],zero
movq
176(%rsp), %rax
ucomisd
5648(%rax), %xmm0
movq
184(%rsp), %rcx
jbe
.LBB6_56
# BB#128:                               # %true73
movq
672(%rcx), %rdx
jmp
.LBB6_129
.LBB6_56:                               # %false74
movq
696(%rcx), %rdx
.LBB6_129:                              # %merge129
movq
%rdx, 200(%rsp)
movd
%rdx, %xmm0
addsd
120(%rcx), %xmm0
mulsd
5688(%rax), %xmm0
movsd
%xmm0, 768(%rcx)
movq
176(%rsp), %rax
movsd
5608(%rax), %xmm0       # xmm0 = mem[0],zero

Notice that the blocks true73 and false74 are completely absent in broken.asm.

If you want to generate this asm yourself, please use LLVM 3.8's llc on the .ll files. For viewing convenience, please use a difftool to look at broken.ll versus working.ll and broken.asm versus working.asm -- I've highlighted the differences above at merge128.

Further, we have instrumented this code to print out the value of rtb_Sum3_737, and found that it takes values other than zero, hitting both branches at execution. We would like to know if the community is aware of this bug, and which patch fixed it. Finally, see broken-latest.asm to see the output from the latest llc -- the jump is present and the bug has been fixed:

.LBB6_99:                               # %merge128
movq
8(%rsp), %rcx
movq
%rax, 728(%rcx)
movq
8(%rsp), %rax
movq
728(%rax), %rcx
movq
%rcx, 736(%rax)
movq
8(%rsp), %rax
movq
$0, 744(%rax)
movq
8(%rsp), %rax
movq
$0, 752(%rax)
movq
8(%rsp), %rax
movq
$0, 760(%rax)
movq
16(%rsp), %rax
movsd
5608(%rax), %xmm0       # xmm0 = mem[0],zero
movq
8(%rsp), %rax
mulsd
648(%rax), %xmm0
movsd
40(%rsp), %xmm1         # 8-byte Reload
                                        # xmm1 = mem[0],zero
addsd
%xmm0, %xmm1
movsd
%xmm1, 672(%rax)
movq
16(%rsp), %rax
movsd
5648(%rax), %xmm0       # xmm0 = mem[0],zero
movq
8(%rsp), %rax
mulsd
648(%rax), %xmm0
movsd
%xmm0, 704(%rax)
movsd
32(%rsp), %xmm0         # xmm0 = mem[0],zero
movq
8(%rsp), %rax
xorpd
%xmm1, %xmm1
ucomisd
%xmm1, %xmm0
jne
.LBB6_100
jnp
.LBB6_101
.LBB6_100:                              # %true73
movq
672(%rax), %rcx
jmp
.LBB6_102
.LBB6_101:                              # %false74
movq
696(%rax), %rcx

Thanks.

Ram
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://lists.llvm.org/pipermail/llvm-dev/attachments/20170310/966bccc9/attachment-0001.html>