[llvm-dev] [Codegen bug in LLVM 3.8?] br following `fcmp une` is present in ll, absent in asm

Ramkumar Ramachandra via llvm-dev llvm-dev at lists.llvm.org
Wed Mar 1 10:17:01 PST 2017


Hi,


We seem to have found a bug in the LLVM 3.8 code generator. We are using MCJIT and have isolated working.ll and broken.ll after middle-end optimizations -- in the block merge128, notice that broken.ll has a fcmp une comparison to zero and a jump based on that branch:


merge128:                                         ; preds = %true71, %false72
  %_rtB_724 = load %B_repro_T*, %B_repro_T** %_rtB_, align 8
  %590 = getelementptr inbounds %B_repro_T, %B_repro_T* %_rtB_724, i64 0, i32 38
  %591 = bitcast double* %590 to i64*
  %_rtB__Step31020 = load i64, i64* %591, align 1
  %592 = getelementptr inbounds %B_repro_T, %B_repro_T* %_rtB_724, i64 0, i32 39
  %593 = bitcast [4 x double]* %592 to i64*
  store i64 %_rtB__Step31020, i64* %593, align 1
  %_rtB_726 = load %B_repro_T*, %B_repro_T** %_rtB_, align 8
  %594 = getelementptr inbounds %B_repro_T, %B_repro_T* %_rtB_726, i64 0, i32 39, i64 1
  store double 0.000000e+00, double* %594, align 1
  %_rtB_727 = load %B_repro_T*, %B_repro_T** %_rtB_, align 8
  %595 = getelementptr inbounds %B_repro_T, %B_repro_T* %_rtB_727, i64 0, i32 39, i64 2
  store double 0.000000e+00, double* %595, align 1
  %_rtB_728 = load %B_repro_T*, %B_repro_T** %_rtB_, align 8
  %596 = getelementptr inbounds %B_repro_T, %B_repro_T* %_rtB_728, i64 0, i32 39, i64 3
  store double 0.000000e+00, double* %596, align 1
  %_rtP_729 = load %P_repro_T*, %P_repro_T** %_rtP_, align 8
  %597 = getelementptr inbounds %P_repro_T, %P_repro_T* %_rtP_729, i64 0, i32 149
  %_rtP__Kp_Gain_n = load double, double* %597, align 1
  %_rtB_730 = load %B_repro_T*, %B_repro_T** %_rtB_, align 8
  %598 = getelementptr inbounds %B_repro_T, %B_repro_T* %_rtB_730, i64 0, i32 34, i64 0
  %_rtB__Switch_k_el = load double, double* %598, align 1
  %tmp140 = fmul double %_rtP__Kp_Gain_n, %_rtB__Switch_k_el
  %tmp141 = fadd double %_rtDW__broken_discrete_time_integrator1_DSTATE_el10061129, %tmp140
  %599 = getelementptr inbounds %B_repro_T, %B_repro_T* %_rtB_730, i64 0, i32 35, i64 0
  store double %tmp141, double* %599, align 1
  %_rtP_733 = load %P_repro_T*, %P_repro_T** %_rtP_, align 8
  %600 = getelementptr inbounds %P_repro_T, %P_repro_T* %_rtP_733, i64 0, i32 154
  %_rtP__Ki_Gain_b = load double, double* %600, align 1
  %_rtB_734 = load %B_repro_T*, %B_repro_T** %_rtB_, align 8
  %601 = getelementptr inbounds %B_repro_T, %B_repro_T* %_rtB_734, i64 0, i32 34, i64 0
  %_rtB__Switch_k_el735 = load double, double* %601, align 1
  %tmp142 = fmul double %_rtP__Ki_Gain_b, %_rtB__Switch_k_el735
  %602 = getelementptr inbounds %B_repro_T, %B_repro_T* %_rtB_734, i64 0, i32 37, i64 0
  store double %tmp142, double* %602, align 1
  %rtb_Sum3_737 = load double, double* %rtb_Sum3_, align 8
  %603 = fcmp une double %rtb_Sum3_737, 0.000000e+00
  %_rtB_739 = load %B_repro_T*, %B_repro_T** %_rtB_, align 8
  br i1 %603, label %true73, label %false74

Now, in broken.asm, notice the same merge128 is missing the branch instruction:

.LBB6_55:                               # %merge128
movq 184(%rsp), %rcx
movq %rax, 728(%rcx)
movq 184(%rsp), %rax
movq 728(%rax), %rcx
movq %rcx, 736(%rax)
movq 184(%rsp), %rax
movq $0, 744(%rax)
movq 184(%rsp), %rax
movq $0, 752(%rax)
movq 184(%rsp), %rax
movq $0, 760(%rax)
movq 176(%rsp), %rax
movsd 5608(%rax), %xmm0       # xmm0 = mem[0],zero
movq 184(%rsp), %rax
mulsd 648(%rax), %xmm0
movsd 160(%rsp), %xmm1        # 8-byte Reload
                                        # xmm1 = mem[0],zero
addsd %xmm0, %xmm1
movsd %xmm1, 672(%rax)
movq 176(%rsp), %rax
movsd 5648(%rax), %xmm0       # xmm0 = mem[0],zero
movq 184(%rsp), %rax
mulsd 648(%rax), %xmm0
movsd %xmm0, 704(%rax)
movsd 192(%rsp), %xmm0        # xmm0 = mem[0],zero
movq 184(%rsp), %rax
xorpd %xmm1, %xmm1
ucomisd %xmm1, %xmm0
movq 672(%rax), %rcx
movq %rcx, 200(%rsp)
movd %rcx, %xmm0
addsd 120(%rax), %xmm0
movq 176(%rsp), %rcx
mulsd 5680(%rcx), %xmm0
movsd %xmm0, 768(%rax)
movq 176(%rsp), %rax
movsd 5608(%rax), %xmm0       # xmm0 = mem[0],zero

We know that this is the right instruction to be looking at, because we can line it up with working.ll and working.asm:


merge128:                                         ; preds = %true71, %false72
  %_rtB_724 = load %B_repro_T.0*, %B_repro_T.0** %_rtB_, align 8
  %590 = getelementptr inbounds %B_repro_T.0, %B_repro_T.0* %_rtB_724, i64 0, i32 38
  %591 = bitcast double* %590 to i64*
  %_rtB__Step31025 = load i64, i64* %591, align 1
  %592 = getelementptr inbounds %B_repro_T.0, %B_repro_T.0* %_rtB_724, i64 0, i32 39
  %593 = bitcast [4 x double]* %592 to i64*
  store i64 %_rtB__Step31025, i64* %593, align 1
  %_rtB_726 = load %B_repro_T.0*, %B_repro_T.0** %_rtB_, align 8
  %594 = getelementptr inbounds %B_repro_T.0, %B_repro_T.0* %_rtB_726, i64 0, i32 39, i64 1
  store double 0.000000e+00, double* %594, align 1
  %_rtB_727 = load %B_repro_T.0*, %B_repro_T.0** %_rtB_, align 8
  %595 = getelementptr inbounds %B_repro_T.0, %B_repro_T.0* %_rtB_727, i64 0, i32 39, i64 2
  store double 0.000000e+00, double* %595, align 1
  %_rtB_728 = load %B_repro_T.0*, %B_repro_T.0** %_rtB_, align 8
  %596 = getelementptr inbounds %B_repro_T.0, %B_repro_T.0* %_rtB_728, i64 0, i32 39, i64 3
  store double 0.000000e+00, double* %596, align 1
  %_rtP_729 = load %P_repro_T.2*, %P_repro_T.2** %_rtP_, align 8
  %597 = getelementptr inbounds %P_repro_T.2, %P_repro_T.2* %_rtP_729, i64 0, i32 149
  %_rtP__Kp_Gain_n = load double, double* %597, align 1
  %_rtB_730 = load %B_repro_T.0*, %B_repro_T.0** %_rtB_, align 8
  %598 = getelementptr inbounds %B_repro_T.0, %B_repro_T.0* %_rtB_730, i64 0, i32 34, i64 0
  %_rtB__Switch_k_el = load double, double* %598, align 1
  %tmp140 = fmul double %_rtP__Kp_Gain_n, %_rtB__Switch_k_el
  %tmp141 = fadd double %_rtDW__broken_discrete_time_integrator1_DSTATE_el10111134, %tmp140
  %599 = getelementptr inbounds %B_repro_T.0, %B_repro_T.0* %_rtB_730, i64 0, i32 35, i64 0
  store double %tmp141, double* %599, align 1
  %_rtP_733 = load %P_repro_T.2*, %P_repro_T.2** %_rtP_, align 8
  %600 = getelementptr inbounds %P_repro_T.2, %P_repro_T.2* %_rtP_733, i64 0, i32 155
  %_rtP__Ki_Gain_b = load double, double* %600, align 1
  %_rtB_734 = load %B_repro_T.0*, %B_repro_T.0** %_rtB_, align 8
  %601 = getelementptr inbounds %B_repro_T.0, %B_repro_T.0* %_rtB_734, i64 0, i32 34, i64 0
  %_rtB__Switch_k_el735 = load double, double* %601, align 1
  %tmp142 = fmul double %_rtP__Ki_Gain_b, %_rtB__Switch_k_el735
  %602 = getelementptr inbounds %B_repro_T.0, %B_repro_T.0* %_rtB_734, i64 0, i32 37, i64 0
  store double %tmp142, double* %602, align 1
  %rtb_Sum3_737 = load double, double* %rtb_Sum3_, align 8
  %_rtP_738 = load %P_repro_T.2*, %P_repro_T.2** %_rtP_, align 8
  %603 = getelementptr inbounds %P_repro_T.2, %P_repro_T.2* %_rtP_738, i64 0, i32 154
  %_rtP__Switch_Threshold = load double, double* %603, align 1
  %604 = fcmp ogt double %rtb_Sum3_737, %_rtP__Switch_Threshold
  %_rtB_740 = load %B_repro_T.0*, %B_repro_T.0** %_rtB_, align 8
  br i1 %604, label %true73, label %false74

working.ll is a slightly different model from broken.ll, in that it loads the "zero value" from memory and does fcmp ogt instead of fcmp une. Otherwise, they're the same. Now, let's look at working.asm:


.LBB6_55:                               # %merge128
movq 184(%rsp), %rcx
movq %rax, 728(%rcx)
movq 184(%rsp), %rax
movq 728(%rax), %rcx
movq %rcx, 736(%rax)
movq 184(%rsp), %rax
movq $0, 744(%rax)
movq 184(%rsp), %rax
movq $0, 752(%rax)
movq 184(%rsp), %rax
movq $0, 760(%rax)
movq 176(%rsp), %rax
movsd 5608(%rax), %xmm0       # xmm0 = mem[0],zero
movq 184(%rsp), %rax
mulsd 648(%rax), %xmm0
movsd 160(%rsp), %xmm1        # 8-byte Reload
                                        # xmm1 = mem[0],zero
addsd %xmm0, %xmm1
movsd %xmm1, 672(%rax)
movq 176(%rsp), %rax
movsd 5656(%rax), %xmm0       # xmm0 = mem[0],zero
movq 184(%rsp), %rax
mulsd 648(%rax), %xmm0
movsd %xmm0, 704(%rax)
movsd 192(%rsp), %xmm0        # xmm0 = mem[0],zero
movq 176(%rsp), %rax
ucomisd 5648(%rax), %xmm0
movq 184(%rsp), %rcx
jbe .LBB6_56
# BB#128:                               # %true73
movq 672(%rcx), %rdx
jmp .LBB6_129
.LBB6_56:                               # %false74
movq 696(%rcx), %rdx
.LBB6_129:                              # %merge129
movq %rdx, 200(%rsp)
movd %rdx, %xmm0
addsd 120(%rcx), %xmm0
mulsd 5688(%rax), %xmm0
movsd %xmm0, 768(%rcx)
movq 176(%rsp), %rax
movsd 5608(%rax), %xmm0       # xmm0 = mem[0],zero

Notice that the blocks true73 and false74 are completely absent in broken.asm.


If you want to generate this asm yourself, please use LLVM 3.8's llc on the .ll files. For viewing convenience, please use a difftool to look at broken.ll versus working.ll and broken.asm versus working.asm -- I've highlighted the differences above at merge128.


Further, we have instrumented this code to print out the value of rtb_Sum3_737, and found that it takes values other than zero, hitting both branches at execution. We would like to know if the community is aware of this bug, and which patch fixed it. Finally, see broken-latest.asm to see the output from the latest llc -- the jump is present and the bug has been fixed:


.LBB6_99:                               # %merge128
movq 8(%rsp), %rcx
movq %rax, 728(%rcx)
movq 8(%rsp), %rax
movq 728(%rax), %rcx
movq %rcx, 736(%rax)
movq 8(%rsp), %rax
movq $0, 744(%rax)
movq 8(%rsp), %rax
movq $0, 752(%rax)
movq 8(%rsp), %rax
movq $0, 760(%rax)
movq 16(%rsp), %rax
movsd 5608(%rax), %xmm0       # xmm0 = mem[0],zero
movq 8(%rsp), %rax
mulsd 648(%rax), %xmm0
movsd 40(%rsp), %xmm1         # 8-byte Reload
                                        # xmm1 = mem[0],zero
addsd %xmm0, %xmm1
movsd %xmm1, 672(%rax)
movq 16(%rsp), %rax
movsd 5648(%rax), %xmm0       # xmm0 = mem[0],zero
movq 8(%rsp), %rax
mulsd 648(%rax), %xmm0
movsd %xmm0, 704(%rax)
movsd 32(%rsp), %xmm0         # xmm0 = mem[0],zero
movq 8(%rsp), %rax
xorpd %xmm1, %xmm1
ucomisd %xmm1, %xmm0
jne .LBB6_100
jnp .LBB6_101
.LBB6_100:                              # %true73
movq 672(%rax), %rcx
jmp .LBB6_102
.LBB6_101:                              # %false74
movq 696(%rax), %rcx


Thanks.


Ram
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://lists.llvm.org/pipermail/llvm-dev/attachments/20170301/4e629126/attachment-0001.html>
-------------- next part --------------
An embedded and charset-unspecified text was scrubbed...
Name: broken.asm
URL: <http://lists.llvm.org/pipermail/llvm-dev/attachments/20170301/4e629126/attachment-0003.ksh>
-------------- next part --------------
An embedded and charset-unspecified text was scrubbed...
Name: working.asm
URL: <http://lists.llvm.org/pipermail/llvm-dev/attachments/20170301/4e629126/attachment-0004.ksh>
-------------- next part --------------
A non-text attachment was scrubbed...
Name: broken.ll
Type: application/octet-stream
Size: 215087 bytes
Desc: broken.ll
URL: <http://lists.llvm.org/pipermail/llvm-dev/attachments/20170301/4e629126/attachment-0002.obj>
-------------- next part --------------
A non-text attachment was scrubbed...
Name: working.ll
Type: application/octet-stream
Size: 220892 bytes
Desc: working.ll
URL: <http://lists.llvm.org/pipermail/llvm-dev/attachments/20170301/4e629126/attachment-0003.obj>
-------------- next part --------------
An embedded and charset-unspecified text was scrubbed...
Name: broken-latest.asm
URL: <http://lists.llvm.org/pipermail/llvm-dev/attachments/20170301/4e629126/attachment-0005.ksh>


More information about the llvm-dev mailing list