<html>
<head>
<base href="https://bugs.llvm.org/">
</head>
<body><table border="1" cellspacing="0" cellpadding="8">
<tr>
<th>Bug ID</th>
<td><a class="bz_bug_link
bz_status_NEW "
title="NEW - Reorder the fldcw w.r.t fmul in x86_64 fix ,results in wrong output."
href="https://bugs.llvm.org/show_bug.cgi?id=40529">40529</a>
</td>
</tr>
<tr>
<th>Summary</th>
<td>Reorder the fldcw w.r.t fmul in x86_64 fix ,results in wrong output.
</td>
</tr>
<tr>
<th>Product</th>
<td>clang
</td>
</tr>
<tr>
<th>Version</th>
<td>trunk
</td>
</tr>
<tr>
<th>Hardware</th>
<td>Other
</td>
</tr>
<tr>
<th>OS</th>
<td>Linux
</td>
</tr>
<tr>
<th>Status</th>
<td>NEW
</td>
</tr>
<tr>
<th>Severity</th>
<td>normal
</td>
</tr>
<tr>
<th>Priority</th>
<td>P
</td>
</tr>
<tr>
<th>Component</th>
<td>LLVM Codegen
</td>
</tr>
<tr>
<th>Assignee</th>
<td>unassignedclangbugs@nondot.org
</td>
</tr>
<tr>
<th>Reporter</th>
<td>umesh.kalappa0@gmail.com
</td>
</tr>
<tr>
<th>CC</th>
<td>llvm-bugs@lists.llvm.org, neeilans@live.com, richard-llvm@metafoo.co.uk
</td>
</tr></table>
<p>
<div>
<pre>the below llvm code :
define linkonce_odr dso_local dereferenceable(96)
%"class.std::basic_ostream"*
@_ZSt6_WriteIcSt11char_traitsIcEERSt13basic_ostreamIT_T0_ES6_e(%"class.std::basic_ostream"*
dereferenceable(96) %_Os, x86_fp80 %_Dx) local_unnamed_addr #4 comdat {
entry:
%_Ex = alloca i32, align 4
%0 = bitcast i32* %_Ex to i8*
call void @llvm.lifetime.start.p0i8(i64 4, i8* nonnull %0) #8
%call = call x86_fp80 @frexpl(x86_fp80 %_Dx, i32* nonnull %_Ex) #8
%mul = fmul x86_fp80 %call, 0xK401E8000000000000000
%conv = fptosi x86_fp80 %mul to i64
%conv1 = sitofp i64 %conv to x86_fp80
%sub = fsub x86_fp80 %mul, %conv1
%call2 = tail call dereferenceable(96) %"class.std::basic_ostream"*
@_ZStlsISt11char_traitsIcEERSt13basic_ostreamIcT_ES5_c(%"class.std::basic_ostream"*
nonnull dereferenceable(96) %_Os, i8 signext 32)
%call3 = tail call dereferenceable(96) %"class.std::basic_ostream"*
@_ZNSolsEl(%"class.std::basic_ostream"* nonnull %call2, i64 %conv)
%mul.1 = fmul x86_fp80 %sub, 0xK401E8000000000000000
%conv.1 = fptosi x86_fp80 %mul.1 to i64
%conv1.1 = sitofp i64 %conv.1 to x86_fp80
%sub.1 = fsub x86_fp80 %mul.1, %conv1.1
%call2.1 = tail call dereferenceable(96) %"class.std::basic_ostream"*
@_ZStlsISt11char_traitsIcEERSt13basic_ostreamIcT_ES5_c(%"class.std::basic_ostream"*
nonnull dereferenceable(96) %_Os, i8 signext 32)
%call3.1 = tail call dereferenceable(96) %"class.std::basic_ostream"*
@_ZNSolsEl(%"class.std::basic_ostream"* nonnull %call2.1, i64 %conv.1)
%mul.2 = fmul x86_fp80 %sub.1, 0xK401E8000000000000000
%conv.2 = fptosi x86_fp80 %mul.2 to i64
%conv1.2 = sitofp i64 %conv.2 to x86_fp80
%sub.2 = fsub x86_fp80 %mul.2, %conv1.2
%call2.2 = tail call dereferenceable(96) %"class.std::basic_ostream"*
@_ZStlsISt11char_traitsIcEERSt13basic_ostreamIcT_ES5_c(%"class.std::basic_ostream"*
nonnull dereferenceable(96) %_Os, i8 signext 32)
%call3.2 = tail call dereferenceable(96) %"class.std::basic_ostream"*
@_ZNSolsEl(%"class.std::basic_ostream"* nonnull %call2.2, i64 %conv.2)
%mul.3 = fmul x86_fp80 %sub.2, 0xK401E8000000000000000
%conv.3 = fptosi x86_fp80 %mul.3 to i64
%call2.3 = tail call dereferenceable(96) %"class.std::basic_ostream"*
@_ZStlsISt11char_traitsIcEERSt13basic_ostreamIcT_ES5_c(%"class.std::basic_ostream"*
nonnull dereferenceable(96) %_Os, i8 signext 32)
%call3.3 = tail call dereferenceable(96) %"class.std::basic_ostream"*
@_ZNSolsEl(%"class.std::basic_ostream"* nonnull %call2.3, i64 %conv.3)
%call4 = tail call dereferenceable(96) %"class.std::basic_ostream"*
@_ZStlsISt11char_traitsIcEERSt13basic_ostreamIcT_ES5_c(%"class.std::basic_ostream"*
nonnull dereferenceable(96) %_Os, i8 signext 32)
%1 = load i32, i32* %_Ex, align 4, !tbaa !64
%call5 = tail call dereferenceable(96) %"class.std::basic_ostream"*
@_ZNSolsEi(%"class.std::basic_ostream"* nonnull %call4, i32 %1)
call void @llvm.lifetime.end.p0i8(i64 4, i8* nonnull %0) #8
ret %"class.std::basic_ostream"* %_Os
}
in the X86 backend ,when we have statements that converts the "double" to
"long long " then backend sets the rounding to "round to zero" by emitting
the following instruction ,before conversion like
MOV16mi %stack.8, 1, $noreg, 0, $noreg, 3199 :: (store 2 into %stack.8)
FLDCW16m %stack.8, 1, $noreg, 0, $noreg, implicit-def dead $fpsw :: (load 2
from %stack.8)
MOV16mr %stack.8, 1, $noreg, 0, $noreg, %32:gr16 :: (store 2 into %stack.8)
IST_Fp64m80 %stack.7, 1, $noreg, 0, $noreg, %6:rfp80, implicit-def dead $fpsw
and restore previous FDCW status after conversion ,but
the msched scheduler recording the "FLDCW16m along mov16mi " before
"LD_Fp32m80" attached the log(msched.log) for the reference ,results the below
code and gives not the precise output.
:
pushq %rbp
.cfi_def_cfa_offset 16
.cfi_offset %rbp, -16
movq %rsp, %rbp
.cfi_def_cfa_register %rbp
pushq %r14
pushq %rbx
subq $112, %rsp
.cfi_offset %rbx, -32
.cfi_offset %r14, -24
movq %rdi, %r14
fldt 16(%rbp)
fstpt (%rsp)
leaq -40(%rbp), %rdi
callq frexpl
fnstcw -24(%rbp)
flds .LCPI83_0(%rip)
fld %st(0)
fstpt -52(%rbp) # 10-byte Folded Spill
movzwl -24(%rbp), %eax
movw $3199, -24(%rbp) # imm = 0xC7F
fldcw -24(%rbp)
fmulp %st(1)
movw %ax, -24(%rbp)
fld %st(0)
fistpll -64(%rbp)
fldcw -24(%rbp)
movq -64(%rbp), %rbx
movq %rbx, -72(%rbp)
fildll -72(%rbp)
fsubrp %st(1)
fstpt -36(%rbp) # 10-byte Folded Spill
movq %r14, %rdi
movl $32, %esi
callq _ZStlsISt11char_traitsIcEERSt13basic_ostreamIcT_ES5_c
movq %rax, %rdi
movq %rbx, %rsi
callq _ZNSolsEl
on invesitigating the source "X86InstrFPStack.td" adding the changes like
from :
let Defs = [FPSW] in {
// FPBinary_rr just defines pseudo-instructions, no need to set a scheduling
// resources.
let hasNoSchedulingInfo = 1 in {
defm ADD : FPBinary_rr<fadd>;
defm SUB : FPBinary_rr<fsub>;
defm MUL : FPBinary_rr<fmul>;
defm DIV : FPBinary_rr<fdiv>;
}
// Sets the scheduling resources for the actual NAME#_F<size>m defintions.
let SchedRW = [WriteFAddLd] in {
defm ADD : FPBinary<fadd, MRM0m, "add">;
defm SUB : FPBinary<fsub, MRM4m, "sub">;
defm SUBR: FPBinary<fsub ,MRM5m, "subr", 0>;
}
let SchedRW = [WriteFMulLd] in {
defm MUL : FPBinary<fmul, MRM1m, "mul">;
}
}
to :
let Defs = [FPSW], Uses = [FPSW] in {
// FPBinary_rr just defines pseudo-instructions, no need to set a scheduling
// resources.
let hasNoSchedulingInfo = 1 in {
defm ADD : FPBinary_rr<fadd>;
defm SUB : FPBinary_rr<fsub>;
defm MUL : FPBinary_rr<fmul>;
defm DIV : FPBinary_rr<fdiv>;
}
// Sets the scheduling resources for the actual NAME#_F<size>m defintions.
let SchedRW = [WriteFAddLd] in {
defm ADD : FPBinary<fadd, MRM0m, "add">;
defm SUB : FPBinary<fsub, MRM4m, "sub">;
defm SUBR: FPBinary<fsub ,MRM5m, "subr", 0>;
}
let SchedRW = [WriteFMulLd] in {
defm MUL : FPBinary<fmul, MRM1m, "mul">;
}
i.e marking FPSW used by fmul,fixes the problem like above.
command used :llc -march=x86-64 -O2 -debug-only=machine-scheduler
before we go ahead with our analysis and test ,we would like to know the expert
suggestions on our analysis and testcase is huge (like 20k lines, attached the
same).</pre>
</div>
</p>
<hr>
<span>You are receiving this mail because:</span>
<ul>
<li>You are on the CC list for the bug.</li>
</ul>
</body>
</html>