<pre>the below llvm code :
define linkonce_odr dso_local dereferenceable(96)
dereferenceable(96) %_Os, x86_fp80 %_Dx) local_unnamed_addr #4 comdat {
%_Ex = alloca i32, align 4
%0 = bitcast i32* %_Ex to i8*
call void @llvm.lifetime.start.p0i8(i64 4, i8* nonnull %0) #8
%call = call x86_fp80 @frexpl(x86_fp80 %_Dx, i32* nonnull %_Ex) #8
%mul = fmul x86_fp80 %call, 0xK401E8000000000000000
%conv = fptosi x86_fp80 %mul to i64
%conv1 = sitofp i64 %conv to x86_fp80
%sub = fsub x86_fp80 %mul, %conv1
%call2 = tail call dereferenceable(96) %"class.std::basic_ostream"*
nonnull dereferenceable(96) %_Os, i8 signext 32)
%call3 = tail call dereferenceable(96) %"class.std::basic_ostream"*
@_ZNSolsEl(%"class.std::basic_ostream"* nonnull %call2, i64 %conv)
%mul.1 = fmul x86_fp80 %sub, 0xK401E8000000000000000
%conv.1 = fptosi x86_fp80 %mul.1 to i64
%conv1.1 = sitofp i64 %conv.1 to x86_fp80
%sub.1 = fsub x86_fp80 %mul.1, %conv1.1
%call2.1 = tail call dereferenceable(96) %"class.std::basic_ostream"*
nonnull dereferenceable(96) %_Os, i8 signext 32)
%call3.1 = tail call dereferenceable(96) %"class.std::basic_ostream"*
@_ZNSolsEl(%"class.std::basic_ostream"* nonnull %call2.1, i64 %conv.1)
%mul.2 = fmul x86_fp80 %sub.1, 0xK401E8000000000000000
%conv.2 = fptosi x86_fp80 %mul.2 to i64
%conv1.2 = sitofp i64 %conv.2 to x86_fp80
%sub.2 = fsub x86_fp80 %mul.2, %conv1.2
%call2.2 = tail call dereferenceable(96) %"class.std::basic_ostream"*
nonnull dereferenceable(96) %_Os, i8 signext 32)
%call3.2 = tail call dereferenceable(96) %"class.std::basic_ostream"*
@_ZNSolsEl(%"class.std::basic_ostream"* nonnull %call2.2, i64 %conv.2)
%mul.3 = fmul x86_fp80 %sub.2, 0xK401E8000000000000000
%conv.3 = fptosi x86_fp80 %mul.3 to i64
%call2.3 = tail call dereferenceable(96) %"class.std::basic_ostream"*
nonnull dereferenceable(96) %_Os, i8 signext 32)
%call3.3 = tail call dereferenceable(96) %"class.std::basic_ostream"*
@_ZNSolsEl(%"class.std::basic_ostream"* nonnull %call2.3, i64 %conv.3)
%call4 = tail call dereferenceable(96) %"class.std::basic_ostream"*
nonnull dereferenceable(96) %_Os, i8 signext 32)
%1 = load i32, i32* %_Ex, align 4, !tbaa !64
%call5 = tail call dereferenceable(96) %"class.std::basic_ostream"*
@_ZNSolsEi(%"class.std::basic_ostream"* nonnull %call4, i32 %1)
call void @llvm.lifetime.end.p0i8(i64 4, i8* nonnull %0) #8
ret %"class.std::basic_ostream"* %_Os
in the X86 backend ,when we have statements that converts the "double" to
"long long " then backend sets the rounding to "round to zero" by emitting
the following instruction ,before conversion like
MOV16mi %stack.8, 1, $noreg, 0, $noreg, 3199 :: (store 2 into %stack.8)
FLDCW16m %stack.8, 1, $noreg, 0, $noreg, implicit-def dead $fpsw :: (load 2
from %stack.8)
MOV16mr %stack.8, 1, $noreg, 0, $noreg, %32:gr16 :: (store 2 into %stack.8)
IST_Fp64m80 %stack.7, 1, $noreg, 0, $noreg, %6:rfp80, implicit-def dead $fpsw
and restore previous FDCW status after conversion ,but
the msched scheduler recording the "FLDCW16m along mov16mi " before
"LD_Fp32m80" attached the log(msched.log) for the reference ,results the below
code and gives not the precise output.
pushq %rbp
.cfi_def_cfa_offset 16
.cfi_offset %rbp, -16
movq %rsp, %rbp
.cfi_def_cfa_register %rbp
pushq %r14
pushq %rbx
subq $112, %rsp
.cfi_offset %rbx, -32
.cfi_offset %r14, -24
movq %rdi, %r14
fldt 16(%rbp)
fstpt (%rsp)
leaq -40(%rbp), %rdi
callq frexpl
fnstcw -24(%rbp)
flds .LCPI83_0(%rip)
fld %st(0)
fstpt -52(%rbp) # 10-byte Folded Spill
movzwl -24(%rbp), %eax
movw $3199, -24(%rbp) # imm = 0xC7F
fldcw -24(%rbp)
fmulp %st(1)
movw %ax, -24(%rbp)
fld %st(0)
fistpll -64(%rbp)
fldcw -24(%rbp)
movq -64(%rbp), %rbx
movq %rbx, -72(%rbp)
fildll -72(%rbp)
fsubrp %st(1)
fstpt -36(%rbp) # 10-byte Folded Spill
movq %r14, %rdi
movl $32, %esi
callq _ZStlsISt11char_traitsIcEERSt13basic_ostreamIcT_ES5_c
movq %rax, %rdi
movq %rbx, %rsi
callq _ZNSolsEl
on invesitigating the source "X86InstrFPStack.td" adding the changes like
from :
let Defs = [FPSW] in {
// FPBinary_rr just defines pseudo-instructions, no need to set a scheduling
// resources.
let hasNoSchedulingInfo = 1 in {
defm ADD : FPBinary_rr<fadd>;
defm SUB : FPBinary_rr<fsub>;
defm MUL : FPBinary_rr<fmul>;
defm DIV : FPBinary_rr<fdiv>;
// Sets the scheduling resources for the actual NAME#_F<size>m defintions.
let SchedRW = [WriteFAddLd] in {
defm ADD : FPBinary<fadd, MRM0m, "add">;
defm SUB : FPBinary<fsub, MRM4m, "sub">;
defm SUBR: FPBinary<fsub ,MRM5m, "subr", 0>;
let SchedRW = [WriteFMulLd] in {
defm MUL : FPBinary<fmul, MRM1m, "mul">;
to :
let Defs = [FPSW], Uses = [FPSW] in {
// FPBinary_rr just defines pseudo-instructions, no need to set a scheduling
// resources.
let hasNoSchedulingInfo = 1 in {
defm ADD : FPBinary_rr<fadd>;
defm SUB : FPBinary_rr<fsub>;
defm MUL : FPBinary_rr<fmul>;
defm DIV : FPBinary_rr<fdiv>;
// Sets the scheduling resources for the actual NAME#_F<size>m defintions.
let SchedRW = [WriteFAddLd] in {
defm ADD : FPBinary<fadd, MRM0m, "add">;
defm SUB : FPBinary<fsub, MRM4m, "sub">;
defm SUBR: FPBinary<fsub ,MRM5m, "subr", 0>;
let SchedRW = [WriteFMulLd] in {
defm MUL : FPBinary<fmul, MRM1m, "mul">;
i.e marking FPSW used by fmul,fixes the problem like above.
command used :llc -march=x86-64 -O2 -debug-only=machine-scheduler
before we go ahead with our analysis and test ,we would like to know the expert
suggestions on our analysis and testcase is huge (like 20k lines, attached the
