the below llvm code : 

  define linkonce_odr dso_local dereferenceable(96)
dereferenceable(96) %_Os, x86_fp80 %_Dx) local_unnamed_addr #4 comdat {
  %_Ex = alloca i32, align 4
  %0 = bitcast i32* %_Ex to i8*
  call void @llvm.lifetime.start.p0i8(i64 4, i8* nonnull %0) #8
  %call = call x86_fp80 @frexpl(x86_fp80 %_Dx, i32* nonnull %_Ex) #8
  %mul = fmul x86_fp80 %call, 0xK401E8000000000000000
  %conv = fptosi x86_fp80 %mul to i64
  %conv1 = sitofp i64 %conv to x86_fp80
  %sub = fsub x86_fp80 %mul, %conv1
  %call2 = tail call dereferenceable(96) %"class.std::basic_ostream"*
nonnull dereferenceable(96) %_Os, i8 signext 32)
  %call3 = tail call dereferenceable(96) %"class.std::basic_ostream"*
@_ZNSolsEl(%"class.std::basic_ostream"* nonnull %call2, i64 %conv)
  %mul.1 = fmul x86_fp80 %sub, 0xK401E8000000000000000
  %conv.1 = fptosi x86_fp80 %mul.1 to i64
  %conv1.1 = sitofp i64 %conv.1 to x86_fp80
  %sub.1 = fsub x86_fp80 %mul.1, %conv1.1
  %call2.1 = tail call dereferenceable(96) %"class.std::basic_ostream"*
nonnull dereferenceable(96) %_Os, i8 signext 32)
  %call3.1 = tail call dereferenceable(96) %"class.std::basic_ostream"*
@_ZNSolsEl(%"class.std::basic_ostream"* nonnull %call2.1, i64 %conv.1)
  %mul.2 = fmul x86_fp80 %sub.1, 0xK401E8000000000000000
  %conv.2 = fptosi x86_fp80 %mul.2 to i64
  %conv1.2 = sitofp i64 %conv.2 to x86_fp80
  %sub.2 = fsub x86_fp80 %mul.2, %conv1.2
  %call2.2 = tail call dereferenceable(96) %"class.std::basic_ostream"*
nonnull dereferenceable(96) %_Os, i8 signext 32)
  %call3.2 = tail call dereferenceable(96) %"class.std::basic_ostream"*
@_ZNSolsEl(%"class.std::basic_ostream"* nonnull %call2.2, i64 %conv.2)
  %mul.3 = fmul x86_fp80 %sub.2, 0xK401E8000000000000000
  %conv.3 = fptosi x86_fp80 %mul.3 to i64
  %call2.3 = tail call dereferenceable(96) %"class.std::basic_ostream"*
nonnull dereferenceable(96) %_Os, i8 signext 32)
  %call3.3 = tail call dereferenceable(96) %"class.std::basic_ostream"*
@_ZNSolsEl(%"class.std::basic_ostream"* nonnull %call2.3, i64 %conv.3)
  %call4 = tail call dereferenceable(96) %"class.std::basic_ostream"*
nonnull dereferenceable(96) %_Os, i8 signext 32)
%1 = load i32, i32* %_Ex, align 4, !tbaa !64
  %call5 = tail call dereferenceable(96) %"class.std::basic_ostream"*
@_ZNSolsEi(%"class.std::basic_ostream"* nonnull %call4, i32 %1)
  call void @llvm.lifetime.end.p0i8(i64 4, i8* nonnull %0) #8
  ret %"class.std::basic_ostream"* %_Os

in the X86 backend ,when we have statements that converts  the "double" to
"long long " then backend sets the rounding to  "round to zero"  by emitting
the following instruction ,before conversion like 

MOV16mi %stack.8, 1, $noreg, 0, $noreg, 3199 :: (store 2 into %stack.8)
  FLDCW16m %stack.8, 1, $noreg, 0, $noreg, implicit-def dead $fpsw :: (load 2
from %stack.8)
  MOV16mr %stack.8, 1, $noreg, 0, $noreg, %32:gr16 :: (store 2 into %stack.8)
  IST_Fp64m80 %stack.7, 1, $noreg, 0, $noreg, %6:rfp80, implicit-def dead $fpsw

and restore previous FDCW status after conversion ,but

the msched scheduler recording the "FLDCW16m along mov16mi " before
"LD_Fp32m80" attached the log(msched.log)  for the reference ,results the below
code and gives not the precise output.

         pushq   %rbp
        .cfi_def_cfa_offset 16
        .cfi_offset %rbp, -16
        movq    %rsp, %rbp
        .cfi_def_cfa_register %rbp
        pushq   %r14
        pushq   %rbx
        subq    $112, %rsp
        .cfi_offset %rbx, -32
        .cfi_offset %r14, -24
        movq    %rdi, %r14
        fldt    16(%rbp)
        fstpt   (%rsp)
        leaq    -40(%rbp), %rdi
        callq   frexpl
        fnstcw  -24(%rbp)
        flds    .LCPI83_0(%rip)
        fld     %st(0)
        fstpt   -52(%rbp)               # 10-byte Folded Spill
        movzwl  -24(%rbp), %eax
        movw    $3199, -24(%rbp)        # imm = 0xC7F
        fldcw   -24(%rbp)
        fmulp   %st(1)
        movw    %ax, -24(%rbp)
        fld     %st(0)
        fistpll -64(%rbp)
        fldcw   -24(%rbp)
        movq    -64(%rbp), %rbx
        movq    %rbx, -72(%rbp)
        fildll  -72(%rbp)
        fsubrp  %st(1)
        fstpt   -36(%rbp)               # 10-byte Folded Spill
        movq    %r14, %rdi
        movl    $32, %esi
        callq   _ZStlsISt11char_traitsIcEERSt13basic_ostreamIcT_ES5_c
        movq    %rax, %rdi
        movq    %rbx, %rsi
        callq   _ZNSolsEl

on invesitigating the source "X86InstrFPStack.td" adding the changes like 

from :

let Defs = [FPSW] in {
// FPBinary_rr just defines pseudo-instructions, no need to set a scheduling
// resources.
let hasNoSchedulingInfo = 1 in {
defm ADD : FPBinary_rr<fadd>;
defm SUB : FPBinary_rr<fsub>;
defm MUL : FPBinary_rr<fmul>;
defm DIV : FPBinary_rr<fdiv>;

// Sets the scheduling resources for the actual NAME#_F<size>m defintions.
let SchedRW = [WriteFAddLd] in {
defm ADD : FPBinary<fadd, MRM0m, "add">;
defm SUB : FPBinary<fsub, MRM4m, "sub">;
defm SUBR: FPBinary<fsub ,MRM5m, "subr", 0>;

let SchedRW = [WriteFMulLd] in {
defm MUL : FPBinary<fmul, MRM1m, "mul">;


to : 

let Defs = [FPSW], Uses = [FPSW] in {
// FPBinary_rr just defines pseudo-instructions, no need to set a scheduling
// resources.
let hasNoSchedulingInfo = 1 in {
defm ADD : FPBinary_rr<fadd>;
defm SUB : FPBinary_rr<fsub>;
defm MUL : FPBinary_rr<fmul>;
defm DIV : FPBinary_rr<fdiv>;

// Sets the scheduling resources for the actual NAME#_F<size>m defintions.
let SchedRW = [WriteFAddLd] in {
defm ADD : FPBinary<fadd, MRM0m, "add">;
defm SUB : FPBinary<fsub, MRM4m, "sub">;
defm SUBR: FPBinary<fsub ,MRM5m, "subr", 0>;

let SchedRW = [WriteFMulLd] in {
defm MUL : FPBinary<fmul, MRM1m, "mul">;

i.e marking FPSW used by fmul,fixes the problem like above. 

command used :llc -march=x86-64  -O2  -debug-only=machine-scheduler 

before we go ahead with our analysis and test ,we would like to know the expert
suggestions on our analysis and testcase is huge (like 20k lines, attached the

