[llvm-bugs] [Bug 40529] New: Reorder the fldcw w.r.t fmul in x86_64 fix , results in wrong output.

Wed Jan 30 06:58:56 PST 2019

https://bugs.llvm.org/show_bug.cgi?id=40529

            Bug ID: 40529
           Summary: Reorder the  fldcw w.r.t fmul in x86_64 fix ,results
                    in wrong output.
           Product: clang
           Version: trunk
          Hardware: Other
                OS: Linux
            Status: NEW
          Severity: normal
          Priority: P
         Component: LLVM Codegen
          Assignee: unassignedclangbugs at nondot.org
          Reporter: umesh.kalappa0 at gmail.com
                CC: llvm-bugs at lists.llvm.org, neeilans at live.com,
                    richard-llvm at metafoo.co.uk

the below llvm code : 

  define linkonce_odr dso_local dereferenceable(96)
%"class.std::basic_ostream"*
@_ZSt6_WriteIcSt11char_traitsIcEERSt13basic_ostreamIT_T0_ES6_e(%"class.std::basic_ostream"*
dereferenceable(96) %_Os, x86_fp80 %_Dx) local_unnamed_addr #4 comdat {
entry:
  %_Ex = alloca i32, align 4
  %0 = bitcast i32* %_Ex to i8*
  call void @llvm.lifetime.start.p0i8(i64 4, i8* nonnull %0) #8
  %call = call x86_fp80 @frexpl(x86_fp80 %_Dx, i32* nonnull %_Ex) #8
  %mul = fmul x86_fp80 %call, 0xK401E8000000000000000
  %conv = fptosi x86_fp80 %mul to i64
  %conv1 = sitofp i64 %conv to x86_fp80
  %sub = fsub x86_fp80 %mul, %conv1
  %call2 = tail call dereferenceable(96) %"class.std::basic_ostream"*
@_ZStlsISt11char_traitsIcEERSt13basic_ostreamIcT_ES5_c(%"class.std::basic_ostream"*
nonnull dereferenceable(96) %_Os, i8 signext 32)
  %call3 = tail call dereferenceable(96) %"class.std::basic_ostream"*
@_ZNSolsEl(%"class.std::basic_ostream"* nonnull %call2, i64 %conv)
  %mul.1 = fmul x86_fp80 %sub, 0xK401E8000000000000000
  %conv.1 = fptosi x86_fp80 %mul.1 to i64
  %conv1.1 = sitofp i64 %conv.1 to x86_fp80
  %sub.1 = fsub x86_fp80 %mul.1, %conv1.1
  %call2.1 = tail call dereferenceable(96) %"class.std::basic_ostream"*
@_ZStlsISt11char_traitsIcEERSt13basic_ostreamIcT_ES5_c(%"class.std::basic_ostream"*
nonnull dereferenceable(96) %_Os, i8 signext 32)
  %call3.1 = tail call dereferenceable(96) %"class.std::basic_ostream"*
@_ZNSolsEl(%"class.std::basic_ostream"* nonnull %call2.1, i64 %conv.1)
  %mul.2 = fmul x86_fp80 %sub.1, 0xK401E8000000000000000
  %conv.2 = fptosi x86_fp80 %mul.2 to i64
  %conv1.2 = sitofp i64 %conv.2 to x86_fp80
  %sub.2 = fsub x86_fp80 %mul.2, %conv1.2
  %call2.2 = tail call dereferenceable(96) %"class.std::basic_ostream"*
@_ZStlsISt11char_traitsIcEERSt13basic_ostreamIcT_ES5_c(%"class.std::basic_ostream"*
nonnull dereferenceable(96) %_Os, i8 signext 32)
  %call3.2 = tail call dereferenceable(96) %"class.std::basic_ostream"*
@_ZNSolsEl(%"class.std::basic_ostream"* nonnull %call2.2, i64 %conv.2)
  %mul.3 = fmul x86_fp80 %sub.2, 0xK401E8000000000000000
  %conv.3 = fptosi x86_fp80 %mul.3 to i64
  %call2.3 = tail call dereferenceable(96) %"class.std::basic_ostream"*
@_ZStlsISt11char_traitsIcEERSt13basic_ostreamIcT_ES5_c(%"class.std::basic_ostream"*
nonnull dereferenceable(96) %_Os, i8 signext 32)
  %call3.3 = tail call dereferenceable(96) %"class.std::basic_ostream"*
@_ZNSolsEl(%"class.std::basic_ostream"* nonnull %call2.3, i64 %conv.3)
  %call4 = tail call dereferenceable(96) %"class.std::basic_ostream"*
@_ZStlsISt11char_traitsIcEERSt13basic_ostreamIcT_ES5_c(%"class.std::basic_ostream"*
nonnull dereferenceable(96) %_Os, i8 signext 32)
%1 = load i32, i32* %_Ex, align 4, !tbaa !64
  %call5 = tail call dereferenceable(96) %"class.std::basic_ostream"*
@_ZNSolsEi(%"class.std::basic_ostream"* nonnull %call4, i32 %1)
  call void @llvm.lifetime.end.p0i8(i64 4, i8* nonnull %0) #8
  ret %"class.std::basic_ostream"* %_Os
}

in the X86 backend ,when we have statements that converts  the "double" to
"long long " then backend sets the rounding to  "round to zero"  by emitting
the following instruction ,before conversion like 

MOV16mi %stack.8, 1, $noreg, 0, $noreg, 3199 :: (store 2 into %stack.8)
  FLDCW16m %stack.8, 1, $noreg, 0, $noreg, implicit-def dead $fpsw :: (load 2
from %stack.8)
  MOV16mr %stack.8, 1, $noreg, 0, $noreg, %32:gr16 :: (store 2 into %stack.8)
  IST_Fp64m80 %stack.7, 1, $noreg, 0, $noreg, %6:rfp80, implicit-def dead $fpsw

and restore previous FDCW status after conversion ,but

the msched scheduler recording the "FLDCW16m along mov16mi " before
"LD_Fp32m80" attached the log(msched.log)  for the reference ,results the below
code and gives not the precise output.

:
         pushq   %rbp
        .cfi_def_cfa_offset 16
        .cfi_offset %rbp, -16
        movq    %rsp, %rbp
        .cfi_def_cfa_register %rbp
        pushq   %r14
        pushq   %rbx
        subq    $112, %rsp
        .cfi_offset %rbx, -32
        .cfi_offset %r14, -24
        movq    %rdi, %r14
        fldt    16(%rbp)
        fstpt   (%rsp)
        leaq    -40(%rbp), %rdi
        callq   frexpl
        fnstcw  -24(%rbp)
        flds    .LCPI83_0(%rip)
        fld     %st(0)
        fstpt   -52(%rbp)               # 10-byte Folded Spill
        movzwl  -24(%rbp), %eax
        movw    $3199, -24(%rbp)        # imm = 0xC7F
        fldcw   -24(%rbp)
        fmulp   %st(1)
        movw    %ax, -24(%rbp)
        fld     %st(0)
        fistpll -64(%rbp)
        fldcw   -24(%rbp)
        movq    -64(%rbp), %rbx
        movq    %rbx, -72(%rbp)
        fildll  -72(%rbp)
        fsubrp  %st(1)
        fstpt   -36(%rbp)               # 10-byte Folded Spill
        movq    %r14, %rdi
        movl    $32, %esi
        callq   _ZStlsISt11char_traitsIcEERSt13basic_ostreamIcT_ES5_c
        movq    %rax, %rdi
        movq    %rbx, %rsi
        callq   _ZNSolsEl

on invesitigating the source "X86InstrFPStack.td" adding the changes like 

from :

let Defs = [FPSW] in {
// FPBinary_rr just defines pseudo-instructions, no need to set a scheduling
// resources.
let hasNoSchedulingInfo = 1 in {
defm ADD : FPBinary_rr<fadd>;
defm SUB : FPBinary_rr<fsub>;
defm MUL : FPBinary_rr<fmul>;
defm DIV : FPBinary_rr<fdiv>;
}

// Sets the scheduling resources for the actual NAME#_F<size>m defintions.
let SchedRW = [WriteFAddLd] in {
defm ADD : FPBinary<fadd, MRM0m, "add">;
defm SUB : FPBinary<fsub, MRM4m, "sub">;
defm SUBR: FPBinary<fsub ,MRM5m, "subr", 0>;
}

let SchedRW = [WriteFMulLd] in {
defm MUL : FPBinary<fmul, MRM1m, "mul">;
}

}

to : 

let Defs = [FPSW], Uses = [FPSW] in {
// FPBinary_rr just defines pseudo-instructions, no need to set a scheduling
// resources.
let hasNoSchedulingInfo = 1 in {
defm ADD : FPBinary_rr<fadd>;
defm SUB : FPBinary_rr<fsub>;
defm MUL : FPBinary_rr<fmul>;
defm DIV : FPBinary_rr<fdiv>;
}

// Sets the scheduling resources for the actual NAME#_F<size>m defintions.
let SchedRW = [WriteFAddLd] in {
defm ADD : FPBinary<fadd, MRM0m, "add">;
defm SUB : FPBinary<fsub, MRM4m, "sub">;
defm SUBR: FPBinary<fsub ,MRM5m, "subr", 0>;
}

let SchedRW = [WriteFMulLd] in {
defm MUL : FPBinary<fmul, MRM1m, "mul">;
}

i.e marking FPSW used by fmul,fixes the problem like above. 

command used :llc -march=x86-64  -O2  -debug-only=machine-scheduler 

before we go ahead with our analysis and test ,we would like to know the expert
suggestions on our analysis and testcase is huge (like 20k lines, attached the
same).

-- 
You are receiving this mail because:
You are on the CC list for the bug.
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://lists.llvm.org/pipermail/llvm-bugs/attachments/20190130/19029def/attachment.html>