[llvm] 928a176 - [X86][WIP] Change precision control to FP80 during u64->fp32 conversion on Windows.

Fri Jan 20 00:34:40 PST 2023

Author: Craig Topper
Date: 2023-01-20T00:34:05-08:00
New Revision: 928a1764d6bdf84073c9d85875f45c1716d6ff12

URL: https://github.com/llvm/llvm-project/commit/928a1764d6bdf84073c9d85875f45c1716d6ff12
DIFF: https://github.com/llvm/llvm-project/commit/928a1764d6bdf84073c9d85875f45c1716d6ff12.diff

LOG: [X86][WIP] Change precision control to FP80 during u64->fp32 conversion on Windows.

This is an alternative to D141074 to fix the problem by adjusting
the precision control dynamically.

This isn't quite complete yet. I want to support fadd with an load
folded into it too. That's the code we will usually generate.

Posting for early review so we can do some testing of this solution.

Differential Revision: https://reviews.llvm.org/D142178

Added: 
    

Modified: 
    llvm/lib/Target/X86/X86ISelLowering.cpp
    llvm/lib/Target/X86/X86ISelLowering.h
    llvm/lib/Target/X86/X86InstrFPStack.td
    llvm/test/CodeGen/X86/uint64-to-float.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index c88c66d8b2ed..39c01baa9ee5 100644

--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -21944,15 +21944,20 @@ SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op,
   // Extend everything to 80 bits to force it to be done on x87.
   // TODO: Are there any fast-math-flags to propagate here?
   if (IsStrict) {
-    SDValue Add = DAG.getNode(ISD::STRICT_FADD, dl, {MVT::f80, MVT::Other},
-                              {Chain, Fild, Fudge});
+    unsigned Opc = Subtarget.isOSWindows() && DstVT == MVT::f32
+                       ? X86ISD::STRICT_FP80_ADD
+                       : ISD::STRICT_FADD;
+    SDValue Add =
+        DAG.getNode(Opc, dl, {MVT::f80, MVT::Other}, {Chain, Fild, Fudge});
     // STRICT_FP_ROUND can't handle equal types.
     if (DstVT == MVT::f80)
       return Add;
     return DAG.getNode(ISD::STRICT_FP_ROUND, dl, {DstVT, MVT::Other},
                        {Add.getValue(1), Add, DAG.getIntPtrConstant(0, dl)});
   }
-  SDValue Add = DAG.getNode(ISD::FADD, dl, MVT::f80, Fild, Fudge);
+  unsigned Opc = Subtarget.isOSWindows() && DstVT == MVT::f32 ? X86ISD::FP80_ADD
+                                                              : ISD::FADD;
+  SDValue Add = DAG.getNode(Opc, dl, MVT::f80, Fild, Fudge);
   return DAG.getNode(ISD::FP_ROUND, dl, DstVT, Add,
                      DAG.getIntPtrConstant(0, dl, /*isTarget=*/true));
 }
@@ -34739,6 +34744,8 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
   NODE_NAME_CASE(AESDECWIDE256KL)
   NODE_NAME_CASE(CMPCCXADD)
   NODE_NAME_CASE(TESTUI)
+  NODE_NAME_CASE(FP80_ADD)
+  NODE_NAME_CASE(STRICT_FP80_ADD)
   }
   return nullptr;
 #undef NODE_NAME_CASE
@@ -37249,6 +37256,57 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
     return BB;
   }
 
+  case X86::FP80_ADD: {
+    // Change the floating point control register to use double extended
+    // precision when performing the addition.
+    int OrigCWFrameIdx =
+        MF->getFrameInfo().CreateStackObject(2, Align(2), false);
+    addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::FNSTCW16m)),
+                      OrigCWFrameIdx);
+
+    // Load the old value of the control word...
+    Register OldCW = MF->getRegInfo().createVirtualRegister(&X86::GR32RegClass);
+    addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOVZX32rm16), OldCW),
+                      OrigCWFrameIdx);
+
+    // OR 0b11 into bit 8 and 9. 0b11 is the encoding for double extended
+    // precision.
+    Register NewCW = MF->getRegInfo().createVirtualRegister(&X86::GR32RegClass);
+    BuildMI(*BB, MI, DL, TII->get(X86::OR32ri), NewCW)
+        .addReg(OldCW, RegState::Kill)
+        .addImm(0x300);
+
+    // Extract to 16 bits.
+    Register NewCW16 =
+        MF->getRegInfo().createVirtualRegister(&X86::GR16RegClass);
+    BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), NewCW16)
+        .addReg(NewCW, RegState::Kill, X86::sub_16bit);
+
+    // Prepare memory for FLDCW.
+    int NewCWFrameIdx =
+        MF->getFrameInfo().CreateStackObject(2, Align(2), false);
+    addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16mr)),
+                      NewCWFrameIdx)
+        .addReg(NewCW16, RegState::Kill);
+
+    // Reload the modified control word now...
+    addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::FLDCW16m)),
+                      NewCWFrameIdx);
+
+    // Do the addition.
+    BuildMI(*BB, MI, DL, TII->get(X86::ADD_Fp80))
+        .add(MI.getOperand(0))
+        .add(MI.getOperand(1))
+        .add(MI.getOperand(2));
+
+    // Reload the original control word now.
+    addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::FLDCW16m)),
+                      OrigCWFrameIdx);
+
+    MI.eraseFromParent(); // The pseudo instruction is gone now.
+    return BB;
+  }
+
   case X86::FP32_TO_INT16_IN_MEM:
   case X86::FP32_TO_INT32_IN_MEM:
   case X86::FP32_TO_INT64_IN_MEM:

diff  --git a/llvm/lib/Target/X86/X86ISelLowering.h b/llvm/lib/Target/X86/X86ISelLowering.h
index c08227b5b383..89b09148c1e3 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.h
+++ b/llvm/lib/Target/X86/X86ISelLowering.h
@@ -740,6 +740,9 @@ namespace llvm {
     // User level interrupts - testui
     TESTUI,
 
+    // Perform an FP80 add after changing precision control in FPCW.
+    FP80_ADD,
+
     /// X86 strict FP compare instructions.
     STRICT_FCMP = ISD::FIRST_TARGET_STRICTFP_OPCODE,
     STRICT_FCMPS,
@@ -779,6 +782,9 @@ namespace llvm {
     STRICT_CVTPS2PH,
     STRICT_CVTPH2PS,
 
+    // Perform an FP80 add after changing precision control in FPCW.
+    STRICT_FP80_ADD,
+
     // WARNING: Only add nodes here if they are strict FP nodes. Non-memory and
     // non-strict FP nodes should be above FIRST_TARGET_STRICTFP_OPCODE.
 
@@ -886,7 +892,8 @@ namespace llvm {
     AESDECWIDE256KL,
 
     /// Compare and Add if Condition is Met. Compare value in operand 2 with
-    /// value in memory of operand 1. If condition of operand 4 is met, add value
+    /// value in memory of operand 1. If condition of operand 4 is met, add
+    /// value
     /// operand 3 to m32 and write new value in operand 1. Operand 2 is
     /// always updated with the original value from operand 1.
     CMPCCXADD,

diff  --git a/llvm/lib/Target/X86/X86InstrFPStack.td b/llvm/lib/Target/X86/X86InstrFPStack.td
index a68d61043c5c..751b7bace58f 100644
--- a/llvm/lib/Target/X86/X86InstrFPStack.td
+++ b/llvm/lib/Target/X86/X86InstrFPStack.td
@@ -26,6 +26,13 @@ def SDTX86Fist      : SDTypeProfile<0, 2, [SDTCisFP<0>, SDTCisPtrTy<1>]>;
 def SDTX86CwdStore  : SDTypeProfile<0, 1, [SDTCisPtrTy<0>]>;
 def SDTX86CwdLoad   : SDTypeProfile<0, 1, [SDTCisPtrTy<0>]>;
 
+def X86fp80_add     : SDNode<"X86ISD::FP80_ADD", SDTFPBinOp, [SDNPCommutative]>;
+def X86strict_fp80_add : SDNode<"X86ISD::STRICT_FP80_ADD", SDTFPBinOp,
+                        [SDNPHasChain,SDNPCommutative]>;
+def any_X86fp80_add : PatFrags<(ops node:$lhs, node:$rhs),
+                               [(X86strict_fp80_add node:$lhs, node:$rhs),
+                                (X86fp80_add node:$lhs, node:$rhs)]>;
+
 def X86fld          : SDNode<"X86ISD::FLD", SDTX86Fld,
                              [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>;
 def X86fst          : SDNode<"X86ISD::FST", SDTX86Fst,
@@ -141,6 +148,10 @@ let usesCustomInserter = 1, hasNoSchedulingInfo = 1, Defs = [EFLAGS] in {
                               [(X86fp_to_i32mem RFP80:$src, addr:$dst)]>;
   def FP80_TO_INT64_IN_MEM : PseudoI<(outs), (ins i64mem:$dst, RFP80:$src),
                               [(X86fp_to_i64mem RFP80:$src, addr:$dst)]>;
+
+  def FP80_ADD : PseudoI<(outs RFP80:$dst), (ins RFP80:$src1, RFP80:$src2),
+                         [(set RFP80:$dst,
+                               (any_X86fp80_add  RFP80:$src1, RFP80:$src2))]>;
 }
 
 // All FP Stack operations are represented with four instructions here.  The

diff  --git a/llvm/test/CodeGen/X86/uint64-to-float.ll b/llvm/test/CodeGen/X86/uint64-to-float.ll
index 8b6623476eba..b84529213c36 100644
--- a/llvm/test/CodeGen/X86/uint64-to-float.ll
+++ b/llvm/test/CodeGen/X86/uint64-to-float.ll
@@ -1,6 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=i686-apple-unknown -mattr=+sse2 | FileCheck %s --check-prefix=X86
-; RUN: llc < %s -mtriple=x86_64-apple-unknown -mattr=+sse2 | FileCheck %s --check-prefix=X64
+; RUN: llc < %s -mtriple=i686-unknown -mattr=+sse2 | FileCheck %s --check-prefix=X86
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+sse2 | FileCheck %s --check-prefix=X64
+; RUN: llc < %s -mtriple=i686-windows -mattr=+sse2 | FileCheck %s --check-prefix=X86-WIN
+; RUN: llc < %s -mtriple=x86_64-windows -mattr=+sse2 | FileCheck %s --check-prefix=X64-WIN
 
 ; Verify that we are using the efficient uitofp --> sitofp lowering illustrated
 ; by the compiler_rt implementation of __floatundisf.
@@ -42,6 +44,49 @@ define float @test(i64 %a) nounwind {
 ; X64-NEXT:    cvtsi2ss %rdi, %xmm0
 ; X64-NEXT:    addss %xmm0, %xmm0
 ; X64-NEXT:    retq
+;
+; X86-WIN-LABEL: test:
+; X86-WIN:       # %bb.0: # %entry
+; X86-WIN-NEXT:    pushl %ebp
+; X86-WIN-NEXT:    movl %esp, %ebp
+; X86-WIN-NEXT:    andl $-8, %esp
+; X86-WIN-NEXT:    subl $24, %esp
+; X86-WIN-NEXT:    movl 12(%ebp), %eax
+; X86-WIN-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; X86-WIN-NEXT:    movlps %xmm0, {{[0-9]+}}(%esp)
+; X86-WIN-NEXT:    shrl $31, %eax
+; X86-WIN-NEXT:    fildll {{[0-9]+}}(%esp)
+; X86-WIN-NEXT:    flds __real at 5f80000000000000(,%eax,4)
+; X86-WIN-NEXT:    fnstcw {{[0-9]+}}(%esp)
+; X86-WIN-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
+; X86-WIN-NEXT:    orl $768, %eax # imm = 0x300
+; X86-WIN-NEXT:    movw %ax, {{[0-9]+}}(%esp)
+; X86-WIN-NEXT:    fldcw {{[0-9]+}}(%esp)
+; X86-WIN-NEXT:    faddp %st, %st(1)
+; X86-WIN-NEXT:    fldcw {{[0-9]+}}(%esp)
+; X86-WIN-NEXT:    fstps {{[0-9]+}}(%esp)
+; X86-WIN-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X86-WIN-NEXT:    movss %xmm0, {{[0-9]+}}(%esp)
+; X86-WIN-NEXT:    flds {{[0-9]+}}(%esp)
+; X86-WIN-NEXT:    movl %ebp, %esp
+; X86-WIN-NEXT:    popl %ebp
+; X86-WIN-NEXT:    retl
+;
+; X64-WIN-LABEL: test:
+; X64-WIN:       # %bb.0: # %entry
+; X64-WIN-NEXT:    testq %rcx, %rcx
+; X64-WIN-NEXT:    js .LBB0_1
+; X64-WIN-NEXT:  # %bb.2: # %entry
+; X64-WIN-NEXT:    cvtsi2ss %rcx, %xmm0
+; X64-WIN-NEXT:    retq
+; X64-WIN-NEXT:  .LBB0_1:
+; X64-WIN-NEXT:    movq %rcx, %rax
+; X64-WIN-NEXT:    shrq %rax
+; X64-WIN-NEXT:    andl $1, %ecx
+; X64-WIN-NEXT:    orq %rax, %rcx
+; X64-WIN-NEXT:    cvtsi2ss %rcx, %xmm0
+; X64-WIN-NEXT:    addss %xmm0, %xmm0
+; X64-WIN-NEXT:    retq
 entry:
   %b = uitofp i64 %a to float
   ret float %b