[llvm] d6216e2 - [X86] Fix handling of i128<->fp on Windows

Wed Sep 29 03:06:31 PDT 2021

Author: Martin Storsjö
Date: 2021-09-29T13:05:59+03:00
New Revision: d6216e2cd1a5e07f8509215ee5422ff5ee358da8

URL: https://github.com/llvm/llvm-project/commit/d6216e2cd1a5e07f8509215ee5422ff5ee358da8
DIFF: https://github.com/llvm/llvm-project/commit/d6216e2cd1a5e07f8509215ee5422ff5ee358da8.diff

LOG: [X86] Fix handling of i128<->fp on Windows

On Windows, i128 arguments are passed as indirect arguments, and
they are returned in xmm0.

This is mostly fixed up by `WinX86_64ABIInfo::classify` in Clang, making
the IR functions return v2i64 instead of i128, and making the arguments
indirect. However for cases where libcalls are generated in the target
lowering, the lowering uses the default x86_64 calling convention for
i128, where they are passed/returned as a register pair.

Add custom lowering logic, similar to the existing logic for i128
div/mod (added in 4a406d32e97b1748c4eed6674a2c1819b9cf98ea),
manually making the libcall (while overriding the return type to
v2i64 or passing the arguments as pointers to arguments on the stack).

X86CallingConv.td doesn't seem to handle i128 at all, otherwise
the windows specific behaviours would ideally be implemented as
overrides there, in generic code, handling these cases automatically.

This fixes https://bugs.llvm.org/show_bug.cgi?id=48940.

Differential Revision: https://reviews.llvm.org/D110413

Added: 
    llvm/test/CodeGen/X86/i128-fpconv-win64-strict.ll
    llvm/test/CodeGen/X86/i128-fpconv-win64.ll

Modified: 
    llvm/lib/Target/X86/X86ISelLowering.cpp
    llvm/lib/Target/X86/X86ISelLowering.h

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index acd84dbaaad23..7a496f9a439e1 100644

--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -2167,6 +2167,14 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
     setOperationAction(ISD::UDIV, MVT::i128, Custom);
     setOperationAction(ISD::SREM, MVT::i128, Custom);
     setOperationAction(ISD::UREM, MVT::i128, Custom);
+    setOperationAction(ISD::FP_TO_SINT, MVT::i128, Custom);
+    setOperationAction(ISD::FP_TO_UINT, MVT::i128, Custom);
+    setOperationAction(ISD::SINT_TO_FP, MVT::i128, Custom);
+    setOperationAction(ISD::UINT_TO_FP, MVT::i128, Custom);
+    setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::i128, Custom);
+    setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i128, Custom);
+    setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::i128, Custom);
+    setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i128, Custom);
   }
 
   // On 32 bit MSVC, `fmodf(f32)` is not defined - only `fmod(f64)`
@@ -20445,6 +20453,9 @@ SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op,
   MVT VT = Op.getSimpleValueType();
   SDLoc dl(Op);
 
+  if (Subtarget.isTargetWin64() && SrcVT == MVT::i128)
+    return LowerWin64_INT128_TO_FP(Op, DAG);
+
   if (SDValue Extract = vectorizeExtractedCast(Op, DAG, Subtarget))
     return Extract;
 
@@ -20944,6 +20955,9 @@ SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op,
   if (DstVT.isVector())
     return lowerUINT_TO_FP_vec(Op, DAG, Subtarget);
 
+  if (Subtarget.isTargetWin64() && SrcVT == MVT::i128)
+    return LowerWin64_INT128_TO_FP(Op, DAG);
+
   if (SDValue Extract = vectorizeExtractedCast(Op, DAG, Subtarget))
     return Extract;
 
@@ -28686,6 +28700,77 @@ SDValue X86TargetLowering::LowerWin64_i128OP(SDValue Op, SelectionDAG &DAG) cons
   return DAG.getBitcast(VT, CallInfo.first);
 }
 
+SDValue X86TargetLowering::LowerWin64_FP_TO_INT128(SDValue Op,
+                                                   SelectionDAG &DAG,
+                                                   SDValue &Chain) const {
+  assert(Subtarget.isTargetWin64() && "Unexpected target");
+  EVT VT = Op.getValueType();
+  bool IsStrict = Op->isStrictFPOpcode();
+
+  SDValue Arg = Op.getOperand(IsStrict ? 1 : 0);
+  EVT ArgVT = Arg.getValueType();
+
+  assert(VT.isInteger() && VT.getSizeInBits() == 128 &&
+         "Unexpected return type for lowering");
+
+  RTLIB::Libcall LC;
+  if (Op->getOpcode() == ISD::FP_TO_SINT ||
+      Op->getOpcode() == ISD::STRICT_FP_TO_SINT)
+    LC = RTLIB::getFPTOSINT(ArgVT, VT);
+  else
+    LC = RTLIB::getFPTOUINT(ArgVT, VT);
+  assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unexpected request for libcall!");
+
+  SDLoc dl(Op);
+  MakeLibCallOptions CallOptions;
+  Chain = IsStrict ? Op.getOperand(0) : DAG.getEntryNode();
+
+  SDValue Result;
+  // Expect the i128 argument returned as a v2i64 in xmm0, cast back to the
+  // expected VT (i128).
+  std::tie(Result, Chain) =
+      makeLibCall(DAG, LC, MVT::v2i64, Arg, CallOptions, dl, Chain);
+  Result = DAG.getBitcast(VT, Result);
+  return Result;
+}
+
+SDValue X86TargetLowering::LowerWin64_INT128_TO_FP(SDValue Op,
+                                                   SelectionDAG &DAG) const {
+  assert(Subtarget.isTargetWin64() && "Unexpected target");
+  EVT VT = Op.getValueType();
+  bool IsStrict = Op->isStrictFPOpcode();
+
+  SDValue Arg = Op.getOperand(IsStrict ? 1 : 0);
+  EVT ArgVT = Arg.getValueType();
+
+  assert(ArgVT.isInteger() && ArgVT.getSizeInBits() == 128 &&
+         "Unexpected argument type for lowering");
+
+  RTLIB::Libcall LC;
+  if (Op->getOpcode() == ISD::SINT_TO_FP ||
+      Op->getOpcode() == ISD::STRICT_SINT_TO_FP)
+    LC = RTLIB::getSINTTOFP(ArgVT, VT);
+  else
+    LC = RTLIB::getUINTTOFP(ArgVT, VT);
+  assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unexpected request for libcall!");
+
+  SDLoc dl(Op);
+  MakeLibCallOptions CallOptions;
+  SDValue Chain = IsStrict ? Op.getOperand(0) : DAG.getEntryNode();
+
+  // Pass the i128 argument as an indirect argument on the stack.
+  SDValue StackPtr = DAG.CreateStackTemporary(ArgVT, 16);
+  int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
+  MachinePointerInfo MPI =
+      MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SPFI);
+  Chain = DAG.getStore(Chain, dl, Arg, StackPtr, MPI, Align(16));
+
+  SDValue Result;
+  std::tie(Result, Chain) =
+      makeLibCall(DAG, LC, VT, StackPtr, CallOptions, dl, Chain);
+  return IsStrict ? DAG.getMergeValues({Result, Chain}, dl) : Result;
+}
+
 // Return true if the required (according to Opcode) shift-imm form is natively
 // supported by the Subtarget
 static bool SupportedVectorShiftWithImm(MVT VT, const X86Subtarget &Subtarget,
@@ -31665,6 +31750,15 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
       return;
     }
 
+    if (VT == MVT::i128 && Subtarget.isTargetWin64()) {
+      SDValue Chain;
+      SDValue V = LowerWin64_FP_TO_INT128(SDValue(N, 0), DAG, Chain);
+      Results.push_back(V);
+      if (IsStrict)
+        Results.push_back(Chain);
+      return;
+    }
+
     SDValue Chain;
     if (SDValue V = FP_TO_INTHelper(SDValue(N, 0), DAG, IsSigned, Chain)) {
       Results.push_back(V);

diff  --git a/llvm/lib/Target/X86/X86ISelLowering.h b/llvm/lib/Target/X86/X86ISelLowering.h
index c6c16128d366b..ce63e09528f03 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.h
+++ b/llvm/lib/Target/X86/X86ISelLowering.h
@@ -1561,6 +1561,9 @@ namespace llvm {
     SDValue LowerFLT_ROUNDS_(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerSET_ROUNDING(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerWin64_i128OP(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerWin64_FP_TO_INT128(SDValue Op, SelectionDAG &DAG,
+                                    SDValue &Chain) const;
+    SDValue LowerWin64_INT128_TO_FP(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerGC_TRANSITION(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const;
     SDValue lowerFaddFsub(SDValue Op, SelectionDAG &DAG) const;

diff  --git a/llvm/test/CodeGen/X86/i128-fpconv-win64-strict.ll b/llvm/test/CodeGen/X86/i128-fpconv-win64-strict.ll
new file mode 100644
index 0000000000000..ca1d1fecb6cd2
--- /dev/null
+++ b/llvm/test/CodeGen/X86/i128-fpconv-win64-strict.ll
@@ -0,0 +1,208 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-win32 | FileCheck %s -check-prefix=WIN64
+; RUN: llc < %s -mtriple=x86_64-mingw32 | FileCheck %s -check-prefix=WIN64
+
+define i64 @double_to_i128(double %d) nounwind {
+; WIN64-LABEL: double_to_i128:
+; WIN64:       # %bb.0:
+; WIN64-NEXT:    subq $40, %rsp
+; WIN64-NEXT:    callq __fixdfti
+; WIN64-NEXT:    movq %xmm0, %rax
+; WIN64-NEXT:    addq $40, %rsp
+; WIN64-NEXT:    retq
+  %1 = tail call i128 @llvm.experimental.constrained.fptosi.i128.f64(double %d, metadata !"fpexcept.strict")
+  %2 = trunc i128 %1 to i64
+  ret i64 %2
+}
+
+define i64 @double_to_ui128(double %d) nounwind {
+; WIN64-LABEL: double_to_ui128:
+; WIN64:       # %bb.0:
+; WIN64-NEXT:    subq $40, %rsp
+; WIN64-NEXT:    callq __fixunsdfti
+; WIN64-NEXT:    movq %xmm0, %rax
+; WIN64-NEXT:    addq $40, %rsp
+; WIN64-NEXT:    retq
+  %1 = tail call i128 @llvm.experimental.constrained.fptoui.i128.f64(double %d, metadata !"fpexcept.strict")
+  %2 = trunc i128 %1 to i64
+  ret i64 %2
+}
+
+define i64 @float_to_i128(float %d) nounwind {
+; WIN64-LABEL: float_to_i128:
+; WIN64:       # %bb.0:
+; WIN64-NEXT:    subq $40, %rsp
+; WIN64-NEXT:    callq __fixsfti
+; WIN64-NEXT:    movq %xmm0, %rax
+; WIN64-NEXT:    addq $40, %rsp
+; WIN64-NEXT:    retq
+  %1 = tail call i128 @llvm.experimental.constrained.fptosi.i128.f32(float %d, metadata !"fpexcept.strict")
+  %2 = trunc i128 %1 to i64
+  ret i64 %2
+}
+
+define i64 @float_to_ui128(float %d) nounwind {
+; WIN64-LABEL: float_to_ui128:
+; WIN64:       # %bb.0:
+; WIN64-NEXT:    subq $40, %rsp
+; WIN64-NEXT:    callq __fixunssfti
+; WIN64-NEXT:    movq %xmm0, %rax
+; WIN64-NEXT:    addq $40, %rsp
+; WIN64-NEXT:    retq
+  %1 = tail call i128 @llvm.experimental.constrained.fptoui.i128.f32(float %d, metadata !"fpexcept.strict")
+  %2 = trunc i128 %1 to i64
+  ret i64 %2
+}
+
+define i64 @longdouble_to_i128(x86_fp80* nocapture readonly %0) nounwind {
+; WIN64-LABEL: longdouble_to_i128:
+; WIN64:       # %bb.0:
+; WIN64-NEXT:    subq $56, %rsp
+; WIN64-NEXT:    fldt (%rcx)
+; WIN64-NEXT:    fstpt {{[0-9]+}}(%rsp)
+; WIN64-NEXT:    leaq {{[0-9]+}}(%rsp), %rcx
+; WIN64-NEXT:    callq __fixxfti
+; WIN64-NEXT:    movq %xmm0, %rax
+; WIN64-NEXT:    addq $56, %rsp
+; WIN64-NEXT:    retq
+  %2 = load x86_fp80, x86_fp80* %0, align 16
+  %3 = tail call i128 @llvm.experimental.constrained.fptosi.i128.f80(x86_fp80 %2, metadata !"fpexcept.strict")
+  %4 = trunc i128 %3 to i64
+  ret i64 %4
+}
+
+define i64 @longdouble_to_ui128(x86_fp80* nocapture readonly %0) nounwind {
+; WIN64-LABEL: longdouble_to_ui128:
+; WIN64:       # %bb.0:
+; WIN64-NEXT:    subq $56, %rsp
+; WIN64-NEXT:    fldt (%rcx)
+; WIN64-NEXT:    fstpt {{[0-9]+}}(%rsp)
+; WIN64-NEXT:    leaq {{[0-9]+}}(%rsp), %rcx
+; WIN64-NEXT:    callq __fixunsxfti
+; WIN64-NEXT:    movq %xmm0, %rax
+; WIN64-NEXT:    addq $56, %rsp
+; WIN64-NEXT:    retq
+  %2 = load x86_fp80, x86_fp80* %0, align 16
+  %3 = tail call i128 @llvm.experimental.constrained.fptoui.i128.f80(x86_fp80 %2, metadata !"fpexcept.strict")
+  %4 = trunc i128 %3 to i64
+  ret i64 %4
+}
+
+define double @i128_to_double(i128* nocapture readonly %0) nounwind {
+; WIN64-LABEL: i128_to_double:
+; WIN64:       # %bb.0:
+; WIN64-NEXT:    subq $56, %rsp
+; WIN64-NEXT:    movaps (%rcx), %xmm0
+; WIN64-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
+; WIN64-NEXT:    leaq {{[0-9]+}}(%rsp), %rcx
+; WIN64-NEXT:    callq __floattidf
+; WIN64-NEXT:    addq $56, %rsp
+; WIN64-NEXT:    retq
+  %2 = load i128, i128* %0, align 16
+  %3 = tail call double @llvm.experimental.constrained.sitofp.f64.i128(i128 %2, metadata !"round.dynamic", metadata !"fpexcept.strict")
+  ret double %3
+}
+
+define double @ui128_to_double(i128* nocapture readonly %0) nounwind {
+; WIN64-LABEL: ui128_to_double:
+; WIN64:       # %bb.0:
+; WIN64-NEXT:    subq $56, %rsp
+; WIN64-NEXT:    movaps (%rcx), %xmm0
+; WIN64-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
+; WIN64-NEXT:    leaq {{[0-9]+}}(%rsp), %rcx
+; WIN64-NEXT:    callq __floatuntidf
+; WIN64-NEXT:    addq $56, %rsp
+; WIN64-NEXT:    retq
+  %2 = load i128, i128* %0, align 16
+  %3 = tail call double @llvm.experimental.constrained.uitofp.f64.i128(i128 %2, metadata !"round.dynamic", metadata !"fpexcept.strict")
+  ret double %3
+}
+
+define float @i128_to_float(i128* nocapture readonly %0) nounwind {
+; WIN64-LABEL: i128_to_float:
+; WIN64:       # %bb.0:
+; WIN64-NEXT:    subq $56, %rsp
+; WIN64-NEXT:    movaps (%rcx), %xmm0
+; WIN64-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
+; WIN64-NEXT:    leaq {{[0-9]+}}(%rsp), %rcx
+; WIN64-NEXT:    callq __floattisf
+; WIN64-NEXT:    addq $56, %rsp
+; WIN64-NEXT:    retq
+  %2 = load i128, i128* %0, align 16
+  %3 = tail call float @llvm.experimental.constrained.sitofp.f32.i128(i128 %2, metadata !"round.dynamic", metadata !"fpexcept.strict")
+  ret float %3
+}
+
+define float @ui128_to_float(i128* nocapture readonly %0) nounwind {
+; WIN64-LABEL: ui128_to_float:
+; WIN64:       # %bb.0:
+; WIN64-NEXT:    subq $56, %rsp
+; WIN64-NEXT:    movaps (%rcx), %xmm0
+; WIN64-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
+; WIN64-NEXT:    leaq {{[0-9]+}}(%rsp), %rcx
+; WIN64-NEXT:    callq __floatuntisf
+; WIN64-NEXT:    addq $56, %rsp
+; WIN64-NEXT:    retq
+  %2 = load i128, i128* %0, align 16
+  %3 = tail call float @llvm.experimental.constrained.uitofp.f32.i128(i128 %2, metadata !"round.dynamic", metadata !"fpexcept.strict")
+  ret float %3
+}
+
+define void @i128_to_longdouble(x86_fp80* noalias nocapture sret(x86_fp80) align 16 %agg.result, i128* nocapture readonly %0) nounwind {
+; WIN64-LABEL: i128_to_longdouble:
+; WIN64:       # %bb.0:
+; WIN64-NEXT:    pushq %rsi
+; WIN64-NEXT:    subq $64, %rsp
+; WIN64-NEXT:    movq %rcx, %rsi
+; WIN64-NEXT:    movaps (%rdx), %xmm0
+; WIN64-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
+; WIN64-NEXT:    leaq {{[0-9]+}}(%rsp), %rcx
+; WIN64-NEXT:    leaq {{[0-9]+}}(%rsp), %rdx
+; WIN64-NEXT:    callq __floattixf
+; WIN64-NEXT:    fldt {{[0-9]+}}(%rsp)
+; WIN64-NEXT:    fstpt (%rsi)
+; WIN64-NEXT:    movq %rsi, %rax
+; WIN64-NEXT:    addq $64, %rsp
+; WIN64-NEXT:    popq %rsi
+; WIN64-NEXT:    retq
+  %2 = load i128, i128* %0, align 16
+  %3 = tail call x86_fp80 @llvm.experimental.constrained.sitofp.f80.i128(i128 %2, metadata !"round.dynamic", metadata !"fpexcept.strict")
+  store x86_fp80 %3, x86_fp80* %agg.result, align 16
+  ret void
+}
+
+define void @ui128_to_longdouble(x86_fp80* noalias nocapture sret(x86_fp80) align 16 %agg.result, i128* nocapture readonly %0) nounwind {
+; WIN64-LABEL: ui128_to_longdouble:
+; WIN64:       # %bb.0:
+; WIN64-NEXT:    pushq %rsi
+; WIN64-NEXT:    subq $64, %rsp
+; WIN64-NEXT:    movq %rcx, %rsi
+; WIN64-NEXT:    movaps (%rdx), %xmm0
+; WIN64-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
+; WIN64-NEXT:    leaq {{[0-9]+}}(%rsp), %rcx
+; WIN64-NEXT:    leaq {{[0-9]+}}(%rsp), %rdx
+; WIN64-NEXT:    callq __floatuntixf
+; WIN64-NEXT:    fldt {{[0-9]+}}(%rsp)
+; WIN64-NEXT:    fstpt (%rsi)
+; WIN64-NEXT:    movq %rsi, %rax
+; WIN64-NEXT:    addq $64, %rsp
+; WIN64-NEXT:    popq %rsi
+; WIN64-NEXT:    retq
+  %2 = load i128, i128* %0, align 16
+  %3 = tail call x86_fp80 @llvm.experimental.constrained.uitofp.f80.i128(i128 %2, metadata !"round.dynamic", metadata !"fpexcept.strict")
+  store x86_fp80 %3, x86_fp80* %agg.result, align 16
+  ret void
+}
+
+declare i128 @llvm.experimental.constrained.fptosi.i128.f64(double, metadata)
+declare i128 @llvm.experimental.constrained.fptoui.i128.f64(double, metadata)
+declare i128 @llvm.experimental.constrained.fptosi.i128.f32(float, metadata)
+declare i128 @llvm.experimental.constrained.fptoui.i128.f32(float, metadata)
+declare i128 @llvm.experimental.constrained.fptosi.i128.f80(x86_fp80, metadata)
+declare i128 @llvm.experimental.constrained.fptoui.i128.f80(x86_fp80, metadata)
+declare double @llvm.experimental.constrained.sitofp.f64.i128(i128, metadata, metadata)
+declare double @llvm.experimental.constrained.uitofp.f64.i128(i128, metadata, metadata)
+declare float @llvm.experimental.constrained.sitofp.f32.i128(i128, metadata, metadata)
+declare float @llvm.experimental.constrained.uitofp.f32.i128(i128, metadata, metadata)
+declare x86_fp80 @llvm.experimental.constrained.sitofp.f80.i128(i128, metadata, metadata)
+declare x86_fp80 @llvm.experimental.constrained.uitofp.f80.i128(i128, metadata, metadata)

diff  --git a/llvm/test/CodeGen/X86/i128-fpconv-win64.ll b/llvm/test/CodeGen/X86/i128-fpconv-win64.ll
new file mode 100644
index 0000000000000..824ad85a7cd9b
--- /dev/null
+++ b/llvm/test/CodeGen/X86/i128-fpconv-win64.ll
@@ -0,0 +1,195 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-win32 | FileCheck %s -check-prefix=WIN64
+; RUN: llc < %s -mtriple=x86_64-mingw32 | FileCheck %s -check-prefix=WIN64
+
+define i64 @double_to_i128(double %d) nounwind {
+; WIN64-LABEL: double_to_i128:
+; WIN64:       # %bb.0:
+; WIN64-NEXT:    subq $40, %rsp
+; WIN64-NEXT:    callq __fixdfti
+; WIN64-NEXT:    movq %xmm0, %rax
+; WIN64-NEXT:    addq $40, %rsp
+; WIN64-NEXT:    retq
+  %1 = fptosi double %d to i128
+  %2 = trunc i128 %1 to i64
+  ret i64 %2
+}
+
+define i64 @double_to_ui128(double %d) nounwind {
+; WIN64-LABEL: double_to_ui128:
+; WIN64:       # %bb.0:
+; WIN64-NEXT:    subq $40, %rsp
+; WIN64-NEXT:    callq __fixunsdfti
+; WIN64-NEXT:    movq %xmm0, %rax
+; WIN64-NEXT:    addq $40, %rsp
+; WIN64-NEXT:    retq
+  %1 = fptoui double %d to i128
+  %2 = trunc i128 %1 to i64
+  ret i64 %2
+}
+
+define i64 @float_to_i128(float %d) nounwind {
+; WIN64-LABEL: float_to_i128:
+; WIN64:       # %bb.0:
+; WIN64-NEXT:    subq $40, %rsp
+; WIN64-NEXT:    callq __fixsfti
+; WIN64-NEXT:    movq %xmm0, %rax
+; WIN64-NEXT:    addq $40, %rsp
+; WIN64-NEXT:    retq
+  %1 = fptosi float %d to i128
+  %2 = trunc i128 %1 to i64
+  ret i64 %2
+}
+
+define i64 @float_to_ui128(float %d) nounwind {
+; WIN64-LABEL: float_to_ui128:
+; WIN64:       # %bb.0:
+; WIN64-NEXT:    subq $40, %rsp
+; WIN64-NEXT:    callq __fixunssfti
+; WIN64-NEXT:    movq %xmm0, %rax
+; WIN64-NEXT:    addq $40, %rsp
+; WIN64-NEXT:    retq
+  %1 = fptoui float %d to i128
+  %2 = trunc i128 %1 to i64
+  ret i64 %2
+}
+
+define i64 @longdouble_to_i128(x86_fp80* nocapture readonly %0) nounwind {
+; WIN64-LABEL: longdouble_to_i128:
+; WIN64:       # %bb.0:
+; WIN64-NEXT:    subq $56, %rsp
+; WIN64-NEXT:    fldt (%rcx)
+; WIN64-NEXT:    fstpt {{[0-9]+}}(%rsp)
+; WIN64-NEXT:    leaq {{[0-9]+}}(%rsp), %rcx
+; WIN64-NEXT:    callq __fixxfti
+; WIN64-NEXT:    movq %xmm0, %rax
+; WIN64-NEXT:    addq $56, %rsp
+; WIN64-NEXT:    retq
+  %2 = load x86_fp80, x86_fp80* %0, align 16
+  %3 = fptosi x86_fp80 %2 to i128
+  %4 = trunc i128 %3 to i64
+  ret i64 %4
+}
+
+define i64 @longdouble_to_ui128(x86_fp80* nocapture readonly %0) nounwind {
+; WIN64-LABEL: longdouble_to_ui128:
+; WIN64:       # %bb.0:
+; WIN64-NEXT:    subq $56, %rsp
+; WIN64-NEXT:    fldt (%rcx)
+; WIN64-NEXT:    fstpt {{[0-9]+}}(%rsp)
+; WIN64-NEXT:    leaq {{[0-9]+}}(%rsp), %rcx
+; WIN64-NEXT:    callq __fixunsxfti
+; WIN64-NEXT:    movq %xmm0, %rax
+; WIN64-NEXT:    addq $56, %rsp
+; WIN64-NEXT:    retq
+  %2 = load x86_fp80, x86_fp80* %0, align 16
+  %3 = fptoui x86_fp80 %2 to i128
+  %4 = trunc i128 %3 to i64
+  ret i64 %4
+}
+
+define double @i128_to_double(i128* nocapture readonly %0) nounwind {
+; WIN64-LABEL: i128_to_double:
+; WIN64:       # %bb.0:
+; WIN64-NEXT:    subq $56, %rsp
+; WIN64-NEXT:    movaps (%rcx), %xmm0
+; WIN64-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
+; WIN64-NEXT:    leaq {{[0-9]+}}(%rsp), %rcx
+; WIN64-NEXT:    callq __floattidf
+; WIN64-NEXT:    addq $56, %rsp
+; WIN64-NEXT:    retq
+  %2 = load i128, i128* %0, align 16
+  %3 = sitofp i128 %2 to double
+  ret double %3
+}
+
+define double @ui128_to_double(i128* nocapture readonly %0) nounwind {
+; WIN64-LABEL: ui128_to_double:
+; WIN64:       # %bb.0:
+; WIN64-NEXT:    subq $56, %rsp
+; WIN64-NEXT:    movaps (%rcx), %xmm0
+; WIN64-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
+; WIN64-NEXT:    leaq {{[0-9]+}}(%rsp), %rcx
+; WIN64-NEXT:    callq __floatuntidf
+; WIN64-NEXT:    addq $56, %rsp
+; WIN64-NEXT:    retq
+  %2 = load i128, i128* %0, align 16
+  %3 = uitofp i128 %2 to double
+  ret double %3
+}
+
+define float @i128_to_float(i128* nocapture readonly %0) nounwind {
+; WIN64-LABEL: i128_to_float:
+; WIN64:       # %bb.0:
+; WIN64-NEXT:    subq $56, %rsp
+; WIN64-NEXT:    movaps (%rcx), %xmm0
+; WIN64-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
+; WIN64-NEXT:    leaq {{[0-9]+}}(%rsp), %rcx
+; WIN64-NEXT:    callq __floattisf
+; WIN64-NEXT:    addq $56, %rsp
+; WIN64-NEXT:    retq
+  %2 = load i128, i128* %0, align 16
+  %3 = sitofp i128 %2 to float
+  ret float %3
+}
+
+define float @ui128_to_float(i128* nocapture readonly %0) nounwind {
+; WIN64-LABEL: ui128_to_float:
+; WIN64:       # %bb.0:
+; WIN64-NEXT:    subq $56, %rsp
+; WIN64-NEXT:    movaps (%rcx), %xmm0
+; WIN64-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
+; WIN64-NEXT:    leaq {{[0-9]+}}(%rsp), %rcx
+; WIN64-NEXT:    callq __floatuntisf
+; WIN64-NEXT:    addq $56, %rsp
+; WIN64-NEXT:    retq
+  %2 = load i128, i128* %0, align 16
+  %3 = uitofp i128 %2 to float
+  ret float %3
+}
+
+define void @i128_to_longdouble(x86_fp80* noalias nocapture sret(x86_fp80) align 16 %agg.result, i128* nocapture readonly %0) nounwind {
+; WIN64-LABEL: i128_to_longdouble:
+; WIN64:       # %bb.0:
+; WIN64-NEXT:    pushq %rsi
+; WIN64-NEXT:    subq $64, %rsp
+; WIN64-NEXT:    movq %rcx, %rsi
+; WIN64-NEXT:    movaps (%rdx), %xmm0
+; WIN64-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
+; WIN64-NEXT:    leaq {{[0-9]+}}(%rsp), %rcx
+; WIN64-NEXT:    leaq {{[0-9]+}}(%rsp), %rdx
+; WIN64-NEXT:    callq __floattixf
+; WIN64-NEXT:    fldt {{[0-9]+}}(%rsp)
+; WIN64-NEXT:    fstpt (%rsi)
+; WIN64-NEXT:    movq %rsi, %rax
+; WIN64-NEXT:    addq $64, %rsp
+; WIN64-NEXT:    popq %rsi
+; WIN64-NEXT:    retq
+  %2 = load i128, i128* %0, align 16
+  %3 = sitofp i128 %2 to x86_fp80
+  store x86_fp80 %3, x86_fp80* %agg.result, align 16
+  ret void
+}
+
+define void @ui128_to_longdouble(x86_fp80* noalias nocapture sret(x86_fp80) align 16 %agg.result, i128* nocapture readonly %0) nounwind {
+; WIN64-LABEL: ui128_to_longdouble:
+; WIN64:       # %bb.0:
+; WIN64-NEXT:    pushq %rsi
+; WIN64-NEXT:    subq $64, %rsp
+; WIN64-NEXT:    movq %rcx, %rsi
+; WIN64-NEXT:    movaps (%rdx), %xmm0
+; WIN64-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
+; WIN64-NEXT:    leaq {{[0-9]+}}(%rsp), %rcx
+; WIN64-NEXT:    leaq {{[0-9]+}}(%rsp), %rdx
+; WIN64-NEXT:    callq __floatuntixf
+; WIN64-NEXT:    fldt {{[0-9]+}}(%rsp)
+; WIN64-NEXT:    fstpt (%rsi)
+; WIN64-NEXT:    movq %rsi, %rax
+; WIN64-NEXT:    addq $64, %rsp
+; WIN64-NEXT:    popq %rsi
+; WIN64-NEXT:    retq
+  %2 = load i128, i128* %0, align 16
+  %3 = uitofp i128 %2 to x86_fp80
+  store x86_fp80 %3, x86_fp80* %agg.result, align 16
+  ret void
+}