[llvm] 8fdafb7 - Insert wait instruction after X87 instructions which could raise

Wed Jan 15 20:13:10 PST 2020

Author: Liu, Chen3
Date: 2020-01-16T12:12:51+08:00
New Revision: 8fdafb7dced812b2dc0af77f9668bfe23b4ffb0b

URL: https://github.com/llvm/llvm-project/commit/8fdafb7dced812b2dc0af77f9668bfe23b4ffb0b
DIFF: https://github.com/llvm/llvm-project/commit/8fdafb7dced812b2dc0af77f9668bfe23b4ffb0b.diff

LOG: Insert wait instruction after X87 instructions which could raise
float-point exception.

This patch also modify some mayRaiseFPException flag which set in D68854.

Differential Revision: https://reviews.llvm.org/D72750

Added: 
    llvm/lib/Target/X86/X86InsertWait.cpp

Modified: 
    llvm/lib/Target/X86/CMakeLists.txt
    llvm/lib/Target/X86/X86.h
    llvm/lib/Target/X86/X86FloatingPoint.cpp
    llvm/lib/Target/X86/X86InstrFPStack.td
    llvm/lib/Target/X86/X86TargetMachine.cpp
    llvm/test/CodeGen/X86/O0-pipeline.ll
    llvm/test/CodeGen/X86/O3-pipeline.ll
    llvm/test/CodeGen/X86/constrained-fp80-trunc-ext.ll
    llvm/test/CodeGen/X86/fp-intrinsics.ll
    llvm/test/CodeGen/X86/fp-strict-libcalls-msvc32.ll
    llvm/test/CodeGen/X86/fp-strict-scalar-cmp.ll
    llvm/test/CodeGen/X86/fp-strict-scalar-fptoint.ll
    llvm/test/CodeGen/X86/fp-strict-scalar-inttofp.ll
    llvm/test/CodeGen/X86/fp-strict-scalar-round.ll
    llvm/test/CodeGen/X86/fp-strict-scalar.ll
    llvm/test/CodeGen/X86/fp128-cast-strict.ll
    llvm/test/CodeGen/X86/fp80-strict-scalar-cmp.ll
    llvm/test/CodeGen/X86/fp80-strict-scalar.ll
    llvm/test/CodeGen/X86/vec-strict-128.ll
    llvm/test/CodeGen/X86/vec-strict-fptoint-128.ll
    llvm/test/CodeGen/X86/vec-strict-fptoint-256.ll
    llvm/test/CodeGen/X86/vec-strict-fptoint-512.ll
    llvm/test/CodeGen/X86/vec-strict-inttofp-128.ll
    llvm/test/CodeGen/X86/vec-strict-inttofp-256.ll
    llvm/test/CodeGen/X86/vec-strict-inttofp-512.ll
    llvm/test/CodeGen/X86/vector-constrained-fp-intrinsics.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/X86/CMakeLists.txt b/llvm/lib/Target/X86/CMakeLists.txt
index 58f2292dd4cd..3f0d68c0c788 100644

--- a/llvm/lib/Target/X86/CMakeLists.txt
+++ b/llvm/lib/Target/X86/CMakeLists.txt
@@ -69,6 +69,7 @@ set(sources
   X86VZeroUpper.cpp
   X86WinAllocaExpander.cpp
   X86WinEHState.cpp
+  X86InsertWait.cpp
   )
 
 add_llvm_target(X86CodeGen ${sources})

diff  --git a/llvm/lib/Target/X86/X86.h b/llvm/lib/Target/X86/X86.h
index 0481a40d462a..604438f83531 100644
--- a/llvm/lib/Target/X86/X86.h
+++ b/llvm/lib/Target/X86/X86.h
@@ -129,6 +129,10 @@ FunctionPass *createX86DiscriminateMemOpsPass();
 /// This pass applies profiling information to insert cache prefetches.
 FunctionPass *createX86InsertPrefetchPass();
 
+/// This pass insert wait instruction after X87 instructions which could raise
+/// fp exceptions when strict-fp enabled.
+FunctionPass *createX86InsertX87waitPass();
+
 InstructionSelector *createX86InstructionSelector(const X86TargetMachine &TM,
                                                   X86Subtarget &,
                                                   X86RegisterBankInfo &);

diff  --git a/llvm/lib/Target/X86/X86FloatingPoint.cpp b/llvm/lib/Target/X86/X86FloatingPoint.cpp
index 13bbd6ccfce4..e6ee46957500 100644
--- a/llvm/lib/Target/X86/X86FloatingPoint.cpp
+++ b/llvm/lib/Target/X86/X86FloatingPoint.cpp
@@ -1364,6 +1364,9 @@ void FPS::handleTwoArgFP(MachineBasicBlock::iterator &I) {
   MBB->remove(&*I++);
   I = BuildMI(*MBB, I, dl, TII->get(Opcode)).addReg(getSTReg(NotTOS));
 
+  if (!MI.mayRaiseFPException())
+    I->setFlag(MachineInstr::MIFlag::NoFPExcept);
+
   // If both operands are killed, pop one off of the stack in addition to
   // overwriting the other one.
   if (KillsOp0 && KillsOp1 && Op0 != Op1) {

diff  --git a/llvm/lib/Target/X86/X86InsertWait.cpp b/llvm/lib/Target/X86/X86InsertWait.cpp
new file mode 100644
index 000000000000..a82d98d88b30
--- /dev/null
+++ b/llvm/lib/Target/X86/X86InsertWait.cpp
@@ -0,0 +1,151 @@
+//-  X86Insertwait.cpp - Strict-Fp:Insert wait instruction X87 instructions --//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the pass which insert x86 wait instructions after each
+// X87 instructions when strict float is enabled.
+//
+// The logic to insert a wait instruction after an X87 instruction is as below:
+// 1. If the X87 instruction don't raise float exception nor is a load/store
+//    instruction, or is a x87 control instruction, don't insert wait.
+// 2. If the X87 instruction is an instruction which the following instruction
+//    is an X87 exception synchronizing X87 instruction, don't insert wait.
+// 3. For other situations, insert wait instruction.
+//
+//===----------------------------------------------------------------------===//
+
+#include "X86.h"
+#include "X86InstrInfo.h"
+#include "X86Subtarget.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineOperand.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
+#include "llvm/IR/DebugLoc.h"
+#include "llvm/Support/Debug.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "x86-insert-wait"
+
+namespace {
+
+class WaitInsert : public MachineFunctionPass {
+public:
+  static char ID;
+
+  WaitInsert() : MachineFunctionPass(ID) {}
+
+  bool runOnMachineFunction(MachineFunction &MF) override;
+
+  StringRef getPassName() const override {
+    return "X86 insert wait instruction";
+  }
+
+private:
+  const TargetInstrInfo *TII; // Machine instruction info.
+};
+
+} // namespace
+
+char WaitInsert::ID = 0;
+
+FunctionPass *llvm::createX86InsertX87waitPass() { return new WaitInsert(); }
+
+/// Return true if the Reg is X87 register.
+static bool isX87Reg(unsigned Reg) {
+  return (Reg == X86::FPCW || Reg == X86::FPSW ||
+          (Reg >= X86::ST0 && Reg <= X86::ST7));
+}
+
+/// check if the instruction is X87 instruction
+static bool isX87Instruction(MachineInstr &MI) {
+  for (const MachineOperand &MO : MI.operands()) {
+    if (!MO.isReg())
+      continue;
+    if (isX87Reg(MO.getReg()))
+      return true;
+  }
+  return false;
+}
+
+static bool isX87ControlInstruction(MachineInstr &MI) {
+  switch (MI.getOpcode()) {
+  case X86::FNINIT:
+  case X86::FLDCW16m:
+  case X86::FNSTCW16m:
+  case X86::FNSTSW16r:
+  case X86::FNSTSWm:
+  case X86::FNCLEX:
+  case X86::FLDENVm:
+  case X86::FSTENVm:
+  case X86::FRSTORm:
+  case X86::FSAVEm:
+  case X86::FINCSTP:
+  case X86::FDECSTP:
+  case X86::FFREE:
+  case X86::FFREEP:
+  case X86::FNOP:
+  case X86::WAIT:
+    return true;
+  default:
+    return false;
+  }
+}
+
+static bool isX87NonWaitingControlInstruction(MachineInstr &MI) {
+  // a few special control instructions don't perform a wait operation
+  switch (MI.getOpcode()) {
+  case X86::FNINIT:
+  case X86::FNSTSW16r:
+  case X86::FNSTSWm:
+  case X86::FNSTCW16m:
+  case X86::FNCLEX:
+    return true;
+  default:
+    return false;
+  }
+}
+
+bool WaitInsert::runOnMachineFunction(MachineFunction &MF) {
+  if (!MF.getFunction().hasFnAttribute(Attribute::StrictFP))
+    return false;
+
+  const X86Subtarget &ST = MF.getSubtarget<X86Subtarget>();
+  TII = ST.getInstrInfo();
+  bool Changed = false;
+
+  for (MachineBasicBlock &MBB : MF) {
+    for (MachineBasicBlock::iterator MI = MBB.begin(); MI != MBB.end(); ++MI) {
+      // Jump non X87 instruction.
+      if (!isX87Instruction(*MI))
+        continue;
+      // If the instruction instruction neither has float exception nor is
+      // a load/store instruction, or the instruction is x87 control
+      // instruction, do not insert wait.
+      if (!(MI->mayRaiseFPException() || MI->mayLoadOrStore()) ||
+          isX87ControlInstruction(*MI))
+        continue;
+      // If the following instruction is an X87 instruction and isn't an X87
+      // non-waiting control instruction, we can omit insert wait instruction.
+      MachineBasicBlock::iterator AfterMI = std::next(MI);
+      if (AfterMI != MBB.end() && isX87Instruction(*AfterMI) &&
+          !isX87NonWaitingControlInstruction(*AfterMI))
+        continue;
+
+      BuildMI(MBB, AfterMI, MI->getDebugLoc(), TII->get(X86::WAIT));
+      LLVM_DEBUG(dbgs() << "\nInsert wait after:\t" << *MI);
+      // Jump the newly inserting wait
+      ++MI;
+      Changed = true;
+    }
+  }
+  return Changed;
+}

diff  --git a/llvm/lib/Target/X86/X86InstrFPStack.td b/llvm/lib/Target/X86/X86InstrFPStack.td
index 1830262205c6..677859ef0805 100644
--- a/llvm/lib/Target/X86/X86InstrFPStack.td
+++ b/llvm/lib/Target/X86/X86InstrFPStack.td
@@ -601,6 +601,7 @@ let SchedRW = [WriteMove], Uses = [FPCW] in {
 def LD_Frr   : FPI<0xD9, MRM0r, (outs), (ins RSTi:$op), "fld\t$op">;
 def ST_Frr   : FPI<0xDD, MRM2r, (outs), (ins RSTi:$op), "fst\t$op">;
 def ST_FPrr  : FPI<0xDD, MRM3r, (outs), (ins RSTi:$op), "fstp\t$op">;
+let mayRaiseFPException = 0 in
 def XCH_F    : FPI<0xD9, MRM1r, (outs), (ins RSTi:$op), "fxch\t$op">;
 }
 
@@ -620,13 +621,13 @@ def LD_Fp180 : FpI_<(outs RFP80:$dst), (ins), ZeroArgFP,
                 [(set RFP80:$dst, fpimm1)]>;
 }
 
-let SchedRW = [WriteFLD0], Uses = [FPCW] in
+let SchedRW = [WriteFLD0], Uses = [FPCW], mayRaiseFPException = 0 in
 def LD_F0 : FPI<0xD9, MRM_EE, (outs), (ins), "fldz">;
 
-let SchedRW = [WriteFLD1], Uses = [FPCW] in
+let SchedRW = [WriteFLD1], Uses = [FPCW], mayRaiseFPException = 0 in
 def LD_F1 : FPI<0xD9, MRM_E8, (outs), (ins), "fld1">;
 
-let SchedRW = [WriteFLDC], Defs = [FPSW], Uses = [FPCW] in {
+let SchedRW = [WriteFLDC], Defs = [FPSW], Uses = [FPCW], mayRaiseFPException = 0 in {
 def FLDL2T : I<0xD9, MRM_E9, (outs), (ins), "fldl2t", []>;
 def FLDL2E : I<0xD9, MRM_EA, (outs), (ins), "fldl2e", []>;
 def FLDPI : I<0xD9, MRM_EB, (outs), (ins), "fldpi", []>;

diff  --git a/llvm/lib/Target/X86/X86TargetMachine.cpp b/llvm/lib/Target/X86/X86TargetMachine.cpp
index 8c696e9adbed..22b4e2805a5e 100644
--- a/llvm/lib/Target/X86/X86TargetMachine.cpp
+++ b/llvm/lib/Target/X86/X86TargetMachine.cpp
@@ -519,6 +519,7 @@ void X86PassConfig::addPreEmitPass() {
   }
   addPass(createX86DiscriminateMemOpsPass());
   addPass(createX86InsertPrefetchPass());
+  addPass(createX86InsertX87waitPass());
 }
 
 void X86PassConfig::addPreEmitPass2() {

diff  --git a/llvm/test/CodeGen/X86/O0-pipeline.ll b/llvm/test/CodeGen/X86/O0-pipeline.ll
index 33ecad677a63..5dfe02147e13 100644
--- a/llvm/test/CodeGen/X86/O0-pipeline.ll
+++ b/llvm/test/CodeGen/X86/O0-pipeline.ll
@@ -65,6 +65,7 @@
 ; CHECK-NEXT:       X86 vzeroupper inserter
 ; CHECK-NEXT:       X86 Discriminate Memory Operands
 ; CHECK-NEXT:       X86 Insert Cache Prefetches
+; CHECK-NEXT:       X86 insert wait instruction
 ; CHECK-NEXT:       Contiguously Lay Out Funclets
 ; CHECK-NEXT:       StackMap Liveness Analysis
 ; CHECK-NEXT:       Live DEBUG_VALUE analysis

diff  --git a/llvm/test/CodeGen/X86/O3-pipeline.ll b/llvm/test/CodeGen/X86/O3-pipeline.ll
index 575b704b8b4c..3d96753b553f 100644
--- a/llvm/test/CodeGen/X86/O3-pipeline.ll
+++ b/llvm/test/CodeGen/X86/O3-pipeline.ll
@@ -174,6 +174,7 @@
 ; CHECK-NEXT:       Compressing EVEX instrs to VEX encoding when possible
 ; CHECK-NEXT:       X86 Discriminate Memory Operands
 ; CHECK-NEXT:       X86 Insert Cache Prefetches
+; CHECK-NEXT:       X86 insert wait instruction
 ; CHECK-NEXT:       Contiguously Lay Out Funclets
 ; CHECK-NEXT:       StackMap Liveness Analysis
 ; CHECK-NEXT:       Live DEBUG_VALUE analysis

diff  --git a/llvm/test/CodeGen/X86/constrained-fp80-trunc-ext.ll b/llvm/test/CodeGen/X86/constrained-fp80-trunc-ext.ll
index 9c408c70cfbf..efa86e241bad 100644
--- a/llvm/test/CodeGen/X86/constrained-fp80-trunc-ext.ll
+++ b/llvm/test/CodeGen/X86/constrained-fp80-trunc-ext.ll
@@ -6,6 +6,7 @@ define x86_fp80 @constrained_fpext_f32_as_fp80(float %mem) #0 {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    movss %xmm0, -{{[0-9]+}}(%rsp)
 ; CHECK-NEXT:    flds -{{[0-9]+}}(%rsp)
+; CHECK-NEXT:    wait
 ; CHECK-NEXT:    retq
 entry:
   %ext = call x86_fp80 @llvm.experimental.constrained.fpext.f80.f32(
@@ -19,6 +20,7 @@ define float @constrained_fptrunc_f80_to_f32(x86_fp80 %reg) #0 {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    fldt {{[0-9]+}}(%rsp)
 ; CHECK-NEXT:    fstps -{{[0-9]+}}(%rsp)
+; CHECK-NEXT:    wait
 ; CHECK-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; CHECK-NEXT:    retq
   %trunc = call float @llvm.experimental.constrained.fptrunc.f32.f80(
@@ -33,6 +35,7 @@ define x86_fp80 @constrained_fpext_f64_to_f80(double %mem) #0 {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    movsd %xmm0, -{{[0-9]+}}(%rsp)
 ; CHECK-NEXT:    fldl -{{[0-9]+}}(%rsp)
+; CHECK-NEXT:    wait
 ; CHECK-NEXT:    retq
 entry:
   %ext = call x86_fp80 @llvm.experimental.constrained.fpext.f80.f64(
@@ -46,6 +49,7 @@ define double @constrained_fptrunc_f80_to_f64(x86_fp80 %reg) #0 {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    fldt {{[0-9]+}}(%rsp)
 ; CHECK-NEXT:    fstpl -{{[0-9]+}}(%rsp)
+; CHECK-NEXT:    wait
 ; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
 ; CHECK-NEXT:    retq
   %trunc = call double @llvm.experimental.constrained.fptrunc.f64.f80(

diff  --git a/llvm/test/CodeGen/X86/fp-intrinsics.ll b/llvm/test/CodeGen/X86/fp-intrinsics.ll
index e00248b22df8..27f198168e38 100644
--- a/llvm/test/CodeGen/X86/fp-intrinsics.ll
+++ b/llvm/test/CodeGen/X86/fp-intrinsics.ll
@@ -19,6 +19,7 @@ define double @f1() #0 {
 ; X87:       # %bb.0: # %entry
 ; X87-NEXT:    fld1
 ; X87-NEXT:    fdivs {{\.LCPI.*}}
+; X87-NEXT:    wait
 ; X87-NEXT:    retl
 ;
 ; X86-SSE-LABEL: f1:
@@ -29,6 +30,7 @@ define double @f1() #0 {
 ; X86-SSE-NEXT:    divsd {{\.LCPI.*}}, %xmm0
 ; X86-SSE-NEXT:    movsd %xmm0, (%esp)
 ; X86-SSE-NEXT:    fldl (%esp)
+; X86-SSE-NEXT:    wait
 ; X86-SSE-NEXT:    addl $12, %esp
 ; X86-SSE-NEXT:    .cfi_def_cfa_offset 4
 ; X86-SSE-NEXT:    retl
@@ -66,6 +68,7 @@ define double @f2(double %a) #0 {
 ; X87:       # %bb.0: # %entry
 ; X87-NEXT:    fldz
 ; X87-NEXT:    fsubrl {{[0-9]+}}(%esp)
+; X87-NEXT:    wait
 ; X87-NEXT:    retl
 ;
 ; X86-SSE-LABEL: f2:
@@ -77,6 +80,7 @@ define double @f2(double %a) #0 {
 ; X86-SSE-NEXT:    subsd %xmm1, %xmm0
 ; X86-SSE-NEXT:    movsd %xmm0, (%esp)
 ; X86-SSE-NEXT:    fldl (%esp)
+; X86-SSE-NEXT:    wait
 ; X86-SSE-NEXT:    addl $12, %esp
 ; X86-SSE-NEXT:    .cfi_def_cfa_offset 4
 ; X86-SSE-NEXT:    retl
@@ -119,6 +123,7 @@ define double @f3(double %a, double %b) #0 {
 ; X87-NEXT:    fsubl {{[0-9]+}}(%esp)
 ; X87-NEXT:    fmull {{[0-9]+}}(%esp)
 ; X87-NEXT:    fsubrp %st, %st(1)
+; X87-NEXT:    wait
 ; X87-NEXT:    retl
 ;
 ; X86-SSE-LABEL: f3:
@@ -132,6 +137,7 @@ define double @f3(double %a, double %b) #0 {
 ; X86-SSE-NEXT:    subsd %xmm1, %xmm0
 ; X86-SSE-NEXT:    movsd %xmm0, (%esp)
 ; X86-SSE-NEXT:    fldl (%esp)
+; X86-SSE-NEXT:    wait
 ; X86-SSE-NEXT:    addl $12, %esp
 ; X86-SSE-NEXT:    .cfi_def_cfa_offset 4
 ; X86-SSE-NEXT:    retl
@@ -185,11 +191,13 @@ define double @f4(i32 %n, double %a) #0 {
 ; X87-LABEL: f4:
 ; X87:       # %bb.0: # %entry
 ; X87-NEXT:    fldl {{[0-9]+}}(%esp)
+; X87-NEXT:    wait
 ; X87-NEXT:    cmpl $0, {{[0-9]+}}(%esp)
 ; X87-NEXT:    jle .LBB3_2
 ; X87-NEXT:  # %bb.1: # %if.then
 ; X87-NEXT:    fld1
 ; X87-NEXT:    faddp %st, %st(1)
+; X87-NEXT:    wait
 ; X87-NEXT:  .LBB3_2: # %if.end
 ; X87-NEXT:    retl
 ;
@@ -205,6 +213,7 @@ define double @f4(i32 %n, double %a) #0 {
 ; X86-SSE-NEXT:  .LBB3_2: # %if.end
 ; X86-SSE-NEXT:    movsd %xmm0, (%esp)
 ; X86-SSE-NEXT:    fldl (%esp)
+; X86-SSE-NEXT:    wait
 ; X86-SSE-NEXT:    addl $12, %esp
 ; X86-SSE-NEXT:    .cfi_def_cfa_offset 4
 ; X86-SSE-NEXT:    retl
@@ -248,6 +257,7 @@ define double @f5() #0 {
 ; X87:       # %bb.0: # %entry
 ; X87-NEXT:    flds {{\.LCPI.*}}
 ; X87-NEXT:    fsqrt
+; X87-NEXT:    wait
 ; X87-NEXT:    retl
 ;
 ; X86-SSE-LABEL: f5:
@@ -258,6 +268,7 @@ define double @f5() #0 {
 ; X86-SSE-NEXT:    sqrtsd %xmm0, %xmm0
 ; X86-SSE-NEXT:    movsd %xmm0, (%esp)
 ; X86-SSE-NEXT:    fldl (%esp)
+; X86-SSE-NEXT:    wait
 ; X86-SSE-NEXT:    addl $12, %esp
 ; X86-SSE-NEXT:    .cfi_def_cfa_offset 4
 ; X86-SSE-NEXT:    retl
@@ -290,6 +301,7 @@ define double @f6() #0 {
 ; X87-NEXT:    fstpl {{[0-9]+}}(%esp)
 ; X87-NEXT:    fldl {{\.LCPI.*}}
 ; X87-NEXT:    fstpl (%esp)
+; X87-NEXT:    wait
 ; X87-NEXT:    calll pow
 ; X87-NEXT:    addl $28, %esp
 ; X87-NEXT:    .cfi_def_cfa_offset 4
@@ -345,6 +357,7 @@ define double @f7() #0 {
 ; X87-NEXT:    .cfi_def_cfa_offset 16
 ; X87-NEXT:    fldl {{\.LCPI.*}}
 ; X87-NEXT:    fstpl (%esp)
+; X87-NEXT:    wait
 ; X87-NEXT:    movl $3, {{[0-9]+}}(%esp)
 ; X87-NEXT:    calll __powidf2
 ; X87-NEXT:    addl $12, %esp
@@ -400,6 +413,7 @@ define double @f8() #0 {
 ; X87-NEXT:    .cfi_def_cfa_offset 16
 ; X87-NEXT:    flds {{\.LCPI.*}}
 ; X87-NEXT:    fstpl (%esp)
+; X87-NEXT:    wait
 ; X87-NEXT:    calll sin
 ; X87-NEXT:    addl $12, %esp
 ; X87-NEXT:    .cfi_def_cfa_offset 4
@@ -450,6 +464,7 @@ define double @f9() #0 {
 ; X87-NEXT:    .cfi_def_cfa_offset 16
 ; X87-NEXT:    flds {{\.LCPI.*}}
 ; X87-NEXT:    fstpl (%esp)
+; X87-NEXT:    wait
 ; X87-NEXT:    calll cos
 ; X87-NEXT:    addl $12, %esp
 ; X87-NEXT:    .cfi_def_cfa_offset 4
@@ -500,6 +515,7 @@ define double @f10() #0 {
 ; X87-NEXT:    .cfi_def_cfa_offset 16
 ; X87-NEXT:    flds {{\.LCPI.*}}
 ; X87-NEXT:    fstpl (%esp)
+; X87-NEXT:    wait
 ; X87-NEXT:    calll exp
 ; X87-NEXT:    addl $12, %esp
 ; X87-NEXT:    .cfi_def_cfa_offset 4
@@ -550,6 +566,7 @@ define double @f11() #0 {
 ; X87-NEXT:    .cfi_def_cfa_offset 16
 ; X87-NEXT:    fldl {{\.LCPI.*}}
 ; X87-NEXT:    fstpl (%esp)
+; X87-NEXT:    wait
 ; X87-NEXT:    calll exp2
 ; X87-NEXT:    addl $12, %esp
 ; X87-NEXT:    .cfi_def_cfa_offset 4
@@ -600,6 +617,7 @@ define double @f12() #0 {
 ; X87-NEXT:    .cfi_def_cfa_offset 16
 ; X87-NEXT:    flds {{\.LCPI.*}}
 ; X87-NEXT:    fstpl (%esp)
+; X87-NEXT:    wait
 ; X87-NEXT:    calll log
 ; X87-NEXT:    addl $12, %esp
 ; X87-NEXT:    .cfi_def_cfa_offset 4
@@ -650,6 +668,7 @@ define double @f13() #0 {
 ; X87-NEXT:    .cfi_def_cfa_offset 16
 ; X87-NEXT:    flds {{\.LCPI.*}}
 ; X87-NEXT:    fstpl (%esp)
+; X87-NEXT:    wait
 ; X87-NEXT:    calll log10
 ; X87-NEXT:    addl $12, %esp
 ; X87-NEXT:    .cfi_def_cfa_offset 4
@@ -700,6 +719,7 @@ define double @f14() #0 {
 ; X87-NEXT:    .cfi_def_cfa_offset 16
 ; X87-NEXT:    flds {{\.LCPI.*}}
 ; X87-NEXT:    fstpl (%esp)
+; X87-NEXT:    wait
 ; X87-NEXT:    calll log2
 ; X87-NEXT:    addl $12, %esp
 ; X87-NEXT:    .cfi_def_cfa_offset 4
@@ -750,6 +770,7 @@ define double @f15() #0 {
 ; X87-NEXT:    .cfi_def_cfa_offset 16
 ; X87-NEXT:    fldl {{\.LCPI.*}}
 ; X87-NEXT:    fstpl (%esp)
+; X87-NEXT:    wait
 ; X87-NEXT:    calll rint
 ; X87-NEXT:    addl $12, %esp
 ; X87-NEXT:    .cfi_def_cfa_offset 4
@@ -797,6 +818,7 @@ define double @f16() #0 {
 ; X87-NEXT:    .cfi_def_cfa_offset 16
 ; X87-NEXT:    fldl {{\.LCPI.*}}
 ; X87-NEXT:    fstpl (%esp)
+; X87-NEXT:    wait
 ; X87-NEXT:    calll nearbyint
 ; X87-NEXT:    addl $12, %esp
 ; X87-NEXT:    .cfi_def_cfa_offset 4
@@ -843,6 +865,7 @@ define double @f19() #0 {
 ; X87-NEXT:    .cfi_def_cfa_offset 32
 ; X87-NEXT:    flds {{\.LCPI.*}}
 ; X87-NEXT:    fstpl {{[0-9]+}}(%esp)
+; X87-NEXT:    wait
 ; X87-NEXT:    movl $1072693248, {{[0-9]+}}(%esp) # imm = 0x3FF00000
 ; X87-NEXT:    movl $0, (%esp)
 ; X87-NEXT:    calll fmod
@@ -904,6 +927,7 @@ define i8 @f20s8(double %x) #0 {
 ; X87-NEXT:    subl $8, %esp
 ; X87-NEXT:    .cfi_def_cfa_offset 12
 ; X87-NEXT:    fldl {{[0-9]+}}(%esp)
+; X87-NEXT:    wait
 ; X87-NEXT:    fnstcw {{[0-9]+}}(%esp)
 ; X87-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
 ; X87-NEXT:    orl $3072, %eax # imm = 0xC00
@@ -950,6 +974,7 @@ define i16 @f20s16(double %x) #0 {
 ; X87-NEXT:    subl $8, %esp
 ; X87-NEXT:    .cfi_def_cfa_offset 12
 ; X87-NEXT:    fldl {{[0-9]+}}(%esp)
+; X87-NEXT:    wait
 ; X87-NEXT:    fnstcw {{[0-9]+}}(%esp)
 ; X87-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
 ; X87-NEXT:    orl $3072, %eax # imm = 0xC00
@@ -994,6 +1019,7 @@ define i32 @f20s(double %x) #0 {
 ; X87-NEXT:    subl $8, %esp
 ; X87-NEXT:    .cfi_def_cfa_offset 12
 ; X87-NEXT:    fldl {{[0-9]+}}(%esp)
+; X87-NEXT:    wait
 ; X87-NEXT:    fnstcw (%esp)
 ; X87-NEXT:    movzwl (%esp), %eax
 ; X87-NEXT:    orl $3072, %eax # imm = 0xC00
@@ -1036,6 +1062,7 @@ define i64 @f20s64(double %x) #0 {
 ; X87-NEXT:    subl $20, %esp
 ; X87-NEXT:    .cfi_def_cfa_offset 24
 ; X87-NEXT:    fldl {{[0-9]+}}(%esp)
+; X87-NEXT:    wait
 ; X87-NEXT:    fnstcw {{[0-9]+}}(%esp)
 ; X87-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
 ; X87-NEXT:    orl $3072, %eax # imm = 0xC00
@@ -1056,6 +1083,7 @@ define i64 @f20s64(double %x) #0 {
 ; X86-SSE-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
 ; X86-SSE-NEXT:    movsd %xmm0, {{[0-9]+}}(%esp)
 ; X86-SSE-NEXT:    fldl {{[0-9]+}}(%esp)
+; X86-SSE-NEXT:    wait
 ; X86-SSE-NEXT:    fnstcw {{[0-9]+}}(%esp)
 ; X86-SSE-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
 ; X86-SSE-NEXT:    orl $3072, %eax # imm = 0xC00
@@ -1096,6 +1124,7 @@ define i128 @f20s128(double %x) nounwind strictfp {
 ; X87-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X87-NEXT:    fldl {{[0-9]+}}(%esp)
 ; X87-NEXT:    fstpl {{[0-9]+}}(%esp)
+; X87-NEXT:    wait
 ; X87-NEXT:    leal {{[0-9]+}}(%esp), %eax
 ; X87-NEXT:    movl %eax, (%esp)
 ; X87-NEXT:    calll __fixdfti
@@ -1170,6 +1199,7 @@ define i8 @f20u8(double %x) #0 {
 ; X87-NEXT:    subl $8, %esp
 ; X87-NEXT:    .cfi_def_cfa_offset 12
 ; X87-NEXT:    fldl {{[0-9]+}}(%esp)
+; X87-NEXT:    wait
 ; X87-NEXT:    fnstcw {{[0-9]+}}(%esp)
 ; X87-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
 ; X87-NEXT:    orl $3072, %eax # imm = 0xC00
@@ -1215,6 +1245,7 @@ define i16 @f20u16(double %x) #0 {
 ; X87-NEXT:    subl $8, %esp
 ; X87-NEXT:    .cfi_def_cfa_offset 12
 ; X87-NEXT:    fldl {{[0-9]+}}(%esp)
+; X87-NEXT:    wait
 ; X87-NEXT:    fnstcw (%esp)
 ; X87-NEXT:    movzwl (%esp), %eax
 ; X87-NEXT:    orl $3072, %eax # imm = 0xC00
@@ -1262,6 +1293,7 @@ define i32 @f20u(double %x) #0 {
 ; X87-NEXT:    subl $20, %esp
 ; X87-NEXT:    .cfi_def_cfa_offset 24
 ; X87-NEXT:    fldl {{[0-9]+}}(%esp)
+; X87-NEXT:    wait
 ; X87-NEXT:    fnstcw {{[0-9]+}}(%esp)
 ; X87-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
 ; X87-NEXT:    orl $3072, %eax # imm = 0xC00
@@ -1325,14 +1357,17 @@ define i64 @f20u64(double %x) #0 {
 ; X87-NEXT:    .cfi_def_cfa_offset 24
 ; X87-NEXT:    fldl {{[0-9]+}}(%esp)
 ; X87-NEXT:    flds {{\.LCPI.*}}
+; X87-NEXT:    wait
 ; X87-NEXT:    xorl %edx, %edx
 ; X87-NEXT:    fcomi %st(1), %st
+; X87-NEXT:    wait
 ; X87-NEXT:    setbe %dl
 ; X87-NEXT:    fldz
 ; X87-NEXT:    fxch %st(1)
 ; X87-NEXT:    fcmovnbe %st(1), %st
 ; X87-NEXT:    fstp %st(1)
 ; X87-NEXT:    fsubrp %st, %st(1)
+; X87-NEXT:    wait
 ; X87-NEXT:    fnstcw {{[0-9]+}}(%esp)
 ; X87-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
 ; X87-NEXT:    orl $3072, %eax # imm = 0xC00
@@ -1363,6 +1398,7 @@ define i64 @f20u64(double %x) #0 {
 ; X86-SSE-NEXT:    movsd %xmm0, {{[0-9]+}}(%esp)
 ; X86-SSE-NEXT:    setbe %al
 ; X86-SSE-NEXT:    fldl {{[0-9]+}}(%esp)
+; X86-SSE-NEXT:    wait
 ; X86-SSE-NEXT:    fnstcw {{[0-9]+}}(%esp)
 ; X86-SSE-NEXT:    movzwl {{[0-9]+}}(%esp), %ecx
 ; X86-SSE-NEXT:    orl $3072, %ecx # imm = 0xC00
@@ -1435,6 +1471,7 @@ define i128 @f20u128(double %x) nounwind strictfp {
 ; X87-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X87-NEXT:    fldl {{[0-9]+}}(%esp)
 ; X87-NEXT:    fstpl {{[0-9]+}}(%esp)
+; X87-NEXT:    wait
 ; X87-NEXT:    leal {{[0-9]+}}(%esp), %eax
 ; X87-NEXT:    movl %eax, (%esp)
 ; X87-NEXT:    calll __fixunsdfti
@@ -1509,6 +1546,7 @@ define float @f21() #0 {
 ; X87-NEXT:    fldl {{\.LCPI.*}}
 ; X87-NEXT:    fstps (%esp)
 ; X87-NEXT:    flds (%esp)
+; X87-NEXT:    wait
 ; X87-NEXT:    popl %eax
 ; X87-NEXT:    .cfi_def_cfa_offset 4
 ; X87-NEXT:    retl
@@ -1521,6 +1559,7 @@ define float @f21() #0 {
 ; X86-SSE-NEXT:    cvtsd2ss %xmm0, %xmm0
 ; X86-SSE-NEXT:    movss %xmm0, (%esp)
 ; X86-SSE-NEXT:    flds (%esp)
+; X86-SSE-NEXT:    wait
 ; X86-SSE-NEXT:    popl %eax
 ; X86-SSE-NEXT:    .cfi_def_cfa_offset 4
 ; X86-SSE-NEXT:    retl
@@ -1548,6 +1587,7 @@ define double @f22(float %x) #0 {
 ; X87-LABEL: f22:
 ; X87:       # %bb.0: # %entry
 ; X87-NEXT:    flds {{[0-9]+}}(%esp)
+; X87-NEXT:    wait
 ; X87-NEXT:    retl
 ;
 ; X86-SSE-LABEL: f22:
@@ -1558,6 +1598,7 @@ define double @f22(float %x) #0 {
 ; X86-SSE-NEXT:    cvtss2sd %xmm0, %xmm0
 ; X86-SSE-NEXT:    movsd %xmm0, (%esp)
 ; X86-SSE-NEXT:    fldl (%esp)
+; X86-SSE-NEXT:    wait
 ; X86-SSE-NEXT:    addl $12, %esp
 ; X86-SSE-NEXT:    .cfi_def_cfa_offset 4
 ; X86-SSE-NEXT:    retl
@@ -1584,6 +1625,7 @@ define i32 @f23(double %x) #0 {
 ; X87-NEXT:    .cfi_def_cfa_offset 16
 ; X87-NEXT:    fldl {{[0-9]+}}(%esp)
 ; X87-NEXT:    fstpl (%esp)
+; X87-NEXT:    wait
 ; X87-NEXT:    calll lrint
 ; X87-NEXT:    addl $12, %esp
 ; X87-NEXT:    .cfi_def_cfa_offset 4
@@ -1631,6 +1673,7 @@ define i32 @f24(float %x) #0 {
 ; X87-NEXT:    .cfi_def_cfa_offset 16
 ; X87-NEXT:    flds {{[0-9]+}}(%esp)
 ; X87-NEXT:    fstps (%esp)
+; X87-NEXT:    wait
 ; X87-NEXT:    calll lrintf
 ; X87-NEXT:    addl $12, %esp
 ; X87-NEXT:    .cfi_def_cfa_offset 4
@@ -1678,6 +1721,7 @@ define i64 @f25(double %x) #0 {
 ; X87-NEXT:    .cfi_def_cfa_offset 16
 ; X87-NEXT:    fldl {{[0-9]+}}(%esp)
 ; X87-NEXT:    fstpl (%esp)
+; X87-NEXT:    wait
 ; X87-NEXT:    calll llrint
 ; X87-NEXT:    addl $12, %esp
 ; X87-NEXT:    .cfi_def_cfa_offset 4
@@ -1772,6 +1816,7 @@ define i32 @f27(double %x) #0 {
 ; X87-NEXT:    .cfi_def_cfa_offset 16
 ; X87-NEXT:    fldl {{[0-9]+}}(%esp)
 ; X87-NEXT:    fstpl (%esp)
+; X87-NEXT:    wait
 ; X87-NEXT:    calll lround
 ; X87-NEXT:    addl $12, %esp
 ; X87-NEXT:    .cfi_def_cfa_offset 4
@@ -1818,6 +1863,7 @@ define i32 @f28(float %x) #0 {
 ; X87-NEXT:    .cfi_def_cfa_offset 16
 ; X87-NEXT:    flds {{[0-9]+}}(%esp)
 ; X87-NEXT:    fstps (%esp)
+; X87-NEXT:    wait
 ; X87-NEXT:    calll lroundf
 ; X87-NEXT:    addl $12, %esp
 ; X87-NEXT:    .cfi_def_cfa_offset 4
@@ -1864,6 +1910,7 @@ define i64 @f29(double %x) #0 {
 ; X87-NEXT:    .cfi_def_cfa_offset 16
 ; X87-NEXT:    fldl {{[0-9]+}}(%esp)
 ; X87-NEXT:    fstpl (%esp)
+; X87-NEXT:    wait
 ; X87-NEXT:    calll llround
 ; X87-NEXT:    addl $12, %esp
 ; X87-NEXT:    .cfi_def_cfa_offset 4
@@ -1910,6 +1957,7 @@ define i64 @f30(float %x) #0 {
 ; X87-NEXT:    .cfi_def_cfa_offset 16
 ; X87-NEXT:    flds {{[0-9]+}}(%esp)
 ; X87-NEXT:    fstps (%esp)
+; X87-NEXT:    wait
 ; X87-NEXT:    calll llroundf
 ; X87-NEXT:    addl $12, %esp
 ; X87-NEXT:    .cfi_def_cfa_offset 4
@@ -1960,6 +2008,7 @@ define double @sifdb(i8 %x) #0 {
 ; X87-NEXT:    movsbl {{[0-9]+}}(%esp), %eax
 ; X87-NEXT:    movw %ax, {{[0-9]+}}(%esp)
 ; X87-NEXT:    filds {{[0-9]+}}(%esp)
+; X87-NEXT:    wait
 ; X87-NEXT:    popl %eax
 ; X87-NEXT:    .cfi_def_cfa_offset 4
 ; X87-NEXT:    retl
@@ -1972,6 +2021,7 @@ define double @sifdb(i8 %x) #0 {
 ; X86-SSE-NEXT:    cvtsi2sd %eax, %xmm0
 ; X86-SSE-NEXT:    movsd %xmm0, (%esp)
 ; X86-SSE-NEXT:    fldl (%esp)
+; X86-SSE-NEXT:    wait
 ; X86-SSE-NEXT:    addl $12, %esp
 ; X86-SSE-NEXT:    .cfi_def_cfa_offset 4
 ; X86-SSE-NEXT:    retl
@@ -2002,6 +2052,7 @@ define double @sifdw(i16 %x) #0 {
 ; X87-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
 ; X87-NEXT:    movw %ax, {{[0-9]+}}(%esp)
 ; X87-NEXT:    filds {{[0-9]+}}(%esp)
+; X87-NEXT:    wait
 ; X87-NEXT:    popl %eax
 ; X87-NEXT:    .cfi_def_cfa_offset 4
 ; X87-NEXT:    retl
@@ -2014,6 +2065,7 @@ define double @sifdw(i16 %x) #0 {
 ; X86-SSE-NEXT:    cvtsi2sd %eax, %xmm0
 ; X86-SSE-NEXT:    movsd %xmm0, (%esp)
 ; X86-SSE-NEXT:    fldl (%esp)
+; X86-SSE-NEXT:    wait
 ; X86-SSE-NEXT:    addl $12, %esp
 ; X86-SSE-NEXT:    .cfi_def_cfa_offset 4
 ; X86-SSE-NEXT:    retl
@@ -2044,6 +2096,7 @@ define double @sifdi(i32 %x) #0 {
 ; X87-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X87-NEXT:    movl %eax, (%esp)
 ; X87-NEXT:    fildl (%esp)
+; X87-NEXT:    wait
 ; X87-NEXT:    popl %eax
 ; X87-NEXT:    .cfi_def_cfa_offset 4
 ; X87-NEXT:    retl
@@ -2055,6 +2108,7 @@ define double @sifdi(i32 %x) #0 {
 ; X86-SSE-NEXT:    cvtsi2sdl {{[0-9]+}}(%esp), %xmm0
 ; X86-SSE-NEXT:    movsd %xmm0, (%esp)
 ; X86-SSE-NEXT:    fldl (%esp)
+; X86-SSE-NEXT:    wait
 ; X86-SSE-NEXT:    addl $12, %esp
 ; X86-SSE-NEXT:    .cfi_def_cfa_offset 4
 ; X86-SSE-NEXT:    retl
@@ -2083,6 +2137,7 @@ define float @siffb(i8 %x) #0 {
 ; X87-NEXT:    movsbl {{[0-9]+}}(%esp), %eax
 ; X87-NEXT:    movw %ax, {{[0-9]+}}(%esp)
 ; X87-NEXT:    filds {{[0-9]+}}(%esp)
+; X87-NEXT:    wait
 ; X87-NEXT:    popl %eax
 ; X87-NEXT:    .cfi_def_cfa_offset 4
 ; X87-NEXT:    retl
@@ -2095,6 +2150,7 @@ define float @siffb(i8 %x) #0 {
 ; X86-SSE-NEXT:    cvtsi2ss %eax, %xmm0
 ; X86-SSE-NEXT:    movss %xmm0, (%esp)
 ; X86-SSE-NEXT:    flds (%esp)
+; X86-SSE-NEXT:    wait
 ; X86-SSE-NEXT:    popl %eax
 ; X86-SSE-NEXT:    .cfi_def_cfa_offset 4
 ; X86-SSE-NEXT:    retl
@@ -2125,6 +2181,7 @@ define float @siffw(i16 %x) #0 {
 ; X87-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
 ; X87-NEXT:    movw %ax, {{[0-9]+}}(%esp)
 ; X87-NEXT:    filds {{[0-9]+}}(%esp)
+; X87-NEXT:    wait
 ; X87-NEXT:    popl %eax
 ; X87-NEXT:    .cfi_def_cfa_offset 4
 ; X87-NEXT:    retl
@@ -2137,6 +2194,7 @@ define float @siffw(i16 %x) #0 {
 ; X86-SSE-NEXT:    cvtsi2ss %eax, %xmm0
 ; X86-SSE-NEXT:    movss %xmm0, (%esp)
 ; X86-SSE-NEXT:    flds (%esp)
+; X86-SSE-NEXT:    wait
 ; X86-SSE-NEXT:    popl %eax
 ; X86-SSE-NEXT:    .cfi_def_cfa_offset 4
 ; X86-SSE-NEXT:    retl
@@ -2167,6 +2225,7 @@ define float @siffi(i32 %x) #0 {
 ; X87-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X87-NEXT:    movl %eax, (%esp)
 ; X87-NEXT:    fildl (%esp)
+; X87-NEXT:    wait
 ; X87-NEXT:    popl %eax
 ; X87-NEXT:    .cfi_def_cfa_offset 4
 ; X87-NEXT:    retl
@@ -2178,6 +2237,7 @@ define float @siffi(i32 %x) #0 {
 ; X86-SSE-NEXT:    cvtsi2ssl {{[0-9]+}}(%esp), %xmm0
 ; X86-SSE-NEXT:    movss %xmm0, (%esp)
 ; X86-SSE-NEXT:    flds (%esp)
+; X86-SSE-NEXT:    wait
 ; X86-SSE-NEXT:    popl %eax
 ; X86-SSE-NEXT:    .cfi_def_cfa_offset 4
 ; X86-SSE-NEXT:    retl
@@ -2202,6 +2262,7 @@ define double @sifdl(i64 %x) #0 {
 ; X87-LABEL: sifdl:
 ; X87:       # %bb.0: # %entry
 ; X87-NEXT:    fildll {{[0-9]+}}(%esp)
+; X87-NEXT:    wait
 ; X87-NEXT:    retl
 ;
 ; X86-SSE-LABEL: sifdl:
@@ -2211,6 +2272,7 @@ define double @sifdl(i64 %x) #0 {
 ; X86-SSE-NEXT:    fildll {{[0-9]+}}(%esp)
 ; X86-SSE-NEXT:    fstpl (%esp)
 ; X86-SSE-NEXT:    fldl (%esp)
+; X86-SSE-NEXT:    wait
 ; X86-SSE-NEXT:    addl $12, %esp
 ; X86-SSE-NEXT:    .cfi_def_cfa_offset 4
 ; X86-SSE-NEXT:    retl
@@ -2235,6 +2297,7 @@ define float @siffl(i64 %x) #0 {
 ; X87-LABEL: siffl:
 ; X87:       # %bb.0: # %entry
 ; X87-NEXT:    fildll {{[0-9]+}}(%esp)
+; X87-NEXT:    wait
 ; X87-NEXT:    retl
 ;
 ; X86-SSE-LABEL: siffl:
@@ -2244,6 +2307,7 @@ define float @siffl(i64 %x) #0 {
 ; X86-SSE-NEXT:    fildll {{[0-9]+}}(%esp)
 ; X86-SSE-NEXT:    fstps (%esp)
 ; X86-SSE-NEXT:    flds (%esp)
+; X86-SSE-NEXT:    wait
 ; X86-SSE-NEXT:    popl %eax
 ; X86-SSE-NEXT:    .cfi_def_cfa_offset 4
 ; X86-SSE-NEXT:    retl
@@ -2275,6 +2339,7 @@ define double @uifdb(i8 %x) #0 {
 ; X87-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
 ; X87-NEXT:    movw %ax, {{[0-9]+}}(%esp)
 ; X87-NEXT:    filds {{[0-9]+}}(%esp)
+; X87-NEXT:    wait
 ; X87-NEXT:    popl %eax
 ; X87-NEXT:    .cfi_def_cfa_offset 4
 ; X87-NEXT:    retl
@@ -2287,6 +2352,7 @@ define double @uifdb(i8 %x) #0 {
 ; X86-SSE-NEXT:    cvtsi2sd %eax, %xmm0
 ; X86-SSE-NEXT:    movsd %xmm0, (%esp)
 ; X86-SSE-NEXT:    fldl (%esp)
+; X86-SSE-NEXT:    wait
 ; X86-SSE-NEXT:    addl $12, %esp
 ; X86-SSE-NEXT:    .cfi_def_cfa_offset 4
 ; X86-SSE-NEXT:    retl
@@ -2317,6 +2383,7 @@ define double @uifdw(i16 %x) #0 {
 ; X87-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
 ; X87-NEXT:    movl %eax, (%esp)
 ; X87-NEXT:    fildl (%esp)
+; X87-NEXT:    wait
 ; X87-NEXT:    popl %eax
 ; X87-NEXT:    .cfi_def_cfa_offset 4
 ; X87-NEXT:    retl
@@ -2329,6 +2396,7 @@ define double @uifdw(i16 %x) #0 {
 ; X86-SSE-NEXT:    cvtsi2sd %eax, %xmm0
 ; X86-SSE-NEXT:    movsd %xmm0, (%esp)
 ; X86-SSE-NEXT:    fldl (%esp)
+; X86-SSE-NEXT:    wait
 ; X86-SSE-NEXT:    addl $12, %esp
 ; X86-SSE-NEXT:    .cfi_def_cfa_offset 4
 ; X86-SSE-NEXT:    retl
@@ -2360,6 +2428,7 @@ define double @uifdi(i32 %x) #0 {
 ; X87-NEXT:    movl %eax, (%esp)
 ; X87-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X87-NEXT:    fildll (%esp)
+; X87-NEXT:    wait
 ; X87-NEXT:    addl $12, %esp
 ; X87-NEXT:    .cfi_def_cfa_offset 4
 ; X87-NEXT:    retl
@@ -2374,6 +2443,7 @@ define double @uifdi(i32 %x) #0 {
 ; X86-SSE-NEXT:    subsd %xmm0, %xmm1
 ; X86-SSE-NEXT:    movsd %xmm1, (%esp)
 ; X86-SSE-NEXT:    fldl (%esp)
+; X86-SSE-NEXT:    wait
 ; X86-SSE-NEXT:    addl $12, %esp
 ; X86-SSE-NEXT:    .cfi_def_cfa_offset 4
 ; X86-SSE-NEXT:    retl
@@ -2415,6 +2485,7 @@ define double @uifdl(i64 %x) #0 {
 ; X87-NEXT:    fadds {{\.LCPI.*}}(,%ecx,4)
 ; X87-NEXT:    fstpl {{[0-9]+}}(%esp)
 ; X87-NEXT:    fldl {{[0-9]+}}(%esp)
+; X87-NEXT:    wait
 ; X87-NEXT:    addl $20, %esp
 ; X87-NEXT:    .cfi_def_cfa_offset 4
 ; X87-NEXT:    retl
@@ -2431,6 +2502,7 @@ define double @uifdl(i64 %x) #0 {
 ; X86-SSE-NEXT:    addpd %xmm0, %xmm1
 ; X86-SSE-NEXT:    movlpd %xmm1, (%esp)
 ; X86-SSE-NEXT:    fldl (%esp)
+; X86-SSE-NEXT:    wait
 ; X86-SSE-NEXT:    addl $12, %esp
 ; X86-SSE-NEXT:    .cfi_def_cfa_offset 4
 ; X86-SSE-NEXT:    retl
@@ -2473,6 +2545,7 @@ define float @uiffb(i8 %x) #0 {
 ; X87-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
 ; X87-NEXT:    movw %ax, {{[0-9]+}}(%esp)
 ; X87-NEXT:    filds {{[0-9]+}}(%esp)
+; X87-NEXT:    wait
 ; X87-NEXT:    popl %eax
 ; X87-NEXT:    .cfi_def_cfa_offset 4
 ; X87-NEXT:    retl
@@ -2485,6 +2558,7 @@ define float @uiffb(i8 %x) #0 {
 ; X86-SSE-NEXT:    cvtsi2ss %eax, %xmm0
 ; X86-SSE-NEXT:    movss %xmm0, (%esp)
 ; X86-SSE-NEXT:    flds (%esp)
+; X86-SSE-NEXT:    wait
 ; X86-SSE-NEXT:    popl %eax
 ; X86-SSE-NEXT:    .cfi_def_cfa_offset 4
 ; X86-SSE-NEXT:    retl
@@ -2515,6 +2589,7 @@ define float @uiffw(i16 %x) #0 {
 ; X87-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
 ; X87-NEXT:    movl %eax, (%esp)
 ; X87-NEXT:    fildl (%esp)
+; X87-NEXT:    wait
 ; X87-NEXT:    popl %eax
 ; X87-NEXT:    .cfi_def_cfa_offset 4
 ; X87-NEXT:    retl
@@ -2527,6 +2602,7 @@ define float @uiffw(i16 %x) #0 {
 ; X86-SSE-NEXT:    cvtsi2ss %eax, %xmm0
 ; X86-SSE-NEXT:    movss %xmm0, (%esp)
 ; X86-SSE-NEXT:    flds (%esp)
+; X86-SSE-NEXT:    wait
 ; X86-SSE-NEXT:    popl %eax
 ; X86-SSE-NEXT:    .cfi_def_cfa_offset 4
 ; X86-SSE-NEXT:    retl
@@ -2558,6 +2634,7 @@ define float @uiffi(i32 %x) #0 {
 ; X87-NEXT:    movl %eax, (%esp)
 ; X87-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X87-NEXT:    fildll (%esp)
+; X87-NEXT:    wait
 ; X87-NEXT:    addl $12, %esp
 ; X87-NEXT:    .cfi_def_cfa_offset 4
 ; X87-NEXT:    retl
@@ -2574,6 +2651,7 @@ define float @uiffi(i32 %x) #0 {
 ; X86-SSE-NEXT:    cvtsd2ss %xmm1, %xmm0
 ; X86-SSE-NEXT:    movss %xmm0, (%esp)
 ; X86-SSE-NEXT:    flds (%esp)
+; X86-SSE-NEXT:    wait
 ; X86-SSE-NEXT:    popl %eax
 ; X86-SSE-NEXT:    .cfi_def_cfa_offset 4
 ; X86-SSE-NEXT:    retl
@@ -2615,6 +2693,7 @@ define float @uiffl(i64 %x) #0 {
 ; X87-NEXT:    fadds {{\.LCPI.*}}(,%ecx,4)
 ; X87-NEXT:    fstps {{[0-9]+}}(%esp)
 ; X87-NEXT:    flds {{[0-9]+}}(%esp)
+; X87-NEXT:    wait
 ; X87-NEXT:    addl $20, %esp
 ; X87-NEXT:    .cfi_def_cfa_offset 4
 ; X87-NEXT:    retl
@@ -2630,9 +2709,11 @@ define float @uiffl(i64 %x) #0 {
 ; X86-SSE-NEXT:    fildll {{[0-9]+}}(%esp)
 ; X86-SSE-NEXT:    fadds {{\.LCPI.*}}(,%eax,4)
 ; X86-SSE-NEXT:    fstps {{[0-9]+}}(%esp)
+; X86-SSE-NEXT:    wait
 ; X86-SSE-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; X86-SSE-NEXT:    movss %xmm0, (%esp)
 ; X86-SSE-NEXT:    flds (%esp)
+; X86-SSE-NEXT:    wait
 ; X86-SSE-NEXT:    addl $20, %esp
 ; X86-SSE-NEXT:    .cfi_def_cfa_offset 4
 ; X86-SSE-NEXT:    retl

diff  --git a/llvm/test/CodeGen/X86/fp-strict-libcalls-msvc32.ll b/llvm/test/CodeGen/X86/fp-strict-libcalls-msvc32.ll
index 45d7d513e6fb..1bc308bef8cc 100644
--- a/llvm/test/CodeGen/X86/fp-strict-libcalls-msvc32.ll
+++ b/llvm/test/CodeGen/X86/fp-strict-libcalls-msvc32.ll
@@ -7,9 +7,11 @@ define float @ceil(float %x) #0 {
 ; CHECK-NEXT:    subl $12, %esp
 ; CHECK-NEXT:    flds {{[0-9]+}}(%esp)
 ; CHECK-NEXT:    fstpl (%esp)
+; CHECK-NEXT:    wait
 ; CHECK-NEXT:    calll _ceil
 ; CHECK-NEXT:    fstps {{[0-9]+}}(%esp)
 ; CHECK-NEXT:    flds {{[0-9]+}}(%esp)
+; CHECK-NEXT:    wait
 ; CHECK-NEXT:    addl $12, %esp
 ; CHECK-NEXT:    retl
   %result = call float @llvm.experimental.constrained.ceil.f32(float %x, metadata !"fpexcept.strict") #0
@@ -22,9 +24,11 @@ define float @cos(float %x) #0 {
 ; CHECK-NEXT:    subl $12, %esp
 ; CHECK-NEXT:    flds {{[0-9]+}}(%esp)
 ; CHECK-NEXT:    fstpl (%esp)
+; CHECK-NEXT:    wait
 ; CHECK-NEXT:    calll _cos
 ; CHECK-NEXT:    fstps {{[0-9]+}}(%esp)
 ; CHECK-NEXT:    flds {{[0-9]+}}(%esp)
+; CHECK-NEXT:    wait
 ; CHECK-NEXT:    addl $12, %esp
 ; CHECK-NEXT:    retl
   %result = call float @llvm.experimental.constrained.cos.f32(float %x, metadata !"round.dynamic", metadata !"fpexcept.strict") #0
@@ -37,9 +41,11 @@ define float @exp(float %x) #0 {
 ; CHECK-NEXT:    subl $12, %esp
 ; CHECK-NEXT:    flds {{[0-9]+}}(%esp)
 ; CHECK-NEXT:    fstpl (%esp)
+; CHECK-NEXT:    wait
 ; CHECK-NEXT:    calll _exp
 ; CHECK-NEXT:    fstps {{[0-9]+}}(%esp)
 ; CHECK-NEXT:    flds {{[0-9]+}}(%esp)
+; CHECK-NEXT:    wait
 ; CHECK-NEXT:    addl $12, %esp
 ; CHECK-NEXT:    retl
   %result = call float @llvm.experimental.constrained.exp.f32(float %x, metadata !"round.dynamic", metadata !"fpexcept.strict") #0
@@ -52,9 +58,11 @@ define float @floor(float %x) #0 {
 ; CHECK-NEXT:    subl $12, %esp
 ; CHECK-NEXT:    flds {{[0-9]+}}(%esp)
 ; CHECK-NEXT:    fstpl (%esp)
+; CHECK-NEXT:    wait
 ; CHECK-NEXT:    calll _floor
 ; CHECK-NEXT:    fstps {{[0-9]+}}(%esp)
 ; CHECK-NEXT:    flds {{[0-9]+}}(%esp)
+; CHECK-NEXT:    wait
 ; CHECK-NEXT:    addl $12, %esp
 ; CHECK-NEXT:    retl
   %result = call float @llvm.experimental.constrained.floor.f32(float %x, metadata !"fpexcept.strict") #0
@@ -70,9 +78,11 @@ define float @frem(float %x, float %y) #0 {
 ; CHECK-NEXT:    fxch %st(1)
 ; CHECK-NEXT:    fstpl {{[0-9]+}}(%esp)
 ; CHECK-NEXT:    fstpl (%esp)
+; CHECK-NEXT:    wait
 ; CHECK-NEXT:    calll _fmod
 ; CHECK-NEXT:    fstps {{[0-9]+}}(%esp)
 ; CHECK-NEXT:    flds {{[0-9]+}}(%esp)
+; CHECK-NEXT:    wait
 ; CHECK-NEXT:    addl $20, %esp
 ; CHECK-NEXT:    retl
   %result = call float @llvm.experimental.constrained.frem.f32(float %x, float %y, metadata !"round.dynamic", metadata !"fpexcept.strict") #0
@@ -85,9 +95,11 @@ define float @log(float %x) #0 {
 ; CHECK-NEXT:    subl $12, %esp
 ; CHECK-NEXT:    flds {{[0-9]+}}(%esp)
 ; CHECK-NEXT:    fstpl (%esp)
+; CHECK-NEXT:    wait
 ; CHECK-NEXT:    calll _log
 ; CHECK-NEXT:    fstps {{[0-9]+}}(%esp)
 ; CHECK-NEXT:    flds {{[0-9]+}}(%esp)
+; CHECK-NEXT:    wait
 ; CHECK-NEXT:    addl $12, %esp
 ; CHECK-NEXT:    retl
   %result = call float @llvm.experimental.constrained.log.f32(float %x, metadata !"round.dynamic", metadata !"fpexcept.strict") #0
@@ -100,9 +112,11 @@ define float @log10(float %x) #0 {
 ; CHECK-NEXT:    subl $12, %esp
 ; CHECK-NEXT:    flds {{[0-9]+}}(%esp)
 ; CHECK-NEXT:    fstpl (%esp)
+; CHECK-NEXT:    wait
 ; CHECK-NEXT:    calll _log10
 ; CHECK-NEXT:    fstps {{[0-9]+}}(%esp)
 ; CHECK-NEXT:    flds {{[0-9]+}}(%esp)
+; CHECK-NEXT:    wait
 ; CHECK-NEXT:    addl $12, %esp
 ; CHECK-NEXT:    retl
   %result = call float @llvm.experimental.constrained.log10.f32(float %x, metadata !"round.dynamic", metadata !"fpexcept.strict") #0
@@ -118,9 +132,11 @@ define float @pow(float %x, float %y) #0 {
 ; CHECK-NEXT:    fxch %st(1)
 ; CHECK-NEXT:    fstpl {{[0-9]+}}(%esp)
 ; CHECK-NEXT:    fstpl (%esp)
+; CHECK-NEXT:    wait
 ; CHECK-NEXT:    calll _pow
 ; CHECK-NEXT:    fstps {{[0-9]+}}(%esp)
 ; CHECK-NEXT:    flds {{[0-9]+}}(%esp)
+; CHECK-NEXT:    wait
 ; CHECK-NEXT:    addl $20, %esp
 ; CHECK-NEXT:    retl
   %result = call float @llvm.experimental.constrained.pow.f32(float %x, float %y, metadata !"round.dynamic", metadata !"fpexcept.strict") #0
@@ -133,9 +149,11 @@ define float @sin(float %x) #0 {
 ; CHECK-NEXT:    subl $12, %esp
 ; CHECK-NEXT:    flds {{[0-9]+}}(%esp)
 ; CHECK-NEXT:    fstpl (%esp)
+; CHECK-NEXT:    wait
 ; CHECK-NEXT:    calll _sin
 ; CHECK-NEXT:    fstps {{[0-9]+}}(%esp)
 ; CHECK-NEXT:    flds {{[0-9]+}}(%esp)
+; CHECK-NEXT:    wait
 ; CHECK-NEXT:    addl $12, %esp
 ; CHECK-NEXT:    retl
   %result = call float @llvm.experimental.constrained.sin.f32(float %x, metadata !"round.dynamic", metadata !"fpexcept.strict") #0

diff  --git a/llvm/test/CodeGen/X86/fp-strict-scalar-cmp.ll b/llvm/test/CodeGen/X86/fp-strict-scalar-cmp.ll
index f27076028198..115e16583bf4 100644
--- a/llvm/test/CodeGen/X86/fp-strict-scalar-cmp.ll
+++ b/llvm/test/CodeGen/X86/fp-strict-scalar-cmp.ll
@@ -52,6 +52,7 @@ define i32 @test_f32_oeq_q(i32 %a, i32 %b, float %f1, float %f2) #0 {
 ; X87-NEXT:    flds {{[0-9]+}}(%esp)
 ; X87-NEXT:    flds {{[0-9]+}}(%esp)
 ; X87-NEXT:    fucompp
+; X87-NEXT:    wait
 ; X87-NEXT:    fnstsw %ax
 ; X87-NEXT:    # kill: def $ah killed $ah killed $ax
 ; X87-NEXT:    sahf
@@ -71,6 +72,7 @@ define i32 @test_f32_oeq_q(i32 %a, i32 %b, float %f1, float %f2) #0 {
 ; X87-CMOV-NEXT:    flds {{[0-9]+}}(%esp)
 ; X87-CMOV-NEXT:    fucompi %st(1), %st
 ; X87-CMOV-NEXT:    fstp %st(0)
+; X87-CMOV-NEXT:    wait
 ; X87-CMOV-NEXT:    leal {{[0-9]+}}(%esp), %eax
 ; X87-CMOV-NEXT:    leal {{[0-9]+}}(%esp), %ecx
 ; X87-CMOV-NEXT:    cmovnel %eax, %ecx
@@ -124,6 +126,7 @@ define i32 @test_f32_ogt_q(i32 %a, i32 %b, float %f1, float %f2) #0 {
 ; X87-NEXT:    flds {{[0-9]+}}(%esp)
 ; X87-NEXT:    flds {{[0-9]+}}(%esp)
 ; X87-NEXT:    fucompp
+; X87-NEXT:    wait
 ; X87-NEXT:    fnstsw %ax
 ; X87-NEXT:    # kill: def $ah killed $ah killed $ax
 ; X87-NEXT:    sahf
@@ -143,6 +146,7 @@ define i32 @test_f32_ogt_q(i32 %a, i32 %b, float %f1, float %f2) #0 {
 ; X87-CMOV-NEXT:    flds {{[0-9]+}}(%esp)
 ; X87-CMOV-NEXT:    fucompi %st(1), %st
 ; X87-CMOV-NEXT:    fstp %st(0)
+; X87-CMOV-NEXT:    wait
 ; X87-CMOV-NEXT:    leal {{[0-9]+}}(%esp), %eax
 ; X87-CMOV-NEXT:    leal {{[0-9]+}}(%esp), %ecx
 ; X87-CMOV-NEXT:    cmoval %eax, %ecx
@@ -195,6 +199,7 @@ define i32 @test_f32_oge_q(i32 %a, i32 %b, float %f1, float %f2) #0 {
 ; X87-NEXT:    flds {{[0-9]+}}(%esp)
 ; X87-NEXT:    flds {{[0-9]+}}(%esp)
 ; X87-NEXT:    fucompp
+; X87-NEXT:    wait
 ; X87-NEXT:    fnstsw %ax
 ; X87-NEXT:    # kill: def $ah killed $ah killed $ax
 ; X87-NEXT:    sahf
@@ -214,6 +219,7 @@ define i32 @test_f32_oge_q(i32 %a, i32 %b, float %f1, float %f2) #0 {
 ; X87-CMOV-NEXT:    flds {{[0-9]+}}(%esp)
 ; X87-CMOV-NEXT:    fucompi %st(1), %st
 ; X87-CMOV-NEXT:    fstp %st(0)
+; X87-CMOV-NEXT:    wait
 ; X87-CMOV-NEXT:    leal {{[0-9]+}}(%esp), %eax
 ; X87-CMOV-NEXT:    leal {{[0-9]+}}(%esp), %ecx
 ; X87-CMOV-NEXT:    cmovael %eax, %ecx
@@ -266,6 +272,7 @@ define i32 @test_f32_olt_q(i32 %a, i32 %b, float %f1, float %f2) #0 {
 ; X87-NEXT:    flds {{[0-9]+}}(%esp)
 ; X87-NEXT:    flds {{[0-9]+}}(%esp)
 ; X87-NEXT:    fucompp
+; X87-NEXT:    wait
 ; X87-NEXT:    fnstsw %ax
 ; X87-NEXT:    # kill: def $ah killed $ah killed $ax
 ; X87-NEXT:    sahf
@@ -285,6 +292,7 @@ define i32 @test_f32_olt_q(i32 %a, i32 %b, float %f1, float %f2) #0 {
 ; X87-CMOV-NEXT:    flds {{[0-9]+}}(%esp)
 ; X87-CMOV-NEXT:    fucompi %st(1), %st
 ; X87-CMOV-NEXT:    fstp %st(0)
+; X87-CMOV-NEXT:    wait
 ; X87-CMOV-NEXT:    leal {{[0-9]+}}(%esp), %eax
 ; X87-CMOV-NEXT:    leal {{[0-9]+}}(%esp), %ecx
 ; X87-CMOV-NEXT:    cmoval %eax, %ecx
@@ -337,6 +345,7 @@ define i32 @test_f32_ole_q(i32 %a, i32 %b, float %f1, float %f2) #0 {
 ; X87-NEXT:    flds {{[0-9]+}}(%esp)
 ; X87-NEXT:    flds {{[0-9]+}}(%esp)
 ; X87-NEXT:    fucompp
+; X87-NEXT:    wait
 ; X87-NEXT:    fnstsw %ax
 ; X87-NEXT:    # kill: def $ah killed $ah killed $ax
 ; X87-NEXT:    sahf
@@ -356,6 +365,7 @@ define i32 @test_f32_ole_q(i32 %a, i32 %b, float %f1, float %f2) #0 {
 ; X87-CMOV-NEXT:    flds {{[0-9]+}}(%esp)
 ; X87-CMOV-NEXT:    fucompi %st(1), %st
 ; X87-CMOV-NEXT:    fstp %st(0)
+; X87-CMOV-NEXT:    wait
 ; X87-CMOV-NEXT:    leal {{[0-9]+}}(%esp), %eax
 ; X87-CMOV-NEXT:    leal {{[0-9]+}}(%esp), %ecx
 ; X87-CMOV-NEXT:    cmovael %eax, %ecx
@@ -408,6 +418,7 @@ define i32 @test_f32_one_q(i32 %a, i32 %b, float %f1, float %f2) #0 {
 ; X87-NEXT:    flds {{[0-9]+}}(%esp)
 ; X87-NEXT:    flds {{[0-9]+}}(%esp)
 ; X87-NEXT:    fucompp
+; X87-NEXT:    wait
 ; X87-NEXT:    fnstsw %ax
 ; X87-NEXT:    # kill: def $ah killed $ah killed $ax
 ; X87-NEXT:    sahf
@@ -427,6 +438,7 @@ define i32 @test_f32_one_q(i32 %a, i32 %b, float %f1, float %f2) #0 {
 ; X87-CMOV-NEXT:    flds {{[0-9]+}}(%esp)
 ; X87-CMOV-NEXT:    fucompi %st(1), %st
 ; X87-CMOV-NEXT:    fstp %st(0)
+; X87-CMOV-NEXT:    wait
 ; X87-CMOV-NEXT:    leal {{[0-9]+}}(%esp), %eax
 ; X87-CMOV-NEXT:    leal {{[0-9]+}}(%esp), %ecx
 ; X87-CMOV-NEXT:    cmovnel %eax, %ecx
@@ -479,6 +491,7 @@ define i32 @test_f32_ord_q(i32 %a, i32 %b, float %f1, float %f2) #0 {
 ; X87-NEXT:    flds {{[0-9]+}}(%esp)
 ; X87-NEXT:    flds {{[0-9]+}}(%esp)
 ; X87-NEXT:    fucompp
+; X87-NEXT:    wait
 ; X87-NEXT:    fnstsw %ax
 ; X87-NEXT:    # kill: def $ah killed $ah killed $ax
 ; X87-NEXT:    sahf
@@ -498,6 +511,7 @@ define i32 @test_f32_ord_q(i32 %a, i32 %b, float %f1, float %f2) #0 {
 ; X87-CMOV-NEXT:    flds {{[0-9]+}}(%esp)
 ; X87-CMOV-NEXT:    fucompi %st(1), %st
 ; X87-CMOV-NEXT:    fstp %st(0)
+; X87-CMOV-NEXT:    wait
 ; X87-CMOV-NEXT:    leal {{[0-9]+}}(%esp), %eax
 ; X87-CMOV-NEXT:    leal {{[0-9]+}}(%esp), %ecx
 ; X87-CMOV-NEXT:    cmovnpl %eax, %ecx
@@ -550,6 +564,7 @@ define i32 @test_f32_ueq_q(i32 %a, i32 %b, float %f1, float %f2) #0 {
 ; X87-NEXT:    flds {{[0-9]+}}(%esp)
 ; X87-NEXT:    flds {{[0-9]+}}(%esp)
 ; X87-NEXT:    fucompp
+; X87-NEXT:    wait
 ; X87-NEXT:    fnstsw %ax
 ; X87-NEXT:    # kill: def $ah killed $ah killed $ax
 ; X87-NEXT:    sahf
@@ -569,6 +584,7 @@ define i32 @test_f32_ueq_q(i32 %a, i32 %b, float %f1, float %f2) #0 {
 ; X87-CMOV-NEXT:    flds {{[0-9]+}}(%esp)
 ; X87-CMOV-NEXT:    fucompi %st(1), %st
 ; X87-CMOV-NEXT:    fstp %st(0)
+; X87-CMOV-NEXT:    wait
 ; X87-CMOV-NEXT:    leal {{[0-9]+}}(%esp), %eax
 ; X87-CMOV-NEXT:    leal {{[0-9]+}}(%esp), %ecx
 ; X87-CMOV-NEXT:    cmovel %eax, %ecx
@@ -621,6 +637,7 @@ define i32 @test_f32_ugt_q(i32 %a, i32 %b, float %f1, float %f2) #0 {
 ; X87-NEXT:    flds {{[0-9]+}}(%esp)
 ; X87-NEXT:    flds {{[0-9]+}}(%esp)
 ; X87-NEXT:    fucompp
+; X87-NEXT:    wait
 ; X87-NEXT:    fnstsw %ax
 ; X87-NEXT:    # kill: def $ah killed $ah killed $ax
 ; X87-NEXT:    sahf
@@ -640,6 +657,7 @@ define i32 @test_f32_ugt_q(i32 %a, i32 %b, float %f1, float %f2) #0 {
 ; X87-CMOV-NEXT:    flds {{[0-9]+}}(%esp)
 ; X87-CMOV-NEXT:    fucompi %st(1), %st
 ; X87-CMOV-NEXT:    fstp %st(0)
+; X87-CMOV-NEXT:    wait
 ; X87-CMOV-NEXT:    leal {{[0-9]+}}(%esp), %eax
 ; X87-CMOV-NEXT:    leal {{[0-9]+}}(%esp), %ecx
 ; X87-CMOV-NEXT:    cmovbl %eax, %ecx
@@ -692,6 +710,7 @@ define i32 @test_f32_uge_q(i32 %a, i32 %b, float %f1, float %f2) #0 {
 ; X87-NEXT:    flds {{[0-9]+}}(%esp)
 ; X87-NEXT:    flds {{[0-9]+}}(%esp)
 ; X87-NEXT:    fucompp
+; X87-NEXT:    wait
 ; X87-NEXT:    fnstsw %ax
 ; X87-NEXT:    # kill: def $ah killed $ah killed $ax
 ; X87-NEXT:    sahf
@@ -711,6 +730,7 @@ define i32 @test_f32_uge_q(i32 %a, i32 %b, float %f1, float %f2) #0 {
 ; X87-CMOV-NEXT:    flds {{[0-9]+}}(%esp)
 ; X87-CMOV-NEXT:    fucompi %st(1), %st
 ; X87-CMOV-NEXT:    fstp %st(0)
+; X87-CMOV-NEXT:    wait
 ; X87-CMOV-NEXT:    leal {{[0-9]+}}(%esp), %eax
 ; X87-CMOV-NEXT:    leal {{[0-9]+}}(%esp), %ecx
 ; X87-CMOV-NEXT:    cmovbel %eax, %ecx
@@ -763,6 +783,7 @@ define i32 @test_f32_ult_q(i32 %a, i32 %b, float %f1, float %f2) #0 {
 ; X87-NEXT:    flds {{[0-9]+}}(%esp)
 ; X87-NEXT:    flds {{[0-9]+}}(%esp)
 ; X87-NEXT:    fucompp
+; X87-NEXT:    wait
 ; X87-NEXT:    fnstsw %ax
 ; X87-NEXT:    # kill: def $ah killed $ah killed $ax
 ; X87-NEXT:    sahf
@@ -782,6 +803,7 @@ define i32 @test_f32_ult_q(i32 %a, i32 %b, float %f1, float %f2) #0 {
 ; X87-CMOV-NEXT:    flds {{[0-9]+}}(%esp)
 ; X87-CMOV-NEXT:    fucompi %st(1), %st
 ; X87-CMOV-NEXT:    fstp %st(0)
+; X87-CMOV-NEXT:    wait
 ; X87-CMOV-NEXT:    leal {{[0-9]+}}(%esp), %eax
 ; X87-CMOV-NEXT:    leal {{[0-9]+}}(%esp), %ecx
 ; X87-CMOV-NEXT:    cmovbl %eax, %ecx
@@ -834,6 +856,7 @@ define i32 @test_f32_ule_q(i32 %a, i32 %b, float %f1, float %f2) #0 {
 ; X87-NEXT:    flds {{[0-9]+}}(%esp)
 ; X87-NEXT:    flds {{[0-9]+}}(%esp)
 ; X87-NEXT:    fucompp
+; X87-NEXT:    wait
 ; X87-NEXT:    fnstsw %ax
 ; X87-NEXT:    # kill: def $ah killed $ah killed $ax
 ; X87-NEXT:    sahf
@@ -853,6 +876,7 @@ define i32 @test_f32_ule_q(i32 %a, i32 %b, float %f1, float %f2) #0 {
 ; X87-CMOV-NEXT:    flds {{[0-9]+}}(%esp)
 ; X87-CMOV-NEXT:    fucompi %st(1), %st
 ; X87-CMOV-NEXT:    fstp %st(0)
+; X87-CMOV-NEXT:    wait
 ; X87-CMOV-NEXT:    leal {{[0-9]+}}(%esp), %eax
 ; X87-CMOV-NEXT:    leal {{[0-9]+}}(%esp), %ecx
 ; X87-CMOV-NEXT:    cmovbel %eax, %ecx
@@ -909,6 +933,7 @@ define i32 @test_f32_une_q(i32 %a, i32 %b, float %f1, float %f2) #0 {
 ; X87-NEXT:    flds {{[0-9]+}}(%esp)
 ; X87-NEXT:    flds {{[0-9]+}}(%esp)
 ; X87-NEXT:    fucompp
+; X87-NEXT:    wait
 ; X87-NEXT:    fnstsw %ax
 ; X87-NEXT:    # kill: def $ah killed $ah killed $ax
 ; X87-NEXT:    sahf
@@ -928,6 +953,7 @@ define i32 @test_f32_une_q(i32 %a, i32 %b, float %f1, float %f2) #0 {
 ; X87-CMOV-NEXT:    flds {{[0-9]+}}(%esp)
 ; X87-CMOV-NEXT:    fucompi %st(1), %st
 ; X87-CMOV-NEXT:    fstp %st(0)
+; X87-CMOV-NEXT:    wait
 ; X87-CMOV-NEXT:    leal {{[0-9]+}}(%esp), %eax
 ; X87-CMOV-NEXT:    leal {{[0-9]+}}(%esp), %ecx
 ; X87-CMOV-NEXT:    cmovnel %eax, %ecx
@@ -981,6 +1007,7 @@ define i32 @test_f32_uno_q(i32 %a, i32 %b, float %f1, float %f2) #0 {
 ; X87-NEXT:    flds {{[0-9]+}}(%esp)
 ; X87-NEXT:    flds {{[0-9]+}}(%esp)
 ; X87-NEXT:    fucompp
+; X87-NEXT:    wait
 ; X87-NEXT:    fnstsw %ax
 ; X87-NEXT:    # kill: def $ah killed $ah killed $ax
 ; X87-NEXT:    sahf
@@ -1000,6 +1027,7 @@ define i32 @test_f32_uno_q(i32 %a, i32 %b, float %f1, float %f2) #0 {
 ; X87-CMOV-NEXT:    flds {{[0-9]+}}(%esp)
 ; X87-CMOV-NEXT:    fucompi %st(1), %st
 ; X87-CMOV-NEXT:    fstp %st(0)
+; X87-CMOV-NEXT:    wait
 ; X87-CMOV-NEXT:    leal {{[0-9]+}}(%esp), %eax
 ; X87-CMOV-NEXT:    leal {{[0-9]+}}(%esp), %ecx
 ; X87-CMOV-NEXT:    cmovpl %eax, %ecx
@@ -1056,6 +1084,7 @@ define i32 @test_f64_oeq_q(i32 %a, i32 %b, double %f1, double %f2) #0 {
 ; X87-NEXT:    fldl {{[0-9]+}}(%esp)
 ; X87-NEXT:    fldl {{[0-9]+}}(%esp)
 ; X87-NEXT:    fucompp
+; X87-NEXT:    wait
 ; X87-NEXT:    fnstsw %ax
 ; X87-NEXT:    # kill: def $ah killed $ah killed $ax
 ; X87-NEXT:    sahf
@@ -1075,6 +1104,7 @@ define i32 @test_f64_oeq_q(i32 %a, i32 %b, double %f1, double %f2) #0 {
 ; X87-CMOV-NEXT:    fldl {{[0-9]+}}(%esp)
 ; X87-CMOV-NEXT:    fucompi %st(1), %st
 ; X87-CMOV-NEXT:    fstp %st(0)
+; X87-CMOV-NEXT:    wait
 ; X87-CMOV-NEXT:    leal {{[0-9]+}}(%esp), %eax
 ; X87-CMOV-NEXT:    leal {{[0-9]+}}(%esp), %ecx
 ; X87-CMOV-NEXT:    cmovnel %eax, %ecx
@@ -1128,6 +1158,7 @@ define i32 @test_f64_ogt_q(i32 %a, i32 %b, double %f1, double %f2) #0 {
 ; X87-NEXT:    fldl {{[0-9]+}}(%esp)
 ; X87-NEXT:    fldl {{[0-9]+}}(%esp)
 ; X87-NEXT:    fucompp
+; X87-NEXT:    wait
 ; X87-NEXT:    fnstsw %ax
 ; X87-NEXT:    # kill: def $ah killed $ah killed $ax
 ; X87-NEXT:    sahf
@@ -1147,6 +1178,7 @@ define i32 @test_f64_ogt_q(i32 %a, i32 %b, double %f1, double %f2) #0 {
 ; X87-CMOV-NEXT:    fldl {{[0-9]+}}(%esp)
 ; X87-CMOV-NEXT:    fucompi %st(1), %st
 ; X87-CMOV-NEXT:    fstp %st(0)
+; X87-CMOV-NEXT:    wait
 ; X87-CMOV-NEXT:    leal {{[0-9]+}}(%esp), %eax
 ; X87-CMOV-NEXT:    leal {{[0-9]+}}(%esp), %ecx
 ; X87-CMOV-NEXT:    cmoval %eax, %ecx
@@ -1199,6 +1231,7 @@ define i32 @test_f64_oge_q(i32 %a, i32 %b, double %f1, double %f2) #0 {
 ; X87-NEXT:    fldl {{[0-9]+}}(%esp)
 ; X87-NEXT:    fldl {{[0-9]+}}(%esp)
 ; X87-NEXT:    fucompp
+; X87-NEXT:    wait
 ; X87-NEXT:    fnstsw %ax
 ; X87-NEXT:    # kill: def $ah killed $ah killed $ax
 ; X87-NEXT:    sahf
@@ -1218,6 +1251,7 @@ define i32 @test_f64_oge_q(i32 %a, i32 %b, double %f1, double %f2) #0 {
 ; X87-CMOV-NEXT:    fldl {{[0-9]+}}(%esp)
 ; X87-CMOV-NEXT:    fucompi %st(1), %st
 ; X87-CMOV-NEXT:    fstp %st(0)
+; X87-CMOV-NEXT:    wait
 ; X87-CMOV-NEXT:    leal {{[0-9]+}}(%esp), %eax
 ; X87-CMOV-NEXT:    leal {{[0-9]+}}(%esp), %ecx
 ; X87-CMOV-NEXT:    cmovael %eax, %ecx
@@ -1270,6 +1304,7 @@ define i32 @test_f64_olt_q(i32 %a, i32 %b, double %f1, double %f2) #0 {
 ; X87-NEXT:    fldl {{[0-9]+}}(%esp)
 ; X87-NEXT:    fldl {{[0-9]+}}(%esp)
 ; X87-NEXT:    fucompp
+; X87-NEXT:    wait
 ; X87-NEXT:    fnstsw %ax
 ; X87-NEXT:    # kill: def $ah killed $ah killed $ax
 ; X87-NEXT:    sahf
@@ -1289,6 +1324,7 @@ define i32 @test_f64_olt_q(i32 %a, i32 %b, double %f1, double %f2) #0 {
 ; X87-CMOV-NEXT:    fldl {{[0-9]+}}(%esp)
 ; X87-CMOV-NEXT:    fucompi %st(1), %st
 ; X87-CMOV-NEXT:    fstp %st(0)
+; X87-CMOV-NEXT:    wait
 ; X87-CMOV-NEXT:    leal {{[0-9]+}}(%esp), %eax
 ; X87-CMOV-NEXT:    leal {{[0-9]+}}(%esp), %ecx
 ; X87-CMOV-NEXT:    cmoval %eax, %ecx
@@ -1341,6 +1377,7 @@ define i32 @test_f64_ole_q(i32 %a, i32 %b, double %f1, double %f2) #0 {
 ; X87-NEXT:    fldl {{[0-9]+}}(%esp)
 ; X87-NEXT:    fldl {{[0-9]+}}(%esp)
 ; X87-NEXT:    fucompp
+; X87-NEXT:    wait
 ; X87-NEXT:    fnstsw %ax
 ; X87-NEXT:    # kill: def $ah killed $ah killed $ax
 ; X87-NEXT:    sahf
@@ -1360,6 +1397,7 @@ define i32 @test_f64_ole_q(i32 %a, i32 %b, double %f1, double %f2) #0 {
 ; X87-CMOV-NEXT:    fldl {{[0-9]+}}(%esp)
 ; X87-CMOV-NEXT:    fucompi %st(1), %st
 ; X87-CMOV-NEXT:    fstp %st(0)
+; X87-CMOV-NEXT:    wait
 ; X87-CMOV-NEXT:    leal {{[0-9]+}}(%esp), %eax
 ; X87-CMOV-NEXT:    leal {{[0-9]+}}(%esp), %ecx
 ; X87-CMOV-NEXT:    cmovael %eax, %ecx
@@ -1412,6 +1450,7 @@ define i32 @test_f64_one_q(i32 %a, i32 %b, double %f1, double %f2) #0 {
 ; X87-NEXT:    fldl {{[0-9]+}}(%esp)
 ; X87-NEXT:    fldl {{[0-9]+}}(%esp)
 ; X87-NEXT:    fucompp
+; X87-NEXT:    wait
 ; X87-NEXT:    fnstsw %ax
 ; X87-NEXT:    # kill: def $ah killed $ah killed $ax
 ; X87-NEXT:    sahf
@@ -1431,6 +1470,7 @@ define i32 @test_f64_one_q(i32 %a, i32 %b, double %f1, double %f2) #0 {
 ; X87-CMOV-NEXT:    fldl {{[0-9]+}}(%esp)
 ; X87-CMOV-NEXT:    fucompi %st(1), %st
 ; X87-CMOV-NEXT:    fstp %st(0)
+; X87-CMOV-NEXT:    wait
 ; X87-CMOV-NEXT:    leal {{[0-9]+}}(%esp), %eax
 ; X87-CMOV-NEXT:    leal {{[0-9]+}}(%esp), %ecx
 ; X87-CMOV-NEXT:    cmovnel %eax, %ecx
@@ -1483,6 +1523,7 @@ define i32 @test_f64_ord_q(i32 %a, i32 %b, double %f1, double %f2) #0 {
 ; X87-NEXT:    fldl {{[0-9]+}}(%esp)
 ; X87-NEXT:    fldl {{[0-9]+}}(%esp)
 ; X87-NEXT:    fucompp
+; X87-NEXT:    wait
 ; X87-NEXT:    fnstsw %ax
 ; X87-NEXT:    # kill: def $ah killed $ah killed $ax
 ; X87-NEXT:    sahf
@@ -1502,6 +1543,7 @@ define i32 @test_f64_ord_q(i32 %a, i32 %b, double %f1, double %f2) #0 {
 ; X87-CMOV-NEXT:    fldl {{[0-9]+}}(%esp)
 ; X87-CMOV-NEXT:    fucompi %st(1), %st
 ; X87-CMOV-NEXT:    fstp %st(0)
+; X87-CMOV-NEXT:    wait
 ; X87-CMOV-NEXT:    leal {{[0-9]+}}(%esp), %eax
 ; X87-CMOV-NEXT:    leal {{[0-9]+}}(%esp), %ecx
 ; X87-CMOV-NEXT:    cmovnpl %eax, %ecx
@@ -1554,6 +1596,7 @@ define i32 @test_f64_ueq_q(i32 %a, i32 %b, double %f1, double %f2) #0 {
 ; X87-NEXT:    fldl {{[0-9]+}}(%esp)
 ; X87-NEXT:    fldl {{[0-9]+}}(%esp)
 ; X87-NEXT:    fucompp
+; X87-NEXT:    wait
 ; X87-NEXT:    fnstsw %ax
 ; X87-NEXT:    # kill: def $ah killed $ah killed $ax
 ; X87-NEXT:    sahf
@@ -1573,6 +1616,7 @@ define i32 @test_f64_ueq_q(i32 %a, i32 %b, double %f1, double %f2) #0 {
 ; X87-CMOV-NEXT:    fldl {{[0-9]+}}(%esp)
 ; X87-CMOV-NEXT:    fucompi %st(1), %st
 ; X87-CMOV-NEXT:    fstp %st(0)
+; X87-CMOV-NEXT:    wait
 ; X87-CMOV-NEXT:    leal {{[0-9]+}}(%esp), %eax
 ; X87-CMOV-NEXT:    leal {{[0-9]+}}(%esp), %ecx
 ; X87-CMOV-NEXT:    cmovel %eax, %ecx
@@ -1625,6 +1669,7 @@ define i32 @test_f64_ugt_q(i32 %a, i32 %b, double %f1, double %f2) #0 {
 ; X87-NEXT:    fldl {{[0-9]+}}(%esp)
 ; X87-NEXT:    fldl {{[0-9]+}}(%esp)
 ; X87-NEXT:    fucompp
+; X87-NEXT:    wait
 ; X87-NEXT:    fnstsw %ax
 ; X87-NEXT:    # kill: def $ah killed $ah killed $ax
 ; X87-NEXT:    sahf
@@ -1644,6 +1689,7 @@ define i32 @test_f64_ugt_q(i32 %a, i32 %b, double %f1, double %f2) #0 {
 ; X87-CMOV-NEXT:    fldl {{[0-9]+}}(%esp)
 ; X87-CMOV-NEXT:    fucompi %st(1), %st
 ; X87-CMOV-NEXT:    fstp %st(0)
+; X87-CMOV-NEXT:    wait
 ; X87-CMOV-NEXT:    leal {{[0-9]+}}(%esp), %eax
 ; X87-CMOV-NEXT:    leal {{[0-9]+}}(%esp), %ecx
 ; X87-CMOV-NEXT:    cmovbl %eax, %ecx
@@ -1696,6 +1742,7 @@ define i32 @test_f64_uge_q(i32 %a, i32 %b, double %f1, double %f2) #0 {
 ; X87-NEXT:    fldl {{[0-9]+}}(%esp)
 ; X87-NEXT:    fldl {{[0-9]+}}(%esp)
 ; X87-NEXT:    fucompp
+; X87-NEXT:    wait
 ; X87-NEXT:    fnstsw %ax
 ; X87-NEXT:    # kill: def $ah killed $ah killed $ax
 ; X87-NEXT:    sahf
@@ -1715,6 +1762,7 @@ define i32 @test_f64_uge_q(i32 %a, i32 %b, double %f1, double %f2) #0 {
 ; X87-CMOV-NEXT:    fldl {{[0-9]+}}(%esp)
 ; X87-CMOV-NEXT:    fucompi %st(1), %st
 ; X87-CMOV-NEXT:    fstp %st(0)
+; X87-CMOV-NEXT:    wait
 ; X87-CMOV-NEXT:    leal {{[0-9]+}}(%esp), %eax
 ; X87-CMOV-NEXT:    leal {{[0-9]+}}(%esp), %ecx
 ; X87-CMOV-NEXT:    cmovbel %eax, %ecx
@@ -1767,6 +1815,7 @@ define i32 @test_f64_ult_q(i32 %a, i32 %b, double %f1, double %f2) #0 {
 ; X87-NEXT:    fldl {{[0-9]+}}(%esp)
 ; X87-NEXT:    fldl {{[0-9]+}}(%esp)
 ; X87-NEXT:    fucompp
+; X87-NEXT:    wait
 ; X87-NEXT:    fnstsw %ax
 ; X87-NEXT:    # kill: def $ah killed $ah killed $ax
 ; X87-NEXT:    sahf
@@ -1786,6 +1835,7 @@ define i32 @test_f64_ult_q(i32 %a, i32 %b, double %f1, double %f2) #0 {
 ; X87-CMOV-NEXT:    fldl {{[0-9]+}}(%esp)
 ; X87-CMOV-NEXT:    fucompi %st(1), %st
 ; X87-CMOV-NEXT:    fstp %st(0)
+; X87-CMOV-NEXT:    wait
 ; X87-CMOV-NEXT:    leal {{[0-9]+}}(%esp), %eax
 ; X87-CMOV-NEXT:    leal {{[0-9]+}}(%esp), %ecx
 ; X87-CMOV-NEXT:    cmovbl %eax, %ecx
@@ -1838,6 +1888,7 @@ define i32 @test_f64_ule_q(i32 %a, i32 %b, double %f1, double %f2) #0 {
 ; X87-NEXT:    fldl {{[0-9]+}}(%esp)
 ; X87-NEXT:    fldl {{[0-9]+}}(%esp)
 ; X87-NEXT:    fucompp
+; X87-NEXT:    wait
 ; X87-NEXT:    fnstsw %ax
 ; X87-NEXT:    # kill: def $ah killed $ah killed $ax
 ; X87-NEXT:    sahf
@@ -1857,6 +1908,7 @@ define i32 @test_f64_ule_q(i32 %a, i32 %b, double %f1, double %f2) #0 {
 ; X87-CMOV-NEXT:    fldl {{[0-9]+}}(%esp)
 ; X87-CMOV-NEXT:    fucompi %st(1), %st
 ; X87-CMOV-NEXT:    fstp %st(0)
+; X87-CMOV-NEXT:    wait
 ; X87-CMOV-NEXT:    leal {{[0-9]+}}(%esp), %eax
 ; X87-CMOV-NEXT:    leal {{[0-9]+}}(%esp), %ecx
 ; X87-CMOV-NEXT:    cmovbel %eax, %ecx
@@ -1913,6 +1965,7 @@ define i32 @test_f64_une_q(i32 %a, i32 %b, double %f1, double %f2) #0 {
 ; X87-NEXT:    fldl {{[0-9]+}}(%esp)
 ; X87-NEXT:    fldl {{[0-9]+}}(%esp)
 ; X87-NEXT:    fucompp
+; X87-NEXT:    wait
 ; X87-NEXT:    fnstsw %ax
 ; X87-NEXT:    # kill: def $ah killed $ah killed $ax
 ; X87-NEXT:    sahf
@@ -1932,6 +1985,7 @@ define i32 @test_f64_une_q(i32 %a, i32 %b, double %f1, double %f2) #0 {
 ; X87-CMOV-NEXT:    fldl {{[0-9]+}}(%esp)
 ; X87-CMOV-NEXT:    fucompi %st(1), %st
 ; X87-CMOV-NEXT:    fstp %st(0)
+; X87-CMOV-NEXT:    wait
 ; X87-CMOV-NEXT:    leal {{[0-9]+}}(%esp), %eax
 ; X87-CMOV-NEXT:    leal {{[0-9]+}}(%esp), %ecx
 ; X87-CMOV-NEXT:    cmovnel %eax, %ecx
@@ -1985,6 +2039,7 @@ define i32 @test_f64_uno_q(i32 %a, i32 %b, double %f1, double %f2) #0 {
 ; X87-NEXT:    fldl {{[0-9]+}}(%esp)
 ; X87-NEXT:    fldl {{[0-9]+}}(%esp)
 ; X87-NEXT:    fucompp
+; X87-NEXT:    wait
 ; X87-NEXT:    fnstsw %ax
 ; X87-NEXT:    # kill: def $ah killed $ah killed $ax
 ; X87-NEXT:    sahf
@@ -2004,6 +2059,7 @@ define i32 @test_f64_uno_q(i32 %a, i32 %b, double %f1, double %f2) #0 {
 ; X87-CMOV-NEXT:    fldl {{[0-9]+}}(%esp)
 ; X87-CMOV-NEXT:    fucompi %st(1), %st
 ; X87-CMOV-NEXT:    fstp %st(0)
+; X87-CMOV-NEXT:    wait
 ; X87-CMOV-NEXT:    leal {{[0-9]+}}(%esp), %eax
 ; X87-CMOV-NEXT:    leal {{[0-9]+}}(%esp), %ecx
 ; X87-CMOV-NEXT:    cmovpl %eax, %ecx
@@ -2060,6 +2116,7 @@ define i32 @test_f32_oeq_s(i32 %a, i32 %b, float %f1, float %f2) #0 {
 ; X87-NEXT:    flds {{[0-9]+}}(%esp)
 ; X87-NEXT:    flds {{[0-9]+}}(%esp)
 ; X87-NEXT:    fcompp
+; X87-NEXT:    wait
 ; X87-NEXT:    fnstsw %ax
 ; X87-NEXT:    # kill: def $ah killed $ah killed $ax
 ; X87-NEXT:    sahf
@@ -2079,6 +2136,7 @@ define i32 @test_f32_oeq_s(i32 %a, i32 %b, float %f1, float %f2) #0 {
 ; X87-CMOV-NEXT:    flds {{[0-9]+}}(%esp)
 ; X87-CMOV-NEXT:    fcompi %st(1), %st
 ; X87-CMOV-NEXT:    fstp %st(0)
+; X87-CMOV-NEXT:    wait
 ; X87-CMOV-NEXT:    leal {{[0-9]+}}(%esp), %eax
 ; X87-CMOV-NEXT:    leal {{[0-9]+}}(%esp), %ecx
 ; X87-CMOV-NEXT:    cmovnel %eax, %ecx
@@ -2132,6 +2190,7 @@ define i32 @test_f32_ogt_s(i32 %a, i32 %b, float %f1, float %f2) #0 {
 ; X87-NEXT:    flds {{[0-9]+}}(%esp)
 ; X87-NEXT:    flds {{[0-9]+}}(%esp)
 ; X87-NEXT:    fcompp
+; X87-NEXT:    wait
 ; X87-NEXT:    fnstsw %ax
 ; X87-NEXT:    # kill: def $ah killed $ah killed $ax
 ; X87-NEXT:    sahf
@@ -2151,6 +2210,7 @@ define i32 @test_f32_ogt_s(i32 %a, i32 %b, float %f1, float %f2) #0 {
 ; X87-CMOV-NEXT:    flds {{[0-9]+}}(%esp)
 ; X87-CMOV-NEXT:    fcompi %st(1), %st
 ; X87-CMOV-NEXT:    fstp %st(0)
+; X87-CMOV-NEXT:    wait
 ; X87-CMOV-NEXT:    leal {{[0-9]+}}(%esp), %eax
 ; X87-CMOV-NEXT:    leal {{[0-9]+}}(%esp), %ecx
 ; X87-CMOV-NEXT:    cmoval %eax, %ecx
@@ -2203,6 +2263,7 @@ define i32 @test_f32_oge_s(i32 %a, i32 %b, float %f1, float %f2) #0 {
 ; X87-NEXT:    flds {{[0-9]+}}(%esp)
 ; X87-NEXT:    flds {{[0-9]+}}(%esp)
 ; X87-NEXT:    fcompp
+; X87-NEXT:    wait
 ; X87-NEXT:    fnstsw %ax
 ; X87-NEXT:    # kill: def $ah killed $ah killed $ax
 ; X87-NEXT:    sahf
@@ -2222,6 +2283,7 @@ define i32 @test_f32_oge_s(i32 %a, i32 %b, float %f1, float %f2) #0 {
 ; X87-CMOV-NEXT:    flds {{[0-9]+}}(%esp)
 ; X87-CMOV-NEXT:    fcompi %st(1), %st
 ; X87-CMOV-NEXT:    fstp %st(0)
+; X87-CMOV-NEXT:    wait
 ; X87-CMOV-NEXT:    leal {{[0-9]+}}(%esp), %eax
 ; X87-CMOV-NEXT:    leal {{[0-9]+}}(%esp), %ecx
 ; X87-CMOV-NEXT:    cmovael %eax, %ecx
@@ -2274,6 +2336,7 @@ define i32 @test_f32_olt_s(i32 %a, i32 %b, float %f1, float %f2) #0 {
 ; X87-NEXT:    flds {{[0-9]+}}(%esp)
 ; X87-NEXT:    flds {{[0-9]+}}(%esp)
 ; X87-NEXT:    fcompp
+; X87-NEXT:    wait
 ; X87-NEXT:    fnstsw %ax
 ; X87-NEXT:    # kill: def $ah killed $ah killed $ax
 ; X87-NEXT:    sahf
@@ -2293,6 +2356,7 @@ define i32 @test_f32_olt_s(i32 %a, i32 %b, float %f1, float %f2) #0 {
 ; X87-CMOV-NEXT:    flds {{[0-9]+}}(%esp)
 ; X87-CMOV-NEXT:    fcompi %st(1), %st
 ; X87-CMOV-NEXT:    fstp %st(0)
+; X87-CMOV-NEXT:    wait
 ; X87-CMOV-NEXT:    leal {{[0-9]+}}(%esp), %eax
 ; X87-CMOV-NEXT:    leal {{[0-9]+}}(%esp), %ecx
 ; X87-CMOV-NEXT:    cmoval %eax, %ecx
@@ -2345,6 +2409,7 @@ define i32 @test_f32_ole_s(i32 %a, i32 %b, float %f1, float %f2) #0 {
 ; X87-NEXT:    flds {{[0-9]+}}(%esp)
 ; X87-NEXT:    flds {{[0-9]+}}(%esp)
 ; X87-NEXT:    fcompp
+; X87-NEXT:    wait
 ; X87-NEXT:    fnstsw %ax
 ; X87-NEXT:    # kill: def $ah killed $ah killed $ax
 ; X87-NEXT:    sahf
@@ -2364,6 +2429,7 @@ define i32 @test_f32_ole_s(i32 %a, i32 %b, float %f1, float %f2) #0 {
 ; X87-CMOV-NEXT:    flds {{[0-9]+}}(%esp)
 ; X87-CMOV-NEXT:    fcompi %st(1), %st
 ; X87-CMOV-NEXT:    fstp %st(0)
+; X87-CMOV-NEXT:    wait
 ; X87-CMOV-NEXT:    leal {{[0-9]+}}(%esp), %eax
 ; X87-CMOV-NEXT:    leal {{[0-9]+}}(%esp), %ecx
 ; X87-CMOV-NEXT:    cmovael %eax, %ecx
@@ -2416,6 +2482,7 @@ define i32 @test_f32_one_s(i32 %a, i32 %b, float %f1, float %f2) #0 {
 ; X87-NEXT:    flds {{[0-9]+}}(%esp)
 ; X87-NEXT:    flds {{[0-9]+}}(%esp)
 ; X87-NEXT:    fcompp
+; X87-NEXT:    wait
 ; X87-NEXT:    fnstsw %ax
 ; X87-NEXT:    # kill: def $ah killed $ah killed $ax
 ; X87-NEXT:    sahf
@@ -2435,6 +2502,7 @@ define i32 @test_f32_one_s(i32 %a, i32 %b, float %f1, float %f2) #0 {
 ; X87-CMOV-NEXT:    flds {{[0-9]+}}(%esp)
 ; X87-CMOV-NEXT:    fcompi %st(1), %st
 ; X87-CMOV-NEXT:    fstp %st(0)
+; X87-CMOV-NEXT:    wait
 ; X87-CMOV-NEXT:    leal {{[0-9]+}}(%esp), %eax
 ; X87-CMOV-NEXT:    leal {{[0-9]+}}(%esp), %ecx
 ; X87-CMOV-NEXT:    cmovnel %eax, %ecx
@@ -2487,6 +2555,7 @@ define i32 @test_f32_ord_s(i32 %a, i32 %b, float %f1, float %f2) #0 {
 ; X87-NEXT:    flds {{[0-9]+}}(%esp)
 ; X87-NEXT:    flds {{[0-9]+}}(%esp)
 ; X87-NEXT:    fcompp
+; X87-NEXT:    wait
 ; X87-NEXT:    fnstsw %ax
 ; X87-NEXT:    # kill: def $ah killed $ah killed $ax
 ; X87-NEXT:    sahf
@@ -2506,6 +2575,7 @@ define i32 @test_f32_ord_s(i32 %a, i32 %b, float %f1, float %f2) #0 {
 ; X87-CMOV-NEXT:    flds {{[0-9]+}}(%esp)
 ; X87-CMOV-NEXT:    fcompi %st(1), %st
 ; X87-CMOV-NEXT:    fstp %st(0)
+; X87-CMOV-NEXT:    wait
 ; X87-CMOV-NEXT:    leal {{[0-9]+}}(%esp), %eax
 ; X87-CMOV-NEXT:    leal {{[0-9]+}}(%esp), %ecx
 ; X87-CMOV-NEXT:    cmovnpl %eax, %ecx
@@ -2558,6 +2628,7 @@ define i32 @test_f32_ueq_s(i32 %a, i32 %b, float %f1, float %f2) #0 {
 ; X87-NEXT:    flds {{[0-9]+}}(%esp)
 ; X87-NEXT:    flds {{[0-9]+}}(%esp)
 ; X87-NEXT:    fcompp
+; X87-NEXT:    wait
 ; X87-NEXT:    fnstsw %ax
 ; X87-NEXT:    # kill: def $ah killed $ah killed $ax
 ; X87-NEXT:    sahf
@@ -2577,6 +2648,7 @@ define i32 @test_f32_ueq_s(i32 %a, i32 %b, float %f1, float %f2) #0 {
 ; X87-CMOV-NEXT:    flds {{[0-9]+}}(%esp)
 ; X87-CMOV-NEXT:    fcompi %st(1), %st
 ; X87-CMOV-NEXT:    fstp %st(0)
+; X87-CMOV-NEXT:    wait
 ; X87-CMOV-NEXT:    leal {{[0-9]+}}(%esp), %eax
 ; X87-CMOV-NEXT:    leal {{[0-9]+}}(%esp), %ecx
 ; X87-CMOV-NEXT:    cmovel %eax, %ecx
@@ -2629,6 +2701,7 @@ define i32 @test_f32_ugt_s(i32 %a, i32 %b, float %f1, float %f2) #0 {
 ; X87-NEXT:    flds {{[0-9]+}}(%esp)
 ; X87-NEXT:    flds {{[0-9]+}}(%esp)
 ; X87-NEXT:    fcompp
+; X87-NEXT:    wait
 ; X87-NEXT:    fnstsw %ax
 ; X87-NEXT:    # kill: def $ah killed $ah killed $ax
 ; X87-NEXT:    sahf
@@ -2648,6 +2721,7 @@ define i32 @test_f32_ugt_s(i32 %a, i32 %b, float %f1, float %f2) #0 {
 ; X87-CMOV-NEXT:    flds {{[0-9]+}}(%esp)
 ; X87-CMOV-NEXT:    fcompi %st(1), %st
 ; X87-CMOV-NEXT:    fstp %st(0)
+; X87-CMOV-NEXT:    wait
 ; X87-CMOV-NEXT:    leal {{[0-9]+}}(%esp), %eax
 ; X87-CMOV-NEXT:    leal {{[0-9]+}}(%esp), %ecx
 ; X87-CMOV-NEXT:    cmovbl %eax, %ecx
@@ -2700,6 +2774,7 @@ define i32 @test_f32_uge_s(i32 %a, i32 %b, float %f1, float %f2) #0 {
 ; X87-NEXT:    flds {{[0-9]+}}(%esp)
 ; X87-NEXT:    flds {{[0-9]+}}(%esp)
 ; X87-NEXT:    fcompp
+; X87-NEXT:    wait
 ; X87-NEXT:    fnstsw %ax
 ; X87-NEXT:    # kill: def $ah killed $ah killed $ax
 ; X87-NEXT:    sahf
@@ -2719,6 +2794,7 @@ define i32 @test_f32_uge_s(i32 %a, i32 %b, float %f1, float %f2) #0 {
 ; X87-CMOV-NEXT:    flds {{[0-9]+}}(%esp)
 ; X87-CMOV-NEXT:    fcompi %st(1), %st
 ; X87-CMOV-NEXT:    fstp %st(0)
+; X87-CMOV-NEXT:    wait
 ; X87-CMOV-NEXT:    leal {{[0-9]+}}(%esp), %eax
 ; X87-CMOV-NEXT:    leal {{[0-9]+}}(%esp), %ecx
 ; X87-CMOV-NEXT:    cmovbel %eax, %ecx
@@ -2771,6 +2847,7 @@ define i32 @test_f32_ult_s(i32 %a, i32 %b, float %f1, float %f2) #0 {
 ; X87-NEXT:    flds {{[0-9]+}}(%esp)
 ; X87-NEXT:    flds {{[0-9]+}}(%esp)
 ; X87-NEXT:    fcompp
+; X87-NEXT:    wait
 ; X87-NEXT:    fnstsw %ax
 ; X87-NEXT:    # kill: def $ah killed $ah killed $ax
 ; X87-NEXT:    sahf
@@ -2790,6 +2867,7 @@ define i32 @test_f32_ult_s(i32 %a, i32 %b, float %f1, float %f2) #0 {
 ; X87-CMOV-NEXT:    flds {{[0-9]+}}(%esp)
 ; X87-CMOV-NEXT:    fcompi %st(1), %st
 ; X87-CMOV-NEXT:    fstp %st(0)
+; X87-CMOV-NEXT:    wait
 ; X87-CMOV-NEXT:    leal {{[0-9]+}}(%esp), %eax
 ; X87-CMOV-NEXT:    leal {{[0-9]+}}(%esp), %ecx
 ; X87-CMOV-NEXT:    cmovbl %eax, %ecx
@@ -2842,6 +2920,7 @@ define i32 @test_f32_ule_s(i32 %a, i32 %b, float %f1, float %f2) #0 {
 ; X87-NEXT:    flds {{[0-9]+}}(%esp)
 ; X87-NEXT:    flds {{[0-9]+}}(%esp)
 ; X87-NEXT:    fcompp
+; X87-NEXT:    wait
 ; X87-NEXT:    fnstsw %ax
 ; X87-NEXT:    # kill: def $ah killed $ah killed $ax
 ; X87-NEXT:    sahf
@@ -2861,6 +2940,7 @@ define i32 @test_f32_ule_s(i32 %a, i32 %b, float %f1, float %f2) #0 {
 ; X87-CMOV-NEXT:    flds {{[0-9]+}}(%esp)
 ; X87-CMOV-NEXT:    fcompi %st(1), %st
 ; X87-CMOV-NEXT:    fstp %st(0)
+; X87-CMOV-NEXT:    wait
 ; X87-CMOV-NEXT:    leal {{[0-9]+}}(%esp), %eax
 ; X87-CMOV-NEXT:    leal {{[0-9]+}}(%esp), %ecx
 ; X87-CMOV-NEXT:    cmovbel %eax, %ecx
@@ -2917,6 +2997,7 @@ define i32 @test_f32_une_s(i32 %a, i32 %b, float %f1, float %f2) #0 {
 ; X87-NEXT:    flds {{[0-9]+}}(%esp)
 ; X87-NEXT:    flds {{[0-9]+}}(%esp)
 ; X87-NEXT:    fcompp
+; X87-NEXT:    wait
 ; X87-NEXT:    fnstsw %ax
 ; X87-NEXT:    # kill: def $ah killed $ah killed $ax
 ; X87-NEXT:    sahf
@@ -2936,6 +3017,7 @@ define i32 @test_f32_une_s(i32 %a, i32 %b, float %f1, float %f2) #0 {
 ; X87-CMOV-NEXT:    flds {{[0-9]+}}(%esp)
 ; X87-CMOV-NEXT:    fcompi %st(1), %st
 ; X87-CMOV-NEXT:    fstp %st(0)
+; X87-CMOV-NEXT:    wait
 ; X87-CMOV-NEXT:    leal {{[0-9]+}}(%esp), %eax
 ; X87-CMOV-NEXT:    leal {{[0-9]+}}(%esp), %ecx
 ; X87-CMOV-NEXT:    cmovnel %eax, %ecx
@@ -2989,6 +3071,7 @@ define i32 @test_f32_uno_s(i32 %a, i32 %b, float %f1, float %f2) #0 {
 ; X87-NEXT:    flds {{[0-9]+}}(%esp)
 ; X87-NEXT:    flds {{[0-9]+}}(%esp)
 ; X87-NEXT:    fcompp
+; X87-NEXT:    wait
 ; X87-NEXT:    fnstsw %ax
 ; X87-NEXT:    # kill: def $ah killed $ah killed $ax
 ; X87-NEXT:    sahf
@@ -3008,6 +3091,7 @@ define i32 @test_f32_uno_s(i32 %a, i32 %b, float %f1, float %f2) #0 {
 ; X87-CMOV-NEXT:    flds {{[0-9]+}}(%esp)
 ; X87-CMOV-NEXT:    fcompi %st(1), %st
 ; X87-CMOV-NEXT:    fstp %st(0)
+; X87-CMOV-NEXT:    wait
 ; X87-CMOV-NEXT:    leal {{[0-9]+}}(%esp), %eax
 ; X87-CMOV-NEXT:    leal {{[0-9]+}}(%esp), %ecx
 ; X87-CMOV-NEXT:    cmovpl %eax, %ecx
@@ -3064,6 +3148,7 @@ define i32 @test_f64_oeq_s(i32 %a, i32 %b, double %f1, double %f2) #0 {
 ; X87-NEXT:    fldl {{[0-9]+}}(%esp)
 ; X87-NEXT:    fldl {{[0-9]+}}(%esp)
 ; X87-NEXT:    fcompp
+; X87-NEXT:    wait
 ; X87-NEXT:    fnstsw %ax
 ; X87-NEXT:    # kill: def $ah killed $ah killed $ax
 ; X87-NEXT:    sahf
@@ -3083,6 +3168,7 @@ define i32 @test_f64_oeq_s(i32 %a, i32 %b, double %f1, double %f2) #0 {
 ; X87-CMOV-NEXT:    fldl {{[0-9]+}}(%esp)
 ; X87-CMOV-NEXT:    fcompi %st(1), %st
 ; X87-CMOV-NEXT:    fstp %st(0)
+; X87-CMOV-NEXT:    wait
 ; X87-CMOV-NEXT:    leal {{[0-9]+}}(%esp), %eax
 ; X87-CMOV-NEXT:    leal {{[0-9]+}}(%esp), %ecx
 ; X87-CMOV-NEXT:    cmovnel %eax, %ecx
@@ -3136,6 +3222,7 @@ define i32 @test_f64_ogt_s(i32 %a, i32 %b, double %f1, double %f2) #0 {
 ; X87-NEXT:    fldl {{[0-9]+}}(%esp)
 ; X87-NEXT:    fldl {{[0-9]+}}(%esp)
 ; X87-NEXT:    fcompp
+; X87-NEXT:    wait
 ; X87-NEXT:    fnstsw %ax
 ; X87-NEXT:    # kill: def $ah killed $ah killed $ax
 ; X87-NEXT:    sahf
@@ -3155,6 +3242,7 @@ define i32 @test_f64_ogt_s(i32 %a, i32 %b, double %f1, double %f2) #0 {
 ; X87-CMOV-NEXT:    fldl {{[0-9]+}}(%esp)
 ; X87-CMOV-NEXT:    fcompi %st(1), %st
 ; X87-CMOV-NEXT:    fstp %st(0)
+; X87-CMOV-NEXT:    wait
 ; X87-CMOV-NEXT:    leal {{[0-9]+}}(%esp), %eax
 ; X87-CMOV-NEXT:    leal {{[0-9]+}}(%esp), %ecx
 ; X87-CMOV-NEXT:    cmoval %eax, %ecx
@@ -3207,6 +3295,7 @@ define i32 @test_f64_oge_s(i32 %a, i32 %b, double %f1, double %f2) #0 {
 ; X87-NEXT:    fldl {{[0-9]+}}(%esp)
 ; X87-NEXT:    fldl {{[0-9]+}}(%esp)
 ; X87-NEXT:    fcompp
+; X87-NEXT:    wait
 ; X87-NEXT:    fnstsw %ax
 ; X87-NEXT:    # kill: def $ah killed $ah killed $ax
 ; X87-NEXT:    sahf
@@ -3226,6 +3315,7 @@ define i32 @test_f64_oge_s(i32 %a, i32 %b, double %f1, double %f2) #0 {
 ; X87-CMOV-NEXT:    fldl {{[0-9]+}}(%esp)
 ; X87-CMOV-NEXT:    fcompi %st(1), %st
 ; X87-CMOV-NEXT:    fstp %st(0)
+; X87-CMOV-NEXT:    wait
 ; X87-CMOV-NEXT:    leal {{[0-9]+}}(%esp), %eax
 ; X87-CMOV-NEXT:    leal {{[0-9]+}}(%esp), %ecx
 ; X87-CMOV-NEXT:    cmovael %eax, %ecx
@@ -3278,6 +3368,7 @@ define i32 @test_f64_olt_s(i32 %a, i32 %b, double %f1, double %f2) #0 {
 ; X87-NEXT:    fldl {{[0-9]+}}(%esp)
 ; X87-NEXT:    fldl {{[0-9]+}}(%esp)
 ; X87-NEXT:    fcompp
+; X87-NEXT:    wait
 ; X87-NEXT:    fnstsw %ax
 ; X87-NEXT:    # kill: def $ah killed $ah killed $ax
 ; X87-NEXT:    sahf
@@ -3297,6 +3388,7 @@ define i32 @test_f64_olt_s(i32 %a, i32 %b, double %f1, double %f2) #0 {
 ; X87-CMOV-NEXT:    fldl {{[0-9]+}}(%esp)
 ; X87-CMOV-NEXT:    fcompi %st(1), %st
 ; X87-CMOV-NEXT:    fstp %st(0)
+; X87-CMOV-NEXT:    wait
 ; X87-CMOV-NEXT:    leal {{[0-9]+}}(%esp), %eax
 ; X87-CMOV-NEXT:    leal {{[0-9]+}}(%esp), %ecx
 ; X87-CMOV-NEXT:    cmoval %eax, %ecx
@@ -3349,6 +3441,7 @@ define i32 @test_f64_ole_s(i32 %a, i32 %b, double %f1, double %f2) #0 {
 ; X87-NEXT:    fldl {{[0-9]+}}(%esp)
 ; X87-NEXT:    fldl {{[0-9]+}}(%esp)
 ; X87-NEXT:    fcompp
+; X87-NEXT:    wait
 ; X87-NEXT:    fnstsw %ax
 ; X87-NEXT:    # kill: def $ah killed $ah killed $ax
 ; X87-NEXT:    sahf
@@ -3368,6 +3461,7 @@ define i32 @test_f64_ole_s(i32 %a, i32 %b, double %f1, double %f2) #0 {
 ; X87-CMOV-NEXT:    fldl {{[0-9]+}}(%esp)
 ; X87-CMOV-NEXT:    fcompi %st(1), %st
 ; X87-CMOV-NEXT:    fstp %st(0)
+; X87-CMOV-NEXT:    wait
 ; X87-CMOV-NEXT:    leal {{[0-9]+}}(%esp), %eax
 ; X87-CMOV-NEXT:    leal {{[0-9]+}}(%esp), %ecx
 ; X87-CMOV-NEXT:    cmovael %eax, %ecx
@@ -3420,6 +3514,7 @@ define i32 @test_f64_one_s(i32 %a, i32 %b, double %f1, double %f2) #0 {
 ; X87-NEXT:    fldl {{[0-9]+}}(%esp)
 ; X87-NEXT:    fldl {{[0-9]+}}(%esp)
 ; X87-NEXT:    fcompp
+; X87-NEXT:    wait
 ; X87-NEXT:    fnstsw %ax
 ; X87-NEXT:    # kill: def $ah killed $ah killed $ax
 ; X87-NEXT:    sahf
@@ -3439,6 +3534,7 @@ define i32 @test_f64_one_s(i32 %a, i32 %b, double %f1, double %f2) #0 {
 ; X87-CMOV-NEXT:    fldl {{[0-9]+}}(%esp)
 ; X87-CMOV-NEXT:    fcompi %st(1), %st
 ; X87-CMOV-NEXT:    fstp %st(0)
+; X87-CMOV-NEXT:    wait
 ; X87-CMOV-NEXT:    leal {{[0-9]+}}(%esp), %eax
 ; X87-CMOV-NEXT:    leal {{[0-9]+}}(%esp), %ecx
 ; X87-CMOV-NEXT:    cmovnel %eax, %ecx
@@ -3491,6 +3587,7 @@ define i32 @test_f64_ord_s(i32 %a, i32 %b, double %f1, double %f2) #0 {
 ; X87-NEXT:    fldl {{[0-9]+}}(%esp)
 ; X87-NEXT:    fldl {{[0-9]+}}(%esp)
 ; X87-NEXT:    fcompp
+; X87-NEXT:    wait
 ; X87-NEXT:    fnstsw %ax
 ; X87-NEXT:    # kill: def $ah killed $ah killed $ax
 ; X87-NEXT:    sahf
@@ -3510,6 +3607,7 @@ define i32 @test_f64_ord_s(i32 %a, i32 %b, double %f1, double %f2) #0 {
 ; X87-CMOV-NEXT:    fldl {{[0-9]+}}(%esp)
 ; X87-CMOV-NEXT:    fcompi %st(1), %st
 ; X87-CMOV-NEXT:    fstp %st(0)
+; X87-CMOV-NEXT:    wait
 ; X87-CMOV-NEXT:    leal {{[0-9]+}}(%esp), %eax
 ; X87-CMOV-NEXT:    leal {{[0-9]+}}(%esp), %ecx
 ; X87-CMOV-NEXT:    cmovnpl %eax, %ecx
@@ -3562,6 +3660,7 @@ define i32 @test_f64_ueq_s(i32 %a, i32 %b, double %f1, double %f2) #0 {
 ; X87-NEXT:    fldl {{[0-9]+}}(%esp)
 ; X87-NEXT:    fldl {{[0-9]+}}(%esp)
 ; X87-NEXT:    fcompp
+; X87-NEXT:    wait
 ; X87-NEXT:    fnstsw %ax
 ; X87-NEXT:    # kill: def $ah killed $ah killed $ax
 ; X87-NEXT:    sahf
@@ -3581,6 +3680,7 @@ define i32 @test_f64_ueq_s(i32 %a, i32 %b, double %f1, double %f2) #0 {
 ; X87-CMOV-NEXT:    fldl {{[0-9]+}}(%esp)
 ; X87-CMOV-NEXT:    fcompi %st(1), %st
 ; X87-CMOV-NEXT:    fstp %st(0)
+; X87-CMOV-NEXT:    wait
 ; X87-CMOV-NEXT:    leal {{[0-9]+}}(%esp), %eax
 ; X87-CMOV-NEXT:    leal {{[0-9]+}}(%esp), %ecx
 ; X87-CMOV-NEXT:    cmovel %eax, %ecx
@@ -3633,6 +3733,7 @@ define i32 @test_f64_ugt_s(i32 %a, i32 %b, double %f1, double %f2) #0 {
 ; X87-NEXT:    fldl {{[0-9]+}}(%esp)
 ; X87-NEXT:    fldl {{[0-9]+}}(%esp)
 ; X87-NEXT:    fcompp
+; X87-NEXT:    wait
 ; X87-NEXT:    fnstsw %ax
 ; X87-NEXT:    # kill: def $ah killed $ah killed $ax
 ; X87-NEXT:    sahf
@@ -3652,6 +3753,7 @@ define i32 @test_f64_ugt_s(i32 %a, i32 %b, double %f1, double %f2) #0 {
 ; X87-CMOV-NEXT:    fldl {{[0-9]+}}(%esp)
 ; X87-CMOV-NEXT:    fcompi %st(1), %st
 ; X87-CMOV-NEXT:    fstp %st(0)
+; X87-CMOV-NEXT:    wait
 ; X87-CMOV-NEXT:    leal {{[0-9]+}}(%esp), %eax
 ; X87-CMOV-NEXT:    leal {{[0-9]+}}(%esp), %ecx
 ; X87-CMOV-NEXT:    cmovbl %eax, %ecx
@@ -3704,6 +3806,7 @@ define i32 @test_f64_uge_s(i32 %a, i32 %b, double %f1, double %f2) #0 {
 ; X87-NEXT:    fldl {{[0-9]+}}(%esp)
 ; X87-NEXT:    fldl {{[0-9]+}}(%esp)
 ; X87-NEXT:    fcompp
+; X87-NEXT:    wait
 ; X87-NEXT:    fnstsw %ax
 ; X87-NEXT:    # kill: def $ah killed $ah killed $ax
 ; X87-NEXT:    sahf
@@ -3723,6 +3826,7 @@ define i32 @test_f64_uge_s(i32 %a, i32 %b, double %f1, double %f2) #0 {
 ; X87-CMOV-NEXT:    fldl {{[0-9]+}}(%esp)
 ; X87-CMOV-NEXT:    fcompi %st(1), %st
 ; X87-CMOV-NEXT:    fstp %st(0)
+; X87-CMOV-NEXT:    wait
 ; X87-CMOV-NEXT:    leal {{[0-9]+}}(%esp), %eax
 ; X87-CMOV-NEXT:    leal {{[0-9]+}}(%esp), %ecx
 ; X87-CMOV-NEXT:    cmovbel %eax, %ecx
@@ -3775,6 +3879,7 @@ define i32 @test_f64_ult_s(i32 %a, i32 %b, double %f1, double %f2) #0 {
 ; X87-NEXT:    fldl {{[0-9]+}}(%esp)
 ; X87-NEXT:    fldl {{[0-9]+}}(%esp)
 ; X87-NEXT:    fcompp
+; X87-NEXT:    wait
 ; X87-NEXT:    fnstsw %ax
 ; X87-NEXT:    # kill: def $ah killed $ah killed $ax
 ; X87-NEXT:    sahf
@@ -3794,6 +3899,7 @@ define i32 @test_f64_ult_s(i32 %a, i32 %b, double %f1, double %f2) #0 {
 ; X87-CMOV-NEXT:    fldl {{[0-9]+}}(%esp)
 ; X87-CMOV-NEXT:    fcompi %st(1), %st
 ; X87-CMOV-NEXT:    fstp %st(0)
+; X87-CMOV-NEXT:    wait
 ; X87-CMOV-NEXT:    leal {{[0-9]+}}(%esp), %eax
 ; X87-CMOV-NEXT:    leal {{[0-9]+}}(%esp), %ecx
 ; X87-CMOV-NEXT:    cmovbl %eax, %ecx
@@ -3846,6 +3952,7 @@ define i32 @test_f64_ule_s(i32 %a, i32 %b, double %f1, double %f2) #0 {
 ; X87-NEXT:    fldl {{[0-9]+}}(%esp)
 ; X87-NEXT:    fldl {{[0-9]+}}(%esp)
 ; X87-NEXT:    fcompp
+; X87-NEXT:    wait
 ; X87-NEXT:    fnstsw %ax
 ; X87-NEXT:    # kill: def $ah killed $ah killed $ax
 ; X87-NEXT:    sahf
@@ -3865,6 +3972,7 @@ define i32 @test_f64_ule_s(i32 %a, i32 %b, double %f1, double %f2) #0 {
 ; X87-CMOV-NEXT:    fldl {{[0-9]+}}(%esp)
 ; X87-CMOV-NEXT:    fcompi %st(1), %st
 ; X87-CMOV-NEXT:    fstp %st(0)
+; X87-CMOV-NEXT:    wait
 ; X87-CMOV-NEXT:    leal {{[0-9]+}}(%esp), %eax
 ; X87-CMOV-NEXT:    leal {{[0-9]+}}(%esp), %ecx
 ; X87-CMOV-NEXT:    cmovbel %eax, %ecx
@@ -3921,6 +4029,7 @@ define i32 @test_f64_une_s(i32 %a, i32 %b, double %f1, double %f2) #0 {
 ; X87-NEXT:    fldl {{[0-9]+}}(%esp)
 ; X87-NEXT:    fldl {{[0-9]+}}(%esp)
 ; X87-NEXT:    fcompp
+; X87-NEXT:    wait
 ; X87-NEXT:    fnstsw %ax
 ; X87-NEXT:    # kill: def $ah killed $ah killed $ax
 ; X87-NEXT:    sahf
@@ -3940,6 +4049,7 @@ define i32 @test_f64_une_s(i32 %a, i32 %b, double %f1, double %f2) #0 {
 ; X87-CMOV-NEXT:    fldl {{[0-9]+}}(%esp)
 ; X87-CMOV-NEXT:    fcompi %st(1), %st
 ; X87-CMOV-NEXT:    fstp %st(0)
+; X87-CMOV-NEXT:    wait
 ; X87-CMOV-NEXT:    leal {{[0-9]+}}(%esp), %eax
 ; X87-CMOV-NEXT:    leal {{[0-9]+}}(%esp), %ecx
 ; X87-CMOV-NEXT:    cmovnel %eax, %ecx
@@ -3993,6 +4103,7 @@ define i32 @test_f64_uno_s(i32 %a, i32 %b, double %f1, double %f2) #0 {
 ; X87-NEXT:    fldl {{[0-9]+}}(%esp)
 ; X87-NEXT:    fldl {{[0-9]+}}(%esp)
 ; X87-NEXT:    fcompp
+; X87-NEXT:    wait
 ; X87-NEXT:    fnstsw %ax
 ; X87-NEXT:    # kill: def $ah killed $ah killed $ax
 ; X87-NEXT:    sahf
@@ -4012,6 +4123,7 @@ define i32 @test_f64_uno_s(i32 %a, i32 %b, double %f1, double %f2) #0 {
 ; X87-CMOV-NEXT:    fldl {{[0-9]+}}(%esp)
 ; X87-CMOV-NEXT:    fcompi %st(1), %st
 ; X87-CMOV-NEXT:    fstp %st(0)
+; X87-CMOV-NEXT:    wait
 ; X87-CMOV-NEXT:    leal {{[0-9]+}}(%esp), %eax
 ; X87-CMOV-NEXT:    leal {{[0-9]+}}(%esp), %ecx
 ; X87-CMOV-NEXT:    cmovpl %eax, %ecx

diff  --git a/llvm/test/CodeGen/X86/fp-strict-scalar-fptoint.ll b/llvm/test/CodeGen/X86/fp-strict-scalar-fptoint.ll
index c50d092aba10..c3576addfcdd 100644
--- a/llvm/test/CodeGen/X86/fp-strict-scalar-fptoint.ll
+++ b/llvm/test/CodeGen/X86/fp-strict-scalar-fptoint.ll
@@ -59,6 +59,7 @@ define i1 @fptosi_f32toi1(float %x) #0 {
 ; CHECK-NEXT:    subl $8, %esp
 ; CHECK-NEXT:    .cfi_def_cfa_offset 12
 ; CHECK-NEXT:    flds {{[0-9]+}}(%esp)
+; CHECK-NEXT:    wait
 ; CHECK-NEXT:    fnstcw {{[0-9]+}}(%esp)
 ; CHECK-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
 ; CHECK-NEXT:    orl $3072, %eax # imm = 0xC00
@@ -105,6 +106,7 @@ define i8 @fptosi_f32toi8(float %x) #0 {
 ; CHECK-NEXT:    subl $8, %esp
 ; CHECK-NEXT:    .cfi_def_cfa_offset 12
 ; CHECK-NEXT:    flds {{[0-9]+}}(%esp)
+; CHECK-NEXT:    wait
 ; CHECK-NEXT:    fnstcw {{[0-9]+}}(%esp)
 ; CHECK-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
 ; CHECK-NEXT:    orl $3072, %eax # imm = 0xC00
@@ -151,6 +153,7 @@ define i16 @fptosi_f32toi16(float %x) #0 {
 ; CHECK-NEXT:    subl $8, %esp
 ; CHECK-NEXT:    .cfi_def_cfa_offset 12
 ; CHECK-NEXT:    flds {{[0-9]+}}(%esp)
+; CHECK-NEXT:    wait
 ; CHECK-NEXT:    fnstcw {{[0-9]+}}(%esp)
 ; CHECK-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
 ; CHECK-NEXT:    orl $3072, %eax # imm = 0xC00
@@ -193,6 +196,7 @@ define i32 @fptosi_f32toi32(float %x) #0 {
 ; CHECK-NEXT:    subl $8, %esp
 ; CHECK-NEXT:    .cfi_def_cfa_offset 12
 ; CHECK-NEXT:    flds {{[0-9]+}}(%esp)
+; CHECK-NEXT:    wait
 ; CHECK-NEXT:    fnstcw (%esp)
 ; CHECK-NEXT:    movzwl (%esp), %eax
 ; CHECK-NEXT:    orl $3072, %eax # imm = 0xC00
@@ -222,6 +226,7 @@ define i64 @fptosi_f32toi64(float %x) #0 {
 ; SSE-X86-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; SSE-X86-NEXT:    movss %xmm0, {{[0-9]+}}(%esp)
 ; SSE-X86-NEXT:    flds {{[0-9]+}}(%esp)
+; SSE-X86-NEXT:    wait
 ; SSE-X86-NEXT:    fnstcw {{[0-9]+}}(%esp)
 ; SSE-X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
 ; SSE-X86-NEXT:    orl $3072, %eax # imm = 0xC00
@@ -254,6 +259,7 @@ define i64 @fptosi_f32toi64(float %x) #0 {
 ; AVX-X86-NEXT:    vmovss %xmm0, (%esp)
 ; AVX-X86-NEXT:    flds (%esp)
 ; AVX-X86-NEXT:    fisttpll (%esp)
+; AVX-X86-NEXT:    wait
 ; AVX-X86-NEXT:    movl (%esp), %eax
 ; AVX-X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; AVX-X86-NEXT:    movl %ebp, %esp
@@ -276,6 +282,7 @@ define i64 @fptosi_f32toi64(float %x) #0 {
 ; CHECK-NEXT:    andl $-8, %esp
 ; CHECK-NEXT:    subl $16, %esp
 ; CHECK-NEXT:    flds 8(%ebp)
+; CHECK-NEXT:    wait
 ; CHECK-NEXT:    fnstcw {{[0-9]+}}(%esp)
 ; CHECK-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
 ; CHECK-NEXT:    orl $3072, %eax # imm = 0xC00
@@ -324,6 +331,7 @@ define i1 @fptoui_f32toi1(float %x) #0 {
 ; CHECK-NEXT:    subl $8, %esp
 ; CHECK-NEXT:    .cfi_def_cfa_offset 12
 ; CHECK-NEXT:    flds {{[0-9]+}}(%esp)
+; CHECK-NEXT:    wait
 ; CHECK-NEXT:    fnstcw {{[0-9]+}}(%esp)
 ; CHECK-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
 ; CHECK-NEXT:    orl $3072, %eax # imm = 0xC00
@@ -370,6 +378,7 @@ define i8 @fptoui_f32toi8(float %x) #0 {
 ; CHECK-NEXT:    subl $8, %esp
 ; CHECK-NEXT:    .cfi_def_cfa_offset 12
 ; CHECK-NEXT:    flds {{[0-9]+}}(%esp)
+; CHECK-NEXT:    wait
 ; CHECK-NEXT:    fnstcw {{[0-9]+}}(%esp)
 ; CHECK-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
 ; CHECK-NEXT:    orl $3072, %eax # imm = 0xC00
@@ -416,6 +425,7 @@ define i16 @fptoui_f32toi16(float %x) #0 {
 ; CHECK-NEXT:    subl $8, %esp
 ; CHECK-NEXT:    .cfi_def_cfa_offset 12
 ; CHECK-NEXT:    flds {{[0-9]+}}(%esp)
+; CHECK-NEXT:    wait
 ; CHECK-NEXT:    fnstcw (%esp)
 ; CHECK-NEXT:    movzwl (%esp), %eax
 ; CHECK-NEXT:    orl $3072, %eax # imm = 0xC00
@@ -471,6 +481,7 @@ define i32 @fptoui_f32toi32(float %x) #0 {
 ; AVX1-X86-NEXT:    vmovss %xmm0, (%esp)
 ; AVX1-X86-NEXT:    flds (%esp)
 ; AVX1-X86-NEXT:    fisttpll (%esp)
+; AVX1-X86-NEXT:    wait
 ; AVX1-X86-NEXT:    movl (%esp), %eax
 ; AVX1-X86-NEXT:    movl %ebp, %esp
 ; AVX1-X86-NEXT:    popl %ebp
@@ -503,6 +514,7 @@ define i32 @fptoui_f32toi32(float %x) #0 {
 ; CHECK-NEXT:    andl $-8, %esp
 ; CHECK-NEXT:    subl $16, %esp
 ; CHECK-NEXT:    flds 8(%ebp)
+; CHECK-NEXT:    wait
 ; CHECK-NEXT:    fnstcw {{[0-9]+}}(%esp)
 ; CHECK-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
 ; CHECK-NEXT:    orl $3072, %eax # imm = 0xC00
@@ -542,6 +554,7 @@ define i64 @fptoui_f32toi64(float %x) #0 {
 ; SSE-X86-NEXT:    movss %xmm0, {{[0-9]+}}(%esp)
 ; SSE-X86-NEXT:    setbe %al
 ; SSE-X86-NEXT:    flds {{[0-9]+}}(%esp)
+; SSE-X86-NEXT:    wait
 ; SSE-X86-NEXT:    fnstcw {{[0-9]+}}(%esp)
 ; SSE-X86-NEXT:    movzwl {{[0-9]+}}(%esp), %ecx
 ; SSE-X86-NEXT:    orl $3072, %ecx # imm = 0xC00
@@ -596,6 +609,7 @@ define i64 @fptoui_f32toi64(float %x) #0 {
 ; AVX1-X86-NEXT:    vmovss %xmm0, (%esp)
 ; AVX1-X86-NEXT:    flds (%esp)
 ; AVX1-X86-NEXT:    fisttpll (%esp)
+; AVX1-X86-NEXT:    wait
 ; AVX1-X86-NEXT:    setbe %al
 ; AVX1-X86-NEXT:    movzbl %al, %edx
 ; AVX1-X86-NEXT:    shll $31, %edx
@@ -644,6 +658,7 @@ define i64 @fptoui_f32toi64(float %x) #0 {
 ; AVX512-X86-NEXT:    vmovss %xmm0, (%esp)
 ; AVX512-X86-NEXT:    flds (%esp)
 ; AVX512-X86-NEXT:    fisttpll (%esp)
+; AVX512-X86-NEXT:    wait
 ; AVX512-X86-NEXT:    setbe %dl
 ; AVX512-X86-NEXT:    shll $31, %edx
 ; AVX512-X86-NEXT:    xorl {{[0-9]+}}(%esp), %edx
@@ -670,6 +685,7 @@ define i64 @fptoui_f32toi64(float %x) #0 {
 ; CHECK-NEXT:    flds 8(%ebp)
 ; CHECK-NEXT:    flds {{\.LCPI.*}}
 ; CHECK-NEXT:    fcom %st(1)
+; CHECK-NEXT:    wait
 ; CHECK-NEXT:    fnstsw %ax
 ; CHECK-NEXT:    xorl %edx, %edx
 ; CHECK-NEXT:    # kill: def $ah killed $ah killed $ax
@@ -684,6 +700,7 @@ define i64 @fptoui_f32toi64(float %x) #0 {
 ; CHECK-NEXT:  .LBB9_2:
 ; CHECK-NEXT:    fstp %st(1)
 ; CHECK-NEXT:    fsubrp %st, %st(1)
+; CHECK-NEXT:    wait
 ; CHECK-NEXT:    fnstcw {{[0-9]+}}(%esp)
 ; CHECK-NEXT:    movzwl {{[0-9]+}}(%esp), %ecx
 ; CHECK-NEXT:    orl $3072, %ecx # imm = 0xC00
@@ -734,6 +751,7 @@ define i8 @fptosi_f64toi8(double %x) #0 {
 ; CHECK-NEXT:    subl $8, %esp
 ; CHECK-NEXT:    .cfi_def_cfa_offset 12
 ; CHECK-NEXT:    fldl {{[0-9]+}}(%esp)
+; CHECK-NEXT:    wait
 ; CHECK-NEXT:    fnstcw {{[0-9]+}}(%esp)
 ; CHECK-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
 ; CHECK-NEXT:    orl $3072, %eax # imm = 0xC00
@@ -780,6 +798,7 @@ define i16 @fptosi_f64toi16(double %x) #0 {
 ; CHECK-NEXT:    subl $8, %esp
 ; CHECK-NEXT:    .cfi_def_cfa_offset 12
 ; CHECK-NEXT:    fldl {{[0-9]+}}(%esp)
+; CHECK-NEXT:    wait
 ; CHECK-NEXT:    fnstcw {{[0-9]+}}(%esp)
 ; CHECK-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
 ; CHECK-NEXT:    orl $3072, %eax # imm = 0xC00
@@ -822,6 +841,7 @@ define i32 @fptosi_f64toi32(double %x) #0 {
 ; CHECK-NEXT:    subl $8, %esp
 ; CHECK-NEXT:    .cfi_def_cfa_offset 12
 ; CHECK-NEXT:    fldl {{[0-9]+}}(%esp)
+; CHECK-NEXT:    wait
 ; CHECK-NEXT:    fnstcw (%esp)
 ; CHECK-NEXT:    movzwl (%esp), %eax
 ; CHECK-NEXT:    orl $3072, %eax # imm = 0xC00
@@ -851,6 +871,7 @@ define i64 @fptosi_f64toi64(double %x) #0 {
 ; SSE-X86-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
 ; SSE-X86-NEXT:    movsd %xmm0, {{[0-9]+}}(%esp)
 ; SSE-X86-NEXT:    fldl {{[0-9]+}}(%esp)
+; SSE-X86-NEXT:    wait
 ; SSE-X86-NEXT:    fnstcw {{[0-9]+}}(%esp)
 ; SSE-X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
 ; SSE-X86-NEXT:    orl $3072, %eax # imm = 0xC00
@@ -883,6 +904,7 @@ define i64 @fptosi_f64toi64(double %x) #0 {
 ; AVX-X86-NEXT:    vmovsd %xmm0, (%esp)
 ; AVX-X86-NEXT:    fldl (%esp)
 ; AVX-X86-NEXT:    fisttpll (%esp)
+; AVX-X86-NEXT:    wait
 ; AVX-X86-NEXT:    movl (%esp), %eax
 ; AVX-X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; AVX-X86-NEXT:    movl %ebp, %esp
@@ -905,6 +927,7 @@ define i64 @fptosi_f64toi64(double %x) #0 {
 ; CHECK-NEXT:    andl $-8, %esp
 ; CHECK-NEXT:    subl $16, %esp
 ; CHECK-NEXT:    fldl 8(%ebp)
+; CHECK-NEXT:    wait
 ; CHECK-NEXT:    fnstcw {{[0-9]+}}(%esp)
 ; CHECK-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
 ; CHECK-NEXT:    orl $3072, %eax # imm = 0xC00
@@ -953,6 +976,7 @@ define i1 @fptoui_f64toi1(double %x) #0 {
 ; CHECK-NEXT:    subl $8, %esp
 ; CHECK-NEXT:    .cfi_def_cfa_offset 12
 ; CHECK-NEXT:    fldl {{[0-9]+}}(%esp)
+; CHECK-NEXT:    wait
 ; CHECK-NEXT:    fnstcw {{[0-9]+}}(%esp)
 ; CHECK-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
 ; CHECK-NEXT:    orl $3072, %eax # imm = 0xC00
@@ -999,6 +1023,7 @@ define i8 @fptoui_f64toi8(double %x) #0 {
 ; CHECK-NEXT:    subl $8, %esp
 ; CHECK-NEXT:    .cfi_def_cfa_offset 12
 ; CHECK-NEXT:    fldl {{[0-9]+}}(%esp)
+; CHECK-NEXT:    wait
 ; CHECK-NEXT:    fnstcw {{[0-9]+}}(%esp)
 ; CHECK-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
 ; CHECK-NEXT:    orl $3072, %eax # imm = 0xC00
@@ -1045,6 +1070,7 @@ define i16 @fptoui_f64toi16(double %x) #0 {
 ; CHECK-NEXT:    subl $8, %esp
 ; CHECK-NEXT:    .cfi_def_cfa_offset 12
 ; CHECK-NEXT:    fldl {{[0-9]+}}(%esp)
+; CHECK-NEXT:    wait
 ; CHECK-NEXT:    fnstcw (%esp)
 ; CHECK-NEXT:    movzwl (%esp), %eax
 ; CHECK-NEXT:    orl $3072, %eax # imm = 0xC00
@@ -1100,6 +1126,7 @@ define i32 @fptoui_f64toi32(double %x) #0 {
 ; AVX1-X86-NEXT:    vmovsd %xmm0, (%esp)
 ; AVX1-X86-NEXT:    fldl (%esp)
 ; AVX1-X86-NEXT:    fisttpll (%esp)
+; AVX1-X86-NEXT:    wait
 ; AVX1-X86-NEXT:    movl (%esp), %eax
 ; AVX1-X86-NEXT:    movl %ebp, %esp
 ; AVX1-X86-NEXT:    popl %ebp
@@ -1132,6 +1159,7 @@ define i32 @fptoui_f64toi32(double %x) #0 {
 ; CHECK-NEXT:    andl $-8, %esp
 ; CHECK-NEXT:    subl $16, %esp
 ; CHECK-NEXT:    fldl 8(%ebp)
+; CHECK-NEXT:    wait
 ; CHECK-NEXT:    fnstcw {{[0-9]+}}(%esp)
 ; CHECK-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
 ; CHECK-NEXT:    orl $3072, %eax # imm = 0xC00
@@ -1171,6 +1199,7 @@ define i64 @fptoui_f64toi64(double %x) #0 {
 ; SSE-X86-NEXT:    movsd %xmm0, {{[0-9]+}}(%esp)
 ; SSE-X86-NEXT:    setbe %al
 ; SSE-X86-NEXT:    fldl {{[0-9]+}}(%esp)
+; SSE-X86-NEXT:    wait
 ; SSE-X86-NEXT:    fnstcw {{[0-9]+}}(%esp)
 ; SSE-X86-NEXT:    movzwl {{[0-9]+}}(%esp), %ecx
 ; SSE-X86-NEXT:    orl $3072, %ecx # imm = 0xC00
@@ -1225,6 +1254,7 @@ define i64 @fptoui_f64toi64(double %x) #0 {
 ; AVX1-X86-NEXT:    vmovsd %xmm0, (%esp)
 ; AVX1-X86-NEXT:    fldl (%esp)
 ; AVX1-X86-NEXT:    fisttpll (%esp)
+; AVX1-X86-NEXT:    wait
 ; AVX1-X86-NEXT:    setbe %al
 ; AVX1-X86-NEXT:    movzbl %al, %edx
 ; AVX1-X86-NEXT:    shll $31, %edx
@@ -1273,6 +1303,7 @@ define i64 @fptoui_f64toi64(double %x) #0 {
 ; AVX512-X86-NEXT:    vmovsd %xmm0, (%esp)
 ; AVX512-X86-NEXT:    fldl (%esp)
 ; AVX512-X86-NEXT:    fisttpll (%esp)
+; AVX512-X86-NEXT:    wait
 ; AVX512-X86-NEXT:    setbe %dl
 ; AVX512-X86-NEXT:    shll $31, %edx
 ; AVX512-X86-NEXT:    xorl {{[0-9]+}}(%esp), %edx
@@ -1299,6 +1330,7 @@ define i64 @fptoui_f64toi64(double %x) #0 {
 ; CHECK-NEXT:    fldl 8(%ebp)
 ; CHECK-NEXT:    flds {{\.LCPI.*}}
 ; CHECK-NEXT:    fcom %st(1)
+; CHECK-NEXT:    wait
 ; CHECK-NEXT:    fnstsw %ax
 ; CHECK-NEXT:    xorl %edx, %edx
 ; CHECK-NEXT:    # kill: def $ah killed $ah killed $ax
@@ -1313,6 +1345,7 @@ define i64 @fptoui_f64toi64(double %x) #0 {
 ; CHECK-NEXT:  .LBB18_2:
 ; CHECK-NEXT:    fstp %st(1)
 ; CHECK-NEXT:    fsubrp %st, %st(1)
+; CHECK-NEXT:    wait
 ; CHECK-NEXT:    fnstcw {{[0-9]+}}(%esp)
 ; CHECK-NEXT:    movzwl {{[0-9]+}}(%esp), %ecx
 ; CHECK-NEXT:    orl $3072, %ecx # imm = 0xC00

diff  --git a/llvm/test/CodeGen/X86/fp-strict-scalar-inttofp.ll b/llvm/test/CodeGen/X86/fp-strict-scalar-inttofp.ll
index 887185af8121..0a50f3df2ac2 100644
--- a/llvm/test/CodeGen/X86/fp-strict-scalar-inttofp.ll
+++ b/llvm/test/CodeGen/X86/fp-strict-scalar-inttofp.ll
@@ -41,6 +41,7 @@ define float @sitofp_i1tof32(i1 %x) #0 {
 ; SSE-X86-NEXT:    cvtsi2ss %eax, %xmm0
 ; SSE-X86-NEXT:    movss %xmm0, (%esp)
 ; SSE-X86-NEXT:    flds (%esp)
+; SSE-X86-NEXT:    wait
 ; SSE-X86-NEXT:    popl %eax
 ; SSE-X86-NEXT:    .cfi_def_cfa_offset 4
 ; SSE-X86-NEXT:    retl
@@ -64,6 +65,7 @@ define float @sitofp_i1tof32(i1 %x) #0 {
 ; AVX-X86-NEXT:    vcvtsi2ss %eax, %xmm0, %xmm0
 ; AVX-X86-NEXT:    vmovss %xmm0, (%esp)
 ; AVX-X86-NEXT:    flds (%esp)
+; AVX-X86-NEXT:    wait
 ; AVX-X86-NEXT:    popl %eax
 ; AVX-X86-NEXT:    .cfi_def_cfa_offset 4
 ; AVX-X86-NEXT:    retl
@@ -86,6 +88,7 @@ define float @sitofp_i1tof32(i1 %x) #0 {
 ; X87-NEXT:    movsbl %al, %eax
 ; X87-NEXT:    movw %ax, {{[0-9]+}}(%esp)
 ; X87-NEXT:    filds {{[0-9]+}}(%esp)
+; X87-NEXT:    wait
 ; X87-NEXT:    popl %eax
 ; X87-NEXT:    .cfi_def_cfa_offset 4
 ; X87-NEXT:    retl
@@ -104,6 +107,7 @@ define float @sitofp_i8tof32(i8 %x) #0 {
 ; SSE-X86-NEXT:    cvtsi2ss %eax, %xmm0
 ; SSE-X86-NEXT:    movss %xmm0, (%esp)
 ; SSE-X86-NEXT:    flds (%esp)
+; SSE-X86-NEXT:    wait
 ; SSE-X86-NEXT:    popl %eax
 ; SSE-X86-NEXT:    .cfi_def_cfa_offset 4
 ; SSE-X86-NEXT:    retl
@@ -122,6 +126,7 @@ define float @sitofp_i8tof32(i8 %x) #0 {
 ; AVX-X86-NEXT:    vcvtsi2ss %eax, %xmm0, %xmm0
 ; AVX-X86-NEXT:    vmovss %xmm0, (%esp)
 ; AVX-X86-NEXT:    flds (%esp)
+; AVX-X86-NEXT:    wait
 ; AVX-X86-NEXT:    popl %eax
 ; AVX-X86-NEXT:    .cfi_def_cfa_offset 4
 ; AVX-X86-NEXT:    retl
@@ -139,6 +144,7 @@ define float @sitofp_i8tof32(i8 %x) #0 {
 ; X87-NEXT:    movsbl {{[0-9]+}}(%esp), %eax
 ; X87-NEXT:    movw %ax, {{[0-9]+}}(%esp)
 ; X87-NEXT:    filds {{[0-9]+}}(%esp)
+; X87-NEXT:    wait
 ; X87-NEXT:    popl %eax
 ; X87-NEXT:    .cfi_def_cfa_offset 4
 ; X87-NEXT:    retl
@@ -157,6 +163,7 @@ define float @sitofp_i16tof32(i16 %x) #0 {
 ; SSE-X86-NEXT:    cvtsi2ss %eax, %xmm0
 ; SSE-X86-NEXT:    movss %xmm0, (%esp)
 ; SSE-X86-NEXT:    flds (%esp)
+; SSE-X86-NEXT:    wait
 ; SSE-X86-NEXT:    popl %eax
 ; SSE-X86-NEXT:    .cfi_def_cfa_offset 4
 ; SSE-X86-NEXT:    retl
@@ -175,6 +182,7 @@ define float @sitofp_i16tof32(i16 %x) #0 {
 ; AVX-X86-NEXT:    vcvtsi2ss %eax, %xmm0, %xmm0
 ; AVX-X86-NEXT:    vmovss %xmm0, (%esp)
 ; AVX-X86-NEXT:    flds (%esp)
+; AVX-X86-NEXT:    wait
 ; AVX-X86-NEXT:    popl %eax
 ; AVX-X86-NEXT:    .cfi_def_cfa_offset 4
 ; AVX-X86-NEXT:    retl
@@ -192,6 +200,7 @@ define float @sitofp_i16tof32(i16 %x) #0 {
 ; X87-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
 ; X87-NEXT:    movw %ax, {{[0-9]+}}(%esp)
 ; X87-NEXT:    filds {{[0-9]+}}(%esp)
+; X87-NEXT:    wait
 ; X87-NEXT:    popl %eax
 ; X87-NEXT:    .cfi_def_cfa_offset 4
 ; X87-NEXT:    retl
@@ -209,6 +218,7 @@ define float @sitofp_i32tof32(i32 %x) #0 {
 ; SSE-X86-NEXT:    cvtsi2ssl {{[0-9]+}}(%esp), %xmm0
 ; SSE-X86-NEXT:    movss %xmm0, (%esp)
 ; SSE-X86-NEXT:    flds (%esp)
+; SSE-X86-NEXT:    wait
 ; SSE-X86-NEXT:    popl %eax
 ; SSE-X86-NEXT:    .cfi_def_cfa_offset 4
 ; SSE-X86-NEXT:    retl
@@ -225,6 +235,7 @@ define float @sitofp_i32tof32(i32 %x) #0 {
 ; AVX-X86-NEXT:    vcvtsi2ssl {{[0-9]+}}(%esp), %xmm0, %xmm0
 ; AVX-X86-NEXT:    vmovss %xmm0, (%esp)
 ; AVX-X86-NEXT:    flds (%esp)
+; AVX-X86-NEXT:    wait
 ; AVX-X86-NEXT:    popl %eax
 ; AVX-X86-NEXT:    .cfi_def_cfa_offset 4
 ; AVX-X86-NEXT:    retl
@@ -241,6 +252,7 @@ define float @sitofp_i32tof32(i32 %x) #0 {
 ; X87-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X87-NEXT:    movl %eax, (%esp)
 ; X87-NEXT:    fildl (%esp)
+; X87-NEXT:    wait
 ; X87-NEXT:    popl %eax
 ; X87-NEXT:    .cfi_def_cfa_offset 4
 ; X87-NEXT:    retl
@@ -258,6 +270,7 @@ define float @sitofp_i64tof32(i64 %x) #0 {
 ; SSE-X86-NEXT:    fildll {{[0-9]+}}(%esp)
 ; SSE-X86-NEXT:    fstps (%esp)
 ; SSE-X86-NEXT:    flds (%esp)
+; SSE-X86-NEXT:    wait
 ; SSE-X86-NEXT:    popl %eax
 ; SSE-X86-NEXT:    .cfi_def_cfa_offset 4
 ; SSE-X86-NEXT:    retl
@@ -274,6 +287,7 @@ define float @sitofp_i64tof32(i64 %x) #0 {
 ; AVX-X86-NEXT:    fildll {{[0-9]+}}(%esp)
 ; AVX-X86-NEXT:    fstps (%esp)
 ; AVX-X86-NEXT:    flds (%esp)
+; AVX-X86-NEXT:    wait
 ; AVX-X86-NEXT:    popl %eax
 ; AVX-X86-NEXT:    .cfi_def_cfa_offset 4
 ; AVX-X86-NEXT:    retl
@@ -286,6 +300,7 @@ define float @sitofp_i64tof32(i64 %x) #0 {
 ; X87-LABEL: sitofp_i64tof32:
 ; X87:       # %bb.0:
 ; X87-NEXT:    fildll {{[0-9]+}}(%esp)
+; X87-NEXT:    wait
 ; X87-NEXT:    retl
   %result = call float @llvm.experimental.constrained.sitofp.f32.i64(i64 %x,
                                                metadata !"round.dynamic",
@@ -304,6 +319,7 @@ define float @uitofp_i1tof32(i1 %x) #0 {
 ; SSE-X86-NEXT:    cvtsi2ss %eax, %xmm0
 ; SSE-X86-NEXT:    movss %xmm0, (%esp)
 ; SSE-X86-NEXT:    flds (%esp)
+; SSE-X86-NEXT:    wait
 ; SSE-X86-NEXT:    popl %eax
 ; SSE-X86-NEXT:    .cfi_def_cfa_offset 4
 ; SSE-X86-NEXT:    retl
@@ -324,6 +340,7 @@ define float @uitofp_i1tof32(i1 %x) #0 {
 ; AVX-X86-NEXT:    vcvtsi2ss %eax, %xmm0, %xmm0
 ; AVX-X86-NEXT:    vmovss %xmm0, (%esp)
 ; AVX-X86-NEXT:    flds (%esp)
+; AVX-X86-NEXT:    wait
 ; AVX-X86-NEXT:    popl %eax
 ; AVX-X86-NEXT:    .cfi_def_cfa_offset 4
 ; AVX-X86-NEXT:    retl
@@ -343,6 +360,7 @@ define float @uitofp_i1tof32(i1 %x) #0 {
 ; X87-NEXT:    movzbl %al, %eax
 ; X87-NEXT:    movw %ax, {{[0-9]+}}(%esp)
 ; X87-NEXT:    filds {{[0-9]+}}(%esp)
+; X87-NEXT:    wait
 ; X87-NEXT:    popl %eax
 ; X87-NEXT:    .cfi_def_cfa_offset 4
 ; X87-NEXT:    retl
@@ -361,6 +379,7 @@ define float @uitofp_i8tof32(i8 %x) #0 {
 ; SSE-X86-NEXT:    cvtsi2ss %eax, %xmm0
 ; SSE-X86-NEXT:    movss %xmm0, (%esp)
 ; SSE-X86-NEXT:    flds (%esp)
+; SSE-X86-NEXT:    wait
 ; SSE-X86-NEXT:    popl %eax
 ; SSE-X86-NEXT:    .cfi_def_cfa_offset 4
 ; SSE-X86-NEXT:    retl
@@ -379,6 +398,7 @@ define float @uitofp_i8tof32(i8 %x) #0 {
 ; AVX-X86-NEXT:    vcvtsi2ss %eax, %xmm0, %xmm0
 ; AVX-X86-NEXT:    vmovss %xmm0, (%esp)
 ; AVX-X86-NEXT:    flds (%esp)
+; AVX-X86-NEXT:    wait
 ; AVX-X86-NEXT:    popl %eax
 ; AVX-X86-NEXT:    .cfi_def_cfa_offset 4
 ; AVX-X86-NEXT:    retl
@@ -396,6 +416,7 @@ define float @uitofp_i8tof32(i8 %x) #0 {
 ; X87-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
 ; X87-NEXT:    movw %ax, {{[0-9]+}}(%esp)
 ; X87-NEXT:    filds {{[0-9]+}}(%esp)
+; X87-NEXT:    wait
 ; X87-NEXT:    popl %eax
 ; X87-NEXT:    .cfi_def_cfa_offset 4
 ; X87-NEXT:    retl
@@ -414,6 +435,7 @@ define float @uitofp_i16tof32(i16 %x) #0 {
 ; SSE-X86-NEXT:    cvtsi2ss %eax, %xmm0
 ; SSE-X86-NEXT:    movss %xmm0, (%esp)
 ; SSE-X86-NEXT:    flds (%esp)
+; SSE-X86-NEXT:    wait
 ; SSE-X86-NEXT:    popl %eax
 ; SSE-X86-NEXT:    .cfi_def_cfa_offset 4
 ; SSE-X86-NEXT:    retl
@@ -432,6 +454,7 @@ define float @uitofp_i16tof32(i16 %x) #0 {
 ; AVX-X86-NEXT:    vcvtsi2ss %eax, %xmm0, %xmm0
 ; AVX-X86-NEXT:    vmovss %xmm0, (%esp)
 ; AVX-X86-NEXT:    flds (%esp)
+; AVX-X86-NEXT:    wait
 ; AVX-X86-NEXT:    popl %eax
 ; AVX-X86-NEXT:    .cfi_def_cfa_offset 4
 ; AVX-X86-NEXT:    retl
@@ -449,6 +472,7 @@ define float @uitofp_i16tof32(i16 %x) #0 {
 ; X87-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
 ; X87-NEXT:    movl %eax, (%esp)
 ; X87-NEXT:    fildl (%esp)
+; X87-NEXT:    wait
 ; X87-NEXT:    popl %eax
 ; X87-NEXT:    .cfi_def_cfa_offset 4
 ; X87-NEXT:    retl
@@ -471,6 +495,7 @@ define float @uitofp_i32tof32(i32 %x) #0 {
 ; SSE-X86-NEXT:    cvtsd2ss %xmm1, %xmm0
 ; SSE-X86-NEXT:    movss %xmm0, (%esp)
 ; SSE-X86-NEXT:    flds (%esp)
+; SSE-X86-NEXT:    wait
 ; SSE-X86-NEXT:    popl %eax
 ; SSE-X86-NEXT:    .cfi_def_cfa_offset 4
 ; SSE-X86-NEXT:    retl
@@ -492,6 +517,7 @@ define float @uitofp_i32tof32(i32 %x) #0 {
 ; AVX1-X86-NEXT:    vcvtsd2ss %xmm0, %xmm0, %xmm0
 ; AVX1-X86-NEXT:    vmovss %xmm0, (%esp)
 ; AVX1-X86-NEXT:    flds (%esp)
+; AVX1-X86-NEXT:    wait
 ; AVX1-X86-NEXT:    popl %eax
 ; AVX1-X86-NEXT:    .cfi_def_cfa_offset 4
 ; AVX1-X86-NEXT:    retl
@@ -509,6 +535,7 @@ define float @uitofp_i32tof32(i32 %x) #0 {
 ; AVX512-X86-NEXT:    vcvtusi2ssl {{[0-9]+}}(%esp), %xmm0, %xmm0
 ; AVX512-X86-NEXT:    vmovss %xmm0, (%esp)
 ; AVX512-X86-NEXT:    flds (%esp)
+; AVX512-X86-NEXT:    wait
 ; AVX512-X86-NEXT:    popl %eax
 ; AVX512-X86-NEXT:    .cfi_def_cfa_offset 4
 ; AVX512-X86-NEXT:    retl
@@ -531,6 +558,7 @@ define float @uitofp_i32tof32(i32 %x) #0 {
 ; X87-NEXT:    movl %eax, (%esp)
 ; X87-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X87-NEXT:    fildll (%esp)
+; X87-NEXT:    wait
 ; X87-NEXT:    movl %ebp, %esp
 ; X87-NEXT:    popl %ebp
 ; X87-NEXT:    .cfi_def_cfa %esp, 4
@@ -558,9 +586,11 @@ define float @uitofp_i64tof32(i64 %x) #0 {
 ; SSE-X86-NEXT:    fildll {{[0-9]+}}(%esp)
 ; SSE-X86-NEXT:    fadds {{\.LCPI.*}}(,%eax,4)
 ; SSE-X86-NEXT:    fstps {{[0-9]+}}(%esp)
+; SSE-X86-NEXT:    wait
 ; SSE-X86-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; SSE-X86-NEXT:    movss %xmm0, (%esp)
 ; SSE-X86-NEXT:    flds (%esp)
+; SSE-X86-NEXT:    wait
 ; SSE-X86-NEXT:    movl %ebp, %esp
 ; SSE-X86-NEXT:    popl %ebp
 ; SSE-X86-NEXT:    .cfi_def_cfa %esp, 4
@@ -598,9 +628,11 @@ define float @uitofp_i64tof32(i64 %x) #0 {
 ; AVX-X86-NEXT:    fildll {{[0-9]+}}(%esp)
 ; AVX-X86-NEXT:    fadds {{\.LCPI.*}}(,%eax,4)
 ; AVX-X86-NEXT:    fstps {{[0-9]+}}(%esp)
+; AVX-X86-NEXT:    wait
 ; AVX-X86-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; AVX-X86-NEXT:    vmovss %xmm0, (%esp)
 ; AVX-X86-NEXT:    flds (%esp)
+; AVX-X86-NEXT:    wait
 ; AVX-X86-NEXT:    movl %ebp, %esp
 ; AVX-X86-NEXT:    popl %ebp
 ; AVX-X86-NEXT:    .cfi_def_cfa %esp, 4
@@ -645,6 +677,7 @@ define float @uitofp_i64tof32(i64 %x) #0 {
 ; X87-NEXT:    fadds {{\.LCPI.*}}(,%ecx,4)
 ; X87-NEXT:    fstps {{[0-9]+}}(%esp)
 ; X87-NEXT:    flds {{[0-9]+}}(%esp)
+; X87-NEXT:    wait
 ; X87-NEXT:    movl %ebp, %esp
 ; X87-NEXT:    popl %ebp
 ; X87-NEXT:    .cfi_def_cfa %esp, 4
@@ -669,6 +702,7 @@ define double @sitofp_i8tof64(i8 %x) #0 {
 ; SSE-X86-NEXT:    cvtsi2sd %eax, %xmm0
 ; SSE-X86-NEXT:    movsd %xmm0, (%esp)
 ; SSE-X86-NEXT:    fldl (%esp)
+; SSE-X86-NEXT:    wait
 ; SSE-X86-NEXT:    movl %ebp, %esp
 ; SSE-X86-NEXT:    popl %ebp
 ; SSE-X86-NEXT:    .cfi_def_cfa %esp, 4
@@ -693,6 +727,7 @@ define double @sitofp_i8tof64(i8 %x) #0 {
 ; AVX-X86-NEXT:    vcvtsi2sd %eax, %xmm0, %xmm0
 ; AVX-X86-NEXT:    vmovsd %xmm0, (%esp)
 ; AVX-X86-NEXT:    fldl (%esp)
+; AVX-X86-NEXT:    wait
 ; AVX-X86-NEXT:    movl %ebp, %esp
 ; AVX-X86-NEXT:    popl %ebp
 ; AVX-X86-NEXT:    .cfi_def_cfa %esp, 4
@@ -711,6 +746,7 @@ define double @sitofp_i8tof64(i8 %x) #0 {
 ; X87-NEXT:    movsbl {{[0-9]+}}(%esp), %eax
 ; X87-NEXT:    movw %ax, {{[0-9]+}}(%esp)
 ; X87-NEXT:    filds {{[0-9]+}}(%esp)
+; X87-NEXT:    wait
 ; X87-NEXT:    popl %eax
 ; X87-NEXT:    .cfi_def_cfa_offset 4
 ; X87-NEXT:    retl
@@ -734,6 +770,7 @@ define double @sitofp_i16tof64(i16 %x) #0 {
 ; SSE-X86-NEXT:    cvtsi2sd %eax, %xmm0
 ; SSE-X86-NEXT:    movsd %xmm0, (%esp)
 ; SSE-X86-NEXT:    fldl (%esp)
+; SSE-X86-NEXT:    wait
 ; SSE-X86-NEXT:    movl %ebp, %esp
 ; SSE-X86-NEXT:    popl %ebp
 ; SSE-X86-NEXT:    .cfi_def_cfa %esp, 4
@@ -758,6 +795,7 @@ define double @sitofp_i16tof64(i16 %x) #0 {
 ; AVX-X86-NEXT:    vcvtsi2sd %eax, %xmm0, %xmm0
 ; AVX-X86-NEXT:    vmovsd %xmm0, (%esp)
 ; AVX-X86-NEXT:    fldl (%esp)
+; AVX-X86-NEXT:    wait
 ; AVX-X86-NEXT:    movl %ebp, %esp
 ; AVX-X86-NEXT:    popl %ebp
 ; AVX-X86-NEXT:    .cfi_def_cfa %esp, 4
@@ -776,6 +814,7 @@ define double @sitofp_i16tof64(i16 %x) #0 {
 ; X87-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
 ; X87-NEXT:    movw %ax, {{[0-9]+}}(%esp)
 ; X87-NEXT:    filds {{[0-9]+}}(%esp)
+; X87-NEXT:    wait
 ; X87-NEXT:    popl %eax
 ; X87-NEXT:    .cfi_def_cfa_offset 4
 ; X87-NEXT:    retl
@@ -798,6 +837,7 @@ define double @sitofp_i32tof64(i32 %x) #0 {
 ; SSE-X86-NEXT:    cvtsi2sdl 8(%ebp), %xmm0
 ; SSE-X86-NEXT:    movsd %xmm0, (%esp)
 ; SSE-X86-NEXT:    fldl (%esp)
+; SSE-X86-NEXT:    wait
 ; SSE-X86-NEXT:    movl %ebp, %esp
 ; SSE-X86-NEXT:    popl %ebp
 ; SSE-X86-NEXT:    .cfi_def_cfa %esp, 4
@@ -820,6 +860,7 @@ define double @sitofp_i32tof64(i32 %x) #0 {
 ; AVX-X86-NEXT:    vcvtsi2sdl 8(%ebp), %xmm0, %xmm0
 ; AVX-X86-NEXT:    vmovsd %xmm0, (%esp)
 ; AVX-X86-NEXT:    fldl (%esp)
+; AVX-X86-NEXT:    wait
 ; AVX-X86-NEXT:    movl %ebp, %esp
 ; AVX-X86-NEXT:    popl %ebp
 ; AVX-X86-NEXT:    .cfi_def_cfa %esp, 4
@@ -837,6 +878,7 @@ define double @sitofp_i32tof64(i32 %x) #0 {
 ; X87-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X87-NEXT:    movl %eax, (%esp)
 ; X87-NEXT:    fildl (%esp)
+; X87-NEXT:    wait
 ; X87-NEXT:    popl %eax
 ; X87-NEXT:    .cfi_def_cfa_offset 4
 ; X87-NEXT:    retl
@@ -859,6 +901,7 @@ define double @sitofp_i64tof64(i64 %x) #0 {
 ; SSE-X86-NEXT:    fildll 8(%ebp)
 ; SSE-X86-NEXT:    fstpl (%esp)
 ; SSE-X86-NEXT:    fldl (%esp)
+; SSE-X86-NEXT:    wait
 ; SSE-X86-NEXT:    movl %ebp, %esp
 ; SSE-X86-NEXT:    popl %ebp
 ; SSE-X86-NEXT:    .cfi_def_cfa %esp, 4
@@ -881,6 +924,7 @@ define double @sitofp_i64tof64(i64 %x) #0 {
 ; AVX-X86-NEXT:    fildll 8(%ebp)
 ; AVX-X86-NEXT:    fstpl (%esp)
 ; AVX-X86-NEXT:    fldl (%esp)
+; AVX-X86-NEXT:    wait
 ; AVX-X86-NEXT:    movl %ebp, %esp
 ; AVX-X86-NEXT:    popl %ebp
 ; AVX-X86-NEXT:    .cfi_def_cfa %esp, 4
@@ -894,6 +938,7 @@ define double @sitofp_i64tof64(i64 %x) #0 {
 ; X87-LABEL: sitofp_i64tof64:
 ; X87:       # %bb.0:
 ; X87-NEXT:    fildll {{[0-9]+}}(%esp)
+; X87-NEXT:    wait
 ; X87-NEXT:    retl
   %result = call double @llvm.experimental.constrained.sitofp.f64.i64(i64 %x,
                                                metadata !"round.dynamic",
@@ -917,6 +962,7 @@ define double @uitofp_i1tof64(i1 %x) #0 {
 ; SSE-X86-NEXT:    cvtsi2sd %eax, %xmm0
 ; SSE-X86-NEXT:    movsd %xmm0, (%esp)
 ; SSE-X86-NEXT:    fldl (%esp)
+; SSE-X86-NEXT:    wait
 ; SSE-X86-NEXT:    movl %ebp, %esp
 ; SSE-X86-NEXT:    popl %ebp
 ; SSE-X86-NEXT:    .cfi_def_cfa %esp, 4
@@ -943,6 +989,7 @@ define double @uitofp_i1tof64(i1 %x) #0 {
 ; AVX-X86-NEXT:    vcvtsi2sd %eax, %xmm0, %xmm0
 ; AVX-X86-NEXT:    vmovsd %xmm0, (%esp)
 ; AVX-X86-NEXT:    fldl (%esp)
+; AVX-X86-NEXT:    wait
 ; AVX-X86-NEXT:    movl %ebp, %esp
 ; AVX-X86-NEXT:    popl %ebp
 ; AVX-X86-NEXT:    .cfi_def_cfa %esp, 4
@@ -963,6 +1010,7 @@ define double @uitofp_i1tof64(i1 %x) #0 {
 ; X87-NEXT:    movzbl %al, %eax
 ; X87-NEXT:    movw %ax, {{[0-9]+}}(%esp)
 ; X87-NEXT:    filds {{[0-9]+}}(%esp)
+; X87-NEXT:    wait
 ; X87-NEXT:    popl %eax
 ; X87-NEXT:    .cfi_def_cfa_offset 4
 ; X87-NEXT:    retl
@@ -986,6 +1034,7 @@ define double @uitofp_i8tof64(i8 %x) #0 {
 ; SSE-X86-NEXT:    cvtsi2sd %eax, %xmm0
 ; SSE-X86-NEXT:    movsd %xmm0, (%esp)
 ; SSE-X86-NEXT:    fldl (%esp)
+; SSE-X86-NEXT:    wait
 ; SSE-X86-NEXT:    movl %ebp, %esp
 ; SSE-X86-NEXT:    popl %ebp
 ; SSE-X86-NEXT:    .cfi_def_cfa %esp, 4
@@ -1010,6 +1059,7 @@ define double @uitofp_i8tof64(i8 %x) #0 {
 ; AVX-X86-NEXT:    vcvtsi2sd %eax, %xmm0, %xmm0
 ; AVX-X86-NEXT:    vmovsd %xmm0, (%esp)
 ; AVX-X86-NEXT:    fldl (%esp)
+; AVX-X86-NEXT:    wait
 ; AVX-X86-NEXT:    movl %ebp, %esp
 ; AVX-X86-NEXT:    popl %ebp
 ; AVX-X86-NEXT:    .cfi_def_cfa %esp, 4
@@ -1028,6 +1078,7 @@ define double @uitofp_i8tof64(i8 %x) #0 {
 ; X87-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
 ; X87-NEXT:    movw %ax, {{[0-9]+}}(%esp)
 ; X87-NEXT:    filds {{[0-9]+}}(%esp)
+; X87-NEXT:    wait
 ; X87-NEXT:    popl %eax
 ; X87-NEXT:    .cfi_def_cfa_offset 4
 ; X87-NEXT:    retl
@@ -1051,6 +1102,7 @@ define double @uitofp_i16tof64(i16 %x) #0 {
 ; SSE-X86-NEXT:    cvtsi2sd %eax, %xmm0
 ; SSE-X86-NEXT:    movsd %xmm0, (%esp)
 ; SSE-X86-NEXT:    fldl (%esp)
+; SSE-X86-NEXT:    wait
 ; SSE-X86-NEXT:    movl %ebp, %esp
 ; SSE-X86-NEXT:    popl %ebp
 ; SSE-X86-NEXT:    .cfi_def_cfa %esp, 4
@@ -1075,6 +1127,7 @@ define double @uitofp_i16tof64(i16 %x) #0 {
 ; AVX-X86-NEXT:    vcvtsi2sd %eax, %xmm0, %xmm0
 ; AVX-X86-NEXT:    vmovsd %xmm0, (%esp)
 ; AVX-X86-NEXT:    fldl (%esp)
+; AVX-X86-NEXT:    wait
 ; AVX-X86-NEXT:    movl %ebp, %esp
 ; AVX-X86-NEXT:    popl %ebp
 ; AVX-X86-NEXT:    .cfi_def_cfa %esp, 4
@@ -1093,6 +1146,7 @@ define double @uitofp_i16tof64(i16 %x) #0 {
 ; X87-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
 ; X87-NEXT:    movl %eax, (%esp)
 ; X87-NEXT:    fildl (%esp)
+; X87-NEXT:    wait
 ; X87-NEXT:    popl %eax
 ; X87-NEXT:    .cfi_def_cfa_offset 4
 ; X87-NEXT:    retl
@@ -1118,6 +1172,7 @@ define double @uitofp_i32tof64(i32 %x) #0 {
 ; SSE-X86-NEXT:    subsd %xmm0, %xmm1
 ; SSE-X86-NEXT:    movsd %xmm1, (%esp)
 ; SSE-X86-NEXT:    fldl (%esp)
+; SSE-X86-NEXT:    wait
 ; SSE-X86-NEXT:    movl %ebp, %esp
 ; SSE-X86-NEXT:    popl %ebp
 ; SSE-X86-NEXT:    .cfi_def_cfa %esp, 4
@@ -1144,6 +1199,7 @@ define double @uitofp_i32tof64(i32 %x) #0 {
 ; AVX1-X86-NEXT:    vsubsd %xmm0, %xmm1, %xmm0
 ; AVX1-X86-NEXT:    vmovsd %xmm0, (%esp)
 ; AVX1-X86-NEXT:    fldl (%esp)
+; AVX1-X86-NEXT:    wait
 ; AVX1-X86-NEXT:    movl %ebp, %esp
 ; AVX1-X86-NEXT:    popl %ebp
 ; AVX1-X86-NEXT:    .cfi_def_cfa %esp, 4
@@ -1167,6 +1223,7 @@ define double @uitofp_i32tof64(i32 %x) #0 {
 ; AVX512-X86-NEXT:    vcvtusi2sdl 8(%ebp), %xmm0, %xmm0
 ; AVX512-X86-NEXT:    vmovsd %xmm0, (%esp)
 ; AVX512-X86-NEXT:    fldl (%esp)
+; AVX512-X86-NEXT:    wait
 ; AVX512-X86-NEXT:    movl %ebp, %esp
 ; AVX512-X86-NEXT:    popl %ebp
 ; AVX512-X86-NEXT:    .cfi_def_cfa %esp, 4
@@ -1190,6 +1247,7 @@ define double @uitofp_i32tof64(i32 %x) #0 {
 ; X87-NEXT:    movl %eax, (%esp)
 ; X87-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X87-NEXT:    fildll (%esp)
+; X87-NEXT:    wait
 ; X87-NEXT:    movl %ebp, %esp
 ; X87-NEXT:    popl %ebp
 ; X87-NEXT:    .cfi_def_cfa %esp, 4
@@ -1218,6 +1276,7 @@ define double @uitofp_i64tof64(i64 %x) #0 {
 ; SSE-X86-NEXT:    addpd %xmm0, %xmm1
 ; SSE-X86-NEXT:    movlpd %xmm1, (%esp)
 ; SSE-X86-NEXT:    fldl (%esp)
+; SSE-X86-NEXT:    wait
 ; SSE-X86-NEXT:    movl %ebp, %esp
 ; SSE-X86-NEXT:    popl %ebp
 ; SSE-X86-NEXT:    .cfi_def_cfa %esp, 4
@@ -1249,6 +1308,7 @@ define double @uitofp_i64tof64(i64 %x) #0 {
 ; AVX-X86-NEXT:    vaddpd %xmm0, %xmm1, %xmm0
 ; AVX-X86-NEXT:    vmovlpd %xmm0, (%esp)
 ; AVX-X86-NEXT:    fldl (%esp)
+; AVX-X86-NEXT:    wait
 ; AVX-X86-NEXT:    movl %ebp, %esp
 ; AVX-X86-NEXT:    popl %ebp
 ; AVX-X86-NEXT:    .cfi_def_cfa %esp, 4
@@ -1286,6 +1346,7 @@ define double @uitofp_i64tof64(i64 %x) #0 {
 ; X87-NEXT:    fadds {{\.LCPI.*}}(,%ecx,4)
 ; X87-NEXT:    fstpl {{[0-9]+}}(%esp)
 ; X87-NEXT:    fldl {{[0-9]+}}(%esp)
+; X87-NEXT:    wait
 ; X87-NEXT:    movl %ebp, %esp
 ; X87-NEXT:    popl %ebp
 ; X87-NEXT:    .cfi_def_cfa %esp, 4

diff  --git a/llvm/test/CodeGen/X86/fp-strict-scalar-round.ll b/llvm/test/CodeGen/X86/fp-strict-scalar-round.ll
index 966be02124d8..26137bd76a9f 100644
--- a/llvm/test/CodeGen/X86/fp-strict-scalar-round.ll
+++ b/llvm/test/CodeGen/X86/fp-strict-scalar-round.ll
@@ -26,6 +26,7 @@ define float @fceil32(float %f) #0 {
 ; SSE41-X86-NEXT:    roundss $10, %xmm0, %xmm0
 ; SSE41-X86-NEXT:    movss %xmm0, (%esp)
 ; SSE41-X86-NEXT:    flds (%esp)
+; SSE41-X86-NEXT:    wait
 ; SSE41-X86-NEXT:    popl %eax
 ; SSE41-X86-NEXT:    .cfi_def_cfa_offset 4
 ; SSE41-X86-NEXT:    retl
@@ -43,6 +44,7 @@ define float @fceil32(float %f) #0 {
 ; AVX-X86-NEXT:    vroundss $10, %xmm0, %xmm0, %xmm0
 ; AVX-X86-NEXT:    vmovss %xmm0, (%esp)
 ; AVX-X86-NEXT:    flds (%esp)
+; AVX-X86-NEXT:    wait
 ; AVX-X86-NEXT:    popl %eax
 ; AVX-X86-NEXT:    .cfi_def_cfa_offset 4
 ; AVX-X86-NEXT:    retl
@@ -70,6 +72,7 @@ define double @fceilf64(double %f) #0 {
 ; SSE41-X86-NEXT:    roundsd $10, %xmm0, %xmm0
 ; SSE41-X86-NEXT:    movsd %xmm0, (%esp)
 ; SSE41-X86-NEXT:    fldl (%esp)
+; SSE41-X86-NEXT:    wait
 ; SSE41-X86-NEXT:    movl %ebp, %esp
 ; SSE41-X86-NEXT:    popl %ebp
 ; SSE41-X86-NEXT:    .cfi_def_cfa %esp, 4
@@ -93,6 +96,7 @@ define double @fceilf64(double %f) #0 {
 ; AVX-X86-NEXT:    vroundsd $10, %xmm0, %xmm0, %xmm0
 ; AVX-X86-NEXT:    vmovsd %xmm0, (%esp)
 ; AVX-X86-NEXT:    fldl (%esp)
+; AVX-X86-NEXT:    wait
 ; AVX-X86-NEXT:    movl %ebp, %esp
 ; AVX-X86-NEXT:    popl %ebp
 ; AVX-X86-NEXT:    .cfi_def_cfa %esp, 4
@@ -116,6 +120,7 @@ define float @ffloor32(float %f) #0 {
 ; SSE41-X86-NEXT:    roundss $9, %xmm0, %xmm0
 ; SSE41-X86-NEXT:    movss %xmm0, (%esp)
 ; SSE41-X86-NEXT:    flds (%esp)
+; SSE41-X86-NEXT:    wait
 ; SSE41-X86-NEXT:    popl %eax
 ; SSE41-X86-NEXT:    .cfi_def_cfa_offset 4
 ; SSE41-X86-NEXT:    retl
@@ -133,6 +138,7 @@ define float @ffloor32(float %f) #0 {
 ; AVX-X86-NEXT:    vroundss $9, %xmm0, %xmm0, %xmm0
 ; AVX-X86-NEXT:    vmovss %xmm0, (%esp)
 ; AVX-X86-NEXT:    flds (%esp)
+; AVX-X86-NEXT:    wait
 ; AVX-X86-NEXT:    popl %eax
 ; AVX-X86-NEXT:    .cfi_def_cfa_offset 4
 ; AVX-X86-NEXT:    retl
@@ -160,6 +166,7 @@ define double @ffloorf64(double %f) #0 {
 ; SSE41-X86-NEXT:    roundsd $9, %xmm0, %xmm0
 ; SSE41-X86-NEXT:    movsd %xmm0, (%esp)
 ; SSE41-X86-NEXT:    fldl (%esp)
+; SSE41-X86-NEXT:    wait
 ; SSE41-X86-NEXT:    movl %ebp, %esp
 ; SSE41-X86-NEXT:    popl %ebp
 ; SSE41-X86-NEXT:    .cfi_def_cfa %esp, 4
@@ -183,6 +190,7 @@ define double @ffloorf64(double %f) #0 {
 ; AVX-X86-NEXT:    vroundsd $9, %xmm0, %xmm0, %xmm0
 ; AVX-X86-NEXT:    vmovsd %xmm0, (%esp)
 ; AVX-X86-NEXT:    fldl (%esp)
+; AVX-X86-NEXT:    wait
 ; AVX-X86-NEXT:    movl %ebp, %esp
 ; AVX-X86-NEXT:    popl %ebp
 ; AVX-X86-NEXT:    .cfi_def_cfa %esp, 4
@@ -206,6 +214,7 @@ define float @ftrunc32(float %f) #0 {
 ; SSE41-X86-NEXT:    roundss $11, %xmm0, %xmm0
 ; SSE41-X86-NEXT:    movss %xmm0, (%esp)
 ; SSE41-X86-NEXT:    flds (%esp)
+; SSE41-X86-NEXT:    wait
 ; SSE41-X86-NEXT:    popl %eax
 ; SSE41-X86-NEXT:    .cfi_def_cfa_offset 4
 ; SSE41-X86-NEXT:    retl
@@ -223,6 +232,7 @@ define float @ftrunc32(float %f) #0 {
 ; AVX-X86-NEXT:    vroundss $11, %xmm0, %xmm0, %xmm0
 ; AVX-X86-NEXT:    vmovss %xmm0, (%esp)
 ; AVX-X86-NEXT:    flds (%esp)
+; AVX-X86-NEXT:    wait
 ; AVX-X86-NEXT:    popl %eax
 ; AVX-X86-NEXT:    .cfi_def_cfa_offset 4
 ; AVX-X86-NEXT:    retl
@@ -250,6 +260,7 @@ define double @ftruncf64(double %f) #0 {
 ; SSE41-X86-NEXT:    roundsd $11, %xmm0, %xmm0
 ; SSE41-X86-NEXT:    movsd %xmm0, (%esp)
 ; SSE41-X86-NEXT:    fldl (%esp)
+; SSE41-X86-NEXT:    wait
 ; SSE41-X86-NEXT:    movl %ebp, %esp
 ; SSE41-X86-NEXT:    popl %ebp
 ; SSE41-X86-NEXT:    .cfi_def_cfa %esp, 4
@@ -273,6 +284,7 @@ define double @ftruncf64(double %f) #0 {
 ; AVX-X86-NEXT:    vroundsd $11, %xmm0, %xmm0, %xmm0
 ; AVX-X86-NEXT:    vmovsd %xmm0, (%esp)
 ; AVX-X86-NEXT:    fldl (%esp)
+; AVX-X86-NEXT:    wait
 ; AVX-X86-NEXT:    movl %ebp, %esp
 ; AVX-X86-NEXT:    popl %ebp
 ; AVX-X86-NEXT:    .cfi_def_cfa %esp, 4
@@ -296,6 +308,7 @@ define float @frint32(float %f) #0 {
 ; SSE41-X86-NEXT:    roundss $4, %xmm0, %xmm0
 ; SSE41-X86-NEXT:    movss %xmm0, (%esp)
 ; SSE41-X86-NEXT:    flds (%esp)
+; SSE41-X86-NEXT:    wait
 ; SSE41-X86-NEXT:    popl %eax
 ; SSE41-X86-NEXT:    .cfi_def_cfa_offset 4
 ; SSE41-X86-NEXT:    retl
@@ -313,6 +326,7 @@ define float @frint32(float %f) #0 {
 ; AVX-X86-NEXT:    vroundss $4, %xmm0, %xmm0, %xmm0
 ; AVX-X86-NEXT:    vmovss %xmm0, (%esp)
 ; AVX-X86-NEXT:    flds (%esp)
+; AVX-X86-NEXT:    wait
 ; AVX-X86-NEXT:    popl %eax
 ; AVX-X86-NEXT:    .cfi_def_cfa_offset 4
 ; AVX-X86-NEXT:    retl
@@ -341,6 +355,7 @@ define double @frintf64(double %f) #0 {
 ; SSE41-X86-NEXT:    roundsd $4, %xmm0, %xmm0
 ; SSE41-X86-NEXT:    movsd %xmm0, (%esp)
 ; SSE41-X86-NEXT:    fldl (%esp)
+; SSE41-X86-NEXT:    wait
 ; SSE41-X86-NEXT:    movl %ebp, %esp
 ; SSE41-X86-NEXT:    popl %ebp
 ; SSE41-X86-NEXT:    .cfi_def_cfa %esp, 4
@@ -364,6 +379,7 @@ define double @frintf64(double %f) #0 {
 ; AVX-X86-NEXT:    vroundsd $4, %xmm0, %xmm0, %xmm0
 ; AVX-X86-NEXT:    vmovsd %xmm0, (%esp)
 ; AVX-X86-NEXT:    fldl (%esp)
+; AVX-X86-NEXT:    wait
 ; AVX-X86-NEXT:    movl %ebp, %esp
 ; AVX-X86-NEXT:    popl %ebp
 ; AVX-X86-NEXT:    .cfi_def_cfa %esp, 4
@@ -388,6 +404,7 @@ define float @fnearbyint32(float %f) #0 {
 ; SSE41-X86-NEXT:    roundss $12, %xmm0, %xmm0
 ; SSE41-X86-NEXT:    movss %xmm0, (%esp)
 ; SSE41-X86-NEXT:    flds (%esp)
+; SSE41-X86-NEXT:    wait
 ; SSE41-X86-NEXT:    popl %eax
 ; SSE41-X86-NEXT:    .cfi_def_cfa_offset 4
 ; SSE41-X86-NEXT:    retl
@@ -405,6 +422,7 @@ define float @fnearbyint32(float %f) #0 {
 ; AVX-X86-NEXT:    vroundss $12, %xmm0, %xmm0, %xmm0
 ; AVX-X86-NEXT:    vmovss %xmm0, (%esp)
 ; AVX-X86-NEXT:    flds (%esp)
+; AVX-X86-NEXT:    wait
 ; AVX-X86-NEXT:    popl %eax
 ; AVX-X86-NEXT:    .cfi_def_cfa_offset 4
 ; AVX-X86-NEXT:    retl
@@ -433,6 +451,7 @@ define double @fnearbyintf64(double %f) #0 {
 ; SSE41-X86-NEXT:    roundsd $12, %xmm0, %xmm0
 ; SSE41-X86-NEXT:    movsd %xmm0, (%esp)
 ; SSE41-X86-NEXT:    fldl (%esp)
+; SSE41-X86-NEXT:    wait
 ; SSE41-X86-NEXT:    movl %ebp, %esp
 ; SSE41-X86-NEXT:    popl %ebp
 ; SSE41-X86-NEXT:    .cfi_def_cfa %esp, 4
@@ -456,6 +475,7 @@ define double @fnearbyintf64(double %f) #0 {
 ; AVX-X86-NEXT:    vroundsd $12, %xmm0, %xmm0, %xmm0
 ; AVX-X86-NEXT:    vmovsd %xmm0, (%esp)
 ; AVX-X86-NEXT:    fldl (%esp)
+; AVX-X86-NEXT:    wait
 ; AVX-X86-NEXT:    movl %ebp, %esp
 ; AVX-X86-NEXT:    popl %ebp
 ; AVX-X86-NEXT:    .cfi_def_cfa %esp, 4

diff  --git a/llvm/test/CodeGen/X86/fp-strict-scalar.ll b/llvm/test/CodeGen/X86/fp-strict-scalar.ll
index c864acda3202..fbcde4aa4cd2 100644
--- a/llvm/test/CodeGen/X86/fp-strict-scalar.ll
+++ b/llvm/test/CodeGen/X86/fp-strict-scalar.ll
@@ -33,6 +33,7 @@ define double @fadd_f64(double %a, double %b) nounwind strictfp {
 ; SSE-X86-NEXT:    addsd 16(%ebp), %xmm0
 ; SSE-X86-NEXT:    movsd %xmm0, (%esp)
 ; SSE-X86-NEXT:    fldl (%esp)
+; SSE-X86-NEXT:    wait
 ; SSE-X86-NEXT:    movl %ebp, %esp
 ; SSE-X86-NEXT:    popl %ebp
 ; SSE-X86-NEXT:    retl
@@ -52,6 +53,7 @@ define double @fadd_f64(double %a, double %b) nounwind strictfp {
 ; AVX-X86-NEXT:    vaddsd 16(%ebp), %xmm0, %xmm0
 ; AVX-X86-NEXT:    vmovsd %xmm0, (%esp)
 ; AVX-X86-NEXT:    fldl (%esp)
+; AVX-X86-NEXT:    wait
 ; AVX-X86-NEXT:    movl %ebp, %esp
 ; AVX-X86-NEXT:    popl %ebp
 ; AVX-X86-NEXT:    retl
@@ -65,6 +67,7 @@ define double @fadd_f64(double %a, double %b) nounwind strictfp {
 ; X87:       # %bb.0:
 ; X87-NEXT:    fldl {{[0-9]+}}(%esp)
 ; X87-NEXT:    faddl {{[0-9]+}}(%esp)
+; X87-NEXT:    wait
 ; X87-NEXT:    retl
   %ret = call double @llvm.experimental.constrained.fadd.f64(double %a, double %b,
                                                              metadata !"round.dynamic",
@@ -80,6 +83,7 @@ define float @fadd_f32(float %a, float %b) nounwind strictfp {
 ; SSE-X86-NEXT:    addss {{[0-9]+}}(%esp), %xmm0
 ; SSE-X86-NEXT:    movss %xmm0, (%esp)
 ; SSE-X86-NEXT:    flds (%esp)
+; SSE-X86-NEXT:    wait
 ; SSE-X86-NEXT:    popl %eax
 ; SSE-X86-NEXT:    retl
 ;
@@ -95,6 +99,7 @@ define float @fadd_f32(float %a, float %b) nounwind strictfp {
 ; AVX-X86-NEXT:    vaddss {{[0-9]+}}(%esp), %xmm0, %xmm0
 ; AVX-X86-NEXT:    vmovss %xmm0, (%esp)
 ; AVX-X86-NEXT:    flds (%esp)
+; AVX-X86-NEXT:    wait
 ; AVX-X86-NEXT:    popl %eax
 ; AVX-X86-NEXT:    retl
 ;
@@ -107,6 +112,7 @@ define float @fadd_f32(float %a, float %b) nounwind strictfp {
 ; X87:       # %bb.0:
 ; X87-NEXT:    flds {{[0-9]+}}(%esp)
 ; X87-NEXT:    fadds {{[0-9]+}}(%esp)
+; X87-NEXT:    wait
 ; X87-NEXT:    retl
   %ret = call float @llvm.experimental.constrained.fadd.f32(float %a, float %b,
                                                             metadata !"round.dynamic",
@@ -125,6 +131,7 @@ define double @fsub_f64(double %a, double %b) nounwind strictfp {
 ; SSE-X86-NEXT:    subsd 16(%ebp), %xmm0
 ; SSE-X86-NEXT:    movsd %xmm0, (%esp)
 ; SSE-X86-NEXT:    fldl (%esp)
+; SSE-X86-NEXT:    wait
 ; SSE-X86-NEXT:    movl %ebp, %esp
 ; SSE-X86-NEXT:    popl %ebp
 ; SSE-X86-NEXT:    retl
@@ -144,6 +151,7 @@ define double @fsub_f64(double %a, double %b) nounwind strictfp {
 ; AVX-X86-NEXT:    vsubsd 16(%ebp), %xmm0, %xmm0
 ; AVX-X86-NEXT:    vmovsd %xmm0, (%esp)
 ; AVX-X86-NEXT:    fldl (%esp)
+; AVX-X86-NEXT:    wait
 ; AVX-X86-NEXT:    movl %ebp, %esp
 ; AVX-X86-NEXT:    popl %ebp
 ; AVX-X86-NEXT:    retl
@@ -157,6 +165,7 @@ define double @fsub_f64(double %a, double %b) nounwind strictfp {
 ; X87:       # %bb.0:
 ; X87-NEXT:    fldl {{[0-9]+}}(%esp)
 ; X87-NEXT:    fsubl {{[0-9]+}}(%esp)
+; X87-NEXT:    wait
 ; X87-NEXT:    retl
   %ret = call double @llvm.experimental.constrained.fsub.f64(double %a, double %b,
                                                              metadata !"round.dynamic",
@@ -172,6 +181,7 @@ define float @fsub_f32(float %a, float %b) nounwind strictfp {
 ; SSE-X86-NEXT:    subss {{[0-9]+}}(%esp), %xmm0
 ; SSE-X86-NEXT:    movss %xmm0, (%esp)
 ; SSE-X86-NEXT:    flds (%esp)
+; SSE-X86-NEXT:    wait
 ; SSE-X86-NEXT:    popl %eax
 ; SSE-X86-NEXT:    retl
 ;
@@ -187,6 +197,7 @@ define float @fsub_f32(float %a, float %b) nounwind strictfp {
 ; AVX-X86-NEXT:    vsubss {{[0-9]+}}(%esp), %xmm0, %xmm0
 ; AVX-X86-NEXT:    vmovss %xmm0, (%esp)
 ; AVX-X86-NEXT:    flds (%esp)
+; AVX-X86-NEXT:    wait
 ; AVX-X86-NEXT:    popl %eax
 ; AVX-X86-NEXT:    retl
 ;
@@ -199,6 +210,7 @@ define float @fsub_f32(float %a, float %b) nounwind strictfp {
 ; X87:       # %bb.0:
 ; X87-NEXT:    flds {{[0-9]+}}(%esp)
 ; X87-NEXT:    fsubs {{[0-9]+}}(%esp)
+; X87-NEXT:    wait
 ; X87-NEXT:    retl
   %ret = call float @llvm.experimental.constrained.fsub.f32(float %a, float %b,
                                                             metadata !"round.dynamic",
@@ -217,6 +229,7 @@ define double @fmul_f64(double %a, double %b) nounwind strictfp {
 ; SSE-X86-NEXT:    mulsd 16(%ebp), %xmm0
 ; SSE-X86-NEXT:    movsd %xmm0, (%esp)
 ; SSE-X86-NEXT:    fldl (%esp)
+; SSE-X86-NEXT:    wait
 ; SSE-X86-NEXT:    movl %ebp, %esp
 ; SSE-X86-NEXT:    popl %ebp
 ; SSE-X86-NEXT:    retl
@@ -236,6 +249,7 @@ define double @fmul_f64(double %a, double %b) nounwind strictfp {
 ; AVX-X86-NEXT:    vmulsd 16(%ebp), %xmm0, %xmm0
 ; AVX-X86-NEXT:    vmovsd %xmm0, (%esp)
 ; AVX-X86-NEXT:    fldl (%esp)
+; AVX-X86-NEXT:    wait
 ; AVX-X86-NEXT:    movl %ebp, %esp
 ; AVX-X86-NEXT:    popl %ebp
 ; AVX-X86-NEXT:    retl
@@ -249,6 +263,7 @@ define double @fmul_f64(double %a, double %b) nounwind strictfp {
 ; X87:       # %bb.0:
 ; X87-NEXT:    fldl {{[0-9]+}}(%esp)
 ; X87-NEXT:    fmull {{[0-9]+}}(%esp)
+; X87-NEXT:    wait
 ; X87-NEXT:    retl
   %ret = call double @llvm.experimental.constrained.fmul.f64(double %a, double %b,
                                                              metadata !"round.dynamic",
@@ -264,6 +279,7 @@ define float @fmul_f32(float %a, float %b) nounwind strictfp {
 ; SSE-X86-NEXT:    mulss {{[0-9]+}}(%esp), %xmm0
 ; SSE-X86-NEXT:    movss %xmm0, (%esp)
 ; SSE-X86-NEXT:    flds (%esp)
+; SSE-X86-NEXT:    wait
 ; SSE-X86-NEXT:    popl %eax
 ; SSE-X86-NEXT:    retl
 ;
@@ -279,6 +295,7 @@ define float @fmul_f32(float %a, float %b) nounwind strictfp {
 ; AVX-X86-NEXT:    vmulss {{[0-9]+}}(%esp), %xmm0, %xmm0
 ; AVX-X86-NEXT:    vmovss %xmm0, (%esp)
 ; AVX-X86-NEXT:    flds (%esp)
+; AVX-X86-NEXT:    wait
 ; AVX-X86-NEXT:    popl %eax
 ; AVX-X86-NEXT:    retl
 ;
@@ -291,6 +308,7 @@ define float @fmul_f32(float %a, float %b) nounwind strictfp {
 ; X87:       # %bb.0:
 ; X87-NEXT:    flds {{[0-9]+}}(%esp)
 ; X87-NEXT:    fmuls {{[0-9]+}}(%esp)
+; X87-NEXT:    wait
 ; X87-NEXT:    retl
   %ret = call float @llvm.experimental.constrained.fmul.f32(float %a, float %b,
                                                             metadata !"round.dynamic",
@@ -309,6 +327,7 @@ define double @fdiv_f64(double %a, double %b) nounwind strictfp {
 ; SSE-X86-NEXT:    divsd 16(%ebp), %xmm0
 ; SSE-X86-NEXT:    movsd %xmm0, (%esp)
 ; SSE-X86-NEXT:    fldl (%esp)
+; SSE-X86-NEXT:    wait
 ; SSE-X86-NEXT:    movl %ebp, %esp
 ; SSE-X86-NEXT:    popl %ebp
 ; SSE-X86-NEXT:    retl
@@ -328,6 +347,7 @@ define double @fdiv_f64(double %a, double %b) nounwind strictfp {
 ; AVX-X86-NEXT:    vdivsd 16(%ebp), %xmm0, %xmm0
 ; AVX-X86-NEXT:    vmovsd %xmm0, (%esp)
 ; AVX-X86-NEXT:    fldl (%esp)
+; AVX-X86-NEXT:    wait
 ; AVX-X86-NEXT:    movl %ebp, %esp
 ; AVX-X86-NEXT:    popl %ebp
 ; AVX-X86-NEXT:    retl
@@ -341,6 +361,7 @@ define double @fdiv_f64(double %a, double %b) nounwind strictfp {
 ; X87:       # %bb.0:
 ; X87-NEXT:    fldl {{[0-9]+}}(%esp)
 ; X87-NEXT:    fdivl {{[0-9]+}}(%esp)
+; X87-NEXT:    wait
 ; X87-NEXT:    retl
   %ret = call double @llvm.experimental.constrained.fdiv.f64(double %a, double %b,
                                                              metadata !"round.dynamic",
@@ -356,6 +377,7 @@ define float @fdiv_f32(float %a, float %b) nounwind strictfp {
 ; SSE-X86-NEXT:    divss {{[0-9]+}}(%esp), %xmm0
 ; SSE-X86-NEXT:    movss %xmm0, (%esp)
 ; SSE-X86-NEXT:    flds (%esp)
+; SSE-X86-NEXT:    wait
 ; SSE-X86-NEXT:    popl %eax
 ; SSE-X86-NEXT:    retl
 ;
@@ -371,6 +393,7 @@ define float @fdiv_f32(float %a, float %b) nounwind strictfp {
 ; AVX-X86-NEXT:    vdivss {{[0-9]+}}(%esp), %xmm0, %xmm0
 ; AVX-X86-NEXT:    vmovss %xmm0, (%esp)
 ; AVX-X86-NEXT:    flds (%esp)
+; AVX-X86-NEXT:    wait
 ; AVX-X86-NEXT:    popl %eax
 ; AVX-X86-NEXT:    retl
 ;
@@ -383,6 +406,7 @@ define float @fdiv_f32(float %a, float %b) nounwind strictfp {
 ; X87:       # %bb.0:
 ; X87-NEXT:    flds {{[0-9]+}}(%esp)
 ; X87-NEXT:    fdivs {{[0-9]+}}(%esp)
+; X87-NEXT:    wait
 ; X87-NEXT:    retl
   %ret = call float @llvm.experimental.constrained.fdiv.f32(float %a, float %b,
                                                             metadata !"round.dynamic",
@@ -429,6 +453,7 @@ define void @fpext_f32_to_f64(float* %val, double* %ret) nounwind strictfp {
 ; X87-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X87-NEXT:    flds (%ecx)
 ; X87-NEXT:    fstpl (%eax)
+; X87-NEXT:    wait
 ; X87-NEXT:    retl
   %1 = load float, float* %val, align 4
   %res = call double @llvm.experimental.constrained.fpext.f64.f32(float %1,
@@ -479,6 +504,7 @@ define void @fptrunc_double_to_f32(double* %val, float *%ret) nounwind strictfp
 ; X87-NEXT:    fstps (%esp)
 ; X87-NEXT:    flds (%esp)
 ; X87-NEXT:    fstps (%eax)
+; X87-NEXT:    wait
 ; X87-NEXT:    popl %eax
 ; X87-NEXT:    retl
   %1 = load double, double* %val, align 8
@@ -526,6 +552,7 @@ define void @fsqrt_f64(double* %a) nounwind strictfp {
 ; X87-NEXT:    fldl (%eax)
 ; X87-NEXT:    fsqrt
 ; X87-NEXT:    fstpl (%eax)
+; X87-NEXT:    wait
 ; X87-NEXT:    retl
   %1 = load double, double* %a, align 8
   %res = call double @llvm.experimental.constrained.sqrt.f64(double %1,
@@ -572,6 +599,7 @@ define void @fsqrt_f32(float* %a) nounwind strictfp {
 ; X87-NEXT:    flds (%eax)
 ; X87-NEXT:    fsqrt
 ; X87-NEXT:    fstps (%eax)
+; X87-NEXT:    wait
 ; X87-NEXT:    retl
   %1 = load float, float* %a, align 4
   %res = call float @llvm.experimental.constrained.sqrt.f32(float %1,
@@ -613,6 +641,7 @@ define double @fma_f64(double %a, double %b, double %c) nounwind strictfp {
 ; AVX-X86-NEXT:    vfmadd213sd {{.*#+}} xmm1 = (xmm0 * xmm1) + mem
 ; AVX-X86-NEXT:    vmovsd %xmm1, (%esp)
 ; AVX-X86-NEXT:    fldl (%esp)
+; AVX-X86-NEXT:    wait
 ; AVX-X86-NEXT:    movl %ebp, %esp
 ; AVX-X86-NEXT:    popl %ebp
 ; AVX-X86-NEXT:    retl
@@ -631,6 +660,7 @@ define double @fma_f64(double %a, double %b, double %c) nounwind strictfp {
 ; X87-NEXT:    fstpl {{[0-9]+}}(%esp)
 ; X87-NEXT:    fstpl {{[0-9]+}}(%esp)
 ; X87-NEXT:    fstpl (%esp)
+; X87-NEXT:    wait
 ; X87-NEXT:    calll fma
 ; X87-NEXT:    addl $24, %esp
 ; X87-NEXT:    retl
@@ -669,6 +699,7 @@ define float @fma_f32(float %a, float %b, float %c) nounwind strictfp {
 ; AVX-X86-NEXT:    vfmadd213ss {{.*#+}} xmm1 = (xmm0 * xmm1) + mem
 ; AVX-X86-NEXT:    vmovss %xmm1, (%esp)
 ; AVX-X86-NEXT:    flds (%esp)
+; AVX-X86-NEXT:    wait
 ; AVX-X86-NEXT:    popl %eax
 ; AVX-X86-NEXT:    retl
 ;
@@ -686,6 +717,7 @@ define float @fma_f32(float %a, float %b, float %c) nounwind strictfp {
 ; X87-NEXT:    fstps {{[0-9]+}}(%esp)
 ; X87-NEXT:    fstps {{[0-9]+}}(%esp)
 ; X87-NEXT:    fstps (%esp)
+; X87-NEXT:    wait
 ; X87-NEXT:    calll fmaf
 ; X87-NEXT:    addl $12, %esp
 ; X87-NEXT:    retl

diff  --git a/llvm/test/CodeGen/X86/fp128-cast-strict.ll b/llvm/test/CodeGen/X86/fp128-cast-strict.ll
index 048656f48b62..ce5b09f8d411 100644
--- a/llvm/test/CodeGen/X86/fp128-cast-strict.ll
+++ b/llvm/test/CodeGen/X86/fp128-cast-strict.ll
@@ -39,6 +39,7 @@ define void @TestFPExtF32_F128() nounwind strictfp {
 ; X86-NEXT:    subl $24, %esp
 ; X86-NEXT:    flds vf32
 ; X86-NEXT:    fstps {{[0-9]+}}(%esp)
+; X86-NEXT:    wait
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    calll __extendsftf2
@@ -86,6 +87,7 @@ define void @TestFPExtF64_F128() nounwind strictfp {
 ; X86-NEXT:    subl $40, %esp
 ; X86-NEXT:    fldl vf64
 ; X86-NEXT:    fstpl {{[0-9]+}}(%esp)
+; X86-NEXT:    wait
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    calll __extenddftf2
@@ -114,6 +116,7 @@ define void @TestFPExtF80_F128() nounwind strictfp {
 ; X64-SSE-NEXT:    subq $24, %rsp
 ; X64-SSE-NEXT:    fldt {{.*}}(%rip)
 ; X64-SSE-NEXT:    fstpt (%rsp)
+; X64-SSE-NEXT:    wait
 ; X64-SSE-NEXT:    callq __extendxftf2
 ; X64-SSE-NEXT:    movaps %xmm0, {{.*}}(%rip)
 ; X64-SSE-NEXT:    addq $24, %rsp
@@ -124,6 +127,7 @@ define void @TestFPExtF80_F128() nounwind strictfp {
 ; X64-AVX-NEXT:    subq $24, %rsp
 ; X64-AVX-NEXT:    fldt {{.*}}(%rip)
 ; X64-AVX-NEXT:    fstpt (%rsp)
+; X64-AVX-NEXT:    wait
 ; X64-AVX-NEXT:    callq __extendxftf2
 ; X64-AVX-NEXT:    vmovaps %xmm0, {{.*}}(%rip)
 ; X64-AVX-NEXT:    addq $24, %rsp
@@ -135,6 +139,7 @@ define void @TestFPExtF80_F128() nounwind strictfp {
 ; X86-NEXT:    subl $40, %esp
 ; X86-NEXT:    fldt vf80
 ; X86-NEXT:    fstpt {{[0-9]+}}(%esp)
+; X86-NEXT:    wait
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    calll __extendxftf2
@@ -186,6 +191,7 @@ define void @TestFPTruncF128_F32() nounwind strictfp {
 ; X86-NEXT:    calll __trunctfsf2
 ; X86-NEXT:    addl $16, %esp
 ; X86-NEXT:    fstps vf32
+; X86-NEXT:    wait
 ; X86-NEXT:    addl $12, %esp
 ; X86-NEXT:    retl
 entry:
@@ -224,6 +230,7 @@ define void @TestFPTruncF128_F64() nounwind strictfp {
 ; X86-NEXT:    calll __trunctfdf2
 ; X86-NEXT:    addl $16, %esp
 ; X86-NEXT:    fstpl vf64
+; X86-NEXT:    wait
 ; X86-NEXT:    addl $12, %esp
 ; X86-NEXT:    retl
 entry:
@@ -240,6 +247,7 @@ define void @TestFPTruncF128_F80() nounwind strictfp {
 ; X64-SSE-NEXT:    movaps {{.*}}(%rip), %xmm0
 ; X64-SSE-NEXT:    callq __trunctfxf2
 ; X64-SSE-NEXT:    fstpt {{.*}}(%rip)
+; X64-SSE-NEXT:    wait
 ; X64-SSE-NEXT:    popq %rax
 ; X64-SSE-NEXT:    retq
 ;
@@ -249,6 +257,7 @@ define void @TestFPTruncF128_F80() nounwind strictfp {
 ; X64-AVX-NEXT:    vmovaps {{.*}}(%rip), %xmm0
 ; X64-AVX-NEXT:    callq __trunctfxf2
 ; X64-AVX-NEXT:    fstpt {{.*}}(%rip)
+; X64-AVX-NEXT:    wait
 ; X64-AVX-NEXT:    popq %rax
 ; X64-AVX-NEXT:    retq
 ;
@@ -262,6 +271,7 @@ define void @TestFPTruncF128_F80() nounwind strictfp {
 ; X86-NEXT:    calll __trunctfxf2
 ; X86-NEXT:    addl $16, %esp
 ; X86-NEXT:    fstpt vf80
+; X86-NEXT:    wait
 ; X86-NEXT:    addl $12, %esp
 ; X86-NEXT:    retl
 entry:

diff  --git a/llvm/test/CodeGen/X86/fp80-strict-scalar-cmp.ll b/llvm/test/CodeGen/X86/fp80-strict-scalar-cmp.ll
index 7e5896f04486..1e38b6744f3c 100644
--- a/llvm/test/CodeGen/X86/fp80-strict-scalar-cmp.ll
+++ b/llvm/test/CodeGen/X86/fp80-strict-scalar-cmp.ll
@@ -8,6 +8,7 @@ define i32 @test_oeq_q(i32 %a, i32 %b, x86_fp80 %f1, x86_fp80 %f2) #0 {
 ; X87-32-NEXT:    fldt {{[0-9]+}}(%esp)
 ; X87-32-NEXT:    fldt {{[0-9]+}}(%esp)
 ; X87-32-NEXT:    fucompp
+; X87-32-NEXT:    wait
 ; X87-32-NEXT:    fnstsw %ax
 ; X87-32-NEXT:    # kill: def $ah killed $ah killed $ax
 ; X87-32-NEXT:    sahf
@@ -28,6 +29,7 @@ define i32 @test_oeq_q(i32 %a, i32 %b, x86_fp80 %f1, x86_fp80 %f2) #0 {
 ; X87-64-NEXT:    fldt {{[0-9]+}}(%rsp)
 ; X87-64-NEXT:    fucompi %st(1), %st
 ; X87-64-NEXT:    fstp %st(0)
+; X87-64-NEXT:    wait
 ; X87-64-NEXT:    cmovnel %esi, %eax
 ; X87-64-NEXT:    cmovpl %esi, %eax
 ; X87-64-NEXT:    retq
@@ -44,6 +46,7 @@ define i32 @test_ogt_q(i32 %a, i32 %b, x86_fp80 %f1, x86_fp80 %f2) #0 {
 ; X87-32-NEXT:    fldt {{[0-9]+}}(%esp)
 ; X87-32-NEXT:    fldt {{[0-9]+}}(%esp)
 ; X87-32-NEXT:    fucompp
+; X87-32-NEXT:    wait
 ; X87-32-NEXT:    fnstsw %ax
 ; X87-32-NEXT:    # kill: def $ah killed $ah killed $ax
 ; X87-32-NEXT:    sahf
@@ -64,6 +67,7 @@ define i32 @test_ogt_q(i32 %a, i32 %b, x86_fp80 %f1, x86_fp80 %f2) #0 {
 ; X87-64-NEXT:    fldt {{[0-9]+}}(%rsp)
 ; X87-64-NEXT:    fucompi %st(1), %st
 ; X87-64-NEXT:    fstp %st(0)
+; X87-64-NEXT:    wait
 ; X87-64-NEXT:    cmovbel %esi, %eax
 ; X87-64-NEXT:    retq
   %cond = call i1 @llvm.experimental.constrained.fcmp.x86_fp80(
@@ -79,6 +83,7 @@ define i32 @test_oge_q(i32 %a, i32 %b, x86_fp80 %f1, x86_fp80 %f2) #0 {
 ; X87-32-NEXT:    fldt {{[0-9]+}}(%esp)
 ; X87-32-NEXT:    fldt {{[0-9]+}}(%esp)
 ; X87-32-NEXT:    fucompp
+; X87-32-NEXT:    wait
 ; X87-32-NEXT:    fnstsw %ax
 ; X87-32-NEXT:    # kill: def $ah killed $ah killed $ax
 ; X87-32-NEXT:    sahf
@@ -99,6 +104,7 @@ define i32 @test_oge_q(i32 %a, i32 %b, x86_fp80 %f1, x86_fp80 %f2) #0 {
 ; X87-64-NEXT:    fldt {{[0-9]+}}(%rsp)
 ; X87-64-NEXT:    fucompi %st(1), %st
 ; X87-64-NEXT:    fstp %st(0)
+; X87-64-NEXT:    wait
 ; X87-64-NEXT:    cmovbl %esi, %eax
 ; X87-64-NEXT:    retq
   %cond = call i1 @llvm.experimental.constrained.fcmp.x86_fp80(
@@ -114,6 +120,7 @@ define i32 @test_olt_q(i32 %a, i32 %b, x86_fp80 %f1, x86_fp80 %f2) #0 {
 ; X87-32-NEXT:    fldt {{[0-9]+}}(%esp)
 ; X87-32-NEXT:    fldt {{[0-9]+}}(%esp)
 ; X87-32-NEXT:    fucompp
+; X87-32-NEXT:    wait
 ; X87-32-NEXT:    fnstsw %ax
 ; X87-32-NEXT:    # kill: def $ah killed $ah killed $ax
 ; X87-32-NEXT:    sahf
@@ -134,6 +141,7 @@ define i32 @test_olt_q(i32 %a, i32 %b, x86_fp80 %f1, x86_fp80 %f2) #0 {
 ; X87-64-NEXT:    fldt {{[0-9]+}}(%rsp)
 ; X87-64-NEXT:    fucompi %st(1), %st
 ; X87-64-NEXT:    fstp %st(0)
+; X87-64-NEXT:    wait
 ; X87-64-NEXT:    cmovbel %esi, %eax
 ; X87-64-NEXT:    retq
   %cond = call i1 @llvm.experimental.constrained.fcmp.x86_fp80(
@@ -149,6 +157,7 @@ define i32 @test_ole_q(i32 %a, i32 %b, x86_fp80 %f1, x86_fp80 %f2) #0 {
 ; X87-32-NEXT:    fldt {{[0-9]+}}(%esp)
 ; X87-32-NEXT:    fldt {{[0-9]+}}(%esp)
 ; X87-32-NEXT:    fucompp
+; X87-32-NEXT:    wait
 ; X87-32-NEXT:    fnstsw %ax
 ; X87-32-NEXT:    # kill: def $ah killed $ah killed $ax
 ; X87-32-NEXT:    sahf
@@ -169,6 +178,7 @@ define i32 @test_ole_q(i32 %a, i32 %b, x86_fp80 %f1, x86_fp80 %f2) #0 {
 ; X87-64-NEXT:    fldt {{[0-9]+}}(%rsp)
 ; X87-64-NEXT:    fucompi %st(1), %st
 ; X87-64-NEXT:    fstp %st(0)
+; X87-64-NEXT:    wait
 ; X87-64-NEXT:    cmovbl %esi, %eax
 ; X87-64-NEXT:    retq
   %cond = call i1 @llvm.experimental.constrained.fcmp.x86_fp80(
@@ -184,6 +194,7 @@ define i32 @test_one_q(i32 %a, i32 %b, x86_fp80 %f1, x86_fp80 %f2) #0 {
 ; X87-32-NEXT:    fldt {{[0-9]+}}(%esp)
 ; X87-32-NEXT:    fldt {{[0-9]+}}(%esp)
 ; X87-32-NEXT:    fucompp
+; X87-32-NEXT:    wait
 ; X87-32-NEXT:    fnstsw %ax
 ; X87-32-NEXT:    # kill: def $ah killed $ah killed $ax
 ; X87-32-NEXT:    sahf
@@ -204,6 +215,7 @@ define i32 @test_one_q(i32 %a, i32 %b, x86_fp80 %f1, x86_fp80 %f2) #0 {
 ; X87-64-NEXT:    fldt {{[0-9]+}}(%rsp)
 ; X87-64-NEXT:    fucompi %st(1), %st
 ; X87-64-NEXT:    fstp %st(0)
+; X87-64-NEXT:    wait
 ; X87-64-NEXT:    cmovel %esi, %eax
 ; X87-64-NEXT:    retq
   %cond = call i1 @llvm.experimental.constrained.fcmp.x86_fp80(
@@ -219,6 +231,7 @@ define i32 @test_ord_q(i32 %a, i32 %b, x86_fp80 %f1, x86_fp80 %f2) #0 {
 ; X87-32-NEXT:    fldt {{[0-9]+}}(%esp)
 ; X87-32-NEXT:    fldt {{[0-9]+}}(%esp)
 ; X87-32-NEXT:    fucompp
+; X87-32-NEXT:    wait
 ; X87-32-NEXT:    fnstsw %ax
 ; X87-32-NEXT:    # kill: def $ah killed $ah killed $ax
 ; X87-32-NEXT:    sahf
@@ -239,6 +252,7 @@ define i32 @test_ord_q(i32 %a, i32 %b, x86_fp80 %f1, x86_fp80 %f2) #0 {
 ; X87-64-NEXT:    fldt {{[0-9]+}}(%rsp)
 ; X87-64-NEXT:    fucompi %st(1), %st
 ; X87-64-NEXT:    fstp %st(0)
+; X87-64-NEXT:    wait
 ; X87-64-NEXT:    cmovpl %esi, %eax
 ; X87-64-NEXT:    retq
   %cond = call i1 @llvm.experimental.constrained.fcmp.x86_fp80(
@@ -254,6 +268,7 @@ define i32 @test_ueq_q(i32 %a, i32 %b, x86_fp80 %f1, x86_fp80 %f2) #0 {
 ; X87-32-NEXT:    fldt {{[0-9]+}}(%esp)
 ; X87-32-NEXT:    fldt {{[0-9]+}}(%esp)
 ; X87-32-NEXT:    fucompp
+; X87-32-NEXT:    wait
 ; X87-32-NEXT:    fnstsw %ax
 ; X87-32-NEXT:    # kill: def $ah killed $ah killed $ax
 ; X87-32-NEXT:    sahf
@@ -274,6 +289,7 @@ define i32 @test_ueq_q(i32 %a, i32 %b, x86_fp80 %f1, x86_fp80 %f2) #0 {
 ; X87-64-NEXT:    fldt {{[0-9]+}}(%rsp)
 ; X87-64-NEXT:    fucompi %st(1), %st
 ; X87-64-NEXT:    fstp %st(0)
+; X87-64-NEXT:    wait
 ; X87-64-NEXT:    cmovnel %esi, %eax
 ; X87-64-NEXT:    retq
   %cond = call i1 @llvm.experimental.constrained.fcmp.x86_fp80(
@@ -289,6 +305,7 @@ define i32 @test_ugt_q(i32 %a, i32 %b, x86_fp80 %f1, x86_fp80 %f2) #0 {
 ; X87-32-NEXT:    fldt {{[0-9]+}}(%esp)
 ; X87-32-NEXT:    fldt {{[0-9]+}}(%esp)
 ; X87-32-NEXT:    fucompp
+; X87-32-NEXT:    wait
 ; X87-32-NEXT:    fnstsw %ax
 ; X87-32-NEXT:    # kill: def $ah killed $ah killed $ax
 ; X87-32-NEXT:    sahf
@@ -309,6 +326,7 @@ define i32 @test_ugt_q(i32 %a, i32 %b, x86_fp80 %f1, x86_fp80 %f2) #0 {
 ; X87-64-NEXT:    fldt {{[0-9]+}}(%rsp)
 ; X87-64-NEXT:    fucompi %st(1), %st
 ; X87-64-NEXT:    fstp %st(0)
+; X87-64-NEXT:    wait
 ; X87-64-NEXT:    cmovael %esi, %eax
 ; X87-64-NEXT:    retq
   %cond = call i1 @llvm.experimental.constrained.fcmp.x86_fp80(
@@ -324,6 +342,7 @@ define i32 @test_uge_q(i32 %a, i32 %b, x86_fp80 %f1, x86_fp80 %f2) #0 {
 ; X87-32-NEXT:    fldt {{[0-9]+}}(%esp)
 ; X87-32-NEXT:    fldt {{[0-9]+}}(%esp)
 ; X87-32-NEXT:    fucompp
+; X87-32-NEXT:    wait
 ; X87-32-NEXT:    fnstsw %ax
 ; X87-32-NEXT:    # kill: def $ah killed $ah killed $ax
 ; X87-32-NEXT:    sahf
@@ -344,6 +363,7 @@ define i32 @test_uge_q(i32 %a, i32 %b, x86_fp80 %f1, x86_fp80 %f2) #0 {
 ; X87-64-NEXT:    fldt {{[0-9]+}}(%rsp)
 ; X87-64-NEXT:    fucompi %st(1), %st
 ; X87-64-NEXT:    fstp %st(0)
+; X87-64-NEXT:    wait
 ; X87-64-NEXT:    cmoval %esi, %eax
 ; X87-64-NEXT:    retq
   %cond = call i1 @llvm.experimental.constrained.fcmp.x86_fp80(
@@ -359,6 +379,7 @@ define i32 @test_ult_q(i32 %a, i32 %b, x86_fp80 %f1, x86_fp80 %f2) #0 {
 ; X87-32-NEXT:    fldt {{[0-9]+}}(%esp)
 ; X87-32-NEXT:    fldt {{[0-9]+}}(%esp)
 ; X87-32-NEXT:    fucompp
+; X87-32-NEXT:    wait
 ; X87-32-NEXT:    fnstsw %ax
 ; X87-32-NEXT:    # kill: def $ah killed $ah killed $ax
 ; X87-32-NEXT:    sahf
@@ -379,6 +400,7 @@ define i32 @test_ult_q(i32 %a, i32 %b, x86_fp80 %f1, x86_fp80 %f2) #0 {
 ; X87-64-NEXT:    fldt {{[0-9]+}}(%rsp)
 ; X87-64-NEXT:    fucompi %st(1), %st
 ; X87-64-NEXT:    fstp %st(0)
+; X87-64-NEXT:    wait
 ; X87-64-NEXT:    cmovael %esi, %eax
 ; X87-64-NEXT:    retq
   %cond = call i1 @llvm.experimental.constrained.fcmp.x86_fp80(
@@ -394,6 +416,7 @@ define i32 @test_ule_q(i32 %a, i32 %b, x86_fp80 %f1, x86_fp80 %f2) #0 {
 ; X87-32-NEXT:    fldt {{[0-9]+}}(%esp)
 ; X87-32-NEXT:    fldt {{[0-9]+}}(%esp)
 ; X87-32-NEXT:    fucompp
+; X87-32-NEXT:    wait
 ; X87-32-NEXT:    fnstsw %ax
 ; X87-32-NEXT:    # kill: def $ah killed $ah killed $ax
 ; X87-32-NEXT:    sahf
@@ -414,6 +437,7 @@ define i32 @test_ule_q(i32 %a, i32 %b, x86_fp80 %f1, x86_fp80 %f2) #0 {
 ; X87-64-NEXT:    fldt {{[0-9]+}}(%rsp)
 ; X87-64-NEXT:    fucompi %st(1), %st
 ; X87-64-NEXT:    fstp %st(0)
+; X87-64-NEXT:    wait
 ; X87-64-NEXT:    cmoval %esi, %eax
 ; X87-64-NEXT:    retq
   %cond = call i1 @llvm.experimental.constrained.fcmp.x86_fp80(
@@ -429,6 +453,7 @@ define i32 @test_une_q(i32 %a, i32 %b, x86_fp80 %f1, x86_fp80 %f2) #0 {
 ; X87-32-NEXT:    fldt {{[0-9]+}}(%esp)
 ; X87-32-NEXT:    fldt {{[0-9]+}}(%esp)
 ; X87-32-NEXT:    fucompp
+; X87-32-NEXT:    wait
 ; X87-32-NEXT:    fnstsw %ax
 ; X87-32-NEXT:    # kill: def $ah killed $ah killed $ax
 ; X87-32-NEXT:    sahf
@@ -449,6 +474,7 @@ define i32 @test_une_q(i32 %a, i32 %b, x86_fp80 %f1, x86_fp80 %f2) #0 {
 ; X87-64-NEXT:    fldt {{[0-9]+}}(%rsp)
 ; X87-64-NEXT:    fucompi %st(1), %st
 ; X87-64-NEXT:    fstp %st(0)
+; X87-64-NEXT:    wait
 ; X87-64-NEXT:    cmovnel %edi, %eax
 ; X87-64-NEXT:    cmovpl %edi, %eax
 ; X87-64-NEXT:    retq
@@ -465,6 +491,7 @@ define i32 @test_uno_q(i32 %a, i32 %b, x86_fp80 %f1, x86_fp80 %f2) #0 {
 ; X87-32-NEXT:    fldt {{[0-9]+}}(%esp)
 ; X87-32-NEXT:    fldt {{[0-9]+}}(%esp)
 ; X87-32-NEXT:    fucompp
+; X87-32-NEXT:    wait
 ; X87-32-NEXT:    fnstsw %ax
 ; X87-32-NEXT:    # kill: def $ah killed $ah killed $ax
 ; X87-32-NEXT:    sahf
@@ -485,6 +512,7 @@ define i32 @test_uno_q(i32 %a, i32 %b, x86_fp80 %f1, x86_fp80 %f2) #0 {
 ; X87-64-NEXT:    fldt {{[0-9]+}}(%rsp)
 ; X87-64-NEXT:    fucompi %st(1), %st
 ; X87-64-NEXT:    fstp %st(0)
+; X87-64-NEXT:    wait
 ; X87-64-NEXT:    cmovnpl %esi, %eax
 ; X87-64-NEXT:    retq
   %cond = call i1 @llvm.experimental.constrained.fcmp.x86_fp80(
@@ -500,6 +528,7 @@ define i32 @test_oeq_s(i32 %a, i32 %b, x86_fp80 %f1, x86_fp80 %f2) #0 {
 ; X87-32-NEXT:    fldt {{[0-9]+}}(%esp)
 ; X87-32-NEXT:    fldt {{[0-9]+}}(%esp)
 ; X87-32-NEXT:    fcompp
+; X87-32-NEXT:    wait
 ; X87-32-NEXT:    fnstsw %ax
 ; X87-32-NEXT:    # kill: def $ah killed $ah killed $ax
 ; X87-32-NEXT:    sahf
@@ -520,6 +549,7 @@ define i32 @test_oeq_s(i32 %a, i32 %b, x86_fp80 %f1, x86_fp80 %f2) #0 {
 ; X87-64-NEXT:    fldt {{[0-9]+}}(%rsp)
 ; X87-64-NEXT:    fcompi %st(1), %st
 ; X87-64-NEXT:    fstp %st(0)
+; X87-64-NEXT:    wait
 ; X87-64-NEXT:    cmovnel %esi, %eax
 ; X87-64-NEXT:    cmovpl %esi, %eax
 ; X87-64-NEXT:    retq
@@ -536,6 +566,7 @@ define i32 @test_ogt_s(i32 %a, i32 %b, x86_fp80 %f1, x86_fp80 %f2) #0 {
 ; X87-32-NEXT:    fldt {{[0-9]+}}(%esp)
 ; X87-32-NEXT:    fldt {{[0-9]+}}(%esp)
 ; X87-32-NEXT:    fcompp
+; X87-32-NEXT:    wait
 ; X87-32-NEXT:    fnstsw %ax
 ; X87-32-NEXT:    # kill: def $ah killed $ah killed $ax
 ; X87-32-NEXT:    sahf
@@ -556,6 +587,7 @@ define i32 @test_ogt_s(i32 %a, i32 %b, x86_fp80 %f1, x86_fp80 %f2) #0 {
 ; X87-64-NEXT:    fldt {{[0-9]+}}(%rsp)
 ; X87-64-NEXT:    fcompi %st(1), %st
 ; X87-64-NEXT:    fstp %st(0)
+; X87-64-NEXT:    wait
 ; X87-64-NEXT:    cmovbel %esi, %eax
 ; X87-64-NEXT:    retq
   %cond = call i1 @llvm.experimental.constrained.fcmps.x86_fp80(
@@ -571,6 +603,7 @@ define i32 @test_oge_s(i32 %a, i32 %b, x86_fp80 %f1, x86_fp80 %f2) #0 {
 ; X87-32-NEXT:    fldt {{[0-9]+}}(%esp)
 ; X87-32-NEXT:    fldt {{[0-9]+}}(%esp)
 ; X87-32-NEXT:    fcompp
+; X87-32-NEXT:    wait
 ; X87-32-NEXT:    fnstsw %ax
 ; X87-32-NEXT:    # kill: def $ah killed $ah killed $ax
 ; X87-32-NEXT:    sahf
@@ -591,6 +624,7 @@ define i32 @test_oge_s(i32 %a, i32 %b, x86_fp80 %f1, x86_fp80 %f2) #0 {
 ; X87-64-NEXT:    fldt {{[0-9]+}}(%rsp)
 ; X87-64-NEXT:    fcompi %st(1), %st
 ; X87-64-NEXT:    fstp %st(0)
+; X87-64-NEXT:    wait
 ; X87-64-NEXT:    cmovbl %esi, %eax
 ; X87-64-NEXT:    retq
   %cond = call i1 @llvm.experimental.constrained.fcmps.x86_fp80(
@@ -606,6 +640,7 @@ define i32 @test_olt_s(i32 %a, i32 %b, x86_fp80 %f1, x86_fp80 %f2) #0 {
 ; X87-32-NEXT:    fldt {{[0-9]+}}(%esp)
 ; X87-32-NEXT:    fldt {{[0-9]+}}(%esp)
 ; X87-32-NEXT:    fcompp
+; X87-32-NEXT:    wait
 ; X87-32-NEXT:    fnstsw %ax
 ; X87-32-NEXT:    # kill: def $ah killed $ah killed $ax
 ; X87-32-NEXT:    sahf
@@ -626,6 +661,7 @@ define i32 @test_olt_s(i32 %a, i32 %b, x86_fp80 %f1, x86_fp80 %f2) #0 {
 ; X87-64-NEXT:    fldt {{[0-9]+}}(%rsp)
 ; X87-64-NEXT:    fcompi %st(1), %st
 ; X87-64-NEXT:    fstp %st(0)
+; X87-64-NEXT:    wait
 ; X87-64-NEXT:    cmovbel %esi, %eax
 ; X87-64-NEXT:    retq
   %cond = call i1 @llvm.experimental.constrained.fcmps.x86_fp80(
@@ -641,6 +677,7 @@ define i32 @test_ole_s(i32 %a, i32 %b, x86_fp80 %f1, x86_fp80 %f2) #0 {
 ; X87-32-NEXT:    fldt {{[0-9]+}}(%esp)
 ; X87-32-NEXT:    fldt {{[0-9]+}}(%esp)
 ; X87-32-NEXT:    fcompp
+; X87-32-NEXT:    wait
 ; X87-32-NEXT:    fnstsw %ax
 ; X87-32-NEXT:    # kill: def $ah killed $ah killed $ax
 ; X87-32-NEXT:    sahf
@@ -661,6 +698,7 @@ define i32 @test_ole_s(i32 %a, i32 %b, x86_fp80 %f1, x86_fp80 %f2) #0 {
 ; X87-64-NEXT:    fldt {{[0-9]+}}(%rsp)
 ; X87-64-NEXT:    fcompi %st(1), %st
 ; X87-64-NEXT:    fstp %st(0)
+; X87-64-NEXT:    wait
 ; X87-64-NEXT:    cmovbl %esi, %eax
 ; X87-64-NEXT:    retq
   %cond = call i1 @llvm.experimental.constrained.fcmps.x86_fp80(
@@ -676,6 +714,7 @@ define i32 @test_one_s(i32 %a, i32 %b, x86_fp80 %f1, x86_fp80 %f2) #0 {
 ; X87-32-NEXT:    fldt {{[0-9]+}}(%esp)
 ; X87-32-NEXT:    fldt {{[0-9]+}}(%esp)
 ; X87-32-NEXT:    fcompp
+; X87-32-NEXT:    wait
 ; X87-32-NEXT:    fnstsw %ax
 ; X87-32-NEXT:    # kill: def $ah killed $ah killed $ax
 ; X87-32-NEXT:    sahf
@@ -696,6 +735,7 @@ define i32 @test_one_s(i32 %a, i32 %b, x86_fp80 %f1, x86_fp80 %f2) #0 {
 ; X87-64-NEXT:    fldt {{[0-9]+}}(%rsp)
 ; X87-64-NEXT:    fcompi %st(1), %st
 ; X87-64-NEXT:    fstp %st(0)
+; X87-64-NEXT:    wait
 ; X87-64-NEXT:    cmovel %esi, %eax
 ; X87-64-NEXT:    retq
   %cond = call i1 @llvm.experimental.constrained.fcmps.x86_fp80(
@@ -711,6 +751,7 @@ define i32 @test_ord_s(i32 %a, i32 %b, x86_fp80 %f1, x86_fp80 %f2) #0 {
 ; X87-32-NEXT:    fldt {{[0-9]+}}(%esp)
 ; X87-32-NEXT:    fldt {{[0-9]+}}(%esp)
 ; X87-32-NEXT:    fcompp
+; X87-32-NEXT:    wait
 ; X87-32-NEXT:    fnstsw %ax
 ; X87-32-NEXT:    # kill: def $ah killed $ah killed $ax
 ; X87-32-NEXT:    sahf
@@ -731,6 +772,7 @@ define i32 @test_ord_s(i32 %a, i32 %b, x86_fp80 %f1, x86_fp80 %f2) #0 {
 ; X87-64-NEXT:    fldt {{[0-9]+}}(%rsp)
 ; X87-64-NEXT:    fcompi %st(1), %st
 ; X87-64-NEXT:    fstp %st(0)
+; X87-64-NEXT:    wait
 ; X87-64-NEXT:    cmovpl %esi, %eax
 ; X87-64-NEXT:    retq
   %cond = call i1 @llvm.experimental.constrained.fcmps.x86_fp80(
@@ -746,6 +788,7 @@ define i32 @test_ueq_s(i32 %a, i32 %b, x86_fp80 %f1, x86_fp80 %f2) #0 {
 ; X87-32-NEXT:    fldt {{[0-9]+}}(%esp)
 ; X87-32-NEXT:    fldt {{[0-9]+}}(%esp)
 ; X87-32-NEXT:    fcompp
+; X87-32-NEXT:    wait
 ; X87-32-NEXT:    fnstsw %ax
 ; X87-32-NEXT:    # kill: def $ah killed $ah killed $ax
 ; X87-32-NEXT:    sahf
@@ -766,6 +809,7 @@ define i32 @test_ueq_s(i32 %a, i32 %b, x86_fp80 %f1, x86_fp80 %f2) #0 {
 ; X87-64-NEXT:    fldt {{[0-9]+}}(%rsp)
 ; X87-64-NEXT:    fcompi %st(1), %st
 ; X87-64-NEXT:    fstp %st(0)
+; X87-64-NEXT:    wait
 ; X87-64-NEXT:    cmovnel %esi, %eax
 ; X87-64-NEXT:    retq
   %cond = call i1 @llvm.experimental.constrained.fcmps.x86_fp80(
@@ -781,6 +825,7 @@ define i32 @test_ugt_s(i32 %a, i32 %b, x86_fp80 %f1, x86_fp80 %f2) #0 {
 ; X87-32-NEXT:    fldt {{[0-9]+}}(%esp)
 ; X87-32-NEXT:    fldt {{[0-9]+}}(%esp)
 ; X87-32-NEXT:    fcompp
+; X87-32-NEXT:    wait
 ; X87-32-NEXT:    fnstsw %ax
 ; X87-32-NEXT:    # kill: def $ah killed $ah killed $ax
 ; X87-32-NEXT:    sahf
@@ -801,6 +846,7 @@ define i32 @test_ugt_s(i32 %a, i32 %b, x86_fp80 %f1, x86_fp80 %f2) #0 {
 ; X87-64-NEXT:    fldt {{[0-9]+}}(%rsp)
 ; X87-64-NEXT:    fcompi %st(1), %st
 ; X87-64-NEXT:    fstp %st(0)
+; X87-64-NEXT:    wait
 ; X87-64-NEXT:    cmovael %esi, %eax
 ; X87-64-NEXT:    retq
   %cond = call i1 @llvm.experimental.constrained.fcmps.x86_fp80(
@@ -816,6 +862,7 @@ define i32 @test_uge_s(i32 %a, i32 %b, x86_fp80 %f1, x86_fp80 %f2) #0 {
 ; X87-32-NEXT:    fldt {{[0-9]+}}(%esp)
 ; X87-32-NEXT:    fldt {{[0-9]+}}(%esp)
 ; X87-32-NEXT:    fcompp
+; X87-32-NEXT:    wait
 ; X87-32-NEXT:    fnstsw %ax
 ; X87-32-NEXT:    # kill: def $ah killed $ah killed $ax
 ; X87-32-NEXT:    sahf
@@ -836,6 +883,7 @@ define i32 @test_uge_s(i32 %a, i32 %b, x86_fp80 %f1, x86_fp80 %f2) #0 {
 ; X87-64-NEXT:    fldt {{[0-9]+}}(%rsp)
 ; X87-64-NEXT:    fcompi %st(1), %st
 ; X87-64-NEXT:    fstp %st(0)
+; X87-64-NEXT:    wait
 ; X87-64-NEXT:    cmoval %esi, %eax
 ; X87-64-NEXT:    retq
   %cond = call i1 @llvm.experimental.constrained.fcmps.x86_fp80(
@@ -851,6 +899,7 @@ define i32 @test_ult_s(i32 %a, i32 %b, x86_fp80 %f1, x86_fp80 %f2) #0 {
 ; X87-32-NEXT:    fldt {{[0-9]+}}(%esp)
 ; X87-32-NEXT:    fldt {{[0-9]+}}(%esp)
 ; X87-32-NEXT:    fcompp
+; X87-32-NEXT:    wait
 ; X87-32-NEXT:    fnstsw %ax
 ; X87-32-NEXT:    # kill: def $ah killed $ah killed $ax
 ; X87-32-NEXT:    sahf
@@ -871,6 +920,7 @@ define i32 @test_ult_s(i32 %a, i32 %b, x86_fp80 %f1, x86_fp80 %f2) #0 {
 ; X87-64-NEXT:    fldt {{[0-9]+}}(%rsp)
 ; X87-64-NEXT:    fcompi %st(1), %st
 ; X87-64-NEXT:    fstp %st(0)
+; X87-64-NEXT:    wait
 ; X87-64-NEXT:    cmovael %esi, %eax
 ; X87-64-NEXT:    retq
   %cond = call i1 @llvm.experimental.constrained.fcmps.x86_fp80(
@@ -886,6 +936,7 @@ define i32 @test_ule_s(i32 %a, i32 %b, x86_fp80 %f1, x86_fp80 %f2) #0 {
 ; X87-32-NEXT:    fldt {{[0-9]+}}(%esp)
 ; X87-32-NEXT:    fldt {{[0-9]+}}(%esp)
 ; X87-32-NEXT:    fcompp
+; X87-32-NEXT:    wait
 ; X87-32-NEXT:    fnstsw %ax
 ; X87-32-NEXT:    # kill: def $ah killed $ah killed $ax
 ; X87-32-NEXT:    sahf
@@ -906,6 +957,7 @@ define i32 @test_ule_s(i32 %a, i32 %b, x86_fp80 %f1, x86_fp80 %f2) #0 {
 ; X87-64-NEXT:    fldt {{[0-9]+}}(%rsp)
 ; X87-64-NEXT:    fcompi %st(1), %st
 ; X87-64-NEXT:    fstp %st(0)
+; X87-64-NEXT:    wait
 ; X87-64-NEXT:    cmoval %esi, %eax
 ; X87-64-NEXT:    retq
   %cond = call i1 @llvm.experimental.constrained.fcmps.x86_fp80(
@@ -921,6 +973,7 @@ define i32 @test_une_s(i32 %a, i32 %b, x86_fp80 %f1, x86_fp80 %f2) #0 {
 ; X87-32-NEXT:    fldt {{[0-9]+}}(%esp)
 ; X87-32-NEXT:    fldt {{[0-9]+}}(%esp)
 ; X87-32-NEXT:    fcompp
+; X87-32-NEXT:    wait
 ; X87-32-NEXT:    fnstsw %ax
 ; X87-32-NEXT:    # kill: def $ah killed $ah killed $ax
 ; X87-32-NEXT:    sahf
@@ -941,6 +994,7 @@ define i32 @test_une_s(i32 %a, i32 %b, x86_fp80 %f1, x86_fp80 %f2) #0 {
 ; X87-64-NEXT:    fldt {{[0-9]+}}(%rsp)
 ; X87-64-NEXT:    fcompi %st(1), %st
 ; X87-64-NEXT:    fstp %st(0)
+; X87-64-NEXT:    wait
 ; X87-64-NEXT:    cmovnel %edi, %eax
 ; X87-64-NEXT:    cmovpl %edi, %eax
 ; X87-64-NEXT:    retq
@@ -957,6 +1011,7 @@ define i32 @test_uno_s(i32 %a, i32 %b, x86_fp80 %f1, x86_fp80 %f2) #0 {
 ; X87-32-NEXT:    fldt {{[0-9]+}}(%esp)
 ; X87-32-NEXT:    fldt {{[0-9]+}}(%esp)
 ; X87-32-NEXT:    fcompp
+; X87-32-NEXT:    wait
 ; X87-32-NEXT:    fnstsw %ax
 ; X87-32-NEXT:    # kill: def $ah killed $ah killed $ax
 ; X87-32-NEXT:    sahf
@@ -977,6 +1032,7 @@ define i32 @test_uno_s(i32 %a, i32 %b, x86_fp80 %f1, x86_fp80 %f2) #0 {
 ; X87-64-NEXT:    fldt {{[0-9]+}}(%rsp)
 ; X87-64-NEXT:    fcompi %st(1), %st
 ; X87-64-NEXT:    fstp %st(0)
+; X87-64-NEXT:    wait
 ; X87-64-NEXT:    cmovnpl %esi, %eax
 ; X87-64-NEXT:    retq
   %cond = call i1 @llvm.experimental.constrained.fcmps.x86_fp80(

diff  --git a/llvm/test/CodeGen/X86/fp80-strict-scalar.ll b/llvm/test/CodeGen/X86/fp80-strict-scalar.ll
index 5f03917f56ae..cf4a51fd6920 100644
--- a/llvm/test/CodeGen/X86/fp80-strict-scalar.ll
+++ b/llvm/test/CodeGen/X86/fp80-strict-scalar.ll
@@ -38,6 +38,7 @@ define x86_fp80 @fadd_fp80(x86_fp80 %a, x86_fp80 %b) nounwind strictfp {
 ; X86-NEXT:    fldt {{[0-9]+}}(%esp)
 ; X86-NEXT:    fldt {{[0-9]+}}(%esp)
 ; X86-NEXT:    faddp %st, %st(1)
+; X86-NEXT:    wait
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: fadd_fp80:
@@ -45,6 +46,7 @@ define x86_fp80 @fadd_fp80(x86_fp80 %a, x86_fp80 %b) nounwind strictfp {
 ; X64-NEXT:    fldt {{[0-9]+}}(%rsp)
 ; X64-NEXT:    fldt {{[0-9]+}}(%rsp)
 ; X64-NEXT:    faddp %st, %st(1)
+; X64-NEXT:    wait
 ; X64-NEXT:    retq
   %ret = call x86_fp80 @llvm.experimental.constrained.fadd.x86_fp80(x86_fp80 %a, x86_fp80 %b,
                                                                     metadata !"round.dynamic",
@@ -58,6 +60,7 @@ define x86_fp80 @fsub_fp80(x86_fp80 %a, x86_fp80 %b) nounwind strictfp {
 ; X86-NEXT:    fldt {{[0-9]+}}(%esp)
 ; X86-NEXT:    fldt {{[0-9]+}}(%esp)
 ; X86-NEXT:    fsubp %st, %st(1)
+; X86-NEXT:    wait
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: fsub_fp80:
@@ -65,6 +68,7 @@ define x86_fp80 @fsub_fp80(x86_fp80 %a, x86_fp80 %b) nounwind strictfp {
 ; X64-NEXT:    fldt {{[0-9]+}}(%rsp)
 ; X64-NEXT:    fldt {{[0-9]+}}(%rsp)
 ; X64-NEXT:    fsubp %st, %st(1)
+; X64-NEXT:    wait
 ; X64-NEXT:    retq
   %ret = call x86_fp80 @llvm.experimental.constrained.fsub.x86_fp80(x86_fp80 %a, x86_fp80 %b,
                                                                     metadata !"round.dynamic",
@@ -78,6 +82,7 @@ define x86_fp80 @fmul_fp80(x86_fp80 %a, x86_fp80 %b) nounwind strictfp {
 ; X86-NEXT:    fldt {{[0-9]+}}(%esp)
 ; X86-NEXT:    fldt {{[0-9]+}}(%esp)
 ; X86-NEXT:    fmulp %st, %st(1)
+; X86-NEXT:    wait
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: fmul_fp80:
@@ -85,6 +90,7 @@ define x86_fp80 @fmul_fp80(x86_fp80 %a, x86_fp80 %b) nounwind strictfp {
 ; X64-NEXT:    fldt {{[0-9]+}}(%rsp)
 ; X64-NEXT:    fldt {{[0-9]+}}(%rsp)
 ; X64-NEXT:    fmulp %st, %st(1)
+; X64-NEXT:    wait
 ; X64-NEXT:    retq
   %ret = call x86_fp80 @llvm.experimental.constrained.fmul.x86_fp80(x86_fp80 %a, x86_fp80 %b,
                                                                     metadata !"round.dynamic",
@@ -98,6 +104,7 @@ define x86_fp80 @fdiv_fp80(x86_fp80 %a, x86_fp80 %b) nounwind strictfp {
 ; X86-NEXT:    fldt {{[0-9]+}}(%esp)
 ; X86-NEXT:    fldt {{[0-9]+}}(%esp)
 ; X86-NEXT:    fdivp %st, %st(1)
+; X86-NEXT:    wait
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: fdiv_fp80:
@@ -105,6 +112,7 @@ define x86_fp80 @fdiv_fp80(x86_fp80 %a, x86_fp80 %b) nounwind strictfp {
 ; X64-NEXT:    fldt {{[0-9]+}}(%rsp)
 ; X64-NEXT:    fldt {{[0-9]+}}(%rsp)
 ; X64-NEXT:    fdivp %st, %st(1)
+; X64-NEXT:    wait
 ; X64-NEXT:    retq
   %ret = call x86_fp80 @llvm.experimental.constrained.fdiv.x86_fp80(x86_fp80 %a, x86_fp80 %b,
                                                                     metadata !"round.dynamic",
@@ -116,12 +124,14 @@ define x86_fp80 @fpext_f32_to_fp80(float %a) nounwind strictfp {
 ; X86-LABEL: fpext_f32_to_fp80:
 ; X86:       # %bb.0:
 ; X86-NEXT:    flds {{[0-9]+}}(%esp)
+; X86-NEXT:    wait
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: fpext_f32_to_fp80:
 ; X64:       # %bb.0:
 ; X64-NEXT:    movss %xmm0, -{{[0-9]+}}(%rsp)
 ; X64-NEXT:    flds -{{[0-9]+}}(%rsp)
+; X64-NEXT:    wait
 ; X64-NEXT:    retq
   %ret = call x86_fp80 @llvm.experimental.constrained.fpext.x86_fp80.f32(float %a,
                                                                          metadata !"fpexcept.strict") #0
@@ -132,12 +142,14 @@ define x86_fp80 @fpext_f64_to_fp80(double %a) nounwind strictfp {
 ; X86-LABEL: fpext_f64_to_fp80:
 ; X86:       # %bb.0:
 ; X86-NEXT:    fldl {{[0-9]+}}(%esp)
+; X86-NEXT:    wait
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: fpext_f64_to_fp80:
 ; X64:       # %bb.0:
 ; X64-NEXT:    movsd %xmm0, -{{[0-9]+}}(%rsp)
 ; X64-NEXT:    fldl -{{[0-9]+}}(%rsp)
+; X64-NEXT:    wait
 ; X64-NEXT:    retq
   %ret = call x86_fp80 @llvm.experimental.constrained.fpext.x86_fp80.f64(double %a,
                                                                          metadata !"fpexcept.strict") #0
@@ -151,6 +163,7 @@ define float @fptrunc_fp80_to_f32(x86_fp80 %a) nounwind strictfp {
 ; X86-NEXT:    fldt {{[0-9]+}}(%esp)
 ; X86-NEXT:    fstps (%esp)
 ; X86-NEXT:    flds (%esp)
+; X86-NEXT:    wait
 ; X86-NEXT:    popl %eax
 ; X86-NEXT:    retl
 ;
@@ -158,6 +171,7 @@ define float @fptrunc_fp80_to_f32(x86_fp80 %a) nounwind strictfp {
 ; X64:       # %bb.0:
 ; X64-NEXT:    fldt {{[0-9]+}}(%rsp)
 ; X64-NEXT:    fstps -{{[0-9]+}}(%rsp)
+; X64-NEXT:    wait
 ; X64-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; X64-NEXT:    retq
   %ret = call float @llvm.experimental.constrained.fptrunc.f32.x86_fp80(x86_fp80 %a,
@@ -176,6 +190,7 @@ define double @fptrunc_fp80_to_f64(x86_fp80 %a) nounwind strictfp {
 ; X86-NEXT:    fldt 8(%ebp)
 ; X86-NEXT:    fstpl (%esp)
 ; X86-NEXT:    fldl (%esp)
+; X86-NEXT:    wait
 ; X86-NEXT:    movl %ebp, %esp
 ; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl
@@ -184,6 +199,7 @@ define double @fptrunc_fp80_to_f64(x86_fp80 %a) nounwind strictfp {
 ; X64:       # %bb.0:
 ; X64-NEXT:    fldt {{[0-9]+}}(%rsp)
 ; X64-NEXT:    fstpl -{{[0-9]+}}(%rsp)
+; X64-NEXT:    wait
 ; X64-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
 ; X64-NEXT:    retq
   %ret = call double @llvm.experimental.constrained.fptrunc.f64.x86_fp80(x86_fp80 %a,
@@ -197,12 +213,14 @@ define x86_fp80 @fsqrt_fp80(x86_fp80 %a) nounwind strictfp {
 ; X86:       # %bb.0:
 ; X86-NEXT:    fldt {{[0-9]+}}(%esp)
 ; X86-NEXT:    fsqrt
+; X86-NEXT:    wait
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: fsqrt_fp80:
 ; X64:       # %bb.0:
 ; X64-NEXT:    fldt {{[0-9]+}}(%rsp)
 ; X64-NEXT:    fsqrt
+; X64-NEXT:    wait
 ; X64-NEXT:    retq
   %ret = call x86_fp80 @llvm.experimental.constrained.sqrt.x86_fp80(x86_fp80 %a,
                                                                     metadata !"round.dynamic",
@@ -216,6 +234,7 @@ define i1 @fp80_to_sint1(x86_fp80 %x) #0 {
 ; X86-NEXT:    subl $8, %esp
 ; X86-NEXT:    .cfi_def_cfa_offset 12
 ; X86-NEXT:    fldt {{[0-9]+}}(%esp)
+; X86-NEXT:    wait
 ; X86-NEXT:    fnstcw {{[0-9]+}}(%esp)
 ; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    orl $3072, %eax # imm = 0xC00
@@ -231,6 +250,7 @@ define i1 @fp80_to_sint1(x86_fp80 %x) #0 {
 ; X64-LABEL: fp80_to_sint1:
 ; X64:       # %bb.0:
 ; X64-NEXT:    fldt {{[0-9]+}}(%rsp)
+; X64-NEXT:    wait
 ; X64-NEXT:    fnstcw -{{[0-9]+}}(%rsp)
 ; X64-NEXT:    movzwl -{{[0-9]+}}(%rsp), %eax
 ; X64-NEXT:    orl $3072, %eax # imm = 0xC00
@@ -251,6 +271,7 @@ define i8 @fp80_to_sint8(x86_fp80 %x) #0 {
 ; X86-NEXT:    subl $8, %esp
 ; X86-NEXT:    .cfi_def_cfa_offset 12
 ; X86-NEXT:    fldt {{[0-9]+}}(%esp)
+; X86-NEXT:    wait
 ; X86-NEXT:    fnstcw {{[0-9]+}}(%esp)
 ; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    orl $3072, %eax # imm = 0xC00
@@ -266,6 +287,7 @@ define i8 @fp80_to_sint8(x86_fp80 %x) #0 {
 ; X64-LABEL: fp80_to_sint8:
 ; X64:       # %bb.0:
 ; X64-NEXT:    fldt {{[0-9]+}}(%rsp)
+; X64-NEXT:    wait
 ; X64-NEXT:    fnstcw -{{[0-9]+}}(%rsp)
 ; X64-NEXT:    movzwl -{{[0-9]+}}(%rsp), %eax
 ; X64-NEXT:    orl $3072, %eax # imm = 0xC00
@@ -286,6 +308,7 @@ define i16 @fp80_to_sint16(x86_fp80 %x) #0 {
 ; X86-NEXT:    subl $8, %esp
 ; X86-NEXT:    .cfi_def_cfa_offset 12
 ; X86-NEXT:    fldt {{[0-9]+}}(%esp)
+; X86-NEXT:    wait
 ; X86-NEXT:    fnstcw {{[0-9]+}}(%esp)
 ; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    orl $3072, %eax # imm = 0xC00
@@ -301,6 +324,7 @@ define i16 @fp80_to_sint16(x86_fp80 %x) #0 {
 ; X64-LABEL: fp80_to_sint16:
 ; X64:       # %bb.0:
 ; X64-NEXT:    fldt {{[0-9]+}}(%rsp)
+; X64-NEXT:    wait
 ; X64-NEXT:    fnstcw -{{[0-9]+}}(%rsp)
 ; X64-NEXT:    movzwl -{{[0-9]+}}(%rsp), %eax
 ; X64-NEXT:    orl $3072, %eax # imm = 0xC00
@@ -321,6 +345,7 @@ define i32 @fp80_to_sint32(x86_fp80 %x) #0 {
 ; X86-NEXT:    subl $8, %esp
 ; X86-NEXT:    .cfi_def_cfa_offset 12
 ; X86-NEXT:    fldt {{[0-9]+}}(%esp)
+; X86-NEXT:    wait
 ; X86-NEXT:    fnstcw (%esp)
 ; X86-NEXT:    movzwl (%esp), %eax
 ; X86-NEXT:    orl $3072, %eax # imm = 0xC00
@@ -336,6 +361,7 @@ define i32 @fp80_to_sint32(x86_fp80 %x) #0 {
 ; X64-LABEL: fp80_to_sint32:
 ; X64:       # %bb.0: # %entry
 ; X64-NEXT:    fldt {{[0-9]+}}(%rsp)
+; X64-NEXT:    wait
 ; X64-NEXT:    fnstcw -{{[0-9]+}}(%rsp)
 ; X64-NEXT:    movzwl -{{[0-9]+}}(%rsp), %eax
 ; X64-NEXT:    orl $3072, %eax # imm = 0xC00
@@ -362,6 +388,7 @@ define i64 @fp80_to_sint64(x86_fp80 %x) #0 {
 ; X86-NEXT:    andl $-8, %esp
 ; X86-NEXT:    subl $16, %esp
 ; X86-NEXT:    fldt 8(%ebp)
+; X86-NEXT:    wait
 ; X86-NEXT:    fnstcw {{[0-9]+}}(%esp)
 ; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    orl $3072, %eax # imm = 0xC00
@@ -379,6 +406,7 @@ define i64 @fp80_to_sint64(x86_fp80 %x) #0 {
 ; X64-LABEL: fp80_to_sint64:
 ; X64:       # %bb.0:
 ; X64-NEXT:    fldt {{[0-9]+}}(%rsp)
+; X64-NEXT:    wait
 ; X64-NEXT:    fnstcw -{{[0-9]+}}(%rsp)
 ; X64-NEXT:    movzwl -{{[0-9]+}}(%rsp), %eax
 ; X64-NEXT:    orl $3072, %eax # imm = 0xC00
@@ -399,6 +427,7 @@ define i1 @fp80_to_uint1(x86_fp80 %x) #0 {
 ; X86-NEXT:    subl $8, %esp
 ; X86-NEXT:    .cfi_def_cfa_offset 12
 ; X86-NEXT:    fldt {{[0-9]+}}(%esp)
+; X86-NEXT:    wait
 ; X86-NEXT:    fnstcw {{[0-9]+}}(%esp)
 ; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    orl $3072, %eax # imm = 0xC00
@@ -414,6 +443,7 @@ define i1 @fp80_to_uint1(x86_fp80 %x) #0 {
 ; X64-LABEL: fp80_to_uint1:
 ; X64:       # %bb.0:
 ; X64-NEXT:    fldt {{[0-9]+}}(%rsp)
+; X64-NEXT:    wait
 ; X64-NEXT:    fnstcw -{{[0-9]+}}(%rsp)
 ; X64-NEXT:    movzwl -{{[0-9]+}}(%rsp), %eax
 ; X64-NEXT:    orl $3072, %eax # imm = 0xC00
@@ -434,6 +464,7 @@ define i8 @fp80_to_uint8(x86_fp80 %x) #0 {
 ; X86-NEXT:    subl $8, %esp
 ; X86-NEXT:    .cfi_def_cfa_offset 12
 ; X86-NEXT:    fldt {{[0-9]+}}(%esp)
+; X86-NEXT:    wait
 ; X86-NEXT:    fnstcw {{[0-9]+}}(%esp)
 ; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    orl $3072, %eax # imm = 0xC00
@@ -449,6 +480,7 @@ define i8 @fp80_to_uint8(x86_fp80 %x) #0 {
 ; X64-LABEL: fp80_to_uint8:
 ; X64:       # %bb.0:
 ; X64-NEXT:    fldt {{[0-9]+}}(%rsp)
+; X64-NEXT:    wait
 ; X64-NEXT:    fnstcw -{{[0-9]+}}(%rsp)
 ; X64-NEXT:    movzwl -{{[0-9]+}}(%rsp), %eax
 ; X64-NEXT:    orl $3072, %eax # imm = 0xC00
@@ -469,6 +501,7 @@ define i16 @fp80_to_uint16(x86_fp80 %x) #0 {
 ; X86-NEXT:    subl $8, %esp
 ; X86-NEXT:    .cfi_def_cfa_offset 12
 ; X86-NEXT:    fldt {{[0-9]+}}(%esp)
+; X86-NEXT:    wait
 ; X86-NEXT:    fnstcw (%esp)
 ; X86-NEXT:    movzwl (%esp), %eax
 ; X86-NEXT:    orl $3072, %eax # imm = 0xC00
@@ -485,6 +518,7 @@ define i16 @fp80_to_uint16(x86_fp80 %x) #0 {
 ; X64-LABEL: fp80_to_uint16:
 ; X64:       # %bb.0:
 ; X64-NEXT:    fldt {{[0-9]+}}(%rsp)
+; X64-NEXT:    wait
 ; X64-NEXT:    fnstcw -{{[0-9]+}}(%rsp)
 ; X64-NEXT:    movzwl -{{[0-9]+}}(%rsp), %eax
 ; X64-NEXT:    orl $3072, %eax # imm = 0xC00
@@ -511,6 +545,7 @@ define i32 @fp80_to_uint32(x86_fp80 %x) #0 {
 ; X86-NEXT:    andl $-8, %esp
 ; X86-NEXT:    subl $16, %esp
 ; X86-NEXT:    fldt 8(%ebp)
+; X86-NEXT:    wait
 ; X86-NEXT:    fnstcw {{[0-9]+}}(%esp)
 ; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    orl $3072, %eax # imm = 0xC00
@@ -527,6 +562,7 @@ define i32 @fp80_to_uint32(x86_fp80 %x) #0 {
 ; X64-LABEL: fp80_to_uint32:
 ; X64:       # %bb.0:
 ; X64-NEXT:    fldt {{[0-9]+}}(%rsp)
+; X64-NEXT:    wait
 ; X64-NEXT:    fnstcw -{{[0-9]+}}(%rsp)
 ; X64-NEXT:    movzwl -{{[0-9]+}}(%rsp), %eax
 ; X64-NEXT:    orl $3072, %eax # imm = 0xC00
@@ -554,6 +590,7 @@ define i64 @fp80_to_uint64(x86_fp80 %x) #0 {
 ; X86-NEXT:    fldt 8(%ebp)
 ; X86-NEXT:    flds {{\.LCPI.*}}
 ; X86-NEXT:    fcom %st(1)
+; X86-NEXT:    wait
 ; X86-NEXT:    fnstsw %ax
 ; X86-NEXT:    xorl %edx, %edx
 ; X86-NEXT:    # kill: def $ah killed $ah killed $ax
@@ -568,6 +605,7 @@ define i64 @fp80_to_uint64(x86_fp80 %x) #0 {
 ; X86-NEXT:  .LBB18_2:
 ; X86-NEXT:    fstp %st(1)
 ; X86-NEXT:    fsubrp %st, %st(1)
+; X86-NEXT:    wait
 ; X86-NEXT:    fnstcw {{[0-9]+}}(%esp)
 ; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    orl $3072, %ecx # imm = 0xC00
@@ -588,14 +626,17 @@ define i64 @fp80_to_uint64(x86_fp80 %x) #0 {
 ; X64:       # %bb.0:
 ; X64-NEXT:    fldt {{[0-9]+}}(%rsp)
 ; X64-NEXT:    flds {{.*}}(%rip)
+; X64-NEXT:    wait
 ; X64-NEXT:    xorl %eax, %eax
 ; X64-NEXT:    fcomi %st(1), %st
+; X64-NEXT:    wait
 ; X64-NEXT:    setbe %al
 ; X64-NEXT:    fldz
 ; X64-NEXT:    fxch %st(1)
 ; X64-NEXT:    fcmovnbe %st(1), %st
 ; X64-NEXT:    fstp %st(1)
 ; X64-NEXT:    fsubrp %st, %st(1)
+; X64-NEXT:    wait
 ; X64-NEXT:    fnstcw -{{[0-9]+}}(%rsp)
 ; X64-NEXT:    movzwl -{{[0-9]+}}(%rsp), %ecx
 ; X64-NEXT:    orl $3072, %ecx # imm = 0xC00
@@ -622,6 +663,7 @@ define x86_fp80 @sint1_to_fp80(i1 %x) #0 {
 ; X86-NEXT:    movsbl %al, %eax
 ; X86-NEXT:    movw %ax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    filds {{[0-9]+}}(%esp)
+; X86-NEXT:    wait
 ; X86-NEXT:    popl %eax
 ; X86-NEXT:    .cfi_def_cfa_offset 4
 ; X86-NEXT:    retl
@@ -633,6 +675,7 @@ define x86_fp80 @sint1_to_fp80(i1 %x) #0 {
 ; X64-NEXT:    movsbl %dil, %eax
 ; X64-NEXT:    movw %ax, -{{[0-9]+}}(%rsp)
 ; X64-NEXT:    filds -{{[0-9]+}}(%rsp)
+; X64-NEXT:    wait
 ; X64-NEXT:    retq
   %result = call x86_fp80 @llvm.experimental.constrained.sitofp.x86_fp80.i1(i1 %x,
                                                metadata !"round.dynamic",
@@ -648,6 +691,7 @@ define x86_fp80 @sint8_to_fp80(i8 %x) #0 {
 ; X86-NEXT:    movsbl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movw %ax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    filds {{[0-9]+}}(%esp)
+; X86-NEXT:    wait
 ; X86-NEXT:    popl %eax
 ; X86-NEXT:    .cfi_def_cfa_offset 4
 ; X86-NEXT:    retl
@@ -657,6 +701,7 @@ define x86_fp80 @sint8_to_fp80(i8 %x) #0 {
 ; X64-NEXT:    movsbl %dil, %eax
 ; X64-NEXT:    movw %ax, -{{[0-9]+}}(%rsp)
 ; X64-NEXT:    filds -{{[0-9]+}}(%rsp)
+; X64-NEXT:    wait
 ; X64-NEXT:    retq
   %result = call x86_fp80 @llvm.experimental.constrained.sitofp.x86_fp80.i8(i8 %x,
                                                metadata !"round.dynamic",
@@ -672,6 +717,7 @@ define x86_fp80 @sint16_to_fp80(i16 %x) #0 {
 ; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movw %ax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    filds {{[0-9]+}}(%esp)
+; X86-NEXT:    wait
 ; X86-NEXT:    popl %eax
 ; X86-NEXT:    .cfi_def_cfa_offset 4
 ; X86-NEXT:    retl
@@ -680,6 +726,7 @@ define x86_fp80 @sint16_to_fp80(i16 %x) #0 {
 ; X64:       # %bb.0:
 ; X64-NEXT:    movw %di, -{{[0-9]+}}(%rsp)
 ; X64-NEXT:    filds -{{[0-9]+}}(%rsp)
+; X64-NEXT:    wait
 ; X64-NEXT:    retq
   %result = call x86_fp80 @llvm.experimental.constrained.sitofp.x86_fp80.i16(i16 %x,
                                                metadata !"round.dynamic",
@@ -695,6 +742,7 @@ define x86_fp80 @sint32_to_fp80(i32 %x) #0 {
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    fildl (%esp)
+; X86-NEXT:    wait
 ; X86-NEXT:    popl %eax
 ; X86-NEXT:    .cfi_def_cfa_offset 4
 ; X86-NEXT:    retl
@@ -703,6 +751,7 @@ define x86_fp80 @sint32_to_fp80(i32 %x) #0 {
 ; X64:       # %bb.0:
 ; X64-NEXT:    movl %edi, -{{[0-9]+}}(%rsp)
 ; X64-NEXT:    fildl -{{[0-9]+}}(%rsp)
+; X64-NEXT:    wait
 ; X64-NEXT:    retq
   %result = call x86_fp80 @llvm.experimental.constrained.sitofp.x86_fp80.i32(i32 %x,
                                                metadata !"round.dynamic",
@@ -714,12 +763,14 @@ define x86_fp80 @sint64_to_fp80(i64 %x) #0 {
 ; X86-LABEL: sint64_to_fp80:
 ; X86:       # %bb.0:
 ; X86-NEXT:    fildll {{[0-9]+}}(%esp)
+; X86-NEXT:    wait
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: sint64_to_fp80:
 ; X64:       # %bb.0:
 ; X64-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
 ; X64-NEXT:    fildll -{{[0-9]+}}(%rsp)
+; X64-NEXT:    wait
 ; X64-NEXT:    retq
   %result = call x86_fp80 @llvm.experimental.constrained.sitofp.x86_fp80.i64(i64 %x,
                                                metadata !"round.dynamic",
@@ -737,6 +788,7 @@ define x86_fp80 @uint1_to_fp80(i1 %x) #0 {
 ; X86-NEXT:    movzbl %al, %eax
 ; X86-NEXT:    movw %ax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    filds {{[0-9]+}}(%esp)
+; X86-NEXT:    wait
 ; X86-NEXT:    popl %eax
 ; X86-NEXT:    .cfi_def_cfa_offset 4
 ; X86-NEXT:    retl
@@ -746,6 +798,7 @@ define x86_fp80 @uint1_to_fp80(i1 %x) #0 {
 ; X64-NEXT:    andl $1, %edi
 ; X64-NEXT:    movw %di, -{{[0-9]+}}(%rsp)
 ; X64-NEXT:    filds -{{[0-9]+}}(%rsp)
+; X64-NEXT:    wait
 ; X64-NEXT:    retq
   %result = call x86_fp80 @llvm.experimental.constrained.uitofp.x86_fp80.i1(i1 %x,
                                                metadata !"round.dynamic",
@@ -761,6 +814,7 @@ define x86_fp80 @uint8_to_fp80(i8 %x) #0 {
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movw %ax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    filds {{[0-9]+}}(%esp)
+; X86-NEXT:    wait
 ; X86-NEXT:    popl %eax
 ; X86-NEXT:    .cfi_def_cfa_offset 4
 ; X86-NEXT:    retl
@@ -770,6 +824,7 @@ define x86_fp80 @uint8_to_fp80(i8 %x) #0 {
 ; X64-NEXT:    movzbl %dil, %eax
 ; X64-NEXT:    movw %ax, -{{[0-9]+}}(%rsp)
 ; X64-NEXT:    filds -{{[0-9]+}}(%rsp)
+; X64-NEXT:    wait
 ; X64-NEXT:    retq
   %result = call x86_fp80 @llvm.experimental.constrained.uitofp.x86_fp80.i8(i8 %x,
                                                metadata !"round.dynamic",
@@ -785,6 +840,7 @@ define x86_fp80 @uint16_to_fp80(i16 %x) #0 {
 ; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    fildl (%esp)
+; X86-NEXT:    wait
 ; X86-NEXT:    popl %eax
 ; X86-NEXT:    .cfi_def_cfa_offset 4
 ; X86-NEXT:    retl
@@ -794,6 +850,7 @@ define x86_fp80 @uint16_to_fp80(i16 %x) #0 {
 ; X64-NEXT:    movzwl %di, %eax
 ; X64-NEXT:    movl %eax, -{{[0-9]+}}(%rsp)
 ; X64-NEXT:    fildl -{{[0-9]+}}(%rsp)
+; X64-NEXT:    wait
 ; X64-NEXT:    retq
   %result = call x86_fp80 @llvm.experimental.constrained.uitofp.x86_fp80.i16(i16 %x,
                                                metadata !"round.dynamic",
@@ -815,6 +872,7 @@ define x86_fp80 @uint32_to_fp80(i32 %x) #0 {
 ; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-NEXT:    fildll (%esp)
+; X86-NEXT:    wait
 ; X86-NEXT:    movl %ebp, %esp
 ; X86-NEXT:    popl %ebp
 ; X86-NEXT:    .cfi_def_cfa %esp, 4
@@ -825,6 +883,7 @@ define x86_fp80 @uint32_to_fp80(i32 %x) #0 {
 ; X64-NEXT:    movl %edi, %eax
 ; X64-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
 ; X64-NEXT:    fildll -{{[0-9]+}}(%rsp)
+; X64-NEXT:    wait
 ; X64-NEXT:    retq
   %result = call x86_fp80 @llvm.experimental.constrained.uitofp.x86_fp80.i32(i32 %x,
                                                metadata !"round.dynamic",
@@ -849,6 +908,7 @@ define x86_fp80 @uint64_to_fp80(i64 %x) #0 {
 ; X86-NEXT:    shrl $31, %ecx
 ; X86-NEXT:    fildll (%esp)
 ; X86-NEXT:    fadds {{\.LCPI.*}}(,%ecx,4)
+; X86-NEXT:    wait
 ; X86-NEXT:    movl %ebp, %esp
 ; X86-NEXT:    popl %ebp
 ; X86-NEXT:    .cfi_def_cfa %esp, 4
@@ -862,6 +922,7 @@ define x86_fp80 @uint64_to_fp80(i64 %x) #0 {
 ; X64-NEXT:    sets %al
 ; X64-NEXT:    fildll -{{[0-9]+}}(%rsp)
 ; X64-NEXT:    fadds {{\.LCPI.*}}(,%rax,4)
+; X64-NEXT:    wait
 ; X64-NEXT:    retq
   %result = call x86_fp80 @llvm.experimental.constrained.uitofp.x86_fp80.i64(i64 %x,
                                                metadata !"round.dynamic",

diff  --git a/llvm/test/CodeGen/X86/vec-strict-128.ll b/llvm/test/CodeGen/X86/vec-strict-128.ll
index 5aa0802adf03..98162a1da9a9 100644
--- a/llvm/test/CodeGen/X86/vec-strict-128.ll
+++ b/llvm/test/CodeGen/X86/vec-strict-128.ll
@@ -234,6 +234,7 @@ define <4 x float> @f13(<4 x float> %a, <4 x float> %b, <4 x float> %c) #0 {
 ; SSE-X86-NEXT:    movss %xmm0, (%esp)
 ; SSE-X86-NEXT:    calll fmaf
 ; SSE-X86-NEXT:    fstpt {{[-0-9]+}}(%e{{[sb]}}p) # 10-byte Folded Spill
+; SSE-X86-NEXT:    wait
 ; SSE-X86-NEXT:    movups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
 ; SSE-X86-NEXT:    movhlps {{.*#+}} xmm0 = xmm0[1,1]
 ; SSE-X86-NEXT:    movss %xmm0, {{[0-9]+}}(%esp)
@@ -245,6 +246,7 @@ define <4 x float> @f13(<4 x float> %a, <4 x float> %b, <4 x float> %c) #0 {
 ; SSE-X86-NEXT:    movss %xmm0, (%esp)
 ; SSE-X86-NEXT:    calll fmaf
 ; SSE-X86-NEXT:    fstpt {{[-0-9]+}}(%e{{[sb]}}p) # 10-byte Folded Spill
+; SSE-X86-NEXT:    wait
 ; SSE-X86-NEXT:    movups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
 ; SSE-X86-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,1,2,3]
 ; SSE-X86-NEXT:    movss %xmm0, {{[0-9]+}}(%esp)
@@ -269,8 +271,10 @@ define <4 x float> @f13(<4 x float> %a, <4 x float> %b, <4 x float> %c) #0 {
 ; SSE-X86-NEXT:    fstps {{[0-9]+}}(%esp)
 ; SSE-X86-NEXT:    fldt {{[-0-9]+}}(%e{{[sb]}}p) # 10-byte Folded Reload
 ; SSE-X86-NEXT:    fstps {{[0-9]+}}(%esp)
+; SSE-X86-NEXT:    wait
 ; SSE-X86-NEXT:    calll fmaf
 ; SSE-X86-NEXT:    fstps {{[0-9]+}}(%esp)
+; SSE-X86-NEXT:    wait
 ; SSE-X86-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; SSE-X86-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
 ; SSE-X86-NEXT:    unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
@@ -359,8 +363,10 @@ define <2 x double> @f14(<2 x double> %a, <2 x double> %b, <2 x double> %c) #0 {
 ; SSE-X86-NEXT:    movaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
 ; SSE-X86-NEXT:    movhps %xmm0, (%esp)
 ; SSE-X86-NEXT:    fstpl {{[0-9]+}}(%esp)
+; SSE-X86-NEXT:    wait
 ; SSE-X86-NEXT:    calll fma
 ; SSE-X86-NEXT:    fstpl {{[0-9]+}}(%esp)
+; SSE-X86-NEXT:    wait
 ; SSE-X86-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
 ; SSE-X86-NEXT:    movhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1]
 ; SSE-X86-NEXT:    movl %ebp, %esp

diff  --git a/llvm/test/CodeGen/X86/vec-strict-fptoint-128.ll b/llvm/test/CodeGen/X86/vec-strict-fptoint-128.ll
index fb1ec511fcca..205ea7f66720 100644
--- a/llvm/test/CodeGen/X86/vec-strict-fptoint-128.ll
+++ b/llvm/test/CodeGen/X86/vec-strict-fptoint-128.ll
@@ -54,6 +54,7 @@ define <2 x i64> @strict_vector_fptosi_v2f64_to_v2i64(<2 x double> %a) #0 {
 ; SSE-32-NEXT:    movhps %xmm0, {{[0-9]+}}(%esp)
 ; SSE-32-NEXT:    movlps %xmm0, {{[0-9]+}}(%esp)
 ; SSE-32-NEXT:    fldl {{[0-9]+}}(%esp)
+; SSE-32-NEXT:    wait
 ; SSE-32-NEXT:    fnstcw {{[0-9]+}}(%esp)
 ; SSE-32-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
 ; SSE-32-NEXT:    orl $3072, %eax # imm = 0xC00
@@ -62,6 +63,7 @@ define <2 x i64> @strict_vector_fptosi_v2f64_to_v2i64(<2 x double> %a) #0 {
 ; SSE-32-NEXT:    fistpll {{[0-9]+}}(%esp)
 ; SSE-32-NEXT:    fldcw {{[0-9]+}}(%esp)
 ; SSE-32-NEXT:    fldl {{[0-9]+}}(%esp)
+; SSE-32-NEXT:    wait
 ; SSE-32-NEXT:    fnstcw (%esp)
 ; SSE-32-NEXT:    movzwl (%esp), %eax
 ; SSE-32-NEXT:    orl $3072, %eax # imm = 0xC00
@@ -103,6 +105,7 @@ define <2 x i64> @strict_vector_fptosi_v2f64_to_v2i64(<2 x double> %a) #0 {
 ; AVX-32-NEXT:    fisttpll {{[0-9]+}}(%esp)
 ; AVX-32-NEXT:    fldl (%esp)
 ; AVX-32-NEXT:    fisttpll (%esp)
+; AVX-32-NEXT:    wait
 ; AVX-32-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; AVX-32-NEXT:    vpinsrd $1, {{[0-9]+}}(%esp), %xmm0, %xmm0
 ; AVX-32-NEXT:    vpinsrd $2, (%esp), %xmm0, %xmm0
@@ -137,6 +140,7 @@ define <2 x i64> @strict_vector_fptosi_v2f64_to_v2i64(<2 x double> %a) #0 {
 ; AVX512F-32-NEXT:    fisttpll {{[0-9]+}}(%esp)
 ; AVX512F-32-NEXT:    fldl (%esp)
 ; AVX512F-32-NEXT:    fisttpll (%esp)
+; AVX512F-32-NEXT:    wait
 ; AVX512F-32-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; AVX512F-32-NEXT:    vpinsrd $1, {{[0-9]+}}(%esp), %xmm0, %xmm0
 ; AVX512F-32-NEXT:    vpinsrd $2, (%esp), %xmm0, %xmm0
@@ -171,6 +175,7 @@ define <2 x i64> @strict_vector_fptosi_v2f64_to_v2i64(<2 x double> %a) #0 {
 ; AVX512VL-32-NEXT:    fisttpll {{[0-9]+}}(%esp)
 ; AVX512VL-32-NEXT:    fldl (%esp)
 ; AVX512VL-32-NEXT:    fisttpll (%esp)
+; AVX512VL-32-NEXT:    wait
 ; AVX512VL-32-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; AVX512VL-32-NEXT:    vpinsrd $1, {{[0-9]+}}(%esp), %xmm0, %xmm0
 ; AVX512VL-32-NEXT:    vpinsrd $2, (%esp), %xmm0, %xmm0
@@ -230,6 +235,7 @@ define <2 x i64> @strict_vector_fptoui_v2f64_to_v2i64(<2 x double> %a) #0 {
 ; SSE-32-NEXT:    movsd %xmm4, {{[0-9]+}}(%esp)
 ; SSE-32-NEXT:    setae %al
 ; SSE-32-NEXT:    fldl {{[0-9]+}}(%esp)
+; SSE-32-NEXT:    wait
 ; SSE-32-NEXT:    fnstcw {{[0-9]+}}(%esp)
 ; SSE-32-NEXT:    movzwl {{[0-9]+}}(%esp), %ecx
 ; SSE-32-NEXT:    orl $3072, %ecx # imm = 0xC00
@@ -247,6 +253,7 @@ define <2 x i64> @strict_vector_fptoui_v2f64_to_v2i64(<2 x double> %a) #0 {
 ; SSE-32-NEXT:    movsd %xmm0, {{[0-9]+}}(%esp)
 ; SSE-32-NEXT:    setae %cl
 ; SSE-32-NEXT:    fldl {{[0-9]+}}(%esp)
+; SSE-32-NEXT:    wait
 ; SSE-32-NEXT:    fnstcw (%esp)
 ; SSE-32-NEXT:    movzwl (%esp), %edx
 ; SSE-32-NEXT:    orl $3072, %edx # imm = 0xC00
@@ -329,6 +336,7 @@ define <2 x i64> @strict_vector_fptoui_v2f64_to_v2i64(<2 x double> %a) #0 {
 ; AVX-32-NEXT:    vmovsd %xmm3, (%esp)
 ; AVX-32-NEXT:    fldl (%esp)
 ; AVX-32-NEXT:    fisttpll (%esp)
+; AVX-32-NEXT:    wait
 ; AVX-32-NEXT:    setae %al
 ; AVX-32-NEXT:    movzbl %al, %eax
 ; AVX-32-NEXT:    shll $31, %eax
@@ -342,6 +350,7 @@ define <2 x i64> @strict_vector_fptoui_v2f64_to_v2i64(<2 x double> %a) #0 {
 ; AVX-32-NEXT:    vmovsd %xmm0, {{[0-9]+}}(%esp)
 ; AVX-32-NEXT:    fldl {{[0-9]+}}(%esp)
 ; AVX-32-NEXT:    fisttpll {{[0-9]+}}(%esp)
+; AVX-32-NEXT:    wait
 ; AVX-32-NEXT:    setae %cl
 ; AVX-32-NEXT:    movzbl %cl, %ecx
 ; AVX-32-NEXT:    shll $31, %ecx
@@ -410,6 +419,7 @@ define <2 x i64> @strict_vector_fptoui_v2f64_to_v2i64(<2 x double> %a) #0 {
 ; AVX512F-32-NEXT:    vmovsd %xmm1, (%esp)
 ; AVX512F-32-NEXT:    fldl (%esp)
 ; AVX512F-32-NEXT:    fisttpll (%esp)
+; AVX512F-32-NEXT:    wait
 ; AVX512F-32-NEXT:    setae %al
 ; AVX512F-32-NEXT:    shll $31, %eax
 ; AVX512F-32-NEXT:    xorl {{[0-9]+}}(%esp), %eax
@@ -422,6 +432,7 @@ define <2 x i64> @strict_vector_fptoui_v2f64_to_v2i64(<2 x double> %a) #0 {
 ; AVX512F-32-NEXT:    vmovsd %xmm0, {{[0-9]+}}(%esp)
 ; AVX512F-32-NEXT:    fldl {{[0-9]+}}(%esp)
 ; AVX512F-32-NEXT:    fisttpll {{[0-9]+}}(%esp)
+; AVX512F-32-NEXT:    wait
 ; AVX512F-32-NEXT:    setae %cl
 ; AVX512F-32-NEXT:    shll $31, %ecx
 ; AVX512F-32-NEXT:    xorl {{[0-9]+}}(%esp), %ecx
@@ -466,6 +477,7 @@ define <2 x i64> @strict_vector_fptoui_v2f64_to_v2i64(<2 x double> %a) #0 {
 ; AVX512VL-32-NEXT:    vmovsd %xmm1, (%esp)
 ; AVX512VL-32-NEXT:    fldl (%esp)
 ; AVX512VL-32-NEXT:    fisttpll (%esp)
+; AVX512VL-32-NEXT:    wait
 ; AVX512VL-32-NEXT:    setae %al
 ; AVX512VL-32-NEXT:    shll $31, %eax
 ; AVX512VL-32-NEXT:    xorl {{[0-9]+}}(%esp), %eax
@@ -478,6 +490,7 @@ define <2 x i64> @strict_vector_fptoui_v2f64_to_v2i64(<2 x double> %a) #0 {
 ; AVX512VL-32-NEXT:    vmovsd %xmm0, {{[0-9]+}}(%esp)
 ; AVX512VL-32-NEXT:    fldl {{[0-9]+}}(%esp)
 ; AVX512VL-32-NEXT:    fisttpll {{[0-9]+}}(%esp)
+; AVX512VL-32-NEXT:    wait
 ; AVX512VL-32-NEXT:    setae %cl
 ; AVX512VL-32-NEXT:    shll $31, %ecx
 ; AVX512VL-32-NEXT:    xorl {{[0-9]+}}(%esp), %ecx
@@ -531,6 +544,7 @@ define <2 x i64> @strict_vector_fptosi_v2f32_to_v2i64(<2 x float> %a) #0 {
 ; SSE-32-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1,2,3]
 ; SSE-32-NEXT:    movss %xmm0, {{[0-9]+}}(%esp)
 ; SSE-32-NEXT:    flds {{[0-9]+}}(%esp)
+; SSE-32-NEXT:    wait
 ; SSE-32-NEXT:    fnstcw {{[0-9]+}}(%esp)
 ; SSE-32-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
 ; SSE-32-NEXT:    orl $3072, %eax # imm = 0xC00
@@ -539,6 +553,7 @@ define <2 x i64> @strict_vector_fptosi_v2f32_to_v2i64(<2 x float> %a) #0 {
 ; SSE-32-NEXT:    fistpll {{[0-9]+}}(%esp)
 ; SSE-32-NEXT:    fldcw {{[0-9]+}}(%esp)
 ; SSE-32-NEXT:    flds {{[0-9]+}}(%esp)
+; SSE-32-NEXT:    wait
 ; SSE-32-NEXT:    fnstcw (%esp)
 ; SSE-32-NEXT:    movzwl (%esp), %eax
 ; SSE-32-NEXT:    orl $3072, %eax # imm = 0xC00
@@ -580,6 +595,7 @@ define <2 x i64> @strict_vector_fptosi_v2f32_to_v2i64(<2 x float> %a) #0 {
 ; AVX-32-NEXT:    fisttpll (%esp)
 ; AVX-32-NEXT:    flds {{[0-9]+}}(%esp)
 ; AVX-32-NEXT:    fisttpll {{[0-9]+}}(%esp)
+; AVX-32-NEXT:    wait
 ; AVX-32-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; AVX-32-NEXT:    vpinsrd $1, {{[0-9]+}}(%esp), %xmm0, %xmm0
 ; AVX-32-NEXT:    vpinsrd $2, {{[0-9]+}}(%esp), %xmm0, %xmm0
@@ -614,6 +630,7 @@ define <2 x i64> @strict_vector_fptosi_v2f32_to_v2i64(<2 x float> %a) #0 {
 ; AVX512F-32-NEXT:    fisttpll (%esp)
 ; AVX512F-32-NEXT:    flds {{[0-9]+}}(%esp)
 ; AVX512F-32-NEXT:    fisttpll {{[0-9]+}}(%esp)
+; AVX512F-32-NEXT:    wait
 ; AVX512F-32-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; AVX512F-32-NEXT:    vpinsrd $1, {{[0-9]+}}(%esp), %xmm0, %xmm0
 ; AVX512F-32-NEXT:    vpinsrd $2, {{[0-9]+}}(%esp), %xmm0, %xmm0
@@ -648,6 +665,7 @@ define <2 x i64> @strict_vector_fptosi_v2f32_to_v2i64(<2 x float> %a) #0 {
 ; AVX512VL-32-NEXT:    fisttpll (%esp)
 ; AVX512VL-32-NEXT:    flds {{[0-9]+}}(%esp)
 ; AVX512VL-32-NEXT:    fisttpll {{[0-9]+}}(%esp)
+; AVX512VL-32-NEXT:    wait
 ; AVX512VL-32-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; AVX512VL-32-NEXT:    vpinsrd $1, {{[0-9]+}}(%esp), %xmm0, %xmm0
 ; AVX512VL-32-NEXT:    vpinsrd $2, {{[0-9]+}}(%esp), %xmm0, %xmm0
@@ -723,6 +741,7 @@ define <2 x i64> @strict_vector_fptoui_v2f32_to_v2i64(<2 x float> %a) #0 {
 ; SSE-32-NEXT:    movss %xmm4, {{[0-9]+}}(%esp)
 ; SSE-32-NEXT:    setae %al
 ; SSE-32-NEXT:    flds {{[0-9]+}}(%esp)
+; SSE-32-NEXT:    wait
 ; SSE-32-NEXT:    fnstcw {{[0-9]+}}(%esp)
 ; SSE-32-NEXT:    movzwl {{[0-9]+}}(%esp), %ecx
 ; SSE-32-NEXT:    orl $3072, %ecx # imm = 0xC00
@@ -740,6 +759,7 @@ define <2 x i64> @strict_vector_fptoui_v2f32_to_v2i64(<2 x float> %a) #0 {
 ; SSE-32-NEXT:    movss %xmm0, {{[0-9]+}}(%esp)
 ; SSE-32-NEXT:    setae %cl
 ; SSE-32-NEXT:    flds {{[0-9]+}}(%esp)
+; SSE-32-NEXT:    wait
 ; SSE-32-NEXT:    fnstcw (%esp)
 ; SSE-32-NEXT:    movzwl (%esp), %edx
 ; SSE-32-NEXT:    orl $3072, %edx # imm = 0xC00
@@ -822,6 +842,7 @@ define <2 x i64> @strict_vector_fptoui_v2f32_to_v2i64(<2 x float> %a) #0 {
 ; AVX-32-NEXT:    vmovss %xmm3, {{[0-9]+}}(%esp)
 ; AVX-32-NEXT:    flds {{[0-9]+}}(%esp)
 ; AVX-32-NEXT:    fisttpll {{[0-9]+}}(%esp)
+; AVX-32-NEXT:    wait
 ; AVX-32-NEXT:    setae %al
 ; AVX-32-NEXT:    movzbl %al, %eax
 ; AVX-32-NEXT:    shll $31, %eax
@@ -835,6 +856,7 @@ define <2 x i64> @strict_vector_fptoui_v2f32_to_v2i64(<2 x float> %a) #0 {
 ; AVX-32-NEXT:    vmovss %xmm0, (%esp)
 ; AVX-32-NEXT:    flds (%esp)
 ; AVX-32-NEXT:    fisttpll (%esp)
+; AVX-32-NEXT:    wait
 ; AVX-32-NEXT:    setae %cl
 ; AVX-32-NEXT:    movzbl %cl, %ecx
 ; AVX-32-NEXT:    shll $31, %ecx
@@ -903,6 +925,7 @@ define <2 x i64> @strict_vector_fptoui_v2f32_to_v2i64(<2 x float> %a) #0 {
 ; AVX512F-32-NEXT:    vmovss %xmm1, {{[0-9]+}}(%esp)
 ; AVX512F-32-NEXT:    flds {{[0-9]+}}(%esp)
 ; AVX512F-32-NEXT:    fisttpll {{[0-9]+}}(%esp)
+; AVX512F-32-NEXT:    wait
 ; AVX512F-32-NEXT:    setae %al
 ; AVX512F-32-NEXT:    shll $31, %eax
 ; AVX512F-32-NEXT:    xorl {{[0-9]+}}(%esp), %eax
@@ -915,6 +938,7 @@ define <2 x i64> @strict_vector_fptoui_v2f32_to_v2i64(<2 x float> %a) #0 {
 ; AVX512F-32-NEXT:    vmovss %xmm0, (%esp)
 ; AVX512F-32-NEXT:    flds (%esp)
 ; AVX512F-32-NEXT:    fisttpll (%esp)
+; AVX512F-32-NEXT:    wait
 ; AVX512F-32-NEXT:    setae %cl
 ; AVX512F-32-NEXT:    shll $31, %ecx
 ; AVX512F-32-NEXT:    xorl {{[0-9]+}}(%esp), %ecx
@@ -959,6 +983,7 @@ define <2 x i64> @strict_vector_fptoui_v2f32_to_v2i64(<2 x float> %a) #0 {
 ; AVX512VL-32-NEXT:    vmovss %xmm1, {{[0-9]+}}(%esp)
 ; AVX512VL-32-NEXT:    flds {{[0-9]+}}(%esp)
 ; AVX512VL-32-NEXT:    fisttpll {{[0-9]+}}(%esp)
+; AVX512VL-32-NEXT:    wait
 ; AVX512VL-32-NEXT:    setae %al
 ; AVX512VL-32-NEXT:    shll $31, %eax
 ; AVX512VL-32-NEXT:    xorl {{[0-9]+}}(%esp), %eax
@@ -971,6 +996,7 @@ define <2 x i64> @strict_vector_fptoui_v2f32_to_v2i64(<2 x float> %a) #0 {
 ; AVX512VL-32-NEXT:    vmovss %xmm0, (%esp)
 ; AVX512VL-32-NEXT:    flds (%esp)
 ; AVX512VL-32-NEXT:    fisttpll (%esp)
+; AVX512VL-32-NEXT:    wait
 ; AVX512VL-32-NEXT:    setae %cl
 ; AVX512VL-32-NEXT:    shll $31, %ecx
 ; AVX512VL-32-NEXT:    xorl {{[0-9]+}}(%esp), %ecx
@@ -1128,6 +1154,7 @@ define <2 x i32> @strict_vector_fptoui_v2f64_to_v2i32(<2 x double> %a) #0 {
 ; AVX-32-NEXT:    fisttpll {{[0-9]+}}(%esp)
 ; AVX-32-NEXT:    fldl (%esp)
 ; AVX-32-NEXT:    fisttpll (%esp)
+; AVX-32-NEXT:    wait
 ; AVX-32-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; AVX-32-NEXT:    vpinsrd $1, (%esp), %xmm0, %xmm0
 ; AVX-32-NEXT:    movl %ebp, %esp
@@ -1283,6 +1310,7 @@ define <2 x i32> @strict_vector_fptoui_v2f32_to_v2i32(<2 x float> %a) #0 {
 ; AVX-32-NEXT:    fisttpll (%esp)
 ; AVX-32-NEXT:    flds {{[0-9]+}}(%esp)
 ; AVX-32-NEXT:    fisttpll {{[0-9]+}}(%esp)
+; AVX-32-NEXT:    wait
 ; AVX-32-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; AVX-32-NEXT:    vpinsrd $1, {{[0-9]+}}(%esp), %xmm0, %xmm0
 ; AVX-32-NEXT:    movl %ebp, %esp
@@ -1764,6 +1792,7 @@ define <2 x i1> @strict_vector_fptosi_v2f64_to_v2i1(<2 x double> %a) #0 {
 ; SSE-32-NEXT:    movhps %xmm0, {{[0-9]+}}(%esp)
 ; SSE-32-NEXT:    movlps %xmm0, {{[0-9]+}}(%esp)
 ; SSE-32-NEXT:    fldl {{[0-9]+}}(%esp)
+; SSE-32-NEXT:    wait
 ; SSE-32-NEXT:    fnstcw {{[0-9]+}}(%esp)
 ; SSE-32-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
 ; SSE-32-NEXT:    orl $3072, %eax # imm = 0xC00
@@ -1772,6 +1801,7 @@ define <2 x i1> @strict_vector_fptosi_v2f64_to_v2i1(<2 x double> %a) #0 {
 ; SSE-32-NEXT:    fistpll {{[0-9]+}}(%esp)
 ; SSE-32-NEXT:    fldcw {{[0-9]+}}(%esp)
 ; SSE-32-NEXT:    fldl {{[0-9]+}}(%esp)
+; SSE-32-NEXT:    wait
 ; SSE-32-NEXT:    fnstcw (%esp)
 ; SSE-32-NEXT:    movzwl (%esp), %eax
 ; SSE-32-NEXT:    orl $3072, %eax # imm = 0xC00
@@ -1813,6 +1843,7 @@ define <2 x i1> @strict_vector_fptosi_v2f64_to_v2i1(<2 x double> %a) #0 {
 ; AVX-32-NEXT:    fisttpll {{[0-9]+}}(%esp)
 ; AVX-32-NEXT:    fldl (%esp)
 ; AVX-32-NEXT:    fisttpll (%esp)
+; AVX-32-NEXT:    wait
 ; AVX-32-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; AVX-32-NEXT:    vpinsrd $1, {{[0-9]+}}(%esp), %xmm0, %xmm0
 ; AVX-32-NEXT:    vpinsrd $2, (%esp), %xmm0, %xmm0
@@ -1896,6 +1927,7 @@ define <2 x i1> @strict_vector_fptoui_v2f64_to_v2i1(<2 x double> %a) #0 {
 ; SSE-32-NEXT:    movsd %xmm4, {{[0-9]+}}(%esp)
 ; SSE-32-NEXT:    setae %al
 ; SSE-32-NEXT:    fldl {{[0-9]+}}(%esp)
+; SSE-32-NEXT:    wait
 ; SSE-32-NEXT:    fnstcw {{[0-9]+}}(%esp)
 ; SSE-32-NEXT:    movzwl {{[0-9]+}}(%esp), %ecx
 ; SSE-32-NEXT:    orl $3072, %ecx # imm = 0xC00
@@ -1913,6 +1945,7 @@ define <2 x i1> @strict_vector_fptoui_v2f64_to_v2i1(<2 x double> %a) #0 {
 ; SSE-32-NEXT:    movsd %xmm0, {{[0-9]+}}(%esp)
 ; SSE-32-NEXT:    setae %cl
 ; SSE-32-NEXT:    fldl {{[0-9]+}}(%esp)
+; SSE-32-NEXT:    wait
 ; SSE-32-NEXT:    fnstcw (%esp)
 ; SSE-32-NEXT:    movzwl (%esp), %edx
 ; SSE-32-NEXT:    orl $3072, %edx # imm = 0xC00
@@ -1995,6 +2028,7 @@ define <2 x i1> @strict_vector_fptoui_v2f64_to_v2i1(<2 x double> %a) #0 {
 ; AVX-32-NEXT:    vmovsd %xmm3, (%esp)
 ; AVX-32-NEXT:    fldl (%esp)
 ; AVX-32-NEXT:    fisttpll (%esp)
+; AVX-32-NEXT:    wait
 ; AVX-32-NEXT:    setae %al
 ; AVX-32-NEXT:    movzbl %al, %eax
 ; AVX-32-NEXT:    shll $31, %eax
@@ -2008,6 +2042,7 @@ define <2 x i1> @strict_vector_fptoui_v2f64_to_v2i1(<2 x double> %a) #0 {
 ; AVX-32-NEXT:    vmovsd %xmm0, {{[0-9]+}}(%esp)
 ; AVX-32-NEXT:    fldl {{[0-9]+}}(%esp)
 ; AVX-32-NEXT:    fisttpll {{[0-9]+}}(%esp)
+; AVX-32-NEXT:    wait
 ; AVX-32-NEXT:    setae %cl
 ; AVX-32-NEXT:    movzbl %cl, %ecx
 ; AVX-32-NEXT:    shll $31, %ecx
@@ -2111,6 +2146,7 @@ define <2 x i1> @strict_vector_fptosi_v2f32_to_v2i1(<2 x float> %a) #0 {
 ; SSE-32-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1,2,3]
 ; SSE-32-NEXT:    movss %xmm0, {{[0-9]+}}(%esp)
 ; SSE-32-NEXT:    flds {{[0-9]+}}(%esp)
+; SSE-32-NEXT:    wait
 ; SSE-32-NEXT:    fnstcw {{[0-9]+}}(%esp)
 ; SSE-32-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
 ; SSE-32-NEXT:    orl $3072, %eax # imm = 0xC00
@@ -2119,6 +2155,7 @@ define <2 x i1> @strict_vector_fptosi_v2f32_to_v2i1(<2 x float> %a) #0 {
 ; SSE-32-NEXT:    fistpll {{[0-9]+}}(%esp)
 ; SSE-32-NEXT:    fldcw {{[0-9]+}}(%esp)
 ; SSE-32-NEXT:    flds {{[0-9]+}}(%esp)
+; SSE-32-NEXT:    wait
 ; SSE-32-NEXT:    fnstcw (%esp)
 ; SSE-32-NEXT:    movzwl (%esp), %eax
 ; SSE-32-NEXT:    orl $3072, %eax # imm = 0xC00
@@ -2160,6 +2197,7 @@ define <2 x i1> @strict_vector_fptosi_v2f32_to_v2i1(<2 x float> %a) #0 {
 ; AVX-32-NEXT:    fisttpll (%esp)
 ; AVX-32-NEXT:    flds {{[0-9]+}}(%esp)
 ; AVX-32-NEXT:    fisttpll {{[0-9]+}}(%esp)
+; AVX-32-NEXT:    wait
 ; AVX-32-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; AVX-32-NEXT:    vpinsrd $1, {{[0-9]+}}(%esp), %xmm0, %xmm0
 ; AVX-32-NEXT:    vpinsrd $2, {{[0-9]+}}(%esp), %xmm0, %xmm0
@@ -2265,6 +2303,7 @@ define <2 x i1> @strict_vector_fptoui_v2f32_to_v2i1(<2 x float> %a) #0 {
 ; SSE-32-NEXT:    movss %xmm4, {{[0-9]+}}(%esp)
 ; SSE-32-NEXT:    setae %al
 ; SSE-32-NEXT:    flds {{[0-9]+}}(%esp)
+; SSE-32-NEXT:    wait
 ; SSE-32-NEXT:    fnstcw {{[0-9]+}}(%esp)
 ; SSE-32-NEXT:    movzwl {{[0-9]+}}(%esp), %ecx
 ; SSE-32-NEXT:    orl $3072, %ecx # imm = 0xC00
@@ -2282,6 +2321,7 @@ define <2 x i1> @strict_vector_fptoui_v2f32_to_v2i1(<2 x float> %a) #0 {
 ; SSE-32-NEXT:    movss %xmm0, {{[0-9]+}}(%esp)
 ; SSE-32-NEXT:    setae %cl
 ; SSE-32-NEXT:    flds {{[0-9]+}}(%esp)
+; SSE-32-NEXT:    wait
 ; SSE-32-NEXT:    fnstcw (%esp)
 ; SSE-32-NEXT:    movzwl (%esp), %edx
 ; SSE-32-NEXT:    orl $3072, %edx # imm = 0xC00
@@ -2364,6 +2404,7 @@ define <2 x i1> @strict_vector_fptoui_v2f32_to_v2i1(<2 x float> %a) #0 {
 ; AVX-32-NEXT:    vmovss %xmm3, {{[0-9]+}}(%esp)
 ; AVX-32-NEXT:    flds {{[0-9]+}}(%esp)
 ; AVX-32-NEXT:    fisttpll {{[0-9]+}}(%esp)
+; AVX-32-NEXT:    wait
 ; AVX-32-NEXT:    setae %al
 ; AVX-32-NEXT:    movzbl %al, %eax
 ; AVX-32-NEXT:    shll $31, %eax
@@ -2377,6 +2418,7 @@ define <2 x i1> @strict_vector_fptoui_v2f32_to_v2i1(<2 x float> %a) #0 {
 ; AVX-32-NEXT:    vmovss %xmm0, (%esp)
 ; AVX-32-NEXT:    flds (%esp)
 ; AVX-32-NEXT:    fisttpll (%esp)
+; AVX-32-NEXT:    wait
 ; AVX-32-NEXT:    setae %cl
 ; AVX-32-NEXT:    movzbl %cl, %ecx
 ; AVX-32-NEXT:    shll $31, %ecx

diff  --git a/llvm/test/CodeGen/X86/vec-strict-fptoint-256.ll b/llvm/test/CodeGen/X86/vec-strict-fptoint-256.ll
index 6b7f083a2c87..38f19e9ed2d5 100644
--- a/llvm/test/CodeGen/X86/vec-strict-fptoint-256.ll
+++ b/llvm/test/CodeGen/X86/vec-strict-fptoint-256.ll
@@ -57,6 +57,7 @@ define <4 x i64> @strict_vector_fptosi_v4f64_to_v4i64(<4 x double> %a) #0 {
 ; AVX-32-NEXT:    fisttpll {{[0-9]+}}(%esp)
 ; AVX-32-NEXT:    fldl (%esp)
 ; AVX-32-NEXT:    fisttpll (%esp)
+; AVX-32-NEXT:    wait
 ; AVX-32-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; AVX-32-NEXT:    vpinsrd $1, {{[0-9]+}}(%esp), %xmm0, %xmm0
 ; AVX-32-NEXT:    vpinsrd $2, {{[0-9]+}}(%esp), %xmm0, %xmm0
@@ -111,6 +112,7 @@ define <4 x i64> @strict_vector_fptosi_v4f64_to_v4i64(<4 x double> %a) #0 {
 ; AVX512F-32-NEXT:    fisttpll {{[0-9]+}}(%esp)
 ; AVX512F-32-NEXT:    fldl (%esp)
 ; AVX512F-32-NEXT:    fisttpll (%esp)
+; AVX512F-32-NEXT:    wait
 ; AVX512F-32-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; AVX512F-32-NEXT:    vpinsrd $1, {{[0-9]+}}(%esp), %xmm0, %xmm0
 ; AVX512F-32-NEXT:    vpinsrd $2, {{[0-9]+}}(%esp), %xmm0, %xmm0
@@ -165,6 +167,7 @@ define <4 x i64> @strict_vector_fptosi_v4f64_to_v4i64(<4 x double> %a) #0 {
 ; AVX512VL-32-NEXT:    fisttpll {{[0-9]+}}(%esp)
 ; AVX512VL-32-NEXT:    fldl (%esp)
 ; AVX512VL-32-NEXT:    fisttpll (%esp)
+; AVX512VL-32-NEXT:    wait
 ; AVX512VL-32-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; AVX512VL-32-NEXT:    vpinsrd $1, {{[0-9]+}}(%esp), %xmm0, %xmm0
 ; AVX512VL-32-NEXT:    vpinsrd $2, {{[0-9]+}}(%esp), %xmm0, %xmm0
@@ -236,6 +239,7 @@ define <4 x i64> @strict_vector_fptoui_v4f64_to_v4i64(<4 x double> %a) #0 {
 ; AVX-32-NEXT:    vmovsd %xmm3, {{[0-9]+}}(%esp)
 ; AVX-32-NEXT:    fldl {{[0-9]+}}(%esp)
 ; AVX-32-NEXT:    fisttpll {{[0-9]+}}(%esp)
+; AVX-32-NEXT:    wait
 ; AVX-32-NEXT:    setae %al
 ; AVX-32-NEXT:    movzbl %al, %eax
 ; AVX-32-NEXT:    shll $31, %eax
@@ -252,6 +256,7 @@ define <4 x i64> @strict_vector_fptoui_v4f64_to_v4i64(<4 x double> %a) #0 {
 ; AVX-32-NEXT:    vmovsd %xmm4, (%esp)
 ; AVX-32-NEXT:    fldl (%esp)
 ; AVX-32-NEXT:    fisttpll (%esp)
+; AVX-32-NEXT:    wait
 ; AVX-32-NEXT:    setae %cl
 ; AVX-32-NEXT:    movzbl %cl, %ecx
 ; AVX-32-NEXT:    shll $31, %ecx
@@ -266,6 +271,7 @@ define <4 x i64> @strict_vector_fptoui_v4f64_to_v4i64(<4 x double> %a) #0 {
 ; AVX-32-NEXT:    vmovsd %xmm3, {{[0-9]+}}(%esp)
 ; AVX-32-NEXT:    fldl {{[0-9]+}}(%esp)
 ; AVX-32-NEXT:    fisttpll {{[0-9]+}}(%esp)
+; AVX-32-NEXT:    wait
 ; AVX-32-NEXT:    setae %dl
 ; AVX-32-NEXT:    movzbl %dl, %edx
 ; AVX-32-NEXT:    shll $31, %edx
@@ -279,6 +285,7 @@ define <4 x i64> @strict_vector_fptoui_v4f64_to_v4i64(<4 x double> %a) #0 {
 ; AVX-32-NEXT:    vmovsd %xmm0, {{[0-9]+}}(%esp)
 ; AVX-32-NEXT:    fldl {{[0-9]+}}(%esp)
 ; AVX-32-NEXT:    fisttpll {{[0-9]+}}(%esp)
+; AVX-32-NEXT:    wait
 ; AVX-32-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; AVX-32-NEXT:    vpinsrd $1, %edx, %xmm0, %xmm0
 ; AVX-32-NEXT:    vpinsrd $2, (%esp), %xmm0, %xmm0
@@ -385,6 +392,7 @@ define <4 x i64> @strict_vector_fptoui_v4f64_to_v4i64(<4 x double> %a) #0 {
 ; AVX512F-32-NEXT:    vmovsd %xmm2, {{[0-9]+}}(%esp)
 ; AVX512F-32-NEXT:    fldl {{[0-9]+}}(%esp)
 ; AVX512F-32-NEXT:    fisttpll {{[0-9]+}}(%esp)
+; AVX512F-32-NEXT:    wait
 ; AVX512F-32-NEXT:    movl $0, %eax
 ; AVX512F-32-NEXT:    setae %al
 ; AVX512F-32-NEXT:    shll $31, %eax
@@ -402,6 +410,7 @@ define <4 x i64> @strict_vector_fptoui_v4f64_to_v4i64(<4 x double> %a) #0 {
 ; AVX512F-32-NEXT:    vmovsd %xmm4, (%esp)
 ; AVX512F-32-NEXT:    fldl (%esp)
 ; AVX512F-32-NEXT:    fisttpll (%esp)
+; AVX512F-32-NEXT:    wait
 ; AVX512F-32-NEXT:    setae %cl
 ; AVX512F-32-NEXT:    shll $31, %ecx
 ; AVX512F-32-NEXT:    xorl {{[0-9]+}}(%esp), %ecx
@@ -415,6 +424,7 @@ define <4 x i64> @strict_vector_fptoui_v4f64_to_v4i64(<4 x double> %a) #0 {
 ; AVX512F-32-NEXT:    vmovsd %xmm2, {{[0-9]+}}(%esp)
 ; AVX512F-32-NEXT:    fldl {{[0-9]+}}(%esp)
 ; AVX512F-32-NEXT:    fisttpll {{[0-9]+}}(%esp)
+; AVX512F-32-NEXT:    wait
 ; AVX512F-32-NEXT:    setae %dl
 ; AVX512F-32-NEXT:    shll $31, %edx
 ; AVX512F-32-NEXT:    xorl {{[0-9]+}}(%esp), %edx
@@ -427,6 +437,7 @@ define <4 x i64> @strict_vector_fptoui_v4f64_to_v4i64(<4 x double> %a) #0 {
 ; AVX512F-32-NEXT:    vmovsd %xmm0, {{[0-9]+}}(%esp)
 ; AVX512F-32-NEXT:    fldl {{[0-9]+}}(%esp)
 ; AVX512F-32-NEXT:    fisttpll {{[0-9]+}}(%esp)
+; AVX512F-32-NEXT:    wait
 ; AVX512F-32-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; AVX512F-32-NEXT:    vpinsrd $1, %edx, %xmm0, %xmm0
 ; AVX512F-32-NEXT:    vpinsrd $2, (%esp), %xmm0, %xmm0
@@ -489,6 +500,7 @@ define <4 x i64> @strict_vector_fptoui_v4f64_to_v4i64(<4 x double> %a) #0 {
 ; AVX512VL-32-NEXT:    vmovsd %xmm2, {{[0-9]+}}(%esp)
 ; AVX512VL-32-NEXT:    fldl {{[0-9]+}}(%esp)
 ; AVX512VL-32-NEXT:    fisttpll {{[0-9]+}}(%esp)
+; AVX512VL-32-NEXT:    wait
 ; AVX512VL-32-NEXT:    movl $0, %eax
 ; AVX512VL-32-NEXT:    setae %al
 ; AVX512VL-32-NEXT:    shll $31, %eax
@@ -506,6 +518,7 @@ define <4 x i64> @strict_vector_fptoui_v4f64_to_v4i64(<4 x double> %a) #0 {
 ; AVX512VL-32-NEXT:    vmovsd %xmm4, (%esp)
 ; AVX512VL-32-NEXT:    fldl (%esp)
 ; AVX512VL-32-NEXT:    fisttpll (%esp)
+; AVX512VL-32-NEXT:    wait
 ; AVX512VL-32-NEXT:    setae %cl
 ; AVX512VL-32-NEXT:    shll $31, %ecx
 ; AVX512VL-32-NEXT:    xorl {{[0-9]+}}(%esp), %ecx
@@ -519,6 +532,7 @@ define <4 x i64> @strict_vector_fptoui_v4f64_to_v4i64(<4 x double> %a) #0 {
 ; AVX512VL-32-NEXT:    vmovsd %xmm2, {{[0-9]+}}(%esp)
 ; AVX512VL-32-NEXT:    fldl {{[0-9]+}}(%esp)
 ; AVX512VL-32-NEXT:    fisttpll {{[0-9]+}}(%esp)
+; AVX512VL-32-NEXT:    wait
 ; AVX512VL-32-NEXT:    setae %dl
 ; AVX512VL-32-NEXT:    shll $31, %edx
 ; AVX512VL-32-NEXT:    xorl {{[0-9]+}}(%esp), %edx
@@ -531,6 +545,7 @@ define <4 x i64> @strict_vector_fptoui_v4f64_to_v4i64(<4 x double> %a) #0 {
 ; AVX512VL-32-NEXT:    vmovsd %xmm0, {{[0-9]+}}(%esp)
 ; AVX512VL-32-NEXT:    fldl {{[0-9]+}}(%esp)
 ; AVX512VL-32-NEXT:    fisttpll {{[0-9]+}}(%esp)
+; AVX512VL-32-NEXT:    wait
 ; AVX512VL-32-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; AVX512VL-32-NEXT:    vpinsrd $1, %edx, %xmm0, %xmm0
 ; AVX512VL-32-NEXT:    vpinsrd $2, (%esp), %xmm0, %xmm0
@@ -606,6 +621,7 @@ define <4 x i64> @strict_vector_fptosi_v4f32_to_v4i64(<4 x float> %a) #0 {
 ; AVX-32-NEXT:    fisttpll {{[0-9]+}}(%esp)
 ; AVX-32-NEXT:    flds (%esp)
 ; AVX-32-NEXT:    fisttpll (%esp)
+; AVX-32-NEXT:    wait
 ; AVX-32-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; AVX-32-NEXT:    vpinsrd $1, {{[0-9]+}}(%esp), %xmm0, %xmm0
 ; AVX-32-NEXT:    vpinsrd $2, {{[0-9]+}}(%esp), %xmm0, %xmm0
@@ -659,6 +675,7 @@ define <4 x i64> @strict_vector_fptosi_v4f32_to_v4i64(<4 x float> %a) #0 {
 ; AVX512F-32-NEXT:    fisttpll {{[0-9]+}}(%esp)
 ; AVX512F-32-NEXT:    flds (%esp)
 ; AVX512F-32-NEXT:    fisttpll (%esp)
+; AVX512F-32-NEXT:    wait
 ; AVX512F-32-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; AVX512F-32-NEXT:    vpinsrd $1, {{[0-9]+}}(%esp), %xmm0, %xmm0
 ; AVX512F-32-NEXT:    vpinsrd $2, {{[0-9]+}}(%esp), %xmm0, %xmm0
@@ -712,6 +729,7 @@ define <4 x i64> @strict_vector_fptosi_v4f32_to_v4i64(<4 x float> %a) #0 {
 ; AVX512VL-32-NEXT:    fisttpll {{[0-9]+}}(%esp)
 ; AVX512VL-32-NEXT:    flds (%esp)
 ; AVX512VL-32-NEXT:    fisttpll (%esp)
+; AVX512VL-32-NEXT:    wait
 ; AVX512VL-32-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; AVX512VL-32-NEXT:    vpinsrd $1, {{[0-9]+}}(%esp), %xmm0, %xmm0
 ; AVX512VL-32-NEXT:    vpinsrd $2, {{[0-9]+}}(%esp), %xmm0, %xmm0
@@ -783,6 +801,7 @@ define <4 x i64> @strict_vector_fptoui_v4f32_to_v4i64(<4 x float> %a) #0 {
 ; AVX-32-NEXT:    vmovss %xmm3, {{[0-9]+}}(%esp)
 ; AVX-32-NEXT:    flds {{[0-9]+}}(%esp)
 ; AVX-32-NEXT:    fisttpll {{[0-9]+}}(%esp)
+; AVX-32-NEXT:    wait
 ; AVX-32-NEXT:    setae %al
 ; AVX-32-NEXT:    movzbl %al, %eax
 ; AVX-32-NEXT:    shll $31, %eax
@@ -798,6 +817,7 @@ define <4 x i64> @strict_vector_fptoui_v4f32_to_v4i64(<4 x float> %a) #0 {
 ; AVX-32-NEXT:    vmovss %xmm3, (%esp)
 ; AVX-32-NEXT:    flds (%esp)
 ; AVX-32-NEXT:    fisttpll (%esp)
+; AVX-32-NEXT:    wait
 ; AVX-32-NEXT:    setae %cl
 ; AVX-32-NEXT:    movzbl %cl, %ecx
 ; AVX-32-NEXT:    shll $31, %ecx
@@ -813,6 +833,7 @@ define <4 x i64> @strict_vector_fptoui_v4f32_to_v4i64(<4 x float> %a) #0 {
 ; AVX-32-NEXT:    vmovss %xmm3, {{[0-9]+}}(%esp)
 ; AVX-32-NEXT:    flds {{[0-9]+}}(%esp)
 ; AVX-32-NEXT:    fisttpll {{[0-9]+}}(%esp)
+; AVX-32-NEXT:    wait
 ; AVX-32-NEXT:    setae %dl
 ; AVX-32-NEXT:    movzbl %dl, %edx
 ; AVX-32-NEXT:    shll $31, %edx
@@ -826,6 +847,7 @@ define <4 x i64> @strict_vector_fptoui_v4f32_to_v4i64(<4 x float> %a) #0 {
 ; AVX-32-NEXT:    vmovss %xmm0, {{[0-9]+}}(%esp)
 ; AVX-32-NEXT:    flds {{[0-9]+}}(%esp)
 ; AVX-32-NEXT:    fisttpll {{[0-9]+}}(%esp)
+; AVX-32-NEXT:    wait
 ; AVX-32-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; AVX-32-NEXT:    vpinsrd $1, %edx, %xmm0, %xmm0
 ; AVX-32-NEXT:    vpinsrd $2, (%esp), %xmm0, %xmm0
@@ -932,6 +954,7 @@ define <4 x i64> @strict_vector_fptoui_v4f32_to_v4i64(<4 x float> %a) #0 {
 ; AVX512F-32-NEXT:    vmovss %xmm2, {{[0-9]+}}(%esp)
 ; AVX512F-32-NEXT:    flds {{[0-9]+}}(%esp)
 ; AVX512F-32-NEXT:    fisttpll {{[0-9]+}}(%esp)
+; AVX512F-32-NEXT:    wait
 ; AVX512F-32-NEXT:    movl $0, %eax
 ; AVX512F-32-NEXT:    setae %al
 ; AVX512F-32-NEXT:    shll $31, %eax
@@ -948,6 +971,7 @@ define <4 x i64> @strict_vector_fptoui_v4f32_to_v4i64(<4 x float> %a) #0 {
 ; AVX512F-32-NEXT:    vmovss %xmm2, (%esp)
 ; AVX512F-32-NEXT:    flds (%esp)
 ; AVX512F-32-NEXT:    fisttpll (%esp)
+; AVX512F-32-NEXT:    wait
 ; AVX512F-32-NEXT:    setae %cl
 ; AVX512F-32-NEXT:    shll $31, %ecx
 ; AVX512F-32-NEXT:    xorl {{[0-9]+}}(%esp), %ecx
@@ -962,6 +986,7 @@ define <4 x i64> @strict_vector_fptoui_v4f32_to_v4i64(<4 x float> %a) #0 {
 ; AVX512F-32-NEXT:    vmovss %xmm2, {{[0-9]+}}(%esp)
 ; AVX512F-32-NEXT:    flds {{[0-9]+}}(%esp)
 ; AVX512F-32-NEXT:    fisttpll {{[0-9]+}}(%esp)
+; AVX512F-32-NEXT:    wait
 ; AVX512F-32-NEXT:    setae %dl
 ; AVX512F-32-NEXT:    shll $31, %edx
 ; AVX512F-32-NEXT:    xorl {{[0-9]+}}(%esp), %edx
@@ -974,6 +999,7 @@ define <4 x i64> @strict_vector_fptoui_v4f32_to_v4i64(<4 x float> %a) #0 {
 ; AVX512F-32-NEXT:    vmovss %xmm0, {{[0-9]+}}(%esp)
 ; AVX512F-32-NEXT:    flds {{[0-9]+}}(%esp)
 ; AVX512F-32-NEXT:    fisttpll {{[0-9]+}}(%esp)
+; AVX512F-32-NEXT:    wait
 ; AVX512F-32-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; AVX512F-32-NEXT:    vpinsrd $1, %edx, %xmm0, %xmm0
 ; AVX512F-32-NEXT:    vpinsrd $2, (%esp), %xmm0, %xmm0
@@ -1036,6 +1062,7 @@ define <4 x i64> @strict_vector_fptoui_v4f32_to_v4i64(<4 x float> %a) #0 {
 ; AVX512VL-32-NEXT:    vmovss %xmm2, {{[0-9]+}}(%esp)
 ; AVX512VL-32-NEXT:    flds {{[0-9]+}}(%esp)
 ; AVX512VL-32-NEXT:    fisttpll {{[0-9]+}}(%esp)
+; AVX512VL-32-NEXT:    wait
 ; AVX512VL-32-NEXT:    movl $0, %eax
 ; AVX512VL-32-NEXT:    setae %al
 ; AVX512VL-32-NEXT:    shll $31, %eax
@@ -1052,6 +1079,7 @@ define <4 x i64> @strict_vector_fptoui_v4f32_to_v4i64(<4 x float> %a) #0 {
 ; AVX512VL-32-NEXT:    vmovss %xmm2, (%esp)
 ; AVX512VL-32-NEXT:    flds (%esp)
 ; AVX512VL-32-NEXT:    fisttpll (%esp)
+; AVX512VL-32-NEXT:    wait
 ; AVX512VL-32-NEXT:    setae %cl
 ; AVX512VL-32-NEXT:    shll $31, %ecx
 ; AVX512VL-32-NEXT:    xorl {{[0-9]+}}(%esp), %ecx
@@ -1066,6 +1094,7 @@ define <4 x i64> @strict_vector_fptoui_v4f32_to_v4i64(<4 x float> %a) #0 {
 ; AVX512VL-32-NEXT:    vmovss %xmm2, {{[0-9]+}}(%esp)
 ; AVX512VL-32-NEXT:    flds {{[0-9]+}}(%esp)
 ; AVX512VL-32-NEXT:    fisttpll {{[0-9]+}}(%esp)
+; AVX512VL-32-NEXT:    wait
 ; AVX512VL-32-NEXT:    setae %dl
 ; AVX512VL-32-NEXT:    shll $31, %edx
 ; AVX512VL-32-NEXT:    xorl {{[0-9]+}}(%esp), %edx
@@ -1078,6 +1107,7 @@ define <4 x i64> @strict_vector_fptoui_v4f32_to_v4i64(<4 x float> %a) #0 {
 ; AVX512VL-32-NEXT:    vmovss %xmm0, {{[0-9]+}}(%esp)
 ; AVX512VL-32-NEXT:    flds {{[0-9]+}}(%esp)
 ; AVX512VL-32-NEXT:    fisttpll {{[0-9]+}}(%esp)
+; AVX512VL-32-NEXT:    wait
 ; AVX512VL-32-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; AVX512VL-32-NEXT:    vpinsrd $1, %edx, %xmm0, %xmm0
 ; AVX512VL-32-NEXT:    vpinsrd $2, (%esp), %xmm0, %xmm0

diff  --git a/llvm/test/CodeGen/X86/vec-strict-fptoint-512.ll b/llvm/test/CodeGen/X86/vec-strict-fptoint-512.ll
index 3cc9edb83e13..06464ea1cb81 100644
--- a/llvm/test/CodeGen/X86/vec-strict-fptoint-512.ll
+++ b/llvm/test/CodeGen/X86/vec-strict-fptoint-512.ll
@@ -65,6 +65,7 @@ define <8 x i64> @strict_vector_fptosi_v8f64_to_v8i64(<8 x double> %a) #0 {
 ; AVX512VL-32-NEXT:    fisttpll {{[0-9]+}}(%esp)
 ; AVX512VL-32-NEXT:    fldl {{[0-9]+}}(%esp)
 ; AVX512VL-32-NEXT:    fisttpll {{[0-9]+}}(%esp)
+; AVX512VL-32-NEXT:    wait
 ; AVX512VL-32-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; AVX512VL-32-NEXT:    vpinsrd $1, {{[0-9]+}}(%esp), %xmm0, %xmm0
 ; AVX512VL-32-NEXT:    vpinsrd $2, {{[0-9]+}}(%esp), %xmm0, %xmm0
@@ -160,6 +161,7 @@ define <8 x i64> @strict_vector_fptoui_v8f64_to_v8i64(<8 x double> %a) #0 {
 ; AVX512VL-32-NEXT:    vmovsd %xmm3, {{[0-9]+}}(%esp)
 ; AVX512VL-32-NEXT:    fldl {{[0-9]+}}(%esp)
 ; AVX512VL-32-NEXT:    fisttpll {{[0-9]+}}(%esp)
+; AVX512VL-32-NEXT:    wait
 ; AVX512VL-32-NEXT:    movl $0, %eax
 ; AVX512VL-32-NEXT:    setae %al
 ; AVX512VL-32-NEXT:    shll $31, %eax
@@ -176,6 +178,7 @@ define <8 x i64> @strict_vector_fptoui_v8f64_to_v8i64(<8 x double> %a) #0 {
 ; AVX512VL-32-NEXT:    vmovsd %xmm4, {{[0-9]+}}(%esp)
 ; AVX512VL-32-NEXT:    fldl {{[0-9]+}}(%esp)
 ; AVX512VL-32-NEXT:    fisttpll {{[0-9]+}}(%esp)
+; AVX512VL-32-NEXT:    wait
 ; AVX512VL-32-NEXT:    movl $0, %eax
 ; AVX512VL-32-NEXT:    setae %al
 ; AVX512VL-32-NEXT:    shll $31, %eax
@@ -190,6 +193,7 @@ define <8 x i64> @strict_vector_fptoui_v8f64_to_v8i64(<8 x double> %a) #0 {
 ; AVX512VL-32-NEXT:    vmovsd %xmm3, {{[0-9]+}}(%esp)
 ; AVX512VL-32-NEXT:    fldl {{[0-9]+}}(%esp)
 ; AVX512VL-32-NEXT:    fisttpll {{[0-9]+}}(%esp)
+; AVX512VL-32-NEXT:    wait
 ; AVX512VL-32-NEXT:    movl $0, %eax
 ; AVX512VL-32-NEXT:    setae %al
 ; AVX512VL-32-NEXT:    shll $31, %eax
@@ -206,6 +210,7 @@ define <8 x i64> @strict_vector_fptoui_v8f64_to_v8i64(<8 x double> %a) #0 {
 ; AVX512VL-32-NEXT:    vmovsd %xmm4, {{[0-9]+}}(%esp)
 ; AVX512VL-32-NEXT:    fldl {{[0-9]+}}(%esp)
 ; AVX512VL-32-NEXT:    fisttpll {{[0-9]+}}(%esp)
+; AVX512VL-32-NEXT:    wait
 ; AVX512VL-32-NEXT:    movl $0, %eax
 ; AVX512VL-32-NEXT:    setae %al
 ; AVX512VL-32-NEXT:    shll $31, %eax
@@ -220,6 +225,7 @@ define <8 x i64> @strict_vector_fptoui_v8f64_to_v8i64(<8 x double> %a) #0 {
 ; AVX512VL-32-NEXT:    vmovsd %xmm3, {{[0-9]+}}(%esp)
 ; AVX512VL-32-NEXT:    fldl {{[0-9]+}}(%esp)
 ; AVX512VL-32-NEXT:    fisttpll {{[0-9]+}}(%esp)
+; AVX512VL-32-NEXT:    wait
 ; AVX512VL-32-NEXT:    movl $0, %eax
 ; AVX512VL-32-NEXT:    setae %al
 ; AVX512VL-32-NEXT:    shll $31, %eax
@@ -237,6 +243,7 @@ define <8 x i64> @strict_vector_fptoui_v8f64_to_v8i64(<8 x double> %a) #0 {
 ; AVX512VL-32-NEXT:    vmovsd %xmm4, (%esp)
 ; AVX512VL-32-NEXT:    fldl (%esp)
 ; AVX512VL-32-NEXT:    fisttpll (%esp)
+; AVX512VL-32-NEXT:    wait
 ; AVX512VL-32-NEXT:    setae %dl
 ; AVX512VL-32-NEXT:    shll $31, %edx
 ; AVX512VL-32-NEXT:    xorl {{[0-9]+}}(%esp), %edx
@@ -250,6 +257,7 @@ define <8 x i64> @strict_vector_fptoui_v8f64_to_v8i64(<8 x double> %a) #0 {
 ; AVX512VL-32-NEXT:    vmovsd %xmm3, {{[0-9]+}}(%esp)
 ; AVX512VL-32-NEXT:    fldl {{[0-9]+}}(%esp)
 ; AVX512VL-32-NEXT:    fisttpll {{[0-9]+}}(%esp)
+; AVX512VL-32-NEXT:    wait
 ; AVX512VL-32-NEXT:    setae %al
 ; AVX512VL-32-NEXT:    shll $31, %eax
 ; AVX512VL-32-NEXT:    xorl {{[0-9]+}}(%esp), %eax
@@ -262,6 +270,7 @@ define <8 x i64> @strict_vector_fptoui_v8f64_to_v8i64(<8 x double> %a) #0 {
 ; AVX512VL-32-NEXT:    vmovsd %xmm0, {{[0-9]+}}(%esp)
 ; AVX512VL-32-NEXT:    fldl {{[0-9]+}}(%esp)
 ; AVX512VL-32-NEXT:    fisttpll {{[0-9]+}}(%esp)
+; AVX512VL-32-NEXT:    wait
 ; AVX512VL-32-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; AVX512VL-32-NEXT:    vpinsrd $1, %eax, %xmm0, %xmm0
 ; AVX512VL-32-NEXT:    vpinsrd $2, (%esp), %xmm0, %xmm0
@@ -370,6 +379,7 @@ define <8 x i64> @strict_vector_fptosi_v8f32_to_v8i64(<8 x float> %a) #0 {
 ; AVX512VL-32-NEXT:    fisttpll {{[0-9]+}}(%esp)
 ; AVX512VL-32-NEXT:    flds {{[0-9]+}}(%esp)
 ; AVX512VL-32-NEXT:    fisttpll {{[0-9]+}}(%esp)
+; AVX512VL-32-NEXT:    wait
 ; AVX512VL-32-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; AVX512VL-32-NEXT:    vpinsrd $1, {{[0-9]+}}(%esp), %xmm0, %xmm0
 ; AVX512VL-32-NEXT:    vpinsrd $2, {{[0-9]+}}(%esp), %xmm0, %xmm0
@@ -465,6 +475,7 @@ define <8 x i64> @strict_vector_fptoui_v8f32_to_v8i64(<8 x float> %a) #0 {
 ; AVX512VL-32-NEXT:    vmovss %xmm3, {{[0-9]+}}(%esp)
 ; AVX512VL-32-NEXT:    flds {{[0-9]+}}(%esp)
 ; AVX512VL-32-NEXT:    fisttpll {{[0-9]+}}(%esp)
+; AVX512VL-32-NEXT:    wait
 ; AVX512VL-32-NEXT:    movl $0, %eax
 ; AVX512VL-32-NEXT:    setae %al
 ; AVX512VL-32-NEXT:    shll $31, %eax
@@ -480,6 +491,7 @@ define <8 x i64> @strict_vector_fptoui_v8f32_to_v8i64(<8 x float> %a) #0 {
 ; AVX512VL-32-NEXT:    vmovss %xmm3, {{[0-9]+}}(%esp)
 ; AVX512VL-32-NEXT:    flds {{[0-9]+}}(%esp)
 ; AVX512VL-32-NEXT:    fisttpll {{[0-9]+}}(%esp)
+; AVX512VL-32-NEXT:    wait
 ; AVX512VL-32-NEXT:    movl $0, %eax
 ; AVX512VL-32-NEXT:    setae %al
 ; AVX512VL-32-NEXT:    shll $31, %eax
@@ -495,6 +507,7 @@ define <8 x i64> @strict_vector_fptoui_v8f32_to_v8i64(<8 x float> %a) #0 {
 ; AVX512VL-32-NEXT:    vmovss %xmm3, {{[0-9]+}}(%esp)
 ; AVX512VL-32-NEXT:    flds {{[0-9]+}}(%esp)
 ; AVX512VL-32-NEXT:    fisttpll {{[0-9]+}}(%esp)
+; AVX512VL-32-NEXT:    wait
 ; AVX512VL-32-NEXT:    movl $0, %eax
 ; AVX512VL-32-NEXT:    setae %al
 ; AVX512VL-32-NEXT:    shll $31, %eax
@@ -511,6 +524,7 @@ define <8 x i64> @strict_vector_fptoui_v8f32_to_v8i64(<8 x float> %a) #0 {
 ; AVX512VL-32-NEXT:    vmovss %xmm4, {{[0-9]+}}(%esp)
 ; AVX512VL-32-NEXT:    flds {{[0-9]+}}(%esp)
 ; AVX512VL-32-NEXT:    fisttpll {{[0-9]+}}(%esp)
+; AVX512VL-32-NEXT:    wait
 ; AVX512VL-32-NEXT:    movl $0, %eax
 ; AVX512VL-32-NEXT:    setae %al
 ; AVX512VL-32-NEXT:    shll $31, %eax
@@ -525,6 +539,7 @@ define <8 x i64> @strict_vector_fptoui_v8f32_to_v8i64(<8 x float> %a) #0 {
 ; AVX512VL-32-NEXT:    vmovss %xmm4, {{[0-9]+}}(%esp)
 ; AVX512VL-32-NEXT:    flds {{[0-9]+}}(%esp)
 ; AVX512VL-32-NEXT:    fisttpll {{[0-9]+}}(%esp)
+; AVX512VL-32-NEXT:    wait
 ; AVX512VL-32-NEXT:    movl $0, %eax
 ; AVX512VL-32-NEXT:    setae %al
 ; AVX512VL-32-NEXT:    shll $31, %eax
@@ -541,6 +556,7 @@ define <8 x i64> @strict_vector_fptoui_v8f32_to_v8i64(<8 x float> %a) #0 {
 ; AVX512VL-32-NEXT:    vmovss %xmm4, (%esp)
 ; AVX512VL-32-NEXT:    flds (%esp)
 ; AVX512VL-32-NEXT:    fisttpll (%esp)
+; AVX512VL-32-NEXT:    wait
 ; AVX512VL-32-NEXT:    setae %dl
 ; AVX512VL-32-NEXT:    shll $31, %edx
 ; AVX512VL-32-NEXT:    xorl {{[0-9]+}}(%esp), %edx
@@ -555,6 +571,7 @@ define <8 x i64> @strict_vector_fptoui_v8f32_to_v8i64(<8 x float> %a) #0 {
 ; AVX512VL-32-NEXT:    vmovss %xmm3, {{[0-9]+}}(%esp)
 ; AVX512VL-32-NEXT:    flds {{[0-9]+}}(%esp)
 ; AVX512VL-32-NEXT:    fisttpll {{[0-9]+}}(%esp)
+; AVX512VL-32-NEXT:    wait
 ; AVX512VL-32-NEXT:    setae %al
 ; AVX512VL-32-NEXT:    shll $31, %eax
 ; AVX512VL-32-NEXT:    xorl {{[0-9]+}}(%esp), %eax
@@ -567,6 +584,7 @@ define <8 x i64> @strict_vector_fptoui_v8f32_to_v8i64(<8 x float> %a) #0 {
 ; AVX512VL-32-NEXT:    vmovss %xmm0, {{[0-9]+}}(%esp)
 ; AVX512VL-32-NEXT:    flds {{[0-9]+}}(%esp)
 ; AVX512VL-32-NEXT:    fisttpll {{[0-9]+}}(%esp)
+; AVX512VL-32-NEXT:    wait
 ; AVX512VL-32-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; AVX512VL-32-NEXT:    vpinsrd $1, %eax, %xmm0, %xmm0
 ; AVX512VL-32-NEXT:    vpinsrd $2, (%esp), %xmm0, %xmm0

diff  --git a/llvm/test/CodeGen/X86/vec-strict-inttofp-128.ll b/llvm/test/CodeGen/X86/vec-strict-inttofp-128.ll
index f3c79c2cb397..51d9fe2bb864 100644
--- a/llvm/test/CodeGen/X86/vec-strict-inttofp-128.ll
+++ b/llvm/test/CodeGen/X86/vec-strict-inttofp-128.ll
@@ -141,6 +141,7 @@ define <2 x float> @sitofp_v2i64_v2f32(<2 x i64> %x) #0 {
 ; SSE-32-NEXT:    fstps (%esp)
 ; SSE-32-NEXT:    fildll {{[0-9]+}}(%esp)
 ; SSE-32-NEXT:    fstps {{[0-9]+}}(%esp)
+; SSE-32-NEXT:    wait
 ; SSE-32-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; SSE-32-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
 ; SSE-32-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
@@ -177,6 +178,7 @@ define <2 x float> @sitofp_v2i64_v2f32(<2 x i64> %x) #0 {
 ; SSE41-32-NEXT:    fstps (%esp)
 ; SSE41-32-NEXT:    fildll {{[0-9]+}}(%esp)
 ; SSE41-32-NEXT:    fstps {{[0-9]+}}(%esp)
+; SSE41-32-NEXT:    wait
 ; SSE41-32-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; SSE41-32-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
 ; SSE41-32-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
@@ -213,6 +215,7 @@ define <2 x float> @sitofp_v2i64_v2f32(<2 x i64> %x) #0 {
 ; AVX-32-NEXT:    fstps (%esp)
 ; AVX-32-NEXT:    fildll {{[0-9]+}}(%esp)
 ; AVX-32-NEXT:    fstps {{[0-9]+}}(%esp)
+; AVX-32-NEXT:    wait
 ; AVX-32-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; AVX-32-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[2,3]
 ; AVX-32-NEXT:    movl %ebp, %esp
@@ -277,12 +280,14 @@ define <2 x float> @uitofp_v2i64_v2f32(<2 x i64> %x) #0 {
 ; SSE-32-NEXT:    fildll {{[0-9]+}}(%esp)
 ; SSE-32-NEXT:    fadds {{\.LCPI.*}}(,%eax,4)
 ; SSE-32-NEXT:    fstps (%esp)
+; SSE-32-NEXT:    wait
 ; SSE-32-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
 ; SSE-32-NEXT:    movd %xmm0, %eax
 ; SSE-32-NEXT:    shrl $31, %eax
 ; SSE-32-NEXT:    fildll {{[0-9]+}}(%esp)
 ; SSE-32-NEXT:    fadds {{\.LCPI.*}}(,%eax,4)
 ; SSE-32-NEXT:    fstps {{[0-9]+}}(%esp)
+; SSE-32-NEXT:    wait
 ; SSE-32-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
 ; SSE-32-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; SSE-32-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
@@ -344,12 +349,14 @@ define <2 x float> @uitofp_v2i64_v2f32(<2 x i64> %x) #0 {
 ; SSE41-32-NEXT:    fildll {{[0-9]+}}(%esp)
 ; SSE41-32-NEXT:    fadds {{\.LCPI.*}}(,%eax,4)
 ; SSE41-32-NEXT:    fstps (%esp)
+; SSE41-32-NEXT:    wait
 ; SSE41-32-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
 ; SSE41-32-NEXT:    movd %xmm0, %eax
 ; SSE41-32-NEXT:    shrl $31, %eax
 ; SSE41-32-NEXT:    fildll {{[0-9]+}}(%esp)
 ; SSE41-32-NEXT:    fadds {{\.LCPI.*}}(,%eax,4)
 ; SSE41-32-NEXT:    fstps {{[0-9]+}}(%esp)
+; SSE41-32-NEXT:    wait
 ; SSE41-32-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
 ; SSE41-32-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; SSE41-32-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
@@ -410,11 +417,13 @@ define <2 x float> @uitofp_v2i64_v2f32(<2 x i64> %x) #0 {
 ; AVX-32-NEXT:    fildll {{[0-9]+}}(%esp)
 ; AVX-32-NEXT:    fadds {{\.LCPI.*}}(,%eax,4)
 ; AVX-32-NEXT:    fstps {{[0-9]+}}(%esp)
+; AVX-32-NEXT:    wait
 ; AVX-32-NEXT:    vextractps $3, %xmm0, %eax
 ; AVX-32-NEXT:    shrl $31, %eax
 ; AVX-32-NEXT:    fildll {{[0-9]+}}(%esp)
 ; AVX-32-NEXT:    fadds {{\.LCPI.*}}(,%eax,4)
 ; AVX-32-NEXT:    fstps (%esp)
+; AVX-32-NEXT:    wait
 ; AVX-32-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; AVX-32-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[2,3]
 ; AVX-32-NEXT:    movl %ebp, %esp
@@ -1143,6 +1152,7 @@ define <2 x double> @sitofp_v2i64_v2f64(<2 x i64> %x) #0 {
 ; SSE-32-NEXT:    fstpl {{[0-9]+}}(%esp)
 ; SSE-32-NEXT:    fildll {{[0-9]+}}(%esp)
 ; SSE-32-NEXT:    fstpl (%esp)
+; SSE-32-NEXT:    wait
 ; SSE-32-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
 ; SSE-32-NEXT:    movhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1]
 ; SSE-32-NEXT:    movl %ebp, %esp
@@ -1178,6 +1188,7 @@ define <2 x double> @sitofp_v2i64_v2f64(<2 x i64> %x) #0 {
 ; SSE41-32-NEXT:    fstpl {{[0-9]+}}(%esp)
 ; SSE41-32-NEXT:    fildll {{[0-9]+}}(%esp)
 ; SSE41-32-NEXT:    fstpl (%esp)
+; SSE41-32-NEXT:    wait
 ; SSE41-32-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
 ; SSE41-32-NEXT:    movhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1]
 ; SSE41-32-NEXT:    movl %ebp, %esp
@@ -1213,6 +1224,7 @@ define <2 x double> @sitofp_v2i64_v2f64(<2 x i64> %x) #0 {
 ; AVX-32-NEXT:    fstpl {{[0-9]+}}(%esp)
 ; AVX-32-NEXT:    fildll {{[0-9]+}}(%esp)
 ; AVX-32-NEXT:    fstpl (%esp)
+; AVX-32-NEXT:    wait
 ; AVX-32-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
 ; AVX-32-NEXT:    vmovhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1]
 ; AVX-32-NEXT:    movl %ebp, %esp

diff  --git a/llvm/test/CodeGen/X86/vec-strict-inttofp-256.ll b/llvm/test/CodeGen/X86/vec-strict-inttofp-256.ll
index 60f0c3430125..5fbb8aa864c1 100644
--- a/llvm/test/CodeGen/X86/vec-strict-inttofp-256.ll
+++ b/llvm/test/CodeGen/X86/vec-strict-inttofp-256.ll
@@ -651,12 +651,14 @@ define <4 x double> @sitofp_v4i64_v4f64(<4 x i64> %x) #0 {
 ; AVX-32-NEXT:    fstpl {{[0-9]+}}(%esp)
 ; AVX-32-NEXT:    fildll {{[0-9]+}}(%esp)
 ; AVX-32-NEXT:    fstpl {{[0-9]+}}(%esp)
+; AVX-32-NEXT:    wait
 ; AVX-32-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
 ; AVX-32-NEXT:    vmovhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1]
 ; AVX-32-NEXT:    fildll {{[0-9]+}}(%esp)
 ; AVX-32-NEXT:    fstpl {{[0-9]+}}(%esp)
 ; AVX-32-NEXT:    fildll {{[0-9]+}}(%esp)
 ; AVX-32-NEXT:    fstpl (%esp)
+; AVX-32-NEXT:    wait
 ; AVX-32-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
 ; AVX-32-NEXT:    vmovhps {{.*#+}} xmm1 = xmm1[0,1],mem[0,1]
 ; AVX-32-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
@@ -889,6 +891,7 @@ define <4 x float> @sitofp_v4i64_v4f32(<4 x i64> %x) #0 {
 ; AVX-32-NEXT:    fstps {{[0-9]+}}(%esp)
 ; AVX-32-NEXT:    fildll {{[0-9]+}}(%esp)
 ; AVX-32-NEXT:    fstps (%esp)
+; AVX-32-NEXT:    wait
 ; AVX-32-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; AVX-32-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[2,3]
 ; AVX-32-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]
@@ -1008,21 +1011,25 @@ define <4 x float> @uitofp_v4i64_v4f32(<4 x i64> %x) #0 {
 ; AVX-32-NEXT:    fildll {{[0-9]+}}(%esp)
 ; AVX-32-NEXT:    fadds {{\.LCPI.*}}(,%eax,4)
 ; AVX-32-NEXT:    fstps (%esp)
+; AVX-32-NEXT:    wait
 ; AVX-32-NEXT:    vextractps $3, %xmm0, %eax
 ; AVX-32-NEXT:    shrl $31, %eax
 ; AVX-32-NEXT:    fildll {{[0-9]+}}(%esp)
 ; AVX-32-NEXT:    fadds {{\.LCPI.*}}(,%eax,4)
 ; AVX-32-NEXT:    fstps {{[0-9]+}}(%esp)
+; AVX-32-NEXT:    wait
 ; AVX-32-NEXT:    vextractps $1, %xmm1, %eax
 ; AVX-32-NEXT:    shrl $31, %eax
 ; AVX-32-NEXT:    fildll {{[0-9]+}}(%esp)
 ; AVX-32-NEXT:    fadds {{\.LCPI.*}}(,%eax,4)
 ; AVX-32-NEXT:    fstps {{[0-9]+}}(%esp)
+; AVX-32-NEXT:    wait
 ; AVX-32-NEXT:    vextractps $3, %xmm1, %eax
 ; AVX-32-NEXT:    shrl $31, %eax
 ; AVX-32-NEXT:    fildll {{[0-9]+}}(%esp)
 ; AVX-32-NEXT:    fadds {{\.LCPI.*}}(,%eax,4)
 ; AVX-32-NEXT:    fstps {{[0-9]+}}(%esp)
+; AVX-32-NEXT:    wait
 ; AVX-32-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; AVX-32-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[2,3]
 ; AVX-32-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]

diff  --git a/llvm/test/CodeGen/X86/vec-strict-inttofp-512.ll b/llvm/test/CodeGen/X86/vec-strict-inttofp-512.ll
index b61c401a6e46..55134b4e0d6d 100644
--- a/llvm/test/CodeGen/X86/vec-strict-inttofp-512.ll
+++ b/llvm/test/CodeGen/X86/vec-strict-inttofp-512.ll
@@ -290,12 +290,14 @@ define <8 x double> @sitofp_v8i64_v8f64(<8 x i64> %x) #0 {
 ; NODQ-32-NEXT:    fstpl {{[0-9]+}}(%esp)
 ; NODQ-32-NEXT:    fildll {{[0-9]+}}(%esp)
 ; NODQ-32-NEXT:    fstpl {{[0-9]+}}(%esp)
+; NODQ-32-NEXT:    wait
 ; NODQ-32-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
 ; NODQ-32-NEXT:    vmovhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1]
 ; NODQ-32-NEXT:    fildll {{[0-9]+}}(%esp)
 ; NODQ-32-NEXT:    fstpl {{[0-9]+}}(%esp)
 ; NODQ-32-NEXT:    fildll {{[0-9]+}}(%esp)
 ; NODQ-32-NEXT:    fstpl (%esp)
+; NODQ-32-NEXT:    wait
 ; NODQ-32-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
 ; NODQ-32-NEXT:    vmovhps {{.*#+}} xmm1 = xmm1[0,1],mem[0,1]
 ; NODQ-32-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
@@ -303,12 +305,14 @@ define <8 x double> @sitofp_v8i64_v8f64(<8 x i64> %x) #0 {
 ; NODQ-32-NEXT:    fstpl {{[0-9]+}}(%esp)
 ; NODQ-32-NEXT:    fildll {{[0-9]+}}(%esp)
 ; NODQ-32-NEXT:    fstpl {{[0-9]+}}(%esp)
+; NODQ-32-NEXT:    wait
 ; NODQ-32-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
 ; NODQ-32-NEXT:    vmovhps {{.*#+}} xmm1 = xmm1[0,1],mem[0,1]
 ; NODQ-32-NEXT:    fildll {{[0-9]+}}(%esp)
 ; NODQ-32-NEXT:    fstpl {{[0-9]+}}(%esp)
 ; NODQ-32-NEXT:    fildll {{[0-9]+}}(%esp)
 ; NODQ-32-NEXT:    fstpl {{[0-9]+}}(%esp)
+; NODQ-32-NEXT:    wait
 ; NODQ-32-NEXT:    vmovsd {{.*#+}} xmm2 = mem[0],zero
 ; NODQ-32-NEXT:    vmovhps {{.*#+}} xmm2 = xmm2[0,1],mem[0,1]
 ; NODQ-32-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
@@ -422,6 +426,7 @@ define <8 x float> @sitofp_v8i64_v8f32(<8 x i64> %x) #0 {
 ; NODQ-32-NEXT:    fstps {{[0-9]+}}(%esp)
 ; NODQ-32-NEXT:    fildll {{[0-9]+}}(%esp)
 ; NODQ-32-NEXT:    fstps {{[0-9]+}}(%esp)
+; NODQ-32-NEXT:    wait
 ; NODQ-32-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; NODQ-32-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[2,3]
 ; NODQ-32-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]
@@ -434,6 +439,7 @@ define <8 x float> @sitofp_v8i64_v8f32(<8 x i64> %x) #0 {
 ; NODQ-32-NEXT:    fstps {{[0-9]+}}(%esp)
 ; NODQ-32-NEXT:    fildll {{[0-9]+}}(%esp)
 ; NODQ-32-NEXT:    fstps (%esp)
+; NODQ-32-NEXT:    wait
 ; NODQ-32-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
 ; NODQ-32-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[2,3]
 ; NODQ-32-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1],mem[0],xmm1[3]
@@ -514,41 +520,49 @@ define <8 x float> @uitofp_v8i64_v8f32(<8 x i64> %x) #0 {
 ; NODQ-32-NEXT:    fildll {{[0-9]+}}(%esp)
 ; NODQ-32-NEXT:    fadds {{\.LCPI.*}}(,%eax,4)
 ; NODQ-32-NEXT:    fstps (%esp)
+; NODQ-32-NEXT:    wait
 ; NODQ-32-NEXT:    vextractps $3, %xmm0, %eax
 ; NODQ-32-NEXT:    shrl $31, %eax
 ; NODQ-32-NEXT:    fildll {{[0-9]+}}(%esp)
 ; NODQ-32-NEXT:    fadds {{\.LCPI.*}}(,%eax,4)
 ; NODQ-32-NEXT:    fstps {{[0-9]+}}(%esp)
+; NODQ-32-NEXT:    wait
 ; NODQ-32-NEXT:    vextractps $1, %xmm3, %eax
 ; NODQ-32-NEXT:    shrl $31, %eax
 ; NODQ-32-NEXT:    fildll {{[0-9]+}}(%esp)
 ; NODQ-32-NEXT:    fadds {{\.LCPI.*}}(,%eax,4)
 ; NODQ-32-NEXT:    fstps {{[0-9]+}}(%esp)
+; NODQ-32-NEXT:    wait
 ; NODQ-32-NEXT:    vextractps $3, %xmm3, %eax
 ; NODQ-32-NEXT:    shrl $31, %eax
 ; NODQ-32-NEXT:    fildll {{[0-9]+}}(%esp)
 ; NODQ-32-NEXT:    fadds {{\.LCPI.*}}(,%eax,4)
 ; NODQ-32-NEXT:    fstps {{[0-9]+}}(%esp)
+; NODQ-32-NEXT:    wait
 ; NODQ-32-NEXT:    vextractps $1, %xmm2, %eax
 ; NODQ-32-NEXT:    shrl $31, %eax
 ; NODQ-32-NEXT:    fildll {{[0-9]+}}(%esp)
 ; NODQ-32-NEXT:    fadds {{\.LCPI.*}}(,%eax,4)
 ; NODQ-32-NEXT:    fstps {{[0-9]+}}(%esp)
+; NODQ-32-NEXT:    wait
 ; NODQ-32-NEXT:    vextractps $3, %xmm2, %eax
 ; NODQ-32-NEXT:    shrl $31, %eax
 ; NODQ-32-NEXT:    fildll {{[0-9]+}}(%esp)
 ; NODQ-32-NEXT:    fadds {{\.LCPI.*}}(,%eax,4)
 ; NODQ-32-NEXT:    fstps {{[0-9]+}}(%esp)
+; NODQ-32-NEXT:    wait
 ; NODQ-32-NEXT:    vextractps $1, %xmm1, %eax
 ; NODQ-32-NEXT:    shrl $31, %eax
 ; NODQ-32-NEXT:    fildll {{[0-9]+}}(%esp)
 ; NODQ-32-NEXT:    fadds {{\.LCPI.*}}(,%eax,4)
 ; NODQ-32-NEXT:    fstps {{[0-9]+}}(%esp)
+; NODQ-32-NEXT:    wait
 ; NODQ-32-NEXT:    vextractps $3, %xmm1, %eax
 ; NODQ-32-NEXT:    shrl $31, %eax
 ; NODQ-32-NEXT:    fildll {{[0-9]+}}(%esp)
 ; NODQ-32-NEXT:    fadds {{\.LCPI.*}}(,%eax,4)
 ; NODQ-32-NEXT:    fstps {{[0-9]+}}(%esp)
+; NODQ-32-NEXT:    wait
 ; NODQ-32-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; NODQ-32-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[2,3]
 ; NODQ-32-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]

diff  --git a/llvm/test/CodeGen/X86/vector-constrained-fp-intrinsics.ll b/llvm/test/CodeGen/X86/vector-constrained-fp-intrinsics.ll
index cd8a06f71d02..7c35ccebefb7 100644
--- a/llvm/test/CodeGen/X86/vector-constrained-fp-intrinsics.ll
+++ b/llvm/test/CodeGen/X86/vector-constrained-fp-intrinsics.ll
@@ -92,6 +92,7 @@ define <3 x double> @constrained_vector_fdiv_v3f64() #0 {
 ; CHECK-NEXT:    movapd %xmm0, %xmm1
 ; CHECK-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
 ; CHECK-NEXT:    fldl -{{[0-9]+}}(%rsp)
+; CHECK-NEXT:    wait
 ; CHECK-NEXT:    retq
 ;
 ; AVX-LABEL: constrained_vector_fdiv_v3f64:
@@ -292,6 +293,7 @@ define <3 x double> @constrained_vector_frem_v3f64() #0 {
 ; CHECK-NEXT:    callq fmod
 ; CHECK-NEXT:    movsd %xmm0, {{[0-9]+}}(%rsp)
 ; CHECK-NEXT:    fldl {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    wait
 ; CHECK-NEXT:    movsd (%rsp), %xmm0 # 8-byte Reload
 ; CHECK-NEXT:    # xmm0 = mem[0],zero
 ; CHECK-NEXT:    movsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 8-byte Reload
@@ -484,6 +486,7 @@ define <3 x double> @constrained_vector_fmul_v3f64() #0 {
 ; CHECK-NEXT:    movapd %xmm0, %xmm1
 ; CHECK-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
 ; CHECK-NEXT:    fldl -{{[0-9]+}}(%rsp)
+; CHECK-NEXT:    wait
 ; CHECK-NEXT:    retq
 ;
 ; AVX-LABEL: constrained_vector_fmul_v3f64:
@@ -621,6 +624,7 @@ define <3 x double> @constrained_vector_fadd_v3f64() #0 {
 ; CHECK-NEXT:    movapd %xmm0, %xmm1
 ; CHECK-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
 ; CHECK-NEXT:    fldl -{{[0-9]+}}(%rsp)
+; CHECK-NEXT:    wait
 ; CHECK-NEXT:    retq
 ;
 ; AVX-LABEL: constrained_vector_fadd_v3f64:
@@ -760,6 +764,7 @@ define <3 x double> @constrained_vector_fsub_v3f64() #0 {
 ; CHECK-NEXT:    movapd %xmm0, %xmm1
 ; CHECK-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
 ; CHECK-NEXT:    fldl -{{[0-9]+}}(%rsp)
+; CHECK-NEXT:    wait
 ; CHECK-NEXT:    retq
 ;
 ; AVX-LABEL: constrained_vector_fsub_v3f64:
@@ -892,6 +897,7 @@ define <3 x double> @constrained_vector_sqrt_v3f64() #0 {
 ; CHECK-NEXT:    movapd %xmm0, %xmm1
 ; CHECK-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
 ; CHECK-NEXT:    fldl -{{[0-9]+}}(%rsp)
+; CHECK-NEXT:    wait
 ; CHECK-NEXT:    retq
 ;
 ; AVX-LABEL: constrained_vector_sqrt_v3f64:
@@ -1077,6 +1083,7 @@ define <3 x double> @constrained_vector_pow_v3f64() #0 {
 ; CHECK-NEXT:    callq pow
 ; CHECK-NEXT:    movsd %xmm0, {{[0-9]+}}(%rsp)
 ; CHECK-NEXT:    fldl {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    wait
 ; CHECK-NEXT:    movsd (%rsp), %xmm0 # 8-byte Reload
 ; CHECK-NEXT:    # xmm0 = mem[0],zero
 ; CHECK-NEXT:    movsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 8-byte Reload
@@ -1333,6 +1340,7 @@ define <3 x double> @constrained_vector_powi_v3f64() #0 {
 ; CHECK-NEXT:    callq __powidf2
 ; CHECK-NEXT:    movsd %xmm0, {{[0-9]+}}(%rsp)
 ; CHECK-NEXT:    fldl {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    wait
 ; CHECK-NEXT:    movsd (%rsp), %xmm0 # 8-byte Reload
 ; CHECK-NEXT:    # xmm0 = mem[0],zero
 ; CHECK-NEXT:    movsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 8-byte Reload
@@ -1570,6 +1578,7 @@ define <3 x double> @constrained_vector_sin_v3f64() #0 {
 ; CHECK-NEXT:    callq sin
 ; CHECK-NEXT:    movsd %xmm0, {{[0-9]+}}(%rsp)
 ; CHECK-NEXT:    fldl {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    wait
 ; CHECK-NEXT:    movsd (%rsp), %xmm0 # 8-byte Reload
 ; CHECK-NEXT:    # xmm0 = mem[0],zero
 ; CHECK-NEXT:    movsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 8-byte Reload
@@ -1794,6 +1803,7 @@ define <3 x double> @constrained_vector_cos_v3f64() #0 {
 ; CHECK-NEXT:    callq cos
 ; CHECK-NEXT:    movsd %xmm0, {{[0-9]+}}(%rsp)
 ; CHECK-NEXT:    fldl {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    wait
 ; CHECK-NEXT:    movsd (%rsp), %xmm0 # 8-byte Reload
 ; CHECK-NEXT:    # xmm0 = mem[0],zero
 ; CHECK-NEXT:    movsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 8-byte Reload
@@ -2018,6 +2028,7 @@ define <3 x double> @constrained_vector_exp_v3f64() #0 {
 ; CHECK-NEXT:    callq exp
 ; CHECK-NEXT:    movsd %xmm0, {{[0-9]+}}(%rsp)
 ; CHECK-NEXT:    fldl {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    wait
 ; CHECK-NEXT:    movsd (%rsp), %xmm0 # 8-byte Reload
 ; CHECK-NEXT:    # xmm0 = mem[0],zero
 ; CHECK-NEXT:    movsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 8-byte Reload
@@ -2242,6 +2253,7 @@ define <3 x double> @constrained_vector_exp2_v3f64() #0 {
 ; CHECK-NEXT:    callq exp2
 ; CHECK-NEXT:    movsd %xmm0, {{[0-9]+}}(%rsp)
 ; CHECK-NEXT:    fldl {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    wait
 ; CHECK-NEXT:    movsd (%rsp), %xmm0 # 8-byte Reload
 ; CHECK-NEXT:    # xmm0 = mem[0],zero
 ; CHECK-NEXT:    movsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 8-byte Reload
@@ -2466,6 +2478,7 @@ define <3 x double> @constrained_vector_log_v3f64() #0 {
 ; CHECK-NEXT:    callq log
 ; CHECK-NEXT:    movsd %xmm0, {{[0-9]+}}(%rsp)
 ; CHECK-NEXT:    fldl {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    wait
 ; CHECK-NEXT:    movsd (%rsp), %xmm0 # 8-byte Reload
 ; CHECK-NEXT:    # xmm0 = mem[0],zero
 ; CHECK-NEXT:    movsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 8-byte Reload
@@ -2690,6 +2703,7 @@ define <3 x double> @constrained_vector_log10_v3f64() #0 {
 ; CHECK-NEXT:    callq log10
 ; CHECK-NEXT:    movsd %xmm0, {{[0-9]+}}(%rsp)
 ; CHECK-NEXT:    fldl {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    wait
 ; CHECK-NEXT:    movsd (%rsp), %xmm0 # 8-byte Reload
 ; CHECK-NEXT:    # xmm0 = mem[0],zero
 ; CHECK-NEXT:    movsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 8-byte Reload
@@ -2914,6 +2928,7 @@ define <3 x double> @constrained_vector_log2_v3f64() #0 {
 ; CHECK-NEXT:    callq log2
 ; CHECK-NEXT:    movsd %xmm0, {{[0-9]+}}(%rsp)
 ; CHECK-NEXT:    fldl {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    wait
 ; CHECK-NEXT:    movsd (%rsp), %xmm0 # 8-byte Reload
 ; CHECK-NEXT:    # xmm0 = mem[0],zero
 ; CHECK-NEXT:    movsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 8-byte Reload
@@ -3116,6 +3131,7 @@ define <3 x double> @constrained_vector_rint_v3f64() #0 {
 ; CHECK-NEXT:    callq rint
 ; CHECK-NEXT:    movsd %xmm0, {{[0-9]+}}(%rsp)
 ; CHECK-NEXT:    fldl {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    wait
 ; CHECK-NEXT:    movsd (%rsp), %xmm0 # 8-byte Reload
 ; CHECK-NEXT:    # xmm0 = mem[0],zero
 ; CHECK-NEXT:    movsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 8-byte Reload
@@ -3286,6 +3302,7 @@ define <3 x double> @constrained_vector_nearby_v3f64() #0 {
 ; CHECK-NEXT:    callq nearbyint
 ; CHECK-NEXT:    movsd %xmm0, {{[0-9]+}}(%rsp)
 ; CHECK-NEXT:    fldl {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    wait
 ; CHECK-NEXT:    movsd (%rsp), %xmm0 # 8-byte Reload
 ; CHECK-NEXT:    # xmm0 = mem[0],zero
 ; CHECK-NEXT:    movsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 8-byte Reload
@@ -3492,6 +3509,7 @@ define <3 x double> @constrained_vector_max_v3f64() #0 {
 ; CHECK-NEXT:    callq fmax
 ; CHECK-NEXT:    movsd %xmm0, {{[0-9]+}}(%rsp)
 ; CHECK-NEXT:    fldl {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    wait
 ; CHECK-NEXT:    movsd (%rsp), %xmm0 # 8-byte Reload
 ; CHECK-NEXT:    # xmm0 = mem[0],zero
 ; CHECK-NEXT:    movsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 8-byte Reload
@@ -3742,6 +3760,7 @@ define <3 x double> @constrained_vector_min_v3f64() #0 {
 ; CHECK-NEXT:    callq fmin
 ; CHECK-NEXT:    movsd %xmm0, {{[0-9]+}}(%rsp)
 ; CHECK-NEXT:    fldl {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    wait
 ; CHECK-NEXT:    movsd (%rsp), %xmm0 # 8-byte Reload
 ; CHECK-NEXT:    # xmm0 = mem[0],zero
 ; CHECK-NEXT:    movsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 8-byte Reload
@@ -5549,6 +5568,7 @@ define <3 x double> @constrained_vector_fpext_v3f32() #0 {
 ; CHECK-NEXT:    cvtss2sd %xmm2, %xmm2
 ; CHECK-NEXT:    movsd %xmm2, -{{[0-9]+}}(%rsp)
 ; CHECK-NEXT:    fldl -{{[0-9]+}}(%rsp)
+; CHECK-NEXT:    wait
 ; CHECK-NEXT:    retq
 ;
 ; AVX-LABEL: constrained_vector_fpext_v3f32:
@@ -5694,6 +5714,7 @@ define <3 x double> @constrained_vector_ceil_v3f64() #0 {
 ; CHECK-NEXT:    callq ceil
 ; CHECK-NEXT:    movsd %xmm0, {{[0-9]+}}(%rsp)
 ; CHECK-NEXT:    fldl {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    wait
 ; CHECK-NEXT:    movsd (%rsp), %xmm0 # 8-byte Reload
 ; CHECK-NEXT:    # xmm0 = mem[0],zero
 ; CHECK-NEXT:    movsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 8-byte Reload
@@ -5822,6 +5843,7 @@ define <3 x double> @constrained_vector_floor_v3f64() #0 {
 ; CHECK-NEXT:    callq floor
 ; CHECK-NEXT:    movsd %xmm0, {{[0-9]+}}(%rsp)
 ; CHECK-NEXT:    fldl {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    wait
 ; CHECK-NEXT:    movsd (%rsp), %xmm0 # 8-byte Reload
 ; CHECK-NEXT:    # xmm0 = mem[0],zero
 ; CHECK-NEXT:    movsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 8-byte Reload
@@ -5972,6 +5994,7 @@ define <3 x double> @constrained_vector_round_v3f64() #0 {
 ; CHECK-NEXT:    callq round
 ; CHECK-NEXT:    movsd %xmm0, {{[0-9]+}}(%rsp)
 ; CHECK-NEXT:    fldl {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    wait
 ; CHECK-NEXT:    movsd (%rsp), %xmm0 # 8-byte Reload
 ; CHECK-NEXT:    # xmm0 = mem[0],zero
 ; CHECK-NEXT:    movsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 8-byte Reload
@@ -6112,6 +6135,7 @@ define <3 x double> @constrained_vector_trunc_v3f64() #0 {
 ; CHECK-NEXT:    callq trunc
 ; CHECK-NEXT:    movsd %xmm0, {{[0-9]+}}(%rsp)
 ; CHECK-NEXT:    fldl {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    wait
 ; CHECK-NEXT:    movsd (%rsp), %xmm0 # 8-byte Reload
 ; CHECK-NEXT:    # xmm0 = mem[0],zero
 ; CHECK-NEXT:    movsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 8-byte Reload
@@ -6334,6 +6358,7 @@ define <3 x double> @constrained_vector_sitofp_v3f64_v3i32(<3 x i32> %x) #0 {
 ; CHECK-NEXT:    cvtsi2sd %eax, %xmm0
 ; CHECK-NEXT:    movsd %xmm0, -{{[0-9]+}}(%rsp)
 ; CHECK-NEXT:    fldl -{{[0-9]+}}(%rsp)
+; CHECK-NEXT:    wait
 ; CHECK-NEXT:    movapd %xmm2, %xmm0
 ; CHECK-NEXT:    retq
 ;
@@ -6401,6 +6426,7 @@ define <3 x double> @constrained_vector_sitofp_v3f64_v3i64(<3 x i64> %x) #0 {
 ; CHECK-NEXT:    cvtsi2sd %rdx, %xmm2
 ; CHECK-NEXT:    movsd %xmm2, -{{[0-9]+}}(%rsp)
 ; CHECK-NEXT:    fldl -{{[0-9]+}}(%rsp)
+; CHECK-NEXT:    wait
 ; CHECK-NEXT:    retq
 ;
 ; AVX1-LABEL: constrained_vector_sitofp_v3f64_v3i64:
@@ -6988,6 +7014,7 @@ define <3 x double> @constrained_vector_uitofp_v3f64_v3i32(<3 x i32> %x) #0 {
 ; CHECK-NEXT:    cvtsi2sd %rax, %xmm0
 ; CHECK-NEXT:    movsd %xmm0, -{{[0-9]+}}(%rsp)
 ; CHECK-NEXT:    fldl -{{[0-9]+}}(%rsp)
+; CHECK-NEXT:    wait
 ; CHECK-NEXT:    movapd %xmm2, %xmm0
 ; CHECK-NEXT:    retq
 ;
@@ -7096,6 +7123,7 @@ define <3 x double> @constrained_vector_uitofp_v3f64_v3i64(<3 x i64> %x) #0 {
 ; CHECK-NEXT:    addpd %xmm4, %xmm2
 ; CHECK-NEXT:    movlpd %xmm2, -{{[0-9]+}}(%rsp)
 ; CHECK-NEXT:    fldl -{{[0-9]+}}(%rsp)
+; CHECK-NEXT:    wait
 ; CHECK-NEXT:    retq
 ;
 ; AVX1-LABEL: constrained_vector_uitofp_v3f64_v3i64: