[llvm] [X86] Resolve FIXME: Add FPCW as a rounding control register (PR #82452)

Fri Mar 1 11:14:46 PST 2024

https://github.com/AtariDreams updated https://github.com/llvm/llvm-project/pull/82452

>From 3f809108c40915564fbf4faf7a168c12811fbc6e Mon Sep 17 00:00:00 2001
From: Rose <83477269+AtariDreams at users.noreply.github.com>
Date: Wed, 21 Feb 2024 14:53:23 -0500
Subject: [PATCH 1/2] [X86] Insert wait if instruction right before call is a
 waiting one

The reason adding fpcr broke tests is because that caused LLVM to no longer kill the instruction before a call, which prevented LLVM from treating x87 as an operand, which meant the call was not eligible for a wait before it as a result.

This patch now has LLVM add wait to the end of a x87 instruction is a call is immediately after.
---
 llvm/lib/Target/X86/X86InsertWait.cpp | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/llvm/lib/Target/X86/X86InsertWait.cpp b/llvm/lib/Target/X86/X86InsertWait.cpp
index 69a3d32a931498..10f141abe9c786 100644
--- a/llvm/lib/Target/X86/X86InsertWait.cpp
+++ b/llvm/lib/Target/X86/X86InsertWait.cpp
@@ -115,7 +115,8 @@ bool WaitInsert::runOnMachineFunction(MachineFunction &MF) {
       // If the following instruction is an X87 instruction and isn't an X87
       // non-waiting control instruction, we can omit insert wait instruction.
       MachineBasicBlock::iterator AfterMI = std::next(MI);
-      if (AfterMI != MBB.end() && X86::isX87Instruction(*AfterMI) &&
+      if (AfterMI != MBB.end() && !AfterMI->isCall() &&
+          X86::isX87Instruction(*AfterMI) &&
           !isX87NonWaitingControlInstruction(*AfterMI))
         continue;
 

>From 251e5b32031d0d4ab5153160867d2eabb60d6969 Mon Sep 17 00:00:00 2001
From: Rose <83477269+AtariDreams at users.noreply.github.com>
Date: Tue, 20 Feb 2024 20:41:40 -0500
Subject: [PATCH 2/2] [X86] Resolve FIXME: Add FPCW as a rounding control
 register

To prevent tests from breaking, another fix had to be made: Now, we check if the instruction after a waiting instruction is a call, and if so, we insert the wait.
---
 llvm/lib/Target/X86/X86FloatingPoint.cpp    |  3 +
 llvm/lib/Target/X86/X86ISelLoweringCall.cpp |  4 +-
 llvm/lib/Target/X86/X86InsertWait.cpp       |  4 +
 llvm/test/CodeGen/X86/pr59305.ll            | 89 +++++++++++++++------
 4 files changed, 71 insertions(+), 29 deletions(-)

diff --git a/llvm/lib/Target/X86/X86FloatingPoint.cpp b/llvm/lib/Target/X86/X86FloatingPoint.cpp
index ca4d03913d093e..5064e75fb44348 100644
--- a/llvm/lib/Target/X86/X86FloatingPoint.cpp
+++ b/llvm/lib/Target/X86/X86FloatingPoint.cpp
@@ -432,6 +432,9 @@ bool FPS::processBasicBlock(MachineFunction &MF, MachineBasicBlock &BB) {
     if (MI.isCall())
       FPInstClass = X86II::SpecialFP;
 
+    if (MI.isReturn())
+      FPInstClass = X86II::SpecialFP;
+
     if (FPInstClass == X86II::NotFP)
       continue;  // Efficiently ignore non-fp insts!
 
diff --git a/llvm/lib/Target/X86/X86ISelLoweringCall.cpp b/llvm/lib/Target/X86/X86ISelLoweringCall.cpp
index be8275c92e11ae..c7ef11aede886a 100644
--- a/llvm/lib/Target/X86/X86ISelLoweringCall.cpp
+++ b/llvm/lib/Target/X86/X86ISelLoweringCall.cpp
@@ -670,9 +670,7 @@ const MCPhysReg *X86TargetLowering::getScratchRegisters(CallingConv::ID) const {
 }
 
 ArrayRef<MCPhysReg> X86TargetLowering::getRoundingControlRegisters() const {
-  // FIXME: We should def X86::FPCW for x87 as well. But it affects a lot of lit
-  // tests at the moment, which is not what we expected.
-  static const MCPhysReg RCRegs[] = {X86::MXCSR};
+  static const MCPhysReg RCRegs[] = {X86::FPCW, X86::MXCSR};
   return RCRegs;
 }
 
diff --git a/llvm/lib/Target/X86/X86InsertWait.cpp b/llvm/lib/Target/X86/X86InsertWait.cpp
index 10f141abe9c786..9fa43a802d3907 100644
--- a/llvm/lib/Target/X86/X86InsertWait.cpp
+++ b/llvm/lib/Target/X86/X86InsertWait.cpp
@@ -106,16 +106,20 @@ bool WaitInsert::runOnMachineFunction(MachineFunction &MF) {
       // Jump non X87 instruction.
       if (!X86::isX87Instruction(*MI))
         continue;
+
       // If the instruction instruction neither has float exception nor is
       // a load/store instruction, or the instruction is x87 control
       // instruction, do not insert wait.
       if (!(MI->mayRaiseFPException() || MI->mayLoadOrStore()) ||
           isX87ControlInstruction(*MI))
         continue;
+
       // If the following instruction is an X87 instruction and isn't an X87
       // non-waiting control instruction, we can omit insert wait instruction.
       MachineBasicBlock::iterator AfterMI = std::next(MI);
       if (AfterMI != MBB.end() && !AfterMI->isCall() &&
+          !AfterMI->isTerminator() && !AfterMI->isReturn() &&
+          !AfterMI->hasUnmodeledSideEffects() &&
           X86::isX87Instruction(*AfterMI) &&
           !isX87NonWaitingControlInstruction(*AfterMI))
         continue;
diff --git a/llvm/test/CodeGen/X86/pr59305.ll b/llvm/test/CodeGen/X86/pr59305.ll
index c2f6d21a41d4dc..4172aa6204def2 100644
--- a/llvm/test/CodeGen/X86/pr59305.ll
+++ b/llvm/test/CodeGen/X86/pr59305.ll
@@ -1,32 +1,69 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=x86_64-pc-linux < %s | FileCheck %s
+; RUN: llc -mtriple=x86_64-pc-linux < %s | FileCheck %s --check-prefix=X86-64
+; RUN: llc -mtriple=i686-pc-linux < %s | FileCheck %s --check-prefix=X86
 
 define double @foo(double %0) #0 {
-; CHECK-LABEL: foo:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    subq $24, %rsp
-; CHECK-NEXT:    movsd %xmm0, (%rsp) # 8-byte Spill
-; CHECK-NEXT:    movl $1024, %edi # imm = 0x400
-; CHECK-NEXT:    callq fesetround at PLT
-; CHECK-NEXT:    movsd {{.*#+}} xmm1 = [1.0E+0,0.0E+0]
-; CHECK-NEXT:    divsd (%rsp), %xmm1 # 8-byte Folded Reload
-; CHECK-NEXT:    movsd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; CHECK-NEXT:    movl $1024, %edi # imm = 0x400
-; CHECK-NEXT:    callq fesetround at PLT
-; CHECK-NEXT:    movsd {{.*#+}} xmm0 = [1.0E+0,0.0E+0]
-; CHECK-NEXT:    divsd (%rsp), %xmm0 # 8-byte Folded Reload
-; CHECK-NEXT:    movsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; CHECK-NEXT:    movl $1024, %edi # imm = 0x400
-; CHECK-NEXT:    callq fesetround at PLT
-; CHECK-NEXT:    movsd {{.*#+}} xmm2 = [1.0E+0,0.0E+0]
-; CHECK-NEXT:    divsd (%rsp), %xmm2 # 8-byte Folded Reload
-; CHECK-NEXT:    movsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 8-byte Reload
-; CHECK-NEXT:    # xmm0 = mem[0],zero
-; CHECK-NEXT:    movsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 8-byte Reload
-; CHECK-NEXT:    # xmm1 = mem[0],zero
-; CHECK-NEXT:    callq fma at PLT
-; CHECK-NEXT:    addq $24, %rsp
-; CHECK-NEXT:    retq
+; X86-64-LABEL: foo:
+; X86-64:       # %bb.0:
+; X86-64-NEXT:    subq $24, %rsp
+; X86-64-NEXT:    movsd %xmm0, (%rsp) # 8-byte Spill
+; X86-64-NEXT:    movl $1024, %edi # imm = 0x400
+; X86-64-NEXT:    callq fesetround at PLT
+; X86-64-NEXT:    movsd {{.*#+}} xmm1 = [1.0E+0,0.0E+0]
+; X86-64-NEXT:    divsd (%rsp), %xmm1 # 8-byte Folded Reload
+; X86-64-NEXT:    movsd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X86-64-NEXT:    movl $1024, %edi # imm = 0x400
+; X86-64-NEXT:    callq fesetround at PLT
+; X86-64-NEXT:    movsd {{.*#+}} xmm0 = [1.0E+0,0.0E+0]
+; X86-64-NEXT:    divsd (%rsp), %xmm0 # 8-byte Folded Reload
+; X86-64-NEXT:    movsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X86-64-NEXT:    movl $1024, %edi # imm = 0x400
+; X86-64-NEXT:    callq fesetround at PLT
+; X86-64-NEXT:    movsd {{.*#+}} xmm2 = [1.0E+0,0.0E+0]
+; X86-64-NEXT:    divsd (%rsp), %xmm2 # 8-byte Folded Reload
+; X86-64-NEXT:    movsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 8-byte Reload
+; X86-64-NEXT:    # xmm0 = mem[0],zero
+; X86-64-NEXT:    movsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 8-byte Reload
+; X86-64-NEXT:    # xmm1 = mem[0],zero
+; X86-64-NEXT:    callq fma at PLT
+; X86-64-NEXT:    addq $24, %rsp
+; X86-64-NEXT:    retq
+;
+; X86-LABEL: foo:
+; X86:       # %bb.0:
+; X86-NEXT:    subl $60, %esp
+; X86-NEXT:    fldl {{[0-9]+}}(%esp)
+; X86-NEXT:    fstpl {{[-0-9]+}}(%e{{[sb]}}p) # 8-byte Folded Spill
+; X86-NEXT:    wait
+; X86-NEXT:    movl $1024, (%esp) # imm = 0x400
+; X86-NEXT:    calll fesetround at PLT
+; X86-NEXT:    fld1
+; X86-NEXT:    fstl {{[-0-9]+}}(%e{{[sb]}}p) # 8-byte Folded Spill
+; X86-NEXT:    fldl {{[-0-9]+}}(%e{{[sb]}}p) # 8-byte Folded Reload
+; X86-NEXT:    fdivrp %st, %st(1)
+; X86-NEXT:    fstpl {{[-0-9]+}}(%e{{[sb]}}p) # 8-byte Folded Spill
+; X86-NEXT:    wait
+; X86-NEXT:    movl $1024, (%esp) # imm = 0x400
+; X86-NEXT:    calll fesetround at PLT
+; X86-NEXT:    fldl {{[-0-9]+}}(%e{{[sb]}}p) # 8-byte Folded Reload
+; X86-NEXT:    fldl {{[-0-9]+}}(%e{{[sb]}}p) # 8-byte Folded Reload
+; X86-NEXT:    fdivp %st, %st(1)
+; X86-NEXT:    fstpl {{[-0-9]+}}(%e{{[sb]}}p) # 8-byte Folded Spill
+; X86-NEXT:    wait
+; X86-NEXT:    movl $1024, (%esp) # imm = 0x400
+; X86-NEXT:    calll fesetround at PLT
+; X86-NEXT:    fldl {{[-0-9]+}}(%e{{[sb]}}p) # 8-byte Folded Reload
+; X86-NEXT:    fldl {{[-0-9]+}}(%e{{[sb]}}p) # 8-byte Folded Reload
+; X86-NEXT:    fdivp %st, %st(1)
+; X86-NEXT:    fstpl {{[0-9]+}}(%esp)
+; X86-NEXT:    fldl {{[-0-9]+}}(%e{{[sb]}}p) # 8-byte Folded Reload
+; X86-NEXT:    fstpl {{[0-9]+}}(%esp)
+; X86-NEXT:    fldl {{[-0-9]+}}(%e{{[sb]}}p) # 8-byte Folded Reload
+; X86-NEXT:    fstpl (%esp)
+; X86-NEXT:    wait
+; X86-NEXT:    calll fma
+; X86-NEXT:    addl $60, %esp
+; X86-NEXT:    retl
     %2 = call i32 @fesetround(i32 noundef 1024)
     %3 = call double @llvm.experimental.constrained.fdiv.f64(double 1.000000e+00, double %0, metadata !"round.dynamic", metadata !"fpexcept.ignore") #0
     %4 = call i32 @fesetround(i32 noundef 1024)