[llvm] 2bb9603 - [X86][ConstraintFP] Model `MXCSR` for function call
Phoebe Wang via llvm-commits
llvm-commits at lists.llvm.org
Thu Jan 26 18:07:44 PST 2023
Author: Phoebe Wang
Date: 2023-01-27T10:07:38+08:00
New Revision: 2bb960302aa5b53fd387c05f917043b421ade53e
URL: https://github.com/llvm/llvm-project/commit/2bb960302aa5b53fd387c05f917043b421ade53e
DIFF: https://github.com/llvm/llvm-project/commit/2bb960302aa5b53fd387c05f917043b421ade53e.diff
LOG: [X86][ConstraintFP] Model `MXCSR` for function call
This patch is inspired by D111433. It would affect the performance under
strict FP mode. But it preserves the correct rounding behavior accross
function calls.
Fixes #59305
Reviewed By: sepavloff
Differential Revision: https://reviews.llvm.org/D139549
Added:
Modified:
llvm/include/llvm/CodeGen/TargetLowering.h
llvm/lib/CodeGen/SelectionDAG/InstrEmitter.cpp
llvm/lib/Target/X86/X86ISelLowering.cpp
llvm/lib/Target/X86/X86ISelLowering.h
llvm/test/CodeGen/X86/fp-strict-scalar-inttofp-fp16.ll
llvm/test/CodeGen/X86/pr59305.ll
Removed:
################################################################################
diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h
index 639d48e342ef3..4374f7e294cda 100644
--- a/llvm/include/llvm/CodeGen/TargetLowering.h
+++ b/llvm/include/llvm/CodeGen/TargetLowering.h
@@ -4488,6 +4488,12 @@ class TargetLowering : public TargetLoweringBase {
return nullptr;
}
+ /// Returns a 0 terminated array of rounding control registers that can be
+ /// attached into strict FP call.
+ virtual const MCPhysReg *getRoundingControlRegisters() const {
+ return nullptr;
+ }
+
/// This callback is used to prepare for a volatile or atomic load.
/// It takes a chain node as input and returns the chain for the load itself.
///
diff --git a/llvm/lib/CodeGen/SelectionDAG/InstrEmitter.cpp b/llvm/lib/CodeGen/SelectionDAG/InstrEmitter.cpp
index 338172e4e10a7..5cd5de71b9d0b 100644
--- a/llvm/lib/CodeGen/SelectionDAG/InstrEmitter.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/InstrEmitter.cpp
@@ -1161,6 +1161,14 @@ EmitMachineNode(SDNode *Node, bool IsClone, bool IsCloned,
}
}
+ // Add rounding control registers as implicit def for function call.
+ if (II.isCall() && MF->getFunction().hasFnAttribute(Attribute::StrictFP)) {
+ const MCPhysReg *RCRegs = TLI->getRoundingControlRegisters();
+ if (RCRegs)
+ for (; *RCRegs; ++RCRegs)
+ UsedRegs.push_back(*RCRegs);
+ }
+
// Finally mark unused registers as dead.
if (!UsedRegs.empty() || !II.implicit_defs().empty() || II.hasOptionalDef())
MIB->setPhysRegsDeadExcept(UsedRegs, *TRI);
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 4c463dae0a004..838a0e4bb095c 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -3111,6 +3111,13 @@ const MCPhysReg *X86TargetLowering::getScratchRegisters(CallingConv::ID) const {
return ScratchRegs;
}
+const MCPhysReg *X86TargetLowering::getRoundingControlRegisters() const {
+ // FIXME: We should def X86::FPCW for x87 as well. But it affects a lot of lit
+ // tests at the moment, which is not what we expected.
+ static const MCPhysReg RCRegs[] = { X86::MXCSR, 0 };
+ return RCRegs;
+}
+
/// Lowers masks values (v*i1) to the local register values
/// \returns DAG node after lowering to register type
static SDValue lowerMasksToReg(const SDValue &ValArg, const EVT &ValLoc,
diff --git a/llvm/lib/Target/X86/X86ISelLowering.h b/llvm/lib/Target/X86/X86ISelLowering.h
index c5c1150472718..7f56d8da004ec 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.h
+++ b/llvm/lib/Target/X86/X86ISelLowering.h
@@ -1702,6 +1702,7 @@ namespace llvm {
LLVMContext &Context) const override;
const MCPhysReg *getScratchRegisters(CallingConv::ID CC) const override;
+ const MCPhysReg *getRoundingControlRegisters() const override;
TargetLoweringBase::AtomicExpansionKind
shouldExpandAtomicLoadInIR(LoadInst *LI) const override;
diff --git a/llvm/test/CodeGen/X86/fp-strict-scalar-inttofp-fp16.ll b/llvm/test/CodeGen/X86/fp-strict-scalar-inttofp-fp16.ll
index 18b6b4ad2055d..3c99d4daa806f 100644
--- a/llvm/test/CodeGen/X86/fp-strict-scalar-inttofp-fp16.ll
+++ b/llvm/test/CodeGen/X86/fp-strict-scalar-inttofp-fp16.ll
@@ -380,10 +380,12 @@ define half @uitofp_i64tof16(i64 %x) #0 {
; SSE2-NEXT: orq %rax, %rcx
; SSE2-NEXT: testq %rdi, %rdi
; SSE2-NEXT: cmovnsq %rdi, %rcx
-; SSE2-NEXT: cvtsi2ss %rcx, %xmm0
-; SSE2-NEXT: jns .LBB9_2
+; SSE2-NEXT: cvtsi2ss %rcx, %xmm1
+; SSE2-NEXT: movaps %xmm1, %xmm0
+; SSE2-NEXT: addss %xmm1, %xmm0
+; SSE2-NEXT: js .LBB9_2
; SSE2-NEXT: # %bb.1:
-; SSE2-NEXT: addss %xmm0, %xmm0
+; SSE2-NEXT: movaps %xmm1, %xmm0
; SSE2-NEXT: .LBB9_2:
; SSE2-NEXT: pushq %rax
; SSE2-NEXT: callq __truncsfhf2 at PLT
diff --git a/llvm/test/CodeGen/X86/pr59305.ll b/llvm/test/CodeGen/X86/pr59305.ll
index 6c6ee99359ffa..cb98e9d5dda8e 100644
--- a/llvm/test/CodeGen/X86/pr59305.ll
+++ b/llvm/test/CodeGen/X86/pr59305.ll
@@ -4,23 +4,28 @@
define double @foo(double %0) #0 {
; CHECK-LABEL: foo:
; CHECK: # %bb.0:
-; CHECK-NEXT: pushq %rax
+; CHECK-NEXT: subq $24, %rsp
; CHECK-NEXT: movsd %xmm0, (%rsp) # 8-byte Spill
; CHECK-NEXT: movl $1024, %edi # imm = 0x400
; CHECK-NEXT: callq fesetround at PLT
-; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
-; CHECK-NEXT: divsd (%rsp), %xmm0 # 8-byte Folded Reload
-; CHECK-NEXT: movsd %xmm0, (%rsp) # 8-byte Spill
+; CHECK-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero
+; CHECK-NEXT: divsd (%rsp), %xmm1 # 8-byte Folded Reload
+; CHECK-NEXT: movsd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; CHECK-NEXT: movl $1024, %edi # imm = 0x400
; CHECK-NEXT: callq fesetround at PLT
+; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT: divsd (%rsp), %xmm0 # 8-byte Folded Reload
+; CHECK-NEXT: movsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; CHECK-NEXT: movl $1024, %edi # imm = 0x400
; CHECK-NEXT: callq fesetround at PLT
-; CHECK-NEXT: movsd (%rsp), %xmm0 # 8-byte Reload
+; CHECK-NEXT: movsd {{.*#+}} xmm2 = mem[0],zero
+; CHECK-NEXT: divsd (%rsp), %xmm2 # 8-byte Folded Reload
+; CHECK-NEXT: movsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 8-byte Reload
; CHECK-NEXT: # xmm0 = mem[0],zero
-; CHECK-NEXT: movaps %xmm0, %xmm1
-; CHECK-NEXT: movaps %xmm0, %xmm2
+; CHECK-NEXT: movsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 8-byte Reload
+; CHECK-NEXT: # xmm1 = mem[0],zero
; CHECK-NEXT: callq fma at PLT
-; CHECK-NEXT: popq %rax
+; CHECK-NEXT: addq $24, %rsp
; CHECK-NEXT: retq
%2 = call i32 @fesetround(i32 noundef 1024)
%3 = call double @llvm.experimental.constrained.fdiv.f64(double 1.000000e+00, double %0, metadata !"round.dynamic", metadata !"fpexcept.ignore") #0
More information about the llvm-commits
mailing list