[llvm] 55fdecc - [SDAG][X86] Remove hack needed to avoid missing x87 FPU stack pops (#128055)

via llvm-commits llvm-commits at lists.llvm.org
Mon Mar 3 04:23:32 PST 2025


Author: Benjamin Maxwell
Date: 2025-03-03T12:23:28Z
New Revision: 55fdeccc4567bcd4e3f8df0d177195880a194a6a

URL: https://github.com/llvm/llvm-project/commit/55fdeccc4567bcd4e3f8df0d177195880a194a6a
DIFF: https://github.com/llvm/llvm-project/commit/55fdeccc4567bcd4e3f8df0d177195880a194a6a.diff

LOG: [SDAG][X86] Remove hack needed to avoid missing x87 FPU stack pops (#128055)

If a (two-result) node like `FMODF` or `FFREXP` is expanded to a library
call, where said library has the function prototype like: `float(float,
float*)` -- that is it returns a float from the call and via an output
pointer. The first result of the node maps to the value returned by
value and the second result maps to the value returned via the output
pointer.

If only the second result is used after the expansion, we hit an issue
on x87 targets:

```
// Before expansion: 
t0, t1 = fmodf x
return t1  // t0 is unused
```

Expanded result:
```
ptr = alloca
ch0 = call modf ptr
t0, ch1 = copy_from_reg, ch0 // t0 unused
t1, ch2 = ldr ptr, ch1
return t1
```

So far things are alright, but the DAGCombiner optimizes this to:
```
ptr = alloca
ch0 = call modf ptr
// copy_from_reg optimized out
t1, ch1 = ldr ptr, ch0
return t1
```

On most targets this is fine. The optimized out `copy_from_reg` is
unused and is a NOP. However, x87 uses a floating-point stack, and if
the `copy_from_reg` is optimized out it won't emit a pop needed to
remove the unused result.

The prior solution for this was to attach the chain from the
`copy_from_reg` to the root, which did work, however, the root is not
always available (it's set to null during legalize types). So the
alternate solution in this patch is to replace the `copy_from_reg` with
an `X86ISD::POP_FROM_X87_REG` within the X86 call lowering. This node is
the same as `copy_from_reg` except this node makes it explicit that it
may lower to an x87 FPU stack pop. Optimizations should be more cautious
when handling this node than a normal CopyFromReg to avoid removing a
required FPU stack pop.

```
ptr = alloca
ch0 = call modf ptr
t0, ch1 = pop_from_x87_reg, ch0 // t0 unused
t1, ch2 = ldr ptr, ch1
return t1
```

Using this node ensures a required x87 FPU pop is not removed due to the
DAGCombiner.

This is an alternate solution for #127976.

Added: 
    

Modified: 
    llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
    llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
    llvm/lib/Target/X86/X86ISelLowering.cpp
    llvm/lib/Target/X86/X86ISelLowering.h
    llvm/lib/Target/X86/X86ISelLoweringCall.cpp
    llvm/test/CodeGen/PowerPC/llvm.modf.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
index d38a7c54c8cea..df30148b78b65 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -2635,22 +2635,6 @@ bool SelectionDAG::expandMultipleResultFPLibCall(
     Results.push_back(LoadResult);
   }
 
-  if (CallRetResNo && !Node->hasAnyUseOfValue(*CallRetResNo)) {
-    // FIXME: Find a way to avoid updating the root. This is needed for x86,
-    // which uses a floating-point stack. If (for example) the node to be
-    // expanded has two results one floating-point which is returned by the
-    // call, and one integer result, returned via an output pointer. If only the
-    // integer result is used then the `CopyFromReg` for the FP result may be
-    // optimized out. This prevents an FP stack pop from being emitted for it.
-    // Setting the root like this ensures there will be a use of the
-    // `CopyFromReg` chain, and ensures the FP pop will be emitted.
-    SDValue NewRoot =
-        getNode(ISD::TokenFactor, DL, MVT::Other, getRoot(), CallChain);
-    setRoot(NewRoot);
-    // Ensure the new root is reachable from the results.
-    Results[0] = getMergeValues({Results[0], NewRoot}, DL);
-  }
-
   return true;
 }
 

diff  --git a/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp b/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
index 84bcdae520885..a7a0e84ba2b60 100644
--- a/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
+++ b/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
@@ -6717,6 +6717,17 @@ void X86DAGToDAGISel::Select(SDNode *Node) {
     ReplaceNode(Node, Res);
     return;
   }
+  case X86ISD::POP_FROM_X87_REG: {
+    SDValue Chain = Node->getOperand(0);
+    Register Reg = cast<RegisterSDNode>(Node->getOperand(1))->getReg();
+    SDValue Glue;
+    if (Node->getNumValues() == 3)
+      Glue = Node->getOperand(2);
+    SDValue Copy =
+        CurDAG->getCopyFromReg(Chain, dl, Reg, Node->getValueType(0), Glue);
+    ReplaceNode(Node, Copy.getNode());
+    return;
+  }
   }
 
   SelectCode(Node);

diff  --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 5b9db77a3330f..3e23250d861c5 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -35114,6 +35114,7 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
   NODE_NAME_CASE(CVTTP2SIS_SAE)
   NODE_NAME_CASE(CVTTP2UIS)
   NODE_NAME_CASE(MCVTTP2UIS)
+  NODE_NAME_CASE(POP_FROM_X87_REG)
   }
   return nullptr;
 #undef NODE_NAME_CASE

diff  --git a/llvm/lib/Target/X86/X86ISelLowering.h b/llvm/lib/Target/X86/X86ISelLowering.h
index fe79fefeed631..4a2b35e9efe7c 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.h
+++ b/llvm/lib/Target/X86/X86ISelLowering.h
@@ -81,6 +81,15 @@ namespace llvm {
     // marker instruction.
     CALL_RVMARKER,
 
+    /// The same as ISD::CopyFromReg except that this node makes it explicit
+    /// that it may lower to an x87 FPU stack pop. Optimizations should be more
+    /// cautious when handling this node than a normal CopyFromReg to avoid
+    /// removing a required FPU stack pop. A key requirement is optimizations
+    /// should not optimize any users of a chain that contains a
+    /// POP_FROM_X87_REG to use a chain from a point earlier than the
+    /// POP_FROM_X87_REG (which may remove a required FPU stack pop).
+    POP_FROM_X87_REG,
+
     /// X86 compare and logical compare instructions.
     CMP,
     FCMP,

diff  --git a/llvm/lib/Target/X86/X86ISelLoweringCall.cpp b/llvm/lib/Target/X86/X86ISelLoweringCall.cpp
index ee4bb758102f4..80b4aeeda1e00 100644
--- a/llvm/lib/Target/X86/X86ISelLoweringCall.cpp
+++ b/llvm/lib/Target/X86/X86ISelLoweringCall.cpp
@@ -1095,6 +1095,15 @@ static SDValue lowerRegToMasks(const SDValue &ValArg, const EVT &ValVT,
   return DAG.getBitcast(ValVT, ValReturned);
 }
 
+static SDValue getPopFromX87Reg(SelectionDAG &DAG, SDValue Chain,
+                                const SDLoc &dl, Register Reg, EVT VT,
+                                SDValue Glue) {
+  SDVTList VTs = DAG.getVTList(VT, MVT::Other, MVT::Glue);
+  SDValue Ops[] = {Chain, DAG.getRegister(Reg, VT), Glue};
+  return DAG.getNode(X86ISD::POP_FROM_X87_REG, dl, VTs,
+                     ArrayRef(Ops, Glue.getNode() ? 3 : 2));
+}
+
 /// Lower the result values of a call into the
 /// appropriate copies out of appropriate physical registers.
 ///
@@ -1145,8 +1154,8 @@ SDValue X86TargetLowering::LowerCallResult(
     // If we prefer to use the value in xmm registers, copy it out as f80 and
     // use a truncate to move it from fp stack reg to xmm reg.
     bool RoundAfterCopy = false;
-    if ((VA.getLocReg() == X86::FP0 || VA.getLocReg() == X86::FP1) &&
-        isScalarFPTypeInSSEReg(VA.getValVT())) {
+    bool X87Result = VA.getLocReg() == X86::FP0 || VA.getLocReg() == X86::FP1;
+    if (X87Result && isScalarFPTypeInSSEReg(VA.getValVT())) {
       if (!Subtarget.hasX87())
         report_fatal_error("X87 register return with X87 disabled");
       CopyVT = MVT::f80;
@@ -1160,8 +1169,12 @@ SDValue X86TargetLowering::LowerCallResult(
       Val =
           getv64i1Argument(VA, RVLocs[++I], Chain, DAG, dl, Subtarget, &InGlue);
     } else {
-      Chain = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), CopyVT, InGlue)
-                  .getValue(1);
+      Chain =
+          X87Result
+              ? getPopFromX87Reg(DAG, Chain, dl, VA.getLocReg(), CopyVT, InGlue)
+                    .getValue(1)
+              : DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), CopyVT, InGlue)
+                    .getValue(1);
       Val = Chain.getValue(0);
       InGlue = Chain.getValue(2);
     }

diff  --git a/llvm/test/CodeGen/PowerPC/llvm.modf.ll b/llvm/test/CodeGen/PowerPC/llvm.modf.ll
index 69e3b22c7352c..1b137c786cc91 100644
--- a/llvm/test/CodeGen/PowerPC/llvm.modf.ll
+++ b/llvm/test/CodeGen/PowerPC/llvm.modf.ll
@@ -328,3 +328,108 @@ define { ppc_fp128, ppc_fp128 } @test_modf_ppcf128(ppc_fp128 %a) {
   %result = call { ppc_fp128, ppc_fp128 } @llvm.modf.ppcf128(ppc_fp128 %a)
   ret { ppc_fp128, ppc_fp128 } %result
 }
+
+define ppc_fp128 @test_modf_ppcf128_only_use_intergral(ppc_fp128 %a) {
+; CHECK-LABEL: test_modf_ppcf128_only_use_intergral:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    mflr r0
+; CHECK-NEXT:    stdu r1, -48(r1)
+; CHECK-NEXT:    std r0, 64(r1)
+; CHECK-NEXT:    .cfi_def_cfa_offset 48
+; CHECK-NEXT:    .cfi_offset lr, 16
+; CHECK-NEXT:    addi r5, r1, 32
+; CHECK-NEXT:    bl modfl
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    lfd f1, 32(r1)
+; CHECK-NEXT:    lfd f2, 40(r1)
+; CHECK-NEXT:    addi r1, r1, 48
+; CHECK-NEXT:    ld r0, 16(r1)
+; CHECK-NEXT:    mtlr r0
+; CHECK-NEXT:    blr
+  %result = call { ppc_fp128, ppc_fp128 } @llvm.modf.ppcf128(ppc_fp128 %a)
+  %result.1 = extractvalue { ppc_fp128, ppc_fp128 } %result, 1
+  ret ppc_fp128 %result.1
+}
+
+define ppc_fp128 @test_modf_ppcf128_only_use_fractional(ppc_fp128 %a) {
+; CHECK-LABEL: test_modf_ppcf128_only_use_fractional:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    mflr r0
+; CHECK-NEXT:    stdu r1, -48(r1)
+; CHECK-NEXT:    std r0, 64(r1)
+; CHECK-NEXT:    .cfi_def_cfa_offset 48
+; CHECK-NEXT:    .cfi_offset lr, 16
+; CHECK-NEXT:    addi r5, r1, 32
+; CHECK-NEXT:    bl modfl
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    addi r1, r1, 48
+; CHECK-NEXT:    ld r0, 16(r1)
+; CHECK-NEXT:    mtlr r0
+; CHECK-NEXT:    blr
+  %result = call { ppc_fp128, ppc_fp128 } @llvm.modf.ppcf128(ppc_fp128 %a)
+  %result.1 = extractvalue { ppc_fp128, ppc_fp128 } %result, 0
+  ret ppc_fp128 %result.1
+}
+
+define { ppc_fp128, ppc_fp128 } @test_modf_ppcf128_tail_call(ppc_fp128 %a) {
+; CHECK-LABEL: test_modf_ppcf128_tail_call:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    mflr r0
+; CHECK-NEXT:    stdu r1, -48(r1)
+; CHECK-NEXT:    std r0, 64(r1)
+; CHECK-NEXT:    .cfi_def_cfa_offset 48
+; CHECK-NEXT:    .cfi_offset lr, 16
+; CHECK-NEXT:    addi r5, r1, 32
+; CHECK-NEXT:    bl modfl
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    lfd f3, 32(r1)
+; CHECK-NEXT:    lfd f4, 40(r1)
+; CHECK-NEXT:    addi r1, r1, 48
+; CHECK-NEXT:    ld r0, 16(r1)
+; CHECK-NEXT:    mtlr r0
+; CHECK-NEXT:    blr
+  %result = tail call { ppc_fp128, ppc_fp128 } @llvm.modf.ppcf128(ppc_fp128 %a)
+  ret { ppc_fp128, ppc_fp128 } %result
+}
+
+define ppc_fp128 @test_modf_ppcf128_only_use_intergral_tail_call(ppc_fp128 %a) {
+; CHECK-LABEL: test_modf_ppcf128_only_use_intergral_tail_call:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    mflr r0
+; CHECK-NEXT:    stdu r1, -48(r1)
+; CHECK-NEXT:    std r0, 64(r1)
+; CHECK-NEXT:    .cfi_def_cfa_offset 48
+; CHECK-NEXT:    .cfi_offset lr, 16
+; CHECK-NEXT:    addi r5, r1, 32
+; CHECK-NEXT:    bl modfl
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    lfd f1, 32(r1)
+; CHECK-NEXT:    lfd f2, 40(r1)
+; CHECK-NEXT:    addi r1, r1, 48
+; CHECK-NEXT:    ld r0, 16(r1)
+; CHECK-NEXT:    mtlr r0
+; CHECK-NEXT:    blr
+  %result = tail call { ppc_fp128, ppc_fp128 } @llvm.modf.ppcf128(ppc_fp128 %a)
+  %result.1 = extractvalue { ppc_fp128, ppc_fp128 } %result, 1
+  ret ppc_fp128 %result.1
+}
+
+define ppc_fp128 @test_modf_ppcf128_only_use_fractional_tail_call(ppc_fp128 %a) {
+; CHECK-LABEL: test_modf_ppcf128_only_use_fractional_tail_call:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    mflr r0
+; CHECK-NEXT:    stdu r1, -48(r1)
+; CHECK-NEXT:    std r0, 64(r1)
+; CHECK-NEXT:    .cfi_def_cfa_offset 48
+; CHECK-NEXT:    .cfi_offset lr, 16
+; CHECK-NEXT:    addi r5, r1, 32
+; CHECK-NEXT:    bl modfl
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    addi r1, r1, 48
+; CHECK-NEXT:    ld r0, 16(r1)
+; CHECK-NEXT:    mtlr r0
+; CHECK-NEXT:    blr
+  %result = tail call { ppc_fp128, ppc_fp128 } @llvm.modf.ppcf128(ppc_fp128 %a)
+  %result.1 = extractvalue { ppc_fp128, ppc_fp128 } %result, 0
+  ret ppc_fp128 %result.1
+}


        


More information about the llvm-commits mailing list