[llvm] d5c97f4 - [X86] Teach X86FloatingPoint's handleCall to only erase the FP stack if there is a regmask operand that clobbers the FP stack.

Mon Jul 12 10:16:09 PDT 2021

Author: Craig Topper
Date: 2021-07-12T10:15:38-07:00
New Revision: d5c97f4bf0449f8a4ad6260c75cbf6ff3605ac04

URL: https://github.com/llvm/llvm-project/commit/d5c97f4bf0449f8a4ad6260c75cbf6ff3605ac04
DIFF: https://github.com/llvm/llvm-project/commit/d5c97f4bf0449f8a4ad6260c75cbf6ff3605ac04.diff

LOG: [X86] Teach X86FloatingPoint's handleCall to only erase the FP stack if there is a regmask operand that clobbers the FP stack.

There are some calls to functions like `__alloca` that are missing
a regmask operand. Lack of a regmask operand means that all
registers that aren't mentioned by def operands are preserved.
__alloca only updates EAX and ESP and has def operands for
them so this is ok. Because there is no regmask the register
allocator won't spill the FP registers across the call. Assuming
we want to keep the FP stack untoched across these calls, we
need to handle this is in the FP stackifier.

We might want to add a proper regmask operand to the code that
creates these calls to indicate all registers are preserved, but we'd
still need this change to the FP stackifier to know to preserve the
FP stack for such a regmask.

The test is kind of long, but bugpoint wasn't able to reduce it
any further.

Fixes PR50782

Reviewed By: pengfei

Differential Revision: https://reviews.llvm.org/D105762

Added: 
    llvm/test/CodeGen/X86/pr50782.ll

Modified: 
    llvm/lib/Target/X86/X86FloatingPoint.cpp

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/X86/X86FloatingPoint.cpp b/llvm/lib/Target/X86/X86FloatingPoint.cpp
index ef45cfb7a14f..e35989fad5ad 100644

--- a/llvm/lib/Target/X86/X86FloatingPoint.cpp
+++ b/llvm/lib/Target/X86/X86FloatingPoint.cpp
@@ -982,8 +982,24 @@ void FPS::handleCall(MachineBasicBlock::iterator &I) {
   MachineInstr &MI = *I;
   unsigned STReturns = 0;
 
+  bool ClobbersFPStack = false;
   for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
     MachineOperand &Op = MI.getOperand(i);
+    // Check if this call clobbers the FP stack.
+    // is sufficient.
+    if (Op.isRegMask()) {
+      bool ClobbersFP0 = Op.clobbersPhysReg(X86::FP0);
+#ifndef NDEBUG
+      static_assert(X86::FP7 - X86::FP0 == 7, "sequential FP regnumbers");
+      for (unsigned i = 1; i != 8; ++i)
+        assert(Op.clobbersPhysReg(X86::FP0 + i) == ClobbersFP0 &&
+               "Inconsistent FP register clobber");
+#endif
+
+      if (ClobbersFP0)
+        ClobbersFPStack = true;
+    }
+
     if (!Op.isReg() || Op.getReg() < X86::FP0 || Op.getReg() > X86::FP6)
       continue;
 
@@ -998,6 +1014,14 @@ void FPS::handleCall(MachineBasicBlock::iterator &I) {
     --e;
   }
 
+  // Most calls should have a regmask that clobbers the FP registers. If it
+  // isn't present then the register allocator didn't spill the FP registers
+  // so they are still on the stack.
+  assert((ClobbersFPStack || STReturns == 0) &&
+         "ST returns without FP stack clobber");
+  if (!ClobbersFPStack)
+    return;
+
   unsigned N = countTrailingOnes(STReturns);
 
   // FP registers used for function return must be consecutive starting at

diff  --git a/llvm/test/CodeGen/X86/pr50782.ll b/llvm/test/CodeGen/X86/pr50782.ll
new file mode 100644
index 000000000000..8a992ad618f3
--- /dev/null
+++ b/llvm/test/CodeGen/X86/pr50782.ll
@@ -0,0 +1,121 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i686-w64-windows-gnu | FileCheck %s
+
+ at a = global i32 0, align 4
+ at b = global float 0.000000e+00, align 4
+ at d = global float 0.000000e+00, align 4
+ at f = global i32 0, align 4
+ at g = global float 0.000000e+00, align 4
+ at e = global i32 0, align 4
+ at c = global float* null, align 4
+
+; The FP stack should be preserved across the call to __alloca.
+define void @h(float %i) {
+; CHECK-LABEL: h:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    pushl %ebp
+; CHECK-NEXT:    .cfi_def_cfa_offset 8
+; CHECK-NEXT:    .cfi_offset %ebp, -8
+; CHECK-NEXT:    movl %esp, %ebp
+; CHECK-NEXT:    .cfi_def_cfa_register %ebp
+; CHECK-NEXT:    pushl %esi
+; CHECK-NEXT:    andl $-16, %esp
+; CHECK-NEXT:    subl $32, %esp
+; CHECK-NEXT:    movl %esp, %esi
+; CHECK-NEXT:    .cfi_offset %esi, -12
+; CHECK-NEXT:    flds 8(%ebp)
+; CHECK-NEXT:    movl _a, %ecx
+; CHECK-NEXT:    leal 3(%ecx), %eax
+; CHECK-NEXT:    andl $-4, %eax
+; CHECK-NEXT:    calll __alloca
+; CHECK-NEXT:    movl %esp, %eax
+; CHECK-NEXT:    andl $-16, %eax
+; CHECK-NEXT:    movl %eax, %esp
+; CHECK-NEXT:    fsts 8(%esi) # 4-byte Folded Spill
+; CHECK-NEXT:    fadds _b
+; CHECK-NEXT:    fsts _d
+; CHECK-NEXT:    fld1
+; CHECK-NEXT:    fldz
+; CHECK-NEXT:    testl %ecx, %ecx
+; CHECK-NEXT:    fld %st(0)
+; CHECK-NEXT:    fld %st(2)
+; CHECK-NEXT:    je LBB0_2
+; CHECK-NEXT:  # %bb.1: # %for.body.preheader
+; CHECK-NEXT:    fstp %st(1)
+; CHECK-NEXT:    fstp %st(0)
+; CHECK-NEXT:    movl _f, %ecx
+; CHECK-NEXT:    flds (%eax,%ecx,4)
+; CHECK-NEXT:    fld %st(3)
+; CHECK-NEXT:  LBB0_2: # %for.cond1.preheader
+; CHECK-NEXT:    movl _e, %ecx
+; CHECK-NEXT:    movl %ecx, 12(%esi)
+; CHECK-NEXT:    fildl 12(%esi)
+; CHECK-NEXT:    movl _c, %edx
+; CHECK-NEXT:    jmp LBB0_3
+; CHECK-NEXT:    .p2align 4, 0x90
+; CHECK-NEXT:  LBB0_5: # %for.inc
+; CHECK-NEXT:    # in Loop: Header=BB0_3 Depth=1
+; CHECK-NEXT:    fxch %st(5)
+; CHECK-NEXT:    fadd %st(4), %st
+; CHECK-NEXT:    fxch %st(5)
+; CHECK-NEXT:  LBB0_3: # %for.cond1
+; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    fld %st(5)
+; CHECK-NEXT:    fmul %st(4), %st
+; CHECK-NEXT:    fdiv %st(2), %st
+; CHECK-NEXT:    fadd %st(3), %st
+; CHECK-NEXT:    fsts _g
+; CHECK-NEXT:    fxch %st(1)
+; CHECK-NEXT:    fucom %st(1)
+; CHECK-NEXT:    fstp %st(1)
+; CHECK-NEXT:    fnstsw %ax
+; CHECK-NEXT:    # kill: def $ah killed $ah killed $ax
+; CHECK-NEXT:    sahf
+; CHECK-NEXT:    jbe LBB0_5
+; CHECK-NEXT:  # %bb.4: # %if.then
+; CHECK-NEXT:    # in Loop: Header=BB0_3 Depth=1
+; CHECK-NEXT:    flds 8(%esi) # 4-byte Folded Reload
+; CHECK-NEXT:    fstps (%edx,%ecx,4)
+; CHECK-NEXT:    jmp LBB0_5
+entry:
+  %0 = load i32, i32* @a, align 4
+  %1 = alloca i8, i32 %0, align 16
+  %2 = load float, float* @b, align 4
+  %add = fadd float %2, %i
+  store float %add, float* @d, align 4
+  %tobool.not = icmp eq i32 %0, 0
+  br i1 %tobool.not, label %for.cond1.preheader, label %for.body.preheader
+
+for.body.preheader:                               ; preds = %entry
+  %3 = bitcast i8* %1 to float*
+  %4 = load i32, i32* @f, align 4
+  %arrayidx.le = getelementptr inbounds float, float* %3, i32 %4
+  %5 = load float, float* %arrayidx.le, align 4
+  br label %for.cond1.preheader
+
+for.cond1.preheader:                              ; preds = %for.body.preheader, %entry
+  %k.0.lcssa = phi float [ %5, %for.body.preheader ], [ undef, %entry ]
+  %l.0.lcssa = phi float [ %add, %for.body.preheader ], [ 1.000000e+00, %entry ]
+  %6 = load i32, i32* @e, align 4
+  %conv = sitofp i32 %6 to float
+  %7 = load float*, float** @c, align 4
+  %arrayidx4 = getelementptr inbounds float, float* %7, i32 %6
+  br label %for.cond1
+
+for.cond1:                                        ; preds = %for.inc, %for.cond1.preheader
+  %m.0 = phi float [ %add5, %for.inc ], [ %add, %for.cond1.preheader ]
+  %mul = fmul float %m.0, 0.000000e+00
+  %div = fdiv float %mul, %l.0.lcssa
+  %add2 = fadd float %k.0.lcssa, %div
+  store float %add2, float* @g, align 4
+  %cmp = fcmp olt float %add2, %conv
+  br i1 %cmp, label %if.then, label %for.inc
+
+if.then:                                          ; preds = %for.cond1
+  store float %i, float* %arrayidx4, align 4
+  br label %for.inc
+
+for.inc:                                          ; preds = %if.then, %for.cond1
+  %add5 = fadd float %m.0, 1.000000e+00
+  br label %for.cond1
+}