[PATCH] D36388: [X86][SandyBridge] Additional updates to the SNB instructions scheduling information

Sat Aug 12 12:52:11 PDT 2017

dim added a comment.

In https://reviews.llvm.org/D36388#836337, @RKSimon wrote:

> @dim What effect does this patch have on PR34080? The changes in pr34080.ll aren't looking hopeful

It seems to fix my minimized test case, e.g.:

  #include <stdio.h>

  __attribute__((noinline)) int g(double *tx) {
    int bad = (tx[2] != 0x1.93p+16);
    printf("tx[0]=%.20a tx[1]=%.20a tx[2]=%.20a: %s\n", tx[0], tx[1], tx[2], bad ? "Bad!" : "OK");
    return bad;
  }

  __attribute__((noinline)) int f(long double z) {
    int i;
    double tx[3];
    for (i = 0; i < 2; i++) {
      tx[i] = (double)((int)(z));
      z = (z - tx[i]) * 1.6777216e+07;
      //printf("z=%.20La\n", z);
    }
    tx[2] = z;
    return g(tx);
  }

  int main(void)
  {
    return f(0x1.b2f3ee96e7600326p+23L);
  }

Although the resulting assembly is very slightly different from before https://reviews.llvm.org/rL307529.  So with trunk https://reviews.llvm.org/rL307528, it gives (only the body of function `f` shown):

  fldt    64(%rsp)
  fnstcw  6(%rsp)
  movzwl  6(%rsp), %eax
  movw    $3199, 6(%rsp)          # imm = 0xC7F
  fldcw   6(%rsp)
  movw    %ax, 6(%rsp)
  fistl   8(%rsp)
  fldcw   6(%rsp)
  cvtsi2sdl       8(%rsp), %xmm0
  movsd   %xmm0, 32(%rsp)
  movsd   %xmm0, 24(%rsp)
  fsubl   24(%rsp)
  flds    .LCPI1_0(%rip)
  fmul    %st(0), %st(1)
  fnstcw  4(%rsp)
  movzwl  4(%rsp), %eax
  movw    $3199, 4(%rsp)          # imm = 0xC7F
  fldcw   4(%rsp)
  movw    %ax, 4(%rsp)
  fxch    %st(1)
  fistl   12(%rsp)
  fldcw   4(%rsp)
  xorps   %xmm0, %xmm0
  cvtsi2sdl       12(%rsp), %xmm0
  movsd   %xmm0, 40(%rsp)
  movsd   %xmm0, 16(%rsp)
  fsubl   16(%rsp)
  fmulp   %st(1)
  fstpl   48(%rsp)
  leaq    32(%rsp), %rdi
  callq   g
  addq    $56, %rsp
  retq

With trunk https://reviews.llvm.org/rL307529 through https://reviews.llvm.org/rL310782, it gives:

  fnstcw  6(%rsp)
  movzwl  6(%rsp), %eax
  movw    $3199, 6(%rsp)          # imm = 0xC7F
  fldcw   6(%rsp)
  fldt    64(%rsp)
  movw    %ax, 6(%rsp)
  fistl   8(%rsp)
  fldcw   6(%rsp)
  cvtsi2sdl       8(%rsp), %xmm0
  movsd   %xmm0, 32(%rsp)
  movsd   %xmm0, 24(%rsp)
  fsubl   24(%rsp)
  fnstcw  4(%rsp)
  flds    .LCPI1_0(%rip)
  movzwl  4(%rsp), %eax
  movw    $3199, 4(%rsp)          # imm = 0xC7F
  fldcw   4(%rsp)
  fmul    %st(0), %st(1)
  movw    %ax, 4(%rsp)
  fxch    %st(1)
  fistl   12(%rsp)
  fldcw   4(%rsp)
  xorps   %xmm0, %xmm0
  cvtsi2sdl       12(%rsp), %xmm0
  movsd   %xmm0, 40(%rsp)
  movsd   %xmm0, 16(%rsp)
  fsubl   16(%rsp)
  fmulp   %st(1)
  fstpl   48(%rsp)
  leaq    32(%rsp), %rdi
  callq   g
  addq    $56, %rsp
  retq

Where the most important change (and the source of errors) is that the `flds .LCPI1_0(%rip)` and `fmul %st(0), %st(1)` are moved apart from each other.

However, applying https://reviews.llvm.org/D36388 to https://reviews.llvm.org/rL310782 results in:

  fnstcw  6(%rsp)
  fldt    64(%rsp)
  movzwl  6(%rsp), %eax
  movw    $3199, 6(%rsp)          # imm = 0xC7F
  fldcw   6(%rsp)
  movw    %ax, 6(%rsp)
  fistl   8(%rsp)
  fldcw   6(%rsp)
  cvtsi2sdl       8(%rsp), %xmm0
  movsd   %xmm0, 32(%rsp)
  movsd   %xmm0, 24(%rsp)
  fsubl   24(%rsp)
  flds    .LCPI1_0(%rip)
  fnstcw  4(%rsp)
  fmul    %st(0), %st(1)
  movzwl  4(%rsp), %eax
  movw    $3199, 4(%rsp)          # imm = 0xC7F
  fldcw   4(%rsp)
  movw    %ax, 4(%rsp)
  fxch    %st(1)
  fistl   12(%rsp)
  fldcw   4(%rsp)
  xorps   %xmm0, %xmm0
  cvtsi2sdl       12(%rsp), %xmm0
  movsd   %xmm0, 40(%rsp)
  movsd   %xmm0, 16(%rsp)
  fsubl   16(%rsp)
  fmulp   %st(1)
  fstpl   48(%rsp)
  leaq    32(%rsp), %rdi
  callq   g
  addq    $56, %rsp
  retq

So the `flds .LCPI1_0(%rip)` and `fmul %st(0), %st(1)` are now only interspersed with a `fnstcw 4(%rsp)`, which does not seem to affect the outcome.

Diff of the assembly output of stock https://reviews.llvm.org/rL307528 and https://reviews.llvm.org/rL310782 with https://reviews.llvm.org/D36388:

  --- pio2n-r307528.s
  +++ pio2n-r310782-D36388.s
  @@ -53,8 +53,8 @@
          subq    $56, %rsp
   .Lcfi2:
          .cfi_def_cfa_offset 64
  -       fldt    64(%rsp)
          fnstcw  6(%rsp)
  +       fldt    64(%rsp)
          movzwl  6(%rsp), %eax
          movw    $3199, 6(%rsp)          # imm = 0xC7F
          fldcw   6(%rsp)
  @@ -66,8 +66,8 @@
          movsd   %xmm0, 24(%rsp)
          fsubl   24(%rsp)
          flds    .LCPI1_0(%rip)
  -       fmul    %st(0), %st(1)
          fnstcw  4(%rsp)
  +       fmul    %st(0), %st(1)
          movzwl  4(%rsp), %eax
          movw    $3199, 4(%rsp)          # imm = 0xC7F
          fldcw   4(%rsp)

Repository:
  rL LLVM

https://reviews.llvm.org/D36388