[cfe-dev] [test-suite] making polybench/symm succeed with "-Ofast" and "-ffp-contract=on"

Fri Oct 14 07:50:36 PDT 2016

On Wed, Oct 12, 2016 at 11:29 AM, Sebastian Pop <sebpop.llvm at gmail.com> wrote:
> On Wed, Oct 12, 2016 at 10:53 AM, Hal Finkel <hfinkel at anl.gov> wrote:
>> I don't think that Clang/LLVM uses it by default on x86_64. If you're using -Ofast, however, that would explain it. I recommend looking at -O3 vs -O0 and make sure those are the same. -Ofast enables -ffast-math, which can legitimately cause differences.
>>
>
> The following tests pass at "-O3" and "-O3 -ffp-contract=on" compared
> with FP_ABSTOLERANCE=1e-5 against "-O0 -ffp-contract=off":
>
> polybench/linear-algebra/kernels/symm
> polybench/linear-algebra/solvers/gramschmidt
> polybench/stencils/seidel-2d
>

These 3 tests are passing with the following configurations:
-O3 -ffp-contract=off
-O3 -ffp-contract=on
-O0 -ffp-contract=off
-O0 -ffp-contract=on

They are not passing at:
-Ofast -ffp-contract=on
-Ofast -ffp-contract=off

Using Abe's CMake/Makefile variables to detect the use of -ffast-math,
we could change the FP_ABSTOLERANCE at -Ofast: something like this

if(TEST_SUITE_USES_FAST_MATH)
  add_definitions(-DFP_ABSTOLERANCE=1e0)
else()
  add_definitions(-DFP_ABSTOLERANCE=1e-5)
endif()

The tests are passing at -Ofast with the following tolerances:

polybench/linear-algebra/kernels/symm, FP_ABSTOLERANCE=1e1
polybench/linear-algebra/solvers/gramschmidt, FP_ABSTOLERANCE=1e0
polybench/stencils/seidel-2d, FP_ABSTOLERANCE=1e-5

The 3 tests are currently not passing at -Ofast with these FP_ABSTOLERANCE
because the output of array_StrictFP does not match the hash.
The cause may be related to a bug in handling -ffast-math and
__attribute__((optnone)):
$ clang -O3 -ffast-math f.c -S -o ofast.s
$ clang -O3 f.c -S -o o3.s
$ diff -u o3.s ofast.s

--- o3.s        2016-10-14 10:39:46.411567948 -0400
+++ ofast.s     2016-10-14 10:39:45.079567919 -0400
@@ -109,16 +109,16 @@
        addq    %rax, %rcx
        movslq  -64(%rsp), %rax
        mulsd   (%rcx,%rax,8), %xmm1
-       addsd   %xmm0, %xmm1
-       movsd   -24(%rsp), %xmm0        # xmm0 = mem[0],zero
-       mulsd   -56(%rsp), %xmm0
-       addsd   %xmm1, %xmm0
+       movsd   -24(%rsp), %xmm2        # xmm2 = mem[0],zero
+       mulsd   -56(%rsp), %xmm2
+       addsd   %xmm0, %xmm2
+       addsd   %xmm1, %xmm2
        movq    -32(%rsp), %rax
        movslq  -68(%rsp), %rcx
        shlq    $13, %rcx
        addq    %rax, %rcx
        movslq  -64(%rsp), %rax
-       movsd   %xmm0, (%rcx,%rax,8)
+       movsd   %xmm2, (%rcx,%rax,8)
 # BB#9:                                 # %for.inc50
                                         #   in Loop: Header=BB0_3 Depth=2
        movl    -64(%rsp), %eax

$ cat f.c
__attribute__((optnone))
void kernel_symm_StrictFP(int ni, int nj,
                          double alpha,
                          double beta,
                          double C[1024 + 0][1024 + 0],
                          double A[1024 + 0][1024 + 0],
                          double B[1024 + 0][1024 + 0])
{
#pragma STDC FP_CONTRACT OFF
  int i, j, k;
  double acc;
  for (i = 0; i < ni; i++)
    for (j = 0; j < nj; j++)
      {
 acc = 0;
 for (k = 0; k < j - 1; k++)
   {
     C[k][j] += alpha * A[k][i] * B[i][j];
     acc += B[k][j] * A[k][i];
   }
 C[i][j] = beta * C[i][j] + alpha * A[i][i] * B[i][j] + alpha * acc;
      }
}