On Tuesday 16 December 2008 01:03:36 Evan Cheng wrote:
> FYI. http://leonardo-m.livejournal.com/73732.html
> If anyone is motivated, please file bugs for the losing cases. Also,
> it might make sense to incorporate the tests into our nightly tester
> test suite.

FWIW, I just ported my ray tracer benchmark to C and found that llvm-gcc gives 
much worse performance than gcc on x86 but not on x86-64 on an Opteron:

2.1GHz Opteron

     gcc 4.3.2: 5.60s (gcc -Wall -O3 -lm ray.c -o ray)
llvm-gcc 4.2.1: 9.00s (llvm-gcc -O3 -march=opteron -msse2 -lm ray.c -o ray)

     gcc 4.3.2: 4.18s (gcc -Wall -O3 -lm ray.c -o ray)
llvm-gcc 4.2.1: 5.00s (llvm-gcc -O3 -march=opteron -msse2 -lm ray.c -o ray)

Note that the LLVM-generated code is 60% slower than GCC's in the first case.

I am unfamiliar with x86 assembler but I believe the problem is that LLVM is 
calling a function for fsqrt rather than using the x86 op-code. Should I be 
passing some command line arguments or using a newer llvm-gcc to get it to 
emit fsqrt or is that not yet implemented?

Benchmark was:

  time ./ray 9 512 >image.pgm

Compile times go down from 0.36s to 0.13s on x86 and 0.35s to 0.19s on x86-64 
as expected.

Here's the code:

#include <float.h>
#include <stdio.h>
#include <stdlib.h>
#include <math.h>

#define real float
#define epsilon FLT_EPSILON
//#define real double
//#define epsilon DBL_EPSILON

real delta, INFINITY;

typedef struct { real x, y, z; } Vec;
Vec vec(real x, real y, real z) { Vec r; r.x=x; r.y=y; r.z=z; return r; }
Vec add(const Vec a, const Vec b) { return vec(a.x+b.x, a.y+b.y, a.z+b.z); }
Vec sub(const Vec a, const Vec b) { return vec(a.x-b.x, a.y-b.y, a.z-b.z); }
Vec scale(real a, const Vec b) { return vec(a*b.x, a*b.y, a*b.z); }
real dot(const Vec a, const Vec b) { return a.x*b.x + a.y*b.y + a.z*b.z; }
Vec unitise(const Vec a) { return scale((1.0 / sqrt(dot(a, a))), a); }

struct Scene {
  Vec center;
  real radius;
  struct Scene *child;

real ray_sphere(Vec o, Vec d, Vec c, real r) {
  Vec v = sub(c, o);
  real b = dot(v, d), disc = b*b - dot(v, v) + r*r, t1, t2;
  if (disc < 0.0) return INFINITY;
  disc = sqrt(disc);
  t2 = b + disc;
  if (t2 < 0.0) return INFINITY;
  t1 = b - disc;
  return (t1 > 0.0 ? t1 : t2);

void intersect(Vec o, Vec d, real *lambda, struct Scene **t, struct Scene 
*scene) {
  real lambda2 = ray_sphere(o, d, scene->center, scene->radius);
  if (lambda2 < *lambda) {
    if (scene->child) {
      int i;
      for (i=0; i<5; ++i)
        intersect(o, d, lambda, t, &scene->child[i]);
    } else {
      *lambda = lambda2;
      *t = scene;

Vec neglight;

real ray_trace(Vec o, Vec d, struct Scene scene) {
  real lambda = INFINITY;
  struct Scene *t = NULL;
  intersect(o, d, &lambda, &t, &scene);
  if (lambda == INFINITY) return 0.0;
    Vec p = add(o, scale(lambda, d));
    Vec normal = unitise(sub(p, t->center));
    real g = dot(normal, neglight);
    if (g <= 0.0) return 0.0;
    p = add(p, scale(delta, normal));
    lambda = INFINITY;
    intersect(p, neglight, &lambda, &t, &scene);
    return (lambda < INFINITY ? 0.0 : g);

struct Scene create(int level, Vec c, real r) {
  struct Scene scene;
  scene.center = c;
  if (level == 1) {
    scene.radius = r;
    scene.child = NULL;
  } else {
    real rn = 3*r/sqrt(12);
    scene.radius = 3*r;
    scene.child = (struct Scene *)malloc(5*sizeof(struct Scene));
    scene.child[0] = create(1, c, r);
    scene.child[1] = create(level-1, add(c, scale(rn, vec(-1, 1, -1))), r/2);
    scene.child[2] = create(level-1, add(c, scale(rn, vec( 1, 1, -1))), r/2);
    scene.child[3] = create(level-1, add(c, scale(rn, vec(-1, 1,  1))), r/2);
    scene.child[4] = create(level-1, add(c, scale(rn, vec( 1, 1,  1))), r/2);
  return scene;

int main(int argc, char *argv[]) {
  struct Scene scene;
  int level, n, ss=4, x, y;
  level = (argc==3 ? atoi(argv[1]) : 9);
  n = (argc==3 ? atoi(argv[2]) : 512);
  delta = sqrt(epsilon);
  INFINITY = 1.0 / 0.0;
  neglight = unitise(vec(1, 3, -2));
  scene = create(level, vec(0, -1, 0), 1);
  printf("P5\n%d %d\n255\n", n, n);
  for (y=n-1; y>=0; --y)
    for (x=0; x<n; ++x) {
      real g=0.0;
      int dx, dy;
      for (dx=0; dx<ss; ++dx)
	for (dy=0; dy<ss; ++dy) {
	  Vec d=unitise(vec(x+dx*1./ss-n/2., y+dy*1./ss-n/2., n));
	  g += ray_trace(vec(0, 0, -4), d, scene);
      printf("%c", (char)(0.5 + 255.0 * g / (ss*ss)));
  return 0;

