[llvm-commits] CVS: llvm-test/SingleSource/UnitTests/Vector/Altivec/alti.expandfft.c alti.isamax.c alti.sdot.c alti.stepfft.c

Mon Apr 3 17:48:09 PDT 2006

Changes in directory llvm-test/SingleSource/UnitTests/Vector/Altivec:

alti.expandfft.c added (r1.1)
alti.isamax.c added (r1.1)
alti.sdot.c added (r1.1)
alti.stepfft.c added (r1.1)
---
Log message:

Added some Altivec and SSE examples from:

Introduction to Parallel Computing 
A practical guide with examples in C 

Oxford Texts in Applied and Engineering Mathematics No. 9 
Oxford University Press, February 2004 
ISBN: 0-19-851576-6 (hardback), 0-19-851577-4 (paperback) 

http://people.inf.ethz.ch/arbenz/book/

---
Diffs of the changes:  (+725 -0)

 alti.expandfft.c |  287 +++++++++++++++++++++++++++++++++++++++++++++++++++++++
 alti.isamax.c    |  131 +++++++++++++++++++++++++
 alti.sdot.c      |  103 +++++++++++++++++++
 alti.stepfft.c   |  204 +++++++++++++++++++++++++++++++++++++++
 4 files changed, 725 insertions(+)

Index: llvm-test/SingleSource/UnitTests/Vector/Altivec/alti.expandfft.c
diff -c /dev/null llvm-test/SingleSource/UnitTests/Vector/Altivec/alti.expandfft.c:1.1
*** /dev/null	Mon Apr  3 19:48:04 2006
--- llvm-test/SingleSource/UnitTests/Vector/Altivec/alti.expandfft.c	Mon Apr  3 19:47:54 2006
***************
*** 0 ****
--- 1,287 ----
+ #include <stdio.h>
+ #include <math.h>
+ #include <time.h>
+ #include <float.h>
+ #define N 1048576
+ #define N2 N/2
+ main()
+ {
+ /* 
+    Example of Apple Altivec coded binary radix FFT
+    using intrinsics from Petersen and Arbenz "Intro. 
+    to Parallel Computing," Section 3.6
+  
+    This is an expanded version of a generic work-space 
+    FFT: steps are in-line. cfft2(n,x,y,w,sign) takes complex
+    n-array "x" (Fortran real,aimag,real,aimag,... order) 
+    and writes its DFT in "y". Both input "x" and the 
+    original contents of "y" are destroyed. Initialization
+    for array "w" (size n/2 complex of twiddle factors
+    (exp(twopi*i*k/n), for k=0..n/2-1)) is computed once
+    by cffti(n,w).
+ 
+                       WPP, SAM. Math. ETHZ, 1 June, 2002 
+ */
+ 
+    int first,i,icase,it,ln2,n;
+    int nits=1000000;
+    static float seed = 331.0;
+    float error,fnm1,sign,z0,z1,ggl();
+    float *x,*y,*z,*w;
+    double t1,mflops;
+    void cffti(),cfft2();
+ /* allocate storage for x,y,z,w on 4-word bndr. */
+    x = (float *) malloc(8*N);
+    y = (float *) malloc(8*N);
+    z = (float *) malloc(8*N);
+    w = (float *) malloc(4*N);
+    n     = 2;
+    for(ln2=1;ln2<21;ln2++){
+       first = 1;
+       for(icase=0;icase<2;icase++){
+          if(first){
+             for(i=0;i<2*n;i+=2){
+                z0 = ggl(&seed);     /* real part of array */
+                z1 = ggl(&seed);     /* imaginary part of array */
+                x[i] = z0;
+                z[i] = z0;           /* copy of initial real data */
+                x[i+1] = z1;
+                z[i+1] = z1;         /* copy of initial imag. data */
+             }
+          } else {
+             for(i=0;i<2*n;i+=2){
+                z0 = 0;              /* real part of array */
+                z1 = 0;              /* imaginary part of array */
+                x[i] = z0;
+                z[i] = z0;           /* copy of initial real data */
+                x[i+1] = z1;
+                z[i+1] = z1;         /* copy of initial imag. data */
+             }
+          }
+ /* initialize sine/cosine tables */
+          cffti(n,w);
+ /* transform forward, back */
+          if(first){
+             sign = 1.0;
+             cfft2(n,x,y,w,sign);
+             sign = -1.0;
+             cfft2(n,y,x,w,sign);
+ /* results should be same as initial multiplied by n */
+             fnm1 = 1.0/((float) n);
+             error = 0.0;
+             for(i=0;i<2*n;i+=2){
+                error += (z[i] - fnm1*x[i])*(z[i] - fnm1*x[i]) +
+                    (z[i+1] - fnm1*x[i+1])*(z[i+1] - fnm1*x[i+1]);
+             }
+             error = sqrt(fnm1*error);
+             printf(" for n=%d, fwd/bck error=%e\n",n,error);
+             first = 0;
+          } else {
+             t1   = ((double)clock())/((double) CLOCKS_PER_SEC);
+             for(it=0;it<nits;it++){
+                sign = +1.0;
+                cfft2(n,x,y,w,sign);
+                sign = -1.0;
+                cfft2(n,y,x,w,sign);
+             }
+             t1   = ((double)clock())/((double) CLOCKS_PER_SEC) - t1;
+             t1   = 0.5*t1/((double) nits);
+             mflops = 5.0*((double) n)*((double) ln2)/((1.e+6)*t1);
+             printf(" for n=%d, t1=%e, mflops=%e\n",n,t1,mflops);
+          }
+       }
+       if((ln2%4)==0) nits /= 10;
+       n *= 2;
+    }
+ }
+ void cfft2(unsigned int n,float x[][2],float y[][2],float w[][2], float sign)
+ {
+ 
+ /* 
+    altivec version of cfft2 from Petersen and Arbenz book, "Intro.
+    to Parallel Computing", Oxford Univ. Press, 2003, Section 3.6
+                                         wpp 14. Dec. 2003
+ */
+ 
+    int jb,jc,jd,jw,k,k2,k4,lj,m,j,mj,mj2,pass,tgle;
+    float rp,up,wr[4],wu[4];
+    float *a,*b,*c,*d;
+    const vector float vminus = (vector float)(-0.,0.,-0.,0.);
+    const vector float vzero  = (vector float)(0.,0.,0.,0.);
+    const vector unsigned char pv3201 =
+    (vector unsigned char)(4,5,6,7,0,1,2,3,12,13,14,15,8,9,10,11);
+    vector float V0,V1,V2,V3,V4,V5,V6,V7;
+    vector float V8,V9,V10,V11,V12,V13,V14,V15;
+ 
+    if(n<=1){
+       y[0][0] = x[0][0];
+       y[0][1] = x[0][1];
+       return;
+    }
+    m    = (int) (log((float) n)/log(1.99));
+    mj   = 1;
+    mj2  = 2;
+    lj   = n/2;
+ /* first pass thru data: x -> y */
+    for(j=0;j<lj;j++){
+       jb = n/2+j; jc  = j*mj2; jd = jc + 1;
+       rp = w[j][0]; up = w[j][1];
+       if(sign<0.0) up = -up;
+       y[jd][0] = rp*(x[j][0] - x[jb][0]) - up*(x[j][1] - x[jb][1]);
+       y[jd][1] = up*(x[j][0] - x[jb][0]) + rp*(x[j][1] - x[jb][1]);
+       y[jc][0] = x[j][0] + x[jb][0];
+       y[jc][1] = x[j][1] + x[jb][1];
+    }
+    if(n==2) return;
+ /* next pass is mj = 2 */
+    mj  = 2;
+    mj2 = 4;
+    lj  = n/4;
+    a = (float *)&y[0][0];
+    b = (float *)&y[n/2][0];
+    c = (float *)&x[0][0];
+    d = (float *)&x[mj][0];
+    if(n==4){
+       c = (float *)&y[0][0];
+       d = (float *)&y[mj][0];
+    }
+    for(j=0;j<lj;j++){
+       jw = j*mj; jc = j*mj2; jd = 2*jc;
+       rp = w[jw][0]; up = w[jw][1];
+       if(sign<0.0) up = -up;
+       wr[0] = rp; wr[1] = rp; wr[2] = rp; wr[3] = rp;
+       wu[0] = up; wu[1] = up; wu[2] = up; wu[3] = up;
+       V6 = vec_ld(0,wr);
+       V7 = vec_ld(0,wu);
+       V7 = vec_xor(V7,vminus);
+       V0 = vec_ld(0,(vector float *) (a+jc));
+       V1 = vec_ld(0,(vector float *) (b+jc));
+       V2 = vec_add(V0,V1);                         /* a + b */
+       vec_st(V2,0,(vector float *) (c+jd));     /* store c */
+       V3 = vec_sub(V0,V1);                         /* a - b */
+       V4 = vec_perm(V3,V3,pv3201);
+       V0 = vec_madd(V6,V3,vzero);
+       V1 = vec_madd(V7,V4,vzero);
+       V2 = vec_add(V0,V1);                         /* w*(a - b) */
+       vec_st(V2,0,(vector float*) (d+jd));         /* store d */
+    }
+    if(n==4) return;
+    mj  *= 2;
+    mj2  = 2*mj;
+    lj   = n/mj2; 
+    tgle = 0;  
+    for(pass=2;pass<m-1;pass++){
+       if(tgle){
+          a = (float *)&y[0][0];
+          b = (float *)&y[n/2][0];
+          c = (float *)&x[0][0];
+          d = (float *)&x[mj][0];
+  	 tgle = 0;
+       } else {
+          a = (float *)&x[0][0];
+          b = (float *)&x[n/2][0];
+          c = (float *)&y[0][0];
+          d = (float *)&y[mj][0];
+ 	 tgle = 1;
+       }
+       for(j=0; j<lj; j++){
+          jw = j*mj; jc = j*mj2; jd = 2*jc;
+          rp = w[jw][0];
+          up = w[jw][1];
+          if(sign<0.0) up = -up;
+          wr[0] = rp; wr[1] = rp; wr[2] = rp; wr[3] = rp;
+          wu[0] = up; wu[1] = up; wu[2] = up; wu[3] = up;
+          V6 = vec_ld(0,wr);
+          V7 = vec_ld(0,wu);
+          V7 = vec_xor(V7,vminus);
+          for(k=0; k<mj; k+=4){
+             k2   = 2*k; k4 = k2+4;
+             V0 = vec_ld(0,(vector float *) (a+jc+k2));
+             V1 = vec_ld(0,(vector float *) (b+jc+k2));
+             V2 = vec_add(V0,V1);                        /* a + b */
+             vec_st(V2,0,(vector float*) (c+jd+k2));   /* store c */
+             V3 = vec_sub(V0,V1);                        /* a - b */
+             V4 = vec_perm(V3,V3,pv3201);
+             V0 = vec_madd(V6,V3,vzero);
+             V1 = vec_madd(V7,V4,vzero);
+             V2 = vec_add(V0,V1);                        /* w*(a - b) */
+             vec_st(V2,0,(vector float *) (d+jd+k2));    /* store d */
+ 	    V8 = vec_ld(0,(vector float *) (a+jc+k4));
+ 	    V9 = vec_ld(0,(vector float *) (b+jc+k4));
+ 	    V10 = vec_add(V8,V9);		        /* a + b */
+ 	    vec_st(V10,0,(vector float *) (c+jd+k4));   /* store c */
+ 	    V11 = vec_sub(V8,V9);		        /* a - b */
+ 	    V12 = vec_perm(V11,V11,pv3201);
+ 	    V8  = vec_madd(V6,V11,vzero);
+ 	    V9  = vec_madd(V7,V12,vzero);
+ 	    V10 = vec_add(V8,V9);		        /* w*(a - b) */
+ 	    vec_st(V10,0,(vector float *) (d+jd+k4));   /* store d */
+          }
+       }
+       mj  *= 2;
+       mj2  = 2*mj;
+       lj   = n/mj2;
+    }
+ /* last pass thru data: in-place if previous in y */
+    c = (float *)&y[0][0];
+    d = (float *)&y[n/2][0];
+    if(tgle) {
+       a = (float *)&y[0][0];
+       b = (float *)&y[n/2][0];
+    } else {
+       a = (float *)&x[0][0];
+       b = (float *)&x[n/2][0];
+    }
+    for(k=0; k<(n/2); k+=4){
+       k2 = 2*k; k4 = k2+4;
+       V0 = vec_ld(0,(vector float *) (a+k2));
+       V1 = vec_ld(0,(vector float *) (b+k2));
+       V2 = vec_add(V0,V1);                      /* a + b */
+       vec_st(V2,0,(vector float*) (c+k2));      /* store c */
+       V3 = vec_sub(V0,V1);                      /* a - b */
+       vec_st(V3,0,(vector float *) (d+k2));     /* store d */
+       V4 = vec_ld(0,(vector float *) (a+k4));
+       V5 = vec_ld(0,(vector float *) (b+k4));
+       V6 = vec_add(V4,V5);                      /* a + b */
+       vec_st(V6,0,(vector float *) (c+k4));     /* store c */
+       V7 = vec_sub(V4,V5);                      /* a - b */
+       vec_st(V7,0,(vector float *) (d+k4));     /* store d */
+    }
+ }
+ void cffti(int n, float w[][2])
+ {
+ 
+ /* initialization routine for cfft2: computes
+         cos(twopi*k),sin(twopi*k) for k=0..n/2-1
+    - the "twiddle factors" for a binary radix FFT */
+ 
+    int i,n2;
+    float aw,arg,pi;
+    pi = 3.141592653589793;
+    n2 = n/2;
+    aw = 2.0*pi/((float)n);
+    for(i=0;i<n2;i++){
+       arg   = aw*((float)i);
+       w[i][0] = cos(arg);
+       w[i][1] = sin(arg);
+    }
+ }
+ #include <math.h>
+ float ggl(float *ds)
+ {
+ 
+ /* generate u(0,1) distributed random numbers. 
+    Seed ds must be saved between calls. ggl is 
+    essentially the same as the IMSL routine RNUM. 
+ 
+    W. Petersen and M. Troyer, 24 Oct. 2002, ETHZ: 
+    a modification of a fortran version from 
+    I. Vattulainen, Tampere Univ. of Technology, 
+    Finland, 1992 */
+ 
+    double t,d2=0.2147483647e10;
+    t   = (float) *ds;
+    t   = fmod(0.16807e5*t,d2);
+    *ds = (float) t;
+    return((float) ((t-1.0e0)/(d2-1.0e0)));
+ }

Index: llvm-test/SingleSource/UnitTests/Vector/Altivec/alti.isamax.c
diff -c /dev/null llvm-test/SingleSource/UnitTests/Vector/Altivec/alti.isamax.c:1.1
*** /dev/null	Mon Apr  3 19:48:09 2006
--- llvm-test/SingleSource/UnitTests/Vector/Altivec/alti.isamax.c	Mon Apr  3 19:47:54 2006
***************
*** 0 ****
--- 1,131 ----
+ #include <stdio.h>
+ #include <math.h>
+ #include <float.h>
+ #define N 1027
+ main()
+ {
+ /* 
+   Mac G-4 unit step isamax for arbitrary N code 
+   This is an Altivec version of isamax0 from Section 3.5.7
+   in Arbenz and Petersen, "Intro. to Parallel Computing"
+   Oxford Univ. Press, 2004
+                                            wpp 5/8/2002 
+ */
+   float x[N];
+   float xb;
+   int err,flag,i,im,k,ki,kl,ib,n0,n;
+   int isamax(int,float *);
+   static float seed = 331.0;
+   float ggl(float*);
+   flag = 0;                         // error flag
+   kl   = 3;
+   n0   = 1;
+   for(k=0;k<5;k++){
+      for(ki=0;ki<kl;ki++){
+         n  = n0 + ki;
+         ib = 0; xb = 0.0;
+         for(i=0;i<n;i++){
+            x[i] = ggl(&seed);
+            if(fabs(x[i]) > xb){
+               xb = fabs(x[i]);
+               ib = i;
+            }
+         }
+         im  = isamax(n,x);
+         err = ib - im;
+         if(err != 0){
+             printf(" err in isamax: n = %d, ib = %d, im = %d\n",n,ib,im);
+             flag = 1;
+         }
+      }
+      n0  *= 4;                    // increase n
+      kl   = 4;                    // for n > 1, 3 steps of increase in n
+   }
+   if(flag==0) printf(" All n tests pass\n");
+ }
+ #define NS 12
+ int isamax(int n, float *x)
+ {
+   float rbig,*xp;
+   int i,ii,nres,nsegs,ibig,irbig;
+   vector float V0,V1,V6;
+   vector bool int V3;
+   vector float V2 = (vector float) (0.0,1.0,2.0,3.0);
+   vector float V7 = (vector float) (0.0,1.0,2.0,3.0);
+   const vector float incr_4 = (vector float) (4.0,4.0,4.0,4.0);
+   const vector float minus0 = (vector float) (-0.0,-0.0,-0.0,-0.0);
+   float big,xbig[4],indx[4];
+ // n < NS done in scalar mode
+   if(n < NS){
+      ibig = 0;
+      rbig = 0.0;
+      for(i=0;i<n;i++){
+         if(fabs(x[i]) > rbig){
+            rbig = fabs(x[i]);
+            ibig = i;
+         }
+      }
+      return(ibig);
+   }
+ // n >= NS case done with altivec 
+   nsegs = (n >> 2) - 1;
+   nres  = n - ((nsegs+1) << 2);    // nres = n mod 4
+   V2 = vec_add(V2,incr_4);         // increment next index
+   xp = x;
+   V0 = vec_ld(0,xp); xp += 4;      // first four 
+   V1 = vec_ld(0,xp); xp += 4;      // next four
+   V0 = vec_abs(V0);                // absolute value of first four
+   for(i=0;i<nsegs;i++){
+      V1 = vec_abs(V1);             // take absolute value fo next segment
+      V3 = vec_cmpgt(V1,V0);        // compare accumulated 4 to next 4
+      V0 = vec_sel(V0,V1,V3);       // select new or old accumulation
+      V7 = vec_sel(V7,V2,V3);       // select new index or old
+      V1 = vec_ld(0,xp); xp += 4;   // bottom load next 4
+      V2 = vec_add(V2,incr_4);
+   }
+   V1 = vec_ld(0,xp); xp += 4;  // bottom load next four
+   V3 = vec_cmpgt(V1,V0);      // compare accumulated to last 4
+   V0 = vec_sel(V0,V1,V3);     // select accumulation to last 4
+   V7 = vec_sel(V7,V2,V3);     // select index of accum. to last 4
+ // Now finish up: segment maxima are in V0, indices in V7
+   vec_st(V0,0,xbig);
+   vec_st(V7,0,indx);
+   ii   = ((n >> 2) << 2);
+   big  = 0.0;
+   ibig = 0.0;
+   for(i=0;i<nres;i++){
+      if(fabs(x[ii]) > big){
+         big  = fabs(x[ii]);
+         ibig = ii;
+      }
+      ii++;
+   }
+   for(i=0;i<4;i++){
+      if(xbig[i] > big){
+         big  = xbig[i];
+         ibig = (int) indx[i];
+      }
+   }
+   return(ibig);
+ }
+ #undef NS
+ #include <math.h>
+ float ggl(float *ds)
+ {
+  
+ /* generate u(0,1) distributed random numbers.
+    Seed ds must be saved between calls. ggl is
+    essentially the same as the IMSL routine RNUM.
+  
+    W. Petersen and M. Troyer, 24 Oct. 2002, ETHZ:
+    a modification of a fortran version from
+    I. Vattulainen, Tampere Univ. of Technology,
+    Finland, 1992 */
+  
+    double t,d2=0.2147483647e10;
+    t   = (float) *ds;
+    t   = fmod(0.16807e5*t,d2);
+    *ds = (float) t;
+    return((float) ((t-1.0e0)/(d2-1.0e0)));
+ }
+ 

Index: llvm-test/SingleSource/UnitTests/Vector/Altivec/alti.sdot.c
diff -c /dev/null llvm-test/SingleSource/UnitTests/Vector/Altivec/alti.sdot.c:1.1
*** /dev/null	Mon Apr  3 19:48:09 2006
--- llvm-test/SingleSource/UnitTests/Vector/Altivec/alti.sdot.c	Mon Apr  3 19:47:54 2006
***************
*** 0 ****
--- 1,103 ----
+ #include <stdio.h>
+ #include <math.h>
+ #include <float.h>
+ #define N 1027
+ main()
+ {
+ /* Mac G-4 sdot for arbitrary N wpp 6/8/2002 */
+   float x[N],y[N],tres,res,eps;
+   int flag,i,k,ki,kl,n0,n;
+   static float seed = 331.0;
+   float sdot(int,float *,float *);
+   float ggl(float *);
+   eps  = FLT_EPSILON;    /* machine eps */
+   n0   = 1; kl   = 3;
+   flag = 0;
+   for(k=0;k<5;k++){
+      for(ki=0;ki<kl;ki++){
+         n    = n0 + ki;
+         tres = 0.0;
+         for(i=0;i<n;i++){
+            x[i] = ggl(&seed);
+            y[i] = ggl(&seed);
+            tres = tres + x[i]*y[i];
+         }
+         res = sdot(n,x,y);
+         if(fabs(tres-res) > ((float) n)*eps){
+            flag = 1;
+            printf(" n = %d, test sdot value = %e, sdot value = %e\n",
+                     n,tres,res);
+         }
+      }
+      n0 *= 4;
+      kl  = 4;
+   }
+   if(flag == 0) printf(" All n tests passed\n");
+ }
+ #define NS 12
+ float sdot(int n, float *x, float *y)
+ {
+   float sum,*xp,*yp;
+   int i,ii,nres,nsegs;
+   vector float V7 = (vector float)(0.0,0.0,0.0,0.0);
+   vector float V0,V1;
+   float psum[4];
+ // n < NS done in scalar mode
+   if(n < NS){
+      sum = x[0]*y[0];
+      for(i=1;i<n;i++){
+         sum += x[i]*y[i];
+      }
+      return(sum);
+   }
+ // n >= NS case done with altivec 
+   xp = x;
+   yp = y;
+   V0    = vec_ld(0,xp); xp += 4;   // increment next index
+   V1    = vec_ld(0,yp); yp += 4;   // increment next index
+   nsegs = (n >> 2) - 1;
+   nres  = n - ((nsegs+1) << 2);    // nres = n mod 4
+   for(i=0;i<nsegs;i++){
+      V7 = vec_madd(V0,V1,V7);       // partial sum gp. of 4
+      V0 = vec_ld(0,xp); xp += 4;    // bottom load next 4 x
+      V1 = vec_ld(0,yp); yp += 4;    // bottom load next 4 y
+   }
+   V7 = vec_madd(V0,V1,V7);          // final partial sum
+ // Now finish up: segment maxima are in V0, indices in V7
+   vec_st(V7,0,psum);
+   if(nres > 0){
+      ii   = ((n >> 2) << 2);
+      sum  = x[ii]*y[ii];
+      ii++;
+      for(i=1;i<nres;i++){
+         sum += x[ii]*y[ii];
+         ii++;
+      }
+   } else {
+      sum = 0.0;
+   }
+   for(i=0;i<4;i++){
+      sum += psum[i];
+   }
+   return(sum);
+ }
+ #undef NS
+ #include <math.h>
+ float ggl(float *ds)
+ {
+ 
+ /* generate u(0,1) distributed random numbers. 
+    Seed ds must be saved between calls. ggl is 
+    essentially the same as the IMSL routine RNUM. 
+ 
+    W. Petersen and M. Troyer, 24 Oct. 2002, ETHZ: 
+    a modification of a fortran version from 
+    I. Vattulainen, Tampere Univ. of Technology, 
+    Finland, 1992 */
+ 
+    double t,d2=0.2147483647e10;
+    t   = (float) *ds;
+    t   = fmod(0.16807e5*t,d2);
+    *ds = (float) t;
+    return((float) ((t-1.0e0)/(d2-1.0e0)));
+ }

Index: llvm-test/SingleSource/UnitTests/Vector/Altivec/alti.stepfft.c
diff -c /dev/null llvm-test/SingleSource/UnitTests/Vector/Altivec/alti.stepfft.c:1.1
*** /dev/null	Mon Apr  3 19:48:09 2006
--- llvm-test/SingleSource/UnitTests/Vector/Altivec/alti.stepfft.c	Mon Apr  3 19:47:54 2006
***************
*** 0 ****
--- 1,204 ----
+ #include <stdio.h>
+ #include <math.h>
+ #include <time.h>
+ #include <float.h>
+ #define N 128
+ #define N2 N/2
+ main()
+ {
+ /* SSE version of cfft2 - uses Apple intrinsics
+    W. Petersen, SAM. Math. ETHZ 2 May, 2002 */
+    int first,i,icase,it,n;
+    int nits=1000;              /* number of iterations for timing test */
+    float error,fnm1,sign,z0,z1,ggl();
+    static float seed = 331.0;
+    float *x,*y,*z,*w;
+    float t1,ln2,mflops;
+    void cffti(),cfft2();
+ /* allocate storage for x,y,z,w on 4-word bndr. */
+    x = (float *)malloc(8*N);
+    y = (float *)malloc(8*N);
+    z = (float *)malloc(8*N);
+    w = (float *)malloc(4*N);
+    first = 1;
+    for(icase=0;icase<2;icase++){
+    if(first){
+       for(i=0;i<2*N;i+=2){
+          z0 = ggl(&seed);     /* real part of array */
+          z1 = ggl(&seed);     /* imaginary part of array */
+          x[i] = z0;
+          z[i] = z0;           /* copy of initial real data */
+          x[i+1] = z1;
+          z[i+1] = z1;         /* copy of initial imag. data */
+       }
+    } else {
+       for(i=0;i<2*N;i+=2){
+          z0 = 0;              /* real part of array */
+          z1 = 0;              /* imaginary part of array */
+          x[i] = z0;
+          z[i] = z0;           /* copy of initial real data */
+          x[i+1] = z1;
+          z[i+1] = z1;         /* copy of initial imag. data */
+       }
+    }
+ /* initialize sine/cosine tables */
+    n = N;
+    cffti(n,w);
+ /* transform forward, back */
+    if(first){
+       sign = 1.0;
+       cfft2(n,x,y,w,sign);
+       sign = -1.0;
+       cfft2(n,y,x,w,sign);
+ /* results should be same as initial multiplied by N */
+       fnm1 = 1.0/((float) n);
+       error = 0.0;
+       for(i=0;i<2*N;i+=2){
+          error += (z[i] - fnm1*x[i])*(z[i] - fnm1*x[i]) +
+                   (z[i+1] - fnm1*x[i+1])*(z[i+1] - fnm1*x[i+1]);
+       }
+       error = sqrt(fnm1*error);
+       printf(" for n=%d, fwd/bck error=%e\n",N,error);
+       first = 0;
+    } else {
+       t1   = ((float)clock())/((float) CLOCKS_PER_SEC);
+       for(it=0;it<nits;it++){
+          sign = +1.0;
+          cfft2(n,x,y,w,sign);
+          sign = -1.0;
+          cfft2(n,y,x,w,sign);
+       }
+       t1   = ((float)clock())/((float) CLOCKS_PER_SEC) - t1;
+       t1   = t1/(2.0*((float) nits));
+       ln2 = 10.0; /* reset this for different N  */
+       mflops = 5.0*((float) N)*ln2/((1.e+6)*t1);
+       printf(" for n=%d, t1=%e, mflops=%e\n",n,t1,mflops);
+    }
+    }
+ }
+ void cfft2(n,x,y,w,sign)
+ int n;
+ float x[][2],y[][2],w[][2],sign;
+ {
+    int jb, m, j, mj, tgle;
+    void ccopy(),step();
+    m    = (int) (log((float) n)/log(1.99));
+    mj   = 1;
+    tgle = 1;  /* toggling switch for work array */
+    step(n,mj,&x[0][0],&x[n/2][0],&y[0][0],&y[mj][0],w,sign);
+    for(j=0;j<m-2;j++){
+       mj *= 2;
+       if(tgle){
+          step(n,mj,&y[0][0],&y[n/2][0],&x[0][0],&x[mj][0],w,sign);
+          tgle = 0;
+       } else {
+          step(n,mj,&x[0][0],&x[n/2][0],&y[0][0],&y[mj][0],w,sign);
+          tgle = 1;
+       }
+    }
+ /* last pass thru data: move y to x if needed */
+    if(tgle) {
+       ccopy(n,y,x);
+    }
+    mj   = n/2;
+    step(n,mj,&x[0][0],&x[n/2][0],&y[0][0],&y[mj][0],w,sign);
+ }
+ void cffti(int n, float w[][2])
+ {
+    int i,n2;
+    float aw,arg,pi;
+    pi = 3.141592653589793;
+    n2 = n/2;
+    aw = 2.0*pi/((float)n);
+    for(i=0;i<n2;i++){
+       arg   = aw*((float)i);
+       w[i][0] = cos(arg);
+       w[i][1] = sin(arg);
+    }
+ }
+ void ccopy(int n, float x[][2], float y[][2])
+ {
+    int i;
+    for(i=0;i<n;i++){
+       y[i][0] = x[i][0];
+       y[i][1] = x[i][1];
+    }
+ }
+ #define cplx __complex__ 
+ #define Re __real__ 
+ #define Im __imag__
+ void step
+ (unsigned int n,unsigned int mj, 
+ cplx float *a, __complex__ float *b, 
+ cplx float *c, __complex__ float *d, 
+ cplx float *w, float sign)
+ {
+    int j,k,jc,jw,l,lj,mj2;
+    float rp,up;
+    float wr[4], wu[4];
+    const vector float vminus = (vector float)(-0.,0.,-0.,0.);
+    const vector float vzero  = (vector float)(0.,0.,0.,0.);
+    const vector unsigned char pv3201 = 
+    (vector unsigned char)(4,5,6,7,0,1,2,3,12,13,14,15,8,9,10,11);
+    vector float v0,v1,v2,v3,v4,v5,v6,v7;
+ 
+    mj2 = 2*mj;
+    lj  = n/mj2;
+ 
+    for(j=0; j<lj; j++){
+       jw  = j*mj; jc  = j*mj2;
+       rp = Re w[jw];
+       up = Im w[jw];
+       if(sign<0.0) up = -up;
+       if(mj<2){
+ /* special case mj=1 */ 
+          Re d[jc] = rp*(Re a[jw] - Re b[jw]) - 
+                           up*(Im a[jw] - Im b[jw]);
+          Im d[jc] = up*(Re a[jw] - Re b[jw]) 
+                         + rp*(Im a[jw] - Im b[jw]);
+          Re c[jc] = Re a[jw] + Re b[jw];
+          Im c[jc] = Im a[jw] + Im b[jw];
+       } else {
+ /* mj>=2 case */ 
+          wr[0] = rp; wr[1] = rp; wr[2] = rp; wr[3] = rp; 
+          wu[0] = up; wu[1] = up; wu[2] = up; wu[3] = up; 
+          v6 = vec_ld(0,wr);
+          v7 = vec_ld(0,wu);
+          v7 = vec_xor(v7,vminus);
+          for(k=0; k<mj; k+=2){
+             v0 = vec_ld(0,(vector float *) &a[jw+k]); /* read a */
+             v1 = vec_ld(0,(vector float *) &b[jw+k]); /* read b */
+             v2 = vec_add(v0, v1);
+             vec_st(v2,0,(vector float *) &c[jc+k]);   /* store c */
+             v3 = vec_sub(v0, v1); 
+             v4 = vec_perm(v3,v3,pv3201);
+             v0 = vec_madd(v6,v3,vzero);
+             v1 = vec_madd(v7,v4,vzero);
+             v2 = vec_add(v0,v1);                     /* w*(a - b) */
+             vec_st(v2,0,(vector float *) &d[jc+k]);  /* store a */
+          }
+       }
+    }
+ }
+ #undef cplx 
+ #undef Re 
+ #undef Im
+ #include <math.h>
+ float ggl(float *ds)
+ {
+ 
+ /* generate u(0,1) distributed random numbers. 
+    Seed ds must be saved between calls. ggl is 
+    essentially the same as the IMSL routine RNUM. 
+ 
+    W. Petersen and M. Troyer, 24 Oct. 2002, ETHZ: 
+    a modification of a fortran version from 
+    I. Vattulainen, Tampere Univ. of Technology, 
+    Finland, 1992 */
+ 
+    double t,d2=0.2147483647e10;
+    t   = (float) *ds;
+    t   = fmod(0.16807e5*t,d2);
+    *ds = (float) t;
+    return((float) ((t-1.0e0)/(d2-1.0e0)));
+ }