[llvm-commits] CVS: llvm-test/SingleSource/UnitTests/Vector/Altivec/alti.expandfft.c alti.isamax.c alti.sdot.c alti.stepfft.c
Evan Cheng
evan.cheng at apple.com
Mon Apr 3 17:48:09 PDT 2006
Changes in directory llvm-test/SingleSource/UnitTests/Vector/Altivec:
alti.expandfft.c added (r1.1)
alti.isamax.c added (r1.1)
alti.sdot.c added (r1.1)
alti.stepfft.c added (r1.1)
---
Log message:
Added some Altivec and SSE examples from:
Introduction to Parallel Computing
A practical guide with examples in C
Oxford Texts in Applied and Engineering Mathematics No. 9
Oxford University Press, February 2004
ISBN: 0-19-851576-6 (hardback), 0-19-851577-4 (paperback)
http://people.inf.ethz.ch/arbenz/book/
---
Diffs of the changes: (+725 -0)
alti.expandfft.c | 287 +++++++++++++++++++++++++++++++++++++++++++++++++++++++
alti.isamax.c | 131 +++++++++++++++++++++++++
alti.sdot.c | 103 +++++++++++++++++++
alti.stepfft.c | 204 +++++++++++++++++++++++++++++++++++++++
4 files changed, 725 insertions(+)
Index: llvm-test/SingleSource/UnitTests/Vector/Altivec/alti.expandfft.c
diff -c /dev/null llvm-test/SingleSource/UnitTests/Vector/Altivec/alti.expandfft.c:1.1
*** /dev/null Mon Apr 3 19:48:04 2006
--- llvm-test/SingleSource/UnitTests/Vector/Altivec/alti.expandfft.c Mon Apr 3 19:47:54 2006
***************
*** 0 ****
--- 1,287 ----
+ #include <stdio.h>
+ #include <math.h>
+ #include <time.h>
+ #include <float.h>
+ #define N 1048576
+ #define N2 N/2
+ main()
+ {
+ /*
+ Example of Apple Altivec coded binary radix FFT
+ using intrinsics from Petersen and Arbenz "Intro.
+ to Parallel Computing," Section 3.6
+
+ This is an expanded version of a generic work-space
+ FFT: steps are in-line. cfft2(n,x,y,w,sign) takes complex
+ n-array "x" (Fortran real,aimag,real,aimag,... order)
+ and writes its DFT in "y". Both input "x" and the
+ original contents of "y" are destroyed. Initialization
+ for array "w" (size n/2 complex of twiddle factors
+ (exp(twopi*i*k/n), for k=0..n/2-1)) is computed once
+ by cffti(n,w).
+
+ WPP, SAM. Math. ETHZ, 1 June, 2002
+ */
+
+ int first,i,icase,it,ln2,n;
+ int nits=1000000;
+ static float seed = 331.0;
+ float error,fnm1,sign,z0,z1,ggl();
+ float *x,*y,*z,*w;
+ double t1,mflops;
+ void cffti(),cfft2();
+ /* allocate storage for x,y,z,w on 4-word bndr. */
+ x = (float *) malloc(8*N);
+ y = (float *) malloc(8*N);
+ z = (float *) malloc(8*N);
+ w = (float *) malloc(4*N);
+ n = 2;
+ for(ln2=1;ln2<21;ln2++){
+ first = 1;
+ for(icase=0;icase<2;icase++){
+ if(first){
+ for(i=0;i<2*n;i+=2){
+ z0 = ggl(&seed); /* real part of array */
+ z1 = ggl(&seed); /* imaginary part of array */
+ x[i] = z0;
+ z[i] = z0; /* copy of initial real data */
+ x[i+1] = z1;
+ z[i+1] = z1; /* copy of initial imag. data */
+ }
+ } else {
+ for(i=0;i<2*n;i+=2){
+ z0 = 0; /* real part of array */
+ z1 = 0; /* imaginary part of array */
+ x[i] = z0;
+ z[i] = z0; /* copy of initial real data */
+ x[i+1] = z1;
+ z[i+1] = z1; /* copy of initial imag. data */
+ }
+ }
+ /* initialize sine/cosine tables */
+ cffti(n,w);
+ /* transform forward, back */
+ if(first){
+ sign = 1.0;
+ cfft2(n,x,y,w,sign);
+ sign = -1.0;
+ cfft2(n,y,x,w,sign);
+ /* results should be same as initial multiplied by n */
+ fnm1 = 1.0/((float) n);
+ error = 0.0;
+ for(i=0;i<2*n;i+=2){
+ error += (z[i] - fnm1*x[i])*(z[i] - fnm1*x[i]) +
+ (z[i+1] - fnm1*x[i+1])*(z[i+1] - fnm1*x[i+1]);
+ }
+ error = sqrt(fnm1*error);
+ printf(" for n=%d, fwd/bck error=%e\n",n,error);
+ first = 0;
+ } else {
+ t1 = ((double)clock())/((double) CLOCKS_PER_SEC);
+ for(it=0;it<nits;it++){
+ sign = +1.0;
+ cfft2(n,x,y,w,sign);
+ sign = -1.0;
+ cfft2(n,y,x,w,sign);
+ }
+ t1 = ((double)clock())/((double) CLOCKS_PER_SEC) - t1;
+ t1 = 0.5*t1/((double) nits);
+ mflops = 5.0*((double) n)*((double) ln2)/((1.e+6)*t1);
+ printf(" for n=%d, t1=%e, mflops=%e\n",n,t1,mflops);
+ }
+ }
+ if((ln2%4)==0) nits /= 10;
+ n *= 2;
+ }
+ }
+ void cfft2(unsigned int n,float x[][2],float y[][2],float w[][2], float sign)
+ {
+
+ /*
+ altivec version of cfft2 from Petersen and Arbenz book, "Intro.
+ to Parallel Computing", Oxford Univ. Press, 2003, Section 3.6
+ wpp 14. Dec. 2003
+ */
+
+ int jb,jc,jd,jw,k,k2,k4,lj,m,j,mj,mj2,pass,tgle;
+ float rp,up,wr[4],wu[4];
+ float *a,*b,*c,*d;
+ const vector float vminus = (vector float)(-0.,0.,-0.,0.);
+ const vector float vzero = (vector float)(0.,0.,0.,0.);
+ const vector unsigned char pv3201 =
+ (vector unsigned char)(4,5,6,7,0,1,2,3,12,13,14,15,8,9,10,11);
+ vector float V0,V1,V2,V3,V4,V5,V6,V7;
+ vector float V8,V9,V10,V11,V12,V13,V14,V15;
+
+ if(n<=1){
+ y[0][0] = x[0][0];
+ y[0][1] = x[0][1];
+ return;
+ }
+ m = (int) (log((float) n)/log(1.99));
+ mj = 1;
+ mj2 = 2;
+ lj = n/2;
+ /* first pass thru data: x -> y */
+ for(j=0;j<lj;j++){
+ jb = n/2+j; jc = j*mj2; jd = jc + 1;
+ rp = w[j][0]; up = w[j][1];
+ if(sign<0.0) up = -up;
+ y[jd][0] = rp*(x[j][0] - x[jb][0]) - up*(x[j][1] - x[jb][1]);
+ y[jd][1] = up*(x[j][0] - x[jb][0]) + rp*(x[j][1] - x[jb][1]);
+ y[jc][0] = x[j][0] + x[jb][0];
+ y[jc][1] = x[j][1] + x[jb][1];
+ }
+ if(n==2) return;
+ /* next pass is mj = 2 */
+ mj = 2;
+ mj2 = 4;
+ lj = n/4;
+ a = (float *)&y[0][0];
+ b = (float *)&y[n/2][0];
+ c = (float *)&x[0][0];
+ d = (float *)&x[mj][0];
+ if(n==4){
+ c = (float *)&y[0][0];
+ d = (float *)&y[mj][0];
+ }
+ for(j=0;j<lj;j++){
+ jw = j*mj; jc = j*mj2; jd = 2*jc;
+ rp = w[jw][0]; up = w[jw][1];
+ if(sign<0.0) up = -up;
+ wr[0] = rp; wr[1] = rp; wr[2] = rp; wr[3] = rp;
+ wu[0] = up; wu[1] = up; wu[2] = up; wu[3] = up;
+ V6 = vec_ld(0,wr);
+ V7 = vec_ld(0,wu);
+ V7 = vec_xor(V7,vminus);
+ V0 = vec_ld(0,(vector float *) (a+jc));
+ V1 = vec_ld(0,(vector float *) (b+jc));
+ V2 = vec_add(V0,V1); /* a + b */
+ vec_st(V2,0,(vector float *) (c+jd)); /* store c */
+ V3 = vec_sub(V0,V1); /* a - b */
+ V4 = vec_perm(V3,V3,pv3201);
+ V0 = vec_madd(V6,V3,vzero);
+ V1 = vec_madd(V7,V4,vzero);
+ V2 = vec_add(V0,V1); /* w*(a - b) */
+ vec_st(V2,0,(vector float*) (d+jd)); /* store d */
+ }
+ if(n==4) return;
+ mj *= 2;
+ mj2 = 2*mj;
+ lj = n/mj2;
+ tgle = 0;
+ for(pass=2;pass<m-1;pass++){
+ if(tgle){
+ a = (float *)&y[0][0];
+ b = (float *)&y[n/2][0];
+ c = (float *)&x[0][0];
+ d = (float *)&x[mj][0];
+ tgle = 0;
+ } else {
+ a = (float *)&x[0][0];
+ b = (float *)&x[n/2][0];
+ c = (float *)&y[0][0];
+ d = (float *)&y[mj][0];
+ tgle = 1;
+ }
+ for(j=0; j<lj; j++){
+ jw = j*mj; jc = j*mj2; jd = 2*jc;
+ rp = w[jw][0];
+ up = w[jw][1];
+ if(sign<0.0) up = -up;
+ wr[0] = rp; wr[1] = rp; wr[2] = rp; wr[3] = rp;
+ wu[0] = up; wu[1] = up; wu[2] = up; wu[3] = up;
+ V6 = vec_ld(0,wr);
+ V7 = vec_ld(0,wu);
+ V7 = vec_xor(V7,vminus);
+ for(k=0; k<mj; k+=4){
+ k2 = 2*k; k4 = k2+4;
+ V0 = vec_ld(0,(vector float *) (a+jc+k2));
+ V1 = vec_ld(0,(vector float *) (b+jc+k2));
+ V2 = vec_add(V0,V1); /* a + b */
+ vec_st(V2,0,(vector float*) (c+jd+k2)); /* store c */
+ V3 = vec_sub(V0,V1); /* a - b */
+ V4 = vec_perm(V3,V3,pv3201);
+ V0 = vec_madd(V6,V3,vzero);
+ V1 = vec_madd(V7,V4,vzero);
+ V2 = vec_add(V0,V1); /* w*(a - b) */
+ vec_st(V2,0,(vector float *) (d+jd+k2)); /* store d */
+ V8 = vec_ld(0,(vector float *) (a+jc+k4));
+ V9 = vec_ld(0,(vector float *) (b+jc+k4));
+ V10 = vec_add(V8,V9); /* a + b */
+ vec_st(V10,0,(vector float *) (c+jd+k4)); /* store c */
+ V11 = vec_sub(V8,V9); /* a - b */
+ V12 = vec_perm(V11,V11,pv3201);
+ V8 = vec_madd(V6,V11,vzero);
+ V9 = vec_madd(V7,V12,vzero);
+ V10 = vec_add(V8,V9); /* w*(a - b) */
+ vec_st(V10,0,(vector float *) (d+jd+k4)); /* store d */
+ }
+ }
+ mj *= 2;
+ mj2 = 2*mj;
+ lj = n/mj2;
+ }
+ /* last pass thru data: in-place if previous in y */
+ c = (float *)&y[0][0];
+ d = (float *)&y[n/2][0];
+ if(tgle) {
+ a = (float *)&y[0][0];
+ b = (float *)&y[n/2][0];
+ } else {
+ a = (float *)&x[0][0];
+ b = (float *)&x[n/2][0];
+ }
+ for(k=0; k<(n/2); k+=4){
+ k2 = 2*k; k4 = k2+4;
+ V0 = vec_ld(0,(vector float *) (a+k2));
+ V1 = vec_ld(0,(vector float *) (b+k2));
+ V2 = vec_add(V0,V1); /* a + b */
+ vec_st(V2,0,(vector float*) (c+k2)); /* store c */
+ V3 = vec_sub(V0,V1); /* a - b */
+ vec_st(V3,0,(vector float *) (d+k2)); /* store d */
+ V4 = vec_ld(0,(vector float *) (a+k4));
+ V5 = vec_ld(0,(vector float *) (b+k4));
+ V6 = vec_add(V4,V5); /* a + b */
+ vec_st(V6,0,(vector float *) (c+k4)); /* store c */
+ V7 = vec_sub(V4,V5); /* a - b */
+ vec_st(V7,0,(vector float *) (d+k4)); /* store d */
+ }
+ }
+ void cffti(int n, float w[][2])
+ {
+
+ /* initialization routine for cfft2: computes
+ cos(twopi*k),sin(twopi*k) for k=0..n/2-1
+ - the "twiddle factors" for a binary radix FFT */
+
+ int i,n2;
+ float aw,arg,pi;
+ pi = 3.141592653589793;
+ n2 = n/2;
+ aw = 2.0*pi/((float)n);
+ for(i=0;i<n2;i++){
+ arg = aw*((float)i);
+ w[i][0] = cos(arg);
+ w[i][1] = sin(arg);
+ }
+ }
+ #include <math.h>
+ float ggl(float *ds)
+ {
+
+ /* generate u(0,1) distributed random numbers.
+ Seed ds must be saved between calls. ggl is
+ essentially the same as the IMSL routine RNUM.
+
+ W. Petersen and M. Troyer, 24 Oct. 2002, ETHZ:
+ a modification of a fortran version from
+ I. Vattulainen, Tampere Univ. of Technology,
+ Finland, 1992 */
+
+ double t,d2=0.2147483647e10;
+ t = (float) *ds;
+ t = fmod(0.16807e5*t,d2);
+ *ds = (float) t;
+ return((float) ((t-1.0e0)/(d2-1.0e0)));
+ }
Index: llvm-test/SingleSource/UnitTests/Vector/Altivec/alti.isamax.c
diff -c /dev/null llvm-test/SingleSource/UnitTests/Vector/Altivec/alti.isamax.c:1.1
*** /dev/null Mon Apr 3 19:48:09 2006
--- llvm-test/SingleSource/UnitTests/Vector/Altivec/alti.isamax.c Mon Apr 3 19:47:54 2006
***************
*** 0 ****
--- 1,131 ----
+ #include <stdio.h>
+ #include <math.h>
+ #include <float.h>
+ #define N 1027
+ main()
+ {
+ /*
+ Mac G-4 unit step isamax for arbitrary N code
+ This is an Altivec version of isamax0 from Section 3.5.7
+ in Arbenz and Petersen, "Intro. to Parallel Computing"
+ Oxford Univ. Press, 2004
+ wpp 5/8/2002
+ */
+ float x[N];
+ float xb;
+ int err,flag,i,im,k,ki,kl,ib,n0,n;
+ int isamax(int,float *);
+ static float seed = 331.0;
+ float ggl(float*);
+ flag = 0; // error flag
+ kl = 3;
+ n0 = 1;
+ for(k=0;k<5;k++){
+ for(ki=0;ki<kl;ki++){
+ n = n0 + ki;
+ ib = 0; xb = 0.0;
+ for(i=0;i<n;i++){
+ x[i] = ggl(&seed);
+ if(fabs(x[i]) > xb){
+ xb = fabs(x[i]);
+ ib = i;
+ }
+ }
+ im = isamax(n,x);
+ err = ib - im;
+ if(err != 0){
+ printf(" err in isamax: n = %d, ib = %d, im = %d\n",n,ib,im);
+ flag = 1;
+ }
+ }
+ n0 *= 4; // increase n
+ kl = 4; // for n > 1, 3 steps of increase in n
+ }
+ if(flag==0) printf(" All n tests pass\n");
+ }
+ #define NS 12
+ int isamax(int n, float *x)
+ {
+ float rbig,*xp;
+ int i,ii,nres,nsegs,ibig,irbig;
+ vector float V0,V1,V6;
+ vector bool int V3;
+ vector float V2 = (vector float) (0.0,1.0,2.0,3.0);
+ vector float V7 = (vector float) (0.0,1.0,2.0,3.0);
+ const vector float incr_4 = (vector float) (4.0,4.0,4.0,4.0);
+ const vector float minus0 = (vector float) (-0.0,-0.0,-0.0,-0.0);
+ float big,xbig[4],indx[4];
+ // n < NS done in scalar mode
+ if(n < NS){
+ ibig = 0;
+ rbig = 0.0;
+ for(i=0;i<n;i++){
+ if(fabs(x[i]) > rbig){
+ rbig = fabs(x[i]);
+ ibig = i;
+ }
+ }
+ return(ibig);
+ }
+ // n >= NS case done with altivec
+ nsegs = (n >> 2) - 1;
+ nres = n - ((nsegs+1) << 2); // nres = n mod 4
+ V2 = vec_add(V2,incr_4); // increment next index
+ xp = x;
+ V0 = vec_ld(0,xp); xp += 4; // first four
+ V1 = vec_ld(0,xp); xp += 4; // next four
+ V0 = vec_abs(V0); // absolute value of first four
+ for(i=0;i<nsegs;i++){
+ V1 = vec_abs(V1); // take absolute value fo next segment
+ V3 = vec_cmpgt(V1,V0); // compare accumulated 4 to next 4
+ V0 = vec_sel(V0,V1,V3); // select new or old accumulation
+ V7 = vec_sel(V7,V2,V3); // select new index or old
+ V1 = vec_ld(0,xp); xp += 4; // bottom load next 4
+ V2 = vec_add(V2,incr_4);
+ }
+ V1 = vec_ld(0,xp); xp += 4; // bottom load next four
+ V3 = vec_cmpgt(V1,V0); // compare accumulated to last 4
+ V0 = vec_sel(V0,V1,V3); // select accumulation to last 4
+ V7 = vec_sel(V7,V2,V3); // select index of accum. to last 4
+ // Now finish up: segment maxima are in V0, indices in V7
+ vec_st(V0,0,xbig);
+ vec_st(V7,0,indx);
+ ii = ((n >> 2) << 2);
+ big = 0.0;
+ ibig = 0.0;
+ for(i=0;i<nres;i++){
+ if(fabs(x[ii]) > big){
+ big = fabs(x[ii]);
+ ibig = ii;
+ }
+ ii++;
+ }
+ for(i=0;i<4;i++){
+ if(xbig[i] > big){
+ big = xbig[i];
+ ibig = (int) indx[i];
+ }
+ }
+ return(ibig);
+ }
+ #undef NS
+ #include <math.h>
+ float ggl(float *ds)
+ {
+
+ /* generate u(0,1) distributed random numbers.
+ Seed ds must be saved between calls. ggl is
+ essentially the same as the IMSL routine RNUM.
+
+ W. Petersen and M. Troyer, 24 Oct. 2002, ETHZ:
+ a modification of a fortran version from
+ I. Vattulainen, Tampere Univ. of Technology,
+ Finland, 1992 */
+
+ double t,d2=0.2147483647e10;
+ t = (float) *ds;
+ t = fmod(0.16807e5*t,d2);
+ *ds = (float) t;
+ return((float) ((t-1.0e0)/(d2-1.0e0)));
+ }
+
Index: llvm-test/SingleSource/UnitTests/Vector/Altivec/alti.sdot.c
diff -c /dev/null llvm-test/SingleSource/UnitTests/Vector/Altivec/alti.sdot.c:1.1
*** /dev/null Mon Apr 3 19:48:09 2006
--- llvm-test/SingleSource/UnitTests/Vector/Altivec/alti.sdot.c Mon Apr 3 19:47:54 2006
***************
*** 0 ****
--- 1,103 ----
+ #include <stdio.h>
+ #include <math.h>
+ #include <float.h>
+ #define N 1027
+ main()
+ {
+ /* Mac G-4 sdot for arbitrary N wpp 6/8/2002 */
+ float x[N],y[N],tres,res,eps;
+ int flag,i,k,ki,kl,n0,n;
+ static float seed = 331.0;
+ float sdot(int,float *,float *);
+ float ggl(float *);
+ eps = FLT_EPSILON; /* machine eps */
+ n0 = 1; kl = 3;
+ flag = 0;
+ for(k=0;k<5;k++){
+ for(ki=0;ki<kl;ki++){
+ n = n0 + ki;
+ tres = 0.0;
+ for(i=0;i<n;i++){
+ x[i] = ggl(&seed);
+ y[i] = ggl(&seed);
+ tres = tres + x[i]*y[i];
+ }
+ res = sdot(n,x,y);
+ if(fabs(tres-res) > ((float) n)*eps){
+ flag = 1;
+ printf(" n = %d, test sdot value = %e, sdot value = %e\n",
+ n,tres,res);
+ }
+ }
+ n0 *= 4;
+ kl = 4;
+ }
+ if(flag == 0) printf(" All n tests passed\n");
+ }
+ #define NS 12
+ float sdot(int n, float *x, float *y)
+ {
+ float sum,*xp,*yp;
+ int i,ii,nres,nsegs;
+ vector float V7 = (vector float)(0.0,0.0,0.0,0.0);
+ vector float V0,V1;
+ float psum[4];
+ // n < NS done in scalar mode
+ if(n < NS){
+ sum = x[0]*y[0];
+ for(i=1;i<n;i++){
+ sum += x[i]*y[i];
+ }
+ return(sum);
+ }
+ // n >= NS case done with altivec
+ xp = x;
+ yp = y;
+ V0 = vec_ld(0,xp); xp += 4; // increment next index
+ V1 = vec_ld(0,yp); yp += 4; // increment next index
+ nsegs = (n >> 2) - 1;
+ nres = n - ((nsegs+1) << 2); // nres = n mod 4
+ for(i=0;i<nsegs;i++){
+ V7 = vec_madd(V0,V1,V7); // partial sum gp. of 4
+ V0 = vec_ld(0,xp); xp += 4; // bottom load next 4 x
+ V1 = vec_ld(0,yp); yp += 4; // bottom load next 4 y
+ }
+ V7 = vec_madd(V0,V1,V7); // final partial sum
+ // Now finish up: segment maxima are in V0, indices in V7
+ vec_st(V7,0,psum);
+ if(nres > 0){
+ ii = ((n >> 2) << 2);
+ sum = x[ii]*y[ii];
+ ii++;
+ for(i=1;i<nres;i++){
+ sum += x[ii]*y[ii];
+ ii++;
+ }
+ } else {
+ sum = 0.0;
+ }
+ for(i=0;i<4;i++){
+ sum += psum[i];
+ }
+ return(sum);
+ }
+ #undef NS
+ #include <math.h>
+ float ggl(float *ds)
+ {
+
+ /* generate u(0,1) distributed random numbers.
+ Seed ds must be saved between calls. ggl is
+ essentially the same as the IMSL routine RNUM.
+
+ W. Petersen and M. Troyer, 24 Oct. 2002, ETHZ:
+ a modification of a fortran version from
+ I. Vattulainen, Tampere Univ. of Technology,
+ Finland, 1992 */
+
+ double t,d2=0.2147483647e10;
+ t = (float) *ds;
+ t = fmod(0.16807e5*t,d2);
+ *ds = (float) t;
+ return((float) ((t-1.0e0)/(d2-1.0e0)));
+ }
Index: llvm-test/SingleSource/UnitTests/Vector/Altivec/alti.stepfft.c
diff -c /dev/null llvm-test/SingleSource/UnitTests/Vector/Altivec/alti.stepfft.c:1.1
*** /dev/null Mon Apr 3 19:48:09 2006
--- llvm-test/SingleSource/UnitTests/Vector/Altivec/alti.stepfft.c Mon Apr 3 19:47:54 2006
***************
*** 0 ****
--- 1,204 ----
+ #include <stdio.h>
+ #include <math.h>
+ #include <time.h>
+ #include <float.h>
+ #define N 128
+ #define N2 N/2
+ main()
+ {
+ /* SSE version of cfft2 - uses Apple intrinsics
+ W. Petersen, SAM. Math. ETHZ 2 May, 2002 */
+ int first,i,icase,it,n;
+ int nits=1000; /* number of iterations for timing test */
+ float error,fnm1,sign,z0,z1,ggl();
+ static float seed = 331.0;
+ float *x,*y,*z,*w;
+ float t1,ln2,mflops;
+ void cffti(),cfft2();
+ /* allocate storage for x,y,z,w on 4-word bndr. */
+ x = (float *)malloc(8*N);
+ y = (float *)malloc(8*N);
+ z = (float *)malloc(8*N);
+ w = (float *)malloc(4*N);
+ first = 1;
+ for(icase=0;icase<2;icase++){
+ if(first){
+ for(i=0;i<2*N;i+=2){
+ z0 = ggl(&seed); /* real part of array */
+ z1 = ggl(&seed); /* imaginary part of array */
+ x[i] = z0;
+ z[i] = z0; /* copy of initial real data */
+ x[i+1] = z1;
+ z[i+1] = z1; /* copy of initial imag. data */
+ }
+ } else {
+ for(i=0;i<2*N;i+=2){
+ z0 = 0; /* real part of array */
+ z1 = 0; /* imaginary part of array */
+ x[i] = z0;
+ z[i] = z0; /* copy of initial real data */
+ x[i+1] = z1;
+ z[i+1] = z1; /* copy of initial imag. data */
+ }
+ }
+ /* initialize sine/cosine tables */
+ n = N;
+ cffti(n,w);
+ /* transform forward, back */
+ if(first){
+ sign = 1.0;
+ cfft2(n,x,y,w,sign);
+ sign = -1.0;
+ cfft2(n,y,x,w,sign);
+ /* results should be same as initial multiplied by N */
+ fnm1 = 1.0/((float) n);
+ error = 0.0;
+ for(i=0;i<2*N;i+=2){
+ error += (z[i] - fnm1*x[i])*(z[i] - fnm1*x[i]) +
+ (z[i+1] - fnm1*x[i+1])*(z[i+1] - fnm1*x[i+1]);
+ }
+ error = sqrt(fnm1*error);
+ printf(" for n=%d, fwd/bck error=%e\n",N,error);
+ first = 0;
+ } else {
+ t1 = ((float)clock())/((float) CLOCKS_PER_SEC);
+ for(it=0;it<nits;it++){
+ sign = +1.0;
+ cfft2(n,x,y,w,sign);
+ sign = -1.0;
+ cfft2(n,y,x,w,sign);
+ }
+ t1 = ((float)clock())/((float) CLOCKS_PER_SEC) - t1;
+ t1 = t1/(2.0*((float) nits));
+ ln2 = 10.0; /* reset this for different N */
+ mflops = 5.0*((float) N)*ln2/((1.e+6)*t1);
+ printf(" for n=%d, t1=%e, mflops=%e\n",n,t1,mflops);
+ }
+ }
+ }
+ void cfft2(n,x,y,w,sign)
+ int n;
+ float x[][2],y[][2],w[][2],sign;
+ {
+ int jb, m, j, mj, tgle;
+ void ccopy(),step();
+ m = (int) (log((float) n)/log(1.99));
+ mj = 1;
+ tgle = 1; /* toggling switch for work array */
+ step(n,mj,&x[0][0],&x[n/2][0],&y[0][0],&y[mj][0],w,sign);
+ for(j=0;j<m-2;j++){
+ mj *= 2;
+ if(tgle){
+ step(n,mj,&y[0][0],&y[n/2][0],&x[0][0],&x[mj][0],w,sign);
+ tgle = 0;
+ } else {
+ step(n,mj,&x[0][0],&x[n/2][0],&y[0][0],&y[mj][0],w,sign);
+ tgle = 1;
+ }
+ }
+ /* last pass thru data: move y to x if needed */
+ if(tgle) {
+ ccopy(n,y,x);
+ }
+ mj = n/2;
+ step(n,mj,&x[0][0],&x[n/2][0],&y[0][0],&y[mj][0],w,sign);
+ }
+ void cffti(int n, float w[][2])
+ {
+ int i,n2;
+ float aw,arg,pi;
+ pi = 3.141592653589793;
+ n2 = n/2;
+ aw = 2.0*pi/((float)n);
+ for(i=0;i<n2;i++){
+ arg = aw*((float)i);
+ w[i][0] = cos(arg);
+ w[i][1] = sin(arg);
+ }
+ }
+ void ccopy(int n, float x[][2], float y[][2])
+ {
+ int i;
+ for(i=0;i<n;i++){
+ y[i][0] = x[i][0];
+ y[i][1] = x[i][1];
+ }
+ }
+ #define cplx __complex__
+ #define Re __real__
+ #define Im __imag__
+ void step
+ (unsigned int n,unsigned int mj,
+ cplx float *a, __complex__ float *b,
+ cplx float *c, __complex__ float *d,
+ cplx float *w, float sign)
+ {
+ int j,k,jc,jw,l,lj,mj2;
+ float rp,up;
+ float wr[4], wu[4];
+ const vector float vminus = (vector float)(-0.,0.,-0.,0.);
+ const vector float vzero = (vector float)(0.,0.,0.,0.);
+ const vector unsigned char pv3201 =
+ (vector unsigned char)(4,5,6,7,0,1,2,3,12,13,14,15,8,9,10,11);
+ vector float v0,v1,v2,v3,v4,v5,v6,v7;
+
+ mj2 = 2*mj;
+ lj = n/mj2;
+
+ for(j=0; j<lj; j++){
+ jw = j*mj; jc = j*mj2;
+ rp = Re w[jw];
+ up = Im w[jw];
+ if(sign<0.0) up = -up;
+ if(mj<2){
+ /* special case mj=1 */
+ Re d[jc] = rp*(Re a[jw] - Re b[jw]) -
+ up*(Im a[jw] - Im b[jw]);
+ Im d[jc] = up*(Re a[jw] - Re b[jw])
+ + rp*(Im a[jw] - Im b[jw]);
+ Re c[jc] = Re a[jw] + Re b[jw];
+ Im c[jc] = Im a[jw] + Im b[jw];
+ } else {
+ /* mj>=2 case */
+ wr[0] = rp; wr[1] = rp; wr[2] = rp; wr[3] = rp;
+ wu[0] = up; wu[1] = up; wu[2] = up; wu[3] = up;
+ v6 = vec_ld(0,wr);
+ v7 = vec_ld(0,wu);
+ v7 = vec_xor(v7,vminus);
+ for(k=0; k<mj; k+=2){
+ v0 = vec_ld(0,(vector float *) &a[jw+k]); /* read a */
+ v1 = vec_ld(0,(vector float *) &b[jw+k]); /* read b */
+ v2 = vec_add(v0, v1);
+ vec_st(v2,0,(vector float *) &c[jc+k]); /* store c */
+ v3 = vec_sub(v0, v1);
+ v4 = vec_perm(v3,v3,pv3201);
+ v0 = vec_madd(v6,v3,vzero);
+ v1 = vec_madd(v7,v4,vzero);
+ v2 = vec_add(v0,v1); /* w*(a - b) */
+ vec_st(v2,0,(vector float *) &d[jc+k]); /* store a */
+ }
+ }
+ }
+ }
+ #undef cplx
+ #undef Re
+ #undef Im
+ #include <math.h>
+ float ggl(float *ds)
+ {
+
+ /* generate u(0,1) distributed random numbers.
+ Seed ds must be saved between calls. ggl is
+ essentially the same as the IMSL routine RNUM.
+
+ W. Petersen and M. Troyer, 24 Oct. 2002, ETHZ:
+ a modification of a fortran version from
+ I. Vattulainen, Tampere Univ. of Technology,
+ Finland, 1992 */
+
+ double t,d2=0.2147483647e10;
+ t = (float) *ds;
+ t = fmod(0.16807e5*t,d2);
+ *ds = (float) t;
+ return((float) ((t-1.0e0)/(d2-1.0e0)));
+ }
More information about the llvm-commits
mailing list