[llvm-commits] CVS: llvm-test/SingleSource/UnitTests/Vector/SSE/Makefile sse.expandfft.c sse.isamax.c sse.stepfft.c

Mon Apr 3 17:48:08 PDT 2006

Changes in directory llvm-test/SingleSource/UnitTests/Vector/SSE:

Makefile added (r1.1)
sse.expandfft.c added (r1.1)
sse.isamax.c added (r1.1)
sse.stepfft.c added (r1.1)
---
Log message:

Added some Altivec and SSE examples from:

Introduction to Parallel Computing 
A practical guide with examples in C 

Oxford Texts in Applied and Engineering Mathematics No. 9 
Oxford University Press, February 2004 
ISBN: 0-19-851576-6 (hardback), 0-19-851577-4 (paperback) 

http://people.inf.ethz.ch/arbenz/book/

---
Diffs of the changes:  (+585 -0)

 Makefile        |    8 +
 sse.expandfft.c |  263 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 sse.isamax.c    |  119 +++++++++++++++++++++++++
 sse.stepfft.c   |  195 +++++++++++++++++++++++++++++++++++++++++
 4 files changed, 585 insertions(+)

Index: llvm-test/SingleSource/UnitTests/Vector/SSE/Makefile
diff -c /dev/null llvm-test/SingleSource/UnitTests/Vector/SSE/Makefile:1.1
*** /dev/null	Mon Apr  3 19:48:04 2006
--- llvm-test/SingleSource/UnitTests/Vector/SSE/Makefile	Mon Apr  3 19:47:54 2006
***************
*** 0 ****
--- 1,8 ----
+ # SingleSource/UnitTests/Vector/SSE/Makefile
+ 
+ DIRS = 
+ LEVEL = ../../../..
+ include $(LEVEL)/SingleSource/Makefile.singlesrc
+ 
+ TARGET_CFLAGS += -msse3
+ LCCFLAGS += -msse3

Index: llvm-test/SingleSource/UnitTests/Vector/SSE/sse.expandfft.c
diff -c /dev/null llvm-test/SingleSource/UnitTests/Vector/SSE/sse.expandfft.c:1.1
*** /dev/null	Mon Apr  3 19:48:08 2006
--- llvm-test/SingleSource/UnitTests/Vector/SSE/sse.expandfft.c	Mon Apr  3 19:47:54 2006
***************
*** 0 ****
--- 1,263 ----
+ #include <stdio.h>
+ #include <math.h>
+ #include <time.h>
+ #include <float.h>
+ #include "xmmintrin.h"
+ #define N 256
+ #define N2 N/2
+ main()
+ {
+ /* 
+    SSE version of cfft2 - uses Intel intrinsics.
+    Expanded version 
+    
+               wpp, SAM. Math. ETHZ 21 May, 2002 
+ */
+    int first,i,icase,it,n;
+    float error,fnm1,seed,sign,z0,z1,ggl();
+    float *x,*y,*z,*w;
+    float t1,ln2,mflops;
+    void cffti(),cfft2();
+ /* allocate storage for x,y,z,w on 4-word bndr. */
+    x = (float *)_mm_malloc(8*N, 16);
+    y = (float *)_mm_malloc(8*N, 16);
+    z = (float *)_mm_malloc(8*N, 16);
+    w = (float *)_mm_malloc(4*N, 16);
+    first = 1;
+    seed  = 331.0;
+    for(icase=0;icase<2;icase++){
+    if(first){
+       for(i=0;i<2*N;i+=2){
+          z0 = ggl(&seed);     /* real part of array */
+          z1 = ggl(&seed);     /* imaginary part of array */
+          x[i] = z0;
+          z[i] = z0;           /* copy of initial real data */
+          x[i+1] = z1;
+          z[i+1] = z1;         /* copy of initial imag. data */
+       }
+    } else {
+       for(i=0;i<2*N;i+=2){
+          z0 = 0;              /* real part of array */
+          z1 = 0;              /* imaginary part of array */
+          x[i] = z0;
+          z[i] = z0;           /* copy of initial real data */
+          x[i+1] = z1;
+          z[i+1] = z1;         /* copy of initial imag. data */
+       }
+    }
+ /* initialize sine/cosine tables */
+    n = N;
+    cffti(n,w);
+ /* transform forward, back */
+    if(first){
+       sign = 1.0;
+       cfft2(n,x,y,w,sign);
+       sign = -1.0;
+       cfft2(n,y,x,w,sign);
+ /* results should be same as initial multiplied by N */
+       fnm1 = 1.0/((float) n);
+       error = 0.0;
+       for(i=0;i<2*N;i+=2){
+          error += (z[i] - fnm1*x[i])*(z[i] - fnm1*x[i]) +
+                 (z[i+1] - fnm1*x[i+1])*(z[i+1] - fnm1*x[i+1]);
+       }
+       error = sqrt(fnm1*error);
+       printf(" for n=%d, fwd/bck error=%e\n",N,error);
+       first = 0;
+    } else {
+       t1   = ((float)clock())/((float) CLOCKS_PER_SEC);
+       for(it=0;it<1000;it++){
+          sign = +1.0;
+          cfft2(n,x,y,w,sign);
+          sign = -1.0;
+          cfft2(n,y,x,w,sign);
+       }
+       t1   = ((float)clock())/((float) CLOCKS_PER_SEC) - t1;
+       t1   = t1/2000.0;
+       ln2 = 10.0; // reset this for different N 
+       mflops = 5.0*((float) N)*ln2/((1.e+6)*t1);
+       printf(" for n=%d, t1=%e, mflops=%e\n",n,t1,mflops);
+    }
+    }
+ }
+ void cfft2(n,x,y,w,sign)
+ int n;
+ float x[][2],y[][2],w[][2],sign;
+ {
+    int jb, jc, jw, k, k2, lj, m, j, mj, mj2, pass, tgle;
+    float (*a)[2],(*b)[2],(*c)[2],(*d)[2];
+    float (*aa)[2],(*bb)[2],(*cb)[2],(*dd)[2];
+    float rp,up,wr[4],wu[4];
+    __m128 V0,V1,V2,V3,V4,V5,V6,V7;
+    __m128 V8,V9,V10,V11,V12,V13,V14,V15;
+ 
+    if(n<=1){
+       y[0][0] = x[0][0];
+       y[0][1] = x[0][1];
+       return;
+    }
+    m    = (int) (log((float) n)/log(1.99));
+    mj   = 1;
+    mj2  = 2;
+    lj   = n/2;
+ // first pass thru data: x -> y
+    a = (void *)&x[0][0];
+    b = (void *)&x[n/2][0];
+    c = (void *)&y[0][0];
+    d = (void *)&y[1][0];
+    for(j=0;j<lj;j++){
+       jc  = j*mj2;
+       rp = w[j][0]; up = w[j][1];
+       if(sign<0.0) up = -up;
+       d[jc][0] = rp*(a[j][0] - b[j][0]) - up*(a[j][1] - b[j][1]);
+       d[jc][1] = up*(a[j][0] - b[j][0]) + rp*(a[j][1] - b[j][1]);
+       c[jc][0] = a[j][0] + b[j][0];
+       c[jc][1] = a[j][1] + b[j][1];
+    }
+    if(n==2) return;
+ // next pass is mj = 2
+    mj  = 2;
+    mj2 = 4;
+    lj  = n/4;
+    a = (void *)&y[0][0];
+    b = (void *)&y[n/2][0];
+    c = (void *)&x[0][0];
+    d = (void *)&x[mj][0];
+    if(n==4){
+       c = (void *)&y[0][0];
+       d = (void *)&y[mj][0];
+    }
+    for(j=0;j<lj;j++){
+       jw = j*mj; jc = j*mj2;
+       rp = w[jw][0]; up = w[jw][1];
+       if(sign<0.0) up = -up;
+       wr[0] =  rp; wr[1] = rp; wr[2] =  rp; wr[3] = rp;
+       wu[0] = -up; wu[1] = up; wu[2] = -up; wu[3] = up;
+       V6 = _mm_load_ps(wr);
+       V7 = _mm_load_ps(wu);
+       V0 = _mm_load_ps(&a[jw][0]);
+       V1 = _mm_load_ps(&b[jw][0]);
+       V2 = _mm_add_ps(V0,V1);            /* a + b */
+       _mm_store_ps(&c[jc][0],V2);          /* store c */
+       V3 = _mm_sub_ps(V0,V1);            /* a - b */
+       V4 = _mm_shuffle_ps(V3,V3,_MM_SHUFFLE(2,3,0,1));
+       V0 = _mm_mul_ps(V6,V3);
+       V1 = _mm_mul_ps(V7,V4);
+       V2 = _mm_add_ps(V0,V1);            /* w*(a - b) */
+       _mm_store_ps(&d[jc][0],V2);          /* store d */
+    }
+    if(n==4) return;
+    mj  *= 2;
+    mj2  = 2*mj;
+    lj   = n/mj2; 
+    tgle = 0;  
+    for(pass=2;pass<m-1;pass++){
+       if(tgle){
+          a = (void *)&y[0][0];
+          b = (void *)&y[n/2][0];
+          c = (void *)&x[0][0];
+          d = (void *)&x[mj][0];
+  	 tgle = 0;
+       } else {
+          a = (void *)&x[0][0];
+          b = (void *)&x[n/2][0];
+          c = (void *)&y[0][0];
+          d = (void *)&y[mj][0];
+ 	 tgle = 1;
+       }
+       for(j=0; j<lj; j++){
+          jw = j*mj; jc = j*mj2;
+          rp = w[jw][0];
+          up = w[jw][1];
+          if(sign<0.0) up = -up;
+          wr[0] =  rp; wr[1] = rp; wr[2] =  rp; wr[3] = rp;
+          wu[0] = -up; wu[1] = up; wu[2] = -up; wu[3] = up;
+          V6 = _mm_load_ps(wr);
+          V7 = _mm_load_ps(wu);
+          for(k=0; k<mj; k+=4){
+             k2   = k + 2;
+             V0 = _mm_load_ps(&a[jw+k][0]);
+             V1 = _mm_load_ps(&b[jw+k][0]);
+             V2 = _mm_add_ps(V0,V1);            /* a + b */
+             _mm_store_ps(&c[jc+k][0],V2);      /* store c */
+             V3 = _mm_sub_ps(V0,V1);            /* a - b */
+             V4 = _mm_shuffle_ps(V3,V3,_MM_SHUFFLE(2,3,0,1));
+             V0 = _mm_mul_ps(V6,V3);
+             V1 = _mm_mul_ps(V7,V4);
+             V2 = _mm_add_ps(V0,V1);            /* w*(a - b) */
+             _mm_store_ps(&d[jc+k][0],V2);      /* store d */
+ 	    V8 = _mm_load_ps(&a[jw+k2][0]);
+ 	    V9 = _mm_load_ps(&b[jw+k2][0]);
+ 	    V10 = _mm_add_ps(V8,V9);		/* a + b */
+ 	    _mm_store_ps(&c[jc+k2][0],V10);     /* store c */
+ 	    V11 = _mm_sub_ps(V8,V9);		/* a - b */
+ 	    V12 = _mm_shuffle_ps(V11,V11,_MM_SHUFFLE(2,3,0,1));
+ 	    V8  = _mm_mul_ps(V6,V11);
+ 	    V9  = _mm_mul_ps(V7,V12);
+ 	    V10 = _mm_add_ps(V8,V9);		/* w*(a - b) */
+ 	    _mm_store_ps(&d[jc+k2][0],V10);	/* store d */
+          }
+       }
+       mj  *= 2;
+       mj2  = 2*mj;
+       lj   = n/mj2;
+    }
+ /* last pass thru data: in-place if previous in y */
+    c = (void *)&y[0][0];
+    d = (void *)&y[n/2][0];
+    if(tgle) {
+       a = (void *)&y[0][0];
+       b = (void *)&y[n/2][0];
+    } else {
+       a = (void *)&x[0][0];
+       b = (void *)&x[n/2][0];
+    }
+    for(k=0; k<(n/2); k+=4){
+       k2   = k + 2;
+       V0 = _mm_load_ps(&a[k][0]);
+       V1 = _mm_load_ps(&b[k][0]);
+       V2 = _mm_add_ps(V0,V1);         /* a + b */
+       _mm_store_ps(&c[k][0],V2);      /* store c */
+       V3 = _mm_sub_ps(V0,V1);         /* a - b */
+       _mm_store_ps(&d[k][0],V3);      /* store d */
+       V4 = _mm_load_ps(&a[k2][0]);
+       V5 = _mm_load_ps(&b[k2][0]);
+       V6 = _mm_add_ps(V4,V5);         /* a + b */
+       _mm_store_ps(&c[k2][0],V6);     /* store c */
+       V7 = _mm_sub_ps(V4,V5);         /* a - b */
+       _mm_store_ps(&d[k2][0],V7);     /* store d */
+    }
+ }
+ void cffti(int n, float w[][2])
+ {
+    int i,n2;
+    float aw,arg,pi;
+    pi = 3.141592653589793;
+    n2 = n/2;
+    aw = 2.0*pi/((float)n);
+ #pragma vector
+    for(i=0;i<n2;i++){
+       arg   = aw*((float)i);
+       w[i][0] = cos(arg);
+       w[i][1] = sin(arg);
+    }
+ }
+ #include <math.h>
+ float ggl(float *ds)
+ {
+ /* generate u(0,1) distributed random numbers.
+    Seed ds must be saved between calls. ggl is
+    essentially the same as the IMSL routine RNUM.
+                                                                                 
+    W. Petersen and M. Troyer, 24 Oct. 2002, ETHZ:
+    a modification of a fortran version from
+    I. Vattulainen, Tampere Univ. of Technology,
+    Finland, 1992 */
+                                                                                 
+    double t,d2=0.2147483647e10;
+    t   = (float) *ds;
+    t   = fmod(0.16807e5*t,d2);
+    *ds = (float) t;
+    return((float) ((t-1.0e0)/(d2-1.0e0)));
+ }
+ 

Index: llvm-test/SingleSource/UnitTests/Vector/SSE/sse.isamax.c
diff -c /dev/null llvm-test/SingleSource/UnitTests/Vector/SSE/sse.isamax.c:1.1
*** /dev/null	Mon Apr  3 19:48:08 2006
--- llvm-test/SingleSource/UnitTests/Vector/SSE/sse.isamax.c	Mon Apr  3 19:47:54 2006
***************
*** 0 ****
--- 1,119 ----
+ #include <stdio.h>
+ #include <math.h>
+ #include <float.h>
+ #include "xmmintrin.h"
+ #define N 20
+ main()
+ {
+ /* 
+   SSE unit step isamax with alignment code. From Section
+   3.5.7 of Petersen and Arbenz "Intro. to Parallel Computing,"
+   Oxford Univ. Press, 2004.
+ 
+                              wpp 31/7/2002 
+ */
+   float x[N];
+   int i,im;
+   int isamax0(int,float *);
+   for(i=0;i<N;i++){
+      x[i] = -2.0 + (float) i;
+   }
+   x[7] =33.0;
+   im = isamax0(N,x);
+   printf(" maximum index = %d\n",im);
+   printf(" maximum value = %e\n",x[im]);
+ }
+ #define NS 12
+ int isamax0(int n, float *x)
+ {
+   float bbig,ebig,bres,*xp;
+   int eres,i,ibbig,iebig,align,nsegs,mb,nn;
+   __m128 offset4,V0,V1,V2,V3,V6,V7;
+   float xbig[8],indx[8];
+ // n < NS done in scalar mode
+   if(n < NS){
+      iebig = 0;
+      bbig  = 0.0;
+      for(i=0;i<n;i++){
+         if(fabs(x[i]) > bbig){
+            bbig  = fabs(x[i]);
+            iebig = i;
+         }
+      }
+      return(iebig);
+   }
+ // n >= NS case done in SSE mode
+   V7      = _mm_set_ps(3.0,2.0,1.0,0.0);
+   V2      = _mm_set_ps(3.0,2.0,1.0,0.0);
+   V6      = _mm_set_ps1(-0.0);
+   offset4 = _mm_set_ps1(4.0);
+   align = ((unsigned int) x >> 2) & 0x03;
+   if(align == 1){        // bres = 3 case
+      bbig = fabsf(x[0]); ibbig = 0; 
+      bres = 3.0; nn = n - 3;
+      for(i=1;i<3;i++){
+         if(fabsf(x[i]) > bbig){
+            bbig = fabsf(x[i]); ibbig = i;
+         }
+      }
+   } else if(align == 2){    // bres = 2 case
+      bbig = fabsf(x[0]); ibbig = 0;
+      bres = 2.0; nn = n - 2;
+      if(fabsf(x[1]) > bbig){
+         bbig  = fabsf(x[1]); ibbig = 1;
+      }
+   } else if(align == 1){    // bres = 1 case
+      bbig = fabsf(x[0]); ibbig = 0;
+      bres = 1.0; nn = n - 1;
+   } else {                  // bres = 0 case
+      bbig = 0.0; ibbig = 0; nn = n;
+      bres = 0.0;
+   }
+   xp = x + (int) bres;
+   nsegs = (nn >> 2) - 2;
+   eres  = nn - 4*(nsegs+2); 
+   V0 = _mm_load_ps(xp); xp += 4;   // first four in 4/time seq.
+   V1 = _mm_load_ps(xp); xp += 4;   // next four in 4/time seq.
+   V0 = _mm_andnot_ps(V6,V0);       // take absolute value
+   for(i=0;i<nsegs;i++){
+      V1 = _mm_andnot_ps(V6,V1);    // take absolute value
+      V3 = _mm_cmpnle_ps(V1,V0);    // compare old max of 4 to new
+      mb = _mm_movemask_ps(V3);     // any of 4 bigger?
+      V2 = _mm_add_ps(V2,offset4);  // add offset
+      if(mb > 0){
+         V0 = _mm_max_ps(V0,V1);
+         V3 = _mm_and_ps(V2,V3);
+         V7 = _mm_max_ps(V7,V3);
+      }
+      V1 = _mm_load_ps(xp); xp += 4;  // bottom load next four
+   }
+ // finish up the last segment of 4
+   V1 = _mm_andnot_ps(V6,V1);    // take absolute value
+   V3 = _mm_cmpnle_ps(V1,V0);    // compare old max of 4 to new
+   mb = _mm_movemask_ps(V3);     // any of 4 bigger?
+   V2 = _mm_add_ps(V2,offset4);  // add offset
+   if(mb > 0){
+      V0 = _mm_max_ps(V0,V1);
+      V3 = _mm_and_ps(V2,V3);
+      V7 = _mm_max_ps(V7,V3);
+   }
+ // Now finish up: segment maxima are in V0, indices in V7
+   _mm_store_ps(xbig,V0);
+   _mm_store_ps(indx,V7);
+   if(eres>0){
+     for(i=0;i<eres;i++){
+        xbig[4+i] = fabsf(*(xp++));
+        indx[4+i] = (float) (nn+i);
+     }
+   }
+   ebig  = bbig; 
+   iebig = ibbig; 
+   for(i=0;i<4+eres;i++){
+      if(xbig[i] > ebig){
+         ebig = xbig[i];
+         iebig = (int) indx[i];
+      }
+   }
+   return(iebig);
+ }
+ #undef NS

Index: llvm-test/SingleSource/UnitTests/Vector/SSE/sse.stepfft.c
diff -c /dev/null llvm-test/SingleSource/UnitTests/Vector/SSE/sse.stepfft.c:1.1
*** /dev/null	Mon Apr  3 19:48:08 2006
--- llvm-test/SingleSource/UnitTests/Vector/SSE/sse.stepfft.c	Mon Apr  3 19:47:54 2006
***************
*** 0 ****
--- 1,195 ----
+ #include <stdio.h>
+ #include <math.h>
+ #include <time.h>
+ #include <float.h>
+ #include "xmmintrin.h"
+ #define N 1024
+ #define N2 N/2
+ main()
+ {
+ /* 
+    SSE version of cfft2 - uses INTEL intrinsics
+        W. Petersen, SAM. Math. ETHZ 2 May, 2002 
+ */
+    int first,i,icase,it,n;
+    float seed,error,fnm1,sign,z0,z1,ggl();
+    float *x,*y,*z,*w;
+    float t1,ln2,mflops;
+    void cffti(),cfft2();
+ /* allocate storage for x,y,z,w on 4-word bndr. */
+    x = (float *)_mm_malloc(8*N, 16);
+    y = (float *)_mm_malloc(8*N, 16);
+    z = (float *)_mm_malloc(8*N, 16);
+    w = (float *)_mm_malloc(4*N, 16);
+    first = 1;
+    seed  = 331.0;
+    for(icase=0;icase<2;icase++){
+    if(first){
+       for(i=0;i<2*N;i+=2){
+          z0 = ggl(&seed);     /* real part of array */
+          z1 = ggl(&seed);     /* imaginary part of array */
+          x[i] = z0;
+          z[i] = z0;           /* copy of initial real data */
+          x[i+1] = z1;
+          z[i+1] = z1;         /* copy of initial imag. data */
+       }
+    } else {
+       for(i=0;i<2*N;i+=2){
+          z0 = 0;              /* real part of array */
+          z1 = 0;              /* imaginary part of array */
+          x[i] = z0;
+          z[i] = z0;           /* copy of initial real data */
+          x[i+1] = z1;
+          z[i+1] = z1;         /* copy of initial imag. data */
+       }
+    }
+ /* initialize sine/cosine tables */
+    n = N;
+    cffti(n,w);
+ /* transform forward, back */
+    if(first){
+       sign = 1.0;
+       cfft2(n,x,y,w,sign);
+       sign = -1.0;
+       cfft2(n,y,x,w,sign);
+ /* results should be same as initial multiplied by N */
+       fnm1 = 1.0/((float) n);
+       error = 0.0;
+       for(i=0;i<2*N;i+=2){
+          error += (z[i] - fnm1*x[i])*(z[i] - fnm1*x[i]) +
+                   (z[i+1] - fnm1*x[i+1])*(z[i+1] - fnm1*x[i+1]);
+       }
+       error = sqrt(fnm1*error);
+       printf(" for n=%d, fwd/bck error=%e\n",N,error);
+       first = 0;
+    } else {
+       t1   = ((float)clock())/((float) CLOCKS_PER_SEC);
+       for(it=0;it<10000;it++){
+          sign = +1.0;
+          cfft2(n,x,y,w,sign);
+          sign = -1.0;
+          cfft2(n,y,x,w,sign);
+       }
+       t1   = ((float)clock())/((float) CLOCKS_PER_SEC) - t1;
+       t1   = t1/20000.0;
+       ln2 = 10.0; /* reset this for different N  */
+       mflops = 5.0*((float) N)*ln2/((1.e+6)*t1);
+       printf(" for n=%d, t1=%e, mflops=%e\n",n,t1,mflops);
+    }
+    }
+ }
+ void cfft2(n,x,y,w,sign)
+ int n;
+ float x[][2],y[][2],w[][2],sign;
+ {
+    int jb, m, j, mj, tgle;
+    void ccopy(),step();
+    m    = (int) (log((float) n)/log(1.99));
+    mj   = 1;
+    tgle = 1;  /* toggling switch for work array */
+    step(n,mj,&x[0][0],&x[n/2][0],&y[0][0],&y[mj][0],w,sign);
+    for(j=0;j<m-2;j++){
+       mj *= 2;
+       if(tgle){
+          step(n,mj,&y[0][0],&y[n/2][0],&x[0][0],&x[mj][0],w,sign);
+          tgle = 0;
+       } else {
+          step(n,mj,&x[0][0],&x[n/2][0],&y[0][0],&y[mj][0],w,sign);
+          tgle = 1;
+       }
+    }
+ /* last pass thru data: move y to x if needed */
+    if(tgle) {
+       ccopy(n,y,x);
+    }
+    mj   = n/2;
+    step(n,mj,&x[0][0],&x[n/2][0],&y[0][0],&y[mj][0],w,sign);
+ }
+ void cffti(int n, float w[][2])
+ {
+    int i,n2;
+    float aw,arg,pi;
+    pi = 3.141592653589793;
+    n2 = n/2;
+    aw = 2.0*pi/((float)n);
+ #pragma vector
+    for(i=0;i<n2;i++){
+       arg   = aw*((float)i);
+       w[i][0] = cos(arg);
+       w[i][1] = sin(arg);
+    }
+ }
+ void ccopy(int n, float x[][2], float y[][2])
+ {
+    int i;
+    for(i=0;i<n;i++){
+       y[i][0] = x[i][0];
+       y[i][1] = x[i][1];
+    }
+ }
+ #include <math.h>
+ float ggl(float *ds)
+ {
+                                                                                 
+ /* generate u(0,1) distributed random numbers.
+    Seed ds must be saved between calls. ggl is
+    essentially the same as the IMSL routine RNUM.
+                                                                                 
+    W. Petersen and M. Troyer, 24 Oct. 2002, ETHZ:
+    a modification of a fortran version from
+    I. Vattulainen, Tampere Univ. of Technology,
+    Finland, 1992 */
+                                                                                 
+    double t,d2=0.2147483647e10;
+    t   = (float) *ds;
+    t   = fmod(0.16807e5*t,d2);
+    *ds = (float) t;
+    return((float) ((t-1.0e0)/(d2-1.0e0)));
+ }
+ void step(n,mj,a,b,c,d,w,sign)
+ int n, mj; 
+ float a[][2],b[][2],c[][2],d[][2],w[][2],sign;
+ {
+    int j,k,jc,jw,l,lj,mj2,mseg;
+    float rp,up,wr[4],wu[4];
+    __m128 xmm0,xmm1,xmm2,xmm3,xmm4,xmm5,xmm6,xmm7;
+ 
+    mj2 = 2*mj;
+    lj  = n/mj2;
+ 
+    for(j=0; j<lj; j++){
+       jw  = j*mj; jc  = j*mj2;
+       rp = w[jw][0];
+       up = w[jw][1];
+       if(sign<0.0) up = -up;
+       if(mj<2){
+ /* special case mj=1 */ 
+          d[jc][0] = rp*(a[jw][0] - b[jw][0]) - up*(a[jw][1] - b[jw][1]);
+          d[jc][1] = up*(a[jw][0] - b[jw][0]) + rp*(a[jw][1] - b[jw][1]);
+          c[jc][0] = a[jw][0] + b[jw][0];
+          c[jc][1] = a[jw][1] + b[jw][1];
+       } else {
+ /* mj>=2 case */ 
+ /*       _mm_prefetch((char *)&a[jw][0],_MM_HINT_NTA); */
+ /*       _mm_prefetch((char *)&b[jw][0],_MM_HINT_NTA); */
+          wr[0] =  rp; wr[1] = rp; wr[2] =  rp; wr[3] = rp;
+          wu[0] = -up; wu[1] = up; wu[2] = -up; wu[3] = up;
+          xmm6 = _mm_load_ps(wr);
+          xmm7 = _mm_load_ps(wu);
+          for(k=0; k<mj; k+=2){
+ /*          _mm_prefetch((char *)&a[jw+k][0],_MM_HINT_NTA); */
+ /*          _mm_prefetch((char *)&b[jw+k][0],_MM_HINT_NTA); */
+             xmm0 = _mm_load_ps(&a[jw+k][0]);
+             xmm1 = _mm_load_ps(&b[jw+k][0]);
+             xmm2 = _mm_add_ps(xmm0,xmm1);            /* a + b */
+             _mm_store_ps(&c[jc+k][0],xmm2);          /* store c */
+             xmm3 = _mm_sub_ps(xmm0,xmm1);            /* a - b */
+             xmm4 = _mm_shuffle_ps(xmm3,xmm3,_MM_SHUFFLE(2,3,0,1));   
+             xmm0 = _mm_mul_ps(xmm6,xmm3);   
+             xmm1 = _mm_mul_ps(xmm7,xmm4);   
+             xmm2 = _mm_add_ps(xmm0,xmm1);            /* w*(a - b) */
+             _mm_store_ps(&d[jc+k][0],xmm2);          /* store d */
+          }
+       }
+    }
+ }