[llvm-commits] CVS: llvm-test/SingleSource/UnitTests/Vector/SSE/Makefile sse.expandfft.c sse.isamax.c sse.stepfft.c
Evan Cheng
evan.cheng at apple.com
Mon Apr 3 17:48:08 PDT 2006
Changes in directory llvm-test/SingleSource/UnitTests/Vector/SSE:
Makefile added (r1.1)
sse.expandfft.c added (r1.1)
sse.isamax.c added (r1.1)
sse.stepfft.c added (r1.1)
---
Log message:
Added some Altivec and SSE examples from:
Introduction to Parallel Computing
A practical guide with examples in C
Oxford Texts in Applied and Engineering Mathematics No. 9
Oxford University Press, February 2004
ISBN: 0-19-851576-6 (hardback), 0-19-851577-4 (paperback)
http://people.inf.ethz.ch/arbenz/book/
---
Diffs of the changes: (+585 -0)
Makefile | 8 +
sse.expandfft.c | 263 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++
sse.isamax.c | 119 +++++++++++++++++++++++++
sse.stepfft.c | 195 +++++++++++++++++++++++++++++++++++++++++
4 files changed, 585 insertions(+)
Index: llvm-test/SingleSource/UnitTests/Vector/SSE/Makefile
diff -c /dev/null llvm-test/SingleSource/UnitTests/Vector/SSE/Makefile:1.1
*** /dev/null Mon Apr 3 19:48:04 2006
--- llvm-test/SingleSource/UnitTests/Vector/SSE/Makefile Mon Apr 3 19:47:54 2006
***************
*** 0 ****
--- 1,8 ----
+ # SingleSource/UnitTests/Vector/SSE/Makefile
+
+ DIRS =
+ LEVEL = ../../../..
+ include $(LEVEL)/SingleSource/Makefile.singlesrc
+
+ TARGET_CFLAGS += -msse3
+ LCCFLAGS += -msse3
Index: llvm-test/SingleSource/UnitTests/Vector/SSE/sse.expandfft.c
diff -c /dev/null llvm-test/SingleSource/UnitTests/Vector/SSE/sse.expandfft.c:1.1
*** /dev/null Mon Apr 3 19:48:08 2006
--- llvm-test/SingleSource/UnitTests/Vector/SSE/sse.expandfft.c Mon Apr 3 19:47:54 2006
***************
*** 0 ****
--- 1,263 ----
+ #include <stdio.h>
+ #include <math.h>
+ #include <time.h>
+ #include <float.h>
+ #include "xmmintrin.h"
+ #define N 256
+ #define N2 N/2
+ main()
+ {
+ /*
+ SSE version of cfft2 - uses Intel intrinsics.
+ Expanded version
+
+ wpp, SAM. Math. ETHZ 21 May, 2002
+ */
+ int first,i,icase,it,n;
+ float error,fnm1,seed,sign,z0,z1,ggl();
+ float *x,*y,*z,*w;
+ float t1,ln2,mflops;
+ void cffti(),cfft2();
+ /* allocate storage for x,y,z,w on 4-word bndr. */
+ x = (float *)_mm_malloc(8*N, 16);
+ y = (float *)_mm_malloc(8*N, 16);
+ z = (float *)_mm_malloc(8*N, 16);
+ w = (float *)_mm_malloc(4*N, 16);
+ first = 1;
+ seed = 331.0;
+ for(icase=0;icase<2;icase++){
+ if(first){
+ for(i=0;i<2*N;i+=2){
+ z0 = ggl(&seed); /* real part of array */
+ z1 = ggl(&seed); /* imaginary part of array */
+ x[i] = z0;
+ z[i] = z0; /* copy of initial real data */
+ x[i+1] = z1;
+ z[i+1] = z1; /* copy of initial imag. data */
+ }
+ } else {
+ for(i=0;i<2*N;i+=2){
+ z0 = 0; /* real part of array */
+ z1 = 0; /* imaginary part of array */
+ x[i] = z0;
+ z[i] = z0; /* copy of initial real data */
+ x[i+1] = z1;
+ z[i+1] = z1; /* copy of initial imag. data */
+ }
+ }
+ /* initialize sine/cosine tables */
+ n = N;
+ cffti(n,w);
+ /* transform forward, back */
+ if(first){
+ sign = 1.0;
+ cfft2(n,x,y,w,sign);
+ sign = -1.0;
+ cfft2(n,y,x,w,sign);
+ /* results should be same as initial multiplied by N */
+ fnm1 = 1.0/((float) n);
+ error = 0.0;
+ for(i=0;i<2*N;i+=2){
+ error += (z[i] - fnm1*x[i])*(z[i] - fnm1*x[i]) +
+ (z[i+1] - fnm1*x[i+1])*(z[i+1] - fnm1*x[i+1]);
+ }
+ error = sqrt(fnm1*error);
+ printf(" for n=%d, fwd/bck error=%e\n",N,error);
+ first = 0;
+ } else {
+ t1 = ((float)clock())/((float) CLOCKS_PER_SEC);
+ for(it=0;it<1000;it++){
+ sign = +1.0;
+ cfft2(n,x,y,w,sign);
+ sign = -1.0;
+ cfft2(n,y,x,w,sign);
+ }
+ t1 = ((float)clock())/((float) CLOCKS_PER_SEC) - t1;
+ t1 = t1/2000.0;
+ ln2 = 10.0; // reset this for different N
+ mflops = 5.0*((float) N)*ln2/((1.e+6)*t1);
+ printf(" for n=%d, t1=%e, mflops=%e\n",n,t1,mflops);
+ }
+ }
+ }
+ void cfft2(n,x,y,w,sign)
+ int n;
+ float x[][2],y[][2],w[][2],sign;
+ {
+ int jb, jc, jw, k, k2, lj, m, j, mj, mj2, pass, tgle;
+ float (*a)[2],(*b)[2],(*c)[2],(*d)[2];
+ float (*aa)[2],(*bb)[2],(*cb)[2],(*dd)[2];
+ float rp,up,wr[4],wu[4];
+ __m128 V0,V1,V2,V3,V4,V5,V6,V7;
+ __m128 V8,V9,V10,V11,V12,V13,V14,V15;
+
+ if(n<=1){
+ y[0][0] = x[0][0];
+ y[0][1] = x[0][1];
+ return;
+ }
+ m = (int) (log((float) n)/log(1.99));
+ mj = 1;
+ mj2 = 2;
+ lj = n/2;
+ // first pass thru data: x -> y
+ a = (void *)&x[0][0];
+ b = (void *)&x[n/2][0];
+ c = (void *)&y[0][0];
+ d = (void *)&y[1][0];
+ for(j=0;j<lj;j++){
+ jc = j*mj2;
+ rp = w[j][0]; up = w[j][1];
+ if(sign<0.0) up = -up;
+ d[jc][0] = rp*(a[j][0] - b[j][0]) - up*(a[j][1] - b[j][1]);
+ d[jc][1] = up*(a[j][0] - b[j][0]) + rp*(a[j][1] - b[j][1]);
+ c[jc][0] = a[j][0] + b[j][0];
+ c[jc][1] = a[j][1] + b[j][1];
+ }
+ if(n==2) return;
+ // next pass is mj = 2
+ mj = 2;
+ mj2 = 4;
+ lj = n/4;
+ a = (void *)&y[0][0];
+ b = (void *)&y[n/2][0];
+ c = (void *)&x[0][0];
+ d = (void *)&x[mj][0];
+ if(n==4){
+ c = (void *)&y[0][0];
+ d = (void *)&y[mj][0];
+ }
+ for(j=0;j<lj;j++){
+ jw = j*mj; jc = j*mj2;
+ rp = w[jw][0]; up = w[jw][1];
+ if(sign<0.0) up = -up;
+ wr[0] = rp; wr[1] = rp; wr[2] = rp; wr[3] = rp;
+ wu[0] = -up; wu[1] = up; wu[2] = -up; wu[3] = up;
+ V6 = _mm_load_ps(wr);
+ V7 = _mm_load_ps(wu);
+ V0 = _mm_load_ps(&a[jw][0]);
+ V1 = _mm_load_ps(&b[jw][0]);
+ V2 = _mm_add_ps(V0,V1); /* a + b */
+ _mm_store_ps(&c[jc][0],V2); /* store c */
+ V3 = _mm_sub_ps(V0,V1); /* a - b */
+ V4 = _mm_shuffle_ps(V3,V3,_MM_SHUFFLE(2,3,0,1));
+ V0 = _mm_mul_ps(V6,V3);
+ V1 = _mm_mul_ps(V7,V4);
+ V2 = _mm_add_ps(V0,V1); /* w*(a - b) */
+ _mm_store_ps(&d[jc][0],V2); /* store d */
+ }
+ if(n==4) return;
+ mj *= 2;
+ mj2 = 2*mj;
+ lj = n/mj2;
+ tgle = 0;
+ for(pass=2;pass<m-1;pass++){
+ if(tgle){
+ a = (void *)&y[0][0];
+ b = (void *)&y[n/2][0];
+ c = (void *)&x[0][0];
+ d = (void *)&x[mj][0];
+ tgle = 0;
+ } else {
+ a = (void *)&x[0][0];
+ b = (void *)&x[n/2][0];
+ c = (void *)&y[0][0];
+ d = (void *)&y[mj][0];
+ tgle = 1;
+ }
+ for(j=0; j<lj; j++){
+ jw = j*mj; jc = j*mj2;
+ rp = w[jw][0];
+ up = w[jw][1];
+ if(sign<0.0) up = -up;
+ wr[0] = rp; wr[1] = rp; wr[2] = rp; wr[3] = rp;
+ wu[0] = -up; wu[1] = up; wu[2] = -up; wu[3] = up;
+ V6 = _mm_load_ps(wr);
+ V7 = _mm_load_ps(wu);
+ for(k=0; k<mj; k+=4){
+ k2 = k + 2;
+ V0 = _mm_load_ps(&a[jw+k][0]);
+ V1 = _mm_load_ps(&b[jw+k][0]);
+ V2 = _mm_add_ps(V0,V1); /* a + b */
+ _mm_store_ps(&c[jc+k][0],V2); /* store c */
+ V3 = _mm_sub_ps(V0,V1); /* a - b */
+ V4 = _mm_shuffle_ps(V3,V3,_MM_SHUFFLE(2,3,0,1));
+ V0 = _mm_mul_ps(V6,V3);
+ V1 = _mm_mul_ps(V7,V4);
+ V2 = _mm_add_ps(V0,V1); /* w*(a - b) */
+ _mm_store_ps(&d[jc+k][0],V2); /* store d */
+ V8 = _mm_load_ps(&a[jw+k2][0]);
+ V9 = _mm_load_ps(&b[jw+k2][0]);
+ V10 = _mm_add_ps(V8,V9); /* a + b */
+ _mm_store_ps(&c[jc+k2][0],V10); /* store c */
+ V11 = _mm_sub_ps(V8,V9); /* a - b */
+ V12 = _mm_shuffle_ps(V11,V11,_MM_SHUFFLE(2,3,0,1));
+ V8 = _mm_mul_ps(V6,V11);
+ V9 = _mm_mul_ps(V7,V12);
+ V10 = _mm_add_ps(V8,V9); /* w*(a - b) */
+ _mm_store_ps(&d[jc+k2][0],V10); /* store d */
+ }
+ }
+ mj *= 2;
+ mj2 = 2*mj;
+ lj = n/mj2;
+ }
+ /* last pass thru data: in-place if previous in y */
+ c = (void *)&y[0][0];
+ d = (void *)&y[n/2][0];
+ if(tgle) {
+ a = (void *)&y[0][0];
+ b = (void *)&y[n/2][0];
+ } else {
+ a = (void *)&x[0][0];
+ b = (void *)&x[n/2][0];
+ }
+ for(k=0; k<(n/2); k+=4){
+ k2 = k + 2;
+ V0 = _mm_load_ps(&a[k][0]);
+ V1 = _mm_load_ps(&b[k][0]);
+ V2 = _mm_add_ps(V0,V1); /* a + b */
+ _mm_store_ps(&c[k][0],V2); /* store c */
+ V3 = _mm_sub_ps(V0,V1); /* a - b */
+ _mm_store_ps(&d[k][0],V3); /* store d */
+ V4 = _mm_load_ps(&a[k2][0]);
+ V5 = _mm_load_ps(&b[k2][0]);
+ V6 = _mm_add_ps(V4,V5); /* a + b */
+ _mm_store_ps(&c[k2][0],V6); /* store c */
+ V7 = _mm_sub_ps(V4,V5); /* a - b */
+ _mm_store_ps(&d[k2][0],V7); /* store d */
+ }
+ }
+ void cffti(int n, float w[][2])
+ {
+ int i,n2;
+ float aw,arg,pi;
+ pi = 3.141592653589793;
+ n2 = n/2;
+ aw = 2.0*pi/((float)n);
+ #pragma vector
+ for(i=0;i<n2;i++){
+ arg = aw*((float)i);
+ w[i][0] = cos(arg);
+ w[i][1] = sin(arg);
+ }
+ }
+ #include <math.h>
+ float ggl(float *ds)
+ {
+ /* generate u(0,1) distributed random numbers.
+ Seed ds must be saved between calls. ggl is
+ essentially the same as the IMSL routine RNUM.
+
+ W. Petersen and M. Troyer, 24 Oct. 2002, ETHZ:
+ a modification of a fortran version from
+ I. Vattulainen, Tampere Univ. of Technology,
+ Finland, 1992 */
+
+ double t,d2=0.2147483647e10;
+ t = (float) *ds;
+ t = fmod(0.16807e5*t,d2);
+ *ds = (float) t;
+ return((float) ((t-1.0e0)/(d2-1.0e0)));
+ }
+
Index: llvm-test/SingleSource/UnitTests/Vector/SSE/sse.isamax.c
diff -c /dev/null llvm-test/SingleSource/UnitTests/Vector/SSE/sse.isamax.c:1.1
*** /dev/null Mon Apr 3 19:48:08 2006
--- llvm-test/SingleSource/UnitTests/Vector/SSE/sse.isamax.c Mon Apr 3 19:47:54 2006
***************
*** 0 ****
--- 1,119 ----
+ #include <stdio.h>
+ #include <math.h>
+ #include <float.h>
+ #include "xmmintrin.h"
+ #define N 20
+ main()
+ {
+ /*
+ SSE unit step isamax with alignment code. From Section
+ 3.5.7 of Petersen and Arbenz "Intro. to Parallel Computing,"
+ Oxford Univ. Press, 2004.
+
+ wpp 31/7/2002
+ */
+ float x[N];
+ int i,im;
+ int isamax0(int,float *);
+ for(i=0;i<N;i++){
+ x[i] = -2.0 + (float) i;
+ }
+ x[7] =33.0;
+ im = isamax0(N,x);
+ printf(" maximum index = %d\n",im);
+ printf(" maximum value = %e\n",x[im]);
+ }
+ #define NS 12
+ int isamax0(int n, float *x)
+ {
+ float bbig,ebig,bres,*xp;
+ int eres,i,ibbig,iebig,align,nsegs,mb,nn;
+ __m128 offset4,V0,V1,V2,V3,V6,V7;
+ float xbig[8],indx[8];
+ // n < NS done in scalar mode
+ if(n < NS){
+ iebig = 0;
+ bbig = 0.0;
+ for(i=0;i<n;i++){
+ if(fabs(x[i]) > bbig){
+ bbig = fabs(x[i]);
+ iebig = i;
+ }
+ }
+ return(iebig);
+ }
+ // n >= NS case done in SSE mode
+ V7 = _mm_set_ps(3.0,2.0,1.0,0.0);
+ V2 = _mm_set_ps(3.0,2.0,1.0,0.0);
+ V6 = _mm_set_ps1(-0.0);
+ offset4 = _mm_set_ps1(4.0);
+ align = ((unsigned int) x >> 2) & 0x03;
+ if(align == 1){ // bres = 3 case
+ bbig = fabsf(x[0]); ibbig = 0;
+ bres = 3.0; nn = n - 3;
+ for(i=1;i<3;i++){
+ if(fabsf(x[i]) > bbig){
+ bbig = fabsf(x[i]); ibbig = i;
+ }
+ }
+ } else if(align == 2){ // bres = 2 case
+ bbig = fabsf(x[0]); ibbig = 0;
+ bres = 2.0; nn = n - 2;
+ if(fabsf(x[1]) > bbig){
+ bbig = fabsf(x[1]); ibbig = 1;
+ }
+ } else if(align == 1){ // bres = 1 case
+ bbig = fabsf(x[0]); ibbig = 0;
+ bres = 1.0; nn = n - 1;
+ } else { // bres = 0 case
+ bbig = 0.0; ibbig = 0; nn = n;
+ bres = 0.0;
+ }
+ xp = x + (int) bres;
+ nsegs = (nn >> 2) - 2;
+ eres = nn - 4*(nsegs+2);
+ V0 = _mm_load_ps(xp); xp += 4; // first four in 4/time seq.
+ V1 = _mm_load_ps(xp); xp += 4; // next four in 4/time seq.
+ V0 = _mm_andnot_ps(V6,V0); // take absolute value
+ for(i=0;i<nsegs;i++){
+ V1 = _mm_andnot_ps(V6,V1); // take absolute value
+ V3 = _mm_cmpnle_ps(V1,V0); // compare old max of 4 to new
+ mb = _mm_movemask_ps(V3); // any of 4 bigger?
+ V2 = _mm_add_ps(V2,offset4); // add offset
+ if(mb > 0){
+ V0 = _mm_max_ps(V0,V1);
+ V3 = _mm_and_ps(V2,V3);
+ V7 = _mm_max_ps(V7,V3);
+ }
+ V1 = _mm_load_ps(xp); xp += 4; // bottom load next four
+ }
+ // finish up the last segment of 4
+ V1 = _mm_andnot_ps(V6,V1); // take absolute value
+ V3 = _mm_cmpnle_ps(V1,V0); // compare old max of 4 to new
+ mb = _mm_movemask_ps(V3); // any of 4 bigger?
+ V2 = _mm_add_ps(V2,offset4); // add offset
+ if(mb > 0){
+ V0 = _mm_max_ps(V0,V1);
+ V3 = _mm_and_ps(V2,V3);
+ V7 = _mm_max_ps(V7,V3);
+ }
+ // Now finish up: segment maxima are in V0, indices in V7
+ _mm_store_ps(xbig,V0);
+ _mm_store_ps(indx,V7);
+ if(eres>0){
+ for(i=0;i<eres;i++){
+ xbig[4+i] = fabsf(*(xp++));
+ indx[4+i] = (float) (nn+i);
+ }
+ }
+ ebig = bbig;
+ iebig = ibbig;
+ for(i=0;i<4+eres;i++){
+ if(xbig[i] > ebig){
+ ebig = xbig[i];
+ iebig = (int) indx[i];
+ }
+ }
+ return(iebig);
+ }
+ #undef NS
Index: llvm-test/SingleSource/UnitTests/Vector/SSE/sse.stepfft.c
diff -c /dev/null llvm-test/SingleSource/UnitTests/Vector/SSE/sse.stepfft.c:1.1
*** /dev/null Mon Apr 3 19:48:08 2006
--- llvm-test/SingleSource/UnitTests/Vector/SSE/sse.stepfft.c Mon Apr 3 19:47:54 2006
***************
*** 0 ****
--- 1,195 ----
+ #include <stdio.h>
+ #include <math.h>
+ #include <time.h>
+ #include <float.h>
+ #include "xmmintrin.h"
+ #define N 1024
+ #define N2 N/2
+ main()
+ {
+ /*
+ SSE version of cfft2 - uses INTEL intrinsics
+ W. Petersen, SAM. Math. ETHZ 2 May, 2002
+ */
+ int first,i,icase,it,n;
+ float seed,error,fnm1,sign,z0,z1,ggl();
+ float *x,*y,*z,*w;
+ float t1,ln2,mflops;
+ void cffti(),cfft2();
+ /* allocate storage for x,y,z,w on 4-word bndr. */
+ x = (float *)_mm_malloc(8*N, 16);
+ y = (float *)_mm_malloc(8*N, 16);
+ z = (float *)_mm_malloc(8*N, 16);
+ w = (float *)_mm_malloc(4*N, 16);
+ first = 1;
+ seed = 331.0;
+ for(icase=0;icase<2;icase++){
+ if(first){
+ for(i=0;i<2*N;i+=2){
+ z0 = ggl(&seed); /* real part of array */
+ z1 = ggl(&seed); /* imaginary part of array */
+ x[i] = z0;
+ z[i] = z0; /* copy of initial real data */
+ x[i+1] = z1;
+ z[i+1] = z1; /* copy of initial imag. data */
+ }
+ } else {
+ for(i=0;i<2*N;i+=2){
+ z0 = 0; /* real part of array */
+ z1 = 0; /* imaginary part of array */
+ x[i] = z0;
+ z[i] = z0; /* copy of initial real data */
+ x[i+1] = z1;
+ z[i+1] = z1; /* copy of initial imag. data */
+ }
+ }
+ /* initialize sine/cosine tables */
+ n = N;
+ cffti(n,w);
+ /* transform forward, back */
+ if(first){
+ sign = 1.0;
+ cfft2(n,x,y,w,sign);
+ sign = -1.0;
+ cfft2(n,y,x,w,sign);
+ /* results should be same as initial multiplied by N */
+ fnm1 = 1.0/((float) n);
+ error = 0.0;
+ for(i=0;i<2*N;i+=2){
+ error += (z[i] - fnm1*x[i])*(z[i] - fnm1*x[i]) +
+ (z[i+1] - fnm1*x[i+1])*(z[i+1] - fnm1*x[i+1]);
+ }
+ error = sqrt(fnm1*error);
+ printf(" for n=%d, fwd/bck error=%e\n",N,error);
+ first = 0;
+ } else {
+ t1 = ((float)clock())/((float) CLOCKS_PER_SEC);
+ for(it=0;it<10000;it++){
+ sign = +1.0;
+ cfft2(n,x,y,w,sign);
+ sign = -1.0;
+ cfft2(n,y,x,w,sign);
+ }
+ t1 = ((float)clock())/((float) CLOCKS_PER_SEC) - t1;
+ t1 = t1/20000.0;
+ ln2 = 10.0; /* reset this for different N */
+ mflops = 5.0*((float) N)*ln2/((1.e+6)*t1);
+ printf(" for n=%d, t1=%e, mflops=%e\n",n,t1,mflops);
+ }
+ }
+ }
+ void cfft2(n,x,y,w,sign)
+ int n;
+ float x[][2],y[][2],w[][2],sign;
+ {
+ int jb, m, j, mj, tgle;
+ void ccopy(),step();
+ m = (int) (log((float) n)/log(1.99));
+ mj = 1;
+ tgle = 1; /* toggling switch for work array */
+ step(n,mj,&x[0][0],&x[n/2][0],&y[0][0],&y[mj][0],w,sign);
+ for(j=0;j<m-2;j++){
+ mj *= 2;
+ if(tgle){
+ step(n,mj,&y[0][0],&y[n/2][0],&x[0][0],&x[mj][0],w,sign);
+ tgle = 0;
+ } else {
+ step(n,mj,&x[0][0],&x[n/2][0],&y[0][0],&y[mj][0],w,sign);
+ tgle = 1;
+ }
+ }
+ /* last pass thru data: move y to x if needed */
+ if(tgle) {
+ ccopy(n,y,x);
+ }
+ mj = n/2;
+ step(n,mj,&x[0][0],&x[n/2][0],&y[0][0],&y[mj][0],w,sign);
+ }
+ void cffti(int n, float w[][2])
+ {
+ int i,n2;
+ float aw,arg,pi;
+ pi = 3.141592653589793;
+ n2 = n/2;
+ aw = 2.0*pi/((float)n);
+ #pragma vector
+ for(i=0;i<n2;i++){
+ arg = aw*((float)i);
+ w[i][0] = cos(arg);
+ w[i][1] = sin(arg);
+ }
+ }
+ void ccopy(int n, float x[][2], float y[][2])
+ {
+ int i;
+ for(i=0;i<n;i++){
+ y[i][0] = x[i][0];
+ y[i][1] = x[i][1];
+ }
+ }
+ #include <math.h>
+ float ggl(float *ds)
+ {
+
+ /* generate u(0,1) distributed random numbers.
+ Seed ds must be saved between calls. ggl is
+ essentially the same as the IMSL routine RNUM.
+
+ W. Petersen and M. Troyer, 24 Oct. 2002, ETHZ:
+ a modification of a fortran version from
+ I. Vattulainen, Tampere Univ. of Technology,
+ Finland, 1992 */
+
+ double t,d2=0.2147483647e10;
+ t = (float) *ds;
+ t = fmod(0.16807e5*t,d2);
+ *ds = (float) t;
+ return((float) ((t-1.0e0)/(d2-1.0e0)));
+ }
+ void step(n,mj,a,b,c,d,w,sign)
+ int n, mj;
+ float a[][2],b[][2],c[][2],d[][2],w[][2],sign;
+ {
+ int j,k,jc,jw,l,lj,mj2,mseg;
+ float rp,up,wr[4],wu[4];
+ __m128 xmm0,xmm1,xmm2,xmm3,xmm4,xmm5,xmm6,xmm7;
+
+ mj2 = 2*mj;
+ lj = n/mj2;
+
+ for(j=0; j<lj; j++){
+ jw = j*mj; jc = j*mj2;
+ rp = w[jw][0];
+ up = w[jw][1];
+ if(sign<0.0) up = -up;
+ if(mj<2){
+ /* special case mj=1 */
+ d[jc][0] = rp*(a[jw][0] - b[jw][0]) - up*(a[jw][1] - b[jw][1]);
+ d[jc][1] = up*(a[jw][0] - b[jw][0]) + rp*(a[jw][1] - b[jw][1]);
+ c[jc][0] = a[jw][0] + b[jw][0];
+ c[jc][1] = a[jw][1] + b[jw][1];
+ } else {
+ /* mj>=2 case */
+ /* _mm_prefetch((char *)&a[jw][0],_MM_HINT_NTA); */
+ /* _mm_prefetch((char *)&b[jw][0],_MM_HINT_NTA); */
+ wr[0] = rp; wr[1] = rp; wr[2] = rp; wr[3] = rp;
+ wu[0] = -up; wu[1] = up; wu[2] = -up; wu[3] = up;
+ xmm6 = _mm_load_ps(wr);
+ xmm7 = _mm_load_ps(wu);
+ for(k=0; k<mj; k+=2){
+ /* _mm_prefetch((char *)&a[jw+k][0],_MM_HINT_NTA); */
+ /* _mm_prefetch((char *)&b[jw+k][0],_MM_HINT_NTA); */
+ xmm0 = _mm_load_ps(&a[jw+k][0]);
+ xmm1 = _mm_load_ps(&b[jw+k][0]);
+ xmm2 = _mm_add_ps(xmm0,xmm1); /* a + b */
+ _mm_store_ps(&c[jc+k][0],xmm2); /* store c */
+ xmm3 = _mm_sub_ps(xmm0,xmm1); /* a - b */
+ xmm4 = _mm_shuffle_ps(xmm3,xmm3,_MM_SHUFFLE(2,3,0,1));
+ xmm0 = _mm_mul_ps(xmm6,xmm3);
+ xmm1 = _mm_mul_ps(xmm7,xmm4);
+ xmm2 = _mm_add_ps(xmm0,xmm1); /* w*(a - b) */
+ _mm_store_ps(&d[jc+k][0],xmm2); /* store d */
+ }
+ }
+ }
+ }
More information about the llvm-commits
mailing list