[llvm-commits] [vector_llvm] CVS: llvm/examples/SIMD/DCT/Makefile dct.altivec.handwritten.c dct.sse.handwritten.c dct.vectorc.c main.c

Sun Oct 23 15:50:20 PDT 2005

Changes in directory llvm/examples/SIMD/DCT:

Makefile added (r1.1.2.1)
dct.altivec.handwritten.c added (r1.1.2.1)
dct.sse.handwritten.c added (r1.1.2.1)
dct.vectorc.c added (r1.1.2.1)
main.c added (r1.1.2.1)
---
Log message:

Examples to illustrate Vector LLVM's SIMD support.

---
Diffs of the changes:  (+646 -0)

 Makefile                  |    4 
 dct.altivec.handwritten.c |  198 ++++++++++++++++++++++++++++++++++++++++++++++
 dct.sse.handwritten.c     |  129 +++++++++++++++++++++++++++++
 dct.vectorc.c             |  140 ++++++++++++++++++++++++++++++++
 main.c                    |  175 ++++++++++++++++++++++++++++++++++++++++
 5 files changed, 646 insertions

Index: llvm/examples/SIMD/DCT/Makefile
diff -c /dev/null llvm/examples/SIMD/DCT/Makefile:1.1.2.1
*** /dev/null	Sun Oct 23 17:49:50 2005
--- llvm/examples/SIMD/DCT/Makefile	Sun Oct 23 17:49:39 2005
***************
*** 0 ****
--- 1,4 ----
+ NAME= dct
+ 
+ include ../Makefile.common
+ 

Index: llvm/examples/SIMD/DCT/dct.altivec.handwritten.c
diff -c /dev/null llvm/examples/SIMD/DCT/dct.altivec.handwritten.c:1.1.2.1
*** /dev/null	Sun Oct 23 17:50:17 2005
--- llvm/examples/SIMD/DCT/dct.altivec.handwritten.c	Sun Oct 23 17:49:39 2005
***************
*** 0 ****
--- 1,198 ----
+ static inline void Matrix_Transpose ( vector signed short *input, vector signed short *output)
+ {
+   vector signed short a0, a1, a2, a3, a4, a5, a6, a7;
+   vector signed short b0, b1, b2, b3, b4, b5, b6, b7;
+ 
+   b0 = vec_mergeh( input[0], input[4] );     /* [ 00 40 01 41 02 42 03 43 ]*/
+   b1 = vec_mergel( input[0], input[4] );     /* [ 04 44 05 45 06 46 07 47 ]*/
+   b2 = vec_mergeh( input[1], input[5] );     /* [ 10 50 11 51 12 52 13 53 ]*/
+   b3 = vec_mergel( input[1], input[5] );     /* [ 14 54 15 55 16 56 17 57 ]*/
+   b4 = vec_mergeh( input[2], input[6] );     /* [ 20 60 21 61 22 62 23 63 ]*/
+   b5 = vec_mergel( input[2], input[6] );     /* [ 24 64 25 65 26 66 27 67 ]*/
+   b6 = vec_mergeh( input[3], input[7] );     /* [ 30 70 31 71 32 72 33 73 ]*/
+   b7 = vec_mergel( input[3], input[7] );     /* [ 34 74 35 75 36 76 37 77 ]*/
+ 
+   a0 = vec_mergeh( b0, b4 );                 /* [ 00 20 40 60 01 21 41 61 ]*/
+   a1 = vec_mergel( b0, b4 );                 /* [ 02 22 42 62 03 23 43 63 ]*/
+   a2 = vec_mergeh( b1, b5 );                 /* [ 04 24 44 64 05 25 45 65 ]*/
+   a3 = vec_mergel( b1, b5 );                 /* [ 06 26 46 66 07 27 47 67 ]*/
+   a4 = vec_mergeh( b2, b6 );                 /* [ 10 30 50 70 11 31 51 71 ]*/
+   a5 = vec_mergel( b2, b6 );                 /* [ 12 32 52 72 13 33 53 73 ]*/
+   a6 = vec_mergeh( b3, b7 );                 /* [ 14 34 54 74 15 35 55 75 ]*/
+   a7 = vec_mergel( b3, b7 );                 /* [ 16 36 56 76 17 37 57 77 ]*/
+ 
+   output[0] = vec_mergeh( a0, a4 );          /* [ 00 10 20 30 40 50 60 70 ]*/
+   output[1] = vec_mergel( a0, a4 );          /* [ 01 11 21 31 41 51 61 71 ]*/
+   output[2] = vec_mergeh( a1, a5 );          /* [ 02 12 22 32 42 52 62 72 ]*/
+   output[3] = vec_mergel( a1, a5 );          /* [ 03 13 23 33 43 53 63 73 ]*/
+   output[4] = vec_mergeh( a2, a6 );          /* [ 04 14 24 34 44 54 64 74 ]*/
+   output[5] = vec_mergel( a2, a6 );          /* [ 05 15 25 35 45 55 65 75 ]*/
+   output[6] = vec_mergeh( a3, a7 );          /* [ 06 16 26 36 46 56 66 76 ]*/
+   output[7] = vec_mergel( a3, a7 );          /* [ 07 17 27 37 47 57 67 77 ]*/
+ 
+ }
+ 
+ /***************************************************************
+  *
+  * Copyright:   (c) Copyright Motorola Inc. 1998
+  *
+  * Date:        April 15, 1998
+  *
+  * Macro:       DCT_Transform
+  *
+  * Description: Discrete Cosign Transform implemented by the
+  *              Scaled Chen (II) Algorithm developed by Haifa
+  *              Research Lab.  The major differnce between this
+  *              algorithm and the Scaled Chen (I) is that
+  *              certain multiply-subtracts are replaced by
+  *              multiply adds.  A full description of the
+  *              Scaled Chen (I) algorithm can be found in:
+  *              W.C.Chen, C.H.Smith and S.C.Fralick, "A Fast
+  *              Computational Algorithm for the Discrete Cosine
+  *              Transform", IEEE Transactions on Cummnuications,
+  *              Vol. COM-25, No. 9, pp 1004-1009, Sept. 1997.
+  *
+  * Inputs:      vx     : array of vector short
+  *              t1-t10 : temporary vector variables set up by caller
+  *              c4     : cos(4*pi/16)
+  *              mc4    : -c4
+  *              a0     : c6/c2
+  *              a1     : c7/c1
+  *              a2     : c5/c3
+  *              ma2    : -a2
+  *              zero   : an array of zero elements
+  *
+  * Outputs:     vy     : array of vector short
+  *
+  **************************************************************/
+ 
+ #define DCT_Transform(vx,vy) \
+                                                                    \
+   /* 1st stage. */                                                 \
+   t8 = vec_adds( vx[0], vx[7] );     /* t0 + t7 */                 \
+   t9 = vec_subs( vx[0], vx[7] );     /* t0 - t7 */                 \
+   t0 = vec_adds( vx[1], vx[6] );     /* t1 + t6 */                 \
+   t7 = vec_subs( vx[1], vx[6] );     /* t1 - t6 */                 \
+   t1 = vec_adds( vx[2], vx[5] );     /* t2 + t6 */                 \
+   t6 = vec_subs( vx[2], vx[5] );     /* t2 - t6 */                 \
+   t2 = vec_adds( vx[3], vx[4] );     /* t3 + t4 */                 \
+   t5 = vec_subs( vx[3], vx[4] );     /* t3 - t4 */                 \
+                                                                    \
+   /* 2nd stage. */                                                 \
+   t3 = vec_adds( t8, t2 );           /* (t0+t7) + (t3+t4) */       \
+   t4 = vec_subs( t8, t2 );           /* (t0+t7) - (t3+t4) */       \
+   t2 = vec_adds( t0, t1 );           /* (t1+t6) + (t2+t5) */       \
+   t8 = vec_subs( t0, t1 );           /* (t1+t6) - (t2+t5) */       \
+                                                                    \
+   t1 = vec_adds( t7, t6 );           /* (t1-t6) + (t2-t5) */       \
+   t0 = vec_subs( t7, t6 );           /* (t1-t6) - (t2-t5) */       \
+                                                                    \
+   /* 3rd stage */                                                  \
+   vy[0] = vec_adds( t3, t2 );        /* y0 = t3 + t2 */            \
+   vy[4] = vec_subs( t3, t2 );        /* y4 = t3 + t2 */            \
+   vy[2] = vec_mradds( t8, a0, t4 );  /* y2 = t8 * (a0) + t4 */     \
+   t10 = vec_mradds( t4, a0, zero );                                \
+   vy[6]  = vec_subs( t10, t8 );       /* y6 = t4 * (a0) - t8 */    \
+                                                                    \
+   t6 = vec_mradds( t0, c4, t5 );     /* t6 = t0 * (c4) + t5  */    \
+   t7 = vec_mradds( t0, mc4, t5 );    /* t7 = t0 * (-c4) + t5 */    \
+   t2 = vec_mradds( t1, mc4, t9 );    /* t2 = t1 * (-c4) + t9 */    \
+   t3 = vec_mradds( t1, c4, t9 );     /* t3 = t1 * (c4) + t9  */    \
+                                                                    \
+   /* 4th stage. */                                                 \
+   vy[1] = vec_mradds( t6, a1, t3 );    /* y1 = t6 * (a1) + t3  */  \
+   t9 = vec_mradds( t3, a1, zero );                                 \
+   vy[7] = vec_subs( t9, t6 ) ;         /* y7 = t3 * (a1) - t6  */  \
+   vy[5] = vec_mradds( t2, a2, t7 );    /* y5 = t2 + (a2) + t7  */  \
+   vy[3] = vec_mradds( t7, ma2, t2 );   /* y3 = t7 * (-a2) + t2 */
+ 
+ /* Post-scaling matrix -- scaled by 1 */
+ vector signed short PostScale[8] = {
+     (vector signed short)( 4095, 5681, 5351, 4816, 4095, 4816, 5351, 5681 ),
+     (vector signed short)( 5681, 7880, 7422, 6680, 5681, 6680, 7422, 7880 ),
+     (vector signed short)( 5351, 7422, 6992, 6292, 5351, 6292, 6992, 7422 ),
+     (vector signed short)( 4816, 6680, 6292, 5663, 4816, 5663, 6292, 6680 ),
+     (vector signed short)( 4095, 5681, 5351, 4816, 4095, 4816, 5351, 5681 ),
+     (vector signed short)( 4816, 6680, 6292, 5663, 4816, 5663, 6292, 6680 ),
+     (vector signed short)( 5351, 7422, 6992, 6292, 5351, 6292, 6992, 7422 ),
+     (vector signed short)( 5681, 7880, 7422, 6680, 5681, 6680, 7422, 7880 )
+ };
+ 
+ /***************************************************************
+  *
+  * Copyright:   (c) Copyright Motorola Inc. 1998
+  *
+  * Date:        April 17, 1998
+  *
+  * Function:    DCT
+  *
+  * Description: Scaled Chen (II) algorithm for DCT
+  *              Arithmetic is 16-bit fixed point.
+  *
+  * Inputs:      input - Pointer to input data (short), which
+  *                      must be between -255 to +255.
+  *                      It is assumed that the allocated array
+  *                      has been 128-bit aligned and contains
+  *                      8x8 short elements.
+  *
+  * Outputs:     output - Pointer to output area for the transfored
+  *                       data. The output values are between -2040
+  *                       and 2040. It is assumed that a 128-bit
+  *                       aligned 8x8 array of short has been
+  *                       pre-allocated.
+  *
+  * Return:      None
+  *
+  ***************************************************************/
+ 
+ void dct_vector(short *input, short *output) {
+ 
+   vector signed short t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10;
+   vector signed short a0, a1, a2, ma2, c4, mc4, zero;
+   vector signed short vx[8], vy[8];
+   vector signed short *vec_ptr;  /* used for conversion between
+                                     arrays of short and vector
+                                     signed short array.  */
+ 
+   /* load the multiplication constants */
+   c4   = (vector signed short)(23170);   /* c4 = cos(4*pi/16)  */
+   a0   = (vector signed short)(13573);   /* a0 = c6/c2         */
+   a1   = (vector signed short)(6518);    /* a1 = c7/c1         */
+   a2   = (vector signed short)(21895);   /* a2 = c5/c3         */
+   mc4   = (vector signed short)(-23170); /* -c4                */
+   ma2   = (vector signed short)(-21895); /* -a2                */
+   zero = (vector signed short)(0);       /* 0                  */
+ 
+   /* copy the rows of input data */
+   vec_ptr = ( vector signed short * ) input;
+   vx[0] = vec_ptr[0];
+   vx[1] = vec_ptr[1];
+   vx[2] = vec_ptr[2];
+   vx[3] = vec_ptr[3];
+   vx[4] = vec_ptr[4];
+   vx[5] = vec_ptr[5];
+   vx[6] = vec_ptr[6];
+   vx[7] = vec_ptr[7];
+ 
+   /* Perform DCT first on the 8 columns */
+   DCT_Transform( vx, vy );
+ 
+   /* Transpose matrix to work on rows */
+   Matrix_Transpose( vy, vx );
+ 
+   /* Perform DCT first on the 8 rows */
+   DCT_Transform( vx, vy );
+ 
+   /* Post-scale and store result. */
+   vec_ptr = (vector signed short *) output;
+ 
+   vec_ptr[0] = vec_mradds( PostScale[0], vy[0], zero );
+   vec_ptr[1] = vec_mradds( PostScale[1], vy[1], zero );
+   vec_ptr[2] = vec_mradds( PostScale[2], vy[2], zero );
+   vec_ptr[3] = vec_mradds( PostScale[3], vy[3], zero );
+   vec_ptr[4] = vec_mradds( PostScale[4], vy[4], zero );
+   vec_ptr[5] = vec_mradds( PostScale[5], vy[5], zero );
+   vec_ptr[6] = vec_mradds( PostScale[6], vy[6], zero );
+   vec_ptr[7] = vec_mradds( PostScale[7], vy[7], zero );
+ 
+ }

Index: llvm/examples/SIMD/DCT/dct.sse.handwritten.c
diff -c /dev/null llvm/examples/SIMD/DCT/dct.sse.handwritten.c:1.1.2.1
*** /dev/null	Sun Oct 23 17:50:18 2005
--- llvm/examples/SIMD/DCT/dct.sse.handwritten.c	Sun Oct 23 17:49:39 2005
***************
*** 0 ****
--- 1,129 ----
+ #include "Scalar.h"
+ #include "SSE.h"
+ 
+ extern short *PostScalePtr;
+ 
+ static inline void Matrix_Transpose ( short *input_scalar, short *output_scalar)
+ {
+   __m128i *input = (__m128i*) input_scalar;
+   __m128i *output = (__m128i*) output_scalar;
+ 
+   __m128i a0, a1, a2, a3, a4, a5, a6, a7;
+   __m128i b0, b1, b2, b3, b4, b5, b6, b7;
+ 
+   b0 = _mm_unpacklo_epi16( input[0], input[4] );     /* [ 00 40 01 41 02 42 03 43 ]*/
+   b1 = _mm_unpackhi_epi16( input[0], input[4] );     /* [ 04 44 05 45 06 46 07 47 ]*/
+   b2 = _mm_unpacklo_epi16( input[1], input[5] );     /* [ 10 50 11 51 12 52 13 53 ]*/
+   b3 = _mm_unpackhi_epi16( input[1], input[5] );     /* [ 14 54 15 55 16 56 17 57 ]*/
+   b4 = _mm_unpacklo_epi16( input[2], input[6] );     /* [ 20 60 21 61 22 62 23 63 ]*/
+   b5 = _mm_unpackhi_epi16( input[2], input[6] );     /* [ 24 64 25 65 26 66 27 67 ]*/
+   b6 = _mm_unpacklo_epi16( input[3], input[7] );     /* [ 30 70 31 71 32 72 33 73 ]*/
+   b7 = _mm_unpackhi_epi16( input[3], input[7] );     /* [ 34 74 35 75 36 76 37 77 ]*/
+ 
+   a0 = _mm_unpacklo_epi16( b0, b4 );                 /* [ 00 20 40 60 01 21 41 61 ]*/
+   a1 = _mm_unpackhi_epi16( b0, b4 );                 /* [ 02 22 42 62 03 23 43 63 ]*/
+   a2 = _mm_unpacklo_epi16( b1, b5 );                 /* [ 04 24 44 64 05 25 45 65 ]*/
+   a3 = _mm_unpackhi_epi16( b1, b5 );                 /* [ 06 26 46 66 07 27 47 67 ]*/
+   a4 = _mm_unpacklo_epi16( b2, b6 );                 /* [ 10 30 50 70 11 31 51 71 ]*/
+   a5 = _mm_unpackhi_epi16( b2, b6 );                 /* [ 12 32 52 72 13 33 53 73 ]*/
+   a6 = _mm_unpacklo_epi16( b3, b7 );                 /* [ 14 34 54 74 15 35 55 75 ]*/
+   a7 = _mm_unpackhi_epi16( b3, b7 );                 /* [ 16 36 56 76 17 37 57 77 ]*/
+ 
+   output[0] = _mm_unpacklo_epi16( a0, a4 );          /* [ 00 10 20 30 40 50 60 70 ]*/
+   output[1] = _mm_unpackhi_epi16( a0, a4 );          /* [ 01 11 21 31 41 51 61 71 ]*/
+   output[2] = _mm_unpacklo_epi16( a1, a5 );          /* [ 02 12 22 32 42 52 62 72 ]*/
+   output[3] = _mm_unpackhi_epi16( a1, a5 );          /* [ 03 13 23 33 43 53 63 73 ]*/
+   output[4] = _mm_unpacklo_epi16( a2, a6 );          /* [ 04 14 24 34 44 54 64 74 ]*/
+   output[5] = _mm_unpackhi_epi16( a2, a6 );          /* [ 05 15 25 35 45 55 65 75 ]*/
+   output[6] = _mm_unpacklo_epi16( a3, a7 );          /* [ 06 16 26 36 46 56 66 76 ]*/
+   output[7] = _mm_unpackhi_epi16( a3, a7 );          /* [ 07 17 27 37 47 57 67 77 ]*/
+ 
+ }
+ 
+ static inline void DCT_Transform ( short *x, short *y) {
+   __m128i *vx = (__m128i*) x;
+   __m128i *vy = (__m128i*) y;
+ 
+   __m128i t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10;
+ 
+   __m128i c13573 = _mm_splat_epi16(13573);
+   __m128i c21895 = _mm_splat_epi16(21895);
+   __m128i cNeg21895 = _mm_splat_epi16(-21895);
+   __m128i c23170 = _mm_splat_epi16(23170);
+   __m128i cNeg23170 = _mm_splat_epi16(-23170);
+   __m128i c6518 = _mm_splat_epi16(6518);
+ 
+   t8 = _mm_adds_epi16(vx[0], vx[7]);
+   t9 = _mm_subs_epi16(vx[0], vx[7]);
+   t0 = _mm_adds_epi16(vx[1], vx[6]);
+   t7 = _mm_subs_epi16(vx[1], vx[6]);
+   t1 = _mm_adds_epi16(vx[2], vx[5]);
+   t6 = _mm_subs_epi16(vx[2], vx[5]);
+   t2 = _mm_adds_epi16(vx[3], vx[4]);
+   t5 = _mm_subs_epi16(vx[3], vx[4]);
+ 
+   t3 = _mm_adds_epi16(t8, t2);
+   t4 = _mm_subs_epi16(t8, t2);
+   t2 = _mm_adds_epi16(t0, t1);
+   t8 = _mm_subs_epi16(t0, t1);
+ 
+   t1 = _mm_adds_epi16(t7, t6);
+   t0 = _mm_subs_epi16(t7, t6);
+ 
+   vy[0] = _mm_adds_epi16(t3, t2);
+   vy[4] = _mm_subs_epi16(t3, t2);
+ 
+   vy[2] = _mm_mradds_epi16(t8, c13573, t4);
+   t10 = _mm_mr_epi16(t4, c13573);
+ 
+   vy[6] = _mm_subs_epi16(t10, t8);
+ 
+   t6 = _mm_mradds_epi16(t0, c23170, t5);
+   t7 = _mm_mradds_epi16(t0, cNeg23170, t5);
+   t2 = _mm_mradds_epi16(t1, cNeg23170, t9);
+   t3 = _mm_mradds_epi16(t1, c23170, t9);
+ 
+   vy[1] = _mm_mradds_epi16(t6, c6518, t3);
+   t9 = _mm_mr_epi16(t3, c6518);
+ 
+   vy[7] = _mm_subs_epi16(t9, t6);
+   vy[5] = _mm_mradds_epi16(t2, c21895, t7);
+   vy[3] = _mm_mradds_epi16(t7, cNeg21895, t2);
+ 
+ }
+ 
+ #define STORE(i) \
+      outputv[i] = _mm_mradds_epi16(PostScalev[i], yv[i], _mm_splat_epi16(0));
+ 
+ void dct_vector(short *input, short *output, short *x, short *y) {
+ 
+   __m128i *xv = (__m128i*) x;
+   __m128i *yv = (__m128i*) y;
+   __m128i *inputv = (__m128i*) input;
+   __m128i *outputv = (__m128i*) output;
+   __m128i *PostScalev = (__m128i*) PostScalePtr;
+ 
+   xv[0] = inputv[0];
+   xv[1] = inputv[1];
+   xv[2] = inputv[2];
+   xv[3] = inputv[3];
+   xv[4] = inputv[4];
+   xv[5] = inputv[5];
+   xv[6] = inputv[6];
+   xv[7] = inputv[7];
+ 
+   DCT_Transform( x, y );
+   Matrix_Transpose( y, x );
+   DCT_Transform( x, y );
+ 
+   STORE(0);
+   STORE(1);
+   STORE(2);
+   STORE(3);
+   STORE(4);
+   STORE(5);
+   STORE(6);
+   STORE(7);
+ 
+ }
+ 

Index: llvm/examples/SIMD/DCT/dct.vectorc.c
diff -c /dev/null llvm/examples/SIMD/DCT/dct.vectorc.c:1.1.2.1
*** /dev/null	Sun Oct 23 17:50:18 2005
--- llvm/examples/SIMD/DCT/dct.vectorc.c	Sun Oct 23 17:49:39 2005
***************
*** 0 ****
--- 1,140 ----
+ #include "Scalar.h"
+ #include "VectorC.h"
+ #include "Intrinsics.h"
+ 
+ // See the rgb2yuv benchmark for a description of USE_C0.  For some
+ // reason, USE_C0 1 seems to be slightly *faster* on SSE!  I'm
+ // investigating why this is.
+ //
+ #define USE_C0 1
+ 
+ short vllvm_adds_short(short,short);
+ 
+ #define MERGE(out01, out0, out1, in0, in1) \
+   short out01 = vllvm_fixed_vimm_short(0, 16); \
+   out01 = _fixed_combine_short(out01, 16, in0, 8, 0, 1); \
+   out01 = _fixed_combine_short(out01, 16, in1, 8, 8, 1); \
+   short out0 = _extract_short(out01, 0, 2, 8); \
+   short out1 = _extract_short(out01, 1, 2, 8)
+ 
+ #define IN(x) \
+   vllvm_load_short(input_scalar, 8, x)
+ 
+ #define STORE(out, idx) \
+   vllvm_store_short(out, output_scalar, idx)
+ 
+ static inline void Matrix_Transpose_VectorC (short *input_scalar, short *output_scalar) {
+   MERGE(b01, b0, b1, IN(0), IN(4));
+   MERGE(b23, b2, b3, IN(1), IN(5));
+   MERGE(b45, b4, b5, IN(2), IN(6));
+   MERGE(b67, b6, b7, IN(3), IN(7));
+ 
+   MERGE(a01, a0, a1, b0, b4);
+   MERGE(a23, a2, a3, b1, b5);
+   MERGE(a45, a4, a5, b2, b6);
+   MERGE(a67, a6, a7, b3, b7);
+ 
+   MERGE(out01, out0, out1, a0, a4);
+   MERGE(out23, out2, out3, a1, a5);
+   MERGE(out45, out4, out5, a2, a6);
+   MERGE(out67, out6, out7, a3, a7);
+ 
+   STORE(out0, 0);
+   STORE(out1, 1);
+   STORE(out2, 2);
+   STORE(out3, 3);
+   STORE(out4, 4);
+   STORE(out5, 5);
+   STORE(out6, 6);
+   STORE(out7, 7);
+ }
+ 
+ static inline void DCT_Transform_VectorC ( short *x, short *y) {
+   signed short t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10;
+ 
+   t8 = vllvm_adds_short(vllvm_load_short(x, 8, 0), vllvm_load_short(x, 8, 7));
+   t9 = vllvm_subs_short(vllvm_load_short(x, 8, 0), vllvm_load_short(x, 8, 7));
+   t0 = vllvm_adds_short(vllvm_load_short(x, 8, 1), vllvm_load_short(x, 8, 6));
+   t7 = vllvm_subs_short(vllvm_load_short(x, 8, 1), vllvm_load_short(x, 8, 6));
+   t1 = vllvm_adds_short(vllvm_load_short(x, 8, 2), vllvm_load_short(x, 8, 5));
+   t6 = vllvm_subs_short(vllvm_load_short(x, 8, 2), vllvm_load_short(x, 8, 5));
+   t2 = vllvm_adds_short(vllvm_load_short(x, 8, 3), vllvm_load_short(x, 8, 4));
+   t5 = vllvm_subs_short(vllvm_load_short(x, 8, 3), vllvm_load_short(x, 8, 4));
+ 
+   t3 = vllvm_adds_short(t8, t2);
+   t4 = vllvm_subs_short(t8, t2);
+   t2 = vllvm_adds_short(t0, t1);
+   t8 = vllvm_subs_short(t0, t1);
+ 
+   t1 = vllvm_adds_short(t7, t6);
+   t0 = vllvm_subs_short(t7, t6);
+ 
+   vllvm_store_short(vllvm_adds_short(t3, t2), y, 0);
+   vllvm_store_short(vllvm_subs_short(t3, t2), y, 4);
+ 
+   short c13573 = vllvm_fixed_vimm_short(13573, 8);
+ #if USE_C0
+   short c0 = vllvm_fixed_vimm_short(0, 8);
+ #endif
+   short c23170 = vllvm_fixed_vimm_short(23170, 8);
+   short cneg23170 = vllvm_fixed_vimm_short(-23170, 8);
+   short c6518 = vllvm_fixed_vimm_short(6518, 8);
+ 
+   vllvm_store_short(vllvm_mradds_short(t8, c13573, t4), y, 2);
+ #if USE_C0
+   t10 = vllvm_mradds_short(t4, c13573, c0);
+ #else
+   t10 = vllvm_mr_short(t4, c13573);
+ #endif
+   vllvm_store_short(vllvm_subs_short(t10, t8), y, 6);
+ 
+   t6 = vllvm_mradds_short(t0, c23170, t5);
+   t7 = vllvm_mradds_short(t0, cneg23170, t5);
+   t2 = vllvm_mradds_short(t1, cneg23170, t9);
+   t3 = vllvm_mradds_short(t1, c23170, t9);
+ 
+   vllvm_store_short(vllvm_mradds_short(t6, c6518, t3), y, 1);
+ #if USE_C0
+   t9 = vllvm_mradds_short(t3, c6518, c0);
+ #else
+   t9 = vllvm_mr_short(t3, c6518);
+ #endif
+   vllvm_store_short(vllvm_subs_short(t9, t6), y, 7);
+   vllvm_store_short(vllvm_mradds_short(t2, vllvm_fixed_vimm_short(21895, 8), t7), y, 5);
+   vllvm_store_short(vllvm_mradds_short(t7, vllvm_fixed_vimm_short(-21895, 8), t2), y, 3);
+ 
+ }
+ 
+ extern short *PostScalePtr;
+ 
+ #define STORE2(i) \
+     vllvm_store_short(vllvm_mradds_short(vllvm_load_short(PostScalePtr, 8, i), \
+ 					   vllvm_load_short(y, 8, i), \
+ 					   vllvm_fixed_vimm_short(0, 8)), \
+ 			output, i);
+ 
+ void dct_vector(short *input, short *output, short *x, short *y) {
+ 
+   vllvm_store_short(vllvm_load_short(input, 8, 0), x, 0);
+   vllvm_store_short(vllvm_load_short(input, 8, 1), x, 1);
+   vllvm_store_short(vllvm_load_short(input, 8, 2), x, 2);
+   vllvm_store_short(vllvm_load_short(input, 8, 3), x, 3);
+   vllvm_store_short(vllvm_load_short(input, 8, 4), x, 4);
+   vllvm_store_short(vllvm_load_short(input, 8, 5), x, 5);
+   vllvm_store_short(vllvm_load_short(input, 8, 6), x, 6);
+   vllvm_store_short(vllvm_load_short(input, 8, 7), x, 7);
+   DCT_Transform_VectorC( x, y );
+   Matrix_Transpose_VectorC( y, x );
+   DCT_Transform_VectorC( x, y );
+ 
+   STORE2(0);
+   STORE2(1);
+   STORE2(2);
+   STORE2(3);
+   STORE2(4);
+   STORE2(5);
+   STORE2(6);
+   STORE2(7);
+ 
+ }
+ 

Index: llvm/examples/SIMD/DCT/main.c
diff -c /dev/null llvm/examples/SIMD/DCT/main.c:1.1.2.1
*** /dev/null	Sun Oct 23 17:50:18 2005
--- llvm/examples/SIMD/DCT/main.c	Sun Oct 23 17:49:39 2005
***************
*** 0 ****
--- 1,175 ----
+ #define N 1024
+ 
+ #include <stdio.h>
+ #include <stdlib.h>
+ #include <string.h>
+ #include <sys/time.h>
+ #include <sys/times.h>
+ #include "../_malloc.h"
+ #include "Scalar.h"
+ 
+ inline void dct_scalar(short*, short*);
+ void dct_vector(short*, short*, short*, short*);
+ 
+ short *in;
+ short *out_vector;
+ short *out_scalar;
+ 
+ static short PostScaleArray[64] = {
+     4095, 5681, 5351, 4816, 4095, 4816, 5351, 5681,
+     5681, 7880, 7422, 6680, 5681, 6680, 7422, 7880,
+     5351, 7422, 6992, 6292, 5351, 6292, 6992, 7422,
+     4816, 6680, 6292, 5663, 4816, 5663, 6292, 6680,
+     4095, 5681, 5351, 4816, 4095, 4816, 5351, 5681,
+     4816, 6680, 6292, 5663, 4816, 5663, 6292, 6680,
+     5351, 7422, 6992, 6292, 5351, 6292, 6992, 7422,
+     5681, 7880, 7422, 6680, 5681, 6680, 7422, 7880
+ };
+ 
+ short *PostScalePtr;
+ 
+ void init() {
+   int i;
+ 
+   // Force 16-byte alignment
+   //
+   in = (short*) _malloc(N*sizeof(short));
+   out_vector = (short*) _malloc(N*sizeof(short));
+   out_scalar = (short*) _malloc(N*sizeof(short));
+   PostScalePtr = (short*) _malloc(64*sizeof(short));
+   memcpy(PostScalePtr, PostScaleArray, 64*sizeof(short));
+   
+   // Populate in with a range of values
+   //
+   for (i = 0; i < N; ++i) {
+     in[i] = N/2-i;
+   }
+   
+ }
+ 
+ float run() {
+   long t0, t1, t2;
+   int i,j;
+   struct tms buf_s, buf_e;
+   long scalar_time = 0, vector_time = 0;
+ 
+   times(&buf_s);
+   for (i = 0; i < 100000; ++i)
+     for (j = 0; j < N; j +=64)
+       dct_scalar(in+j, out_scalar+j);
+   times(&buf_e);
+   scalar_time = buf_e.tms_utime - buf_s.tms_utime;
+   printf("scalar time=%d, ", scalar_time);
+   
+   short *x = (short*) _malloc(64*sizeof(short));
+   short *y = (short*) _malloc(64*sizeof(short));
+   times(&buf_s);
+   for (i = 0; i < 100000; ++i)
+     for (j = 0; j < N; j +=64)
+       dct_vector(in+j, out_vector+j, x, y);
+   times(&buf_e);
+   vector_time = buf_e.tms_utime - buf_s.tms_utime;
+   printf("vector time=%d, ", vector_time);
+   
+   float speedup = ((float) scalar_time)/vector_time;
+   printf("speedup=%f\n", speedup);
+   
+   for (i = 0; i < N; i++) {
+     if (out_vector[i] != out_scalar[i]) {
+       printf("FAILED\n");
+       exit(1);
+     }
+   }
+   
+   return speedup;
+ }
+ 
+ int main (void) {
+   unsigned i;
+ 
+   init();
+ 
+   float best = 0;
+   for (i = 0; i < NRUNS; ++i) {
+     float speedup = run();
+     if (speedup > best)
+       best = speedup;
+   }
+   printf("best speedup=%f\n", best);
+ 
+   printf ("PASSED\n");
+   return 0;
+ 
+ }
+ 
+ static inline void Matrix_Transpose ( short *input, short *output) {
+   unsigned i;
+   for (i = 0; i < 8; ++i) {
+     output[i] = input[8*i];
+     output[8+i] = input[8*i+1];
+     output[16+i] = input[8*i+2];
+     output[24+i] = input[8*i+3];
+     output[32+i] = input[8*i+4];
+     output[40+i] = input[8*i+5];
+     output[48+i] = input[8*i+6];
+     output[56+i] = input[8*i+7];
+   }
+ }
+ 
+ static inline void DCT_Transform ( short *x, short *y) {
+   signed short t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10;
+ 
+   unsigned i;
+   for (i = 0; i < 8; ++i) {
+     t8 = adds_short(x[i], x[56+i]);
+     t9 = subs_short(x[i], x[56+i]);
+     t0 = adds_short(x[8+i], x[48+i]);
+     t7 = subs_short(x[8+i], x[48+i]);
+     t1 = adds_short(x[16+i], x[40+i]);
+     t6 = subs_short(x[16+i], x[40+i]);
+     t2 = adds_short(x[24+i], x[32+i]);
+     t5 = subs_short(x[24+i], x[32+i]);
+ 
+     t3 = adds_short(t8, t2);
+     t4 = subs_short(t8, t2);
+     t2 = adds_short(t0, t1);
+     t8 = subs_short(t0, t1);
+     
+     t1 = adds_short(t7, t6);
+     t0 = subs_short(t7, t6);
+ 
+     y[i] = adds_short(t3, t2);
+     y[32+i] = subs_short(t3, t2);
+     y[16+i] = mradds_short(t8, 13573, t4);
+     t10 = mradds_short(t4, 13573, 0);
+     y[48+i] = subs_short(t10, t8);
+ 
+     t6 = mradds_short(t0, 23170, t5);
+     t7 = mradds_short(t0, -23170, t5);
+     t2 = mradds_short(t1, -23170, t9);
+     t3 = mradds_short(t1, 23170, t9);
+ 
+     y[8+i] = mradds_short(t6, 6518, t3);
+     t9 = mradds_short(t3, 6518, 0);
+     y[56+i] = subs_short(t9, t6);
+     y[40+i] = mradds_short(t2, 21895, t7);
+     y[24+i] = mradds_short(t7, -21895, t2);
+   }
+ 
+ }
+ 
+ void dct_scalar(short *input, short *output) {
+ 
+   short x[64], y[64];
+   unsigned i, j;
+ 
+   memcpy(x, input, 64*sizeof(short));
+   DCT_Transform( x, y );
+   Matrix_Transpose( y, x );
+   DCT_Transform( x, y );
+ 
+   for (i = 0; i < 8; ++i)
+     for (j = 0; j < 8; ++j)
+       output[8*i+j] = mradds_short(PostScaleArray[8*i+j], y[8*i+j], 0);
+ }
+