Robert L. Bocchino Jr.
bocchino at persephone.cs.uiuc.edu
Sun Oct 23 15:50:20 PDT 2005
Changes in directory llvm/examples/SIMD/DCT:
Makefile added (r1.1.2.1)
dct.altivec.handwritten.c added (r1.1.2.1)
dct.sse.handwritten.c added (r1.1.2.1)
dct.vectorc.c added (r1.1.2.1)
main.c added (r1.1.2.1)
Log message:
Examples to illustrate Vector LLVM's SIMD support.
Diffs of the changes: (+646 -0)
Makefile | 4
dct.altivec.handwritten.c | 198 ++++++++++++++++++++++++++++++++++++++++++++++
dct.sse.handwritten.c | 129 +++++++++++++++++++++++++++++
dct.vectorc.c | 140 ++++++++++++++++++++++++++++++++
main.c | 175 ++++++++++++++++++++++++++++++++++++++++
5 files changed, 646 insertions
Index: llvm/examples/SIMD/DCT/Makefile
diff -c /dev/null llvm/examples/SIMD/DCT/Makefile:
*** /dev/null Sun Oct 23 17:49:50 2005
--- llvm/examples/SIMD/DCT/Makefile Sun Oct 23 17:49:39 2005
*** 0 ****
--- 1,4 ----
+ NAME= dct
+ include ../Makefile.common
Index: llvm/examples/SIMD/DCT/dct.altivec.handwritten.c
diff -c /dev/null llvm/examples/SIMD/DCT/dct.altivec.handwritten.c:
*** /dev/null Sun Oct 23 17:50:17 2005
--- llvm/examples/SIMD/DCT/dct.altivec.handwritten.c Sun Oct 23 17:49:39 2005
*** 0 ****
--- 1,198 ----
+ static inline void Matrix_Transpose ( vector signed short *input, vector signed short *output)
+ {
+ vector signed short a0, a1, a2, a3, a4, a5, a6, a7;
+ vector signed short b0, b1, b2, b3, b4, b5, b6, b7;
+ b0 = vec_mergeh( input[0], input[4] ); /* [ 00 40 01 41 02 42 03 43 ]*/
+ b1 = vec_mergel( input[0], input[4] ); /* [ 04 44 05 45 06 46 07 47 ]*/
+ b2 = vec_mergeh( input[1], input[5] ); /* [ 10 50 11 51 12 52 13 53 ]*/
+ b3 = vec_mergel( input[1], input[5] ); /* [ 14 54 15 55 16 56 17 57 ]*/
+ b4 = vec_mergeh( input[2], input[6] ); /* [ 20 60 21 61 22 62 23 63 ]*/
+ b5 = vec_mergel( input[2], input[6] ); /* [ 24 64 25 65 26 66 27 67 ]*/
+ b6 = vec_mergeh( input[3], input[7] ); /* [ 30 70 31 71 32 72 33 73 ]*/
+ b7 = vec_mergel( input[3], input[7] ); /* [ 34 74 35 75 36 76 37 77 ]*/
+ a0 = vec_mergeh( b0, b4 ); /* [ 00 20 40 60 01 21 41 61 ]*/
+ a1 = vec_mergel( b0, b4 ); /* [ 02 22 42 62 03 23 43 63 ]*/
+ a2 = vec_mergeh( b1, b5 ); /* [ 04 24 44 64 05 25 45 65 ]*/
+ a3 = vec_mergel( b1, b5 ); /* [ 06 26 46 66 07 27 47 67 ]*/
+ a4 = vec_mergeh( b2, b6 ); /* [ 10 30 50 70 11 31 51 71 ]*/
+ a5 = vec_mergel( b2, b6 ); /* [ 12 32 52 72 13 33 53 73 ]*/
+ a6 = vec_mergeh( b3, b7 ); /* [ 14 34 54 74 15 35 55 75 ]*/
+ a7 = vec_mergel( b3, b7 ); /* [ 16 36 56 76 17 37 57 77 ]*/
+ output[0] = vec_mergeh( a0, a4 ); /* [ 00 10 20 30 40 50 60 70 ]*/
+ output[1] = vec_mergel( a0, a4 ); /* [ 01 11 21 31 41 51 61 71 ]*/
+ output[2] = vec_mergeh( a1, a5 ); /* [ 02 12 22 32 42 52 62 72 ]*/
+ output[3] = vec_mergel( a1, a5 ); /* [ 03 13 23 33 43 53 63 73 ]*/
+ output[4] = vec_mergeh( a2, a6 ); /* [ 04 14 24 34 44 54 64 74 ]*/
+ output[5] = vec_mergel( a2, a6 ); /* [ 05 15 25 35 45 55 65 75 ]*/
+ output[6] = vec_mergeh( a3, a7 ); /* [ 06 16 26 36 46 56 66 76 ]*/
+ output[7] = vec_mergel( a3, a7 ); /* [ 07 17 27 37 47 57 67 77 ]*/
+ }
+ /***************************************************************
+ *
+ * Copyright: (c) Copyright Motorola Inc. 1998
+ *
+ * Date: April 15, 1998
+ *
+ * Macro: DCT_Transform
+ *
+ * Description: Discrete Cosign Transform implemented by the
+ * Scaled Chen (II) Algorithm developed by Haifa
+ * Research Lab. The major differnce between this
+ * algorithm and the Scaled Chen (I) is that
+ * certain multiply-subtracts are replaced by
+ * multiply adds. A full description of the
+ * Scaled Chen (I) algorithm can be found in:
+ * W.C.Chen, C.H.Smith and S.C.Fralick, "A Fast
+ * Computational Algorithm for the Discrete Cosine
+ * Transform", IEEE Transactions on Cummnuications,
+ * Vol. COM-25, No. 9, pp 1004-1009, Sept. 1997.
+ *
+ * Inputs: vx : array of vector short
+ * t1-t10 : temporary vector variables set up by caller
+ * c4 : cos(4*pi/16)
+ * mc4 : -c4
+ * a0 : c6/c2
+ * a1 : c7/c1
+ * a2 : c5/c3
+ * ma2 : -a2
+ * zero : an array of zero elements
+ *
+ * Outputs: vy : array of vector short
+ *
+ **************************************************************/
+ #define DCT_Transform(vx,vy) \
+ \
+ /* 1st stage. */ \
+ t8 = vec_adds( vx[0], vx[7] ); /* t0 + t7 */ \
+ t9 = vec_subs( vx[0], vx[7] ); /* t0 - t7 */ \
+ t0 = vec_adds( vx[1], vx[6] ); /* t1 + t6 */ \
+ t7 = vec_subs( vx[1], vx[6] ); /* t1 - t6 */ \
+ t1 = vec_adds( vx[2], vx[5] ); /* t2 + t6 */ \
+ t6 = vec_subs( vx[2], vx[5] ); /* t2 - t6 */ \
+ t2 = vec_adds( vx[3], vx[4] ); /* t3 + t4 */ \
+ t5 = vec_subs( vx[3], vx[4] ); /* t3 - t4 */ \
+ \
+ /* 2nd stage. */ \
+ t3 = vec_adds( t8, t2 ); /* (t0+t7) + (t3+t4) */ \
+ t4 = vec_subs( t8, t2 ); /* (t0+t7) - (t3+t4) */ \
+ t2 = vec_adds( t0, t1 ); /* (t1+t6) + (t2+t5) */ \
+ t8 = vec_subs( t0, t1 ); /* (t1+t6) - (t2+t5) */ \
+ \
+ t1 = vec_adds( t7, t6 ); /* (t1-t6) + (t2-t5) */ \
+ t0 = vec_subs( t7, t6 ); /* (t1-t6) - (t2-t5) */ \
+ \
+ /* 3rd stage */ \
+ vy[0] = vec_adds( t3, t2 ); /* y0 = t3 + t2 */ \
+ vy[4] = vec_subs( t3, t2 ); /* y4 = t3 + t2 */ \
+ vy[2] = vec_mradds( t8, a0, t4 ); /* y2 = t8 * (a0) + t4 */ \
+ t10 = vec_mradds( t4, a0, zero ); \
+ vy[6] = vec_subs( t10, t8 ); /* y6 = t4 * (a0) - t8 */ \
+ \
+ t6 = vec_mradds( t0, c4, t5 ); /* t6 = t0 * (c4) + t5 */ \
+ t7 = vec_mradds( t0, mc4, t5 ); /* t7 = t0 * (-c4) + t5 */ \
+ t2 = vec_mradds( t1, mc4, t9 ); /* t2 = t1 * (-c4) + t9 */ \
+ t3 = vec_mradds( t1, c4, t9 ); /* t3 = t1 * (c4) + t9 */ \
+ \
+ /* 4th stage. */ \
+ vy[1] = vec_mradds( t6, a1, t3 ); /* y1 = t6 * (a1) + t3 */ \
+ t9 = vec_mradds( t3, a1, zero ); \
+ vy[7] = vec_subs( t9, t6 ) ; /* y7 = t3 * (a1) - t6 */ \
+ vy[5] = vec_mradds( t2, a2, t7 ); /* y5 = t2 + (a2) + t7 */ \
+ vy[3] = vec_mradds( t7, ma2, t2 ); /* y3 = t7 * (-a2) + t2 */
+ /* Post-scaling matrix -- scaled by 1 */
+ vector signed short PostScale[8] = {
+ (vector signed short)( 4095, 5681, 5351, 4816, 4095, 4816, 5351, 5681 ),
+ (vector signed short)( 5681, 7880, 7422, 6680, 5681, 6680, 7422, 7880 ),
+ (vector signed short)( 5351, 7422, 6992, 6292, 5351, 6292, 6992, 7422 ),
+ (vector signed short)( 4816, 6680, 6292, 5663, 4816, 5663, 6292, 6680 ),
+ (vector signed short)( 4095, 5681, 5351, 4816, 4095, 4816, 5351, 5681 ),
+ (vector signed short)( 4816, 6680, 6292, 5663, 4816, 5663, 6292, 6680 ),
+ (vector signed short)( 5351, 7422, 6992, 6292, 5351, 6292, 6992, 7422 ),
+ (vector signed short)( 5681, 7880, 7422, 6680, 5681, 6680, 7422, 7880 )
+ };
+ /***************************************************************
+ *
+ * Copyright: (c) Copyright Motorola Inc. 1998
+ *
+ * Date: April 17, 1998
+ *
+ * Function: DCT
+ *
+ * Description: Scaled Chen (II) algorithm for DCT
+ * Arithmetic is 16-bit fixed point.
+ *
+ * Inputs: input - Pointer to input data (short), which
+ * must be between -255 to +255.
+ * It is assumed that the allocated array
+ * has been 128-bit aligned and contains
+ * 8x8 short elements.
+ *
+ * Outputs: output - Pointer to output area for the transfored
+ * data. The output values are between -2040
+ * and 2040. It is assumed that a 128-bit
+ * aligned 8x8 array of short has been
+ * pre-allocated.
+ *
+ * Return: None
+ *
+ ***************************************************************/
+ void dct_vector(short *input, short *output) {
+ vector signed short t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10;
+ vector signed short a0, a1, a2, ma2, c4, mc4, zero;
+ vector signed short vx[8], vy[8];
+ vector signed short *vec_ptr; /* used for conversion between
+ arrays of short and vector
+ signed short array. */
+ /* load the multiplication constants */
+ c4 = (vector signed short)(23170); /* c4 = cos(4*pi/16) */
+ a0 = (vector signed short)(13573); /* a0 = c6/c2 */
+ a1 = (vector signed short)(6518); /* a1 = c7/c1 */
+ a2 = (vector signed short)(21895); /* a2 = c5/c3 */
+ mc4 = (vector signed short)(-23170); /* -c4 */
+ ma2 = (vector signed short)(-21895); /* -a2 */
+ zero = (vector signed short)(0); /* 0 */
+ /* copy the rows of input data */
+ vec_ptr = ( vector signed short * ) input;
+ vx[0] = vec_ptr[0];
+ vx[1] = vec_ptr[1];
+ vx[2] = vec_ptr[2];
+ vx[3] = vec_ptr[3];
+ vx[4] = vec_ptr[4];
+ vx[5] = vec_ptr[5];
+ vx[6] = vec_ptr[6];
+ vx[7] = vec_ptr[7];
+ /* Perform DCT first on the 8 columns */
+ DCT_Transform( vx, vy );
+ /* Transpose matrix to work on rows */
+ Matrix_Transpose( vy, vx );
+ /* Perform DCT first on the 8 rows */
+ DCT_Transform( vx, vy );
+ /* Post-scale and store result. */
+ vec_ptr = (vector signed short *) output;
+ vec_ptr[0] = vec_mradds( PostScale[0], vy[0], zero );
+ vec_ptr[1] = vec_mradds( PostScale[1], vy[1], zero );
+ vec_ptr[2] = vec_mradds( PostScale[2], vy[2], zero );
+ vec_ptr[3] = vec_mradds( PostScale[3], vy[3], zero );
+ vec_ptr[4] = vec_mradds( PostScale[4], vy[4], zero );
+ vec_ptr[5] = vec_mradds( PostScale[5], vy[5], zero );
+ vec_ptr[6] = vec_mradds( PostScale[6], vy[6], zero );
+ vec_ptr[7] = vec_mradds( PostScale[7], vy[7], zero );
+ }
Index: llvm/examples/SIMD/DCT/dct.sse.handwritten.c
diff -c /dev/null llvm/examples/SIMD/DCT/dct.sse.handwritten.c:
*** /dev/null Sun Oct 23 17:50:18 2005
--- llvm/examples/SIMD/DCT/dct.sse.handwritten.c Sun Oct 23 17:49:39 2005
*** 0 ****
--- 1,129 ----
+ #include "Scalar.h"
+ #include "SSE.h"
+ extern short *PostScalePtr;
+ static inline void Matrix_Transpose ( short *input_scalar, short *output_scalar)
+ {
+ __m128i *input = (__m128i*) input_scalar;
+ __m128i *output = (__m128i*) output_scalar;
+ __m128i a0, a1, a2, a3, a4, a5, a6, a7;
+ __m128i b0, b1, b2, b3, b4, b5, b6, b7;
+ b0 = _mm_unpacklo_epi16( input[0], input[4] ); /* [ 00 40 01 41 02 42 03 43 ]*/
+ b1 = _mm_unpackhi_epi16( input[0], input[4] ); /* [ 04 44 05 45 06 46 07 47 ]*/
+ b2 = _mm_unpacklo_epi16( input[1], input[5] ); /* [ 10 50 11 51 12 52 13 53 ]*/
+ b3 = _mm_unpackhi_epi16( input[1], input[5] ); /* [ 14 54 15 55 16 56 17 57 ]*/
+ b4 = _mm_unpacklo_epi16( input[2], input[6] ); /* [ 20 60 21 61 22 62 23 63 ]*/
+ b5 = _mm_unpackhi_epi16( input[2], input[6] ); /* [ 24 64 25 65 26 66 27 67 ]*/
+ b6 = _mm_unpacklo_epi16( input[3], input[7] ); /* [ 30 70 31 71 32 72 33 73 ]*/
+ b7 = _mm_unpackhi_epi16( input[3], input[7] ); /* [ 34 74 35 75 36 76 37 77 ]*/
+ a0 = _mm_unpacklo_epi16( b0, b4 ); /* [ 00 20 40 60 01 21 41 61 ]*/
+ a1 = _mm_unpackhi_epi16( b0, b4 ); /* [ 02 22 42 62 03 23 43 63 ]*/
+ a2 = _mm_unpacklo_epi16( b1, b5 ); /* [ 04 24 44 64 05 25 45 65 ]*/
+ a3 = _mm_unpackhi_epi16( b1, b5 ); /* [ 06 26 46 66 07 27 47 67 ]*/
+ a4 = _mm_unpacklo_epi16( b2, b6 ); /* [ 10 30 50 70 11 31 51 71 ]*/
+ a5 = _mm_unpackhi_epi16( b2, b6 ); /* [ 12 32 52 72 13 33 53 73 ]*/
+ a6 = _mm_unpacklo_epi16( b3, b7 ); /* [ 14 34 54 74 15 35 55 75 ]*/
+ a7 = _mm_unpackhi_epi16( b3, b7 ); /* [ 16 36 56 76 17 37 57 77 ]*/
+ output[0] = _mm_unpacklo_epi16( a0, a4 ); /* [ 00 10 20 30 40 50 60 70 ]*/
+ output[1] = _mm_unpackhi_epi16( a0, a4 ); /* [ 01 11 21 31 41 51 61 71 ]*/
+ output[2] = _mm_unpacklo_epi16( a1, a5 ); /* [ 02 12 22 32 42 52 62 72 ]*/
+ output[3] = _mm_unpackhi_epi16( a1, a5 ); /* [ 03 13 23 33 43 53 63 73 ]*/
+ output[4] = _mm_unpacklo_epi16( a2, a6 ); /* [ 04 14 24 34 44 54 64 74 ]*/
+ output[5] = _mm_unpackhi_epi16( a2, a6 ); /* [ 05 15 25 35 45 55 65 75 ]*/
+ output[6] = _mm_unpacklo_epi16( a3, a7 ); /* [ 06 16 26 36 46 56 66 76 ]*/
+ output[7] = _mm_unpackhi_epi16( a3, a7 ); /* [ 07 17 27 37 47 57 67 77 ]*/
+ }
+ static inline void DCT_Transform ( short *x, short *y) {
+ __m128i *vx = (__m128i*) x;
+ __m128i *vy = (__m128i*) y;
+ __m128i t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10;
+ __m128i c13573 = _mm_splat_epi16(13573);
+ __m128i c21895 = _mm_splat_epi16(21895);
+ __m128i cNeg21895 = _mm_splat_epi16(-21895);
+ __m128i c23170 = _mm_splat_epi16(23170);
+ __m128i cNeg23170 = _mm_splat_epi16(-23170);
+ __m128i c6518 = _mm_splat_epi16(6518);
+ t8 = _mm_adds_epi16(vx[0], vx[7]);
+ t9 = _mm_subs_epi16(vx[0], vx[7]);
+ t0 = _mm_adds_epi16(vx[1], vx[6]);
+ t7 = _mm_subs_epi16(vx[1], vx[6]);
+ t1 = _mm_adds_epi16(vx[2], vx[5]);
+ t6 = _mm_subs_epi16(vx[2], vx[5]);
+ t2 = _mm_adds_epi16(vx[3], vx[4]);
+ t5 = _mm_subs_epi16(vx[3], vx[4]);
+ t3 = _mm_adds_epi16(t8, t2);
+ t4 = _mm_subs_epi16(t8, t2);
+ t2 = _mm_adds_epi16(t0, t1);
+ t8 = _mm_subs_epi16(t0, t1);
+ t1 = _mm_adds_epi16(t7, t6);
+ t0 = _mm_subs_epi16(t7, t6);
+ vy[0] = _mm_adds_epi16(t3, t2);
+ vy[4] = _mm_subs_epi16(t3, t2);
+ vy[2] = _mm_mradds_epi16(t8, c13573, t4);
+ t10 = _mm_mr_epi16(t4, c13573);
+ vy[6] = _mm_subs_epi16(t10, t8);
+ t6 = _mm_mradds_epi16(t0, c23170, t5);
+ t7 = _mm_mradds_epi16(t0, cNeg23170, t5);
+ t2 = _mm_mradds_epi16(t1, cNeg23170, t9);
+ t3 = _mm_mradds_epi16(t1, c23170, t9);
+ vy[1] = _mm_mradds_epi16(t6, c6518, t3);
+ t9 = _mm_mr_epi16(t3, c6518);
+ vy[7] = _mm_subs_epi16(t9, t6);
+ vy[5] = _mm_mradds_epi16(t2, c21895, t7);
+ vy[3] = _mm_mradds_epi16(t7, cNeg21895, t2);
+ }
+ #define STORE(i) \
+ outputv[i] = _mm_mradds_epi16(PostScalev[i], yv[i], _mm_splat_epi16(0));
+ void dct_vector(short *input, short *output, short *x, short *y) {
+ __m128i *xv = (__m128i*) x;
+ __m128i *yv = (__m128i*) y;
+ __m128i *inputv = (__m128i*) input;
+ __m128i *outputv = (__m128i*) output;
+ __m128i *PostScalev = (__m128i*) PostScalePtr;
+ xv[0] = inputv[0];
+ xv[1] = inputv[1];
+ xv[2] = inputv[2];
+ xv[3] = inputv[3];
+ xv[4] = inputv[4];
+ xv[5] = inputv[5];
+ xv[6] = inputv[6];
+ xv[7] = inputv[7];
+ DCT_Transform( x, y );
+ Matrix_Transpose( y, x );
+ DCT_Transform( x, y );
+ STORE(0);
+ STORE(1);
+ STORE(2);
+ STORE(3);
+ STORE(4);
+ STORE(5);
+ STORE(6);
+ STORE(7);
+ }
Index: llvm/examples/SIMD/DCT/dct.vectorc.c
diff -c /dev/null llvm/examples/SIMD/DCT/dct.vectorc.c:
*** /dev/null Sun Oct 23 17:50:18 2005
--- llvm/examples/SIMD/DCT/dct.vectorc.c Sun Oct 23 17:49:39 2005
*** 0 ****
--- 1,140 ----
+ #include "Scalar.h"
+ #include "VectorC.h"
+ #include "Intrinsics.h"
+ // See the rgb2yuv benchmark for a description of USE_C0. For some
+ // reason, USE_C0 1 seems to be slightly *faster* on SSE! I'm
+ // investigating why this is.
+ //
+ #define USE_C0 1
+ short vllvm_adds_short(short,short);
+ #define MERGE(out01, out0, out1, in0, in1) \
+ short out01 = vllvm_fixed_vimm_short(0, 16); \
+ out01 = _fixed_combine_short(out01, 16, in0, 8, 0, 1); \
+ out01 = _fixed_combine_short(out01, 16, in1, 8, 8, 1); \
+ short out0 = _extract_short(out01, 0, 2, 8); \
+ short out1 = _extract_short(out01, 1, 2, 8)
+ #define IN(x) \
+ vllvm_load_short(input_scalar, 8, x)
+ #define STORE(out, idx) \
+ vllvm_store_short(out, output_scalar, idx)
+ static inline void Matrix_Transpose_VectorC (short *input_scalar, short *output_scalar) {
+ MERGE(b01, b0, b1, IN(0), IN(4));
+ MERGE(b23, b2, b3, IN(1), IN(5));
+ MERGE(b45, b4, b5, IN(2), IN(6));
+ MERGE(b67, b6, b7, IN(3), IN(7));
+ MERGE(a01, a0, a1, b0, b4);
+ MERGE(a23, a2, a3, b1, b5);
+ MERGE(a45, a4, a5, b2, b6);
+ MERGE(a67, a6, a7, b3, b7);
+ MERGE(out01, out0, out1, a0, a4);
+ MERGE(out23, out2, out3, a1, a5);
+ MERGE(out45, out4, out5, a2, a6);
+ MERGE(out67, out6, out7, a3, a7);
+ STORE(out0, 0);
+ STORE(out1, 1);
+ STORE(out2, 2);
+ STORE(out3, 3);
+ STORE(out4, 4);
+ STORE(out5, 5);
+ STORE(out6, 6);
+ STORE(out7, 7);
+ }
+ static inline void DCT_Transform_VectorC ( short *x, short *y) {
+ signed short t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10;
+ t8 = vllvm_adds_short(vllvm_load_short(x, 8, 0), vllvm_load_short(x, 8, 7));
+ t9 = vllvm_subs_short(vllvm_load_short(x, 8, 0), vllvm_load_short(x, 8, 7));
+ t0 = vllvm_adds_short(vllvm_load_short(x, 8, 1), vllvm_load_short(x, 8, 6));
+ t7 = vllvm_subs_short(vllvm_load_short(x, 8, 1), vllvm_load_short(x, 8, 6));
+ t1 = vllvm_adds_short(vllvm_load_short(x, 8, 2), vllvm_load_short(x, 8, 5));
+ t6 = vllvm_subs_short(vllvm_load_short(x, 8, 2), vllvm_load_short(x, 8, 5));
+ t2 = vllvm_adds_short(vllvm_load_short(x, 8, 3), vllvm_load_short(x, 8, 4));
+ t5 = vllvm_subs_short(vllvm_load_short(x, 8, 3), vllvm_load_short(x, 8, 4));
+ t3 = vllvm_adds_short(t8, t2);
+ t4 = vllvm_subs_short(t8, t2);
+ t2 = vllvm_adds_short(t0, t1);
+ t8 = vllvm_subs_short(t0, t1);
+ t1 = vllvm_adds_short(t7, t6);
+ t0 = vllvm_subs_short(t7, t6);
+ vllvm_store_short(vllvm_adds_short(t3, t2), y, 0);
+ vllvm_store_short(vllvm_subs_short(t3, t2), y, 4);
+ short c13573 = vllvm_fixed_vimm_short(13573, 8);
+ #if USE_C0
+ short c0 = vllvm_fixed_vimm_short(0, 8);
+ #endif
+ short c23170 = vllvm_fixed_vimm_short(23170, 8);
+ short cneg23170 = vllvm_fixed_vimm_short(-23170, 8);
+ short c6518 = vllvm_fixed_vimm_short(6518, 8);
+ vllvm_store_short(vllvm_mradds_short(t8, c13573, t4), y, 2);
+ #if USE_C0
+ t10 = vllvm_mradds_short(t4, c13573, c0);
+ #else
+ t10 = vllvm_mr_short(t4, c13573);
+ #endif
+ vllvm_store_short(vllvm_subs_short(t10, t8), y, 6);
+ t6 = vllvm_mradds_short(t0, c23170, t5);
+ t7 = vllvm_mradds_short(t0, cneg23170, t5);
+ t2 = vllvm_mradds_short(t1, cneg23170, t9);
+ t3 = vllvm_mradds_short(t1, c23170, t9);
+ vllvm_store_short(vllvm_mradds_short(t6, c6518, t3), y, 1);
+ #if USE_C0
+ t9 = vllvm_mradds_short(t3, c6518, c0);
+ #else
+ t9 = vllvm_mr_short(t3, c6518);
+ #endif
+ vllvm_store_short(vllvm_subs_short(t9, t6), y, 7);
+ vllvm_store_short(vllvm_mradds_short(t2, vllvm_fixed_vimm_short(21895, 8), t7), y, 5);
+ vllvm_store_short(vllvm_mradds_short(t7, vllvm_fixed_vimm_short(-21895, 8), t2), y, 3);
+ }
+ extern short *PostScalePtr;
+ #define STORE2(i) \
+ vllvm_store_short(vllvm_mradds_short(vllvm_load_short(PostScalePtr, 8, i), \
+ vllvm_load_short(y, 8, i), \
+ vllvm_fixed_vimm_short(0, 8)), \
+ output, i);
+ void dct_vector(short *input, short *output, short *x, short *y) {
+ vllvm_store_short(vllvm_load_short(input, 8, 0), x, 0);
+ vllvm_store_short(vllvm_load_short(input, 8, 1), x, 1);
+ vllvm_store_short(vllvm_load_short(input, 8, 2), x, 2);
+ vllvm_store_short(vllvm_load_short(input, 8, 3), x, 3);
+ vllvm_store_short(vllvm_load_short(input, 8, 4), x, 4);
+ vllvm_store_short(vllvm_load_short(input, 8, 5), x, 5);
+ vllvm_store_short(vllvm_load_short(input, 8, 6), x, 6);
+ vllvm_store_short(vllvm_load_short(input, 8, 7), x, 7);
+ DCT_Transform_VectorC( x, y );
+ Matrix_Transpose_VectorC( y, x );
+ DCT_Transform_VectorC( x, y );
+ STORE2(0);
+ STORE2(1);
+ STORE2(2);
+ STORE2(3);
+ STORE2(4);
+ STORE2(5);
+ STORE2(6);
+ STORE2(7);
+ }
Index: llvm/examples/SIMD/DCT/main.c
diff -c /dev/null llvm/examples/SIMD/DCT/main.c:
*** /dev/null Sun Oct 23 17:50:18 2005
--- llvm/examples/SIMD/DCT/main.c Sun Oct 23 17:49:39 2005
*** 0 ****
--- 1,175 ----
+ #define N 1024
+ #include <stdio.h>
+ #include <stdlib.h>
+ #include <string.h>
+ #include <sys/time.h>
+ #include <sys/times.h>
+ #include "../_malloc.h"
+ #include "Scalar.h"
+ inline void dct_scalar(short*, short*);
+ void dct_vector(short*, short*, short*, short*);
+ short *in;
+ short *out_vector;
+ short *out_scalar;
+ static short PostScaleArray[64] = {
+ 4095, 5681, 5351, 4816, 4095, 4816, 5351, 5681,
+ 5681, 7880, 7422, 6680, 5681, 6680, 7422, 7880,
+ 5351, 7422, 6992, 6292, 5351, 6292, 6992, 7422,
+ 4816, 6680, 6292, 5663, 4816, 5663, 6292, 6680,
+ 4095, 5681, 5351, 4816, 4095, 4816, 5351, 5681,
+ 4816, 6680, 6292, 5663, 4816, 5663, 6292, 6680,
+ 5351, 7422, 6992, 6292, 5351, 6292, 6992, 7422,
+ 5681, 7880, 7422, 6680, 5681, 6680, 7422, 7880
+ };
+ short *PostScalePtr;
+ void init() {
+ int i;
+ // Force 16-byte alignment
+ //
+ in = (short*) _malloc(N*sizeof(short));
+ out_vector = (short*) _malloc(N*sizeof(short));
+ out_scalar = (short*) _malloc(N*sizeof(short));
+ PostScalePtr = (short*) _malloc(64*sizeof(short));
+ memcpy(PostScalePtr, PostScaleArray, 64*sizeof(short));
+ // Populate in with a range of values
+ //
+ for (i = 0; i < N; ++i) {
+ in[i] = N/2-i;
+ }
+ }
+ float run() {
+ long t0, t1, t2;
+ int i,j;
+ struct tms buf_s, buf_e;
+ long scalar_time = 0, vector_time = 0;
+ times(&buf_s);
+ for (i = 0; i < 100000; ++i)
+ for (j = 0; j < N; j +=64)
+ dct_scalar(in+j, out_scalar+j);
+ times(&buf_e);
+ scalar_time = buf_e.tms_utime - buf_s.tms_utime;
+ printf("scalar time=%d, ", scalar_time);
+ short *x = (short*) _malloc(64*sizeof(short));
+ short *y = (short*) _malloc(64*sizeof(short));
+ times(&buf_s);
+ for (i = 0; i < 100000; ++i)
+ for (j = 0; j < N; j +=64)
+ dct_vector(in+j, out_vector+j, x, y);
+ times(&buf_e);
+ vector_time = buf_e.tms_utime - buf_s.tms_utime;
+ printf("vector time=%d, ", vector_time);
+ float speedup = ((float) scalar_time)/vector_time;
+ printf("speedup=%f\n", speedup);
+ for (i = 0; i < N; i++) {
+ if (out_vector[i] != out_scalar[i]) {
+ printf("FAILED\n");
+ exit(1);
+ }
+ }
+ return speedup;
+ }
+ int main (void) {
+ unsigned i;
+ init();
+ float best = 0;
+ for (i = 0; i < NRUNS; ++i) {
+ float speedup = run();
+ if (speedup > best)
+ best = speedup;
+ }
+ printf("best speedup=%f\n", best);
+ printf ("PASSED\n");
+ return 0;
+ }
+ static inline void Matrix_Transpose ( short *input, short *output) {
+ unsigned i;
+ for (i = 0; i < 8; ++i) {
+ output[i] = input[8*i];
+ output[8+i] = input[8*i+1];
+ output[16+i] = input[8*i+2];
+ output[24+i] = input[8*i+3];
+ output[32+i] = input[8*i+4];
+ output[40+i] = input[8*i+5];
+ output[48+i] = input[8*i+6];
+ output[56+i] = input[8*i+7];
+ }
+ }
+ static inline void DCT_Transform ( short *x, short *y) {
+ signed short t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10;
+ unsigned i;
+ for (i = 0; i < 8; ++i) {
+ t8 = adds_short(x[i], x[56+i]);
+ t9 = subs_short(x[i], x[56+i]);
+ t0 = adds_short(x[8+i], x[48+i]);
+ t7 = subs_short(x[8+i], x[48+i]);
+ t1 = adds_short(x[16+i], x[40+i]);
+ t6 = subs_short(x[16+i], x[40+i]);
+ t2 = adds_short(x[24+i], x[32+i]);
+ t5 = subs_short(x[24+i], x[32+i]);
+ t3 = adds_short(t8, t2);
+ t4 = subs_short(t8, t2);
+ t2 = adds_short(t0, t1);
+ t8 = subs_short(t0, t1);
+ t1 = adds_short(t7, t6);
+ t0 = subs_short(t7, t6);
+ y[i] = adds_short(t3, t2);
+ y[32+i] = subs_short(t3, t2);
+ y[16+i] = mradds_short(t8, 13573, t4);
+ t10 = mradds_short(t4, 13573, 0);
+ y[48+i] = subs_short(t10, t8);
+ t6 = mradds_short(t0, 23170, t5);
+ t7 = mradds_short(t0, -23170, t5);
+ t2 = mradds_short(t1, -23170, t9);
+ t3 = mradds_short(t1, 23170, t9);
+ y[8+i] = mradds_short(t6, 6518, t3);
+ t9 = mradds_short(t3, 6518, 0);
+ y[56+i] = subs_short(t9, t6);
+ y[40+i] = mradds_short(t2, 21895, t7);
+ y[24+i] = mradds_short(t7, -21895, t2);
+ }
+ }
+ void dct_scalar(short *input, short *output) {
+ short x[64], y[64];
+ unsigned i, j;
+ memcpy(x, input, 64*sizeof(short));
+ DCT_Transform( x, y );
+ Matrix_Transpose( y, x );
+ DCT_Transform( x, y );
+ for (i = 0; i < 8; ++i)
+ for (j = 0; j < 8; ++j)
+ output[8*i+j] = mradds_short(PostScaleArray[8*i+j], y[8*i+j], 0);
+ }
