[llvm-commits] [vector_llvm] CVS: llvm/examples/SIMD/InterQuant/Makefile interquant.altivec.handwritten.c interquant.sse.handwritten.c interquant.vectorc.c main.c

Sun Oct 23 15:50:20 PDT 2005

Changes in directory llvm/examples/SIMD/InterQuant:

Makefile added (r1.1.2.1)
interquant.altivec.handwritten.c added (r1.1.2.1)
interquant.sse.handwritten.c added (r1.1.2.1)
interquant.vectorc.c added (r1.1.2.1)
main.c added (r1.1.2.1)
---
Log message:

Examples to illustrate Vector LLVM's SIMD support.

---
Diffs of the changes:  (+201 -0)

 Makefile                         |    4 +
 interquant.altivec.handwritten.c |    1 
 interquant.sse.handwritten.c     |   40 +++++++++++++
 interquant.vectorc.c             |   44 +++++++++++++++
 main.c                           |  112 +++++++++++++++++++++++++++++++++++++++
 5 files changed, 201 insertions

Index: llvm/examples/SIMD/InterQuant/Makefile
diff -c /dev/null llvm/examples/SIMD/InterQuant/Makefile:1.1.2.1
*** /dev/null	Sun Oct 23 17:50:00 2005
--- llvm/examples/SIMD/InterQuant/Makefile	Sun Oct 23 17:49:40 2005
***************
*** 0 ****
--- 1,4 ----
+ NAME= interquant
+ 
+ include ../Makefile.common
+ 

Index: llvm/examples/SIMD/InterQuant/interquant.altivec.handwritten.c
diff -c /dev/null llvm/examples/SIMD/InterQuant/interquant.altivec.handwritten.c:1.1.2.1
*** /dev/null	Sun Oct 23 17:50:17 2005
--- llvm/examples/SIMD/InterQuant/interquant.altivec.handwritten.c	Sun Oct 23 17:49:40 2005
***************
*** 0 ****
--- 1 ----
+ /***************************************************************
 *
 * Copyright:   (c) Copyright Motorola Inc. 1998
 *
 * Date:        May 18, 1998
 *
 * Function:    INTER_Quantization
 *
 * Description: The INTER_QUANTIZATION routine will quantize
 *              the predictive frames (P-picture). Coefficients
 *              are quantized to the formula:
 *                 C' = sign(C) * ( abs(C) - QP/2 ) / ( 2 * QP ).
 *              To ensure ( abs(C) - QP/2 ) is positive, saturating
 *              unsigned subtraction is used.
 *
 * Inputs:      input - Pointer to input data (short), which
 *                      must be between -2040 and 2040 (as set
 *                      up by DCT ). It is assumed that the allocated
 *                      array has been 128-bit aligned and contains
 *                      8x8 short elements.
 *
 * Outputs:     output - Pointer to output area for the transfored
 *                       data. The output values are between -127
 *
                        and 127. It is assumed that a 128-bit
 *                       aligned 8x8 array of signed char has been
 *                       pre-allocated.
 *
 * QP:          QP (quantization parameter?) ranges from 1 to 31
 *
 **************************************************************/

#define INTER_CALC( input, output ) \
     t1 = vec_subs( zero, input);\
     u1 = (vector unsigned short ) vec_max( input, t1 );     /*  ( abs(C))   */ \
     t2 = (vector signed short ) vec_subs( u1, qpd2 );/*max(0,(abs(C)-QP/2)) */ \
     t3 = vec_madds( t2, dtqp.v, zero );   /* ( (abs(C)-QP/2)/(2*QP) )>>15 ) */ \
     t4 = vec_min(maxq,t3);                /* peg value at 127 if greater    */ \
     msk = vec_cmpgt( zero, input );       /* select to find sign of input   */ \
     t5 = vec_subs( zero, t4 );\
     output = vec_sel( t4, t5, msk );      /* ensure result is same sign     */

void interquant_vector ( signed short* in,
			 signed char* out,
			 int QP )
{
  vect
 or signed short* input = (vector signed short*) in;
  vector signed char* output = (vector signed char*) out;

     /* ensure alignment so calculated constant can be
        propagated into entire vector for calculations */
     union{
        vector signed short v;
        signed short s[8];
     } dtqp;

     vector signed short zero, minus1, maxq, parta, partb;
     vector signed short t1, t2, t3, t4, t5; /* used in macros */
     vector unsigned short qpd2, u1;
     vector bool short msk;

     /* load the calculated constant into the vector */
     dtqp.s[0] = (signed short)((int)((32768+QP)/(2*QP)));
     dtqp.s[1] = (signed short)(QP/2);
     qpd2 = (vector unsigned short) vec_splat( dtqp.v, 1);
     dtqp.v = vec_splat( dtqp.v, 0 );

     /* load the static constants used in the macros */
     zero = (vector signed short) (0);
     maxq = (vector signed short) (127);
     minus1 = (vector signed short) (-1);

     /* for all input compute: C' = sign(C) * ( (abs(C)-(QP
 /2) ) / 2*QP ) */
     INTER_CALC( input[0], parta );
     INTER_CALC( input[1], partb );
     output[0] = vec_pack( parta, partb );

     INTER_CALC( input[2], parta );
     INTER_CALC( input[3], partb );
     output[1] = vec_pack( parta, partb );

     INTER_CALC( input[4], parta );
     INTER_CALC( input[5], partb );
     output[2] = vec_pack( parta, partb );

     INTER_CALC( input[6], parta );
     INTER_CALC( input[7], partb );
     output[3] = vec_pack( parta, partb );

}

\ No newline at end of file

Index: llvm/examples/SIMD/InterQuant/interquant.sse.handwritten.c
diff -c /dev/null llvm/examples/SIMD/InterQuant/interquant.sse.handwritten.c:1.1.2.1
*** /dev/null	Sun Oct 23 17:50:18 2005
--- llvm/examples/SIMD/InterQuant/interquant.sse.handwritten.c	Sun Oct 23 17:49:40 2005
***************
*** 0 ****
--- 1,40 ----
+ #include "SSE.h"
+ 
+ void interquant_vector ( signed short* in,
+ 			 signed char* out,
+ 			 int qp) {
+   int i, j, k;
+   short dtqp = (32768+qp)/(2*qp);
+   __m128i dtqp_vec = _mm_splat_epi16(dtqp);
+   __m128i zero = _mm_splat_epi16(0);
+   __m128i qpd2 = _mm_splat_epi16(qp/2);
+   __m128i maxq = _mm_splat_epi16(127);
+   __m128i *in_vp = (__m128i*) in;
+   __m128i *out_vp = (__m128i*) out;
+   __m128i result[2];
+   
+   for (i = 0; i < 4; ++i) {
+     for (j = 0; j < 2; ++j) {
+       __m128i input = *in_vp++;
+       __m128i t1 = _mm_subs_epi16(zero, input);
+       __m128i u1 = _mm_max_epi16(input, t1);
+       __m128i t2 = _mm_subs_epu16(u1, qpd2);
+ 
+       // unsigned tmp = (unsigned) t2 * (unsigned) dtqp_vec
+       __m128i tmp_hi = _mm_mulhi_epi16(t2, dtqp_vec);
+       __m128i tmp_lo = _mm_mullo_epi16(t2, dtqp_vec);
+ 
+       // short t3 = tmp >> 15
+       __m128i hi = _mm_slli_epi16(tmp_hi, 1);
+       __m128i lo = _mm_srli_epi16(tmp_lo, 15);
+       __m128i t3 = _mm_or_si128(hi, lo);
+ 
+       __m128i t4 = _mm_min_epi16(maxq, t3);
+       __m128i mask = _mm_cmpgt_epi16(zero, input);
+       __m128i neg = _mm_subs_epi16(zero, t4);
+       result[j] = _mm_select_si128(mask, neg, t4);
+     }
+     *out_vp++ = _mm_pack_epi16(result[0], result[1]);
+   }
+ }
+ 

Index: llvm/examples/SIMD/InterQuant/interquant.vectorc.c
diff -c /dev/null llvm/examples/SIMD/InterQuant/interquant.vectorc.c:1.1.2.1
*** /dev/null	Sun Oct 23 17:50:18 2005
--- llvm/examples/SIMD/InterQuant/interquant.vectorc.c	Sun Oct 23 17:49:40 2005
***************
*** 0 ****
--- 1,44 ----
+ #include "VectorC.h"
+ #include "Intrinsics.h"
+ 
+ void interquant_vector(signed short* in, signed char* out, int qp ) {
+   int i, j;
+ 
+   short part1, part2;
+   short t1, t2, t3, t4, t5;
+   unsigned short u1;
+   short msk;
+   
+   unsigned short qpd2 = vllvm_fixed_vimm_short((short) qp/2, 8);
+   short v = vllvm_fixed_vimm_short((short)((int)((32768+qp)/(2*qp))), 8);
+   
+   short zero = vllvm_fixed_vimm_short(0, 8);
+   short maxq = vllvm_fixed_vimm_short(127, 8);
+   
+   for (i = 0; i < 4; ++i) {
+     short in_vec = vllvm_load_short(in, 8, 2*i);
+     t1 = vllvm_subs_short( zero, in_vec);
+     u1 = (unsigned short) vllvm_max_short( in_vec, t1 );
+     t2 = vllvm_subs_ushort( u1, qpd2 );
+     t3 = t2*v >> 15;
+     t4 = vllvm_min_short(maxq,t3);
+     msk = zero > in_vec;
+     t5 = vllvm_subs_short( zero, t4 );
+     part1 = vllvm_vselect_short(msk, t5, t4);
+ 
+     in_vec = vllvm_load_short(in, 8, 2*i+1);
+     t1 = vllvm_subs_short( zero, in_vec);
+     u1 = (unsigned short) vllvm_max_short( in_vec, t1 );
+     t2 = (short) vllvm_subs_ushort( u1, qpd2 );
+     t3 = (t2*v) >> 15;
+     t4 = vllvm_min_short(maxq,t3);
+     msk = zero > in_vec;
+     t5 = vllvm_subs_short( zero, t4 );
+     part2 = vllvm_vselect_short(msk, t5, t4);
+     
+     short out_vec = vllvm_fixed_vimm_short(0, 16);
+     out_vec = vllvm_fixed_combine_short(out_vec, 16, part1, 8, 0, 1);
+     out_vec = vllvm_fixed_combine_short(out_vec, 16, part2, 8, 8, 1);
+     vllvm_store_char(out_vec, out, i);
+   }
+ }

Index: llvm/examples/SIMD/InterQuant/main.c
diff -c /dev/null llvm/examples/SIMD/InterQuant/main.c:1.1.2.1
*** /dev/null	Sun Oct 23 17:50:18 2005
--- llvm/examples/SIMD/InterQuant/main.c	Sun Oct 23 17:49:40 2005
***************
*** 0 ****
--- 1,112 ----
+ #define N 1024 //2048*2
+ #define MAX_QP 31
+ 
+ #include <stdio.h>
+ #include <stdlib.h>
+ #include <sys/time.h>
+ #include <sys/times.h>
+ #include "../_malloc.h"
+ 
+ void interquant_scalar(short*,signed char*,int);
+ void interquant_vector(short*,signed char*,int);
+ 
+ short *in;
+ char *vector;
+ char *scalar;
+ 
+ void init() {
+   int i;
+ 
+   // Force 16-byte alignment
+   //
+   in = (short*) _malloc(N*sizeof(short));
+   vector = (char*) _malloc(N*sizeof(short));
+   scalar = (char*) _malloc(N*sizeof(short));
+   
+   // Populate in with a range of values
+   //
+   for (i = 0; i < N; ++i) {
+     in[i] = -(N/2)+i;
+   }
+   
+ }
+ 
+ void run(long *scalar_time, long *vector_time) {
+   long t0, t1, t2;
+   int i,j;
+   int qp = 10;
+   struct tms buf_s, buf_e;
+ 
+   init();
+   
+   times(&buf_s);
+   for (j = 0; j < 100000; ++j)
+     for (i = 0; i < N/64; ++i)
+       interquant_scalar(in+64*i, scalar+64*i, qp);
+   times(&buf_e);
+   
+   *scalar_time = buf_e.tms_utime - buf_s.tms_utime;
+   printf("scalar time=%d, ", *scalar_time);
+   
+   times(&buf_s);
+   for (j = 0; j < 100000; ++j)
+     for (i = 0; i < N/64; ++i)
+       interquant_vector(in+64*i, vector+64*i, qp);
+   times(&buf_e);
+   
+   *vector_time = buf_e.tms_utime - buf_s.tms_utime;
+   printf("vector time=%d, ", *vector_time);
+   
+   for (i = 0; i < N; i++) {
+     if (vector[i] != scalar[i]) {
+       printf("FAILED\n");
+       exit(1);
+     }
+   }
+   
+   float speedup = ((float) *scalar_time) / *vector_time;
+   printf("speedup=%f\n", speedup);
+   
+ }
+ 
+ int
+ main (void) {
+   unsigned i;
+   init();
+ 
+   long best_scalar = -1, best_vector = -1;
+   long scalar, vector;
+   for (i = 0; i < NRUNS; ++i) {
+     run (&scalar, &vector);
+     if (best_scalar < 0 || best_scalar > scalar)
+       best_scalar = scalar;
+     if (best_vector < 0 || best_vector > vector)
+       best_vector = vector;
+   }
+ 
+   printf("best scalar=%d, ", best_scalar);
+   printf("best vector=%d, ", best_vector);
+   printf("speedup=%f\n", ((float) best_scalar)/best_vector);
+   printf ("PASSED\n");
+   return 0;
+ }
+ 
+ void interquant_scalar( signed short* in,  signed char* out, int qp) {
+   int i;
+   int qpd2 = (32768+qp)/(2*qp);
+ 
+   for (i = 0; i < 64; ++i) {
+     short input = in[i];
+     short t1 = (input == -32768) ? 32767 : -input;
+     unsigned short u1 = (unsigned short) ((input > t1) ? input : t1); 
+     short t2 = (short) (u1 - (qp/2));
+     t2 = (t2 > 0) ? t2 : 0;
+     //int t3 = (t2 * ((32768+qp)/(2*qp))) / 32768;
+     int t3 = (t2 * qpd2) /32768;
+     t3 = (t3 > 32767) ? 32767 : t3;
+     t3 = (t3 < -32768) ? -32768 : t3;
+     short t4 = (t3 < 127) ? t3 : 127;
+     out[i] = (input < 0) ? -t4 : t4;
+   }
+ }
+