[llvm-commits] [vector_llvm] CVS: llvm/examples/SIMD/InterQuant/Makefile interquant.altivec.handwritten.c interquant.sse.handwritten.c interquant.vectorc.c main.c
Robert L. Bocchino Jr.
bocchino at persephone.cs.uiuc.edu
Sun Oct 23 15:50:20 PDT 2005
Changes in directory llvm/examples/SIMD/InterQuant:
Makefile added (r1.1.2.1)
interquant.altivec.handwritten.c added (r1.1.2.1)
interquant.sse.handwritten.c added (r1.1.2.1)
interquant.vectorc.c added (r1.1.2.1)
main.c added (r1.1.2.1)
---
Log message:
Examples to illustrate Vector LLVM's SIMD support.
---
Diffs of the changes: (+201 -0)
Makefile | 4 +
interquant.altivec.handwritten.c | 1
interquant.sse.handwritten.c | 40 +++++++++++++
interquant.vectorc.c | 44 +++++++++++++++
main.c | 112 +++++++++++++++++++++++++++++++++++++++
5 files changed, 201 insertions
Index: llvm/examples/SIMD/InterQuant/Makefile
diff -c /dev/null llvm/examples/SIMD/InterQuant/Makefile:1.1.2.1
*** /dev/null Sun Oct 23 17:50:00 2005
--- llvm/examples/SIMD/InterQuant/Makefile Sun Oct 23 17:49:40 2005
***************
*** 0 ****
--- 1,4 ----
+ NAME= interquant
+
+ include ../Makefile.common
+
Index: llvm/examples/SIMD/InterQuant/interquant.altivec.handwritten.c
diff -c /dev/null llvm/examples/SIMD/InterQuant/interquant.altivec.handwritten.c:1.1.2.1
*** /dev/null Sun Oct 23 17:50:17 2005
--- llvm/examples/SIMD/InterQuant/interquant.altivec.handwritten.c Sun Oct 23 17:49:40 2005
***************
*** 0 ****
--- 1 ----
+ /***************************************************************
*
* Copyright: (c) Copyright Motorola Inc. 1998
*
* Date: May 18, 1998
*
* Function: INTER_Quantization
*
* Description: The INTER_QUANTIZATION routine will quantize
* the predictive frames (P-picture). Coefficients
* are quantized to the formula:
* C' = sign(C) * ( abs(C) - QP/2 ) / ( 2 * QP ).
* To ensure ( abs(C) - QP/2 ) is positive, saturating
* unsigned subtraction is used.
*
* Inputs: input - Pointer to input data (short), which
* must be between -2040 and 2040 (as set
* up by DCT ). It is assumed that the allocated
* array has been 128-bit aligned and contains
* 8x8 short elements.
*
* Outputs: output - Pointer to output area for the transfored
* data. The output values are between -127
*
and 127. It is assumed that a 128-bit
* aligned 8x8 array of signed char has been
* pre-allocated.
*
* QP: QP (quantization parameter?) ranges from 1 to 31
*
**************************************************************/
#define INTER_CALC( input, output ) \
t1 = vec_subs( zero, input);\
u1 = (vector unsigned short ) vec_max( input, t1 ); /* ( abs(C)) */ \
t2 = (vector signed short ) vec_subs( u1, qpd2 );/*max(0,(abs(C)-QP/2)) */ \
t3 = vec_madds( t2, dtqp.v, zero ); /* ( (abs(C)-QP/2)/(2*QP) )>>15 ) */ \
t4 = vec_min(maxq,t3); /* peg value at 127 if greater */ \
msk = vec_cmpgt( zero, input ); /* select to find sign of input */ \
t5 = vec_subs( zero, t4 );\
output = vec_sel( t4, t5, msk ); /* ensure result is same sign */
void interquant_vector ( signed short* in,
signed char* out,
int QP )
{
vect
or signed short* input = (vector signed short*) in;
vector signed char* output = (vector signed char*) out;
/* ensure alignment so calculated constant can be
propagated into entire vector for calculations */
union{
vector signed short v;
signed short s[8];
} dtqp;
vector signed short zero, minus1, maxq, parta, partb;
vector signed short t1, t2, t3, t4, t5; /* used in macros */
vector unsigned short qpd2, u1;
vector bool short msk;
/* load the calculated constant into the vector */
dtqp.s[0] = (signed short)((int)((32768+QP)/(2*QP)));
dtqp.s[1] = (signed short)(QP/2);
qpd2 = (vector unsigned short) vec_splat( dtqp.v, 1);
dtqp.v = vec_splat( dtqp.v, 0 );
/* load the static constants used in the macros */
zero = (vector signed short) (0);
maxq = (vector signed short) (127);
minus1 = (vector signed short) (-1);
/* for all input compute: C' = sign(C) * ( (abs(C)-(QP
/2) ) / 2*QP ) */
INTER_CALC( input[0], parta );
INTER_CALC( input[1], partb );
output[0] = vec_pack( parta, partb );
INTER_CALC( input[2], parta );
INTER_CALC( input[3], partb );
output[1] = vec_pack( parta, partb );
INTER_CALC( input[4], parta );
INTER_CALC( input[5], partb );
output[2] = vec_pack( parta, partb );
INTER_CALC( input[6], parta );
INTER_CALC( input[7], partb );
output[3] = vec_pack( parta, partb );
}
\ No newline at end of file
Index: llvm/examples/SIMD/InterQuant/interquant.sse.handwritten.c
diff -c /dev/null llvm/examples/SIMD/InterQuant/interquant.sse.handwritten.c:1.1.2.1
*** /dev/null Sun Oct 23 17:50:18 2005
--- llvm/examples/SIMD/InterQuant/interquant.sse.handwritten.c Sun Oct 23 17:49:40 2005
***************
*** 0 ****
--- 1,40 ----
+ #include "SSE.h"
+
+ void interquant_vector ( signed short* in,
+ signed char* out,
+ int qp) {
+ int i, j, k;
+ short dtqp = (32768+qp)/(2*qp);
+ __m128i dtqp_vec = _mm_splat_epi16(dtqp);
+ __m128i zero = _mm_splat_epi16(0);
+ __m128i qpd2 = _mm_splat_epi16(qp/2);
+ __m128i maxq = _mm_splat_epi16(127);
+ __m128i *in_vp = (__m128i*) in;
+ __m128i *out_vp = (__m128i*) out;
+ __m128i result[2];
+
+ for (i = 0; i < 4; ++i) {
+ for (j = 0; j < 2; ++j) {
+ __m128i input = *in_vp++;
+ __m128i t1 = _mm_subs_epi16(zero, input);
+ __m128i u1 = _mm_max_epi16(input, t1);
+ __m128i t2 = _mm_subs_epu16(u1, qpd2);
+
+ // unsigned tmp = (unsigned) t2 * (unsigned) dtqp_vec
+ __m128i tmp_hi = _mm_mulhi_epi16(t2, dtqp_vec);
+ __m128i tmp_lo = _mm_mullo_epi16(t2, dtqp_vec);
+
+ // short t3 = tmp >> 15
+ __m128i hi = _mm_slli_epi16(tmp_hi, 1);
+ __m128i lo = _mm_srli_epi16(tmp_lo, 15);
+ __m128i t3 = _mm_or_si128(hi, lo);
+
+ __m128i t4 = _mm_min_epi16(maxq, t3);
+ __m128i mask = _mm_cmpgt_epi16(zero, input);
+ __m128i neg = _mm_subs_epi16(zero, t4);
+ result[j] = _mm_select_si128(mask, neg, t4);
+ }
+ *out_vp++ = _mm_pack_epi16(result[0], result[1]);
+ }
+ }
+
Index: llvm/examples/SIMD/InterQuant/interquant.vectorc.c
diff -c /dev/null llvm/examples/SIMD/InterQuant/interquant.vectorc.c:1.1.2.1
*** /dev/null Sun Oct 23 17:50:18 2005
--- llvm/examples/SIMD/InterQuant/interquant.vectorc.c Sun Oct 23 17:49:40 2005
***************
*** 0 ****
--- 1,44 ----
+ #include "VectorC.h"
+ #include "Intrinsics.h"
+
+ void interquant_vector(signed short* in, signed char* out, int qp ) {
+ int i, j;
+
+ short part1, part2;
+ short t1, t2, t3, t4, t5;
+ unsigned short u1;
+ short msk;
+
+ unsigned short qpd2 = vllvm_fixed_vimm_short((short) qp/2, 8);
+ short v = vllvm_fixed_vimm_short((short)((int)((32768+qp)/(2*qp))), 8);
+
+ short zero = vllvm_fixed_vimm_short(0, 8);
+ short maxq = vllvm_fixed_vimm_short(127, 8);
+
+ for (i = 0; i < 4; ++i) {
+ short in_vec = vllvm_load_short(in, 8, 2*i);
+ t1 = vllvm_subs_short( zero, in_vec);
+ u1 = (unsigned short) vllvm_max_short( in_vec, t1 );
+ t2 = vllvm_subs_ushort( u1, qpd2 );
+ t3 = t2*v >> 15;
+ t4 = vllvm_min_short(maxq,t3);
+ msk = zero > in_vec;
+ t5 = vllvm_subs_short( zero, t4 );
+ part1 = vllvm_vselect_short(msk, t5, t4);
+
+ in_vec = vllvm_load_short(in, 8, 2*i+1);
+ t1 = vllvm_subs_short( zero, in_vec);
+ u1 = (unsigned short) vllvm_max_short( in_vec, t1 );
+ t2 = (short) vllvm_subs_ushort( u1, qpd2 );
+ t3 = (t2*v) >> 15;
+ t4 = vllvm_min_short(maxq,t3);
+ msk = zero > in_vec;
+ t5 = vllvm_subs_short( zero, t4 );
+ part2 = vllvm_vselect_short(msk, t5, t4);
+
+ short out_vec = vllvm_fixed_vimm_short(0, 16);
+ out_vec = vllvm_fixed_combine_short(out_vec, 16, part1, 8, 0, 1);
+ out_vec = vllvm_fixed_combine_short(out_vec, 16, part2, 8, 8, 1);
+ vllvm_store_char(out_vec, out, i);
+ }
+ }
Index: llvm/examples/SIMD/InterQuant/main.c
diff -c /dev/null llvm/examples/SIMD/InterQuant/main.c:1.1.2.1
*** /dev/null Sun Oct 23 17:50:18 2005
--- llvm/examples/SIMD/InterQuant/main.c Sun Oct 23 17:49:40 2005
***************
*** 0 ****
--- 1,112 ----
+ #define N 1024 //2048*2
+ #define MAX_QP 31
+
+ #include <stdio.h>
+ #include <stdlib.h>
+ #include <sys/time.h>
+ #include <sys/times.h>
+ #include "../_malloc.h"
+
+ void interquant_scalar(short*,signed char*,int);
+ void interquant_vector(short*,signed char*,int);
+
+ short *in;
+ char *vector;
+ char *scalar;
+
+ void init() {
+ int i;
+
+ // Force 16-byte alignment
+ //
+ in = (short*) _malloc(N*sizeof(short));
+ vector = (char*) _malloc(N*sizeof(short));
+ scalar = (char*) _malloc(N*sizeof(short));
+
+ // Populate in with a range of values
+ //
+ for (i = 0; i < N; ++i) {
+ in[i] = -(N/2)+i;
+ }
+
+ }
+
+ void run(long *scalar_time, long *vector_time) {
+ long t0, t1, t2;
+ int i,j;
+ int qp = 10;
+ struct tms buf_s, buf_e;
+
+ init();
+
+ times(&buf_s);
+ for (j = 0; j < 100000; ++j)
+ for (i = 0; i < N/64; ++i)
+ interquant_scalar(in+64*i, scalar+64*i, qp);
+ times(&buf_e);
+
+ *scalar_time = buf_e.tms_utime - buf_s.tms_utime;
+ printf("scalar time=%d, ", *scalar_time);
+
+ times(&buf_s);
+ for (j = 0; j < 100000; ++j)
+ for (i = 0; i < N/64; ++i)
+ interquant_vector(in+64*i, vector+64*i, qp);
+ times(&buf_e);
+
+ *vector_time = buf_e.tms_utime - buf_s.tms_utime;
+ printf("vector time=%d, ", *vector_time);
+
+ for (i = 0; i < N; i++) {
+ if (vector[i] != scalar[i]) {
+ printf("FAILED\n");
+ exit(1);
+ }
+ }
+
+ float speedup = ((float) *scalar_time) / *vector_time;
+ printf("speedup=%f\n", speedup);
+
+ }
+
+ int
+ main (void) {
+ unsigned i;
+ init();
+
+ long best_scalar = -1, best_vector = -1;
+ long scalar, vector;
+ for (i = 0; i < NRUNS; ++i) {
+ run (&scalar, &vector);
+ if (best_scalar < 0 || best_scalar > scalar)
+ best_scalar = scalar;
+ if (best_vector < 0 || best_vector > vector)
+ best_vector = vector;
+ }
+
+ printf("best scalar=%d, ", best_scalar);
+ printf("best vector=%d, ", best_vector);
+ printf("speedup=%f\n", ((float) best_scalar)/best_vector);
+ printf ("PASSED\n");
+ return 0;
+ }
+
+ void interquant_scalar( signed short* in, signed char* out, int qp) {
+ int i;
+ int qpd2 = (32768+qp)/(2*qp);
+
+ for (i = 0; i < 64; ++i) {
+ short input = in[i];
+ short t1 = (input == -32768) ? 32767 : -input;
+ unsigned short u1 = (unsigned short) ((input > t1) ? input : t1);
+ short t2 = (short) (u1 - (qp/2));
+ t2 = (t2 > 0) ? t2 : 0;
+ //int t3 = (t2 * ((32768+qp)/(2*qp))) / 32768;
+ int t3 = (t2 * qpd2) /32768;
+ t3 = (t3 > 32767) ? 32767 : t3;
+ t3 = (t3 < -32768) ? -32768 : t3;
+ short t4 = (t3 < 127) ? t3 : 127;
+ out[i] = (input < 0) ? -t4 : t4;
+ }
+ }
+
More information about the llvm-commits
mailing list