[llvm-commits] [vector_llvm] CVS: llvm/examples/SIMD/RGB2YUV/Makefile main.c rgb2yuv.altivec.handwritten.c rgb2yuv.sse.handwritten.c rgb2yuv.vectorc.c
Robert L. Bocchino Jr.
bocchino at persephone.cs.uiuc.edu
Sun Oct 23 15:50:20 PDT 2005
Changes in directory llvm/examples/SIMD/RGB2YUV:
Makefile added (r1.1.2.1)
main.c added (r1.1.2.1)
rgb2yuv.altivec.handwritten.c added (r1.1.2.1)
rgb2yuv.sse.handwritten.c added (r1.1.2.1)
rgb2yuv.vectorc.c added (r1.1.2.1)
---
Log message:
Examples to illustrate Vector LLVM's SIMD support.
---
Diffs of the changes: (+384 -0)
Makefile | 4
main.c | 148 +++++++++++++++++++++++++++
rgb2yuv.altivec.handwritten.c | 1
rgb2yuv.sse.handwritten.c | 230 ++++++++++++++++++++++++++++++++++++++++++
rgb2yuv.vectorc.c | 1
5 files changed, 384 insertions
Index: llvm/examples/SIMD/RGB2YUV/Makefile
diff -c /dev/null llvm/examples/SIMD/RGB2YUV/Makefile:1.1.2.1
*** /dev/null Sun Oct 23 17:50:00 2005
--- llvm/examples/SIMD/RGB2YUV/Makefile Sun Oct 23 17:49:41 2005
***************
*** 0 ****
--- 1,4 ----
+ NAME= rgb2yuv
+
+ include ../Makefile.common
+
Index: llvm/examples/SIMD/RGB2YUV/main.c
diff -c /dev/null llvm/examples/SIMD/RGB2YUV/main.c:1.1.2.1
*** /dev/null Sun Oct 23 17:50:17 2005
--- llvm/examples/SIMD/RGB2YUV/main.c Sun Oct 23 17:49:41 2005
***************
*** 0 ****
--- 1,148 ----
+ #define N 4800
+
+ #include <stdio.h>
+ #include <stdlib.h>
+ #include <sys/time.h>
+ #include <sys/times.h>
+ #include <assert.h>
+ #include "../_malloc.h"
+
+ void rgb2yuv_scalar(unsigned char*, int, unsigned char*);
+ void rgb2yuv_vector(unsigned char*, int, unsigned char*);
+
+ char *in;
+ char *out;
+ char *ref;
+
+ void init() {
+ int i;
+
+ // Force 16-byte alignment
+ //
+ in = (char*) _malloc(N*sizeof(char));
+ out = (char*) _malloc(N*sizeof(char));
+ ref = (char*) _malloc(N*sizeof(char));
+
+ // Populate in with a range of values
+ //
+ for (i = 0; i < N; ++i) {
+ in[i] = -(N/2)+i;
+ out[i] = 1;
+ ref[i] = 2;
+ }
+
+ }
+
+ void run(long *scalar_time, long *vector_time) {
+ long t0, t1, t2;
+ int i,j;
+
+ struct tms buf_s, buf_e;
+
+ times(&buf_s);
+ for (i = 0; i < 100000; ++i)
+ rgb2yuv_scalar(in, N, ref);
+ times(&buf_e);
+ *scalar_time = buf_e.tms_utime - buf_s.tms_utime;
+ printf("scalar time=%d, ", *scalar_time);
+
+ times(&buf_s);
+ for (i = 0; i < 100000; ++i)
+ rgb2yuv_vector(in, N, out);
+ times(&buf_e);
+ *vector_time = buf_e.tms_utime - buf_s.tms_utime;
+ printf("vector time=%d, ", *vector_time);
+
+ for (i = 0; i < N; i++) {
+ if (out[i] != ref[i]) {
+ printf("FAILED\n");
+ exit(1);
+ }
+ }
+
+ float speedup = ((float) *scalar_time) / *vector_time;
+ printf("speedup=%f\n", speedup);
+
+ }
+
+ int
+ main (void) {
+ unsigned i;
+ init();
+
+ long best_scalar = -1, best_vector = -1;
+ long scalar, vector;
+ for (i = 0; i < NRUNS; ++i) {
+ run (&scalar, &vector);
+ if (best_scalar < 0 || best_scalar > scalar)
+ best_scalar = scalar;
+ if (best_vector < 0 || best_vector > vector)
+ best_vector = vector;
+ }
+
+ printf("best scalar=%d, ", best_scalar);
+ printf("best vector=%d, ", best_vector);
+ printf("speedup=%f\n", ((float) best_scalar)/best_vector);
+ printf ("PASSED\n");
+ return 0;
+ }
+
+ inline short saturate(int a) {
+ if (a > 32767)
+ return 32767;
+ if (a < -32768)
+ return -32768;
+ return a;
+ }
+
+ inline short mradds(short a, short b, short c) {
+ int aint = a, bint = b, cint = c;
+ assert(((aint*bint)+(1<<14))>>15 == (((short)((aint*bint)>>14))+1)>>1);
+ return saturate(((aint*bint+(1 << 14)) >> 15) + cint);
+ }
+
+ inline short adds(short a, short b) {
+ return saturate(a+b);
+ }
+
+ inline unsigned char saturate_uchar(unsigned short a) {
+ if (a > 255)
+ return 255;
+ return a;
+ }
+
+ void rgb2yuv_scalar(unsigned char *RGB_char_ptr, int RGB_size,
+ unsigned char *YCC_char_ptr) {
+ short red, green, blue;
+ short Y, Cb, Cr;
+ unsigned j, i;
+
+ for (i = 0; i < RGB_size; i += 3*16) {
+ for (j = 0; j < 16; ++j) {
+ red = RGB_char_ptr[i+3*j];
+ green = RGB_char_ptr[i+3*j+1];
+ blue = RGB_char_ptr[i+3*j+2];
+
+ Y = mradds(red, 8432, 0);
+ Cb = mradds(red, -4818, 0);
+ Cr = mradds(red, 14345, 0);
+
+ Y = mradds(green, 16425, Y);
+ Cb = mradds(green, -9527, Cb);
+ Cr = mradds(green, -12045, Cr);
+
+ Y = mradds(blue, 3176, Y);
+ Cb = mradds(blue, 14345, Cb);
+ Cr = mradds(blue, -2300, Cr);
+
+ Y = adds(Y, 16);
+ Cb = adds(Cb, 128);
+ Cr = adds(Cr, 128);
+
+ YCC_char_ptr[i+j] = saturate_uchar(Y);
+ YCC_char_ptr[i+j+16] = saturate_uchar(Cb);
+ YCC_char_ptr[i+j+32] = saturate_uchar(Cr);
+ }
+ }
+ }
+
Index: llvm/examples/SIMD/RGB2YUV/rgb2yuv.altivec.handwritten.c
diff -c /dev/null llvm/examples/SIMD/RGB2YUV/rgb2yuv.altivec.handwritten.c:1.1.2.1
*** /dev/null Sun Oct 23 17:50:18 2005
--- llvm/examples/SIMD/RGB2YUV/rgb2yuv.altivec.handwritten.c Sun Oct 23 17:49:41 2005
***************
*** 0 ****
--- 1 ----
+ void rgb2yuv_vector(unsigned char *RGB_char_ptr, int RGB_size,
unsigned char *YCC_char_ptr) {
vector unsigned char *RGB_ptr = (vector unsigned char*) RGB_char_ptr;
vector unsigned char *YCC_ptr = (vector unsigned char*) YCC_char_ptr;
vector signed short r0, r1, r2, g0, g1, g2, b0, b1, b2, c0, c16, c128;
vector unsigned char z0, tc0, tc1, tc2, tc3;
vector signed short tr0, tr1, tg0, tg1, tb0, tb1, mask;
vector signed short t0, t1, t2, t3, t4, t5;
int i, j;
vector unsigned char vPerm1 =
(vector unsigned char)( 0, 3, 6, 9, 12, 15, 18, 21, /* R0..R7 */
1, 4, 7, 10, 13, 16, 19, 22 /* G0..G7 */);
vector unsigned char vPerm2 =
(vector unsigned char)( 2, 5, 8, 11, 14, 17, 20, 23, /* B0..B7 */
0, 0, 0, 0, 0, 0, 0, 0 /* dont care */);
vector unsigned char vPerm3 =
(vector unsigned char)( 8, 11, 14, 17, 20, 23, 26, 29, /* R8..R15 */
9
, 12, 15, 18, 21, 24, 27, 30 /* G8..G15 */);
vector unsigned char vPerm4 =
(vector unsigned char)(10, 13, 16, 19, 22, 25, 28, 31, /* B8..B15 */
0, 0, 0, 0, 0, 0, 0, 0 /* dont care */);
vector signed short vConst1 =
(vector signed short)( 8432, 16425, 3176,
-4818, -9527, 14345,
0, 0 );
vector signed short vConst2 =
(vector signed short)( 14345, -12045, -2300,
16, 128, 0, 0, 0 );
r0 = vec_splat( vConst1, 0 ); /* 8432 */
g0 = vec_splat( vConst1, 1 ); /* 16425 */
b0 = vec_splat( vConst1, 2 ); /* 3176 */
r1 = vec_splat( vConst1, 3 ); /* -4818 */
g1 = vec_splat( vConst1, 4 ); /* -9527 */
b1 = vec_splat( vConst1, 5 ); /* 14345 */
r2 = vec_splat( vConst2, 0 ); /* 14345 */
g2 = vec_splat( vConst2, 1 ); /*-12045 */
b2 = vec_splat( vConst2, 2 ); /* -2300 */
c16 = vec_splat( vConst2, 3 ); /* 16 */
c128 = vec
_splat( vConst2, 4 ); /* 128 */
c0 = (vector signed short) (0); /* 0 */
z0 = (vector unsigned char) (0); /* 0 */
mask = (vector signed short) (0x00FF);
vector unsigned char Ys;
vector unsigned char Cbs;
vector unsigned char Crs;
for ( i = 0; i < (RGB_size/sizeof(vector unsigned char)); i+=3 ) {
tc0 = vec_perm( RGB_ptr[i], RGB_ptr[i+1], vPerm1 ); /* R0..R7 G0..G7 */
tc1 = vec_perm( RGB_ptr[i], RGB_ptr[i+1], vPerm2 ); /* B0..B7 */
tc2 = vec_perm( RGB_ptr[i+1], RGB_ptr[i+2], vPerm3 ); /* R8..R15 G8..G15 */
tc3 = vec_perm( RGB_ptr[i+1], RGB_ptr[i+2], vPerm4 ); /* B8..B15 */
tr0 = vec_and(vec_unpackh( (vector signed char) tc0 ), mask); /* tr0 = R0 .. R7 */
tg0 = vec_and(vec_unpackl( (vector signed char) tc0 ), mask); /* tg0 = G0 .. G7 */
tb0 = vec_and(vec_unpackh( (vector signed char) tc1 ), mask); /* tb0 = B0 .. B7 */
tr1 = vec_and(vec_unpackh( (vector signed char) tc2 ), mask); /* tr0 = R8 .. R15
*/
tg1 = vec_and(vec_unpackl( (vector signed char) tc2 ), mask); /* tg0 = G8 .. G15 */
tb1 = vec_and(vec_unpackh( (vector signed char) tc3 ), mask); /* tb0 = B8 .. B15 */
t0 = vec_mradds( tr0, r0, c0 ); /* (R0 .. R7) * 8432 */
t1 = vec_mradds( tr0, r1, c0 ); /* (R0 .. R7) * -4818 */
t2 = vec_mradds( tr0, r2, c0 ); /* (R0 .. R7) * 14345 */
t0 = vec_mradds( tg0, g0, t0 ); /* += (G0 .. G7) * 16425 */
t1 = vec_mradds( tg0, g1, t1 ); /* += (G0 .. G7) * -9527 */
t2 = vec_mradds( tg0, g2, t2 ); /* += (G0 .. G7) * -12045 */
t0 = vec_mradds( tb0, b0, t0 ); /* += (B0 .. B7) * 3176 */
t1 = vec_mradds( tb0, b1, t1 ); /* += (B0 .. B7) * 14345 */
t2 = vec_mradds( tb0, b2, t2 ); /* += (B0 .. B7) * -2300 */
/* Convert the next three input vectors. */
t3 = vec_mradds( tr1, r0, c0 ); /* (R8 .. R15) * 8432 */
t4 = vec_mradds( tr1, r1, c0 ); /* (R8 .. R15) * -4818 */
t5 = vec_mradds( tr1, r2, c0 ); /* (R8 .. R15) * 14345 */
t3 = vec_mradds( tg1, g0, t3 ); /* += (G8 .. G15) * 16425 */
t4 = vec_mradds( tg1, g1, t4 ); /* += (G8 .. G15) * -9527 */
t5 = vec_mradds( tg1, g2, t5 ); /* += (G8 .. G15) * -12045 */
t3 = vec_mradds( tb1, b0, t3 ); /* += (B8 .. B15) * 3176 */
t4 = vec_mradds( tb1, b1, t4 ); /* += (B8 .. B15) * 14345 */
t5 = vec_mradds( tb1, b2, t5 ); /* += (B8 .. B15) * -2300 */
t0 = vec_adds( t0, c16 );
t3 = vec_adds( t3, c16 );
t1 = vec_adds( t1, c128 );
t4 = vec_adds( t4, c128 );
t2 = vec_adds( t2, c128 );
t5 = vec_adds( t5, c128 );
YCC_ptr[i] = vec_packsu( t0, t3 ); /* Y0 .. Y15 */
YCC_ptr[i+1] = vec_packsu( t1, t4 ); /* Cb0 .. Cb15 */
YCC_ptr[i+2] = vec_packsu( t2, t5 ); /* Cr0 .. Cr15 */
}
}
\ No newline at end of file
Index: llvm/examples/SIMD/RGB2YUV/rgb2yuv.sse.handwritten.c
diff -c /dev/null llvm/examples/SIMD/RGB2YUV/rgb2yuv.sse.handwritten.c:1.1.2.1
*** /dev/null Sun Oct 23 17:50:18 2005
--- llvm/examples/SIMD/RGB2YUV/rgb2yuv.sse.handwritten.c Sun Oct 23 17:49:41 2005
***************
*** 0 ****
--- 1,230 ----
+ #include <emmintrin.h>
+
+ #define VECTOR(x) *((__m128i*) &x)
+ #define CONSTANT(x) _mm_set_epi16(x,x,x,x,x,x,x,x)
+
+ inline __m128i vec_mr(__m128i x, short y) {
+ __m128i const_1 = _mm_set_epi16(1,1,1,1,1,1,1,1);
+ __m128i y_vec = _mm_set_epi16(y, y, y, y,y, y, y, y);
+ __m128i tmp_hi = _mm_mulhi_epi16(x, y_vec);
+ __m128i tmp_lo = _mm_mullo_epi16(x, y_vec);
+ __m128i hi = _mm_slli_epi16(tmp_hi, 2);
+ __m128i lo = _mm_srli_epi16(tmp_lo, 14);
+ __m128i tmp_vec = _mm_or_si128(hi, lo);
+ tmp_vec = _mm_add_epi16(tmp_vec, const_1);
+ tmp_vec = _mm_srai_epi16(tmp_vec, 1);
+ return tmp_vec;
+ }
+
+ inline __m128i vec_mradds(__m128i x, short y, __m128i z) {
+ return _mm_adds_epi16(vec_mr(x,y),z);
+ }
+
+ #define MRADDS(x,y,z) _mm_adds_epi16(vec_mr(x,y),z)
+
+ void print_quaternary(unsigned char ch) {
+ unsigned i;
+ for (i = 0; i < 4; ++i)
+ printf("%d ", (ch >> (2*i)) & 3);
+ printf("\n");
+ }
+
+ void print_vector_128(__m128i vec) {
+ __m128i tmp = vec;
+ unsigned char *p = (unsigned char*) &tmp;
+ unsigned i;
+ for (i = 0; i < 16; ++i)
+ printf("%02X ", p[i]);
+ printf("\n");
+ }
+
+ #define idx(idx0, idx1, idx2, idx3) \
+ idx0 | (idx1 << 2) | (idx2 << 4) | (idx3 << 6)
+
+ #define extract(source, idx0, idx1, idx2, idx3) \
+ _mm_shuffle_epi32(source, idx0 | (idx1 << 2) | (idx2 << 4) | (idx3 << 6)) \
+
+ #define mask(source, idx0, idx1, idx2, idx3) \
+ _mm_and_si128(source, _mm_set_epi32(idx3 * ~0U, idx2 * ~0U, idx1 * ~0U, idx0 * ~0U))
+
+ #define msk(idx0, idx1, idx2, idx3) \
+ _mm_set_epi32(idx3 * ~0U, idx2 * ~0U, idx1 * ~0U, idx0 * ~0U)
+
+ void rgb2yuv_vector(unsigned char *RGB_char_ptr, int RGB_size,
+ unsigned char *YCC_char_ptr) {
+
+ __m128i* RGB_ptr = (__m128i*) RGB_char_ptr;
+ __m128i zero = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0);
+ __m128i constant_16 = CONSTANT(16);
+ __m128i constant_128 = CONSTANT(128);
+
+ unsigned j, i;
+
+ __m128i in0123, in01, in0, in1, in23, in2, in3;
+ __m128i in4567, in45, in4, in5, in67, in6, in7;
+ __m128i in89AB, in89, in8, in9, inAB, inA, inB;
+
+ __m128i red0, red1, red2, red_lo, red_hi;
+ __m128i green0, green1, green_lo, green_hi;
+ __m128i blue0, blue1, blue_lo, blue_hi;
+
+ __m128i red_16_lo, red_16_hi;
+ __m128i green_16_lo, green_16_hi;
+ __m128i blue_16_lo, blue_16_hi;
+
+ __m128i Y_lo, Y_hi;
+ __m128i Cb_lo, Cb_hi;
+ __m128i Cr_lo, Cr_hi;
+
+ __m128i Ys_char, Cbs_char, Crs_char;
+
+ for (i = 0; i < RGB_size; i += 3*16) {
+ in0123 = RGB_ptr[i/16];
+ in01 = _mm_unpacklo_epi8(in0123, zero);
+ in0 = _mm_unpacklo_epi16(in01, zero);
+ in1 = _mm_unpackhi_epi16(in01, zero);
+ in23 = _mm_unpackhi_epi8(in0123, zero);
+ in2 = _mm_unpacklo_epi16(in23, zero);
+ in3 = _mm_unpackhi_epi16(in23, zero);
+ in4567 = RGB_ptr[i/16+1];
+ in45 = _mm_unpacklo_epi8(in4567, zero);
+ in4 = _mm_unpacklo_epi16(in45, zero);
+ in5 = _mm_unpackhi_epi16(in45, zero);
+ in67 = _mm_unpackhi_epi8(in4567, zero);
+ in6 = _mm_unpacklo_epi16(in67, zero);
+ in7 = _mm_unpackhi_epi16(in67, zero);
+ in89AB = RGB_ptr[i/16+2];
+ in89 = _mm_unpacklo_epi8(in89AB, zero);
+ in8 = _mm_unpacklo_epi16(in89, zero);
+ in9 = _mm_unpackhi_epi16(in89, zero);
+ inAB = _mm_unpackhi_epi8(in89AB, zero);
+ inA = _mm_unpacklo_epi16(inAB, zero);
+ inB = _mm_unpackhi_epi16(inAB, zero);
+
+ red0 = _mm_and_si128(_mm_shuffle_epi32(in0, idx(0,3,0,0)), msk(1,1,0,0));
+ red1 = _mm_and_si128(in1, msk(0,0,1,0));
+ red2 = _mm_and_si128(_mm_shuffle_epi32(in2, idx(0,0,0,1)), msk(0,0,0,1));
+ red_lo = _mm_or_si128(_mm_or_si128(red0, red1), red2);
+
+ red0 = _mm_and_si128(_mm_shuffle_epi32(in3, idx(0,3,0,0)), msk(1,1,0,0));
+ red1 = _mm_and_si128(in4, msk(0,0,1,0));
+ red2 = _mm_and_si128(_mm_shuffle_epi32(in5, idx(0,0,0,1)), msk(0,0,0,1));
+ red_hi = _mm_or_si128(_mm_or_si128(red0, red1), red2);
+ red_16_lo = _mm_packs_epi32(red_lo, red_hi);
+
+ red0 = _mm_and_si128(_mm_shuffle_epi32(in6, idx(0,3,0,0)), msk(1,1,0,0));
+ red1 = _mm_and_si128(in7, msk(0,0,1,0));
+ red0 = _mm_or_si128(red0, red1);
+ red1 = _mm_and_si128(_mm_shuffle_epi32(in8, idx(0,0,0,1)), msk(0,0,0,1));
+ red_lo = _mm_or_si128(red0, red1);
+
+ red0 = _mm_and_si128(_mm_shuffle_epi32(in9, idx(0,3,0,0)), msk(1,1,0,0));
+ red1 = _mm_and_si128(inA, msk(0,0,1,0));
+ red0 = _mm_or_si128(red0, red1);
+ red1 = _mm_and_si128(_mm_shuffle_epi32(inB, idx(0,0,0,1)), msk(0,0,0,1));
+ red_hi = _mm_or_si128(red0, red1);
+ red_16_hi = _mm_packs_epi32(red_lo, red_hi);
+
+ green0 = mask(extract(in0, 1,0,0,0), 1, 0, 0, 0);
+ green1 = mask(extract(in1, 0, 0, 3, 0), 0, 1, 1, 0);
+ green0 = _mm_or_si128(green0, green1);
+ green1 = mask(extract(in2, 0, 0, 0, 2), 0, 0, 0, 1);
+ green_lo = _mm_or_si128(green0, green1);
+
+ green0 = mask(extract(in3, 1,0,0,0), 1, 0, 0, 0);
+ green1 = mask(extract(in4, 0, 0, 3, 0), 0, 1, 1, 0);
+ green0 = _mm_or_si128(green0, green1);
+ green1 = mask(extract(in5, 0, 0, 0, 2), 0, 0, 0, 1);
+ green_hi = _mm_or_si128(green0, green1);
+ green_16_lo = _mm_packs_epi32(green_lo, green_hi);
+
+ green0 = mask(extract(in6, 1,0,0,0), 1, 0, 0, 0);
+ green1 = mask(extract(in7, 0, 0, 3, 0), 0, 1, 1, 0);
+ green0 = _mm_or_si128(green0, green1);
+ green1 = mask(extract(in8, 0, 0, 0, 2), 0, 0, 0, 1);
+ green_lo = _mm_or_si128(green0, green1);
+
+ green0 = mask(extract(in9, 1,0,0,0), 1, 0, 0, 0);
+ green1 = mask(extract(inA, 0, 0, 3, 0), 0, 1, 1, 0);
+ green0 = _mm_or_si128(green0, green1);
+ green1 = mask(extract(inB, 0, 0, 0, 2), 0, 0, 0, 1);
+ green_hi = _mm_or_si128(green0, green1);
+ green_16_hi = _mm_packs_epi32(green_lo, green_hi);
+
+ blue0 = mask(extract(in0, 2,0,0,0), 1, 0, 0, 0);
+ blue1 = mask(extract(in1, 0, 1, 0, 0), 0, 1, 0, 0);
+ blue0 = _mm_or_si128(blue0, blue1);
+ blue1 = mask(extract(in2, 0, 0, 0, 3), 0, 0, 1, 1);
+ blue_lo = _mm_or_si128(blue0, blue1);
+
+ blue0 = mask(extract(in3, 2,0,0,0), 1, 0, 0, 0);
+ blue1 = mask(extract(in4, 0, 1, 0, 0), 0, 1, 0, 0);
+ blue0 = _mm_or_si128(blue0, blue1);
+ blue1 = mask(extract(in5, 0, 0, 0, 3), 0, 0, 1, 1);
+ blue_hi = _mm_or_si128(blue0, blue1);
+ blue_16_lo = _mm_packs_epi32(blue_lo, blue_hi);
+
+ blue0 = mask(extract(in6, 2,0,0,0), 1, 0, 0, 0);
+ blue1 = mask(extract(in7, 0, 1, 0, 0), 0, 1, 0, 0);
+ blue0 = _mm_or_si128(blue0, blue1);
+ blue1 = mask(extract(in8, 0, 0, 0, 3), 0, 0, 1, 1);
+ blue_lo = _mm_or_si128(blue0, blue1);
+
+ blue0 = mask(extract(in9, 2,0,0,0), 1, 0, 0, 0);
+ blue1 = mask(extract(inA, 0, 1, 0, 0), 0, 1, 0, 0);
+ blue0 = _mm_or_si128(blue0, blue1);
+ blue1 = mask(extract(inB, 0, 0, 0, 3), 0, 0, 1, 1);
+ blue_hi = _mm_or_si128(blue0, blue1);
+ blue_16_hi = _mm_packs_epi32(blue_lo, blue_hi);
+
+ Y_lo = vec_mr(red_16_lo, 8432);
+ Y_hi = vec_mr(red_16_hi, 8432);
+
+ Cb_lo = vec_mr(red_16_lo, -4818);
+ Cb_hi = vec_mr(red_16_hi, -4818);
+
+ Cr_lo = vec_mr(red_16_lo, 14345);
+ Cr_hi = vec_mr(red_16_hi, 14345);
+
+ Y_lo = vec_mradds(green_16_lo, 16425, Y_lo);
+ Y_hi = vec_mradds(green_16_hi, 16425, Y_hi);
+
+ Cb_lo = vec_mradds(green_16_lo, -9527, Cb_lo);
+ Cb_hi = vec_mradds(green_16_hi, -9527, Cb_hi);
+
+ Cr_lo = vec_mradds(green_16_lo, -12045, Cr_lo);
+ Cr_hi = vec_mradds(green_16_hi, -12045, Cr_hi);
+
+ Y_lo = vec_mradds(blue_16_lo, 3176, Y_lo);
+ Y_hi = vec_mradds(blue_16_hi, 3176, Y_hi);
+
+ Cb_lo = vec_mradds(blue_16_lo, 14345, Cb_lo);
+ Cb_hi = vec_mradds(blue_16_hi, 14345, Cb_hi);
+
+ Cr_lo = vec_mradds(blue_16_lo, -2300, Cr_lo);
+ Cr_hi = vec_mradds(blue_16_hi, -2300, Cr_hi);
+
+ Y_lo = _mm_adds_epi16(Y_lo, constant_16);
+ Y_hi = _mm_adds_epi16(Y_hi, constant_16);
+
+ Cb_lo = _mm_adds_epi16(Cb_lo, constant_128);
+ Cb_hi = _mm_adds_epi16(Cb_hi, constant_128);
+
+ Cr_lo = _mm_adds_epi16(Cr_lo, constant_128);
+ Cr_hi = _mm_adds_epi16(Cr_hi, constant_128);
+
+ Ys_char = _mm_packus_epi16(Y_lo, Y_hi);
+ Cbs_char = _mm_packus_epi16(Cb_lo, Cb_hi);
+ Crs_char = _mm_packus_epi16(Cr_lo, Cr_hi);
+
+ for (j = 0; j < 16; ++j) {
+ YCC_char_ptr[i+3*j] = ((unsigned char*) &Ys_char)[j];
+ YCC_char_ptr[i+1+3*j] = ((unsigned char*) &Cbs_char)[j];
+ YCC_char_ptr[i+2+3*j] = ((unsigned char*) &Crs_char)[j];
+ }
+ }
+
+ malloc(0);
+
+ }
Index: llvm/examples/SIMD/RGB2YUV/rgb2yuv.vectorc.c
diff -c /dev/null llvm/examples/SIMD/RGB2YUV/rgb2yuv.vectorc.c:1.1.2.1
*** /dev/null Sun Oct 23 17:50:18 2005
--- llvm/examples/SIMD/RGB2YUV/rgb2yuv.vectorc.c Sun Oct 23 17:49:41 2005
***************
*** 0 ****
--- 1 ----
+ #include "VectorC.h"
#include "Intrinsics.h"
// Selects whether to use vllvm_mradds with third argument c0
// (constant 0), or vllvm_mr instead. Using c0 may be slightly faster
// on AltiVec. Not using c0 saves an "adds x, 0" op and may be
// slightly faster on architectures (e.g., SSE2) that don't support
// mradds as a single op. Smarter code generation can make USE_C0 0
// just as fast on AltiVec as USE_C0 1.
//
#define USE_C0 1
// Note that both vllvm_mr and vllvm_mradds are defined in
// "Intrinsics.h" and expand to patterns of more primitive Vector LLVM
// instructions
//
void rgb2yuv_vector(unsigned char *RGB_ptr, int RGB_size,
unsigned char *YCC_ptr) {
signed short r0, r1, r2, g0, g1, g2, b0, b1, b2, c16, c128;
unsigned char z0, tc0, tc1, tc2, tc3;
signed short tr0, tr1, tg0, tg1, tb0, tb1;
signed short t0, t1, t2, t3, t4, t5;
int i;
unsigned char vPerm1 =
vllvm_constant_unsigned_char( 0, 3, 6, 9, 12, 15, 18, 21,
1, 4, 7, 10,
13, 16, 19, 22);
unsigned char vPerm2 =
vllvm_constant_unsigned_char( 2, 5, 8, 11, 14, 17, 20, 23,
0, 0, 0, 0, 0, 0, 0, 0);
unsigned char vPerm3 =
vllvm_constant_unsigned_char( 8, 11, 14, 17, 20, 23, 26, 29,
9, 12, 15, 18, 21, 24, 27, 30);
unsigned char vPerm4 =
vllvm_constant_unsigned_char(10, 13, 16, 19, 22, 25, 28, 31,
0, 0, 0, 0, 0, 0, 0, 0);
r0 = vllvm_fixed_vimm_short(8432, 8);
g0 = vllvm_fixed_vimm_short(16425, 8);
b0 = vllvm_fixed_vimm_short(3176, 8);
r1 = vllvm_fixed_vimm_short(-4818, 8);
g1 = vllvm_fixed_vimm_short(-9527, 8);
b1 = vllvm_fixed_vimm_short(14345, 8);
r2 = vllvm_fixed_vimm_short(14345, 8);
g2 = vllvm_fixed_vimm_short(-12045, 8);
b2 = vllvm_fixed_vimm_short(-2300, 8);
c16 = vllvm_fixed_vimm_short(16, 8);
c128 = vllvm_fixed_vimm_short(128, 8);
#if USE_C0
signed short c0 = vllvm_fixed_vimm_short(0, 8);
#endif
for ( i = 0; i < (RGB_size/16); i+=3 ) {
unsigned char v0 = v
llvm_load_unsigned_char(RGB_ptr, 16, i);
unsigned char v1 = vllvm_load_unsigned_char(RGB_ptr, 16, i+1);
unsigned char v2 = vllvm_load_unsigned_char(RGB_ptr, 16, i+2);
char tmp = vllvm_fixed_vimm_char(0, 32);
tmp = vllvm_fixed_combine_unsigned_char(tmp, 32, v0, 16, 0, 1);
tmp = vllvm_fixed_combine_unsigned_char(tmp, 32, v1, 16, 16, 1);
tc0 = vllvm_fixed_permute_unsigned_char(tmp, 32, vPerm1, 16);
char tmp1 = vllvm_fixed_vimm_char(0, 32);
tmp1 = vllvm_fixed_combine_unsigned_char(tmp1, 32, v0, 16, 0, 1);
tmp1 = vllvm_fixed_combine_unsigned_char(tmp1, 32, v1, 16, 16, 1);
tc1 = vllvm_fixed_permute_unsigned_char(tmp1, 32, vPerm2, 16);
char tmp2 = vllvm_fixed_vimm_char(0, 32);
tmp2 = vllvm_fixed_combine_unsigned_char(tmp2, 32, v1, 16, 0, 1);
tmp2 = vllvm_fixed_combine_unsigned_char(tmp2, 32, v2, 16, 16, 1);
tc2 = vllvm_fixed_permute_unsigned_char(tmp2, 32, vPerm3, 16);
char tmp3 = vllvm_fixed_vimm_char(0, 32);
tmp3
= vllvm_fixed_combine_unsigned_char(tmp3, 32, v1, 16, 0, 1);
tmp3 = vllvm_fixed_combine_unsigned_char(tmp3, 32, v2, 16, 16, 1);
tc3 = vllvm_fixed_permute_unsigned_char(tmp3, 32, vPerm4, 16);
tr0 = _extract_unsigned_char(tc0, 0, 1, 8);
tg0 = _extract_unsigned_char(tc0, 8, 1, 8);
tb0 = _extract_unsigned_char(tc1, 0, 1, 8);
tr1 = _extract_unsigned_char(tc2, 0, 1, 8);
tg1 = _extract_unsigned_char(tc2, 8, 1, 8);
tb1 = _extract_unsigned_char(tc3, 0, 1, 8);
#if USE_C0
t0 = vllvm_mradds_short( tr0, r0, c0 );
t1 = vllvm_mradds_short( tr0, r1, c0 );
t2 = vllvm_mradds_short( tr0, r2, c0 );
#else
t0 = vllvm_mr_short( tr0, r0 );
t1 = vllvm_mr_short( tr0, r1 );
t2 = vllvm_mr_short( tr0, r2 );
#endif
t0 = vllvm_mradds_short( tg0, g0, t0 );
t1 = vllvm_mradds_short( tg0, g1, t1 );
t2 = vllvm_mradds_short( tg0, g2, t2 );
t0 = vllvm_mradds_short( tb0, b0, t0 );
t1 = vllvm_mradds_short( tb0, b1, t1 );
t2 = vllv
m_mradds_short( tb0, b2, t2 );
#if USE_C0
t3 = vllvm_mradds_short( tr1, r0, c0 );
t4 = vllvm_mradds_short( tr1, r1, c0 );
t5 = vllvm_mradds_short( tr1, r2, c0 );
#else
t3 = vllvm_mr_short( tr1, r0 );
t4 = vllvm_mr_short( tr1, r1 );
t5 = vllvm_mr_short( tr1, r2 );
#endif
t3 = vllvm_mradds_short( tg1, g0, t3 );
t4 = vllvm_mradds_short( tg1, g1, t4 );
t5 = vllvm_mradds_short( tg1, g2, t5 );
t3 = vllvm_mradds_short( tb1, b0, t3 );
t4 = vllvm_mradds_short( tb1, b1, t4 );
t5 = vllvm_mradds_short( tb1, b2, t5 );
t0 = vllvm_adds_short( t0, c16 );
t3 = vllvm_adds_short( t3, c16 );
t1 = vllvm_adds_short( t1, c128 );
t4 = vllvm_adds_short( t4, c128 );
t2 = vllvm_adds_short( t2, c128 );
t5 = vllvm_adds_short( t5, c128 );
short out0 = vllvm_fixed_vimm_short(0, 16);
short out1 = vllvm_fixed_combine_short(out0, 16, t0, 8, 0, 1);
short out2 = vllvm_fixed_combine_short(out1, 16, t3, 8, 8, 1);
unsigne
d char out3 = vllvm_saturate_short_uchar(out2);
vllvm_store_unsigned_char(out3, YCC_ptr, i);
out1 = vllvm_fixed_combine_short(out0, 16, t1, 8, 0, 1);
out2 = vllvm_fixed_combine_short(out1, 16, t4, 8, 8, 1);
out3 = vllvm_saturate_short_uchar(out2);
vllvm_store_unsigned_char(out3, YCC_ptr, i+1);
out1 = vllvm_fixed_combine_short(out0, 16, t2, 8, 0, 1);
out2 = vllvm_fixed_combine_short(out1, 16, t5, 8, 8, 1);
out3 = vllvm_saturate_short_uchar(out2);
vllvm_store_unsigned_char(out3, YCC_ptr, i+2);
}
}
\ No newline at end of file
More information about the llvm-commits
mailing list