00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025
00026 #include <cstdio>
00027 #include <cstdlib>
00028 #include <algorithm>
00029 #include <inttypes.h>
00030 #include <limits.h>
00031 #include "config.h"
00032
00033 #ifdef MMX
00034 extern "C" {
00035 #include "libavcodec/i386/mmx.h"
00036 }
00037 #define CPU_MMXEXT 0
00038 #define CPU_MMX 1
00039 #endif
00040
00041 #ifdef HAVE_ALTIVEC
00042 extern "C" int has_altivec(void);
00043 #ifdef HAVE_ALTIVEC_H
00044 #include <altivec.h>
00045 #else
00046 #include <Accelerate/Accelerate.h>
00047 #endif
00048 #endif
00049 #include "yuv2rgb.h"
00050
00056 static void yuv420_argb32_non_mmx(unsigned char *image, unsigned char *py,
00057 unsigned char *pu, unsigned char *pv,
00058 int h_size, int v_size, int rgb_stride,
00059 int y_stride, int uv_stride, int alphaones);
00060
00061
00062
00063 #define movntq(src,dest) \
00064 do { \
00065 if (cpu == CPU_MMXEXT) \
00066 movntq_r2m (src, dest); \
00067 else \
00068 movq_r2m (src, dest); \
00069 } while (0)
00070
00071 #ifdef MMX
00072 static inline void mmx_yuv2rgb (uint8_t * py, uint8_t * pu, uint8_t * pv)
00073 {
00074 static mmx_t mmx_80w = {0x0080008000800080LL};
00075 static mmx_t mmx_U_green = {0xf37df37df37df37dLL};
00076 static mmx_t mmx_U_blue = {0x4093409340934093LL};
00077 static mmx_t mmx_V_red = {0x3312331233123312LL};
00078 static mmx_t mmx_V_green = {0xe5fce5fce5fce5fcLL};
00079 static mmx_t mmx_10w = {0x1010101010101010LL};
00080 static mmx_t mmx_00ffw = {0x00ff00ff00ff00ffLL};
00081 static mmx_t mmx_Y_coeff = {0x253f253f253f253fLL};
00082
00083 movd_m2r (*pu, mm0);
00084 movd_m2r (*pv, mm1);
00085 movq_m2r (*py, mm6);
00086 pxor_r2r (mm4, mm4);
00087
00088
00089
00090
00091
00092
00093
00094
00095
00096
00097 punpcklbw_r2r (mm4, mm0);
00098 punpcklbw_r2r (mm4, mm1);
00099 psubsw_m2r (mmx_80w, mm0);
00100 psubsw_m2r (mmx_80w, mm1);
00101 psllw_i2r (3, mm0);
00102 psllw_i2r (3, mm1);
00103 movq_r2r (mm0, mm2);
00104 movq_r2r (mm1, mm3);
00105 pmulhw_m2r (mmx_U_green, mm2);
00106 pmulhw_m2r (mmx_V_green, mm3);
00107 pmulhw_m2r (mmx_U_blue, mm0);
00108 pmulhw_m2r (mmx_V_red, mm1);
00109 paddsw_r2r (mm3, mm2);
00110
00111 psubusb_m2r (mmx_10w, mm6);
00112 movq_r2r (mm6, mm7);
00113 pand_m2r (mmx_00ffw, mm6);
00114 psrlw_i2r (8, mm7);
00115 psllw_i2r (3, mm6);
00116 psllw_i2r (3, mm7);
00117 pmulhw_m2r (mmx_Y_coeff, mm6);
00118 pmulhw_m2r (mmx_Y_coeff, mm7);
00119
00120
00121
00122
00123
00124
00125
00126
00127
00128 movq_r2r (mm0, mm3);
00129 movq_r2r (mm1, mm4);
00130 movq_r2r (mm2, mm5);
00131 paddsw_r2r (mm6, mm0);
00132 paddsw_r2r (mm7, mm3);
00133 paddsw_r2r (mm6, mm1);
00134 paddsw_r2r (mm7, mm4);
00135 paddsw_r2r (mm6, mm2);
00136 paddsw_r2r (mm7, mm5);
00137 packuswb_r2r (mm0, mm0);
00138 packuswb_r2r (mm1, mm1);
00139 packuswb_r2r (mm2, mm2);
00140 packuswb_r2r (mm3, mm3);
00141 packuswb_r2r (mm4, mm4);
00142 packuswb_r2r (mm5, mm5);
00143 punpcklbw_r2r (mm3, mm0);
00144 punpcklbw_r2r (mm4, mm1);
00145 punpcklbw_r2r (mm5, mm2);
00146 }
00147
00148 static inline void mmx_unpack_16rgb (uint8_t * image, int cpu)
00149 {
00150 static mmx_t mmx_bluemask = {0xf8f8f8f8f8f8f8f8LL};
00151 static mmx_t mmx_greenmask = {0xfcfcfcfcfcfcfcfcLL};
00152 static mmx_t mmx_redmask = {0xf8f8f8f8f8f8f8f8LL};
00153
00154
00155
00156
00157
00158
00159
00160
00161 pand_m2r (mmx_bluemask, mm0);
00162 pand_m2r (mmx_greenmask, mm2);
00163 pand_m2r (mmx_redmask, mm1);
00164 psrlq_i2r (3, mm0);
00165 pxor_r2r (mm4, mm4);
00166 movq_r2r (mm0, mm5);
00167 movq_r2r (mm2, mm7);
00168
00169 punpcklbw_r2r (mm4, mm2);
00170 punpcklbw_r2r (mm1, mm0);
00171 psllq_i2r (3, mm2);
00172 por_r2r (mm2, mm0);
00173 movntq (mm0, *image);
00174
00175 punpckhbw_r2r (mm4, mm7);
00176 punpckhbw_r2r (mm1, mm5);
00177 psllq_i2r (3, mm7);
00178 por_r2r (mm7, mm5);
00179 movntq (mm5, *(image+8));
00180 }
00181
00182 static inline void mmx_unpack_32rgb (uint8_t * image, int cpu, int alphaones)
00183 {
00184
00185
00186
00187
00188
00189
00190
00191 if (alphaones)
00192 {
00193 static mmx_t mmx_1s = {0xffffffffffffffffLL};
00194 movq_m2r (mmx_1s, mm3);
00195 }
00196 else
00197 pxor_r2r (mm3, mm3);
00198
00199 movq_r2r (mm0, mm6);
00200 movq_r2r (mm1, mm7);
00201 movq_r2r (mm0, mm4);
00202 movq_r2r (mm1, mm5);
00203 punpcklbw_r2r (mm2, mm6);
00204 punpcklbw_r2r (mm3, mm7);
00205 punpcklwd_r2r (mm7, mm6);
00206 movntq (mm6, *image);
00207 movq_r2r (mm0, mm6);
00208 punpcklbw_r2r (mm2, mm6);
00209 punpckhwd_r2r (mm7, mm6);
00210 movntq (mm6, *(image+8));
00211 punpckhbw_r2r (mm2, mm4);
00212 punpckhbw_r2r (mm3, mm5);
00213 punpcklwd_r2r (mm5, mm4);
00214 movntq (mm4, *(image+16));
00215 movq_r2r (mm0, mm4);
00216 punpckhbw_r2r (mm2, mm4);
00217 punpckhwd_r2r (mm5, mm4);
00218 movntq (mm4, *(image+24));
00219 }
00220
00221 static inline void yuv420_rgb16 (uint8_t * image,
00222 uint8_t * py, uint8_t * pu, uint8_t * pv,
00223 int width, int height,
00224 int rgb_stride, int y_stride, int uv_stride,
00225 int cpu, int alphaones)
00226 {
00227 (void)alphaones;
00228 int i;
00229
00230 rgb_stride -= 2 * width;
00231 y_stride -= width;
00232 uv_stride -= width >> 1;
00233 width >>= 3;
00234
00235 do {
00236 i = width;
00237 do {
00238 mmx_yuv2rgb (py, pu, pv);
00239 mmx_unpack_16rgb (image, cpu);
00240 py += 8;
00241 pu += 4;
00242 pv += 4;
00243 image += 16;
00244 } while (--i);
00245
00246 py += y_stride;
00247 image += rgb_stride;
00248 if (height & 1) {
00249 pu += uv_stride;
00250 pv += uv_stride;
00251 } else {
00252 pu -= 4 * width;
00253 pv -= 4 * width;
00254 }
00255 } while (--height);
00256
00257 emms();
00258 }
00259
00260 static inline void yuv420_argb32 (uint8_t * image, uint8_t * py,
00261 uint8_t * pu, uint8_t * pv,
00262 int width, int height,
00263 int rgb_stride, int y_stride, int uv_stride,
00264 int cpu, int alphaones)
00265 {
00266 int i;
00267
00268 rgb_stride -= 4 * width;
00269 y_stride -= width;
00270 uv_stride -= width >> 1;
00271 width >>= 3;
00272
00273 do {
00274 i = width;
00275 do {
00276 mmx_yuv2rgb (py, pu, pv);
00277 mmx_unpack_32rgb (image, cpu, alphaones);
00278 py += 8;
00279 pu += 4;
00280 pv += 4;
00281 image += 32;
00282 } while (--i);
00283
00284 py += y_stride;
00285 image += rgb_stride;
00286 if (height & 1) {
00287 pu += uv_stride;
00288 pv += uv_stride;
00289 } else {
00290 pu -= 4 * width;
00291 pv -= 4 * width;
00292 }
00293 } while (--height);
00294
00295 emms();
00296 }
00297
00298 static void mmxext_rgb16 (uint8_t * image,
00299 uint8_t * py, uint8_t * pu, uint8_t * pv,
00300 int width, int height,
00301 int rgb_stride, int y_stride, int uv_stride,
00302 int alphaones)
00303 {
00304 yuv420_rgb16 (image, py, pu, pv, width, height,
00305 rgb_stride, y_stride, uv_stride, CPU_MMXEXT, alphaones);
00306 }
00307
00308 static void mmxext_argb32 (uint8_t * image,
00309 uint8_t * py, uint8_t * pu, uint8_t * pv,
00310 int width, int height,
00311 int rgb_stride, int y_stride, int uv_stride,
00312 int alphaones)
00313 {
00314 yuv420_argb32 (image, py, pu, pv, width, height,
00315 rgb_stride, y_stride, uv_stride, CPU_MMXEXT, alphaones);
00316 }
00317
00318 static void mmx_rgb16 (uint8_t * image,
00319 uint8_t * py, uint8_t * pu, uint8_t * pv,
00320 int width, int height,
00321 int rgb_stride, int y_stride, int uv_stride,
00322 int alphaones)
00323 {
00324 yuv420_rgb16 (image, py, pu, pv, width, height,
00325 rgb_stride, y_stride, uv_stride, CPU_MMX, alphaones);
00326 }
00327
00328 static void mmx_argb32 (uint8_t * image,
00329 uint8_t * py, uint8_t * pu, uint8_t * pv,
00330 int width, int height,
00331 int rgb_stride, int y_stride, int uv_stride,
00332 int alphaones)
00333 {
00334 yuv420_argb32 (image, py, pu, pv, width, height,
00335 rgb_stride, y_stride, uv_stride, CPU_MMX, alphaones);
00336 }
00337 #endif
00338
00348 yuv2rgb_fun yuv2rgb_init_mmxext (int bpp, int mode)
00349 {
00350 #ifdef MMX
00351 if ((bpp == 16) && (mode == MODE_RGB))
00352 return mmxext_rgb16;
00353 else if ((bpp == 32) && (mode == MODE_RGB))
00354 return mmxext_argb32;
00355 #else
00356 (void)bpp;
00357 (void)mode;
00358 #endif
00359
00360 return NULL;
00361 }
00362
00372 yuv2rgb_fun yuv2rgb_init_mmx (int bpp, int mode)
00373 {
00374 #ifdef MMX
00375 if ((bpp == 16) && (mode == MODE_RGB))
00376 return mmx_rgb16;
00377 else if ((bpp == 32) && (mode == MODE_RGB))
00378 return mmx_argb32;
00379 #endif
00380
00381 if ((bpp == 32) && (mode == MODE_RGB))
00382 return yuv420_argb32_non_mmx;
00383
00384 return NULL;
00385 }
00386
00387 #define SCALE_BITS 10
00388
00389 #define C_Y (76309 >> (16 - SCALE_BITS))
00390 #define C_RV (117504 >> (16 - SCALE_BITS))
00391 #define C_BU (138453 >> (16 - SCALE_BITS))
00392 #define C_GU (13954 >> (16 - SCALE_BITS))
00393 #define C_GV (34903 >> (16 - SCALE_BITS))
00394
00395 #if defined(__FreeBSD__)
00396
00397
00398 #undef UCHAR_MAX
00399 #define UCHAR_MAX (int)__UCHAR_MAX
00400 #endif
00401
00402 #define RGBOUT(r, g, b, y1)\
00403 {\
00404 y = (y1 - 16) * C_Y;\
00405 r = std::min(UCHAR_MAX, std::max(0, (y + r_add) >> SCALE_BITS));\
00406 g = std::min(UCHAR_MAX, std::max(0, (y + g_add) >> SCALE_BITS));\
00407 b = std::min(UCHAR_MAX, std::max(0, (y + b_add) >> SCALE_BITS));\
00408 }
00409
00410 static void yuv420_argb32_non_mmx(unsigned char *image, unsigned char *py,
00411 unsigned char *pu, unsigned char *pv,
00412 int h_size, int v_size, int rgb_stride,
00413 int y_stride, int uv_stride, int alphaones)
00414 {
00415 unsigned char *y1_ptr, *y2_ptr, *cb_ptr, *cr_ptr, *d, *d1, *d2;
00416 int w, y, cb, cr, r_add, g_add, b_add, width2;
00417 int dstwidth;
00418
00419
00420 #ifdef WORDS_BIGENDIAN
00421 #define R_OI 1
00422 #define G_OI 2
00423 #define B_OI 3
00424 #define A_OI 0
00425 #else
00426 #define R_OI 2
00427 #define G_OI 1
00428 #define B_OI 0
00429 #define A_OI 3
00430 #endif
00431
00432
00433 rgb_stride = y_stride = uv_stride;
00434
00435 d = image;
00436 y1_ptr = py;
00437 cb_ptr = pu;
00438 cr_ptr = pv;
00439 dstwidth = h_size * 4;
00440 width2 = h_size / 2;
00441
00442 for(;v_size > 0; v_size -= 2) {
00443 d1 = d;
00444 d2 = d + h_size * 4;
00445 y2_ptr = y1_ptr + h_size;
00446 for(w = width2; w > 0; w--) {
00447 cb = cb_ptr[0] - 128;
00448 cr = cr_ptr[0] - 128;
00449 r_add = C_RV * cr + (1 << (SCALE_BITS - 1));
00450 g_add = - C_GU * cb - C_GV * cr + (1 << (SCALE_BITS - 1));
00451 b_add = C_BU * cb + (1 << (SCALE_BITS - 1));
00452
00453
00454 RGBOUT(d1[R_OI], d1[G_OI], d1[B_OI], y1_ptr[0]);
00455 RGBOUT(d1[R_OI+4], d1[G_OI+4], d1[B_OI+4], y1_ptr[1]);
00456 RGBOUT(d2[R_OI], d2[G_OI], d2[B_OI], y2_ptr[0]);
00457 RGBOUT(d2[R_OI+4], d2[G_OI+4], d2[B_OI+4], y2_ptr[1]);
00458
00459 if (alphaones)
00460 d1[A_OI] = d1[A_OI+4] = d2[A_OI] = d2[A_OI+4] = 0xff;
00461 else
00462 d1[A_OI] = d1[A_OI+4] = d2[A_OI] = d2[A_OI+4] = 0;
00463
00464 d1 += 8;
00465 d2 += 8;
00466 y1_ptr += 2;
00467 y2_ptr += 2;
00468 cb_ptr++;
00469 cr_ptr++;
00470 }
00471 d += 2 * dstwidth;
00472 y1_ptr += h_size;
00473 }
00474 }
00475
00476 #define SCALEBITS 8
00477 #define ONE_HALF (1 << (SCALEBITS - 1))
00478 #define FIX(x) ((int) ((x) * (1L<<SCALEBITS) + 0.5))
00479
00484 void rgb32_to_yuv420p(unsigned char *lum, unsigned char *cb, unsigned char *cr,
00485 unsigned char *alpha, unsigned char *src,
00486 int width, int height, int srcwidth)
00487 {
00488 int wrap, wrap4, x, y;
00489 int r, g, b, r1, g1, b1;
00490 unsigned char *p;
00491
00492
00493 #ifdef WORDS_BIGENDIAN
00494 #define R_II 3
00495 #define G_II 2
00496 #define B_II 1
00497 #define A_II 0
00498 #else
00499 #define R_II 0
00500 #define G_II 1
00501 #define B_II 2
00502 #define A_II 3
00503 #endif
00504
00505 wrap = (width + 1) & ~1;
00506 wrap4 = srcwidth * 4;
00507 p = src;
00508 for(y=0;y+1<height;y+=2) {
00509 for(x=0;x+1<width;x+=2) {
00510 r = p[R_II];
00511 g = p[G_II];
00512 b = p[B_II];
00513 r1 = r;
00514 g1 = g;
00515 b1 = b;
00516 lum[0] = (FIX(0.29900) * r + FIX(0.58700) * g +
00517 FIX(0.11400) * b + ONE_HALF) >> SCALEBITS;
00518 alpha[0] = p[A_II];
00519
00520 r = p[R_II+4];
00521 g = p[G_II+4];
00522 b = p[B_II+4];
00523 r1 += r;
00524 g1 += g;
00525 b1 += b;
00526 lum[1] = (FIX(0.29900) * r + FIX(0.58700) * g +
00527 FIX(0.11400) * b + ONE_HALF) >> SCALEBITS;
00528 alpha[1] = p[A_II+4];
00529
00530 p += wrap4;
00531 lum += wrap;
00532 alpha += wrap;
00533
00534 r = p[R_II];
00535 g = p[G_II];
00536 b = p[B_II];
00537 r1 += r;
00538 g1 += g;
00539 b1 += b;
00540 lum[0] = (FIX(0.29900) * r + FIX(0.58700) * g +
00541 FIX(0.11400) * b + ONE_HALF) >> SCALEBITS;
00542 alpha[0] = p[A_II];
00543
00544 r = p[R_II+4];
00545 g = p[G_II+4];
00546 b = p[B_II+4];
00547 r1 += r;
00548 g1 += g;
00549 b1 += b;
00550 lum[1] = (FIX(0.29900) * r + FIX(0.58700) * g +
00551 FIX(0.11400) * b + ONE_HALF) >> SCALEBITS;
00552 alpha[1] = p[A_II+4];
00553
00554 cr[0] = ((- FIX(0.16874) * r1 - FIX(0.33126) * g1 +
00555 FIX(0.50000) * b1 + 4 * ONE_HALF - 1) >> (SCALEBITS + 2)) +
00556 128;
00557 cb[0] = ((FIX(0.50000) * r1 - FIX(0.41869) * g1 -
00558 FIX(0.08131) * b1 + 4 * ONE_HALF - 1) >> (SCALEBITS + 2)) +
00559 128;
00560
00561 cb++;
00562 cr++;
00563 p += -wrap4 + 2 * 4;
00564 lum += -wrap + 2;
00565 alpha += -wrap + 2;
00566 }
00567 if (width & 1) {
00568 r = p[R_II];
00569 g = p[G_II];
00570 b = p[B_II];
00571 r1 = r;
00572 g1 = g;
00573 b1 = b;
00574 lum[0] = (FIX(0.29900) * r + FIX(0.58700) * g +
00575 FIX(0.11400) * b + ONE_HALF) >> SCALEBITS;
00576 alpha[0] = p[A_II];
00577
00578 lum[1] = 16;
00579 alpha[1] = 0;
00580
00581 p += wrap4;
00582 lum += wrap;
00583 alpha += wrap;
00584
00585 r = p[R_II];
00586 g = p[G_II];
00587 b = p[B_II];
00588 r1 += r;
00589 g1 += g;
00590 b1 += b;
00591 lum[0] = (FIX(0.29900) * r + FIX(0.58700) * g +
00592 FIX(0.11400) * b + ONE_HALF) >> SCALEBITS;
00593 alpha[0] = p[A_II];
00594
00595 lum[1] = 16;
00596 alpha[1] = 0;
00597
00598 cr[0] = ((- FIX(0.16874) * r1 - FIX(0.33126) * g1 +
00599 FIX(0.50000) * b1 + 2 * ONE_HALF - 1) >> (SCALEBITS + 1)) +
00600 128;
00601 cb[0] = ((FIX(0.50000) * r1 - FIX(0.41869) * g1 -
00602 FIX(0.08131) * b1 + 2 * ONE_HALF - 1) >> (SCALEBITS + 1)) +
00603 128;
00604
00605 cb++;
00606 cr++;
00607 p += -wrap4 + 4;
00608 lum += -wrap + 2;
00609 alpha += -wrap + 2;
00610 }
00611 p += wrap4 * 2 - width * 4;
00612 lum += wrap;
00613 alpha += wrap;
00614 }
00615 if (height & 1) {
00616 for(x=0;x+1<width;x+=2) {
00617 r = p[R_II];
00618 g = p[G_II];
00619 b = p[B_II];
00620 r1 = r;
00621 g1 = g;
00622 b1 = b;
00623 lum[0] = (FIX(0.29900) * r + FIX(0.58700) * g +
00624 FIX(0.11400) * b + ONE_HALF) >> SCALEBITS;
00625 alpha[0] = p[A_II];
00626
00627 r = p[R_II+4];
00628 g = p[G_II+4];
00629 b = p[B_II+4];
00630 r1 += r;
00631 g1 += g;
00632 b1 += b;
00633 lum[1] = (FIX(0.29900) * r + FIX(0.58700) * g +
00634 FIX(0.11400) * b + ONE_HALF) >> SCALEBITS;
00635 alpha[1] = p[A_II+4];
00636
00637 lum += wrap;
00638 alpha += wrap;
00639
00640 lum[0] = 16;
00641 alpha[0] = 0;
00642
00643 lum[1] = 16;
00644 alpha[1] = 0;
00645
00646 cr[0] = ((- FIX(0.16874) * r1 - FIX(0.33126) * g1 +
00647 FIX(0.50000) * b1 + 2 * ONE_HALF - 1) >> (SCALEBITS + 1)) +
00648 128;
00649 cb[0] = ((FIX(0.50000) * r1 - FIX(0.41869) * g1 -
00650 FIX(0.08131) * b1 + 2 * ONE_HALF - 1) >> (SCALEBITS + 1)) +
00651 128;
00652
00653 cb++;
00654 cr++;
00655 p += 2 * 4;
00656 lum += -wrap + 2;
00657 alpha += -wrap + 2;
00658 }
00659 if (width & 1) {
00660 r = p[R_II];
00661 g = p[G_II];
00662 b = p[B_II];
00663 r1 = r;
00664 g1 = g;
00665 b1 = b;
00666 lum[0] = (FIX(0.29900) * r + FIX(0.58700) * g +
00667 FIX(0.11400) * b + ONE_HALF) >> SCALEBITS;
00668 alpha[0] = p[A_II];
00669
00670 lum[1] = 16;
00671 alpha[1] = 0;
00672
00673 lum += wrap;
00674 alpha += wrap;
00675
00676 lum[0] = 16;
00677 alpha[0] = 0;
00678
00679 lum[1] = 16;
00680 alpha[1] = 0;
00681
00682 cr[0] = ((- FIX(0.16874) * r1 - FIX(0.33126) * g1 +
00683 FIX(0.50000) * b1 + ONE_HALF - 1) >> SCALEBITS) +
00684 128;
00685 cb[0] = ((FIX(0.50000) * r1 - FIX(0.41869) * g1 -
00686 FIX(0.08131) * b1 + ONE_HALF - 1) >> SCALEBITS) +
00687 128;
00688
00689 cb++;
00690 cr++;
00691 p += 4;
00692 lum += -wrap + 2;
00693 alpha += -wrap + 2;
00694 }
00695 }
00696 }
00697
00698
00699
00700
00701
00702
00703
00704
00705
00706
00707
00708
00709
00710
00711
00724 static void non_vec_i420_2vuy(
00725 uint8_t *image, int vuy_stride,
00726 const uint8_t *py, const uint8_t *pu, const uint8_t *pv,
00727 int y_stride, int u_stride, int v_stride,
00728 int h_size, int v_size)
00729 {
00730 uint8_t *pi1, *pi2;
00731 const uint8_t *py1;
00732 const uint8_t *py2;
00733 const uint8_t *pu1;
00734 const uint8_t *pv1;
00735 int x, y;
00736
00737 for (y = 0; y < (v_size>>1); y++)
00738 {
00739 pi1 = image + 2*y * vuy_stride;
00740 pi2 = image + 2*y * vuy_stride + vuy_stride;
00741 py1 = py + 2*y * y_stride;
00742 py2 = py + 2*y * y_stride + y_stride;
00743 pu1 = pu + y * u_stride;
00744 pv1 = pv + y * v_stride;
00745
00746 for (x = 0; x < (h_size>>1); x++)
00747 {
00748 pi1[4*x+0] = pu1[1*x+0];
00749 pi2[4*x+0] = pu1[1*x+0];
00750 pi1[4*x+1] = py1[2*x+0];
00751 pi2[4*x+1] = py2[2*x+0];
00752 pi1[4*x+2] = pv1[1*x+0];
00753 pi2[4*x+2] = pv1[1*x+0];
00754 pi1[4*x+3] = py1[2*x+1];
00755 pi2[4*x+3] = py2[2*x+1];
00756 }
00757 }
00758 }
00759
00760 #ifdef MMX
00761
00773 static void mmx_i420_2vuy(
00774 uint8_t *image, int vuy_stride,
00775 const uint8_t *py, const uint8_t *pu, const uint8_t *pv,
00776 int y_stride, int u_stride, int v_stride,
00777 int h_size, int v_size)
00778 {
00779 uint8_t *pi1, *pi2;
00780 const uint8_t *py1 = py;
00781 const uint8_t *py2 = py;
00782 const uint8_t *pu1 = pu;
00783 const uint8_t *pv1 = pv;
00784
00785 int x,y;
00786
00787 if ((h_size % 16) || (v_size % 2))
00788 {
00789 non_vec_i420_2vuy(image, vuy_stride,
00790 py, pu, pv, y_stride, u_stride, v_stride,
00791 h_size, v_size);
00792 return;
00793 }
00794
00795 emms();
00796
00797 for (y = 0; y < (v_size>>1); y++)
00798 {
00799 pi1 = image + 2*y * vuy_stride;
00800 pi2 = image + 2*y * vuy_stride + vuy_stride;
00801 py1 = py + 2*y * y_stride;
00802 py2 = py + 2*y * y_stride + y_stride;
00803 pu1 = pu + y * u_stride;
00804 pv1 = pv + y * v_stride;
00805
00806 for (x = 0; x < h_size / 16; x++)
00807 {
00808 movq_m2r (*py1, mm0);
00809 movq_m2r (*py2, mm1);
00810 movq_m2r (*pu1, mm2);
00811 movq_m2r (*pv1, mm3);
00812
00813 movq_r2r (mm2, mm4);
00814
00815 punpcklbw_r2r (mm3, mm2);
00816 punpckhbw_r2r (mm3, mm4);
00817
00818 movq_r2r (mm2, mm5);
00819 movq_r2r (mm2, mm6);
00820 punpcklbw_r2r (mm0, mm5);
00821 punpckhbw_r2r (mm0, mm6);
00822
00823 movntq_r2m (mm5, *(pi1));
00824 movntq_r2m (mm6, *(pi1+8));
00825
00826 movq_r2r (mm2, mm5);
00827 movq_r2r (mm2, mm6);
00828 punpcklbw_r2r (mm1, mm5);
00829 punpckhbw_r2r (mm1, mm6);
00830
00831 movntq_r2m (mm5, *(pi2));
00832 movntq_r2m (mm6, *(pi2+8));
00833
00834
00835 movq_m2r (*(py1+8), mm0);
00836 movq_m2r (*(py2+8), mm1);
00837
00838 movq_r2r (mm4, mm5);
00839 movq_r2r (mm4, mm6);
00840 punpcklbw_r2r (mm0, mm5);
00841 punpckhbw_r2r (mm0, mm6);
00842
00843 movntq_r2m (mm5, *(pi1+16));
00844 movntq_r2m (mm6, *(pi1+24));
00845
00846 movq_r2r (mm4, mm5);
00847 movq_r2r (mm4, mm6);
00848 punpcklbw_r2r (mm1, mm5);
00849 punpckhbw_r2r (mm1, mm6);
00850
00851 movntq_r2m (mm5, *(pi2+16));
00852 movntq_r2m (mm6, *(pi2+24));
00853
00854 pi1 += 32;
00855 pi2 += 32;
00856 py1 += 16;
00857 py2 += 16;
00858 pu1 += 8;
00859 pv1 += 8;
00860 }
00861 }
00862
00863 emms();
00864 }
00865
00866 #endif // MMX
00867
00868 #ifdef HAVE_ALTIVEC
00869
00870
00871
00872 #define VEC_NEXT_LINES() \
00873 pi1 = pi2; \
00874 pi2 += h_size * 2; \
00875 py1 = py2; \
00876 py2 += h_size;
00877
00878 #define VEC_LOAD_UV() \
00879 u_vec = vec_ld(0, pu); pu += 16; \
00880 v_vec = vec_ld(0, pv); pv += 16;
00881
00882 #define VEC_MERGE(a) \
00883 uv_vec = a(u_vec, v_vec); \
00884 y_vec = vec_ld(0, py1); py1 += 16; \
00885 vec_st(vec_mergeh(uv_vec, y_vec), 0, pi1); pi1 += 16; \
00886 vec_st(vec_mergel(uv_vec, y_vec), 0, pi1); pi1 += 16; \
00887 y_vec = vec_ld(0, py2); py2 += 16; \
00888 vec_st(vec_mergeh(uv_vec, y_vec), 0, pi2); pi2 += 16; \
00889 vec_st(vec_mergel(uv_vec, y_vec), 0, pi2); pi2 += 16;
00890
00903 static void altivec_i420_2vuy(
00904 uint8_t *image, int vuy_stride,
00905 const uint8_t *py, const uint8_t *pu, const uint8_t *pv,
00906 int y_stride, int u_stride, int v_stride,
00907 int h_size, int v_size)
00908 {
00909 uint8_t *pi1, *pi2 = image;
00910 const uint8_t *py1;
00911 const uint8_t *py2 = py;
00912
00913 int x, y;
00914
00915 vector unsigned char u_vec;
00916 vector unsigned char v_vec;
00917 vector unsigned char uv_vec;
00918 vector unsigned char y_vec;
00919
00920 int vuy_extra = vuy_stride - (h_size<<1);
00921 int y_extra = y_stride - (h_size);
00922 int u_extra = u_stride - (h_size>>1);
00923 int v_extra = v_stride - (h_size>>1);
00924
00925 if (vuy_extra || y_extra || u_extra || v_extra)
00926 {
00927
00928 non_vec_i420_2vuy(image, vuy_stride,
00929 py, pu, pv,
00930 y_stride, u_stride, v_stride,
00931 h_size, v_size);
00932 return;
00933 }
00934
00935 if (!((h_size % 32) || (v_size % 2)))
00936 {
00937
00938 for (y = v_size / 2; y--; )
00939 {
00940 VEC_NEXT_LINES();
00941 for (x = h_size / 32; x--; )
00942 {
00943 VEC_LOAD_UV();
00944 VEC_MERGE(vec_mergeh);
00945 VEC_MERGE(vec_mergel);
00946 }
00947 }
00948
00949 }
00950 else if (!((h_size % 16) || (v_size % 4)))
00951 {
00952
00953 for (y = v_size / 4; y--; )
00954 {
00955
00956 VEC_NEXT_LINES();
00957 for (x = h_size / 32; x--; )
00958 {
00959 VEC_LOAD_UV();
00960 VEC_MERGE(vec_mergeh);
00961 VEC_MERGE(vec_mergel);
00962 }
00963
00964
00965 VEC_LOAD_UV();
00966 VEC_MERGE(vec_mergeh);
00967
00968
00969 VEC_NEXT_LINES();
00970 VEC_MERGE(vec_mergel);
00971
00972
00973 for (x = h_size / 32; x--; )
00974 {
00975 VEC_LOAD_UV();
00976 VEC_MERGE(vec_mergeh);
00977 VEC_MERGE(vec_mergel);
00978 }
00979 }
00980 }
00981 else
00982 {
00983
00984 non_vec_i420_2vuy(image, vuy_stride,
00985 py, pu, pv,
00986 y_stride, u_stride, v_stride,
00987 h_size, v_size);
00988 }
00989 }
00990
00991 #endif // HAVE_ALTIVEC
00992
00993
01007 conv_i420_2vuy_fun get_i420_2vuy_conv(void)
01008 {
01009 #ifdef HAVE_ALTIVEC
01010 if (has_altivec())
01011 return altivec_i420_2vuy;
01012 #endif
01013
01014 #ifdef MMX
01015 return mmx_i420_2vuy;
01016 #endif
01017
01018 return non_vec_i420_2vuy;
01019 }
01020
01030 static void non_vec_2vuy_i420(
01031 uint8_t *py, uint8_t *pu, uint8_t *pv,
01032 int y_stride, int u_stride, int v_stride,
01033 const uint8_t *image, int vuy_stride,
01034 int h_size, int v_size)
01035 {
01036 const uint8_t *pi1;
01037 const uint8_t *pi2;
01038 uint8_t *py1, *py2, *pu1, *pv1;
01039 int x, y;
01040
01041 for (y = 0; y < (v_size>>1); y++)
01042 {
01043 pi1 = image + 2*y * vuy_stride;
01044 pi2 = image + 2*y * vuy_stride + vuy_stride;
01045 py1 = py + 2*y * y_stride;
01046 py2 = py + 2*y * y_stride + y_stride;
01047 pu1 = pu + y * u_stride;
01048 pv1 = pv + y * v_stride;
01049
01050 for (x = 0; x < (h_size>>1); x++)
01051 {
01052 pu1[1*x+0] = (pi1[4*x+0] + pi2[4*x+0]) >> 1;
01053 py1[2*x+0] = pi1[4*x+1];
01054 py2[2*x+0] = pi2[4*x+1];
01055 pv1[1*x+0] = (pi1[4*x+2] + pi2[4*x+2]) >> 1;
01056 py1[2*x+1] = pi1[4*x+3];
01057 py2[2*x+1] = pi2[4*x+3];
01058 }
01059 }
01060 }
01061
01062 #ifdef HAVE_ALTIVEC
01063
01064
01065
01066 #define VEC_READ_LINE(ptr, y, uv) \
01067 pa_vec = vec_ld(0, ptr); ptr += 16; \
01068 pb_vec = vec_ld(0, ptr); ptr += 16; \
01069 vec_st(vec_pack((vector unsigned short)pa_vec, \
01070 (vector unsigned short)pb_vec), \
01071 0, y); y += 16; \
01072 uv = vec_pack(vec_sr((vector unsigned short)pa_vec, eight_vec), \
01073 vec_sr((vector unsigned short)pb_vec, eight_vec));
01074
01075 #define VEC_SPLIT(a) \
01076 VEC_READ_LINE(pi1, py1, uv1_vec); \
01077 VEC_READ_LINE(pi2, py2, uv2_vec); \
01078 a = vec_avg(uv1_vec, uv2_vec);
01079
01080 #define VEC_STORE_UV() \
01081 vec_st(vec_pack((vector unsigned short)uva_vec, \
01082 (vector unsigned short)uvb_vec), \
01083 0, pv); pv += 16; \
01084 vec_st(vec_pack(vec_sr((vector unsigned short)uva_vec, eight_vec), \
01085 vec_sr((vector unsigned short)uvb_vec, eight_vec)), \
01086 0, pu); pu += 16;
01087
01088
01098 static void altivec_2vuy_i420(
01099 uint8_t *py, uint8_t *pu, uint8_t *pv,
01100 int y_stride, int u_stride, int v_stride,
01101 const uint8_t *image, int vuy_stride,
01102 int h_size, int v_size)
01103 {
01104 const uint8_t *pi1;
01105 const uint8_t *pi2 = image;
01106 uint8_t *py1, *py2 = py;
01107
01108 int x, y;
01109
01110 vector unsigned short eight_vec = vec_splat_u16(8);
01111 vector unsigned char pa_vec, pb_vec,
01112 uv1_vec, uv2_vec,
01113 uva_vec, uvb_vec;
01114
01115 int vuy_extra = vuy_stride - (h_size<<1);
01116 int y_extra = y_stride - (h_size);
01117 int u_extra = u_stride - (h_size>>1);
01118 int v_extra = v_stride - (h_size>>1);
01119
01120 if (vuy_extra || y_extra || u_extra || v_extra)
01121 {
01122
01123 non_vec_2vuy_i420(py, pu, pv,
01124 y_stride, u_stride, v_stride,
01125 image, vuy_stride,
01126 h_size, v_size);
01127 return;
01128 }
01129
01130 if (!((h_size % 32) || (v_size % 2)))
01131 {
01132
01133 for (y = v_size / 2; y--; )
01134 {
01135 VEC_NEXT_LINES();
01136 for (x = h_size / 32; x--; )
01137 {
01138 VEC_SPLIT(uva_vec);
01139 VEC_SPLIT(uvb_vec);
01140 VEC_STORE_UV();
01141 }
01142 }
01143 }
01144 else if (!((h_size % 16) || (v_size % 4)))
01145 {
01146
01147 for (y = v_size / 4; y--; )
01148 {
01149
01150 VEC_NEXT_LINES();
01151 for (x = h_size / 32; x--; )
01152 {
01153 VEC_SPLIT(uva_vec);
01154 VEC_SPLIT(uvb_vec);
01155 VEC_STORE_UV();
01156 }
01157
01158
01159 VEC_SPLIT(uva_vec);
01160
01161
01162 VEC_NEXT_LINES();
01163 VEC_SPLIT(uvb_vec);
01164 VEC_STORE_UV();
01165
01166
01167 for (x = h_size / 32; x--; )
01168 {
01169 VEC_SPLIT(uva_vec);
01170 VEC_SPLIT(uvb_vec);
01171 VEC_STORE_UV();
01172 }
01173 }
01174 }
01175 else
01176 {
01177
01178 non_vec_2vuy_i420(py, pu, pv,
01179 y_stride, u_stride, v_stride,
01180 image, vuy_stride,
01181 h_size, v_size);
01182 }
01183 }
01184
01185 #endif // HAVE_ALTIVEC
01186
01187
01201 conv_2vuy_i420_fun get_2vuy_i420_conv(void)
01202 {
01203 #ifdef HAVE_ALTIVEC
01204 if (has_altivec())
01205 return altivec_2vuy_i420;
01206 #endif
01207 return non_vec_2vuy_i420;
01208 }