00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024 #include <string.h>
00025 #include <stdlib.h>
00026 #ifdef HAVE_STDINT_H
00027 #include <stdint.h>
00028 #endif
00029 #include <math.h>
00030
00031
00032 #include "config.h"
00033 #include "dsputil.h"
00034
00035 #ifdef MMX
00036 #include "i386/mmx.h"
00037 #endif
00038
00039 void (*yv12_to_yuy2)
00040 (const unsigned char *y_src, int y_src_pitch,
00041 const unsigned char *u_src, int u_src_pitch,
00042 const unsigned char *v_src, int v_src_pitch,
00043 unsigned char *yuy2_map, int yuy2_pitch,
00044 int width, int height, int progressive);
00045 void (*yuy2_to_yv12)
00046 (const unsigned char *yuy2_map, int yuy2_pitch,
00047 unsigned char *y_dst, int y_dst_pitch,
00048 unsigned char *u_dst, int u_dst_pitch,
00049 unsigned char *v_dst, int v_dst_pitch,
00050 int width, int height);
00051 void (*vfilter_chroma_332_packed422_scanline)( uint8_t *output, int width, uint8_t *m, uint8_t *t, uint8_t *b );
00052
00053
00054 #define C_YUV420_YUYV( ) \
00055 *p_line1++ = *p_y1++; *p_line2++ = *p_y2++; \
00056 *p_line1++ = *p_u; *p_line2++ = (*p_u++ + *p_u2++)>>1; \
00057 *p_line1++ = *p_y1++; *p_line2++ = *p_y2++; \
00058 *p_line1++ = *p_v; *p_line2++ = (*p_v++ + *p_v2++)>>1;
00059
00060
00061
00062
00063
00064
00065 static void yv12_to_yuy2_c
00066 (const unsigned char *y_src, int y_src_pitch,
00067 const unsigned char *u_src, int u_src_pitch,
00068 const unsigned char *v_src, int v_src_pitch,
00069 unsigned char *yuy2_map, int yuy2_pitch,
00070 int width, int height, int progressive)
00071 {
00072
00073 uint8_t *p_line1, *p_line2 = yuy2_map;
00074 const uint8_t *p_y1, *p_y2 = y_src;
00075 const uint8_t *p_u = u_src;
00076 const uint8_t *p_v = v_src;
00077 const uint8_t *p_u2 = u_src + u_src_pitch;
00078 const uint8_t *p_v2 = v_src + v_src_pitch;
00079
00080 int i_x, i_y;
00081
00082 const int i_source_margin = y_src_pitch - width;
00083 const int i_source_u_margin = u_src_pitch - width/2;
00084 const int i_source_v_margin = v_src_pitch - width/2;
00085 const int i_dest_margin = yuy2_pitch - width*2;
00086
00087
00088 if ( progressive )
00089 {
00090 for ( i_y = height / 2 ; i_y-- ; )
00091 {
00092 p_line1 = p_line2;
00093 p_line2 += yuy2_pitch;
00094
00095 p_y1 = p_y2;
00096 p_y2 += y_src_pitch;
00097
00098 for ( i_x = width / 2 ; i_x-- ; )
00099 {
00100 C_YUV420_YUYV( );
00101 }
00102
00103 p_y2 += i_source_margin;
00104 p_u += i_source_u_margin;
00105 p_v += i_source_v_margin;
00106 if ( i_y > 1 )
00107 {
00108 p_u2 += i_source_u_margin;
00109 p_v2 += i_source_v_margin;
00110 }
00111 else
00112 {
00113 p_u2 = p_u;
00114 p_v2 = p_v;
00115 }
00116 p_line2 += i_dest_margin;
00117 }
00118 }
00119 else
00120 {
00121 p_u2 = u_src + 2*u_src_pitch;
00122 p_v2 = v_src + 2*v_src_pitch;
00123 for ( i_y = height / 4 ; i_y-- ; )
00124 {
00125 p_line1 = p_line2;
00126 p_line2 += 2 * yuy2_pitch;
00127
00128 p_y1 = p_y2;
00129 p_y2 += 2 * y_src_pitch;
00130
00131 for ( i_x = width / 2 ; i_x-- ; )
00132 {
00133 C_YUV420_YUYV( );
00134 }
00135
00136 p_y2 += i_source_margin + y_src_pitch;
00137 p_u += i_source_u_margin + u_src_pitch;
00138 p_v += i_source_v_margin + v_src_pitch;
00139 if ( i_y > 1 )
00140 {
00141 p_u2 += i_source_u_margin + u_src_pitch;
00142 p_v2 += i_source_v_margin + v_src_pitch;
00143 }
00144 else
00145 {
00146 p_u2 = p_u;
00147 p_v2 = p_v;
00148 }
00149 p_line2 += i_dest_margin + yuy2_pitch;
00150 }
00151
00152 p_line2 = yuy2_map + yuy2_pitch;
00153 p_y2 = y_src + y_src_pitch;
00154 p_u = u_src + u_src_pitch;
00155 p_v = v_src + v_src_pitch;
00156 p_u2 = u_src + 3*u_src_pitch;
00157 p_v2 = v_src + 3*v_src_pitch;
00158
00159 for ( i_y = height / 4 ; i_y-- ; )
00160 {
00161 p_line1 = p_line2;
00162 p_line2 += 2 * yuy2_pitch;
00163
00164 p_y1 = p_y2;
00165 p_y2 += 2 * y_src_pitch;
00166
00167 for ( i_x = width / 2 ; i_x-- ; )
00168 {
00169 C_YUV420_YUYV( );
00170 }
00171
00172 p_y2 += i_source_margin + y_src_pitch;
00173 p_u += i_source_u_margin + u_src_pitch;
00174 p_v += i_source_v_margin + v_src_pitch;
00175 if ( i_y > 1 )
00176 {
00177 p_u2 += i_source_u_margin + u_src_pitch;
00178 p_v2 += i_source_v_margin + v_src_pitch;
00179 }
00180 else
00181 {
00182 p_u2 = p_u;
00183 p_v2 = p_v;
00184 }
00185 p_line2 += i_dest_margin + yuy2_pitch;
00186 }
00187 }
00188 }
00189
00190
00191 #ifdef HAVE_MMX
00192
00193 #define MMXEXT_YUV420_YUYV( ) \
00194 do { \
00195 __asm__ __volatile__(".align 8 \n\t" \
00196 "movq (%0), %%mm0 \n\t" \
00197 "movd (%1), %%mm1 \n\t" \
00198 "movd (%2), %%mm2 \n\t" \
00199 "punpcklbw %%mm2, %%mm1 \n\t" \
00200 "movq %%mm0, %%mm2 \n\t" \
00201 "punpcklbw %%mm1, %%mm2 \n\t" \
00202 : \
00203 : "r" (p_y1), "r" (p_u), "r" (p_v) ); \
00204 __asm__ __volatile__( \
00205 "movd (%0), %%mm3 \n\t" \
00206 "movd (%1), %%mm4 \n\t" \
00207 "punpcklbw %%mm4, %%mm3 \n\t" \
00208 "pavgb %%mm1, %%mm3 \n\t" \
00209 : \
00210 : "r" (p_u2), "r" (p_v2) ); \
00211 __asm__ __volatile__( \
00212 "movntq %%mm2, (%0) \n\t" \
00213 "punpckhbw %%mm1, %%mm0 \n\t" \
00214 "movntq %%mm0, 8(%0) \n\t" \
00215 "movq (%2), %%mm0 \n\t" \
00216 "movq %%mm0, %%mm2 \n\t" \
00217 "punpcklbw %%mm3, %%mm2 \n\t" \
00218 "movntq %%mm2, (%1) \n\t" \
00219 "punpckhbw %%mm3, %%mm0 \n\t" \
00220 "movntq %%mm0, 8(%1) \n\t" \
00221 : \
00222 : "r" (p_line1), "r" (p_line2), "r" (p_y2) ); \
00223 p_line1 += 16; p_line2 += 16; p_y1 += 8; p_y2 += 8; p_u += 4; p_v += 4; \
00224 p_u2 += 4; p_v2 += 4; \
00225 } while(0)
00226
00227 #endif
00228
00229 static void yv12_to_yuy2_mmxext
00230 (const unsigned char *y_src, int y_src_pitch,
00231 const unsigned char *u_src, int u_src_pitch,
00232 const unsigned char *v_src, int v_src_pitch,
00233 unsigned char *yuy2_map, int yuy2_pitch,
00234 int width, int height, int progressive )
00235 {
00236 #ifdef HAVE_MMX
00237 uint8_t *p_line1, *p_line2 = yuy2_map;
00238 const uint8_t *p_y1, *p_y2 = y_src;
00239 const uint8_t *p_u = u_src;
00240 const uint8_t *p_v = v_src;
00241 const uint8_t *p_u2 = u_src + u_src_pitch;
00242 const uint8_t *p_v2 = v_src + v_src_pitch;
00243
00244 int i_x, i_y;
00245
00246 const int i_source_margin = y_src_pitch - width;
00247 const int i_source_u_margin = u_src_pitch - width/2;
00248 const int i_source_v_margin = v_src_pitch - width/2;
00249 const int i_dest_margin = yuy2_pitch - width*2;
00250
00251 if ( progressive )
00252 {
00253 for ( i_y = height / 2; i_y-- ; )
00254 {
00255 p_line1 = p_line2;
00256 p_line2 += yuy2_pitch;
00257
00258 p_y1 = p_y2;
00259 p_y2 += y_src_pitch;
00260
00261 for ( i_x = width / 8 ; i_x-- ; )
00262 {
00263 MMXEXT_YUV420_YUYV( );
00264 }
00265 for ( i_x = (width % 8) / 2 ; i_x-- ; )
00266 {
00267 C_YUV420_YUYV( );
00268 }
00269
00270 p_y2 += i_source_margin;
00271 p_u += i_source_u_margin;
00272 p_v += i_source_v_margin;
00273 if ( i_y > 1 )
00274 {
00275 p_u2 += i_source_u_margin;
00276 p_v2 += i_source_v_margin;
00277 }
00278 else
00279 {
00280 p_u2 = p_u;
00281 p_v2 = p_v;
00282 }
00283 p_line2 += i_dest_margin;
00284 }
00285 }
00286 else
00287 {
00288 p_u2 = u_src + 2*u_src_pitch;
00289 p_v2 = v_src + 2*v_src_pitch;
00290 for ( i_y = height / 4 ; i_y-- ; )
00291 {
00292 p_line1 = p_line2;
00293 p_line2 += 2 * yuy2_pitch;
00294
00295 p_y1 = p_y2;
00296 p_y2 += 2 * y_src_pitch;
00297
00298 for ( i_x = width / 8 ; i_x-- ; )
00299 {
00300 MMXEXT_YUV420_YUYV( );
00301 }
00302 for ( i_x = (width % 8) / 2 ; i_x-- ; )
00303 {
00304 C_YUV420_YUYV( );
00305 }
00306
00307 p_y2 += i_source_margin + y_src_pitch;
00308 p_u += i_source_u_margin + u_src_pitch;
00309 p_v += i_source_v_margin + v_src_pitch;
00310 if ( i_y > 1 )
00311 {
00312 p_u2 += i_source_u_margin + u_src_pitch;
00313 p_v2 += i_source_v_margin + v_src_pitch;
00314 }
00315 else
00316 {
00317 p_u2 = p_u;
00318 p_v2 = p_v;
00319 }
00320 p_line2 += i_dest_margin + yuy2_pitch;
00321 }
00322
00323 p_line2 = yuy2_map + yuy2_pitch;
00324 p_y2 = y_src + y_src_pitch;
00325 p_u = u_src + u_src_pitch;
00326 p_v = v_src + v_src_pitch;
00327 p_u2 = u_src + 3*u_src_pitch;
00328 p_v2 = v_src + 3*v_src_pitch;
00329
00330 for ( i_y = height / 4 ; i_y-- ; )
00331 {
00332 p_line1 = p_line2;
00333 p_line2 += 2 * yuy2_pitch;
00334
00335 p_y1 = p_y2;
00336 p_y2 += 2 * y_src_pitch;
00337
00338 for ( i_x = width / 8 ; i_x-- ; )
00339 {
00340 MMXEXT_YUV420_YUYV( );
00341 }
00342 for ( i_x = (width % 8) / 2 ; i_x-- ; )
00343 {
00344 C_YUV420_YUYV( );
00345 }
00346
00347 p_y2 += i_source_margin + y_src_pitch;
00348 p_u += i_source_u_margin + u_src_pitch;
00349 p_v += i_source_v_margin + v_src_pitch;
00350 if ( i_y > 1 )
00351 {
00352 p_u2 += i_source_u_margin + u_src_pitch;
00353 p_v2 += i_source_v_margin + v_src_pitch;
00354 }
00355 else
00356 {
00357 p_u2 = p_u;
00358 p_v2 = p_v;
00359 }
00360 p_line2 += i_dest_margin + yuy2_pitch;
00361 }
00362 }
00363
00364 sfence();
00365 emms();
00366
00367 #endif
00368 }
00369
00370 #define C_YUYV_YUV420( ) \
00371 *p_y1++ = *p_line1++; *p_y2++ = *p_line2++; \
00372 *p_u++ = (*p_line1++ + *p_line2++)>>1; \
00373 *p_y1++ = *p_line1++; *p_y2++ = *p_line2++; \
00374 *p_v++ = (*p_line1++ + *p_line2++)>>1;
00375
00376 static void yuy2_to_yv12_c
00377 (const unsigned char *yuy2_map, int yuy2_pitch,
00378 unsigned char *y_dst, int y_dst_pitch,
00379 unsigned char *u_dst, int u_dst_pitch,
00380 unsigned char *v_dst, int v_dst_pitch,
00381 int width, int height)
00382 {
00383
00384 const uint8_t *p_line1, *p_line2 = yuy2_map;
00385 uint8_t *p_y1, *p_y2 = y_dst;
00386 uint8_t *p_u = u_dst;
00387 uint8_t *p_v = v_dst;
00388
00389 int i_x, i_y;
00390
00391 const int i_dest_margin = y_dst_pitch - width;
00392 const int i_dest_u_margin = u_dst_pitch - width/2;
00393 const int i_dest_v_margin = v_dst_pitch - width/2;
00394 const int i_source_margin = yuy2_pitch - width*2;
00395
00396
00397 for ( i_y = height / 2 ; i_y-- ; )
00398 {
00399 p_line1 = p_line2;
00400 p_line2 += yuy2_pitch;
00401
00402 p_y1 = p_y2;
00403 p_y2 += y_dst_pitch;
00404
00405 for ( i_x = width / 8 ; i_x-- ; )
00406 {
00407 C_YUYV_YUV420( );
00408 C_YUYV_YUV420( );
00409 C_YUYV_YUV420( );
00410 C_YUYV_YUV420( );
00411 }
00412
00413 p_y2 += i_dest_margin;
00414 p_u += i_dest_u_margin;
00415 p_v += i_dest_v_margin;
00416 p_line2 += i_source_margin;
00417 }
00418 }
00419
00420
00421 #ifdef HAVE_MMX
00422
00423
00424 #define MMXEXT_YUYV_YUV420( ) \
00425 do { \
00426 __asm__ __volatile__(".align 8 \n\t" \
00427 "movq (%0), %%mm0 \n\t" \
00428 "movq 8(%0), %%mm1 \n\t" \
00429 "movq %%mm0, %%mm2 \n\t" \
00430 "movq %%mm1, %%mm3 \n\t" \
00431 "psrlw $8, %%mm0 \n\t" \
00432 "psrlw $8, %%mm1 \n\t" \
00433 "pand %%mm7, %%mm2 \n\t" \
00434 "pand %%mm7, %%mm3 \n\t" \
00435 "packuswb %%mm1, %%mm0 \n\t" \
00436 "packuswb %%mm3, %%mm2 \n\t" \
00437 "movntq %%mm2, (%1) \n\t" \
00438 : \
00439 : "r" (p_line1), "r" (p_y1) ); \
00440 __asm__ __volatile__(".align 8 \n\t" \
00441 "movq (%0), %%mm1 \n\t" \
00442 "movq 8(%0), %%mm2 \n\t" \
00443 "movq %%mm1, %%mm3 \n\t" \
00444 "movq %%mm2, %%mm4 \n\t" \
00445 "psrlw $8, %%mm1 \n\t" \
00446 "psrlw $8, %%mm2 \n\t" \
00447 "pand %%mm7, %%mm3 \n\t" \
00448 "pand %%mm7, %%mm4 \n\t" \
00449 "packuswb %%mm2, %%mm1 \n\t" \
00450 "packuswb %%mm4, %%mm3 \n\t" \
00451 "movntq %%mm3, (%1) \n\t" \
00452 : \
00453 : "r" (p_line2), "r" (p_y2) ); \
00454 __asm__ __volatile__( \
00455 "pavgb %%mm1, %%mm0 \n\t" \
00456 "movq %%mm0, %%mm1 \n\t" \
00457 "psrlw $8, %%mm0 \n\t" \
00458 "packuswb %%mm0, %%mm0 \n\t" \
00459 "movd %%mm0, (%0) \n\t" \
00460 "pand %%mm7, %%mm1 \n\t" \
00461 "packuswb %%mm1, %%mm1 \n\t" \
00462 "movd %%mm1, (%1) \n\t" \
00463 : \
00464 : "r" (p_v), "r" (p_u) ); \
00465 p_line1 += 16; p_line2 += 16; p_y1 += 8; p_y2 += 8; p_u += 4; p_v += 4; \
00466 } while(0)
00467
00468 #endif
00469
00470 static void yuy2_to_yv12_mmxext
00471 (const unsigned char *yuy2_map, int yuy2_pitch,
00472 unsigned char *y_dst, int y_dst_pitch,
00473 unsigned char *u_dst, int u_dst_pitch,
00474 unsigned char *v_dst, int v_dst_pitch,
00475 int width, int height)
00476 {
00477 #ifdef HAVE_MMX
00478 const uint8_t *p_line1, *p_line2 = yuy2_map;
00479 uint8_t *p_y1, *p_y2 = y_dst;
00480 uint8_t *p_u = u_dst;
00481 uint8_t *p_v = v_dst;
00482
00483 int i_x, i_y;
00484
00485 const int i_dest_margin = y_dst_pitch - width;
00486 const int i_dest_u_margin = u_dst_pitch - width/2;
00487 const int i_dest_v_margin = v_dst_pitch - width/2;
00488 const int i_source_margin = yuy2_pitch - width*2;
00489
00490 __asm__ __volatile__(
00491 "pcmpeqw %mm7, %mm7 \n\t"
00492 "psrlw $8, %mm7 \n\t"
00493 );
00494
00495 for ( i_y = height / 2 ; i_y-- ; )
00496 {
00497 p_line1 = p_line2;
00498 p_line2 += yuy2_pitch;
00499
00500 p_y1 = p_y2;
00501 p_y2 += y_dst_pitch;
00502
00503 for ( i_x = width / 8 ; i_x-- ; )
00504 {
00505 MMXEXT_YUYV_YUV420( );
00506 }
00507
00508 p_y2 += i_dest_margin;
00509 p_u += i_dest_u_margin;
00510 p_v += i_dest_v_margin;
00511 p_line2 += i_source_margin;
00512 }
00513
00514 sfence();
00515 emms();
00516 #endif
00517 }
00518
00519 #ifdef HAVE_MMX
00520 static void vfilter_chroma_332_packed422_scanline_mmx( uint8_t *output, int width,
00521 uint8_t *m, uint8_t *t, uint8_t *b )
00522 {
00523 int i;
00524 const mmx_t ymask = { 0x00ff00ff00ff00ffULL };
00525 const mmx_t cmask = { 0xff00ff00ff00ff00ULL };
00526
00527
00528 width *= 2;
00529 i = width / 8;
00530 width -= i * 8;
00531
00532 movq_m2r( ymask, mm7 );
00533 movq_m2r( cmask, mm6 );
00534
00535 while ( i-- )
00536 {
00537 movq_m2r( *t, mm0 );
00538 movq_m2r( *b, mm1 );
00539 movq_m2r( *m, mm2 );
00540
00541 movq_r2r ( mm2, mm3 );
00542 pand_r2r ( mm7, mm3 );
00543
00544 pand_r2r ( mm6, mm0 );
00545 pand_r2r ( mm6, mm1 );
00546 pand_r2r ( mm6, mm2 );
00547
00548 psrlq_i2r( 8, mm0 );
00549 psrlq_i2r( 7, mm1 );
00550 psrlq_i2r( 8, mm2 );
00551
00552 movq_r2r ( mm0, mm4 );
00553 psllw_i2r( 1, mm4 );
00554 paddw_r2r( mm4, mm0 );
00555
00556 movq_r2r ( mm2, mm4 );
00557 psllw_i2r( 1, mm4 );
00558 paddw_r2r( mm4, mm2 );
00559
00560 paddw_r2r( mm0, mm2 );
00561 paddw_r2r( mm1, mm2 );
00562
00563 psllw_i2r( 5, mm2 );
00564 pand_r2r( mm6, mm2 );
00565
00566 por_r2r ( mm3, mm2 );
00567
00568 movq_r2m( mm2, *output );
00569 output += 8;
00570 t += 8;
00571 b += 8;
00572 m += 8;
00573 }
00574 output++; t++; b++; m++;
00575 while ( width-- )
00576 {
00577 *output = (3 * *t + 3 * *m + 2 * *b) >> 3;
00578 output +=2; t+=2; b+=2; m+=2;
00579 }
00580
00581 emms();
00582 }
00583 #endif
00584
00585 static void vfilter_chroma_332_packed422_scanline_c( uint8_t *output, int width,
00586 uint8_t *m, uint8_t *t, uint8_t *b )
00587 {
00588 output++; t++; b++; m++;
00589 while ( width-- )
00590 {
00591 *output = (3 * *t + 3 * *m + 2 * *b) >> 3;
00592 output +=2; t+=2; b+=2; m+=2;
00593 }
00594 }
00595
00596
00597
00598
00599
00600
00601
00602 void init_yuv_conversion(void)
00603 {
00604
00605
00606
00607
00608 #ifdef MMX
00609 if (mm_support() & MM_MMXEXT)
00610 {
00611 yv12_to_yuy2 = yv12_to_yuy2_mmxext;
00612 yuy2_to_yv12 = yuy2_to_yv12_mmxext;
00613 vfilter_chroma_332_packed422_scanline = vfilter_chroma_332_packed422_scanline_mmx;
00614 }
00615 else
00616 #endif
00617 {
00618 yv12_to_yuy2 = yv12_to_yuy2_c;
00619 yuy2_to_yv12 = yuy2_to_yv12_c;
00620 vfilter_chroma_332_packed422_scanline = vfilter_chroma_332_packed422_scanline_c;
00621 }
00622 }
00623
00624 void apply_chroma_filter( uint8_t *data, int stride, int width, int height )
00625 {
00626 int i;
00627
00628
00629
00630
00631
00632 for ( i = 0; i < height; i++, data += stride )
00633 {
00634 vfilter_chroma_332_packed422_scanline( data, width,
00635 data,
00636 (i) ? (data - stride) : data,
00637 (i < height-1) ? (data + stride) : data );
00638 }
00639 }
00640
00641