00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022 DECLARE_ALIGNED_8 (static const uint64_t, ff_pb_3_1 ) = 0x0103010301030103ULL;
00023 DECLARE_ALIGNED_8 (static const uint64_t, ff_pb_7_3 ) = 0x0307030703070307ULL;
00024
00025
00026
00027
00028
00029 #define SUMSUB_BA( a, b ) \
00030 "paddw "#b", "#a" \n\t"\
00031 "paddw "#b", "#b" \n\t"\
00032 "psubw "#a", "#b" \n\t"
00033
00034 #define SUMSUB_BADC( a, b, c, d ) \
00035 "paddw "#b", "#a" \n\t"\
00036 "paddw "#d", "#c" \n\t"\
00037 "paddw "#b", "#b" \n\t"\
00038 "paddw "#d", "#d" \n\t"\
00039 "psubw "#a", "#b" \n\t"\
00040 "psubw "#c", "#d" \n\t"
00041
00042 #define SUMSUBD2_AB( a, b, t ) \
00043 "movq "#b", "#t" \n\t"\
00044 "psraw $1 , "#b" \n\t"\
00045 "paddw "#a", "#b" \n\t"\
00046 "psraw $1 , "#a" \n\t"\
00047 "psubw "#t", "#a" \n\t"
00048
00049 #define IDCT4_1D( s02, s13, d02, d13, t ) \
00050 SUMSUB_BA ( s02, d02 )\
00051 SUMSUBD2_AB( s13, d13, t )\
00052 SUMSUB_BADC( d13, s02, s13, d02 )
00053
00054 #define STORE_DIFF_4P( p, t, z ) \
00055 "psraw $6, "#p" \n\t"\
00056 "movd (%0), "#t" \n\t"\
00057 "punpcklbw "#z", "#t" \n\t"\
00058 "paddsw "#t", "#p" \n\t"\
00059 "packuswb "#z", "#p" \n\t"\
00060 "movd "#p", (%0) \n\t"
00061
00062 static void ff_h264_idct_add_mmx(uint8_t *dst, int16_t *block, int stride)
00063 {
00064
00065 asm volatile(
00066 "movq (%0), %%mm0 \n\t"
00067 "movq 8(%0), %%mm1 \n\t"
00068 "movq 16(%0), %%mm2 \n\t"
00069 "movq 24(%0), %%mm3 \n\t"
00070 :: "r"(block) );
00071
00072 asm volatile(
00073
00074 IDCT4_1D( %%mm2, %%mm1, %%mm0, %%mm3, %%mm4 )
00075
00076 "movq %0, %%mm6 \n\t"
00077
00078 TRANSPOSE4( %%mm3, %%mm1, %%mm0, %%mm2, %%mm4 )
00079
00080 "paddw %%mm6, %%mm3 \n\t"
00081
00082
00083 IDCT4_1D( %%mm4, %%mm2, %%mm3, %%mm0, %%mm1 )
00084
00085 "pxor %%mm7, %%mm7 \n\t"
00086 :: "m"(ff_pw_32));
00087
00088 asm volatile(
00089 STORE_DIFF_4P( %%mm0, %%mm1, %%mm7)
00090 "add %1, %0 \n\t"
00091 STORE_DIFF_4P( %%mm2, %%mm1, %%mm7)
00092 "add %1, %0 \n\t"
00093 STORE_DIFF_4P( %%mm3, %%mm1, %%mm7)
00094 "add %1, %0 \n\t"
00095 STORE_DIFF_4P( %%mm4, %%mm1, %%mm7)
00096 : "+r"(dst)
00097 : "r" ((long)stride)
00098 );
00099 }
00100
00101 static inline void h264_idct8_1d(int16_t *block)
00102 {
00103 asm volatile(
00104 "movq 112(%0), %%mm7 \n\t"
00105 "movq 80(%0), %%mm5 \n\t"
00106 "movq 48(%0), %%mm3 \n\t"
00107 "movq 16(%0), %%mm1 \n\t"
00108
00109 "movq %%mm7, %%mm4 \n\t"
00110 "movq %%mm3, %%mm6 \n\t"
00111 "movq %%mm5, %%mm0 \n\t"
00112 "movq %%mm7, %%mm2 \n\t"
00113 "psraw $1, %%mm4 \n\t"
00114 "psraw $1, %%mm6 \n\t"
00115 "psubw %%mm7, %%mm0 \n\t"
00116 "psubw %%mm6, %%mm2 \n\t"
00117 "psubw %%mm4, %%mm0 \n\t"
00118 "psubw %%mm3, %%mm2 \n\t"
00119 "psubw %%mm3, %%mm0 \n\t"
00120 "paddw %%mm1, %%mm2 \n\t"
00121
00122 "movq %%mm5, %%mm4 \n\t"
00123 "movq %%mm1, %%mm6 \n\t"
00124 "psraw $1, %%mm4 \n\t"
00125 "psraw $1, %%mm6 \n\t"
00126 "paddw %%mm5, %%mm4 \n\t"
00127 "paddw %%mm1, %%mm6 \n\t"
00128 "paddw %%mm7, %%mm4 \n\t"
00129 "paddw %%mm5, %%mm6 \n\t"
00130 "psubw %%mm1, %%mm4 \n\t"
00131 "paddw %%mm3, %%mm6 \n\t"
00132
00133 "movq %%mm0, %%mm1 \n\t"
00134 "movq %%mm4, %%mm3 \n\t"
00135 "movq %%mm2, %%mm5 \n\t"
00136 "movq %%mm6, %%mm7 \n\t"
00137 "psraw $2, %%mm6 \n\t"
00138 "psraw $2, %%mm3 \n\t"
00139 "psraw $2, %%mm5 \n\t"
00140 "psraw $2, %%mm0 \n\t"
00141 "paddw %%mm6, %%mm1 \n\t"
00142 "paddw %%mm2, %%mm3 \n\t"
00143 "psubw %%mm4, %%mm5 \n\t"
00144 "psubw %%mm0, %%mm7 \n\t"
00145
00146 "movq 32(%0), %%mm2 \n\t"
00147 "movq 96(%0), %%mm6 \n\t"
00148 "movq %%mm2, %%mm4 \n\t"
00149 "movq %%mm6, %%mm0 \n\t"
00150 "psraw $1, %%mm4 \n\t"
00151 "psraw $1, %%mm6 \n\t"
00152 "psubw %%mm0, %%mm4 \n\t"
00153 "paddw %%mm2, %%mm6 \n\t"
00154
00155 "movq (%0), %%mm2 \n\t"
00156 "movq 64(%0), %%mm0 \n\t"
00157 SUMSUB_BA( %%mm0, %%mm2 )
00158 SUMSUB_BA( %%mm6, %%mm0 )
00159 SUMSUB_BA( %%mm4, %%mm2 )
00160 SUMSUB_BA( %%mm7, %%mm6 )
00161 SUMSUB_BA( %%mm5, %%mm4 )
00162 SUMSUB_BA( %%mm3, %%mm2 )
00163 SUMSUB_BA( %%mm1, %%mm0 )
00164 :: "r"(block)
00165 );
00166 }
00167
00168 static void ff_h264_idct8_add_mmx(uint8_t *dst, int16_t *block, int stride)
00169 {
00170 int i;
00171 int16_t __attribute__ ((aligned(8))) b2[64];
00172
00173 block[0] += 32;
00174
00175 for(i=0; i<2; i++){
00176 DECLARE_ALIGNED_8(uint64_t, tmp);
00177
00178 h264_idct8_1d(block+4*i);
00179
00180 asm volatile(
00181 "movq %%mm7, %0 \n\t"
00182 TRANSPOSE4( %%mm0, %%mm2, %%mm4, %%mm6, %%mm7 )
00183 "movq %%mm0, 8(%1) \n\t"
00184 "movq %%mm6, 24(%1) \n\t"
00185 "movq %%mm7, 40(%1) \n\t"
00186 "movq %%mm4, 56(%1) \n\t"
00187 "movq %0, %%mm7 \n\t"
00188 TRANSPOSE4( %%mm7, %%mm5, %%mm3, %%mm1, %%mm0 )
00189 "movq %%mm7, (%1) \n\t"
00190 "movq %%mm1, 16(%1) \n\t"
00191 "movq %%mm0, 32(%1) \n\t"
00192 "movq %%mm3, 48(%1) \n\t"
00193 : "=m"(tmp)
00194 : "r"(b2+32*i)
00195 : "memory"
00196 );
00197 }
00198
00199 for(i=0; i<2; i++){
00200 h264_idct8_1d(b2+4*i);
00201
00202 asm volatile(
00203 "psraw $6, %%mm7 \n\t"
00204 "psraw $6, %%mm6 \n\t"
00205 "psraw $6, %%mm5 \n\t"
00206 "psraw $6, %%mm4 \n\t"
00207 "psraw $6, %%mm3 \n\t"
00208 "psraw $6, %%mm2 \n\t"
00209 "psraw $6, %%mm1 \n\t"
00210 "psraw $6, %%mm0 \n\t"
00211
00212 "movq %%mm7, (%0) \n\t"
00213 "movq %%mm5, 16(%0) \n\t"
00214 "movq %%mm3, 32(%0) \n\t"
00215 "movq %%mm1, 48(%0) \n\t"
00216 "movq %%mm0, 64(%0) \n\t"
00217 "movq %%mm2, 80(%0) \n\t"
00218 "movq %%mm4, 96(%0) \n\t"
00219 "movq %%mm6, 112(%0) \n\t"
00220 :: "r"(b2+4*i)
00221 : "memory"
00222 );
00223 }
00224
00225 add_pixels_clamped_mmx(b2, dst, stride);
00226 }
00227
00228 static void ff_h264_idct_dc_add_mmx2(uint8_t *dst, int16_t *block, int stride)
00229 {
00230 int dc = (block[0] + 32) >> 6;
00231 asm volatile(
00232 "movd %0, %%mm0 \n\t"
00233 "pshufw $0, %%mm0, %%mm0 \n\t"
00234 "pxor %%mm1, %%mm1 \n\t"
00235 "psubw %%mm0, %%mm1 \n\t"
00236 "packuswb %%mm0, %%mm0 \n\t"
00237 "packuswb %%mm1, %%mm1 \n\t"
00238 ::"r"(dc)
00239 );
00240 asm volatile(
00241 "movd %0, %%mm2 \n\t"
00242 "movd %1, %%mm3 \n\t"
00243 "movd %2, %%mm4 \n\t"
00244 "movd %3, %%mm5 \n\t"
00245 "paddusb %%mm0, %%mm2 \n\t"
00246 "paddusb %%mm0, %%mm3 \n\t"
00247 "paddusb %%mm0, %%mm4 \n\t"
00248 "paddusb %%mm0, %%mm5 \n\t"
00249 "psubusb %%mm1, %%mm2 \n\t"
00250 "psubusb %%mm1, %%mm3 \n\t"
00251 "psubusb %%mm1, %%mm4 \n\t"
00252 "psubusb %%mm1, %%mm5 \n\t"
00253 "movd %%mm2, %0 \n\t"
00254 "movd %%mm3, %1 \n\t"
00255 "movd %%mm4, %2 \n\t"
00256 "movd %%mm5, %3 \n\t"
00257 :"+m"(*(uint32_t*)(dst+0*stride)),
00258 "+m"(*(uint32_t*)(dst+1*stride)),
00259 "+m"(*(uint32_t*)(dst+2*stride)),
00260 "+m"(*(uint32_t*)(dst+3*stride))
00261 );
00262 }
00263
00264 static void ff_h264_idct8_dc_add_mmx2(uint8_t *dst, int16_t *block, int stride)
00265 {
00266 int dc = (block[0] + 32) >> 6;
00267 int y;
00268 asm volatile(
00269 "movd %0, %%mm0 \n\t"
00270 "pshufw $0, %%mm0, %%mm0 \n\t"
00271 "pxor %%mm1, %%mm1 \n\t"
00272 "psubw %%mm0, %%mm1 \n\t"
00273 "packuswb %%mm0, %%mm0 \n\t"
00274 "packuswb %%mm1, %%mm1 \n\t"
00275 ::"r"(dc)
00276 );
00277 for(y=2; y--; dst += 4*stride){
00278 asm volatile(
00279 "movq %0, %%mm2 \n\t"
00280 "movq %1, %%mm3 \n\t"
00281 "movq %2, %%mm4 \n\t"
00282 "movq %3, %%mm5 \n\t"
00283 "paddusb %%mm0, %%mm2 \n\t"
00284 "paddusb %%mm0, %%mm3 \n\t"
00285 "paddusb %%mm0, %%mm4 \n\t"
00286 "paddusb %%mm0, %%mm5 \n\t"
00287 "psubusb %%mm1, %%mm2 \n\t"
00288 "psubusb %%mm1, %%mm3 \n\t"
00289 "psubusb %%mm1, %%mm4 \n\t"
00290 "psubusb %%mm1, %%mm5 \n\t"
00291 "movq %%mm2, %0 \n\t"
00292 "movq %%mm3, %1 \n\t"
00293 "movq %%mm4, %2 \n\t"
00294 "movq %%mm5, %3 \n\t"
00295 :"+m"(*(uint64_t*)(dst+0*stride)),
00296 "+m"(*(uint64_t*)(dst+1*stride)),
00297 "+m"(*(uint64_t*)(dst+2*stride)),
00298 "+m"(*(uint64_t*)(dst+3*stride))
00299 );
00300 }
00301 }
00302
00303
00304
00305
00306
00307
00308
00309 #define DIFF_GT_MMX(x,y,a,o,t)\
00310 "movq "#y", "#t" \n\t"\
00311 "movq "#x", "#o" \n\t"\
00312 "psubusb "#x", "#t" \n\t"\
00313 "psubusb "#y", "#o" \n\t"\
00314 "por "#t", "#o" \n\t"\
00315 "psubusb "#a", "#o" \n\t"
00316
00317
00318
00319 #define DIFF_GT2_MMX(x,y,a,o,t)\
00320 "movq "#y", "#t" \n\t"\
00321 "movq "#x", "#o" \n\t"\
00322 "psubusb "#x", "#t" \n\t"\
00323 "psubusb "#y", "#o" \n\t"\
00324 "psubusb "#a", "#t" \n\t"\
00325 "psubusb "#a", "#o" \n\t"\
00326 "pcmpeqb "#t", "#o" \n\t"\
00327
00328
00329
00330
00331 #define H264_DEBLOCK_MASK(alpha1, beta1) \
00332 "pshufw $0, "#alpha1", %%mm4 \n\t"\
00333 "pshufw $0, "#beta1 ", %%mm5 \n\t"\
00334 "packuswb %%mm4, %%mm4 \n\t"\
00335 "packuswb %%mm5, %%mm5 \n\t"\
00336 DIFF_GT_MMX(%%mm1, %%mm2, %%mm4, %%mm7, %%mm6) \
00337 DIFF_GT_MMX(%%mm0, %%mm1, %%mm5, %%mm4, %%mm6) \
00338 "por %%mm4, %%mm7 \n\t"\
00339 DIFF_GT_MMX(%%mm3, %%mm2, %%mm5, %%mm4, %%mm6) \
00340 "por %%mm4, %%mm7 \n\t"\
00341 "pxor %%mm6, %%mm6 \n\t"\
00342 "pcmpeqb %%mm6, %%mm7 \n\t"
00343
00344
00345
00346
00347 #define H264_DEBLOCK_P0_Q0(pb_01, pb_3f)\
00348 "movq %%mm1 , %%mm5 \n\t"\
00349 "pxor %%mm2 , %%mm5 \n\t" \
00350 "pand "#pb_01" , %%mm5 \n\t" \
00351 "pcmpeqb %%mm4 , %%mm4 \n\t"\
00352 "pxor %%mm4 , %%mm3 \n\t"\
00353 "pavgb %%mm0 , %%mm3 \n\t" \
00354 "pavgb "MANGLE(ff_pb_3)" , %%mm3 \n\t" \
00355 "pxor %%mm1 , %%mm4 \n\t"\
00356 "pavgb %%mm2 , %%mm4 \n\t" \
00357 "pavgb %%mm5 , %%mm3 \n\t"\
00358 "paddusb %%mm4 , %%mm3 \n\t" \
00359 "movq "MANGLE(ff_pb_A1)" , %%mm6 \n\t"\
00360 "psubusb %%mm3 , %%mm6 \n\t"\
00361 "psubusb "MANGLE(ff_pb_A1)" , %%mm3 \n\t"\
00362 "pminub %%mm7 , %%mm6 \n\t"\
00363 "pminub %%mm7 , %%mm3 \n\t"\
00364 "psubusb %%mm6 , %%mm1 \n\t"\
00365 "psubusb %%mm3 , %%mm2 \n\t"\
00366 "paddusb %%mm3 , %%mm1 \n\t"\
00367 "paddusb %%mm6 , %%mm2 \n\t"
00368
00369
00370
00371
00372 #define H264_DEBLOCK_Q1(p1, q2, q2addr, q1addr, tc0, tmp)\
00373 "movq %%mm1, "#tmp" \n\t"\
00374 "pavgb %%mm2, "#tmp" \n\t"\
00375 "pavgb "#tmp", "#q2" \n\t" \
00376 "pxor "q2addr", "#tmp" \n\t"\
00377 "pand %8, "#tmp" \n\t" \
00378 "psubusb "#tmp", "#q2" \n\t" \
00379 "movq "#p1", "#tmp" \n\t"\
00380 "psubusb "#tc0", "#tmp" \n\t"\
00381 "paddusb "#p1", "#tc0" \n\t"\
00382 "pmaxub "#tmp", "#q2" \n\t"\
00383 "pminub "#tc0", "#q2" \n\t"\
00384 "movq "#q2", "q1addr" \n\t"
00385
00386 static inline void h264_loop_filter_luma_mmx2(uint8_t *pix, int stride, int alpha1, int beta1, int8_t *tc0)
00387 {
00388 DECLARE_ALIGNED_8(uint64_t, tmp0[2]);
00389
00390 asm volatile(
00391 "movq (%1,%3), %%mm0 \n\t"
00392 "movq (%1,%3,2), %%mm1 \n\t"
00393 "movq (%2), %%mm2 \n\t"
00394 "movq (%2,%3), %%mm3 \n\t"
00395 H264_DEBLOCK_MASK(%6, %7)
00396
00397 "movd %5, %%mm4 \n\t"
00398 "punpcklbw %%mm4, %%mm4 \n\t"
00399 "punpcklwd %%mm4, %%mm4 \n\t"
00400 "pcmpeqb %%mm3, %%mm3 \n\t"
00401 "movq %%mm4, %%mm6 \n\t"
00402 "pcmpgtb %%mm3, %%mm4 \n\t"
00403 "movq %%mm6, 8+%0 \n\t"
00404 "pand %%mm4, %%mm7 \n\t"
00405 "movq %%mm7, %0 \n\t"
00406
00407
00408 "movq (%1), %%mm3 \n\t"
00409 DIFF_GT2_MMX(%%mm1, %%mm3, %%mm5, %%mm6, %%mm4)
00410 "pand %%mm7, %%mm6 \n\t"
00411 "pand 8+%0, %%mm7 \n\t"
00412 "movq %%mm7, %%mm4 \n\t"
00413 "psubb %%mm6, %%mm7 \n\t"
00414 "pand %%mm4, %%mm6 \n\t"
00415 H264_DEBLOCK_Q1(%%mm0, %%mm3, "(%1)", "(%1,%3)", %%mm6, %%mm4)
00416
00417
00418 "movq (%2,%3,2), %%mm4 \n\t"
00419 DIFF_GT2_MMX(%%mm2, %%mm4, %%mm5, %%mm6, %%mm3)
00420 "pand %0, %%mm6 \n\t"
00421 "movq 8+%0, %%mm5 \n\t"
00422 "pand %%mm6, %%mm5 \n\t"
00423 "psubb %%mm6, %%mm7 \n\t"
00424 "movq (%2,%3), %%mm3 \n\t"
00425 H264_DEBLOCK_Q1(%%mm3, %%mm4, "(%2,%3,2)", "(%2,%3)", %%mm5, %%mm6)
00426
00427
00428 H264_DEBLOCK_P0_Q0(%8, unused)
00429 "movq %%mm1, (%1,%3,2) \n\t"
00430 "movq %%mm2, (%2) \n\t"
00431
00432 : "=m"(*tmp0)
00433 : "r"(pix-3*stride), "r"(pix), "r"((long)stride),
00434 "m"(*tmp0), "m"(*(uint32_t*)tc0), "m"(alpha1), "m"(beta1),
00435 "m"(mm_bone)
00436 );
00437 }
00438
00439 static void h264_v_loop_filter_luma_mmx2(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
00440 {
00441 if((tc0[0] & tc0[1]) >= 0)
00442 h264_loop_filter_luma_mmx2(pix, stride, alpha-1, beta-1, tc0);
00443 if((tc0[2] & tc0[3]) >= 0)
00444 h264_loop_filter_luma_mmx2(pix+8, stride, alpha-1, beta-1, tc0+2);
00445 }
00446 static void h264_h_loop_filter_luma_mmx2(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
00447 {
00448
00449
00450 DECLARE_ALIGNED_8(uint8_t, trans[8*8]);
00451 int i;
00452 for(i=0; i<2; i++, pix+=8*stride, tc0+=2) {
00453 if((tc0[0] & tc0[1]) < 0)
00454 continue;
00455 transpose4x4(trans, pix-4, 8, stride);
00456 transpose4x4(trans +4*8, pix, 8, stride);
00457 transpose4x4(trans+4, pix-4+4*stride, 8, stride);
00458 transpose4x4(trans+4+4*8, pix +4*stride, 8, stride);
00459 h264_loop_filter_luma_mmx2(trans+4*8, 8, alpha-1, beta-1, tc0);
00460 transpose4x4(pix-2, trans +2*8, stride, 8);
00461 transpose4x4(pix-2+4*stride, trans+4+2*8, stride, 8);
00462 }
00463 }
00464
00465 static inline void h264_loop_filter_chroma_mmx2(uint8_t *pix, int stride, int alpha1, int beta1, int8_t *tc0)
00466 {
00467 asm volatile(
00468 "movq (%0), %%mm0 \n\t"
00469 "movq (%0,%2), %%mm1 \n\t"
00470 "movq (%1), %%mm2 \n\t"
00471 "movq (%1,%2), %%mm3 \n\t"
00472 H264_DEBLOCK_MASK(%4, %5)
00473 "movd %3, %%mm6 \n\t"
00474 "punpcklbw %%mm6, %%mm6 \n\t"
00475 "pand %%mm6, %%mm7 \n\t"
00476 H264_DEBLOCK_P0_Q0(%6, %7)
00477 "movq %%mm1, (%0,%2) \n\t"
00478 "movq %%mm2, (%1) \n\t"
00479
00480 :: "r"(pix-2*stride), "r"(pix), "r"((long)stride),
00481 "r"(*(uint32_t*)tc0),
00482 "m"(alpha1), "m"(beta1), "m"(mm_bone), "m"(ff_pb_3F)
00483 );
00484 }
00485
00486 static void h264_v_loop_filter_chroma_mmx2(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
00487 {
00488 h264_loop_filter_chroma_mmx2(pix, stride, alpha-1, beta-1, tc0);
00489 }
00490
00491 static void h264_h_loop_filter_chroma_mmx2(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
00492 {
00493
00494 DECLARE_ALIGNED_8(uint8_t, trans[8*4]);
00495 transpose4x4(trans, pix-2, 8, stride);
00496 transpose4x4(trans+4, pix-2+4*stride, 8, stride);
00497 h264_loop_filter_chroma_mmx2(trans+2*8, 8, alpha-1, beta-1, tc0);
00498 transpose4x4(pix-2, trans, stride, 8);
00499 transpose4x4(pix-2+4*stride, trans+4, stride, 8);
00500 }
00501
00502
00503 #define H264_FILTER_CHROMA4(p0, p1, q1, one) \
00504 "movq "#p0", %%mm4 \n\t"\
00505 "pxor "#q1", %%mm4 \n\t"\
00506 "pand "#one", %%mm4 \n\t" \
00507 "pavgb "#q1", "#p0" \n\t"\
00508 "psubusb %%mm4, "#p0" \n\t"\
00509 "pavgb "#p1", "#p0" \n\t" \
00510
00511 static inline void h264_loop_filter_chroma_intra_mmx2(uint8_t *pix, int stride, int alpha1, int beta1)
00512 {
00513 asm volatile(
00514 "movq (%0), %%mm0 \n\t"
00515 "movq (%0,%2), %%mm1 \n\t"
00516 "movq (%1), %%mm2 \n\t"
00517 "movq (%1,%2), %%mm3 \n\t"
00518 H264_DEBLOCK_MASK(%3, %4)
00519 "movq %%mm1, %%mm5 \n\t"
00520 "movq %%mm2, %%mm6 \n\t"
00521 H264_FILTER_CHROMA4(%%mm1, %%mm0, %%mm3, %5)
00522 H264_FILTER_CHROMA4(%%mm2, %%mm3, %%mm0, %5)
00523 "psubb %%mm5, %%mm1 \n\t"
00524 "psubb %%mm6, %%mm2 \n\t"
00525 "pand %%mm7, %%mm1 \n\t"
00526 "pand %%mm7, %%mm2 \n\t"
00527 "paddb %%mm5, %%mm1 \n\t"
00528 "paddb %%mm6, %%mm2 \n\t"
00529 "movq %%mm1, (%0,%2) \n\t"
00530 "movq %%mm2, (%1) \n\t"
00531 :: "r"(pix-2*stride), "r"(pix), "r"((long)stride),
00532 "m"(alpha1), "m"(beta1), "m"(mm_bone)
00533 );
00534 }
00535
00536 static void h264_v_loop_filter_chroma_intra_mmx2(uint8_t *pix, int stride, int alpha, int beta)
00537 {
00538 h264_loop_filter_chroma_intra_mmx2(pix, stride, alpha-1, beta-1);
00539 }
00540
00541 static void h264_h_loop_filter_chroma_intra_mmx2(uint8_t *pix, int stride, int alpha, int beta)
00542 {
00543
00544 DECLARE_ALIGNED_8(uint8_t, trans[8*4]);
00545 transpose4x4(trans, pix-2, 8, stride);
00546 transpose4x4(trans+4, pix-2+4*stride, 8, stride);
00547 h264_loop_filter_chroma_intra_mmx2(trans+2*8, 8, alpha-1, beta-1);
00548 transpose4x4(pix-2, trans, stride, 8);
00549 transpose4x4(pix-2+4*stride, trans+4, stride, 8);
00550 }
00551
00552 static void h264_loop_filter_strength_mmx2( int16_t bS[2][4][4], uint8_t nnz[40], int8_t ref[2][40], int16_t mv[2][40][2],
00553 int bidir, int edges, int step, int mask_mv0, int mask_mv1, int field ) {
00554 int dir;
00555 asm volatile(
00556 "pxor %%mm7, %%mm7 \n\t"
00557 "movq %0, %%mm6 \n\t"
00558 "movq %1, %%mm5 \n\t"
00559 "movq %2, %%mm4 \n\t"
00560 ::"m"(ff_pb_1), "m"(ff_pb_3), "m"(ff_pb_7)
00561 );
00562 if(field)
00563 asm volatile(
00564 "movq %0, %%mm5 \n\t"
00565 "movq %1, %%mm4 \n\t"
00566 ::"m"(ff_pb_3_1), "m"(ff_pb_7_3)
00567 );
00568
00569
00570
00571 for( dir=1; dir>=0; dir-- ) {
00572 const int d_idx = dir ? -8 : -1;
00573 const int mask_mv = dir ? mask_mv1 : mask_mv0;
00574 DECLARE_ALIGNED_8(const uint64_t, mask_dir) = dir ? 0 : 0xffffffffffffffffULL;
00575 int b_idx, edge, l;
00576 for( b_idx=12, edge=0; edge<edges; edge+=step, b_idx+=8*step ) {
00577 asm volatile(
00578 "pand %0, %%mm0 \n\t"
00579 ::"m"(mask_dir)
00580 );
00581 if(!(mask_mv & edge)) {
00582 asm volatile("pxor %%mm0, %%mm0 \n\t":);
00583 for( l = bidir; l >= 0; l-- ) {
00584 asm volatile(
00585 "movd %0, %%mm1 \n\t"
00586 "punpckldq %1, %%mm1 \n\t"
00587 "movq %%mm1, %%mm2 \n\t"
00588 "psrlw $7, %%mm2 \n\t"
00589 "pand %%mm6, %%mm2 \n\t"
00590 "por %%mm2, %%mm1 \n\t"
00591 "punpckldq %%mm1, %%mm2 \n\t"
00592 "pcmpeqb %%mm2, %%mm1 \n\t"
00593 "paddb %%mm6, %%mm1 \n\t"
00594 "punpckhbw %%mm7, %%mm1 \n\t"
00595 "por %%mm1, %%mm0 \n\t"
00596
00597 "movq %2, %%mm1 \n\t"
00598 "movq %3, %%mm2 \n\t"
00599 "psubw %4, %%mm1 \n\t"
00600 "psubw %5, %%mm2 \n\t"
00601 "packsswb %%mm2, %%mm1 \n\t"
00602 "paddb %%mm5, %%mm1 \n\t"
00603 "pminub %%mm4, %%mm1 \n\t"
00604 "pcmpeqb %%mm4, %%mm1 \n\t"
00605 "por %%mm1, %%mm0 \n\t"
00606 ::"m"(ref[l][b_idx]),
00607 "m"(ref[l][b_idx+d_idx]),
00608 "m"(mv[l][b_idx][0]),
00609 "m"(mv[l][b_idx+2][0]),
00610 "m"(mv[l][b_idx+d_idx][0]),
00611 "m"(mv[l][b_idx+d_idx+2][0])
00612 );
00613 }
00614 }
00615 asm volatile(
00616 "movd %0, %%mm1 \n\t"
00617 "por %1, %%mm1 \n\t"
00618 "punpcklbw %%mm7, %%mm1 \n\t"
00619 "pcmpgtw %%mm7, %%mm1 \n\t"
00620 ::"m"(nnz[b_idx]),
00621 "m"(nnz[b_idx+d_idx])
00622 );
00623 asm volatile(
00624 "pcmpeqw %%mm7, %%mm0 \n\t"
00625 "pcmpeqw %%mm7, %%mm0 \n\t"
00626 "psrlw $15, %%mm0 \n\t"
00627 "psrlw $14, %%mm1 \n\t"
00628 "movq %%mm0, %%mm2 \n\t"
00629 "por %%mm1, %%mm2 \n\t"
00630 "psrlw $1, %%mm1 \n\t"
00631 "pandn %%mm2, %%mm1 \n\t"
00632 "movq %%mm1, %0 \n\t"
00633 :"=m"(*bS[dir][edge])
00634 ::"memory"
00635 );
00636 }
00637 edges = 4;
00638 step = 1;
00639 }
00640 asm volatile(
00641 "movq (%0), %%mm0 \n\t"
00642 "movq 8(%0), %%mm1 \n\t"
00643 "movq 16(%0), %%mm2 \n\t"
00644 "movq 24(%0), %%mm3 \n\t"
00645 TRANSPOSE4(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4)
00646 "movq %%mm0, (%0) \n\t"
00647 "movq %%mm3, 8(%0) \n\t"
00648 "movq %%mm4, 16(%0) \n\t"
00649 "movq %%mm2, 24(%0) \n\t"
00650 ::"r"(bS[0])
00651 :"memory"
00652 );
00653 }
00654
00655
00656
00657
00658 #define QPEL_H264V(A,B,C,D,E,F,OP)\
00659 "movd (%0), "#F" \n\t"\
00660 "movq "#C", %%mm6 \n\t"\
00661 "paddw "#D", %%mm6 \n\t"\
00662 "psllw $2, %%mm6 \n\t"\
00663 "psubw "#B", %%mm6 \n\t"\
00664 "psubw "#E", %%mm6 \n\t"\
00665 "pmullw %4, %%mm6 \n\t"\
00666 "add %2, %0 \n\t"\
00667 "punpcklbw %%mm7, "#F" \n\t"\
00668 "paddw %5, "#A" \n\t"\
00669 "paddw "#F", "#A" \n\t"\
00670 "paddw "#A", %%mm6 \n\t"\
00671 "psraw $5, %%mm6 \n\t"\
00672 "packuswb %%mm6, %%mm6 \n\t"\
00673 OP(%%mm6, (%1), A, d)\
00674 "add %3, %1 \n\t"
00675
00676 #define QPEL_H264HV(A,B,C,D,E,F,OF)\
00677 "movd (%0), "#F" \n\t"\
00678 "movq "#C", %%mm6 \n\t"\
00679 "paddw "#D", %%mm6 \n\t"\
00680 "psllw $2, %%mm6 \n\t"\
00681 "psubw "#B", %%mm6 \n\t"\
00682 "psubw "#E", %%mm6 \n\t"\
00683 "pmullw %3, %%mm6 \n\t"\
00684 "add %2, %0 \n\t"\
00685 "punpcklbw %%mm7, "#F" \n\t"\
00686 "paddw "#F", "#A" \n\t"\
00687 "paddw "#A", %%mm6 \n\t"\
00688 "movq %%mm6, "#OF"(%1) \n\t"
00689
00690 #define QPEL_H264(OPNAME, OP, MMX)\
00691 static av_noinline void OPNAME ## h264_qpel4_h_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
00692 int h=4;\
00693 \
00694 asm volatile(\
00695 "pxor %%mm7, %%mm7 \n\t"\
00696 "movq %5, %%mm4 \n\t"\
00697 "movq %6, %%mm5 \n\t"\
00698 "1: \n\t"\
00699 "movd -1(%0), %%mm1 \n\t"\
00700 "movd (%0), %%mm2 \n\t"\
00701 "movd 1(%0), %%mm3 \n\t"\
00702 "movd 2(%0), %%mm0 \n\t"\
00703 "punpcklbw %%mm7, %%mm1 \n\t"\
00704 "punpcklbw %%mm7, %%mm2 \n\t"\
00705 "punpcklbw %%mm7, %%mm3 \n\t"\
00706 "punpcklbw %%mm7, %%mm0 \n\t"\
00707 "paddw %%mm0, %%mm1 \n\t"\
00708 "paddw %%mm3, %%mm2 \n\t"\
00709 "movd -2(%0), %%mm0 \n\t"\
00710 "movd 3(%0), %%mm3 \n\t"\
00711 "punpcklbw %%mm7, %%mm0 \n\t"\
00712 "punpcklbw %%mm7, %%mm3 \n\t"\
00713 "paddw %%mm3, %%mm0 \n\t"\
00714 "psllw $2, %%mm2 \n\t"\
00715 "psubw %%mm1, %%mm2 \n\t"\
00716 "pmullw %%mm4, %%mm2 \n\t"\
00717 "paddw %%mm5, %%mm0 \n\t"\
00718 "paddw %%mm2, %%mm0 \n\t"\
00719 "psraw $5, %%mm0 \n\t"\
00720 "packuswb %%mm0, %%mm0 \n\t"\
00721 OP(%%mm0, (%1),%%mm6, d)\
00722 "add %3, %0 \n\t"\
00723 "add %4, %1 \n\t"\
00724 "decl %2 \n\t"\
00725 " jnz 1b \n\t"\
00726 : "+a"(src), "+c"(dst), "+m"(h)\
00727 : "d"((long)srcStride), "S"((long)dstStride), "m"(ff_pw_5), "m"(ff_pw_16)\
00728 : "memory"\
00729 );\
00730 }\
00731 static av_noinline void OPNAME ## h264_qpel4_h_lowpass_l2_ ## MMX(uint8_t *dst, uint8_t *src, uint8_t *src2, int dstStride, int src2Stride){\
00732 int h=4;\
00733 asm volatile(\
00734 "pxor %%mm7, %%mm7 \n\t"\
00735 "movq %0, %%mm4 \n\t"\
00736 "movq %1, %%mm5 \n\t"\
00737 :: "m"(ff_pw_5), "m"(ff_pw_16)\
00738 );\
00739 do{\
00740 asm volatile(\
00741 "movd -1(%0), %%mm1 \n\t"\
00742 "movd (%0), %%mm2 \n\t"\
00743 "movd 1(%0), %%mm3 \n\t"\
00744 "movd 2(%0), %%mm0 \n\t"\
00745 "punpcklbw %%mm7, %%mm1 \n\t"\
00746 "punpcklbw %%mm7, %%mm2 \n\t"\
00747 "punpcklbw %%mm7, %%mm3 \n\t"\
00748 "punpcklbw %%mm7, %%mm0 \n\t"\
00749 "paddw %%mm0, %%mm1 \n\t"\
00750 "paddw %%mm3, %%mm2 \n\t"\
00751 "movd -2(%0), %%mm0 \n\t"\
00752 "movd 3(%0), %%mm3 \n\t"\
00753 "punpcklbw %%mm7, %%mm0 \n\t"\
00754 "punpcklbw %%mm7, %%mm3 \n\t"\
00755 "paddw %%mm3, %%mm0 \n\t"\
00756 "psllw $2, %%mm2 \n\t"\
00757 "psubw %%mm1, %%mm2 \n\t"\
00758 "pmullw %%mm4, %%mm2 \n\t"\
00759 "paddw %%mm5, %%mm0 \n\t"\
00760 "paddw %%mm2, %%mm0 \n\t"\
00761 "movd (%2), %%mm3 \n\t"\
00762 "psraw $5, %%mm0 \n\t"\
00763 "packuswb %%mm0, %%mm0 \n\t"\
00764 PAVGB" %%mm3, %%mm0 \n\t"\
00765 OP(%%mm0, (%1),%%mm6, d)\
00766 "add %4, %0 \n\t"\
00767 "add %4, %1 \n\t"\
00768 "add %3, %2 \n\t"\
00769 : "+a"(src), "+c"(dst), "+d"(src2)\
00770 : "D"((long)src2Stride), "S"((long)dstStride)\
00771 : "memory"\
00772 );\
00773 }while(--h);\
00774 }\
00775 static av_noinline void OPNAME ## h264_qpel4_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
00776 src -= 2*srcStride;\
00777 asm volatile(\
00778 "pxor %%mm7, %%mm7 \n\t"\
00779 "movd (%0), %%mm0 \n\t"\
00780 "add %2, %0 \n\t"\
00781 "movd (%0), %%mm1 \n\t"\
00782 "add %2, %0 \n\t"\
00783 "movd (%0), %%mm2 \n\t"\
00784 "add %2, %0 \n\t"\
00785 "movd (%0), %%mm3 \n\t"\
00786 "add %2, %0 \n\t"\
00787 "movd (%0), %%mm4 \n\t"\
00788 "add %2, %0 \n\t"\
00789 "punpcklbw %%mm7, %%mm0 \n\t"\
00790 "punpcklbw %%mm7, %%mm1 \n\t"\
00791 "punpcklbw %%mm7, %%mm2 \n\t"\
00792 "punpcklbw %%mm7, %%mm3 \n\t"\
00793 "punpcklbw %%mm7, %%mm4 \n\t"\
00794 QPEL_H264V(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, OP)\
00795 QPEL_H264V(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, OP)\
00796 QPEL_H264V(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, OP)\
00797 QPEL_H264V(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, OP)\
00798 \
00799 : "+a"(src), "+c"(dst)\
00800 : "S"((long)srcStride), "D"((long)dstStride), "m"(ff_pw_5), "m"(ff_pw_16)\
00801 : "memory"\
00802 );\
00803 }\
00804 static av_noinline void OPNAME ## h264_qpel4_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
00805 int h=4;\
00806 int w=3;\
00807 src -= 2*srcStride+2;\
00808 while(w--){\
00809 asm volatile(\
00810 "pxor %%mm7, %%mm7 \n\t"\
00811 "movd (%0), %%mm0 \n\t"\
00812 "add %2, %0 \n\t"\
00813 "movd (%0), %%mm1 \n\t"\
00814 "add %2, %0 \n\t"\
00815 "movd (%0), %%mm2 \n\t"\
00816 "add %2, %0 \n\t"\
00817 "movd (%0), %%mm3 \n\t"\
00818 "add %2, %0 \n\t"\
00819 "movd (%0), %%mm4 \n\t"\
00820 "add %2, %0 \n\t"\
00821 "punpcklbw %%mm7, %%mm0 \n\t"\
00822 "punpcklbw %%mm7, %%mm1 \n\t"\
00823 "punpcklbw %%mm7, %%mm2 \n\t"\
00824 "punpcklbw %%mm7, %%mm3 \n\t"\
00825 "punpcklbw %%mm7, %%mm4 \n\t"\
00826 QPEL_H264HV(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, 0*8*3)\
00827 QPEL_H264HV(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, 1*8*3)\
00828 QPEL_H264HV(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, 2*8*3)\
00829 QPEL_H264HV(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, 3*8*3)\
00830 \
00831 : "+a"(src)\
00832 : "c"(tmp), "S"((long)srcStride), "m"(ff_pw_5)\
00833 : "memory"\
00834 );\
00835 tmp += 4;\
00836 src += 4 - 9*srcStride;\
00837 }\
00838 tmp -= 3*4;\
00839 asm volatile(\
00840 "movq %4, %%mm6 \n\t"\
00841 "1: \n\t"\
00842 "movq (%0), %%mm0 \n\t"\
00843 "paddw 10(%0), %%mm0 \n\t"\
00844 "movq 2(%0), %%mm1 \n\t"\
00845 "paddw 8(%0), %%mm1 \n\t"\
00846 "movq 4(%0), %%mm2 \n\t"\
00847 "paddw 6(%0), %%mm2 \n\t"\
00848 "psubw %%mm1, %%mm0 \n\t"\
00849 "psraw $2, %%mm0 \n\t"\
00850 "psubw %%mm1, %%mm0 \n\t"\
00851 "paddsw %%mm2, %%mm0 \n\t"\
00852 "psraw $2, %%mm0 \n\t"\
00853 "paddw %%mm6, %%mm2 \n\t"\
00854 "paddw %%mm2, %%mm0 \n\t"\
00855 "psraw $6, %%mm0 \n\t"\
00856 "packuswb %%mm0, %%mm0 \n\t"\
00857 OP(%%mm0, (%1),%%mm7, d)\
00858 "add $24, %0 \n\t"\
00859 "add %3, %1 \n\t"\
00860 "decl %2 \n\t"\
00861 " jnz 1b \n\t"\
00862 : "+a"(tmp), "+c"(dst), "+m"(h)\
00863 : "S"((long)dstStride), "m"(ff_pw_32)\
00864 : "memory"\
00865 );\
00866 }\
00867 \
00868 static av_noinline void OPNAME ## h264_qpel8_h_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
00869 int h=8;\
00870 asm volatile(\
00871 "pxor %%mm7, %%mm7 \n\t"\
00872 "movq %5, %%mm6 \n\t"\
00873 "1: \n\t"\
00874 "movq (%0), %%mm0 \n\t"\
00875 "movq 1(%0), %%mm2 \n\t"\
00876 "movq %%mm0, %%mm1 \n\t"\
00877 "movq %%mm2, %%mm3 \n\t"\
00878 "punpcklbw %%mm7, %%mm0 \n\t"\
00879 "punpckhbw %%mm7, %%mm1 \n\t"\
00880 "punpcklbw %%mm7, %%mm2 \n\t"\
00881 "punpckhbw %%mm7, %%mm3 \n\t"\
00882 "paddw %%mm2, %%mm0 \n\t"\
00883 "paddw %%mm3, %%mm1 \n\t"\
00884 "psllw $2, %%mm0 \n\t"\
00885 "psllw $2, %%mm1 \n\t"\
00886 "movq -1(%0), %%mm2 \n\t"\
00887 "movq 2(%0), %%mm4 \n\t"\
00888 "movq %%mm2, %%mm3 \n\t"\
00889 "movq %%mm4, %%mm5 \n\t"\
00890 "punpcklbw %%mm7, %%mm2 \n\t"\
00891 "punpckhbw %%mm7, %%mm3 \n\t"\
00892 "punpcklbw %%mm7, %%mm4 \n\t"\
00893 "punpckhbw %%mm7, %%mm5 \n\t"\
00894 "paddw %%mm4, %%mm2 \n\t"\
00895 "paddw %%mm3, %%mm5 \n\t"\
00896 "psubw %%mm2, %%mm0 \n\t"\
00897 "psubw %%mm5, %%mm1 \n\t"\
00898 "pmullw %%mm6, %%mm0 \n\t"\
00899 "pmullw %%mm6, %%mm1 \n\t"\
00900 "movd -2(%0), %%mm2 \n\t"\
00901 "movd 7(%0), %%mm5 \n\t"\
00902 "punpcklbw %%mm7, %%mm2 \n\t"\
00903 "punpcklbw %%mm7, %%mm5 \n\t"\
00904 "paddw %%mm3, %%mm2 \n\t"\
00905 "paddw %%mm5, %%mm4 \n\t"\
00906 "movq %6, %%mm5 \n\t"\
00907 "paddw %%mm5, %%mm2 \n\t"\
00908 "paddw %%mm5, %%mm4 \n\t"\
00909 "paddw %%mm2, %%mm0 \n\t"\
00910 "paddw %%mm4, %%mm1 \n\t"\
00911 "psraw $5, %%mm0 \n\t"\
00912 "psraw $5, %%mm1 \n\t"\
00913 "packuswb %%mm1, %%mm0 \n\t"\
00914 OP(%%mm0, (%1),%%mm5, q)\
00915 "add %3, %0 \n\t"\
00916 "add %4, %1 \n\t"\
00917 "decl %2 \n\t"\
00918 " jnz 1b \n\t"\
00919 : "+a"(src), "+c"(dst), "+m"(h)\
00920 : "d"((long)srcStride), "S"((long)dstStride), "m"(ff_pw_5), "m"(ff_pw_16)\
00921 : "memory"\
00922 );\
00923 }\
00924 \
00925 static av_noinline void OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(uint8_t *dst, uint8_t *src, uint8_t *src2, int dstStride, int src2Stride){\
00926 int h=8;\
00927 asm volatile(\
00928 "pxor %%mm7, %%mm7 \n\t"\
00929 "movq %0, %%mm6 \n\t"\
00930 :: "m"(ff_pw_5)\
00931 );\
00932 do{\
00933 asm volatile(\
00934 "movq (%0), %%mm0 \n\t"\
00935 "movq 1(%0), %%mm2 \n\t"\
00936 "movq %%mm0, %%mm1 \n\t"\
00937 "movq %%mm2, %%mm3 \n\t"\
00938 "punpcklbw %%mm7, %%mm0 \n\t"\
00939 "punpckhbw %%mm7, %%mm1 \n\t"\
00940 "punpcklbw %%mm7, %%mm2 \n\t"\
00941 "punpckhbw %%mm7, %%mm3 \n\t"\
00942 "paddw %%mm2, %%mm0 \n\t"\
00943 "paddw %%mm3, %%mm1 \n\t"\
00944 "psllw $2, %%mm0 \n\t"\
00945 "psllw $2, %%mm1 \n\t"\
00946 "movq -1(%0), %%mm2 \n\t"\
00947 "movq 2(%0), %%mm4 \n\t"\
00948 "movq %%mm2, %%mm3 \n\t"\
00949 "movq %%mm4, %%mm5 \n\t"\
00950 "punpcklbw %%mm7, %%mm2 \n\t"\
00951 "punpckhbw %%mm7, %%mm3 \n\t"\
00952 "punpcklbw %%mm7, %%mm4 \n\t"\
00953 "punpckhbw %%mm7, %%mm5 \n\t"\
00954 "paddw %%mm4, %%mm2 \n\t"\
00955 "paddw %%mm3, %%mm5 \n\t"\
00956 "psubw %%mm2, %%mm0 \n\t"\
00957 "psubw %%mm5, %%mm1 \n\t"\
00958 "pmullw %%mm6, %%mm0 \n\t"\
00959 "pmullw %%mm6, %%mm1 \n\t"\
00960 "movd -2(%0), %%mm2 \n\t"\
00961 "movd 7(%0), %%mm5 \n\t"\
00962 "punpcklbw %%mm7, %%mm2 \n\t"\
00963 "punpcklbw %%mm7, %%mm5 \n\t"\
00964 "paddw %%mm3, %%mm2 \n\t"\
00965 "paddw %%mm5, %%mm4 \n\t"\
00966 "movq %5, %%mm5 \n\t"\
00967 "paddw %%mm5, %%mm2 \n\t"\
00968 "paddw %%mm5, %%mm4 \n\t"\
00969 "paddw %%mm2, %%mm0 \n\t"\
00970 "paddw %%mm4, %%mm1 \n\t"\
00971 "psraw $5, %%mm0 \n\t"\
00972 "psraw $5, %%mm1 \n\t"\
00973 "movq (%2), %%mm4 \n\t"\
00974 "packuswb %%mm1, %%mm0 \n\t"\
00975 PAVGB" %%mm4, %%mm0 \n\t"\
00976 OP(%%mm0, (%1),%%mm5, q)\
00977 "add %4, %0 \n\t"\
00978 "add %4, %1 \n\t"\
00979 "add %3, %2 \n\t"\
00980 : "+a"(src), "+c"(dst), "+d"(src2)\
00981 : "D"((long)src2Stride), "S"((long)dstStride),\
00982 "m"(ff_pw_16)\
00983 : "memory"\
00984 );\
00985 }while(--h);\
00986 }\
00987 \
00988 static av_noinline void OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
00989 int w= 2;\
00990 src -= 2*srcStride;\
00991 \
00992 while(w--){\
00993 asm volatile(\
00994 "pxor %%mm7, %%mm7 \n\t"\
00995 "movd (%0), %%mm0 \n\t"\
00996 "add %2, %0 \n\t"\
00997 "movd (%0), %%mm1 \n\t"\
00998 "add %2, %0 \n\t"\
00999 "movd (%0), %%mm2 \n\t"\
01000 "add %2, %0 \n\t"\
01001 "movd (%0), %%mm3 \n\t"\
01002 "add %2, %0 \n\t"\
01003 "movd (%0), %%mm4 \n\t"\
01004 "add %2, %0 \n\t"\
01005 "punpcklbw %%mm7, %%mm0 \n\t"\
01006 "punpcklbw %%mm7, %%mm1 \n\t"\
01007 "punpcklbw %%mm7, %%mm2 \n\t"\
01008 "punpcklbw %%mm7, %%mm3 \n\t"\
01009 "punpcklbw %%mm7, %%mm4 \n\t"\
01010 QPEL_H264V(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, OP)\
01011 QPEL_H264V(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, OP)\
01012 QPEL_H264V(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, OP)\
01013 QPEL_H264V(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, OP)\
01014 QPEL_H264V(%%mm4, %%mm5, %%mm0, %%mm1, %%mm2, %%mm3, OP)\
01015 QPEL_H264V(%%mm5, %%mm0, %%mm1, %%mm2, %%mm3, %%mm4, OP)\
01016 QPEL_H264V(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, OP)\
01017 QPEL_H264V(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, OP)\
01018 \
01019 : "+a"(src), "+c"(dst)\
01020 : "S"((long)srcStride), "D"((long)dstStride), "m"(ff_pw_5), "m"(ff_pw_16)\
01021 : "memory"\
01022 );\
01023 if(h==16){\
01024 asm volatile(\
01025 QPEL_H264V(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, OP)\
01026 QPEL_H264V(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, OP)\
01027 QPEL_H264V(%%mm4, %%mm5, %%mm0, %%mm1, %%mm2, %%mm3, OP)\
01028 QPEL_H264V(%%mm5, %%mm0, %%mm1, %%mm2, %%mm3, %%mm4, OP)\
01029 QPEL_H264V(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, OP)\
01030 QPEL_H264V(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, OP)\
01031 QPEL_H264V(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, OP)\
01032 QPEL_H264V(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, OP)\
01033 \
01034 : "+a"(src), "+c"(dst)\
01035 : "S"((long)srcStride), "D"((long)dstStride), "m"(ff_pw_5), "m"(ff_pw_16)\
01036 : "memory"\
01037 );\
01038 }\
01039 src += 4-(h+5)*srcStride;\
01040 dst += 4-h*dstStride;\
01041 }\
01042 }\
01043 static av_noinline void OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride, int size){\
01044 int h = size;\
01045 int w = (size+8)>>2;\
01046 src -= 2*srcStride+2;\
01047 while(w--){\
01048 asm volatile(\
01049 "pxor %%mm7, %%mm7 \n\t"\
01050 "movd (%0), %%mm0 \n\t"\
01051 "add %2, %0 \n\t"\
01052 "movd (%0), %%mm1 \n\t"\
01053 "add %2, %0 \n\t"\
01054 "movd (%0), %%mm2 \n\t"\
01055 "add %2, %0 \n\t"\
01056 "movd (%0), %%mm3 \n\t"\
01057 "add %2, %0 \n\t"\
01058 "movd (%0), %%mm4 \n\t"\
01059 "add %2, %0 \n\t"\
01060 "punpcklbw %%mm7, %%mm0 \n\t"\
01061 "punpcklbw %%mm7, %%mm1 \n\t"\
01062 "punpcklbw %%mm7, %%mm2 \n\t"\
01063 "punpcklbw %%mm7, %%mm3 \n\t"\
01064 "punpcklbw %%mm7, %%mm4 \n\t"\
01065 QPEL_H264HV(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, 0*48)\
01066 QPEL_H264HV(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, 1*48)\
01067 QPEL_H264HV(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, 2*48)\
01068 QPEL_H264HV(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, 3*48)\
01069 QPEL_H264HV(%%mm4, %%mm5, %%mm0, %%mm1, %%mm2, %%mm3, 4*48)\
01070 QPEL_H264HV(%%mm5, %%mm0, %%mm1, %%mm2, %%mm3, %%mm4, 5*48)\
01071 QPEL_H264HV(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, 6*48)\
01072 QPEL_H264HV(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, 7*48)\
01073 : "+a"(src)\
01074 : "c"(tmp), "S"((long)srcStride), "m"(ff_pw_5)\
01075 : "memory"\
01076 );\
01077 if(size==16){\
01078 asm volatile(\
01079 QPEL_H264HV(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, 8*48)\
01080 QPEL_H264HV(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, 9*48)\
01081 QPEL_H264HV(%%mm4, %%mm5, %%mm0, %%mm1, %%mm2, %%mm3, 10*48)\
01082 QPEL_H264HV(%%mm5, %%mm0, %%mm1, %%mm2, %%mm3, %%mm4, 11*48)\
01083 QPEL_H264HV(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, 12*48)\
01084 QPEL_H264HV(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, 13*48)\
01085 QPEL_H264HV(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, 14*48)\
01086 QPEL_H264HV(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, 15*48)\
01087 : "+a"(src)\
01088 : "c"(tmp), "S"((long)srcStride), "m"(ff_pw_5)\
01089 : "memory"\
01090 );\
01091 }\
01092 tmp += 4;\
01093 src += 4 - (size+5)*srcStride;\
01094 }\
01095 tmp -= size+8;\
01096 w = size>>4;\
01097 do{\
01098 h = size;\
01099 asm volatile(\
01100 "movq %4, %%mm6 \n\t"\
01101 "1: \n\t"\
01102 "movq (%0), %%mm0 \n\t"\
01103 "movq 8(%0), %%mm3 \n\t"\
01104 "movq 2(%0), %%mm1 \n\t"\
01105 "movq 10(%0), %%mm4 \n\t"\
01106 "paddw %%mm4, %%mm0 \n\t"\
01107 "paddw %%mm3, %%mm1 \n\t"\
01108 "paddw 18(%0), %%mm3 \n\t"\
01109 "paddw 16(%0), %%mm4 \n\t"\
01110 "movq 4(%0), %%mm2 \n\t"\
01111 "movq 12(%0), %%mm5 \n\t"\
01112 "paddw 6(%0), %%mm2 \n\t"\
01113 "paddw 14(%0), %%mm5 \n\t"\
01114 "psubw %%mm1, %%mm0 \n\t"\
01115 "psubw %%mm4, %%mm3 \n\t"\
01116 "psraw $2, %%mm0 \n\t"\
01117 "psraw $2, %%mm3 \n\t"\
01118 "psubw %%mm1, %%mm0 \n\t"\
01119 "psubw %%mm4, %%mm3 \n\t"\
01120 "paddsw %%mm2, %%mm0 \n\t"\
01121 "paddsw %%mm5, %%mm3 \n\t"\
01122 "psraw $2, %%mm0 \n\t"\
01123 "psraw $2, %%mm3 \n\t"\
01124 "paddw %%mm6, %%mm2 \n\t"\
01125 "paddw %%mm6, %%mm5 \n\t"\
01126 "paddw %%mm2, %%mm0 \n\t"\
01127 "paddw %%mm5, %%mm3 \n\t"\
01128 "psraw $6, %%mm0 \n\t"\
01129 "psraw $6, %%mm3 \n\t"\
01130 "packuswb %%mm3, %%mm0 \n\t"\
01131 OP(%%mm0, (%1),%%mm7, q)\
01132 "add $48, %0 \n\t"\
01133 "add %3, %1 \n\t"\
01134 "decl %2 \n\t"\
01135 " jnz 1b \n\t"\
01136 : "+a"(tmp), "+c"(dst), "+m"(h)\
01137 : "S"((long)dstStride), "m"(ff_pw_32)\
01138 : "memory"\
01139 );\
01140 tmp += 8 - size*24;\
01141 dst += 8 - size*dstStride;\
01142 }while(w--);\
01143 }\
01144 \
01145 static void OPNAME ## h264_qpel8_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
01146 OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(dst , src , dstStride, srcStride, 8);\
01147 }\
01148 static av_noinline void OPNAME ## h264_qpel16_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
01149 OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(dst , src , dstStride, srcStride, 16);\
01150 OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(dst+8, src+8, dstStride, srcStride, 16);\
01151 }\
01152 \
01153 static av_noinline void OPNAME ## h264_qpel16_h_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
01154 OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst , src , dstStride, srcStride);\
01155 OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst+8, src+8, dstStride, srcStride);\
01156 src += 8*srcStride;\
01157 dst += 8*dstStride;\
01158 OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst , src , dstStride, srcStride);\
01159 OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst+8, src+8, dstStride, srcStride);\
01160 }\
01161 \
01162 static av_noinline void OPNAME ## h264_qpel16_h_lowpass_l2_ ## MMX(uint8_t *dst, uint8_t *src, uint8_t *src2, int dstStride, int src2Stride){\
01163 OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst , src , src2 , dstStride, src2Stride);\
01164 OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst+8, src+8, src2+8, dstStride, src2Stride);\
01165 src += 8*dstStride;\
01166 dst += 8*dstStride;\
01167 src2 += 8*src2Stride;\
01168 OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst , src , src2 , dstStride, src2Stride);\
01169 OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst+8, src+8, src2+8, dstStride, src2Stride);\
01170 }\
01171 \
01172 static void OPNAME ## h264_qpel8_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
01173 OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(dst , tmp , src , dstStride, tmpStride, srcStride, 8);\
01174 }\
01175 \
01176 static void OPNAME ## h264_qpel16_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
01177 OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(dst , tmp , src , dstStride, tmpStride, srcStride, 16);\
01178 }\
01179 \
01180 static av_noinline void OPNAME ## pixels4_l2_shift5_ ## MMX(uint8_t *dst, int16_t *src16, uint8_t *src8, int dstStride, int src8Stride, int h)\
01181 {\
01182 asm volatile(\
01183 "movq %5, %%mm6 \n\t"\
01184 "movq (%1), %%mm0 \n\t"\
01185 "movq 24(%1), %%mm1 \n\t"\
01186 "paddw %%mm6, %%mm0 \n\t"\
01187 "paddw %%mm6, %%mm1 \n\t"\
01188 "psraw $5, %%mm0 \n\t"\
01189 "psraw $5, %%mm1 \n\t"\
01190 "packuswb %%mm0, %%mm0 \n\t"\
01191 "packuswb %%mm1, %%mm1 \n\t"\
01192 PAVGB" (%0), %%mm0 \n\t"\
01193 PAVGB" (%0,%3), %%mm1 \n\t"\
01194 OP(%%mm0, (%2), %%mm4, d)\
01195 OP(%%mm1, (%2,%4), %%mm5, d)\
01196 "lea (%0,%3,2), %0 \n\t"\
01197 "lea (%2,%4,2), %2 \n\t"\
01198 "movq 48(%1), %%mm0 \n\t"\
01199 "movq 72(%1), %%mm1 \n\t"\
01200 "paddw %%mm6, %%mm0 \n\t"\
01201 "paddw %%mm6, %%mm1 \n\t"\
01202 "psraw $5, %%mm0 \n\t"\
01203 "psraw $5, %%mm1 \n\t"\
01204 "packuswb %%mm0, %%mm0 \n\t"\
01205 "packuswb %%mm1, %%mm1 \n\t"\
01206 PAVGB" (%0), %%mm0 \n\t"\
01207 PAVGB" (%0,%3), %%mm1 \n\t"\
01208 OP(%%mm0, (%2), %%mm4, d)\
01209 OP(%%mm1, (%2,%4), %%mm5, d)\
01210 :"+a"(src8), "+c"(src16), "+d"(dst)\
01211 :"S"((long)src8Stride), "D"((long)dstStride), "m"(ff_pw_16)\
01212 :"memory");\
01213 }\
01214 static av_noinline void OPNAME ## pixels8_l2_shift5_ ## MMX(uint8_t *dst, int16_t *src16, uint8_t *src8, int dstStride, int src8Stride, int h)\
01215 {\
01216 asm volatile(\
01217 "movq %0, %%mm6 \n\t"\
01218 ::"m"(ff_pw_16)\
01219 );\
01220 while(h--){\
01221 asm volatile(\
01222 "movq (%1), %%mm0 \n\t"\
01223 "movq 8(%1), %%mm1 \n\t"\
01224 "paddw %%mm6, %%mm0 \n\t"\
01225 "paddw %%mm6, %%mm1 \n\t"\
01226 "psraw $5, %%mm0 \n\t"\
01227 "psraw $5, %%mm1 \n\t"\
01228 "packuswb %%mm1, %%mm0 \n\t"\
01229 PAVGB" (%0), %%mm0 \n\t"\
01230 OP(%%mm0, (%2), %%mm5, q)\
01231 ::"a"(src8), "c"(src16), "d"(dst)\
01232 :"memory");\
01233 src8 += src8Stride;\
01234 src16 += 24;\
01235 dst += dstStride;\
01236 }\
01237 }\
01238 static void OPNAME ## pixels16_l2_shift5_ ## MMX(uint8_t *dst, int16_t *src16, uint8_t *src8, int dstStride, int src8Stride, int h)\
01239 {\
01240 OPNAME ## pixels8_l2_shift5_ ## MMX(dst , src16 , src8 , dstStride, src8Stride, h);\
01241 OPNAME ## pixels8_l2_shift5_ ## MMX(dst+8, src16+8, src8+8, dstStride, src8Stride, h);\
01242 }\
01243
01244
01245 #define H264_MC(OPNAME, SIZE, MMX) \
01246 static void OPNAME ## h264_qpel ## SIZE ## _mc00_ ## MMX (uint8_t *dst, uint8_t *src, int stride){\
01247 OPNAME ## pixels ## SIZE ## _mmx(dst, src, stride, SIZE);\
01248 }\
01249 \
01250 static void OPNAME ## h264_qpel ## SIZE ## _mc10_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
01251 OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src, src, stride, stride);\
01252 }\
01253 \
01254 static void OPNAME ## h264_qpel ## SIZE ## _mc20_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
01255 OPNAME ## h264_qpel ## SIZE ## _h_lowpass_ ## MMX(dst, src, stride, stride);\
01256 }\
01257 \
01258 static void OPNAME ## h264_qpel ## SIZE ## _mc30_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
01259 OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src, src+1, stride, stride);\
01260 }\
01261 \
01262 static void OPNAME ## h264_qpel ## SIZE ## _mc01_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
01263 uint64_t temp[SIZE*SIZE/8];\
01264 uint8_t * const half= (uint8_t*)temp;\
01265 put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(half, src, SIZE, stride);\
01266 OPNAME ## pixels ## SIZE ## _l2_ ## MMX(dst, src, half, stride, stride, SIZE);\
01267 }\
01268 \
01269 static void OPNAME ## h264_qpel ## SIZE ## _mc02_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
01270 OPNAME ## h264_qpel ## SIZE ## _v_lowpass_ ## MMX(dst, src, stride, stride);\
01271 }\
01272 \
01273 static void OPNAME ## h264_qpel ## SIZE ## _mc03_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
01274 uint64_t temp[SIZE*SIZE/8];\
01275 uint8_t * const half= (uint8_t*)temp;\
01276 put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(half, src, SIZE, stride);\
01277 OPNAME ## pixels ## SIZE ## _l2_ ## MMX(dst, src+stride, half, stride, stride, SIZE);\
01278 }\
01279 \
01280 static void OPNAME ## h264_qpel ## SIZE ## _mc11_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
01281 uint64_t temp[SIZE*SIZE/8];\
01282 uint8_t * const halfV= (uint8_t*)temp;\
01283 put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(halfV, src, SIZE, stride);\
01284 OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src, halfV, stride, SIZE);\
01285 }\
01286 \
01287 static void OPNAME ## h264_qpel ## SIZE ## _mc31_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
01288 uint64_t temp[SIZE*SIZE/8];\
01289 uint8_t * const halfV= (uint8_t*)temp;\
01290 put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(halfV, src+1, SIZE, stride);\
01291 OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src, halfV, stride, SIZE);\
01292 }\
01293 \
01294 static void OPNAME ## h264_qpel ## SIZE ## _mc13_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
01295 uint64_t temp[SIZE*SIZE/8];\
01296 uint8_t * const halfV= (uint8_t*)temp;\
01297 put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(halfV, src, SIZE, stride);\
01298 OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src+stride, halfV, stride, SIZE);\
01299 }\
01300 \
01301 static void OPNAME ## h264_qpel ## SIZE ## _mc33_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
01302 uint64_t temp[SIZE*SIZE/8];\
01303 uint8_t * const halfV= (uint8_t*)temp;\
01304 put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(halfV, src+1, SIZE, stride);\
01305 OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src+stride, halfV, stride, SIZE);\
01306 }\
01307 \
01308 static void OPNAME ## h264_qpel ## SIZE ## _mc22_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
01309 uint64_t temp[SIZE*(SIZE<8?12:24)/4];\
01310 int16_t * const tmp= (int16_t*)temp;\
01311 OPNAME ## h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(dst, tmp, src, stride, SIZE, stride);\
01312 }\
01313 \
01314 static void OPNAME ## h264_qpel ## SIZE ## _mc21_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
01315 uint64_t temp[SIZE*(SIZE<8?12:24)/4 + SIZE*SIZE/8];\
01316 uint8_t * const halfHV= (uint8_t*)temp;\
01317 int16_t * const tmp= ((int16_t*)temp) + SIZE*SIZE/2;\
01318 assert(((int)temp & 7) == 0);\
01319 put_h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(halfHV, tmp, src, SIZE, SIZE, stride);\
01320 OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src, halfHV, stride, SIZE);\
01321 }\
01322 \
01323 static void OPNAME ## h264_qpel ## SIZE ## _mc23_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
01324 uint64_t temp[SIZE*(SIZE<8?12:24)/4 + SIZE*SIZE/8];\
01325 uint8_t * const halfHV= (uint8_t*)temp;\
01326 int16_t * const tmp= ((int16_t*)temp) + SIZE*SIZE/2;\
01327 assert(((int)temp & 7) == 0);\
01328 put_h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(halfHV, tmp, src, SIZE, SIZE, stride);\
01329 OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src+stride, halfHV, stride, SIZE);\
01330 }\
01331 \
01332 static void OPNAME ## h264_qpel ## SIZE ## _mc12_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
01333 uint64_t temp[SIZE*(SIZE<8?12:24)/4 + SIZE*SIZE/8];\
01334 int16_t * const halfV= ((int16_t*)temp) + SIZE*SIZE/2;\
01335 uint8_t * const halfHV= ((uint8_t*)temp);\
01336 assert(((int)temp & 7) == 0);\
01337 put_h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(halfHV, halfV, src, SIZE, SIZE, stride);\
01338 OPNAME ## pixels ## SIZE ## _l2_shift5_ ## MMX(dst, halfV+2, halfHV, stride, SIZE, SIZE);\
01339 }\
01340 \
01341 static void OPNAME ## h264_qpel ## SIZE ## _mc32_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
01342 uint64_t temp[SIZE*(SIZE<8?12:24)/4 + SIZE*SIZE/8];\
01343 int16_t * const halfV= ((int16_t*)temp) + SIZE*SIZE/2;\
01344 uint8_t * const halfHV= ((uint8_t*)temp);\
01345 assert(((int)temp & 7) == 0);\
01346 put_h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(halfHV, halfV, src, SIZE, SIZE, stride);\
01347 OPNAME ## pixels ## SIZE ## _l2_shift5_ ## MMX(dst, halfV+3, halfHV, stride, SIZE, SIZE);\
01348 }\
01349
01350
01351 #define AVG_3DNOW_OP(a,b,temp, size) \
01352 "mov" #size " " #b ", " #temp " \n\t"\
01353 "pavgusb " #temp ", " #a " \n\t"\
01354 "mov" #size " " #a ", " #b " \n\t"
01355 #define AVG_MMX2_OP(a,b,temp, size) \
01356 "mov" #size " " #b ", " #temp " \n\t"\
01357 "pavgb " #temp ", " #a " \n\t"\
01358 "mov" #size " " #a ", " #b " \n\t"
01359
01360 #define PAVGB "pavgusb"
01361 QPEL_H264(put_, PUT_OP, 3dnow)
01362 QPEL_H264(avg_, AVG_3DNOW_OP, 3dnow)
01363 #undef PAVGB
01364 #define PAVGB "pavgb"
01365 QPEL_H264(put_, PUT_OP, mmx2)
01366 QPEL_H264(avg_, AVG_MMX2_OP, mmx2)
01367 #undef PAVGB
01368
01369 H264_MC(put_, 4, 3dnow)
01370 H264_MC(put_, 8, 3dnow)
01371 H264_MC(put_, 16,3dnow)
01372 H264_MC(avg_, 4, 3dnow)
01373 H264_MC(avg_, 8, 3dnow)
01374 H264_MC(avg_, 16,3dnow)
01375 H264_MC(put_, 4, mmx2)
01376 H264_MC(put_, 8, mmx2)
01377 H264_MC(put_, 16,mmx2)
01378 H264_MC(avg_, 4, mmx2)
01379 H264_MC(avg_, 8, mmx2)
01380 H264_MC(avg_, 16,mmx2)
01381
01382
01383 #define H264_CHROMA_OP(S,D)
01384 #define H264_CHROMA_OP4(S,D,T)
01385 #define H264_CHROMA_MC8_TMPL put_h264_chroma_mc8_mmx
01386 #define H264_CHROMA_MC4_TMPL put_h264_chroma_mc4_mmx
01387 #define H264_CHROMA_MC2_TMPL put_h264_chroma_mc2_mmx2
01388 #define H264_CHROMA_MC8_MV0 put_pixels8_mmx
01389 #include "dsputil_h264_template_mmx.c"
01390 #undef H264_CHROMA_OP
01391 #undef H264_CHROMA_OP4
01392 #undef H264_CHROMA_MC8_TMPL
01393 #undef H264_CHROMA_MC4_TMPL
01394 #undef H264_CHROMA_MC2_TMPL
01395 #undef H264_CHROMA_MC8_MV0
01396
01397 #define H264_CHROMA_OP(S,D) "pavgb " #S ", " #D " \n\t"
01398 #define H264_CHROMA_OP4(S,D,T) "movd " #S ", " #T " \n\t"\
01399 "pavgb " #T ", " #D " \n\t"
01400 #define H264_CHROMA_MC8_TMPL avg_h264_chroma_mc8_mmx2
01401 #define H264_CHROMA_MC4_TMPL avg_h264_chroma_mc4_mmx2
01402 #define H264_CHROMA_MC2_TMPL avg_h264_chroma_mc2_mmx2
01403 #define H264_CHROMA_MC8_MV0 avg_pixels8_mmx2
01404 #include "dsputil_h264_template_mmx.c"
01405 #undef H264_CHROMA_OP
01406 #undef H264_CHROMA_OP4
01407 #undef H264_CHROMA_MC8_TMPL
01408 #undef H264_CHROMA_MC4_TMPL
01409 #undef H264_CHROMA_MC2_TMPL
01410 #undef H264_CHROMA_MC8_MV0
01411
01412 #define H264_CHROMA_OP(S,D) "pavgusb " #S ", " #D " \n\t"
01413 #define H264_CHROMA_OP4(S,D,T) "movd " #S ", " #T " \n\t"\
01414 "pavgusb " #T ", " #D " \n\t"
01415 #define H264_CHROMA_MC8_TMPL avg_h264_chroma_mc8_3dnow
01416 #define H264_CHROMA_MC4_TMPL avg_h264_chroma_mc4_3dnow
01417 #define H264_CHROMA_MC8_MV0 avg_pixels8_3dnow
01418 #include "dsputil_h264_template_mmx.c"
01419 #undef H264_CHROMA_OP
01420 #undef H264_CHROMA_OP4
01421 #undef H264_CHROMA_MC8_TMPL
01422 #undef H264_CHROMA_MC4_TMPL
01423 #undef H264_CHROMA_MC8_MV0
01424
01425
01426
01427
01428 static inline void ff_h264_weight_WxH_mmx2(uint8_t *dst, int stride, int log2_denom, int weight, int offset, int w, int h)
01429 {
01430 int x, y;
01431 offset <<= log2_denom;
01432 offset += (1 << log2_denom) >> 1;
01433 asm volatile(
01434 "movd %0, %%mm4 \n\t"
01435 "movd %1, %%mm5 \n\t"
01436 "movd %2, %%mm6 \n\t"
01437 "pshufw $0, %%mm4, %%mm4 \n\t"
01438 "pshufw $0, %%mm5, %%mm5 \n\t"
01439 "pxor %%mm7, %%mm7 \n\t"
01440 :: "g"(weight), "g"(offset), "g"(log2_denom)
01441 );
01442 for(y=0; y<h; y+=2){
01443 for(x=0; x<w; x+=4){
01444 asm volatile(
01445 "movd %0, %%mm0 \n\t"
01446 "movd %1, %%mm1 \n\t"
01447 "punpcklbw %%mm7, %%mm0 \n\t"
01448 "punpcklbw %%mm7, %%mm1 \n\t"
01449 "pmullw %%mm4, %%mm0 \n\t"
01450 "pmullw %%mm4, %%mm1 \n\t"
01451 "paddsw %%mm5, %%mm0 \n\t"
01452 "paddsw %%mm5, %%mm1 \n\t"
01453 "psraw %%mm6, %%mm0 \n\t"
01454 "psraw %%mm6, %%mm1 \n\t"
01455 "packuswb %%mm7, %%mm0 \n\t"
01456 "packuswb %%mm7, %%mm1 \n\t"
01457 "movd %%mm0, %0 \n\t"
01458 "movd %%mm1, %1 \n\t"
01459 : "+m"(*(uint32_t*)(dst+x)),
01460 "+m"(*(uint32_t*)(dst+x+stride))
01461 );
01462 }
01463 dst += 2*stride;
01464 }
01465 }
01466
01467 static inline void ff_h264_biweight_WxH_mmx2(uint8_t *dst, uint8_t *src, int stride, int log2_denom, int weightd, int weights, int offset, int w, int h)
01468 {
01469 int x, y;
01470 offset = ((offset + 1) | 1) << log2_denom;
01471 asm volatile(
01472 "movd %0, %%mm3 \n\t"
01473 "movd %1, %%mm4 \n\t"
01474 "movd %2, %%mm5 \n\t"
01475 "movd %3, %%mm6 \n\t"
01476 "pshufw $0, %%mm3, %%mm3 \n\t"
01477 "pshufw $0, %%mm4, %%mm4 \n\t"
01478 "pshufw $0, %%mm5, %%mm5 \n\t"
01479 "pxor %%mm7, %%mm7 \n\t"
01480 :: "g"(weightd), "g"(weights), "g"(offset), "g"(log2_denom+1)
01481 );
01482 for(y=0; y<h; y++){
01483 for(x=0; x<w; x+=4){
01484 asm volatile(
01485 "movd %0, %%mm0 \n\t"
01486 "movd %1, %%mm1 \n\t"
01487 "punpcklbw %%mm7, %%mm0 \n\t"
01488 "punpcklbw %%mm7, %%mm1 \n\t"
01489 "pmullw %%mm3, %%mm0 \n\t"
01490 "pmullw %%mm4, %%mm1 \n\t"
01491 "paddsw %%mm1, %%mm0 \n\t"
01492 "paddsw %%mm5, %%mm0 \n\t"
01493 "psraw %%mm6, %%mm0 \n\t"
01494 "packuswb %%mm0, %%mm0 \n\t"
01495 "movd %%mm0, %0 \n\t"
01496 : "+m"(*(uint32_t*)(dst+x))
01497 : "m"(*(uint32_t*)(src+x))
01498 );
01499 }
01500 src += stride;
01501 dst += stride;
01502 }
01503 }
01504
01505 #define H264_WEIGHT(W,H) \
01506 static void ff_h264_biweight_ ## W ## x ## H ## _mmx2(uint8_t *dst, uint8_t *src, int stride, int log2_denom, int weightd, int weights, int offset){ \
01507 ff_h264_biweight_WxH_mmx2(dst, src, stride, log2_denom, weightd, weights, offset, W, H); \
01508 } \
01509 static void ff_h264_weight_ ## W ## x ## H ## _mmx2(uint8_t *dst, int stride, int log2_denom, int weight, int offset){ \
01510 ff_h264_weight_WxH_mmx2(dst, stride, log2_denom, weight, offset, W, H); \
01511 }
01512
01513 H264_WEIGHT(16,16)
01514 H264_WEIGHT(16, 8)
01515 H264_WEIGHT( 8,16)
01516 H264_WEIGHT( 8, 8)
01517 H264_WEIGHT( 8, 4)
01518 H264_WEIGHT( 4, 8)
01519 H264_WEIGHT( 4, 4)
01520 H264_WEIGHT( 4, 2)
01521