00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025 #include "dsputil.h"
00026 #include "simple_idct.h"
00027 #include "mpegvideo.h"
00028 #include "x86_cpu.h"
00029 #include "mmx.h"
00030 #include "vp3dsp_mmx.h"
00031 #include "vp3dsp_sse2.h"
00032 #include "h263.h"
00033
00034
00035
00036
00037 extern void ff_idct_xvid_mmx(short *block);
00038 extern void ff_idct_xvid_mmx2(short *block);
00039
00040 int mm_flags;
00041
00042
00043 static const uint64_t mm_bone attribute_used __attribute__ ((aligned(8))) = 0x0101010101010101ULL;
00044 static const uint64_t mm_wone attribute_used __attribute__ ((aligned(8))) = 0x0001000100010001ULL;
00045 static const uint64_t mm_wtwo attribute_used __attribute__ ((aligned(8))) = 0x0002000200020002ULL;
00046
00047 static const uint64_t ff_pdw_80000000[2] attribute_used __attribute__ ((aligned(16))) =
00048 {0x8000000080000000ULL, 0x8000000080000000ULL};
00049
00050 static const uint64_t ff_pw_20 attribute_used __attribute__ ((aligned(8))) = 0x0014001400140014ULL;
00051 static const uint64_t ff_pw_3 attribute_used __attribute__ ((aligned(8))) = 0x0003000300030003ULL;
00052 static const uint64_t ff_pw_4 attribute_used __attribute__ ((aligned(8))) = 0x0004000400040004ULL;
00053 static const uint64_t ff_pw_5 attribute_used __attribute__ ((aligned(8))) = 0x0005000500050005ULL;
00054 static const uint64_t ff_pw_8 attribute_used __attribute__ ((aligned(8))) = 0x0008000800080008ULL;
00055 static const uint64_t ff_pw_16 attribute_used __attribute__ ((aligned(8))) = 0x0010001000100010ULL;
00056 static const uint64_t ff_pw_32 attribute_used __attribute__ ((aligned(8))) = 0x0020002000200020ULL;
00057 static const uint64_t ff_pw_64 attribute_used __attribute__ ((aligned(8))) = 0x0040004000400040ULL;
00058 static const uint64_t ff_pw_15 attribute_used __attribute__ ((aligned(8))) = 0x000F000F000F000FULL;
00059
00060 static const uint64_t ff_pb_1 attribute_used __attribute__ ((aligned(8))) = 0x0101010101010101ULL;
00061 static const uint64_t ff_pb_3 attribute_used __attribute__ ((aligned(8))) = 0x0303030303030303ULL;
00062 static const uint64_t ff_pb_7 attribute_used __attribute__ ((aligned(8))) = 0x0707070707070707ULL;
00063 static const uint64_t ff_pb_3F attribute_used __attribute__ ((aligned(8))) = 0x3F3F3F3F3F3F3F3FULL;
00064 static const uint64_t ff_pb_A1 attribute_used __attribute__ ((aligned(8))) = 0xA1A1A1A1A1A1A1A1ULL;
00065 static const uint64_t ff_pb_5F attribute_used __attribute__ ((aligned(8))) = 0x5F5F5F5F5F5F5F5FULL;
00066 static const uint64_t ff_pb_FC attribute_used __attribute__ ((aligned(8))) = 0xFCFCFCFCFCFCFCFCULL;
00067
00068 static const double ff_pd_1[2] attribute_used __attribute__ ((aligned(16))) = { 1.0, 1.0 };
00069 static const double ff_pd_2[2] attribute_used __attribute__ ((aligned(16))) = { 2.0, 2.0 };
00070
00071 #define JUMPALIGN() __asm __volatile (ASMALIGN(3)::)
00072 #define MOVQ_ZERO(regd) __asm __volatile ("pxor %%" #regd ", %%" #regd ::)
00073
00074 #define MOVQ_WONE(regd) \
00075 __asm __volatile ( \
00076 "pcmpeqd %%" #regd ", %%" #regd " \n\t" \
00077 "psrlw $15, %%" #regd ::)
00078
00079 #define MOVQ_BFE(regd) \
00080 __asm __volatile ( \
00081 "pcmpeqd %%" #regd ", %%" #regd " \n\t"\
00082 "paddb %%" #regd ", %%" #regd " \n\t" ::)
00083
00084 #ifndef PIC
00085 #define MOVQ_BONE(regd) __asm __volatile ("movq %0, %%" #regd " \n\t" ::"m"(mm_bone))
00086 #define MOVQ_WTWO(regd) __asm __volatile ("movq %0, %%" #regd " \n\t" ::"m"(mm_wtwo))
00087 #else
00088
00089
00090 #define MOVQ_BONE(regd) \
00091 __asm __volatile ( \
00092 "pcmpeqd %%" #regd ", %%" #regd " \n\t" \
00093 "psrlw $15, %%" #regd " \n\t" \
00094 "packuswb %%" #regd ", %%" #regd " \n\t" ::)
00095
00096 #define MOVQ_WTWO(regd) \
00097 __asm __volatile ( \
00098 "pcmpeqd %%" #regd ", %%" #regd " \n\t" \
00099 "psrlw $15, %%" #regd " \n\t" \
00100 "psllw $1, %%" #regd " \n\t"::)
00101
00102 #endif
00103
00104
00105
00106
00107 #define PAVGB_MMX_NO_RND(rega, regb, regr, regfe) \
00108 "movq " #rega ", " #regr " \n\t"\
00109 "pand " #regb ", " #regr " \n\t"\
00110 "pxor " #rega ", " #regb " \n\t"\
00111 "pand " #regfe "," #regb " \n\t"\
00112 "psrlq $1, " #regb " \n\t"\
00113 "paddb " #regb ", " #regr " \n\t"
00114
00115 #define PAVGB_MMX(rega, regb, regr, regfe) \
00116 "movq " #rega ", " #regr " \n\t"\
00117 "por " #regb ", " #regr " \n\t"\
00118 "pxor " #rega ", " #regb " \n\t"\
00119 "pand " #regfe "," #regb " \n\t"\
00120 "psrlq $1, " #regb " \n\t"\
00121 "psubb " #regb ", " #regr " \n\t"
00122
00123
00124 #define PAVGBP_MMX_NO_RND(rega, regb, regr, regc, regd, regp) \
00125 "movq " #rega ", " #regr " \n\t"\
00126 "movq " #regc ", " #regp " \n\t"\
00127 "pand " #regb ", " #regr " \n\t"\
00128 "pand " #regd ", " #regp " \n\t"\
00129 "pxor " #rega ", " #regb " \n\t"\
00130 "pxor " #regc ", " #regd " \n\t"\
00131 "pand %%mm6, " #regb " \n\t"\
00132 "pand %%mm6, " #regd " \n\t"\
00133 "psrlq $1, " #regb " \n\t"\
00134 "psrlq $1, " #regd " \n\t"\
00135 "paddb " #regb ", " #regr " \n\t"\
00136 "paddb " #regd ", " #regp " \n\t"
00137
00138 #define PAVGBP_MMX(rega, regb, regr, regc, regd, regp) \
00139 "movq " #rega ", " #regr " \n\t"\
00140 "movq " #regc ", " #regp " \n\t"\
00141 "por " #regb ", " #regr " \n\t"\
00142 "por " #regd ", " #regp " \n\t"\
00143 "pxor " #rega ", " #regb " \n\t"\
00144 "pxor " #regc ", " #regd " \n\t"\
00145 "pand %%mm6, " #regb " \n\t"\
00146 "pand %%mm6, " #regd " \n\t"\
00147 "psrlq $1, " #regd " \n\t"\
00148 "psrlq $1, " #regb " \n\t"\
00149 "psubb " #regb ", " #regr " \n\t"\
00150 "psubb " #regd ", " #regp " \n\t"
00151
00152
00153
00154 #define DEF(x, y) x ## _no_rnd_ ## y ##_mmx
00155 #define SET_RND MOVQ_WONE
00156 #define PAVGBP(a, b, c, d, e, f) PAVGBP_MMX_NO_RND(a, b, c, d, e, f)
00157 #define PAVGB(a, b, c, e) PAVGB_MMX_NO_RND(a, b, c, e)
00158
00159 #include "dsputil_mmx_rnd.h"
00160
00161 #undef DEF
00162 #undef SET_RND
00163 #undef PAVGBP
00164 #undef PAVGB
00165
00166
00167
00168 #define DEF(x, y) x ## _ ## y ##_mmx
00169 #define SET_RND MOVQ_WTWO
00170 #define PAVGBP(a, b, c, d, e, f) PAVGBP_MMX(a, b, c, d, e, f)
00171 #define PAVGB(a, b, c, e) PAVGB_MMX(a, b, c, e)
00172
00173 #include "dsputil_mmx_rnd.h"
00174
00175 #undef DEF
00176 #undef SET_RND
00177 #undef PAVGBP
00178 #undef PAVGB
00179
00180
00181
00182
00183 #define DEF(x) x ## _3dnow
00184 #define PAVGB "pavgusb"
00185
00186 #include "dsputil_mmx_avg.h"
00187
00188 #undef DEF
00189 #undef PAVGB
00190
00191
00192
00193
00194 #define DEF(x) x ## _mmx2
00195
00196
00197 #define PAVGB "pavgb"
00198
00199 #include "dsputil_mmx_avg.h"
00200
00201 #undef DEF
00202 #undef PAVGB
00203
00204 #define SBUTTERFLY(a,b,t,n,m)\
00205 "mov" #m " " #a ", " #t " \n\t" \
00206 "punpckl" #n " " #b ", " #a " \n\t" \
00207 "punpckh" #n " " #b ", " #t " \n\t" \
00208
00209 #define TRANSPOSE4(a,b,c,d,t)\
00210 SBUTTERFLY(a,b,t,wd,q) \
00211 SBUTTERFLY(c,d,b,wd,q) \
00212 SBUTTERFLY(a,c,d,dq,q) \
00213 SBUTTERFLY(t,b,c,dq,q)
00214
00215
00216
00217
00218 #ifdef CONFIG_ENCODERS
00219 static void get_pixels_mmx(DCTELEM *block, const uint8_t *pixels, int line_size)
00220 {
00221 asm volatile(
00222 "mov $-128, %%"REG_a" \n\t"
00223 "pxor %%mm7, %%mm7 \n\t"
00224 ASMALIGN(4)
00225 "1: \n\t"
00226 "movq (%0), %%mm0 \n\t"
00227 "movq (%0, %2), %%mm2 \n\t"
00228 "movq %%mm0, %%mm1 \n\t"
00229 "movq %%mm2, %%mm3 \n\t"
00230 "punpcklbw %%mm7, %%mm0 \n\t"
00231 "punpckhbw %%mm7, %%mm1 \n\t"
00232 "punpcklbw %%mm7, %%mm2 \n\t"
00233 "punpckhbw %%mm7, %%mm3 \n\t"
00234 "movq %%mm0, (%1, %%"REG_a") \n\t"
00235 "movq %%mm1, 8(%1, %%"REG_a") \n\t"
00236 "movq %%mm2, 16(%1, %%"REG_a") \n\t"
00237 "movq %%mm3, 24(%1, %%"REG_a") \n\t"
00238 "add %3, %0 \n\t"
00239 "add $32, %%"REG_a" \n\t"
00240 "js 1b \n\t"
00241 : "+r" (pixels)
00242 : "r" (block+64), "r" ((long)line_size), "r" ((long)line_size*2)
00243 : "%"REG_a
00244 );
00245 }
00246
00247 static inline void diff_pixels_mmx(DCTELEM *block, const uint8_t *s1, const uint8_t *s2, int stride)
00248 {
00249 asm volatile(
00250 "pxor %%mm7, %%mm7 \n\t"
00251 "mov $-128, %%"REG_a" \n\t"
00252 ASMALIGN(4)
00253 "1: \n\t"
00254 "movq (%0), %%mm0 \n\t"
00255 "movq (%1), %%mm2 \n\t"
00256 "movq %%mm0, %%mm1 \n\t"
00257 "movq %%mm2, %%mm3 \n\t"
00258 "punpcklbw %%mm7, %%mm0 \n\t"
00259 "punpckhbw %%mm7, %%mm1 \n\t"
00260 "punpcklbw %%mm7, %%mm2 \n\t"
00261 "punpckhbw %%mm7, %%mm3 \n\t"
00262 "psubw %%mm2, %%mm0 \n\t"
00263 "psubw %%mm3, %%mm1 \n\t"
00264 "movq %%mm0, (%2, %%"REG_a") \n\t"
00265 "movq %%mm1, 8(%2, %%"REG_a") \n\t"
00266 "add %3, %0 \n\t"
00267 "add %3, %1 \n\t"
00268 "add $16, %%"REG_a" \n\t"
00269 "jnz 1b \n\t"
00270 : "+r" (s1), "+r" (s2)
00271 : "r" (block+64), "r" ((long)stride)
00272 : "%"REG_a
00273 );
00274 }
00275 #endif //CONFIG_ENCODERS
00276
00277 void put_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels, int line_size)
00278 {
00279 const DCTELEM *p;
00280 uint8_t *pix;
00281
00282
00283 p = block;
00284 pix = pixels;
00285
00286 __asm __volatile(
00287 "movq %3, %%mm0 \n\t"
00288 "movq 8%3, %%mm1 \n\t"
00289 "movq 16%3, %%mm2 \n\t"
00290 "movq 24%3, %%mm3 \n\t"
00291 "movq 32%3, %%mm4 \n\t"
00292 "movq 40%3, %%mm5 \n\t"
00293 "movq 48%3, %%mm6 \n\t"
00294 "movq 56%3, %%mm7 \n\t"
00295 "packuswb %%mm1, %%mm0 \n\t"
00296 "packuswb %%mm3, %%mm2 \n\t"
00297 "packuswb %%mm5, %%mm4 \n\t"
00298 "packuswb %%mm7, %%mm6 \n\t"
00299 "movq %%mm0, (%0) \n\t"
00300 "movq %%mm2, (%0, %1) \n\t"
00301 "movq %%mm4, (%0, %1, 2) \n\t"
00302 "movq %%mm6, (%0, %2) \n\t"
00303 ::"r" (pix), "r" ((long)line_size), "r" ((long)line_size*3), "m"(*p)
00304 :"memory");
00305 pix += line_size*4;
00306 p += 32;
00307
00308
00309
00310
00311 __asm __volatile(
00312 "movq (%3), %%mm0 \n\t"
00313 "movq 8(%3), %%mm1 \n\t"
00314 "movq 16(%3), %%mm2 \n\t"
00315 "movq 24(%3), %%mm3 \n\t"
00316 "movq 32(%3), %%mm4 \n\t"
00317 "movq 40(%3), %%mm5 \n\t"
00318 "movq 48(%3), %%mm6 \n\t"
00319 "movq 56(%3), %%mm7 \n\t"
00320 "packuswb %%mm1, %%mm0 \n\t"
00321 "packuswb %%mm3, %%mm2 \n\t"
00322 "packuswb %%mm5, %%mm4 \n\t"
00323 "packuswb %%mm7, %%mm6 \n\t"
00324 "movq %%mm0, (%0) \n\t"
00325 "movq %%mm2, (%0, %1) \n\t"
00326 "movq %%mm4, (%0, %1, 2) \n\t"
00327 "movq %%mm6, (%0, %2) \n\t"
00328 ::"r" (pix), "r" ((long)line_size), "r" ((long)line_size*3), "r"(p)
00329 :"memory");
00330 }
00331
00332 static DECLARE_ALIGNED_8(const unsigned char, vector128[8]) =
00333 { 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 };
00334
00335 void put_signed_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels, int line_size)
00336 {
00337 int i;
00338
00339 movq_m2r(*vector128, mm1);
00340 for (i = 0; i < 8; i++) {
00341 movq_m2r(*(block), mm0);
00342 packsswb_m2r(*(block + 4), mm0);
00343 block += 8;
00344 paddb_r2r(mm1, mm0);
00345 movq_r2m(mm0, *pixels);
00346 pixels += line_size;
00347 }
00348 }
00349
00350 void add_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels, int line_size)
00351 {
00352 const DCTELEM *p;
00353 uint8_t *pix;
00354 int i;
00355
00356
00357 p = block;
00358 pix = pixels;
00359 MOVQ_ZERO(mm7);
00360 i = 4;
00361 do {
00362 __asm __volatile(
00363 "movq (%2), %%mm0 \n\t"
00364 "movq 8(%2), %%mm1 \n\t"
00365 "movq 16(%2), %%mm2 \n\t"
00366 "movq 24(%2), %%mm3 \n\t"
00367 "movq %0, %%mm4 \n\t"
00368 "movq %1, %%mm6 \n\t"
00369 "movq %%mm4, %%mm5 \n\t"
00370 "punpcklbw %%mm7, %%mm4 \n\t"
00371 "punpckhbw %%mm7, %%mm5 \n\t"
00372 "paddsw %%mm4, %%mm0 \n\t"
00373 "paddsw %%mm5, %%mm1 \n\t"
00374 "movq %%mm6, %%mm5 \n\t"
00375 "punpcklbw %%mm7, %%mm6 \n\t"
00376 "punpckhbw %%mm7, %%mm5 \n\t"
00377 "paddsw %%mm6, %%mm2 \n\t"
00378 "paddsw %%mm5, %%mm3 \n\t"
00379 "packuswb %%mm1, %%mm0 \n\t"
00380 "packuswb %%mm3, %%mm2 \n\t"
00381 "movq %%mm0, %0 \n\t"
00382 "movq %%mm2, %1 \n\t"
00383 :"+m"(*pix), "+m"(*(pix+line_size))
00384 :"r"(p)
00385 :"memory");
00386 pix += line_size*2;
00387 p += 16;
00388 } while (--i);
00389 }
00390
00391 static void put_pixels4_mmx(uint8_t *block, const uint8_t *pixels, int line_size, int h)
00392 {
00393 __asm __volatile(
00394 "lea (%3, %3), %%"REG_a" \n\t"
00395 ASMALIGN(3)
00396 "1: \n\t"
00397 "movd (%1), %%mm0 \n\t"
00398 "movd (%1, %3), %%mm1 \n\t"
00399 "movd %%mm0, (%2) \n\t"
00400 "movd %%mm1, (%2, %3) \n\t"
00401 "add %%"REG_a", %1 \n\t"
00402 "add %%"REG_a", %2 \n\t"
00403 "movd (%1), %%mm0 \n\t"
00404 "movd (%1, %3), %%mm1 \n\t"
00405 "movd %%mm0, (%2) \n\t"
00406 "movd %%mm1, (%2, %3) \n\t"
00407 "add %%"REG_a", %1 \n\t"
00408 "add %%"REG_a", %2 \n\t"
00409 "subl $4, %0 \n\t"
00410 "jnz 1b \n\t"
00411 : "+g"(h), "+r" (pixels), "+r" (block)
00412 : "r"((long)line_size)
00413 : "%"REG_a, "memory"
00414 );
00415 }
00416
00417 static void put_pixels8_mmx(uint8_t *block, const uint8_t *pixels, int line_size, int h)
00418 {
00419 __asm __volatile(
00420 "lea (%3, %3), %%"REG_a" \n\t"
00421 ASMALIGN(3)
00422 "1: \n\t"
00423 "movq (%1), %%mm0 \n\t"
00424 "movq (%1, %3), %%mm1 \n\t"
00425 "movq %%mm0, (%2) \n\t"
00426 "movq %%mm1, (%2, %3) \n\t"
00427 "add %%"REG_a", %1 \n\t"
00428 "add %%"REG_a", %2 \n\t"
00429 "movq (%1), %%mm0 \n\t"
00430 "movq (%1, %3), %%mm1 \n\t"
00431 "movq %%mm0, (%2) \n\t"
00432 "movq %%mm1, (%2, %3) \n\t"
00433 "add %%"REG_a", %1 \n\t"
00434 "add %%"REG_a", %2 \n\t"
00435 "subl $4, %0 \n\t"
00436 "jnz 1b \n\t"
00437 : "+g"(h), "+r" (pixels), "+r" (block)
00438 : "r"((long)line_size)
00439 : "%"REG_a, "memory"
00440 );
00441 }
00442
00443 static void put_pixels16_mmx(uint8_t *block, const uint8_t *pixels, int line_size, int h)
00444 {
00445 __asm __volatile(
00446 "lea (%3, %3), %%"REG_a" \n\t"
00447 ASMALIGN(3)
00448 "1: \n\t"
00449 "movq (%1), %%mm0 \n\t"
00450 "movq 8(%1), %%mm4 \n\t"
00451 "movq (%1, %3), %%mm1 \n\t"
00452 "movq 8(%1, %3), %%mm5 \n\t"
00453 "movq %%mm0, (%2) \n\t"
00454 "movq %%mm4, 8(%2) \n\t"
00455 "movq %%mm1, (%2, %3) \n\t"
00456 "movq %%mm5, 8(%2, %3) \n\t"
00457 "add %%"REG_a", %1 \n\t"
00458 "add %%"REG_a", %2 \n\t"
00459 "movq (%1), %%mm0 \n\t"
00460 "movq 8(%1), %%mm4 \n\t"
00461 "movq (%1, %3), %%mm1 \n\t"
00462 "movq 8(%1, %3), %%mm5 \n\t"
00463 "movq %%mm0, (%2) \n\t"
00464 "movq %%mm4, 8(%2) \n\t"
00465 "movq %%mm1, (%2, %3) \n\t"
00466 "movq %%mm5, 8(%2, %3) \n\t"
00467 "add %%"REG_a", %1 \n\t"
00468 "add %%"REG_a", %2 \n\t"
00469 "subl $4, %0 \n\t"
00470 "jnz 1b \n\t"
00471 : "+g"(h), "+r" (pixels), "+r" (block)
00472 : "r"((long)line_size)
00473 : "%"REG_a, "memory"
00474 );
00475 }
00476
00477 static void clear_blocks_mmx(DCTELEM *blocks)
00478 {
00479 __asm __volatile(
00480 "pxor %%mm7, %%mm7 \n\t"
00481 "mov $-128*6, %%"REG_a" \n\t"
00482 "1: \n\t"
00483 "movq %%mm7, (%0, %%"REG_a") \n\t"
00484 "movq %%mm7, 8(%0, %%"REG_a") \n\t"
00485 "movq %%mm7, 16(%0, %%"REG_a") \n\t"
00486 "movq %%mm7, 24(%0, %%"REG_a") \n\t"
00487 "add $32, %%"REG_a" \n\t"
00488 " js 1b \n\t"
00489 : : "r" (((uint8_t *)blocks)+128*6)
00490 : "%"REG_a
00491 );
00492 }
00493
00494 #ifdef CONFIG_ENCODERS
00495 static int pix_sum16_mmx(uint8_t * pix, int line_size){
00496 const int h=16;
00497 int sum;
00498 long index= -line_size*h;
00499
00500 __asm __volatile(
00501 "pxor %%mm7, %%mm7 \n\t"
00502 "pxor %%mm6, %%mm6 \n\t"
00503 "1: \n\t"
00504 "movq (%2, %1), %%mm0 \n\t"
00505 "movq (%2, %1), %%mm1 \n\t"
00506 "movq 8(%2, %1), %%mm2 \n\t"
00507 "movq 8(%2, %1), %%mm3 \n\t"
00508 "punpcklbw %%mm7, %%mm0 \n\t"
00509 "punpckhbw %%mm7, %%mm1 \n\t"
00510 "punpcklbw %%mm7, %%mm2 \n\t"
00511 "punpckhbw %%mm7, %%mm3 \n\t"
00512 "paddw %%mm0, %%mm1 \n\t"
00513 "paddw %%mm2, %%mm3 \n\t"
00514 "paddw %%mm1, %%mm3 \n\t"
00515 "paddw %%mm3, %%mm6 \n\t"
00516 "add %3, %1 \n\t"
00517 " js 1b \n\t"
00518 "movq %%mm6, %%mm5 \n\t"
00519 "psrlq $32, %%mm6 \n\t"
00520 "paddw %%mm5, %%mm6 \n\t"
00521 "movq %%mm6, %%mm5 \n\t"
00522 "psrlq $16, %%mm6 \n\t"
00523 "paddw %%mm5, %%mm6 \n\t"
00524 "movd %%mm6, %0 \n\t"
00525 "andl $0xFFFF, %0 \n\t"
00526 : "=&r" (sum), "+r" (index)
00527 : "r" (pix - index), "r" ((long)line_size)
00528 );
00529
00530 return sum;
00531 }
00532 #endif //CONFIG_ENCODERS
00533
00534 static void add_bytes_mmx(uint8_t *dst, uint8_t *src, int w){
00535 long i=0;
00536 asm volatile(
00537 "1: \n\t"
00538 "movq (%1, %0), %%mm0 \n\t"
00539 "movq (%2, %0), %%mm1 \n\t"
00540 "paddb %%mm0, %%mm1 \n\t"
00541 "movq %%mm1, (%2, %0) \n\t"
00542 "movq 8(%1, %0), %%mm0 \n\t"
00543 "movq 8(%2, %0), %%mm1 \n\t"
00544 "paddb %%mm0, %%mm1 \n\t"
00545 "movq %%mm1, 8(%2, %0) \n\t"
00546 "add $16, %0 \n\t"
00547 "cmp %3, %0 \n\t"
00548 " jb 1b \n\t"
00549 : "+r" (i)
00550 : "r"(src), "r"(dst), "r"((long)w-15)
00551 );
00552 for(; i<w; i++)
00553 dst[i+0] += src[i+0];
00554 }
00555
00556 #define H263_LOOP_FILTER \
00557 "pxor %%mm7, %%mm7 \n\t"\
00558 "movq %0, %%mm0 \n\t"\
00559 "movq %0, %%mm1 \n\t"\
00560 "movq %3, %%mm2 \n\t"\
00561 "movq %3, %%mm3 \n\t"\
00562 "punpcklbw %%mm7, %%mm0 \n\t"\
00563 "punpckhbw %%mm7, %%mm1 \n\t"\
00564 "punpcklbw %%mm7, %%mm2 \n\t"\
00565 "punpckhbw %%mm7, %%mm3 \n\t"\
00566 "psubw %%mm2, %%mm0 \n\t"\
00567 "psubw %%mm3, %%mm1 \n\t"\
00568 "movq %1, %%mm2 \n\t"\
00569 "movq %1, %%mm3 \n\t"\
00570 "movq %2, %%mm4 \n\t"\
00571 "movq %2, %%mm5 \n\t"\
00572 "punpcklbw %%mm7, %%mm2 \n\t"\
00573 "punpckhbw %%mm7, %%mm3 \n\t"\
00574 "punpcklbw %%mm7, %%mm4 \n\t"\
00575 "punpckhbw %%mm7, %%mm5 \n\t"\
00576 "psubw %%mm2, %%mm4 \n\t"\
00577 "psubw %%mm3, %%mm5 \n\t"\
00578 "psllw $2, %%mm4 \n\t"\
00579 "psllw $2, %%mm5 \n\t"\
00580 "paddw %%mm0, %%mm4 \n\t"\
00581 "paddw %%mm1, %%mm5 \n\t"\
00582 "pxor %%mm6, %%mm6 \n\t"\
00583 "pcmpgtw %%mm4, %%mm6 \n\t"\
00584 "pcmpgtw %%mm5, %%mm7 \n\t"\
00585 "pxor %%mm6, %%mm4 \n\t"\
00586 "pxor %%mm7, %%mm5 \n\t"\
00587 "psubw %%mm6, %%mm4 \n\t"\
00588 "psubw %%mm7, %%mm5 \n\t"\
00589 "psrlw $3, %%mm4 \n\t"\
00590 "psrlw $3, %%mm5 \n\t"\
00591 "packuswb %%mm5, %%mm4 \n\t"\
00592 "packsswb %%mm7, %%mm6 \n\t"\
00593 "pxor %%mm7, %%mm7 \n\t"\
00594 "movd %4, %%mm2 \n\t"\
00595 "punpcklbw %%mm2, %%mm2 \n\t"\
00596 "punpcklbw %%mm2, %%mm2 \n\t"\
00597 "punpcklbw %%mm2, %%mm2 \n\t"\
00598 "psubusb %%mm4, %%mm2 \n\t"\
00599 "movq %%mm2, %%mm3 \n\t"\
00600 "psubusb %%mm4, %%mm3 \n\t"\
00601 "psubb %%mm3, %%mm2 \n\t"\
00602 "movq %1, %%mm3 \n\t"\
00603 "movq %2, %%mm4 \n\t"\
00604 "pxor %%mm6, %%mm3 \n\t"\
00605 "pxor %%mm6, %%mm4 \n\t"\
00606 "paddusb %%mm2, %%mm3 \n\t"\
00607 "psubusb %%mm2, %%mm4 \n\t"\
00608 "pxor %%mm6, %%mm3 \n\t"\
00609 "pxor %%mm6, %%mm4 \n\t"\
00610 "paddusb %%mm2, %%mm2 \n\t"\
00611 "packsswb %%mm1, %%mm0 \n\t"\
00612 "pcmpgtb %%mm0, %%mm7 \n\t"\
00613 "pxor %%mm7, %%mm0 \n\t"\
00614 "psubb %%mm7, %%mm0 \n\t"\
00615 "movq %%mm0, %%mm1 \n\t"\
00616 "psubusb %%mm2, %%mm0 \n\t"\
00617 "psubb %%mm0, %%mm1 \n\t"\
00618 "pand %5, %%mm1 \n\t"\
00619 "psrlw $2, %%mm1 \n\t"\
00620 "pxor %%mm7, %%mm1 \n\t"\
00621 "psubb %%mm7, %%mm1 \n\t"\
00622 "movq %0, %%mm5 \n\t"\
00623 "movq %3, %%mm6 \n\t"\
00624 "psubb %%mm1, %%mm5 \n\t"\
00625 "paddb %%mm1, %%mm6 \n\t"
00626
00627 static void h263_v_loop_filter_mmx(uint8_t *src, int stride, int qscale){
00628 if(ENABLE_ANY_H263) {
00629 const int strength= ff_h263_loop_filter_strength[qscale];
00630
00631 asm volatile(
00632
00633 H263_LOOP_FILTER
00634
00635 "movq %%mm3, %1 \n\t"
00636 "movq %%mm4, %2 \n\t"
00637 "movq %%mm5, %0 \n\t"
00638 "movq %%mm6, %3 \n\t"
00639 : "+m" (*(uint64_t*)(src - 2*stride)),
00640 "+m" (*(uint64_t*)(src - 1*stride)),
00641 "+m" (*(uint64_t*)(src + 0*stride)),
00642 "+m" (*(uint64_t*)(src + 1*stride))
00643 : "g" (2*strength), "m"(ff_pb_FC)
00644 );
00645 }
00646 }
00647
00648 static inline void transpose4x4(uint8_t *dst, uint8_t *src, int dst_stride, int src_stride){
00649 uint32_t s0 = *(uint32_t*)(src + 0*src_stride);
00650 uint32_t s1 = *(uint32_t*)(src + 1*src_stride);
00651 uint32_t s2 = *(uint32_t*)(src + 2*src_stride);
00652 uint32_t s3 = *(uint32_t*)(src + 3*src_stride);
00653
00654 asm volatile(
00655 "movd %4, %%mm0 \n\t"
00656 "movd %5, %%mm1 \n\t"
00657 "movd %6, %%mm2 \n\t"
00658 "movd %7, %%mm3 \n\t"
00659 "punpcklbw %%mm1, %%mm0 \n\t"
00660 "punpcklbw %%mm3, %%mm2 \n\t"
00661 "movq %%mm0, %%mm1 \n\t"
00662 "punpcklwd %%mm2, %%mm0 \n\t"
00663 "punpckhwd %%mm2, %%mm1 \n\t"
00664 "movd %%mm0, %0 \n\t"
00665 "punpckhdq %%mm0, %%mm0 \n\t"
00666 "movd %%mm0, %1 \n\t"
00667 "movd %%mm1, %2 \n\t"
00668 "punpckhdq %%mm1, %%mm1 \n\t"
00669 "movd %%mm1, %3 \n\t"
00670
00671 : "=m" (*(uint32_t*)(dst + 0*dst_stride)),
00672 "=m" (*(uint32_t*)(dst + 1*dst_stride)),
00673 "=m" (*(uint32_t*)(dst + 2*dst_stride)),
00674 "=m" (*(uint32_t*)(dst + 3*dst_stride))
00675 : "m" (s0),
00676 "m" (s1),
00677 "m" (s2),
00678 "m" (s3)
00679 );
00680 }
00681
00682 static void h263_h_loop_filter_mmx(uint8_t *src, int stride, int qscale){
00683 if(ENABLE_ANY_H263) {
00684 const int strength= ff_h263_loop_filter_strength[qscale];
00685 uint64_t temp[4] __attribute__ ((aligned(8)));
00686 uint8_t *btemp= (uint8_t*)temp;
00687
00688 src -= 2;
00689
00690 transpose4x4(btemp , src , 8, stride);
00691 transpose4x4(btemp+4, src + 4*stride, 8, stride);
00692 asm volatile(
00693 H263_LOOP_FILTER
00694
00695 : "+m" (temp[0]),
00696 "+m" (temp[1]),
00697 "+m" (temp[2]),
00698 "+m" (temp[3])
00699 : "g" (2*strength), "m"(ff_pb_FC)
00700 );
00701
00702 asm volatile(
00703 "movq %%mm5, %%mm1 \n\t"
00704 "movq %%mm4, %%mm0 \n\t"
00705 "punpcklbw %%mm3, %%mm5 \n\t"
00706 "punpcklbw %%mm6, %%mm4 \n\t"
00707 "punpckhbw %%mm3, %%mm1 \n\t"
00708 "punpckhbw %%mm6, %%mm0 \n\t"
00709 "movq %%mm5, %%mm3 \n\t"
00710 "movq %%mm1, %%mm6 \n\t"
00711 "punpcklwd %%mm4, %%mm5 \n\t"
00712 "punpcklwd %%mm0, %%mm1 \n\t"
00713 "punpckhwd %%mm4, %%mm3 \n\t"
00714 "punpckhwd %%mm0, %%mm6 \n\t"
00715 "movd %%mm5, (%0) \n\t"
00716 "punpckhdq %%mm5, %%mm5 \n\t"
00717 "movd %%mm5, (%0,%2) \n\t"
00718 "movd %%mm3, (%0,%2,2) \n\t"
00719 "punpckhdq %%mm3, %%mm3 \n\t"
00720 "movd %%mm3, (%0,%3) \n\t"
00721 "movd %%mm1, (%1) \n\t"
00722 "punpckhdq %%mm1, %%mm1 \n\t"
00723 "movd %%mm1, (%1,%2) \n\t"
00724 "movd %%mm6, (%1,%2,2) \n\t"
00725 "punpckhdq %%mm6, %%mm6 \n\t"
00726 "movd %%mm6, (%1,%3) \n\t"
00727 :: "r" (src),
00728 "r" (src + 4*stride),
00729 "r" ((long) stride ),
00730 "r" ((long)(3*stride))
00731 );
00732 }
00733 }
00734
00735 #ifdef CONFIG_ENCODERS
00736 static int pix_norm1_mmx(uint8_t *pix, int line_size) {
00737 int tmp;
00738 asm volatile (
00739 "movl $16,%%ecx\n"
00740 "pxor %%mm0,%%mm0\n"
00741 "pxor %%mm7,%%mm7\n"
00742 "1:\n"
00743 "movq (%0),%%mm2\n"
00744 "movq 8(%0),%%mm3\n"
00745
00746 "movq %%mm2,%%mm1\n"
00747
00748 "punpckhbw %%mm0,%%mm1\n"
00749 "punpcklbw %%mm0,%%mm2\n"
00750
00751 "movq %%mm3,%%mm4\n"
00752 "punpckhbw %%mm0,%%mm3\n"
00753 "punpcklbw %%mm0,%%mm4\n"
00754
00755 "pmaddwd %%mm1,%%mm1\n"
00756 "pmaddwd %%mm2,%%mm2\n"
00757
00758 "pmaddwd %%mm3,%%mm3\n"
00759 "pmaddwd %%mm4,%%mm4\n"
00760
00761 "paddd %%mm1,%%mm2\n"
00762
00763 "paddd %%mm3,%%mm4\n"
00764 "paddd %%mm2,%%mm7\n"
00765
00766 "add %2, %0\n"
00767 "paddd %%mm4,%%mm7\n"
00768 "dec %%ecx\n"
00769 "jnz 1b\n"
00770
00771 "movq %%mm7,%%mm1\n"
00772 "psrlq $32, %%mm7\n"
00773 "paddd %%mm7,%%mm1\n"
00774 "movd %%mm1,%1\n"
00775 : "+r" (pix), "=r"(tmp) : "r" ((long)line_size) : "%ecx" );
00776 return tmp;
00777 }
00778
00779 static int sse8_mmx(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) {
00780 int tmp;
00781 asm volatile (
00782 "movl %4,%%ecx\n"
00783 "shr $1,%%ecx\n"
00784 "pxor %%mm0,%%mm0\n"
00785 "pxor %%mm7,%%mm7\n"
00786 "1:\n"
00787 "movq (%0),%%mm1\n"
00788 "movq (%1),%%mm2\n"
00789 "movq (%0,%3),%%mm3\n"
00790 "movq (%1,%3),%%mm4\n"
00791
00792
00793
00794
00795 "movq %%mm1,%%mm5\n"
00796 "movq %%mm3,%%mm6\n"
00797 "psubusb %%mm2,%%mm1\n"
00798 "psubusb %%mm4,%%mm3\n"
00799 "psubusb %%mm5,%%mm2\n"
00800 "psubusb %%mm6,%%mm4\n"
00801
00802 "por %%mm1,%%mm2\n"
00803 "por %%mm3,%%mm4\n"
00804
00805
00806 "movq %%mm2,%%mm1\n"
00807 "movq %%mm4,%%mm3\n"
00808
00809 "punpckhbw %%mm0,%%mm2\n"
00810 "punpckhbw %%mm0,%%mm4\n"
00811 "punpcklbw %%mm0,%%mm1\n"
00812 "punpcklbw %%mm0,%%mm3\n"
00813
00814 "pmaddwd %%mm2,%%mm2\n"
00815 "pmaddwd %%mm4,%%mm4\n"
00816 "pmaddwd %%mm1,%%mm1\n"
00817 "pmaddwd %%mm3,%%mm3\n"
00818
00819 "lea (%0,%3,2), %0\n"
00820 "lea (%1,%3,2), %1\n"
00821
00822 "paddd %%mm2,%%mm1\n"
00823 "paddd %%mm4,%%mm3\n"
00824 "paddd %%mm1,%%mm7\n"
00825 "paddd %%mm3,%%mm7\n"
00826
00827 "decl %%ecx\n"
00828 "jnz 1b\n"
00829
00830 "movq %%mm7,%%mm1\n"
00831 "psrlq $32, %%mm7\n"
00832 "paddd %%mm7,%%mm1\n"
00833 "movd %%mm1,%2\n"
00834 : "+r" (pix1), "+r" (pix2), "=r"(tmp)
00835 : "r" ((long)line_size) , "m" (h)
00836 : "%ecx");
00837 return tmp;
00838 }
00839
00840 static int sse16_mmx(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) {
00841 int tmp;
00842 asm volatile (
00843 "movl %4,%%ecx\n"
00844 "pxor %%mm0,%%mm0\n"
00845 "pxor %%mm7,%%mm7\n"
00846 "1:\n"
00847 "movq (%0),%%mm1\n"
00848 "movq (%1),%%mm2\n"
00849 "movq 8(%0),%%mm3\n"
00850 "movq 8(%1),%%mm4\n"
00851
00852
00853
00854
00855 "movq %%mm1,%%mm5\n"
00856 "movq %%mm3,%%mm6\n"
00857 "psubusb %%mm2,%%mm1\n"
00858 "psubusb %%mm4,%%mm3\n"
00859 "psubusb %%mm5,%%mm2\n"
00860 "psubusb %%mm6,%%mm4\n"
00861
00862 "por %%mm1,%%mm2\n"
00863 "por %%mm3,%%mm4\n"
00864
00865
00866 "movq %%mm2,%%mm1\n"
00867 "movq %%mm4,%%mm3\n"
00868
00869 "punpckhbw %%mm0,%%mm2\n"
00870 "punpckhbw %%mm0,%%mm4\n"
00871 "punpcklbw %%mm0,%%mm1\n"
00872 "punpcklbw %%mm0,%%mm3\n"
00873
00874 "pmaddwd %%mm2,%%mm2\n"
00875 "pmaddwd %%mm4,%%mm4\n"
00876 "pmaddwd %%mm1,%%mm1\n"
00877 "pmaddwd %%mm3,%%mm3\n"
00878
00879 "add %3,%0\n"
00880 "add %3,%1\n"
00881
00882 "paddd %%mm2,%%mm1\n"
00883 "paddd %%mm4,%%mm3\n"
00884 "paddd %%mm1,%%mm7\n"
00885 "paddd %%mm3,%%mm7\n"
00886
00887 "decl %%ecx\n"
00888 "jnz 1b\n"
00889
00890 "movq %%mm7,%%mm1\n"
00891 "psrlq $32, %%mm7\n"
00892 "paddd %%mm7,%%mm1\n"
00893 "movd %%mm1,%2\n"
00894 : "+r" (pix1), "+r" (pix2), "=r"(tmp)
00895 : "r" ((long)line_size) , "m" (h)
00896 : "%ecx");
00897 return tmp;
00898 }
00899
00900 static int sse16_sse2(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) {
00901 int tmp;
00902 asm volatile (
00903 "shr $1,%2\n"
00904 "pxor %%xmm0,%%xmm0\n"
00905 "pxor %%xmm7,%%xmm7\n"
00906 "1:\n"
00907 "movdqu (%0),%%xmm1\n"
00908 "movdqu (%1),%%xmm2\n"
00909 "movdqu (%0,%4),%%xmm3\n"
00910 "movdqu (%1,%4),%%xmm4\n"
00911
00912
00913
00914
00915 "movdqa %%xmm1,%%xmm5\n"
00916 "movdqa %%xmm3,%%xmm6\n"
00917 "psubusb %%xmm2,%%xmm1\n"
00918 "psubusb %%xmm4,%%xmm3\n"
00919 "psubusb %%xmm5,%%xmm2\n"
00920 "psubusb %%xmm6,%%xmm4\n"
00921
00922 "por %%xmm1,%%xmm2\n"
00923 "por %%xmm3,%%xmm4\n"
00924
00925
00926 "movdqa %%xmm2,%%xmm1\n"
00927 "movdqa %%xmm4,%%xmm3\n"
00928
00929 "punpckhbw %%xmm0,%%xmm2\n"
00930 "punpckhbw %%xmm0,%%xmm4\n"
00931 "punpcklbw %%xmm0,%%xmm1\n"
00932 "punpcklbw %%xmm0,%%xmm3\n"
00933
00934 "pmaddwd %%xmm2,%%xmm2\n"
00935 "pmaddwd %%xmm4,%%xmm4\n"
00936 "pmaddwd %%xmm1,%%xmm1\n"
00937 "pmaddwd %%xmm3,%%xmm3\n"
00938
00939 "lea (%0,%4,2), %0\n"
00940 "lea (%1,%4,2), %1\n"
00941
00942 "paddd %%xmm2,%%xmm1\n"
00943 "paddd %%xmm4,%%xmm3\n"
00944 "paddd %%xmm1,%%xmm7\n"
00945 "paddd %%xmm3,%%xmm7\n"
00946
00947 "decl %2\n"
00948 "jnz 1b\n"
00949
00950 "movdqa %%xmm7,%%xmm1\n"
00951 "psrldq $8, %%xmm7\n"
00952 "paddd %%xmm1,%%xmm7\n"
00953 "movdqa %%xmm7,%%xmm1\n"
00954 "psrldq $4, %%xmm7\n"
00955 "paddd %%xmm1,%%xmm7\n"
00956 "movd %%xmm7,%3\n"
00957 : "+r" (pix1), "+r" (pix2), "+r"(h), "=r"(tmp)
00958 : "r" ((long)line_size));
00959 return tmp;
00960 }
00961
00962 static int hf_noise8_mmx(uint8_t * pix1, int line_size, int h) {
00963 int tmp;
00964 asm volatile (
00965 "movl %3,%%ecx\n"
00966 "pxor %%mm7,%%mm7\n"
00967 "pxor %%mm6,%%mm6\n"
00968
00969 "movq (%0),%%mm0\n"
00970 "movq %%mm0, %%mm1\n"
00971 "psllq $8, %%mm0\n"
00972 "psrlq $8, %%mm1\n"
00973 "psrlq $8, %%mm0\n"
00974 "movq %%mm0, %%mm2\n"
00975 "movq %%mm1, %%mm3\n"
00976 "punpcklbw %%mm7,%%mm0\n"
00977 "punpcklbw %%mm7,%%mm1\n"
00978 "punpckhbw %%mm7,%%mm2\n"
00979 "punpckhbw %%mm7,%%mm3\n"
00980 "psubw %%mm1, %%mm0\n"
00981 "psubw %%mm3, %%mm2\n"
00982
00983 "add %2,%0\n"
00984
00985 "movq (%0),%%mm4\n"
00986 "movq %%mm4, %%mm1\n"
00987 "psllq $8, %%mm4\n"
00988 "psrlq $8, %%mm1\n"
00989 "psrlq $8, %%mm4\n"
00990 "movq %%mm4, %%mm5\n"
00991 "movq %%mm1, %%mm3\n"
00992 "punpcklbw %%mm7,%%mm4\n"
00993 "punpcklbw %%mm7,%%mm1\n"
00994 "punpckhbw %%mm7,%%mm5\n"
00995 "punpckhbw %%mm7,%%mm3\n"
00996 "psubw %%mm1, %%mm4\n"
00997 "psubw %%mm3, %%mm5\n"
00998 "psubw %%mm4, %%mm0\n"
00999 "psubw %%mm5, %%mm2\n"
01000 "pxor %%mm3, %%mm3\n"
01001 "pxor %%mm1, %%mm1\n"
01002 "pcmpgtw %%mm0, %%mm3\n\t"
01003 "pcmpgtw %%mm2, %%mm1\n\t"
01004 "pxor %%mm3, %%mm0\n"
01005 "pxor %%mm1, %%mm2\n"
01006 "psubw %%mm3, %%mm0\n"
01007 "psubw %%mm1, %%mm2\n"
01008 "paddw %%mm0, %%mm2\n"
01009 "paddw %%mm2, %%mm6\n"
01010
01011 "add %2,%0\n"
01012 "1:\n"
01013
01014 "movq (%0),%%mm0\n"
01015 "movq %%mm0, %%mm1\n"
01016 "psllq $8, %%mm0\n"
01017 "psrlq $8, %%mm1\n"
01018 "psrlq $8, %%mm0\n"
01019 "movq %%mm0, %%mm2\n"
01020 "movq %%mm1, %%mm3\n"
01021 "punpcklbw %%mm7,%%mm0\n"
01022 "punpcklbw %%mm7,%%mm1\n"
01023 "punpckhbw %%mm7,%%mm2\n"
01024 "punpckhbw %%mm7,%%mm3\n"
01025 "psubw %%mm1, %%mm0\n"
01026 "psubw %%mm3, %%mm2\n"
01027 "psubw %%mm0, %%mm4\n"
01028 "psubw %%mm2, %%mm5\n"
01029 "pxor %%mm3, %%mm3\n"
01030 "pxor %%mm1, %%mm1\n"
01031 "pcmpgtw %%mm4, %%mm3\n\t"
01032 "pcmpgtw %%mm5, %%mm1\n\t"
01033 "pxor %%mm3, %%mm4\n"
01034 "pxor %%mm1, %%mm5\n"
01035 "psubw %%mm3, %%mm4\n"
01036 "psubw %%mm1, %%mm5\n"
01037 "paddw %%mm4, %%mm5\n"
01038 "paddw %%mm5, %%mm6\n"
01039
01040 "add %2,%0\n"
01041
01042 "movq (%0),%%mm4\n"
01043 "movq %%mm4, %%mm1\n"
01044 "psllq $8, %%mm4\n"
01045 "psrlq $8, %%mm1\n"
01046 "psrlq $8, %%mm4\n"
01047 "movq %%mm4, %%mm5\n"
01048 "movq %%mm1, %%mm3\n"
01049 "punpcklbw %%mm7,%%mm4\n"
01050 "punpcklbw %%mm7,%%mm1\n"
01051 "punpckhbw %%mm7,%%mm5\n"
01052 "punpckhbw %%mm7,%%mm3\n"
01053 "psubw %%mm1, %%mm4\n"
01054 "psubw %%mm3, %%mm5\n"
01055 "psubw %%mm4, %%mm0\n"
01056 "psubw %%mm5, %%mm2\n"
01057 "pxor %%mm3, %%mm3\n"
01058 "pxor %%mm1, %%mm1\n"
01059 "pcmpgtw %%mm0, %%mm3\n\t"
01060 "pcmpgtw %%mm2, %%mm1\n\t"
01061 "pxor %%mm3, %%mm0\n"
01062 "pxor %%mm1, %%mm2\n"
01063 "psubw %%mm3, %%mm0\n"
01064 "psubw %%mm1, %%mm2\n"
01065 "paddw %%mm0, %%mm2\n"
01066 "paddw %%mm2, %%mm6\n"
01067
01068 "add %2,%0\n"
01069 "subl $2, %%ecx\n"
01070 " jnz 1b\n"
01071
01072 "movq %%mm6, %%mm0\n"
01073 "punpcklwd %%mm7,%%mm0\n"
01074 "punpckhwd %%mm7,%%mm6\n"
01075 "paddd %%mm0, %%mm6\n"
01076
01077 "movq %%mm6,%%mm0\n"
01078 "psrlq $32, %%mm6\n"
01079 "paddd %%mm6,%%mm0\n"
01080 "movd %%mm0,%1\n"
01081 : "+r" (pix1), "=r"(tmp)
01082 : "r" ((long)line_size) , "g" (h-2)
01083 : "%ecx");
01084 return tmp;
01085 }
01086
01087 static int hf_noise16_mmx(uint8_t * pix1, int line_size, int h) {
01088 int tmp;
01089 uint8_t * pix= pix1;
01090 asm volatile (
01091 "movl %3,%%ecx\n"
01092 "pxor %%mm7,%%mm7\n"
01093 "pxor %%mm6,%%mm6\n"
01094
01095 "movq (%0),%%mm0\n"
01096 "movq 1(%0),%%mm1\n"
01097 "movq %%mm0, %%mm2\n"
01098 "movq %%mm1, %%mm3\n"
01099 "punpcklbw %%mm7,%%mm0\n"
01100 "punpcklbw %%mm7,%%mm1\n"
01101 "punpckhbw %%mm7,%%mm2\n"
01102 "punpckhbw %%mm7,%%mm3\n"
01103 "psubw %%mm1, %%mm0\n"
01104 "psubw %%mm3, %%mm2\n"
01105
01106 "add %2,%0\n"
01107
01108 "movq (%0),%%mm4\n"
01109 "movq 1(%0),%%mm1\n"
01110 "movq %%mm4, %%mm5\n"
01111 "movq %%mm1, %%mm3\n"
01112 "punpcklbw %%mm7,%%mm4\n"
01113 "punpcklbw %%mm7,%%mm1\n"
01114 "punpckhbw %%mm7,%%mm5\n"
01115 "punpckhbw %%mm7,%%mm3\n"
01116 "psubw %%mm1, %%mm4\n"
01117 "psubw %%mm3, %%mm5\n"
01118 "psubw %%mm4, %%mm0\n"
01119 "psubw %%mm5, %%mm2\n"
01120 "pxor %%mm3, %%mm3\n"
01121 "pxor %%mm1, %%mm1\n"
01122 "pcmpgtw %%mm0, %%mm3\n\t"
01123 "pcmpgtw %%mm2, %%mm1\n\t"
01124 "pxor %%mm3, %%mm0\n"
01125 "pxor %%mm1, %%mm2\n"
01126 "psubw %%mm3, %%mm0\n"
01127 "psubw %%mm1, %%mm2\n"
01128 "paddw %%mm0, %%mm2\n"
01129 "paddw %%mm2, %%mm6\n"
01130
01131 "add %2,%0\n"
01132 "1:\n"
01133
01134 "movq (%0),%%mm0\n"
01135 "movq 1(%0),%%mm1\n"
01136 "movq %%mm0, %%mm2\n"
01137 "movq %%mm1, %%mm3\n"
01138 "punpcklbw %%mm7,%%mm0\n"
01139 "punpcklbw %%mm7,%%mm1\n"
01140 "punpckhbw %%mm7,%%mm2\n"
01141 "punpckhbw %%mm7,%%mm3\n"
01142 "psubw %%mm1, %%mm0\n"
01143 "psubw %%mm3, %%mm2\n"
01144 "psubw %%mm0, %%mm4\n"
01145 "psubw %%mm2, %%mm5\n"
01146 "pxor %%mm3, %%mm3\n"
01147 "pxor %%mm1, %%mm1\n"
01148 "pcmpgtw %%mm4, %%mm3\n\t"
01149 "pcmpgtw %%mm5, %%mm1\n\t"
01150 "pxor %%mm3, %%mm4\n"
01151 "pxor %%mm1, %%mm5\n"
01152 "psubw %%mm3, %%mm4\n"
01153 "psubw %%mm1, %%mm5\n"
01154 "paddw %%mm4, %%mm5\n"
01155 "paddw %%mm5, %%mm6\n"
01156
01157 "add %2,%0\n"
01158
01159 "movq (%0),%%mm4\n"
01160 "movq 1(%0),%%mm1\n"
01161 "movq %%mm4, %%mm5\n"
01162 "movq %%mm1, %%mm3\n"
01163 "punpcklbw %%mm7,%%mm4\n"
01164 "punpcklbw %%mm7,%%mm1\n"
01165 "punpckhbw %%mm7,%%mm5\n"
01166 "punpckhbw %%mm7,%%mm3\n"
01167 "psubw %%mm1, %%mm4\n"
01168 "psubw %%mm3, %%mm5\n"
01169 "psubw %%mm4, %%mm0\n"
01170 "psubw %%mm5, %%mm2\n"
01171 "pxor %%mm3, %%mm3\n"
01172 "pxor %%mm1, %%mm1\n"
01173 "pcmpgtw %%mm0, %%mm3\n\t"
01174 "pcmpgtw %%mm2, %%mm1\n\t"
01175 "pxor %%mm3, %%mm0\n"
01176 "pxor %%mm1, %%mm2\n"
01177 "psubw %%mm3, %%mm0\n"
01178 "psubw %%mm1, %%mm2\n"
01179 "paddw %%mm0, %%mm2\n"
01180 "paddw %%mm2, %%mm6\n"
01181
01182 "add %2,%0\n"
01183 "subl $2, %%ecx\n"
01184 " jnz 1b\n"
01185
01186 "movq %%mm6, %%mm0\n"
01187 "punpcklwd %%mm7,%%mm0\n"
01188 "punpckhwd %%mm7,%%mm6\n"
01189 "paddd %%mm0, %%mm6\n"
01190
01191 "movq %%mm6,%%mm0\n"
01192 "psrlq $32, %%mm6\n"
01193 "paddd %%mm6,%%mm0\n"
01194 "movd %%mm0,%1\n"
01195 : "+r" (pix1), "=r"(tmp)
01196 : "r" ((long)line_size) , "g" (h-2)
01197 : "%ecx");
01198 return tmp + hf_noise8_mmx(pix+8, line_size, h);
01199 }
01200
01201 static int nsse16_mmx(void *p, uint8_t * pix1, uint8_t * pix2, int line_size, int h) {
01202 MpegEncContext *c = p;
01203 int score1, score2;
01204
01205 if(c) score1 = c->dsp.sse[0](c, pix1, pix2, line_size, h);
01206 else score1 = sse16_mmx(c, pix1, pix2, line_size, h);
01207 score2= hf_noise16_mmx(pix1, line_size, h) - hf_noise16_mmx(pix2, line_size, h);
01208
01209 if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
01210 else return score1 + FFABS(score2)*8;
01211 }
01212
01213 static int nsse8_mmx(void *p, uint8_t * pix1, uint8_t * pix2, int line_size, int h) {
01214 MpegEncContext *c = p;
01215 int score1= sse8_mmx(c, pix1, pix2, line_size, h);
01216 int score2= hf_noise8_mmx(pix1, line_size, h) - hf_noise8_mmx(pix2, line_size, h);
01217
01218 if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
01219 else return score1 + FFABS(score2)*8;
01220 }
01221
01222 static int vsad_intra16_mmx(void *v, uint8_t * pix, uint8_t * dummy, int line_size, int h) {
01223 int tmp;
01224
01225 assert( (((int)pix) & 7) == 0);
01226 assert((line_size &7) ==0);
01227
01228 #define SUM(in0, in1, out0, out1) \
01229 "movq (%0), %%mm2\n"\
01230 "movq 8(%0), %%mm3\n"\
01231 "add %2,%0\n"\
01232 "movq %%mm2, " #out0 "\n"\
01233 "movq %%mm3, " #out1 "\n"\
01234 "psubusb " #in0 ", %%mm2\n"\
01235 "psubusb " #in1 ", %%mm3\n"\
01236 "psubusb " #out0 ", " #in0 "\n"\
01237 "psubusb " #out1 ", " #in1 "\n"\
01238 "por %%mm2, " #in0 "\n"\
01239 "por %%mm3, " #in1 "\n"\
01240 "movq " #in0 ", %%mm2\n"\
01241 "movq " #in1 ", %%mm3\n"\
01242 "punpcklbw %%mm7, " #in0 "\n"\
01243 "punpcklbw %%mm7, " #in1 "\n"\
01244 "punpckhbw %%mm7, %%mm2\n"\
01245 "punpckhbw %%mm7, %%mm3\n"\
01246 "paddw " #in1 ", " #in0 "\n"\
01247 "paddw %%mm3, %%mm2\n"\
01248 "paddw %%mm2, " #in0 "\n"\
01249 "paddw " #in0 ", %%mm6\n"
01250
01251
01252 asm volatile (
01253 "movl %3,%%ecx\n"
01254 "pxor %%mm6,%%mm6\n"
01255 "pxor %%mm7,%%mm7\n"
01256 "movq (%0),%%mm0\n"
01257 "movq 8(%0),%%mm1\n"
01258 "add %2,%0\n"
01259 "subl $2, %%ecx\n"
01260 SUM(%%mm0, %%mm1, %%mm4, %%mm5)
01261 "1:\n"
01262
01263 SUM(%%mm4, %%mm5, %%mm0, %%mm1)
01264
01265 SUM(%%mm0, %%mm1, %%mm4, %%mm5)
01266
01267 "subl $2, %%ecx\n"
01268 "jnz 1b\n"
01269
01270 "movq %%mm6,%%mm0\n"
01271 "psrlq $32, %%mm6\n"
01272 "paddw %%mm6,%%mm0\n"
01273 "movq %%mm0,%%mm6\n"
01274 "psrlq $16, %%mm0\n"
01275 "paddw %%mm6,%%mm0\n"
01276 "movd %%mm0,%1\n"
01277 : "+r" (pix), "=r"(tmp)
01278 : "r" ((long)line_size) , "m" (h)
01279 : "%ecx");
01280 return tmp & 0xFFFF;
01281 }
01282 #undef SUM
01283
01284 static int vsad_intra16_mmx2(void *v, uint8_t * pix, uint8_t * dummy, int line_size, int h) {
01285 int tmp;
01286
01287 assert( (((int)pix) & 7) == 0);
01288 assert((line_size &7) ==0);
01289
01290 #define SUM(in0, in1, out0, out1) \
01291 "movq (%0), " #out0 "\n"\
01292 "movq 8(%0), " #out1 "\n"\
01293 "add %2,%0\n"\
01294 "psadbw " #out0 ", " #in0 "\n"\
01295 "psadbw " #out1 ", " #in1 "\n"\
01296 "paddw " #in1 ", " #in0 "\n"\
01297 "paddw " #in0 ", %%mm6\n"
01298
01299 asm volatile (
01300 "movl %3,%%ecx\n"
01301 "pxor %%mm6,%%mm6\n"
01302 "pxor %%mm7,%%mm7\n"
01303 "movq (%0),%%mm0\n"
01304 "movq 8(%0),%%mm1\n"
01305 "add %2,%0\n"
01306 "subl $2, %%ecx\n"
01307 SUM(%%mm0, %%mm1, %%mm4, %%mm5)
01308 "1:\n"
01309
01310 SUM(%%mm4, %%mm5, %%mm0, %%mm1)
01311
01312 SUM(%%mm0, %%mm1, %%mm4, %%mm5)
01313
01314 "subl $2, %%ecx\n"
01315 "jnz 1b\n"
01316
01317 "movd %%mm6,%1\n"
01318 : "+r" (pix), "=r"(tmp)
01319 : "r" ((long)line_size) , "m" (h)
01320 : "%ecx");
01321 return tmp;
01322 }
01323 #undef SUM
01324
01325 static int vsad16_mmx(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) {
01326 int tmp;
01327
01328 assert( (((int)pix1) & 7) == 0);
01329 assert( (((int)pix2) & 7) == 0);
01330 assert((line_size &7) ==0);
01331
01332 #define SUM(in0, in1, out0, out1) \
01333 "movq (%0),%%mm2\n"\
01334 "movq (%1)," #out0 "\n"\
01335 "movq 8(%0),%%mm3\n"\
01336 "movq 8(%1)," #out1 "\n"\
01337 "add %3,%0\n"\
01338 "add %3,%1\n"\
01339 "psubb " #out0 ", %%mm2\n"\
01340 "psubb " #out1 ", %%mm3\n"\
01341 "pxor %%mm7, %%mm2\n"\
01342 "pxor %%mm7, %%mm3\n"\
01343 "movq %%mm2, " #out0 "\n"\
01344 "movq %%mm3, " #out1 "\n"\
01345 "psubusb " #in0 ", %%mm2\n"\
01346 "psubusb " #in1 ", %%mm3\n"\
01347 "psubusb " #out0 ", " #in0 "\n"\
01348 "psubusb " #out1 ", " #in1 "\n"\
01349 "por %%mm2, " #in0 "\n"\
01350 "por %%mm3, " #in1 "\n"\
01351 "movq " #in0 ", %%mm2\n"\
01352 "movq " #in1 ", %%mm3\n"\
01353 "punpcklbw %%mm7, " #in0 "\n"\
01354 "punpcklbw %%mm7, " #in1 "\n"\
01355 "punpckhbw %%mm7, %%mm2\n"\
01356 "punpckhbw %%mm7, %%mm3\n"\
01357 "paddw " #in1 ", " #in0 "\n"\
01358 "paddw %%mm3, %%mm2\n"\
01359 "paddw %%mm2, " #in0 "\n"\
01360 "paddw " #in0 ", %%mm6\n"
01361
01362
01363 asm volatile (
01364 "movl %4,%%ecx\n"
01365 "pxor %%mm6,%%mm6\n"
01366 "pcmpeqw %%mm7,%%mm7\n"
01367 "psllw $15, %%mm7\n"
01368 "packsswb %%mm7, %%mm7\n"
01369 "movq (%0),%%mm0\n"
01370 "movq (%1),%%mm2\n"
01371 "movq 8(%0),%%mm1\n"
01372 "movq 8(%1),%%mm3\n"
01373 "add %3,%0\n"
01374 "add %3,%1\n"
01375 "subl $2, %%ecx\n"
01376 "psubb %%mm2, %%mm0\n"
01377 "psubb %%mm3, %%mm1\n"
01378 "pxor %%mm7, %%mm0\n"
01379 "pxor %%mm7, %%mm1\n"
01380 SUM(%%mm0, %%mm1, %%mm4, %%mm5)
01381 "1:\n"
01382
01383 SUM(%%mm4, %%mm5, %%mm0, %%mm1)
01384
01385 SUM(%%mm0, %%mm1, %%mm4, %%mm5)
01386
01387 "subl $2, %%ecx\n"
01388 "jnz 1b\n"
01389
01390 "movq %%mm6,%%mm0\n"
01391 "psrlq $32, %%mm6\n"
01392 "paddw %%mm6,%%mm0\n"
01393 "movq %%mm0,%%mm6\n"
01394 "psrlq $16, %%mm0\n"
01395 "paddw %%mm6,%%mm0\n"
01396 "movd %%mm0,%2\n"
01397 : "+r" (pix1), "+r" (pix2), "=r"(tmp)
01398 : "r" ((long)line_size) , "m" (h)
01399 : "%ecx");
01400 return tmp & 0x7FFF;
01401 }
01402 #undef SUM
01403
01404 static int vsad16_mmx2(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) {
01405 int tmp;
01406
01407 assert( (((int)pix1) & 7) == 0);
01408 assert( (((int)pix2) & 7) == 0);
01409 assert((line_size &7) ==0);
01410
01411 #define SUM(in0, in1, out0, out1) \
01412 "movq (%0)," #out0 "\n"\
01413 "movq (%1),%%mm2\n"\
01414 "movq 8(%0)," #out1 "\n"\
01415 "movq 8(%1),%%mm3\n"\
01416 "add %3,%0\n"\
01417 "add %3,%1\n"\
01418 "psubb %%mm2, " #out0 "\n"\
01419 "psubb %%mm3, " #out1 "\n"\
01420 "pxor %%mm7, " #out0 "\n"\
01421 "pxor %%mm7, " #out1 "\n"\
01422 "psadbw " #out0 ", " #in0 "\n"\
01423 "psadbw " #out1 ", " #in1 "\n"\
01424 "paddw " #in1 ", " #in0 "\n"\
01425 "paddw " #in0 ", %%mm6\n"
01426
01427 asm volatile (
01428 "movl %4,%%ecx\n"
01429 "pxor %%mm6,%%mm6\n"
01430 "pcmpeqw %%mm7,%%mm7\n"
01431 "psllw $15, %%mm7\n"
01432 "packsswb %%mm7, %%mm7\n"
01433 "movq (%0),%%mm0\n"
01434 "movq (%1),%%mm2\n"
01435 "movq 8(%0),%%mm1\n"
01436 "movq 8(%1),%%mm3\n"
01437 "add %3,%0\n"
01438 "add %3,%1\n"
01439 "subl $2, %%ecx\n"
01440 "psubb %%mm2, %%mm0\n"
01441 "psubb %%mm3, %%mm1\n"
01442 "pxor %%mm7, %%mm0\n"
01443 "pxor %%mm7, %%mm1\n"
01444 SUM(%%mm0, %%mm1, %%mm4, %%mm5)
01445 "1:\n"
01446
01447 SUM(%%mm4, %%mm5, %%mm0, %%mm1)
01448
01449 SUM(%%mm0, %%mm1, %%mm4, %%mm5)
01450
01451 "subl $2, %%ecx\n"
01452 "jnz 1b\n"
01453
01454 "movd %%mm6,%2\n"
01455 : "+r" (pix1), "+r" (pix2), "=r"(tmp)
01456 : "r" ((long)line_size) , "m" (h)
01457 : "%ecx");
01458 return tmp;
01459 }
01460 #undef SUM
01461
01462 static void diff_bytes_mmx(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
01463 long i=0;
01464 asm volatile(
01465 "1: \n\t"
01466 "movq (%2, %0), %%mm0 \n\t"
01467 "movq (%1, %0), %%mm1 \n\t"
01468 "psubb %%mm0, %%mm1 \n\t"
01469 "movq %%mm1, (%3, %0) \n\t"
01470 "movq 8(%2, %0), %%mm0 \n\t"
01471 "movq 8(%1, %0), %%mm1 \n\t"
01472 "psubb %%mm0, %%mm1 \n\t"
01473 "movq %%mm1, 8(%3, %0) \n\t"
01474 "add $16, %0 \n\t"
01475 "cmp %4, %0 \n\t"
01476 " jb 1b \n\t"
01477 : "+r" (i)
01478 : "r"(src1), "r"(src2), "r"(dst), "r"((long)w-15)
01479 );
01480 for(; i<w; i++)
01481 dst[i+0] = src1[i+0]-src2[i+0];
01482 }
01483
01484 static void sub_hfyu_median_prediction_mmx2(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w, int *left, int *left_top){
01485 long i=0;
01486 uint8_t l, lt;
01487
01488 asm volatile(
01489 "1: \n\t"
01490 "movq -1(%1, %0), %%mm0 \n\t"
01491 "movq (%1, %0), %%mm1 \n\t"
01492 "movq -1(%2, %0), %%mm2 \n\t"
01493 "movq (%2, %0), %%mm3 \n\t"
01494 "movq %%mm2, %%mm4 \n\t"
01495 "psubb %%mm0, %%mm2 \n\t"
01496 "paddb %%mm1, %%mm2 \n\t"
01497 "movq %%mm4, %%mm5 \n\t"
01498 "pmaxub %%mm1, %%mm4 \n\t"
01499 "pminub %%mm5, %%mm1 \n\t"
01500 "pminub %%mm2, %%mm4 \n\t"
01501 "pmaxub %%mm1, %%mm4 \n\t"
01502 "psubb %%mm4, %%mm3 \n\t"
01503 "movq %%mm3, (%3, %0) \n\t"
01504 "add $8, %0 \n\t"
01505 "cmp %4, %0 \n\t"
01506 " jb 1b \n\t"
01507 : "+r" (i)
01508 : "r"(src1), "r"(src2), "r"(dst), "r"((long)w)
01509 );
01510
01511 l= *left;
01512 lt= *left_top;
01513
01514 dst[0]= src2[0] - mid_pred(l, src1[0], (l + src1[0] - lt)&0xFF);
01515
01516 *left_top= src1[w-1];
01517 *left = src2[w-1];
01518 }
01519
01520 #define DIFF_PIXELS_1(m,a,t,p1,p2)\
01521 "mov"#m" "#p1", "#a" \n\t"\
01522 "mov"#m" "#p2", "#t" \n\t"\
01523 "punpcklbw "#a", "#t" \n\t"\
01524 "punpcklbw "#a", "#a" \n\t"\
01525 "psubw "#t", "#a" \n\t"\
01526
01527 #define DIFF_PIXELS_8(m0,m1,mm,p1,p2,stride,temp) {\
01528 uint8_t *p1b=p1, *p2b=p2;\
01529 asm volatile(\
01530 DIFF_PIXELS_1(m0, mm##0, mm##7, (%1), (%2))\
01531 DIFF_PIXELS_1(m0, mm##1, mm##7, (%1,%3), (%2,%3))\
01532 DIFF_PIXELS_1(m0, mm##2, mm##7, (%1,%3,2), (%2,%3,2))\
01533 "add %4, %1 \n\t"\
01534 "add %4, %2 \n\t"\
01535 DIFF_PIXELS_1(m0, mm##3, mm##7, (%1), (%2))\
01536 DIFF_PIXELS_1(m0, mm##4, mm##7, (%1,%3), (%2,%3))\
01537 DIFF_PIXELS_1(m0, mm##5, mm##7, (%1,%3,2), (%2,%3,2))\
01538 DIFF_PIXELS_1(m0, mm##6, mm##7, (%1,%4), (%2,%4))\
01539 "mov"#m1" "#mm"0, %0 \n\t"\
01540 DIFF_PIXELS_1(m0, mm##7, mm##0, (%1,%3,4), (%2,%3,4))\
01541 "mov"#m1" %0, "#mm"0 \n\t"\
01542 : "+m"(temp), "+r"(p1b), "+r"(p2b)\
01543 : "r"((long)stride), "r"((long)stride*3)\
01544 );\
01545 }
01546
01547
01548 #define DIFF_PIXELS_4x8(p1,p2,stride,temp) DIFF_PIXELS_8(d, q, %%mm, p1, p2, stride, temp)
01549 #define DIFF_PIXELS_8x8(p1,p2,stride,temp) DIFF_PIXELS_8(q, dqa, %%xmm, p1, p2, stride, temp)
01550
01551 #ifdef ARCH_X86_64
01552
01553 #define TRANSPOSE8(a,b,c,d,e,f,g,h,t)\
01554 SBUTTERFLY(a,b,%%xmm8,wd,dqa)\
01555 SBUTTERFLY(c,d,b,wd,dqa)\
01556 SBUTTERFLY(e,f,d,wd,dqa)\
01557 SBUTTERFLY(g,h,f,wd,dqa)\
01558 SBUTTERFLY(a,c,h,dq,dqa)\
01559 SBUTTERFLY(%%xmm8,b,c,dq,dqa)\
01560 SBUTTERFLY(e,g,b,dq,dqa)\
01561 SBUTTERFLY(d,f,g,dq,dqa)\
01562 SBUTTERFLY(a,e,f,qdq,dqa)\
01563 SBUTTERFLY(%%xmm8,d,e,qdq,dqa)\
01564 SBUTTERFLY(h,b,d,qdq,dqa)\
01565 SBUTTERFLY(c,g,b,qdq,dqa)\
01566 "movdqa %%xmm8, "#g" \n\t"
01567 #else
01568 #define TRANSPOSE8(a,b,c,d,e,f,g,h,t)\
01569 "movdqa "#h", "#t" \n\t"\
01570 SBUTTERFLY(a,b,h,wd,dqa)\
01571 "movdqa "#h", 16"#t" \n\t"\
01572 "movdqa "#t", "#h" \n\t"\
01573 SBUTTERFLY(c,d,b,wd,dqa)\
01574 SBUTTERFLY(e,f,d,wd,dqa)\
01575 SBUTTERFLY(g,h,f,wd,dqa)\
01576 SBUTTERFLY(a,c,h,dq,dqa)\
01577 "movdqa "#h", "#t" \n\t"\
01578 "movdqa 16"#t", "#h" \n\t"\
01579 SBUTTERFLY(h,b,c,dq,dqa)\
01580 SBUTTERFLY(e,g,b,dq,dqa)\
01581 SBUTTERFLY(d,f,g,dq,dqa)\
01582 SBUTTERFLY(a,e,f,qdq,dqa)\
01583 SBUTTERFLY(h,d,e,qdq,dqa)\
01584 "movdqa "#h", 16"#t" \n\t"\
01585 "movdqa "#t", "#h" \n\t"\
01586 SBUTTERFLY(h,b,d,qdq,dqa)\
01587 SBUTTERFLY(c,g,b,qdq,dqa)\
01588 "movdqa 16"#t", "#g" \n\t"
01589 #endif
01590
01591 #define LBUTTERFLY2(a1,b1,a2,b2)\
01592 "paddw " #b1 ", " #a1 " \n\t"\
01593 "paddw " #b2 ", " #a2 " \n\t"\
01594 "paddw " #b1 ", " #b1 " \n\t"\
01595 "paddw " #b2 ", " #b2 " \n\t"\
01596 "psubw " #a1 ", " #b1 " \n\t"\
01597 "psubw " #a2 ", " #b2 " \n\t"
01598
01599 #define HADAMARD8(m0, m1, m2, m3, m4, m5, m6, m7)\
01600 LBUTTERFLY2(m0, m1, m2, m3)\
01601 LBUTTERFLY2(m4, m5, m6, m7)\
01602 LBUTTERFLY2(m0, m2, m1, m3)\
01603 LBUTTERFLY2(m4, m6, m5, m7)\
01604 LBUTTERFLY2(m0, m4, m1, m5)\
01605 LBUTTERFLY2(m2, m6, m3, m7)\
01606
01607 #define HADAMARD48 HADAMARD8(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm6, %%mm7)
01608
01609 #define MMABS_MMX(a,z)\
01610 "pxor " #z ", " #z " \n\t"\
01611 "pcmpgtw " #a ", " #z " \n\t"\
01612 "pxor " #z ", " #a " \n\t"\
01613 "psubw " #z ", " #a " \n\t"
01614
01615 #define MMABS_MMX2(a,z)\
01616 "pxor " #z ", " #z " \n\t"\
01617 "psubw " #a ", " #z " \n\t"\
01618 "pmaxsw " #z ", " #a " \n\t"
01619
01620 #define MMABS_SSSE3(a,z)\
01621 "pabsw " #a ", " #a " \n\t"
01622
01623 #define MMABS_SUM(a,z, sum)\
01624 MMABS(a,z)\
01625 "paddusw " #a ", " #sum " \n\t"
01626
01627 #define MMABS_SUM_8x8_NOSPILL\
01628 MMABS(%%xmm0, %%xmm8)\
01629 MMABS(%%xmm1, %%xmm9)\
01630 MMABS_SUM(%%xmm2, %%xmm8, %%xmm0)\
01631 MMABS_SUM(%%xmm3, %%xmm9, %%xmm1)\
01632 MMABS_SUM(%%xmm4, %%xmm8, %%xmm0)\
01633 MMABS_SUM(%%xmm5, %%xmm9, %%xmm1)\
01634 MMABS_SUM(%%xmm6, %%xmm8, %%xmm0)\
01635 MMABS_SUM(%%xmm7, %%xmm9, %%xmm1)\
01636 "paddusw %%xmm1, %%xmm0 \n\t"
01637
01638 #ifdef ARCH_X86_64
01639 #define MMABS_SUM_8x8_SSE2 MMABS_SUM_8x8_NOSPILL
01640 #else
01641 #define MMABS_SUM_8x8_SSE2\
01642 "movdqa %%xmm7, (%1) \n\t"\
01643 MMABS(%%xmm0, %%xmm7)\
01644 MMABS(%%xmm1, %%xmm7)\
01645 MMABS_SUM(%%xmm2, %%xmm7, %%xmm0)\
01646 MMABS_SUM(%%xmm3, %%xmm7, %%xmm1)\
01647 MMABS_SUM(%%xmm4, %%xmm7, %%xmm0)\
01648 MMABS_SUM(%%xmm5, %%xmm7, %%xmm1)\
01649 MMABS_SUM(%%xmm6, %%xmm7, %%xmm0)\
01650 "movdqa (%1), %%xmm2 \n\t"\
01651 MMABS_SUM(%%xmm2, %%xmm7, %%xmm1)\
01652 "paddusw %%xmm1, %%xmm0 \n\t"
01653 #endif
01654
01655 #define LOAD4(o, a, b, c, d)\
01656 "movq "#o"(%1), "#a" \n\t"\
01657 "movq "#o"+8(%1), "#b" \n\t"\
01658 "movq "#o"+16(%1), "#c" \n\t"\
01659 "movq "#o"+24(%1), "#d" \n\t"\
01660
01661 #define STORE4(o, a, b, c, d)\
01662 "movq "#a", "#o"(%1) \n\t"\
01663 "movq "#b", "#o"+8(%1) \n\t"\
01664 "movq "#c", "#o"+16(%1) \n\t"\
01665 "movq "#d", "#o"+24(%1) \n\t"\
01666
01667
01668
01669
01670 #define HSUM_MMX(a, t, dst)\
01671 "movq "#a", "#t" \n\t"\
01672 "psrlq $32, "#a" \n\t"\
01673 "paddusw "#t", "#a" \n\t"\
01674 "movq "#a", "#t" \n\t"\
01675 "psrlq $16, "#a" \n\t"\
01676 "paddusw "#t", "#a" \n\t"\
01677 "movd "#a", "#dst" \n\t"\
01678
01679 #define HSUM_MMX2(a, t, dst)\
01680 "pshufw $0x0E, "#a", "#t" \n\t"\
01681 "paddusw "#t", "#a" \n\t"\
01682 "pshufw $0x01, "#a", "#t" \n\t"\
01683 "paddusw "#t", "#a" \n\t"\
01684 "movd "#a", "#dst" \n\t"\
01685
01686 #define HSUM_SSE2(a, t, dst)\
01687 "movhlps "#a", "#t" \n\t"\
01688 "paddusw "#t", "#a" \n\t"\
01689 "pshuflw $0x0E, "#a", "#t" \n\t"\
01690 "paddusw "#t", "#a" \n\t"\
01691 "pshuflw $0x01, "#a", "#t" \n\t"\
01692 "paddusw "#t", "#a" \n\t"\
01693 "movd "#a", "#dst" \n\t"\
01694
01695 #define HADAMARD8_DIFF_MMX(cpu) \
01696 static int hadamard8_diff_##cpu(void *s, uint8_t *src1, uint8_t *src2, int stride, int h){\
01697 DECLARE_ALIGNED_8(uint64_t, temp[13]);\
01698 int sum;\
01699 \
01700 assert(h==8);\
01701 \
01702 DIFF_PIXELS_4x8(src1, src2, stride, temp[0]);\
01703 \
01704 asm volatile(\
01705 HADAMARD48\
01706 \
01707 "movq %%mm7, 96(%1) \n\t"\
01708 \
01709 TRANSPOSE4(%%mm0, %%mm1, %%mm2, %%mm3, %%mm7)\
01710 STORE4(0 , %%mm0, %%mm3, %%mm7, %%mm2)\
01711 \
01712 "movq 96(%1), %%mm7 \n\t"\
01713 TRANSPOSE4(%%mm4, %%mm5, %%mm6, %%mm7, %%mm0)\
01714 STORE4(64, %%mm4, %%mm7, %%mm0, %%mm6)\
01715 \
01716 : "=r" (sum)\
01717 : "r"(temp)\
01718 );\
01719 \
01720 DIFF_PIXELS_4x8(src1+4, src2+4, stride, temp[4]);\
01721 \
01722 asm volatile(\
01723 HADAMARD48\
01724 \
01725 "movq %%mm7, 96(%1) \n\t"\
01726 \
01727 TRANSPOSE4(%%mm0, %%mm1, %%mm2, %%mm3, %%mm7)\
01728 STORE4(32, %%mm0, %%mm3, %%mm7, %%mm2)\
01729 \
01730 "movq 96(%1), %%mm7 \n\t"\
01731 TRANSPOSE4(%%mm4, %%mm5, %%mm6, %%mm7, %%mm0)\
01732 "movq %%mm7, %%mm5 \n\t"\
01733 "movq %%mm6, %%mm7 \n\t"\
01734 "movq %%mm0, %%mm6 \n\t"\
01735 \
01736 LOAD4(64, %%mm0, %%mm1, %%mm2, %%mm3)\
01737 \
01738 HADAMARD48\
01739 "movq %%mm7, 64(%1) \n\t"\
01740 MMABS(%%mm0, %%mm7)\
01741 MMABS(%%mm1, %%mm7)\
01742 MMABS_SUM(%%mm2, %%mm7, %%mm0)\
01743 MMABS_SUM(%%mm3, %%mm7, %%mm1)\
01744 MMABS_SUM(%%mm4, %%mm7, %%mm0)\
01745 MMABS_SUM(%%mm5, %%mm7, %%mm1)\
01746 MMABS_SUM(%%mm6, %%mm7, %%mm0)\
01747 "movq 64(%1), %%mm2 \n\t"\
01748 MMABS_SUM(%%mm2, %%mm7, %%mm1)\
01749 "paddusw %%mm1, %%mm0 \n\t"\
01750 "movq %%mm0, 64(%1) \n\t"\
01751 \
01752 LOAD4(0 , %%mm0, %%mm1, %%mm2, %%mm3)\
01753 LOAD4(32, %%mm4, %%mm5, %%mm6, %%mm7)\
01754 \
01755 HADAMARD48\
01756 "movq %%mm7, (%1) \n\t"\
01757 MMABS(%%mm0, %%mm7)\
01758 MMABS(%%mm1, %%mm7)\
01759 MMABS_SUM(%%mm2, %%mm7, %%mm0)\
01760 MMABS_SUM(%%mm3, %%mm7, %%mm1)\
01761 MMABS_SUM(%%mm4, %%mm7, %%mm0)\
01762 MMABS_SUM(%%mm5, %%mm7, %%mm1)\
01763 MMABS_SUM(%%mm6, %%mm7, %%mm0)\
01764 "movq (%1), %%mm2 \n\t"\
01765 MMABS_SUM(%%mm2, %%mm7, %%mm1)\
01766 "paddusw 64(%1), %%mm0 \n\t"\
01767 "paddusw %%mm1, %%mm0 \n\t"\
01768 \
01769 HSUM(%%mm0, %%mm1, %0)\
01770 \
01771 : "=r" (sum)\
01772 : "r"(temp)\
01773 );\
01774 return sum&0xFFFF;\
01775 }\
01776 WARPER8_16_SQ(hadamard8_diff_##cpu, hadamard8_diff16_##cpu)
01777
01778 #define HADAMARD8_DIFF_SSE2(cpu) \
01779 static int hadamard8_diff_##cpu(void *s, uint8_t *src1, uint8_t *src2, int stride, int h){\
01780 DECLARE_ALIGNED_16(uint64_t, temp[4]);\
01781 int sum;\
01782 \
01783 assert(h==8);\
01784 \
01785 DIFF_PIXELS_8x8(src1, src2, stride, temp[0]);\
01786 \
01787 asm volatile(\
01788 HADAMARD8(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm6, %%xmm7)\
01789 TRANSPOSE8(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm6, %%xmm7, (%1))\
01790 HADAMARD8(%%xmm0, %%xmm5, %%xmm7, %%xmm3, %%xmm6, %%xmm4, %%xmm2, %%xmm1)\
01791 MMABS_SUM_8x8\
01792 HSUM_SSE2(%%xmm0, %%xmm1, %0)\
01793 : "=r" (sum)\
01794 : "r"(temp)\
01795 );\
01796 return sum&0xFFFF;\
01797 }\
01798 WARPER8_16_SQ(hadamard8_diff_##cpu, hadamard8_diff16_##cpu)
01799
01800 #define MMABS(a,z) MMABS_MMX(a,z)
01801 #define HSUM(a,t,dst) HSUM_MMX(a,t,dst)
01802 HADAMARD8_DIFF_MMX(mmx)
01803 #undef MMABS
01804 #undef HSUM
01805
01806 #define MMABS(a,z) MMABS_MMX2(a,z)
01807 #define MMABS_SUM_8x8 MMABS_SUM_8x8_SSE2
01808 #define HSUM(a,t,dst) HSUM_MMX2(a,t,dst)
01809 HADAMARD8_DIFF_MMX(mmx2)
01810 HADAMARD8_DIFF_SSE2(sse2)
01811 #undef MMABS
01812 #undef MMABS_SUM_8x8
01813 #undef HSUM
01814
01815 #ifdef HAVE_SSSE3
01816 #define MMABS(a,z) MMABS_SSSE3(a,z)
01817 #define MMABS_SUM_8x8 MMABS_SUM_8x8_NOSPILL
01818 HADAMARD8_DIFF_SSE2(ssse3)
01819 #undef MMABS
01820 #undef MMABS_SUM_8x8
01821 #endif
01822
01823 #define DCT_SAD4(m,mm,o)\
01824 "mov"#m" "#o"+ 0(%1), "#mm"2 \n\t"\
01825 "mov"#m" "#o"+16(%1), "#mm"3 \n\t"\
01826 "mov"#m" "#o"+32(%1), "#mm"4 \n\t"\
01827 "mov"#m" "#o"+48(%1), "#mm"5 \n\t"\
01828 MMABS_SUM(mm##2, mm##6, mm##0)\
01829 MMABS_SUM(mm##3, mm##7, mm##1)\
01830 MMABS_SUM(mm##4, mm##6, mm##0)\
01831 MMABS_SUM(mm##5, mm##7, mm##1)\
01832
01833 #define DCT_SAD_MMX\
01834 "pxor %%mm0, %%mm0 \n\t"\
01835 "pxor %%mm1, %%mm1 \n\t"\
01836 DCT_SAD4(q, %%mm, 0)\
01837 DCT_SAD4(q, %%mm, 8)\
01838 DCT_SAD4(q, %%mm, 64)\
01839 DCT_SAD4(q, %%mm, 72)\
01840 "paddusw %%mm1, %%mm0 \n\t"\
01841 HSUM(%%mm0, %%mm1, %0)
01842
01843 #define DCT_SAD_SSE2\
01844 "pxor %%xmm0, %%xmm0 \n\t"\
01845 "pxor %%xmm1, %%xmm1 \n\t"\
01846 DCT_SAD4(dqa, %%xmm, 0)\
01847 DCT_SAD4(dqa, %%xmm, 64)\
01848 "paddusw %%xmm1, %%xmm0 \n\t"\
01849 HSUM(%%xmm0, %%xmm1, %0)
01850
01851 #define DCT_SAD_FUNC(cpu) \
01852 static int sum_abs_dctelem_##cpu(DCTELEM *block){\
01853 int sum;\
01854 asm volatile(\
01855 DCT_SAD\
01856 :"=r"(sum)\
01857 :"r"(block)\
01858 );\
01859 return sum&0xFFFF;\
01860 }
01861
01862 #define DCT_SAD DCT_SAD_MMX
01863 #define HSUM(a,t,dst) HSUM_MMX(a,t,dst)
01864 #define MMABS(a,z) MMABS_MMX(a,z)
01865 DCT_SAD_FUNC(mmx)
01866 #undef MMABS
01867 #undef HSUM
01868
01869 #define HSUM(a,t,dst) HSUM_MMX2(a,t,dst)
01870 #define MMABS(a,z) MMABS_MMX2(a,z)
01871 DCT_SAD_FUNC(mmx2)
01872 #undef HSUM
01873 #undef DCT_SAD
01874
01875 #define DCT_SAD DCT_SAD_SSE2
01876 #define HSUM(a,t,dst) HSUM_SSE2(a,t,dst)
01877 DCT_SAD_FUNC(sse2)
01878 #undef MMABS
01879
01880 #ifdef HAVE_SSSE3
01881 #define MMABS(a,z) MMABS_SSSE3(a,z)
01882 DCT_SAD_FUNC(ssse3)
01883 #undef MMABS
01884 #endif
01885 #undef HSUM
01886 #undef DCT_SAD
01887
01888 static int ssd_int8_vs_int16_mmx(const int8_t *pix1, const int16_t *pix2, int size){
01889 int sum;
01890 long i=size;
01891 asm volatile(
01892 "pxor %%mm4, %%mm4 \n"
01893 "1: \n"
01894 "sub $8, %0 \n"
01895 "movq (%2,%0), %%mm2 \n"
01896 "movq (%3,%0,2), %%mm0 \n"
01897 "movq 8(%3,%0,2), %%mm1 \n"
01898 "punpckhbw %%mm2, %%mm3 \n"
01899 "punpcklbw %%mm2, %%mm2 \n"
01900 "psraw $8, %%mm3 \n"
01901 "psraw $8, %%mm2 \n"
01902 "psubw %%mm3, %%mm1 \n"
01903 "psubw %%mm2, %%mm0 \n"
01904 "pmaddwd %%mm1, %%mm1 \n"
01905 "pmaddwd %%mm0, %%mm0 \n"
01906 "paddd %%mm1, %%mm4 \n"
01907 "paddd %%mm0, %%mm4 \n"
01908 "jg 1b \n"
01909 "movq %%mm4, %%mm3 \n"
01910 "psrlq $32, %%mm3 \n"
01911 "paddd %%mm3, %%mm4 \n"
01912 "movd %%mm4, %1 \n"
01913 :"+r"(i), "=r"(sum)
01914 :"r"(pix1), "r"(pix2)
01915 );
01916 return sum;
01917 }
01918
01919 #endif //CONFIG_ENCODERS
01920
01921 #define put_no_rnd_pixels8_mmx(a,b,c,d) put_pixels8_mmx(a,b,c,d)
01922 #define put_no_rnd_pixels16_mmx(a,b,c,d) put_pixels16_mmx(a,b,c,d)
01923
01924 #define QPEL_V_LOW(m3,m4,m5,m6, pw_20, pw_3, rnd, in0, in1, in2, in7, out, OP)\
01925 "paddw " #m4 ", " #m3 " \n\t" \
01926 "movq "MANGLE(ff_pw_20)", %%mm4 \n\t" \
01927 "pmullw " #m3 ", %%mm4 \n\t" \
01928 "movq "#in7", " #m3 " \n\t" \
01929 "movq "#in0", %%mm5 \n\t" \
01930 "paddw " #m3 ", %%mm5 \n\t" \
01931 "psubw %%mm5, %%mm4 \n\t" \
01932 "movq "#in1", %%mm5 \n\t" \
01933 "movq "#in2", %%mm6 \n\t" \
01934 "paddw " #m6 ", %%mm5 \n\t" \
01935 "paddw " #m5 ", %%mm6 \n\t" \
01936 "paddw %%mm6, %%mm6 \n\t" \
01937 "psubw %%mm6, %%mm5 \n\t" \
01938 "pmullw "MANGLE(ff_pw_3)", %%mm5 \n\t" \
01939 "paddw " #rnd ", %%mm4 \n\t" \
01940 "paddw %%mm4, %%mm5 \n\t" \
01941 "psraw $5, %%mm5 \n\t"\
01942 "packuswb %%mm5, %%mm5 \n\t"\
01943 OP(%%mm5, out, %%mm7, d)
01944
01945 #define QPEL_BASE(OPNAME, ROUNDER, RND, OP_MMX2, OP_3DNOW)\
01946 static void OPNAME ## mpeg4_qpel16_h_lowpass_mmx2(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
01947 uint64_t temp;\
01948 \
01949 asm volatile(\
01950 "pxor %%mm7, %%mm7 \n\t"\
01951 "1: \n\t"\
01952 "movq (%0), %%mm0 \n\t" \
01953 "movq %%mm0, %%mm1 \n\t" \
01954 "movq %%mm0, %%mm2 \n\t" \
01955 "punpcklbw %%mm7, %%mm0 \n\t" \
01956 "punpckhbw %%mm7, %%mm1 \n\t" \
01957 "pshufw $0x90, %%mm0, %%mm5 \n\t" \
01958 "pshufw $0x41, %%mm0, %%mm6 \n\t" \
01959 "movq %%mm2, %%mm3 \n\t" \
01960 "movq %%mm2, %%mm4 \n\t" \
01961 "psllq $8, %%mm2 \n\t" \
01962 "psllq $16, %%mm3 \n\t" \
01963 "psllq $24, %%mm4 \n\t" \
01964 "punpckhbw %%mm7, %%mm2 \n\t" \
01965 "punpckhbw %%mm7, %%mm3 \n\t" \
01966 "punpckhbw %%mm7, %%mm4 \n\t" \
01967 "paddw %%mm3, %%mm5 \n\t" \
01968 "paddw %%mm2, %%mm6 \n\t" \
01969 "paddw %%mm5, %%mm5 \n\t" \
01970 "psubw %%mm5, %%mm6 \n\t" \
01971 "pshufw $0x06, %%mm0, %%mm5 \n\t" \
01972 "pmullw "MANGLE(ff_pw_3)", %%mm6 \n\t" \
01973 "paddw %%mm4, %%mm0 \n\t" \
01974 "paddw %%mm1, %%mm5 \n\t" \
01975 "pmullw "MANGLE(ff_pw_20)", %%mm0 \n\t" \
01976 "psubw %%mm5, %%mm0 \n\t" \
01977 "paddw %6, %%mm6 \n\t"\
01978 "paddw %%mm6, %%mm0 \n\t" \
01979 "psraw $5, %%mm0 \n\t"\
01980 "movq %%mm0, %5 \n\t"\
01981 \
01982 \
01983 "movq 5(%0), %%mm0 \n\t" \
01984 "movq %%mm0, %%mm5 \n\t" \
01985 "movq %%mm0, %%mm6 \n\t" \
01986 "psrlq $8, %%mm0 \n\t" \
01987 "psrlq $16, %%mm5 \n\t" \
01988 "punpcklbw %%mm7, %%mm0 \n\t" \
01989 "punpcklbw %%mm7, %%mm5 \n\t" \
01990 "paddw %%mm0, %%mm2 \n\t" \
01991 "paddw %%mm5, %%mm3 \n\t" \
01992 "paddw %%mm2, %%mm2 \n\t" \
01993 "psubw %%mm2, %%mm3 \n\t" \
01994 "movq %%mm6, %%mm2 \n\t" \
01995 "psrlq $24, %%mm6 \n\t" \
01996 "punpcklbw %%mm7, %%mm2 \n\t" \
01997 "punpcklbw %%mm7, %%mm6 \n\t" \
01998 "pmullw "MANGLE(ff_pw_3)", %%mm3 \n\t" \
01999 "paddw %%mm2, %%mm1 \n\t" \
02000 "paddw %%mm6, %%mm4 \n\t" \
02001 "pmullw "MANGLE(ff_pw_20)", %%mm1 \n\t" \
02002 "psubw %%mm4, %%mm3 \n\t" \
02003 "paddw %6, %%mm1 \n\t"\
02004 "paddw %%mm1, %%mm3 \n\t" \
02005 "psraw $5, %%mm3 \n\t"\
02006 "movq %5, %%mm1 \n\t"\
02007 "packuswb %%mm3, %%mm1 \n\t"\
02008 OP_MMX2(%%mm1, (%1),%%mm4, q)\
02009 \
02010 \
02011 "movq 9(%0), %%mm1 \n\t" \
02012 "movq %%mm1, %%mm4 \n\t" \
02013 "movq %%mm1, %%mm3 \n\t" \
02014 "psrlq $8, %%mm1 \n\t" \
02015 "psrlq $16, %%mm4 \n\t" \
02016 "punpcklbw %%mm7, %%mm1 \n\t" \
02017 "punpcklbw %%mm7, %%mm4 \n\t" \
02018 "paddw %%mm1, %%mm5 \n\t" \
02019 "paddw %%mm4, %%mm0 \n\t" \
02020 "paddw %%mm5, %%mm5 \n\t" \
02021 "psubw %%mm5, %%mm0 \n\t" \
02022 "movq %%mm3, %%mm5 \n\t" \
02023 "psrlq $24, %%mm3 \n\t" \
02024 "pmullw "MANGLE(ff_pw_3)", %%mm0 \n\t" \
02025 "punpcklbw %%mm7, %%mm3 \n\t" \
02026 "paddw %%mm3, %%mm2 \n\t" \
02027 "psubw %%mm2, %%mm0 \n\t" \
02028 "movq %%mm5, %%mm2 \n\t" \
02029 "punpcklbw %%mm7, %%mm2 \n\t" \
02030 "punpckhbw %%mm7, %%mm5 \n\t" \
02031 "paddw %%mm2, %%mm6 \n\t" \
02032 "pmullw "MANGLE(ff_pw_20)", %%mm6 \n\t" \
02033 "paddw %6, %%mm0 \n\t"\
02034 "paddw %%mm6, %%mm0 \n\t" \
02035 "psraw $5, %%mm0 \n\t"\
02036 \
02037 \
02038 "paddw %%mm5, %%mm3 \n\t" \
02039 "pshufw $0xF9, %%mm5, %%mm6 \n\t" \
02040 "paddw %%mm4, %%mm6 \n\t" \
02041 "pshufw $0xBE, %%mm5, %%mm4 \n\t" \
02042 "pshufw $0x6F, %%mm5, %%mm5 \n\t" \
02043 "paddw %%mm1, %%mm4 \n\t" \
02044 "paddw %%mm2, %%mm5 \n\t" \
02045 "paddw %%mm6, %%mm6 \n\t" \
02046 "psubw %%mm6, %%mm4 \n\t" \
02047 "pmullw "MANGLE(ff_pw_20)", %%mm3 \n\t" \
02048 "pmullw "MANGLE(ff_pw_3)", %%mm4 \n\t" \
02049 "psubw %%mm5, %%mm3 \n\t" \
02050 "paddw %6, %%mm4 \n\t"\
02051 "paddw %%mm3, %%mm4 \n\t" \
02052 "psraw $5, %%mm4 \n\t"\
02053 "packuswb %%mm4, %%mm0 \n\t"\
02054 OP_MMX2(%%mm0, 8(%1), %%mm4, q)\
02055 \
02056 "add %3, %0 \n\t"\
02057 "add %4, %1 \n\t"\
02058 "decl %2 \n\t"\
02059 " jnz 1b \n\t"\
02060 : "+a"(src), "+c"(dst), "+m"(h)\
02061 : "d"((long)srcStride), "S"((long)dstStride), "m"(temp), "m"(ROUNDER)\
02062 : "memory"\
02063 );\
02064 }\
02065 \
02066 static void OPNAME ## mpeg4_qpel16_h_lowpass_3dnow(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
02067 int i;\
02068 int16_t temp[16];\
02069 \
02070 for(i=0; i<h; i++)\
02071 {\
02072 temp[ 0]= (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]);\
02073 temp[ 1]= (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]);\
02074 temp[ 2]= (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]);\
02075 temp[ 3]= (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]);\
02076 temp[ 4]= (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]);\
02077 temp[ 5]= (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 9]);\
02078 temp[ 6]= (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 9])*3 - (src[ 3]+src[10]);\
02079 temp[ 7]= (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 9])*6 + (src[ 5]+src[10])*3 - (src[ 4]+src[11]);\
02080 temp[ 8]= (src[ 8]+src[ 9])*20 - (src[ 7]+src[10])*6 + (src[ 6]+src[11])*3 - (src[ 5]+src[12]);\
02081 temp[ 9]= (src[ 9]+src[10])*20 - (src[ 8]+src[11])*6 + (src[ 7]+src[12])*3 - (src[ 6]+src[13]);\
02082 temp[10]= (src[10]+src[11])*20 - (src[ 9]+src[12])*6 + (src[ 8]+src[13])*3 - (src[ 7]+src[14]);\
02083 temp[11]= (src[11]+src[12])*20 - (src[10]+src[13])*6 + (src[ 9]+src[14])*3 - (src[ 8]+src[15]);\
02084 temp[12]= (src[12]+src[13])*20 - (src[11]+src[14])*6 + (src[10]+src[15])*3 - (src[ 9]+src[16]);\
02085 temp[13]= (src[13]+src[14])*20 - (src[12]+src[15])*6 + (src[11]+src[16])*3 - (src[10]+src[16]);\
02086 temp[14]= (src[14]+src[15])*20 - (src[13]+src[16])*6 + (src[12]+src[16])*3 - (src[11]+src[15]);\
02087 temp[15]= (src[15]+src[16])*20 - (src[14]+src[16])*6 + (src[13]+src[15])*3 - (src[12]+src[14]);\
02088 asm volatile(\
02089 "movq (%0), %%mm0 \n\t"\
02090 "movq 8(%0), %%mm1 \n\t"\
02091 "paddw %2, %%mm0 \n\t"\
02092 "paddw %2, %%mm1 \n\t"\
02093 "psraw $5, %%mm0 \n\t"\
02094 "psraw $5, %%mm1 \n\t"\
02095 "packuswb %%mm1, %%mm0 \n\t"\
02096 OP_3DNOW(%%mm0, (%1), %%mm1, q)\
02097 "movq 16(%0), %%mm0 \n\t"\
02098 "movq 24(%0), %%mm1 \n\t"\
02099 "paddw %2, %%mm0 \n\t"\
02100 "paddw %2, %%mm1 \n\t"\
02101 "psraw $5, %%mm0 \n\t"\
02102 "psraw $5, %%mm1 \n\t"\
02103 "packuswb %%mm1, %%mm0 \n\t"\
02104 OP_3DNOW(%%mm0, 8(%1), %%mm1, q)\
02105 :: "r"(temp), "r"(dst), "m"(ROUNDER)\
02106 : "memory"\
02107 );\
02108 dst+=dstStride;\
02109 src+=srcStride;\
02110 }\
02111 }\
02112 \
02113 static void OPNAME ## mpeg4_qpel8_h_lowpass_mmx2(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
02114 uint64_t temp;\
02115 \
02116 asm volatile(\
02117 "pxor %%mm7, %%mm7 \n\t"\
02118 "1: \n\t"\
02119 "movq (%0), %%mm0 \n\t" \
02120 "movq %%mm0, %%mm1 \n\t" \
02121 "movq %%mm0, %%mm2 \n\t" \
02122 "punpcklbw %%mm7, %%mm0 \n\t" \
02123 "punpckhbw %%mm7, %%mm1 \n\t" \
02124 "pshufw $0x90, %%mm0, %%mm5 \n\t" \
02125 "pshufw $0x41, %%mm0, %%mm6 \n\t" \
02126 "movq %%mm2, %%mm3 \n\t" \
02127 "movq %%mm2, %%mm4 \n\t" \
02128 "psllq $8, %%mm2 \n\t" \
02129 "psllq $16, %%mm3 \n\t" \
02130 "psllq $24, %%mm4 \n\t" \
02131 "punpckhbw %%mm7, %%mm2 \n\t" \
02132 "punpckhbw %%mm7, %%mm3 \n\t" \
02133 "punpckhbw %%mm7, %%mm4 \n\t" \
02134 "paddw %%mm3, %%mm5 \n\t" \
02135 "paddw %%mm2, %%mm6 \n\t" \
02136 "paddw %%mm5, %%mm5 \n\t" \
02137 "psubw %%mm5, %%mm6 \n\t" \
02138 "pshufw $0x06, %%mm0, %%mm5 \n\t" \
02139 "pmullw "MANGLE(ff_pw_3)", %%mm6 \n\t" \
02140 "paddw %%mm4, %%mm0 \n\t" \
02141 "paddw %%mm1, %%mm5 \n\t" \
02142 "pmullw "MANGLE(ff_pw_20)", %%mm0 \n\t" \
02143 "psubw %%mm5, %%mm0 \n\t" \
02144 "paddw %6, %%mm6 \n\t"\
02145 "paddw %%mm6, %%mm0 \n\t" \
02146 "psraw $5, %%mm0 \n\t"\
02147 \
02148 \
02149 "movd 5(%0), %%mm5 \n\t" \
02150 "punpcklbw %%mm7, %%mm5 \n\t" \
02151 "pshufw $0xF9, %%mm5, %%mm6 \n\t" \
02152 "paddw %%mm5, %%mm1 \n\t" \
02153 "paddw %%mm6, %%mm2 \n\t" \
02154 "pshufw $0xBE, %%mm5, %%mm6 \n\t" \
02155 "pshufw $0x6F, %%mm5, %%mm5 \n\t" \
02156 "paddw %%mm6, %%mm3 \n\t" \
02157 "paddw %%mm5, %%mm4 \n\t" \
02158 "paddw %%mm2, %%mm2 \n\t" \
02159 "psubw %%mm2, %%mm3 \n\t" \
02160 "pmullw "MANGLE(ff_pw_20)", %%mm1 \n\t" \
02161 "pmullw "MANGLE(ff_pw_3)", %%mm3 \n\t" \
02162 "psubw %%mm4, %%mm3 \n\t" \
02163 "paddw %6, %%mm1 \n\t"\
02164 "paddw %%mm1, %%mm3 \n\t" \
02165 "psraw $5, %%mm3 \n\t"\
02166 "packuswb %%mm3, %%mm0 \n\t"\
02167 OP_MMX2(%%mm0, (%1), %%mm4, q)\
02168 \
02169 "add %3, %0 \n\t"\
02170 "add %4, %1 \n\t"\
02171 "decl %2 \n\t"\
02172 " jnz 1b \n\t"\
02173 : "+a"(src), "+c"(dst), "+m"(h)\
02174 : "S"((long)srcStride), "D"((long)dstStride), "m"(temp), "m"(ROUNDER)\
02175 : "memory"\
02176 );\
02177 }\
02178 \
02179 static void OPNAME ## mpeg4_qpel8_h_lowpass_3dnow(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
02180 int i;\
02181 int16_t temp[8];\
02182 \
02183 for(i=0; i<h; i++)\
02184 {\
02185 temp[ 0]= (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]);\
02186 temp[ 1]= (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]);\
02187 temp[ 2]= (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]);\
02188 temp[ 3]= (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]);\
02189 temp[ 4]= (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]);\
02190 temp[ 5]= (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 8]);\
02191 temp[ 6]= (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 8])*3 - (src[ 3]+src[ 7]);\
02192 temp[ 7]= (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 8])*6 + (src[ 5]+src[ 7])*3 - (src[ 4]+src[ 6]);\
02193 asm volatile(\
02194 "movq (%0), %%mm0 \n\t"\
02195 "movq 8(%0), %%mm1 \n\t"\
02196 "paddw %2, %%mm0 \n\t"\
02197 "paddw %2, %%mm1 \n\t"\
02198 "psraw $5, %%mm0 \n\t"\
02199 "psraw $5, %%mm1 \n\t"\
02200 "packuswb %%mm1, %%mm0 \n\t"\
02201 OP_3DNOW(%%mm0, (%1), %%mm1, q)\
02202 :: "r"(temp), "r"(dst), "m"(ROUNDER)\
02203 :"memory"\
02204 );\
02205 dst+=dstStride;\
02206 src+=srcStride;\
02207 }\
02208 }
02209
02210 #define QPEL_OP(OPNAME, ROUNDER, RND, OP, MMX)\
02211 \
02212 static void OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
02213 uint64_t temp[17*4];\
02214 uint64_t *temp_ptr= temp;\
02215 int count= 17;\
02216 \
02217 \
02218 asm volatile(\
02219 "pxor %%mm7, %%mm7 \n\t"\
02220 "1: \n\t"\
02221 "movq (%0), %%mm0 \n\t"\
02222 "movq (%0), %%mm1 \n\t"\
02223 "movq 8(%0), %%mm2 \n\t"\
02224 "movq 8(%0), %%mm3 \n\t"\
02225 "punpcklbw %%mm7, %%mm0 \n\t"\
02226 "punpckhbw %%mm7, %%mm1 \n\t"\
02227 "punpcklbw %%mm7, %%mm2 \n\t"\
02228 "punpckhbw %%mm7, %%mm3 \n\t"\
02229 "movq %%mm0, (%1) \n\t"\
02230 "movq %%mm1, 17*8(%1) \n\t"\
02231 "movq %%mm2, 2*17*8(%1) \n\t"\
02232 "movq %%mm3, 3*17*8(%1) \n\t"\
02233 "add $8, %1 \n\t"\
02234 "add %3, %0 \n\t"\
02235 "decl %2 \n\t"\
02236 " jnz 1b \n\t"\
02237 : "+r" (src), "+r" (temp_ptr), "+r"(count)\
02238 : "r" ((long)srcStride)\
02239 : "memory"\
02240 );\
02241 \
02242 temp_ptr= temp;\
02243 count=4;\
02244 \
02245 \
02246 asm volatile(\
02247 \
02248 "1: \n\t"\
02249 "movq (%0), %%mm0 \n\t"\
02250 "movq 8(%0), %%mm1 \n\t"\
02251 "movq 16(%0), %%mm2 \n\t"\
02252 "movq 24(%0), %%mm3 \n\t"\
02253 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 16(%0), 8(%0), (%0), 32(%0), (%1), OP)\
02254 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 8(%0), (%0), (%0), 40(%0), (%1, %3), OP)\
02255 "add %4, %1 \n\t"\
02256 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, (%0), (%0), 8(%0), 48(%0), (%1), OP)\
02257 \
02258 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, (%0), 8(%0), 16(%0), 56(%0), (%1, %3), OP)\
02259 "add %4, %1 \n\t"\
02260 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 8(%0), 16(%0), 24(%0), 64(%0), (%1), OP)\
02261 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 16(%0), 24(%0), 32(%0), 72(%0), (%1, %3), OP)\
02262 "add %4, %1 \n\t"\
02263 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 24(%0), 32(%0), 40(%0), 80(%0), (%1), OP)\
02264 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 32(%0), 40(%0), 48(%0), 88(%0), (%1, %3), OP)\
02265 "add %4, %1 \n\t"\
02266 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 40(%0), 48(%0), 56(%0), 96(%0), (%1), OP)\
02267 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 48(%0), 56(%0), 64(%0),104(%0), (%1, %3), OP)\
02268 "add %4, %1 \n\t"\
02269 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 56(%0), 64(%0), 72(%0),112(%0), (%1), OP)\
02270 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 64(%0), 72(%0), 80(%0),120(%0), (%1, %3), OP)\
02271 "add %4, %1 \n\t"\
02272 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 72(%0), 80(%0), 88(%0),128(%0), (%1), OP)\
02273 \
02274 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 80(%0), 88(%0), 96(%0),128(%0), (%1, %3), OP)\
02275 "add %4, %1 \n\t" \
02276 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 88(%0), 96(%0),104(%0),120(%0), (%1), OP)\
02277 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 96(%0),104(%0),112(%0),112(%0), (%1, %3), OP)\
02278 \
02279 "add $136, %0 \n\t"\
02280 "add %6, %1 \n\t"\
02281 "decl %2 \n\t"\
02282 " jnz 1b \n\t"\
02283 \
02284 : "+r"(temp_ptr), "+r"(dst), "+g"(count)\
02285 : "r"((long)dstStride), "r"(2*(long)dstStride), "m"(ROUNDER), "g"(4-14*(long)dstStride)\
02286 :"memory"\
02287 );\
02288 }\
02289 \
02290 static void OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
02291 uint64_t temp[9*2];\
02292 uint64_t *temp_ptr= temp;\
02293 int count= 9;\
02294 \
02295 \
02296 asm volatile(\
02297 "pxor %%mm7, %%mm7 \n\t"\
02298 "1: \n\t"\
02299 "movq (%0), %%mm0 \n\t"\
02300 "movq (%0), %%mm1 \n\t"\
02301 "punpcklbw %%mm7, %%mm0 \n\t"\
02302 "punpckhbw %%mm7, %%mm1 \n\t"\
02303 "movq %%mm0, (%1) \n\t"\
02304 "movq %%mm1, 9*8(%1) \n\t"\
02305 "add $8, %1 \n\t"\
02306 "add %3, %0 \n\t"\
02307 "decl %2 \n\t"\
02308 " jnz 1b \n\t"\
02309 : "+r" (src), "+r" (temp_ptr), "+r"(count)\
02310 : "r" ((long)srcStride)\
02311 : "memory"\
02312 );\
02313 \
02314 temp_ptr= temp;\
02315 count=2;\
02316 \
02317 \
02318 asm volatile(\
02319 \
02320 "1: \n\t"\
02321 "movq (%0), %%mm0 \n\t"\
02322 "movq 8(%0), %%mm1 \n\t"\
02323 "movq 16(%0), %%mm2 \n\t"\
02324 "movq 24(%0), %%mm3 \n\t"\
02325 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 16(%0), 8(%0), (%0), 32(%0), (%1), OP)\
02326 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 8(%0), (%0), (%0), 40(%0), (%1, %3), OP)\
02327 "add %4, %1 \n\t"\
02328 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, (%0), (%0), 8(%0), 48(%0), (%1), OP)\
02329 \
02330 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, (%0), 8(%0), 16(%0), 56(%0), (%1, %3), OP)\
02331 "add %4, %1 \n\t"\
02332 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 8(%0), 16(%0), 24(%0), 64(%0), (%1), OP)\
02333 \
02334 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 16(%0), 24(%0), 32(%0), 64(%0), (%1, %3), OP)\
02335 "add %4, %1 \n\t"\
02336 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 24(%0), 32(%0), 40(%0), 56(%0), (%1), OP)\
02337 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 32(%0), 40(%0), 48(%0), 48(%0), (%1, %3), OP)\
02338 \
02339 "add $72, %0 \n\t"\
02340 "add %6, %1 \n\t"\
02341 "decl %2 \n\t"\
02342 " jnz 1b \n\t"\
02343 \
02344 : "+r"(temp_ptr), "+r"(dst), "+g"(count)\
02345 : "r"((long)dstStride), "r"(2*(long)dstStride), "m"(ROUNDER), "g"(4-6*(long)dstStride)\
02346 : "memory"\
02347 );\
02348 }\
02349 \
02350 static void OPNAME ## qpel8_mc00_ ## MMX (uint8_t *dst, uint8_t *src, int stride){\
02351 OPNAME ## pixels8_mmx(dst, src, stride, 8);\
02352 }\
02353 \
02354 static void OPNAME ## qpel8_mc10_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
02355 uint64_t temp[8];\
02356 uint8_t * const half= (uint8_t*)temp;\
02357 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(half, src, 8, stride, 8);\
02358 OPNAME ## pixels8_l2_ ## MMX(dst, src, half, stride, stride, 8);\
02359 }\
02360 \
02361 static void OPNAME ## qpel8_mc20_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
02362 OPNAME ## mpeg4_qpel8_h_lowpass_ ## MMX(dst, src, stride, stride, 8);\
02363 }\
02364 \
02365 static void OPNAME ## qpel8_mc30_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
02366 uint64_t temp[8];\
02367 uint8_t * const half= (uint8_t*)temp;\
02368 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(half, src, 8, stride, 8);\
02369 OPNAME ## pixels8_l2_ ## MMX(dst, src+1, half, stride, stride, 8);\
02370 }\
02371 \
02372 static void OPNAME ## qpel8_mc01_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
02373 uint64_t temp[8];\
02374 uint8_t * const half= (uint8_t*)temp;\
02375 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(half, src, 8, stride);\
02376 OPNAME ## pixels8_l2_ ## MMX(dst, src, half, stride, stride, 8);\
02377 }\
02378 \
02379 static void OPNAME ## qpel8_mc02_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
02380 OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, src, stride, stride);\
02381 }\
02382 \
02383 static void OPNAME ## qpel8_mc03_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
02384 uint64_t temp[8];\
02385 uint8_t * const half= (uint8_t*)temp;\
02386 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(half, src, 8, stride);\
02387 OPNAME ## pixels8_l2_ ## MMX(dst, src+stride, half, stride, stride, 8);\
02388 }\
02389 static void OPNAME ## qpel8_mc11_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
02390 uint64_t half[8 + 9];\
02391 uint8_t * const halfH= ((uint8_t*)half) + 64;\
02392 uint8_t * const halfHV= ((uint8_t*)half);\
02393 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
02394 put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, 8, stride, 9);\
02395 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
02396 OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV, stride, 8, 8);\
02397 }\
02398 static void OPNAME ## qpel8_mc31_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
02399 uint64_t half[8 + 9];\
02400 uint8_t * const halfH= ((uint8_t*)half) + 64;\
02401 uint8_t * const halfHV= ((uint8_t*)half);\
02402 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
02403 put ## RND ## pixels8_l2_ ## MMX(halfH, src+1, halfH, 8, stride, 9);\
02404 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
02405 OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV, stride, 8, 8);\
02406 }\
02407 static void OPNAME ## qpel8_mc13_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
02408 uint64_t half[8 + 9];\
02409 uint8_t * const halfH= ((uint8_t*)half) + 64;\
02410 uint8_t * const halfHV= ((uint8_t*)half);\
02411 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
02412 put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, 8, stride, 9);\
02413 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
02414 OPNAME ## pixels8_l2_ ## MMX(dst, halfH+8, halfHV, stride, 8, 8);\
02415 }\
02416 static void OPNAME ## qpel8_mc33_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
02417 uint64_t half[8 + 9];\
02418 uint8_t * const halfH= ((uint8_t*)half) + 64;\
02419 uint8_t * const halfHV= ((uint8_t*)half);\
02420 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
02421 put ## RND ## pixels8_l2_ ## MMX(halfH, src+1, halfH, 8, stride, 9);\
02422 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
02423 OPNAME ## pixels8_l2_ ## MMX(dst, halfH+8, halfHV, stride, 8, 8);\
02424 }\
02425 static void OPNAME ## qpel8_mc21_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
02426 uint64_t half[8 + 9];\
02427 uint8_t * const halfH= ((uint8_t*)half) + 64;\
02428 uint8_t * const halfHV= ((uint8_t*)half);\
02429 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
02430 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
02431 OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV, stride, 8, 8);\
02432 }\
02433 static void OPNAME ## qpel8_mc23_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
02434 uint64_t half[8 + 9];\
02435 uint8_t * const halfH= ((uint8_t*)half) + 64;\
02436 uint8_t * const halfHV= ((uint8_t*)half);\
02437 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
02438 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
02439 OPNAME ## pixels8_l2_ ## MMX(dst, halfH+8, halfHV, stride, 8, 8);\
02440 }\
02441 static void OPNAME ## qpel8_mc12_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
02442 uint64_t half[8 + 9];\
02443 uint8_t * const halfH= ((uint8_t*)half);\
02444 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
02445 put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, 8, stride, 9);\
02446 OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, stride, 8);\
02447 }\
02448 static void OPNAME ## qpel8_mc32_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
02449 uint64_t half[8 + 9];\
02450 uint8_t * const halfH= ((uint8_t*)half);\
02451 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
02452 put ## RND ## pixels8_l2_ ## MMX(halfH, src+1, halfH, 8, stride, 9);\
02453 OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, stride, 8);\
02454 }\
02455 static void OPNAME ## qpel8_mc22_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
02456 uint64_t half[9];\
02457 uint8_t * const halfH= ((uint8_t*)half);\
02458 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
02459 OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, stride, 8);\
02460 }\
02461 static void OPNAME ## qpel16_mc00_ ## MMX (uint8_t *dst, uint8_t *src, int stride){\
02462 OPNAME ## pixels16_mmx(dst, src, stride, 16);\
02463 }\
02464 \
02465 static void OPNAME ## qpel16_mc10_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
02466 uint64_t temp[32];\
02467 uint8_t * const half= (uint8_t*)temp;\
02468 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(half, src, 16, stride, 16);\
02469 OPNAME ## pixels16_l2_ ## MMX(dst, src, half, stride, stride, 16);\
02470 }\
02471 \
02472 static void OPNAME ## qpel16_mc20_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
02473 OPNAME ## mpeg4_qpel16_h_lowpass_ ## MMX(dst, src, stride, stride, 16);\
02474 }\
02475 \
02476 static void OPNAME ## qpel16_mc30_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
02477 uint64_t temp[32];\
02478 uint8_t * const half= (uint8_t*)temp;\
02479 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(half, src, 16, stride, 16);\
02480 OPNAME ## pixels16_l2_ ## MMX(dst, src+1, half, stride, stride, 16);\
02481 }\
02482 \
02483 static void OPNAME ## qpel16_mc01_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
02484 uint64_t temp[32];\
02485 uint8_t * const half= (uint8_t*)temp;\
02486 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(half, src, 16, stride);\
02487 OPNAME ## pixels16_l2_ ## MMX(dst, src, half, stride, stride, 16);\
02488 }\
02489 \
02490 static void OPNAME ## qpel16_mc02_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
02491 OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, src, stride, stride);\
02492 }\
02493 \
02494 static void OPNAME ## qpel16_mc03_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
02495 uint64_t temp[32];\
02496 uint8_t * const half= (uint8_t*)temp;\
02497 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(half, src, 16, stride);\
02498 OPNAME ## pixels16_l2_ ## MMX(dst, src+stride, half, stride, stride, 16);\
02499 }\
02500 static void OPNAME ## qpel16_mc11_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
02501 uint64_t half[16*2 + 17*2];\
02502 uint8_t * const halfH= ((uint8_t*)half) + 256;\
02503 uint8_t * const halfHV= ((uint8_t*)half);\
02504 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
02505 put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16, stride, 17);\
02506 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
02507 OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV, stride, 16, 16);\
02508 }\
02509 static void OPNAME ## qpel16_mc31_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
02510 uint64_t half[16*2 + 17*2];\
02511 uint8_t * const halfH= ((uint8_t*)half) + 256;\
02512 uint8_t * const halfHV= ((uint8_t*)half);\
02513 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
02514 put ## RND ## pixels16_l2_ ## MMX(halfH, src+1, halfH, 16, stride, 17);\
02515 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
02516 OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV, stride, 16, 16);\
02517 }\
02518 static void OPNAME ## qpel16_mc13_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
02519 uint64_t half[16*2 + 17*2];\
02520 uint8_t * const halfH= ((uint8_t*)half) + 256;\
02521 uint8_t * const halfHV= ((uint8_t*)half);\
02522 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
02523 put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16, stride, 17);\
02524 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
02525 OPNAME ## pixels16_l2_ ## MMX(dst, halfH+16, halfHV, stride, 16, 16);\
02526 }\
02527 static void OPNAME ## qpel16_mc33_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
02528 uint64_t half[16*2 + 17*2];\
02529 uint8_t * const halfH= ((uint8_t*)half) + 256;\
02530 uint8_t * const halfHV= ((uint8_t*)half);\
02531 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
02532 put ## RND ## pixels16_l2_ ## MMX(halfH, src+1, halfH, 16, stride, 17);\
02533 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
02534 OPNAME ## pixels16_l2_ ## MMX(dst, halfH+16, halfHV, stride, 16, 16);\
02535 }\
02536 static void OPNAME ## qpel16_mc21_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
02537 uint64_t half[16*2 + 17*2];\
02538 uint8_t * const halfH= ((uint8_t*)half) + 256;\
02539 uint8_t * const halfHV= ((uint8_t*)half);\
02540 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
02541 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
02542 OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV, stride, 16, 16);\
02543 }\
02544 static void OPNAME ## qpel16_mc23_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
02545 uint64_t half[16*2 + 17*2];\
02546 uint8_t * const halfH= ((uint8_t*)half) + 256;\
02547 uint8_t * const halfHV= ((uint8_t*)half);\
02548 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
02549 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
02550 OPNAME ## pixels16_l2_ ## MMX(dst, halfH+16, halfHV, stride, 16, 16);\
02551 }\
02552 static void OPNAME ## qpel16_mc12_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
02553 uint64_t half[17*2];\
02554 uint8_t * const halfH= ((uint8_t*)half);\
02555 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
02556 put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16, stride, 17);\
02557 OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, stride, 16);\
02558 }\
02559 static void OPNAME ## qpel16_mc32_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
02560 uint64_t half[17*2];\
02561 uint8_t * const halfH= ((uint8_t*)half);\
02562 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
02563 put ## RND ## pixels16_l2_ ## MMX(halfH, src+1, halfH, 16, stride, 17);\
02564 OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, stride, 16);\
02565 }\
02566 static void OPNAME ## qpel16_mc22_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
02567 uint64_t half[17*2];\
02568 uint8_t * const halfH= ((uint8_t*)half);\
02569 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
02570 OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, stride, 16);\
02571 }
02572
02573 #define PUT_OP(a,b,temp, size) "mov" #size " " #a ", " #b " \n\t"
02574 #define AVG_3DNOW_OP(a,b,temp, size) \
02575 "mov" #size " " #b ", " #temp " \n\t"\
02576 "pavgusb " #temp ", " #a " \n\t"\
02577 "mov" #size " " #a ", " #b " \n\t"
02578 #define AVG_MMX2_OP(a,b,temp, size) \
02579 "mov" #size " " #b ", " #temp " \n\t"\
02580 "pavgb " #temp ", " #a " \n\t"\
02581 "mov" #size " " #a ", " #b " \n\t"
02582
02583 QPEL_BASE(put_ , ff_pw_16, _ , PUT_OP, PUT_OP)
02584 QPEL_BASE(avg_ , ff_pw_16, _ , AVG_MMX2_OP, AVG_3DNOW_OP)
02585 QPEL_BASE(put_no_rnd_, ff_pw_15, _no_rnd_, PUT_OP, PUT_OP)
02586 QPEL_OP(put_ , ff_pw_16, _ , PUT_OP, 3dnow)
02587 QPEL_OP(avg_ , ff_pw_16, _ , AVG_3DNOW_OP, 3dnow)
02588 QPEL_OP(put_no_rnd_, ff_pw_15, _no_rnd_, PUT_OP, 3dnow)
02589 QPEL_OP(put_ , ff_pw_16, _ , PUT_OP, mmx2)
02590 QPEL_OP(avg_ , ff_pw_16, _ , AVG_MMX2_OP, mmx2)
02591 QPEL_OP(put_no_rnd_, ff_pw_15, _no_rnd_, PUT_OP, mmx2)
02592
02593
02594
02595
02596 #define QPEL_2TAP_XY(OPNAME, SIZE, MMX, XY, HPEL)\
02597 static void OPNAME ## 2tap_qpel ## SIZE ## _mc ## XY ## _ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
02598 OPNAME ## pixels ## SIZE ## HPEL(dst, src, stride, SIZE);\
02599 }
02600 #define QPEL_2TAP_L3(OPNAME, SIZE, MMX, XY, S0, S1, S2)\
02601 static void OPNAME ## 2tap_qpel ## SIZE ## _mc ## XY ## _ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
02602 OPNAME ## 2tap_qpel ## SIZE ## _l3_ ## MMX(dst, src+S0, stride, SIZE, S1, S2);\
02603 }
02604
02605 #define QPEL_2TAP(OPNAME, SIZE, MMX)\
02606 QPEL_2TAP_XY(OPNAME, SIZE, MMX, 20, _x2_ ## MMX)\
02607 QPEL_2TAP_XY(OPNAME, SIZE, MMX, 02, _y2_ ## MMX)\
02608 QPEL_2TAP_XY(OPNAME, SIZE, MMX, 22, _xy2_mmx)\
02609 static const qpel_mc_func OPNAME ## 2tap_qpel ## SIZE ## _mc00_ ## MMX =\
02610 OPNAME ## qpel ## SIZE ## _mc00_ ## MMX;\
02611 static const qpel_mc_func OPNAME ## 2tap_qpel ## SIZE ## _mc21_ ## MMX =\
02612 OPNAME ## 2tap_qpel ## SIZE ## _mc20_ ## MMX;\
02613 static const qpel_mc_func OPNAME ## 2tap_qpel ## SIZE ## _mc12_ ## MMX =\
02614 OPNAME ## 2tap_qpel ## SIZE ## _mc02_ ## MMX;\
02615 static void OPNAME ## 2tap_qpel ## SIZE ## _mc32_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
02616 OPNAME ## pixels ## SIZE ## _y2_ ## MMX(dst, src+1, stride, SIZE);\
02617 }\
02618 static void OPNAME ## 2tap_qpel ## SIZE ## _mc23_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
02619 OPNAME ## pixels ## SIZE ## _x2_ ## MMX(dst, src+stride, stride, SIZE);\
02620 }\
02621 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 10, 0, 1, 0)\
02622 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 30, 1, -1, 0)\
02623 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 01, 0, stride, 0)\
02624 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 03, stride, -stride, 0)\
02625 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 11, 0, stride, 1)\
02626 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 31, 1, stride, -1)\
02627 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 13, stride, -stride, 1)\
02628 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 33, stride+1, -stride, -1)\
02629
02630 QPEL_2TAP(put_, 16, mmx2)
02631 QPEL_2TAP(avg_, 16, mmx2)
02632 QPEL_2TAP(put_, 8, mmx2)
02633 QPEL_2TAP(avg_, 8, mmx2)
02634 QPEL_2TAP(put_, 16, 3dnow)
02635 QPEL_2TAP(avg_, 16, 3dnow)
02636 QPEL_2TAP(put_, 8, 3dnow)
02637 QPEL_2TAP(avg_, 8, 3dnow)
02638
02639
02640 #if 0
02641 static void just_return() { return; }
02642 #endif
02643
02644 #define SET_QPEL_FUNC(postfix1, postfix2) \
02645 c->put_ ## postfix1 = put_ ## postfix2;\
02646 c->put_no_rnd_ ## postfix1 = put_no_rnd_ ## postfix2;\
02647 c->avg_ ## postfix1 = avg_ ## postfix2;
02648
02649 static void gmc_mmx(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy,
02650 int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height){
02651 const int w = 8;
02652 const int ix = ox>>(16+shift);
02653 const int iy = oy>>(16+shift);
02654 const int oxs = ox>>4;
02655 const int oys = oy>>4;
02656 const int dxxs = dxx>>4;
02657 const int dxys = dxy>>4;
02658 const int dyxs = dyx>>4;
02659 const int dyys = dyy>>4;
02660 const uint16_t r4[4] = {r,r,r,r};
02661 const uint16_t dxy4[4] = {dxys,dxys,dxys,dxys};
02662 const uint16_t dyy4[4] = {dyys,dyys,dyys,dyys};
02663 const uint64_t shift2 = 2*shift;
02664 uint8_t edge_buf[(h+1)*stride];
02665 int x, y;
02666
02667 const int dxw = (dxx-(1<<(16+shift)))*(w-1);
02668 const int dyh = (dyy-(1<<(16+shift)))*(h-1);
02669 const int dxh = dxy*(h-1);
02670 const int dyw = dyx*(w-1);
02671 if(
02672 (ox^(ox+dxw) | ox^(ox+dxh) | ox^(ox+dxw+dxh) |
02673 oy^(oy+dyw) | oy^(oy+dyh) | oy^(oy+dyw+dyh)) >> (16+shift)
02674
02675 || (dxx|dxy|dyx|dyy)&15 )
02676 {
02677
02678 ff_gmc_c(dst, src, stride, h, ox, oy, dxx, dxy, dyx, dyy, shift, r, width, height);
02679 return;
02680 }
02681
02682 src += ix + iy*stride;
02683 if( (unsigned)ix >= width-w ||
02684 (unsigned)iy >= height-h )
02685 {
02686 ff_emulated_edge_mc(edge_buf, src, stride, w+1, h+1, ix, iy, width, height);
02687 src = edge_buf;
02688 }
02689
02690 asm volatile(
02691 "movd %0, %%mm6 \n\t"
02692 "pxor %%mm7, %%mm7 \n\t"
02693 "punpcklwd %%mm6, %%mm6 \n\t"
02694 "punpcklwd %%mm6, %%mm6 \n\t"
02695 :: "r"(1<<shift)
02696 );
02697
02698 for(x=0; x<w; x+=4){
02699 uint16_t dx4[4] = { oxs - dxys + dxxs*(x+0),
02700 oxs - dxys + dxxs*(x+1),
02701 oxs - dxys + dxxs*(x+2),
02702 oxs - dxys + dxxs*(x+3) };
02703 uint16_t dy4[4] = { oys - dyys + dyxs*(x+0),
02704 oys - dyys + dyxs*(x+1),
02705 oys - dyys + dyxs*(x+2),
02706 oys - dyys + dyxs*(x+3) };
02707
02708 for(y=0; y<h; y++){
02709 asm volatile(
02710 "movq %0, %%mm4 \n\t"
02711 "movq %1, %%mm5 \n\t"
02712 "paddw %2, %%mm4 \n\t"
02713 "paddw %3, %%mm5 \n\t"
02714 "movq %%mm4, %0 \n\t"
02715 "movq %%mm5, %1 \n\t"
02716 "psrlw $12, %%mm4 \n\t"
02717 "psrlw $12, %%mm5 \n\t"
02718 : "+m"(*dx4), "+m"(*dy4)
02719 : "m"(*dxy4), "m"(*dyy4)
02720 );
02721
02722 asm volatile(
02723 "movq %%mm6, %%mm2 \n\t"
02724 "movq %%mm6, %%mm1 \n\t"
02725 "psubw %%mm4, %%mm2 \n\t"
02726 "psubw %%mm5, %%mm1 \n\t"
02727 "movq %%mm2, %%mm0 \n\t"
02728 "movq %%mm4, %%mm3 \n\t"
02729 "pmullw %%mm1, %%mm0 \n\t"
02730 "pmullw %%mm5, %%mm3 \n\t"
02731 "pmullw %%mm5, %%mm2 \n\t"
02732 "pmullw %%mm4, %%mm1 \n\t"
02733
02734 "movd %1, %%mm5 \n\t"
02735 "movd %0, %%mm4 \n\t"
02736 "punpcklbw %%mm7, %%mm5 \n\t"
02737 "punpcklbw %%mm7, %%mm4 \n\t"
02738 "pmullw %%mm5, %%mm3 \n\t"
02739 "pmullw %%mm4, %%mm2 \n\t"
02740
02741 :
02742 : "m"(src[stride]), "m"(src[stride+1])
02743 );
02744
02745 asm volatile(
02746 "movd %2, %%mm5 \n\t"
02747 "movd %1, %%mm4 \n\t"
02748 "punpcklbw %%mm7, %%mm5 \n\t"
02749 "punpcklbw %%mm7, %%mm4 \n\t"
02750 "pmullw %%mm5, %%mm1 \n\t"
02751 "pmullw %%mm4, %%mm0 \n\t"
02752 "paddw %3, %%mm1 \n\t"
02753 "paddw %%mm3, %%mm2 \n\t"
02754 "paddw %%mm1, %%mm0 \n\t"
02755 "paddw %%mm2, %%mm0 \n\t"
02756
02757 "psrlw %4, %%mm0 \n\t"
02758 "packuswb %%mm0, %%mm0 \n\t"
02759 "movd %%mm0, %0 \n\t"
02760
02761 : "=m"(dst[x+y*stride])
02762 : "m"(src[0]), "m"(src[1]),
02763 "m"(*r4), "m"(shift2)
02764 );
02765 src += stride;
02766 }
02767 src += 4-h*stride;
02768 }
02769 }
02770
02771 #ifdef CONFIG_ENCODERS
02772
02773 #define PHADDD(a, t)\
02774 "movq "#a", "#t" \n\t"\
02775 "psrlq $32, "#a" \n\t"\
02776 "paddd "#t", "#a" \n\t"
02777
02778
02779
02780
02781
02782 #define PMULHRW(x, y, s, o)\
02783 "pmulhw " #s ", "#x " \n\t"\
02784 "pmulhw " #s ", "#y " \n\t"\
02785 "paddw " #o ", "#x " \n\t"\
02786 "paddw " #o ", "#y " \n\t"\
02787 "psraw $1, "#x " \n\t"\
02788 "psraw $1, "#y " \n\t"
02789 #define DEF(x) x ## _mmx
02790 #define SET_RND MOVQ_WONE
02791 #define SCALE_OFFSET 1
02792
02793 #include "dsputil_mmx_qns.h"
02794
02795 #undef DEF
02796 #undef SET_RND
02797 #undef SCALE_OFFSET
02798 #undef PMULHRW
02799
02800 #define DEF(x) x ## _3dnow
02801 #define SET_RND(x)
02802 #define SCALE_OFFSET 0
02803 #define PMULHRW(x, y, s, o)\
02804 "pmulhrw " #s ", "#x " \n\t"\
02805 "pmulhrw " #s ", "#y " \n\t"
02806
02807 #include "dsputil_mmx_qns.h"
02808
02809 #undef DEF
02810 #undef SET_RND
02811 #undef SCALE_OFFSET
02812 #undef PMULHRW
02813
02814 #ifdef HAVE_SSSE3
02815 #undef PHADDD
02816 #define DEF(x) x ## _ssse3
02817 #define SET_RND(x)
02818 #define SCALE_OFFSET -1
02819 #define PHADDD(a, t)\
02820 "pshufw $0x0E, "#a", "#t" \n\t"\
02821 "paddd "#t", "#a" \n\t"
02822 #define PMULHRW(x, y, s, o)\
02823 "pmulhrsw " #s ", "#x " \n\t"\
02824 "pmulhrsw " #s ", "#y " \n\t"
02825
02826 #include "dsputil_mmx_qns.h"
02827
02828 #undef DEF
02829 #undef SET_RND
02830 #undef SCALE_OFFSET
02831 #undef PMULHRW
02832 #undef PHADDD
02833 #endif //HAVE_SSSE3
02834
02835 #endif
02836
02837 #define PREFETCH(name, op) \
02838 static void name(void *mem, int stride, int h){\
02839 const uint8_t *p= mem;\
02840 do{\
02841 asm volatile(#op" %0" :: "m"(*p));\
02842 p+= stride;\
02843 }while(--h);\
02844 }
02845 PREFETCH(prefetch_mmx2, prefetcht0)
02846 PREFETCH(prefetch_3dnow, prefetch)
02847 #undef PREFETCH
02848
02849 #include "h264dsp_mmx.c"
02850
02851
02852 void ff_cavsdsp_init_mmx2(DSPContext* c, AVCodecContext *avctx);
02853
02854 void ff_put_cavs_qpel8_mc00_mmx2(uint8_t *dst, uint8_t *src, int stride) {
02855 put_pixels8_mmx(dst, src, stride, 8);
02856 }
02857 void ff_avg_cavs_qpel8_mc00_mmx2(uint8_t *dst, uint8_t *src, int stride) {
02858 avg_pixels8_mmx(dst, src, stride, 8);
02859 }
02860 void ff_put_cavs_qpel16_mc00_mmx2(uint8_t *dst, uint8_t *src, int stride) {
02861 put_pixels16_mmx(dst, src, stride, 16);
02862 }
02863 void ff_avg_cavs_qpel16_mc00_mmx2(uint8_t *dst, uint8_t *src, int stride) {
02864 avg_pixels16_mmx(dst, src, stride, 16);
02865 }
02866
02867
02868 void ff_mmx_idct(DCTELEM *block);
02869 void ff_mmxext_idct(DCTELEM *block);
02870
02871
02872
02873 #ifdef CONFIG_GPL
02874 static void ff_libmpeg2mmx_idct_put(uint8_t *dest, int line_size, DCTELEM *block)
02875 {
02876 ff_mmx_idct (block);
02877 put_pixels_clamped_mmx(block, dest, line_size);
02878 }
02879 static void ff_libmpeg2mmx_idct_add(uint8_t *dest, int line_size, DCTELEM *block)
02880 {
02881 ff_mmx_idct (block);
02882 add_pixels_clamped_mmx(block, dest, line_size);
02883 }
02884 static void ff_libmpeg2mmx2_idct_put(uint8_t *dest, int line_size, DCTELEM *block)
02885 {
02886 ff_mmxext_idct (block);
02887 put_pixels_clamped_mmx(block, dest, line_size);
02888 }
02889 static void ff_libmpeg2mmx2_idct_add(uint8_t *dest, int line_size, DCTELEM *block)
02890 {
02891 ff_mmxext_idct (block);
02892 add_pixels_clamped_mmx(block, dest, line_size);
02893 }
02894 #endif
02895 static void ff_idct_xvid_mmx_put(uint8_t *dest, int line_size, DCTELEM *block)
02896 {
02897 ff_idct_xvid_mmx (block);
02898 put_pixels_clamped_mmx(block, dest, line_size);
02899 }
02900 static void ff_idct_xvid_mmx_add(uint8_t *dest, int line_size, DCTELEM *block)
02901 {
02902 ff_idct_xvid_mmx (block);
02903 add_pixels_clamped_mmx(block, dest, line_size);
02904 }
02905 static void ff_idct_xvid_mmx2_put(uint8_t *dest, int line_size, DCTELEM *block)
02906 {
02907 ff_idct_xvid_mmx2 (block);
02908 put_pixels_clamped_mmx(block, dest, line_size);
02909 }
02910 static void ff_idct_xvid_mmx2_add(uint8_t *dest, int line_size, DCTELEM *block)
02911 {
02912 ff_idct_xvid_mmx2 (block);
02913 add_pixels_clamped_mmx(block, dest, line_size);
02914 }
02915
02916 static void vorbis_inverse_coupling_3dnow(float *mag, float *ang, int blocksize)
02917 {
02918 int i;
02919 asm volatile("pxor %%mm7, %%mm7":);
02920 for(i=0; i<blocksize; i+=2) {
02921 asm volatile(
02922 "movq %0, %%mm0 \n\t"
02923 "movq %1, %%mm1 \n\t"
02924 "movq %%mm0, %%mm2 \n\t"
02925 "movq %%mm1, %%mm3 \n\t"
02926 "pfcmpge %%mm7, %%mm2 \n\t"
02927 "pfcmpge %%mm7, %%mm3 \n\t"
02928 "pslld $31, %%mm2 \n\t"
02929 "pxor %%mm2, %%mm1 \n\t"
02930 "movq %%mm3, %%mm4 \n\t"
02931 "pand %%mm1, %%mm3 \n\t"
02932 "pandn %%mm1, %%mm4 \n\t"
02933 "pfadd %%mm0, %%mm3 \n\t"
02934 "pfsub %%mm4, %%mm0 \n\t"
02935 "movq %%mm3, %1 \n\t"
02936 "movq %%mm0, %0 \n\t"
02937 :"+m"(mag[i]), "+m"(ang[i])
02938 ::"memory"
02939 );
02940 }
02941 asm volatile("femms");
02942 }
02943 static void vorbis_inverse_coupling_sse(float *mag, float *ang, int blocksize)
02944 {
02945 int i;
02946
02947 asm volatile(
02948 "movaps %0, %%xmm5 \n\t"
02949 ::"m"(ff_pdw_80000000[0])
02950 );
02951 for(i=0; i<blocksize; i+=4) {
02952 asm volatile(
02953 "movaps %0, %%xmm0 \n\t"
02954 "movaps %1, %%xmm1 \n\t"
02955 "xorps %%xmm2, %%xmm2 \n\t"
02956 "xorps %%xmm3, %%xmm3 \n\t"
02957 "cmpleps %%xmm0, %%xmm2 \n\t"
02958 "cmpleps %%xmm1, %%xmm3 \n\t"
02959 "andps %%xmm5, %%xmm2 \n\t"
02960 "xorps %%xmm2, %%xmm1 \n\t"
02961 "movaps %%xmm3, %%xmm4 \n\t"
02962 "andps %%xmm1, %%xmm3 \n\t"
02963 "andnps %%xmm1, %%xmm4 \n\t"
02964 "addps %%xmm0, %%xmm3 \n\t"
02965 "subps %%xmm4, %%xmm0 \n\t"
02966 "movaps %%xmm3, %1 \n\t"
02967 "movaps %%xmm0, %0 \n\t"
02968 :"+m"(mag[i]), "+m"(ang[i])
02969 ::"memory"
02970 );
02971 }
02972 }
02973
02974 #ifdef CONFIG_ENCODERS
02975 static void apply_welch_window_sse2(const int32_t *data, int len, double *w_data)
02976 {
02977 double c = 2.0 / (len-1.0);
02978 int n2 = len>>1;
02979 long i = -n2*sizeof(int32_t);
02980 long j = n2*sizeof(int32_t);
02981 asm volatile(
02982 "movsd %0, %%xmm7 \n\t"
02983 "movapd %1, %%xmm6 \n\t"
02984 "movapd %2, %%xmm5 \n\t"
02985 "movlhps %%xmm7, %%xmm7 \n\t"
02986 "subpd %%xmm5, %%xmm7 \n\t"
02987 "addsd %%xmm6, %%xmm7 \n\t"
02988 ::"m"(c), "m"(*ff_pd_1), "m"(*ff_pd_2)
02989 );
02990 #define WELCH(MOVPD)\
02991 asm volatile(\
02992 "1: \n\t"\
02993 "movapd %%xmm7, %%xmm1 \n\t"\
02994 "mulpd %%xmm1, %%xmm1 \n\t"\
02995 "movapd %%xmm6, %%xmm0 \n\t"\
02996 "subpd %%xmm1, %%xmm0 \n\t"\
02997 "pshufd $0x4e, %%xmm0, %%xmm1 \n\t"\
02998 "cvtpi2pd (%4,%0), %%xmm2 \n\t"\
02999 "cvtpi2pd (%5,%1), %%xmm3 \n\t"\
03000 "mulpd %%xmm0, %%xmm2 \n\t"\
03001 "mulpd %%xmm1, %%xmm3 \n\t"\
03002 "movapd %%xmm2, (%2,%0,2) \n\t"\
03003 MOVPD" %%xmm3, (%3,%1,2) \n\t"\
03004 "subpd %%xmm5, %%xmm7 \n\t"\
03005 "sub $8, %1 \n\t"\
03006 "add $8, %0 \n\t"\
03007 "jl 1b \n\t"\
03008 :"+&r"(i), "+&r"(j)\
03009 :"r"(w_data+n2), "r"(w_data+len-2-n2),\
03010 "r"(data+n2), "r"(data+len-2-n2)\
03011 );
03012 if(len&1)
03013 WELCH("movupd")
03014 else
03015 WELCH("movapd")
03016 #undef WELCH
03017 }
03018
03019 static void flac_compute_autocorr_sse2(const int32_t *data, int len, int lag,
03020 double *autoc)
03021 {
03022 double tmp[len + lag + 2];
03023 double *data1 = tmp + lag;
03024 int j;
03025
03026 if((long)data1 & 15)
03027 data1++;
03028
03029 apply_welch_window_sse2(data, len, data1);
03030
03031 for(j=0; j<lag; j++)
03032 data1[j-lag]= 0.0;
03033 data1[len] = 0.0;
03034
03035 for(j=0; j<lag; j+=2){
03036 long i = -len*sizeof(double);
03037 if(j == lag-2) {
03038 asm volatile(
03039 "movsd %6, %%xmm0 \n\t"
03040 "movsd %6, %%xmm1 \n\t"
03041 "movsd %6, %%xmm2 \n\t"
03042 "1: \n\t"
03043 "movapd (%4,%0), %%xmm3 \n\t"
03044 "movupd -8(%5,%0), %%xmm4 \n\t"
03045 "movapd (%5,%0), %%xmm5 \n\t"
03046 "mulpd %%xmm3, %%xmm4 \n\t"
03047 "mulpd %%xmm3, %%xmm5 \n\t"
03048 "mulpd -16(%5,%0), %%xmm3 \n\t"
03049 "addpd %%xmm4, %%xmm1 \n\t"
03050 "addpd %%xmm5, %%xmm0 \n\t"
03051 "addpd %%xmm3, %%xmm2 \n\t"
03052 "add $16, %0 \n\t"
03053 "jl 1b \n\t"
03054 "movhlps %%xmm0, %%xmm3 \n\t"
03055 "movhlps %%xmm1, %%xmm4 \n\t"
03056 "movhlps %%xmm2, %%xmm5 \n\t"
03057 "addsd %%xmm3, %%xmm0 \n\t"
03058 "addsd %%xmm4, %%xmm1 \n\t"
03059 "addsd %%xmm5, %%xmm2 \n\t"
03060 "movsd %%xmm0, %1 \n\t"
03061 "movsd %%xmm1, %2 \n\t"
03062 "movsd %%xmm2, %3 \n\t"
03063 :"+&r"(i), "=m"(autoc[j]), "=m"(autoc[j+1]), "=m"(autoc[j+2])
03064 :"r"(data1+len), "r"(data1+len-j), "m"(*ff_pd_1)
03065 );
03066 } else {
03067 asm volatile(
03068 "movsd %5, %%xmm0 \n\t"
03069 "movsd %5, %%xmm1 \n\t"
03070 "1: \n\t"
03071 "movapd (%3,%0), %%xmm3 \n\t"
03072 "movupd -8(%4,%0), %%xmm4 \n\t"
03073 "mulpd %%xmm3, %%xmm4 \n\t"
03074 "mulpd (%4,%0), %%xmm3 \n\t"
03075 "addpd %%xmm4, %%xmm1 \n\t"
03076 "addpd %%xmm3, %%xmm0 \n\t"
03077 "add $16, %0 \n\t"
03078 "jl 1b \n\t"
03079 "movhlps %%xmm0, %%xmm3 \n\t"
03080 "movhlps %%xmm1, %%xmm4 \n\t"
03081 "addsd %%xmm3, %%xmm0 \n\t"
03082 "addsd %%xmm4, %%xmm1 \n\t"
03083 "movsd %%xmm0, %1 \n\t"
03084 "movsd %%xmm1, %2 \n\t"
03085 :"+&r"(i), "=m"(autoc[j]), "=m"(autoc[j+1])
03086 :"r"(data1+len), "r"(data1+len-j), "m"(*ff_pd_1)
03087 );
03088 }
03089 }
03090 }
03091 #endif // CONFIG_ENCODERS
03092
03093 static void vector_fmul_3dnow(float *dst, const float *src, int len){
03094 long i = (len-4)*4;
03095 asm volatile(
03096 "1: \n\t"
03097 "movq (%1,%0), %%mm0 \n\t"
03098 "movq 8(%1,%0), %%mm1 \n\t"
03099 "pfmul (%2,%0), %%mm0 \n\t"
03100 "pfmul 8(%2,%0), %%mm1 \n\t"
03101 "movq %%mm0, (%1,%0) \n\t"
03102 "movq %%mm1, 8(%1,%0) \n\t"
03103 "sub $16, %0 \n\t"
03104 "jge 1b \n\t"
03105 "femms \n\t"
03106 :"+r"(i)
03107 :"r"(dst), "r"(src)
03108 :"memory"
03109 );
03110 }
03111 static void vector_fmul_sse(float *dst, const float *src, int len){
03112 long i = (len-8)*4;
03113 asm volatile(
03114 "1: \n\t"
03115 "movaps (%1,%0), %%xmm0 \n\t"
03116 "movaps 16(%1,%0), %%xmm1 \n\t"
03117 "mulps (%2,%0), %%xmm0 \n\t"
03118 "mulps 16(%2,%0), %%xmm1 \n\t"
03119 "movaps %%xmm0, (%1,%0) \n\t"
03120 "movaps %%xmm1, 16(%1,%0) \n\t"
03121 "sub $32, %0 \n\t"
03122 "jge 1b \n\t"
03123 :"+r"(i)
03124 :"r"(dst), "r"(src)
03125 :"memory"
03126 );
03127 }
03128
03129 static void vector_fmul_reverse_3dnow2(float *dst, const float *src0, const float *src1, int len){
03130 long i = len*4-16;
03131 asm volatile(
03132 "1: \n\t"
03133 "pswapd 8(%1), %%mm0 \n\t"
03134 "pswapd (%1), %%mm1 \n\t"
03135 "pfmul (%3,%0), %%mm0 \n\t"
03136 "pfmul 8(%3,%0), %%mm1 \n\t"
03137 "movq %%mm0, (%2,%0) \n\t"
03138 "movq %%mm1, 8(%2,%0) \n\t"
03139 "add $16, %1 \n\t"
03140 "sub $16, %0 \n\t"
03141 "jge 1b \n\t"
03142 :"+r"(i), "+r"(src1)
03143 :"r"(dst), "r"(src0)
03144 );
03145 asm volatile("femms");
03146 }
03147 static void vector_fmul_reverse_sse(float *dst, const float *src0, const float *src1, int len){
03148 long i = len*4-32;
03149 asm volatile(
03150 "1: \n\t"
03151 "movaps 16(%1), %%xmm0 \n\t"
03152 "movaps (%1), %%xmm1 \n\t"
03153 "shufps $0x1b, %%xmm0, %%xmm0 \n\t"
03154 "shufps $0x1b, %%xmm1, %%xmm1 \n\t"
03155 "mulps (%3,%0), %%xmm0 \n\t"
03156 "mulps 16(%3,%0), %%xmm1 \n\t"
03157 "movaps %%xmm0, (%2,%0) \n\t"
03158 "movaps %%xmm1, 16(%2,%0) \n\t"
03159 "add $32, %1 \n\t"
03160 "sub $32, %0 \n\t"
03161 "jge 1b \n\t"
03162 :"+r"(i), "+r"(src1)
03163 :"r"(dst), "r"(src0)
03164 );
03165 }
03166
03167 static void vector_fmul_add_add_3dnow(float *dst, const float *src0, const float *src1,
03168 const float *src2, int src3, int len, int step){
03169 long i = (len-4)*4;
03170 if(step == 2 && src3 == 0){
03171 dst += (len-4)*2;
03172 asm volatile(
03173 "1: \n\t"
03174 "movq (%2,%0), %%mm0 \n\t"
03175 "movq 8(%2,%0), %%mm1 \n\t"
03176 "pfmul (%3,%0), %%mm0 \n\t"
03177 "pfmul 8(%3,%0), %%mm1 \n\t"
03178 "pfadd (%4,%0), %%mm0 \n\t"
03179 "pfadd 8(%4,%0), %%mm1 \n\t"
03180 "movd %%mm0, (%1) \n\t"
03181 "movd %%mm1, 16(%1) \n\t"
03182 "psrlq $32, %%mm0 \n\t"
03183 "psrlq $32, %%mm1 \n\t"
03184 "movd %%mm0, 8(%1) \n\t"
03185 "movd %%mm1, 24(%1) \n\t"
03186 "sub $32, %1 \n\t"
03187 "sub $16, %0 \n\t"
03188 "jge 1b \n\t"
03189 :"+r"(i), "+r"(dst)
03190 :"r"(src0), "r"(src1), "r"(src2)
03191 :"memory"
03192 );
03193 }
03194 else if(step == 1 && src3 == 0){
03195 asm volatile(
03196 "1: \n\t"
03197 "movq (%2,%0), %%mm0 \n\t"
03198 "movq 8(%2,%0), %%mm1 \n\t"
03199 "pfmul (%3,%0), %%mm0 \n\t"
03200 "pfmul 8(%3,%0), %%mm1 \n\t"
03201 "pfadd (%4,%0), %%mm0 \n\t"
03202 "pfadd 8(%4,%0), %%mm1 \n\t"
03203 "movq %%mm0, (%1,%0) \n\t"
03204 "movq %%mm1, 8(%1,%0) \n\t"
03205 "sub $16, %0 \n\t"
03206 "jge 1b \n\t"
03207 :"+r"(i)
03208 :"r"(dst), "r"(src0), "r"(src1), "r"(src2)
03209 :"memory"
03210 );
03211 }
03212 else
03213 ff_vector_fmul_add_add_c(dst, src0, src1, src2, src3, len, step);
03214 asm volatile("femms");
03215 }
03216 static void vector_fmul_add_add_sse(float *dst, const float *src0, const float *src1,
03217 const float *src2, int src3, int len, int step){
03218 long i = (len-8)*4;
03219 if(step == 2 && src3 == 0){
03220 dst += (len-8)*2;
03221 asm volatile(
03222 "1: \n\t"
03223 "movaps (%2,%0), %%xmm0 \n\t"
03224 "movaps 16(%2,%0), %%xmm1 \n\t"
03225 "mulps (%3,%0), %%xmm0 \n\t"
03226 "mulps 16(%3,%0), %%xmm1 \n\t"
03227 "addps (%4,%0), %%xmm0 \n\t"
03228 "addps 16(%4,%0), %%xmm1 \n\t"
03229 "movss %%xmm0, (%1) \n\t"
03230 "movss %%xmm1, 32(%1) \n\t"
03231 "movhlps %%xmm0, %%xmm2 \n\t"
03232 "movhlps %%xmm1, %%xmm3 \n\t"
03233 "movss %%xmm2, 16(%1) \n\t"
03234 "movss %%xmm3, 48(%1) \n\t"
03235 "shufps $0xb1, %%xmm0, %%xmm0 \n\t"
03236 "shufps $0xb1, %%xmm1, %%xmm1 \n\t"
03237 "movss %%xmm0, 8(%1) \n\t"
03238 "movss %%xmm1, 40(%1) \n\t"
03239 "movhlps %%xmm0, %%xmm2 \n\t"
03240 "movhlps %%xmm1, %%xmm3 \n\t"
03241 "movss %%xmm2, 24(%1) \n\t"
03242 "movss %%xmm3, 56(%1) \n\t"
03243 "sub $64, %1 \n\t"
03244 "sub $32, %0 \n\t"
03245 "jge 1b \n\t"
03246 :"+r"(i), "+r"(dst)
03247 :"r"(src0), "r"(src1), "r"(src2)
03248 :"memory"
03249 );
03250 }
03251 else if(step == 1 && src3 == 0){
03252 asm volatile(
03253 "1: \n\t"
03254 "movaps (%2,%0), %%xmm0 \n\t"
03255 "movaps 16(%2,%0), %%xmm1 \n\t"
03256 "mulps (%3,%0), %%xmm0 \n\t"
03257 "mulps 16(%3,%0), %%xmm1 \n\t"
03258 "addps (%4,%0), %%xmm0 \n\t"
03259 "addps 16(%4,%0), %%xmm1 \n\t"
03260 "movaps %%xmm0, (%1,%0) \n\t"
03261 "movaps %%xmm1, 16(%1,%0) \n\t"
03262 "sub $32, %0 \n\t"
03263 "jge 1b \n\t"
03264 :"+r"(i)
03265 :"r"(dst), "r"(src0), "r"(src1), "r"(src2)
03266 :"memory"
03267 );
03268 }
03269 else
03270 ff_vector_fmul_add_add_c(dst, src0, src1, src2, src3, len, step);
03271 }
03272
03273 static void float_to_int16_3dnow(int16_t *dst, const float *src, int len){
03274
03275 int i;
03276 for(i=0; i<len; i+=4) {
03277 asm volatile(
03278 "pf2id %1, %%mm0 \n\t"
03279 "pf2id %2, %%mm1 \n\t"
03280 "packssdw %%mm1, %%mm0 \n\t"
03281 "movq %%mm0, %0 \n\t"
03282 :"=m"(dst[i])
03283 :"m"(src[i]), "m"(src[i+2])
03284 );
03285 }
03286 asm volatile("femms");
03287 }
03288 static void float_to_int16_sse(int16_t *dst, const float *src, int len){
03289 int i;
03290 for(i=0; i<len; i+=4) {
03291 asm volatile(
03292 "cvtps2pi %1, %%mm0 \n\t"
03293 "cvtps2pi %2, %%mm1 \n\t"
03294 "packssdw %%mm1, %%mm0 \n\t"
03295 "movq %%mm0, %0 \n\t"
03296 :"=m"(dst[i])
03297 :"m"(src[i]), "m"(src[i+2])
03298 );
03299 }
03300 asm volatile("emms");
03301 }
03302
03303 #ifdef CONFIG_SNOW_DECODER
03304 extern void ff_snow_horizontal_compose97i_sse2(DWTELEM *b, int width);
03305 extern void ff_snow_horizontal_compose97i_mmx(DWTELEM *b, int width);
03306 extern void ff_snow_vertical_compose97i_sse2(DWTELEM *b0, DWTELEM *b1, DWTELEM *b2, DWTELEM *b3, DWTELEM *b4, DWTELEM *b5, int width);
03307 extern void ff_snow_vertical_compose97i_mmx(DWTELEM *b0, DWTELEM *b1, DWTELEM *b2, DWTELEM *b3, DWTELEM *b4, DWTELEM *b5, int width);
03308 extern void ff_snow_inner_add_yblock_sse2(const uint8_t *obmc, const int obmc_stride, uint8_t * * block, int b_w, int b_h,
03309 int src_x, int src_y, int src_stride, slice_buffer * sb, int add, uint8_t * dst8);
03310 extern void ff_snow_inner_add_yblock_mmx(const uint8_t *obmc, const int obmc_stride, uint8_t * * block, int b_w, int b_h,
03311 int src_x, int src_y, int src_stride, slice_buffer * sb, int add, uint8_t * dst8);
03312 #endif
03313
03314 void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx)
03315 {
03316 mm_flags = mm_support();
03317
03318 if (avctx->dsp_mask) {
03319 if (avctx->dsp_mask & FF_MM_FORCE)
03320 mm_flags |= (avctx->dsp_mask & 0xffff);
03321 else
03322 mm_flags &= ~(avctx->dsp_mask & 0xffff);
03323 }
03324
03325 #if 0
03326 av_log(avctx, AV_LOG_INFO, "libavcodec: CPU flags:");
03327 if (mm_flags & MM_MMX)
03328 av_log(avctx, AV_LOG_INFO, " mmx");
03329 if (mm_flags & MM_MMXEXT)
03330 av_log(avctx, AV_LOG_INFO, " mmxext");
03331 if (mm_flags & MM_3DNOW)
03332 av_log(avctx, AV_LOG_INFO, " 3dnow");
03333 if (mm_flags & MM_SSE)
03334 av_log(avctx, AV_LOG_INFO, " sse");
03335 if (mm_flags & MM_SSE2)
03336 av_log(avctx, AV_LOG_INFO, " sse2");
03337 av_log(avctx, AV_LOG_INFO, "\n");
03338 #endif
03339
03340 if (mm_flags & MM_MMX) {
03341 const int idct_algo= avctx->idct_algo;
03342
03343 #ifdef CONFIG_ENCODERS
03344 const int dct_algo = avctx->dct_algo;
03345 if(dct_algo==FF_DCT_AUTO || dct_algo==FF_DCT_MMX){
03346 if(mm_flags & MM_SSE2){
03347 c->fdct = ff_fdct_sse2;
03348 }else if(mm_flags & MM_MMXEXT){
03349 c->fdct = ff_fdct_mmx2;
03350 }else{
03351 c->fdct = ff_fdct_mmx;
03352 }
03353 }
03354 #endif //CONFIG_ENCODERS
03355 if(avctx->lowres==0){
03356 if(idct_algo==FF_IDCT_AUTO || idct_algo==FF_IDCT_SIMPLEMMX){
03357 c->idct_put= ff_simple_idct_put_mmx;
03358 c->idct_add= ff_simple_idct_add_mmx;
03359 c->idct = ff_simple_idct_mmx;
03360 c->idct_permutation_type= FF_SIMPLE_IDCT_PERM;
03361 #ifdef CONFIG_GPL
03362 }else if(idct_algo==FF_IDCT_LIBMPEG2MMX){
03363 if(mm_flags & MM_MMXEXT){
03364 c->idct_put= ff_libmpeg2mmx2_idct_put;
03365 c->idct_add= ff_libmpeg2mmx2_idct_add;
03366 c->idct = ff_mmxext_idct;
03367 }else{
03368 c->idct_put= ff_libmpeg2mmx_idct_put;
03369 c->idct_add= ff_libmpeg2mmx_idct_add;
03370 c->idct = ff_mmx_idct;
03371 }
03372 c->idct_permutation_type= FF_LIBMPEG2_IDCT_PERM;
03373 #endif
03374 }else if((ENABLE_VP3_DECODER || ENABLE_VP5_DECODER || ENABLE_VP6_DECODER) &&
03375 idct_algo==FF_IDCT_VP3 &&
03376 avctx->codec->id!=CODEC_ID_THEORA &&
03377 !(avctx->flags & CODEC_FLAG_BITEXACT)){
03378 if(mm_flags & MM_SSE2){
03379 c->idct_put= ff_vp3_idct_put_sse2;
03380 c->idct_add= ff_vp3_idct_add_sse2;
03381 c->idct = ff_vp3_idct_sse2;
03382 c->idct_permutation_type= FF_TRANSPOSE_IDCT_PERM;
03383 }else{
03384 ff_vp3_dsp_init_mmx();
03385 c->idct_put= ff_vp3_idct_put_mmx;
03386 c->idct_add= ff_vp3_idct_add_mmx;
03387 c->idct = ff_vp3_idct_mmx;
03388 c->idct_permutation_type= FF_PARTTRANS_IDCT_PERM;
03389 }
03390 }else if(idct_algo==FF_IDCT_CAVS){
03391 c->idct_permutation_type= FF_TRANSPOSE_IDCT_PERM;
03392 }else if(idct_algo==FF_IDCT_XVIDMMX){
03393 if(mm_flags & MM_MMXEXT){
03394 c->idct_put= ff_idct_xvid_mmx2_put;
03395 c->idct_add= ff_idct_xvid_mmx2_add;
03396 c->idct = ff_idct_xvid_mmx2;
03397 }else{
03398 c->idct_put= ff_idct_xvid_mmx_put;
03399 c->idct_add= ff_idct_xvid_mmx_add;
03400 c->idct = ff_idct_xvid_mmx;
03401 }
03402 }
03403 }
03404
03405 #ifdef CONFIG_ENCODERS
03406 c->get_pixels = get_pixels_mmx;
03407 c->diff_pixels = diff_pixels_mmx;
03408 #endif //CONFIG_ENCODERS
03409 c->put_pixels_clamped = put_pixels_clamped_mmx;
03410 c->put_signed_pixels_clamped = put_signed_pixels_clamped_mmx;
03411 c->add_pixels_clamped = add_pixels_clamped_mmx;
03412 c->clear_blocks = clear_blocks_mmx;
03413 #ifdef CONFIG_ENCODERS
03414 c->pix_sum = pix_sum16_mmx;
03415 #endif //CONFIG_ENCODERS
03416
03417 c->put_pixels_tab[0][0] = put_pixels16_mmx;
03418 c->put_pixels_tab[0][1] = put_pixels16_x2_mmx;
03419 c->put_pixels_tab[0][2] = put_pixels16_y2_mmx;
03420 c->put_pixels_tab[0][3] = put_pixels16_xy2_mmx;
03421
03422 c->put_no_rnd_pixels_tab[0][0] = put_pixels16_mmx;
03423 c->put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_mmx;
03424 c->put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_mmx;
03425 c->put_no_rnd_pixels_tab[0][3] = put_no_rnd_pixels16_xy2_mmx;
03426
03427 c->avg_pixels_tab[0][0] = avg_pixels16_mmx;
03428 c->avg_pixels_tab[0][1] = avg_pixels16_x2_mmx;
03429 c->avg_pixels_tab[0][2] = avg_pixels16_y2_mmx;
03430 c->avg_pixels_tab[0][3] = avg_pixels16_xy2_mmx;
03431
03432 c->avg_no_rnd_pixels_tab[0][0] = avg_no_rnd_pixels16_mmx;
03433 c->avg_no_rnd_pixels_tab[0][1] = avg_no_rnd_pixels16_x2_mmx;
03434 c->avg_no_rnd_pixels_tab[0][2] = avg_no_rnd_pixels16_y2_mmx;
03435 c->avg_no_rnd_pixels_tab[0][3] = avg_no_rnd_pixels16_xy2_mmx;
03436
03437 c->put_pixels_tab[1][0] = put_pixels8_mmx;
03438 c->put_pixels_tab[1][1] = put_pixels8_x2_mmx;
03439 c->put_pixels_tab[1][2] = put_pixels8_y2_mmx;
03440 c->put_pixels_tab[1][3] = put_pixels8_xy2_mmx;
03441
03442 c->put_no_rnd_pixels_tab[1][0] = put_pixels8_mmx;
03443 c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_mmx;
03444 c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_mmx;
03445 c->put_no_rnd_pixels_tab[1][3] = put_no_rnd_pixels8_xy2_mmx;
03446
03447 c->avg_pixels_tab[1][0] = avg_pixels8_mmx;
03448 c->avg_pixels_tab[1][1] = avg_pixels8_x2_mmx;
03449 c->avg_pixels_tab[1][2] = avg_pixels8_y2_mmx;
03450 c->avg_pixels_tab[1][3] = avg_pixels8_xy2_mmx;
03451
03452 c->avg_no_rnd_pixels_tab[1][0] = avg_no_rnd_pixels8_mmx;
03453 c->avg_no_rnd_pixels_tab[1][1] = avg_no_rnd_pixels8_x2_mmx;
03454 c->avg_no_rnd_pixels_tab[1][2] = avg_no_rnd_pixels8_y2_mmx;
03455 c->avg_no_rnd_pixels_tab[1][3] = avg_no_rnd_pixels8_xy2_mmx;
03456
03457 c->gmc= gmc_mmx;
03458
03459 c->add_bytes= add_bytes_mmx;
03460 #ifdef CONFIG_ENCODERS
03461 c->diff_bytes= diff_bytes_mmx;
03462 c->sum_abs_dctelem= sum_abs_dctelem_mmx;
03463
03464 c->hadamard8_diff[0]= hadamard8_diff16_mmx;
03465 c->hadamard8_diff[1]= hadamard8_diff_mmx;
03466
03467 c->pix_norm1 = pix_norm1_mmx;
03468 c->sse[0] = (mm_flags & MM_SSE2) ? sse16_sse2 : sse16_mmx;
03469 c->sse[1] = sse8_mmx;
03470 c->vsad[4]= vsad_intra16_mmx;
03471
03472 c->nsse[0] = nsse16_mmx;
03473 c->nsse[1] = nsse8_mmx;
03474 if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
03475 c->vsad[0] = vsad16_mmx;
03476 }
03477
03478 if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
03479 c->try_8x8basis= try_8x8basis_mmx;
03480 }
03481 c->add_8x8basis= add_8x8basis_mmx;
03482
03483 c->ssd_int8_vs_int16 = ssd_int8_vs_int16_mmx;
03484
03485 #endif //CONFIG_ENCODERS
03486
03487 if (ENABLE_ANY_H263) {
03488 c->h263_v_loop_filter= h263_v_loop_filter_mmx;
03489 c->h263_h_loop_filter= h263_h_loop_filter_mmx;
03490 }
03491 c->put_h264_chroma_pixels_tab[0]= put_h264_chroma_mc8_mmx;
03492 c->put_h264_chroma_pixels_tab[1]= put_h264_chroma_mc4_mmx;
03493
03494 c->h264_idct_dc_add=
03495 c->h264_idct_add= ff_h264_idct_add_mmx;
03496 c->h264_idct8_dc_add=
03497 c->h264_idct8_add= ff_h264_idct8_add_mmx;
03498
03499 if (mm_flags & MM_MMXEXT) {
03500 c->prefetch = prefetch_mmx2;
03501
03502 c->put_pixels_tab[0][1] = put_pixels16_x2_mmx2;
03503 c->put_pixels_tab[0][2] = put_pixels16_y2_mmx2;
03504
03505 c->avg_pixels_tab[0][0] = avg_pixels16_mmx2;
03506 c->avg_pixels_tab[0][1] = avg_pixels16_x2_mmx2;
03507 c->avg_pixels_tab[0][2] = avg_pixels16_y2_mmx2;
03508
03509 c->put_pixels_tab[1][1] = put_pixels8_x2_mmx2;
03510 c->put_pixels_tab[1][2] = put_pixels8_y2_mmx2;
03511
03512 c->avg_pixels_tab[1][0] = avg_pixels8_mmx2;
03513 c->avg_pixels_tab[1][1] = avg_pixels8_x2_mmx2;
03514 c->avg_pixels_tab[1][2] = avg_pixels8_y2_mmx2;
03515
03516 #ifdef CONFIG_ENCODERS
03517 c->sum_abs_dctelem= sum_abs_dctelem_mmx2;
03518 c->hadamard8_diff[0]= hadamard8_diff16_mmx2;
03519 c->hadamard8_diff[1]= hadamard8_diff_mmx2;
03520 c->vsad[4]= vsad_intra16_mmx2;
03521 #endif //CONFIG_ENCODERS
03522
03523 c->h264_idct_dc_add= ff_h264_idct_dc_add_mmx2;
03524 c->h264_idct8_dc_add= ff_h264_idct8_dc_add_mmx2;
03525
03526 if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
03527 c->put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_mmx2;
03528 c->put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_mmx2;
03529 c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_mmx2;
03530 c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_mmx2;
03531 c->avg_pixels_tab[0][3] = avg_pixels16_xy2_mmx2;
03532 c->avg_pixels_tab[1][3] = avg_pixels8_xy2_mmx2;
03533 #ifdef CONFIG_ENCODERS
03534 c->vsad[0] = vsad16_mmx2;
03535 #endif //CONFIG_ENCODERS
03536 }
03537
03538 #if 1
03539 SET_QPEL_FUNC(qpel_pixels_tab[0][ 0], qpel16_mc00_mmx2)
03540 SET_QPEL_FUNC(qpel_pixels_tab[0][ 1], qpel16_mc10_mmx2)
03541 SET_QPEL_FUNC(qpel_pixels_tab[0][ 2], qpel16_mc20_mmx2)
03542 SET_QPEL_FUNC(qpel_pixels_tab[0][ 3], qpel16_mc30_mmx2)
03543 SET_QPEL_FUNC(qpel_pixels_tab[0][ 4], qpel16_mc01_mmx2)
03544 SET_QPEL_FUNC(qpel_pixels_tab[0][ 5], qpel16_mc11_mmx2)
03545 SET_QPEL_FUNC(qpel_pixels_tab[0][ 6], qpel16_mc21_mmx2)
03546 SET_QPEL_FUNC(qpel_pixels_tab[0][ 7], qpel16_mc31_mmx2)
03547 SET_QPEL_FUNC(qpel_pixels_tab[0][ 8], qpel16_mc02_mmx2)
03548 SET_QPEL_FUNC(qpel_pixels_tab[0][ 9], qpel16_mc12_mmx2)
03549 SET_QPEL_FUNC(qpel_pixels_tab[0][10], qpel16_mc22_mmx2)
03550 SET_QPEL_FUNC(qpel_pixels_tab[0][11], qpel16_mc32_mmx2)
03551 SET_QPEL_FUNC(qpel_pixels_tab[0][12], qpel16_mc03_mmx2)
03552 SET_QPEL_FUNC(qpel_pixels_tab[0][13], qpel16_mc13_mmx2)
03553 SET_QPEL_FUNC(qpel_pixels_tab[0][14], qpel16_mc23_mmx2)
03554 SET_QPEL_FUNC(qpel_pixels_tab[0][15], qpel16_mc33_mmx2)
03555 SET_QPEL_FUNC(qpel_pixels_tab[1][ 0], qpel8_mc00_mmx2)
03556 SET_QPEL_FUNC(qpel_pixels_tab[1][ 1], qpel8_mc10_mmx2)
03557 SET_QPEL_FUNC(qpel_pixels_tab[1][ 2], qpel8_mc20_mmx2)
03558 SET_QPEL_FUNC(qpel_pixels_tab[1][ 3], qpel8_mc30_mmx2)
03559 SET_QPEL_FUNC(qpel_pixels_tab[1][ 4], qpel8_mc01_mmx2)
03560 SET_QPEL_FUNC(qpel_pixels_tab[1][ 5], qpel8_mc11_mmx2)
03561 SET_QPEL_FUNC(qpel_pixels_tab[1][ 6], qpel8_mc21_mmx2)
03562 SET_QPEL_FUNC(qpel_pixels_tab[1][ 7], qpel8_mc31_mmx2)
03563 SET_QPEL_FUNC(qpel_pixels_tab[1][ 8], qpel8_mc02_mmx2)
03564 SET_QPEL_FUNC(qpel_pixels_tab[1][ 9], qpel8_mc12_mmx2)
03565 SET_QPEL_FUNC(qpel_pixels_tab[1][10], qpel8_mc22_mmx2)
03566 SET_QPEL_FUNC(qpel_pixels_tab[1][11], qpel8_mc32_mmx2)
03567 SET_QPEL_FUNC(qpel_pixels_tab[1][12], qpel8_mc03_mmx2)
03568 SET_QPEL_FUNC(qpel_pixels_tab[1][13], qpel8_mc13_mmx2)
03569 SET_QPEL_FUNC(qpel_pixels_tab[1][14], qpel8_mc23_mmx2)
03570 SET_QPEL_FUNC(qpel_pixels_tab[1][15], qpel8_mc33_mmx2)
03571 #endif
03572
03573
03574 #define dspfunc(PFX, IDX, NUM) \
03575 c->PFX ## _pixels_tab[IDX][ 0] = PFX ## NUM ## _mc00_mmx2; \
03576 c->PFX ## _pixels_tab[IDX][ 1] = PFX ## NUM ## _mc10_mmx2; \
03577 c->PFX ## _pixels_tab[IDX][ 2] = PFX ## NUM ## _mc20_mmx2; \
03578 c->PFX ## _pixels_tab[IDX][ 3] = PFX ## NUM ## _mc30_mmx2; \
03579 c->PFX ## _pixels_tab[IDX][ 4] = PFX ## NUM ## _mc01_mmx2; \
03580 c->PFX ## _pixels_tab[IDX][ 5] = PFX ## NUM ## _mc11_mmx2; \
03581 c->PFX ## _pixels_tab[IDX][ 6] = PFX ## NUM ## _mc21_mmx2; \
03582 c->PFX ## _pixels_tab[IDX][ 7] = PFX ## NUM ## _mc31_mmx2; \
03583 c->PFX ## _pixels_tab[IDX][ 8] = PFX ## NUM ## _mc02_mmx2; \
03584 c->PFX ## _pixels_tab[IDX][ 9] = PFX ## NUM ## _mc12_mmx2; \
03585 c->PFX ## _pixels_tab[IDX][10] = PFX ## NUM ## _mc22_mmx2; \
03586 c->PFX ## _pixels_tab[IDX][11] = PFX ## NUM ## _mc32_mmx2; \
03587 c->PFX ## _pixels_tab[IDX][12] = PFX ## NUM ## _mc03_mmx2; \
03588 c->PFX ## _pixels_tab[IDX][13] = PFX ## NUM ## _mc13_mmx2; \
03589 c->PFX ## _pixels_tab[IDX][14] = PFX ## NUM ## _mc23_mmx2; \
03590 c->PFX ## _pixels_tab[IDX][15] = PFX ## NUM ## _mc33_mmx2
03591
03592 dspfunc(put_h264_qpel, 0, 16);
03593 dspfunc(put_h264_qpel, 1, 8);
03594 dspfunc(put_h264_qpel, 2, 4);
03595 dspfunc(avg_h264_qpel, 0, 16);
03596 dspfunc(avg_h264_qpel, 1, 8);
03597 dspfunc(avg_h264_qpel, 2, 4);
03598
03599 dspfunc(put_2tap_qpel, 0, 16);
03600 dspfunc(put_2tap_qpel, 1, 8);
03601 dspfunc(avg_2tap_qpel, 0, 16);
03602 dspfunc(avg_2tap_qpel, 1, 8);
03603 #undef dspfunc
03604
03605 c->avg_h264_chroma_pixels_tab[0]= avg_h264_chroma_mc8_mmx2;
03606 c->avg_h264_chroma_pixels_tab[1]= avg_h264_chroma_mc4_mmx2;
03607 c->avg_h264_chroma_pixels_tab[2]= avg_h264_chroma_mc2_mmx2;
03608 c->put_h264_chroma_pixels_tab[2]= put_h264_chroma_mc2_mmx2;
03609 c->h264_v_loop_filter_luma= h264_v_loop_filter_luma_mmx2;
03610 c->h264_h_loop_filter_luma= h264_h_loop_filter_luma_mmx2;
03611 c->h264_v_loop_filter_chroma= h264_v_loop_filter_chroma_mmx2;
03612 c->h264_h_loop_filter_chroma= h264_h_loop_filter_chroma_mmx2;
03613 c->h264_v_loop_filter_chroma_intra= h264_v_loop_filter_chroma_intra_mmx2;
03614 c->h264_h_loop_filter_chroma_intra= h264_h_loop_filter_chroma_intra_mmx2;
03615 c->h264_loop_filter_strength= h264_loop_filter_strength_mmx2;
03616
03617 c->weight_h264_pixels_tab[0]= ff_h264_weight_16x16_mmx2;
03618 c->weight_h264_pixels_tab[1]= ff_h264_weight_16x8_mmx2;
03619 c->weight_h264_pixels_tab[2]= ff_h264_weight_8x16_mmx2;
03620 c->weight_h264_pixels_tab[3]= ff_h264_weight_8x8_mmx2;
03621 c->weight_h264_pixels_tab[4]= ff_h264_weight_8x4_mmx2;
03622 c->weight_h264_pixels_tab[5]= ff_h264_weight_4x8_mmx2;
03623 c->weight_h264_pixels_tab[6]= ff_h264_weight_4x4_mmx2;
03624 c->weight_h264_pixels_tab[7]= ff_h264_weight_4x2_mmx2;
03625
03626 c->biweight_h264_pixels_tab[0]= ff_h264_biweight_16x16_mmx2;
03627 c->biweight_h264_pixels_tab[1]= ff_h264_biweight_16x8_mmx2;
03628 c->biweight_h264_pixels_tab[2]= ff_h264_biweight_8x16_mmx2;
03629 c->biweight_h264_pixels_tab[3]= ff_h264_biweight_8x8_mmx2;
03630 c->biweight_h264_pixels_tab[4]= ff_h264_biweight_8x4_mmx2;
03631 c->biweight_h264_pixels_tab[5]= ff_h264_biweight_4x8_mmx2;
03632 c->biweight_h264_pixels_tab[6]= ff_h264_biweight_4x4_mmx2;
03633 c->biweight_h264_pixels_tab[7]= ff_h264_biweight_4x2_mmx2;
03634
03635 #ifdef CONFIG_CAVS_DECODER
03636 ff_cavsdsp_init_mmx2(c, avctx);
03637 #endif
03638
03639 #ifdef CONFIG_ENCODERS
03640 c->sub_hfyu_median_prediction= sub_hfyu_median_prediction_mmx2;
03641 #endif //CONFIG_ENCODERS
03642 } else if (mm_flags & MM_3DNOW) {
03643 c->prefetch = prefetch_3dnow;
03644
03645 c->put_pixels_tab[0][1] = put_pixels16_x2_3dnow;
03646 c->put_pixels_tab[0][2] = put_pixels16_y2_3dnow;
03647
03648 c->avg_pixels_tab[0][0] = avg_pixels16_3dnow;
03649 c->avg_pixels_tab[0][1] = avg_pixels16_x2_3dnow;
03650 c->avg_pixels_tab[0][2] = avg_pixels16_y2_3dnow;
03651
03652 c->put_pixels_tab[1][1] = put_pixels8_x2_3dnow;
03653 c->put_pixels_tab[1][2] = put_pixels8_y2_3dnow;
03654
03655 c->avg_pixels_tab[1][0] = avg_pixels8_3dnow;
03656 c->avg_pixels_tab[1][1] = avg_pixels8_x2_3dnow;
03657 c->avg_pixels_tab[1][2] = avg_pixels8_y2_3dnow;
03658
03659 if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
03660 c->put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_3dnow;
03661 c->put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_3dnow;
03662 c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_3dnow;
03663 c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_3dnow;
03664 c->avg_pixels_tab[0][3] = avg_pixels16_xy2_3dnow;
03665 c->avg_pixels_tab[1][3] = avg_pixels8_xy2_3dnow;
03666 }
03667
03668 SET_QPEL_FUNC(qpel_pixels_tab[0][ 0], qpel16_mc00_3dnow)
03669 SET_QPEL_FUNC(qpel_pixels_tab[0][ 1], qpel16_mc10_3dnow)
03670 SET_QPEL_FUNC(qpel_pixels_tab[0][ 2], qpel16_mc20_3dnow)
03671 SET_QPEL_FUNC(qpel_pixels_tab[0][ 3], qpel16_mc30_3dnow)
03672 SET_QPEL_FUNC(qpel_pixels_tab[0][ 4], qpel16_mc01_3dnow)
03673 SET_QPEL_FUNC(qpel_pixels_tab[0][ 5], qpel16_mc11_3dnow)
03674 SET_QPEL_FUNC(qpel_pixels_tab[0][ 6], qpel16_mc21_3dnow)
03675 SET_QPEL_FUNC(qpel_pixels_tab[0][ 7], qpel16_mc31_3dnow)
03676 SET_QPEL_FUNC(qpel_pixels_tab[0][ 8], qpel16_mc02_3dnow)
03677 SET_QPEL_FUNC(qpel_pixels_tab[0][ 9], qpel16_mc12_3dnow)
03678 SET_QPEL_FUNC(qpel_pixels_tab[0][10], qpel16_mc22_3dnow)
03679 SET_QPEL_FUNC(qpel_pixels_tab[0][11], qpel16_mc32_3dnow)
03680 SET_QPEL_FUNC(qpel_pixels_tab[0][12], qpel16_mc03_3dnow)
03681 SET_QPEL_FUNC(qpel_pixels_tab[0][13], qpel16_mc13_3dnow)
03682 SET_QPEL_FUNC(qpel_pixels_tab[0][14], qpel16_mc23_3dnow)
03683 SET_QPEL_FUNC(qpel_pixels_tab[0][15], qpel16_mc33_3dnow)
03684 SET_QPEL_FUNC(qpel_pixels_tab[1][ 0], qpel8_mc00_3dnow)
03685 SET_QPEL_FUNC(qpel_pixels_tab[1][ 1], qpel8_mc10_3dnow)
03686 SET_QPEL_FUNC(qpel_pixels_tab[1][ 2], qpel8_mc20_3dnow)
03687 SET_QPEL_FUNC(qpel_pixels_tab[1][ 3], qpel8_mc30_3dnow)
03688 SET_QPEL_FUNC(qpel_pixels_tab[1][ 4], qpel8_mc01_3dnow)
03689 SET_QPEL_FUNC(qpel_pixels_tab[1][ 5], qpel8_mc11_3dnow)
03690 SET_QPEL_FUNC(qpel_pixels_tab[1][ 6], qpel8_mc21_3dnow)
03691 SET_QPEL_FUNC(qpel_pixels_tab[1][ 7], qpel8_mc31_3dnow)
03692 SET_QPEL_FUNC(qpel_pixels_tab[1][ 8], qpel8_mc02_3dnow)
03693 SET_QPEL_FUNC(qpel_pixels_tab[1][ 9], qpel8_mc12_3dnow)
03694 SET_QPEL_FUNC(qpel_pixels_tab[1][10], qpel8_mc22_3dnow)
03695 SET_QPEL_FUNC(qpel_pixels_tab[1][11], qpel8_mc32_3dnow)
03696 SET_QPEL_FUNC(qpel_pixels_tab[1][12], qpel8_mc03_3dnow)
03697 SET_QPEL_FUNC(qpel_pixels_tab[1][13], qpel8_mc13_3dnow)
03698 SET_QPEL_FUNC(qpel_pixels_tab[1][14], qpel8_mc23_3dnow)
03699 SET_QPEL_FUNC(qpel_pixels_tab[1][15], qpel8_mc33_3dnow)
03700
03701 #define dspfunc(PFX, IDX, NUM) \
03702 c->PFX ## _pixels_tab[IDX][ 0] = PFX ## NUM ## _mc00_3dnow; \
03703 c->PFX ## _pixels_tab[IDX][ 1] = PFX ## NUM ## _mc10_3dnow; \
03704 c->PFX ## _pixels_tab[IDX][ 2] = PFX ## NUM ## _mc20_3dnow; \
03705 c->PFX ## _pixels_tab[IDX][ 3] = PFX ## NUM ## _mc30_3dnow; \
03706 c->PFX ## _pixels_tab[IDX][ 4] = PFX ## NUM ## _mc01_3dnow; \
03707 c->PFX ## _pixels_tab[IDX][ 5] = PFX ## NUM ## _mc11_3dnow; \
03708 c->PFX ## _pixels_tab[IDX][ 6] = PFX ## NUM ## _mc21_3dnow; \
03709 c->PFX ## _pixels_tab[IDX][ 7] = PFX ## NUM ## _mc31_3dnow; \
03710 c->PFX ## _pixels_tab[IDX][ 8] = PFX ## NUM ## _mc02_3dnow; \
03711 c->PFX ## _pixels_tab[IDX][ 9] = PFX ## NUM ## _mc12_3dnow; \
03712 c->PFX ## _pixels_tab[IDX][10] = PFX ## NUM ## _mc22_3dnow; \
03713 c->PFX ## _pixels_tab[IDX][11] = PFX ## NUM ## _mc32_3dnow; \
03714 c->PFX ## _pixels_tab[IDX][12] = PFX ## NUM ## _mc03_3dnow; \
03715 c->PFX ## _pixels_tab[IDX][13] = PFX ## NUM ## _mc13_3dnow; \
03716 c->PFX ## _pixels_tab[IDX][14] = PFX ## NUM ## _mc23_3dnow; \
03717 c->PFX ## _pixels_tab[IDX][15] = PFX ## NUM ## _mc33_3dnow
03718
03719 dspfunc(put_h264_qpel, 0, 16);
03720 dspfunc(put_h264_qpel, 1, 8);
03721 dspfunc(put_h264_qpel, 2, 4);
03722 dspfunc(avg_h264_qpel, 0, 16);
03723 dspfunc(avg_h264_qpel, 1, 8);
03724 dspfunc(avg_h264_qpel, 2, 4);
03725
03726 dspfunc(put_2tap_qpel, 0, 16);
03727 dspfunc(put_2tap_qpel, 1, 8);
03728 dspfunc(avg_2tap_qpel, 0, 16);
03729 dspfunc(avg_2tap_qpel, 1, 8);
03730
03731 c->avg_h264_chroma_pixels_tab[0]= avg_h264_chroma_mc8_3dnow;
03732 c->avg_h264_chroma_pixels_tab[1]= avg_h264_chroma_mc4_3dnow;
03733 }
03734
03735 #ifdef CONFIG_ENCODERS
03736 if(mm_flags & MM_SSE2){
03737 c->sum_abs_dctelem= sum_abs_dctelem_sse2;
03738 c->hadamard8_diff[0]= hadamard8_diff16_sse2;
03739 c->hadamard8_diff[1]= hadamard8_diff_sse2;
03740 #ifdef HAVE_EBX_AVAILABLE
03741 c->flac_compute_autocorr = flac_compute_autocorr_sse2;
03742 #endif
03743 }
03744
03745 #ifdef HAVE_SSSE3
03746 if(mm_flags & MM_SSSE3){
03747 if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
03748 c->try_8x8basis= try_8x8basis_ssse3;
03749 }
03750 c->add_8x8basis= add_8x8basis_ssse3;
03751 c->sum_abs_dctelem= sum_abs_dctelem_ssse3;
03752 c->hadamard8_diff[0]= hadamard8_diff16_ssse3;
03753 c->hadamard8_diff[1]= hadamard8_diff_ssse3;
03754 }
03755 #endif
03756 #endif
03757
03758 #ifdef CONFIG_SNOW_DECODER
03759 if(mm_flags & MM_SSE2 & 0){
03760 c->horizontal_compose97i = ff_snow_horizontal_compose97i_sse2;
03761 #ifdef HAVE_7REGS
03762 c->vertical_compose97i = ff_snow_vertical_compose97i_sse2;
03763 #endif
03764 c->inner_add_yblock = ff_snow_inner_add_yblock_sse2;
03765 }
03766 else{
03767 if(mm_flags & MM_MMXEXT){
03768 c->horizontal_compose97i = ff_snow_horizontal_compose97i_mmx;
03769 #ifdef HAVE_7REGS
03770 c->vertical_compose97i = ff_snow_vertical_compose97i_mmx;
03771 #endif
03772 }
03773 c->inner_add_yblock = ff_snow_inner_add_yblock_mmx;
03774 }
03775 #endif
03776
03777 if(mm_flags & MM_3DNOW){
03778 #ifdef CONFIG_ENCODERS
03779 if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
03780 c->try_8x8basis= try_8x8basis_3dnow;
03781 }
03782 c->add_8x8basis= add_8x8basis_3dnow;
03783 #endif //CONFIG_ENCODERS
03784 c->vorbis_inverse_coupling = vorbis_inverse_coupling_3dnow;
03785 c->vector_fmul = vector_fmul_3dnow;
03786 if(!(avctx->flags & CODEC_FLAG_BITEXACT))
03787 c->float_to_int16 = float_to_int16_3dnow;
03788 }
03789 if(mm_flags & MM_3DNOWEXT)
03790 c->vector_fmul_reverse = vector_fmul_reverse_3dnow2;
03791 if(mm_flags & MM_SSE){
03792 c->vorbis_inverse_coupling = vorbis_inverse_coupling_sse;
03793 c->vector_fmul = vector_fmul_sse;
03794 c->float_to_int16 = float_to_int16_sse;
03795 c->vector_fmul_reverse = vector_fmul_reverse_sse;
03796 c->vector_fmul_add_add = vector_fmul_add_add_sse;
03797 }
03798 if(mm_flags & MM_3DNOW)
03799 c->vector_fmul_add_add = vector_fmul_add_add_3dnow;
03800 }
03801
03802 #ifdef CONFIG_ENCODERS
03803 dsputil_init_pix_mmx(c, avctx);
03804 #endif //CONFIG_ENCODERS
03805 #if 0
03806
03807 get_pixels = just_return;
03808 put_pixels_clamped = just_return;
03809 add_pixels_clamped = just_return;
03810
03811 pix_abs16x16 = just_return;
03812 pix_abs16x16_x2 = just_return;
03813 pix_abs16x16_y2 = just_return;
03814 pix_abs16x16_xy2 = just_return;
03815
03816 put_pixels_tab[0] = just_return;
03817 put_pixels_tab[1] = just_return;
03818 put_pixels_tab[2] = just_return;
03819 put_pixels_tab[3] = just_return;
03820
03821 put_no_rnd_pixels_tab[0] = just_return;
03822 put_no_rnd_pixels_tab[1] = just_return;
03823 put_no_rnd_pixels_tab[2] = just_return;
03824 put_no_rnd_pixels_tab[3] = just_return;
03825
03826 avg_pixels_tab[0] = just_return;
03827 avg_pixels_tab[1] = just_return;
03828 avg_pixels_tab[2] = just_return;
03829 avg_pixels_tab[3] = just_return;
03830
03831 avg_no_rnd_pixels_tab[0] = just_return;
03832 avg_no_rnd_pixels_tab[1] = just_return;
03833 avg_no_rnd_pixels_tab[2] = just_return;
03834 avg_no_rnd_pixels_tab[3] = just_return;
03835
03836
03837
03838 #endif
03839 }