00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00024
00025
00026
00027
00028
00029
00030
00031
00032
00033
00034
00035
00036
00037
00038
00039
00040
00041
00042
00043
00045
00046 #include "STTypes.h"
00047 using namespace std;
00048 using namespace soundtouch;
00049
00050 #ifdef ALLOW_MMX
00051 #include <stdexcept>
00052 #include <string>
00053 #include <climits>
00054
00055
00056
00057
00058 #ifdef USE_GCC_INTRINSICS
00059 # include <mmintrin.h>
00060 # define SI(A,B...) A
00061 # define GI(X...) X
00062 #else
00063 # include "i386/mmx.h"
00064 # define _mm_empty() __asm__ __volatile__ ("emms")
00065 # define __m64 mmx_t
00066 # define SI(A,B...) B
00067 # define GI(X...)
00068 #endif
00069
00070 #include "cpu_detect.h"
00071 #include "TDStretch.h"
00072
00073
00074
00076
00077
00078
00080
00081
00082 extern int scanOffsets[4][24];
00083
00084
00085 long TDStretchMMX::calcCrossCorrStereo(const short *pV1, const short *pV2) const
00086 {
00087 uint tmp;
00088 uint counter = (overlapLength>>3)-1;
00089 __m64 *pv1=(__m64*)pV1, *pv2=(__m64*)pV2;
00090 GI(__m64 m0, m1, m2, m3, m4, m5);
00091 uint shift = overlapDividerBits;
00092
00093
00094 SI(m1 = pv1[0], movq_a2r(0, pv1, mm1));
00095 SI(m2 = pv1[1], movq_a2r(8, pv1, mm2));
00096 SI(m0 = _mm_setzero_si64(), pxor_r2r(mm0, mm0));
00097 SI(m5 = _mm_cvtsi32_si64(shift),movd_v2r(shift, mm5));
00098
00099 do {
00100
00101
00102
00103 SI(m1 = _mm_madd_pi16(m1, pv2[0]),pmaddwd_a2r(0, pv2, mm1));
00104 SI(m3 = pv1[2], movq_a2r(16, pv1, mm3));
00105 SI(m2 = _mm_madd_pi16(m2, pv2[1]),pmaddwd_a2r(8, pv2, mm2));
00106 SI(m4 = pv1[3], movq_a2r(24, pv1, mm4));
00107 SI(m3 = _mm_madd_pi16(m3, pv2[2]),pmaddwd_a2r(16, pv2, mm3));
00108 SI(m2 = _mm_add_pi32(m2, m1), paddd_r2r(mm1, mm2));
00109 SI(m4 = _mm_madd_pi16(m4, pv2[3]),pmaddwd_a2r(24, pv2, mm4));
00110 SI(m1 = pv1[4], movq_a2r(32, pv1, mm1));
00111 SI(m2 = _mm_srai_pi32(m2, m5), psrad_r2r(mm5, mm2));
00112 pv1 += 4;
00113 SI(m3 = _mm_add_pi32(m3, m4), paddd_r2r(mm4, mm3));
00114 SI(m0 = _mm_add_pi32(m0, m2), paddd_r2r(mm2, mm0));
00115 SI(m2 = pv1[1], movq_a2r(8, pv1, mm2));
00116 SI(m3 = _mm_srai_pi32(m3, m5), psrad_r2r(mm5, mm3));
00117 pv2 += 4;
00118 SI(m0 = _mm_add_pi32(m0, m3), paddd_r2r(mm3, mm0));
00119 } while ((--counter)!=0);
00120
00121
00122 SI(m1 = _mm_madd_pi16(m1, pv2[0]), pmaddwd_a2r(0, pv2, mm1));
00123 SI(m3 = pv1[2], movq_a2r(16, pv1, mm3));
00124 SI(m2 = _mm_madd_pi16(m2, pv2[1]), pmaddwd_a2r(8, pv2, mm2));
00125 SI(m4 = pv1[3], movq_a2r(24, pv1, mm4));
00126 SI(m3 = _mm_madd_pi16(m3, pv2[2]), pmaddwd_a2r(16, pv2, mm3));
00127 SI(m2 = _mm_add_pi32(m2, m1), paddd_r2r(mm1, mm2));
00128 SI(m4 = _mm_madd_pi16(m4, pv2[3]), pmaddwd_a2r(24, pv2, mm4));
00129 SI(m2 = _mm_srai_pi32(m2, m5), psrad_r2r(mm5, mm2));
00130 SI(m3 = _mm_add_pi32(m3, m4), paddd_r2r(mm4, mm3));
00131 SI(m0 = _mm_add_pi32(m0, m2), paddd_r2r(mm2, mm0));
00132 SI(m3 = _mm_srai_pi32(m3, m5), psrad_r2r(mm5, mm3));
00133 SI(m0 = _mm_add_pi32(m0, m3), paddd_r2r(mm3, mm0));
00134
00135
00136
00137 SI(m1 = m0, movq_r2r(mm0, mm1));
00138 SI(m1 = _mm_srli_si64(m1, 32), psrld_i2r(32, mm1));
00139 SI(m0 = _mm_add_pi32(m0, m1), paddd_r2r(mm1, mm0));
00140 SI(tmp = _mm_cvtsi64_si32(m0), movd_r2m(mm0, tmp));
00141 return tmp;
00142 }
00143
00144 #ifdef USE_MULTI_MMX
00145
00146 long TDStretchMMX::calcCrossCorrMulti(const short *pV1, const short *pV2) const
00147 {
00148
00149 static const __m64 mm_mask[4][8] __attribute__ ((aligned(8))) = {
00150 {
00151
00152 0xffffffffffffffffULL,
00153 0xffffffffffffffffULL,
00154 0xffffffffffffffffULL,
00155 0xffffffffffffffffULL,
00156 0,
00157 0,
00158 0,
00159 0
00160 },
00161 {
00162 0xffffffffffffffffULL,
00163 0xffffffffffffffffULL,
00164 0xffffffffffffffffULL,
00165 0x0000ffffffffffffULL,
00166 0,
00167 0,
00168 0,
00169 0
00170 },
00171 {
00172 0xffffffffffffffffULL,
00173 0xffffffffffffffffULL,
00174 0xffffffffffffffffULL,
00175 0x00000000ffffffffULL,
00176 0,
00177 0,
00178 0,
00179 0
00180 },
00181 {
00182 0xffffffffffffffffULL,
00183 0xffffffffffffffffULL,
00184 0xffffffffffffffffULL,
00185 0x000000000000ffffULL,
00186 0,
00187 0,
00188 0,
00189 0
00190 }
00191 };
00192 uint tmp;
00193 uint adjustedOverlapLength = overlapLength*channels;
00194 uint counter = ((adjustedOverlapLength+15)>>4)-1;
00195 uint remainder = (16-adjustedOverlapLength)&0xf;
00196
00197 __m64 *ph = (__m64*)&mm_mask[remainder&3][remainder>>2];
00198 __m64 *pv1=(__m64*)pV1, *pv2=(__m64*)pV2;
00199 GI(__m64 m0, m1, m2, m3, m4, m5, m6);
00200 uint shift = overlapDividerBits;
00201
00202
00203 SI(m1 = pv1[0], movq_a2r(0, pv1, mm1));
00204 SI(m2 = pv1[1], movq_a2r(8, pv1, mm2));
00205 SI(m0 = _mm_setzero_si64(), pxor_r2r(mm0, mm0));
00206 SI(m5 = _mm_cvtsi32_si64(shift),movd_v2r(shift, mm5));
00207
00208 do {
00209
00210
00211
00212 SI(m1 = _mm_madd_pi16(m1, pv2[0]),pmaddwd_a2r(0, pv2, mm1));
00213 SI(m3 = pv1[2], movq_a2r(16, pv1, mm3));
00214 SI(m2 = _mm_madd_pi16(m2, pv2[1]),pmaddwd_a2r(8, pv2, mm2));
00215 SI(m4 = pv1[3], movq_a2r(24, pv1, mm4));
00216 SI(m3 = _mm_madd_pi16(m3, pv2[2]),pmaddwd_a2r(16, pv2, mm3));
00217 SI(m2 = _mm_add_pi32(m2, m1), paddd_r2r(mm1, mm2));
00218 SI(m4 = _mm_madd_pi16(m4, pv2[3]),pmaddwd_a2r(24, pv2, mm4));
00219 SI(m1 = pv1[4], movq_a2r(32, pv1, mm1));
00220 SI(m2 = _mm_srai_pi32(m2, m5), psrad_r2r(mm5, mm2));
00221 pv1 += 4;
00222 SI(m3 = _mm_add_pi32(m3, m4), paddd_r2r(mm4, mm3));
00223 SI(m0 = _mm_add_pi32(m0, m2), paddd_r2r(mm2, mm0));
00224 SI(m2 = pv1[1], movq_a2r(8, pv1, mm2));
00225 SI(m3 = _mm_srai_pi32(m3, m5), psrad_r2r(mm5, mm3));
00226 pv2 += 4;
00227 SI(m0 = _mm_add_pi32(m0, m3), paddd_r2r(mm3, mm0));
00228 } while ((--counter)!=0);
00229
00230 SI(m6 = ph[0], movq_a2r(0, ph, mm6));
00231
00232 SI(m1 = _mm_madd_pi16(m1, pv2[0]), pmaddwd_a2r(0, pv2, mm1));
00233 SI(m1 = _mm_and_si64(m1, m6), pand_r2r(mm6, mm1));
00234 SI(m3 = pv1[2], movq_a2r(16, pv1, mm3));
00235 SI(m6 = ph[1], movq_a2r(8, ph, mm6));
00236 SI(m2 = _mm_madd_pi16(m2, pv2[1]), pmaddwd_a2r(8, pv2, mm2));
00237 SI(m2 = _mm_and_si64(m2, m6), pand_r2r(mm6, mm2));
00238 SI(m4 = pv1[3], movq_a2r(24, pv1, mm4));
00239 SI(m6 = ph[2], movq_a2r(16, ph, mm6));
00240 SI(m3 = _mm_madd_pi16(m3, pv2[2]), pmaddwd_a2r(16, pv2, mm3));
00241 SI(m3 = _mm_and_si64(m3, m6), pand_r2r(mm6, mm3));
00242 SI(m2 = _mm_add_pi32(m2, m1), paddd_r2r(mm1, mm2));
00243 SI(m6 = ph[3], movq_a2r(24, ph, mm6));
00244 SI(m4 = _mm_madd_pi16(m4, pv2[3]), pmaddwd_a2r(24, pv2, mm4));
00245 SI(m4 = _mm_and_si64(m4, m6), pand_r2r(mm6, mm4));
00246 SI(m2 = _mm_srai_pi32(m2, m5), psrad_r2r(mm5, mm2));
00247 SI(m3 = _mm_add_pi32(m3, m4), paddd_r2r(mm4, mm3));
00248 SI(m0 = _mm_add_pi32(m0, m2), paddd_r2r(mm2, mm0));
00249 SI(m3 = _mm_srai_pi32(m3, m5), psrad_r2r(mm5, mm3));
00250 SI(m0 = _mm_add_pi32(m0, m3), paddd_r2r(mm3, mm0));
00251
00252
00253
00254 SI(m1 = m0, movq_r2r(mm0, mm1));
00255 SI(m1 = _mm_srli_si64(m1, 32), psrld_i2r(32, mm1));
00256 SI(m0 = _mm_add_pi32(m0, m1), paddd_r2r(mm1, mm0));
00257 SI(tmp = _mm_cvtsi64_si32(m0), movd_r2m(mm0, tmp));
00258 return tmp;
00259 }
00260 #endif
00261
00262 void TDStretchMMX::clearCrossCorrState()
00263 {
00264 _mm_empty();
00265 }
00266
00267
00268 void TDStretchMMX::overlapStereo(short *output, const short *input) const
00269 {
00270 _mm_empty();
00271 uint shift = overlapDividerBits;
00272 uint counter = overlapLength>>2;
00273 __m64 *inPtr = (__m64*) input;
00274 __m64 *midPtr = (__m64*) pMidBuffer;
00275 __m64 *outPtr = ((__m64*) output)-2;
00276 GI(__m64 m0, m1, m2, m3, m4, m5, m6, m7);
00277
00278
00279 uint tmp0 = 0x0002fffe;
00280 SI(m5 = _mm_cvtsi32_si64(tmp0), movd_v2r(tmp0, mm5));
00281 SI(m5 = _mm_unpacklo_pi32(m5,m5), punpckldq_r2r(mm5, mm5));
00282
00283 SI(m6 = _mm_cvtsi32_si64(overlapLength), movd_v2r(overlapLength, mm6));
00284 SI(m6 = _mm_unpacklo_pi32(m6, m6), punpckldq_r2r(mm6, mm6));
00285
00286 uint tmp1 = (overlapLength-1)|0x00010000;
00287 SI(m7 = _mm_cvtsi32_si64(tmp1), movd_v2r(tmp1, mm7));
00288 SI(m7 = _mm_unpacklo_pi32(m7, m7), punpckldq_r2r(mm7, mm7));
00289
00290 do {
00291
00292
00293
00294
00295
00296
00297
00298
00299
00300
00301
00302
00303
00304
00305
00306
00307 SI(m0 = midPtr[0], movq_a2r(0, midPtr, mm0));
00308 outPtr += 2;
00309 SI(m3 = inPtr[0], movq_a2r(0, inPtr, mm3));
00310 SI(m1 = m0, movq_r2r(mm0, mm1));
00311 SI(m2 = midPtr[1], movq_a2r(8, midPtr, mm2));
00312 SI(m0 = _mm_unpacklo_pi16(m0, m3),punpcklwd_r2r(mm3, mm0));
00313 midPtr += 2;
00314 SI(m4 = inPtr[1], movq_a2r(8, inPtr, mm4));
00315 SI(m1 = _mm_unpackhi_pi16(m1, m3),punpckhwd_r2r(mm3, mm1));
00316 inPtr+=2;
00317 SI(m3 = m2, movq_r2r(mm2, mm3));
00318 SI(m2 = _mm_unpacklo_pi16(m2, m4),punpcklwd_r2r(mm4, mm2));
00319
00320 SI(m0 = _mm_madd_pi16(m0, m6), pmaddwd_r2r(mm6, mm0));
00321 SI(m3 = _mm_unpackhi_pi16(m3, m4),punpckhwd_r2r(mm4, mm3));
00322 SI(m4 = _mm_cvtsi32_si64(shift), movd_v2r(shift, mm4));
00323
00324 SI(m1 = _mm_madd_pi16(m1, m7), pmaddwd_r2r(mm7, mm1));
00325 SI(m6 = _mm_add_pi16(m6, m5), paddw_r2r(mm5, mm6));
00326 SI(m7 = _mm_add_pi16(m7, m5), paddw_r2r(mm5, mm7));
00327 SI(m0 = _mm_srai_pi32(m0, m4), psrad_r2r(mm4, mm0));
00328
00329 SI(m2 = _mm_madd_pi16(m2, m6), pmaddwd_r2r(mm6, mm2));
00330 SI(m1 = _mm_srai_pi32(m1, m4), psrad_r2r(mm4, mm1));
00331
00332 SI(m3 = _mm_madd_pi16(m3, m7), pmaddwd_r2r(mm7, mm3));
00333 SI(m2 = _mm_srai_pi32(m2, m4), psrad_r2r(mm4, mm2));
00334 SI(m0 = _mm_packs_pi32(m0, m1), packssdw_r2r(mm1, mm0));
00335 SI(m3 = _mm_srai_pi32(m3, m4), psrad_r2r(mm4, mm3));
00336 SI(m6 = _mm_add_pi16(m6, m5), paddw_r2r(mm5, mm6));
00337 SI(m2 = _mm_packs_pi32(m2, m3), packssdw_r2r(mm3, mm2));
00338 SI(m7 = _mm_add_pi16(m7, m5), paddw_r2r(mm5, mm7));
00339 SI(outPtr[0] = m0, movq_r2a(mm0, 0, outPtr));
00340 SI(outPtr[1] = m2, movq_r2a(mm2, 8, outPtr));
00341 } while ((--counter)!=0);
00342 _mm_empty();
00343 }
00344
00345 #if 0
00346
00347 void TDStretchMMX::overlapMulti(short *output, const short *input) const
00348 {
00349 _mm_empty();
00350 uint shift = overlapDividerBits;
00351 uint counter = overlapLength>>2;
00352 __m64 *inPtr = (__m64*) input;
00353 __m64 *midPtr = (__m64*) pMidBuffer;
00354 __m64 *outPtr = ((__m64*) output)-2;
00355 GI(__m64 m0, m1, m2, m3, m4, m5, m6, m7);
00356
00357
00358 uint tmp0 = 0x0002fffe;
00359 SI(m5 = _mm_cvtsi32_si64(tmp0), movd_v2r(tmp0, mm5));
00360 SI(m5 = _mm_unpacklo_pi32(m5,m5), punpckldq_r2r(mm5, mm5));
00361
00362 SI(m6 = _mm_cvtsi32_si64(overlapLength), movd_v2r(overlapLength, mm6));
00363 SI(m6 = _mm_unpacklo_pi32(m6, m6), punpckldq_r2r(mm6, mm6));
00364
00365 uint tmp1 = (overlapLength-1)|0x00010000;
00366 SI(m7 = _mm_cvtsi32_si64(tmp1), movd_v2r(tmp1, mm7));
00367 SI(m7 = _mm_unpacklo_pi32(m7, m7), punpckldq_r2r(mm7, mm7));
00368
00369 do {
00370
00371
00372
00373
00374
00375
00376
00377
00378
00379
00380
00381
00382
00383
00384
00385
00386 SI(m0 = midPtr[0], movq_a2r(0, midPtr, mm0));
00387 outPtr += 2;
00388 SI(m3 = inPtr[0], movq_a2r(0, inPtr, mm3));
00389 SI(m1 = m0, movq_r2r(mm0, mm1));
00390 SI(m2 = midPtr[1], movq_a2r(8, midPtr, mm2));
00391 SI(m0 = _mm_unpacklo_pi16(m0, m3),punpcklwd_r2r(mm3, mm0));
00392 midPtr += 2;
00393 SI(m4 = inPtr[1], movq_a2r(8, inPtr, mm4));
00394 SI(m1 = _mm_unpackhi_pi16(m1, m3),punpckhwd_r2r(mm3, mm1));
00395 inPtr+=2;
00396 SI(m3 = m2, movq_r2r(mm2, mm3));
00397 SI(m2 = _mm_unpacklo_pi16(m2, m4),punpcklwd_r2r(mm4, mm2));
00398
00399 SI(m0 = _mm_madd_pi16(m0, m6), pmaddwd_r2r(mm6, mm0));
00400 SI(m3 = _mm_unpackhi_pi16(m3, m4),punpckhwd_r2r(mm4, mm3));
00401 SI(m4 = _mm_cvtsi32_si64(shift), movd_v2r(shift, mm4));
00402
00403 SI(m1 = _mm_madd_pi16(m1, m7), pmaddwd_r2r(mm7, mm1));
00404 SI(m6 = _mm_add_pi16(m6, m5), paddw_r2r(mm5, mm6));
00405 SI(m7 = _mm_add_pi16(m7, m5), paddw_r2r(mm5, mm7));
00406 SI(m0 = _mm_srai_pi32(m0, m4), psrad_r2r(mm4, mm0));
00407
00408 SI(m2 = _mm_madd_pi16(m2, m6), pmaddwd_r2r(mm6, mm2));
00409 SI(m1 = _mm_srai_pi32(m1, m4), psrad_r2r(mm4, mm1));
00410
00411 SI(m3 = _mm_madd_pi16(m3, m7), pmaddwd_r2r(mm7, mm3));
00412 SI(m2 = _mm_srai_pi32(m2, m4), psrad_r2r(mm4, mm2));
00413 SI(m0 = _mm_packs_pi32(m0, m1), packssdw_r2r(mm1, mm0));
00414 SI(m3 = _mm_srai_pi32(m3, m4), psrad_r2r(mm4, mm3));
00415 SI(m6 = _mm_add_pi16(m6, m5), paddw_r2r(mm5, mm6));
00416 SI(m2 = _mm_packs_pi32(m2, m3), packssdw_r2r(mm3, mm2));
00417 SI(m7 = _mm_add_pi16(m7, m5), paddw_r2r(mm5, mm7));
00418 SI(outPtr[0] = m0, movq_r2a(mm0, 0, outPtr));
00419 SI(outPtr[1] = m2, movq_r2a(mm2, 8, outPtr));
00420 } while ((--counter)!=0);
00421 _mm_empty();
00422 }
00423 #endif
00424
00426
00427
00428
00430
00431 #include "FIRFilter.h"
00432 FIRFilterMMX::FIRFilterMMX() : FIRFilter()
00433 {
00434 filterCoeffsUnalign = NULL;
00435 }
00436
00437
00438 FIRFilterMMX::~FIRFilterMMX()
00439 {
00440 delete[] filterCoeffsUnalign;
00441 }
00442
00443
00444 void FIRFilterMMX::setCoefficients(const short *coeffs, uint newLength, uint uResultDivFactor)
00445 {
00446 uint i;
00447 FIRFilter::setCoefficients(coeffs, newLength, uResultDivFactor);
00448
00449
00450 delete[] filterCoeffsUnalign;
00451 filterCoeffsUnalign = new short[2 * newLength + 8];
00452 filterCoeffsAlign = (short *)(((ulong)filterCoeffsUnalign + 15) & -16);
00453
00454
00455 for (i = 0;i < length; i += 4)
00456 {
00457 filterCoeffsAlign[2 * i + 0] = coeffs[i + 0];
00458 filterCoeffsAlign[2 * i + 1] = coeffs[i + 2];
00459 filterCoeffsAlign[2 * i + 2] = coeffs[i + 0];
00460 filterCoeffsAlign[2 * i + 3] = coeffs[i + 2];
00461
00462 filterCoeffsAlign[2 * i + 4] = coeffs[i + 1];
00463 filterCoeffsAlign[2 * i + 5] = coeffs[i + 3];
00464 filterCoeffsAlign[2 * i + 6] = coeffs[i + 1];
00465 filterCoeffsAlign[2 * i + 7] = coeffs[i + 3];
00466 }
00467 }
00468
00469
00470
00471
00472 uint FIRFilterMMX::evaluateFilterStereo(short *dest, const short *src, const uint numSamples) const
00473 {
00474 _mm_empty();
00475 __m64 *inPtr = (__m64*)src;
00476 __m64 *outPtr = ((__m64*)dest) - 1;
00477 uint counter = (numSamples - length) >> 1;
00478 GI(__m64 m0, m1, m2, m3, m4, m5, m6, m7);
00479
00480 do {
00481 __m64 *filterInPtr = inPtr;
00482 __m64 *filterPtr = (__m64*)filterCoeffsAlign;
00483 uint filterCounter = lengthDiv8;
00484
00485 SI(m0 = _mm_setzero_si64(), pxor_r2r(mm0, mm0));
00486 SI(m1 = filterInPtr[0], movq_a2r(0, filterInPtr, mm1));
00487 SI(m7 = _mm_setzero_si64(), pxor_r2r(mm7, mm7));
00488
00489 do {
00490 SI(m2 = filterInPtr[1], movq_a2r(8, filterInPtr, mm2));
00491 SI(m4 = m1, movq_r2r(mm1, mm4));
00492 SI(m3 = filterInPtr[2], movq_a2r(16, filterInPtr, mm3));
00493 SI(m1 = _mm_unpackhi_pi16(m1, m2), punpckhwd_r2r(mm2, mm1));
00494 SI(m6 = m2, movq_r2r(mm2, mm6));
00495 SI(m4 = _mm_unpacklo_pi16(m4, m2), punpcklwd_r2r(mm2, mm4));
00496 SI(m2 = filterPtr[0], movq_a2r(0, filterPtr, mm2));
00497 SI(m5 = m1, movq_r2r(mm1, mm5));
00498 SI(m6 = _mm_unpacklo_pi16(m6, m3), punpcklwd_r2r(mm3, mm6));
00499 SI(m4 = _mm_madd_pi16(m4, m2), pmaddwd_r2r(mm2, mm4));
00500 SI(m5 = _mm_madd_pi16(m5, m2), pmaddwd_r2r(mm2, mm5));
00501 SI(m2 = filterPtr[1], movq_a2r(8, filterPtr, mm2));
00502 SI(m0 = _mm_add_pi32(m0, m4), paddd_r2r(mm4, mm0));
00503 SI(m4 = m3, movq_r2r(mm3, mm4));
00504 SI(m1 = _mm_madd_pi16(m1, m2), pmaddwd_r2r(mm2, mm1));
00505 SI(m7 = _mm_add_pi32(m7, m5), paddd_r2r(mm5, mm7));
00506 SI(m6 = _mm_madd_pi16(m6, m2), pmaddwd_r2r(mm2, mm6));
00507 SI(m2 = filterInPtr[3], movq_a2r(24, filterInPtr, mm2));
00508 SI(m0 = _mm_add_pi32(m0, m1), paddd_r2r(mm1, mm0));
00509 SI(m1 = filterInPtr[4], movq_a2r(32, filterInPtr, mm1));
00510 SI(m7 = _mm_add_pi32(m7, m6), paddd_r2r(mm6, mm7));
00511 SI(m3 = _mm_unpackhi_pi16(m3, m2), punpckhwd_r2r(mm2, mm3));
00512 SI(m6 = m2, movq_r2r(mm2, mm6));
00513 SI(m4 = _mm_unpackhi_pi16(m4, m2), punpcklwd_r2r(mm2, mm4));
00514 SI(m2 = filterPtr[2], movq_a2r(16, filterInPtr, mm2));
00515 SI(m5 = m3, movq_r2r(mm3, mm5));
00516 SI(m6 = _mm_unpackhi_pi16(m6, m1), punpcklwd_r2r(mm1, mm6));
00517 filterPtr += 4;
00518 SI(m4 = _mm_madd_pi16(m4, m2), pmaddwd_r2r(mm2, mm4));
00519 filterInPtr += 4;
00520 SI(m5 = _mm_madd_pi16(m5, m2), pmaddwd_r2r(mm2, mm5));
00521 SI(m2 = filterPtr[-1], movq_a2r(-8, filterPtr, mm2));
00522 SI(m0 = _mm_add_pi32(m0, m4), paddd_r2r(mm4, mm0));
00523 SI(m3 = _mm_madd_pi16(m3, m2), pmaddwd_r2r(mm2, mm3));
00524 SI(m7 = _mm_add_pi32(m7, m5), paddd_r2r(mm5, mm7));
00525 SI(m6 = _mm_madd_pi16(m6, m2), pmaddwd_r2r(mm2, mm6));
00526 SI(m0 = _mm_add_pi32(m0, m3), paddd_r2r(mm3, mm0));
00527 SI(m7 = _mm_add_pi32(m7, m6), paddd_r2r(mm6, mm7));
00528 } while ((--filterCounter)!=0);
00529
00530 SI(m4 = _mm_cvtsi32_si64(resultDivFactor), movd_v2r(resultDivFactor, mm4));
00531
00532 SI(m0 = _mm_srai_pi32(m0, m4), psrad_r2r(mm4, mm0));
00533 outPtr++;
00534
00535 SI(m7 = _mm_srai_pi32(m7, m4), psrad_r2r(mm4, mm7));
00536 inPtr++;
00537
00538 SI(m0 = _mm_packs_pi32(m0, m7), packssdw_r2r(mm7, mm0));
00539 SI(*outPtr = m0, movq_r2a(mm0, 0, outPtr));
00540 } while ((--counter)!=0);
00541
00542 _mm_empty();
00543 return (numSamples & 0xfffffffe) - length;
00544 }
00545
00546 #endif // ALLOW_MMX