00001
00002
00003
00004
00005
00006 #ifndef HAVE_SSE2
00007
00008
00009
00010
00011
00012
00013
00014
00015 #undef HAVE_SSE
00016 #endif
00017
00018
00019
00020
00021
00022
00023
00024
00025
00026
00027
00028
00029
00030
00031
00032
00033
00034
00035
00036
00037
00038
00039
00040
00041
00042
00043
00044
00045
00046
00047
00048
00049
00050
00051
00052
00053
00054
00055
00056
00057
00058
00059
00060
00061
00062
00063
00064
00065
00066
00067 #undef HAVE_ONLY_MMX1
00068 #if defined(HAVE_MMX) && !defined(HAVE_MMX2) && !defined(HAVE_3DNOW) && !defined(HAVE_SSE)
00069
00070
00071
00072
00073
00074 #define HAVE_ONLY_MMX1
00075 #endif
00076
00077
00078
00079 #define small_memcpy(to,from,n)\
00080 {\
00081 register unsigned long int dummy;\
00082 __asm__ __volatile__(\
00083 "rep; movsb"\
00084 :"=&D"(to), "=&S"(from), "=&c"(dummy)\
00085 \
00086 \
00087 \
00088 :"0" (to), "1" (from),"2" (n)\
00089 : "memory");\
00090 }
00091
00092 #undef MMREG_SIZE
00093 #ifdef HAVE_SSE
00094 #define MMREG_SIZE 16
00095 #else
00096 #define MMREG_SIZE 64 //8
00097 #endif
00098
00099 #undef PREFETCH
00100 #undef EMMS
00101
00102 #ifdef HAVE_MMX2
00103 #define PREFETCH "prefetchnta"
00104 #elif defined ( HAVE_3DNOW )
00105 #define PREFETCH "prefetch"
00106 #else
00107 #define PREFETCH "/nop"
00108 #endif
00109
00110
00111 #ifdef HAVE_3DNOW
00112 #define EMMS "femms"
00113 #else
00114 #define EMMS "emms"
00115 #endif
00116
00117 #undef MOVNTQ
00118 #ifdef HAVE_MMX2
00119 #define MOVNTQ "movntq"
00120 #else
00121 #define MOVNTQ "movq"
00122 #endif
00123
00124 #undef MIN_LEN
00125 #ifdef HAVE_ONLY_MMX1
00126 #define MIN_LEN 0x800
00127 #else
00128 #define MIN_LEN 0x40
00129 #endif
00130
00131 void * RENAME(fast_memcpy)(void * to, const void * from, size_t len)
00132 {
00133 void *retval;
00134 size_t i;
00135 retval = to;
00136 long oldbx;
00137 #ifdef STATISTIC
00138
00139 {
00140 static int freq[33];
00141 static int t=0;
00142 int i;
00143 for(i=0; len>(1<<i); i++);
00144 freq[i]++;
00145 t++;
00146 if(t % (1024 * 1024) == 0)
00147 for(i=0; i<32; i++)
00148 printf("freq < %8d %4d\n", 1<<i, freq[i]);
00149 }
00150 #endif
00151
00152 #ifndef HAVE_ONLY_MMX1
00153
00154 __asm__ __volatile__ (
00155 PREFETCH" (%0)\n"
00156 PREFETCH" 64(%0)\n"
00157 PREFETCH" 128(%0)\n"
00158 PREFETCH" 192(%0)\n"
00159 PREFETCH" 256(%0)\n"
00160 : : "r" (from) );
00161 #endif
00162 if (len >= MIN_LEN)
00163 {
00164 register unsigned long int delta;
00165
00166 delta = ((unsigned long int)to)&(MMREG_SIZE-1);
00167 if (delta)
00168 {
00169 delta=MMREG_SIZE-delta;
00170 len -= delta;
00171 small_memcpy(to, from, delta);
00172 }
00173 i = len >> 6;
00174 len&=63;
00175
00176
00177
00178
00179
00180
00181
00182
00183
00184 #ifdef HAVE_SSE
00185 if (((unsigned long)from) & 15)
00186
00187 for (; i>0; i--)
00188 {
00189 __asm__ __volatile__ (
00190 PREFETCH" 320(%0)\n"
00191 "movups (%0), %%xmm0\n"
00192 "movups 16(%0), %%xmm1\n"
00193 "movups 32(%0), %%xmm2\n"
00194 "movups 48(%0), %%xmm3\n"
00195 "movntps %%xmm0, (%1)\n"
00196 "movntps %%xmm1, 16(%1)\n"
00197 "movntps %%xmm2, 32(%1)\n"
00198 "movntps %%xmm3, 48(%1)\n"
00199 :
00200 : "r" (from), "r" (to)
00201 : "memory"
00202 );
00203 from=((const unsigned char *) from)+64;
00204 to=((unsigned char *)to)+64;
00205 }
00206 else
00207
00208
00209
00210
00211
00212 for (; i>0; i--)
00213 {
00214 __asm__ __volatile__ (
00215 PREFETCH" 320(%0)\n"
00216 "movaps (%0), %%xmm0\n"
00217 "movaps 16(%0), %%xmm1\n"
00218 "movaps 32(%0), %%xmm2\n"
00219 "movaps 48(%0), %%xmm3\n"
00220 "movntps %%xmm0, (%1)\n"
00221 "movntps %%xmm1, 16(%1)\n"
00222 "movntps %%xmm2, 32(%1)\n"
00223 "movntps %%xmm3, 48(%1)\n"
00224 :: "r" (from), "r" (to) : "memory");
00225 from=((const unsigned char *)from)+64;
00226 to=((unsigned char *)to)+64;
00227 }
00228 #else
00229
00230 for (; ((int)to & (BLOCK_SIZE-1)) && i>0; i--)
00231 {
00232 __asm__ __volatile__ (
00233 #ifndef HAVE_ONLY_MMX1
00234 PREFETCH" 320(%0)\n"
00235 #endif
00236 "movq (%0), %%mm0\n"
00237 "movq 8(%0), %%mm1\n"
00238 "movq 16(%0), %%mm2\n"
00239 "movq 24(%0), %%mm3\n"
00240 "movq 32(%0), %%mm4\n"
00241 "movq 40(%0), %%mm5\n"
00242 "movq 48(%0), %%mm6\n"
00243 "movq 56(%0), %%mm7\n"
00244 MOVNTQ" %%mm0, (%1)\n"
00245 MOVNTQ" %%mm1, 8(%1)\n"
00246 MOVNTQ" %%mm2, 16(%1)\n"
00247 MOVNTQ" %%mm3, 24(%1)\n"
00248 MOVNTQ" %%mm4, 32(%1)\n"
00249 MOVNTQ" %%mm5, 40(%1)\n"
00250 MOVNTQ" %%mm6, 48(%1)\n"
00251 MOVNTQ" %%mm7, 56(%1)\n"
00252 :: "r" (from), "r" (to) : "memory");
00253 from=((const unsigned char *)from)+64;
00254 to=((unsigned char *)to)+64;
00255 }
00256
00257
00258
00259 if (i>=BLOCK_SIZE/64)
00260 asm volatile(
00261
00262 MOVX" %%"REG_b", %6\n\t"
00263 "xor %%"REG_a", %%"REG_a" \n\t"
00264 ASMALIGN(4)
00265 "1: \n\t"
00266 "movl (%0, %%"REG_a"), %%ebx \n\t"
00267 "movl 32(%0, %%"REG_a"), %%ebx \n\t"
00268 "movl 64(%0, %%"REG_a"), %%ebx \n\t"
00269 "movl 96(%0, %%"REG_a"), %%ebx \n\t"
00270 "add $128, %%"REG_a" \n\t"
00271 "cmp %3, %%"REG_a" \n\t"
00272 " jb 1b \n\t"
00273
00274 "xor %%"REG_a", %%"REG_a" \n\t"
00275 ASMALIGN(4)
00276 "2: \n\t"
00277 "movq (%0, %%"REG_a"), %%mm0\n"
00278 "movq 8(%0, %%"REG_a"), %%mm1\n"
00279 "movq 16(%0, %%"REG_a"), %%mm2\n"
00280 "movq 24(%0, %%"REG_a"), %%mm3\n"
00281 "movq 32(%0, %%"REG_a"), %%mm4\n"
00282 "movq 40(%0, %%"REG_a"), %%mm5\n"
00283 "movq 48(%0, %%"REG_a"), %%mm6\n"
00284 "movq 56(%0, %%"REG_a"), %%mm7\n"
00285 MOVNTQ" %%mm0, (%1, %%"REG_a")\n"
00286 MOVNTQ" %%mm1, 8(%1, %%"REG_a")\n"
00287 MOVNTQ" %%mm2, 16(%1, %%"REG_a")\n"
00288 MOVNTQ" %%mm3, 24(%1, %%"REG_a")\n"
00289 MOVNTQ" %%mm4, 32(%1, %%"REG_a")\n"
00290 MOVNTQ" %%mm5, 40(%1, %%"REG_a")\n"
00291 MOVNTQ" %%mm6, 48(%1, %%"REG_a")\n"
00292 MOVNTQ" %%mm7, 56(%1, %%"REG_a")\n"
00293 "add $64, %%"REG_a" \n\t"
00294 "cmp %3, %%"REG_a" \n\t"
00295 "jb 2b \n\t"
00296
00297 #if CONFUSION_FACTOR > 0
00298
00299 "mov %5, %%"REG_a" \n\t"
00300 "2: \n\t"
00301 "movl (%0), %%ebx \n\t"
00302 "movl (%0), %%ebx \n\t"
00303 "movl (%0), %%ebx \n\t"
00304 "movl (%0), %%ebx \n\t"
00305 "dec %%"REG_a" \n\t"
00306 " jnz 2b \n\t"
00307 #endif
00308
00309 "xor %%"REG_a", %%"REG_a" \n\t"
00310 "add %3, %0 \n\t"
00311 "add %3, %1 \n\t"
00312 "sub %4, %2 \n\t"
00313 "cmp %4, %2 \n\t"
00314 " jae 1b \n\t"
00315 MOVX" %6, %%"REG_b" \n\t"
00316 : "+r" (from), "+r" (to), "+r" (i)
00317 : "r" ((long)BLOCK_SIZE), "i" (BLOCK_SIZE/64), "i" ((long)CONFUSION_FACTOR), "m" (oldbx)
00318 : "%"REG_a, "memory"
00319 );
00320
00321 for (; i>0; i--)
00322 {
00323 __asm__ __volatile__ (
00324 #ifndef HAVE_ONLY_MMX1
00325 PREFETCH" 320(%0)\n"
00326 #endif
00327 "movq (%0), %%mm0\n"
00328 "movq 8(%0), %%mm1\n"
00329 "movq 16(%0), %%mm2\n"
00330 "movq 24(%0), %%mm3\n"
00331 "movq 32(%0), %%mm4\n"
00332 "movq 40(%0), %%mm5\n"
00333 "movq 48(%0), %%mm6\n"
00334 "movq 56(%0), %%mm7\n"
00335 MOVNTQ" %%mm0, (%1)\n"
00336 MOVNTQ" %%mm1, 8(%1)\n"
00337 MOVNTQ" %%mm2, 16(%1)\n"
00338 MOVNTQ" %%mm3, 24(%1)\n"
00339 MOVNTQ" %%mm4, 32(%1)\n"
00340 MOVNTQ" %%mm5, 40(%1)\n"
00341 MOVNTQ" %%mm6, 48(%1)\n"
00342 MOVNTQ" %%mm7, 56(%1)\n"
00343 :: "r" (from), "r" (to) : "memory");
00344 from=((const unsigned char *)from)+64;
00345 to=((unsigned char *)to)+64;
00346 }
00347
00348 #endif
00349 #ifdef HAVE_MMX2
00350
00351
00352 __asm__ __volatile__ ("sfence":::"memory");
00353 #endif
00354 #ifndef HAVE_SSE
00355
00356 __asm__ __volatile__ (EMMS:::"memory");
00357 #endif
00358 }
00359
00360
00361
00362 if (len) small_memcpy(to, from, len);
00363 return retval;
00364 }
00365
00366
00367