00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022 #ifndef FFMPEG_DSPUTIL_IWMMXT_RND_H
00023 #define FFMPEG_DSPUTIL_IWMMXT_RND_H
00024
00025 void DEF(put, pixels8)(uint8_t *block, const uint8_t *pixels, const int line_size, int h)
00026 {
00027 int stride = line_size;
00028 __asm__ __volatile__ (
00029 "and r12, %[pixels], #7 \n\t"
00030 "bic %[pixels], %[pixels], #7 \n\t"
00031 "tmcr wcgr1, r12 \n\t"
00032 "add r4, %[pixels], %[line_size] \n\t"
00033 "add r5, %[block], %[line_size] \n\t"
00034 "mov %[line_size], %[line_size], lsl #1 \n\t"
00035 "1: \n\t"
00036 "wldrd wr0, [%[pixels]] \n\t"
00037 "subs %[h], %[h], #2 \n\t"
00038 "wldrd wr1, [%[pixels], #8] \n\t"
00039 "add %[pixels], %[pixels], %[line_size] \n\t"
00040 "wldrd wr3, [r4] \n\t"
00041 "pld [%[pixels]] \n\t"
00042 "pld [%[pixels], #32] \n\t"
00043 "wldrd wr4, [r4, #8] \n\t"
00044 "add r4, r4, %[line_size] \n\t"
00045 "walignr1 wr8, wr0, wr1 \n\t"
00046 "pld [r4] \n\t"
00047 "pld [r4, #32] \n\t"
00048 "walignr1 wr10, wr3, wr4 \n\t"
00049 "wstrd wr8, [%[block]] \n\t"
00050 "add %[block], %[block], %[line_size] \n\t"
00051 "wstrd wr10, [r5] \n\t"
00052 "add r5, r5, %[line_size] \n\t"
00053 "bne 1b \n\t"
00054 : [block]"+r"(block), [pixels]"+r"(pixels), [line_size]"+r"(stride), [h]"+r"(h)
00055 :
00056 : "memory", "r4", "r5", "r12");
00057 }
00058
00059 void DEF(avg, pixels8)(uint8_t *block, const uint8_t *pixels, const int line_size, int h)
00060 {
00061 int stride = line_size;
00062 __asm__ __volatile__ (
00063 "and r12, %[pixels], #7 \n\t"
00064 "bic %[pixels], %[pixels], #7 \n\t"
00065 "tmcr wcgr1, r12 \n\t"
00066 "add r4, %[pixels], %[line_size] \n\t"
00067 "add r5, %[block], %[line_size] \n\t"
00068 "mov %[line_size], %[line_size], lsl #1 \n\t"
00069 "1: \n\t"
00070 "wldrd wr0, [%[pixels]] \n\t"
00071 "subs %[h], %[h], #2 \n\t"
00072 "wldrd wr1, [%[pixels], #8] \n\t"
00073 "add %[pixels], %[pixels], %[line_size] \n\t"
00074 "wldrd wr3, [r4] \n\t"
00075 "pld [%[pixels]] \n\t"
00076 "pld [%[pixels], #32] \n\t"
00077 "wldrd wr4, [r4, #8] \n\t"
00078 "add r4, r4, %[line_size] \n\t"
00079 "walignr1 wr8, wr0, wr1 \n\t"
00080 "wldrd wr0, [%[block]] \n\t"
00081 "wldrd wr2, [r5] \n\t"
00082 "pld [r4] \n\t"
00083 "pld [r4, #32] \n\t"
00084 "walignr1 wr10, wr3, wr4 \n\t"
00085 WAVG2B" wr8, wr8, wr0 \n\t"
00086 WAVG2B" wr10, wr10, wr2 \n\t"
00087 "wstrd wr8, [%[block]] \n\t"
00088 "add %[block], %[block], %[line_size] \n\t"
00089 "wstrd wr10, [r5] \n\t"
00090 "pld [%[block]] \n\t"
00091 "pld [%[block], #32] \n\t"
00092 "add r5, r5, %[line_size] \n\t"
00093 "pld [r5] \n\t"
00094 "pld [r5, #32] \n\t"
00095 "bne 1b \n\t"
00096 : [block]"+r"(block), [pixels]"+r"(pixels), [line_size]"+r"(stride), [h]"+r"(h)
00097 :
00098 : "memory", "r4", "r5", "r12");
00099 }
00100
00101 void DEF(put, pixels16)(uint8_t *block, const uint8_t *pixels, const int line_size, int h)
00102 {
00103 int stride = line_size;
00104 __asm__ __volatile__ (
00105 "and r12, %[pixels], #7 \n\t"
00106 "bic %[pixels], %[pixels], #7 \n\t"
00107 "tmcr wcgr1, r12 \n\t"
00108 "add r4, %[pixels], %[line_size] \n\t"
00109 "add r5, %[block], %[line_size] \n\t"
00110 "mov %[line_size], %[line_size], lsl #1 \n\t"
00111 "1: \n\t"
00112 "wldrd wr0, [%[pixels]] \n\t"
00113 "wldrd wr1, [%[pixels], #8] \n\t"
00114 "subs %[h], %[h], #2 \n\t"
00115 "wldrd wr2, [%[pixels], #16] \n\t"
00116 "add %[pixels], %[pixels], %[line_size] \n\t"
00117 "wldrd wr3, [r4] \n\t"
00118 "pld [%[pixels]] \n\t"
00119 "pld [%[pixels], #32] \n\t"
00120 "walignr1 wr8, wr0, wr1 \n\t"
00121 "wldrd wr4, [r4, #8] \n\t"
00122 "walignr1 wr9, wr1, wr2 \n\t"
00123 "wldrd wr5, [r4, #16] \n\t"
00124 "add r4, r4, %[line_size] \n\t"
00125 "pld [r4] \n\t"
00126 "pld [r4, #32] \n\t"
00127 "walignr1 wr10, wr3, wr4 \n\t"
00128 "wstrd wr8, [%[block]] \n\t"
00129 "walignr1 wr11, wr4, wr5 \n\t"
00130 "wstrd wr9, [%[block], #8] \n\t"
00131 "add %[block], %[block], %[line_size] \n\t"
00132 "wstrd wr10, [r5] \n\t"
00133 "wstrd wr11, [r5, #8] \n\t"
00134 "add r5, r5, %[line_size] \n\t"
00135 "bne 1b \n\t"
00136 : [block]"+r"(block), [pixels]"+r"(pixels), [line_size]"+r"(stride), [h]"+r"(h)
00137 :
00138 : "memory", "r4", "r5", "r12");
00139 }
00140
00141 void DEF(avg, pixels16)(uint8_t *block, const uint8_t *pixels, const int line_size, int h)
00142 {
00143 int stride = line_size;
00144 __asm__ __volatile__ (
00145 "pld [%[pixels]] \n\t"
00146 "pld [%[pixels], #32] \n\t"
00147 "pld [%[block]] \n\t"
00148 "pld [%[block], #32] \n\t"
00149 "and r12, %[pixels], #7 \n\t"
00150 "bic %[pixels], %[pixels], #7 \n\t"
00151 "tmcr wcgr1, r12 \n\t"
00152 "add r4, %[pixels], %[line_size]\n\t"
00153 "add r5, %[block], %[line_size] \n\t"
00154 "mov %[line_size], %[line_size], lsl #1 \n\t"
00155 "1: \n\t"
00156 "wldrd wr0, [%[pixels]] \n\t"
00157 "wldrd wr1, [%[pixels], #8] \n\t"
00158 "subs %[h], %[h], #2 \n\t"
00159 "wldrd wr2, [%[pixels], #16] \n\t"
00160 "add %[pixels], %[pixels], %[line_size] \n\t"
00161 "wldrd wr3, [r4] \n\t"
00162 "pld [%[pixels]] \n\t"
00163 "pld [%[pixels], #32] \n\t"
00164 "walignr1 wr8, wr0, wr1 \n\t"
00165 "wldrd wr4, [r4, #8] \n\t"
00166 "walignr1 wr9, wr1, wr2 \n\t"
00167 "wldrd wr5, [r4, #16] \n\t"
00168 "add r4, r4, %[line_size] \n\t"
00169 "wldrd wr0, [%[block]] \n\t"
00170 "pld [r4] \n\t"
00171 "wldrd wr1, [%[block], #8] \n\t"
00172 "pld [r4, #32] \n\t"
00173 "wldrd wr2, [r5] \n\t"
00174 "walignr1 wr10, wr3, wr4 \n\t"
00175 "wldrd wr3, [r5, #8] \n\t"
00176 WAVG2B" wr8, wr8, wr0 \n\t"
00177 WAVG2B" wr9, wr9, wr1 \n\t"
00178 WAVG2B" wr10, wr10, wr2 \n\t"
00179 "wstrd wr8, [%[block]] \n\t"
00180 "walignr1 wr11, wr4, wr5 \n\t"
00181 WAVG2B" wr11, wr11, wr3 \n\t"
00182 "wstrd wr9, [%[block], #8] \n\t"
00183 "add %[block], %[block], %[line_size] \n\t"
00184 "wstrd wr10, [r5] \n\t"
00185 "pld [%[block]] \n\t"
00186 "pld [%[block], #32] \n\t"
00187 "wstrd wr11, [r5, #8] \n\t"
00188 "add r5, r5, %[line_size] \n\t"
00189 "pld [r5] \n\t"
00190 "pld [r5, #32] \n\t"
00191 "bne 1b \n\t"
00192 : [block]"+r"(block), [pixels]"+r"(pixels), [line_size]"+r"(stride), [h]"+r"(h)
00193 :
00194 : "memory", "r4", "r5", "r12");
00195 }
00196
00197 void DEF(put, pixels8_x2)(uint8_t *block, const uint8_t *pixels, const int line_size, int h)
00198 {
00199 int stride = line_size;
00200
00201
00202 SET_RND(wr15);
00203 __asm__ __volatile__(
00204 "pld [%[pixels]] \n\t"
00205 "pld [%[pixels], #32] \n\t"
00206 "and r12, %[pixels], #7 \n\t"
00207 "bic %[pixels], %[pixels], #7 \n\t"
00208 "tmcr wcgr1, r12 \n\t"
00209 "add r12, r12, #1 \n\t"
00210 "add r4, %[pixels], %[line_size]\n\t"
00211 "tmcr wcgr2, r12 \n\t"
00212 "add r5, %[block], %[line_size] \n\t"
00213 "mov %[line_size], %[line_size], lsl #1 \n\t"
00214
00215 "1: \n\t"
00216 "wldrd wr10, [%[pixels]] \n\t"
00217 "cmp r12, #8 \n\t"
00218 "wldrd wr11, [%[pixels], #8] \n\t"
00219 "add %[pixels], %[pixels], %[line_size] \n\t"
00220 "wldrd wr13, [r4] \n\t"
00221 "pld [%[pixels]] \n\t"
00222 "wldrd wr14, [r4, #8] \n\t"
00223 "pld [%[pixels], #32] \n\t"
00224 "add r4, r4, %[line_size] \n\t"
00225 "walignr1 wr0, wr10, wr11 \n\t"
00226 "pld [r4] \n\t"
00227 "pld [r4, #32] \n\t"
00228 "walignr1 wr2, wr13, wr14 \n\t"
00229 "wmoveq wr4, wr11 \n\t"
00230 "wmoveq wr6, wr14 \n\t"
00231 "walignr2ne wr4, wr10, wr11 \n\t"
00232 "walignr2ne wr6, wr13, wr14 \n\t"
00233 WAVG2B" wr0, wr0, wr4 \n\t"
00234 WAVG2B" wr2, wr2, wr6 \n\t"
00235 "wstrd wr0, [%[block]] \n\t"
00236 "subs %[h], %[h], #2 \n\t"
00237 "wstrd wr2, [r5] \n\t"
00238 "add %[block], %[block], %[line_size] \n\t"
00239 "add r5, r5, %[line_size] \n\t"
00240 "bne 1b \n\t"
00241 : [h]"+r"(h), [pixels]"+r"(pixels), [block]"+r"(block), [line_size]"+r"(stride)
00242 :
00243 : "r4", "r5", "r12", "memory");
00244 }
00245
00246 void DEF(put, pixels16_x2)(uint8_t *block, const uint8_t *pixels, const int line_size, int h)
00247 {
00248 int stride = line_size;
00249
00250
00251 SET_RND(wr15);
00252 __asm__ __volatile__(
00253 "pld [%[pixels]] \n\t"
00254 "pld [%[pixels], #32] \n\t"
00255 "and r12, %[pixels], #7 \n\t"
00256 "bic %[pixels], %[pixels], #7 \n\t"
00257 "tmcr wcgr1, r12 \n\t"
00258 "add r12, r12, #1 \n\t"
00259 "add r4, %[pixels], %[line_size]\n\t"
00260 "tmcr wcgr2, r12 \n\t"
00261 "add r5, %[block], %[line_size] \n\t"
00262 "mov %[line_size], %[line_size], lsl #1 \n\t"
00263
00264 "1: \n\t"
00265 "wldrd wr10, [%[pixels]] \n\t"
00266 "cmp r12, #8 \n\t"
00267 "wldrd wr11, [%[pixels], #8] \n\t"
00268 "wldrd wr12, [%[pixels], #16] \n\t"
00269 "add %[pixels], %[pixels], %[line_size] \n\t"
00270 "wldrd wr13, [r4] \n\t"
00271 "pld [%[pixels]] \n\t"
00272 "wldrd wr14, [r4, #8] \n\t"
00273 "pld [%[pixels], #32] \n\t"
00274 "wldrd wr15, [r4, #16] \n\t"
00275 "add r4, r4, %[line_size] \n\t"
00276 "walignr1 wr0, wr10, wr11 \n\t"
00277 "pld [r4] \n\t"
00278 "pld [r4, #32] \n\t"
00279 "walignr1 wr1, wr11, wr12 \n\t"
00280 "walignr1 wr2, wr13, wr14 \n\t"
00281 "walignr1 wr3, wr14, wr15 \n\t"
00282 "wmoveq wr4, wr11 \n\t"
00283 "wmoveq wr5, wr12 \n\t"
00284 "wmoveq wr6, wr14 \n\t"
00285 "wmoveq wr7, wr15 \n\t"
00286 "walignr2ne wr4, wr10, wr11 \n\t"
00287 "walignr2ne wr5, wr11, wr12 \n\t"
00288 "walignr2ne wr6, wr13, wr14 \n\t"
00289 "walignr2ne wr7, wr14, wr15 \n\t"
00290 WAVG2B" wr0, wr0, wr4 \n\t"
00291 WAVG2B" wr1, wr1, wr5 \n\t"
00292 "wstrd wr0, [%[block]] \n\t"
00293 WAVG2B" wr2, wr2, wr6 \n\t"
00294 "wstrd wr1, [%[block], #8] \n\t"
00295 WAVG2B" wr3, wr3, wr7 \n\t"
00296 "add %[block], %[block], %[line_size] \n\t"
00297 "wstrd wr2, [r5] \n\t"
00298 "subs %[h], %[h], #2 \n\t"
00299 "wstrd wr3, [r5, #8] \n\t"
00300 "add r5, r5, %[line_size] \n\t"
00301 "bne 1b \n\t"
00302 : [h]"+r"(h), [pixels]"+r"(pixels), [block]"+r"(block), [line_size]"+r"(stride)
00303 :
00304 : "r4", "r5", "r12", "memory");
00305 }
00306
00307 void DEF(avg, pixels8_x2)(uint8_t *block, const uint8_t *pixels, const int line_size, int h)
00308 {
00309 int stride = line_size;
00310
00311
00312 SET_RND(wr15);
00313 __asm__ __volatile__(
00314 "pld [%[pixels]] \n\t"
00315 "pld [%[pixels], #32] \n\t"
00316 "pld [%[block]] \n\t"
00317 "pld [%[block], #32] \n\t"
00318 "and r12, %[pixels], #7 \n\t"
00319 "bic %[pixels], %[pixels], #7 \n\t"
00320 "tmcr wcgr1, r12 \n\t"
00321 "add r12, r12, #1 \n\t"
00322 "add r4, %[pixels], %[line_size]\n\t"
00323 "tmcr wcgr2, r12 \n\t"
00324 "add r5, %[block], %[line_size] \n\t"
00325 "mov %[line_size], %[line_size], lsl #1 \n\t"
00326 "pld [r5] \n\t"
00327 "pld [r5, #32] \n\t"
00328
00329 "1: \n\t"
00330 "wldrd wr10, [%[pixels]] \n\t"
00331 "cmp r12, #8 \n\t"
00332 "wldrd wr11, [%[pixels], #8] \n\t"
00333 "add %[pixels], %[pixels], %[line_size] \n\t"
00334 "wldrd wr13, [r4] \n\t"
00335 "pld [%[pixels]] \n\t"
00336 "wldrd wr14, [r4, #8] \n\t"
00337 "pld [%[pixels], #32] \n\t"
00338 "add r4, r4, %[line_size] \n\t"
00339 "walignr1 wr0, wr10, wr11 \n\t"
00340 "pld [r4] \n\t"
00341 "pld [r4, #32] \n\t"
00342 "walignr1 wr2, wr13, wr14 \n\t"
00343 "wmoveq wr4, wr11 \n\t"
00344 "wmoveq wr6, wr14 \n\t"
00345 "walignr2ne wr4, wr10, wr11 \n\t"
00346 "wldrd wr10, [%[block]] \n\t"
00347 "walignr2ne wr6, wr13, wr14 \n\t"
00348 "wldrd wr12, [r5] \n\t"
00349 WAVG2B" wr0, wr0, wr4 \n\t"
00350 WAVG2B" wr2, wr2, wr6 \n\t"
00351 WAVG2B" wr0, wr0, wr10 \n\t"
00352 WAVG2B" wr2, wr2, wr12 \n\t"
00353 "wstrd wr0, [%[block]] \n\t"
00354 "subs %[h], %[h], #2 \n\t"
00355 "wstrd wr2, [r5] \n\t"
00356 "add %[block], %[block], %[line_size] \n\t"
00357 "add r5, r5, %[line_size] \n\t"
00358 "pld [%[block]] \n\t"
00359 "pld [%[block], #32] \n\t"
00360 "pld [r5] \n\t"
00361 "pld [r5, #32] \n\t"
00362 "bne 1b \n\t"
00363 : [h]"+r"(h), [pixels]"+r"(pixels), [block]"+r"(block), [line_size]"+r"(stride)
00364 :
00365 : "r4", "r5", "r12", "memory");
00366 }
00367
00368 void DEF(avg, pixels16_x2)(uint8_t *block, const uint8_t *pixels, const int line_size, int h)
00369 {
00370 int stride = line_size;
00371
00372
00373 SET_RND(wr15);
00374 __asm__ __volatile__(
00375 "pld [%[pixels]] \n\t"
00376 "pld [%[pixels], #32] \n\t"
00377 "pld [%[block]] \n\t"
00378 "pld [%[block], #32] \n\t"
00379 "and r12, %[pixels], #7 \n\t"
00380 "bic %[pixels], %[pixels], #7 \n\t"
00381 "tmcr wcgr1, r12 \n\t"
00382 "add r12, r12, #1 \n\t"
00383 "add r4, %[pixels], %[line_size]\n\t"
00384 "tmcr wcgr2, r12 \n\t"
00385 "add r5, %[block], %[line_size] \n\t"
00386 "mov %[line_size], %[line_size], lsl #1 \n\t"
00387 "pld [r5] \n\t"
00388 "pld [r5, #32] \n\t"
00389
00390 "1: \n\t"
00391 "wldrd wr10, [%[pixels]] \n\t"
00392 "cmp r12, #8 \n\t"
00393 "wldrd wr11, [%[pixels], #8] \n\t"
00394 "wldrd wr12, [%[pixels], #16] \n\t"
00395 "add %[pixels], %[pixels], %[line_size] \n\t"
00396 "wldrd wr13, [r4] \n\t"
00397 "pld [%[pixels]] \n\t"
00398 "wldrd wr14, [r4, #8] \n\t"
00399 "pld [%[pixels], #32] \n\t"
00400 "wldrd wr15, [r4, #16] \n\t"
00401 "add r4, r4, %[line_size] \n\t"
00402 "walignr1 wr0, wr10, wr11 \n\t"
00403 "pld [r4] \n\t"
00404 "pld [r4, #32] \n\t"
00405 "walignr1 wr1, wr11, wr12 \n\t"
00406 "walignr1 wr2, wr13, wr14 \n\t"
00407 "walignr1 wr3, wr14, wr15 \n\t"
00408 "wmoveq wr4, wr11 \n\t"
00409 "wmoveq wr5, wr12 \n\t"
00410 "wmoveq wr6, wr14 \n\t"
00411 "wmoveq wr7, wr15 \n\t"
00412 "walignr2ne wr4, wr10, wr11 \n\t"
00413 "walignr2ne wr5, wr11, wr12 \n\t"
00414 "walignr2ne wr6, wr13, wr14 \n\t"
00415 "walignr2ne wr7, wr14, wr15 \n\t"
00416 "wldrd wr10, [%[block]] \n\t"
00417 WAVG2B" wr0, wr0, wr4 \n\t"
00418 "wldrd wr11, [%[block], #8] \n\t"
00419 WAVG2B" wr1, wr1, wr5 \n\t"
00420 "wldrd wr12, [r5] \n\t"
00421 WAVG2B" wr2, wr2, wr6 \n\t"
00422 "wldrd wr13, [r5, #8] \n\t"
00423 WAVG2B" wr3, wr3, wr7 \n\t"
00424 WAVG2B" wr0, wr0, wr10 \n\t"
00425 WAVG2B" wr1, wr1, wr11 \n\t"
00426 WAVG2B" wr2, wr2, wr12 \n\t"
00427 WAVG2B" wr3, wr3, wr13 \n\t"
00428 "wstrd wr0, [%[block]] \n\t"
00429 "subs %[h], %[h], #2 \n\t"
00430 "wstrd wr1, [%[block], #8] \n\t"
00431 "add %[block], %[block], %[line_size] \n\t"
00432 "wstrd wr2, [r5] \n\t"
00433 "pld [%[block]] \n\t"
00434 "wstrd wr3, [r5, #8] \n\t"
00435 "add r5, r5, %[line_size] \n\t"
00436 "pld [%[block], #32] \n\t"
00437 "pld [r5] \n\t"
00438 "pld [r5, #32] \n\t"
00439 "bne 1b \n\t"
00440 : [h]"+r"(h), [pixels]"+r"(pixels), [block]"+r"(block), [line_size]"+r"(stride)
00441 :
00442 :"r4", "r5", "r12", "memory");
00443 }
00444
00445 void DEF(avg, pixels8_y2)(uint8_t *block, const uint8_t *pixels, const int line_size, int h)
00446 {
00447 int stride = line_size;
00448
00449
00450 __asm__ __volatile__(
00451 "pld [%[pixels]] \n\t"
00452 "pld [%[pixels], #32] \n\t"
00453 "and r12, %[pixels], #7 \n\t"
00454 "tmcr wcgr1, r12 \n\t"
00455 "bic %[pixels], %[pixels], #7 \n\t"
00456
00457 "wldrd wr10, [%[pixels]] \n\t"
00458 "wldrd wr11, [%[pixels], #8] \n\t"
00459 "pld [%[block]] \n\t"
00460 "add %[pixels], %[pixels], %[line_size] \n\t"
00461 "walignr1 wr0, wr10, wr11 \n\t"
00462 "pld [%[pixels]] \n\t"
00463 "pld [%[pixels], #32] \n\t"
00464
00465 "1: \n\t"
00466 "wldrd wr10, [%[pixels]] \n\t"
00467 "wldrd wr11, [%[pixels], #8] \n\t"
00468 "add %[pixels], %[pixels], %[line_size] \n\t"
00469 "pld [%[pixels]] \n\t"
00470 "pld [%[pixels], #32] \n\t"
00471 "walignr1 wr4, wr10, wr11 \n\t"
00472 "wldrd wr10, [%[block]] \n\t"
00473 WAVG2B" wr8, wr0, wr4 \n\t"
00474 WAVG2B" wr8, wr8, wr10 \n\t"
00475 "wstrd wr8, [%[block]] \n\t"
00476 "add %[block], %[block], %[line_size] \n\t"
00477
00478 "wldrd wr10, [%[pixels]] \n\t"
00479 "wldrd wr11, [%[pixels], #8] \n\t"
00480 "pld [%[block]] \n\t"
00481 "add %[pixels], %[pixels], %[line_size] \n\t"
00482 "pld [%[pixels]] \n\t"
00483 "pld [%[pixels], #32] \n\t"
00484 "walignr1 wr0, wr10, wr11 \n\t"
00485 "wldrd wr10, [%[block]] \n\t"
00486 WAVG2B" wr8, wr0, wr4 \n\t"
00487 WAVG2B" wr8, wr8, wr10 \n\t"
00488 "wstrd wr8, [%[block]] \n\t"
00489 "add %[block], %[block], %[line_size] \n\t"
00490
00491 "subs %[h], %[h], #2 \n\t"
00492 "pld [%[block]] \n\t"
00493 "bne 1b \n\t"
00494 : [h]"+r"(h), [pixels]"+r"(pixels), [block]"+r"(block), [line_size]"+r"(stride)
00495 :
00496 : "cc", "memory", "r12");
00497 }
00498
00499 void DEF(put, pixels16_y2)(uint8_t *block, const uint8_t *pixels, const int line_size, int h)
00500 {
00501 int stride = line_size;
00502
00503
00504 __asm__ __volatile__(
00505 "pld [%[pixels]] \n\t"
00506 "pld [%[pixels], #32] \n\t"
00507 "and r12, %[pixels], #7 \n\t"
00508 "tmcr wcgr1, r12 \n\t"
00509 "bic %[pixels], %[pixels], #7 \n\t"
00510
00511 "wldrd wr10, [%[pixels]] \n\t"
00512 "wldrd wr11, [%[pixels], #8] \n\t"
00513 "wldrd wr12, [%[pixels], #16] \n\t"
00514 "add %[pixels], %[pixels], %[line_size] \n\t"
00515 "pld [%[pixels]] \n\t"
00516 "pld [%[pixels], #32] \n\t"
00517 "walignr1 wr0, wr10, wr11 \n\t"
00518 "walignr1 wr1, wr11, wr12 \n\t"
00519
00520 "1: \n\t"
00521 "wldrd wr10, [%[pixels]] \n\t"
00522 "wldrd wr11, [%[pixels], #8] \n\t"
00523 "wldrd wr12, [%[pixels], #16] \n\t"
00524 "add %[pixels], %[pixels], %[line_size] \n\t"
00525 "pld [%[pixels]] \n\t"
00526 "pld [%[pixels], #32] \n\t"
00527 "walignr1 wr4, wr10, wr11 \n\t"
00528 "walignr1 wr5, wr11, wr12 \n\t"
00529 WAVG2B" wr8, wr0, wr4 \n\t"
00530 WAVG2B" wr9, wr1, wr5 \n\t"
00531 "wstrd wr8, [%[block]] \n\t"
00532 "wstrd wr9, [%[block], #8] \n\t"
00533 "add %[block], %[block], %[line_size] \n\t"
00534
00535 "wldrd wr10, [%[pixels]] \n\t"
00536 "wldrd wr11, [%[pixels], #8] \n\t"
00537 "wldrd wr12, [%[pixels], #16] \n\t"
00538 "add %[pixels], %[pixels], %[line_size] \n\t"
00539 "pld [%[pixels]] \n\t"
00540 "pld [%[pixels], #32] \n\t"
00541 "walignr1 wr0, wr10, wr11 \n\t"
00542 "walignr1 wr1, wr11, wr12 \n\t"
00543 WAVG2B" wr8, wr0, wr4 \n\t"
00544 WAVG2B" wr9, wr1, wr5 \n\t"
00545 "wstrd wr8, [%[block]] \n\t"
00546 "wstrd wr9, [%[block], #8] \n\t"
00547 "add %[block], %[block], %[line_size] \n\t"
00548
00549 "subs %[h], %[h], #2 \n\t"
00550 "bne 1b \n\t"
00551 : [h]"+r"(h), [pixels]"+r"(pixels), [block]"+r"(block), [line_size]"+r"(stride)
00552 :
00553 : "r4", "r5", "r12", "memory");
00554 }
00555
00556 void DEF(avg, pixels16_y2)(uint8_t *block, const uint8_t *pixels, const int line_size, int h)
00557 {
00558 int stride = line_size;
00559
00560
00561 __asm__ __volatile__(
00562 "pld [%[pixels]] \n\t"
00563 "pld [%[pixels], #32] \n\t"
00564 "and r12, %[pixels], #7 \n\t"
00565 "tmcr wcgr1, r12 \n\t"
00566 "bic %[pixels], %[pixels], #7 \n\t"
00567
00568 "wldrd wr10, [%[pixels]] \n\t"
00569 "wldrd wr11, [%[pixels], #8] \n\t"
00570 "pld [%[block]] \n\t"
00571 "wldrd wr12, [%[pixels], #16] \n\t"
00572 "add %[pixels], %[pixels], %[line_size] \n\t"
00573 "pld [%[pixels]] \n\t"
00574 "pld [%[pixels], #32] \n\t"
00575 "walignr1 wr0, wr10, wr11 \n\t"
00576 "walignr1 wr1, wr11, wr12 \n\t"
00577
00578 "1: \n\t"
00579 "wldrd wr10, [%[pixels]] \n\t"
00580 "wldrd wr11, [%[pixels], #8] \n\t"
00581 "wldrd wr12, [%[pixels], #16] \n\t"
00582 "add %[pixels], %[pixels], %[line_size] \n\t"
00583 "pld [%[pixels]] \n\t"
00584 "pld [%[pixels], #32] \n\t"
00585 "walignr1 wr4, wr10, wr11 \n\t"
00586 "walignr1 wr5, wr11, wr12 \n\t"
00587 "wldrd wr10, [%[block]] \n\t"
00588 "wldrd wr11, [%[block], #8] \n\t"
00589 WAVG2B" wr8, wr0, wr4 \n\t"
00590 WAVG2B" wr9, wr1, wr5 \n\t"
00591 WAVG2B" wr8, wr8, wr10 \n\t"
00592 WAVG2B" wr9, wr9, wr11 \n\t"
00593 "wstrd wr8, [%[block]] \n\t"
00594 "wstrd wr9, [%[block], #8] \n\t"
00595 "add %[block], %[block], %[line_size] \n\t"
00596
00597 "wldrd wr10, [%[pixels]] \n\t"
00598 "wldrd wr11, [%[pixels], #8] \n\t"
00599 "pld [%[block]] \n\t"
00600 "wldrd wr12, [%[pixels], #16] \n\t"
00601 "add %[pixels], %[pixels], %[line_size] \n\t"
00602 "pld [%[pixels]] \n\t"
00603 "pld [%[pixels], #32] \n\t"
00604 "walignr1 wr0, wr10, wr11 \n\t"
00605 "walignr1 wr1, wr11, wr12 \n\t"
00606 "wldrd wr10, [%[block]] \n\t"
00607 "wldrd wr11, [%[block], #8] \n\t"
00608 WAVG2B" wr8, wr0, wr4 \n\t"
00609 WAVG2B" wr9, wr1, wr5 \n\t"
00610 WAVG2B" wr8, wr8, wr10 \n\t"
00611 WAVG2B" wr9, wr9, wr11 \n\t"
00612 "wstrd wr8, [%[block]] \n\t"
00613 "wstrd wr9, [%[block], #8] \n\t"
00614 "add %[block], %[block], %[line_size] \n\t"
00615
00616 "subs %[h], %[h], #2 \n\t"
00617 "pld [%[block]] \n\t"
00618 "bne 1b \n\t"
00619 : [h]"+r"(h), [pixels]"+r"(pixels), [block]"+r"(block), [line_size]"+r"(stride)
00620 :
00621 : "r4", "r5", "r12", "memory");
00622 }
00623
00624 void DEF(put, pixels8_xy2)(uint8_t *block, const uint8_t *pixels, const int line_size, int h)
00625 {
00626
00627
00628 SET_RND(wr15);
00629 __asm__ __volatile__(
00630 "pld [%[pixels]] \n\t"
00631 "mov r12, #2 \n\t"
00632 "pld [%[pixels], #32] \n\t"
00633 "tmcr wcgr0, r12 \n\t"
00634 "and r12, %[pixels], #7 \n\t"
00635 "bic %[pixels], %[pixels], #7 \n\t"
00636 "tmcr wcgr1, r12 \n\t"
00637
00638
00639
00640 "wldrd wr12, [%[pixels]] \n\t"
00641 "add r12, r12, #1 \n\t"
00642 "wldrd wr13, [%[pixels], #8] \n\t"
00643 "tmcr wcgr2, r12 \n\t"
00644 "add %[pixels], %[pixels], %[line_size] \n\t"
00645 "cmp r12, #8 \n\t"
00646 "pld [%[pixels]] \n\t"
00647 "pld [%[pixels], #32] \n\t"
00648 "walignr1 wr2, wr12, wr13 \n\t"
00649 "wmoveq wr10, wr13 \n\t"
00650 "walignr2ne wr10, wr12, wr13 \n\t"
00651 "wunpckelub wr0, wr2 \n\t"
00652 "wunpckehub wr1, wr2 \n\t"
00653 "wunpckelub wr8, wr10 \n\t"
00654 "wunpckehub wr9, wr10 \n\t"
00655 "waddhus wr0, wr0, wr8 \n\t"
00656 "waddhus wr1, wr1, wr9 \n\t"
00657
00658 "1: \n\t"
00659
00660
00661 "wldrd wr12, [%[pixels]] \n\t"
00662 "cmp r12, #8 \n\t"
00663 "wldrd wr13, [%[pixels], #8] \n\t"
00664 "add %[pixels], %[pixels], %[line_size] \n\t"
00665 "walignr1 wr6, wr12, wr13 \n\t"
00666 "pld [%[pixels]] \n\t"
00667 "pld [%[pixels], #32] \n\t"
00668 "wmoveq wr10, wr13 \n\t"
00669 "walignr2ne wr10, wr12, wr13 \n\t"
00670 "wunpckelub wr4, wr6 \n\t"
00671 "wunpckehub wr5, wr6 \n\t"
00672 "wunpckelub wr8, wr10 \n\t"
00673 "wunpckehub wr9, wr10 \n\t"
00674 "waddhus wr4, wr4, wr8 \n\t"
00675 "waddhus wr5, wr5, wr9 \n\t"
00676 "waddhus wr8, wr0, wr4 \n\t"
00677 "waddhus wr9, wr1, wr5 \n\t"
00678 "waddhus wr8, wr8, wr15 \n\t"
00679 "waddhus wr9, wr9, wr15 \n\t"
00680 "wsrlhg wr8, wr8, wcgr0 \n\t"
00681 "wsrlhg wr9, wr9, wcgr0 \n\t"
00682 "wpackhus wr8, wr8, wr9 \n\t"
00683 "wstrd wr8, [%[block]] \n\t"
00684 "add %[block], %[block], %[line_size] \n\t"
00685
00686
00687
00688 "wldrd wr12, [%[pixels]] \n\t"
00689 "wldrd wr13, [%[pixels], #8] \n\t"
00690 "add %[pixels], %[pixels], %[line_size] \n\t"
00691 "walignr1 wr2, wr12, wr13 \n\t"
00692 "pld [%[pixels]] \n\t"
00693 "pld [%[pixels], #32] \n\t"
00694 "wmoveq wr10, wr13 \n\t"
00695 "walignr2ne wr10, wr12, wr13 \n\t"
00696 "wunpckelub wr0, wr2 \n\t"
00697 "wunpckehub wr1, wr2 \n\t"
00698 "wunpckelub wr8, wr10 \n\t"
00699 "wunpckehub wr9, wr10 \n\t"
00700 "waddhus wr0, wr0, wr8 \n\t"
00701 "waddhus wr1, wr1, wr9 \n\t"
00702 "waddhus wr8, wr0, wr4 \n\t"
00703 "waddhus wr9, wr1, wr5 \n\t"
00704 "waddhus wr8, wr8, wr15 \n\t"
00705 "waddhus wr9, wr9, wr15 \n\t"
00706 "wsrlhg wr8, wr8, wcgr0 \n\t"
00707 "wsrlhg wr9, wr9, wcgr0 \n\t"
00708 "wpackhus wr8, wr8, wr9 \n\t"
00709 "subs %[h], %[h], #2 \n\t"
00710 "wstrd wr8, [%[block]] \n\t"
00711 "add %[block], %[block], %[line_size] \n\t"
00712 "bne 1b \n\t"
00713 : [h]"+r"(h), [pixels]"+r"(pixels), [block]"+r"(block)
00714 : [line_size]"r"(line_size)
00715 : "r12", "memory");
00716 }
00717
00718 void DEF(put, pixels16_xy2)(uint8_t *block, const uint8_t *pixels, const int line_size, int h)
00719 {
00720
00721
00722 SET_RND(wr15);
00723 __asm__ __volatile__(
00724 "pld [%[pixels]] \n\t"
00725 "mov r12, #2 \n\t"
00726 "pld [%[pixels], #32] \n\t"
00727 "tmcr wcgr0, r12 \n\t"
00728
00729 "and r12, %[pixels], #7 \n\t"
00730 "bic %[pixels], %[pixels], #7 \n\t"
00731 "tmcr wcgr1, r12 \n\t"
00732 "add r12, r12, #1 \n\t"
00733 "tmcr wcgr2, r12 \n\t"
00734
00735
00736
00737 "wldrd wr12, [%[pixels]] \n\t"
00738 "cmp r12, #8 \n\t"
00739 "wldrd wr13, [%[pixels], #8] \n\t"
00740 "wldrd wr14, [%[pixels], #16] \n\t"
00741 "add %[pixels], %[pixels], %[line_size] \n\t"
00742 "pld [%[pixels]] \n\t"
00743 "walignr1 wr2, wr12, wr13 \n\t"
00744 "pld [%[pixels], #32] \n\t"
00745 "walignr1 wr3, wr13, wr14 \n\t"
00746 "wmoveq wr10, wr13 \n\t"
00747 "wmoveq wr11, wr14 \n\t"
00748 "walignr2ne wr10, wr12, wr13 \n\t"
00749 "walignr2ne wr11, wr13, wr14 \n\t"
00750 "wunpckelub wr0, wr2 \n\t"
00751 "wunpckehub wr1, wr2 \n\t"
00752 "wunpckelub wr2, wr3 \n\t"
00753 "wunpckehub wr3, wr3 \n\t"
00754 "wunpckelub wr8, wr10 \n\t"
00755 "wunpckehub wr9, wr10 \n\t"
00756 "wunpckelub wr10, wr11 \n\t"
00757 "wunpckehub wr11, wr11 \n\t"
00758 "waddhus wr0, wr0, wr8 \n\t"
00759 "waddhus wr1, wr1, wr9 \n\t"
00760 "waddhus wr2, wr2, wr10 \n\t"
00761 "waddhus wr3, wr3, wr11 \n\t"
00762
00763 "1: \n\t"
00764
00765
00766 "wldrd wr12, [%[pixels]] \n\t"
00767 "cmp r12, #8 \n\t"
00768 "wldrd wr13, [%[pixels], #8] \n\t"
00769 "wldrd wr14, [%[pixels], #16] \n\t"
00770 "add %[pixels], %[pixels], %[line_size] \n\t"
00771 "walignr1 wr6, wr12, wr13 \n\t"
00772 "pld [%[pixels]] \n\t"
00773 "pld [%[pixels], #32] \n\t"
00774 "walignr1 wr7, wr13, wr14 \n\t"
00775 "wmoveq wr10, wr13 \n\t"
00776 "wmoveq wr11, wr14 \n\t"
00777 "walignr2ne wr10, wr12, wr13 \n\t"
00778 "walignr2ne wr11, wr13, wr14 \n\t"
00779 "wunpckelub wr4, wr6 \n\t"
00780 "wunpckehub wr5, wr6 \n\t"
00781 "wunpckelub wr6, wr7 \n\t"
00782 "wunpckehub wr7, wr7 \n\t"
00783 "wunpckelub wr8, wr10 \n\t"
00784 "wunpckehub wr9, wr10 \n\t"
00785 "wunpckelub wr10, wr11 \n\t"
00786 "wunpckehub wr11, wr11 \n\t"
00787 "waddhus wr4, wr4, wr8 \n\t"
00788 "waddhus wr5, wr5, wr9 \n\t"
00789 "waddhus wr6, wr6, wr10 \n\t"
00790 "waddhus wr7, wr7, wr11 \n\t"
00791 "waddhus wr8, wr0, wr4 \n\t"
00792 "waddhus wr9, wr1, wr5 \n\t"
00793 "waddhus wr10, wr2, wr6 \n\t"
00794 "waddhus wr11, wr3, wr7 \n\t"
00795 "waddhus wr8, wr8, wr15 \n\t"
00796 "waddhus wr9, wr9, wr15 \n\t"
00797 "waddhus wr10, wr10, wr15 \n\t"
00798 "waddhus wr11, wr11, wr15 \n\t"
00799 "wsrlhg wr8, wr8, wcgr0 \n\t"
00800 "wsrlhg wr9, wr9, wcgr0 \n\t"
00801 "wsrlhg wr10, wr10, wcgr0 \n\t"
00802 "wsrlhg wr11, wr11, wcgr0 \n\t"
00803 "wpackhus wr8, wr8, wr9 \n\t"
00804 "wpackhus wr9, wr10, wr11 \n\t"
00805 "wstrd wr8, [%[block]] \n\t"
00806 "wstrd wr9, [%[block], #8] \n\t"
00807 "add %[block], %[block], %[line_size] \n\t"
00808
00809
00810
00811 "wldrd wr12, [%[pixels]] \n\t"
00812 "wldrd wr13, [%[pixels], #8] \n\t"
00813 "wldrd wr14, [%[pixels], #16] \n\t"
00814 "add %[pixels], %[pixels], %[line_size] \n\t"
00815 "walignr1 wr2, wr12, wr13 \n\t"
00816 "pld [%[pixels]] \n\t"
00817 "pld [%[pixels], #32] \n\t"
00818 "walignr1 wr3, wr13, wr14 \n\t"
00819 "wmoveq wr10, wr13 \n\t"
00820 "wmoveq wr11, wr14 \n\t"
00821 "walignr2ne wr10, wr12, wr13 \n\t"
00822 "walignr2ne wr11, wr13, wr14 \n\t"
00823 "wunpckelub wr0, wr2 \n\t"
00824 "wunpckehub wr1, wr2 \n\t"
00825 "wunpckelub wr2, wr3 \n\t"
00826 "wunpckehub wr3, wr3 \n\t"
00827 "wunpckelub wr8, wr10 \n\t"
00828 "wunpckehub wr9, wr10 \n\t"
00829 "wunpckelub wr10, wr11 \n\t"
00830 "wunpckehub wr11, wr11 \n\t"
00831 "waddhus wr0, wr0, wr8 \n\t"
00832 "waddhus wr1, wr1, wr9 \n\t"
00833 "waddhus wr2, wr2, wr10 \n\t"
00834 "waddhus wr3, wr3, wr11 \n\t"
00835 "waddhus wr8, wr0, wr4 \n\t"
00836 "waddhus wr9, wr1, wr5 \n\t"
00837 "waddhus wr10, wr2, wr6 \n\t"
00838 "waddhus wr11, wr3, wr7 \n\t"
00839 "waddhus wr8, wr8, wr15 \n\t"
00840 "waddhus wr9, wr9, wr15 \n\t"
00841 "waddhus wr10, wr10, wr15 \n\t"
00842 "waddhus wr11, wr11, wr15 \n\t"
00843 "wsrlhg wr8, wr8, wcgr0 \n\t"
00844 "wsrlhg wr9, wr9, wcgr0 \n\t"
00845 "wsrlhg wr10, wr10, wcgr0 \n\t"
00846 "wsrlhg wr11, wr11, wcgr0 \n\t"
00847 "wpackhus wr8, wr8, wr9 \n\t"
00848 "wpackhus wr9, wr10, wr11 \n\t"
00849 "wstrd wr8, [%[block]] \n\t"
00850 "wstrd wr9, [%[block], #8] \n\t"
00851 "add %[block], %[block], %[line_size] \n\t"
00852
00853 "subs %[h], %[h], #2 \n\t"
00854 "bne 1b \n\t"
00855 : [h]"+r"(h), [pixels]"+r"(pixels), [block]"+r"(block)
00856 : [line_size]"r"(line_size)
00857 : "r12", "memory");
00858 }
00859
00860 void DEF(avg, pixels8_xy2)(uint8_t *block, const uint8_t *pixels, const int line_size, int h)
00861 {
00862
00863
00864 SET_RND(wr15);
00865 __asm__ __volatile__(
00866 "pld [%[block]] \n\t"
00867 "pld [%[block], #32] \n\t"
00868 "pld [%[pixels]] \n\t"
00869 "mov r12, #2 \n\t"
00870 "pld [%[pixels], #32] \n\t"
00871 "tmcr wcgr0, r12 \n\t"
00872 "and r12, %[pixels], #7 \n\t"
00873 "bic %[pixels], %[pixels], #7 \n\t"
00874 "tmcr wcgr1, r12 \n\t"
00875
00876
00877
00878 "wldrd wr12, [%[pixels]] \n\t"
00879 "add r12, r12, #1 \n\t"
00880 "wldrd wr13, [%[pixels], #8] \n\t"
00881 "tmcr wcgr2, r12 \n\t"
00882 "add %[pixels], %[pixels], %[line_size] \n\t"
00883 "cmp r12, #8 \n\t"
00884 "pld [%[pixels]] \n\t"
00885 "pld [%[pixels], #32] \n\t"
00886 "walignr1 wr2, wr12, wr13 \n\t"
00887 "wmoveq wr10, wr13 \n\t"
00888 "walignr2ne wr10, wr12, wr13 \n\t"
00889 "wunpckelub wr0, wr2 \n\t"
00890 "wunpckehub wr1, wr2 \n\t"
00891 "wunpckelub wr8, wr10 \n\t"
00892 "wunpckehub wr9, wr10 \n\t"
00893 "waddhus wr0, wr0, wr8 \n\t"
00894 "waddhus wr1, wr1, wr9 \n\t"
00895
00896 "1: \n\t"
00897
00898
00899 "wldrd wr12, [%[pixels]] \n\t"
00900 "cmp r12, #8 \n\t"
00901 "wldrd wr13, [%[pixels], #8] \n\t"
00902 "add %[pixels], %[pixels], %[line_size] \n\t"
00903 "walignr1 wr6, wr12, wr13 \n\t"
00904 "pld [%[pixels]] \n\t"
00905 "pld [%[pixels], #32] \n\t"
00906 "wmoveq wr10, wr13 \n\t"
00907 "walignr2ne wr10, wr12, wr13 \n\t"
00908 "wunpckelub wr4, wr6 \n\t"
00909 "wunpckehub wr5, wr6 \n\t"
00910 "wunpckelub wr8, wr10 \n\t"
00911 "wunpckehub wr9, wr10 \n\t"
00912 "waddhus wr4, wr4, wr8 \n\t"
00913 "waddhus wr5, wr5, wr9 \n\t"
00914 "waddhus wr8, wr0, wr4 \n\t"
00915 "waddhus wr9, wr1, wr5 \n\t"
00916 "waddhus wr8, wr8, wr15 \n\t"
00917 "waddhus wr9, wr9, wr15 \n\t"
00918 "wldrd wr12, [%[block]] \n\t"
00919 "wsrlhg wr8, wr8, wcgr0 \n\t"
00920 "wsrlhg wr9, wr9, wcgr0 \n\t"
00921 "wpackhus wr8, wr8, wr9 \n\t"
00922 WAVG2B" wr8, wr8, wr12 \n\t"
00923 "wstrd wr8, [%[block]] \n\t"
00924 "add %[block], %[block], %[line_size] \n\t"
00925 "wldrd wr12, [%[pixels]] \n\t"
00926 "pld [%[block]] \n\t"
00927 "pld [%[block], #32] \n\t"
00928
00929
00930
00931 "wldrd wr13, [%[pixels], #8] \n\t"
00932 "add %[pixels], %[pixels], %[line_size] \n\t"
00933 "walignr1 wr2, wr12, wr13 \n\t"
00934 "pld [%[pixels]] \n\t"
00935 "pld [%[pixels], #32] \n\t"
00936 "wmoveq wr10, wr13 \n\t"
00937 "walignr2ne wr10, wr12, wr13 \n\t"
00938 "wunpckelub wr0, wr2 \n\t"
00939 "wunpckehub wr1, wr2 \n\t"
00940 "wunpckelub wr8, wr10 \n\t"
00941 "wunpckehub wr9, wr10 \n\t"
00942 "waddhus wr0, wr0, wr8 \n\t"
00943 "waddhus wr1, wr1, wr9 \n\t"
00944 "waddhus wr8, wr0, wr4 \n\t"
00945 "waddhus wr9, wr1, wr5 \n\t"
00946 "waddhus wr8, wr8, wr15 \n\t"
00947 "waddhus wr9, wr9, wr15 \n\t"
00948 "wldrd wr12, [%[block]] \n\t"
00949 "wsrlhg wr8, wr8, wcgr0 \n\t"
00950 "wsrlhg wr9, wr9, wcgr0 \n\t"
00951 "wpackhus wr8, wr8, wr9 \n\t"
00952 "subs %[h], %[h], #2 \n\t"
00953 WAVG2B" wr8, wr8, wr12 \n\t"
00954 "wstrd wr8, [%[block]] \n\t"
00955 "add %[block], %[block], %[line_size] \n\t"
00956 "pld [%[block]] \n\t"
00957 "pld [%[block], #32] \n\t"
00958 "bne 1b \n\t"
00959 : [h]"+r"(h), [pixels]"+r"(pixels), [block]"+r"(block)
00960 : [line_size]"r"(line_size)
00961 : "r12", "memory");
00962 }
00963
00964 void DEF(avg, pixels16_xy2)(uint8_t *block, const uint8_t *pixels, const int line_size, int h)
00965 {
00966
00967
00968 SET_RND(wr15);
00969 __asm__ __volatile__(
00970 "pld [%[block]] \n\t"
00971 "pld [%[block], #32] \n\t"
00972 "pld [%[pixels]] \n\t"
00973 "mov r12, #2 \n\t"
00974 "pld [%[pixels], #32] \n\t"
00975 "tmcr wcgr0, r12 \n\t"
00976
00977 "and r12, %[pixels], #7 \n\t"
00978 "bic %[pixels], %[pixels], #7 \n\t"
00979 "tmcr wcgr1, r12 \n\t"
00980 "add r12, r12, #1 \n\t"
00981 "tmcr wcgr2, r12 \n\t"
00982
00983
00984
00985 "wldrd wr12, [%[pixels]] \n\t"
00986 "cmp r12, #8 \n\t"
00987 "wldrd wr13, [%[pixels], #8] \n\t"
00988 "wldrd wr14, [%[pixels], #16] \n\t"
00989 "add %[pixels], %[pixels], %[line_size] \n\t"
00990 "pld [%[pixels]] \n\t"
00991 "walignr1 wr2, wr12, wr13 \n\t"
00992 "pld [%[pixels], #32] \n\t"
00993 "walignr1 wr3, wr13, wr14 \n\t"
00994 "wmoveq wr10, wr13 \n\t"
00995 "wmoveq wr11, wr14 \n\t"
00996 "walignr2ne wr10, wr12, wr13 \n\t"
00997 "walignr2ne wr11, wr13, wr14 \n\t"
00998 "wunpckelub wr0, wr2 \n\t"
00999 "wunpckehub wr1, wr2 \n\t"
01000 "wunpckelub wr2, wr3 \n\t"
01001 "wunpckehub wr3, wr3 \n\t"
01002 "wunpckelub wr8, wr10 \n\t"
01003 "wunpckehub wr9, wr10 \n\t"
01004 "wunpckelub wr10, wr11 \n\t"
01005 "wunpckehub wr11, wr11 \n\t"
01006 "waddhus wr0, wr0, wr8 \n\t"
01007 "waddhus wr1, wr1, wr9 \n\t"
01008 "waddhus wr2, wr2, wr10 \n\t"
01009 "waddhus wr3, wr3, wr11 \n\t"
01010
01011 "1: \n\t"
01012
01013
01014 "wldrd wr12, [%[pixels]] \n\t"
01015 "cmp r12, #8 \n\t"
01016 "wldrd wr13, [%[pixels], #8] \n\t"
01017 "wldrd wr14, [%[pixels], #16] \n\t"
01018 "add %[pixels], %[pixels], %[line_size] \n\t"
01019 "walignr1 wr6, wr12, wr13 \n\t"
01020 "pld [%[pixels]] \n\t"
01021 "pld [%[pixels], #32] \n\t"
01022 "walignr1 wr7, wr13, wr14 \n\t"
01023 "wmoveq wr10, wr13 \n\t"
01024 "wmoveq wr11, wr14 \n\t"
01025 "walignr2ne wr10, wr12, wr13 \n\t"
01026 "walignr2ne wr11, wr13, wr14 \n\t"
01027 "wunpckelub wr4, wr6 \n\t"
01028 "wunpckehub wr5, wr6 \n\t"
01029 "wunpckelub wr6, wr7 \n\t"
01030 "wunpckehub wr7, wr7 \n\t"
01031 "wunpckelub wr8, wr10 \n\t"
01032 "wunpckehub wr9, wr10 \n\t"
01033 "wunpckelub wr10, wr11 \n\t"
01034 "wunpckehub wr11, wr11 \n\t"
01035 "waddhus wr4, wr4, wr8 \n\t"
01036 "waddhus wr5, wr5, wr9 \n\t"
01037 "waddhus wr6, wr6, wr10 \n\t"
01038 "waddhus wr7, wr7, wr11 \n\t"
01039 "waddhus wr8, wr0, wr4 \n\t"
01040 "waddhus wr9, wr1, wr5 \n\t"
01041 "waddhus wr10, wr2, wr6 \n\t"
01042 "waddhus wr11, wr3, wr7 \n\t"
01043 "waddhus wr8, wr8, wr15 \n\t"
01044 "waddhus wr9, wr9, wr15 \n\t"
01045 "waddhus wr10, wr10, wr15 \n\t"
01046 "waddhus wr11, wr11, wr15 \n\t"
01047 "wsrlhg wr8, wr8, wcgr0 \n\t"
01048 "wsrlhg wr9, wr9, wcgr0 \n\t"
01049 "wldrd wr12, [%[block]] \n\t"
01050 "wldrd wr13, [%[block], #8] \n\t"
01051 "wsrlhg wr10, wr10, wcgr0 \n\t"
01052 "wsrlhg wr11, wr11, wcgr0 \n\t"
01053 "wpackhus wr8, wr8, wr9 \n\t"
01054 "wpackhus wr9, wr10, wr11 \n\t"
01055 WAVG2B" wr8, wr8, wr12 \n\t"
01056 WAVG2B" wr9, wr9, wr13 \n\t"
01057 "wstrd wr8, [%[block]] \n\t"
01058 "wstrd wr9, [%[block], #8] \n\t"
01059 "add %[block], %[block], %[line_size] \n\t"
01060
01061
01062
01063 "wldrd wr12, [%[pixels]] \n\t"
01064 "pld [%[block]] \n\t"
01065 "wldrd wr13, [%[pixels], #8] \n\t"
01066 "pld [%[block], #32] \n\t"
01067 "wldrd wr14, [%[pixels], #16] \n\t"
01068 "add %[pixels], %[pixels], %[line_size] \n\t"
01069 "walignr1 wr2, wr12, wr13 \n\t"
01070 "pld [%[pixels]] \n\t"
01071 "pld [%[pixels], #32] \n\t"
01072 "walignr1 wr3, wr13, wr14 \n\t"
01073 "wmoveq wr10, wr13 \n\t"
01074 "wmoveq wr11, wr14 \n\t"
01075 "walignr2ne wr10, wr12, wr13 \n\t"
01076 "walignr2ne wr11, wr13, wr14 \n\t"
01077 "wunpckelub wr0, wr2 \n\t"
01078 "wunpckehub wr1, wr2 \n\t"
01079 "wunpckelub wr2, wr3 \n\t"
01080 "wunpckehub wr3, wr3 \n\t"
01081 "wunpckelub wr8, wr10 \n\t"
01082 "wunpckehub wr9, wr10 \n\t"
01083 "wunpckelub wr10, wr11 \n\t"
01084 "wunpckehub wr11, wr11 \n\t"
01085 "waddhus wr0, wr0, wr8 \n\t"
01086 "waddhus wr1, wr1, wr9 \n\t"
01087 "waddhus wr2, wr2, wr10 \n\t"
01088 "waddhus wr3, wr3, wr11 \n\t"
01089 "waddhus wr8, wr0, wr4 \n\t"
01090 "waddhus wr9, wr1, wr5 \n\t"
01091 "waddhus wr10, wr2, wr6 \n\t"
01092 "waddhus wr11, wr3, wr7 \n\t"
01093 "waddhus wr8, wr8, wr15 \n\t"
01094 "waddhus wr9, wr9, wr15 \n\t"
01095 "waddhus wr10, wr10, wr15 \n\t"
01096 "waddhus wr11, wr11, wr15 \n\t"
01097 "wsrlhg wr8, wr8, wcgr0 \n\t"
01098 "wsrlhg wr9, wr9, wcgr0 \n\t"
01099 "wldrd wr12, [%[block]] \n\t"
01100 "wldrd wr13, [%[block], #8] \n\t"
01101 "wsrlhg wr10, wr10, wcgr0 \n\t"
01102 "wsrlhg wr11, wr11, wcgr0 \n\t"
01103 "wpackhus wr8, wr8, wr9 \n\t"
01104 "wpackhus wr9, wr10, wr11 \n\t"
01105 WAVG2B" wr8, wr8, wr12 \n\t"
01106 WAVG2B" wr9, wr9, wr13 \n\t"
01107 "wstrd wr8, [%[block]] \n\t"
01108 "wstrd wr9, [%[block], #8] \n\t"
01109 "add %[block], %[block], %[line_size] \n\t"
01110 "subs %[h], %[h], #2 \n\t"
01111 "pld [%[block]] \n\t"
01112 "pld [%[block], #32] \n\t"
01113 "bne 1b \n\t"
01114 : [h]"+r"(h), [pixels]"+r"(pixels), [block]"+r"(block)
01115 : [line_size]"r"(line_size)
01116 : "r12", "memory");
01117 }
01118
01119 #endif