00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025 #include "dsputil.h"
00026 #include "dsputil_mmx.h"
00027 #include "simple_idct.h"
00028 #include "mpegvideo.h"
00029 #include "x86_cpu.h"
00030 #include "mmx.h"
00031 #include "vp3dsp_mmx.h"
00032 #include "vp3dsp_sse2.h"
00033 #include "h263.h"
00034
00035
00036
00037
00038 extern void ff_idct_xvid_mmx(short *block);
00039 extern void ff_idct_xvid_mmx2(short *block);
00040
00041 int mm_flags;
00042
00043
00044 DECLARE_ALIGNED_8 (const uint64_t, ff_bone) = 0x0101010101010101ULL;
00045 DECLARE_ALIGNED_8 (const uint64_t, ff_wtwo) = 0x0002000200020002ULL;
00046
00047 DECLARE_ALIGNED_16(const uint64_t, ff_pdw_80000000[2]) =
00048 {0x8000000080000000ULL, 0x8000000080000000ULL};
00049
00050 DECLARE_ALIGNED_8 (const uint64_t, ff_pw_3 ) = 0x0003000300030003ULL;
00051 DECLARE_ALIGNED_8 (const uint64_t, ff_pw_4 ) = 0x0004000400040004ULL;
00052 DECLARE_ALIGNED_16(const xmm_t, ff_pw_5 ) = {0x0005000500050005ULL, 0x0005000500050005ULL};
00053 DECLARE_ALIGNED_8 (const uint64_t, ff_pw_8 ) = 0x0008000800080008ULL;
00054 DECLARE_ALIGNED_8 (const uint64_t, ff_pw_15 ) = 0x000F000F000F000FULL;
00055 DECLARE_ALIGNED_16(const xmm_t, ff_pw_16 ) = {0x0010001000100010ULL, 0x0010001000100010ULL};
00056 DECLARE_ALIGNED_8 (const uint64_t, ff_pw_20 ) = 0x0014001400140014ULL;
00057 DECLARE_ALIGNED_16(const xmm_t, ff_pw_32 ) = {0x0020002000200020ULL, 0x0020002000200020ULL};
00058 DECLARE_ALIGNED_8 (const uint64_t, ff_pw_42 ) = 0x002A002A002A002AULL;
00059 DECLARE_ALIGNED_8 (const uint64_t, ff_pw_64 ) = 0x0040004000400040ULL;
00060 DECLARE_ALIGNED_8 (const uint64_t, ff_pw_96 ) = 0x0060006000600060ULL;
00061 DECLARE_ALIGNED_16(const uint64_t, ff_pw_128) = 0x0080008000800080ULL;
00062
00063 DECLARE_ALIGNED_8 (const uint64_t, ff_pb_1 ) = 0x0101010101010101ULL;
00064 DECLARE_ALIGNED_8 (const uint64_t, ff_pb_3 ) = 0x0303030303030303ULL;
00065 DECLARE_ALIGNED_8 (const uint64_t, ff_pb_7 ) = 0x0707070707070707ULL;
00066 DECLARE_ALIGNED_8 (const uint64_t, ff_pb_3F ) = 0x3F3F3F3F3F3F3F3FULL;
00067 DECLARE_ALIGNED_8 (const uint64_t, ff_pb_A1 ) = 0xA1A1A1A1A1A1A1A1ULL;
00068 DECLARE_ALIGNED_8 (const uint64_t, ff_pb_FC ) = 0xFCFCFCFCFCFCFCFCULL;
00069
00070 DECLARE_ALIGNED_16(const double, ff_pd_1[2]) = { 1.0, 1.0 };
00071 DECLARE_ALIGNED_16(const double, ff_pd_2[2]) = { 2.0, 2.0 };
00072
00073 #define JUMPALIGN() __asm __volatile (ASMALIGN(3)::)
00074 #define MOVQ_ZERO(regd) __asm __volatile ("pxor %%" #regd ", %%" #regd ::)
00075
00076 #define MOVQ_WONE(regd) \
00077 __asm __volatile ( \
00078 "pcmpeqd %%" #regd ", %%" #regd " \n\t" \
00079 "psrlw $15, %%" #regd ::)
00080
00081 #define MOVQ_BFE(regd) \
00082 __asm __volatile ( \
00083 "pcmpeqd %%" #regd ", %%" #regd " \n\t"\
00084 "paddb %%" #regd ", %%" #regd " \n\t" ::)
00085
00086 #ifndef PIC
00087 #define MOVQ_BONE(regd) __asm __volatile ("movq %0, %%" #regd " \n\t" ::"m"(ff_bone))
00088 #define MOVQ_WTWO(regd) __asm __volatile ("movq %0, %%" #regd " \n\t" ::"m"(ff_wtwo))
00089 #else
00090
00091
00092 #define MOVQ_BONE(regd) \
00093 __asm __volatile ( \
00094 "pcmpeqd %%" #regd ", %%" #regd " \n\t" \
00095 "psrlw $15, %%" #regd " \n\t" \
00096 "packuswb %%" #regd ", %%" #regd " \n\t" ::)
00097
00098 #define MOVQ_WTWO(regd) \
00099 __asm __volatile ( \
00100 "pcmpeqd %%" #regd ", %%" #regd " \n\t" \
00101 "psrlw $15, %%" #regd " \n\t" \
00102 "psllw $1, %%" #regd " \n\t"::)
00103
00104 #endif
00105
00106
00107
00108
00109 #define PAVGB_MMX_NO_RND(rega, regb, regr, regfe) \
00110 "movq " #rega ", " #regr " \n\t"\
00111 "pand " #regb ", " #regr " \n\t"\
00112 "pxor " #rega ", " #regb " \n\t"\
00113 "pand " #regfe "," #regb " \n\t"\
00114 "psrlq $1, " #regb " \n\t"\
00115 "paddb " #regb ", " #regr " \n\t"
00116
00117 #define PAVGB_MMX(rega, regb, regr, regfe) \
00118 "movq " #rega ", " #regr " \n\t"\
00119 "por " #regb ", " #regr " \n\t"\
00120 "pxor " #rega ", " #regb " \n\t"\
00121 "pand " #regfe "," #regb " \n\t"\
00122 "psrlq $1, " #regb " \n\t"\
00123 "psubb " #regb ", " #regr " \n\t"
00124
00125
00126 #define PAVGBP_MMX_NO_RND(rega, regb, regr, regc, regd, regp) \
00127 "movq " #rega ", " #regr " \n\t"\
00128 "movq " #regc ", " #regp " \n\t"\
00129 "pand " #regb ", " #regr " \n\t"\
00130 "pand " #regd ", " #regp " \n\t"\
00131 "pxor " #rega ", " #regb " \n\t"\
00132 "pxor " #regc ", " #regd " \n\t"\
00133 "pand %%mm6, " #regb " \n\t"\
00134 "pand %%mm6, " #regd " \n\t"\
00135 "psrlq $1, " #regb " \n\t"\
00136 "psrlq $1, " #regd " \n\t"\
00137 "paddb " #regb ", " #regr " \n\t"\
00138 "paddb " #regd ", " #regp " \n\t"
00139
00140 #define PAVGBP_MMX(rega, regb, regr, regc, regd, regp) \
00141 "movq " #rega ", " #regr " \n\t"\
00142 "movq " #regc ", " #regp " \n\t"\
00143 "por " #regb ", " #regr " \n\t"\
00144 "por " #regd ", " #regp " \n\t"\
00145 "pxor " #rega ", " #regb " \n\t"\
00146 "pxor " #regc ", " #regd " \n\t"\
00147 "pand %%mm6, " #regb " \n\t"\
00148 "pand %%mm6, " #regd " \n\t"\
00149 "psrlq $1, " #regd " \n\t"\
00150 "psrlq $1, " #regb " \n\t"\
00151 "psubb " #regb ", " #regr " \n\t"\
00152 "psubb " #regd ", " #regp " \n\t"
00153
00154
00155
00156 #define DEF(x, y) x ## _no_rnd_ ## y ##_mmx
00157 #define SET_RND MOVQ_WONE
00158 #define PAVGBP(a, b, c, d, e, f) PAVGBP_MMX_NO_RND(a, b, c, d, e, f)
00159 #define PAVGB(a, b, c, e) PAVGB_MMX_NO_RND(a, b, c, e)
00160
00161 #include "dsputil_mmx_rnd.h"
00162
00163 #undef DEF
00164 #undef SET_RND
00165 #undef PAVGBP
00166 #undef PAVGB
00167
00168
00169
00170 #define DEF(x, y) x ## _ ## y ##_mmx
00171 #define SET_RND MOVQ_WTWO
00172 #define PAVGBP(a, b, c, d, e, f) PAVGBP_MMX(a, b, c, d, e, f)
00173 #define PAVGB(a, b, c, e) PAVGB_MMX(a, b, c, e)
00174
00175 #include "dsputil_mmx_rnd.h"
00176
00177 #undef DEF
00178 #undef SET_RND
00179 #undef PAVGBP
00180 #undef PAVGB
00181
00182
00183
00184
00185 #define DEF(x) x ## _3dnow
00186 #define PAVGB "pavgusb"
00187
00188 #include "dsputil_mmx_avg.h"
00189
00190 #undef DEF
00191 #undef PAVGB
00192
00193
00194
00195
00196 #define DEF(x) x ## _mmx2
00197
00198
00199 #define PAVGB "pavgb"
00200
00201 #include "dsputil_mmx_avg.h"
00202
00203 #undef DEF
00204 #undef PAVGB
00205
00206 #define put_no_rnd_pixels16_mmx put_pixels16_mmx
00207 #define put_no_rnd_pixels8_mmx put_pixels8_mmx
00208 #define put_pixels16_mmx2 put_pixels16_mmx
00209 #define put_pixels8_mmx2 put_pixels8_mmx
00210 #define put_pixels4_mmx2 put_pixels4_mmx
00211 #define put_no_rnd_pixels16_mmx2 put_no_rnd_pixels16_mmx
00212 #define put_no_rnd_pixels8_mmx2 put_no_rnd_pixels8_mmx
00213 #define put_pixels16_3dnow put_pixels16_mmx
00214 #define put_pixels8_3dnow put_pixels8_mmx
00215 #define put_pixels4_3dnow put_pixels4_mmx
00216 #define put_no_rnd_pixels16_3dnow put_no_rnd_pixels16_mmx
00217 #define put_no_rnd_pixels8_3dnow put_no_rnd_pixels8_mmx
00218
00219
00220
00221
00222 #ifdef CONFIG_ENCODERS
00223 static void get_pixels_mmx(DCTELEM *block, const uint8_t *pixels, int line_size)
00224 {
00225 asm volatile(
00226 "mov $-128, %%"REG_a" \n\t"
00227 "pxor %%mm7, %%mm7 \n\t"
00228 ASMALIGN(4)
00229 "1: \n\t"
00230 "movq (%0), %%mm0 \n\t"
00231 "movq (%0, %2), %%mm2 \n\t"
00232 "movq %%mm0, %%mm1 \n\t"
00233 "movq %%mm2, %%mm3 \n\t"
00234 "punpcklbw %%mm7, %%mm0 \n\t"
00235 "punpckhbw %%mm7, %%mm1 \n\t"
00236 "punpcklbw %%mm7, %%mm2 \n\t"
00237 "punpckhbw %%mm7, %%mm3 \n\t"
00238 "movq %%mm0, (%1, %%"REG_a") \n\t"
00239 "movq %%mm1, 8(%1, %%"REG_a") \n\t"
00240 "movq %%mm2, 16(%1, %%"REG_a") \n\t"
00241 "movq %%mm3, 24(%1, %%"REG_a") \n\t"
00242 "add %3, %0 \n\t"
00243 "add $32, %%"REG_a" \n\t"
00244 "js 1b \n\t"
00245 : "+r" (pixels)
00246 : "r" (block+64), "r" ((long)line_size), "r" ((long)line_size*2)
00247 : "%"REG_a
00248 );
00249 }
00250
00251 static inline void diff_pixels_mmx(DCTELEM *block, const uint8_t *s1, const uint8_t *s2, int stride)
00252 {
00253 asm volatile(
00254 "pxor %%mm7, %%mm7 \n\t"
00255 "mov $-128, %%"REG_a" \n\t"
00256 ASMALIGN(4)
00257 "1: \n\t"
00258 "movq (%0), %%mm0 \n\t"
00259 "movq (%1), %%mm2 \n\t"
00260 "movq %%mm0, %%mm1 \n\t"
00261 "movq %%mm2, %%mm3 \n\t"
00262 "punpcklbw %%mm7, %%mm0 \n\t"
00263 "punpckhbw %%mm7, %%mm1 \n\t"
00264 "punpcklbw %%mm7, %%mm2 \n\t"
00265 "punpckhbw %%mm7, %%mm3 \n\t"
00266 "psubw %%mm2, %%mm0 \n\t"
00267 "psubw %%mm3, %%mm1 \n\t"
00268 "movq %%mm0, (%2, %%"REG_a") \n\t"
00269 "movq %%mm1, 8(%2, %%"REG_a") \n\t"
00270 "add %3, %0 \n\t"
00271 "add %3, %1 \n\t"
00272 "add $16, %%"REG_a" \n\t"
00273 "jnz 1b \n\t"
00274 : "+r" (s1), "+r" (s2)
00275 : "r" (block+64), "r" ((long)stride)
00276 : "%"REG_a
00277 );
00278 }
00279 #endif //CONFIG_ENCODERS
00280
00281 void put_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels, int line_size)
00282 {
00283 const DCTELEM *p;
00284 uint8_t *pix;
00285
00286
00287 p = block;
00288 pix = pixels;
00289
00290 __asm __volatile(
00291 "movq %3, %%mm0 \n\t"
00292 "movq 8%3, %%mm1 \n\t"
00293 "movq 16%3, %%mm2 \n\t"
00294 "movq 24%3, %%mm3 \n\t"
00295 "movq 32%3, %%mm4 \n\t"
00296 "movq 40%3, %%mm5 \n\t"
00297 "movq 48%3, %%mm6 \n\t"
00298 "movq 56%3, %%mm7 \n\t"
00299 "packuswb %%mm1, %%mm0 \n\t"
00300 "packuswb %%mm3, %%mm2 \n\t"
00301 "packuswb %%mm5, %%mm4 \n\t"
00302 "packuswb %%mm7, %%mm6 \n\t"
00303 "movq %%mm0, (%0) \n\t"
00304 "movq %%mm2, (%0, %1) \n\t"
00305 "movq %%mm4, (%0, %1, 2) \n\t"
00306 "movq %%mm6, (%0, %2) \n\t"
00307 ::"r" (pix), "r" ((long)line_size), "r" ((long)line_size*3), "m"(*p)
00308 :"memory");
00309 pix += line_size*4;
00310 p += 32;
00311
00312
00313
00314
00315 __asm __volatile(
00316 "movq (%3), %%mm0 \n\t"
00317 "movq 8(%3), %%mm1 \n\t"
00318 "movq 16(%3), %%mm2 \n\t"
00319 "movq 24(%3), %%mm3 \n\t"
00320 "movq 32(%3), %%mm4 \n\t"
00321 "movq 40(%3), %%mm5 \n\t"
00322 "movq 48(%3), %%mm6 \n\t"
00323 "movq 56(%3), %%mm7 \n\t"
00324 "packuswb %%mm1, %%mm0 \n\t"
00325 "packuswb %%mm3, %%mm2 \n\t"
00326 "packuswb %%mm5, %%mm4 \n\t"
00327 "packuswb %%mm7, %%mm6 \n\t"
00328 "movq %%mm0, (%0) \n\t"
00329 "movq %%mm2, (%0, %1) \n\t"
00330 "movq %%mm4, (%0, %1, 2) \n\t"
00331 "movq %%mm6, (%0, %2) \n\t"
00332 ::"r" (pix), "r" ((long)line_size), "r" ((long)line_size*3), "r"(p)
00333 :"memory");
00334 }
00335
00336 static DECLARE_ALIGNED_8(const unsigned char, vector128[8]) =
00337 { 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 };
00338
00339 void put_signed_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels, int line_size)
00340 {
00341 int i;
00342
00343 movq_m2r(*vector128, mm1);
00344 for (i = 0; i < 8; i++) {
00345 movq_m2r(*(block), mm0);
00346 packsswb_m2r(*(block + 4), mm0);
00347 block += 8;
00348 paddb_r2r(mm1, mm0);
00349 movq_r2m(mm0, *pixels);
00350 pixels += line_size;
00351 }
00352 }
00353
00354 void add_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels, int line_size)
00355 {
00356 const DCTELEM *p;
00357 uint8_t *pix;
00358 int i;
00359
00360
00361 p = block;
00362 pix = pixels;
00363 MOVQ_ZERO(mm7);
00364 i = 4;
00365 do {
00366 __asm __volatile(
00367 "movq (%2), %%mm0 \n\t"
00368 "movq 8(%2), %%mm1 \n\t"
00369 "movq 16(%2), %%mm2 \n\t"
00370 "movq 24(%2), %%mm3 \n\t"
00371 "movq %0, %%mm4 \n\t"
00372 "movq %1, %%mm6 \n\t"
00373 "movq %%mm4, %%mm5 \n\t"
00374 "punpcklbw %%mm7, %%mm4 \n\t"
00375 "punpckhbw %%mm7, %%mm5 \n\t"
00376 "paddsw %%mm4, %%mm0 \n\t"
00377 "paddsw %%mm5, %%mm1 \n\t"
00378 "movq %%mm6, %%mm5 \n\t"
00379 "punpcklbw %%mm7, %%mm6 \n\t"
00380 "punpckhbw %%mm7, %%mm5 \n\t"
00381 "paddsw %%mm6, %%mm2 \n\t"
00382 "paddsw %%mm5, %%mm3 \n\t"
00383 "packuswb %%mm1, %%mm0 \n\t"
00384 "packuswb %%mm3, %%mm2 \n\t"
00385 "movq %%mm0, %0 \n\t"
00386 "movq %%mm2, %1 \n\t"
00387 :"+m"(*pix), "+m"(*(pix+line_size))
00388 :"r"(p)
00389 :"memory");
00390 pix += line_size*2;
00391 p += 16;
00392 } while (--i);
00393 }
00394
00395 static void put_pixels4_mmx(uint8_t *block, const uint8_t *pixels, int line_size, int h)
00396 {
00397 __asm __volatile(
00398 "lea (%3, %3), %%"REG_a" \n\t"
00399 ASMALIGN(3)
00400 "1: \n\t"
00401 "movd (%1), %%mm0 \n\t"
00402 "movd (%1, %3), %%mm1 \n\t"
00403 "movd %%mm0, (%2) \n\t"
00404 "movd %%mm1, (%2, %3) \n\t"
00405 "add %%"REG_a", %1 \n\t"
00406 "add %%"REG_a", %2 \n\t"
00407 "movd (%1), %%mm0 \n\t"
00408 "movd (%1, %3), %%mm1 \n\t"
00409 "movd %%mm0, (%2) \n\t"
00410 "movd %%mm1, (%2, %3) \n\t"
00411 "add %%"REG_a", %1 \n\t"
00412 "add %%"REG_a", %2 \n\t"
00413 "subl $4, %0 \n\t"
00414 "jnz 1b \n\t"
00415 : "+g"(h), "+r" (pixels), "+r" (block)
00416 : "r"((long)line_size)
00417 : "%"REG_a, "memory"
00418 );
00419 }
00420
00421 static void put_pixels8_mmx(uint8_t *block, const uint8_t *pixels, int line_size, int h)
00422 {
00423 __asm __volatile(
00424 "lea (%3, %3), %%"REG_a" \n\t"
00425 ASMALIGN(3)
00426 "1: \n\t"
00427 "movq (%1), %%mm0 \n\t"
00428 "movq (%1, %3), %%mm1 \n\t"
00429 "movq %%mm0, (%2) \n\t"
00430 "movq %%mm1, (%2, %3) \n\t"
00431 "add %%"REG_a", %1 \n\t"
00432 "add %%"REG_a", %2 \n\t"
00433 "movq (%1), %%mm0 \n\t"
00434 "movq (%1, %3), %%mm1 \n\t"
00435 "movq %%mm0, (%2) \n\t"
00436 "movq %%mm1, (%2, %3) \n\t"
00437 "add %%"REG_a", %1 \n\t"
00438 "add %%"REG_a", %2 \n\t"
00439 "subl $4, %0 \n\t"
00440 "jnz 1b \n\t"
00441 : "+g"(h), "+r" (pixels), "+r" (block)
00442 : "r"((long)line_size)
00443 : "%"REG_a, "memory"
00444 );
00445 }
00446
00447 static void put_pixels16_mmx(uint8_t *block, const uint8_t *pixels, int line_size, int h)
00448 {
00449 __asm __volatile(
00450 "lea (%3, %3), %%"REG_a" \n\t"
00451 ASMALIGN(3)
00452 "1: \n\t"
00453 "movq (%1), %%mm0 \n\t"
00454 "movq 8(%1), %%mm4 \n\t"
00455 "movq (%1, %3), %%mm1 \n\t"
00456 "movq 8(%1, %3), %%mm5 \n\t"
00457 "movq %%mm0, (%2) \n\t"
00458 "movq %%mm4, 8(%2) \n\t"
00459 "movq %%mm1, (%2, %3) \n\t"
00460 "movq %%mm5, 8(%2, %3) \n\t"
00461 "add %%"REG_a", %1 \n\t"
00462 "add %%"REG_a", %2 \n\t"
00463 "movq (%1), %%mm0 \n\t"
00464 "movq 8(%1), %%mm4 \n\t"
00465 "movq (%1, %3), %%mm1 \n\t"
00466 "movq 8(%1, %3), %%mm5 \n\t"
00467 "movq %%mm0, (%2) \n\t"
00468 "movq %%mm4, 8(%2) \n\t"
00469 "movq %%mm1, (%2, %3) \n\t"
00470 "movq %%mm5, 8(%2, %3) \n\t"
00471 "add %%"REG_a", %1 \n\t"
00472 "add %%"REG_a", %2 \n\t"
00473 "subl $4, %0 \n\t"
00474 "jnz 1b \n\t"
00475 : "+g"(h), "+r" (pixels), "+r" (block)
00476 : "r"((long)line_size)
00477 : "%"REG_a, "memory"
00478 );
00479 }
00480
00481 static void put_pixels16_sse2(uint8_t *block, const uint8_t *pixels, int line_size, int h)
00482 {
00483 __asm __volatile(
00484 "1: \n\t"
00485 "movdqu (%1), %%xmm0 \n\t"
00486 "movdqu (%1,%3), %%xmm1 \n\t"
00487 "movdqu (%1,%3,2), %%xmm2 \n\t"
00488 "movdqu (%1,%4), %%xmm3 \n\t"
00489 "movdqa %%xmm0, (%2) \n\t"
00490 "movdqa %%xmm1, (%2,%3) \n\t"
00491 "movdqa %%xmm2, (%2,%3,2) \n\t"
00492 "movdqa %%xmm3, (%2,%4) \n\t"
00493 "subl $4, %0 \n\t"
00494 "lea (%1,%3,4), %1 \n\t"
00495 "lea (%2,%3,4), %2 \n\t"
00496 "jnz 1b \n\t"
00497 : "+g"(h), "+r" (pixels), "+r" (block)
00498 : "r"((long)line_size), "r"(3L*line_size)
00499 : "memory"
00500 );
00501 }
00502
00503 static void avg_pixels16_sse2(uint8_t *block, const uint8_t *pixels, int line_size, int h)
00504 {
00505 __asm __volatile(
00506 "1: \n\t"
00507 "movdqu (%1), %%xmm0 \n\t"
00508 "movdqu (%1,%3), %%xmm1 \n\t"
00509 "movdqu (%1,%3,2), %%xmm2 \n\t"
00510 "movdqu (%1,%4), %%xmm3 \n\t"
00511 "pavgb (%2), %%xmm0 \n\t"
00512 "pavgb (%2,%3), %%xmm1 \n\t"
00513 "pavgb (%2,%3,2), %%xmm2 \n\t"
00514 "pavgb (%2,%4), %%xmm3 \n\t"
00515 "movdqa %%xmm0, (%2) \n\t"
00516 "movdqa %%xmm1, (%2,%3) \n\t"
00517 "movdqa %%xmm2, (%2,%3,2) \n\t"
00518 "movdqa %%xmm3, (%2,%4) \n\t"
00519 "subl $4, %0 \n\t"
00520 "lea (%1,%3,4), %1 \n\t"
00521 "lea (%2,%3,4), %2 \n\t"
00522 "jnz 1b \n\t"
00523 : "+g"(h), "+r" (pixels), "+r" (block)
00524 : "r"((long)line_size), "r"(3L*line_size)
00525 : "memory"
00526 );
00527 }
00528
00529 static void clear_blocks_mmx(DCTELEM *blocks)
00530 {
00531 __asm __volatile(
00532 "pxor %%mm7, %%mm7 \n\t"
00533 "mov $-128*6, %%"REG_a" \n\t"
00534 "1: \n\t"
00535 "movq %%mm7, (%0, %%"REG_a") \n\t"
00536 "movq %%mm7, 8(%0, %%"REG_a") \n\t"
00537 "movq %%mm7, 16(%0, %%"REG_a") \n\t"
00538 "movq %%mm7, 24(%0, %%"REG_a") \n\t"
00539 "add $32, %%"REG_a" \n\t"
00540 " js 1b \n\t"
00541 : : "r" (((uint8_t *)blocks)+128*6)
00542 : "%"REG_a
00543 );
00544 }
00545
00546 #ifdef CONFIG_ENCODERS
00547 static int pix_sum16_mmx(uint8_t * pix, int line_size){
00548 const int h=16;
00549 int sum;
00550 long index= -line_size*h;
00551
00552 __asm __volatile(
00553 "pxor %%mm7, %%mm7 \n\t"
00554 "pxor %%mm6, %%mm6 \n\t"
00555 "1: \n\t"
00556 "movq (%2, %1), %%mm0 \n\t"
00557 "movq (%2, %1), %%mm1 \n\t"
00558 "movq 8(%2, %1), %%mm2 \n\t"
00559 "movq 8(%2, %1), %%mm3 \n\t"
00560 "punpcklbw %%mm7, %%mm0 \n\t"
00561 "punpckhbw %%mm7, %%mm1 \n\t"
00562 "punpcklbw %%mm7, %%mm2 \n\t"
00563 "punpckhbw %%mm7, %%mm3 \n\t"
00564 "paddw %%mm0, %%mm1 \n\t"
00565 "paddw %%mm2, %%mm3 \n\t"
00566 "paddw %%mm1, %%mm3 \n\t"
00567 "paddw %%mm3, %%mm6 \n\t"
00568 "add %3, %1 \n\t"
00569 " js 1b \n\t"
00570 "movq %%mm6, %%mm5 \n\t"
00571 "psrlq $32, %%mm6 \n\t"
00572 "paddw %%mm5, %%mm6 \n\t"
00573 "movq %%mm6, %%mm5 \n\t"
00574 "psrlq $16, %%mm6 \n\t"
00575 "paddw %%mm5, %%mm6 \n\t"
00576 "movd %%mm6, %0 \n\t"
00577 "andl $0xFFFF, %0 \n\t"
00578 : "=&r" (sum), "+r" (index)
00579 : "r" (pix - index), "r" ((long)line_size)
00580 );
00581
00582 return sum;
00583 }
00584 #endif //CONFIG_ENCODERS
00585
00586 static void add_bytes_mmx(uint8_t *dst, uint8_t *src, int w){
00587 long i=0;
00588 asm volatile(
00589 "1: \n\t"
00590 "movq (%1, %0), %%mm0 \n\t"
00591 "movq (%2, %0), %%mm1 \n\t"
00592 "paddb %%mm0, %%mm1 \n\t"
00593 "movq %%mm1, (%2, %0) \n\t"
00594 "movq 8(%1, %0), %%mm0 \n\t"
00595 "movq 8(%2, %0), %%mm1 \n\t"
00596 "paddb %%mm0, %%mm1 \n\t"
00597 "movq %%mm1, 8(%2, %0) \n\t"
00598 "add $16, %0 \n\t"
00599 "cmp %3, %0 \n\t"
00600 " jb 1b \n\t"
00601 : "+r" (i)
00602 : "r"(src), "r"(dst), "r"((long)w-15)
00603 );
00604 for(; i<w; i++)
00605 dst[i+0] += src[i+0];
00606 }
00607
00608 #define H263_LOOP_FILTER \
00609 "pxor %%mm7, %%mm7 \n\t"\
00610 "movq %0, %%mm0 \n\t"\
00611 "movq %0, %%mm1 \n\t"\
00612 "movq %3, %%mm2 \n\t"\
00613 "movq %3, %%mm3 \n\t"\
00614 "punpcklbw %%mm7, %%mm0 \n\t"\
00615 "punpckhbw %%mm7, %%mm1 \n\t"\
00616 "punpcklbw %%mm7, %%mm2 \n\t"\
00617 "punpckhbw %%mm7, %%mm3 \n\t"\
00618 "psubw %%mm2, %%mm0 \n\t"\
00619 "psubw %%mm3, %%mm1 \n\t"\
00620 "movq %1, %%mm2 \n\t"\
00621 "movq %1, %%mm3 \n\t"\
00622 "movq %2, %%mm4 \n\t"\
00623 "movq %2, %%mm5 \n\t"\
00624 "punpcklbw %%mm7, %%mm2 \n\t"\
00625 "punpckhbw %%mm7, %%mm3 \n\t"\
00626 "punpcklbw %%mm7, %%mm4 \n\t"\
00627 "punpckhbw %%mm7, %%mm5 \n\t"\
00628 "psubw %%mm2, %%mm4 \n\t"\
00629 "psubw %%mm3, %%mm5 \n\t"\
00630 "psllw $2, %%mm4 \n\t"\
00631 "psllw $2, %%mm5 \n\t"\
00632 "paddw %%mm0, %%mm4 \n\t"\
00633 "paddw %%mm1, %%mm5 \n\t"\
00634 "pxor %%mm6, %%mm6 \n\t"\
00635 "pcmpgtw %%mm4, %%mm6 \n\t"\
00636 "pcmpgtw %%mm5, %%mm7 \n\t"\
00637 "pxor %%mm6, %%mm4 \n\t"\
00638 "pxor %%mm7, %%mm5 \n\t"\
00639 "psubw %%mm6, %%mm4 \n\t"\
00640 "psubw %%mm7, %%mm5 \n\t"\
00641 "psrlw $3, %%mm4 \n\t"\
00642 "psrlw $3, %%mm5 \n\t"\
00643 "packuswb %%mm5, %%mm4 \n\t"\
00644 "packsswb %%mm7, %%mm6 \n\t"\
00645 "pxor %%mm7, %%mm7 \n\t"\
00646 "movd %4, %%mm2 \n\t"\
00647 "punpcklbw %%mm2, %%mm2 \n\t"\
00648 "punpcklbw %%mm2, %%mm2 \n\t"\
00649 "punpcklbw %%mm2, %%mm2 \n\t"\
00650 "psubusb %%mm4, %%mm2 \n\t"\
00651 "movq %%mm2, %%mm3 \n\t"\
00652 "psubusb %%mm4, %%mm3 \n\t"\
00653 "psubb %%mm3, %%mm2 \n\t"\
00654 "movq %1, %%mm3 \n\t"\
00655 "movq %2, %%mm4 \n\t"\
00656 "pxor %%mm6, %%mm3 \n\t"\
00657 "pxor %%mm6, %%mm4 \n\t"\
00658 "paddusb %%mm2, %%mm3 \n\t"\
00659 "psubusb %%mm2, %%mm4 \n\t"\
00660 "pxor %%mm6, %%mm3 \n\t"\
00661 "pxor %%mm6, %%mm4 \n\t"\
00662 "paddusb %%mm2, %%mm2 \n\t"\
00663 "packsswb %%mm1, %%mm0 \n\t"\
00664 "pcmpgtb %%mm0, %%mm7 \n\t"\
00665 "pxor %%mm7, %%mm0 \n\t"\
00666 "psubb %%mm7, %%mm0 \n\t"\
00667 "movq %%mm0, %%mm1 \n\t"\
00668 "psubusb %%mm2, %%mm0 \n\t"\
00669 "psubb %%mm0, %%mm1 \n\t"\
00670 "pand %5, %%mm1 \n\t"\
00671 "psrlw $2, %%mm1 \n\t"\
00672 "pxor %%mm7, %%mm1 \n\t"\
00673 "psubb %%mm7, %%mm1 \n\t"\
00674 "movq %0, %%mm5 \n\t"\
00675 "movq %3, %%mm6 \n\t"\
00676 "psubb %%mm1, %%mm5 \n\t"\
00677 "paddb %%mm1, %%mm6 \n\t"
00678
00679 static void h263_v_loop_filter_mmx(uint8_t *src, int stride, int qscale){
00680 if(ENABLE_ANY_H263) {
00681 const int strength= ff_h263_loop_filter_strength[qscale];
00682
00683 asm volatile(
00684
00685 H263_LOOP_FILTER
00686
00687 "movq %%mm3, %1 \n\t"
00688 "movq %%mm4, %2 \n\t"
00689 "movq %%mm5, %0 \n\t"
00690 "movq %%mm6, %3 \n\t"
00691 : "+m" (*(uint64_t*)(src - 2*stride)),
00692 "+m" (*(uint64_t*)(src - 1*stride)),
00693 "+m" (*(uint64_t*)(src + 0*stride)),
00694 "+m" (*(uint64_t*)(src + 1*stride))
00695 : "g" (2*strength), "m"(ff_pb_FC)
00696 );
00697 }
00698 }
00699
00700 static inline void transpose4x4(uint8_t *dst, uint8_t *src, int dst_stride, int src_stride){
00701 asm volatile(
00702 "movd %4, %%mm0 \n\t"
00703 "movd %5, %%mm1 \n\t"
00704 "movd %6, %%mm2 \n\t"
00705 "movd %7, %%mm3 \n\t"
00706 "punpcklbw %%mm1, %%mm0 \n\t"
00707 "punpcklbw %%mm3, %%mm2 \n\t"
00708 "movq %%mm0, %%mm1 \n\t"
00709 "punpcklwd %%mm2, %%mm0 \n\t"
00710 "punpckhwd %%mm2, %%mm1 \n\t"
00711 "movd %%mm0, %0 \n\t"
00712 "punpckhdq %%mm0, %%mm0 \n\t"
00713 "movd %%mm0, %1 \n\t"
00714 "movd %%mm1, %2 \n\t"
00715 "punpckhdq %%mm1, %%mm1 \n\t"
00716 "movd %%mm1, %3 \n\t"
00717
00718 : "=m" (*(uint32_t*)(dst + 0*dst_stride)),
00719 "=m" (*(uint32_t*)(dst + 1*dst_stride)),
00720 "=m" (*(uint32_t*)(dst + 2*dst_stride)),
00721 "=m" (*(uint32_t*)(dst + 3*dst_stride))
00722 : "m" (*(uint32_t*)(src + 0*src_stride)),
00723 "m" (*(uint32_t*)(src + 1*src_stride)),
00724 "m" (*(uint32_t*)(src + 2*src_stride)),
00725 "m" (*(uint32_t*)(src + 3*src_stride))
00726 );
00727 }
00728
00729 static void h263_h_loop_filter_mmx(uint8_t *src, int stride, int qscale){
00730 if(ENABLE_ANY_H263) {
00731 const int strength= ff_h263_loop_filter_strength[qscale];
00732 DECLARE_ALIGNED(8, uint64_t, temp[4]);
00733 uint8_t *btemp= (uint8_t*)temp;
00734
00735 src -= 2;
00736
00737 transpose4x4(btemp , src , 8, stride);
00738 transpose4x4(btemp+4, src + 4*stride, 8, stride);
00739 asm volatile(
00740 H263_LOOP_FILTER
00741
00742 : "+m" (temp[0]),
00743 "+m" (temp[1]),
00744 "+m" (temp[2]),
00745 "+m" (temp[3])
00746 : "g" (2*strength), "m"(ff_pb_FC)
00747 );
00748
00749 asm volatile(
00750 "movq %%mm5, %%mm1 \n\t"
00751 "movq %%mm4, %%mm0 \n\t"
00752 "punpcklbw %%mm3, %%mm5 \n\t"
00753 "punpcklbw %%mm6, %%mm4 \n\t"
00754 "punpckhbw %%mm3, %%mm1 \n\t"
00755 "punpckhbw %%mm6, %%mm0 \n\t"
00756 "movq %%mm5, %%mm3 \n\t"
00757 "movq %%mm1, %%mm6 \n\t"
00758 "punpcklwd %%mm4, %%mm5 \n\t"
00759 "punpcklwd %%mm0, %%mm1 \n\t"
00760 "punpckhwd %%mm4, %%mm3 \n\t"
00761 "punpckhwd %%mm0, %%mm6 \n\t"
00762 "movd %%mm5, (%0) \n\t"
00763 "punpckhdq %%mm5, %%mm5 \n\t"
00764 "movd %%mm5, (%0,%2) \n\t"
00765 "movd %%mm3, (%0,%2,2) \n\t"
00766 "punpckhdq %%mm3, %%mm3 \n\t"
00767 "movd %%mm3, (%0,%3) \n\t"
00768 "movd %%mm1, (%1) \n\t"
00769 "punpckhdq %%mm1, %%mm1 \n\t"
00770 "movd %%mm1, (%1,%2) \n\t"
00771 "movd %%mm6, (%1,%2,2) \n\t"
00772 "punpckhdq %%mm6, %%mm6 \n\t"
00773 "movd %%mm6, (%1,%3) \n\t"
00774 :: "r" (src),
00775 "r" (src + 4*stride),
00776 "r" ((long) stride ),
00777 "r" ((long)(3*stride))
00778 );
00779 }
00780 }
00781
00782 #ifdef CONFIG_ENCODERS
00783 static int pix_norm1_mmx(uint8_t *pix, int line_size) {
00784 int tmp;
00785 asm volatile (
00786 "movl $16,%%ecx\n"
00787 "pxor %%mm0,%%mm0\n"
00788 "pxor %%mm7,%%mm7\n"
00789 "1:\n"
00790 "movq (%0),%%mm2\n"
00791 "movq 8(%0),%%mm3\n"
00792
00793 "movq %%mm2,%%mm1\n"
00794
00795 "punpckhbw %%mm0,%%mm1\n"
00796 "punpcklbw %%mm0,%%mm2\n"
00797
00798 "movq %%mm3,%%mm4\n"
00799 "punpckhbw %%mm0,%%mm3\n"
00800 "punpcklbw %%mm0,%%mm4\n"
00801
00802 "pmaddwd %%mm1,%%mm1\n"
00803 "pmaddwd %%mm2,%%mm2\n"
00804
00805 "pmaddwd %%mm3,%%mm3\n"
00806 "pmaddwd %%mm4,%%mm4\n"
00807
00808 "paddd %%mm1,%%mm2\n"
00809
00810 "paddd %%mm3,%%mm4\n"
00811 "paddd %%mm2,%%mm7\n"
00812
00813 "add %2, %0\n"
00814 "paddd %%mm4,%%mm7\n"
00815 "dec %%ecx\n"
00816 "jnz 1b\n"
00817
00818 "movq %%mm7,%%mm1\n"
00819 "psrlq $32, %%mm7\n"
00820 "paddd %%mm7,%%mm1\n"
00821 "movd %%mm1,%1\n"
00822 : "+r" (pix), "=r"(tmp) : "r" ((long)line_size) : "%ecx" );
00823 return tmp;
00824 }
00825
00826 static int sse8_mmx(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) {
00827 int tmp;
00828 asm volatile (
00829 "movl %4,%%ecx\n"
00830 "shr $1,%%ecx\n"
00831 "pxor %%mm0,%%mm0\n"
00832 "pxor %%mm7,%%mm7\n"
00833 "1:\n"
00834 "movq (%0),%%mm1\n"
00835 "movq (%1),%%mm2\n"
00836 "movq (%0,%3),%%mm3\n"
00837 "movq (%1,%3),%%mm4\n"
00838
00839
00840
00841
00842 "movq %%mm1,%%mm5\n"
00843 "movq %%mm3,%%mm6\n"
00844 "psubusb %%mm2,%%mm1\n"
00845 "psubusb %%mm4,%%mm3\n"
00846 "psubusb %%mm5,%%mm2\n"
00847 "psubusb %%mm6,%%mm4\n"
00848
00849 "por %%mm1,%%mm2\n"
00850 "por %%mm3,%%mm4\n"
00851
00852
00853 "movq %%mm2,%%mm1\n"
00854 "movq %%mm4,%%mm3\n"
00855
00856 "punpckhbw %%mm0,%%mm2\n"
00857 "punpckhbw %%mm0,%%mm4\n"
00858 "punpcklbw %%mm0,%%mm1\n"
00859 "punpcklbw %%mm0,%%mm3\n"
00860
00861 "pmaddwd %%mm2,%%mm2\n"
00862 "pmaddwd %%mm4,%%mm4\n"
00863 "pmaddwd %%mm1,%%mm1\n"
00864 "pmaddwd %%mm3,%%mm3\n"
00865
00866 "lea (%0,%3,2), %0\n"
00867 "lea (%1,%3,2), %1\n"
00868
00869 "paddd %%mm2,%%mm1\n"
00870 "paddd %%mm4,%%mm3\n"
00871 "paddd %%mm1,%%mm7\n"
00872 "paddd %%mm3,%%mm7\n"
00873
00874 "decl %%ecx\n"
00875 "jnz 1b\n"
00876
00877 "movq %%mm7,%%mm1\n"
00878 "psrlq $32, %%mm7\n"
00879 "paddd %%mm7,%%mm1\n"
00880 "movd %%mm1,%2\n"
00881 : "+r" (pix1), "+r" (pix2), "=r"(tmp)
00882 : "r" ((long)line_size) , "m" (h)
00883 : "%ecx");
00884 return tmp;
00885 }
00886
00887 static int sse16_mmx(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) {
00888 int tmp;
00889 asm volatile (
00890 "movl %4,%%ecx\n"
00891 "pxor %%mm0,%%mm0\n"
00892 "pxor %%mm7,%%mm7\n"
00893 "1:\n"
00894 "movq (%0),%%mm1\n"
00895 "movq (%1),%%mm2\n"
00896 "movq 8(%0),%%mm3\n"
00897 "movq 8(%1),%%mm4\n"
00898
00899
00900
00901
00902 "movq %%mm1,%%mm5\n"
00903 "movq %%mm3,%%mm6\n"
00904 "psubusb %%mm2,%%mm1\n"
00905 "psubusb %%mm4,%%mm3\n"
00906 "psubusb %%mm5,%%mm2\n"
00907 "psubusb %%mm6,%%mm4\n"
00908
00909 "por %%mm1,%%mm2\n"
00910 "por %%mm3,%%mm4\n"
00911
00912
00913 "movq %%mm2,%%mm1\n"
00914 "movq %%mm4,%%mm3\n"
00915
00916 "punpckhbw %%mm0,%%mm2\n"
00917 "punpckhbw %%mm0,%%mm4\n"
00918 "punpcklbw %%mm0,%%mm1\n"
00919 "punpcklbw %%mm0,%%mm3\n"
00920
00921 "pmaddwd %%mm2,%%mm2\n"
00922 "pmaddwd %%mm4,%%mm4\n"
00923 "pmaddwd %%mm1,%%mm1\n"
00924 "pmaddwd %%mm3,%%mm3\n"
00925
00926 "add %3,%0\n"
00927 "add %3,%1\n"
00928
00929 "paddd %%mm2,%%mm1\n"
00930 "paddd %%mm4,%%mm3\n"
00931 "paddd %%mm1,%%mm7\n"
00932 "paddd %%mm3,%%mm7\n"
00933
00934 "decl %%ecx\n"
00935 "jnz 1b\n"
00936
00937 "movq %%mm7,%%mm1\n"
00938 "psrlq $32, %%mm7\n"
00939 "paddd %%mm7,%%mm1\n"
00940 "movd %%mm1,%2\n"
00941 : "+r" (pix1), "+r" (pix2), "=r"(tmp)
00942 : "r" ((long)line_size) , "m" (h)
00943 : "%ecx");
00944 return tmp;
00945 }
00946
00947 static int sse16_sse2(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) {
00948 int tmp;
00949 asm volatile (
00950 "shr $1,%2\n"
00951 "pxor %%xmm0,%%xmm0\n"
00952 "pxor %%xmm7,%%xmm7\n"
00953 "1:\n"
00954 "movdqu (%0),%%xmm1\n"
00955 "movdqu (%1),%%xmm2\n"
00956 "movdqu (%0,%4),%%xmm3\n"
00957 "movdqu (%1,%4),%%xmm4\n"
00958
00959
00960
00961
00962 "movdqa %%xmm1,%%xmm5\n"
00963 "movdqa %%xmm3,%%xmm6\n"
00964 "psubusb %%xmm2,%%xmm1\n"
00965 "psubusb %%xmm4,%%xmm3\n"
00966 "psubusb %%xmm5,%%xmm2\n"
00967 "psubusb %%xmm6,%%xmm4\n"
00968
00969 "por %%xmm1,%%xmm2\n"
00970 "por %%xmm3,%%xmm4\n"
00971
00972
00973 "movdqa %%xmm2,%%xmm1\n"
00974 "movdqa %%xmm4,%%xmm3\n"
00975
00976 "punpckhbw %%xmm0,%%xmm2\n"
00977 "punpckhbw %%xmm0,%%xmm4\n"
00978 "punpcklbw %%xmm0,%%xmm1\n"
00979 "punpcklbw %%xmm0,%%xmm3\n"
00980
00981 "pmaddwd %%xmm2,%%xmm2\n"
00982 "pmaddwd %%xmm4,%%xmm4\n"
00983 "pmaddwd %%xmm1,%%xmm1\n"
00984 "pmaddwd %%xmm3,%%xmm3\n"
00985
00986 "lea (%0,%4,2), %0\n"
00987 "lea (%1,%4,2), %1\n"
00988
00989 "paddd %%xmm2,%%xmm1\n"
00990 "paddd %%xmm4,%%xmm3\n"
00991 "paddd %%xmm1,%%xmm7\n"
00992 "paddd %%xmm3,%%xmm7\n"
00993
00994 "decl %2\n"
00995 "jnz 1b\n"
00996
00997 "movdqa %%xmm7,%%xmm1\n"
00998 "psrldq $8, %%xmm7\n"
00999 "paddd %%xmm1,%%xmm7\n"
01000 "movdqa %%xmm7,%%xmm1\n"
01001 "psrldq $4, %%xmm7\n"
01002 "paddd %%xmm1,%%xmm7\n"
01003 "movd %%xmm7,%3\n"
01004 : "+r" (pix1), "+r" (pix2), "+r"(h), "=r"(tmp)
01005 : "r" ((long)line_size));
01006 return tmp;
01007 }
01008
01009 static int hf_noise8_mmx(uint8_t * pix1, int line_size, int h) {
01010 int tmp;
01011 asm volatile (
01012 "movl %3,%%ecx\n"
01013 "pxor %%mm7,%%mm7\n"
01014 "pxor %%mm6,%%mm6\n"
01015
01016 "movq (%0),%%mm0\n"
01017 "movq %%mm0, %%mm1\n"
01018 "psllq $8, %%mm0\n"
01019 "psrlq $8, %%mm1\n"
01020 "psrlq $8, %%mm0\n"
01021 "movq %%mm0, %%mm2\n"
01022 "movq %%mm1, %%mm3\n"
01023 "punpcklbw %%mm7,%%mm0\n"
01024 "punpcklbw %%mm7,%%mm1\n"
01025 "punpckhbw %%mm7,%%mm2\n"
01026 "punpckhbw %%mm7,%%mm3\n"
01027 "psubw %%mm1, %%mm0\n"
01028 "psubw %%mm3, %%mm2\n"
01029
01030 "add %2,%0\n"
01031
01032 "movq (%0),%%mm4\n"
01033 "movq %%mm4, %%mm1\n"
01034 "psllq $8, %%mm4\n"
01035 "psrlq $8, %%mm1\n"
01036 "psrlq $8, %%mm4\n"
01037 "movq %%mm4, %%mm5\n"
01038 "movq %%mm1, %%mm3\n"
01039 "punpcklbw %%mm7,%%mm4\n"
01040 "punpcklbw %%mm7,%%mm1\n"
01041 "punpckhbw %%mm7,%%mm5\n"
01042 "punpckhbw %%mm7,%%mm3\n"
01043 "psubw %%mm1, %%mm4\n"
01044 "psubw %%mm3, %%mm5\n"
01045 "psubw %%mm4, %%mm0\n"
01046 "psubw %%mm5, %%mm2\n"
01047 "pxor %%mm3, %%mm3\n"
01048 "pxor %%mm1, %%mm1\n"
01049 "pcmpgtw %%mm0, %%mm3\n\t"
01050 "pcmpgtw %%mm2, %%mm1\n\t"
01051 "pxor %%mm3, %%mm0\n"
01052 "pxor %%mm1, %%mm2\n"
01053 "psubw %%mm3, %%mm0\n"
01054 "psubw %%mm1, %%mm2\n"
01055 "paddw %%mm0, %%mm2\n"
01056 "paddw %%mm2, %%mm6\n"
01057
01058 "add %2,%0\n"
01059 "1:\n"
01060
01061 "movq (%0),%%mm0\n"
01062 "movq %%mm0, %%mm1\n"
01063 "psllq $8, %%mm0\n"
01064 "psrlq $8, %%mm1\n"
01065 "psrlq $8, %%mm0\n"
01066 "movq %%mm0, %%mm2\n"
01067 "movq %%mm1, %%mm3\n"
01068 "punpcklbw %%mm7,%%mm0\n"
01069 "punpcklbw %%mm7,%%mm1\n"
01070 "punpckhbw %%mm7,%%mm2\n"
01071 "punpckhbw %%mm7,%%mm3\n"
01072 "psubw %%mm1, %%mm0\n"
01073 "psubw %%mm3, %%mm2\n"
01074 "psubw %%mm0, %%mm4\n"
01075 "psubw %%mm2, %%mm5\n"
01076 "pxor %%mm3, %%mm3\n"
01077 "pxor %%mm1, %%mm1\n"
01078 "pcmpgtw %%mm4, %%mm3\n\t"
01079 "pcmpgtw %%mm5, %%mm1\n\t"
01080 "pxor %%mm3, %%mm4\n"
01081 "pxor %%mm1, %%mm5\n"
01082 "psubw %%mm3, %%mm4\n"
01083 "psubw %%mm1, %%mm5\n"
01084 "paddw %%mm4, %%mm5\n"
01085 "paddw %%mm5, %%mm6\n"
01086
01087 "add %2,%0\n"
01088
01089 "movq (%0),%%mm4\n"
01090 "movq %%mm4, %%mm1\n"
01091 "psllq $8, %%mm4\n"
01092 "psrlq $8, %%mm1\n"
01093 "psrlq $8, %%mm4\n"
01094 "movq %%mm4, %%mm5\n"
01095 "movq %%mm1, %%mm3\n"
01096 "punpcklbw %%mm7,%%mm4\n"
01097 "punpcklbw %%mm7,%%mm1\n"
01098 "punpckhbw %%mm7,%%mm5\n"
01099 "punpckhbw %%mm7,%%mm3\n"
01100 "psubw %%mm1, %%mm4\n"
01101 "psubw %%mm3, %%mm5\n"
01102 "psubw %%mm4, %%mm0\n"
01103 "psubw %%mm5, %%mm2\n"
01104 "pxor %%mm3, %%mm3\n"
01105 "pxor %%mm1, %%mm1\n"
01106 "pcmpgtw %%mm0, %%mm3\n\t"
01107 "pcmpgtw %%mm2, %%mm1\n\t"
01108 "pxor %%mm3, %%mm0\n"
01109 "pxor %%mm1, %%mm2\n"
01110 "psubw %%mm3, %%mm0\n"
01111 "psubw %%mm1, %%mm2\n"
01112 "paddw %%mm0, %%mm2\n"
01113 "paddw %%mm2, %%mm6\n"
01114
01115 "add %2,%0\n"
01116 "subl $2, %%ecx\n"
01117 " jnz 1b\n"
01118
01119 "movq %%mm6, %%mm0\n"
01120 "punpcklwd %%mm7,%%mm0\n"
01121 "punpckhwd %%mm7,%%mm6\n"
01122 "paddd %%mm0, %%mm6\n"
01123
01124 "movq %%mm6,%%mm0\n"
01125 "psrlq $32, %%mm6\n"
01126 "paddd %%mm6,%%mm0\n"
01127 "movd %%mm0,%1\n"
01128 : "+r" (pix1), "=r"(tmp)
01129 : "r" ((long)line_size) , "g" (h-2)
01130 : "%ecx");
01131 return tmp;
01132 }
01133
01134 static int hf_noise16_mmx(uint8_t * pix1, int line_size, int h) {
01135 int tmp;
01136 uint8_t * pix= pix1;
01137 asm volatile (
01138 "movl %3,%%ecx\n"
01139 "pxor %%mm7,%%mm7\n"
01140 "pxor %%mm6,%%mm6\n"
01141
01142 "movq (%0),%%mm0\n"
01143 "movq 1(%0),%%mm1\n"
01144 "movq %%mm0, %%mm2\n"
01145 "movq %%mm1, %%mm3\n"
01146 "punpcklbw %%mm7,%%mm0\n"
01147 "punpcklbw %%mm7,%%mm1\n"
01148 "punpckhbw %%mm7,%%mm2\n"
01149 "punpckhbw %%mm7,%%mm3\n"
01150 "psubw %%mm1, %%mm0\n"
01151 "psubw %%mm3, %%mm2\n"
01152
01153 "add %2,%0\n"
01154
01155 "movq (%0),%%mm4\n"
01156 "movq 1(%0),%%mm1\n"
01157 "movq %%mm4, %%mm5\n"
01158 "movq %%mm1, %%mm3\n"
01159 "punpcklbw %%mm7,%%mm4\n"
01160 "punpcklbw %%mm7,%%mm1\n"
01161 "punpckhbw %%mm7,%%mm5\n"
01162 "punpckhbw %%mm7,%%mm3\n"
01163 "psubw %%mm1, %%mm4\n"
01164 "psubw %%mm3, %%mm5\n"
01165 "psubw %%mm4, %%mm0\n"
01166 "psubw %%mm5, %%mm2\n"
01167 "pxor %%mm3, %%mm3\n"
01168 "pxor %%mm1, %%mm1\n"
01169 "pcmpgtw %%mm0, %%mm3\n\t"
01170 "pcmpgtw %%mm2, %%mm1\n\t"
01171 "pxor %%mm3, %%mm0\n"
01172 "pxor %%mm1, %%mm2\n"
01173 "psubw %%mm3, %%mm0\n"
01174 "psubw %%mm1, %%mm2\n"
01175 "paddw %%mm0, %%mm2\n"
01176 "paddw %%mm2, %%mm6\n"
01177
01178 "add %2,%0\n"
01179 "1:\n"
01180
01181 "movq (%0),%%mm0\n"
01182 "movq 1(%0),%%mm1\n"
01183 "movq %%mm0, %%mm2\n"
01184 "movq %%mm1, %%mm3\n"
01185 "punpcklbw %%mm7,%%mm0\n"
01186 "punpcklbw %%mm7,%%mm1\n"
01187 "punpckhbw %%mm7,%%mm2\n"
01188 "punpckhbw %%mm7,%%mm3\n"
01189 "psubw %%mm1, %%mm0\n"
01190 "psubw %%mm3, %%mm2\n"
01191 "psubw %%mm0, %%mm4\n"
01192 "psubw %%mm2, %%mm5\n"
01193 "pxor %%mm3, %%mm3\n"
01194 "pxor %%mm1, %%mm1\n"
01195 "pcmpgtw %%mm4, %%mm3\n\t"
01196 "pcmpgtw %%mm5, %%mm1\n\t"
01197 "pxor %%mm3, %%mm4\n"
01198 "pxor %%mm1, %%mm5\n"
01199 "psubw %%mm3, %%mm4\n"
01200 "psubw %%mm1, %%mm5\n"
01201 "paddw %%mm4, %%mm5\n"
01202 "paddw %%mm5, %%mm6\n"
01203
01204 "add %2,%0\n"
01205
01206 "movq (%0),%%mm4\n"
01207 "movq 1(%0),%%mm1\n"
01208 "movq %%mm4, %%mm5\n"
01209 "movq %%mm1, %%mm3\n"
01210 "punpcklbw %%mm7,%%mm4\n"
01211 "punpcklbw %%mm7,%%mm1\n"
01212 "punpckhbw %%mm7,%%mm5\n"
01213 "punpckhbw %%mm7,%%mm3\n"
01214 "psubw %%mm1, %%mm4\n"
01215 "psubw %%mm3, %%mm5\n"
01216 "psubw %%mm4, %%mm0\n"
01217 "psubw %%mm5, %%mm2\n"
01218 "pxor %%mm3, %%mm3\n"
01219 "pxor %%mm1, %%mm1\n"
01220 "pcmpgtw %%mm0, %%mm3\n\t"
01221 "pcmpgtw %%mm2, %%mm1\n\t"
01222 "pxor %%mm3, %%mm0\n"
01223 "pxor %%mm1, %%mm2\n"
01224 "psubw %%mm3, %%mm0\n"
01225 "psubw %%mm1, %%mm2\n"
01226 "paddw %%mm0, %%mm2\n"
01227 "paddw %%mm2, %%mm6\n"
01228
01229 "add %2,%0\n"
01230 "subl $2, %%ecx\n"
01231 " jnz 1b\n"
01232
01233 "movq %%mm6, %%mm0\n"
01234 "punpcklwd %%mm7,%%mm0\n"
01235 "punpckhwd %%mm7,%%mm6\n"
01236 "paddd %%mm0, %%mm6\n"
01237
01238 "movq %%mm6,%%mm0\n"
01239 "psrlq $32, %%mm6\n"
01240 "paddd %%mm6,%%mm0\n"
01241 "movd %%mm0,%1\n"
01242 : "+r" (pix1), "=r"(tmp)
01243 : "r" ((long)line_size) , "g" (h-2)
01244 : "%ecx");
01245 return tmp + hf_noise8_mmx(pix+8, line_size, h);
01246 }
01247
01248 static int nsse16_mmx(void *p, uint8_t * pix1, uint8_t * pix2, int line_size, int h) {
01249 MpegEncContext *c = p;
01250 int score1, score2;
01251
01252 if(c) score1 = c->dsp.sse[0](c, pix1, pix2, line_size, h);
01253 else score1 = sse16_mmx(c, pix1, pix2, line_size, h);
01254 score2= hf_noise16_mmx(pix1, line_size, h) - hf_noise16_mmx(pix2, line_size, h);
01255
01256 if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
01257 else return score1 + FFABS(score2)*8;
01258 }
01259
01260 static int nsse8_mmx(void *p, uint8_t * pix1, uint8_t * pix2, int line_size, int h) {
01261 MpegEncContext *c = p;
01262 int score1= sse8_mmx(c, pix1, pix2, line_size, h);
01263 int score2= hf_noise8_mmx(pix1, line_size, h) - hf_noise8_mmx(pix2, line_size, h);
01264
01265 if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
01266 else return score1 + FFABS(score2)*8;
01267 }
01268
01269 static int vsad_intra16_mmx(void *v, uint8_t * pix, uint8_t * dummy, int line_size, int h) {
01270 int tmp;
01271
01272 assert( (((int)pix) & 7) == 0);
01273 assert((line_size &7) ==0);
01274
01275 #define SUM(in0, in1, out0, out1) \
01276 "movq (%0), %%mm2\n"\
01277 "movq 8(%0), %%mm3\n"\
01278 "add %2,%0\n"\
01279 "movq %%mm2, " #out0 "\n"\
01280 "movq %%mm3, " #out1 "\n"\
01281 "psubusb " #in0 ", %%mm2\n"\
01282 "psubusb " #in1 ", %%mm3\n"\
01283 "psubusb " #out0 ", " #in0 "\n"\
01284 "psubusb " #out1 ", " #in1 "\n"\
01285 "por %%mm2, " #in0 "\n"\
01286 "por %%mm3, " #in1 "\n"\
01287 "movq " #in0 ", %%mm2\n"\
01288 "movq " #in1 ", %%mm3\n"\
01289 "punpcklbw %%mm7, " #in0 "\n"\
01290 "punpcklbw %%mm7, " #in1 "\n"\
01291 "punpckhbw %%mm7, %%mm2\n"\
01292 "punpckhbw %%mm7, %%mm3\n"\
01293 "paddw " #in1 ", " #in0 "\n"\
01294 "paddw %%mm3, %%mm2\n"\
01295 "paddw %%mm2, " #in0 "\n"\
01296 "paddw " #in0 ", %%mm6\n"
01297
01298
01299 asm volatile (
01300 "movl %3,%%ecx\n"
01301 "pxor %%mm6,%%mm6\n"
01302 "pxor %%mm7,%%mm7\n"
01303 "movq (%0),%%mm0\n"
01304 "movq 8(%0),%%mm1\n"
01305 "add %2,%0\n"
01306 "subl $2, %%ecx\n"
01307 SUM(%%mm0, %%mm1, %%mm4, %%mm5)
01308 "1:\n"
01309
01310 SUM(%%mm4, %%mm5, %%mm0, %%mm1)
01311
01312 SUM(%%mm0, %%mm1, %%mm4, %%mm5)
01313
01314 "subl $2, %%ecx\n"
01315 "jnz 1b\n"
01316
01317 "movq %%mm6,%%mm0\n"
01318 "psrlq $32, %%mm6\n"
01319 "paddw %%mm6,%%mm0\n"
01320 "movq %%mm0,%%mm6\n"
01321 "psrlq $16, %%mm0\n"
01322 "paddw %%mm6,%%mm0\n"
01323 "movd %%mm0,%1\n"
01324 : "+r" (pix), "=r"(tmp)
01325 : "r" ((long)line_size) , "m" (h)
01326 : "%ecx");
01327 return tmp & 0xFFFF;
01328 }
01329 #undef SUM
01330
01331 static int vsad_intra16_mmx2(void *v, uint8_t * pix, uint8_t * dummy, int line_size, int h) {
01332 int tmp;
01333
01334 assert( (((int)pix) & 7) == 0);
01335 assert((line_size &7) ==0);
01336
01337 #define SUM(in0, in1, out0, out1) \
01338 "movq (%0), " #out0 "\n"\
01339 "movq 8(%0), " #out1 "\n"\
01340 "add %2,%0\n"\
01341 "psadbw " #out0 ", " #in0 "\n"\
01342 "psadbw " #out1 ", " #in1 "\n"\
01343 "paddw " #in1 ", " #in0 "\n"\
01344 "paddw " #in0 ", %%mm6\n"
01345
01346 asm volatile (
01347 "movl %3,%%ecx\n"
01348 "pxor %%mm6,%%mm6\n"
01349 "pxor %%mm7,%%mm7\n"
01350 "movq (%0),%%mm0\n"
01351 "movq 8(%0),%%mm1\n"
01352 "add %2,%0\n"
01353 "subl $2, %%ecx\n"
01354 SUM(%%mm0, %%mm1, %%mm4, %%mm5)
01355 "1:\n"
01356
01357 SUM(%%mm4, %%mm5, %%mm0, %%mm1)
01358
01359 SUM(%%mm0, %%mm1, %%mm4, %%mm5)
01360
01361 "subl $2, %%ecx\n"
01362 "jnz 1b\n"
01363
01364 "movd %%mm6,%1\n"
01365 : "+r" (pix), "=r"(tmp)
01366 : "r" ((long)line_size) , "m" (h)
01367 : "%ecx");
01368 return tmp;
01369 }
01370 #undef SUM
01371
01372 static int vsad16_mmx(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) {
01373 int tmp;
01374
01375 assert( (((int)pix1) & 7) == 0);
01376 assert( (((int)pix2) & 7) == 0);
01377 assert((line_size &7) ==0);
01378
01379 #define SUM(in0, in1, out0, out1) \
01380 "movq (%0),%%mm2\n"\
01381 "movq (%1)," #out0 "\n"\
01382 "movq 8(%0),%%mm3\n"\
01383 "movq 8(%1)," #out1 "\n"\
01384 "add %3,%0\n"\
01385 "add %3,%1\n"\
01386 "psubb " #out0 ", %%mm2\n"\
01387 "psubb " #out1 ", %%mm3\n"\
01388 "pxor %%mm7, %%mm2\n"\
01389 "pxor %%mm7, %%mm3\n"\
01390 "movq %%mm2, " #out0 "\n"\
01391 "movq %%mm3, " #out1 "\n"\
01392 "psubusb " #in0 ", %%mm2\n"\
01393 "psubusb " #in1 ", %%mm3\n"\
01394 "psubusb " #out0 ", " #in0 "\n"\
01395 "psubusb " #out1 ", " #in1 "\n"\
01396 "por %%mm2, " #in0 "\n"\
01397 "por %%mm3, " #in1 "\n"\
01398 "movq " #in0 ", %%mm2\n"\
01399 "movq " #in1 ", %%mm3\n"\
01400 "punpcklbw %%mm7, " #in0 "\n"\
01401 "punpcklbw %%mm7, " #in1 "\n"\
01402 "punpckhbw %%mm7, %%mm2\n"\
01403 "punpckhbw %%mm7, %%mm3\n"\
01404 "paddw " #in1 ", " #in0 "\n"\
01405 "paddw %%mm3, %%mm2\n"\
01406 "paddw %%mm2, " #in0 "\n"\
01407 "paddw " #in0 ", %%mm6\n"
01408
01409
01410 asm volatile (
01411 "movl %4,%%ecx\n"
01412 "pxor %%mm6,%%mm6\n"
01413 "pcmpeqw %%mm7,%%mm7\n"
01414 "psllw $15, %%mm7\n"
01415 "packsswb %%mm7, %%mm7\n"
01416 "movq (%0),%%mm0\n"
01417 "movq (%1),%%mm2\n"
01418 "movq 8(%0),%%mm1\n"
01419 "movq 8(%1),%%mm3\n"
01420 "add %3,%0\n"
01421 "add %3,%1\n"
01422 "subl $2, %%ecx\n"
01423 "psubb %%mm2, %%mm0\n"
01424 "psubb %%mm3, %%mm1\n"
01425 "pxor %%mm7, %%mm0\n"
01426 "pxor %%mm7, %%mm1\n"
01427 SUM(%%mm0, %%mm1, %%mm4, %%mm5)
01428 "1:\n"
01429
01430 SUM(%%mm4, %%mm5, %%mm0, %%mm1)
01431
01432 SUM(%%mm0, %%mm1, %%mm4, %%mm5)
01433
01434 "subl $2, %%ecx\n"
01435 "jnz 1b\n"
01436
01437 "movq %%mm6,%%mm0\n"
01438 "psrlq $32, %%mm6\n"
01439 "paddw %%mm6,%%mm0\n"
01440 "movq %%mm0,%%mm6\n"
01441 "psrlq $16, %%mm0\n"
01442 "paddw %%mm6,%%mm0\n"
01443 "movd %%mm0,%2\n"
01444 : "+r" (pix1), "+r" (pix2), "=r"(tmp)
01445 : "r" ((long)line_size) , "m" (h)
01446 : "%ecx");
01447 return tmp & 0x7FFF;
01448 }
01449 #undef SUM
01450
01451 static int vsad16_mmx2(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) {
01452 int tmp;
01453
01454 assert( (((int)pix1) & 7) == 0);
01455 assert( (((int)pix2) & 7) == 0);
01456 assert((line_size &7) ==0);
01457
01458 #define SUM(in0, in1, out0, out1) \
01459 "movq (%0)," #out0 "\n"\
01460 "movq (%1),%%mm2\n"\
01461 "movq 8(%0)," #out1 "\n"\
01462 "movq 8(%1),%%mm3\n"\
01463 "add %3,%0\n"\
01464 "add %3,%1\n"\
01465 "psubb %%mm2, " #out0 "\n"\
01466 "psubb %%mm3, " #out1 "\n"\
01467 "pxor %%mm7, " #out0 "\n"\
01468 "pxor %%mm7, " #out1 "\n"\
01469 "psadbw " #out0 ", " #in0 "\n"\
01470 "psadbw " #out1 ", " #in1 "\n"\
01471 "paddw " #in1 ", " #in0 "\n"\
01472 "paddw " #in0 ", %%mm6\n"
01473
01474 asm volatile (
01475 "movl %4,%%ecx\n"
01476 "pxor %%mm6,%%mm6\n"
01477 "pcmpeqw %%mm7,%%mm7\n"
01478 "psllw $15, %%mm7\n"
01479 "packsswb %%mm7, %%mm7\n"
01480 "movq (%0),%%mm0\n"
01481 "movq (%1),%%mm2\n"
01482 "movq 8(%0),%%mm1\n"
01483 "movq 8(%1),%%mm3\n"
01484 "add %3,%0\n"
01485 "add %3,%1\n"
01486 "subl $2, %%ecx\n"
01487 "psubb %%mm2, %%mm0\n"
01488 "psubb %%mm3, %%mm1\n"
01489 "pxor %%mm7, %%mm0\n"
01490 "pxor %%mm7, %%mm1\n"
01491 SUM(%%mm0, %%mm1, %%mm4, %%mm5)
01492 "1:\n"
01493
01494 SUM(%%mm4, %%mm5, %%mm0, %%mm1)
01495
01496 SUM(%%mm0, %%mm1, %%mm4, %%mm5)
01497
01498 "subl $2, %%ecx\n"
01499 "jnz 1b\n"
01500
01501 "movd %%mm6,%2\n"
01502 : "+r" (pix1), "+r" (pix2), "=r"(tmp)
01503 : "r" ((long)line_size) , "m" (h)
01504 : "%ecx");
01505 return tmp;
01506 }
01507 #undef SUM
01508
01509 static void diff_bytes_mmx(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
01510 long i=0;
01511 asm volatile(
01512 "1: \n\t"
01513 "movq (%2, %0), %%mm0 \n\t"
01514 "movq (%1, %0), %%mm1 \n\t"
01515 "psubb %%mm0, %%mm1 \n\t"
01516 "movq %%mm1, (%3, %0) \n\t"
01517 "movq 8(%2, %0), %%mm0 \n\t"
01518 "movq 8(%1, %0), %%mm1 \n\t"
01519 "psubb %%mm0, %%mm1 \n\t"
01520 "movq %%mm1, 8(%3, %0) \n\t"
01521 "add $16, %0 \n\t"
01522 "cmp %4, %0 \n\t"
01523 " jb 1b \n\t"
01524 : "+r" (i)
01525 : "r"(src1), "r"(src2), "r"(dst), "r"((long)w-15)
01526 );
01527 for(; i<w; i++)
01528 dst[i+0] = src1[i+0]-src2[i+0];
01529 }
01530
01531 static void sub_hfyu_median_prediction_mmx2(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w, int *left, int *left_top){
01532 long i=0;
01533 uint8_t l, lt;
01534
01535 asm volatile(
01536 "1: \n\t"
01537 "movq -1(%1, %0), %%mm0 \n\t"
01538 "movq (%1, %0), %%mm1 \n\t"
01539 "movq -1(%2, %0), %%mm2 \n\t"
01540 "movq (%2, %0), %%mm3 \n\t"
01541 "movq %%mm2, %%mm4 \n\t"
01542 "psubb %%mm0, %%mm2 \n\t"
01543 "paddb %%mm1, %%mm2 \n\t"
01544 "movq %%mm4, %%mm5 \n\t"
01545 "pmaxub %%mm1, %%mm4 \n\t"
01546 "pminub %%mm5, %%mm1 \n\t"
01547 "pminub %%mm2, %%mm4 \n\t"
01548 "pmaxub %%mm1, %%mm4 \n\t"
01549 "psubb %%mm4, %%mm3 \n\t"
01550 "movq %%mm3, (%3, %0) \n\t"
01551 "add $8, %0 \n\t"
01552 "cmp %4, %0 \n\t"
01553 " jb 1b \n\t"
01554 : "+r" (i)
01555 : "r"(src1), "r"(src2), "r"(dst), "r"((long)w)
01556 );
01557
01558 l= *left;
01559 lt= *left_top;
01560
01561 dst[0]= src2[0] - mid_pred(l, src1[0], (l + src1[0] - lt)&0xFF);
01562
01563 *left_top= src1[w-1];
01564 *left = src2[w-1];
01565 }
01566
01567 #define DIFF_PIXELS_1(m,a,t,p1,p2)\
01568 "mov"#m" "#p1", "#a" \n\t"\
01569 "mov"#m" "#p2", "#t" \n\t"\
01570 "punpcklbw "#a", "#t" \n\t"\
01571 "punpcklbw "#a", "#a" \n\t"\
01572 "psubw "#t", "#a" \n\t"\
01573
01574 #define DIFF_PIXELS_8(m0,m1,mm,p1,p2,stride,temp) {\
01575 uint8_t *p1b=p1, *p2b=p2;\
01576 asm volatile(\
01577 DIFF_PIXELS_1(m0, mm##0, mm##7, (%1), (%2))\
01578 DIFF_PIXELS_1(m0, mm##1, mm##7, (%1,%3), (%2,%3))\
01579 DIFF_PIXELS_1(m0, mm##2, mm##7, (%1,%3,2), (%2,%3,2))\
01580 "add %4, %1 \n\t"\
01581 "add %4, %2 \n\t"\
01582 DIFF_PIXELS_1(m0, mm##3, mm##7, (%1), (%2))\
01583 DIFF_PIXELS_1(m0, mm##4, mm##7, (%1,%3), (%2,%3))\
01584 DIFF_PIXELS_1(m0, mm##5, mm##7, (%1,%3,2), (%2,%3,2))\
01585 DIFF_PIXELS_1(m0, mm##6, mm##7, (%1,%4), (%2,%4))\
01586 "mov"#m1" "#mm"0, %0 \n\t"\
01587 DIFF_PIXELS_1(m0, mm##7, mm##0, (%1,%3,4), (%2,%3,4))\
01588 "mov"#m1" %0, "#mm"0 \n\t"\
01589 : "+m"(temp), "+r"(p1b), "+r"(p2b)\
01590 : "r"((long)stride), "r"((long)stride*3)\
01591 );\
01592 }
01593
01594
01595 #define DIFF_PIXELS_4x8(p1,p2,stride,temp) DIFF_PIXELS_8(d, q, %%mm, p1, p2, stride, temp)
01596 #define DIFF_PIXELS_8x8(p1,p2,stride,temp) DIFF_PIXELS_8(q, dqa, %%xmm, p1, p2, stride, temp)
01597
01598 #define LBUTTERFLY2(a1,b1,a2,b2)\
01599 "paddw " #b1 ", " #a1 " \n\t"\
01600 "paddw " #b2 ", " #a2 " \n\t"\
01601 "paddw " #b1 ", " #b1 " \n\t"\
01602 "paddw " #b2 ", " #b2 " \n\t"\
01603 "psubw " #a1 ", " #b1 " \n\t"\
01604 "psubw " #a2 ", " #b2 " \n\t"
01605
01606 #define HADAMARD8(m0, m1, m2, m3, m4, m5, m6, m7)\
01607 LBUTTERFLY2(m0, m1, m2, m3)\
01608 LBUTTERFLY2(m4, m5, m6, m7)\
01609 LBUTTERFLY2(m0, m2, m1, m3)\
01610 LBUTTERFLY2(m4, m6, m5, m7)\
01611 LBUTTERFLY2(m0, m4, m1, m5)\
01612 LBUTTERFLY2(m2, m6, m3, m7)\
01613
01614 #define HADAMARD48 HADAMARD8(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm6, %%mm7)
01615
01616 #define MMABS_MMX(a,z)\
01617 "pxor " #z ", " #z " \n\t"\
01618 "pcmpgtw " #a ", " #z " \n\t"\
01619 "pxor " #z ", " #a " \n\t"\
01620 "psubw " #z ", " #a " \n\t"
01621
01622 #define MMABS_MMX2(a,z)\
01623 "pxor " #z ", " #z " \n\t"\
01624 "psubw " #a ", " #z " \n\t"\
01625 "pmaxsw " #z ", " #a " \n\t"
01626
01627 #define MMABS_SSSE3(a,z)\
01628 "pabsw " #a ", " #a " \n\t"
01629
01630 #define MMABS_SUM(a,z, sum)\
01631 MMABS(a,z)\
01632 "paddusw " #a ", " #sum " \n\t"
01633
01634 #define MMABS_SUM_8x8_NOSPILL\
01635 MMABS(%%xmm0, %%xmm8)\
01636 MMABS(%%xmm1, %%xmm9)\
01637 MMABS_SUM(%%xmm2, %%xmm8, %%xmm0)\
01638 MMABS_SUM(%%xmm3, %%xmm9, %%xmm1)\
01639 MMABS_SUM(%%xmm4, %%xmm8, %%xmm0)\
01640 MMABS_SUM(%%xmm5, %%xmm9, %%xmm1)\
01641 MMABS_SUM(%%xmm6, %%xmm8, %%xmm0)\
01642 MMABS_SUM(%%xmm7, %%xmm9, %%xmm1)\
01643 "paddusw %%xmm1, %%xmm0 \n\t"
01644
01645 #ifdef ARCH_X86_64
01646 #define MMABS_SUM_8x8_SSE2 MMABS_SUM_8x8_NOSPILL
01647 #else
01648 #define MMABS_SUM_8x8_SSE2\
01649 "movdqa %%xmm7, (%1) \n\t"\
01650 MMABS(%%xmm0, %%xmm7)\
01651 MMABS(%%xmm1, %%xmm7)\
01652 MMABS_SUM(%%xmm2, %%xmm7, %%xmm0)\
01653 MMABS_SUM(%%xmm3, %%xmm7, %%xmm1)\
01654 MMABS_SUM(%%xmm4, %%xmm7, %%xmm0)\
01655 MMABS_SUM(%%xmm5, %%xmm7, %%xmm1)\
01656 MMABS_SUM(%%xmm6, %%xmm7, %%xmm0)\
01657 "movdqa (%1), %%xmm2 \n\t"\
01658 MMABS_SUM(%%xmm2, %%xmm7, %%xmm1)\
01659 "paddusw %%xmm1, %%xmm0 \n\t"
01660 #endif
01661
01662 #define LOAD4(o, a, b, c, d)\
01663 "movq "#o"(%1), "#a" \n\t"\
01664 "movq "#o"+8(%1), "#b" \n\t"\
01665 "movq "#o"+16(%1), "#c" \n\t"\
01666 "movq "#o"+24(%1), "#d" \n\t"\
01667
01668 #define STORE4(o, a, b, c, d)\
01669 "movq "#a", "#o"(%1) \n\t"\
01670 "movq "#b", "#o"+8(%1) \n\t"\
01671 "movq "#c", "#o"+16(%1) \n\t"\
01672 "movq "#d", "#o"+24(%1) \n\t"\
01673
01674
01675
01676
01677 #define HSUM_MMX(a, t, dst)\
01678 "movq "#a", "#t" \n\t"\
01679 "psrlq $32, "#a" \n\t"\
01680 "paddusw "#t", "#a" \n\t"\
01681 "movq "#a", "#t" \n\t"\
01682 "psrlq $16, "#a" \n\t"\
01683 "paddusw "#t", "#a" \n\t"\
01684 "movd "#a", "#dst" \n\t"\
01685
01686 #define HSUM_MMX2(a, t, dst)\
01687 "pshufw $0x0E, "#a", "#t" \n\t"\
01688 "paddusw "#t", "#a" \n\t"\
01689 "pshufw $0x01, "#a", "#t" \n\t"\
01690 "paddusw "#t", "#a" \n\t"\
01691 "movd "#a", "#dst" \n\t"\
01692
01693 #define HSUM_SSE2(a, t, dst)\
01694 "movhlps "#a", "#t" \n\t"\
01695 "paddusw "#t", "#a" \n\t"\
01696 "pshuflw $0x0E, "#a", "#t" \n\t"\
01697 "paddusw "#t", "#a" \n\t"\
01698 "pshuflw $0x01, "#a", "#t" \n\t"\
01699 "paddusw "#t", "#a" \n\t"\
01700 "movd "#a", "#dst" \n\t"\
01701
01702 #define HADAMARD8_DIFF_MMX(cpu) \
01703 static int hadamard8_diff_##cpu(void *s, uint8_t *src1, uint8_t *src2, int stride, int h){\
01704 DECLARE_ALIGNED_8(uint64_t, temp[13]);\
01705 int sum;\
01706 \
01707 assert(h==8);\
01708 \
01709 DIFF_PIXELS_4x8(src1, src2, stride, temp[0]);\
01710 \
01711 asm volatile(\
01712 HADAMARD48\
01713 \
01714 "movq %%mm7, 96(%1) \n\t"\
01715 \
01716 TRANSPOSE4(%%mm0, %%mm1, %%mm2, %%mm3, %%mm7)\
01717 STORE4(0 , %%mm0, %%mm3, %%mm7, %%mm2)\
01718 \
01719 "movq 96(%1), %%mm7 \n\t"\
01720 TRANSPOSE4(%%mm4, %%mm5, %%mm6, %%mm7, %%mm0)\
01721 STORE4(64, %%mm4, %%mm7, %%mm0, %%mm6)\
01722 \
01723 : "=r" (sum)\
01724 : "r"(temp)\
01725 );\
01726 \
01727 DIFF_PIXELS_4x8(src1+4, src2+4, stride, temp[4]);\
01728 \
01729 asm volatile(\
01730 HADAMARD48\
01731 \
01732 "movq %%mm7, 96(%1) \n\t"\
01733 \
01734 TRANSPOSE4(%%mm0, %%mm1, %%mm2, %%mm3, %%mm7)\
01735 STORE4(32, %%mm0, %%mm3, %%mm7, %%mm2)\
01736 \
01737 "movq 96(%1), %%mm7 \n\t"\
01738 TRANSPOSE4(%%mm4, %%mm5, %%mm6, %%mm7, %%mm0)\
01739 "movq %%mm7, %%mm5 \n\t"\
01740 "movq %%mm6, %%mm7 \n\t"\
01741 "movq %%mm0, %%mm6 \n\t"\
01742 \
01743 LOAD4(64, %%mm0, %%mm1, %%mm2, %%mm3)\
01744 \
01745 HADAMARD48\
01746 "movq %%mm7, 64(%1) \n\t"\
01747 MMABS(%%mm0, %%mm7)\
01748 MMABS(%%mm1, %%mm7)\
01749 MMABS_SUM(%%mm2, %%mm7, %%mm0)\
01750 MMABS_SUM(%%mm3, %%mm7, %%mm1)\
01751 MMABS_SUM(%%mm4, %%mm7, %%mm0)\
01752 MMABS_SUM(%%mm5, %%mm7, %%mm1)\
01753 MMABS_SUM(%%mm6, %%mm7, %%mm0)\
01754 "movq 64(%1), %%mm2 \n\t"\
01755 MMABS_SUM(%%mm2, %%mm7, %%mm1)\
01756 "paddusw %%mm1, %%mm0 \n\t"\
01757 "movq %%mm0, 64(%1) \n\t"\
01758 \
01759 LOAD4(0 , %%mm0, %%mm1, %%mm2, %%mm3)\
01760 LOAD4(32, %%mm4, %%mm5, %%mm6, %%mm7)\
01761 \
01762 HADAMARD48\
01763 "movq %%mm7, (%1) \n\t"\
01764 MMABS(%%mm0, %%mm7)\
01765 MMABS(%%mm1, %%mm7)\
01766 MMABS_SUM(%%mm2, %%mm7, %%mm0)\
01767 MMABS_SUM(%%mm3, %%mm7, %%mm1)\
01768 MMABS_SUM(%%mm4, %%mm7, %%mm0)\
01769 MMABS_SUM(%%mm5, %%mm7, %%mm1)\
01770 MMABS_SUM(%%mm6, %%mm7, %%mm0)\
01771 "movq (%1), %%mm2 \n\t"\
01772 MMABS_SUM(%%mm2, %%mm7, %%mm1)\
01773 "paddusw 64(%1), %%mm0 \n\t"\
01774 "paddusw %%mm1, %%mm0 \n\t"\
01775 \
01776 HSUM(%%mm0, %%mm1, %0)\
01777 \
01778 : "=r" (sum)\
01779 : "r"(temp)\
01780 );\
01781 return sum&0xFFFF;\
01782 }\
01783 WRAPPER8_16_SQ(hadamard8_diff_##cpu, hadamard8_diff16_##cpu)
01784
01785 #define HADAMARD8_DIFF_SSE2(cpu) \
01786 static int hadamard8_diff_##cpu(void *s, uint8_t *src1, uint8_t *src2, int stride, int h){\
01787 DECLARE_ALIGNED_16(uint64_t, temp[4]);\
01788 int sum;\
01789 \
01790 assert(h==8);\
01791 \
01792 DIFF_PIXELS_8x8(src1, src2, stride, temp[0]);\
01793 \
01794 asm volatile(\
01795 HADAMARD8(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm6, %%xmm7)\
01796 TRANSPOSE8(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm6, %%xmm7, (%1))\
01797 HADAMARD8(%%xmm0, %%xmm5, %%xmm7, %%xmm3, %%xmm6, %%xmm4, %%xmm2, %%xmm1)\
01798 MMABS_SUM_8x8\
01799 HSUM_SSE2(%%xmm0, %%xmm1, %0)\
01800 : "=r" (sum)\
01801 : "r"(temp)\
01802 );\
01803 return sum&0xFFFF;\
01804 }\
01805 WRAPPER8_16_SQ(hadamard8_diff_##cpu, hadamard8_diff16_##cpu)
01806
01807 #define MMABS(a,z) MMABS_MMX(a,z)
01808 #define HSUM(a,t,dst) HSUM_MMX(a,t,dst)
01809 HADAMARD8_DIFF_MMX(mmx)
01810 #undef MMABS
01811 #undef HSUM
01812
01813 #define MMABS(a,z) MMABS_MMX2(a,z)
01814 #define MMABS_SUM_8x8 MMABS_SUM_8x8_SSE2
01815 #define HSUM(a,t,dst) HSUM_MMX2(a,t,dst)
01816 HADAMARD8_DIFF_MMX(mmx2)
01817 HADAMARD8_DIFF_SSE2(sse2)
01818 #undef MMABS
01819 #undef MMABS_SUM_8x8
01820 #undef HSUM
01821
01822 #ifdef HAVE_SSSE3
01823 #define MMABS(a,z) MMABS_SSSE3(a,z)
01824 #define MMABS_SUM_8x8 MMABS_SUM_8x8_NOSPILL
01825 HADAMARD8_DIFF_SSE2(ssse3)
01826 #undef MMABS
01827 #undef MMABS_SUM_8x8
01828 #endif
01829
01830 #define DCT_SAD4(m,mm,o)\
01831 "mov"#m" "#o"+ 0(%1), "#mm"2 \n\t"\
01832 "mov"#m" "#o"+16(%1), "#mm"3 \n\t"\
01833 "mov"#m" "#o"+32(%1), "#mm"4 \n\t"\
01834 "mov"#m" "#o"+48(%1), "#mm"5 \n\t"\
01835 MMABS_SUM(mm##2, mm##6, mm##0)\
01836 MMABS_SUM(mm##3, mm##7, mm##1)\
01837 MMABS_SUM(mm##4, mm##6, mm##0)\
01838 MMABS_SUM(mm##5, mm##7, mm##1)\
01839
01840 #define DCT_SAD_MMX\
01841 "pxor %%mm0, %%mm0 \n\t"\
01842 "pxor %%mm1, %%mm1 \n\t"\
01843 DCT_SAD4(q, %%mm, 0)\
01844 DCT_SAD4(q, %%mm, 8)\
01845 DCT_SAD4(q, %%mm, 64)\
01846 DCT_SAD4(q, %%mm, 72)\
01847 "paddusw %%mm1, %%mm0 \n\t"\
01848 HSUM(%%mm0, %%mm1, %0)
01849
01850 #define DCT_SAD_SSE2\
01851 "pxor %%xmm0, %%xmm0 \n\t"\
01852 "pxor %%xmm1, %%xmm1 \n\t"\
01853 DCT_SAD4(dqa, %%xmm, 0)\
01854 DCT_SAD4(dqa, %%xmm, 64)\
01855 "paddusw %%xmm1, %%xmm0 \n\t"\
01856 HSUM(%%xmm0, %%xmm1, %0)
01857
01858 #define DCT_SAD_FUNC(cpu) \
01859 static int sum_abs_dctelem_##cpu(DCTELEM *block){\
01860 int sum;\
01861 asm volatile(\
01862 DCT_SAD\
01863 :"=r"(sum)\
01864 :"r"(block)\
01865 );\
01866 return sum&0xFFFF;\
01867 }
01868
01869 #define DCT_SAD DCT_SAD_MMX
01870 #define HSUM(a,t,dst) HSUM_MMX(a,t,dst)
01871 #define MMABS(a,z) MMABS_MMX(a,z)
01872 DCT_SAD_FUNC(mmx)
01873 #undef MMABS
01874 #undef HSUM
01875
01876 #define HSUM(a,t,dst) HSUM_MMX2(a,t,dst)
01877 #define MMABS(a,z) MMABS_MMX2(a,z)
01878 DCT_SAD_FUNC(mmx2)
01879 #undef HSUM
01880 #undef DCT_SAD
01881
01882 #define DCT_SAD DCT_SAD_SSE2
01883 #define HSUM(a,t,dst) HSUM_SSE2(a,t,dst)
01884 DCT_SAD_FUNC(sse2)
01885 #undef MMABS
01886
01887 #ifdef HAVE_SSSE3
01888 #define MMABS(a,z) MMABS_SSSE3(a,z)
01889 DCT_SAD_FUNC(ssse3)
01890 #undef MMABS
01891 #endif
01892 #undef HSUM
01893 #undef DCT_SAD
01894
01895 static int ssd_int8_vs_int16_mmx(const int8_t *pix1, const int16_t *pix2, int size){
01896 int sum;
01897 long i=size;
01898 asm volatile(
01899 "pxor %%mm4, %%mm4 \n"
01900 "1: \n"
01901 "sub $8, %0 \n"
01902 "movq (%2,%0), %%mm2 \n"
01903 "movq (%3,%0,2), %%mm0 \n"
01904 "movq 8(%3,%0,2), %%mm1 \n"
01905 "punpckhbw %%mm2, %%mm3 \n"
01906 "punpcklbw %%mm2, %%mm2 \n"
01907 "psraw $8, %%mm3 \n"
01908 "psraw $8, %%mm2 \n"
01909 "psubw %%mm3, %%mm1 \n"
01910 "psubw %%mm2, %%mm0 \n"
01911 "pmaddwd %%mm1, %%mm1 \n"
01912 "pmaddwd %%mm0, %%mm0 \n"
01913 "paddd %%mm1, %%mm4 \n"
01914 "paddd %%mm0, %%mm4 \n"
01915 "jg 1b \n"
01916 "movq %%mm4, %%mm3 \n"
01917 "psrlq $32, %%mm3 \n"
01918 "paddd %%mm3, %%mm4 \n"
01919 "movd %%mm4, %1 \n"
01920 :"+r"(i), "=r"(sum)
01921 :"r"(pix1), "r"(pix2)
01922 );
01923 return sum;
01924 }
01925
01926 #endif //CONFIG_ENCODERS
01927
01928 #define QPEL_V_LOW(m3,m4,m5,m6, pw_20, pw_3, rnd, in0, in1, in2, in7, out, OP)\
01929 "paddw " #m4 ", " #m3 " \n\t" \
01930 "movq "MANGLE(ff_pw_20)", %%mm4 \n\t" \
01931 "pmullw " #m3 ", %%mm4 \n\t" \
01932 "movq "#in7", " #m3 " \n\t" \
01933 "movq "#in0", %%mm5 \n\t" \
01934 "paddw " #m3 ", %%mm5 \n\t" \
01935 "psubw %%mm5, %%mm4 \n\t" \
01936 "movq "#in1", %%mm5 \n\t" \
01937 "movq "#in2", %%mm6 \n\t" \
01938 "paddw " #m6 ", %%mm5 \n\t" \
01939 "paddw " #m5 ", %%mm6 \n\t" \
01940 "paddw %%mm6, %%mm6 \n\t" \
01941 "psubw %%mm6, %%mm5 \n\t" \
01942 "pmullw "MANGLE(ff_pw_3)", %%mm5 \n\t" \
01943 "paddw " #rnd ", %%mm4 \n\t" \
01944 "paddw %%mm4, %%mm5 \n\t" \
01945 "psraw $5, %%mm5 \n\t"\
01946 "packuswb %%mm5, %%mm5 \n\t"\
01947 OP(%%mm5, out, %%mm7, d)
01948
01949 #define QPEL_BASE(OPNAME, ROUNDER, RND, OP_MMX2, OP_3DNOW)\
01950 static void OPNAME ## mpeg4_qpel16_h_lowpass_mmx2(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
01951 uint64_t temp;\
01952 \
01953 asm volatile(\
01954 "pxor %%mm7, %%mm7 \n\t"\
01955 "1: \n\t"\
01956 "movq (%0), %%mm0 \n\t" \
01957 "movq %%mm0, %%mm1 \n\t" \
01958 "movq %%mm0, %%mm2 \n\t" \
01959 "punpcklbw %%mm7, %%mm0 \n\t" \
01960 "punpckhbw %%mm7, %%mm1 \n\t" \
01961 "pshufw $0x90, %%mm0, %%mm5 \n\t" \
01962 "pshufw $0x41, %%mm0, %%mm6 \n\t" \
01963 "movq %%mm2, %%mm3 \n\t" \
01964 "movq %%mm2, %%mm4 \n\t" \
01965 "psllq $8, %%mm2 \n\t" \
01966 "psllq $16, %%mm3 \n\t" \
01967 "psllq $24, %%mm4 \n\t" \
01968 "punpckhbw %%mm7, %%mm2 \n\t" \
01969 "punpckhbw %%mm7, %%mm3 \n\t" \
01970 "punpckhbw %%mm7, %%mm4 \n\t" \
01971 "paddw %%mm3, %%mm5 \n\t" \
01972 "paddw %%mm2, %%mm6 \n\t" \
01973 "paddw %%mm5, %%mm5 \n\t" \
01974 "psubw %%mm5, %%mm6 \n\t" \
01975 "pshufw $0x06, %%mm0, %%mm5 \n\t" \
01976 "pmullw "MANGLE(ff_pw_3)", %%mm6 \n\t" \
01977 "paddw %%mm4, %%mm0 \n\t" \
01978 "paddw %%mm1, %%mm5 \n\t" \
01979 "pmullw "MANGLE(ff_pw_20)", %%mm0 \n\t" \
01980 "psubw %%mm5, %%mm0 \n\t" \
01981 "paddw %6, %%mm6 \n\t"\
01982 "paddw %%mm6, %%mm0 \n\t" \
01983 "psraw $5, %%mm0 \n\t"\
01984 "movq %%mm0, %5 \n\t"\
01985 \
01986 \
01987 "movq 5(%0), %%mm0 \n\t" \
01988 "movq %%mm0, %%mm5 \n\t" \
01989 "movq %%mm0, %%mm6 \n\t" \
01990 "psrlq $8, %%mm0 \n\t" \
01991 "psrlq $16, %%mm5 \n\t" \
01992 "punpcklbw %%mm7, %%mm0 \n\t" \
01993 "punpcklbw %%mm7, %%mm5 \n\t" \
01994 "paddw %%mm0, %%mm2 \n\t" \
01995 "paddw %%mm5, %%mm3 \n\t" \
01996 "paddw %%mm2, %%mm2 \n\t" \
01997 "psubw %%mm2, %%mm3 \n\t" \
01998 "movq %%mm6, %%mm2 \n\t" \
01999 "psrlq $24, %%mm6 \n\t" \
02000 "punpcklbw %%mm7, %%mm2 \n\t" \
02001 "punpcklbw %%mm7, %%mm6 \n\t" \
02002 "pmullw "MANGLE(ff_pw_3)", %%mm3 \n\t" \
02003 "paddw %%mm2, %%mm1 \n\t" \
02004 "paddw %%mm6, %%mm4 \n\t" \
02005 "pmullw "MANGLE(ff_pw_20)", %%mm1 \n\t" \
02006 "psubw %%mm4, %%mm3 \n\t" \
02007 "paddw %6, %%mm1 \n\t"\
02008 "paddw %%mm1, %%mm3 \n\t" \
02009 "psraw $5, %%mm3 \n\t"\
02010 "movq %5, %%mm1 \n\t"\
02011 "packuswb %%mm3, %%mm1 \n\t"\
02012 OP_MMX2(%%mm1, (%1),%%mm4, q)\
02013 \
02014 \
02015 "movq 9(%0), %%mm1 \n\t" \
02016 "movq %%mm1, %%mm4 \n\t" \
02017 "movq %%mm1, %%mm3 \n\t" \
02018 "psrlq $8, %%mm1 \n\t" \
02019 "psrlq $16, %%mm4 \n\t" \
02020 "punpcklbw %%mm7, %%mm1 \n\t" \
02021 "punpcklbw %%mm7, %%mm4 \n\t" \
02022 "paddw %%mm1, %%mm5 \n\t" \
02023 "paddw %%mm4, %%mm0 \n\t" \
02024 "paddw %%mm5, %%mm5 \n\t" \
02025 "psubw %%mm5, %%mm0 \n\t" \
02026 "movq %%mm3, %%mm5 \n\t" \
02027 "psrlq $24, %%mm3 \n\t" \
02028 "pmullw "MANGLE(ff_pw_3)", %%mm0 \n\t" \
02029 "punpcklbw %%mm7, %%mm3 \n\t" \
02030 "paddw %%mm3, %%mm2 \n\t" \
02031 "psubw %%mm2, %%mm0 \n\t" \
02032 "movq %%mm5, %%mm2 \n\t" \
02033 "punpcklbw %%mm7, %%mm2 \n\t" \
02034 "punpckhbw %%mm7, %%mm5 \n\t" \
02035 "paddw %%mm2, %%mm6 \n\t" \
02036 "pmullw "MANGLE(ff_pw_20)", %%mm6 \n\t" \
02037 "paddw %6, %%mm0 \n\t"\
02038 "paddw %%mm6, %%mm0 \n\t" \
02039 "psraw $5, %%mm0 \n\t"\
02040 \
02041 \
02042 "paddw %%mm5, %%mm3 \n\t" \
02043 "pshufw $0xF9, %%mm5, %%mm6 \n\t" \
02044 "paddw %%mm4, %%mm6 \n\t" \
02045 "pshufw $0xBE, %%mm5, %%mm4 \n\t" \
02046 "pshufw $0x6F, %%mm5, %%mm5 \n\t" \
02047 "paddw %%mm1, %%mm4 \n\t" \
02048 "paddw %%mm2, %%mm5 \n\t" \
02049 "paddw %%mm6, %%mm6 \n\t" \
02050 "psubw %%mm6, %%mm4 \n\t" \
02051 "pmullw "MANGLE(ff_pw_20)", %%mm3 \n\t" \
02052 "pmullw "MANGLE(ff_pw_3)", %%mm4 \n\t" \
02053 "psubw %%mm5, %%mm3 \n\t" \
02054 "paddw %6, %%mm4 \n\t"\
02055 "paddw %%mm3, %%mm4 \n\t" \
02056 "psraw $5, %%mm4 \n\t"\
02057 "packuswb %%mm4, %%mm0 \n\t"\
02058 OP_MMX2(%%mm0, 8(%1), %%mm4, q)\
02059 \
02060 "add %3, %0 \n\t"\
02061 "add %4, %1 \n\t"\
02062 "decl %2 \n\t"\
02063 " jnz 1b \n\t"\
02064 : "+a"(src), "+c"(dst), "+m"(h)\
02065 : "d"((long)srcStride), "S"((long)dstStride), "m"(temp), "m"(ROUNDER)\
02066 : "memory"\
02067 );\
02068 }\
02069 \
02070 static void OPNAME ## mpeg4_qpel16_h_lowpass_3dnow(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
02071 int i;\
02072 int16_t temp[16];\
02073 \
02074 for(i=0; i<h; i++)\
02075 {\
02076 temp[ 0]= (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]);\
02077 temp[ 1]= (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]);\
02078 temp[ 2]= (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]);\
02079 temp[ 3]= (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]);\
02080 temp[ 4]= (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]);\
02081 temp[ 5]= (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 9]);\
02082 temp[ 6]= (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 9])*3 - (src[ 3]+src[10]);\
02083 temp[ 7]= (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 9])*6 + (src[ 5]+src[10])*3 - (src[ 4]+src[11]);\
02084 temp[ 8]= (src[ 8]+src[ 9])*20 - (src[ 7]+src[10])*6 + (src[ 6]+src[11])*3 - (src[ 5]+src[12]);\
02085 temp[ 9]= (src[ 9]+src[10])*20 - (src[ 8]+src[11])*6 + (src[ 7]+src[12])*3 - (src[ 6]+src[13]);\
02086 temp[10]= (src[10]+src[11])*20 - (src[ 9]+src[12])*6 + (src[ 8]+src[13])*3 - (src[ 7]+src[14]);\
02087 temp[11]= (src[11]+src[12])*20 - (src[10]+src[13])*6 + (src[ 9]+src[14])*3 - (src[ 8]+src[15]);\
02088 temp[12]= (src[12]+src[13])*20 - (src[11]+src[14])*6 + (src[10]+src[15])*3 - (src[ 9]+src[16]);\
02089 temp[13]= (src[13]+src[14])*20 - (src[12]+src[15])*6 + (src[11]+src[16])*3 - (src[10]+src[16]);\
02090 temp[14]= (src[14]+src[15])*20 - (src[13]+src[16])*6 + (src[12]+src[16])*3 - (src[11]+src[15]);\
02091 temp[15]= (src[15]+src[16])*20 - (src[14]+src[16])*6 + (src[13]+src[15])*3 - (src[12]+src[14]);\
02092 asm volatile(\
02093 "movq (%0), %%mm0 \n\t"\
02094 "movq 8(%0), %%mm1 \n\t"\
02095 "paddw %2, %%mm0 \n\t"\
02096 "paddw %2, %%mm1 \n\t"\
02097 "psraw $5, %%mm0 \n\t"\
02098 "psraw $5, %%mm1 \n\t"\
02099 "packuswb %%mm1, %%mm0 \n\t"\
02100 OP_3DNOW(%%mm0, (%1), %%mm1, q)\
02101 "movq 16(%0), %%mm0 \n\t"\
02102 "movq 24(%0), %%mm1 \n\t"\
02103 "paddw %2, %%mm0 \n\t"\
02104 "paddw %2, %%mm1 \n\t"\
02105 "psraw $5, %%mm0 \n\t"\
02106 "psraw $5, %%mm1 \n\t"\
02107 "packuswb %%mm1, %%mm0 \n\t"\
02108 OP_3DNOW(%%mm0, 8(%1), %%mm1, q)\
02109 :: "r"(temp), "r"(dst), "m"(ROUNDER)\
02110 : "memory"\
02111 );\
02112 dst+=dstStride;\
02113 src+=srcStride;\
02114 }\
02115 }\
02116 \
02117 static void OPNAME ## mpeg4_qpel8_h_lowpass_mmx2(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
02118 uint64_t temp;\
02119 \
02120 asm volatile(\
02121 "pxor %%mm7, %%mm7 \n\t"\
02122 "1: \n\t"\
02123 "movq (%0), %%mm0 \n\t" \
02124 "movq %%mm0, %%mm1 \n\t" \
02125 "movq %%mm0, %%mm2 \n\t" \
02126 "punpcklbw %%mm7, %%mm0 \n\t" \
02127 "punpckhbw %%mm7, %%mm1 \n\t" \
02128 "pshufw $0x90, %%mm0, %%mm5 \n\t" \
02129 "pshufw $0x41, %%mm0, %%mm6 \n\t" \
02130 "movq %%mm2, %%mm3 \n\t" \
02131 "movq %%mm2, %%mm4 \n\t" \
02132 "psllq $8, %%mm2 \n\t" \
02133 "psllq $16, %%mm3 \n\t" \
02134 "psllq $24, %%mm4 \n\t" \
02135 "punpckhbw %%mm7, %%mm2 \n\t" \
02136 "punpckhbw %%mm7, %%mm3 \n\t" \
02137 "punpckhbw %%mm7, %%mm4 \n\t" \
02138 "paddw %%mm3, %%mm5 \n\t" \
02139 "paddw %%mm2, %%mm6 \n\t" \
02140 "paddw %%mm5, %%mm5 \n\t" \
02141 "psubw %%mm5, %%mm6 \n\t" \
02142 "pshufw $0x06, %%mm0, %%mm5 \n\t" \
02143 "pmullw "MANGLE(ff_pw_3)", %%mm6 \n\t" \
02144 "paddw %%mm4, %%mm0 \n\t" \
02145 "paddw %%mm1, %%mm5 \n\t" \
02146 "pmullw "MANGLE(ff_pw_20)", %%mm0 \n\t" \
02147 "psubw %%mm5, %%mm0 \n\t" \
02148 "paddw %6, %%mm6 \n\t"\
02149 "paddw %%mm6, %%mm0 \n\t" \
02150 "psraw $5, %%mm0 \n\t"\
02151 \
02152 \
02153 "movd 5(%0), %%mm5 \n\t" \
02154 "punpcklbw %%mm7, %%mm5 \n\t" \
02155 "pshufw $0xF9, %%mm5, %%mm6 \n\t" \
02156 "paddw %%mm5, %%mm1 \n\t" \
02157 "paddw %%mm6, %%mm2 \n\t" \
02158 "pshufw $0xBE, %%mm5, %%mm6 \n\t" \
02159 "pshufw $0x6F, %%mm5, %%mm5 \n\t" \
02160 "paddw %%mm6, %%mm3 \n\t" \
02161 "paddw %%mm5, %%mm4 \n\t" \
02162 "paddw %%mm2, %%mm2 \n\t" \
02163 "psubw %%mm2, %%mm3 \n\t" \
02164 "pmullw "MANGLE(ff_pw_20)", %%mm1 \n\t" \
02165 "pmullw "MANGLE(ff_pw_3)", %%mm3 \n\t" \
02166 "psubw %%mm4, %%mm3 \n\t" \
02167 "paddw %6, %%mm1 \n\t"\
02168 "paddw %%mm1, %%mm3 \n\t" \
02169 "psraw $5, %%mm3 \n\t"\
02170 "packuswb %%mm3, %%mm0 \n\t"\
02171 OP_MMX2(%%mm0, (%1), %%mm4, q)\
02172 \
02173 "add %3, %0 \n\t"\
02174 "add %4, %1 \n\t"\
02175 "decl %2 \n\t"\
02176 " jnz 1b \n\t"\
02177 : "+a"(src), "+c"(dst), "+m"(h)\
02178 : "S"((long)srcStride), "D"((long)dstStride), "m"(temp), "m"(ROUNDER)\
02179 : "memory"\
02180 );\
02181 }\
02182 \
02183 static void OPNAME ## mpeg4_qpel8_h_lowpass_3dnow(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
02184 int i;\
02185 int16_t temp[8];\
02186 \
02187 for(i=0; i<h; i++)\
02188 {\
02189 temp[ 0]= (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]);\
02190 temp[ 1]= (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]);\
02191 temp[ 2]= (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]);\
02192 temp[ 3]= (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]);\
02193 temp[ 4]= (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]);\
02194 temp[ 5]= (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 8]);\
02195 temp[ 6]= (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 8])*3 - (src[ 3]+src[ 7]);\
02196 temp[ 7]= (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 8])*6 + (src[ 5]+src[ 7])*3 - (src[ 4]+src[ 6]);\
02197 asm volatile(\
02198 "movq (%0), %%mm0 \n\t"\
02199 "movq 8(%0), %%mm1 \n\t"\
02200 "paddw %2, %%mm0 \n\t"\
02201 "paddw %2, %%mm1 \n\t"\
02202 "psraw $5, %%mm0 \n\t"\
02203 "psraw $5, %%mm1 \n\t"\
02204 "packuswb %%mm1, %%mm0 \n\t"\
02205 OP_3DNOW(%%mm0, (%1), %%mm1, q)\
02206 :: "r"(temp), "r"(dst), "m"(ROUNDER)\
02207 :"memory"\
02208 );\
02209 dst+=dstStride;\
02210 src+=srcStride;\
02211 }\
02212 }
02213
02214 #define QPEL_OP(OPNAME, ROUNDER, RND, OP, MMX)\
02215 \
02216 static void OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
02217 uint64_t temp[17*4];\
02218 uint64_t *temp_ptr= temp;\
02219 int count= 17;\
02220 \
02221 \
02222 asm volatile(\
02223 "pxor %%mm7, %%mm7 \n\t"\
02224 "1: \n\t"\
02225 "movq (%0), %%mm0 \n\t"\
02226 "movq (%0), %%mm1 \n\t"\
02227 "movq 8(%0), %%mm2 \n\t"\
02228 "movq 8(%0), %%mm3 \n\t"\
02229 "punpcklbw %%mm7, %%mm0 \n\t"\
02230 "punpckhbw %%mm7, %%mm1 \n\t"\
02231 "punpcklbw %%mm7, %%mm2 \n\t"\
02232 "punpckhbw %%mm7, %%mm3 \n\t"\
02233 "movq %%mm0, (%1) \n\t"\
02234 "movq %%mm1, 17*8(%1) \n\t"\
02235 "movq %%mm2, 2*17*8(%1) \n\t"\
02236 "movq %%mm3, 3*17*8(%1) \n\t"\
02237 "add $8, %1 \n\t"\
02238 "add %3, %0 \n\t"\
02239 "decl %2 \n\t"\
02240 " jnz 1b \n\t"\
02241 : "+r" (src), "+r" (temp_ptr), "+r"(count)\
02242 : "r" ((long)srcStride)\
02243 : "memory"\
02244 );\
02245 \
02246 temp_ptr= temp;\
02247 count=4;\
02248 \
02249 \
02250 asm volatile(\
02251 \
02252 "1: \n\t"\
02253 "movq (%0), %%mm0 \n\t"\
02254 "movq 8(%0), %%mm1 \n\t"\
02255 "movq 16(%0), %%mm2 \n\t"\
02256 "movq 24(%0), %%mm3 \n\t"\
02257 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 16(%0), 8(%0), (%0), 32(%0), (%1), OP)\
02258 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 8(%0), (%0), (%0), 40(%0), (%1, %3), OP)\
02259 "add %4, %1 \n\t"\
02260 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, (%0), (%0), 8(%0), 48(%0), (%1), OP)\
02261 \
02262 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, (%0), 8(%0), 16(%0), 56(%0), (%1, %3), OP)\
02263 "add %4, %1 \n\t"\
02264 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 8(%0), 16(%0), 24(%0), 64(%0), (%1), OP)\
02265 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 16(%0), 24(%0), 32(%0), 72(%0), (%1, %3), OP)\
02266 "add %4, %1 \n\t"\
02267 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 24(%0), 32(%0), 40(%0), 80(%0), (%1), OP)\
02268 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 32(%0), 40(%0), 48(%0), 88(%0), (%1, %3), OP)\
02269 "add %4, %1 \n\t"\
02270 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 40(%0), 48(%0), 56(%0), 96(%0), (%1), OP)\
02271 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 48(%0), 56(%0), 64(%0),104(%0), (%1, %3), OP)\
02272 "add %4, %1 \n\t"\
02273 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 56(%0), 64(%0), 72(%0),112(%0), (%1), OP)\
02274 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 64(%0), 72(%0), 80(%0),120(%0), (%1, %3), OP)\
02275 "add %4, %1 \n\t"\
02276 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 72(%0), 80(%0), 88(%0),128(%0), (%1), OP)\
02277 \
02278 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 80(%0), 88(%0), 96(%0),128(%0), (%1, %3), OP)\
02279 "add %4, %1 \n\t" \
02280 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 88(%0), 96(%0),104(%0),120(%0), (%1), OP)\
02281 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 96(%0),104(%0),112(%0),112(%0), (%1, %3), OP)\
02282 \
02283 "add $136, %0 \n\t"\
02284 "add %6, %1 \n\t"\
02285 "decl %2 \n\t"\
02286 " jnz 1b \n\t"\
02287 \
02288 : "+r"(temp_ptr), "+r"(dst), "+g"(count)\
02289 : "r"((long)dstStride), "r"(2*(long)dstStride), "m"(ROUNDER), "g"(4-14*(long)dstStride)\
02290 :"memory"\
02291 );\
02292 }\
02293 \
02294 static void OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
02295 uint64_t temp[9*2];\
02296 uint64_t *temp_ptr= temp;\
02297 int count= 9;\
02298 \
02299 \
02300 asm volatile(\
02301 "pxor %%mm7, %%mm7 \n\t"\
02302 "1: \n\t"\
02303 "movq (%0), %%mm0 \n\t"\
02304 "movq (%0), %%mm1 \n\t"\
02305 "punpcklbw %%mm7, %%mm0 \n\t"\
02306 "punpckhbw %%mm7, %%mm1 \n\t"\
02307 "movq %%mm0, (%1) \n\t"\
02308 "movq %%mm1, 9*8(%1) \n\t"\
02309 "add $8, %1 \n\t"\
02310 "add %3, %0 \n\t"\
02311 "decl %2 \n\t"\
02312 " jnz 1b \n\t"\
02313 : "+r" (src), "+r" (temp_ptr), "+r"(count)\
02314 : "r" ((long)srcStride)\
02315 : "memory"\
02316 );\
02317 \
02318 temp_ptr= temp;\
02319 count=2;\
02320 \
02321 \
02322 asm volatile(\
02323 \
02324 "1: \n\t"\
02325 "movq (%0), %%mm0 \n\t"\
02326 "movq 8(%0), %%mm1 \n\t"\
02327 "movq 16(%0), %%mm2 \n\t"\
02328 "movq 24(%0), %%mm3 \n\t"\
02329 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 16(%0), 8(%0), (%0), 32(%0), (%1), OP)\
02330 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 8(%0), (%0), (%0), 40(%0), (%1, %3), OP)\
02331 "add %4, %1 \n\t"\
02332 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, (%0), (%0), 8(%0), 48(%0), (%1), OP)\
02333 \
02334 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, (%0), 8(%0), 16(%0), 56(%0), (%1, %3), OP)\
02335 "add %4, %1 \n\t"\
02336 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 8(%0), 16(%0), 24(%0), 64(%0), (%1), OP)\
02337 \
02338 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 16(%0), 24(%0), 32(%0), 64(%0), (%1, %3), OP)\
02339 "add %4, %1 \n\t"\
02340 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 24(%0), 32(%0), 40(%0), 56(%0), (%1), OP)\
02341 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 32(%0), 40(%0), 48(%0), 48(%0), (%1, %3), OP)\
02342 \
02343 "add $72, %0 \n\t"\
02344 "add %6, %1 \n\t"\
02345 "decl %2 \n\t"\
02346 " jnz 1b \n\t"\
02347 \
02348 : "+r"(temp_ptr), "+r"(dst), "+g"(count)\
02349 : "r"((long)dstStride), "r"(2*(long)dstStride), "m"(ROUNDER), "g"(4-6*(long)dstStride)\
02350 : "memory"\
02351 );\
02352 }\
02353 \
02354 static void OPNAME ## qpel8_mc00_ ## MMX (uint8_t *dst, uint8_t *src, int stride){\
02355 OPNAME ## pixels8_ ## MMX(dst, src, stride, 8);\
02356 }\
02357 \
02358 static void OPNAME ## qpel8_mc10_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
02359 uint64_t temp[8];\
02360 uint8_t * const half= (uint8_t*)temp;\
02361 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(half, src, 8, stride, 8);\
02362 OPNAME ## pixels8_l2_ ## MMX(dst, src, half, stride, stride, 8);\
02363 }\
02364 \
02365 static void OPNAME ## qpel8_mc20_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
02366 OPNAME ## mpeg4_qpel8_h_lowpass_ ## MMX(dst, src, stride, stride, 8);\
02367 }\
02368 \
02369 static void OPNAME ## qpel8_mc30_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
02370 uint64_t temp[8];\
02371 uint8_t * const half= (uint8_t*)temp;\
02372 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(half, src, 8, stride, 8);\
02373 OPNAME ## pixels8_l2_ ## MMX(dst, src+1, half, stride, stride, 8);\
02374 }\
02375 \
02376 static void OPNAME ## qpel8_mc01_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
02377 uint64_t temp[8];\
02378 uint8_t * const half= (uint8_t*)temp;\
02379 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(half, src, 8, stride);\
02380 OPNAME ## pixels8_l2_ ## MMX(dst, src, half, stride, stride, 8);\
02381 }\
02382 \
02383 static void OPNAME ## qpel8_mc02_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
02384 OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, src, stride, stride);\
02385 }\
02386 \
02387 static void OPNAME ## qpel8_mc03_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
02388 uint64_t temp[8];\
02389 uint8_t * const half= (uint8_t*)temp;\
02390 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(half, src, 8, stride);\
02391 OPNAME ## pixels8_l2_ ## MMX(dst, src+stride, half, stride, stride, 8);\
02392 }\
02393 static void OPNAME ## qpel8_mc11_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
02394 uint64_t half[8 + 9];\
02395 uint8_t * const halfH= ((uint8_t*)half) + 64;\
02396 uint8_t * const halfHV= ((uint8_t*)half);\
02397 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
02398 put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, 8, stride, 9);\
02399 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
02400 OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV, stride, 8, 8);\
02401 }\
02402 static void OPNAME ## qpel8_mc31_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
02403 uint64_t half[8 + 9];\
02404 uint8_t * const halfH= ((uint8_t*)half) + 64;\
02405 uint8_t * const halfHV= ((uint8_t*)half);\
02406 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
02407 put ## RND ## pixels8_l2_ ## MMX(halfH, src+1, halfH, 8, stride, 9);\
02408 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
02409 OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV, stride, 8, 8);\
02410 }\
02411 static void OPNAME ## qpel8_mc13_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
02412 uint64_t half[8 + 9];\
02413 uint8_t * const halfH= ((uint8_t*)half) + 64;\
02414 uint8_t * const halfHV= ((uint8_t*)half);\
02415 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
02416 put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, 8, stride, 9);\
02417 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
02418 OPNAME ## pixels8_l2_ ## MMX(dst, halfH+8, halfHV, stride, 8, 8);\
02419 }\
02420 static void OPNAME ## qpel8_mc33_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
02421 uint64_t half[8 + 9];\
02422 uint8_t * const halfH= ((uint8_t*)half) + 64;\
02423 uint8_t * const halfHV= ((uint8_t*)half);\
02424 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
02425 put ## RND ## pixels8_l2_ ## MMX(halfH, src+1, halfH, 8, stride, 9);\
02426 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
02427 OPNAME ## pixels8_l2_ ## MMX(dst, halfH+8, halfHV, stride, 8, 8);\
02428 }\
02429 static void OPNAME ## qpel8_mc21_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
02430 uint64_t half[8 + 9];\
02431 uint8_t * const halfH= ((uint8_t*)half) + 64;\
02432 uint8_t * const halfHV= ((uint8_t*)half);\
02433 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
02434 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
02435 OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV, stride, 8, 8);\
02436 }\
02437 static void OPNAME ## qpel8_mc23_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
02438 uint64_t half[8 + 9];\
02439 uint8_t * const halfH= ((uint8_t*)half) + 64;\
02440 uint8_t * const halfHV= ((uint8_t*)half);\
02441 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
02442 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
02443 OPNAME ## pixels8_l2_ ## MMX(dst, halfH+8, halfHV, stride, 8, 8);\
02444 }\
02445 static void OPNAME ## qpel8_mc12_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
02446 uint64_t half[8 + 9];\
02447 uint8_t * const halfH= ((uint8_t*)half);\
02448 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
02449 put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, 8, stride, 9);\
02450 OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, stride, 8);\
02451 }\
02452 static void OPNAME ## qpel8_mc32_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
02453 uint64_t half[8 + 9];\
02454 uint8_t * const halfH= ((uint8_t*)half);\
02455 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
02456 put ## RND ## pixels8_l2_ ## MMX(halfH, src+1, halfH, 8, stride, 9);\
02457 OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, stride, 8);\
02458 }\
02459 static void OPNAME ## qpel8_mc22_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
02460 uint64_t half[9];\
02461 uint8_t * const halfH= ((uint8_t*)half);\
02462 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
02463 OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, stride, 8);\
02464 }\
02465 static void OPNAME ## qpel16_mc00_ ## MMX (uint8_t *dst, uint8_t *src, int stride){\
02466 OPNAME ## pixels16_ ## MMX(dst, src, stride, 16);\
02467 }\
02468 \
02469 static void OPNAME ## qpel16_mc10_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
02470 uint64_t temp[32];\
02471 uint8_t * const half= (uint8_t*)temp;\
02472 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(half, src, 16, stride, 16);\
02473 OPNAME ## pixels16_l2_ ## MMX(dst, src, half, stride, stride, 16);\
02474 }\
02475 \
02476 static void OPNAME ## qpel16_mc20_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
02477 OPNAME ## mpeg4_qpel16_h_lowpass_ ## MMX(dst, src, stride, stride, 16);\
02478 }\
02479 \
02480 static void OPNAME ## qpel16_mc30_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
02481 uint64_t temp[32];\
02482 uint8_t * const half= (uint8_t*)temp;\
02483 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(half, src, 16, stride, 16);\
02484 OPNAME ## pixels16_l2_ ## MMX(dst, src+1, half, stride, stride, 16);\
02485 }\
02486 \
02487 static void OPNAME ## qpel16_mc01_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
02488 uint64_t temp[32];\
02489 uint8_t * const half= (uint8_t*)temp;\
02490 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(half, src, 16, stride);\
02491 OPNAME ## pixels16_l2_ ## MMX(dst, src, half, stride, stride, 16);\
02492 }\
02493 \
02494 static void OPNAME ## qpel16_mc02_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
02495 OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, src, stride, stride);\
02496 }\
02497 \
02498 static void OPNAME ## qpel16_mc03_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
02499 uint64_t temp[32];\
02500 uint8_t * const half= (uint8_t*)temp;\
02501 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(half, src, 16, stride);\
02502 OPNAME ## pixels16_l2_ ## MMX(dst, src+stride, half, stride, stride, 16);\
02503 }\
02504 static void OPNAME ## qpel16_mc11_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
02505 uint64_t half[16*2 + 17*2];\
02506 uint8_t * const halfH= ((uint8_t*)half) + 256;\
02507 uint8_t * const halfHV= ((uint8_t*)half);\
02508 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
02509 put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16, stride, 17);\
02510 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
02511 OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV, stride, 16, 16);\
02512 }\
02513 static void OPNAME ## qpel16_mc31_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
02514 uint64_t half[16*2 + 17*2];\
02515 uint8_t * const halfH= ((uint8_t*)half) + 256;\
02516 uint8_t * const halfHV= ((uint8_t*)half);\
02517 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
02518 put ## RND ## pixels16_l2_ ## MMX(halfH, src+1, halfH, 16, stride, 17);\
02519 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
02520 OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV, stride, 16, 16);\
02521 }\
02522 static void OPNAME ## qpel16_mc13_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
02523 uint64_t half[16*2 + 17*2];\
02524 uint8_t * const halfH= ((uint8_t*)half) + 256;\
02525 uint8_t * const halfHV= ((uint8_t*)half);\
02526 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
02527 put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16, stride, 17);\
02528 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
02529 OPNAME ## pixels16_l2_ ## MMX(dst, halfH+16, halfHV, stride, 16, 16);\
02530 }\
02531 static void OPNAME ## qpel16_mc33_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
02532 uint64_t half[16*2 + 17*2];\
02533 uint8_t * const halfH= ((uint8_t*)half) + 256;\
02534 uint8_t * const halfHV= ((uint8_t*)half);\
02535 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
02536 put ## RND ## pixels16_l2_ ## MMX(halfH, src+1, halfH, 16, stride, 17);\
02537 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
02538 OPNAME ## pixels16_l2_ ## MMX(dst, halfH+16, halfHV, stride, 16, 16);\
02539 }\
02540 static void OPNAME ## qpel16_mc21_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
02541 uint64_t half[16*2 + 17*2];\
02542 uint8_t * const halfH= ((uint8_t*)half) + 256;\
02543 uint8_t * const halfHV= ((uint8_t*)half);\
02544 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
02545 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
02546 OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV, stride, 16, 16);\
02547 }\
02548 static void OPNAME ## qpel16_mc23_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
02549 uint64_t half[16*2 + 17*2];\
02550 uint8_t * const halfH= ((uint8_t*)half) + 256;\
02551 uint8_t * const halfHV= ((uint8_t*)half);\
02552 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
02553 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
02554 OPNAME ## pixels16_l2_ ## MMX(dst, halfH+16, halfHV, stride, 16, 16);\
02555 }\
02556 static void OPNAME ## qpel16_mc12_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
02557 uint64_t half[17*2];\
02558 uint8_t * const halfH= ((uint8_t*)half);\
02559 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
02560 put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16, stride, 17);\
02561 OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, stride, 16);\
02562 }\
02563 static void OPNAME ## qpel16_mc32_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
02564 uint64_t half[17*2];\
02565 uint8_t * const halfH= ((uint8_t*)half);\
02566 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
02567 put ## RND ## pixels16_l2_ ## MMX(halfH, src+1, halfH, 16, stride, 17);\
02568 OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, stride, 16);\
02569 }\
02570 static void OPNAME ## qpel16_mc22_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
02571 uint64_t half[17*2];\
02572 uint8_t * const halfH= ((uint8_t*)half);\
02573 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
02574 OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, stride, 16);\
02575 }
02576
02577 #define PUT_OP(a,b,temp, size) "mov" #size " " #a ", " #b " \n\t"
02578 #define AVG_3DNOW_OP(a,b,temp, size) \
02579 "mov" #size " " #b ", " #temp " \n\t"\
02580 "pavgusb " #temp ", " #a " \n\t"\
02581 "mov" #size " " #a ", " #b " \n\t"
02582 #define AVG_MMX2_OP(a,b,temp, size) \
02583 "mov" #size " " #b ", " #temp " \n\t"\
02584 "pavgb " #temp ", " #a " \n\t"\
02585 "mov" #size " " #a ", " #b " \n\t"
02586
02587 QPEL_BASE(put_ , ff_pw_16, _ , PUT_OP, PUT_OP)
02588 QPEL_BASE(avg_ , ff_pw_16, _ , AVG_MMX2_OP, AVG_3DNOW_OP)
02589 QPEL_BASE(put_no_rnd_, ff_pw_15, _no_rnd_, PUT_OP, PUT_OP)
02590 QPEL_OP(put_ , ff_pw_16, _ , PUT_OP, 3dnow)
02591 QPEL_OP(avg_ , ff_pw_16, _ , AVG_3DNOW_OP, 3dnow)
02592 QPEL_OP(put_no_rnd_, ff_pw_15, _no_rnd_, PUT_OP, 3dnow)
02593 QPEL_OP(put_ , ff_pw_16, _ , PUT_OP, mmx2)
02594 QPEL_OP(avg_ , ff_pw_16, _ , AVG_MMX2_OP, mmx2)
02595 QPEL_OP(put_no_rnd_, ff_pw_15, _no_rnd_, PUT_OP, mmx2)
02596
02597
02598
02599
02600 #define QPEL_2TAP_XY(OPNAME, SIZE, MMX, XY, HPEL)\
02601 static void OPNAME ## 2tap_qpel ## SIZE ## _mc ## XY ## _ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
02602 OPNAME ## pixels ## SIZE ## HPEL(dst, src, stride, SIZE);\
02603 }
02604 #define QPEL_2TAP_L3(OPNAME, SIZE, MMX, XY, S0, S1, S2)\
02605 static void OPNAME ## 2tap_qpel ## SIZE ## _mc ## XY ## _ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
02606 OPNAME ## 2tap_qpel ## SIZE ## _l3_ ## MMX(dst, src+S0, stride, SIZE, S1, S2);\
02607 }
02608
02609 #define QPEL_2TAP(OPNAME, SIZE, MMX)\
02610 QPEL_2TAP_XY(OPNAME, SIZE, MMX, 20, _x2_ ## MMX)\
02611 QPEL_2TAP_XY(OPNAME, SIZE, MMX, 02, _y2_ ## MMX)\
02612 QPEL_2TAP_XY(OPNAME, SIZE, MMX, 22, _xy2_mmx)\
02613 static const qpel_mc_func OPNAME ## 2tap_qpel ## SIZE ## _mc00_ ## MMX =\
02614 OPNAME ## qpel ## SIZE ## _mc00_ ## MMX;\
02615 static const qpel_mc_func OPNAME ## 2tap_qpel ## SIZE ## _mc21_ ## MMX =\
02616 OPNAME ## 2tap_qpel ## SIZE ## _mc20_ ## MMX;\
02617 static const qpel_mc_func OPNAME ## 2tap_qpel ## SIZE ## _mc12_ ## MMX =\
02618 OPNAME ## 2tap_qpel ## SIZE ## _mc02_ ## MMX;\
02619 static void OPNAME ## 2tap_qpel ## SIZE ## _mc32_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
02620 OPNAME ## pixels ## SIZE ## _y2_ ## MMX(dst, src+1, stride, SIZE);\
02621 }\
02622 static void OPNAME ## 2tap_qpel ## SIZE ## _mc23_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
02623 OPNAME ## pixels ## SIZE ## _x2_ ## MMX(dst, src+stride, stride, SIZE);\
02624 }\
02625 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 10, 0, 1, 0)\
02626 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 30, 1, -1, 0)\
02627 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 01, 0, stride, 0)\
02628 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 03, stride, -stride, 0)\
02629 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 11, 0, stride, 1)\
02630 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 31, 1, stride, -1)\
02631 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 13, stride, -stride, 1)\
02632 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 33, stride+1, -stride, -1)\
02633
02634 QPEL_2TAP(put_, 16, mmx2)
02635 QPEL_2TAP(avg_, 16, mmx2)
02636 QPEL_2TAP(put_, 8, mmx2)
02637 QPEL_2TAP(avg_, 8, mmx2)
02638 QPEL_2TAP(put_, 16, 3dnow)
02639 QPEL_2TAP(avg_, 16, 3dnow)
02640 QPEL_2TAP(put_, 8, 3dnow)
02641 QPEL_2TAP(avg_, 8, 3dnow)
02642
02643
02644 #if 0
02645 static void just_return() { return; }
02646 #endif
02647
02648 static void gmc_mmx(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy,
02649 int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height){
02650 const int w = 8;
02651 const int ix = ox>>(16+shift);
02652 const int iy = oy>>(16+shift);
02653 const int oxs = ox>>4;
02654 const int oys = oy>>4;
02655 const int dxxs = dxx>>4;
02656 const int dxys = dxy>>4;
02657 const int dyxs = dyx>>4;
02658 const int dyys = dyy>>4;
02659 const uint16_t r4[4] = {r,r,r,r};
02660 const uint16_t dxy4[4] = {dxys,dxys,dxys,dxys};
02661 const uint16_t dyy4[4] = {dyys,dyys,dyys,dyys};
02662 const uint64_t shift2 = 2*shift;
02663 uint8_t edge_buf[(h+1)*stride];
02664 int x, y;
02665
02666 const int dxw = (dxx-(1<<(16+shift)))*(w-1);
02667 const int dyh = (dyy-(1<<(16+shift)))*(h-1);
02668 const int dxh = dxy*(h-1);
02669 const int dyw = dyx*(w-1);
02670 if(
02671 ((ox^(ox+dxw)) | (ox^(ox+dxh)) | (ox^(ox+dxw+dxh)) |
02672 (oy^(oy+dyw)) | (oy^(oy+dyh)) | (oy^(oy+dyw+dyh))) >> (16+shift)
02673
02674 || (dxx|dxy|dyx|dyy)&15 )
02675 {
02676
02677 ff_gmc_c(dst, src, stride, h, ox, oy, dxx, dxy, dyx, dyy, shift, r, width, height);
02678 return;
02679 }
02680
02681 src += ix + iy*stride;
02682 if( (unsigned)ix >= width-w ||
02683 (unsigned)iy >= height-h )
02684 {
02685 ff_emulated_edge_mc(edge_buf, src, stride, w+1, h+1, ix, iy, width, height);
02686 src = edge_buf;
02687 }
02688
02689 asm volatile(
02690 "movd %0, %%mm6 \n\t"
02691 "pxor %%mm7, %%mm7 \n\t"
02692 "punpcklwd %%mm6, %%mm6 \n\t"
02693 "punpcklwd %%mm6, %%mm6 \n\t"
02694 :: "r"(1<<shift)
02695 );
02696
02697 for(x=0; x<w; x+=4){
02698 uint16_t dx4[4] = { oxs - dxys + dxxs*(x+0),
02699 oxs - dxys + dxxs*(x+1),
02700 oxs - dxys + dxxs*(x+2),
02701 oxs - dxys + dxxs*(x+3) };
02702 uint16_t dy4[4] = { oys - dyys + dyxs*(x+0),
02703 oys - dyys + dyxs*(x+1),
02704 oys - dyys + dyxs*(x+2),
02705 oys - dyys + dyxs*(x+3) };
02706
02707 for(y=0; y<h; y++){
02708 asm volatile(
02709 "movq %0, %%mm4 \n\t"
02710 "movq %1, %%mm5 \n\t"
02711 "paddw %2, %%mm4 \n\t"
02712 "paddw %3, %%mm5 \n\t"
02713 "movq %%mm4, %0 \n\t"
02714 "movq %%mm5, %1 \n\t"
02715 "psrlw $12, %%mm4 \n\t"
02716 "psrlw $12, %%mm5 \n\t"
02717 : "+m"(*dx4), "+m"(*dy4)
02718 : "m"(*dxy4), "m"(*dyy4)
02719 );
02720
02721 asm volatile(
02722 "movq %%mm6, %%mm2 \n\t"
02723 "movq %%mm6, %%mm1 \n\t"
02724 "psubw %%mm4, %%mm2 \n\t"
02725 "psubw %%mm5, %%mm1 \n\t"
02726 "movq %%mm2, %%mm0 \n\t"
02727 "movq %%mm4, %%mm3 \n\t"
02728 "pmullw %%mm1, %%mm0 \n\t"
02729 "pmullw %%mm5, %%mm3 \n\t"
02730 "pmullw %%mm5, %%mm2 \n\t"
02731 "pmullw %%mm4, %%mm1 \n\t"
02732
02733 "movd %4, %%mm5 \n\t"
02734 "movd %3, %%mm4 \n\t"
02735 "punpcklbw %%mm7, %%mm5 \n\t"
02736 "punpcklbw %%mm7, %%mm4 \n\t"
02737 "pmullw %%mm5, %%mm3 \n\t"
02738 "pmullw %%mm4, %%mm2 \n\t"
02739
02740 "movd %2, %%mm5 \n\t"
02741 "movd %1, %%mm4 \n\t"
02742 "punpcklbw %%mm7, %%mm5 \n\t"
02743 "punpcklbw %%mm7, %%mm4 \n\t"
02744 "pmullw %%mm5, %%mm1 \n\t"
02745 "pmullw %%mm4, %%mm0 \n\t"
02746 "paddw %5, %%mm1 \n\t"
02747 "paddw %%mm3, %%mm2 \n\t"
02748 "paddw %%mm1, %%mm0 \n\t"
02749 "paddw %%mm2, %%mm0 \n\t"
02750
02751 "psrlw %6, %%mm0 \n\t"
02752 "packuswb %%mm0, %%mm0 \n\t"
02753 "movd %%mm0, %0 \n\t"
02754
02755 : "=m"(dst[x+y*stride])
02756 : "m"(src[0]), "m"(src[1]),
02757 "m"(src[stride]), "m"(src[stride+1]),
02758 "m"(*r4), "m"(shift2)
02759 );
02760 src += stride;
02761 }
02762 src += 4-h*stride;
02763 }
02764 }
02765
02766 #ifdef CONFIG_ENCODERS
02767
02768 #define PHADDD(a, t)\
02769 "movq "#a", "#t" \n\t"\
02770 "psrlq $32, "#a" \n\t"\
02771 "paddd "#t", "#a" \n\t"
02772
02773
02774
02775
02776
02777 #define PMULHRW(x, y, s, o)\
02778 "pmulhw " #s ", "#x " \n\t"\
02779 "pmulhw " #s ", "#y " \n\t"\
02780 "paddw " #o ", "#x " \n\t"\
02781 "paddw " #o ", "#y " \n\t"\
02782 "psraw $1, "#x " \n\t"\
02783 "psraw $1, "#y " \n\t"
02784 #define DEF(x) x ## _mmx
02785 #define SET_RND MOVQ_WONE
02786 #define SCALE_OFFSET 1
02787
02788 #include "dsputil_mmx_qns.h"
02789
02790 #undef DEF
02791 #undef SET_RND
02792 #undef SCALE_OFFSET
02793 #undef PMULHRW
02794
02795 #define DEF(x) x ## _3dnow
02796 #define SET_RND(x)
02797 #define SCALE_OFFSET 0
02798 #define PMULHRW(x, y, s, o)\
02799 "pmulhrw " #s ", "#x " \n\t"\
02800 "pmulhrw " #s ", "#y " \n\t"
02801
02802 #include "dsputil_mmx_qns.h"
02803
02804 #undef DEF
02805 #undef SET_RND
02806 #undef SCALE_OFFSET
02807 #undef PMULHRW
02808
02809 #ifdef HAVE_SSSE3
02810 #undef PHADDD
02811 #define DEF(x) x ## _ssse3
02812 #define SET_RND(x)
02813 #define SCALE_OFFSET -1
02814 #define PHADDD(a, t)\
02815 "pshufw $0x0E, "#a", "#t" \n\t"\
02816 "paddd "#t", "#a" \n\t"
02817 #define PMULHRW(x, y, s, o)\
02818 "pmulhrsw " #s ", "#x " \n\t"\
02819 "pmulhrsw " #s ", "#y " \n\t"
02820
02821 #include "dsputil_mmx_qns.h"
02822
02823 #undef DEF
02824 #undef SET_RND
02825 #undef SCALE_OFFSET
02826 #undef PMULHRW
02827 #undef PHADDD
02828 #endif //HAVE_SSSE3
02829
02830 #endif
02831
02832 #define PREFETCH(name, op) \
02833 static void name(void *mem, int stride, int h){\
02834 const uint8_t *p= mem;\
02835 do{\
02836 asm volatile(#op" %0" :: "m"(*p));\
02837 p+= stride;\
02838 }while(--h);\
02839 }
02840 PREFETCH(prefetch_mmx2, prefetcht0)
02841 PREFETCH(prefetch_3dnow, prefetch)
02842 #undef PREFETCH
02843
02844 #include "h264dsp_mmx.c"
02845
02846
02847 void ff_cavsdsp_init_mmx2(DSPContext* c, AVCodecContext *avctx);
02848
02849 void ff_put_cavs_qpel8_mc00_mmx2(uint8_t *dst, uint8_t *src, int stride) {
02850 put_pixels8_mmx(dst, src, stride, 8);
02851 }
02852 void ff_avg_cavs_qpel8_mc00_mmx2(uint8_t *dst, uint8_t *src, int stride) {
02853 avg_pixels8_mmx(dst, src, stride, 8);
02854 }
02855 void ff_put_cavs_qpel16_mc00_mmx2(uint8_t *dst, uint8_t *src, int stride) {
02856 put_pixels16_mmx(dst, src, stride, 16);
02857 }
02858 void ff_avg_cavs_qpel16_mc00_mmx2(uint8_t *dst, uint8_t *src, int stride) {
02859 avg_pixels16_mmx(dst, src, stride, 16);
02860 }
02861
02862
02863 void ff_flac_compute_autocorr_sse2(const int32_t *data, int len, int lag,
02864 double *autoc);
02865
02866
02867 void ff_vc1dsp_init_mmx(DSPContext* dsp, AVCodecContext *avctx);
02868
02869 void ff_put_vc1_mspel_mc00_mmx(uint8_t *dst, const uint8_t *src, int stride, int rnd) {
02870 put_pixels8_mmx(dst, src, stride, 8);
02871 }
02872
02873
02874 void ff_mmx_idct(DCTELEM *block);
02875 void ff_mmxext_idct(DCTELEM *block);
02876
02877
02878
02879 #ifdef CONFIG_GPL
02880 static void ff_libmpeg2mmx_idct_put(uint8_t *dest, int line_size, DCTELEM *block)
02881 {
02882 ff_mmx_idct (block);
02883 put_pixels_clamped_mmx(block, dest, line_size);
02884 }
02885 static void ff_libmpeg2mmx_idct_add(uint8_t *dest, int line_size, DCTELEM *block)
02886 {
02887 ff_mmx_idct (block);
02888 add_pixels_clamped_mmx(block, dest, line_size);
02889 }
02890 static void ff_libmpeg2mmx2_idct_put(uint8_t *dest, int line_size, DCTELEM *block)
02891 {
02892 ff_mmxext_idct (block);
02893 put_pixels_clamped_mmx(block, dest, line_size);
02894 }
02895 static void ff_libmpeg2mmx2_idct_add(uint8_t *dest, int line_size, DCTELEM *block)
02896 {
02897 ff_mmxext_idct (block);
02898 add_pixels_clamped_mmx(block, dest, line_size);
02899 }
02900 #endif
02901 static void ff_idct_xvid_mmx_put(uint8_t *dest, int line_size, DCTELEM *block)
02902 {
02903 ff_idct_xvid_mmx (block);
02904 put_pixels_clamped_mmx(block, dest, line_size);
02905 }
02906 static void ff_idct_xvid_mmx_add(uint8_t *dest, int line_size, DCTELEM *block)
02907 {
02908 ff_idct_xvid_mmx (block);
02909 add_pixels_clamped_mmx(block, dest, line_size);
02910 }
02911 static void ff_idct_xvid_mmx2_put(uint8_t *dest, int line_size, DCTELEM *block)
02912 {
02913 ff_idct_xvid_mmx2 (block);
02914 put_pixels_clamped_mmx(block, dest, line_size);
02915 }
02916 static void ff_idct_xvid_mmx2_add(uint8_t *dest, int line_size, DCTELEM *block)
02917 {
02918 ff_idct_xvid_mmx2 (block);
02919 add_pixels_clamped_mmx(block, dest, line_size);
02920 }
02921
02922 static void vorbis_inverse_coupling_3dnow(float *mag, float *ang, int blocksize)
02923 {
02924 int i;
02925 asm volatile("pxor %%mm7, %%mm7":);
02926 for(i=0; i<blocksize; i+=2) {
02927 asm volatile(
02928 "movq %0, %%mm0 \n\t"
02929 "movq %1, %%mm1 \n\t"
02930 "movq %%mm0, %%mm2 \n\t"
02931 "movq %%mm1, %%mm3 \n\t"
02932 "pfcmpge %%mm7, %%mm2 \n\t"
02933 "pfcmpge %%mm7, %%mm3 \n\t"
02934 "pslld $31, %%mm2 \n\t"
02935 "pxor %%mm2, %%mm1 \n\t"
02936 "movq %%mm3, %%mm4 \n\t"
02937 "pand %%mm1, %%mm3 \n\t"
02938 "pandn %%mm1, %%mm4 \n\t"
02939 "pfadd %%mm0, %%mm3 \n\t"
02940 "pfsub %%mm4, %%mm0 \n\t"
02941 "movq %%mm3, %1 \n\t"
02942 "movq %%mm0, %0 \n\t"
02943 :"+m"(mag[i]), "+m"(ang[i])
02944 ::"memory"
02945 );
02946 }
02947 asm volatile("femms");
02948 }
02949 static void vorbis_inverse_coupling_sse(float *mag, float *ang, int blocksize)
02950 {
02951 int i;
02952
02953 asm volatile(
02954 "movaps %0, %%xmm5 \n\t"
02955 ::"m"(ff_pdw_80000000[0])
02956 );
02957 for(i=0; i<blocksize; i+=4) {
02958 asm volatile(
02959 "movaps %0, %%xmm0 \n\t"
02960 "movaps %1, %%xmm1 \n\t"
02961 "xorps %%xmm2, %%xmm2 \n\t"
02962 "xorps %%xmm3, %%xmm3 \n\t"
02963 "cmpleps %%xmm0, %%xmm2 \n\t"
02964 "cmpleps %%xmm1, %%xmm3 \n\t"
02965 "andps %%xmm5, %%xmm2 \n\t"
02966 "xorps %%xmm2, %%xmm1 \n\t"
02967 "movaps %%xmm3, %%xmm4 \n\t"
02968 "andps %%xmm1, %%xmm3 \n\t"
02969 "andnps %%xmm1, %%xmm4 \n\t"
02970 "addps %%xmm0, %%xmm3 \n\t"
02971 "subps %%xmm4, %%xmm0 \n\t"
02972 "movaps %%xmm3, %1 \n\t"
02973 "movaps %%xmm0, %0 \n\t"
02974 :"+m"(mag[i]), "+m"(ang[i])
02975 ::"memory"
02976 );
02977 }
02978 }
02979
02980 static void vector_fmul_3dnow(float *dst, const float *src, int len){
02981 long i = (len-4)*4;
02982 asm volatile(
02983 "1: \n\t"
02984 "movq (%1,%0), %%mm0 \n\t"
02985 "movq 8(%1,%0), %%mm1 \n\t"
02986 "pfmul (%2,%0), %%mm0 \n\t"
02987 "pfmul 8(%2,%0), %%mm1 \n\t"
02988 "movq %%mm0, (%1,%0) \n\t"
02989 "movq %%mm1, 8(%1,%0) \n\t"
02990 "sub $16, %0 \n\t"
02991 "jge 1b \n\t"
02992 "femms \n\t"
02993 :"+r"(i)
02994 :"r"(dst), "r"(src)
02995 :"memory"
02996 );
02997 }
02998 static void vector_fmul_sse(float *dst, const float *src, int len){
02999 long i = (len-8)*4;
03000 asm volatile(
03001 "1: \n\t"
03002 "movaps (%1,%0), %%xmm0 \n\t"
03003 "movaps 16(%1,%0), %%xmm1 \n\t"
03004 "mulps (%2,%0), %%xmm0 \n\t"
03005 "mulps 16(%2,%0), %%xmm1 \n\t"
03006 "movaps %%xmm0, (%1,%0) \n\t"
03007 "movaps %%xmm1, 16(%1,%0) \n\t"
03008 "sub $32, %0 \n\t"
03009 "jge 1b \n\t"
03010 :"+r"(i)
03011 :"r"(dst), "r"(src)
03012 :"memory"
03013 );
03014 }
03015
03016 static void vector_fmul_reverse_3dnow2(float *dst, const float *src0, const float *src1, int len){
03017 long i = len*4-16;
03018 asm volatile(
03019 "1: \n\t"
03020 "pswapd 8(%1), %%mm0 \n\t"
03021 "pswapd (%1), %%mm1 \n\t"
03022 "pfmul (%3,%0), %%mm0 \n\t"
03023 "pfmul 8(%3,%0), %%mm1 \n\t"
03024 "movq %%mm0, (%2,%0) \n\t"
03025 "movq %%mm1, 8(%2,%0) \n\t"
03026 "add $16, %1 \n\t"
03027 "sub $16, %0 \n\t"
03028 "jge 1b \n\t"
03029 :"+r"(i), "+r"(src1)
03030 :"r"(dst), "r"(src0)
03031 );
03032 asm volatile("femms");
03033 }
03034 static void vector_fmul_reverse_sse(float *dst, const float *src0, const float *src1, int len){
03035 long i = len*4-32;
03036 asm volatile(
03037 "1: \n\t"
03038 "movaps 16(%1), %%xmm0 \n\t"
03039 "movaps (%1), %%xmm1 \n\t"
03040 "shufps $0x1b, %%xmm0, %%xmm0 \n\t"
03041 "shufps $0x1b, %%xmm1, %%xmm1 \n\t"
03042 "mulps (%3,%0), %%xmm0 \n\t"
03043 "mulps 16(%3,%0), %%xmm1 \n\t"
03044 "movaps %%xmm0, (%2,%0) \n\t"
03045 "movaps %%xmm1, 16(%2,%0) \n\t"
03046 "add $32, %1 \n\t"
03047 "sub $32, %0 \n\t"
03048 "jge 1b \n\t"
03049 :"+r"(i), "+r"(src1)
03050 :"r"(dst), "r"(src0)
03051 );
03052 }
03053
03054 static void vector_fmul_add_add_3dnow(float *dst, const float *src0, const float *src1,
03055 const float *src2, int src3, int len, int step){
03056 long i = (len-4)*4;
03057 if(step == 2 && src3 == 0){
03058 dst += (len-4)*2;
03059 asm volatile(
03060 "1: \n\t"
03061 "movq (%2,%0), %%mm0 \n\t"
03062 "movq 8(%2,%0), %%mm1 \n\t"
03063 "pfmul (%3,%0), %%mm0 \n\t"
03064 "pfmul 8(%3,%0), %%mm1 \n\t"
03065 "pfadd (%4,%0), %%mm0 \n\t"
03066 "pfadd 8(%4,%0), %%mm1 \n\t"
03067 "movd %%mm0, (%1) \n\t"
03068 "movd %%mm1, 16(%1) \n\t"
03069 "psrlq $32, %%mm0 \n\t"
03070 "psrlq $32, %%mm1 \n\t"
03071 "movd %%mm0, 8(%1) \n\t"
03072 "movd %%mm1, 24(%1) \n\t"
03073 "sub $32, %1 \n\t"
03074 "sub $16, %0 \n\t"
03075 "jge 1b \n\t"
03076 :"+r"(i), "+r"(dst)
03077 :"r"(src0), "r"(src1), "r"(src2)
03078 :"memory"
03079 );
03080 }
03081 else if(step == 1 && src3 == 0){
03082 asm volatile(
03083 "1: \n\t"
03084 "movq (%2,%0), %%mm0 \n\t"
03085 "movq 8(%2,%0), %%mm1 \n\t"
03086 "pfmul (%3,%0), %%mm0 \n\t"
03087 "pfmul 8(%3,%0), %%mm1 \n\t"
03088 "pfadd (%4,%0), %%mm0 \n\t"
03089 "pfadd 8(%4,%0), %%mm1 \n\t"
03090 "movq %%mm0, (%1,%0) \n\t"
03091 "movq %%mm1, 8(%1,%0) \n\t"
03092 "sub $16, %0 \n\t"
03093 "jge 1b \n\t"
03094 :"+r"(i)
03095 :"r"(dst), "r"(src0), "r"(src1), "r"(src2)
03096 :"memory"
03097 );
03098 }
03099 else
03100 ff_vector_fmul_add_add_c(dst, src0, src1, src2, src3, len, step);
03101 asm volatile("femms");
03102 }
03103 static void vector_fmul_add_add_sse(float *dst, const float *src0, const float *src1,
03104 const float *src2, int src3, int len, int step){
03105 long i = (len-8)*4;
03106 if(step == 2 && src3 == 0){
03107 dst += (len-8)*2;
03108 asm volatile(
03109 "1: \n\t"
03110 "movaps (%2,%0), %%xmm0 \n\t"
03111 "movaps 16(%2,%0), %%xmm1 \n\t"
03112 "mulps (%3,%0), %%xmm0 \n\t"
03113 "mulps 16(%3,%0), %%xmm1 \n\t"
03114 "addps (%4,%0), %%xmm0 \n\t"
03115 "addps 16(%4,%0), %%xmm1 \n\t"
03116 "movss %%xmm0, (%1) \n\t"
03117 "movss %%xmm1, 32(%1) \n\t"
03118 "movhlps %%xmm0, %%xmm2 \n\t"
03119 "movhlps %%xmm1, %%xmm3 \n\t"
03120 "movss %%xmm2, 16(%1) \n\t"
03121 "movss %%xmm3, 48(%1) \n\t"
03122 "shufps $0xb1, %%xmm0, %%xmm0 \n\t"
03123 "shufps $0xb1, %%xmm1, %%xmm1 \n\t"
03124 "movss %%xmm0, 8(%1) \n\t"
03125 "movss %%xmm1, 40(%1) \n\t"
03126 "movhlps %%xmm0, %%xmm2 \n\t"
03127 "movhlps %%xmm1, %%xmm3 \n\t"
03128 "movss %%xmm2, 24(%1) \n\t"
03129 "movss %%xmm3, 56(%1) \n\t"
03130 "sub $64, %1 \n\t"
03131 "sub $32, %0 \n\t"
03132 "jge 1b \n\t"
03133 :"+r"(i), "+r"(dst)
03134 :"r"(src0), "r"(src1), "r"(src2)
03135 :"memory"
03136 );
03137 }
03138 else if(step == 1 && src3 == 0){
03139 asm volatile(
03140 "1: \n\t"
03141 "movaps (%2,%0), %%xmm0 \n\t"
03142 "movaps 16(%2,%0), %%xmm1 \n\t"
03143 "mulps (%3,%0), %%xmm0 \n\t"
03144 "mulps 16(%3,%0), %%xmm1 \n\t"
03145 "addps (%4,%0), %%xmm0 \n\t"
03146 "addps 16(%4,%0), %%xmm1 \n\t"
03147 "movaps %%xmm0, (%1,%0) \n\t"
03148 "movaps %%xmm1, 16(%1,%0) \n\t"
03149 "sub $32, %0 \n\t"
03150 "jge 1b \n\t"
03151 :"+r"(i)
03152 :"r"(dst), "r"(src0), "r"(src1), "r"(src2)
03153 :"memory"
03154 );
03155 }
03156 else
03157 ff_vector_fmul_add_add_c(dst, src0, src1, src2, src3, len, step);
03158 }
03159
03160 static void float_to_int16_3dnow(int16_t *dst, const float *src, int len){
03161
03162 int i;
03163 for(i=0; i<len; i+=4) {
03164 asm volatile(
03165 "pf2id %1, %%mm0 \n\t"
03166 "pf2id %2, %%mm1 \n\t"
03167 "packssdw %%mm1, %%mm0 \n\t"
03168 "movq %%mm0, %0 \n\t"
03169 :"=m"(dst[i])
03170 :"m"(src[i]), "m"(src[i+2])
03171 );
03172 }
03173 asm volatile("femms");
03174 }
03175 static void float_to_int16_sse(int16_t *dst, const float *src, int len){
03176 int i;
03177 for(i=0; i<len; i+=4) {
03178 asm volatile(
03179 "cvtps2pi %1, %%mm0 \n\t"
03180 "cvtps2pi %2, %%mm1 \n\t"
03181 "packssdw %%mm1, %%mm0 \n\t"
03182 "movq %%mm0, %0 \n\t"
03183 :"=m"(dst[i])
03184 :"m"(src[i]), "m"(src[i+2])
03185 );
03186 }
03187 asm volatile("emms");
03188 }
03189
03190 extern void ff_snow_horizontal_compose97i_sse2(IDWTELEM *b, int width);
03191 extern void ff_snow_horizontal_compose97i_mmx(IDWTELEM *b, int width);
03192 extern void ff_snow_vertical_compose97i_sse2(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2, IDWTELEM *b3, IDWTELEM *b4, IDWTELEM *b5, int width);
03193 extern void ff_snow_vertical_compose97i_mmx(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2, IDWTELEM *b3, IDWTELEM *b4, IDWTELEM *b5, int width);
03194 extern void ff_snow_inner_add_yblock_sse2(const uint8_t *obmc, const int obmc_stride, uint8_t * * block, int b_w, int b_h,
03195 int src_x, int src_y, int src_stride, slice_buffer * sb, int add, uint8_t * dst8);
03196 extern void ff_snow_inner_add_yblock_mmx(const uint8_t *obmc, const int obmc_stride, uint8_t * * block, int b_w, int b_h,
03197 int src_x, int src_y, int src_stride, slice_buffer * sb, int add, uint8_t * dst8);
03198
03199 void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx)
03200 {
03201 mm_flags = mm_support();
03202
03203 if (avctx->dsp_mask) {
03204 if (avctx->dsp_mask & FF_MM_FORCE)
03205 mm_flags |= (avctx->dsp_mask & 0xffff);
03206 else
03207 mm_flags &= ~(avctx->dsp_mask & 0xffff);
03208 }
03209
03210 #if 0
03211 av_log(avctx, AV_LOG_INFO, "libavcodec: CPU flags:");
03212 if (mm_flags & MM_MMX)
03213 av_log(avctx, AV_LOG_INFO, " mmx");
03214 if (mm_flags & MM_MMXEXT)
03215 av_log(avctx, AV_LOG_INFO, " mmxext");
03216 if (mm_flags & MM_3DNOW)
03217 av_log(avctx, AV_LOG_INFO, " 3dnow");
03218 if (mm_flags & MM_SSE)
03219 av_log(avctx, AV_LOG_INFO, " sse");
03220 if (mm_flags & MM_SSE2)
03221 av_log(avctx, AV_LOG_INFO, " sse2");
03222 av_log(avctx, AV_LOG_INFO, "\n");
03223 #endif
03224
03225 if (mm_flags & MM_MMX) {
03226 const int idct_algo= avctx->idct_algo;
03227
03228 #ifdef CONFIG_ENCODERS
03229 const int dct_algo = avctx->dct_algo;
03230 if(dct_algo==FF_DCT_AUTO || dct_algo==FF_DCT_MMX){
03231 if(mm_flags & MM_SSE2){
03232 c->fdct = ff_fdct_sse2;
03233 }else if(mm_flags & MM_MMXEXT){
03234 c->fdct = ff_fdct_mmx2;
03235 }else{
03236 c->fdct = ff_fdct_mmx;
03237 }
03238 }
03239 #endif //CONFIG_ENCODERS
03240 if(avctx->lowres==0){
03241 if(idct_algo==FF_IDCT_AUTO || idct_algo==FF_IDCT_SIMPLEMMX){
03242 c->idct_put= ff_simple_idct_put_mmx;
03243 c->idct_add= ff_simple_idct_add_mmx;
03244 c->idct = ff_simple_idct_mmx;
03245 c->idct_permutation_type= FF_SIMPLE_IDCT_PERM;
03246 #ifdef CONFIG_GPL
03247 }else if(idct_algo==FF_IDCT_LIBMPEG2MMX){
03248 if(mm_flags & MM_MMXEXT){
03249 c->idct_put= ff_libmpeg2mmx2_idct_put;
03250 c->idct_add= ff_libmpeg2mmx2_idct_add;
03251 c->idct = ff_mmxext_idct;
03252 }else{
03253 c->idct_put= ff_libmpeg2mmx_idct_put;
03254 c->idct_add= ff_libmpeg2mmx_idct_add;
03255 c->idct = ff_mmx_idct;
03256 }
03257 c->idct_permutation_type= FF_LIBMPEG2_IDCT_PERM;
03258 #endif
03259 }else if((ENABLE_VP3_DECODER || ENABLE_VP5_DECODER || ENABLE_VP6_DECODER) &&
03260 idct_algo==FF_IDCT_VP3 &&
03261 avctx->codec->id!=CODEC_ID_THEORA &&
03262 !(avctx->flags & CODEC_FLAG_BITEXACT)){
03263 if(mm_flags & MM_SSE2){
03264 c->idct_put= ff_vp3_idct_put_sse2;
03265 c->idct_add= ff_vp3_idct_add_sse2;
03266 c->idct = ff_vp3_idct_sse2;
03267 c->idct_permutation_type= FF_TRANSPOSE_IDCT_PERM;
03268 }else{
03269 ff_vp3_dsp_init_mmx();
03270 c->idct_put= ff_vp3_idct_put_mmx;
03271 c->idct_add= ff_vp3_idct_add_mmx;
03272 c->idct = ff_vp3_idct_mmx;
03273 c->idct_permutation_type= FF_PARTTRANS_IDCT_PERM;
03274 }
03275 }else if(idct_algo==FF_IDCT_CAVS){
03276 c->idct_permutation_type= FF_TRANSPOSE_IDCT_PERM;
03277 }else if(idct_algo==FF_IDCT_XVIDMMX){
03278 if(mm_flags & MM_MMXEXT){
03279 c->idct_put= ff_idct_xvid_mmx2_put;
03280 c->idct_add= ff_idct_xvid_mmx2_add;
03281 c->idct = ff_idct_xvid_mmx2;
03282 }else{
03283 c->idct_put= ff_idct_xvid_mmx_put;
03284 c->idct_add= ff_idct_xvid_mmx_add;
03285 c->idct = ff_idct_xvid_mmx;
03286 }
03287 }
03288 }
03289
03290 #ifdef CONFIG_ENCODERS
03291 c->get_pixels = get_pixels_mmx;
03292 c->diff_pixels = diff_pixels_mmx;
03293 #endif //CONFIG_ENCODERS
03294 c->put_pixels_clamped = put_pixels_clamped_mmx;
03295 c->put_signed_pixels_clamped = put_signed_pixels_clamped_mmx;
03296 c->add_pixels_clamped = add_pixels_clamped_mmx;
03297 c->clear_blocks = clear_blocks_mmx;
03298 #ifdef CONFIG_ENCODERS
03299 c->pix_sum = pix_sum16_mmx;
03300 #endif //CONFIG_ENCODERS
03301
03302 #define SET_HPEL_FUNCS(PFX, IDX, SIZE, CPU) \
03303 c->PFX ## _pixels_tab[IDX][0] = PFX ## _pixels ## SIZE ## _ ## CPU; \
03304 c->PFX ## _pixels_tab[IDX][1] = PFX ## _pixels ## SIZE ## _x2_ ## CPU; \
03305 c->PFX ## _pixels_tab[IDX][2] = PFX ## _pixels ## SIZE ## _y2_ ## CPU; \
03306 c->PFX ## _pixels_tab[IDX][3] = PFX ## _pixels ## SIZE ## _xy2_ ## CPU
03307
03308 SET_HPEL_FUNCS(put, 0, 16, mmx);
03309 SET_HPEL_FUNCS(put_no_rnd, 0, 16, mmx);
03310 SET_HPEL_FUNCS(avg, 0, 16, mmx);
03311 SET_HPEL_FUNCS(avg_no_rnd, 0, 16, mmx);
03312 SET_HPEL_FUNCS(put, 1, 8, mmx);
03313 SET_HPEL_FUNCS(put_no_rnd, 1, 8, mmx);
03314 SET_HPEL_FUNCS(avg, 1, 8, mmx);
03315 SET_HPEL_FUNCS(avg_no_rnd, 1, 8, mmx);
03316
03317 c->gmc= gmc_mmx;
03318
03319 c->add_bytes= add_bytes_mmx;
03320 #ifdef CONFIG_ENCODERS
03321 c->diff_bytes= diff_bytes_mmx;
03322 c->sum_abs_dctelem= sum_abs_dctelem_mmx;
03323
03324 c->hadamard8_diff[0]= hadamard8_diff16_mmx;
03325 c->hadamard8_diff[1]= hadamard8_diff_mmx;
03326
03327 c->pix_norm1 = pix_norm1_mmx;
03328 c->sse[0] = (mm_flags & MM_SSE2) ? sse16_sse2 : sse16_mmx;
03329 c->sse[1] = sse8_mmx;
03330 c->vsad[4]= vsad_intra16_mmx;
03331
03332 c->nsse[0] = nsse16_mmx;
03333 c->nsse[1] = nsse8_mmx;
03334 if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
03335 c->vsad[0] = vsad16_mmx;
03336 }
03337
03338 if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
03339 c->try_8x8basis= try_8x8basis_mmx;
03340 }
03341 c->add_8x8basis= add_8x8basis_mmx;
03342
03343 c->ssd_int8_vs_int16 = ssd_int8_vs_int16_mmx;
03344
03345 #endif //CONFIG_ENCODERS
03346
03347 if (ENABLE_ANY_H263) {
03348 c->h263_v_loop_filter= h263_v_loop_filter_mmx;
03349 c->h263_h_loop_filter= h263_h_loop_filter_mmx;
03350 }
03351 c->put_h264_chroma_pixels_tab[0]= put_h264_chroma_mc8_mmx_rnd;
03352 c->put_h264_chroma_pixels_tab[1]= put_h264_chroma_mc4_mmx;
03353 c->put_no_rnd_h264_chroma_pixels_tab[0]= put_h264_chroma_mc8_mmx_nornd;
03354
03355 c->h264_idct_dc_add=
03356 c->h264_idct_add= ff_h264_idct_add_mmx;
03357 c->h264_idct8_dc_add=
03358 c->h264_idct8_add= ff_h264_idct8_add_mmx;
03359 if (mm_flags & MM_SSE2)
03360 c->h264_idct8_add= ff_h264_idct8_add_sse2;
03361
03362 if (mm_flags & MM_MMXEXT) {
03363 c->prefetch = prefetch_mmx2;
03364
03365 c->put_pixels_tab[0][1] = put_pixels16_x2_mmx2;
03366 c->put_pixels_tab[0][2] = put_pixels16_y2_mmx2;
03367
03368 c->avg_pixels_tab[0][0] = avg_pixels16_mmx2;
03369 c->avg_pixels_tab[0][1] = avg_pixels16_x2_mmx2;
03370 c->avg_pixels_tab[0][2] = avg_pixels16_y2_mmx2;
03371
03372 c->put_pixels_tab[1][1] = put_pixels8_x2_mmx2;
03373 c->put_pixels_tab[1][2] = put_pixels8_y2_mmx2;
03374
03375 c->avg_pixels_tab[1][0] = avg_pixels8_mmx2;
03376 c->avg_pixels_tab[1][1] = avg_pixels8_x2_mmx2;
03377 c->avg_pixels_tab[1][2] = avg_pixels8_y2_mmx2;
03378
03379 #ifdef CONFIG_ENCODERS
03380 c->sum_abs_dctelem= sum_abs_dctelem_mmx2;
03381 c->hadamard8_diff[0]= hadamard8_diff16_mmx2;
03382 c->hadamard8_diff[1]= hadamard8_diff_mmx2;
03383 c->vsad[4]= vsad_intra16_mmx2;
03384 #endif //CONFIG_ENCODERS
03385
03386 c->h264_idct_dc_add= ff_h264_idct_dc_add_mmx2;
03387 c->h264_idct8_dc_add= ff_h264_idct8_dc_add_mmx2;
03388
03389 if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
03390 c->put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_mmx2;
03391 c->put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_mmx2;
03392 c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_mmx2;
03393 c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_mmx2;
03394 c->avg_pixels_tab[0][3] = avg_pixels16_xy2_mmx2;
03395 c->avg_pixels_tab[1][3] = avg_pixels8_xy2_mmx2;
03396 #ifdef CONFIG_ENCODERS
03397 c->vsad[0] = vsad16_mmx2;
03398 #endif //CONFIG_ENCODERS
03399 }
03400
03401 #define SET_QPEL_FUNCS(PFX, IDX, SIZE, CPU) \
03402 c->PFX ## _pixels_tab[IDX][ 0] = PFX ## SIZE ## _mc00_ ## CPU; \
03403 c->PFX ## _pixels_tab[IDX][ 1] = PFX ## SIZE ## _mc10_ ## CPU; \
03404 c->PFX ## _pixels_tab[IDX][ 2] = PFX ## SIZE ## _mc20_ ## CPU; \
03405 c->PFX ## _pixels_tab[IDX][ 3] = PFX ## SIZE ## _mc30_ ## CPU; \
03406 c->PFX ## _pixels_tab[IDX][ 4] = PFX ## SIZE ## _mc01_ ## CPU; \
03407 c->PFX ## _pixels_tab[IDX][ 5] = PFX ## SIZE ## _mc11_ ## CPU; \
03408 c->PFX ## _pixels_tab[IDX][ 6] = PFX ## SIZE ## _mc21_ ## CPU; \
03409 c->PFX ## _pixels_tab[IDX][ 7] = PFX ## SIZE ## _mc31_ ## CPU; \
03410 c->PFX ## _pixels_tab[IDX][ 8] = PFX ## SIZE ## _mc02_ ## CPU; \
03411 c->PFX ## _pixels_tab[IDX][ 9] = PFX ## SIZE ## _mc12_ ## CPU; \
03412 c->PFX ## _pixels_tab[IDX][10] = PFX ## SIZE ## _mc22_ ## CPU; \
03413 c->PFX ## _pixels_tab[IDX][11] = PFX ## SIZE ## _mc32_ ## CPU; \
03414 c->PFX ## _pixels_tab[IDX][12] = PFX ## SIZE ## _mc03_ ## CPU; \
03415 c->PFX ## _pixels_tab[IDX][13] = PFX ## SIZE ## _mc13_ ## CPU; \
03416 c->PFX ## _pixels_tab[IDX][14] = PFX ## SIZE ## _mc23_ ## CPU; \
03417 c->PFX ## _pixels_tab[IDX][15] = PFX ## SIZE ## _mc33_ ## CPU
03418
03419 SET_QPEL_FUNCS(put_qpel, 0, 16, mmx2);
03420 SET_QPEL_FUNCS(put_qpel, 1, 8, mmx2);
03421 SET_QPEL_FUNCS(put_no_rnd_qpel, 0, 16, mmx2);
03422 SET_QPEL_FUNCS(put_no_rnd_qpel, 1, 8, mmx2);
03423 SET_QPEL_FUNCS(avg_qpel, 0, 16, mmx2);
03424 SET_QPEL_FUNCS(avg_qpel, 1, 8, mmx2);
03425
03426 SET_QPEL_FUNCS(put_h264_qpel, 0, 16, mmx2);
03427 SET_QPEL_FUNCS(put_h264_qpel, 1, 8, mmx2);
03428 SET_QPEL_FUNCS(put_h264_qpel, 2, 4, mmx2);
03429 SET_QPEL_FUNCS(avg_h264_qpel, 0, 16, mmx2);
03430 SET_QPEL_FUNCS(avg_h264_qpel, 1, 8, mmx2);
03431 SET_QPEL_FUNCS(avg_h264_qpel, 2, 4, mmx2);
03432
03433 SET_QPEL_FUNCS(put_2tap_qpel, 0, 16, mmx2);
03434 SET_QPEL_FUNCS(put_2tap_qpel, 1, 8, mmx2);
03435 SET_QPEL_FUNCS(avg_2tap_qpel, 0, 16, mmx2);
03436 SET_QPEL_FUNCS(avg_2tap_qpel, 1, 8, mmx2);
03437
03438 c->avg_h264_chroma_pixels_tab[0]= avg_h264_chroma_mc8_mmx2_rnd;
03439 c->avg_h264_chroma_pixels_tab[1]= avg_h264_chroma_mc4_mmx2;
03440 c->avg_h264_chroma_pixels_tab[2]= avg_h264_chroma_mc2_mmx2;
03441 c->put_h264_chroma_pixels_tab[2]= put_h264_chroma_mc2_mmx2;
03442 c->h264_v_loop_filter_luma= h264_v_loop_filter_luma_mmx2;
03443 c->h264_h_loop_filter_luma= h264_h_loop_filter_luma_mmx2;
03444 c->h264_v_loop_filter_chroma= h264_v_loop_filter_chroma_mmx2;
03445 c->h264_h_loop_filter_chroma= h264_h_loop_filter_chroma_mmx2;
03446 c->h264_v_loop_filter_chroma_intra= h264_v_loop_filter_chroma_intra_mmx2;
03447 c->h264_h_loop_filter_chroma_intra= h264_h_loop_filter_chroma_intra_mmx2;
03448 c->h264_loop_filter_strength= h264_loop_filter_strength_mmx2;
03449
03450 c->weight_h264_pixels_tab[0]= ff_h264_weight_16x16_mmx2;
03451 c->weight_h264_pixels_tab[1]= ff_h264_weight_16x8_mmx2;
03452 c->weight_h264_pixels_tab[2]= ff_h264_weight_8x16_mmx2;
03453 c->weight_h264_pixels_tab[3]= ff_h264_weight_8x8_mmx2;
03454 c->weight_h264_pixels_tab[4]= ff_h264_weight_8x4_mmx2;
03455 c->weight_h264_pixels_tab[5]= ff_h264_weight_4x8_mmx2;
03456 c->weight_h264_pixels_tab[6]= ff_h264_weight_4x4_mmx2;
03457 c->weight_h264_pixels_tab[7]= ff_h264_weight_4x2_mmx2;
03458
03459 c->biweight_h264_pixels_tab[0]= ff_h264_biweight_16x16_mmx2;
03460 c->biweight_h264_pixels_tab[1]= ff_h264_biweight_16x8_mmx2;
03461 c->biweight_h264_pixels_tab[2]= ff_h264_biweight_8x16_mmx2;
03462 c->biweight_h264_pixels_tab[3]= ff_h264_biweight_8x8_mmx2;
03463 c->biweight_h264_pixels_tab[4]= ff_h264_biweight_8x4_mmx2;
03464 c->biweight_h264_pixels_tab[5]= ff_h264_biweight_4x8_mmx2;
03465 c->biweight_h264_pixels_tab[6]= ff_h264_biweight_4x4_mmx2;
03466 c->biweight_h264_pixels_tab[7]= ff_h264_biweight_4x2_mmx2;
03467
03468 if (ENABLE_CAVS_DECODER)
03469 ff_cavsdsp_init_mmx2(c, avctx);
03470
03471 if (ENABLE_VC1_DECODER || ENABLE_WMV3_DECODER)
03472 ff_vc1dsp_init_mmx(c, avctx);
03473
03474 #ifdef CONFIG_ENCODERS
03475 c->sub_hfyu_median_prediction= sub_hfyu_median_prediction_mmx2;
03476 #endif //CONFIG_ENCODERS
03477 } else if (mm_flags & MM_3DNOW) {
03478 c->prefetch = prefetch_3dnow;
03479
03480 c->put_pixels_tab[0][1] = put_pixels16_x2_3dnow;
03481 c->put_pixels_tab[0][2] = put_pixels16_y2_3dnow;
03482
03483 c->avg_pixels_tab[0][0] = avg_pixels16_3dnow;
03484 c->avg_pixels_tab[0][1] = avg_pixels16_x2_3dnow;
03485 c->avg_pixels_tab[0][2] = avg_pixels16_y2_3dnow;
03486
03487 c->put_pixels_tab[1][1] = put_pixels8_x2_3dnow;
03488 c->put_pixels_tab[1][2] = put_pixels8_y2_3dnow;
03489
03490 c->avg_pixels_tab[1][0] = avg_pixels8_3dnow;
03491 c->avg_pixels_tab[1][1] = avg_pixels8_x2_3dnow;
03492 c->avg_pixels_tab[1][2] = avg_pixels8_y2_3dnow;
03493
03494 if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
03495 c->put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_3dnow;
03496 c->put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_3dnow;
03497 c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_3dnow;
03498 c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_3dnow;
03499 c->avg_pixels_tab[0][3] = avg_pixels16_xy2_3dnow;
03500 c->avg_pixels_tab[1][3] = avg_pixels8_xy2_3dnow;
03501 }
03502
03503 SET_QPEL_FUNCS(put_qpel, 0, 16, 3dnow);
03504 SET_QPEL_FUNCS(put_qpel, 1, 8, 3dnow);
03505 SET_QPEL_FUNCS(put_no_rnd_qpel, 0, 16, 3dnow);
03506 SET_QPEL_FUNCS(put_no_rnd_qpel, 1, 8, 3dnow);
03507 SET_QPEL_FUNCS(avg_qpel, 0, 16, 3dnow);
03508 SET_QPEL_FUNCS(avg_qpel, 1, 8, 3dnow);
03509
03510 SET_QPEL_FUNCS(put_h264_qpel, 0, 16, 3dnow);
03511 SET_QPEL_FUNCS(put_h264_qpel, 1, 8, 3dnow);
03512 SET_QPEL_FUNCS(put_h264_qpel, 2, 4, 3dnow);
03513 SET_QPEL_FUNCS(avg_h264_qpel, 0, 16, 3dnow);
03514 SET_QPEL_FUNCS(avg_h264_qpel, 1, 8, 3dnow);
03515 SET_QPEL_FUNCS(avg_h264_qpel, 2, 4, 3dnow);
03516
03517 SET_QPEL_FUNCS(put_2tap_qpel, 0, 16, 3dnow);
03518 SET_QPEL_FUNCS(put_2tap_qpel, 1, 8, 3dnow);
03519 SET_QPEL_FUNCS(avg_2tap_qpel, 0, 16, 3dnow);
03520 SET_QPEL_FUNCS(avg_2tap_qpel, 1, 8, 3dnow);
03521
03522 c->avg_h264_chroma_pixels_tab[0]= avg_h264_chroma_mc8_3dnow_rnd;
03523 c->avg_h264_chroma_pixels_tab[1]= avg_h264_chroma_mc4_3dnow;
03524 }
03525
03526
03527
03528
03529
03530
03531
03532
03533
03534 #ifdef HAVE_SSSE3
03535 if(mm_flags & MM_SSSE3){
03536 SET_QPEL_FUNCS(put_h264_qpel, 0, 16, ssse3);
03537 SET_QPEL_FUNCS(put_h264_qpel, 1, 8, ssse3);
03538 SET_QPEL_FUNCS(avg_h264_qpel, 0, 16, ssse3);
03539 SET_QPEL_FUNCS(avg_h264_qpel, 1, 8, ssse3);
03540 }
03541 #endif
03542
03543 #ifdef CONFIG_ENCODERS
03544 if(mm_flags & MM_SSE2){
03545 c->sum_abs_dctelem= sum_abs_dctelem_sse2;
03546 c->hadamard8_diff[0]= hadamard8_diff16_sse2;
03547 c->hadamard8_diff[1]= hadamard8_diff_sse2;
03548 if (ENABLE_FLAC_ENCODER)
03549 c->flac_compute_autocorr = ff_flac_compute_autocorr_sse2;
03550 }
03551
03552 #ifdef HAVE_SSSE3
03553 if(mm_flags & MM_SSSE3){
03554 if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
03555 c->try_8x8basis= try_8x8basis_ssse3;
03556 }
03557 c->add_8x8basis= add_8x8basis_ssse3;
03558 c->sum_abs_dctelem= sum_abs_dctelem_ssse3;
03559 c->hadamard8_diff[0]= hadamard8_diff16_ssse3;
03560 c->hadamard8_diff[1]= hadamard8_diff_ssse3;
03561 }
03562 #endif
03563 #endif
03564
03565 #ifdef CONFIG_SNOW_DECODER
03566 if(mm_flags & MM_SSE2 & 0){
03567 c->horizontal_compose97i = ff_snow_horizontal_compose97i_sse2;
03568 #ifdef HAVE_7REGS
03569 c->vertical_compose97i = ff_snow_vertical_compose97i_sse2;
03570 #endif
03571 c->inner_add_yblock = ff_snow_inner_add_yblock_sse2;
03572 }
03573 else{
03574 if(mm_flags & MM_MMXEXT){
03575 c->horizontal_compose97i = ff_snow_horizontal_compose97i_mmx;
03576 #ifdef HAVE_7REGS
03577 c->vertical_compose97i = ff_snow_vertical_compose97i_mmx;
03578 #endif
03579 }
03580 c->inner_add_yblock = ff_snow_inner_add_yblock_mmx;
03581 }
03582 #endif
03583
03584 if(mm_flags & MM_3DNOW){
03585 #ifdef CONFIG_ENCODERS
03586 if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
03587 c->try_8x8basis= try_8x8basis_3dnow;
03588 }
03589 c->add_8x8basis= add_8x8basis_3dnow;
03590 #endif //CONFIG_ENCODERS
03591 c->vorbis_inverse_coupling = vorbis_inverse_coupling_3dnow;
03592 c->vector_fmul = vector_fmul_3dnow;
03593 if(!(avctx->flags & CODEC_FLAG_BITEXACT))
03594 c->float_to_int16 = float_to_int16_3dnow;
03595 }
03596 if(mm_flags & MM_3DNOWEXT)
03597 c->vector_fmul_reverse = vector_fmul_reverse_3dnow2;
03598 if(mm_flags & MM_SSE){
03599 c->vorbis_inverse_coupling = vorbis_inverse_coupling_sse;
03600 c->vector_fmul = vector_fmul_sse;
03601 c->float_to_int16 = float_to_int16_sse;
03602 c->vector_fmul_reverse = vector_fmul_reverse_sse;
03603 c->vector_fmul_add_add = vector_fmul_add_add_sse;
03604 }
03605 if(mm_flags & MM_3DNOW)
03606 c->vector_fmul_add_add = vector_fmul_add_add_3dnow;
03607 }
03608
03609 #ifdef CONFIG_ENCODERS
03610 dsputil_init_pix_mmx(c, avctx);
03611 #endif //CONFIG_ENCODERS
03612 #if 0
03613
03614 get_pixels = just_return;
03615 put_pixels_clamped = just_return;
03616 add_pixels_clamped = just_return;
03617
03618 pix_abs16x16 = just_return;
03619 pix_abs16x16_x2 = just_return;
03620 pix_abs16x16_y2 = just_return;
03621 pix_abs16x16_xy2 = just_return;
03622
03623 put_pixels_tab[0] = just_return;
03624 put_pixels_tab[1] = just_return;
03625 put_pixels_tab[2] = just_return;
03626 put_pixels_tab[3] = just_return;
03627
03628 put_no_rnd_pixels_tab[0] = just_return;
03629 put_no_rnd_pixels_tab[1] = just_return;
03630 put_no_rnd_pixels_tab[2] = just_return;
03631 put_no_rnd_pixels_tab[3] = just_return;
03632
03633 avg_pixels_tab[0] = just_return;
03634 avg_pixels_tab[1] = just_return;
03635 avg_pixels_tab[2] = just_return;
03636 avg_pixels_tab[3] = just_return;
03637
03638 avg_no_rnd_pixels_tab[0] = just_return;
03639 avg_no_rnd_pixels_tab[1] = just_return;
03640 avg_no_rnd_pixels_tab[2] = just_return;
03641 avg_no_rnd_pixels_tab[3] = just_return;
03642
03643
03644
03645 #endif
03646 }