--- interlacing_mmx.asm 2002/09/25 16:51:05 1.1 +++ interlacing_mmx.asm 2002/09/25 16:51:05 1.1.2.1 @@ -0,0 +1,209 @@ +;/************************************************************************** +; * +; * XVID MPEG-4 VIDEO CODEC +; * mmx interlacing decision +; * +; * This program is an implementation of a part of one or more MPEG-4 +; * Video tools as specified in ISO/IEC 14496-2 standard. Those intending +; * to use this software module in hardware or software products are +; * advised that its use may infringe existing patents or copyrights, and +; * any such use would be at such party's own risk. The original +; * developer of this software module and his/her company, and subsequent +; * editors and their companies, will have no liability for use of this +; * software or modifications or derivatives thereof. +; * +; * This program is free software; you can redistribute it and/or modify +; * it under the terms of the GNU General Public License as published by +; * the Free Software Foundation; either version 2 of the License, or +; * (at your option) any later version. +; * +; * This program is distributed in the hope that it will be useful, +; * but WITHOUT ANY WARRANTY; without even the implied warranty of +; * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +; * GNU General Public License for more details. +; * +; * You should have received a copy of the GNU General Public License +; * along with this program; if not, write to the Free Software +; * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. +; * +; *************************************************************************/ + +;/************************************************************************** +; * +; * History: +; * +; * 04.09.2002 initial version; (c)2002 daniel smith +; * +; *************************************************************************/ + + +bits 32 + +%macro cglobal 1 + %ifdef PREFIX + global _%1 + %define %1 _%1 + %else + global %1 + %endif +%endmacro + + +section .text + +cglobal MBFieldTest_mmx + +; advances to next block on right +align 16 +nexts dd 0, 0, 8, 120, 8 + +; multiply word sums into dwords +align 16 +ones times 4 dw 1 + +; neater +%define line0 esi +%define line1 esi+16 +%define line2 esi+32 +%define line3 esi+48 +%define line4 esi+64 +%define line5 esi+80 +%define line6 esi+96 +%define line7 esi+112 +%define line8 edi +%define line9 edi+16 +%define line10 edi+32 +%define line11 edi+48 +%define line12 edi+64 +%define line13 edi+80 +%define line14 edi+96 +%define line15 edi+112 + +; keep from losing track which reg holds which line - these never overlap +%define m00 mm0 +%define m01 mm1 +%define m02 mm2 +%define m03 mm0 +%define m04 mm1 +%define m05 mm2 +%define m06 mm0 +%define m07 mm1 +%define m08 mm2 +%define m09 mm0 +%define m10 mm1 +%define m11 mm2 +%define m12 mm0 +%define m13 mm1 +%define m14 mm2 +%define m15 mm0 + +; gets diff between three lines low(%2),mid(%3),hi(%4): frame = mid-low, field = hi-low +%macro ABS8 4 + movq %4, [%1] ; m02 = hi + movq mm3, %2 ; mm3 = low copy + + pxor mm4, mm4 ; mm4 = 0 + pxor mm5, mm5 ; mm5 = 0 + + psubw %2, %3 ; diff(med,low) for frame + psubw mm3, %4 ; diff(hi,low) for field + + pcmpgtw mm4, %2 ; if (diff<0), mm4 will be all 1's, else all 0's + pcmpgtw mm5, mm3 + pxor %2, mm4 ; this will get abs(), but off by 1 if (diff<0) + pxor mm3, mm5 + psubw %2, mm4 ; correct abs being off by 1 when (diff<0) + psubw mm3, mm5 + + paddw mm6, %2 ; add to totals + paddw mm7, mm3 +%endmacro + +section .text + +;=========================================================================== +; +; uint32_t MBFieldTest_mmx(int16_t * const data); +; +;=========================================================================== + +align 16 +MBFieldTest_mmx: + + push esi + push edi + + mov esi, [esp+8+4] ; esi = top left block + mov edi, esi + add edi, 256 ; edi = bottom left block + + pxor mm6, mm6 ; frame total + pxor mm7, mm7 ; field total + + mov eax, 4 ; we do left 8 bytes of data[0*64], then right 8 bytes + ; then left 8 bytes of data[1*64], then last 8 bytes + +_loop: + movq m00, [line0] ; line0 + movq m01, [line1] ; line1 + + ABS8 line2, m00, m01, m02 ; frame += (line2-line1), field += (line2-line0) + ABS8 line3, m01, m02, m03 + ABS8 line4, m02, m03, m04 + ABS8 line5, m03, m04, m05 + ABS8 line6, m04, m05, m06 + ABS8 line7, m05, m06, m07 + ABS8 line8, m06, m07, m08 + + movq m09, [line9] ; line9-line7, no frame comp for line9-line8! + pxor mm4, mm4 + psubw m07, m09 + pcmpgtw mm4, mm1 + pxor m07, mm4 + psubw m07, mm4 + paddw mm7, m07 ; add to field total + + ABS8 line10, m08, m09, m10 ; frame += (line10-line9), field += (line10-line8) + ABS8 line11, m09, m10, m11 + ABS8 line12, m10, m11, m12 + ABS8 line13, m11, m12, m13 + ABS8 line14, m12, m13, m14 + ABS8 line15, m13, m14, m15 + + pxor mm4, mm4 ; line15-line14, we're done with field comps! + psubw m14, m15 + pcmpgtw mm4, m14 + pxor m14, mm4 + psubw m14, mm4 + paddw mm6, m14 ; add to frame total + + mov ecx, [nexts+eax*4] ; move esi/edi 8 pixels to the right + add esi, ecx + add edi, ecx + + dec eax + jnz near _loop + +_decide: + movq mm0, [ones] ; add packed words into single dwords + pmaddwd mm6, mm0 + pmaddwd mm7, mm0 + + movq mm0, mm6 ; ecx will be frame total, edx field + movq mm1, mm7 + psrlq mm0, 32 + psrlq mm1, 32 + paddd mm0, mm6 + paddd mm1, mm7 + movd ecx, mm0 + movd edx, mm1 + + cmp ecx, edx + jb _end ; if frame=field, use field dct (return 1) + +_end: + pop edi + pop esi + + ret