--- colorspace_yuv_mmx.asm 2002/11/03 04:51:33 1.1 +++ colorspace_yuv_mmx.asm 2003/02/15 15:22:18 1.2 @@ -0,0 +1,282 @@ +;------------------------------------------------------------------------------ +; +; This file is part of XviD, a free MPEG-4 video encoder/decoder +; +; This program is free software; you can redistribute it and/or modify it +; under the terms of the GNU General Public License as published by +; the Free Software Foundation; either version 2 of the License, or +; (at your option) any later version. +; +; This program is distributed in the hope that it will be useful, but +; WITHOUT ANY WARRANTY; without even the implied warranty of +; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +; GNU General Public License for more details. +; +; You should have received a copy of the GNU General Public License +; along with this program; if not, write to the Free Software +; Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +; +;------------------------------------------------------------------------------ +;------------------------------------------------------------------------------ +; +; yuv_to_yuv.asm, MMX optimized color conversion +; +; Copyright (C) 2001 - Michael Militzer +; +; For more information visit the XviD homepage: http://www.xvid.org +; +;------------------------------------------------------------------------------ +;------------------------------------------------------------------------------ +; +; Revision history: +; +; 24.11.2001 initial version (Isibaar) +; 23.07.2002 thread safe (edgomez) +; +; $Id: colorspace_yuv_mmx.asm,v 1.2 2003/02/15 15:22:18 edgomez Exp $ +; +;------------------------------------------------------------------------------ + +BITS 32 + +%macro cglobal 1 +%ifdef PREFIX + global _%1 + %define %1 _%1 + %else + global %1 + %endif +%endmacro + +SECTION .text + +ALIGN 64 + +;------------------------------------------------------------------------------ +; PLANE_COPY ( DST, DST_DIF, SRC, SRC_DIF, WIDTH, HEIGHT, OPT ) +; DST dst buffer +; DST_DIF dst stride difference (e.g. stride - width) +; SRC src destination buffer +; SRC_DIF src stride difference (e.g. stride - width) +; WIDTH width +; HEIGHT height +; OPT 0=plain mmx, 1=xmm +;------------------------------------------------------------------------------ +%macro PLANE_COPY 7 +%define DST %1 +%define DST_DIF %2 +%define SRC %3 +%define SRC_DIF %4 +%define WIDTH %5 +%define HEIGHT %6 +%define OPT %7 + + mov eax, WIDTH + mov ebp, HEIGHT ; $ebp$ = height + mov esi, SRC + mov edi, DST + + mov ebx, eax + shr eax, 6 ; $eax$ = width / 64 + and ebx, 63 ; remainder = width % 64 + mov edx, ebx + shr ebx, 4 ; $ebx$ = reaminder / 16 + and edx, 15 ; $edx$ = remainder % 16 + +%%loop64_start + or eax, eax + jz %%loop16_start + mov ecx, eax ; width64 +%%loop64: +%if OPT == 1 ; xmm + prefetchnta [esi + 64] ; non temporal prefetch + prefetchnta [esi + 96] +%endif + movq mm1, [esi] ; read from src + movq mm2, [esi + 8] + movq mm3, [esi + 16] + movq mm4, [esi + 24] + movq mm5, [esi + 32] + movq mm6, [esi + 40] + movq mm7, [esi + 48] + movq mm0, [esi + 56] + +%if OPT == 0 ; plain mmx + movq [edi], mm1 ; write to y_out + movq [edi + 8], mm2 + movq [edi + 16], mm3 + movq [edi + 24], mm4 + movq [edi + 32], mm5 + movq [edi + 40], mm6 + movq [edi + 48], mm7 + movq [edi + 56], mm0 +%else + movntq [edi], mm1 ; write to y_out + movntq [edi + 8], mm2 + movntq [edi + 16], mm3 + movntq [edi + 24], mm4 + movntq [edi + 32], mm5 + movntq [edi + 40], mm6 + movntq [edi + 48], mm7 + movntq [edi + 56], mm0 +%endif + + add esi, 64 + add edi, 64 + dec ecx + jnz %%loop64 + + +%%loop16_start + or ebx, ebx + jz %%loop1_start + mov ecx, ebx ; width16 +%%loop16: + movq mm1, [esi] + movq mm2, [esi + 8] +%if OPT == 0 ; plain mmx + movq [edi], mm1 + movq [edi + 8], mm2 +%else + movntq [edi], mm1 + movntq [edi + 8], mm2 +%endif + + add esi, 16 + add edi, 16 + dec ecx + jnz %%loop16 + + +%%loop1_start + mov ecx, edx + rep movsb + + add esi, SRC_DIF + add edi, DST_DIF + dec ebp + jnz near %%loop64_start +%endmacro +;------------------------------------------------------------------------------ + + + +;------------------------------------------------------------------------------ +; MAKE_YV12_TO_YV12( NAME, OPT ) +; NAME function name +; OPT 0=plain mmx, 1=xmm +; +; yv12_to_yv12_mmx(uint8_t * y_dst, uint8_t * u_dst, uint8_t * v_dst, +; int y_dst_stride, int uv_dst_stride, +; uint8_t * y_src, uint8_t * u_src, uint8_t * v_src, +; int y_src_stride, int uv_src_stride, +; int width, int height, int vflip) +;------------------------------------------------------------------------------ +%macro MAKE_YV12_TO_YV12 2 +%define NAME %1 +%define OPT %2 +align 16 +cglobal NAME +NAME +%define pushsize 16 +%define localsize 24 + +%define vflip esp + localsize + pushsize + 52 +%define height esp + localsize + pushsize + 48 +%define width esp + localsize + pushsize + 44 +%define uv_src_stride esp + localsize + pushsize + 40 +%define y_src_stride esp + localsize + pushsize + 36 +%define v_src esp + localsize + pushsize + 32 +%define u_src esp + localsize + pushsize + 28 +%define y_src esp + localsize + pushsize + 24 +%define uv_dst_stride esp + localsize + pushsize + 20 +%define y_dst_stride esp + localsize + pushsize + 16 +%define v_dst esp + localsize + pushsize + 12 +%define u_dst esp + localsize + pushsize + 8 +%define y_dst esp + localsize + pushsize + 4 +%define _ip esp + localsize + pushsize + 0 + + push ebx ; esp + localsize + 16 + push esi ; esp + localsize + 8 + push edi ; esp + localsize + 4 + push ebp ; esp + localsize + 0 + +%define width2 esp + localsize - 4 +%define height2 esp + localsize - 8 +%define y_src_dif esp + localsize - 12 +%define y_dst_dif esp + localsize - 16 +%define uv_src_dif esp + localsize - 20 +%define uv_dst_dif esp + localsize - 24 + + sub esp, localsize + + mov eax, [width] + mov ebx, [height] + shr eax, 1 ; calculate widht/2, heigh/2 + shr ebx, 1 + mov [width2], eax + mov [height2], ebx + + mov ebp, [vflip] + or ebp, ebp + jz near .dont_flip + +; flipping support + mov eax, [height] + mov esi, [y_src] + mov edx, [y_src_stride] + push edx + mul edx + pop edx + add esi, eax ; y_src += (height-1) * y_src_stride + neg edx + mov [y_src], esi + mov [y_src_stride], edx ; y_src_stride = -y_src_stride + + mov eax, [height2] + mov esi, [u_src] + mov edi, [v_src] + mov edx, [uv_src_stride] + sub eax, 1 ; ebp = height2 - 1 + push edx + mul edx + pop edx + add esi, eax ; u_src += (height2-1) * uv_src_stride + add edi, eax ; v_src += (height2-1) * uv_src_stride + neg edx + mov [u_src], esi + mov [v_src], edi + mov [uv_src_stride], edx ; uv_src_stride = -uv_src_stride + +.dont_flip + + mov eax, [y_src_stride] + mov ebx, [y_dst_stride] + mov ecx, [uv_src_stride] + mov edx, [uv_dst_stride] + sub eax, [width] + sub ebx, [width] + sub ecx, [width2] + sub edx, [width2] + mov [y_src_dif], eax ; y_src_dif = y_src_stride - width + mov [y_dst_dif], ebx ; y_dst_dif = y_dst_stride - width + mov [uv_src_dif], ecx ; uv_src_dif = uv_src_stride - width2 + mov [uv_dst_dif], edx ; uv_dst_dif = uv_dst_stride - width2 + + PLANE_COPY [y_dst], [y_dst_dif], [y_src], [y_src_dif], [width], [height], OPT + PLANE_COPY [u_dst], [uv_dst_dif], [u_src], [uv_src_dif], [width2], [height2], OPT + PLANE_COPY [v_dst], [uv_dst_dif], [v_src], [uv_src_dif], [width2], [height2], OPT + + add esp, localsize + pop ebp + pop edi + pop esi + pop ebx + + ret +%endmacro +;------------------------------------------------------------------------------ + + +MAKE_YV12_TO_YV12 yv12_to_yv12_mmx, 0 +MAKE_YV12_TO_YV12 yv12_to_yv12_xmm, 1