1 |
|
;------------------------------------------------------------------------------ |
2 |
|
; |
3 |
|
; This file is part of XviD, a free MPEG-4 video encoder/decoder |
4 |
|
; |
5 |
|
; This program is free software; you can redistribute it and/or modify it |
6 |
|
; under the terms of the GNU General Public License as published by |
7 |
|
; the Free Software Foundation; either version 2 of the License, or |
8 |
|
; (at your option) any later version. |
9 |
|
; |
10 |
|
; This program is distributed in the hope that it will be useful, but |
11 |
|
; WITHOUT ANY WARRANTY; without even the implied warranty of |
12 |
|
; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
13 |
|
; GNU General Public License for more details. |
14 |
|
; |
15 |
|
; You should have received a copy of the GNU General Public License |
16 |
|
; along with this program; if not, write to the Free Software |
17 |
|
; Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA |
18 |
|
; |
19 |
|
;------------------------------------------------------------------------------ |
20 |
|
;------------------------------------------------------------------------------ |
21 |
|
; |
22 |
|
; yuv_to_yuv.asm, MMX optimized color conversion |
23 |
|
; |
24 |
|
; Copyright (C) 2001 - Michael Militzer <isibaar@xvid.org> |
25 |
|
; |
26 |
|
; For more information visit the XviD homepage: http://www.xvid.org |
27 |
|
; |
28 |
|
;------------------------------------------------------------------------------ |
29 |
|
;------------------------------------------------------------------------------ |
30 |
|
; |
31 |
|
; Revision history: |
32 |
|
; |
33 |
|
; 24.11.2001 initial version (Isibaar) |
34 |
|
; 23.07.2002 thread safe (edgomez) |
35 |
|
; |
36 |
|
; $Id$ |
37 |
|
; |
38 |
|
;------------------------------------------------------------------------------ |
39 |
|
|
40 |
|
BITS 32 |
41 |
|
|
42 |
|
%macro cglobal 1 |
43 |
|
%ifdef PREFIX |
44 |
|
global _%1 |
45 |
|
%define %1 _%1 |
46 |
|
%else |
47 |
|
global %1 |
48 |
|
%endif |
49 |
|
%endmacro |
50 |
|
|
51 |
|
SECTION .text |
52 |
|
|
53 |
|
ALIGN 64 |
54 |
|
|
55 |
|
;------------------------------------------------------------------------------ |
56 |
|
; PLANE_COPY ( DST, DST_DIF, SRC, SRC_DIF, WIDTH, HEIGHT, OPT ) |
57 |
|
; DST dst buffer |
58 |
|
; DST_DIF dst stride difference (e.g. stride - width) |
59 |
|
; SRC src destination buffer |
60 |
|
; SRC_DIF src stride difference (e.g. stride - width) |
61 |
|
; WIDTH width |
62 |
|
; HEIGHT height |
63 |
|
; OPT 0=plain mmx, 1=xmm |
64 |
|
;------------------------------------------------------------------------------ |
65 |
|
%macro PLANE_COPY 7 |
66 |
|
%define DST %1 |
67 |
|
%define DST_DIF %2 |
68 |
|
%define SRC %3 |
69 |
|
%define SRC_DIF %4 |
70 |
|
%define WIDTH %5 |
71 |
|
%define HEIGHT %6 |
72 |
|
%define OPT %7 |
73 |
|
|
74 |
|
mov eax, WIDTH |
75 |
|
mov ebp, HEIGHT ; $ebp$ = height |
76 |
|
mov esi, SRC |
77 |
|
mov edi, DST |
78 |
|
|
79 |
|
mov ebx, eax |
80 |
|
shr eax, 6 ; $eax$ = width / 64 |
81 |
|
and ebx, 63 ; remainder = width % 64 |
82 |
|
mov edx, ebx |
83 |
|
shr ebx, 4 ; $ebx$ = reaminder / 16 |
84 |
|
and edx, 15 ; $edx$ = remainder % 16 |
85 |
|
|
86 |
|
%%loop64_start |
87 |
|
or eax, eax |
88 |
|
jz %%loop16_start |
89 |
|
mov ecx, eax ; width64 |
90 |
|
%%loop64: |
91 |
|
%if OPT == 1 ; xmm |
92 |
|
prefetchnta [esi + 64] ; non temporal prefetch |
93 |
|
prefetchnta [esi + 96] |
94 |
|
%endif |
95 |
|
movq mm1, [esi] ; read from src |
96 |
|
movq mm2, [esi + 8] |
97 |
|
movq mm3, [esi + 16] |
98 |
|
movq mm4, [esi + 24] |
99 |
|
movq mm5, [esi + 32] |
100 |
|
movq mm6, [esi + 40] |
101 |
|
movq mm7, [esi + 48] |
102 |
|
movq mm0, [esi + 56] |
103 |
|
|
104 |
|
%if OPT == 0 ; plain mmx |
105 |
|
movq [edi], mm1 ; write to y_out |
106 |
|
movq [edi + 8], mm2 |
107 |
|
movq [edi + 16], mm3 |
108 |
|
movq [edi + 24], mm4 |
109 |
|
movq [edi + 32], mm5 |
110 |
|
movq [edi + 40], mm6 |
111 |
|
movq [edi + 48], mm7 |
112 |
|
movq [edi + 56], mm0 |
113 |
|
%else |
114 |
|
movntq [edi], mm1 ; write to y_out |
115 |
|
movntq [edi + 8], mm2 |
116 |
|
movntq [edi + 16], mm3 |
117 |
|
movntq [edi + 24], mm4 |
118 |
|
movntq [edi + 32], mm5 |
119 |
|
movntq [edi + 40], mm6 |
120 |
|
movntq [edi + 48], mm7 |
121 |
|
movntq [edi + 56], mm0 |
122 |
|
%endif |
123 |
|
|
124 |
|
add esi, 64 |
125 |
|
add edi, 64 |
126 |
|
dec ecx |
127 |
|
jnz %%loop64 |
128 |
|
|
129 |
|
|
130 |
|
%%loop16_start |
131 |
|
or ebx, ebx |
132 |
|
jz %%loop1_start |
133 |
|
mov ecx, ebx ; width16 |
134 |
|
%%loop16: |
135 |
|
movq mm1, [esi] |
136 |
|
movq mm2, [esi + 8] |
137 |
|
%if OPT == 0 ; plain mmx |
138 |
|
movq [edi], mm1 |
139 |
|
movq [edi + 8], mm2 |
140 |
|
%else |
141 |
|
movntq [edi], mm1 |
142 |
|
movntq [edi + 8], mm2 |
143 |
|
%endif |
144 |
|
|
145 |
|
add esi, 16 |
146 |
|
add edi, 16 |
147 |
|
dec ecx |
148 |
|
jnz %%loop16 |
149 |
|
|
150 |
|
|
151 |
|
%%loop1_start |
152 |
|
mov ecx, edx |
153 |
|
rep movsb |
154 |
|
|
155 |
|
add esi, SRC_DIF |
156 |
|
add edi, DST_DIF |
157 |
|
dec ebp |
158 |
|
jnz near %%loop64_start |
159 |
|
%endmacro |
160 |
|
;------------------------------------------------------------------------------ |
161 |
|
|
162 |
|
|
163 |
|
|
164 |
|
;------------------------------------------------------------------------------ |
165 |
|
; MAKE_YV12_TO_YV12( NAME, OPT ) |
166 |
|
; NAME function name |
167 |
|
; OPT 0=plain mmx, 1=xmm |
168 |
|
; |
169 |
|
; yv12_to_yv12_mmx(uint8_t * y_dst, uint8_t * u_dst, uint8_t * v_dst, |
170 |
|
; int y_dst_stride, int uv_dst_stride, |
171 |
|
; uint8_t * y_src, uint8_t * u_src, uint8_t * v_src, |
172 |
|
; int y_src_stride, int uv_src_stride, |
173 |
|
; int width, int height, int vflip) |
174 |
|
;------------------------------------------------------------------------------ |
175 |
|
%macro MAKE_YV12_TO_YV12 2 |
176 |
|
%define NAME %1 |
177 |
|
%define OPT %2 |
178 |
|
align 16 |
179 |
|
cglobal NAME |
180 |
|
NAME |
181 |
|
%define pushsize 16 |
182 |
|
%define localsize 24 |
183 |
|
|
184 |
|
%define vflip esp + localsize + pushsize + 52 |
185 |
|
%define height esp + localsize + pushsize + 48 |
186 |
|
%define width esp + localsize + pushsize + 44 |
187 |
|
%define uv_src_stride esp + localsize + pushsize + 40 |
188 |
|
%define y_src_stride esp + localsize + pushsize + 36 |
189 |
|
%define v_src esp + localsize + pushsize + 32 |
190 |
|
%define u_src esp + localsize + pushsize + 28 |
191 |
|
%define y_src esp + localsize + pushsize + 24 |
192 |
|
%define uv_dst_stride esp + localsize + pushsize + 20 |
193 |
|
%define y_dst_stride esp + localsize + pushsize + 16 |
194 |
|
%define v_dst esp + localsize + pushsize + 12 |
195 |
|
%define u_dst esp + localsize + pushsize + 8 |
196 |
|
%define y_dst esp + localsize + pushsize + 4 |
197 |
|
%define _ip esp + localsize + pushsize + 0 |
198 |
|
|
199 |
|
push ebx ; esp + localsize + 16 |
200 |
|
push esi ; esp + localsize + 8 |
201 |
|
push edi ; esp + localsize + 4 |
202 |
|
push ebp ; esp + localsize + 0 |
203 |
|
|
204 |
|
%define width2 esp + localsize - 4 |
205 |
|
%define height2 esp + localsize - 8 |
206 |
|
%define y_src_dif esp + localsize - 12 |
207 |
|
%define y_dst_dif esp + localsize - 16 |
208 |
|
%define uv_src_dif esp + localsize - 20 |
209 |
|
%define uv_dst_dif esp + localsize - 24 |
210 |
|
|
211 |
|
sub esp, localsize |
212 |
|
|
213 |
|
mov eax, [width] |
214 |
|
mov ebx, [height] |
215 |
|
shr eax, 1 ; calculate widht/2, heigh/2 |
216 |
|
shr ebx, 1 |
217 |
|
mov [width2], eax |
218 |
|
mov [height2], ebx |
219 |
|
|
220 |
|
mov ebp, [vflip] |
221 |
|
or ebp, ebp |
222 |
|
jz near .dont_flip |
223 |
|
|
224 |
|
; flipping support |
225 |
|
mov eax, [height] |
226 |
|
mov esi, [y_src] |
227 |
|
mov edx, [y_src_stride] |
228 |
|
push edx |
229 |
|
mul edx |
230 |
|
pop edx |
231 |
|
add esi, eax ; y_src += (height-1) * y_src_stride |
232 |
|
neg edx |
233 |
|
mov [y_src], esi |
234 |
|
mov [y_src_stride], edx ; y_src_stride = -y_src_stride |
235 |
|
|
236 |
|
mov eax, [height2] |
237 |
|
mov esi, [u_src] |
238 |
|
mov edi, [v_src] |
239 |
|
mov edx, [uv_src_stride] |
240 |
|
sub eax, 1 ; ebp = height2 - 1 |
241 |
|
push edx |
242 |
|
mul edx |
243 |
|
pop edx |
244 |
|
add esi, eax ; u_src += (height2-1) * uv_src_stride |
245 |
|
add edi, eax ; v_src += (height2-1) * uv_src_stride |
246 |
|
neg edx |
247 |
|
mov [u_src], esi |
248 |
|
mov [v_src], edi |
249 |
|
mov [uv_src_stride], edx ; uv_src_stride = -uv_src_stride |
250 |
|
|
251 |
|
.dont_flip |
252 |
|
|
253 |
|
mov eax, [y_src_stride] |
254 |
|
mov ebx, [y_dst_stride] |
255 |
|
mov ecx, [uv_src_stride] |
256 |
|
mov edx, [uv_dst_stride] |
257 |
|
sub eax, [width] |
258 |
|
sub ebx, [width] |
259 |
|
sub ecx, [width2] |
260 |
|
sub edx, [width2] |
261 |
|
mov [y_src_dif], eax ; y_src_dif = y_src_stride - width |
262 |
|
mov [y_dst_dif], ebx ; y_dst_dif = y_dst_stride - width |
263 |
|
mov [uv_src_dif], ecx ; uv_src_dif = uv_src_stride - width2 |
264 |
|
mov [uv_dst_dif], edx ; uv_dst_dif = uv_dst_stride - width2 |
265 |
|
|
266 |
|
PLANE_COPY [y_dst], [y_dst_dif], [y_src], [y_src_dif], [width], [height], OPT |
267 |
|
PLANE_COPY [u_dst], [uv_dst_dif], [u_src], [uv_src_dif], [width2], [height2], OPT |
268 |
|
PLANE_COPY [v_dst], [uv_dst_dif], [v_src], [uv_src_dif], [width2], [height2], OPT |
269 |
|
|
270 |
|
add esp, localsize |
271 |
|
pop ebp |
272 |
|
pop edi |
273 |
|
pop esi |
274 |
|
pop ebx |
275 |
|
|
276 |
|
ret |
277 |
|
%endmacro |
278 |
|
;------------------------------------------------------------------------------ |
279 |
|
|
280 |
|
|
281 |
|
MAKE_YV12_TO_YV12 yv12_to_yv12_mmx, 0 |
282 |
|
MAKE_YV12_TO_YV12 yv12_to_yv12_xmm, 1 |