1 |
|
;/************************************************************************** |
2 |
|
; * |
3 |
|
; * XVID MPEG-4 VIDEO CODEC |
4 |
|
; * mmx cbp calc |
5 |
|
; * |
6 |
|
; * This program is an implementation of a part of one or more MPEG-4 |
7 |
|
; * Video tools as specified in ISO/IEC 14496-2 standard. Those intending |
8 |
|
; * to use this software module in hardware or software products are |
9 |
|
; * advised that its use may infringe existing patents or copyrights, and |
10 |
|
; * any such use would be at such party's own risk. The original |
11 |
|
; * developer of this software module and his/her company, and subsequent |
12 |
|
; * editors and their companies, will have no liability for use of this |
13 |
|
; * software or modifications or derivatives thereof. |
14 |
|
; * |
15 |
|
; * This program is free software; you can redistribute it and/or modify |
16 |
|
; * it under the terms of the GNU General Public License as published by |
17 |
|
; * the Free Software Foundation; either version 2 of the License, or |
18 |
|
; * (at your option) any later version. |
19 |
|
; * |
20 |
|
; * This program is distributed in the hope that it will be useful, |
21 |
|
; * but WITHOUT ANY WARRANTY; without even the implied warranty of |
22 |
|
; * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
23 |
|
; * GNU General Public License for more details. |
24 |
|
; * |
25 |
|
; * You should have received a copy of the GNU General Public License |
26 |
|
; * along with this program; if not, write to the Free Software |
27 |
|
; * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. |
28 |
|
; * |
29 |
|
; *************************************************************************/ |
30 |
|
|
31 |
|
|
32 |
|
; these 3dne functions are compatible with iSSE, but are optimized specifically for |
33 |
|
; K7 pipelines |
34 |
|
; |
35 |
|
;------------------------------------------------------------------------------ |
36 |
|
; 09.12.2002 Athlon optimizations contributed by Jaan Kalda |
37 |
|
;------------------------------------------------------------------------------ |
38 |
|
|
39 |
|
bits 32 |
40 |
|
|
41 |
|
section .data |
42 |
|
%macro cglobal 1 |
43 |
|
%if 1 |
44 |
|
global _%1 |
45 |
|
%define %1 _%1 |
46 |
|
%else |
47 |
|
global %1 |
48 |
|
%endif |
49 |
|
%endmacro |
50 |
|
|
51 |
|
section .text |
52 |
|
|
53 |
|
cglobal calc_cbp_3dne |
54 |
|
|
55 |
|
;=========================================================================== |
56 |
|
; |
57 |
|
; uint32_t calc_cbp_3dne(const int16_t coeff[6][64]); |
58 |
|
; |
59 |
|
;=========================================================================== |
60 |
|
%macro calc_cbp 1 |
61 |
|
pshufw mm0, [eax], 229 ; =11100101 |
62 |
|
movq mm1, [eax+8] |
63 |
|
por mm0, [eax+64] |
64 |
|
por mm1, [eax+72] |
65 |
|
movq mm2, [eax+16] |
66 |
|
movq mm3, [eax+24] |
67 |
|
por mm2, [eax+80] |
68 |
|
por mm3, [eax+88] |
69 |
|
movq mm4, [eax+32] |
70 |
|
movq mm5, [eax+40] |
71 |
|
por mm4, [eax+96] |
72 |
|
por mm5, [eax+104] |
73 |
|
movq mm6, [eax+48] |
74 |
|
movq mm7, [eax+56] |
75 |
|
por mm6, [eax+112] |
76 |
|
por mm7, [eax+120] |
77 |
|
por mm1, mm0 |
78 |
|
%if %1 |
79 |
|
sub eax,byte -128 ;ecx ;+= 128; needed 3 bytes for alignment |
80 |
|
%else |
81 |
|
xor eax,eax |
82 |
|
xor edx,edx |
83 |
|
%endif |
84 |
|
por mm3, mm2 |
85 |
|
por mm5, mm4 |
86 |
|
por mm7, mm6 |
87 |
|
por mm3, mm1 |
88 |
|
por mm7, mm5 |
89 |
|
por mm7, mm3 |
90 |
|
packsswb mm7,mm7 |
91 |
|
movd [esp+%1*4],mm7 |
92 |
|
%endmacro |
93 |
|
|
94 |
|
|
95 |
|
|
96 |
|
align 16 ;AMD K7, in cache: ca 80 clk |
97 |
|
calc_cbp_3dne: |
98 |
|
mov eax, [esp+ 4] ; coeff |
99 |
|
lea esp,[esp-24] |
100 |
|
calc_cbp 5 ;bit 5 |
101 |
|
calc_cbp 4 ;b4 |
102 |
|
calc_cbp 3 ;b3 |
103 |
|
calc_cbp 2 ;b2 |
104 |
|
calc_cbp 1 ;b1 |
105 |
|
calc_cbp 0 ;b0 |
106 |
|
cmp eax,[esp+5*4] |
107 |
|
adc eax,eax |
108 |
|
cmp edx,[esp+4*4] |
109 |
|
adc eax,eax |
110 |
|
cmp edx,[esp+3*4] |
111 |
|
adc eax,eax |
112 |
|
cmp edx,[esp+2*4] |
113 |
|
adc eax,eax |
114 |
|
cmp edx,[esp+1*4] |
115 |
|
adc eax,eax |
116 |
|
cmp edx,[esp+0*4] |
117 |
|
adc eax,eax |
118 |
|
add esp,byte 24 |
119 |
|
ret |