29 |
%ifdef MARK_FUNCS |
%ifdef MARK_FUNCS |
30 |
global _%1:function %1.endfunc-%1 |
global _%1:function %1.endfunc-%1 |
31 |
%define %1 _%1:function %1.endfunc-%1 |
%define %1 _%1:function %1.endfunc-%1 |
32 |
|
%define ENDFUNC .endfunc |
33 |
%else |
%else |
34 |
global _%1 |
global _%1 |
35 |
%define %1 _%1 |
%define %1 _%1 |
36 |
|
%define ENDFUNC |
37 |
%endif |
%endif |
38 |
%else |
%else |
39 |
%ifdef MARK_FUNCS |
%ifdef MARK_FUNCS |
40 |
global %1:function %1.endfunc-%1 |
global %1:function %1.endfunc-%1 |
41 |
|
%define ENDFUNC .endfunc |
42 |
%else |
%else |
43 |
global %1 |
global %1 |
44 |
|
%define ENDFUNC |
45 |
%endif |
%endif |
46 |
%endif |
%endif |
47 |
%endmacro |
%endmacro |
55 |
paddw %1, %2 |
paddw %1, %2 |
56 |
%endmacro |
%endmacro |
57 |
|
|
|
;load a dq from mem to a xmm reg |
|
|
%macro LOAD_XMM 2 |
|
|
movdqu %1,[%2] |
|
|
;movhps %1,[%2+8] |
|
|
%endmacro |
|
|
|
|
|
%macro WRITE_XMM 2 |
|
|
;movlps [%1],%2 |
|
|
;movhps [%1+8],%2 |
|
|
movdqu [%1],%2 |
|
|
%endmacro |
|
|
|
|
58 |
%macro CONSIM_1x8_SSE2 0 |
%macro CONSIM_1x8_SSE2 0 |
59 |
LOAD_XMM xmm0,ecx |
movdqu xmm0,[ecx] |
60 |
LOAD_XMM xmm1,edx |
movdqu xmm1,[edx] |
|
pxor xmm2,xmm2 |
|
61 |
|
|
62 |
;unpack to words |
;unpack to words |
63 |
punpcklbw xmm0,xmm2 |
punpcklbw xmm0,xmm2 |
64 |
punpcklbw xmm1,xmm2 |
punpcklbw xmm1,xmm2 |
65 |
|
|
66 |
;devo |
movaps xmm3,xmm0 |
67 |
psubw xmm0,xmm6 |
movaps xmm4,xmm1 |
|
movaps xmm2,xmm0 |
|
|
pmaddwd xmm2,xmm0 |
|
|
paddd xmm3,xmm2 |
|
|
|
|
|
;devc |
|
|
psubw xmm1,xmm7 |
|
|
movaps xmm2,xmm1 |
|
|
pmaddwd xmm2,xmm1 |
|
|
paddd xmm4,xmm2 |
|
|
|
|
|
;corr |
|
|
pmaddwd xmm1,xmm0 |
|
|
paddd xmm5,xmm1 |
|
|
%endmacro |
|
68 |
|
|
69 |
|
pmaddwd xmm0,xmm0;orig |
70 |
|
pmaddwd xmm1,xmm1;comp |
71 |
|
pmaddwd xmm3,xmm4;corr |
72 |
|
|
73 |
|
paddd xmm5,xmm0 |
74 |
|
paddd xmm6,xmm1 |
75 |
|
paddd xmm7,xmm3 |
76 |
|
%endmacro |
77 |
|
|
78 |
%macro CONSIM_1x8_MMX 0 |
%macro CONSIM_1x8_MMX 0 |
79 |
movq mm0,[ecx];orig |
movq mm0,[ecx];orig |
80 |
movq mm1,[edx];comp |
movq mm1,[edx];comp |
|
pxor mm2,mm2;null vector |
|
81 |
|
|
82 |
;unpack low half of qw to words |
;unpack low half of qw to words |
83 |
punpcklbw mm0,mm2 |
punpcklbw mm0,mm2 |
84 |
punpcklbw mm1,mm2 |
punpcklbw mm1,mm2 |
85 |
|
|
86 |
;devo |
movq mm3,mm0 |
87 |
psubw mm0,mm6 |
pmaddwd mm3,mm0 |
88 |
movq mm2,mm0 |
paddd mm5,mm3; |
89 |
pmaddwd mm2,mm0 |
|
90 |
paddd mm3,mm2; |
movq mm4,mm1 |
91 |
|
pmaddwd mm4,mm1 |
92 |
;devc |
paddd mm6,mm4; |
|
psubw mm1,mm7 |
|
|
movq mm2,mm1 |
|
|
pmaddwd mm2,mm1 |
|
|
paddd mm4,mm2 |
|
93 |
|
|
|
;corr |
|
94 |
pmaddwd mm1,mm0 |
pmaddwd mm1,mm0 |
95 |
paddd mm5,mm1 |
paddd mm7,mm1 |
96 |
|
|
97 |
movq mm0,[ecx] |
movq mm0,[ecx];orig |
98 |
movq mm1,[edx] |
movq mm1,[edx];comp |
|
pxor mm2,mm2;null vector |
|
99 |
|
|
100 |
;unpack high half of qw to words |
;unpack high half of qw to words |
101 |
punpckhbw mm0,mm2 |
punpckhbw mm0,mm2 |
102 |
punpckhbw mm1,mm2 |
punpckhbw mm1,mm2 |
103 |
|
|
104 |
;devo |
movq mm3,mm0 |
105 |
psubw mm0,mm6 |
pmaddwd mm3,mm0 |
106 |
movq mm2,mm0 |
paddd mm5,mm3; |
107 |
pmaddwd mm2,mm0 |
|
108 |
paddd mm3,mm2; |
movq mm4,mm1 |
109 |
|
pmaddwd mm4,mm1 |
110 |
;devc |
paddd mm6,mm4; |
|
psubw mm1,mm7 |
|
|
movq mm2,mm1 |
|
|
pmaddwd mm2,mm1 |
|
|
paddd mm4,mm2 |
|
111 |
|
|
|
;corr |
|
112 |
pmaddwd mm1,mm0 |
pmaddwd mm1,mm0 |
113 |
paddd mm5,mm1 |
paddd mm7,mm1 |
114 |
%endmacro |
%endmacro |
115 |
|
|
116 |
|
%macro CONSIM_WRITEOUT 3 |
117 |
|
mov eax,[esp + 16];lumo |
118 |
|
mul eax; lumo^2 |
119 |
|
add eax, 32 |
120 |
|
shr eax,6; 64*lum0^2 |
121 |
|
movd ecx,%1 |
122 |
|
sub ecx,eax |
123 |
|
|
124 |
|
mov edx,[esp + 24]; pdevo |
125 |
|
mov [edx],ecx |
126 |
|
|
127 |
|
mov eax,[esp + 20];lumc |
128 |
|
mul eax; lumc^2 |
129 |
|
add eax, 32 |
130 |
|
shr eax,6; 64*lumc^2 |
131 |
|
movd ecx,%2 |
132 |
|
sub ecx,eax |
133 |
|
|
134 |
|
mov edx,[esp + 28]; pdevc |
135 |
|
mov [edx],ecx |
136 |
|
|
137 |
|
mov eax,[esp + 16];lumo |
138 |
|
mul dword [esp + 20]; lumo*lumc, should fit in eax |
139 |
|
add eax, 32 |
140 |
|
shr eax,6; 64*lumo*lumc |
141 |
|
movd ecx,%3 |
142 |
|
sub ecx,eax |
143 |
|
|
144 |
|
mov edx,[esp + 32]; pcorr |
145 |
|
mov [edx],ecx |
146 |
|
%endmacro |
147 |
|
|
148 |
|
|
149 |
SECTION .text |
SECTION .text |
175 |
|
|
176 |
movd eax,mm1 |
movd eax,mm1 |
177 |
ret |
ret |
178 |
.endfunc |
ENDFUNC |
179 |
|
|
180 |
ALIGN 16 |
ALIGN 16 |
181 |
consim_mmx: |
consim_sse2: |
182 |
mov ecx,[esp+4] ;ptro |
mov ecx,[esp+4] ;ptro |
|
pxor mm6,mm6; |
|
|
|
|
183 |
mov edx,[esp+8] ;ptrc |
mov edx,[esp+8] ;ptrc |
|
pxor mm3,mm3;devo |
|
|
pxor mm4,mm4;devc |
|
|
movd mm6,[esp + 16];lumo |
|
|
pxor mm7,mm7 |
|
184 |
mov eax,[esp+12];stride |
mov eax,[esp+12];stride |
|
movd mm7,[esp + 20];lumc |
|
|
pshufw mm6,mm6,00000000b ; TODO: remove later! not MMX, but SSE |
|
|
pxor mm5,mm5;corr |
|
|
pshufw mm7,mm7,00000000b |
|
185 |
|
|
186 |
CONSIM_1x8_MMX |
pxor xmm2,xmm2;null vektor |
187 |
|
pxor xmm5,xmm5;devo |
188 |
|
pxor xmm6,xmm6;devc |
189 |
|
pxor xmm7,xmm7;corr |
190 |
|
|
191 |
|
;broadcast lumo/c |
192 |
|
punpcklbw xmm6,xmm6 |
193 |
|
punpcklwd xmm6,xmm6 |
194 |
|
pshufd xmm6,xmm6,00000000b;or shufps |
195 |
|
punpcklbw xmm7,xmm7 |
196 |
|
punpcklwd xmm7,xmm7 |
197 |
|
pshufd xmm7,xmm7,00000000b |
198 |
|
|
199 |
|
CONSIM_1x8_SSE2 |
200 |
add ecx,eax |
add ecx,eax |
201 |
add edx,eax |
add edx,eax |
202 |
CONSIM_1x8_MMX |
CONSIM_1x8_SSE2 |
203 |
add ecx,eax |
add ecx,eax |
204 |
add edx,eax |
add edx,eax |
205 |
CONSIM_1x8_MMX |
CONSIM_1x8_SSE2 |
206 |
add ecx,eax |
add ecx,eax |
207 |
add edx,eax |
add edx,eax |
208 |
CONSIM_1x8_MMX |
CONSIM_1x8_SSE2 |
209 |
add ecx,eax |
add ecx,eax |
210 |
add edx,eax |
add edx,eax |
211 |
CONSIM_1x8_MMX |
CONSIM_1x8_SSE2 |
212 |
add ecx,eax |
add ecx,eax |
213 |
add edx,eax |
add edx,eax |
214 |
CONSIM_1x8_MMX |
CONSIM_1x8_SSE2 |
215 |
add ecx,eax |
add ecx,eax |
216 |
add edx,eax |
add edx,eax |
217 |
CONSIM_1x8_MMX |
CONSIM_1x8_SSE2 |
218 |
add ecx,eax |
add ecx,eax |
219 |
add edx,eax |
add edx,eax |
220 |
CONSIM_1x8_MMX |
CONSIM_1x8_SSE2 |
221 |
|
|
222 |
pshufw mm0,mm3,01001110b |
;accumulate xmm5-7 |
223 |
paddd mm3,mm0 |
pshufd xmm0, xmm5, 0x0E |
224 |
pshufw mm1,mm4,01001110b |
paddd xmm5, xmm0 |
225 |
paddd mm4,mm1 |
pshufd xmm0, xmm5, 0x01 |
226 |
pshufw mm2,mm5,01001110b |
paddd xmm5, xmm0 |
227 |
paddd mm5,mm2 |
|
228 |
|
pshufd xmm1, xmm6, 0x0E |
229 |
;load target pointer |
paddd xmm6, xmm1 |
230 |
mov ecx,[esp + 24]; pdevo |
pshufd xmm1, xmm6, 0x01 |
231 |
movd [ecx],mm3 |
paddd xmm6, xmm1 |
232 |
mov edx,[esp + 28]; pdevc |
|
233 |
movd [edx],mm4 |
pshufd xmm2, xmm7, 0x0E |
234 |
mov eax,[esp + 32]; corr |
paddd xmm7, xmm2 |
235 |
movd [eax],mm5 |
pshufd xmm2, xmm7, 0x01 |
236 |
emms |
paddd xmm7, xmm2 |
237 |
|
|
238 |
|
CONSIM_WRITEOUT xmm5,xmm6,xmm7 |
239 |
ret |
ret |
240 |
.endfunc |
ENDFUNC |
241 |
|
|
242 |
consim_sse2: |
|
243 |
|
|
244 |
|
|
245 |
|
|
246 |
|
ALIGN 16 |
247 |
|
consim_mmx: |
248 |
mov ecx,[esp+4] ;ptro |
mov ecx,[esp+4] ;ptro |
|
pxor xmm6,xmm6; |
|
249 |
mov edx,[esp+8] ;ptrc |
mov edx,[esp+8] ;ptrc |
|
pxor xmm3,xmm3;devo |
|
|
pxor xmm4,xmm4;devc |
|
|
movd xmm6,[esp + 16];lumo |
|
|
pxor xmm7,xmm7 |
|
250 |
mov eax,[esp+12];stride |
mov eax,[esp+12];stride |
251 |
movd xmm7,[esp + 20];lumc |
pxor mm2,mm2;null |
252 |
pxor xmm5,xmm5;corr |
pxor mm5,mm5;devo |
253 |
|
pxor mm6,mm6;devc |
254 |
|
pxor mm7,mm7;corr |
255 |
|
|
256 |
;broadcast lumo/c |
CONSIM_1x8_MMX |
|
;punpcklbw xmm6,xmm6 |
|
|
punpcklwd xmm6,xmm6 |
|
|
pshufd xmm6,xmm6,00000000b;or shufps |
|
|
;punpcklbw xmm7,xmm7 |
|
|
punpcklwd xmm7,xmm7 |
|
|
pshufd xmm7,xmm7,00000000b |
|
|
|
|
|
CONSIM_1x8_SSE2 |
|
257 |
add ecx,eax |
add ecx,eax |
258 |
add edx,eax |
add edx,eax |
259 |
CONSIM_1x8_SSE2 |
CONSIM_1x8_MMX |
260 |
add ecx,eax |
add ecx,eax |
261 |
add edx,eax |
add edx,eax |
262 |
CONSIM_1x8_SSE2 |
CONSIM_1x8_MMX |
263 |
add ecx,eax |
add ecx,eax |
264 |
add edx,eax |
add edx,eax |
265 |
CONSIM_1x8_SSE2 |
CONSIM_1x8_MMX |
266 |
add ecx,eax |
add ecx,eax |
267 |
add edx,eax |
add edx,eax |
268 |
CONSIM_1x8_SSE2 |
CONSIM_1x8_MMX |
269 |
add ecx,eax |
add ecx,eax |
270 |
add edx,eax |
add edx,eax |
271 |
CONSIM_1x8_SSE2 |
CONSIM_1x8_MMX |
272 |
add ecx,eax |
add ecx,eax |
273 |
add edx,eax |
add edx,eax |
274 |
CONSIM_1x8_SSE2 |
CONSIM_1x8_MMX |
275 |
add ecx,eax |
add ecx,eax |
276 |
add edx,eax |
add edx,eax |
277 |
CONSIM_1x8_SSE2 |
CONSIM_1x8_MMX |
278 |
|
|
279 |
;accumulate xmm3-5 |
movq mm0,mm5 |
280 |
pshufd xmm0, xmm3, 0EH ; Get bit 64-127 from xmm1 (or use movhlps) |
psrlq mm0,32 |
281 |
paddd xmm3, xmm0 ; Sums are in 2 dwords |
paddd mm5,mm0 |
282 |
pshufd xmm0, xmm3, 01H ; Get bit 32-63 from xmm0 |
movq mm1,mm6 |
283 |
paddd xmm3, xmm0 ; Sum is in one dword |
psrlq mm1,32 |
284 |
|
paddd mm6,mm1 |
285 |
pshufd xmm1, xmm4, 0EH ; Get bit 64-127 from xmm1 (or use movhlps) |
movq mm2,mm7 |
286 |
paddd xmm4, xmm1 ; Sums are in 2 dwords |
psrlq mm2,32 |
287 |
pshufd xmm1, xmm4, 01H ; Get bit 32-63 from xmm0 |
paddd mm7,mm2 |
288 |
paddd xmm4, xmm1 ; Sum is in one dword |
|
289 |
|
CONSIM_WRITEOUT mm5,mm6,mm7 |
|
pshufd xmm2, xmm5, 0EH ; Get bit 64-127 from xmm1 (or use movhlps) |
|
|
paddd xmm5, xmm2 ; Sums are in 2 dwords |
|
|
pshufd xmm2, xmm5, 01H ; Get bit 32-63 from xmm0 |
|
|
paddd xmm5, xmm2 ; Sum is in one dword |
|
|
|
|
|
|
|
|
;load target pointer |
|
|
mov ecx,[esp + 24]; pdevo |
|
|
movd [ecx],xmm3 |
|
|
mov edx,[esp + 28]; pdevc |
|
|
movd [edx],xmm4 |
|
|
mov eax,[esp + 32]; corr |
|
|
movd [eax],xmm5 |
|
290 |
ret |
ret |
|
.endfunc |
|
291 |
|
ENDFUNC |
292 |
|
|
293 |
|
%ifidn __OUTPUT_FORMAT__,elf |
294 |
|
section ".note.GNU-stack" noalloc noexec nowrite progbits |
295 |
|
%endif |