28 |
|
|
29 |
%macro cglobal 1 |
%macro cglobal 1 |
30 |
%ifdef PREFIX |
%ifdef PREFIX |
31 |
|
%ifdef MARK_FUNCS |
32 |
|
global _%1:function %1.endfunc-%1 |
33 |
|
%define %1 _%1:function %1.endfunc-%1 |
34 |
|
%define ENDFUNC .endfunc |
35 |
|
%else |
36 |
global _%1 |
global _%1 |
37 |
%define %1 _%1 |
%define %1 _%1 |
38 |
|
%define ENDFUNC |
39 |
|
%endif |
40 |
|
%else |
41 |
|
%ifdef MARK_FUNCS |
42 |
|
global %1:function %1.endfunc-%1 |
43 |
|
%define ENDFUNC .endfunc |
44 |
%else |
%else |
45 |
global %1 |
global %1 |
46 |
|
%define ENDFUNC |
47 |
|
%endif |
48 |
%endif |
%endif |
49 |
%endmacro |
%endmacro |
50 |
|
|
53 |
;============================================================================= |
;============================================================================= |
54 |
|
|
55 |
%ifdef FORMAT_COFF |
%ifdef FORMAT_COFF |
56 |
SECTION .rodata data |
SECTION .rodata |
57 |
%else |
%else |
58 |
SECTION .rodata data align=16 |
SECTION .rodata align=16 |
59 |
%endif |
%endif |
60 |
|
|
61 |
ALIGN 16 |
ALIGN 16 |
84 |
cglobal interpolate8x8_halfpel_v_3dne |
cglobal interpolate8x8_halfpel_v_3dne |
85 |
cglobal interpolate8x8_halfpel_hv_3dne |
cglobal interpolate8x8_halfpel_hv_3dne |
86 |
|
|
87 |
|
cglobal interpolate8x4_halfpel_h_3dne |
88 |
|
cglobal interpolate8x4_halfpel_v_3dne |
89 |
|
cglobal interpolate8x4_halfpel_hv_3dne |
90 |
|
|
91 |
;----------------------------------------------------------------------------- |
;----------------------------------------------------------------------------- |
92 |
; |
; |
93 |
; void interpolate8x8_halfpel_h_3dne(uint8_t * const dst, |
; void interpolate8x8_halfpel_h_3dne(uint8_t * const dst, |
101 |
%if (%1) |
%if (%1) |
102 |
movq mm0, [eax] |
movq mm0, [eax] |
103 |
%else |
%else |
104 |
movq mm0, [dword eax] |
movq mm0, [eax+0] |
105 |
|
; --- |
106 |
|
; nasm >0.99.x rejects the original statement: |
107 |
|
; movq mm0, [dword eax] |
108 |
|
; as it is ambiguous. for this statement nasm <0.99.x would |
109 |
|
; generate "movq mm0,[eax+0]" |
110 |
|
; --- |
111 |
%endif |
%endif |
112 |
pavgb mm0, [eax+1] |
pavgb mm0, [eax+1] |
113 |
movq mm1, [eax+edx] |
movq mm1, [eax+edx] |
156 |
COPY_H_SSE_RND0 1 |
COPY_H_SSE_RND0 1 |
157 |
ret |
ret |
158 |
|
|
159 |
.rounding1 |
.rounding1: |
160 |
; we use: (i+j)/2 = ( i+j+1 )/2 - (i^j)&1 |
; we use: (i+j)/2 = ( i+j+1 )/2 - (i^j)&1 |
161 |
mov ecx, [esp+ 4] ; Dst |
mov ecx, [esp+ 4] ; Dst |
162 |
movq mm7, [mmx_one] |
movq mm7, [mmx_one] |
168 |
lea ecx,[ecx+2*edx] |
lea ecx,[ecx+2*edx] |
169 |
COPY_H_SSE_RND1 |
COPY_H_SSE_RND1 |
170 |
ret |
ret |
171 |
|
ENDFUNC |
172 |
|
|
173 |
;----------------------------------------------------------------------------- |
;----------------------------------------------------------------------------- |
174 |
; |
; |
228 |
ret |
ret |
229 |
|
|
230 |
ALIGN 8 |
ALIGN 8 |
231 |
.rounding1 |
.rounding1: |
232 |
pcmpeqb mm0, mm0 |
pcmpeqb mm0, mm0 |
233 |
psubusb mm0, [eax] |
psubusb mm0, [eax] |
234 |
add eax, edx |
add eax, edx |
292 |
movq [ecx], mm4 |
movq [ecx], mm4 |
293 |
movq [ecx+edx], mm5 |
movq [ecx+edx], mm5 |
294 |
ret |
ret |
295 |
|
ENDFUNC |
296 |
|
|
297 |
;----------------------------------------------------------------------------- |
;----------------------------------------------------------------------------- |
298 |
; |
; |
415 |
ret |
ret |
416 |
|
|
417 |
ALIGN 16 |
ALIGN 16 |
418 |
.rounding1 |
.rounding1: |
419 |
COPY_HV_SSE_RND1 |
COPY_HV_SSE_RND1 |
420 |
lea ecx,[ecx+2*edx] |
lea ecx,[ecx+2*edx] |
421 |
COPY_HV_SSE_RND1 |
COPY_HV_SSE_RND1 |
424 |
lea ecx,[ecx+2*edx] |
lea ecx,[ecx+2*edx] |
425 |
COPY_HV_SSE_RND1 |
COPY_HV_SSE_RND1 |
426 |
ret |
ret |
427 |
|
ENDFUNC |
428 |
|
|
429 |
|
;----------------------------------------------------------------------------- |
430 |
|
; |
431 |
|
; void interpolate8x4_halfpel_h_3dne(uint8_t * const dst, |
432 |
|
; const uint8_t * const src, |
433 |
|
; const uint32_t stride, |
434 |
|
; const uint32_t rounding); |
435 |
|
; |
436 |
|
;----------------------------------------------------------------------------- |
437 |
|
|
438 |
|
ALIGN 16 |
439 |
|
interpolate8x4_halfpel_h_3dne: |
440 |
|
|
441 |
|
mov eax, [esp+ 8] ; Src |
442 |
|
mov edx, [esp+12] ; stride |
443 |
|
dec dword [esp+16]; rounding |
444 |
|
|
445 |
|
jz .rounding1 |
446 |
|
mov ecx, [esp+ 4] ; Dst |
447 |
|
|
448 |
|
COPY_H_SSE_RND0 0 |
449 |
|
lea ecx,[ecx+2*edx] |
450 |
|
COPY_H_SSE_RND0 1 |
451 |
|
ret |
452 |
|
|
453 |
|
.rounding1: |
454 |
|
; we use: (i+j)/2 = ( i+j+1 )/2 - (i^j)&1 |
455 |
|
mov ecx, [esp+ 4] ; Dst |
456 |
|
movq mm7, [mmx_one] |
457 |
|
COPY_H_SSE_RND1 |
458 |
|
lea ecx, [ecx+2*edx] |
459 |
|
COPY_H_SSE_RND1 |
460 |
|
ret |
461 |
|
ENDFUNC |
462 |
|
|
463 |
|
;----------------------------------------------------------------------------- |
464 |
|
; |
465 |
|
; void interpolate8x4_halfpel_v_3dne(uint8_t * const dst, |
466 |
|
; const uint8_t * const src, |
467 |
|
; const uint32_t stride, |
468 |
|
; const uint32_t rounding); |
469 |
|
; |
470 |
|
;----------------------------------------------------------------------------- |
471 |
|
|
472 |
|
ALIGN 16 |
473 |
|
interpolate8x4_halfpel_v_3dne: |
474 |
|
|
475 |
|
mov eax, [esp+ 8] ; Src |
476 |
|
mov edx, [esp+12] ; stride |
477 |
|
dec dword [esp+16]; rounding |
478 |
|
|
479 |
|
; we process 2 line at a time |
480 |
|
|
481 |
|
jz .rounding1 |
482 |
|
pxor mm2,mm2 |
483 |
|
movq mm0, [eax] |
484 |
|
movq mm1, [eax+edx] |
485 |
|
por mm2, [eax+2*edx] ; Something like preload (pipelining) |
486 |
|
mov ecx, [esp+ 4] ; Dst |
487 |
|
lea eax, [eax+2*edx] |
488 |
|
pxor mm4, mm4 |
489 |
|
pavgb mm0, mm1 |
490 |
|
pavgb mm1, mm2 |
491 |
|
movq [byte ecx], mm0 |
492 |
|
movq [ecx+edx], mm1 |
493 |
|
|
494 |
|
pxor mm6, mm6 |
495 |
|
add eax, edx |
496 |
|
lea ecx, [ecx+2*edx] |
497 |
|
movq mm3, [byte eax] |
498 |
|
por mm4, [eax+edx] |
499 |
|
lea eax, [eax+2*edx] |
500 |
|
pavgb mm2, mm3 |
501 |
|
pavgb mm3, mm4 |
502 |
|
movq [ecx], mm2 |
503 |
|
movq [ecx+edx], mm3 |
504 |
|
|
505 |
|
ret |
506 |
|
|
507 |
|
ALIGN 8 |
508 |
|
.rounding1: |
509 |
|
pcmpeqb mm0, mm0 |
510 |
|
psubusb mm0, [eax] ; eax==line0 |
511 |
|
add eax, edx ; eax==line1 |
512 |
|
mov ecx, [esp+ 4] ; Dst |
513 |
|
|
514 |
|
push esi |
515 |
|
|
516 |
|
pcmpeqb mm1, mm1 |
517 |
|
pcmpeqb mm2, mm2 |
518 |
|
mov esi, mm_minusone |
519 |
|
psubusb mm1, [byte eax] ; line1 |
520 |
|
psubusb mm2, [eax+edx] ; line2 |
521 |
|
lea eax, [eax+2*edx] ; eax==line3 |
522 |
|
movq mm6, [esi] |
523 |
|
movq mm7, [esi] |
524 |
|
pavgb mm0, mm1 |
525 |
|
pavgb mm1, mm2 |
526 |
|
psubusb mm6, mm0 |
527 |
|
psubusb mm7, mm1 |
528 |
|
movq [ecx], mm6 ; store line0 |
529 |
|
movq [ecx+edx], mm7 ; store line1 |
530 |
|
|
531 |
|
lea ecx, [ecx+2*edx] |
532 |
|
pcmpeqb mm3, mm3 |
533 |
|
pcmpeqb mm4, mm4 |
534 |
|
psubusb mm3, [eax] ; line3 |
535 |
|
psubusb mm4, [eax+edx] ; line4 |
536 |
|
lea eax, [eax+2*edx] ; eax==line 5 |
537 |
|
pavgb mm2, mm3 |
538 |
|
pavgb mm3, mm4 |
539 |
|
movq mm0, [esi] |
540 |
|
movq mm1, [esi] |
541 |
|
psubusb mm0, mm2 |
542 |
|
psubusb mm1, mm3 |
543 |
|
movq [ecx], mm0 |
544 |
|
movq [ecx+edx], mm1 |
545 |
|
|
546 |
|
pop esi |
547 |
|
|
548 |
|
ret |
549 |
|
|
550 |
|
ENDFUNC |
551 |
|
|
552 |
|
;----------------------------------------------------------------------------- |
553 |
|
; |
554 |
|
; void interpolate8x4_halfpel_hv_3dne(uint8_t * const dst, |
555 |
|
; const uint8_t * const src, |
556 |
|
; const uint32_t stride, |
557 |
|
; const uint32_t rounding); |
558 |
|
; |
559 |
|
; |
560 |
|
;----------------------------------------------------------------------------- |
561 |
|
|
562 |
|
ALIGN 16 |
563 |
|
interpolate8x4_halfpel_hv_3dne: |
564 |
|
mov eax, [esp+ 8] ; Src |
565 |
|
mov edx, [esp+12] ; stride |
566 |
|
dec dword [esp+16] ; rounding |
567 |
|
|
568 |
|
; loop invariants: mm2=(i+j+1)/2 and mm3= i^j |
569 |
|
movq mm2, [eax] |
570 |
|
movq mm3, [eax+1] |
571 |
|
movq mm6, mm2 |
572 |
|
pavgb mm2, mm3 |
573 |
|
pxor mm3, mm6 ; mm2/mm3 ready |
574 |
|
mov ecx, [esp+ 4] ; Dst |
575 |
|
movq mm7, [mmx_one] |
576 |
|
|
577 |
|
jz near .rounding1 |
578 |
|
lea ebp,[byte ebp] |
579 |
|
COPY_HV_SSE_RND0 |
580 |
|
lea ecx,[ecx+2*edx] |
581 |
|
COPY_HV_SSE_RND0 |
582 |
|
ret |
583 |
|
|
584 |
|
ALIGN 16 |
585 |
|
.rounding1: |
586 |
|
COPY_HV_SSE_RND1 |
587 |
|
lea ecx,[ecx+2*edx] |
588 |
|
COPY_HV_SSE_RND1 |
589 |
|
ret |
590 |
|
ENDFUNC |
591 |
|
|
592 |
|
|
593 |
|
%ifidn __OUTPUT_FORMAT__,elf |
594 |
|
section ".note.GNU-stack" noalloc noexec nowrite progbits |
595 |
|
%endif |
596 |
|
|