387 |
INTERPOLATE8X8_AVG4(); |
INTERPOLATE8X8_AVG4(); |
388 |
} |
} |
389 |
|
|
390 |
|
/* |
391 |
|
* This function assumes: |
392 |
|
* dst is 8 byte aligned |
393 |
|
* src is unaligned |
394 |
|
* stirde is a multiple of 8 |
395 |
|
* rounding is ignored |
396 |
|
*/ |
397 |
|
void |
398 |
|
interpolate8x8_halfpel_add_altivec_c(uint8_t *dst, const uint8_t *src, const uint32_t stride, const uint32_t rouding) |
399 |
|
{ |
400 |
|
interpolate8x8_avg2_altivec_c(dst, dst, src, stride, 0, 8); |
401 |
|
} |
402 |
|
|
403 |
|
#define INTERPOLATE8X8_HALFPEL_H_ADD_ROUND() \ |
404 |
|
mask_dst = vec_lvsl(0,dst); \ |
405 |
|
s1 = vec_perm(vec_ld(0,src),vec_ld(16,src),vec_lvsl(0,src)); \ |
406 |
|
d = vec_perm(vec_ld(0,dst),vec_ld(16,dst),mask_dst); \ |
407 |
|
\ |
408 |
|
s2 = vec_perm(s1,s1,rot1); \ |
409 |
|
tmp = vec_avg(s1,s2); \ |
410 |
|
s1 = vec_sub(tmp,vec_and(vec_xor(s1,s2),one)); \ |
411 |
|
\ |
412 |
|
d = vec_avg(s1,d);\ |
413 |
|
\ |
414 |
|
mask = vec_perm(mask_stencil, mask_stencil, mask_dst); \ |
415 |
|
d = vec_perm(d,d,mask_dst); \ |
416 |
|
d = vec_sel(d,vec_ld(0,dst),mask); \ |
417 |
|
vec_st(d,0,dst); \ |
418 |
|
\ |
419 |
|
dst += stride; \ |
420 |
|
src += stride |
421 |
|
|
422 |
/************************************************************* |
#define INTERPOLATE8X8_HALFPEL_H_ADD_NOROUND() \ |
423 |
* QPEL STUFF STARTS HERE * |
mask_dst = vec_lvsl(0,dst); \ |
424 |
*************************************************************/ |
s1 = vec_perm(vec_ld(0,src),vec_ld(16,src),vec_lvsl(0,src)); \ |
425 |
|
d = vec_perm(vec_ld(0,dst),vec_ld(16,dst),mask_dst); \ |
426 |
|
\ |
427 |
#define INTERPOLATE8X8_6TAP_LOWPASS_H() \ |
s1 = vec_avg(s1, vec_perm(s1,s1,rot1));\ |
428 |
vec_dstt(src, prefetch_constant, 0); \ |
d = vec_avg(s1,d);\ |
429 |
data = vec_perm(vec_ld(-2, src), vec_ld(14, src), vec_lvsl(-2, src)); \ |
\ |
430 |
s1 = (vector signed short)vec_mergeh(zerovec, data); \ |
mask = vec_perm(mask_stencil,mask_stencil,mask_dst);\ |
431 |
t = vec_perm(data, data, vec_lvsl(5, (unsigned char*)0)); \ |
d = vec_perm(d,d,mask_dst);\ |
432 |
s2 = (vector signed short)vec_mergeh(zerovec, t); \ |
d = vec_sel(d,vec_ld(0,dst),mask);\ |
433 |
d = vec_add(s1, s2); \ |
vec_st(d,0,dst);\ |
|
\ |
|
|
t = vec_perm(data, data, vec_lvsl(2, (unsigned char*)0)); \ |
|
|
s1 = (vector signed short)vec_mergeh(zerovec, t); \ |
|
|
t = vec_perm(data, data, vec_lvsl(3, (unsigned char*)0)); \ |
|
|
s2 = (vector signed short)vec_mergeh(zerovec, t); \ |
|
|
s1 = vec_add(s1,s2); \ |
|
|
z = vec_sl(s1, vec_splat_u16(2)); \ |
|
|
t = vec_perm(data, data, vec_lvsl(1, (unsigned char*)0)); \ |
|
|
s1 = (vector signed short)vec_mergeh(zerovec, t); \ |
|
|
t = vec_perm(data, data, vec_lvsl(4, (unsigned char*)0)); \ |
|
|
s2 = (vector signed short)vec_mergeh(zerovec, t); \ |
|
|
s1 = vec_add(s1, s2); \ |
|
|
z = vec_sub(z, s1); \ |
|
|
z = vec_add(vec_sl(z, vec_splat_u16(2)), z); \ |
|
|
d = vec_add(d, z); \ |
|
|
\ |
|
|
d = vec_add(d, round_add); \ |
|
|
d = vec_sra(d, vec_splat_u16(5)); \ |
|
|
\ |
|
|
t = vec_packsu(d, (vector signed short)zerovec); \ |
|
|
mask = vec_perm(mask_stencil, mask_stencil, vec_lvsl(0, dst)); \ |
|
|
t = vec_perm(t, t, vec_lvsl(0, dst)); \ |
|
|
t = vec_sel(t, vec_ld(0, dst), mask); \ |
|
|
vec_st(t, 0, dst); \ |
|
434 |
\ |
\ |
435 |
dst += stride; \ |
dst += stride; \ |
436 |
src += stride |
src += stride |
437 |
|
|
438 |
/* This function assumes: |
/* |
439 |
|
* This function assumes: |
440 |
* dst is 8 byte aligned |
* dst is 8 byte aligned |
441 |
* src is unaligned |
* src is unaligned |
442 |
* stride is a muliple of 8 |
* stride is a multiple of 8 |
443 |
|
*/ |
444 |
|
void |
445 |
|
interpolate8x8_halfpel_h_add_altivec_c(uint8_t *dst, uint8_t *src, const uint32_t stride, const uint32_t rounding) |
446 |
|
{ |
447 |
|
register vector unsigned char s1,s2; |
448 |
|
register vector unsigned char d; |
449 |
|
register vector unsigned char tmp; |
450 |
|
|
451 |
|
register vector unsigned char mask_dst; |
452 |
|
register vector unsigned char one; |
453 |
|
register vector unsigned char rot1; |
454 |
|
|
455 |
|
register vector unsigned char mask_stencil; |
456 |
|
register vector unsigned char mask; |
457 |
|
|
458 |
|
#ifdef DEBUG |
459 |
|
if(((unsigned)dst) & 0x7); |
460 |
|
fprintf(stderr, "interpolate8x8_halfpel_h_add_altivec_c:incorrect align, dst: %x\n", dst); |
461 |
|
if(stride & 0x7) |
462 |
|
fprintf(stderr, "interpolate8x8_halfpel_h_add_altivec_c:incorrect stride, stride: %u\n", stride); |
463 |
|
#endif |
464 |
|
|
465 |
|
/* initialization */ |
466 |
|
mask_stencil = vec_pack(vec_splat_u16(0), vec_splat_u16(-1)); |
467 |
|
one = vec_splat_u8(1); |
468 |
|
rot1 = vec_lvsl(1,(unsigned char*)0); |
469 |
|
|
470 |
|
if(rounding) { |
471 |
|
INTERPOLATE8X8_HALFPEL_H_ADD_ROUND(); |
472 |
|
INTERPOLATE8X8_HALFPEL_H_ADD_ROUND(); |
473 |
|
INTERPOLATE8X8_HALFPEL_H_ADD_ROUND(); |
474 |
|
INTERPOLATE8X8_HALFPEL_H_ADD_ROUND(); |
475 |
|
|
476 |
|
INTERPOLATE8X8_HALFPEL_H_ADD_ROUND(); |
477 |
|
INTERPOLATE8X8_HALFPEL_H_ADD_ROUND(); |
478 |
|
INTERPOLATE8X8_HALFPEL_H_ADD_ROUND(); |
479 |
|
INTERPOLATE8X8_HALFPEL_H_ADD_ROUND(); |
480 |
|
} |
481 |
|
else { |
482 |
|
|
483 |
|
INTERPOLATE8X8_HALFPEL_H_ADD_NOROUND(); |
484 |
|
INTERPOLATE8X8_HALFPEL_H_ADD_NOROUND(); |
485 |
|
INTERPOLATE8X8_HALFPEL_H_ADD_NOROUND(); |
486 |
|
INTERPOLATE8X8_HALFPEL_H_ADD_NOROUND(); |
487 |
|
|
488 |
|
INTERPOLATE8X8_HALFPEL_H_ADD_NOROUND(); |
489 |
|
INTERPOLATE8X8_HALFPEL_H_ADD_NOROUND(); |
490 |
|
INTERPOLATE8X8_HALFPEL_H_ADD_NOROUND(); |
491 |
|
INTERPOLATE8X8_HALFPEL_H_ADD_NOROUND(); |
492 |
|
} |
493 |
|
} |
494 |
|
|
495 |
|
|
496 |
|
|
497 |
|
|
498 |
|
#define INTERPOLATE8X8_HALFPEL_V_ADD_ROUND()\ |
499 |
|
src += stride;\ |
500 |
|
mask_dst = vec_lvsl(0,dst);\ |
501 |
|
s2 = vec_perm(vec_ld(0,src),vec_ld(16,src),vec_lvsl(0,src));\ |
502 |
|
d = vec_perm(vec_ld(0,dst),vec_ld(16,dst),mask_dst);\ |
503 |
|
\ |
504 |
|
tmp = vec_avg(s1,s2);\ |
505 |
|
s1 = vec_sub(tmp,vec_and(vec_xor(s1,s2),vec_splat_u8(1)));\ |
506 |
|
d = vec_avg(s1,d);\ |
507 |
|
\ |
508 |
|
mask = vec_perm(mask_stencil,mask_stencil,mask_dst);\ |
509 |
|
d = vec_perm(d,d,mask_dst);\ |
510 |
|
d = vec_sel(d,vec_ld(0,dst),mask);\ |
511 |
|
vec_st(d,0,dst);\ |
512 |
|
\ |
513 |
|
s1 = s2;\ |
514 |
|
\ |
515 |
|
dst += stride |
516 |
|
|
517 |
|
#define INTERPOLATE8X8_HALFPEL_V_ADD_NOROUND()\ |
518 |
|
src += stride;\ |
519 |
|
mask_dst = vec_lvsl(0,dst);\ |
520 |
|
s2 = vec_perm(vec_ld(0,src),vec_ld(16,src),vec_lvsl(0,src));\ |
521 |
|
d = vec_perm(vec_ld(0,dst),vec_ld(16,dst),mask_dst);\ |
522 |
|
\ |
523 |
|
s1 = vec_avg(s1,s2);\ |
524 |
|
d = vec_avg(s1,d);\ |
525 |
|
\ |
526 |
|
mask = vec_perm(mask_stencil,mask_stencil,mask_dst);\ |
527 |
|
d = vec_perm(d,d,mask_dst);\ |
528 |
|
d = vec_sel(d,vec_ld(0,dst),mask);\ |
529 |
|
vec_st(d,0,dst);\ |
530 |
|
\ |
531 |
|
s1 = s2;\ |
532 |
|
dst += stride |
533 |
|
|
534 |
|
/* |
535 |
|
* This function assumes: |
536 |
|
* dst: 8 byte aligned |
537 |
|
* src: unaligned |
538 |
|
* stride is a multiple of 8 |
539 |
*/ |
*/ |
540 |
|
|
541 |
void |
void |
542 |
interpolate8x8_6tap_lowpass_h_altivec_c(uint8_t *dst, uint8_t *src, int32_t stride, int32_t rounding) |
interpolate8x8_halfpel_v_add_altivec_c(uint8_t *dst, uint8_t *src, const uint32_t stride, const uint32_t rounding) |
543 |
{ |
{ |
544 |
vector signed short s1, s2; |
register vector unsigned char s1,s2; |
545 |
vector signed short z; |
register vector unsigned char tmp; |
546 |
vector signed short d; |
register vector unsigned char d; |
547 |
vector signed short round_add; |
|
548 |
vector unsigned char t; |
register vector unsigned char mask; |
549 |
vector unsigned char data; |
register vector unsigned char mask_dst; |
550 |
vector unsigned char mask; |
register vector unsigned char mask_stencil; |
551 |
vector unsigned char mask_stencil; |
|
552 |
vector unsigned char zerovec; |
#ifdef DEBUG |
553 |
|
if(((unsigned)dst) & 0x7) |
554 |
unsigned prefetch_constant; |
fprintf(stderr, "interpolate8x8_halfpel_v_add_altivec_c:incorrect align, dst: %x\n", dst); |
555 |
|
if(stride & 0x7) |
556 |
zerovec = vec_splat_u8(0); |
fprintf(stderr, "interpolate8x8_halfpel_v_add_altivec_c:incorrect align, dst: %u\n", stride); |
557 |
*((short*)&round_add) = (short)(16 - rounding); |
#endif |
558 |
round_add = vec_splat(round_add, 0); |
|
559 |
|
/* initialization */ |
560 |
mask_stencil = vec_pack(vec_splat_u16(0), vec_splat_u16(-1)); |
mask_stencil = vec_pack(vec_splat_u16(0), vec_splat_u16(-1)); |
561 |
|
|
562 |
prefetch_constant = build_prefetch(1, 4, (short)stride); |
if(rounding) { |
563 |
|
|
564 |
|
/* Interpolate vertical with rounding */ |
565 |
|
s1 = vec_perm(vec_ld(0,src),vec_ld(16,src),vec_lvsl(0,src)); |
566 |
|
|
567 |
|
INTERPOLATE8X8_HALFPEL_V_ADD_ROUND(); |
568 |
|
INTERPOLATE8X8_HALFPEL_V_ADD_ROUND(); |
569 |
|
INTERPOLATE8X8_HALFPEL_V_ADD_ROUND(); |
570 |
|
INTERPOLATE8X8_HALFPEL_V_ADD_ROUND(); |
571 |
|
|
572 |
|
INTERPOLATE8X8_HALFPEL_V_ADD_ROUND(); |
573 |
|
INTERPOLATE8X8_HALFPEL_V_ADD_ROUND(); |
574 |
|
INTERPOLATE8X8_HALFPEL_V_ADD_ROUND(); |
575 |
|
INTERPOLATE8X8_HALFPEL_V_ADD_ROUND(); |
576 |
|
} |
577 |
|
else { |
578 |
|
/* Interpolate vertical without rounding */ |
579 |
|
s1 = vec_perm(vec_ld(0,src),vec_ld(16,src),vec_lvsl(0,src)); |
580 |
|
|
581 |
|
INTERPOLATE8X8_HALFPEL_V_ADD_NOROUND(); |
582 |
|
INTERPOLATE8X8_HALFPEL_V_ADD_NOROUND(); |
583 |
|
INTERPOLATE8X8_HALFPEL_V_ADD_NOROUND(); |
584 |
|
INTERPOLATE8X8_HALFPEL_V_ADD_NOROUND(); |
585 |
|
|
586 |
|
INTERPOLATE8X8_HALFPEL_V_ADD_NOROUND(); |
587 |
|
INTERPOLATE8X8_HALFPEL_V_ADD_NOROUND(); |
588 |
|
INTERPOLATE8X8_HALFPEL_V_ADD_NOROUND(); |
589 |
|
INTERPOLATE8X8_HALFPEL_V_ADD_NOROUND(); |
590 |
|
} |
591 |
|
} |
592 |
|
|
593 |
|
|
594 |
|
|
595 |
|
#define INTERPOLATE8X8_HALFPEL_HV_ADD_ROUND()\ |
596 |
|
src += stride;\ |
597 |
|
mask_dst = vec_lvsl(0,dst);\ |
598 |
|
c10 = vec_perm(vec_ld(0,src),vec_ld(16,src),vec_lvsl(0,src));\ |
599 |
|
d = vec_perm(vec_ld(0,dst),vec_ld(16,dst),mask_dst);\ |
600 |
|
c11 = vec_perm(c10,c10,rot1);\ |
601 |
|
\ |
602 |
|
s00 = (vector unsigned short)vec_mergeh(zero,c00);\ |
603 |
|
s01 = (vector unsigned short)vec_mergeh(zero,c01);\ |
604 |
|
s10 = (vector unsigned short)vec_mergeh(zero,c10);\ |
605 |
|
s11 = (vector unsigned short)vec_mergeh(zero,c11);\ |
606 |
|
\ |
607 |
|
s00 = vec_add(s00,s10);\ |
608 |
|
s01 = vec_add(s01,s11);\ |
609 |
|
s00 = vec_add(s00,s01);\ |
610 |
|
s00 = vec_add(s00,one);\ |
611 |
|
\ |
612 |
|
s00 = vec_sr(s00,two);\ |
613 |
|
s00 = vec_add(s00, (vector unsigned short)vec_mergeh(zero,d));\ |
614 |
|
s00 = vec_sr(s00,one);\ |
615 |
|
\ |
616 |
|
d = vec_pack(s00,s00);\ |
617 |
|
mask = vec_perm(mask_stencil,mask_stencil,mask_dst);\ |
618 |
|
d = vec_sel(d,vec_ld(0,dst),mask);\ |
619 |
|
vec_st(d,0,dst);\ |
620 |
|
\ |
621 |
|
c00 = c10;\ |
622 |
|
c01 = c11;\ |
623 |
|
dst += stride |
624 |
|
|
|
INTERPOLATE8X8_6TAP_LOWPASS_H(); |
|
|
INTERPOLATE8X8_6TAP_LOWPASS_H(); |
|
|
INTERPOLATE8X8_6TAP_LOWPASS_H(); |
|
|
INTERPOLATE8X8_6TAP_LOWPASS_H(); |
|
|
|
|
|
INTERPOLATE8X8_6TAP_LOWPASS_H(); |
|
|
INTERPOLATE8X8_6TAP_LOWPASS_H(); |
|
|
INTERPOLATE8X8_6TAP_LOWPASS_H(); |
|
|
INTERPOLATE8X8_6TAP_LOWPASS_H(); |
|
625 |
|
|
626 |
vec_dss(0); |
#define INTERPOLATE8X8_HALFPEL_HV_ADD_NOROUND()\ |
627 |
|
src += stride;\ |
628 |
|
mask_dst = vec_lvsl(0,dst);\ |
629 |
|
c10 = vec_perm(vec_ld(0,src),vec_ld(16,src),vec_lvsl(0,src));\ |
630 |
|
d = vec_perm(vec_ld(0,dst),vec_ld(16,dst),mask_dst);\ |
631 |
|
c11 = vec_perm(c10,c10,rot1);\ |
632 |
|
\ |
633 |
|
s00 = (vector unsigned short)vec_mergeh(zero,c00);\ |
634 |
|
s01 = (vector unsigned short)vec_mergeh(zero,c01);\ |
635 |
|
s10 = (vector unsigned short)vec_mergeh(zero,c10);\ |
636 |
|
s11 = (vector unsigned short)vec_mergeh(zero,c11);\ |
637 |
|
\ |
638 |
|
s00 = vec_add(s00,s10);\ |
639 |
|
s01 = vec_add(s01,s11);\ |
640 |
|
s00 = vec_add(s00,s01);\ |
641 |
|
s00 = vec_add(s00,two);\ |
642 |
|
s00 = vec_sr(s00,two);\ |
643 |
|
\ |
644 |
|
c00 = vec_pack(s00,s00);\ |
645 |
|
d = vec_avg(d,c00);\ |
646 |
|
\ |
647 |
|
mask = vec_perm(mask_stencil,mask_stencil,mask_dst);\ |
648 |
|
d = vec_perm(d,d,mask_dst);\ |
649 |
|
d = vec_sel(d,vec_ld(0,dst),mask);\ |
650 |
|
vec_st(d,0,dst);\ |
651 |
|
\ |
652 |
|
c00 = c10;\ |
653 |
|
c01 = c11;\ |
654 |
|
dst += stride |
655 |
|
|
656 |
|
|
657 |
|
/* |
658 |
|
* This function assumes: |
659 |
|
* dst: 8 byte aligned |
660 |
|
* src: unaligned |
661 |
|
* stride: multiple of 8 |
662 |
|
*/ |
663 |
|
|
664 |
|
void |
665 |
|
interpolate8x8_halfpel_hv_add_altivec_c(uint8_t *dst, uint8_t *src, const uint32_t stride, const uint32_t rounding) |
666 |
|
{ |
667 |
|
register vector unsigned char c00,c10,c01,c11; |
668 |
|
register vector unsigned short s00,s10,s01,s11; |
669 |
|
register vector unsigned char d; |
670 |
|
|
671 |
|
register vector unsigned char mask; |
672 |
|
register vector unsigned char mask_stencil; |
673 |
|
|
674 |
|
register vector unsigned char rot1; |
675 |
|
register vector unsigned char mask_dst; |
676 |
|
register vector unsigned char zero; |
677 |
|
register vector unsigned short one,two; |
678 |
|
|
679 |
|
#ifdef DEBUG |
680 |
|
if(((unsigned)dst) & 0x7) |
681 |
|
fprintf(stderr, "interpolate8x8_halfpel_hv_add_altivec_c:incorrect align, dst: %x\n",dst); |
682 |
|
if(stride & 0x7) |
683 |
|
fprintf(stderr, "interpolate8x8_halfpel_hv_add_altivec_c:incorrect stride, stride: %u\n", stride); |
684 |
|
#endif |
685 |
|
|
686 |
|
/* initialization */ |
687 |
|
mask_stencil = vec_pack(vec_splat_u16(0), vec_splat_u16(-1)); |
688 |
|
rot1 = vec_lvsl(1,(unsigned char*)0); |
689 |
|
zero = vec_splat_u8(0); |
690 |
|
one = vec_splat_u16(1); |
691 |
|
two = vec_splat_u16(2); |
692 |
|
|
693 |
|
if(rounding) { |
694 |
|
|
695 |
|
/* Load the first row 'manually' */ |
696 |
|
c00 = vec_perm(vec_ld(0,src),vec_ld(16,src),vec_lvsl(0,src)); |
697 |
|
c01 = vec_perm(c00,c00,rot1); |
698 |
|
|
699 |
|
INTERPOLATE8X8_HALFPEL_HV_ADD_ROUND(); |
700 |
|
INTERPOLATE8X8_HALFPEL_HV_ADD_ROUND(); |
701 |
|
INTERPOLATE8X8_HALFPEL_HV_ADD_ROUND(); |
702 |
|
INTERPOLATE8X8_HALFPEL_HV_ADD_ROUND(); |
703 |
|
|
704 |
|
INTERPOLATE8X8_HALFPEL_HV_ADD_ROUND(); |
705 |
|
INTERPOLATE8X8_HALFPEL_HV_ADD_ROUND(); |
706 |
|
INTERPOLATE8X8_HALFPEL_HV_ADD_ROUND(); |
707 |
|
INTERPOLATE8X8_HALFPEL_HV_ADD_ROUND(); |
708 |
|
} |
709 |
|
else { |
710 |
|
|
711 |
|
/* Load the first row 'manually' */ |
712 |
|
c00 = vec_perm(vec_ld(0,src),vec_ld(16,src),vec_lvsl(0,src)); |
713 |
|
c01 = vec_perm(c00,c00,rot1); |
714 |
|
|
715 |
|
INTERPOLATE8X8_HALFPEL_HV_ADD_NOROUND(); |
716 |
|
INTERPOLATE8X8_HALFPEL_HV_ADD_NOROUND(); |
717 |
|
INTERPOLATE8X8_HALFPEL_HV_ADD_NOROUND(); |
718 |
|
INTERPOLATE8X8_HALFPEL_HV_ADD_NOROUND(); |
719 |
|
|
720 |
|
INTERPOLATE8X8_HALFPEL_HV_ADD_NOROUND(); |
721 |
|
INTERPOLATE8X8_HALFPEL_HV_ADD_NOROUND(); |
722 |
|
INTERPOLATE8X8_HALFPEL_HV_ADD_NOROUND(); |
723 |
|
INTERPOLATE8X8_HALFPEL_HV_ADD_NOROUND(); |
724 |
|
} |
725 |
} |
} |