265 |
.endp sad16bi_ia64# |
.endp sad16bi_ia64# |
266 |
|
|
267 |
|
|
268 |
.common dev16#,8,8 |
|
269 |
|
|
270 |
|
|
271 |
|
|
272 |
|
|
273 |
|
|
274 |
|
.text |
275 |
.align 16 |
.align 16 |
276 |
.global dev16_ia64# |
.global dev16_ia64# |
277 |
.proc dev16_ia64# |
.proc dev16_ia64# |
278 |
|
.auto |
279 |
dev16_ia64: |
dev16_ia64: |
280 |
.prologue |
// renamings for better readability |
281 |
zxt4 r33 = r33 |
stride = r18 |
282 |
.save ar.lc, r2 |
pfs = r19 //for saving previous function state |
283 |
mov r2 = ar.lc |
cura0 = r20 //address of first 8-byte block of cur |
284 |
|
cura1 = r21 //address of second 8-byte block of cur |
285 |
|
mean0 = r22 //registers for calculating the sum in parallel |
286 |
|
mean1 = r23 |
287 |
|
mean2 = r24 |
288 |
|
mean3 = r25 |
289 |
|
dev0 = r26 //same for the deviation |
290 |
|
dev1 = r27 |
291 |
|
dev2 = r28 |
292 |
|
dev3 = r29 |
293 |
|
|
294 |
.body |
.body |
295 |
mov r21 = r0 |
alloc pfs = ar.pfs, 2, 38, 0, 40 |
296 |
mov r8 = r0 |
|
297 |
mov r23 = r32 |
mov cura0 = in0 |
298 |
mov r24 = r0 |
mov stride = in1 |
299 |
;; |
add cura1 = 8, cura0 |
300 |
mov r25 = r33 |
|
301 |
.L50: |
.rotr c[32], psad[8] // just using rotating registers to get an array ;-) |
302 |
mov r22 = r0 |
|
303 |
mov r20 = r23 |
.explicit |
304 |
;; |
{.mmi |
305 |
.L54: |
ld8 c[0] = [cura0], stride // load them ... |
306 |
mov r16 = r20 |
ld8 c[1] = [cura1], stride |
307 |
adds r14 = 2, r20 |
;; |
308 |
adds r15 = 3, r20 |
} |
309 |
;; |
{.mmi |
310 |
ld1 r17 = [r16], 1 |
ld8 c[2] = [cura0], stride |
311 |
ld1 r18 = [r14] |
ld8 c[3] = [cura1], stride |
312 |
ld1 r19 = [r15] |
;; |
313 |
;; |
} |
314 |
ld1 r14 = [r16] |
{.mmi |
315 |
add r21 = r17, r21 |
ld8 c[4] = [cura0], stride |
316 |
adds r15 = 4, r20 |
ld8 c[5] = [cura1], stride |
317 |
;; |
;; |
318 |
add r21 = r14, r21 |
} |
319 |
ld1 r16 = [r15] |
{.mmi |
320 |
adds r22 = 8, r22 |
ld8 c[6] = [cura0], stride |
321 |
;; |
ld8 c[7] = [cura1], stride |
322 |
add r21 = r18, r21 |
;; |
323 |
adds r14 = 5, r20 |
} |
324 |
adds r15 = 6, r20 |
{.mmi |
325 |
;; |
ld8 c[8] = [cura0], stride |
326 |
add r21 = r19, r21 |
ld8 c[9] = [cura1], stride |
327 |
ld1 r17 = [r14] |
;; |
328 |
ld1 r18 = [r15] |
} |
329 |
;; |
{.mmi |
330 |
add r21 = r16, r21 |
ld8 c[10] = [cura0], stride |
331 |
adds r14 = 7, r20 |
ld8 c[11] = [cura1], stride |
332 |
cmp4.geu p6, p7 = 15, r22 |
;; |
333 |
;; |
} |
334 |
add r21 = r17, r21 |
{.mii |
335 |
ld1 r15 = [r14] |
ld8 c[12] = [cura0], stride |
336 |
adds r20 = 8, r20 |
psad1 mean0 = c[0], r0 // get the sum of them ... |
337 |
;; |
psad1 mean1 = c[1], r0 |
338 |
add r21 = r18, r21 |
} |
339 |
;; |
{.mmi |
340 |
add r21 = r15, r21 |
ld8 c[13] = [cura1], stride |
341 |
(p6) br.cond.dptk .L54 |
;; |
342 |
adds r24 = 1, r24 |
ld8 c[14] = [cura0], stride |
343 |
add r23 = r23, r25 |
psad1 mean2 = c[2], r0 |
344 |
;; |
} |
345 |
cmp4.geu p6, p7 = 15, r24 |
{.mii |
346 |
(p6) br.cond.dptk .L50 |
ld8 c[15] = [cura1], stride |
347 |
extr.u r14 = r21, 8, 24 |
psad1 mean3 = c[3], r0 |
348 |
mov r23 = r32 |
;; |
349 |
mov r24 = r0 |
psad1 psad[0] = c[4], r0 |
350 |
;; |
} |
351 |
mov r21 = r14 |
{.mmi |
352 |
.L60: |
ld8 c[16] = [cura0], stride |
353 |
addl r14 = 3, r0 |
ld8 c[17] = [cura1], stride |
354 |
mov r17 = r23 |
psad1 psad[1] = c[5], r0 |
355 |
;; |
;; |
356 |
mov ar.lc = r14 |
} |
357 |
;; |
{.mii |
358 |
.L144: |
ld8 c[18] = [cura0], stride |
359 |
mov r16 = r17 |
psad1 psad[2] = c[6], r0 |
360 |
;; |
psad1 psad[3] = c[7], r0 |
361 |
ld1 r14 = [r16], 1 |
} |
362 |
;; |
{.mmi |
363 |
sub r15 = r14, r21 |
ld8 c[19] = [cura1], stride |
364 |
;; |
;; |
365 |
cmp4.ge p6, p7 = 0, r15 |
ld8 c[20] = [cura0], stride |
366 |
;; |
psad1 psad[4] = c[8], r0 |
367 |
(p7) add r8 = r8, r15 |
} |
368 |
(p6) sub r14 = r21, r14 |
{.mii |
369 |
;; |
ld8 c[21] = [cura1], stride |
370 |
(p6) add r8 = r8, r14 |
psad1 psad[5] = c[9], r0 |
371 |
ld1 r14 = [r16] |
;; |
372 |
;; |
add mean0 = mean0, psad[0] |
373 |
sub r15 = r14, r21 |
} |
374 |
adds r16 = 2, r17 |
{.mmi |
375 |
;; |
ld8 c[22] = [cura0], stride |
376 |
cmp4.ge p6, p7 = 0, r15 |
ld8 c[23] = [cura1], stride |
377 |
;; |
add mean1 = mean1, psad[1] |
378 |
(p7) add r8 = r8, r15 |
;; |
379 |
(p6) sub r14 = r21, r14 |
} |
380 |
;; |
{.mii |
381 |
(p6) add r8 = r8, r14 |
ld8 c[24] = [cura0], stride |
382 |
ld1 r14 = [r16] |
psad1 psad[0] = c[10], r0 |
383 |
;; |
psad1 psad[1] = c[11], r0 |
384 |
sub r15 = r14, r21 |
} |
385 |
adds r16 = 3, r17 |
{.mmi |
386 |
;; |
ld8 c[25] = [cura1], stride |
387 |
cmp4.ge p6, p7 = 0, r15 |
;; |
388 |
adds r17 = 4, r17 |
ld8 c[26] = [cura0], stride |
389 |
;; |
add mean2 = mean2, psad[2] |
390 |
(p7) add r8 = r8, r15 |
} |
391 |
(p6) sub r14 = r21, r14 |
{.mii |
392 |
;; |
ld8 c[27] = [cura1], stride |
393 |
(p6) add r8 = r8, r14 |
add mean3 = mean3, psad[3] |
394 |
ld1 r14 = [r16] |
;; |
395 |
;; |
psad1 psad[2] = c[12], r0 |
396 |
sub r15 = r14, r21 |
} |
397 |
;; |
{.mmi |
398 |
cmp4.ge p6, p7 = 0, r15 |
ld8 c[28] = [cura0], stride |
399 |
;; |
ld8 c[29] = [cura1], stride |
400 |
(p7) add r8 = r8, r15 |
psad1 psad[3] = c[13], r0 |
401 |
(p6) sub r14 = r21, r14 |
;; |
402 |
|
} |
403 |
|
{.mii |
404 |
|
ld8 c[30] = [cura0] |
405 |
|
psad1 psad[6] = c[14], r0 |
406 |
|
psad1 psad[7] = c[15], r0 |
407 |
|
} |
408 |
|
{.mmi |
409 |
|
ld8 c[31] = [cura1] |
410 |
|
;; |
411 |
|
add mean0 = mean0, psad[0] |
412 |
|
add mean1 = mean1, psad[1] |
413 |
|
} |
414 |
|
{.mii |
415 |
|
add mean2 = mean2, psad[4] |
416 |
|
add mean3 = mean3, psad[5] |
417 |
|
;; |
418 |
|
psad1 psad[0] = c[16], r0 |
419 |
|
} |
420 |
|
{.mmi |
421 |
|
add mean0 = mean0, psad[2] |
422 |
|
add mean1 = mean1, psad[3] |
423 |
|
psad1 psad[1] = c[17], r0 |
424 |
|
;; |
425 |
|
} |
426 |
|
{.mii |
427 |
|
add mean2 = mean2, psad[6] |
428 |
|
psad1 psad[2] = c[18], r0 |
429 |
|
psad1 psad[3] = c[19], r0 |
430 |
|
} |
431 |
|
{.mmi |
432 |
|
add mean3 = mean3, psad[7] |
433 |
|
;; |
434 |
|
add mean0 = mean0, psad[0] |
435 |
|
psad1 psad[4] = c[20], r0 |
436 |
|
} |
437 |
|
{.mii |
438 |
|
add mean1 = mean1, psad[1] |
439 |
|
psad1 psad[5] = c[21], r0 |
440 |
|
;; |
441 |
|
psad1 psad[6] = c[22], r0 |
442 |
|
} |
443 |
|
{.mmi |
444 |
|
add mean2 = mean2, psad[2] |
445 |
|
add mean3 = mean3, psad[3] |
446 |
|
psad1 psad[7] = c[23], r0 |
447 |
|
;; |
448 |
|
} |
449 |
|
{.mii |
450 |
|
add mean0 = mean0, psad[4] |
451 |
|
psad1 psad[0] = c[24], r0 |
452 |
|
psad1 psad[1] = c[25], r0 |
453 |
|
} |
454 |
|
{.mmi |
455 |
|
add mean1 = mean1, psad[5] |
456 |
|
;; |
457 |
|
add mean2 = mean2, psad[6] |
458 |
|
psad1 psad[2] = c[26], r0 |
459 |
|
} |
460 |
|
{.mii |
461 |
|
add mean3 = mean3, psad[7] |
462 |
|
psad1 psad[3] = c[27], r0 |
463 |
|
;; |
464 |
|
psad1 psad[4] = c[28], r0 |
465 |
|
} |
466 |
|
{.mmi |
467 |
|
add mean0 = mean0, psad[0] |
468 |
|
add mean1 = mean1, psad[1] |
469 |
|
psad1 psad[5] = c[29], r0 |
470 |
|
;; |
471 |
|
} |
472 |
|
{.mii |
473 |
|
add mean2 = mean2, psad[2] |
474 |
|
psad1 psad[6] = c[30], r0 |
475 |
|
psad1 psad[7] = c[31], r0 |
476 |
|
} |
477 |
|
{.mmi |
478 |
|
add mean3 = mean3, psad[3] |
479 |
|
;; |
480 |
|
add mean0 = mean0, psad[4] |
481 |
|
add mean1 = mean1, psad[5] |
482 |
|
} |
483 |
|
{.mbb |
484 |
|
add mean2 = mean2, mean3 |
485 |
|
nop.b 1 |
486 |
|
nop.b 1 |
487 |
|
;; |
488 |
|
} |
489 |
|
{.mib |
490 |
|
add mean0 = mean0, psad[6] |
491 |
|
add mean1 = mean1, psad[7] |
492 |
|
nop.b 1 |
493 |
|
;; |
494 |
|
} |
495 |
|
{.mib |
496 |
|
add mean0 = mean0, mean1 |
497 |
|
// add mean2 = 127, mean2 // this could make our division more exact, but does not help much |
498 |
|
;; |
499 |
|
} |
500 |
|
{.mib |
501 |
|
add mean0 = mean0, mean2 |
502 |
;; |
;; |
503 |
(p6) add r8 = r8, r14 |
} |
504 |
br.cloop.sptk.few .L144 |
|
505 |
adds r24 = 1, r24 |
{.mib |
506 |
add r23 = r23, r33 |
shr.u mean0 = mean0, 8 // divide them ... |
507 |
;; |
;; |
508 |
cmp4.geu p6, p7 = 15, r24 |
} |
509 |
(p6) br.cond.dptk .L60 |
{.mib |
510 |
mov ar.lc = r2 |
mux1 mean0 = mean0, @brcst |
511 |
|
;; |
512 |
|
} |
513 |
|
{.mii |
514 |
|
nop.m 0 |
515 |
|
psad1 dev0 = c[0], mean0 // and do a sad again ... |
516 |
|
psad1 dev1 = c[1], mean0 |
517 |
|
} |
518 |
|
{.mii |
519 |
|
nop.m 0 |
520 |
|
psad1 dev2 = c[2], mean0 |
521 |
|
psad1 dev3 = c[3], mean0 |
522 |
|
} |
523 |
|
{.mii |
524 |
|
nop.m 0 |
525 |
|
psad1 psad[0] = c[4], mean0 |
526 |
|
psad1 psad[1] = c[5], mean0 |
527 |
|
} |
528 |
|
{.mii |
529 |
|
nop.m 0 |
530 |
|
psad1 psad[2] = c[6], mean0 |
531 |
|
psad1 psad[3] = c[7], mean0 |
532 |
|
} |
533 |
|
{.mii |
534 |
|
nop.m 0 |
535 |
|
psad1 psad[4] = c[8], mean0 |
536 |
|
psad1 psad[5] = c[9], mean0 |
537 |
|
;; |
538 |
|
} |
539 |
|
{.mii |
540 |
|
add dev0 = dev0, psad[0] |
541 |
|
psad1 psad[6] = c[10], mean0 |
542 |
|
psad1 psad[7] = c[11], mean0 |
543 |
|
} |
544 |
|
{.mmi |
545 |
|
add dev1 = dev1, psad[1] |
546 |
|
|
547 |
|
add dev2 = dev2, psad[2] |
548 |
|
psad1 psad[0] = c[12], mean0 |
549 |
|
} |
550 |
|
{.mii |
551 |
|
add dev3 = dev3, psad[3] |
552 |
|
psad1 psad[1] = c[13], mean0 |
553 |
|
;; |
554 |
|
psad1 psad[2] = c[14], mean0 |
555 |
|
} |
556 |
|
{.mmi |
557 |
|
add dev0 = dev0, psad[4] |
558 |
|
add dev1 = dev1, psad[5] |
559 |
|
psad1 psad[3] = c[15], mean0 |
560 |
|
} |
561 |
|
{.mii |
562 |
|
add dev2 = dev2, psad[6] |
563 |
|
psad1 psad[4] = c[16], mean0 |
564 |
|
psad1 psad[5] = c[17], mean0 |
565 |
|
} |
566 |
|
{.mmi |
567 |
|
add dev3 = dev3, psad[7] |
568 |
|
;; |
569 |
|
add dev0 = dev0, psad[0] |
570 |
|
psad1 psad[6] = c[18], mean0 |
571 |
|
} |
572 |
|
{.mii |
573 |
|
add dev1 = dev1, psad[1] |
574 |
|
psad1 psad[7] = c[19], mean0 |
575 |
|
|
576 |
|
psad1 psad[0] = c[20], mean0 |
577 |
|
} |
578 |
|
{.mmi |
579 |
|
add dev2 = dev2, psad[2] |
580 |
|
add dev3 = dev3, psad[3] |
581 |
|
psad1 psad[1] = c[21], mean0 |
582 |
|
;; |
583 |
|
} |
584 |
|
{.mii |
585 |
|
add dev0 = dev0, psad[4] |
586 |
|
psad1 psad[2] = c[22], mean0 |
587 |
|
psad1 psad[3] = c[23], mean0 |
588 |
|
} |
589 |
|
{.mmi |
590 |
|
add dev1 = dev1, psad[5] |
591 |
|
|
592 |
|
add dev2 = dev2, psad[6] |
593 |
|
psad1 psad[4] = c[24], mean0 |
594 |
|
} |
595 |
|
{.mii |
596 |
|
add dev3 = dev3, psad[7] |
597 |
|
psad1 psad[5] = c[25], mean0 |
598 |
|
;; |
599 |
|
psad1 psad[6] = c[26], mean0 |
600 |
|
} |
601 |
|
{.mmi |
602 |
|
add dev0 = dev0, psad[0] |
603 |
|
add dev1 = dev1, psad[1] |
604 |
|
psad1 psad[7] = c[27], mean0 |
605 |
|
} |
606 |
|
{.mii |
607 |
|
add dev2 = dev2, psad[2] |
608 |
|
psad1 psad[0] = c[28], mean0 |
609 |
|
psad1 psad[1] = c[29], mean0 |
610 |
|
} |
611 |
|
{.mmi |
612 |
|
add dev3 = dev3, psad[3] |
613 |
|
;; |
614 |
|
add dev0 = dev0, psad[4] |
615 |
|
psad1 psad[2] = c[30], mean0 |
616 |
|
} |
617 |
|
{.mii |
618 |
|
add dev1 = dev1, psad[5] |
619 |
|
psad1 psad[3] = c[31], mean0 |
620 |
|
;; |
621 |
|
add dev2 = dev2, psad[6] |
622 |
|
} |
623 |
|
{.mmi |
624 |
|
add dev3 = dev3, psad[7] |
625 |
|
add dev0 = dev0, psad[0] |
626 |
|
add dev1 = dev1, psad[1] |
627 |
|
;; |
628 |
|
} |
629 |
|
{.mii |
630 |
|
add dev2 = dev2, psad[2] |
631 |
|
add dev3 = dev3, psad[3] |
632 |
|
add ret0 = dev0, dev1 |
633 |
|
;; |
634 |
|
} |
635 |
|
{.mib |
636 |
|
add dev2 = dev2, dev3 |
637 |
|
nop.i 1 |
638 |
|
nop.b 1 |
639 |
|
;; |
640 |
|
} |
641 |
|
{.mib |
642 |
|
add ret0 = ret0, dev2 |
643 |
|
nop.i 1 |
644 |
br.ret.sptk.many b0 |
br.ret.sptk.many b0 |
645 |
|
} |
646 |
.endp dev16_ia64# |
.endp dev16_ia64# |