Quelle deblock_sse2.asm

Sprache: Masm
;

;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

;

;  Use of this source code is governed by a BSD-style license

;  that can be found in the LICENSE file in the root of the source

;  tree. An additional intellectual property rights grant can be found

;  in the file PATENTS.  All contributing project authors may

;  be found in the AUTHORS file in the root of the source tree.

;

%include "vpx_ports/x86_abi_support.asm"

;macro in deblock functions

%macro FIRST_2_ROWS 0

        movdqa      xmm4,       xmm0

        movdqa      xmm6,       xmm0

        movdqa      xmm5,       xmm1

        pavgb       xmm5,       xmm3

        ;calculate absolute value

        psubusb     xmm4,       xmm1

        psubusb     xmm1,       xmm0

        psubusb     xmm6,       xmm3

        psubusb     xmm3,       xmm0

        paddusb     xmm4,       xmm1

        paddusb     xmm6,       xmm3

        ;get threshold

        movdqa      xmm2,       flimit

        pxor        xmm1,       xmm1

        movdqa      xmm7,       xmm2

        ;get mask

        psubusb     xmm2,       xmm4

        psubusb     xmm7,       xmm6

        pcmpeqb     xmm2,       xmm1

        pcmpeqb     xmm7,       xmm1

        por         xmm7,       xmm2

%endmacro

%macro SECOND_2_ROWS 0

        movdqa      xmm6,       xmm0

        movdqa      xmm4,       xmm0

        movdqa      xmm2,       xmm1

        pavgb       xmm1,       xmm3

        ;calculate absolute value

        psubusb     xmm6,       xmm2

        psubusb     xmm2,       xmm0

        psubusb     xmm4,       xmm3

        psubusb     xmm3,       xmm0

        paddusb     xmm6,       xmm2

        paddusb     xmm4,       xmm3

        pavgb       xmm5,       xmm1

        ;get threshold

        movdqa      xmm2,       flimit

        pxor        xmm1,       xmm1

        movdqa      xmm3,       xmm2

        ;get mask

        psubusb     xmm2,       xmm6

        psubusb     xmm3,       xmm4

        pcmpeqb     xmm2,       xmm1

        pcmpeqb     xmm3,       xmm1

        por         xmm7,       xmm2

        por         xmm7,       xmm3

        pavgb       xmm5,       xmm0

        ;decide if or not to use filtered value

        pand        xmm0,       xmm7

        pandn       xmm7,       xmm5

        paddusb     xmm0,       xmm7

%endmacro

%macro UPDATE_FLIMIT 0

        movdqu      xmm2,       XMMWORD PTR [rbx]

        movdqu      [rsp],      xmm2

        add         rbx,        16

%endmacro

SECTION .text

;void vpx_post_proc_down_and_across_mb_row_sse2

;(

;    unsigned char *src_ptr,

;    unsigned char *dst_ptr,

;    int src_pixels_per_line,

;    int dst_pixels_per_line,

;    int cols,

;    int *flimits,

;    int size

;)

globalsym(vpx_post_proc_down_and_across_mb_row_sse2)

sym(vpx_post_proc_down_and_across_mb_row_sse2):

    push        rbp

    mov         rbp, rsp

    SHADOW_ARGS_TO_STACK 7

    SAVE_XMM 7

    push        rbx

    push        rsi

    push        rdi

    ; end prolog

    ALIGN_STACK 16, rax

    sub         rsp, 16

        ; put flimit on stack

        mov         rbx,        arg(5)           ;flimits ptr

        UPDATE_FLIMIT

%define flimit [rsp]

        mov         rsi,        arg(0)           ;src_ptr

        mov         rdi,        arg(1)           ;dst_ptr

        movsxd      rax,        DWORD PTR arg(2) ;src_pixels_per_line

        movsxd      rcx,        DWORD PTR arg(6) ;rows in a macroblock

.nextrow:

        xor         rdx,        rdx              ;col

.nextcol:

        ;load current and next 2 rows

        movdqu      xmm0,       XMMWORD PTR [rsi]

        movdqu      xmm1,       XMMWORD PTR [rsi + rax]

        movdqu      xmm3,       XMMWORD PTR [rsi + 2*rax]

        FIRST_2_ROWS

        ;load above 2 rows

        neg         rax

        movdqu      xmm1,       XMMWORD PTR [rsi + 2*rax]

        movdqu      xmm3,       XMMWORD PTR [rsi + rax]

        SECOND_2_ROWS

        movdqu      XMMWORD PTR [rdi], xmm0

        neg         rax                          ; positive stride

        add         rsi,        16

        add         rdi,        16

        add         rdx,        16

        cmp         edx,        dword arg(4)     ;cols

        jge         .downdone

        UPDATE_FLIMIT

        jmp         .nextcol

.downdone:

        ; done with the all cols, start the across filtering in place

        sub         rsi,        rdx

        sub         rdi,        rdx

        mov         rbx,        arg(5) ; flimits

        UPDATE_FLIMIT

        ; dup the first byte into the left border 8 times

        movq        mm1,   [rdi]

        punpcklbw   mm1,   mm1

        punpcklwd   mm1,   mm1

        punpckldq   mm1,   mm1

        mov         rdx,    -8

        movq        [rdi+rdx], mm1

        ; dup the last byte into the right border

        movsxd      rdx,    dword arg(4)

        movq        mm1,   [rdi + rdx + -1]

        punpcklbw   mm1,   mm1

        punpcklwd   mm1,   mm1

        punpckldq   mm1,   mm1

        movq        [rdi+rdx], mm1

        xor         rdx,        rdx

        movq        mm0,        QWORD PTR [rdi-16];

        movq        mm1,        QWORD PTR [rdi-8];

.acrossnextcol:

        movdqu      xmm0,       XMMWORD PTR [rdi + rdx]

        movdqu      xmm1,       XMMWORD PTR [rdi + rdx -2]

        movdqu      xmm3,       XMMWORD PTR [rdi + rdx -1]

        FIRST_2_ROWS

        movdqu      xmm1,       XMMWORD PTR [rdi + rdx +1]

        movdqu      xmm3,       XMMWORD PTR [rdi + rdx +2]

        SECOND_2_ROWS

        movq        QWORD PTR [rdi+rdx-16], mm0  ; store previous 8 bytes

        movq        QWORD PTR [rdi+rdx-8], mm1   ; store previous 8 bytes

        movdq2q     mm0,        xmm0

        psrldq      xmm0,       8

        movdq2q     mm1,        xmm0

        add         rdx,        16

        cmp         edx,        dword arg(4)     ;cols

        jge         .acrossdone

        UPDATE_FLIMIT

        jmp         .acrossnextcol

.acrossdone:

        ; last 16 pixels

        movq        QWORD PTR [rdi+rdx-16], mm0

        cmp         edx,        dword arg(4)

        jne         .throw_last_8

        movq        QWORD PTR [rdi+rdx-8], mm1

.throw_last_8:

        ; done with this rwo

        add         rsi,rax                      ;next src line

        mov         eax, dword arg(3)            ;dst_pixels_per_line

        add         rdi,rax                      ;next destination

        mov         eax, dword arg(2)            ;src_pixels_per_line

        mov         rbx,        arg(5)           ;flimits

        UPDATE_FLIMIT

        dec         rcx                          ;decrement count

        jnz         .nextrow                     ;next row

    add rsp, 16

    pop rsp

    ; begin epilog

    pop rdi

    pop rsi

    pop rbx

    RESTORE_XMM

    UNSHADOW_ARGS

    pop         rbp

    ret

%undef flimit

;void vpx_mbpost_proc_across_ip_sse2(unsigned char *src,

;                                    int pitch, int rows, int cols,int flimit)

globalsym(vpx_mbpost_proc_across_ip_sse2)

sym(vpx_mbpost_proc_across_ip_sse2):

    push        rbp

    mov         rbp, rsp

    SHADOW_ARGS_TO_STACK 5

    SAVE_XMM 7

    GET_GOT     rbx

    push        rsi

    push        rdi

    ; end prolog

    ALIGN_STACK 16, rax

    sub         rsp, 16

    ; create flimit4 at [rsp]

    mov         eax, dword ptr arg(4) ;flimit

    mov         [rsp], eax

    mov         [rsp+4], eax

    mov         [rsp+8], eax

    mov         [rsp+12], eax

%define flimit4 [rsp]

    ;for(r=0;r<rows;r++)

.ip_row_loop:

        xor         rdx,    rdx ;sumsq=0;

        xor         rcx,    rcx ;sum=0;

        mov         rsi,    arg(0); s

        ; dup the first byte into the left border 8 times

        movq        mm1,   [rsi]

        punpcklbw   mm1,   mm1

        punpcklwd   mm1,   mm1

        punpckldq   mm1,   mm1

        mov         rdi,    -8

        movq        [rsi+rdi], mm1

        ; dup the last byte into the right border

        movsxd      rdx,    dword arg(3)

        movq        mm1,   [rsi + rdx + -1]

        punpcklbw   mm1,   mm1

        punpcklwd   mm1,   mm1

        punpckldq   mm1,   mm1

        movq        [rsi+rdx], mm1

.ip_var_loop:

        ;for(i=-8;i<=6;i++)

        ;{

        ;    sumsq += s[i]*s[i];

        ;    sum   += s[i];

        ;}

        movzx       eax, byte [rsi+rdi]

        add         ecx, eax

        mul         al

        add         edx, eax

        add         rdi, 1

        cmp         rdi, 6

        jle         .ip_var_loop

            ;mov         rax,    sumsq

            ;movd        xmm7,   rax

            movd        xmm7,   edx

            ;mov         rax,    sum

            ;movd        xmm6,   rax

            movd        xmm6,   ecx

            mov         rsi,    arg(0) ;s

            xor         rcx,    rcx

            movsxd      rdx,    dword arg(3) ;cols

            add         rdx,    8

            pxor        mm0,    mm0

            pxor        mm1,    mm1

            pxor        xmm0,   xmm0

.nextcol4:

            movd        xmm1,   DWORD PTR [rsi+rcx-8]   ; -8 -7 -6 -5

            movd        xmm2,   DWORD PTR [rsi+rcx+7]   ; +7 +8 +9 +10

            punpcklbw   xmm1,   xmm0                    ; expanding

            punpcklbw   xmm2,   xmm0                    ; expanding

            punpcklwd   xmm1,   xmm0                    ; expanding to dwords

            punpcklwd   xmm2,   xmm0                    ; expanding to dwords

            psubd       xmm2,   xmm1                    ; 7--8   8--7   9--6 10--5

            paddd       xmm1,   xmm1                    ; -8*2   -7*2   -6*2 -5*2

            paddd       xmm1,   xmm2                    ; 7+-8   8+-7   9+-6 10+-5

            pmaddwd     xmm1,   xmm2                    ; squared of 7+-8   8+-7   9+-6 10+-5

            paddd       xmm6,   xmm2

            paddd       xmm7,   xmm1

            pshufd      xmm6,   xmm6,   0               ; duplicate the last ones

            pshufd      xmm7,   xmm7,   0               ; duplicate the last ones

            psrldq      xmm1,       4                   ; 8--7   9--6 10--5  0000

            psrldq      xmm2,       4                   ; 8--7   9--6 10--5  0000

            pshufd      xmm3,   xmm1,   3               ; 0000  8--7   8--7   8--7 squared

            pshufd      xmm4,   xmm2,   3               ; 0000  8--7   8--7   8--7 squared

            paddd       xmm6,   xmm4

            paddd       xmm7,   xmm3

            pshufd      xmm3,   xmm1,   01011111b       ; 0000  0000   9--6   9--6 squared

            pshufd      xmm4,   xmm2,   01011111b       ; 0000  0000   9--6   9--6 squared

            paddd       xmm7,   xmm3

            paddd       xmm6,   xmm4

            pshufd      xmm3,   xmm1,   10111111b       ; 0000  0000   8--7   8--7 squared

            pshufd      xmm4,   xmm2,   10111111b       ; 0000  0000   8--7   8--7 squared

            paddd       xmm7,   xmm3

            paddd       xmm6,   xmm4

            movdqa      xmm3,   xmm6

            pmaddwd     xmm3,   xmm3

            movdqa      xmm5,   xmm7

            pslld       xmm5,   4

            psubd       xmm5,   xmm7

            psubd       xmm5,   xmm3

            psubd       xmm5,   flimit4

            psrad       xmm5,   31

            packssdw    xmm5,   xmm0

            packsswb    xmm5,   xmm0

            movd        xmm1,   DWORD PTR [rsi+rcx]

            movq        xmm2,   xmm1

            punpcklbw   xmm1,   xmm0

            punpcklwd   xmm1,   xmm0

            paddd       xmm1,   xmm6

            paddd       xmm1,   [GLOBAL(four8s)]

            psrad       xmm1,   4

            packssdw    xmm1,   xmm0

            packuswb    xmm1,   xmm0

            pand        xmm1,   xmm5

            pandn       xmm5,   xmm2

            por         xmm5,   xmm1

            movd        [rsi+rcx-8],  mm0

            movq        mm0,    mm1

            movdq2q     mm1,    xmm5

            psrldq      xmm7,   12

            psrldq      xmm6,   12

            add         rcx,    4

            cmp         rcx,    rdx

            jl          .nextcol4

        ;s+=pitch;

        movsxd rax, dword arg(1)

        add    arg(0), rax

        sub dword arg(2), 1 ;rows-=1

        cmp dword arg(2), 0

        jg .ip_row_loop

    add         rsp, 16

    pop         rsp

    ; begin epilog

    pop rdi

    pop rsi

    RESTORE_GOT

    RESTORE_XMM

    UNSHADOW_ARGS

    pop         rbp

    ret

%undef flimit4

SECTION_RODATA

align 16

four8s:

    times 4 dd 8
Messung V0.5 in Prozent
¤ Dauer der Verarbeitung: 0.17 Sekunden (vorverarbeitet am 2026-06-07) ¤

Wurzel
Suchen
PVS Prover
Isabelle Prover
NIST Cobol Testsuite
Cephes Mathematical Library
Vienna Development Method
Haftungshinweis

Die Informationen auf dieser Webseite wurden nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit, noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:

Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.