;******************************************************************************
;* VP9 loop filter SIMD optimizations
;*
;* Copyright (C) 2015 Ronald S. Bultje <rsbultje@gmail.com>
;*
;* This file is part of FFmpeg.
;*
;* FFmpeg is free software; you can redistribute it and/or
;* modify it under the terms of the GNU Lesser General Public
;* License as published by the Free Software Foundation; either
;* version 2.1 of the License, or (at your option) any later version.
;*
;* FFmpeg is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
;* Lesser General Public License for more details.
;*
;* You should have received a copy of the GNU Lesser General Public
;* License along with FFmpeg; if not, write to the Free Software
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
;******************************************************************************
%include "libavutil/x86/x86util.asm"
SECTION_RODATA
pw_511: times 16 dw 511
pw_2047: times 16 dw 2047
pw_16384: times 16 dw 16384
pw_m512: times 16 dw -512
pw_m2048: times 16 dw -2048
cextern pw_1
cextern pw_3
cextern pw_4
cextern pw_8
cextern pw_16
cextern pw_256
cextern pw_1023
cextern pw_4095
cextern pw_m1
SECTION .text
%macro SCRATCH 3 -4
%if ARCH_X86_64
SWAP %1 , %2
%if %0 == 4
%define reg_%4 m%2
%endif
%else
mova [%3 ], m%1
%if %0 == 4
%define reg_%4 [%3 ]
%endif
%endif
%endmacro
%macro UNSCRATCH 3 -4
%if ARCH_X86_64
SWAP %1 , %2
%else
mova m%1 , [%3 ]
%endif
%if %0 == 4
%undef reg_%4
%endif
%endmacro
%macro PRELOAD 2 -3
%if ARCH_X86_64
mova m%1 , [%2 ]
%if %0 == 3
%define reg_%3 m%1
%endif
%elif %0 == 3
%define reg_%3 [%2 ]
%endif
%endmacro
; calculate p or q portion of flat8out
%macro FLAT8OUT_HALF 0
psubw m4, m0 ; q4-q0
psubw m5, m0 ; q5-q0
psubw m6, m0 ; q6-q0
psubw m7, m0 ; q7-q0
ABS2 m4, m5, m2, m3 ; abs(q4-q0) | abs(q5-q0)
ABS2 m6, m7, m2, m3 ; abs(q6-q0) | abs(q7-q0)
pcmpgtw m4, reg_F ; abs(q4-q0) > F
pcmpgtw m5, reg_F ; abs(q5-q0) > F
pcmpgtw m6, reg_F ; abs(q6-q0) > F
pcmpgtw m7, reg_F ; abs(q7-q0) > F
por m5, m4
por m7, m6
por m7, m5 ; !flat8out, q portion
%endmacro
; calculate p or q portion of flat8in/hev/fm (excluding mb_edge condition)
%macro FLAT8IN_HALF 1
%if %1 > 4
psubw m4, m3, m0 ; q3-q0
psubw m5, m2, m0 ; q2-q0
ABS2 m4, m5, m6, m7 ; abs(q3-q0) | abs(q2-q0)
pcmpgtw m4, reg_F ; abs(q3-q0) > F
pcmpgtw m5, reg_F ; abs(q2-q0) > F
%endif
psubw m3, m2 ; q3-q2
psubw m2, m1 ; q2-q1
ABS2 m3, m2, m6, m7 ; abs(q3-q2) | abs(q2-q1)
pcmpgtw m3, reg_I ; abs(q3-q2) > I
pcmpgtw m2, reg_I ; abs(q2-q1) > I
%if %1 > 4
por m4, m5
%endif
por m2, m3
psubw m3, m1, m0 ; q1-q0
ABS1 m3, m5 ; abs(q1-q0)
%if %1 > 4
pcmpgtw m6, m3, reg_F ; abs(q1-q0) > F
%endif
pcmpgtw m7, m3, reg_H ; abs(q1-q0) > H
pcmpgtw m3, reg_I ; abs(q1-q0) > I
%if %1 > 4
por m4, m6
%endif
por m2, m3
%endmacro
; one step in filter_14/filter_6
;
; take sum $reg, downshift, apply mask and write into dst
;
; if sub2/add1-2 are present, add/sub as appropriate to prepare for the next
; step's sum $reg. This is omitted for the last row in each filter.
;
; if dont_store is set, don't write the result into memory, instead keep the
; values in register so we can write it out later
%macro FILTER_STEP 6 -10 "" , "" , "" , 0 ; tmp, reg, mask, shift, dst, \
; src/sub1, sub2, add1, add2, dont_store
psrlw %1 , %2 , %4
psubw %1 , %6 ; abs->delta
%ifnidn %7 , ""
psubw %2 , %6
psubw %2 , %7
paddw %2 , %8
paddw %2 , %9
%endif
pand %1 , reg_%3 ; apply mask
%if %10 == 1
paddw %6 , %1 ; delta->abs
%else
paddw %1 , %6 ; delta->abs
mova [%5 ], %1
%endif
%endmacro
; FIXME avx2 versions for 16_16 and mix2_{4,8}{4,8}
%macro LOOP_FILTER 3 ; dir[h/v], wd[4/8/16], bpp[10/12]
%if ARCH_X86_64
%if %2 == 16
%assign %%num_xmm_regs 16
%elif %2 == 8
%assign %%num_xmm_regs 15
%else ; %2 == 4
%assign %%num_xmm_regs 14
%endif ; %2
%assign %%bak_mem 0
%else ; ARCH_X86_32
%assign %%num_xmm_regs 8
%if %2 == 16
%assign %%bak_mem 7
%elif %2 == 8
%assign %%bak_mem 6
%else ; %2 == 4
%assign %%bak_mem 5
%endif ; %2
%endif ; ARCH_X86_64/32
%if %2 == 16
%ifidn %1 , v
%assign %%num_gpr_regs 6
%else ; %1 == h
%assign %%num_gpr_regs 5
%endif ; %1
%assign %%wd_mem 6
%else ; %2 == 8/4
%assign %%num_gpr_regs 5
%if ARCH_X86_32 && %2 == 8
%assign %%wd_mem 2
%else ; ARCH_X86_64 || %2 == 4
%assign %%wd_mem 0
%endif ; ARCH_X86_64/32 etc.
%endif ; %2
%ifidn %1 , v
%assign %%tsp_mem 0
%elif %2 == 16 ; && %1 == h
%assign %%tsp_mem 16
%else ; %1 == h && %1 == 8/4
%assign %%tsp_mem 8
%endif ; %1/%2
%assign %%off %%wd_mem
%assign %%tspoff %%bak_mem+%%wd_mem
%assign %%stack_mem ((%%bak_mem+%%wd_mem+%%tsp_mem)*mmsize)
%if %3 == 10
%define %%maxsgn 511
%define %%minsgn m512
%define %%maxusgn 1023
%define %%maxf 4
%else ; %3 == 12
%define %%maxsgn 2047
%define %%minsgn m2048
%define %%maxusgn 4095
%define %%maxf 16
%endif ; %3
cglobal vp9_loop_filter_%1 _%2 _%3 , 5 , %%num_gpr_regs, %%num_xmm_regs, %%stack_mem, dst, stride, E, I, H
; prepare E, I and H masks
shl Ed, %3 -8
shl Id, %3 -8
shl Hd, %3 -8
%if cpuflag(ssse3)
mova m0, [pw_256]
%endif
movd m1, Ed
movd m2, Id
movd m3, Hd
%if cpuflag(ssse3)
pshufb m1, m0 ; E << (bit_depth - 8)
pshufb m2, m0 ; I << (bit_depth - 8)
pshufb m3, m0 ; H << (bit_depth - 8)
%else
punpcklwd m1, m1
punpcklwd m2, m2
punpcklwd m3, m3
pshufd m1, m1, q0000
pshufd m2, m2, q0000
pshufd m3, m3, q0000
%endif
SCRATCH 1 , 8 , rsp+(%%off+0 )*mmsize, E
SCRATCH 2 , 9 , rsp+(%%off+1 )*mmsize, I
SCRATCH 3 , 10 , rsp+(%%off+2 )*mmsize, H
%if %2 > 4
PRELOAD 11 , pw_ %+ %%maxf, F
%endif
; set up variables to load data
%ifidn %1 , v
DEFINE_ARGS dst8, stride, stride3, dst0, dst4, dst12
lea stride3q, [strideq*3 ]
neg strideq
%if %2 == 16
lea dst0q, [dst8q+strideq*8 ]
%else
lea dst4q, [dst8q+strideq*4 ]
%endif
neg strideq
%if %2 == 16
lea dst12q, [dst8q+strideq*4 ]
lea dst4q, [dst0q+strideq*4 ]
%endif
%if %2 == 16
%define %%p7 dst0q
%define %%p6 dst0q+strideq
%define %%p5 dst0q+strideq*2
%define %%p4 dst0q+stride3q
%endif
%define %%p3 dst4q
%define %%p2 dst4q+strideq
%define %%p1 dst4q+strideq*2
%define %%p0 dst4q+stride3q
%define %%q0 dst8q
%define %%q1 dst8q+strideq
%define %%q2 dst8q+strideq*2
%define %%q3 dst8q+stride3q
%if %2 == 16
%define %%q4 dst12q
%define %%q5 dst12q+strideq
%define %%q6 dst12q+strideq*2
%define %%q7 dst12q+stride3q
%endif
%else ; %1 == h
DEFINE_ARGS dst0, stride, stride3, dst4
lea stride3q, [strideq*3 ]
lea dst4q, [dst0q+strideq*4 ]
%define %%p3 rsp+(%%tspoff+0 )*mmsize
%define %%p2 rsp+(%%tspoff+1 )*mmsize
%define %%p1 rsp+(%%tspoff+2 )*mmsize
%define %%p0 rsp+(%%tspoff+3 )*mmsize
%define %%q0 rsp+(%%tspoff+4 )*mmsize
%define %%q1 rsp+(%%tspoff+5 )*mmsize
%define %%q2 rsp+(%%tspoff+6 )*mmsize
%define %%q3 rsp+(%%tspoff+7 )*mmsize
%if %2 < 16
movu m0, [dst0q+strideq*0 -8 ]
movu m1, [dst0q+strideq*1 -8 ]
movu m2, [dst0q+strideq*2 -8 ]
movu m3, [dst0q+stride3q -8 ]
movu m4, [dst4q+strideq*0 -8 ]
movu m5, [dst4q+strideq*1 -8 ]
movu m6, [dst4q+strideq*2 -8 ]
movu m7, [dst4q+stride3q -8 ]
%if ARCH_X86_64
TRANSPOSE8x8W 0 , 1 , 2 , 3 , 4 , 5 , 6 , 7 , 12
%else
TRANSPOSE8x8W 0 , 1 , 2 , 3 , 4 , 5 , 6 , 7 , [%%p0], [%%q0]
%endif
mova [%%p3], m0
mova [%%p2], m1
mova [%%p1], m2
mova [%%p0], m3
%if ARCH_X86_64
mova [%%q0], m4
%endif
mova [%%q1], m5
mova [%%q2], m6
mova [%%q3], m7
; FIXME investigate if we can _not_ load q0-3 below if h, and adjust register
; order here accordingly
%else ; %2 == 16
%define %%p7 rsp+(%%tspoff+ 8 )*mmsize
%define %%p6 rsp+(%%tspoff+ 9 )*mmsize
%define %%p5 rsp+(%%tspoff+10 )*mmsize
%define %%p4 rsp+(%%tspoff+11 )*mmsize
%define %%q4 rsp+(%%tspoff+12 )*mmsize
%define %%q5 rsp+(%%tspoff+13 )*mmsize
%define %%q6 rsp+(%%tspoff+14 )*mmsize
%define %%q7 rsp+(%%tspoff+15 )*mmsize
mova m0, [dst0q+strideq*0 -16 ]
mova m1, [dst0q+strideq*1 -16 ]
mova m2, [dst0q+strideq*2 -16 ]
mova m3, [dst0q+stride3q -16 ]
mova m4, [dst4q+strideq*0 -16 ]
mova m5, [dst4q+strideq*1 -16 ]
%if ARCH_X86_64
mova m6, [dst4q+strideq*2 -16 ]
%endif
mova m7, [dst4q+stride3q -16 ]
%if ARCH_X86_64
TRANSPOSE8x8W 0 , 1 , 2 , 3 , 4 , 5 , 6 , 7 , 12
%else
TRANSPOSE8x8W 0 , 1 , 2 , 3 , 4 , 5 , 6 , 7 , [dst4q+strideq*2 -16 ], [%%p3], 1
%endif
mova [%%p7], m0
mova [%%p6], m1
mova [%%p5], m2
mova [%%p4], m3
%if ARCH_X86_64
mova [%%p3], m4
%endif
mova [%%p2], m5
mova [%%p1], m6
mova [%%p0], m7
mova m0, [dst0q+strideq*0 ]
mova m1, [dst0q+strideq*1 ]
mova m2, [dst0q+strideq*2 ]
mova m3, [dst0q+stride3q ]
mova m4, [dst4q+strideq*0 ]
mova m5, [dst4q+strideq*1 ]
%if ARCH_X86_64
mova m6, [dst4q+strideq*2 ]
%endif
mova m7, [dst4q+stride3q ]
%if ARCH_X86_64
TRANSPOSE8x8W 0 , 1 , 2 , 3 , 4 , 5 , 6 , 7 , 12
%else
TRANSPOSE8x8W 0 , 1 , 2 , 3 , 4 , 5 , 6 , 7 , [dst4q+strideq*2 ], [%%q4], 1
%endif
mova [%%q0], m0
mova [%%q1], m1
mova [%%q2], m2
mova [%%q3], m3
%if ARCH_X86_64
mova [%%q4], m4
%endif
mova [%%q5], m5
mova [%%q6], m6
mova [%%q7], m7
; FIXME investigate if we can _not_ load q0|q4-7 below if h, and adjust register
; order here accordingly
%endif ; %2
%endif ; %1
; load q0|q4-7 data
mova m0, [%%q0]
%if %2 == 16
mova m4, [%%q4]
mova m5, [%%q5]
mova m6, [%%q6]
mova m7, [%%q7]
; flat8out q portion
FLAT8OUT_HALF
SCRATCH 7 , 15 , rsp+(%%off+6 )*mmsize, F8O
%endif
; load q1-3 data
mova m1, [%%q1]
mova m2, [%%q2]
mova m3, [%%q3]
; r6-8|pw_4[m8-11]=reg_E/I/H/F
; r9[m15]=!flatout[q]
; m12-14=free
; m0-3=q0-q3
; m4-7=free
; flat8in|fm|hev q portion
FLAT8IN_HALF %2
SCRATCH 7 , 13 , rsp+(%%off+4 )*mmsize, HEV
%if %2 > 4
SCRATCH 4 , 14 , rsp+(%%off+5 )*mmsize, F8I
%endif
; r6-8|pw_4[m8-11]=reg_E/I/H/F
; r9[m15]=!flat8out[q]
; r10[m13]=hev[q]
; r11[m14]=!flat8in[q]
; m2=!fm[q]
; m0,1=q0-q1
; m2-7=free
; m12=free
; load p0-1
mova m3, [%%p0]
mova m4, [%%p1]
; fm mb_edge portion
psubw m5, m3, m0 ; q0-p0
psubw m6, m4, m1 ; q1-p1
%if ARCH_X86_64
ABS2 m5, m6, m7, m12 ; abs(q0-p0) | abs(q1-p1)
%else
ABS1 m5, m7 ; abs(q0-p0)
ABS1 m6, m7 ; abs(q1-p1)
%endif
paddw m5, m5
psraw m6, 1
paddw m6, m5 ; abs(q0-p0)*2+(abs(q1-p1)>>1)
pcmpgtw m6, reg_E
por m2, m6
SCRATCH 2 , 12 , rsp+(%%off+3 )*mmsize, FM
; r6-8|pw_4[m8-11]=reg_E/I/H/F
; r9[m15]=!flat8out[q]
; r10[m13]=hev[q]
; r11[m14]=!flat8in[q]
; r12[m12]=!fm[q]
; m3-4=q0-1
; m0-2/5-7=free
; load p4-7 data
SWAP 3 , 0 ; p0
SWAP 4 , 1 ; p1
%if %2 == 16
mova m7, [%%p7]
mova m6, [%%p6]
mova m5, [%%p5]
mova m4, [%%p4]
; flat8out p portion
FLAT8OUT_HALF
por m7, reg_F8O
SCRATCH 7 , 15 , rsp+(%%off+6 )*mmsize, F8O
%endif
; r6-8|pw_4[m8-11]=reg_E/I/H/F
; r9[m15]=!flat8out
; r10[m13]=hev[q]
; r11[m14]=!flat8in[q]
; r12[m12]=!fm[q]
; m0=p0
; m1-7=free
; load p2-3 data
mova m2, [%%p2]
mova m3, [%%p3]
; flat8in|fm|hev p portion
FLAT8IN_HALF %2
por m7, reg_HEV
%if %2 > 4
por m4, reg_F8I
%endif
por m2, reg_FM
%if %2 > 4
por m4, m2 ; !flat8|!fm
%if %2 == 16
por m5, m4, reg_F8O ; !flat16|!fm
pandn m2, m4 ; filter4_mask
pandn m4, m5 ; filter8_mask
pxor m5, [pw_m1] ; filter16_mask
SCRATCH 5 , 15 , rsp+(%%off+6 )*mmsize, F16M
%else
pandn m2, m4 ; filter4_mask
pxor m4, [pw_m1] ; filter8_mask
%endif
SCRATCH 4 , 14 , rsp+(%%off+5 )*mmsize, F8M
%else
pxor m2, [pw_m1] ; filter4_mask
%endif
SCRATCH 7 , 13 , rsp+(%%off+4 )*mmsize, HEV
SCRATCH 2 , 12 , rsp+(%%off+3 )*mmsize, F4M
; r9[m15]=filter16_mask
; r10[m13]=hev
; r11[m14]=filter8_mask
; r12[m12]=filter4_mask
; m0,1=p0-p1
; m2-7=free
; m8-11=free
%if %2 > 4
%if %2 == 16
; filter_14
mova m2, [%%p7]
mova m3, [%%p6]
mova m6, [%%p5]
mova m7, [%%p4]
PRELOAD 8 , %%p3, P3
PRELOAD 9 , %%p2, P2
%endif
PRELOAD 10 , %%q0, Q0
PRELOAD 11 , %%q1, Q1
%if %2 == 16
psllw m4, m2, 3
paddw m5, m3, m3
paddw m4, m6
paddw m5, m7
paddw m4, reg_P3
paddw m5, reg_P2
paddw m4, m1
paddw m5, m0
paddw m4, reg_Q0 ; q0+p1+p3+p5+p7*8
psubw m5, m2 ; p0+p2+p4+p6*2-p7
paddw m4, [pw_8]
paddw m5, m4 ; q0+p0+p1+p2+p3+p4+p5+p6*2+p7*7+8
; below, we use r0-5 for storing pre-filter pixels for subsequent subtraction
; at the end of the filter
mova [rsp+0 *mmsize], m3
FILTER_STEP m4, m5, F16M, 4 , %%p6, m3, m2, m6, reg_Q1
%endif
mova m3, [%%q2]
%if %2 == 16
mova [rsp+1 *mmsize], m6
FILTER_STEP m4, m5, F16M, 4 , %%p5, m6, m2, m7, m3
%endif
mova m6, [%%q3]
%if %2 == 16
mova [rsp+2 *mmsize], m7
FILTER_STEP m4, m5, F16M, 4 , %%p4, m7, m2, reg_P3, m6
mova m7, [%%q4]
%if ARCH_X86_64
mova [rsp+3 *mmsize], reg_P3
%else
mova m4, reg_P3
mova [rsp+3 *mmsize], m4
%endif
FILTER_STEP m4, m5, F16M, 4 , %%p3, reg_P3, m2, reg_P2, m7
PRELOAD 8 , %%q5, Q5
%if ARCH_X86_64
mova [rsp+4 *mmsize], reg_P2
%else
mova m4, reg_P2
mova [rsp+4 *mmsize], m4
%endif
FILTER_STEP m4, m5, F16M, 4 , %%p2, reg_P2, m2, m1, reg_Q5
PRELOAD 9 , %%q6, Q6
mova [rsp+5 *mmsize], m1
FILTER_STEP m4, m5, F16M, 4 , %%p1, m1, m2, m0, reg_Q6
mova m1, [%%q7]
FILTER_STEP m4, m5, F16M, 4 , %%p0, m0, m2, reg_Q0, m1, 1
FILTER_STEP m4, m5, F16M, 4 , %%q0, reg_Q0, [rsp+0 *mmsize], reg_Q1, m1, ARCH_X86_64
FILTER_STEP m4, m5, F16M, 4 , %%q1, reg_Q1, [rsp+1 *mmsize], m3, m1, ARCH_X86_64
FILTER_STEP m4, m5, F16M, 4 , %%q2, m3, [rsp+2 *mmsize], m6, m1, 1
FILTER_STEP m4, m5, F16M, 4 , %%q3, m6, [rsp+3 *mmsize], m7, m1
FILTER_STEP m4, m5, F16M, 4 , %%q4, m7, [rsp+4 *mmsize], reg_Q5, m1
FILTER_STEP m4, m5, F16M, 4 , %%q5, reg_Q5, [rsp+5 *mmsize], reg_Q6, m1
FILTER_STEP m4, m5, F16M, 4 , %%q6, reg_Q6
mova m7, [%%p1]
%else
SWAP 1 , 7
%endif
mova m2, [%%p3]
mova m1, [%%p2]
; reg_Q0-1 (m10-m11)
; m0=p0
; m1=p2
; m2=p3
; m3=q2
; m4-5=free
; m6=q3
; m7=p1
; m8-9 unused
; filter_6
psllw m4, m2, 2
paddw m5, m1, m1
paddw m4, m7
psubw m5, m2
paddw m4, m0
paddw m5, reg_Q0
paddw m4, [pw_4]
paddw m5, m4
%if ARCH_X86_64
mova m8, m1
mova m9, m7
%else
mova [rsp+0 *mmsize], m1
mova [rsp+1 *mmsize], m7
%endif
%ifidn %1 , v
FILTER_STEP m4, m5, F8M, 3 , %%p2, m1, m2, m7, reg_Q1
%else
FILTER_STEP m4, m5, F8M, 3 , %%p2, m1, m2, m7, reg_Q1, 1
%endif
FILTER_STEP m4, m5, F8M, 3 , %%p1, m7, m2, m0, m3, 1
FILTER_STEP m4, m5, F8M, 3 , %%p0, m0, m2, reg_Q0, m6, 1
%if ARCH_X86_64
FILTER_STEP m4, m5, F8M, 3 , %%q0, reg_Q0, m8, reg_Q1, m6, ARCH_X86_64
FILTER_STEP m4, m5, F8M, 3 , %%q1, reg_Q1, m9, m3, m6, ARCH_X86_64
%else
FILTER_STEP m4, m5, F8M, 3 , %%q0, reg_Q0, [rsp+0 *mmsize], reg_Q1, m6, ARCH_X86_64
FILTER_STEP m4, m5, F8M, 3 , %%q1, reg_Q1, [rsp+1 *mmsize], m3, m6, ARCH_X86_64
%endif
FILTER_STEP m4, m5, F8M, 3 , %%q2, m3
UNSCRATCH 2 , 10 , %%q0
UNSCRATCH 6 , 11 , %%q1
%else
SWAP 1 , 7
mova m2, [%%q0]
mova m6, [%%q1]
%endif
UNSCRATCH 3 , 13 , rsp+(%%off+4 )*mmsize, HEV
; m0=p0
; m1=p2
; m2=q0
; m3=hev_mask
; m4-5=free
; m6=q1
; m7=p1
; filter_4
psubw m4, m7, m6 ; p1-q1
psubw m5, m2, m0 ; q0-p0
pand m4, m3
pminsw m4, [pw_ %+ %%maxsgn]
pmaxsw m4, [pw_ %+ %%minsgn] ; clip_intp2(p1-q1, 9) -> f
paddw m4, m5
paddw m5, m5
paddw m4, m5 ; 3*(q0-p0)+f
pminsw m4, [pw_ %+ %%maxsgn]
pmaxsw m4, [pw_ %+ %%minsgn] ; clip_intp2(3*(q0-p0)+f, 9) -> f
pand m4, reg_F4M
paddw m5, m4, [pw_4]
paddw m4, [pw_3]
pminsw m5, [pw_ %+ %%maxsgn]
pminsw m4, [pw_ %+ %%maxsgn]
psraw m5, 3 ; min_intp2(f+4, 9)>>3 -> f1
psraw m4, 3 ; min_intp2(f+3, 9)>>3 -> f2
psubw m2, m5 ; q0-f1
paddw m0, m4 ; p0+f2
pandn m3, m5 ; f1 & !hev (for p1/q1 adj)
pxor m4, m4
mova m5, [pw_ %+ %%maxusgn]
pmaxsw m2, m4
pmaxsw m0, m4
pminsw m2, m5
pminsw m0, m5
%if cpuflag(ssse3)
pmulhrsw m3, [pw_16384] ; (f1+1)>>1
%else
paddw m3, [pw_1]
psraw m3, 1
%endif
paddw m7, m3 ; p1+f
psubw m6, m3 ; q1-f
pmaxsw m7, m4
pmaxsw m6, m4
pminsw m7, m5
pminsw m6, m5
; store
%ifidn %1 , v
mova [%%p1], m7
mova [%%p0], m0
mova [%%q0], m2
mova [%%q1], m6
%else ; %1 == h
%if %2 == 4
TRANSPOSE4x4W 7 , 0 , 2 , 6 , 1
movh [dst0q+strideq*0 -4 ], m7
movhps [dst0q+strideq*1 -4 ], m7
movh [dst0q+strideq*2 -4 ], m0
movhps [dst0q+stride3q -4 ], m0
movh [dst4q+strideq*0 -4 ], m2
movhps [dst4q+strideq*1 -4 ], m2
movh [dst4q+strideq*2 -4 ], m6
movhps [dst4q+stride3q -4 ], m6
%elif %2 == 8
mova m3, [%%p3]
mova m4, [%%q2]
mova m5, [%%q3]
%if ARCH_X86_64
TRANSPOSE8x8W 3 , 1 , 7 , 0 , 2 , 6 , 4 , 5 , 8
%else
TRANSPOSE8x8W 3 , 1 , 7 , 0 , 2 , 6 , 4 , 5 , [%%q2], [%%q0], 1
mova m2, [%%q0]
%endif
movu [dst0q+strideq*0 -8 ], m3
movu [dst0q+strideq*1 -8 ], m1
movu [dst0q+strideq*2 -8 ], m7
movu [dst0q+stride3q -8 ], m0
movu [dst4q+strideq*0 -8 ], m2
movu [dst4q+strideq*1 -8 ], m6
movu [dst4q+strideq*2 -8 ], m4
movu [dst4q+stride3q -8 ], m5
%else ; %2 == 16
SCRATCH 2 , 8 , %%q0
SCRATCH 6 , 9 , %%q1
mova m2, [%%p7]
mova m3, [%%p6]
mova m4, [%%p5]
mova m5, [%%p4]
mova m6, [%%p3]
%if ARCH_X86_64
TRANSPOSE8x8W 2 , 3 , 4 , 5 , 6 , 1 , 7 , 0 , 10
%else
mova [%%p1], m7
TRANSPOSE8x8W 2 , 3 , 4 , 5 , 6 , 1 , 7 , 0 , [%%p1], [dst4q+strideq*0 -16 ], 1
%endif
mova [dst0q+strideq*0 -16 ], m2
mova [dst0q+strideq*1 -16 ], m3
mova [dst0q+strideq*2 -16 ], m4
mova [dst0q+stride3q -16 ], m5
%if ARCH_X86_64
mova [dst4q+strideq*0 -16 ], m6
%endif
mova [dst4q+strideq*1 -16 ], m1
mova [dst4q+strideq*2 -16 ], m7
mova [dst4q+stride3q -16 ], m0
UNSCRATCH 2 , 8 , %%q0
UNSCRATCH 6 , 9 , %%q1
mova m0, [%%q2]
mova m1, [%%q3]
mova m3, [%%q4]
mova m4, [%%q5]
%if ARCH_X86_64
mova m5, [%%q6]
%endif
mova m7, [%%q7]
%if ARCH_X86_64
TRANSPOSE8x8W 2 , 6 , 0 , 1 , 3 , 4 , 5 , 7 , 8
%else
TRANSPOSE8x8W 2 , 6 , 0 , 1 , 3 , 4 , 5 , 7 , [%%q6], [dst4q+strideq*0 ], 1
%endif
mova [dst0q+strideq*0 ], m2
mova [dst0q+strideq*1 ], m6
mova [dst0q+strideq*2 ], m0
mova [dst0q+stride3q ], m1
%if ARCH_X86_64
mova [dst4q+strideq*0 ], m3
%endif
mova [dst4q+strideq*1 ], m4
mova [dst4q+strideq*2 ], m5
mova [dst4q+stride3q ], m7
%endif ; %2
%endif ; %1
RET
%endmacro
%macro LOOP_FILTER_CPUSETS 3
INIT_XMM sse2
LOOP_FILTER %1 , %2 , %3
INIT_XMM ssse3
LOOP_FILTER %1 , %2 , %3
INIT_XMM avx
LOOP_FILTER %1 , %2 , %3
%endmacro
%macro LOOP_FILTER_WDSETS 2
LOOP_FILTER_CPUSETS %1 , 4 , %2
LOOP_FILTER_CPUSETS %1 , 8 , %2
LOOP_FILTER_CPUSETS %1 , 16 , %2
%endmacro
LOOP_FILTER_WDSETS h, 10
LOOP_FILTER_WDSETS v, 10
LOOP_FILTER_WDSETS h, 12
LOOP_FILTER_WDSETS v, 12
Messung V0.5 in Prozent C=93 H=88 G=90
¤ Dauer der Verarbeitung: 0.13 Sekunden
(vorverarbeitet am 2026-06-05)
¤
*© Formatika GbR, Deutschland