/*
* VP8 NEON optimisations
*
* Copyright (c) 2010 Rob Clark <rob@ti.com>
* Copyright (c) 2011 Mans Rullgard <mans@mansr.com>
* Copyright (c) 2018 Magnus Röös <mla2.roos@gmail.com>
* Copyright (c) 2019 Martin Storsjo <martin@martin.st>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "libavutil/aarch64/asm.S"
#include "neon.S"
function ff_vp8_luma_dc_wht_neon, export=1
ld1 {v0.4 h - v3.4 h}, [x1]
movi v30.8 h, #0
add v4.4 h, v0.4 h, v3.4 h
add v6.4 h, v1.4 h, v2.4 h
st1 {v30.8 h}, [x1], #16
sub v7.4 h, v1.4 h, v2.4 h
sub v5.4 h, v0.4 h, v3.4 h
st1 {v30.8 h}, [x1]
add v0.4 h, v4.4 h, v6.4 h
add v1.4 h, v5.4 h, v7.4 h
sub v2.4 h, v4.4 h, v6.4 h
sub v3.4 h, v5.4 h, v7.4 h
movi v16.4 h, #3
transpose_4x4H v0, v1, v2, v3, v4, v5, v6, v7
add v0.4 h, v0.4 h, v16.4 h
add v4.4 h, v0.4 h, v3.4 h
add v6.4 h, v1.4 h, v2.4 h
sub v7.4 h, v1.4 h, v2.4 h
sub v5.4 h, v0.4 h, v3.4 h
add v0.4 h, v4.4 h, v6.4 h
add v1.4 h, v5.4 h, v7.4 h
sub v2.4 h, v4.4 h, v6.4 h
sub v3.4 h, v5.4 h, v7.4 h
sshr v0.4 h, v0.4 h, #3
sshr v1.4 h, v1.4 h, #3
sshr v2.4 h, v2.4 h, #3
sshr v3.4 h, v3.4 h, #3
mov x3, #32
st1 {v0.h}[0 ], [x0], x3
st1 {v1.h}[0 ], [x0], x3
st1 {v2.h}[0 ], [x0], x3
st1 {v3.h}[0 ], [x0], x3
st1 {v0.h}[1 ], [x0], x3
st1 {v1.h}[1 ], [x0], x3
st1 {v2.h}[1 ], [x0], x3
st1 {v3.h}[1 ], [x0], x3
st1 {v0.h}[2 ], [x0], x3
st1 {v1.h}[2 ], [x0], x3
st1 {v2.h}[2 ], [x0], x3
st1 {v3.h}[2 ], [x0], x3
st1 {v0.h}[3 ], [x0], x3
st1 {v1.h}[3 ], [x0], x3
st1 {v2.h}[3 ], [x0], x3
st1 {v3.h}[3 ], [x0], x3
ret
endfunc
function ff_vp8_idct_add_neon, export=1
ld1 {v0.8 b - v3.8 b}, [x1]
mov w4, #20091
movk w4, #35468 /2 , lsl #16
dup v4.2 s, w4
smull v26.4 s, v1.4 h, v4.h[0 ]
smull v27.4 s, v3.4 h, v4.h[0 ]
sqdmulh v20.4 h, v1.4 h, v4.h[1 ]
sqdmulh v23.4 h, v3.4 h, v4.h[1 ]
shrn v21.4 h, v26.4 s, #16
shrn v22.4 h, v27.4 s, #16
add v21.4 h, v21.4 h, v1.4 h
add v22.4 h, v22.4 h, v3.4 h
add v16.4 h, v0.4 h, v2.4 h
sub v17.4 h, v0.4 h, v2.4 h
add v18.4 h, v21.4 h, v23.4 h
sub v19.4 h, v20.4 h, v22.4 h
add v0.4 h, v16.4 h, v18.4 h
add v1.4 h, v17.4 h, v19.4 h
sub v3.4 h, v16.4 h, v18.4 h
sub v2.4 h, v17.4 h, v19.4 h
transpose_4x4H v0, v1, v2, v3, v24, v5, v6, v7
movi v29.8 h, #0
smull v26.4 s, v1.4 h, v4.h[0 ]
st1 {v29.8 h}, [x1], #16
smull v27.4 s, v3.4 h, v4.h[0 ]
st1 {v29.16 b}, [x1]
sqdmulh v21.4 h, v1.4 h, v4.h[1 ]
sqdmulh v23.4 h, v3.4 h, v4.h[1 ]
shrn v20.4 h, v26.4 s, #16
shrn v22.4 h, v27.4 s, #16
add v20.4 h, v20.4 h, v1.4 h
add v22.4 h, v22.4 h, v3.4 h
add v16.4 h, v0.4 h, v2.4 h
sub v17.4 h, v0.4 h, v2.4 h
add v18.4 h, v20.4 h, v23.4 h
ld1 {v24.s}[0 ], [x0], x2
sub v19.4 h, v21.4 h, v22.4 h
ld1 {v25.s}[0 ], [x0], x2
add v0.4 h, v16.4 h, v18.4 h
add v1.4 h, v17.4 h, v19.4 h
ld1 {v26.s}[0 ], [x0], x2
sub v3.4 h, v16.4 h, v18.4 h
sub v2.4 h, v17.4 h, v19.4 h
ld1 {v27.s}[0 ], [x0], x2
srshr v0.4 h, v0.4 h, #3
srshr v1.4 h, v1.4 h, #3
srshr v2.4 h, v2.4 h, #3
srshr v3.4 h, v3.4 h, #3
sub x0, x0, x2, lsl #2
transpose_4x4H v0, v1, v2, v3, v5, v6, v7, v16
uaddw v0.8 h, v0.8 h, v24.8 b
uaddw v1.8 h, v1.8 h, v25.8 b
uaddw v2.8 h, v2.8 h, v26.8 b
uaddw v3.8 h, v3.8 h, v27.8 b
sqxtun v0.8 b, v0.8 h
sqxtun v1.8 b, v1.8 h
sqxtun v2.8 b, v2.8 h
sqxtun v3.8 b, v3.8 h
st1 {v0.s}[0 ], [x0], x2
st1 {v1.s}[0 ], [x0], x2
st1 {v2.s}[0 ], [x0], x2
st1 {v3.s}[0 ], [x0], x2
ret
endfunc
function ff_vp8_idct_dc_add4uv_neon, export=1
movi v0.4 h, #0
mov x3, #32
ld1r {v16.4 h}, [x1]
st1 {v0.h}[0 ], [x1], x3
ld1r {v17.4 h}, [x1]
st1 {v0.h}[0 ], [x1], x3
ld1r {v18.4 h}, [x1]
st1 {v0.h}[0 ], [x1], x3
ld1r {v19.4 h}, [x1]
st1 {v0.h}[0 ], [x1], x3
ins v16.d[1 ], v17.d[0 ]
ins v18.d[1 ], v19.d[0 ]
mov x3, x0
srshr v16.8 h, v16.8 h, #3 // dc >>= 3
ld1 {v0.8 b}, [x0], x2
srshr v18.8 h, v18.8 h, #3
ld1 {v1.8 b}, [x0], x2
uaddw v20.8 h, v16.8 h, v0.8 b
ld1 {v2.8 b}, [x0], x2
uaddw v0.8 h, v16.8 h, v1.8 b
ld1 {v3.8 b}, [x0], x2
uaddw v22.8 h, v16.8 h, v2.8 b
ld1 {v4.8 b}, [x0], x2
uaddw v2.8 h, v16.8 h, v3.8 b
ld1 {v5.8 b}, [x0], x2
uaddw v24.8 h, v18.8 h, v4.8 b
ld1 {v6.8 b}, [x0], x2
uaddw v4.8 h, v18.8 h, v5.8 b
ld1 {v7.8 b}, [x0], x2
uaddw v26.8 h, v18.8 h, v6.8 b
sqxtun v20.8 b, v20.8 h
uaddw v6.8 h, v18.8 h, v7.8 b
sqxtun v21.8 b, v0.8 h
sqxtun v22.8 b, v22.8 h
st1 {v20.8 b}, [x3], x2
sqxtun v23.8 b, v2.8 h
st1 {v21.8 b}, [x3], x2
sqxtun v24.8 b, v24.8 h
st1 {v22.8 b}, [x3], x2
sqxtun v25.8 b, v4.8 h
st1 {v23.8 b}, [x3], x2
sqxtun v26.8 b, v26.8 h
st1 {v24.8 b}, [x3], x2
sqxtun v27.8 b, v6.8 h
st1 {v25.8 b}, [x3], x2
st1 {v26.8 b}, [x3], x2
st1 {v27.8 b}, [x3], x2
ret
endfunc
function ff_vp8_idct_dc_add4y_neon, export=1
movi v0.16 b, #0
mov x3, #32
ld1r {v16.4 h}, [x1]
st1 {v0.h}[0 ], [x1], x3
ld1r {v17.4 h}, [x1]
st1 {v0.h}[0 ], [x1], x3
zip1 v16.2 d, v16.2 d, v17.2 d
ld1r {v18.4 h}, [x1]
st1 {v0.h}[0 ], [x1], x3
ld1r {v19.4 h}, [x1]
st1 {v0.h}[0 ], [x1], x3
zip1 v18.2 d, v18.2 d, v19.2 d
srshr v16.8 h, v16.8 h, #3 // dc >>= 3
ld1 {v0.16 b}, [x0], x2
srshr v18.8 h, v18.8 h, #3
ld1 {v1.16 b}, [x0], x2
uaddw v20.8 h, v16.8 h, v0.8 b
ld1 {v2.16 b}, [x0], x2
uaddw2 v0.8 h, v18.8 h, v0.16 b
ld1 {v3.16 b}, [x0], x2
uaddw v21.8 h, v16.8 h, v1.8 b
uaddw2 v1.8 h, v18.8 h, v1.16 b
uaddw v22.8 h, v16.8 h, v2.8 b
uaddw2 v2.8 h, v18.8 h, v2.16 b
uaddw v23.8 h, v16.8 h, v3.8 b
uaddw2 v3.8 h, v18.8 h, v3.16 b
sub x0, x0, x2, lsl #2
sqxtun v20.8 b, v20.8 h
sqxtun2 v20.16 b, v0.8 h
sqxtun v21.8 b, v21.8 h
sqxtun2 v21.16 b, v1.8 h
sqxtun v22.8 b, v22.8 h
st1 {v20.16 b}, [x0], x2
sqxtun2 v22.16 b, v2.8 h
st1 {v21.16 b}, [x0], x2
sqxtun v23.8 b, v23.8 h
st1 {v22.16 b}, [x0], x2
sqxtun2 v23.16 b, v3.8 h
st1 {v23.16 b}, [x0], x2
ret
endfunc
function ff_vp8_idct_dc_add_neon, export=1
mov w3, #0
ld1r {v2.8 h}, [x1]
strh w3, [x1]
srshr v2.8 h, v2.8 h, #3
ld1 {v0.s}[0 ], [x0], x2
ld1 {v0.s}[1 ], [x0], x2
uaddw v3.8 h, v2.8 h, v0.8 b
ld1 {v1.s}[0 ], [x0], x2
ld1 {v1.s}[1 ], [x0], x2
uaddw v4.8 h, v2.8 h, v1.8 b
sqxtun v0.8 b, v3.8 h
sqxtun v1.8 b, v4.8 h
sub x0, x0, x2, lsl #2
st1 {v0.s}[0 ], [x0], x2
st1 {v0.s}[1 ], [x0], x2
st1 {v1.s}[0 ], [x0], x2
st1 {v1.s}[1 ], [x0], x2
ret
endfunc
// Register layout:
// P3..Q3 -> v0..v7
// flim_E -> v22
// flim_I -> v23
// hev_thresh -> x5
//
.macro vp8_loop_filter, inner=0 , simple=0 , hev_thresh
.if \simple
uabd v17.16 b, v3.16 b, v4.16 b // abs(P0-Q0)
uabd v23.16 b, v2.16 b, v5.16 b // abs(P1-Q1)
uqadd v17.16 b, v17.16 b, v17.16 b // abs(P0-Q0) * 2
ushr v18.16 b, v23.16 b, #1 // abs(P1-Q1) / 2
uqadd v19.16 b, v17.16 b, v18.16 b // (abs(P0-Q0)*2 ) + (abs(P1-Q1)/2 )
movi v21.16 b, #0 x80
cmhs v16.16 b, v22.16 b, v19.16 b // (abs(P0-Q0)*2 ) + (abs(P1-Q1)/2 ) <= flim
.else
// calculate hev and normal_limit:
uabd v20.16 b, v2.16 b, v3.16 b // abs(P1-P0)
uabd v21.16 b, v5.16 b, v4.16 b // abs(Q1-Q0)
uabd v18.16 b, v0.16 b, v1.16 b // abs(P3-P2)
uabd v19.16 b, v1.16 b, v2.16 b // abs(P2-P1)
cmhs v16.16 b, v23.16 b, v20.16 b // abs(P1-P0) <= flim_I
cmhs v17.16 b, v23.16 b, v21.16 b // abs(Q1-Q0) <= flim_I
cmhs v18.16 b, v23.16 b, v18.16 b // abs(P3-P2) <= flim_I
cmhs v19.16 b, v23.16 b, v19.16 b // abs(P2-P1) <= flim_I
and v16.16 b, v17.16 b, v16.16 b
uabd v17.16 b, v7.16 b, v6.16 b // abs(Q3-Q2)
and v16.16 b, v16.16 b, v19.16 b
uabd v19.16 b, v6.16 b, v5.16 b // abs(Q2-Q1)
and v16.16 b, v16.16 b, v18.16 b
cmhs v18.16 b, v23.16 b, v17.16 b // abs(Q3-Q2) <= flim_I
cmhs v19.16 b, v23.16 b, v19.16 b // abs(Q2-Q1) <= flim_I
uabd v17.16 b, v3.16 b, v4.16 b // abs(P0-Q0)
uabd v23.16 b, v2.16 b, v5.16 b // abs(P1-Q1)
and v16.16 b, v16.16 b, v18.16 b
uqadd v17.16 b, v17.16 b, v17.16 b // abs(P0-Q0) * 2
and v16.16 b, v16.16 b, v19.16 b
ushr v18.16 b, v23.16 b, #1 // abs(P1-Q1) / 2
dup v23.16 b, \hev_thresh // hev_thresh
uqadd v19.16 b, v17.16 b, v18.16 b // (abs(P0-Q0)*2 ) + (abs(P1-Q1)/2 )
cmhi v20.16 b, v20.16 b, v23.16 b // abs(P1-P0) > hev_thresh
cmhs v19.16 b, v22.16 b, v19.16 b // (abs(P0-Q0)*2 ) + (abs(P1-Q1)/2 ) <= flim_E
cmhi v22.16 b, v21.16 b, v23.16 b // abs(Q1-Q0) > hev_thresh
and v16.16 b, v16.16 b, v19.16 b
movi v21.16 b, #0 x80
orr v17.16 b, v20.16 b, v22.16 b
.endif
// at this point:
// v16: normal_limit
// v17: hev
// convert to signed value:
eor v3.16 b, v3.16 b, v21.16 b // PS0 = P0 ^ 0 x80
eor v4.16 b, v4.16 b, v21.16 b // QS0 = Q0 ^ 0 x80
movi v20.8 h, #3
ssubl v18.8 h, v4.8 b, v3.8 b // QS0 - PS0
ssubl2 v19.8 h, v4.16 b, v3.16 b // (widened to 16 bit)
eor v2.16 b, v2.16 b, v21.16 b // PS1 = P1 ^ 0 x80
eor v5.16 b, v5.16 b, v21.16 b // QS1 = Q1 ^ 0 x80
mul v18.8 h, v18.8 h, v20.8 h // w = 3 * (QS0 - PS0)
mul v19.8 h, v19.8 h, v20.8 h
sqsub v20.16 b, v2.16 b, v5.16 b // clamp(PS1-QS1)
movi v22.16 b, #4
movi v23.16 b, #3
.if \inner
and v20.16 b, v20.16 b, v17.16 b // if(hev) w += clamp(PS1-QS1)
.endif
saddw v18.8 h, v18.8 h, v20.8 b // w += clamp(PS1-QS1)
saddw2 v19.8 h, v19.8 h, v20.16 b
sqxtn v18.8 b, v18.8 h // narrow result back into v18
sqxtn2 v18.16 b, v19.8 h
.if !\inner && !\simple
eor v1.16 b, v1.16 b, v21.16 b // PS2 = P2 ^ 0 x80
eor v6.16 b, v6.16 b, v21.16 b // QS2 = Q2 ^ 0 x80
.endif
and v18.16 b, v18.16 b, v16.16 b // w &= normal_limit
// registers used at this point..
// v0 -> P3 (don't corrupt)
// v1-v6 -> PS2-QS2
// v7 -> Q3 (don't corrupt)
// v17 -> hev
// v18 -> w
// v21 -> #0 x80
// v22 -> #4
// v23 -> #3
// v16, v19, v29 -> unused
//
// filter_common: is4tap==1
// c1 = clamp(w + 4 ) >> 3 ;
// c2 = clamp(w + 3 ) >> 3 ;
// Q0 = s2u(QS0 - c1);
// P0 = s2u(PS0 + c2);
.if \simple
sqadd v19.16 b, v18.16 b, v22.16 b // c1 = clamp((w&hev)+4 )
sqadd v20.16 b, v18.16 b, v23.16 b // c2 = clamp((w&hev)+3 )
sshr v19.16 b, v19.16 b, #3 // c1 >>= 3
sshr v20.16 b, v20.16 b, #3 // c2 >>= 3
sqsub v4.16 b, v4.16 b, v19.16 b // QS0 = clamp(QS0-c1)
sqadd v3.16 b, v3.16 b, v20.16 b // PS0 = clamp(PS0+c2)
eor v4.16 b, v4.16 b, v21.16 b // Q0 = QS0 ^ 0 x80
eor v3.16 b, v3.16 b, v21.16 b // P0 = PS0 ^ 0 x80
eor v5.16 b, v5.16 b, v21.16 b // Q1 = QS1 ^ 0 x80
eor v2.16 b, v2.16 b, v21.16 b // P1 = PS1 ^ 0 x80
.elseif \inner
// the !is4tap case of filter_common, only used for inner blocks
// c3 = ((c1&~hev) + 1 ) >> 1 ;
// Q1 = s2u(QS1 - c3);
// P1 = s2u(PS1 + c3);
sqadd v19.16 b, v18.16 b, v22.16 b // c1 = clamp((w&hev)+4 )
sqadd v20.16 b, v18.16 b, v23.16 b // c2 = clamp((w&hev)+3 )
sshr v19.16 b, v19.16 b, #3 // c1 >>= 3
sshr v20.16 b, v20.16 b, #3 // c2 >>= 3
sqsub v4.16 b, v4.16 b, v19.16 b // QS0 = clamp(QS0-c1)
sqadd v3.16 b, v3.16 b, v20.16 b // PS0 = clamp(PS0+c2)
bic v19.16 b, v19.16 b, v17.16 b // c1 & ~hev
eor v4.16 b, v4.16 b, v21.16 b // Q0 = QS0 ^ 0 x80
srshr v19.16 b, v19.16 b, #1 // c3 >>= 1
eor v3.16 b, v3.16 b, v21.16 b // P0 = PS0 ^ 0 x80
sqsub v5.16 b, v5.16 b, v19.16 b // QS1 = clamp(QS1-c3)
sqadd v2.16 b, v2.16 b, v19.16 b // PS1 = clamp(PS1+c3)
eor v5.16 b, v5.16 b, v21.16 b // Q1 = QS1 ^ 0 x80
eor v2.16 b, v2.16 b, v21.16 b // P1 = PS1 ^ 0 x80
.else
and v20.16 b, v18.16 b, v17.16 b // w & hev
sqadd v19.16 b, v20.16 b, v22.16 b // c1 = clamp((w&hev)+4 )
sqadd v20.16 b, v20.16 b, v23.16 b // c2 = clamp((w&hev)+3 )
sshr v19.16 b, v19.16 b, #3 // c1 >>= 3
sshr v20.16 b, v20.16 b, #3 // c2 >>= 3
bic v18.16 b, v18.16 b, v17.16 b // w &= ~hev
sqsub v4.16 b, v4.16 b, v19.16 b // QS0 = clamp(QS0-c1)
sqadd v3.16 b, v3.16 b, v20.16 b // PS0 = clamp(PS0+c2)
// filter_mbedge:
// a = clamp((27 *w + 63 ) >> 7 );
// Q0 = s2u(QS0 - a);
// P0 = s2u(PS0 + a);
// a = clamp((18 *w + 63 ) >> 7 );
// Q1 = s2u(QS1 - a);
// P1 = s2u(PS1 + a);
// a = clamp((9 *w + 63 ) >> 7 );
// Q2 = s2u(QS2 - a);
// P2 = s2u(PS2 + a);
movi v17.8 h, #63
sshll v22.8 h, v18.8 b, #3
sshll2 v23.8 h, v18.16 b, #3
saddw v22.8 h, v22.8 h, v18.8 b
saddw2 v23.8 h, v23.8 h, v18.16 b
add v16.8 h, v17.8 h, v22.8 h
add v17.8 h, v17.8 h, v23.8 h // 9 *w + 63
add v19.8 h, v16.8 h, v22.8 h
add v20.8 h, v17.8 h, v23.8 h // 18 *w + 63
add v22.8 h, v19.8 h, v22.8 h
add v23.8 h, v20.8 h, v23.8 h // 27 *w + 63
sqshrn v16.8 b, v16.8 h, #7
sqshrn2 v16.16 b, v17.8 h, #7 // clamp(( 9 *w + 63 )>>7 )
sqshrn v19.8 b, v19.8 h, #7
sqshrn2 v19.16 b, v20.8 h, #7 // clamp((18 *w + 63 )>>7 )
sqshrn v22.8 b, v22.8 h, #7
sqshrn2 v22.16 b, v23.8 h, #7 // clamp((27 *w + 63 )>>7 )
sqadd v1.16 b, v1.16 b, v16.16 b // PS2 = clamp(PS2+a)
sqsub v6.16 b, v6.16 b, v16.16 b // QS2 = clamp(QS2-a)
sqadd v2.16 b, v2.16 b, v19.16 b // PS1 = clamp(PS1+a)
sqsub v5.16 b, v5.16 b, v19.16 b // QS1 = clamp(QS1-a)
sqadd v3.16 b, v3.16 b, v22.16 b // PS0 = clamp(PS0+a)
sqsub v4.16 b, v4.16 b, v22.16 b // QS0 = clamp(QS0-a)
eor v3.16 b, v3.16 b, v21.16 b // P0 = PS0 ^ 0 x80
eor v4.16 b, v4.16 b, v21.16 b // Q0 = QS0 ^ 0 x80
eor v2.16 b, v2.16 b, v21.16 b // P1 = PS1 ^ 0 x80
eor v5.16 b, v5.16 b, v21.16 b // Q1 = QS1 ^ 0 x80
eor v1.16 b, v1.16 b, v21.16 b // P2 = PS2 ^ 0 x80
eor v6.16 b, v6.16 b, v21.16 b // Q2 = QS2 ^ 0 x80
.endif
.endm
.macro vp8_v_loop_filter16 name, inner=0 , simple=0
function ff_vp8_v_loop_filter16\name\()_neon, export=1
sub x0, x0, x1, lsl #1 +!\simple
// Load pixels:
.if !\simple
ld1 {v0.16 b}, [x0], x1 // P3
ld1 {v1.16 b}, [x0], x1 // P2
.endif
ld1 {v2.16 b}, [x0], x1 // P1
ld1 {v3.16 b}, [x0], x1 // P0
ld1 {v4.16 b}, [x0], x1 // Q0
ld1 {v5.16 b}, [x0], x1 // Q1
.if !\simple
ld1 {v6.16 b}, [x0], x1 // Q2
ld1 {v7.16 b}, [x0] // Q3
dup v23.16 b, w3 // flim_I
.endif
dup v22.16 b, w2 // flim_E
vp8_loop_filter inner=\inner, simple=\simple, hev_thresh=w4
// back up to P2: dst -= stride * 6
sub x0, x0, x1, lsl #2
.if !\simple
sub x0, x0, x1, lsl #1
// Store pixels:
st1 {v1.16 b}, [x0], x1 // P2
.endif
st1 {v2.16 b}, [x0], x1 // P1
st1 {v3.16 b}, [x0], x1 // P0
st1 {v4.16 b}, [x0], x1 // Q0
st1 {v5.16 b}, [x0], x1 // Q1
.if !\simple
st1 {v6.16 b}, [x0] // Q2
.endif
ret
endfunc
.endm
vp8_v_loop_filter16
vp8_v_loop_filter16 _inner, inner=1
vp8_v_loop_filter16 _simple, simple=1
.macro vp8_v_loop_filter8uv name, inner=0
function ff_vp8_v_loop_filter8uv\name\()_neon, export=1
sub x0, x0, x2, lsl #2
sub x1, x1, x2, lsl #2
// Load pixels:
ld1 {v0.d}[0 ], [x0], x2 // P3
ld1 {v0.d}[1 ], [x1], x2 // P3
ld1 {v1.d}[0 ], [x0], x2 // P2
ld1 {v1.d}[1 ], [x1], x2 // P2
ld1 {v2.d}[0 ], [x0], x2 // P1
ld1 {v2.d}[1 ], [x1], x2 // P1
ld1 {v3.d}[0 ], [x0], x2 // P0
ld1 {v3.d}[1 ], [x1], x2 // P0
ld1 {v4.d}[0 ], [x0], x2 // Q0
ld1 {v4.d}[1 ], [x1], x2 // Q0
ld1 {v5.d}[0 ], [x0], x2 // Q1
ld1 {v5.d}[1 ], [x1], x2 // Q1
ld1 {v6.d}[0 ], [x0], x2 // Q2
ld1 {v6.d}[1 ], [x1], x2 // Q2
ld1 {v7.d}[0 ], [x0] // Q3
ld1 {v7.d}[1 ], [x1] // Q3
dup v22.16 b, w3 // flim_E
dup v23.16 b, w4 // flim_I
vp8_loop_filter inner=\inner, hev_thresh=w5
// back up to P2: u,v -= stride * 6
sub x0, x0, x2, lsl #2
sub x1, x1, x2, lsl #2
sub x0, x0, x2, lsl #1
sub x1, x1, x2, lsl #1
// Store pixels:
st1 {v1.d}[0 ], [x0], x2 // P2
st1 {v1.d}[1 ], [x1], x2 // P2
st1 {v2.d}[0 ], [x0], x2 // P1
st1 {v2.d}[1 ], [x1], x2 // P1
st1 {v3.d}[0 ], [x0], x2 // P0
st1 {v3.d}[1 ], [x1], x2 // P0
st1 {v4.d}[0 ], [x0], x2 // Q0
st1 {v4.d}[1 ], [x1], x2 // Q0
st1 {v5.d}[0 ], [x0], x2 // Q1
st1 {v5.d}[1 ], [x1], x2 // Q1
st1 {v6.d}[0 ], [x0] // Q2
st1 {v6.d}[1 ], [x1] // Q2
ret
endfunc
.endm
vp8_v_loop_filter8uv
vp8_v_loop_filter8uv _inner, inner=1
.macro vp8_h_loop_filter16 name, inner=0 , simple=0
function ff_vp8_h_loop_filter16\name\()_neon, export=1
sub x0, x0, #4
// Load pixels:
ld1 {v0.d}[0 ], [x0], x1
ld1 {v1.d}[0 ], [x0], x1
ld1 {v2.d}[0 ], [x0], x1
ld1 {v3.d}[0 ], [x0], x1
ld1 {v4.d}[0 ], [x0], x1
ld1 {v5.d}[0 ], [x0], x1
ld1 {v6.d}[0 ], [x0], x1
ld1 {v7.d}[0 ], [x0], x1
ld1 {v0.d}[1 ], [x0], x1
ld1 {v1.d}[1 ], [x0], x1
ld1 {v2.d}[1 ], [x0], x1
ld1 {v3.d}[1 ], [x0], x1
ld1 {v4.d}[1 ], [x0], x1
ld1 {v5.d}[1 ], [x0], x1
ld1 {v6.d}[1 ], [x0], x1
ld1 {v7.d}[1 ], [x0], x1
transpose_8x16B v0, v1, v2, v3, v4, v5, v6, v7, v30, v31
dup v22.16 b, w2 // flim_E
.if !\simple
dup v23.16 b, w3 // flim_I
.endif
vp8_loop_filter inner=\inner, simple=\simple, hev_thresh=w4
sub x0, x0, x1, lsl #4 // backup 16 rows
transpose_8x16B v0, v1, v2, v3, v4, v5, v6, v7, v30, v31
// Store pixels:
st1 {v0.d}[0 ], [x0], x1
st1 {v1.d}[0 ], [x0], x1
st1 {v2.d}[0 ], [x0], x1
st1 {v3.d}[0 ], [x0], x1
st1 {v4.d}[0 ], [x0], x1
st1 {v5.d}[0 ], [x0], x1
st1 {v6.d}[0 ], [x0], x1
st1 {v7.d}[0 ], [x0], x1
st1 {v0.d}[1 ], [x0], x1
st1 {v1.d}[1 ], [x0], x1
st1 {v2.d}[1 ], [x0], x1
st1 {v3.d}[1 ], [x0], x1
st1 {v4.d}[1 ], [x0], x1
st1 {v5.d}[1 ], [x0], x1
st1 {v6.d}[1 ], [x0], x1
st1 {v7.d}[1 ], [x0]
ret
endfunc
.endm
vp8_h_loop_filter16
vp8_h_loop_filter16 _inner, inner=1
vp8_h_loop_filter16 _simple, simple=1
.macro vp8_h_loop_filter8uv name, inner=0
function ff_vp8_h_loop_filter8uv\name\()_neon, export=1
sub x0, x0, #4
sub x1, x1, #4
// Load pixels:
ld1 {v0.d}[0 ], [x0], x2 // load u
ld1 {v0.d}[1 ], [x1], x2 // load v
ld1 {v1.d}[0 ], [x0], x2
ld1 {v1.d}[1 ], [x1], x2
ld1 {v2.d}[0 ], [x0], x2
ld1 {v2.d}[1 ], [x1], x2
ld1 {v3.d}[0 ], [x0], x2
ld1 {v3.d}[1 ], [x1], x2
ld1 {v4.d}[0 ], [x0], x2
ld1 {v4.d}[1 ], [x1], x2
ld1 {v5.d}[0 ], [x0], x2
ld1 {v5.d}[1 ], [x1], x2
ld1 {v6.d}[0 ], [x0], x2
ld1 {v6.d}[1 ], [x1], x2
ld1 {v7.d}[0 ], [x0], x2
ld1 {v7.d}[1 ], [x1], x2
transpose_8x16B v0, v1, v2, v3, v4, v5, v6, v7, v30, v31
dup v22.16 b, w3 // flim_E
dup v23.16 b, w4 // flim_I
vp8_loop_filter inner=\inner, hev_thresh=w5
sub x0, x0, x2, lsl #3 // backup u 8 rows
sub x1, x1, x2, lsl #3 // backup v 8 rows
transpose_8x16B v0, v1, v2, v3, v4, v5, v6, v7, v30, v31
// Store pixels:
st1 {v0.d}[0 ], [x0], x2 // load u
st1 {v0.d}[1 ], [x1], x2 // load v
st1 {v1.d}[0 ], [x0], x2
st1 {v1.d}[1 ], [x1], x2
st1 {v2.d}[0 ], [x0], x2
st1 {v2.d}[1 ], [x1], x2
st1 {v3.d}[0 ], [x0], x2
st1 {v3.d}[1 ], [x1], x2
st1 {v4.d}[0 ], [x0], x2
st1 {v4.d}[1 ], [x1], x2
st1 {v5.d}[0 ], [x0], x2
st1 {v5.d}[1 ], [x1], x2
st1 {v6.d}[0 ], [x0], x2
st1 {v6.d}[1 ], [x1], x2
st1 {v7.d}[0 ], [x0]
st1 {v7.d}[1 ], [x1]
ret
endfunc
.endm
vp8_h_loop_filter8uv
vp8_h_loop_filter8uv _inner, inner=1
function ff_put_vp8_pixels16_neon, export=1
1 :
subs w4, w4, #4
ld1 {v0.16 b}, [x2], x3
ld1 {v1.16 b}, [x2], x3
ld1 {v2.16 b}, [x2], x3
ld1 {v3.16 b}, [x2], x3
st1 {v0.16 b}, [x0], x1
st1 {v1.16 b}, [x0], x1
st1 {v2.16 b}, [x0], x1
st1 {v3.16 b}, [x0], x1
b.gt 1 b
ret
endfunc
function ff_put_vp8_pixels8_neon, export=1
1 :
subs w4, w4, #4
ld1 {v0.8 b}, [x2], x3
ld1 {v0.d}[1 ], [x2], x3
ld1 {v1.8 b}, [x2], x3
ld1 {v1.d}[1 ], [x2], x3
st1 {v0.8 b}, [x0], x1
st1 {v0.d}[1 ], [x0], x1
st1 {v1.8 b}, [x0], x1
st1 {v1.d}[1 ], [x0], x1
b.gt 1 b
ret
endfunc
/* 4/6-tap 8th-pel MC */
.macro vp8_epel8_h6 d, s0, s1
ext v22.8 b, \s0\().8 b, \s1\().8 b, #1
uxtl v18.8 h, \s0\().8 b
ext v23.8 b, \s0\().8 b, \s1\().8 b, #2
uxtl v19.8 h, v22.8 b
ext v24.8 b, \s0\().8 b, \s1\().8 b, #3
uxtl v21.8 h, v23.8 b
ext v25.8 b, \s0\().8 b, \s1\().8 b, #4
uxtl v22.8 h, v24.8 b
ext v26.8 b, \s0\().8 b, \s1\().8 b, #5
uxtl v25.8 h, v25.8 b
mul v21.8 h, v21.8 h, v0.h[2 ]
uxtl v26.8 h, v26.8 b
mul v22.8 h, v22.8 h, v0.h[3 ]
mls v21.8 h, v19.8 h, v0.h[1 ]
mls v22.8 h, v25.8 h, v0.h[4 ]
mla v21.8 h, v18.8 h, v0.h[0 ]
mla v22.8 h, v26.8 h, v0.h[5 ]
sqadd v22.8 h, v21.8 h, v22.8 h
sqrshrun \d\().8 b, v22.8 h, #7
.endm
.macro vp8_epel16_h6 d0, v0, v1
ext v22.16 b, \v0\().16 b, \v1\().16 b, #3
ext v23.16 b, \v0\().16 b, \v1\().16 b, #4
uxtl v19.8 h, v22.8 b
uxtl2 v22.8 h, v22.16 b
ext v3.16 b, \v0\().16 b, \v1\().16 b, #2
uxtl v20.8 h, v23.8 b
uxtl2 v23.8 h, v23.16 b
ext v16.16 b, \v0\().16 b, \v1\().16 b, #1
uxtl v18.8 h, v3.8 b
uxtl2 v3.8 h, v3.16 b
ext v2.16 b, \v0\().16 b, \v1\().16 b, #5
uxtl v21.8 h, v2.8 b
uxtl2 v2.8 h, v2.16 b
uxtl v17.8 h, v16.8 b
uxtl2 v16.8 h, v16.16 b
mul v19.8 h, v19.8 h, v0.h[3 ]
mul v18.8 h, v18.8 h, v0.h[2 ]
mul v3.8 h, v3.8 h, v0.h[2 ]
mul v22.8 h, v22.8 h, v0.h[3 ]
mls v19.8 h, v20.8 h, v0.h[4 ]
uxtl v20.8 h, \v0\().8 b
uxtl2 v1.8 h, \v0\().16 b
mls v18.8 h, v17.8 h, v0.h[1 ]
mls v3.8 h, v16.8 h, v0.h[1 ]
mls v22.8 h, v23.8 h, v0.h[4 ]
mla v18.8 h, v20.8 h, v0.h[0 ]
mla v19.8 h, v21.8 h, v0.h[5 ]
mla v3.8 h, v1.8 h, v0.h[0 ]
mla v22.8 h, v2.8 h, v0.h[5 ]
sqadd v19.8 h, v18.8 h, v19.8 h
sqadd v22.8 h, v3.8 h, v22.8 h
sqrshrun \d0\().8 b, v19.8 h, #7
sqrshrun2 \d0\().16 b, v22.8 h, #7
.endm
.macro vp8_epel8_v6_y2 d0, d1, s0, s1, s2, s3, s4, s5, s6
uxtl \s0\().8 h, \s0\().8 b
uxtl \s3\().8 h, \s3\().8 b
uxtl \s6\().8 h, \s6\().8 b
uxtl \s1\().8 h, \s1\().8 b
uxtl \s4\().8 h, \s4\().8 b
uxtl \s2\().8 h, \s2\().8 b
uxtl \s5\().8 h, \s5\().8 b
mul \s0\().8 h, \s0\().8 h, v0.h[0 ]
mul v31.8 h , \s3\().8 h, v0.h[3 ]
mul \s3\().8 h, \s3\().8 h, v0.h[2 ]
mul \s6\().8 h, \s6\().8 h, v0.h[5 ]
mls \s0\().8 h, \s1\().8 h, v0.h[1 ]
mls v31.8 h , \s4\().8 h, v0.h[4 ]
mls \s3\().8 h, \s2\().8 h, v0.h[1 ]
mls \s6\().8 h, \s5\().8 h, v0.h[4 ]
mla \s0\().8 h, \s2\().8 h, v0.h[2 ]
mla v31.8 h , \s5\().8 h, v0.h[5 ]
mla \s3\().8 h, \s1\().8 h, v0.h[0 ]
mla \s6\().8 h, \s4\().8 h, v0.h[3 ]
sqadd v31.8 h , \s0\().8 h, v31.8 h
sqadd \s6\().8 h, \s3\().8 h, \s6\().8 h
sqrshrun \d0\().8 b, v31.8 h, #7
sqrshrun \d1\().8 b, \s6\().8 h, #7
.endm
.macro vp8_epel8_h4 d, v0, v1
ext v22.8 b, \v0\().8 b, \v1\().8 b, #1
uxtl v19.8 h, \v0\().8 b
ext v23.8 b, \v0\().8 b, \v1\().8 b, #2
uxtl v20.8 h, v22.8 b
ext v25.8 b, \v0\().8 b, \v1\().8 b, #3
uxtl v22.8 h, v23.8 b
uxtl v25.8 h, v25.8 b
mul v20.8 h, v20.8 h, v0.h[2 ]
mul v22.8 h, v22.8 h, v0.h[3 ]
mls v20.8 h, v19.8 h, v0.h[1 ]
mls v22.8 h, v25.8 h, v0.h[4 ]
sqadd v22.8 h, v20.8 h, v22.8 h
sqrshrun \d\().8 b, v22.8 h, #7
.endm
.macro vp8_epel8_v4_y2 d0, s0, s1, s2, s3, s4
uxtl \s0\().8 h, \s0\().8 b
uxtl \s1\().8 h, \s1\().8 b
uxtl \s2\().8 h, \s2\().8 b
uxtl \s3\().8 h, \s3\().8 b
uxtl \s4\().8 h, \s4\().8 b
mul v21.8 h, \s1\().8 h, v0.h[2 ]
mul v23.8 h, \s2\().8 h, v0.h[3 ]
mul \s2\().8 h, \s2\().8 h, v0.h[2 ]
mul v22.8 h, \s3\().8 h, v0.h[3 ]
mls v21.8 h, \s0\().8 h, v0.h[1 ]
mls v23.8 h, \s3\().8 h, v0.h[4 ]
mls \s2\().8 h, \s1\().8 h, v0.h[1 ]
mls v22.8 h, \s4\().8 h, v0.h[4 ]
sqadd v21.8 h, v21.8 h, v23.8 h
sqadd \s2\().8 h, \s2\().8 h, v22.8 h
sqrshrun \d0\().8 b, v21.8 h, #7
sqrshrun2 \d0\().16 b, \s2\().8 h, #7
.endm
// note: worst case sum of all 6 -tap filter values * 255 is 0 x7f80 so 16 bit
// arithmetic can be used to apply filters
const subpel_filters, align =4
.short 0 , 6 , 123 , 12 , 1 , 0 , 0 , 0
.short 2 , 11 , 108 , 36 , 8 , 1 , 0 , 0
.short 0 , 9 , 93 , 50 , 6 , 0 , 0 , 0
.short 3 , 16 , 77 , 77 , 16 , 3 , 0 , 0
.short 0 , 6 , 50 , 93 , 9 , 0 , 0 , 0
.short 1 , 8 , 36 , 108 , 11 , 2 , 0 , 0
.short 0 , 1 , 12 , 123 , 6 , 0 , 0 , 0
endconst
function ff_put_vp8_epel16_v6_neon, export=1
sub x2, x2, x3, lsl #1
sxtw x4, w4
sxtw x6, w6
movrel x17, subpel_filters, -16
add x6, x17, x6, lsl #4 // y
ld1 {v0.8 h}, [x6]
1 :
ld1 {v1.1 d - v2.1 d}, [x2], x3
ld1 {v3.1 d - v4.1 d}, [x2], x3
ld1 {v16.1 d - v17.1 d}, [x2], x3
ld1 {v18.1 d - v19.1 d}, [x2], x3
ld1 {v20.1 d - v21.1 d}, [x2], x3
ld1 {v22.1 d - v23.1 d}, [x2], x3
ld1 {v24.1 d - v25.1 d}, [x2]
sub x2, x2, x3, lsl #2
vp8_epel8_v6_y2 v1, v3, v1, v3, v16, v18, v20, v22, v24
vp8_epel8_v6_y2 v2, v4, v2, v4, v17, v19, v21, v23, v25
st1 {v1.1 d - v2.1 d}, [x0], x1
st1 {v3.1 d - v4.1 d}, [x0], x1
subs x4, x4, #2
b.ne 1 b
ret
endfunc
function ff_put_vp8_epel16_h6_neon, export=1
sub x2, x2, #2
sxtw x5, w5 // x
// first pass (horizontal):
movrel x17, subpel_filters, -16
add x5, x17, x5, lsl #4 // x
ld1 {v0.8 h}, [x5]
1 :
ld1 {v1.16 b, v2.16 b}, [x2], x3
vp8_epel16_h6 v1, v1, v2
st1 {v1.16 b}, [x0], x1
subs w4, w4, #1
b.ne 1 b
ret
endfunc
function ff_put_vp8_epel16_h6v6_neon, export=1
sub x2, x2, x3, lsl #1
sub x2, x2, #2
// first pass (horizontal):
movrel x17, subpel_filters, -16
sxtw x5, w5 // x
add x16, x17, x5, lsl #4 // x
sub sp, sp, #336 +16
ld1 {v0.8 h}, [x16]
add x7, sp, #15
sxtw x4, w4
add x16, x4, #5 // h
bic x7, x7, #15
1 :
ld1 {v1.16 b, v2.16 b}, [x2], x3
vp8_epel16_h6 v1, v1, v2
st1 {v1.16 b}, [x7], #16
subs x16, x16, #1
b.ne 1 b
// second pass (vertical):
sxtw x6, w6
add x6, x17, x6, lsl #4 // y
add x7, sp, #15
ld1 {v0.8 h}, [x6]
bic x7, x7, #15
2 :
ld1 {v1.8 b - v4.8 b}, [x7], #32
ld1 {v16.8 b - v19.8 b}, [x7], #32
ld1 {v20.8 b - v23.8 b}, [x7], #32
ld1 {v24.8 b - v25.8 b}, [x7]
sub x7, x7, #64
vp8_epel8_v6_y2 v1, v3, v1, v3, v16, v18, v20, v22, v24
vp8_epel8_v6_y2 v2, v4, v2, v4, v17, v19, v21, v23, v25
trn1 v1.2 d, v1.2 d, v2.2 d
trn1 v3.2 d, v3.2 d, v4.2 d
st1 {v1.16 b}, [x0], x1
st1 {v3.16 b}, [x0], x1
subs x4, x4, #2
b.ne 2 b
add sp, sp, #336 +16
ret
endfunc
function ff_put_vp8_epel8_v6_neon, export=1
sub x2, x2, x3, lsl #1
movrel x7, subpel_filters, -16
add x6, x7, w6, uxtw #4
ld1 {v0.8 h}, [x6]
1 :
ld1 {v2.8 b}, [x2], x3
ld1 {v3.8 b}, [x2], x3
ld1 {v4.8 b}, [x2], x3
ld1 {v5.8 b}, [x2], x3
ld1 {v6.8 b}, [x2], x3
ld1 {v7.8 b}, [x2], x3
ld1 {v28.8 b}, [x2]
sub x2, x2, x3, lsl #2
vp8_epel8_v6_y2 v2, v3, v2, v3, v4, v5, v6, v7, v28
st1 {v2.8 b}, [x0], x1
st1 {v3.8 b}, [x0], x1
subs w4, w4, #2
b.ne 1 b
ret
endfunc
function ff_put_vp8_epel8_h6_neon, export=1
sub x2, x2, #2
movrel x7, subpel_filters, -16
add x5, x7, w5, uxtw #4
ld1 {v0.8 h}, [x5]
1 :
ld1 {v2.8 b, v3.8 b}, [x2], x3
vp8_epel8_h6 v2, v2, v3
st1 {v2.8 b}, [x0], x1
subs w4, w4, #1
b.ne 1 b
ret
endfunc
function ff_put_vp8_epel8_h6v6_neon, export=1
sub x2, x2, x3, lsl #1
sub x2, x2, #2
sxtw x4, w4
// first pass (horizontal):
movrel x17, subpel_filters, -16
sxtw x5, w5
add x5, x17, x5, lsl #4 // x
sub sp, sp, #168 +16
ld1 {v0.8 h}, [x5]
add x7, sp, #15
add x16, x4, #5 // h
bic x7, x7, #15
1 :
ld1 {v1.8 b, v2.8 b}, [x2], x3
vp8_epel8_h6 v1, v1, v2
st1 {v1.8 b}, [x7], #8
subs x16, x16, #1
b.ne 1 b
// second pass (vertical):
sxtw x6, w6
add x6, x17, x6, lsl #4 // y
add x7, sp, #15
ld1 {v0.8 h}, [x6]
bic x7, x7, #15
2 :
ld1 {v1.8 b - v4.8 b}, [x7], #32
ld1 {v5.8 b - v7.8 b}, [x7]
sub x7, x7, #16
vp8_epel8_v6_y2 v1, v2, v1, v2, v3, v4, v5, v6, v7
st1 {v1.8 b}, [x0], x1
st1 {v2.8 b}, [x0], x1
subs x4, x4, #2
b.ne 2 b
add sp, sp, #168 +16
ret
endfunc
function ff_put_vp8_epel8_v4_neon, export=1
sub x2, x2, x3
movrel x7, subpel_filters, -16
add x6, x7, w6, uxtw #4
ld1 {v0.8 h}, [x6]
1 :
ld1 {v2.8 b}, [x2], x3
ld1 {v3.8 b}, [x2], x3
ld1 {v4.8 b}, [x2], x3
ld1 {v5.8 b}, [x2], x3
ld1 {v6.8 b}, [x2]
sub x2, x2, x3, lsl #1
vp8_epel8_v4_y2 v2, v2, v3, v4, v5, v6
st1 {v2.d}[0 ], [x0], x1
st1 {v2.d}[1 ], [x0], x1
subs w4, w4, #2
b.ne 1 b
ret
endfunc
function ff_put_vp8_epel8_h4_neon, export=1
sub x2, x2, #1
movrel x7, subpel_filters, -16
add x5, x7, w5, uxtw #4
ld1 {v0.8 h}, [x5]
1 :
ld1 {v2.8 b,v3.8 b}, [x2], x3
vp8_epel8_h4 v2, v2, v3
st1 {v2.8 b}, [x0], x1
subs w4, w4, #1
b.ne 1 b
ret
endfunc
function ff_put_vp8_epel8_h4v6_neon, export=1
sub x2, x2, x3, lsl #1
sub x2, x2, #1
sxtw x4, w4
// first pass (horizontal):
movrel x17, subpel_filters, -16
sxtw x5, w5
add x5, x17, x5, lsl #4 // x
sub sp, sp, #168 +16
ld1 {v0.8 h}, [x5]
add x7, sp, #15
add x16, x4, #5 // h
bic x7, x7, #15
1 :
ld1 {v1.8 b, v2.8 b}, [x2], x3
vp8_epel8_h4 v1, v1, v2
st1 {v1.8 b}, [x7], #8
subs x16, x16, #1
b.ne 1 b
// second pass (vertical):
sxtw x6, w6
add x6, x17, x6, lsl #4 // y
add x7, sp, #15
ld1 {v0.8 h}, [x6]
bic x7, x7, #15
2 :
ld1 {v1.8 b - v4.8 b}, [x7], #32
ld1 {v5.8 b - v7.8 b}, [x7]
sub x7, x7, #16
vp8_epel8_v6_y2 v1, v2, v1, v2, v3, v4, v5, v6, v7
st1 {v1.8 b}, [x0], x1
st1 {v2.8 b}, [x0], x1
subs x4, x4, #2
b.ne 2 b
add sp, sp, #168 +16
ret
endfunc
function ff_put_vp8_epel8_h4v4_neon, export=1
sub x2, x2, x3
sub x2, x2, #1
sxtw x4, w4
// first pass (horizontal):
movrel x17, subpel_filters, -16
sxtw x5, w5
add x5, x17, x5, lsl #4 // x
sub sp, sp, #168 +16
ld1 {v0.8 h}, [x5]
add x7, sp, #15
add x16, x4, #3 // h
bic x7, x7, #15
1 :
ld1 {v1.8 b, v2.8 b}, [x2], x3
vp8_epel8_h4 v1, v1, v2
st1 {v1.8 b}, [x7], #8
subs x16, x16, #1
b.ne 1 b
// second pass (vertical):
sxtw x6, w6
add x6, x17, x6, lsl #4 // y
add x7, sp, #15
ld1 {v0.8 h}, [x6]
bic x7, x7, #15
2 :
ld1 {v1.8 b - v2.8 b}, [x7], #16
ld1 {v3.8 b - v5.8 b}, [x7]
vp8_epel8_v4_y2 v1, v1, v2, v3, v4, v5
st1 {v1.d}[0 ], [x0], x1
st1 {v1.d}[1 ], [x0], x1
subs x4, x4, #2
b.ne 2 b
add sp, sp, #168 +16
ret
endfunc
function ff_put_vp8_epel8_h6v4_neon, export=1
sub x2, x2, x3
sub x2, x2, #2
sxtw x4, w4
// first pass (horizontal):
movrel x17, subpel_filters, -16
sxtw x5, w5
add x5, x17, x5, lsl #4 // x
sub sp, sp, #168 +16
ld1 {v0.8 h}, [x5]
add x7, sp, #15
add x16, x4, #3 // h
bic x7, x7, #15
1 :
ld1 {v1.8 b, v2.8 b}, [x2], x3
vp8_epel8_h6 v1, v1, v2
st1 {v1.8 b}, [x7], #8
subs x16, x16, #1
b.ne 1 b
// second pass (vertical):
sxtw x6, w6
add x6, x17, x6, lsl #4 // y
add x7, sp, #15
ld1 {v0.8 h}, [x6]
bic x7, x7, #15
2 :
ld1 {v1.8 b - v2.8 b}, [x7], #16
ld1 {v3.8 b - v5.8 b}, [x7]
vp8_epel8_v4_y2 v1, v1, v2, v3, v4, v5
st1 {v1.d}[0 ], [x0], x1
st1 {v1.d}[1 ], [x0], x1
subs x4, x4, #2
b.ne 2 b
add sp, sp, #168 +16
ret
endfunc
function ff_put_vp8_epel4_v6_neon, export=1
sub x2, x2, x3, lsl #1
movrel x7, subpel_filters, -16
add x6, x7, w6, uxtw #4
ld1 {v0.8 h}, [x6]
1 :
ld1r {v2.2 s}, [x2], x3
ld1r {v3.2 s}, [x2], x3
ld1r {v4.2 s}, [x2], x3
ld1r {v5.2 s}, [x2], x3
ld1r {v6.2 s}, [x2], x3
ld1r {v7.2 s}, [x2], x3
ld1r {v28.2 s}, [x2]
sub x2, x2, x3, lsl #2
ld1 {v2.s}[1 ], [x2], x3
ld1 {v3.s}[1 ], [x2], x3
ld1 {v4.s}[1 ], [x2], x3
ld1 {v5.s}[1 ], [x2], x3
ld1 {v6.s}[1 ], [x2], x3
ld1 {v7.s}[1 ], [x2], x3
ld1 {v28.s}[1 ], [x2]
sub x2, x2, x3, lsl #2
vp8_epel8_v6_y2 v2, v3, v2, v3, v4, v5, v6, v7, v28
st1 {v2.s}[0 ], [x0], x1
st1 {v3.s}[0 ], [x0], x1
st1 {v2.s}[1 ], [x0], x1
st1 {v3.s}[1 ], [x0], x1
subs w4, w4, #4
b.ne 1 b
ret
endfunc
function ff_put_vp8_epel4_h6_neon, export=1
sub x2, x2, #2
movrel x7, subpel_filters, -16
add x5, x7, w5, uxtw #4
ld1 {v0.8 h}, [x5]
1 :
ld1 {v2.8 b,v3.8 b}, [x2], x3
vp8_epel8_h6 v2, v2, v3
st1 {v2.s}[0 ], [x0], x1
subs w4, w4, #1
b.ne 1 b
ret
endfunc
function ff_put_vp8_epel4_h6v6_neon, export=1
sub x2, x2, x3, lsl #1
sub x2, x2, #2
movrel x7, subpel_filters, -16
add x5, x7, w5, uxtw #4
ld1 {v0.8 h}, [x5]
sub sp, sp, #52
add w8, w4, #5
mov x9, sp
1 :
ld1 {v2.8 b,v3.8 b}, [x2], x3
vp8_epel8_h6 v2, v2, v3
st1 {v2.s}[0 ], [x9], #4
subs w8, w8, #1
b.ne 1 b
add x6, x7, w6, uxtw #4
ld1 {v0.8 h}, [x6]
mov x9, sp
2 :
ld1 {v2.8 b,v3.8 b}, [x9], #16
ld1 {v6.8 b}, [x9], #8
ld1r {v28.2 s}, [x9]
sub x9, x9, #16
ld1 {v4.8 b,v5.8 b}, [x9], #16
ld1 {v7.8 b}, [x9], #8
ld1 {v28.s}[1 ], [x9]
sub x9, x9, #16
trn1 v1.2 s, v2.2 s, v4.2 s
trn2 v4.2 s, v2.2 s, v4.2 s
trn1 v2.2 s, v3.2 s, v5.2 s
trn2 v5.2 s, v3.2 s, v5.2 s
trn1 v3.2 s, v6.2 s, v7.2 s
trn2 v7.2 s, v6.2 s, v7.2 s
vp8_epel8_v6_y2 v2, v3, v1, v4, v2, v5, v3, v7, v28
st1 {v2.s}[0 ], [x0], x1
st1 {v3.s}[0 ], [x0], x1
st1 {v2.s}[1 ], [x0], x1
st1 {v3.s}[1 ], [x0], x1
subs w4, w4, #4
b.ne 2 b
add sp, sp, #52
ret
endfunc
function ff_put_vp8_epel4_h4v6_neon, export=1
sub x2, x2, x3, lsl #1
sub x2, x2, #1
movrel x7, subpel_filters, -16
add x5, x7, w5, uxtw #4
ld1 {v0.8 h}, [x5]
sub sp, sp, #52
add w8, w4, #5
mov x9, sp
1 :
ld1 {v2.8 b}, [x2], x3
vp8_epel8_h4 v2, v2, v2
st1 {v2.s}[0 ], [x9], #4
subs w8, w8, #1
b.ne 1 b
add x6, x7, w6, uxtw #4
ld1 {v0.8 h}, [x6]
mov x9, sp
2 :
ld1 {v2.8 b,v3.8 b}, [x9], #16
ld1 {v6.8 b}, [x9], #8
ld1r {v28.2 s}, [x9]
sub x9, x9, #16
ld1 {v4.8 b,v5.8 b}, [x9], #16
ld1 {v7.8 b}, [x9], #8
ld1 {v28.s}[1 ], [x9]
sub x9, x9, #16
trn1 v1.2 s, v2.2 s, v4.2 s
trn2 v4.2 s, v2.2 s, v4.2 s
trn1 v2.2 s, v3.2 s, v5.2 s
trn2 v5.2 s, v3.2 s, v5.2 s
trn1 v3.2 s, v6.2 s, v7.2 s
trn2 v7.2 s, v6.2 s, v7.2 s
vp8_epel8_v6_y2 v2, v3, v1, v4, v2, v5, v3, v7, v28
st1 {v2.s}[0 ], [x0], x1
st1 {v3.s}[0 ], [x0], x1
st1 {v2.s}[1 ], [x0], x1
st1 {v3.s}[1 ], [x0], x1
subs w4, w4, #4
b.ne 2 b
add sp, sp, #52
ret
endfunc
function ff_put_vp8_epel4_h6v4_neon, export=1
sub x2, x2, x3
sub x2, x2, #2
movrel x7, subpel_filters, -16
add x5, x7, w5, uxtw #4
ld1 {v0.8 h}, [x5]
sub sp, sp, #44
add w8, w4, #3
mov x9, sp
1 :
ld1 {v2.8 b,v3.8 b}, [x2], x3
vp8_epel8_h6 v2, v2, v3
st1 {v2.s}[0 ], [x9], #4
subs w8, w8, #1
b.ne 1 b
add x6, x7, w6, uxtw #4
ld1 {v0.8 h}, [x6]
mov x9, sp
2 :
ld1 {v2.8 b,v3.8 b}, [x9], #16
ld1r {v6.2 s}, [x9]
sub x9, x9, #8
ld1 {v4.8 b,v5.8 b}, [x9], #16
ld1 {v6.s}[1 ], [x9]
sub x9, x9, #8
trn1 v1.2 s, v2.2 s, v4.2 s
trn2 v4.2 s, v2.2 s, v4.2 s
trn1 v2.2 s, v3.2 s, v5.2 s
trn2 v5.2 s, v3.2 s, v5.2 s
vp8_epel8_v4_y2 v1, v1, v4, v2, v5, v6
st1 {v1.s}[0 ], [x0], x1
st1 {v1.s}[2 ], [x0], x1
st1 {v1.s}[1 ], [x0], x1
st1 {v1.s}[3 ], [x0], x1
subs w4, w4, #4
b.ne 2 b
add sp, sp, #44
ret
endfunc
function ff_put_vp8_epel4_h4_neon, export=1
sub x2, x2, #1
movrel x7, subpel_filters, -16
add x5, x7, w5, uxtw #4
ld1 {v0.8 h}, [x5]
1 :
ld1 {v2.8 b}, [x2], x3
vp8_epel8_h4 v2, v2, v2
st1 {v2.s}[0 ], [x0], x1
subs w4, w4, #1
b.ne 1 b
ret
endfunc
function ff_put_vp8_epel4_v4_neon, export=1
sub x2, x2, x3
movrel x7, subpel_filters, -16
add x6, x7, w6, uxtw #4
ld1 {v0.8 h}, [x6]
1 :
ld1r {v2.2 s}, [x2], x3
ld1r {v3.2 s}, [x2], x3
ld1r {v4.2 s}, [x2], x3
ld1r {v5.2 s}, [x2], x3
ld1r {v6.2 s}, [x2]
sub x2, x2, x3, lsl #1
ld1 {v2.s}[1 ], [x2], x3
ld1 {v3.s}[1 ], [x2], x3
ld1 {v4.s}[1 ], [x2], x3
ld1 {v5.s}[1 ], [x2], x3
ld1 {v6.s}[1 ], [x2]
sub x2, x2, x3, lsl #1
vp8_epel8_v4_y2 v2, v2, v3, v4, v5, v6
st1 {v2.s}[0 ], [x0], x1
st1 {v2.s}[2 ], [x0], x1
st1 {v2.s}[1 ], [x0], x1
st1 {v2.s}[3 ], [x0], x1
subs w4, w4, #4
b.ne 1 b
ret
endfunc
function ff_put_vp8_epel4_h4v4_neon, export=1
sub x2, x2, x3
sub x2, x2, #1
movrel x7, subpel_filters, -16
add x5, x7, w5, uxtw #4
ld1 {v0.8 h}, [x5]
sub sp, sp, #44
add w8, w4, #3
mov x9, sp
1 :
ld1 {v2.8 b}, [x2], x3
vp8_epel8_h4 v2, v2, v3
st1 {v2.s}[0 ], [x9], #4
subs w8, w8, #1
b.ne 1 b
add x6, x7, w6, uxtw #4
ld1 {v0.8 h}, [x6]
mov x9, sp
2 :
ld1 {v2.8 b,v3.8 b}, [x9], #16
ld1r {v6.2 s}, [x9]
sub x9, x9, #8
ld1 {v4.8 b,v5.8 b}, [x9], #16
ld1 {v6.s}[1 ], [x9]
sub x9, x9, #8
trn1 v1.2 s, v2.2 s, v4.2 s
trn2 v4.2 s, v2.2 s, v4.2 s
trn1 v2.2 s, v3.2 s, v5.2 s
trn2 v5.2 s, v3.2 s, v5.2 s
vp8_epel8_v4_y2 v1, v1, v4, v2, v5, v6
st1 {v1.s}[0 ], [x0], x1
st1 {v1.s}[2 ], [x0], x1
st1 {v1.s}[1 ], [x0], x1
st1 {v1.s}[3 ], [x0], x1
subs w4, w4, #4
b.ne 2 b
add sp, sp, #44
ret
endfunc
/* Bilinear MC */
function ff_put_vp8_bilin16_h_neon, export=1
mov w7, #8
dup v0.8 b, w5
sub w5, w7, w5
dup v1.8 b, w5
1 :
subs w4, w4, #2
ld1 {v2.8 b,v3.8 b,v4.8 b}, [x2], x3
ext v5.8 b, v3.8 b, v4.8 b, #1
ext v4.8 b, v2.8 b, v3.8 b, #1
umull v16.8 h, v2.8 b, v1.8 b
umlal v16.8 h, v4.8 b, v0.8 b
ld1 {v18.8 b,v19.8 b,v20.8 b}, [x2], x3
umull v6.8 h, v3.8 b, v1.8 b
umlal v6.8 h, v5.8 b, v0.8 b
ext v21.8 b, v19.8 b, v20.8 b, #1
ext v20.8 b, v18.8 b, v19.8 b, #1
umull v22.8 h, v18.8 b, v1.8 b
umlal v22.8 h, v20.8 b, v0.8 b
umull v24.8 h, v19.8 b, v1.8 b
umlal v24.8 h, v21.8 b, v0.8 b
rshrn v4.8 b, v16.8 h, #3
rshrn2 v4.16 b, v6.8 h, #3
rshrn v6.8 b, v22.8 h, #3
rshrn2 v6.16 b, v24.8 h, #3
st1 {v4.16 b}, [x0], x1
st1 {v6.16 b}, [x0], x1
b.gt 1 b
ret
endfunc
function ff_put_vp8_bilin16_v_neon, export=1
mov w7, #8
dup v0.16 b, w6
sub w6, w7, w6
dup v1.16 b, w6
ld1 {v2.16 b}, [x2], x3
1 :
subs w4, w4, #2
ld1 {v4.16 b}, [x2], x3
umull v6.8 h, v2.8 b, v1.8 b
umlal v6.8 h, v4.8 b, v0.8 b
umull2 v16.8 h, v2.16 b, v1.16 b
umlal2 v16.8 h, v4.16 b, v0.16 b
ld1 {v2.16 b}, [x2], x3
umull v18.8 h, v4.8 b, v1.8 b
umlal v18.8 h, v2.8 b, v0.8 b
umull2 v20.8 h, v4.16 b, v1.16 b
umlal2 v20.8 h, v2.16 b, v0.16 b
rshrn v4.8 b, v6.8 h, #3
rshrn2 v4.16 b, v16.8 h, #3
rshrn v6.8 b, v18.8 h, #3
rshrn2 v6.16 b, v20.8 h, #3
st1 {v4.16 b}, [x0], x1
st1 {v6.16 b}, [x0], x1
b.gt 1 b
ret
endfunc
function ff_put_vp8_bilin16_hv_neon, export=1
mov w7, #8
dup v0.8 b, w5 // mx
sub w5, w7, w5
dup v1.8 b, w5
dup v2.16 b, w6 // my
sub w6, w7, w6
dup v3.16 b, w6
ld1 {v4.8 b,v5.8 b,v6.8 b}, [x2], x3
ext v7.8 b, v5.8 b, v6.8 b, #1
ext v6.8 b, v4.8 b, v5.8 b, #1
umull v16.8 h, v4.8 b, v1.8 b
umlal v16.8 h, v6.8 b, v0.8 b
umull v18.8 h, v5.8 b, v1.8 b
umlal v18.8 h, v7.8 b, v0.8 b
rshrn v4.8 b, v16.8 h, #3
rshrn2 v4.16 b, v18.8 h, #3
1 :
subs w4, w4, #2
ld1 {v18.8 b,v19.8 b,v20.8 b}, [x2], x3
ext v21.8 b, v19.8 b, v20.8 b, #1
ext v20.8 b, v18.8 b, v19.8 b, #1
umull v22.8 h, v18.8 b, v1.8 b
umlal v22.8 h, v20.8 b, v0.8 b
ld1 {v26.8 b,v27.8 b,v28.8 b}, [x2], x3
umull v24.8 h, v19.8 b, v1.8 b
umlal v24.8 h, v21.8 b, v0.8 b
ext v29.8 b, v27.8 b, v28.8 b, #1
ext v28.8 b, v26.8 b, v27.8 b, #1
umull v16.8 h, v26.8 b, v1.8 b
umlal v16.8 h, v28.8 b, v0.8 b
umull v18.8 h, v27.8 b, v1.8 b
umlal v18.8 h, v29.8 b, v0.8 b
rshrn v6.8 b, v22.8 h, #3
rshrn2 v6.16 b, v24.8 h, #3
umull v24.8 h, v4.8 b, v3.8 b
umlal v24.8 h, v6.8 b, v2.8 b
umull2 v30.8 h, v4.16 b, v3.16 b
umlal2 v30.8 h, v6.16 b, v2.16 b
rshrn v4.8 b, v16.8 h, #3
rshrn2 v4.16 b, v18.8 h, #3
umull v20.8 h, v6.8 b, v3.8 b
umlal v20.8 h, v4.8 b, v2.8 b
umull2 v22.8 h, v6.16 b, v3.16 b
umlal2 v22.8 h, v4.16 b, v2.16 b
rshrn v24.8 b, v24.8 h, #3
rshrn2 v24.16 b, v30.8 h, #3
st1 {v24.16 b}, [x0], x1
rshrn v20.8 b, v20.8 h, #3
rshrn2 v20.16 b, v22.8 h, #3
st1 {v20.16 b}, [x0], x1
b.gt 1 b
ret
endfunc
function ff_put_vp8_bilin8_h_neon, export=1
mov w7, #8
dup v0.8 b, w5
sub w5, w7, w5
dup v1.8 b, w5
1 :
subs w4, w4, #2
ld1 {v2.8 b,v3.8 b}, [x2], x3
ext v3.8 b, v2.8 b, v3.8 b, #1
umull v4.8 h, v2.8 b, v1.8 b
umlal v4.8 h, v3.8 b, v0.8 b
ld1 {v6.8 b,v7.8 b}, [x2], x3
ext v7.8 b, v6.8 b, v7.8 b, #1
umull v16.8 h, v6.8 b, v1.8 b
umlal v16.8 h, v7.8 b, v0.8 b
rshrn v4.8 b, v4.8 h, #3
rshrn v16.8 b, v16.8 h, #3
st1 {v4.8 b}, [x0], x1
st1 {v16.8 b}, [x0], x1
b.gt 1 b
ret
endfunc
function ff_put_vp8_bilin8_v_neon, export=1
mov w7, #8
dup v0.8 b, w6
sub w6, w7, w6
dup v1.8 b, w6
ld1 {v2.8 b}, [x2], x3
1 :
subs w4, w4, #2
ld1 {v3.8 b}, [x2], x3
umull v4.8 h, v2.8 b, v1.8 b
umlal v4.8 h, v3.8 b, v0.8 b
ld1 {v2.8 b}, [x2], x3
umull v6.8 h, v3.8 b, v1.8 b
umlal v6.8 h, v2.8 b, v0.8 b
rshrn v4.8 b, v4.8 h, #3
rshrn v6.8 b, v6.8 h, #3
st1 {v4.8 b}, [x0], x1
st1 {v6.8 b}, [x0], x1
b.gt 1 b
ret
endfunc
function ff_put_vp8_bilin8_hv_neon, export=1
mov w7, #8
dup v0.8 b, w5 // mx
sub w5, w7, w5
dup v1.8 b, w5
dup v2.8 b, w6 // my
sub w6, w7, w6
dup v3.8 b, w6
ld1 {v4.8 b,v5.8 b}, [x2], x3
ext v5.8 b, v4.8 b, v5.8 b, #1
umull v18.8 h, v4.8 b, v1.8 b
umlal v18.8 h, v5.8 b, v0.8 b
rshrn v22.8 b, v18.8 h, #3
1 :
subs w4, w4, #2
ld1 {v6.8 b,v7.8 b}, [x2], x3
ext v7.8 b, v6.8 b, v7.8 b, #1
umull v16.8 h, v6.8 b, v1.8 b
umlal v16.8 h, v7.8 b, v0.8 b
ld1 {v4.8 b,v5.8 b}, [x2], x3
ext v5.8 b, v4.8 b, v5.8 b, #1
umull v18.8 h, v4.8 b, v1.8 b
umlal v18.8 h, v5.8 b, v0.8 b
rshrn v16.8 b, v16.8 h, #3
umull v20.8 h, v22.8 b, v3.8 b
umlal v20.8 h, v16.8 b, v2.8 b
rshrn v22.8 b, v18.8 h, #3
umull v24.8 h, v16.8 b, v3.8 b
umlal v24.8 h, v22.8 b, v2.8 b
rshrn v20.8 b, v20.8 h, #3
st1 {v20.8 b}, [x0], x1
rshrn v23.8 b, v24.8 h, #3
st1 {v23.8 b}, [x0], x1
b.gt 1 b
ret
endfunc
function ff_put_vp8_bilin4_h_neon, export=1
mov w7, #8
dup v0.8 b, w5
sub w5, w7, w5
dup v1.8 b, w5
1 :
subs w4, w4, #2
ld1 {v2.8 b}, [x2], x3
ext v3.8 b, v2.8 b, v3.8 b, #1
ld1 {v6.8 b}, [x2], x3
ext v7.8 b, v6.8 b, v7.8 b, #1
trn1 v2.2 s, v2.2 s, v6.2 s
trn1 v3.2 s, v3.2 s, v7.2 s
umull v4.8 h, v2.8 b, v1.8 b
umlal v4.8 h, v3.8 b, v0.8 b
rshrn v4.8 b, v4.8 h, #3
st1 {v4.s}[0 ], [x0], x1
st1 {v4.s}[1 ], [x0], x1
b.gt 1 b
ret
endfunc
function ff_put_vp8_bilin4_v_neon, export=1
mov w7, #8
dup v0.8 b, w6
sub w6, w7, w6
dup v1.8 b, w6
ld1r {v2.2 s}, [x2], x3
1 :
ld1r {v3.2 s}, [x2]
ld1 {v2.s}[1 ], [x2], x3
ld1 {v3.s}[1 ], [x2], x3
umull v4.8 h, v2.8 b, v1.8 b
umlal v4.8 h, v3.8 b, v0.8 b
trn2 v2.2 s, v3.2 s, v2.2 s
rshrn v4.8 b, v4.8 h, #3
st1 {v4.s}[0 ], [x0], x1
st1 {v4.s}[1 ], [x0], x1
subs w4, w4, #2
b.gt 1 b
ret
endfunc
function ff_put_vp8_bilin4_hv_neon, export=1
mov w7, #8
dup v0.8 b, w5 // mx
sub w5, w7, w5
dup v1.8 b, w5
dup v2.8 b, w6 // my
sub w6, w7, w6
dup v3.8 b, w6
ld1 {v4.8 b}, [x2], x3
ext v5.8 b, v4.8 b, v4.8 b, #1
umull v18.8 h, v4.8 b, v1.8 b
umlal v18.8 h, v5.8 b, v0.8 b
rshrn v22.8 b, v18.8 h, #3
1 :
subs w4, w4, #2
ld1 {v6.8 b}, [x2], x3
ext v7.8 b, v6.8 b, v6.8 b, #1
ld1 {v4.8 b}, [x2], x3
ext v5.8 b, v4.8 b, v4.8 b, #1
trn1 v6.2 s, v6.2 s, v4.2 s
trn1 v7.2 s, v7.2 s, v5.2 s
umull v16.8 h, v6.8 b, v1.8 b
umlal v16.8 h, v7.8 b, v0.8 b
rshrn v16.8 b, v16.8 h, #3
umull v20.8 h, v16.8 b, v2.8 b
trn1 v22.2 s, v22.2 s, v16.2 s
umlal v20.8 h, v22.8 b, v3.8 b
rev64 v22.2 s, v16.2 s
rshrn v20.8 b, v20.8 h, #3
st1 {v20.s}[0 ], [x0], x1
st1 {v20.s}[1 ], [x0], x1
b.gt 1 b
ret
endfunc
Messung V0.5 in Prozent C=100 H=100 G=100