/*
* Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
* Copyright (c) 2013 Janne Grunau <janne-libav@jannau.net>
* Copyright (c) 2014 Janne Grunau <janne-libav@jannau.net>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "libavutil/aarch64/asm.S"
#include "neon.S"
.macro h264_loop_filter_start
cmp w2, #0
ldr w6, [x4]
ccmp w3, #0 , #0 , ne
mov v24.s[0 ], w6
and w8, w6, w6, lsl #16
b.eq 1 f
ands w8, w8, w8, lsl #8
b.ge 2 f
1 :
ret
2 :
.endm
.macro h264_loop_filter_luma
dup v22.16 b, w2 // alpha
uxtl v24.8 h, v24.8 b
uabd v21.16 b, v16.16 b, v0.16 b // abs(p0 - q0)
uxtl v24.4 s, v24.4 h
uabd v28.16 b, v18.16 b, v16.16 b // abs(p1 - p0)
sli v24.8 h, v24.8 h, #8
uabd v30.16 b, v2.16 b, v0.16 b // abs(q1 - q0)
sli v24.4 s, v24.4 s, #16
cmhi v21.16 b, v22.16 b, v21.16 b // < alpha
dup v22.16 b, w3 // beta
cmlt v23.16 b, v24.16 b, #0
cmhi v28.16 b, v22.16 b, v28.16 b // < beta
cmhi v30.16 b, v22.16 b, v30.16 b // < beta
bic v21.16 b, v21.16 b, v23.16 b
uabd v17.16 b, v20.16 b, v16.16 b // abs(p2 - p0)
and v21.16 b, v21.16 b, v28.16 b
uabd v19.16 b, v4.16 b, v0.16 b // abs(q2 - q0)
and v21.16 b, v21.16 b, v30.16 b // < beta
shrn v30.8 b, v21.8 h, #4
mov x7, v30.d[0 ]
cmhi v17.16 b, v22.16 b, v17.16 b // < beta
cmhi v19.16 b, v22.16 b, v19.16 b // < beta
cbz x7, 9 f
and v17.16 b, v17.16 b, v21.16 b
and v19.16 b, v19.16 b, v21.16 b
and v24.16 b, v24.16 b, v21.16 b
urhadd v28.16 b, v16.16 b, v0.16 b
sub v21.16 b, v24.16 b, v17.16 b
uqadd v23.16 b, v18.16 b, v24.16 b
uhadd v20.16 b, v20.16 b, v28.16 b
sub v21.16 b, v21.16 b, v19.16 b
uhadd v28.16 b, v4.16 b, v28.16 b
umin v23.16 b, v23.16 b, v20.16 b
uqsub v22.16 b, v18.16 b, v24.16 b
uqadd v4.16 b, v2.16 b, v24.16 b
umax v23.16 b, v23.16 b, v22.16 b
uqsub v22.16 b, v2.16 b, v24.16 b
umin v28.16 b, v4.16 b, v28.16 b
uxtl v4.8 h, v0.8 b
umax v28.16 b, v28.16 b, v22.16 b
uxtl2 v20.8 h, v0.16 b
usubw v4.8 h, v4.8 h, v16.8 b
usubw2 v20.8 h, v20.8 h, v16.16 b
shl v4.8 h, v4.8 h, #2
shl v20.8 h, v20.8 h, #2
uaddw v4.8 h, v4.8 h, v18.8 b
uaddw2 v20.8 h, v20.8 h, v18.16 b
usubw v4.8 h, v4.8 h, v2.8 b
usubw2 v20.8 h, v20.8 h, v2.16 b
rshrn v4.8 b, v4.8 h, #3
rshrn2 v4.16 b, v20.8 h, #3
bsl v17.16 b, v23.16 b, v18.16 b
bsl v19.16 b, v28.16 b, v2.16 b
neg v23.16 b, v21.16 b
uxtl v28.8 h, v16.8 b
smin v4.16 b, v4.16 b, v21.16 b
uxtl2 v21.8 h, v16.16 b
smax v4.16 b, v4.16 b, v23.16 b
uxtl v22.8 h, v0.8 b
uxtl2 v24.8 h, v0.16 b
saddw v28.8 h, v28.8 h, v4.8 b
saddw2 v21.8 h, v21.8 h, v4.16 b
ssubw v22.8 h, v22.8 h, v4.8 b
ssubw2 v24.8 h, v24.8 h, v4.16 b
sqxtun v16.8 b, v28.8 h
sqxtun2 v16.16 b, v21.8 h
sqxtun v0.8 b, v22.8 h
sqxtun2 v0.16 b, v24.8 h
.endm
function ff_h264_v_loop_filter_luma_neon, export=1
h264_loop_filter_start
ld1 {v0.16 b}, [x0], x1
ld1 {v2.16 b}, [x0], x1
ld1 {v4.16 b}, [x0], x1
sub x0, x0, x1, lsl #2
sub x0, x0, x1, lsl #1
ld1 {v20.16 b}, [x0], x1
ld1 {v18.16 b}, [x0], x1
ld1 {v16.16 b}, [x0], x1
h264_loop_filter_luma
sub x0, x0, x1, lsl #1
st1 {v17.16 b}, [x0], x1
st1 {v16.16 b}, [x0], x1
st1 {v0.16 b}, [x0], x1
st1 {v19.16 b}, [x0]
9 :
ret
endfunc
function ff_h264_h_loop_filter_luma_neon, export=1
h264_loop_filter_start
sub x0, x0, #4
ld1 {v6.8 b}, [x0], x1
ld1 {v20.8 b}, [x0], x1
ld1 {v18.8 b}, [x0], x1
ld1 {v16.8 b}, [x0], x1
ld1 {v0.8 b}, [x0], x1
ld1 {v2.8 b}, [x0], x1
ld1 {v4.8 b}, [x0], x1
ld1 {v26.8 b}, [x0], x1
ld1 {v6.d}[1 ], [x0], x1
ld1 {v20.d}[1 ], [x0], x1
ld1 {v18.d}[1 ], [x0], x1
ld1 {v16.d}[1 ], [x0], x1
ld1 {v0.d}[1 ], [x0], x1
ld1 {v2.d}[1 ], [x0], x1
ld1 {v4.d}[1 ], [x0], x1
ld1 {v26.d}[1 ], [x0], x1
transpose_8x16B v6, v20, v18, v16, v0, v2, v4, v26, v21, v23
h264_loop_filter_luma
transpose_4x16B v17, v16, v0, v19, v21, v23, v25, v27
sub x0, x0, x1, lsl #4
add x0, x0, #2
st1 {v17.s}[0 ], [x0], x1
st1 {v16.s}[0 ], [x0], x1
st1 {v0.s}[0 ], [x0], x1
st1 {v19.s}[0 ], [x0], x1
st1 {v17.s}[1 ], [x0], x1
st1 {v16.s}[1 ], [x0], x1
st1 {v0.s}[1 ], [x0], x1
st1 {v19.s}[1 ], [x0], x1
st1 {v17.s}[2 ], [x0], x1
st1 {v16.s}[2 ], [x0], x1
st1 {v0.s}[2 ], [x0], x1
st1 {v19.s}[2 ], [x0], x1
st1 {v17.s}[3 ], [x0], x1
st1 {v16.s}[3 ], [x0], x1
st1 {v0.s}[3 ], [x0], x1
st1 {v19.s}[3 ], [x0], x1
9 :
ret
endfunc
.macro h264_loop_filter_start_intra
orr w4, w2, w3
cbnz w4, 1 f
ret
1 :
dup v30.16 b, w2 // alpha
dup v31.16 b, w3 // beta
.endm
.macro h264_loop_filter_luma_intra
uabd v16.16 b, v7.16 b, v0.16 b // abs(p0 - q0)
uabd v17.16 b, v6.16 b, v7.16 b // abs(p1 - p0)
uabd v18.16 b, v1.16 b, v0.16 b // abs(q1 - q0)
cmhi v19.16 b, v30.16 b, v16.16 b // < alpha
cmhi v17.16 b, v31.16 b, v17.16 b // < beta
cmhi v18.16 b, v31.16 b, v18.16 b // < beta
movi v29.16 b, #2
ushr v30.16 b, v30.16 b, #2 // alpha >> 2
add v30.16 b, v30.16 b, v29.16 b // (alpha >> 2 ) + 2
cmhi v16.16 b, v30.16 b, v16.16 b // < (alpha >> 2 ) + 2
and v19.16 b, v19.16 b, v17.16 b
and v19.16 b, v19.16 b, v18.16 b
shrn v20.8 b, v19.8 h, #4
mov x4, v20.d[0 ]
cbz x4, 9 f
ushll v20.8 h, v6.8 b, #1
ushll v22.8 h, v1.8 b, #1
ushll2 v21.8 h, v6.16 b, #1
ushll2 v23.8 h, v1.16 b, #1
uaddw v20.8 h, v20.8 h, v7.8 b
uaddw v22.8 h, v22.8 h, v0.8 b
uaddw2 v21.8 h, v21.8 h, v7.16 b
uaddw2 v23.8 h, v23.8 h, v0.16 b
uaddw v20.8 h, v20.8 h, v1.8 b
uaddw v22.8 h, v22.8 h, v6.8 b
uaddw2 v21.8 h, v21.8 h, v1.16 b
uaddw2 v23.8 h, v23.8 h, v6.16 b
rshrn v24.8 b, v20.8 h, #2 // p0'_1
rshrn v25.8 b, v22.8 h, #2 // q0'_1
rshrn2 v24.16 b, v21.8 h, #2 // p0'_1
rshrn2 v25.16 b, v23.8 h, #2 // q0'_1
uabd v17.16 b, v5.16 b, v7.16 b // abs(p2 - p0)
uabd v18.16 b, v2.16 b, v0.16 b // abs(q2 - q0)
cmhi v17.16 b, v31.16 b, v17.16 b // < beta
cmhi v18.16 b, v31.16 b, v18.16 b // < beta
and v17.16 b, v16.16 b, v17.16 b // if_2 && if_3
and v18.16 b, v16.16 b, v18.16 b // if_2 && if_4
not v30.16 b, v17.16 b
not v31.16 b, v18.16 b
and v30.16 b, v30.16 b, v19.16 b // if_1 && !(if_2 && if_3)
and v31.16 b, v31.16 b, v19.16 b // if_1 && !(if_2 && if_4)
and v17.16 b, v19.16 b, v17.16 b // if_1 && if_2 && if_3
and v18.16 b, v19.16 b, v18.16 b // if_1 && if_2 && if_4
//calc p, v7, v6, v5, v4, v17, v7, v6, v5, v4
uaddl v26.8 h, v5.8 b, v7.8 b
uaddl2 v27.8 h, v5.16 b, v7.16 b
uaddw v26.8 h, v26.8 h, v0.8 b
uaddw2 v27.8 h, v27.8 h, v0.16 b
add v20.8 h, v20.8 h, v26.8 h
add v21.8 h, v21.8 h, v27.8 h
uaddw v20.8 h, v20.8 h, v0.8 b
uaddw2 v21.8 h, v21.8 h, v0.16 b
rshrn v20.8 b, v20.8 h, #3 // p0'_2
rshrn2 v20.16 b, v21.8 h, #3 // p0'_2
uaddw v26.8 h, v26.8 h, v6.8 b
uaddw2 v27.8 h, v27.8 h, v6.16 b
rshrn v21.8 b, v26.8 h, #2 // p1'_2
rshrn2 v21.16 b, v27.8 h, #2 // p1'_2
uaddl v28.8 h, v4.8 b, v5.8 b
uaddl2 v29.8 h, v4.16 b, v5.16 b
shl v28.8 h, v28.8 h, #1
shl v29.8 h, v29.8 h, #1
add v28.8 h, v28.8 h, v26.8 h
add v29.8 h, v29.8 h, v27.8 h
rshrn v19.8 b, v28.8 h, #3 // p2'_2
rshrn2 v19.16 b, v29.8 h, #3 // p2'_2
//calc q, v0, v1, v2, v3, v18, v0, v1, v2, v3
uaddl v26.8 h, v2.8 b, v0.8 b
uaddl2 v27.8 h, v2.16 b, v0.16 b
uaddw v26.8 h, v26.8 h, v7.8 b
uaddw2 v27.8 h, v27.8 h, v7.16 b
add v22.8 h, v22.8 h, v26.8 h
add v23.8 h, v23.8 h, v27.8 h
uaddw v22.8 h, v22.8 h, v7.8 b
uaddw2 v23.8 h, v23.8 h, v7.16 b
rshrn v22.8 b, v22.8 h, #3 // q0'_2
rshrn2 v22.16 b, v23.8 h, #3 // q0'_2
uaddw v26.8 h, v26.8 h, v1.8 b
uaddw2 v27.8 h, v27.8 h, v1.16 b
rshrn v23.8 b, v26.8 h, #2 // q1'_2
rshrn2 v23.16 b, v27.8 h, #2 // q1'_2
uaddl v28.8 h, v2.8 b, v3.8 b
uaddl2 v29.8 h, v2.16 b, v3.16 b
shl v28.8 h, v28.8 h, #1
shl v29.8 h, v29.8 h, #1
add v28.8 h, v28.8 h, v26.8 h
add v29.8 h, v29.8 h, v27.8 h
rshrn v26.8 b, v28.8 h, #3 // q2'_2
rshrn2 v26.16 b, v29.8 h, #3 // q2'_2
bit v7.16 b, v24.16 b, v30.16 b // p0'_1
bit v0.16 b, v25.16 b, v31.16 b // q0'_1
bit v7.16 b, v20.16 b, v17.16 b // p0'_2
bit v6.16 b, v21.16 b, v17.16 b // p1'_2
bit v5.16 b, v19.16 b, v17.16 b // p2'_2
bit v0.16 b, v22.16 b, v18.16 b // q0'_2
bit v1.16 b, v23.16 b, v18.16 b // q1'_2
bit v2.16 b, v26.16 b, v18.16 b // q2'_2
.endm
function ff_h264_v_loop_filter_luma_intra_neon, export=1
h264_loop_filter_start_intra
ld1 {v0.16 b}, [x0], x1 // q0
ld1 {v1.16 b}, [x0], x1 // q1
ld1 {v2.16 b}, [x0], x1 // q2
ld1 {v3.16 b}, [x0], x1 // q3
sub x0, x0, x1, lsl #3
ld1 {v4.16 b}, [x0], x1 // p3
ld1 {v5.16 b}, [x0], x1 // p2
ld1 {v6.16 b}, [x0], x1 // p1
ld1 {v7.16 b}, [x0] // p0
h264_loop_filter_luma_intra
sub x0, x0, x1, lsl #1
st1 {v5.16 b}, [x0], x1 // p2
st1 {v6.16 b}, [x0], x1 // p1
st1 {v7.16 b}, [x0], x1 // p0
st1 {v0.16 b}, [x0], x1 // q0
st1 {v1.16 b}, [x0], x1 // q1
st1 {v2.16 b}, [x0] // q2
9 :
ret
endfunc
function ff_h264_h_loop_filter_luma_intra_neon, export=1
h264_loop_filter_start_intra
sub x0, x0, #4
ld1 {v4.8 b}, [x0], x1
ld1 {v5.8 b}, [x0], x1
ld1 {v6.8 b}, [x0], x1
ld1 {v7.8 b}, [x0], x1
ld1 {v0.8 b}, [x0], x1
ld1 {v1.8 b}, [x0], x1
ld1 {v2.8 b}, [x0], x1
ld1 {v3.8 b}, [x0], x1
ld1 {v4.d}[1 ], [x0], x1
ld1 {v5.d}[1 ], [x0], x1
ld1 {v6.d}[1 ], [x0], x1
ld1 {v7.d}[1 ], [x0], x1
ld1 {v0.d}[1 ], [x0], x1
ld1 {v1.d}[1 ], [x0], x1
ld1 {v2.d}[1 ], [x0], x1
ld1 {v3.d}[1 ], [x0], x1
transpose_8x16B v4, v5, v6, v7, v0, v1, v2, v3, v21, v23
h264_loop_filter_luma_intra
transpose_8x16B v4, v5, v6, v7, v0, v1, v2, v3, v21, v23
sub x0, x0, x1, lsl #4
st1 {v4.8 b}, [x0], x1
st1 {v5.8 b}, [x0], x1
st1 {v6.8 b}, [x0], x1
st1 {v7.8 b}, [x0], x1
st1 {v0.8 b}, [x0], x1
st1 {v1.8 b}, [x0], x1
st1 {v2.8 b}, [x0], x1
st1 {v3.8 b}, [x0], x1
st1 {v4.d}[1 ], [x0], x1
st1 {v5.d}[1 ], [x0], x1
st1 {v6.d}[1 ], [x0], x1
st1 {v7.d}[1 ], [x0], x1
st1 {v0.d}[1 ], [x0], x1
st1 {v1.d}[1 ], [x0], x1
st1 {v2.d}[1 ], [x0], x1
st1 {v3.d}[1 ], [x0], x1
9 :
ret
endfunc
.macro h264_loop_filter_chroma
dup v22.8 b, w2 // alpha
dup v23.8 b, w3 // beta
uxtl v24.8 h, v24.8 b
uabd v26.8 b, v16.8 b, v0.8 b // abs(p0 - q0)
uabd v28.8 b, v18.8 b, v16.8 b // abs(p1 - p0)
uabd v30.8 b, v2.8 b, v0.8 b // abs(q1 - q0)
cmhi v26.8 b, v22.8 b, v26.8 b // < alpha
cmhi v28.8 b, v23.8 b, v28.8 b // < beta
cmhi v30.8 b, v23.8 b, v30.8 b // < beta
uxtl v4.8 h, v0.8 b
and v26.8 b, v26.8 b, v28.8 b
usubw v4.8 h, v4.8 h, v16.8 b
and v26.8 b, v26.8 b, v30.8 b
shl v4.8 h, v4.8 h, #2
mov x8, v26.d[0 ]
sli v24.8 h, v24.8 h, #8
uaddw v4.8 h, v4.8 h, v18.8 b
cbz x8, 9 f
usubw v4.8 h, v4.8 h, v2.8 b
rshrn v4.8 b, v4.8 h, #3
smin v4.8 b, v4.8 b, v24.8 b
neg v25.8 b, v24.8 b
smax v4.8 b, v4.8 b, v25.8 b
uxtl v22.8 h, v0.8 b
and v4.8 b, v4.8 b, v26.8 b
uxtl v28.8 h, v16.8 b
saddw v28.8 h, v28.8 h, v4.8 b
ssubw v22.8 h, v22.8 h, v4.8 b
sqxtun v16.8 b, v28.8 h
sqxtun v0.8 b, v22.8 h
.endm
function ff_h264_v_loop_filter_chroma_neon, export=1
h264_loop_filter_start
sub x0, x0, x1, lsl #1
ld1 {v18.8 b}, [x0], x1
ld1 {v16.8 b}, [x0], x1
ld1 {v0.8 b}, [x0], x1
ld1 {v2.8 b}, [x0]
h264_loop_filter_chroma
sub x0, x0, x1, lsl #1
st1 {v16.8 b}, [x0], x1
st1 {v0.8 b}, [x0], x1
9 :
ret
endfunc
function ff_h264_h_loop_filter_chroma_neon, export=1
h264_loop_filter_start
sub x0, x0, #2
h_loop_filter_chroma420:
ld1 {v18.s}[0 ], [x0], x1
ld1 {v16.s}[0 ], [x0], x1
ld1 {v0.s}[0 ], [x0], x1
ld1 {v2.s}[0 ], [x0], x1
ld1 {v18.s}[1 ], [x0], x1
ld1 {v16.s}[1 ], [x0], x1
ld1 {v0.s}[1 ], [x0], x1
ld1 {v2.s}[1 ], [x0], x1
transpose_4x8B v18, v16, v0, v2, v28, v29, v30, v31
h264_loop_filter_chroma
transpose_4x8B v18, v16, v0, v2, v28, v29, v30, v31
sub x0, x0, x1, lsl #3
st1 {v18.s}[0 ], [x0], x1
st1 {v16.s}[0 ], [x0], x1
st1 {v0.s}[0 ], [x0], x1
st1 {v2.s}[0 ], [x0], x1
st1 {v18.s}[1 ], [x0], x1
st1 {v16.s}[1 ], [x0], x1
st1 {v0.s}[1 ], [x0], x1
st1 {v2.s}[1 ], [x0], x1
9 :
ret
endfunc
function ff_h264_h_loop_filter_chroma422_neon, export=1
h264_loop_filter_start
add x5, x0, x1
sub x0, x0, #2
add x1, x1, x1
mov x7, x30
bl h_loop_filter_chroma420
mov x30, x7
sub x0, x5, #2
mov v24.s[0 ], w6
b h_loop_filter_chroma420
endfunc
.macro h264_loop_filter_chroma_intra
uabd v26.8 b, v16.8 b, v17.8 b // abs(p0 - q0)
uabd v27.8 b, v18.8 b, v16.8 b // abs(p1 - p0)
uabd v28.8 b, v19.8 b, v17.8 b // abs(q1 - q0)
cmhi v26.8 b, v30.8 b, v26.8 b // < alpha
cmhi v27.8 b, v31.8 b, v27.8 b // < beta
cmhi v28.8 b, v31.8 b, v28.8 b // < beta
and v26.8 b, v26.8 b, v27.8 b
and v26.8 b, v26.8 b, v28.8 b
mov x2, v26.d[0 ]
ushll v4.8 h, v18.8 b, #1
ushll v6.8 h, v19.8 b, #1
cbz x2, 9 f
uaddl v20.8 h, v16.8 b, v19.8 b
uaddl v22.8 h, v17.8 b, v18.8 b
add v20.8 h, v20.8 h, v4.8 h
add v22.8 h, v22.8 h, v6.8 h
uqrshrn v24.8 b, v20.8 h, #2
uqrshrn v25.8 b, v22.8 h, #2
bit v16.8 b, v24.8 b, v26.8 b
bit v17.8 b, v25.8 b, v26.8 b
.endm
function ff_h264_v_loop_filter_chroma_intra_neon, export=1
h264_loop_filter_start_intra
sub x0, x0, x1, lsl #1
ld1 {v18.8 b}, [x0], x1
ld1 {v16.8 b}, [x0], x1
ld1 {v17.8 b}, [x0], x1
ld1 {v19.8 b}, [x0]
h264_loop_filter_chroma_intra
sub x0, x0, x1, lsl #1
st1 {v16.8 b}, [x0], x1
st1 {v17.8 b}, [x0], x1
9 :
ret
endfunc
function ff_h264_h_loop_filter_chroma_mbaff_intra_neon, export=1
h264_loop_filter_start_intra
sub x4, x0, #2
sub x0, x0, #1
ld1 {v18.8 b}, [x4], x1
ld1 {v16.8 b}, [x4], x1
ld1 {v17.8 b}, [x4], x1
ld1 {v19.8 b}, [x4], x1
transpose_4x8B v18, v16, v17, v19, v26, v27, v28, v29
h264_loop_filter_chroma_intra
st2 {v16.b,v17.b}[0 ], [x0], x1
st2 {v16.b,v17.b}[1 ], [x0], x1
st2 {v16.b,v17.b}[2 ], [x0], x1
st2 {v16.b,v17.b}[3 ], [x0], x1
9 :
ret
endfunc
function ff_h264_h_loop_filter_chroma_intra_neon, export=1
h264_loop_filter_start_intra
sub x4, x0, #2
sub x0, x0, #1
h_loop_filter_chroma420_intra:
ld1 {v18.8 b}, [x4], x1
ld1 {v16.8 b}, [x4], x1
ld1 {v17.8 b}, [x4], x1
ld1 {v19.8 b}, [x4], x1
ld1 {v18.s}[1 ], [x4], x1
ld1 {v16.s}[1 ], [x4], x1
ld1 {v17.s}[1 ], [x4], x1
ld1 {v19.s}[1 ], [x4], x1
transpose_4x8B v18, v16, v17, v19, v26, v27, v28, v29
h264_loop_filter_chroma_intra
st2 {v16.b,v17.b}[0 ], [x0], x1
st2 {v16.b,v17.b}[1 ], [x0], x1
st2 {v16.b,v17.b}[2 ], [x0], x1
st2 {v16.b,v17.b}[3 ], [x0], x1
st2 {v16.b,v17.b}[4 ], [x0], x1
st2 {v16.b,v17.b}[5 ], [x0], x1
st2 {v16.b,v17.b}[6 ], [x0], x1
st2 {v16.b,v17.b}[7 ], [x0], x1
9 :
ret
endfunc
function ff_h264_h_loop_filter_chroma422_intra_neon, export=1
h264_loop_filter_start_intra
sub x4, x0, #2
add x5, x0, x1, lsl #3
sub x0, x0, #1
mov x7, x30
bl h_loop_filter_chroma420_intra
sub x0, x5, #1
mov x30, x7
b h_loop_filter_chroma420_intra
endfunc
.macro biweight_16 macs, macd
dup v0.16 b, w5
dup v1.16 b, w6
mov v4.16 b, v16.16 b
mov v6.16 b, v16.16 b
1 : subs w3, w3, #2
ld1 {v20.16 b}, [x0], x2
\macd v4.8 h, v0.8 b, v20.8 b
\macd\()2 v6.8 H, v0.16 B, v20.16 B
ld1 {v22.16 b}, [x1], x2
\macs v4.8 h, v1.8 b, v22.8 b
\macs\()2 v6.8 H, v1.16 B, v22.16 B
mov v24.16 b, v16.16 b
ld1 {v28.16 b}, [x0], x2
mov v26.16 b, v16.16 b
\macd v24.8 h, v0.8 b, v28.8 b
\macd\()2 v26.8 H, v0.16 B, v28.16 B
ld1 {v30.16 b}, [x1], x2
\macs v24.8 h, v1.8 b, v30.8 b
\macs\()2 v26.8 H, v1.16 B, v30.16 B
sshl v4.8 h, v4.8 h, v18.8 h
sshl v6.8 h, v6.8 h, v18.8 h
sqxtun v4.8 b, v4.8 h
sqxtun2 v4.16 b, v6.8 h
sshl v24.8 h, v24.8 h, v18.8 h
sshl v26.8 h, v26.8 h, v18.8 h
sqxtun v24.8 b, v24.8 h
sqxtun2 v24.16 b, v26.8 h
mov v6.16 b, v16.16 b
st1 {v4.16 b}, [x7], x2
mov v4.16 b, v16.16 b
st1 {v24.16 b}, [x7], x2
b.ne 1 b
ret
.endm
.macro biweight_8 macs, macd
dup v0.8 b, w5
dup v1.8 b, w6
mov v2.16 b, v16.16 b
mov v20.16 b, v16.16 b
1 : subs w3, w3, #2
ld1 {v4.8 b}, [x0], x2
\macd v2.8 h, v0.8 b, v4.8 b
ld1 {v5.8 b}, [x1], x2
\macs v2.8 h, v1.8 b, v5.8 b
ld1 {v6.8 b}, [x0], x2
\macd v20.8 h, v0.8 b, v6.8 b
ld1 {v7.8 b}, [x1], x2
\macs v20.8 h, v1.8 b, v7.8 b
sshl v2.8 h, v2.8 h, v18.8 h
sqxtun v2.8 b, v2.8 h
sshl v20.8 h, v20.8 h, v18.8 h
sqxtun v4.8 b, v20.8 h
mov v20.16 b, v16.16 b
st1 {v2.8 b}, [x7], x2
mov v2.16 b, v16.16 b
st1 {v4.8 b}, [x7], x2
b.ne 1 b
ret
.endm
.macro biweight_4 macs, macd
dup v0.8 b, w5
dup v1.8 b, w6
mov v2.16 b, v16.16 b
mov v20.16 b,v16.16 b
1 : subs w3, w3, #4
ld1 {v4.s}[0 ], [x0], x2
ld1 {v4.s}[1 ], [x0], x2
\macd v2.8 h, v0.8 b, v4.8 b
ld1 {v5.s}[0 ], [x1], x2
ld1 {v5.s}[1 ], [x1], x2
\macs v2.8 h, v1.8 b, v5.8 b
b.lt 2 f
ld1 {v6.s}[0 ], [x0], x2
ld1 {v6.s}[1 ], [x0], x2
\macd v20.8 h, v0.8 b, v6.8 b
ld1 {v7.s}[0 ], [x1], x2
ld1 {v7.s}[1 ], [x1], x2
\macs v20.8 h, v1.8 b, v7.8 b
sshl v2.8 h, v2.8 h, v18.8 h
sqxtun v2.8 b, v2.8 h
sshl v20.8 h, v20.8 h, v18.8 h
sqxtun v4.8 b, v20.8 h
mov v20.16 b, v16.16 b
st1 {v2.s}[0 ], [x7], x2
st1 {v2.s}[1 ], [x7], x2
mov v2.16 b, v16.16 b
st1 {v4.s}[0 ], [x7], x2
st1 {v4.s}[1 ], [x7], x2
b.ne 1 b
ret
2 : sshl v2.8 h, v2.8 h, v18.8 h
sqxtun v2.8 b, v2.8 h
st1 {v2.s}[0 ], [x7], x2
st1 {v2.s}[1 ], [x7], x2
ret
.endm
.macro biweight_func w
function ff_biweight_h264_pixels_\w\()_neon, export=1
lsr w8, w5, #31
add w7, w7, #1
eor w8, w8, w6, lsr #30
orr w7, w7, #1
dup v18.8 h, w4
lsl w7, w7, w4
not v18.16 b, v18.16 b
dup v16.8 h, w7
mov x7, x0
cbz w8, 10 f
subs w8, w8, #1
b.eq 20 f
subs w8, w8, #1
b.eq 30 f
b 40 f
10 : biweight_\w umlal, umlal
20 : neg w5, w5
biweight_\w umlal, umlsl
30 : neg w5, w5
neg w6, w6
biweight_\w umlsl, umlsl
40 : neg w6, w6
biweight_\w umlsl, umlal
endfunc
.endm
biweight_func 16
biweight_func 8
biweight_func 4
.macro weight_16 add
dup v0.16 b, w4
1 : subs w2, w2, #2
ld1 {v20.16 b}, [x0], x1
umull v4.8 h, v0.8 b, v20.8 b
umull2 v6.8 h, v0.16 b, v20.16 b
ld1 {v28.16 b}, [x0], x1
umull v24.8 h, v0.8 b, v28.8 b
umull2 v26.8 h, v0.16 b, v28.16 b
\add v4.8 h, v16.8 h, v4.8 h
srshl v4.8 h, v4.8 h, v18.8 h
\add v6.8 h, v16.8 h, v6.8 h
srshl v6.8 h, v6.8 h, v18.8 h
sqxtun v4.8 b, v4.8 h
sqxtun2 v4.16 b, v6.8 h
\add v24.8 h, v16.8 h, v24.8 h
srshl v24.8 h, v24.8 h, v18.8 h
\add v26.8 h, v16.8 h, v26.8 h
srshl v26.8 h, v26.8 h, v18.8 h
sqxtun v24.8 b, v24.8 h
sqxtun2 v24.16 b, v26.8 h
st1 {v4.16 b}, [x5], x1
st1 {v24.16 b}, [x5], x1
b.ne 1 b
ret
.endm
.macro weight_8 add
dup v0.8 b, w4
1 : subs w2, w2, #2
ld1 {v4.8 b}, [x0], x1
umull v2.8 h, v0.8 b, v4.8 b
ld1 {v6.8 b}, [x0], x1
umull v20.8 h, v0.8 b, v6.8 b
\add v2.8 h, v16.8 h, v2.8 h
srshl v2.8 h, v2.8 h, v18.8 h
sqxtun v2.8 b, v2.8 h
\add v20.8 h, v16.8 h, v20.8 h
srshl v20.8 h, v20.8 h, v18.8 h
sqxtun v4.8 b, v20.8 h
st1 {v2.8 b}, [x5], x1
st1 {v4.8 b}, [x5], x1
b.ne 1 b
ret
.endm
.macro weight_4 add
dup v0.8 b, w4
1 : subs w2, w2, #4
ld1 {v4.s}[0 ], [x0], x1
ld1 {v4.s}[1 ], [x0], x1
umull v2.8 h, v0.8 b, v4.8 b
b.lt 2 f
ld1 {v6.s}[0 ], [x0], x1
ld1 {v6.s}[1 ], [x0], x1
umull v20.8 h, v0.8 b, v6.8 b
\add v2.8 h, v16.8 h, v2.8 h
srshl v2.8 h, v2.8 h, v18.8 h
sqxtun v2.8 b, v2.8 h
\add v20.8 h, v16.8 h, v20.8 h
srshl v20.8 h, v20.8 h, v18.8 h
sqxtun v4.8 b, v20.8 h
st1 {v2.s}[0 ], [x5], x1
st1 {v2.s}[1 ], [x5], x1
st1 {v4.s}[0 ], [x5], x1
st1 {v4.s}[1 ], [x5], x1
b.ne 1 b
ret
2 : \add v2.8 h, v16.8 h, v2.8 h
srshl v2.8 h, v2.8 h, v18.8 h
sqxtun v2.8 b, v2.8 h
st1 {v2.s}[0 ], [x5], x1
st1 {v2.s}[1 ], [x5], x1
ret
.endm
.macro weight_func w
function ff_weight_h264_pixels_\w\()_neon, export=1
cmp w3, #1
mov w6, #1
lsl w5, w5, w3
dup v16.8 h, w5
mov x5, x0
b.le 20 f
sub w6, w6, w3
dup v18.8 h, w6
cmp w4, #0
b.lt 10 f
weight_\w shadd
10 : neg w4, w4
weight_\w shsub
20 : neg w6, w3
dup v18.8 h, w6
cmp w4, #0
b.lt 10 f
weight_\w add
10 : neg w4, w4
weight_\w sub
endfunc
.endm
weight_func 16
weight_func 8
weight_func 4
.macro h264_loop_filter_start_10
cmp w2, #0
ldr w6, [x4]
ccmp w3, #0 , #0 , ne
lsl w2, w2, #2
mov v24.s[0 ], w6
lsl w3, w3, #2
and w8, w6, w6, lsl #16
b.eq 1 f
ands w8, w8, w8, lsl #8
b.ge 2 f
1 :
ret
2 :
.endm
.macro h264_loop_filter_start_intra_10
orr w4, w2, w3
cbnz w4, 1 f
ret
1 :
lsl w2, w2, #2
lsl w3, w3, #2
dup v30.8 h, w2 // alpha
dup v31.8 h, w3 // beta
.endm
.macro h264_loop_filter_chroma_10
dup v22.8 h, w2 // alpha
dup v23.8 h, w3 // beta
uxtl v24.8 h, v24.8 b // tc0
uabd v26.8 h, v16.8 h, v0.8 h // abs(p0 - q0)
uabd v28.8 h, v18.8 h, v16.8 h // abs(p1 - p0)
uabd v30.8 h, v2.8 h, v0.8 h // abs(q1 - q0)
cmhi v26.8 h, v22.8 h, v26.8 h // < alpha
cmhi v28.8 h, v23.8 h, v28.8 h // < beta
cmhi v30.8 h, v23.8 h, v30.8 h // < beta
and v26.16 b, v26.16 b, v28.16 b
mov v4.16 b, v0.16 b
sub v4.8 h, v4.8 h, v16.8 h
and v26.16 b, v26.16 b, v30.16 b
shl v4.8 h, v4.8 h, #2
mov x8, v26.d[0 ]
mov x9, v26.d[1 ]
sli v24.8 h, v24.8 h, #8
uxtl v24.8 h, v24.8 b
add v4.8 h, v4.8 h, v18.8 h
adds x8, x8, x9
shl v24.8 h, v24.8 h, #2
b.eq 9 f
movi v31.8 h, #3 // (tc0 - 1 ) << (BIT_DEPTH - 8 )) + 1
uqsub v24.8 h, v24.8 h, v31.8 h
sub v4.8 h, v4.8 h, v2.8 h
srshr v4.8 h, v4.8 h, #3
smin v4.8 h, v4.8 h, v24.8 h
neg v25.8 h, v24.8 h
smax v4.8 h, v4.8 h, v25.8 h
and v4.16 b, v4.16 b, v26.16 b
add v16.8 h, v16.8 h, v4.8 h
sub v0.8 h, v0.8 h, v4.8 h
mvni v4.8 h, #0 xFC, lsl #8 // 1023 for clipping
movi v5.8 h, #0
smin v0.8 h, v0.8 h, v4.8 h
smin v16.8 h, v16.8 h, v4.8 h
smax v0.8 h, v0.8 h, v5.8 h
smax v16.8 h, v16.8 h, v5.8 h
.endm
function ff_h264_v_loop_filter_chroma_neon_10, export=1
h264_loop_filter_start_10
mov x10, x0
sub x0, x0, x1, lsl #1
ld1 {v18.8 h}, [x0 ], x1
ld1 {v0.8 h}, [x10], x1
ld1 {v16.8 h}, [x0 ], x1
ld1 {v2.8 h}, [x10]
h264_loop_filter_chroma_10
sub x0, x10, x1, lsl #1
st1 {v16.8 h}, [x0], x1
st1 {v0.8 h}, [x0], x1
9 :
ret
endfunc
function ff_h264_h_loop_filter_chroma_neon_10, export=1
h264_loop_filter_start_10
sub x0, x0, #4 // access the 2 nd left pixel
h_loop_filter_chroma420_10:
add x10, x0, x1, lsl #2
ld1 {v18.d}[0 ], [x0 ], x1
ld1 {v18.d}[1 ], [x10], x1
ld1 {v16.d}[0 ], [x0 ], x1
ld1 {v16.d}[1 ], [x10], x1
ld1 {v0.d}[0 ], [x0 ], x1
ld1 {v0.d}[1 ], [x10], x1
ld1 {v2.d}[0 ], [x0 ], x1
ld1 {v2.d}[1 ], [x10], x1
transpose_4x8H v18, v16, v0, v2, v28, v29, v30, v31
h264_loop_filter_chroma_10
transpose_4x8H v18, v16, v0, v2, v28, v29, v30, v31
sub x0, x10, x1, lsl #3
st1 {v18.d}[0 ], [x0], x1
st1 {v16.d}[0 ], [x0], x1
st1 {v0.d}[0 ], [x0], x1
st1 {v2.d}[0 ], [x0], x1
st1 {v18.d}[1 ], [x0], x1
st1 {v16.d}[1 ], [x0], x1
st1 {v0.d}[1 ], [x0], x1
st1 {v2.d}[1 ], [x0], x1
9 :
ret
endfunc
function ff_h264_h_loop_filter_chroma422_neon_10, export=1
h264_loop_filter_start_10
add x5, x0, x1
sub x0, x0, #4
add x1, x1, x1
mov x7, x30
bl h_loop_filter_chroma420_10
mov x30, x7
sub x0, x5, #4
mov v24.s[0 ], w6
b h_loop_filter_chroma420_10
endfunc
.macro h264_loop_filter_chroma_intra_10
uabd v26.8 h, v16.8 h, v17.8 h // abs(p0 - q0)
uabd v27.8 h, v18.8 h, v16.8 h // abs(p1 - p0)
uabd v28.8 h, v19.8 h, v17.8 h // abs(q1 - q0)
cmhi v26.8 h, v30.8 h, v26.8 h // < alpha
cmhi v27.8 h, v31.8 h, v27.8 h // < beta
cmhi v28.8 h, v31.8 h, v28.8 h // < beta
and v26.16 b, v26.16 b, v27.16 b
and v26.16 b, v26.16 b, v28.16 b
mov x2, v26.d[0 ]
mov x3, v26.d[1 ]
shl v4.8 h, v18.8 h, #1
shl v6.8 h, v19.8 h, #1
adds x2, x2, x3
b.eq 9 f
add v20.8 h, v16.8 h, v19.8 h
add v22.8 h, v17.8 h, v18.8 h
add v20.8 h, v20.8 h, v4.8 h
add v22.8 h, v22.8 h, v6.8 h
urshr v24.8 h, v20.8 h, #2
urshr v25.8 h, v22.8 h, #2
bit v16.16 b, v24.16 b, v26.16 b
bit v17.16 b, v25.16 b, v26.16 b
.endm
function ff_h264_v_loop_filter_chroma_intra_neon_10, export=1
h264_loop_filter_start_intra_10
mov x9, x0
sub x0, x0, x1, lsl #1
ld1 {v18.8 h}, [x0], x1
ld1 {v17.8 h}, [x9], x1
ld1 {v16.8 h}, [x0], x1
ld1 {v19.8 h}, [x9]
h264_loop_filter_chroma_intra_10
sub x0, x9, x1, lsl #1
st1 {v16.8 h}, [x0], x1
st1 {v17.8 h}, [x0], x1
9 :
ret
endfunc
function ff_h264_h_loop_filter_chroma_mbaff_intra_neon_10, export=1
h264_loop_filter_start_intra_10
sub x4, x0, #4
sub x0, x0, #2
add x9, x4, x1, lsl #1
ld1 {v18.8 h}, [x4], x1
ld1 {v17.8 h}, [x9], x1
ld1 {v16.8 h}, [x4], x1
ld1 {v19.8 h}, [x9], x1
transpose_4x8H v18, v16, v17, v19, v26, v27, v28, v29
h264_loop_filter_chroma_intra_10
st2 {v16.h,v17.h}[0 ], [x0], x1
st2 {v16.h,v17.h}[1 ], [x0], x1
st2 {v16.h,v17.h}[2 ], [x0], x1
st2 {v16.h,v17.h}[3 ], [x0], x1
9 :
ret
endfunc
function ff_h264_h_loop_filter_chroma_intra_neon_10, export=1
h264_loop_filter_start_intra_10
sub x4, x0, #4
sub x0, x0, #2
h_loop_filter_chroma420_intra_10:
add x9, x4, x1, lsl #2
ld1 {v18.4 h}, [x4], x1
ld1 {v18.d}[1 ], [x9], x1
ld1 {v16.4 h}, [x4], x1
ld1 {v16.d}[1 ], [x9], x1
ld1 {v17.4 h}, [x4], x1
ld1 {v17.d}[1 ], [x9], x1
ld1 {v19.4 h}, [x4], x1
ld1 {v19.d}[1 ], [x9], x1
transpose_4x8H v18, v16, v17, v19, v26, v27, v28, v29
h264_loop_filter_chroma_intra_10
st2 {v16.h,v17.h}[0 ], [x0], x1
st2 {v16.h,v17.h}[1 ], [x0], x1
st2 {v16.h,v17.h}[2 ], [x0], x1
st2 {v16.h,v17.h}[3 ], [x0], x1
st2 {v16.h,v17.h}[4 ], [x0], x1
st2 {v16.h,v17.h}[5 ], [x0], x1
st2 {v16.h,v17.h}[6 ], [x0], x1
st2 {v16.h,v17.h}[7 ], [x0], x1
9 :
ret
endfunc
function ff_h264_h_loop_filter_chroma422_intra_neon_10, export=1
h264_loop_filter_start_intra_10
sub x4, x0, #4
add x5, x0, x1, lsl #3
sub x0, x0, #2
mov x7, x30
bl h_loop_filter_chroma420_intra_10
mov x4, x9
sub x0, x5, #2
mov x30, x7
b h_loop_filter_chroma420_intra_10
endfunc
Messung V0.5 in Prozent C=94 H=88 G=90
¤ Dauer der Verarbeitung: 0.21 Sekunden
(vorverarbeitet am 2026-06-05)
¤
*© Formatika GbR, Deutschland