/*
* Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
* Copyright (c) 2013 Janne Grunau <janne-libav@jannau.net>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "libavutil/aarch64/asm.S"
#include "neon.S"
function ff_h264_idct_add_neon, export=1
.L_ff_h264_idct_add_neon:
AARCH64_VALID_CALL_TARGET
ld1 {v0.4 h, v1.4 h, v2.4 h, v3.4 h}, [x1]
sxtw x2, w2
movi v30.8 h, #0
add v4.4 h, v0.4 h, v2.4 h
sshr v16.4 h, v1.4 h, #1
st1 {v30.8 h}, [x1], #16
sshr v17.4 h, v3.4 h, #1
st1 {v30.8 h}, [x1], #16
sub v5.4 h, v0.4 h, v2.4 h
sub v6.4 h, v16.4 h, v3.4 h
add v7.4 h, v1.4 h, v17.4 h
add v0.4 h, v4.4 h, v7.4 h
add v1.4 h, v5.4 h, v6.4 h
sub v2.4 h, v5.4 h, v6.4 h
sub v3.4 h, v4.4 h, v7.4 h
transpose_4x4H v0, v1, v2, v3, v4, v5, v6, v7
add v4.4 h, v0.4 h, v2.4 h
ld1 {v18.s}[0 ], [x0], x2
sshr v16.4 h, v3.4 h, #1
sshr v17.4 h, v1.4 h, #1
ld1 {v18.s}[1 ], [x0], x2
sub v5.4 h, v0.4 h, v2.4 h
ld1 {v19.s}[1 ], [x0], x2
add v6.4 h, v16.4 h, v1.4 h
ins v4.d[1 ], v5.d[0 ]
sub v7.4 h, v17.4 h, v3.4 h
ld1 {v19.s}[0 ], [x0], x2
ins v6.d[1 ], v7.d[0 ]
sub x0, x0, x2, lsl #2
add v0.8 h, v4.8 h, v6.8 h
sub v1.8 h, v4.8 h, v6.8 h
srshr v0.8 h, v0.8 h, #6
srshr v1.8 h, v1.8 h, #6
uaddw v0.8 h, v0.8 h, v18.8 b
uaddw v1.8 h, v1.8 h, v19.8 b
sqxtun v0.8 b, v0.8 h
sqxtun v1.8 b, v1.8 h
st1 {v0.s}[0 ], [x0], x2
st1 {v0.s}[1 ], [x0], x2
st1 {v1.s}[1 ], [x0], x2
st1 {v1.s}[0 ], [x0], x2
sub x1, x1, #32
ret
endfunc
function ff_h264_idct_dc_add_neon, export=1
.L_ff_h264_idct_dc_add_neon:
AARCH64_VALID_CALL_TARGET
sxtw x2, w2
mov w3, #0
ld1r {v2.8 h}, [x1]
strh w3, [x1]
srshr v2.8 h, v2.8 h, #6
ld1 {v0.s}[0 ], [x0], x2
ld1 {v0.s}[1 ], [x0], x2
uaddw v3.8 h, v2.8 h, v0.8 b
ld1 {v1.s}[0 ], [x0], x2
ld1 {v1.s}[1 ], [x0], x2
uaddw v4.8 h, v2.8 h, v1.8 b
sqxtun v0.8 b, v3.8 h
sqxtun v1.8 b, v4.8 h
sub x0, x0, x2, lsl #2
st1 {v0.s}[0 ], [x0], x2
st1 {v0.s}[1 ], [x0], x2
st1 {v1.s}[0 ], [x0], x2
st1 {v1.s}[1 ], [x0], x2
ret
endfunc
function ff_h264_idct_add16_neon, export=1
mov x12, x30
mov x6, x0 // dest
mov x5, x1 // block_offset
mov x1, x2 // block
mov w9, w3 // stride
movrel x7, scan8
mov x10, #16
movrel x13, .L_ff_h264_idct_dc_add_neon
movrel x14, .L_ff_h264_idct_add_neon
1 : mov w2, w9
ldrb w3, [x7], #1
ldrsw x0, [x5], #4
ldrb w3, [x4, w3, uxtw]
subs w3, w3, #1
b.lt 2 f
ldrsh w3, [x1]
add x0, x0, x6
ccmp w3, #0 , #4 , eq
csel x15, x13, x14, ne
blr x15
2 : subs x10, x10, #1
add x1, x1, #32
b.ne 1 b
ret x12
endfunc
function ff_h264_idct_add16intra_neon, export=1
mov x12, x30
mov x6, x0 // dest
mov x5, x1 // block_offset
mov x1, x2 // block
mov w9, w3 // stride
movrel x7, scan8
mov x10, #16
movrel x13, .L_ff_h264_idct_dc_add_neon
movrel x14, .L_ff_h264_idct_add_neon
1 : mov w2, w9
ldrb w3, [x7], #1
ldrsw x0, [x5], #4
ldrb w3, [x4, w3, uxtw]
add x0, x0, x6
cmp w3, #0
ldrsh w3, [x1]
csel x15, x13, x14, eq
ccmp w3, #0 , #0 , eq
b.eq 2 f
blr x15
2 : subs x10, x10, #1
add x1, x1, #32
b.ne 1 b
ret x12
endfunc
function ff_h264_idct_add8_neon, export=1
stp x19, x20, [sp, #-0 x40]!
mov x12, x30
ldp x6, x15, [x0] // dest[0 ], dest[1 ]
add x5, x1, #16 *4 // block_offset
add x9, x2, #16 *32 // block
mov w19, w3 // stride
movrel x13, .L_ff_h264_idct_dc_add_neon
movrel x14, .L_ff_h264_idct_add_neon
movrel x7, scan8, 16
mov x10, #0
mov x11, #16
1 : mov w2, w19
ldrb w3, [x7, x10] // scan8[i]
ldrsw x0, [x5, x10, lsl #2 ] // block_offset[i]
ldrb w3, [x4, w3, uxtw] // nnzc[ scan8[i] ]
add x0, x0, x6 // block_offset[i] + dst[j-1 ]
add x1, x9, x10, lsl #5 // block + i * 16
cmp w3, #0
ldrsh w3, [x1] // block[i*16 ]
csel x20, x13, x14, eq
ccmp w3, #0 , #0 , eq
b.eq 2 f
blr x20
2 : add x10, x10, #1
cmp x10, #4
csel x10, x11, x10, eq // mov x10, #16
csel x6, x15, x6, eq
cmp x10, #20
b.lt 1 b
ldp x19, x20, [sp], #0 x40
ret x12
endfunc
.macro idct8x8_cols pass
.if \pass == 0
va .req v18
vb .req v30
sshr v18.8 h, v26.8 h, #1
add v16.8 h, v24.8 h, v28.8 h
ld1 {v30.8 h, v31.8 h}, [x1]
st1 {v19.8 h}, [x1], #16
st1 {v19.8 h}, [x1], #16
sub v17.8 h, v24.8 h, v28.8 h
sshr v19.8 h, v30.8 h, #1
sub v18.8 h, v18.8 h, v30.8 h
add v19.8 h, v19.8 h, v26.8 h
.else
va .req v30
vb .req v18
sshr v30.8 h, v26.8 h, #1
sshr v19.8 h, v18.8 h, #1
add v16.8 h, v24.8 h, v28.8 h
sub v17.8 h, v24.8 h, v28.8 h
sub v30.8 h, v30.8 h, v18.8 h
add v19.8 h, v19.8 h, v26.8 h
.endif
add v26.8 h, v17.8 h, va.8 h
sub v28.8 h, v17.8 h, va.8 h
add v24.8 h, v16.8 h, v19.8 h
sub vb.8 h, v16.8 h, v19.8 h
sub v16.8 h, v29.8 h, v27.8 h
add v17.8 h, v31.8 h, v25.8 h
sub va.8 h, v31.8 h, v25.8 h
add v19.8 h, v29.8 h, v27.8 h
sub v16.8 h, v16.8 h, v31.8 h
sub v17.8 h, v17.8 h, v27.8 h
add va.8 h, va.8 h, v29.8 h
add v19.8 h, v19.8 h, v25.8 h
sshr v25.8 h, v25.8 h, #1
sshr v27.8 h, v27.8 h, #1
sshr v29.8 h, v29.8 h, #1
sshr v31.8 h, v31.8 h, #1
sub v16.8 h, v16.8 h, v31.8 h
sub v17.8 h, v17.8 h, v27.8 h
add va.8 h, va.8 h, v29.8 h
add v19.8 h, v19.8 h, v25.8 h
sshr v25.8 h, v16.8 h, #2
sshr v27.8 h, v17.8 h, #2
sshr v29.8 h, va.8 h, #2
sshr v31.8 h, v19.8 h, #2
sub v19.8 h, v19.8 h, v25.8 h
sub va.8 h, v27.8 h, va.8 h
add v17.8 h, v17.8 h, v29.8 h
add v16.8 h, v16.8 h, v31.8 h
.if \pass == 0
sub v31.8 h, v24.8 h, v19.8 h
add v24.8 h, v24.8 h, v19.8 h
add v25.8 h, v26.8 h, v18.8 h
sub v18.8 h, v26.8 h, v18.8 h
add v26.8 h, v28.8 h, v17.8 h
add v27.8 h, v30.8 h, v16.8 h
sub v29.8 h, v28.8 h, v17.8 h
sub v28.8 h, v30.8 h, v16.8 h
.else
sub v31.8 h, v24.8 h, v19.8 h
add v24.8 h, v24.8 h, v19.8 h
add v25.8 h, v26.8 h, v30.8 h
sub v30.8 h, v26.8 h, v30.8 h
add v26.8 h, v28.8 h, v17.8 h
sub v29.8 h, v28.8 h, v17.8 h
add v27.8 h, v18.8 h, v16.8 h
sub v28.8 h, v18.8 h, v16.8 h
.endif
.unreq va
.unreq vb
.endm
function ff_h264_idct8_add_neon, export=1
.L_ff_h264_idct8_add_neon:
AARCH64_VALID_CALL_TARGET
movi v19.8 h, #0
sxtw x2, w2
ld1 {v24.8 h, v25.8 h}, [x1]
st1 {v19.8 h}, [x1], #16
st1 {v19.8 h}, [x1], #16
ld1 {v26.8 h, v27.8 h}, [x1]
st1 {v19.8 h}, [x1], #16
st1 {v19.8 h}, [x1], #16
ld1 {v28.8 h, v29.8 h}, [x1]
st1 {v19.8 h}, [x1], #16
st1 {v19.8 h}, [x1], #16
idct8x8_cols 0
transpose_8x8H v24, v25, v26, v27, v28, v29, v18, v31, v6, v7
idct8x8_cols 1
mov x3, x0
srshr v24.8 h, v24.8 h, #6
ld1 {v0.8 b}, [x0], x2
srshr v25.8 h, v25.8 h, #6
ld1 {v1.8 b}, [x0], x2
srshr v26.8 h, v26.8 h, #6
ld1 {v2.8 b}, [x0], x2
srshr v27.8 h, v27.8 h, #6
ld1 {v3.8 b}, [x0], x2
srshr v28.8 h, v28.8 h, #6
ld1 {v4.8 b}, [x0], x2
srshr v29.8 h, v29.8 h, #6
ld1 {v5.8 b}, [x0], x2
srshr v30.8 h, v30.8 h, #6
ld1 {v6.8 b}, [x0], x2
srshr v31.8 h, v31.8 h, #6
ld1 {v7.8 b}, [x0], x2
uaddw v24.8 h, v24.8 h, v0.8 b
uaddw v25.8 h, v25.8 h, v1.8 b
uaddw v26.8 h, v26.8 h, v2.8 b
sqxtun v0.8 b, v24.8 h
uaddw v27.8 h, v27.8 h, v3.8 b
sqxtun v1.8 b, v25.8 h
uaddw v28.8 h, v28.8 h, v4.8 b
sqxtun v2.8 b, v26.8 h
st1 {v0.8 b}, [x3], x2
uaddw v29.8 h, v29.8 h, v5.8 b
sqxtun v3.8 b, v27.8 h
st1 {v1.8 b}, [x3], x2
uaddw v30.8 h, v30.8 h, v6.8 b
sqxtun v4.8 b, v28.8 h
st1 {v2.8 b}, [x3], x2
uaddw v31.8 h, v31.8 h, v7.8 b
sqxtun v5.8 b, v29.8 h
st1 {v3.8 b}, [x3], x2
sqxtun v6.8 b, v30.8 h
sqxtun v7.8 b, v31.8 h
st1 {v4.8 b}, [x3], x2
st1 {v5.8 b}, [x3], x2
st1 {v6.8 b}, [x3], x2
st1 {v7.8 b}, [x3], x2
sub x1, x1, #128
ret
endfunc
function ff_h264_idct8_dc_add_neon, export=1
.L_ff_h264_idct8_dc_add_neon:
AARCH64_VALID_CALL_TARGET
mov w3, #0
sxtw x2, w2
ld1r {v31.8 h}, [x1]
strh w3, [x1]
ld1 {v0.8 b}, [x0], x2
srshr v31.8 h, v31.8 h, #6
ld1 {v1.8 b}, [x0], x2
ld1 {v2.8 b}, [x0], x2
uaddw v24.8 h, v31.8 h, v0.8 b
ld1 {v3.8 b}, [x0], x2
uaddw v25.8 h, v31.8 h, v1.8 b
ld1 {v4.8 b}, [x0], x2
uaddw v26.8 h, v31.8 h, v2.8 b
ld1 {v5.8 b}, [x0], x2
uaddw v27.8 h, v31.8 h, v3.8 b
ld1 {v6.8 b}, [x0], x2
uaddw v28.8 h, v31.8 h, v4.8 b
ld1 {v7.8 b}, [x0], x2
uaddw v29.8 h, v31.8 h, v5.8 b
uaddw v30.8 h, v31.8 h, v6.8 b
uaddw v31.8 h, v31.8 h, v7.8 b
sqxtun v0.8 b, v24.8 h
sqxtun v1.8 b, v25.8 h
sqxtun v2.8 b, v26.8 h
sqxtun v3.8 b, v27.8 h
sub x0, x0, x2, lsl #3
st1 {v0.8 b}, [x0], x2
sqxtun v4.8 b, v28.8 h
st1 {v1.8 b}, [x0], x2
sqxtun v5.8 b, v29.8 h
st1 {v2.8 b}, [x0], x2
sqxtun v6.8 b, v30.8 h
st1 {v3.8 b}, [x0], x2
sqxtun v7.8 b, v31.8 h
st1 {v4.8 b}, [x0], x2
st1 {v5.8 b}, [x0], x2
st1 {v6.8 b}, [x0], x2
st1 {v7.8 b}, [x0], x2
ret
endfunc
function ff_h264_idct8_add4_neon, export=1
mov x12, x30
mov x6, x0
mov x5, x1
mov x1, x2
mov w2, w3
movrel x7, scan8
mov w10, #16
movrel x13, .L_ff_h264_idct8_dc_add_neon
movrel x14, .L_ff_h264_idct8_add_neon
1 : ldrb w9, [x7], #4
ldrsw x0, [x5], #16
ldrb w9, [x4, w9, uxtw]
subs w9, w9, #1
b.lt 2 f
ldrsh w11, [x1]
add x0, x6, x0
ccmp w11, #0 , #4 , eq
csel x15, x13, x14, ne
blr x15
2 : subs w10, w10, #4
add x1, x1, #128
b.ne 1 b
ret x12
endfunc
const scan8
.byte 4 + 1 *8 , 5 + 1 *8 , 4 + 2 *8 , 5 + 2 *8
.byte 6 + 1 *8 , 7 + 1 *8 , 6 + 2 *8 , 7 + 2 *8
.byte 4 + 3 *8 , 5 + 3 *8 , 4 + 4 *8 , 5 + 4 *8
.byte 6 + 3 *8 , 7 + 3 *8 , 6 + 4 *8 , 7 + 4 *8
.byte 4 + 6 *8 , 5 + 6 *8 , 4 + 7 *8 , 5 + 7 *8
.byte 6 + 6 *8 , 7 + 6 *8 , 6 + 7 *8 , 7 + 7 *8
.byte 4 + 8 *8 , 5 + 8 *8 , 4 + 9 *8 , 5 + 9 *8
.byte 6 + 8 *8 , 7 + 8 *8 , 6 + 9 *8 , 7 + 9 *8
.byte 4 +11 *8 , 5 +11 *8 , 4 +12 *8 , 5 +12 *8
.byte 6 +11 *8 , 7 +11 *8 , 6 +12 *8 , 7 +12 *8
.byte 4 +13 *8 , 5 +13 *8 , 4 +14 *8 , 5 +14 *8
.byte 6 +13 *8 , 7 +13 *8 , 6 +14 *8 , 7 +14 *8
endconst
Messung V0.5 in Prozent C=99 H=100 G=99
¤ Dauer der Verarbeitung: 0.13 Sekunden
(vorverarbeitet am 2026-06-05)
¤
*© Formatika GbR, Deutschland