/*
* Copyright © 2018 , VideoLAN and dav1d authors
* Copyright © 2019 , Martin Storsjo
* All rights reserved .
*
* Redistribution and use in source and binary forms , with or without
* modification , are permitted provided that the following conditions are met :
*
* 1 . Redistributions of source code must retain the above copyright notice , this
* list of conditions and the following disclaimer .
*
* 2 . Redistributions in binary form must reproduce the above copyright notice ,
* this list of conditions and the following disclaimer in the documentation
* and / or other materials provided with the distribution .
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS " AS IS " AND
* ANY EXPRESS OR IMPLIED WARRANTIES , INCLUDING , BUT NOT LIMITED TO , THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED . IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
* ANY DIRECT , INDIRECT , INCIDENTAL , SPECIAL , EXEMPLARY , OR CONSEQUENTIAL DAMAGES
* ( INCLUDING , BUT NOT LIMITED TO , PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES ;
* LOSS OF USE , DATA , OR PROFITS ; OR BUSINESS INTERRUPTION ) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY , WHETHER IN CONTRACT , STRICT LIABILITY , OR TORT
* ( INCLUDING NEGLIGENCE OR OTHERWISE ) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE , EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE .
*/
#include "src/arm/asm.S"
#include "util.S"
// void dav1d_sgr_box3_row_v_neon(int32_t **sumsq, int16_t **sum,
// int32_t *sumsq_out, int16_t *sum_out,
// const int w);
function sgr_box3_row_v_neon, export=1
push {r4-r9,lr}
ldr r4, [sp, #28 ]
ldrd r6, r7, [r0]
ldr r0, [r0, #8 ]
add r4, r4, #2
ldrd r8, r9, [r1]
ldr r1, [r1, #8 ]
1 :
vld1.32 {q8, q9}, [r6]!
vld1.32 {q10, q11}, [r7]!
vld1.16 {q14}, [r8]!
vld1.16 {q15}, [r9]!
subs r4, r4, #8
vadd.i32 q8, q8, q10
vadd.i32 q9, q9, q11
vld1.32 {q12, q13}, [r0]!
vadd.i16 q14, q14, q15
vld1.16 {q15}, [r1]!
vadd.i32 q8, q8, q12
vadd.i32 q9, q9, q13
vadd.i16 q14, q14, q15
vst1.32 {q8, q9}, [r2]!
vst1.16 {q14}, [r3]!
bgt 1 b
pop {r4-r9,pc}
endfunc
// void dav1d_sgr_box5_row_v_neon(int32_t **sumsq, int16_t **sum,
// int32_t *sumsq_out, int16_t *sum_out,
// const int w);
function sgr_box5_row_v_neon, export=1
push {r4-r11,lr}
ldr lr, [sp, #36 ]
ldrd r4, r5, [r0]
ldrd r6, r7, [r0, #8 ]
ldr r0, [r0, #16 ]
add lr, lr, #2
ldrd r8, r9, [r1]
ldrd r10, r11, [r1, #8 ]
ldr r1, [r1, #16 ]
1 :
vld1.32 {q8, q9}, [r4]!
vld1.32 {q10, q11}, [r5]!
vld1.32 {q12, q13}, [r6]!
vld1.32 {q14, q15}, [r7]!
vld1.16 {q0}, [r8]!
vld1.16 {q1}, [r9]!
vld1.16 {q2}, [r10]!
vld1.16 {q3}, [r11]!
subs lr, lr, #8
vadd.i32 q8, q8, q10
vadd.i32 q9, q9, q11
vadd.i32 q12, q12, q14
vadd.i32 q13, q13, q15
vld1.32 {q14, q15}, [r0]!
vadd.i16 q0, q0, q1
vadd.i16 q2, q2, q3
vld1.16 {q3}, [r1]!
vadd.i32 q8, q8, q12
vadd.i32 q9, q9, q13
vadd.i16 q0, q0, q2
vadd.i32 q8, q8, q14
vadd.i32 q9, q9, q15
vadd.i16 q0, q0, q3
vst1.32 {q8, q9}, [r2]!
vst1.16 {q0}, [r3]!
bgt 1 b
pop {r4-r11,pc}
endfunc
// void dav1d_sgr_calc_row_ab1_neon(int32_t *a, int16_t *b,
// const int w, const int strength,
// const int bitdepth_max);
// void dav1d_sgr_calc_row_ab2_neon(int32_t *a, int16_t *b,
// const int w, const int strength,
// const int bitdepth_max);
function sgr_calc_row_ab1_neon, export=1
push {r4-r7,lr}
vpush {q4-q7}
ldr r4, [sp, #84 ]
clz r6, r4
vmov.i32 q15, #9 // n
movw r5, #455
b sgr_calc_ab_neon
endfunc
function sgr_calc_row_ab2_neon, export=1
push {r4-r7,lr}
vpush {q4-q7}
ldr r4, [sp, #84 ]
clz r6, r4
vmov.i32 q15, #25 // n
mov r5, #164
endfunc
function sgr_calc_ab_neon
movrel r12, X(sgr_x_by_x)
sub r6, r6, #24 // -bitdepth_min_8
vld1.8 {q8, q9}, [r12, :128 ]!
add r7, r6, r6 // -2 *bitdepth_min_8
vmov.i8 q11, #5
vmov.i8 d10, #55 // idx of last 5
vld1.8 {q10}, [r12, :128 ]
vmov.i8 d11, #72 // idx of last 4
vmov.i8 d12, #101 // idx of last 3
vmov.i8 d13, #169 // idx of last 2
vmov.i8 d14, #254 // idx of last 1
vmov.i8 d15, #32 // elements consumed in first vtbl
add r2, r2, #2 // w += 2
vdup.32 q12, r3
vsub.i8 q8, q8, q11
vsub.i8 q9, q9, q11
vsub.i8 q10, q10, q11
vdup.32 q13, r7 // -2 *bitdepth_min_8
1 :
vld1.32 {q0, q1}, [r0, :128 ] // a
vld1.16 {q2}, [r1, :128 ] // b
vdup.16 q14, r6 // -bitdepth_min_8
subs r2, r2, #8
vrshl.s32 q0, q0, q13
vrshl.s32 q1, q1, q13
vrshl.s16 q4, q2, q14
vmul.i32 q0, q0, q15 // a * n
vmul.i32 q1, q1, q15 // a * n
vmull.u16 q3, d8, d8 // b * b
vmull.u16 q4, d9, d9 // b * b
vqsub.u32 q0, q0, q3 // imax(a * n - b * b, 0 )
vqsub.u32 q1, q1, q4 // imax(a * n - b * b, 0 )
vmul.i32 q0, q0, q12 // p * s
vmul.i32 q1, q1, q12 // p * s
vqshrn.u32 d0, q0, #16
vqshrn.u32 d1, q1, #16
vqrshrn.u16 d0, q0, #4 // imin(z, 255 )
vcgt.u8 d2, d0, d10 // = -1 if sgr_x_by_x[d0] < 5
vcgt.u8 d3, d0, d11 // = -1 if sgr_x_by_x[d0] < 4
vtbl.8 d1, {q8, q9}, d0
vcgt.u8 d6, d0, d12 // = -1 if sgr_x_by_x[d0] < 3
vsub.i8 d9, d0, d15 // indices for vtbx
vcgt.u8 d7, d0, d13 // = -1 if sgr_x_by_x[d0] < 2
vadd.i8 d2, d2, d3
vtbx.8 d1, {q10}, d9
vcgt.u8 d8, d0, d14 // = -1 if sgr_x_by_x[d0] < 1
vadd.i8 d6, d6, d7
vadd.i8 d8, d8, d22
vadd.i8 d2, d2, d6
vadd.i8 d1, d1, d8
vadd.i8 d1, d1, d2
vmovl.u8 q0, d1 // x
vdup.32 q14, r5 // one_by_x
vmull.u16 q1, d0, d4 // x * BB[i]
vmull.u16 q2, d1, d5 // x * BB[i]
vmul.i32 q1, q1, q14 // x * BB[i] * sgr_one_by_x
vmul.i32 q2, q2, q14 // x * BB[i] * sgr_one_by_x
vrshr.s32 q1, q1, #12 // AA[i]
vrshr.s32 q2, q2, #12 // AA[i]
vst1.32 {q1, q2}, [r0, :128 ]!
vst1.16 {q0}, [r1, :128 ]!
bgt 1 b
vpop {q4-q7}
pop {r4-r7,pc}
endfunc
Messung V0.5 in Prozent C=95 H=100 G=97
¤ Dauer der Verarbeitung: 0.5 Sekunden
¤
*© Formatika GbR, Deutschland