/*
* Copyright (c) 2020, Alliance for Open Media. All rights reserved.
*
* This source code is subject to the terms of the BSD 2 Clause License and
* the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
* was not distributed with this source code in the LICENSE file, you can
* obtain it at www.aomedia.org/license/software. If the Alliance for Open
* Media Patent License 1.0 was not distributed with this source code in the
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
#include <arm_neon.h>
#include <assert.h>
#include "config/aom_config.h"
#include "config/av1_rtcd.h"
#include "aom/aom_integer.h"
#include "aom_dsp/arm/mem_neon.h"
#include "aom_dsp/arm/sum_neon.h"
#define MAX_UPSAMPLE_SZ 16
// TODO(aomedia:349436249): enable for armv7 after SIGBUS is fixed.
#if AOM_ARCH_AARCH64
// These kernels are a transposed version of those defined in reconintra.c,
// with the absolute value of the negatives taken in the top row.
DECLARE_ALIGNED(16 , const uint8_t,
av1_filter_intra_taps_neon[FILTER_INTRA_MODES][7 ][8 ]) = {
// clang-format off
{
{ 6 , 5 , 3 , 3 , 4 , 3 , 3 , 3 },
{ 10 , 2 , 1 , 1 , 6 , 2 , 2 , 1 },
{ 0 , 10 , 1 , 1 , 0 , 6 , 2 , 2 },
{ 0 , 0 , 10 , 2 , 0 , 0 , 6 , 2 },
{ 0 , 0 , 0 , 10 , 0 , 0 , 0 , 6 },
{ 12 , 9 , 7 , 5 , 2 , 2 , 2 , 3 },
{ 0 , 0 , 0 , 0 , 12 , 9 , 7 , 5 }
},
{
{ 10 , 6 , 4 , 2 , 10 , 6 , 4 , 2 },
{ 16 , 0 , 0 , 0 , 16 , 0 , 0 , 0 },
{ 0 , 16 , 0 , 0 , 0 , 16 , 0 , 0 },
{ 0 , 0 , 16 , 0 , 0 , 0 , 16 , 0 },
{ 0 , 0 , 0 , 16 , 0 , 0 , 0 , 16 },
{ 10 , 6 , 4 , 2 , 0 , 0 , 0 , 0 },
{ 0 , 0 , 0 , 0 , 10 , 6 , 4 , 2 }
},
{
{ 8 , 8 , 8 , 8 , 4 , 4 , 4 , 4 },
{ 8 , 0 , 0 , 0 , 4 , 0 , 0 , 0 },
{ 0 , 8 , 0 , 0 , 0 , 4 , 0 , 0 },
{ 0 , 0 , 8 , 0 , 0 , 0 , 4 , 0 },
{ 0 , 0 , 0 , 8 , 0 , 0 , 0 , 4 },
{ 16 , 16 , 16 , 16 , 0 , 0 , 0 , 0 },
{ 0 , 0 , 0 , 0 , 16 , 16 , 16 , 16 }
},
{
{ 2 , 1 , 1 , 0 , 1 , 1 , 1 , 1 },
{ 8 , 3 , 2 , 1 , 4 , 3 , 2 , 2 },
{ 0 , 8 , 3 , 2 , 0 , 4 , 3 , 2 },
{ 0 , 0 , 8 , 3 , 0 , 0 , 4 , 3 },
{ 0 , 0 , 0 , 8 , 0 , 0 , 0 , 4 },
{ 10 , 6 , 4 , 2 , 3 , 4 , 4 , 3 },
{ 0 , 0 , 0 , 0 , 10 , 6 , 4 , 3 }
},
{
{ 12 , 10 , 9 , 8 , 10 , 9 , 8 , 7 },
{ 14 , 0 , 0 , 0 , 12 , 1 , 0 , 0 },
{ 0 , 14 , 0 , 0 , 0 , 12 , 0 , 0 },
{ 0 , 0 , 14 , 0 , 0 , 0 , 12 , 1 },
{ 0 , 0 , 0 , 14 , 0 , 0 , 0 , 12 },
{ 14 , 12 , 11 , 10 , 0 , 0 , 1 , 1 },
{ 0 , 0 , 0 , 0 , 14 , 12 , 11 , 9 }
}
// clang-format on
};
#define FILTER_INTRA_SCALE_BITS 4
void av1_filter_intra_predictor_neon(uint8_t *dst, ptrdiff_t stride,
TX_SIZE tx_size, const uint8_t *above,
const uint8_t *left, int mode) {
const int width = tx_size_wide[tx_size];
const int height = tx_size_high[tx_size];
assert(width <= 32 && height <= 32 );
const uint8x8_t f0 = vld1_u8(av1_filter_intra_taps_neon[mode][0 ]);
const uint8x8_t f1 = vld1_u8(av1_filter_intra_taps_neon[mode][1 ]);
const uint8x8_t f2 = vld1_u8(av1_filter_intra_taps_neon[mode][2 ]);
const uint8x8_t f3 = vld1_u8(av1_filter_intra_taps_neon[mode][3 ]);
const uint8x8_t f4 = vld1_u8(av1_filter_intra_taps_neon[mode][4 ]);
const uint8x8_t f5 = vld1_u8(av1_filter_intra_taps_neon[mode][5 ]);
const uint8x8_t f6 = vld1_u8(av1_filter_intra_taps_neon[mode][6 ]);
uint8_t buffer[33 ][33 ];
// Populate the top row in the scratch buffer with data from above.
memcpy(buffer[0 ], &above[-1 ], (width + 1 ) * sizeof (uint8_t));
// Populate the first column in the scratch buffer with data from the left.
int r = 0 ;
do {
buffer[r + 1 ][0 ] = left[r];
} while (++r < height);
// Computing 4 cols per iteration (instead of 8) for 8x<h> blocks is faster.
if (width <= 8 ) {
r = 1 ;
do {
int c = 1 ;
uint8x8_t s0 = vld1_dup_u8(&buffer[r - 1 ][c - 1 ]);
uint8x8_t s5 = vld1_dup_u8(&buffer[r + 0 ][c - 1 ]);
uint8x8_t s6 = vld1_dup_u8(&buffer[r + 1 ][c - 1 ]);
do {
uint8x8_t s1234 = load_u8_4x1(&buffer[r - 1 ][c - 1 ] + 1 );
uint8x8_t s1 = vdup_lane_u8(s1234, 0 );
uint8x8_t s2 = vdup_lane_u8(s1234, 1 );
uint8x8_t s3 = vdup_lane_u8(s1234, 2 );
uint8x8_t s4 = vdup_lane_u8(s1234, 3 );
uint16x8_t sum = vmull_u8(s1, f1);
// First row of each filter has all negative values so subtract.
sum = vmlsl_u8(sum, s0, f0);
sum = vmlal_u8(sum, s2, f2);
sum = vmlal_u8(sum, s3, f3);
sum = vmlal_u8(sum, s4, f4);
sum = vmlal_u8(sum, s5, f5);
sum = vmlal_u8(sum, s6, f6);
uint8x8_t res =
vqrshrun_n_s16(vreinterpretq_s16_u16(sum), FILTER_INTRA_SCALE_BITS);
// Store buffer[r + 0][c] and buffer[r + 1][c].
store_u8x4_strided_x2(&buffer[r][c], 33 , res);
store_u8x4_strided_x2(dst + (r - 1 ) * stride + c - 1 , stride, res);
s0 = s4;
s5 = vdup_lane_u8(res, 3 );
s6 = vdup_lane_u8(res, 7 );
c += 4 ;
} while (c < width + 1 );
r += 2 ;
} while (r < height + 1 );
} else {
r = 1 ;
do {
int c = 1 ;
uint8x8_t s0_lo = vld1_dup_u8(&buffer[r - 1 ][c - 1 ]);
uint8x8_t s5_lo = vld1_dup_u8(&buffer[r + 0 ][c - 1 ]);
uint8x8_t s6_lo = vld1_dup_u8(&buffer[r + 1 ][c - 1 ]);
do {
uint8x8_t s1234 = vld1_u8(&buffer[r - 1 ][c - 1 ] + 1 );
uint8x8_t s1_lo = vdup_lane_u8(s1234, 0 );
uint8x8_t s2_lo = vdup_lane_u8(s1234, 1 );
uint8x8_t s3_lo = vdup_lane_u8(s1234, 2 );
uint8x8_t s4_lo = vdup_lane_u8(s1234, 3 );
uint16x8_t sum_lo = vmull_u8(s1_lo, f1);
// First row of each filter has all negative values so subtract.
sum_lo = vmlsl_u8(sum_lo, s0_lo, f0);
sum_lo = vmlal_u8(sum_lo, s2_lo, f2);
sum_lo = vmlal_u8(sum_lo, s3_lo, f3);
sum_lo = vmlal_u8(sum_lo, s4_lo, f4);
sum_lo = vmlal_u8(sum_lo, s5_lo, f5);
sum_lo = vmlal_u8(sum_lo, s6_lo, f6);
uint8x8_t res_lo = vqrshrun_n_s16(vreinterpretq_s16_u16(sum_lo),
FILTER_INTRA_SCALE_BITS);
uint8x8_t s0_hi = s4_lo;
uint8x8_t s1_hi = vdup_lane_u8(s1234, 4 );
uint8x8_t s2_hi = vdup_lane_u8(s1234, 5 );
uint8x8_t s3_hi = vdup_lane_u8(s1234, 6 );
uint8x8_t s4_hi = vdup_lane_u8(s1234, 7 );
uint8x8_t s5_hi = vdup_lane_u8(res_lo, 3 );
uint8x8_t s6_hi = vdup_lane_u8(res_lo, 7 );
uint16x8_t sum_hi = vmull_u8(s1_hi, f1);
// First row of each filter has all negative values so subtract.
sum_hi = vmlsl_u8(sum_hi, s0_hi, f0);
sum_hi = vmlal_u8(sum_hi, s2_hi, f2);
sum_hi = vmlal_u8(sum_hi, s3_hi, f3);
sum_hi = vmlal_u8(sum_hi, s4_hi, f4);
sum_hi = vmlal_u8(sum_hi, s5_hi, f5);
sum_hi = vmlal_u8(sum_hi, s6_hi, f6);
uint8x8_t res_hi = vqrshrun_n_s16(vreinterpretq_s16_u16(sum_hi),
FILTER_INTRA_SCALE_BITS);
uint32x2x2_t res =
vzip_u32(vreinterpret_u32_u8(res_lo), vreinterpret_u32_u8(res_hi));
vst1_u8(&buffer[r + 0 ][c], vreinterpret_u8_u32(res.val[0 ]));
vst1_u8(&buffer[r + 1 ][c], vreinterpret_u8_u32(res.val[1 ]));
vst1_u8(dst + (r - 1 ) * stride + c - 1 ,
vreinterpret_u8_u32(res.val[0 ]));
vst1_u8(dst + (r + 0 ) * stride + c - 1 ,
vreinterpret_u8_u32(res.val[1 ]));
s0_lo = s4_hi;
s5_lo = vdup_lane_u8(res_hi, 3 );
s6_lo = vdup_lane_u8(res_hi, 7 );
c += 8 ;
} while (c < width + 1 );
r += 2 ;
} while (r < height + 1 );
}
}
#endif // AOM_ARCH_AARCH64
void av1_filter_intra_edge_neon(uint8_t *p, int sz, int strength) {
if (!strength) return ;
assert(sz >= 0 && sz <= 129 );
uint8_t edge[160 ]; // Max value of sz + enough padding for vector accesses.
memcpy(edge + 1 , p, sz * sizeof (*p));
// Populate extra space appropriately.
edge[0 ] = edge[1 ];
edge[sz + 1 ] = edge[sz];
edge[sz + 2 ] = edge[sz];
// Don't overwrite first pixel.
uint8_t *dst = p + 1 ;
sz--;
if (strength == 1 ) { // Filter: {4, 8, 4}.
const uint8_t *src = edge + 1 ;
while (sz >= 8 ) {
uint8x8_t s0 = vld1_u8(src);
uint8x8_t s1 = vld1_u8(src + 1 );
uint8x8_t s2 = vld1_u8(src + 2 );
// Make use of the identity:
// (4*a + 8*b + 4*c) >> 4 == (a + (b << 1) + c) >> 2
uint16x8_t t0 = vaddl_u8(s0, s2);
uint16x8_t t1 = vaddl_u8(s1, s1);
uint16x8_t sum = vaddq_u16(t0, t1);
uint8x8_t res = vrshrn_n_u16(sum, 2 );
vst1_u8(dst, res);
src += 8 ;
dst += 8 ;
sz -= 8 ;
}
if (sz > 0 ) { // Handle sz < 8 to avoid modifying out-of-bounds values.
uint8x8_t s0 = vld1_u8(src);
uint8x8_t s1 = vld1_u8(src + 1 );
uint8x8_t s2 = vld1_u8(src + 2 );
uint16x8_t t0 = vaddl_u8(s0, s2);
uint16x8_t t1 = vaddl_u8(s1, s1);
uint16x8_t sum = vaddq_u16(t0, t1);
uint8x8_t res = vrshrn_n_u16(sum, 2 );
// Mask off out-of-bounds indices.
uint8x8_t current_dst = vld1_u8(dst);
uint8x8_t mask = vcgt_u8(vdup_n_u8(sz), vcreate_u8(0 x0706050403020100));
res = vbsl_u8(mask, res, current_dst);
vst1_u8(dst, res);
}
} else if (strength == 2 ) { // Filter: {5, 6, 5}.
const uint8_t *src = edge + 1 ;
const uint8x8x3_t filter = { { vdup_n_u8(5 ), vdup_n_u8(6 ), vdup_n_u8(5 ) } };
while (sz >= 8 ) {
uint8x8_t s0 = vld1_u8(src);
uint8x8_t s1 = vld1_u8(src + 1 );
uint8x8_t s2 = vld1_u8(src + 2 );
uint16x8_t accum = vmull_u8(s0, filter.val[0 ]);
accum = vmlal_u8(accum, s1, filter.val[1 ]);
accum = vmlal_u8(accum, s2, filter.val[2 ]);
uint8x8_t res = vrshrn_n_u16(accum, 4 );
vst1_u8(dst, res);
src += 8 ;
dst += 8 ;
sz -= 8 ;
}
if (sz > 0 ) { // Handle sz < 8 to avoid modifying out-of-bounds values.
uint8x8_t s0 = vld1_u8(src);
uint8x8_t s1 = vld1_u8(src + 1 );
uint8x8_t s2 = vld1_u8(src + 2 );
uint16x8_t accum = vmull_u8(s0, filter.val[0 ]);
accum = vmlal_u8(accum, s1, filter.val[1 ]);
accum = vmlal_u8(accum, s2, filter.val[2 ]);
uint8x8_t res = vrshrn_n_u16(accum, 4 );
// Mask off out-of-bounds indices.
uint8x8_t current_dst = vld1_u8(dst);
uint8x8_t mask = vcgt_u8(vdup_n_u8(sz), vcreate_u8(0 x0706050403020100));
res = vbsl_u8(mask, res, current_dst);
vst1_u8(dst, res);
}
} else { // Filter {2, 4, 4, 4, 2}.
const uint8_t *src = edge;
while (sz >= 8 ) {
uint8x8_t s0 = vld1_u8(src);
uint8x8_t s1 = vld1_u8(src + 1 );
uint8x8_t s2 = vld1_u8(src + 2 );
uint8x8_t s3 = vld1_u8(src + 3 );
uint8x8_t s4 = vld1_u8(src + 4 );
// Make use of the identity:
// (2*a + 4*b + 4*c + 4*d + 2*e) >> 4 == (a + ((b + c + d) << 1) + e) >> 3
uint16x8_t t0 = vaddl_u8(s0, s4);
uint16x8_t t1 = vaddl_u8(s1, s2);
t1 = vaddw_u8(t1, s3);
t1 = vaddq_u16(t1, t1);
uint16x8_t sum = vaddq_u16(t0, t1);
uint8x8_t res = vrshrn_n_u16(sum, 3 );
vst1_u8(dst, res);
src += 8 ;
dst += 8 ;
sz -= 8 ;
}
if (sz > 0 ) { // Handle sz < 8 to avoid modifying out-of-bounds values.
uint8x8_t s0 = vld1_u8(src);
uint8x8_t s1 = vld1_u8(src + 1 );
uint8x8_t s2 = vld1_u8(src + 2 );
uint8x8_t s3 = vld1_u8(src + 3 );
uint8x8_t s4 = vld1_u8(src + 4 );
uint16x8_t t0 = vaddl_u8(s0, s4);
uint16x8_t t1 = vaddl_u8(s1, s2);
t1 = vaddw_u8(t1, s3);
t1 = vaddq_u16(t1, t1);
uint16x8_t sum = vaddq_u16(t0, t1);
uint8x8_t res = vrshrn_n_u16(sum, 3 );
// Mask off out-of-bounds indices.
uint8x8_t current_dst = vld1_u8(dst);
uint8x8_t mask = vcgt_u8(vdup_n_u8(sz), vcreate_u8(0 x0706050403020100));
res = vbsl_u8(mask, res, current_dst);
vst1_u8(dst, res);
}
}
}
void av1_upsample_intra_edge_neon(uint8_t *p, int sz) {
if (!sz) return ;
assert(sz <= MAX_UPSAMPLE_SZ);
uint8_t edge[MAX_UPSAMPLE_SZ + 3 ];
const uint8_t *src = edge;
// Copy p[-1..(sz-1)] and pad out both ends.
edge[0 ] = p[-1 ];
edge[1 ] = p[-1 ];
memcpy(edge + 2 , p, sz);
edge[sz + 2 ] = p[sz - 1 ];
p[-2 ] = p[-1 ];
uint8_t *dst = p - 1 ;
do {
uint8x8_t s0 = vld1_u8(src);
uint8x8_t s1 = vld1_u8(src + 1 );
uint8x8_t s2 = vld1_u8(src + 2 );
uint8x8_t s3 = vld1_u8(src + 3 );
int16x8_t t0 = vreinterpretq_s16_u16(vaddl_u8(s0, s3));
int16x8_t t1 = vreinterpretq_s16_u16(vaddl_u8(s1, s2));
t1 = vmulq_n_s16(t1, 9 );
t1 = vsubq_s16(t1, t0);
uint8x8x2_t res = { { vqrshrun_n_s16(t1, 4 ), s2 } };
vst2_u8(dst, res);
src += 8 ;
dst += 16 ;
sz -= 8 ;
} while (sz > 0 );
}
Messung V0.5 in Prozent C=77 H=93 G=85
¤ Dauer der Verarbeitung: 0.14 Sekunden
(vorverarbeitet am 2026-06-06)
¤
*© Formatika GbR, Deutschland