/* * Copyright (c) 2023, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
staticinlinevoid obmc_variance_8x1_s16_neon(int16x8_t pre_s16, const int32_t *wsrc, const int32_t *mask,
int32x4_t *ssev,
int32x4_t *sumv) { // For 4xh and 8xh we observe it is faster to avoid the double-widening of // pre. Instead we do a single widening step and narrow the mask to 16-bits // to allow us to perform a widening multiply. Widening multiply // instructions have better throughput on some micro-architectures but for // the larger block sizes this benefit is outweighed by the additional // instruction needed to first narrow the mask vectors.
// ROUND_POWER_OF_TWO_SIGNED(value, 12) rounds to nearest with ties away // from zero, however vrshrq_n_s32 rounds to nearest with ties rounded up. // This difference only affects the bit patterns at the rounding breakpoints // exactly, so we can add -1 to all negative numbers to move the breakpoint // one value across and into the correct rounding region.
diff_s32_lo = vsraq_n_s32(diff_s32_lo, diff_s32_lo, 31);
diff_s32_hi = vsraq_n_s32(diff_s32_hi, diff_s32_hi, 31);
int32x4_t round_s32_lo = vrshrq_n_s32(diff_s32_lo, 12);
int32x4_t round_s32_hi = vrshrq_n_s32(diff_s32_hi, 12);
// Use tbl for doing a double-width zero extension from 8->32 bits since we can // do this in one instruction rather than two (indices out of range (255 here) // are set to zero by tbl).
DECLARE_ALIGNED(16, staticconst uint8_t, obmc_variance_permute_idx[]) = {
0, 255, 255, 255, 1, 255, 255, 255, 2, 255, 255, 255, 3, 255, 255, 255,
4, 255, 255, 255, 5, 255, 255, 255, 6, 255, 255, 255, 7, 255, 255, 255,
8, 255, 255, 255, 9, 255, 255, 255, 10, 255, 255, 255, 11, 255, 255, 255,
12, 255, 255, 255, 13, 255, 255, 255, 14, 255, 255, 255, 15, 255, 255, 255
};
// ROUND_POWER_OF_TWO_SIGNED(value, 12) rounds to nearest with ties away from // zero, however vrshrq_n_s32 rounds to nearest with ties rounded up. This // difference only affects the bit patterns at the rounding breakpoints // exactly, so we can add -1 to all negative numbers to move the breakpoint // one value across and into the correct rounding region.
diff_lo = vsraq_n_s32(diff_lo, diff_lo, 31);
diff_hi = vsraq_n_s32(diff_hi, diff_hi, 31);
int32x4_t round_lo = vrshrq_n_s32(diff_lo, 12);
int32x4_t round_hi = vrshrq_n_s32(diff_hi, 12);
staticinlinevoid obmc_variance_large_neon(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, int width, int height, unsigned *sse, int *sum) {
assert(width % 16 == 0);
// Use tbl for doing a double-width zero extension from 8->32 bits since we // can do this in one instruction rather than two.
uint8x16_t pre_idx0 = vld1q_u8(&obmc_variance_permute_idx[0]);
uint8x16_t pre_idx1 = vld1q_u8(&obmc_variance_permute_idx[16]);
uint8x16_t pre_idx2 = vld1q_u8(&obmc_variance_permute_idx[32]);
uint8x16_t pre_idx3 = vld1q_u8(&obmc_variance_permute_idx[48]);
staticinlinevoid obmc_variance_large_neon(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, int width, int height, unsigned *sse, int *sum) { // Non-aarch64 targets do not have a 128-bit tbl instruction, so use the // widening version of the core kernel instead.
¤ Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.0.13Bemerkung:
(vorverarbeitet am 2026-04-27)
¤
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.