Quelle resize_neon_dotprod.c

Sprache: C

/*
* Copyright (c) 2024, Alliance for Open Media. All rights reserved.
*
* This source code is subject to the terms of the BSD 2 Clause License and
* the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
* was not distributed with this source code in the LICENSE file, you can
* obtain it at www.aomedia.org/license/software. If the Alliance for Open
* Media Patent License 1.0 was not distributed with this source code in the
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/

#include <arm_neon.h>
#include <assert.h>

#include "aom_dsp/arm/mem_neon.h"
#include "aom_dsp/arm/transpose_neon.h"
#include "av1/common/arm/resize_neon.h"
#include "av1/common/resize.h"
#include "config/aom_scale_rtcd.h"
#include "config/av1_rtcd.h"

// clang-format off
DECLARE_ALIGNED(16, static const uint8_t, kScale2DotProdPermuteTbl[32]) = {
  0, 1, 2, 3, 2, 3, 4, 5, 4, 5,  6,  7,  6,  7,  8,  9,
  4, 5, 6, 7, 6, 7, 8, 9, 8, 9, 10, 11, 10, 11, 12, 13
};
DECLARE_ALIGNED(16, static const uint8_t, kScale4DotProdPermuteTbl[16]) = {
  0, 1, 2, 3, 4, 5, 6, 7, 4, 5, 6, 7, 8, 9, 10, 11
};
// clang-format on

static inline uint8x8_t scale_2_to_1_filter8_8(const uint8x16_t s0,
                                               const uint8x16_t s1,
                                               const uint8x16x2_t permute_tbl,
                                               const int8x8_t filter) {
  // Transform sample range to [-128, 127] for 8-bit signed dot product.
  int8x16_t s0_128 = vreinterpretq_s8_u8(vsubq_u8(s0, vdupq_n_u8(128)));
  int8x16_t s1_128 = vreinterpretq_s8_u8(vsubq_u8(s1, vdupq_n_u8(128)));

  // Permute samples ready for dot product.
  int8x16_t perm_samples[4] = { vqtbl1q_s8(s0_128, permute_tbl.val[0]),
                                vqtbl1q_s8(s0_128, permute_tbl.val[1]),
                                vqtbl1q_s8(s1_128, permute_tbl.val[0]),
                                vqtbl1q_s8(s1_128, permute_tbl.val[1]) };

  // Dot product constant:
  // The shim of 128 << FILTER_BITS is needed because we are subtracting 128
  // from every source value. The additional right shift by one is needed
  // because we halve the filter values.
  const int32x4_t acc = vdupq_n_s32((128 << FILTER_BITS) >> 1);

  // First 4 output values.
  int32x4_t sum0123 = vdotq_lane_s32(acc, perm_samples[0], filter, 0);
  sum0123 = vdotq_lane_s32(sum0123, perm_samples[1], filter, 1);
  // Second 4 output values.
  int32x4_t sum4567 = vdotq_lane_s32(acc, perm_samples[2], filter, 0);
  sum4567 = vdotq_lane_s32(sum4567, perm_samples[3], filter, 1);

  int16x8_t sum = vcombine_s16(vmovn_s32(sum0123), vmovn_s32(sum4567));

  // We halved the filter values so -1 from right shift.
  return vqrshrun_n_s16(sum, FILTER_BITS - 1);
}

static inline void scale_2_to_1_horiz_8tap(const uint8_t *src,
                                           const int src_stride, int w, int h,
                                           uint8_t *dst, const int dst_stride,
                                           const int16x8_t filters) {
  const int8x8_t filter = vmovn_s16(filters);
  const uint8x16x2_t permute_tbl = vld1q_u8_x2(kScale2DotProdPermuteTbl);

  do {
    const uint8_t *s = src;
    uint8_t *d = dst;
    int width = w;
    do {
      uint8x16_t s0[2], s1[2], s2[2], s3[2], s4[2], s5[2], s6[2], s7[2];
      load_u8_16x8(s, src_stride, &s0[0], &s1[>0], &s2[0], &s3[0], &s4[0],
                   &s5[0], &s6[0], &s7[0]);
      load_u8_16x8(s + 8, src_stride, &s0[1], &s1[1], &s2[1], &s3[1], &s4[1],
                   &s5[1], &s6[1], &s7[1]);

      uint8x8_t d0 = scale_2_to_1_filter8_8(s0[0], s0[1], permute_tbl, filter);
      uint8x8_t d1 = scale_2_to_1_filter8_8(s1[0], s1[1], permute_tbl, filter);
      uint8x8_t d2 = scale_2_to_1_filter8_8(s2[0], s2[1], permute_tbl, filter);
      uint8x8_t d3 = scale_2_to_1_filter8_8(s3[0], s3[1], permute_tbl, filter);

      uint8x8_t d4 = scale_2_to_1_filter8_8(s4[0], s4[1], permute_tbl, filter);
      uint8x8_t d5 = scale_2_to_1_filter8_8(s5[0], s5[1], permute_tbl, filter);
      uint8x8_t d6 = scale_2_to_1_filter8_8(s6[0], s6[1], permute_tbl, filter);
      uint8x8_t d7 = scale_2_to_1_filter8_8(s7[0], s7[1], permute_tbl, filter);

      store_u8_8x8(d, dst_stride, d0, d1, d2, d3, d4, d5, d6, d7);

      d += 8;
      s += 16;
      width -= 8;
    } while (width > 0);

    dst += 8 * dst_stride;
    src += 8 * src_stride;
    h -= 8;
  } while (h > 0);
}

static inline void scale_plane_2_to_1_8tap(const uint8_t *src,
                                           const int src_stride, uint8_t *dst,
                                           const int dst_stride, const int w,
                                           const int h,
                                           const int16_t *const filter_ptr,
                                           uint8_t *const im_block) {
  assert(w > 0 && h > 0);

  const int im_h = 2 * h + SUBPEL_TAPS - 3;
  const int im_stride = (w + 7) & ~7;
  // All filter values are even, halve them to fit in int8_t when applying
  // horizontal filter and stay in 16-bit elements when applying vertical
  // filter.
  const int16x8_t filters = vshrq_n_s16(vld1q_s16(filter_ptr), 1);

  const ptrdiff_t horiz_offset = SUBPEL_TAPS / 2 - 1;
  const ptrdiff_t vert_offset = (SUBPEL_TAPS / 2 - 1) * src_stride;

  scale_2_to_1_horiz_8tap(src - horiz_offset - vert_offset, src_stride, w, im_h,
                          im_block, im_stride, filters);

  // We can specialise the vertical filtering for 6-tap filters given that the
  // EIGHTTAP_SMOOTH and EIGHTTAP_REGULAR filters are 0-padded.
  scale_2_to_1_vert_6tap(im_block + im_stride, im_stride, w, h, dst, dst_stride,
                         filters);
}

static inline uint8x8_t scale_4_to_1_filter8_8(
    const uint8x16_t s0, const uint8x16_t s1, const uint8x16_t s2,
    const uint8x16_t s3, const uint8x16_t permute_tbl, const int8x8_t filter) {
  int8x16_t filters = vcombine_s8(filter, filter);

  // Transform sample range to [-128, 127] for 8-bit signed dot product.
  int8x16_t s0_128 = vreinterpretq_s8_u8(vsubq_u8(s0, vdupq_n_u8(128)));
  int8x16_t s1_128 = vreinterpretq_s8_u8(vsubq_u8(s1, vdupq_n_u8(128)));
  int8x16_t s2_128 = vreinterpretq_s8_u8(vsubq_u8(s2, vdupq_n_u8(128)));
  int8x16_t s3_128 = vreinterpretq_s8_u8(vsubq_u8(s3, vdupq_n_u8(128)));

  int8x16_t perm_samples[4] = { vqtbl1q_s8(s0_128, permute_tbl),
                                vqtbl1q_s8(s1_128, permute_tbl),
                                vqtbl1q_s8(s2_128, permute_tbl),
                                vqtbl1q_s8(s3_128, permute_tbl) };

  // Dot product constant:
  // The shim of 128 << FILTER_BITS is needed because we are subtracting 128
  // from every source value. The additional right shift by one is needed
  // because we halved the filter values and will use a pairwise add.
  const int32x4_t acc = vdupq_n_s32((128 << FILTER_BITS) >> 2);

  int32x4_t sum0 = vdotq_s32(acc, perm_samples[0], filters);
  int32x4_t sum1 = vdotq_s32(acc, perm_samples[1], filters);
  int32x4_t sum2 = vdotq_s32(acc, perm_samples[2], filters);
  int32x4_t sum3 = vdotq_s32(acc, perm_samples[3], filters);

  int32x4_t sum01 = vpaddq_s32(sum0, sum1);
  int32x4_t sum23 = vpaddq_s32(sum2, sum3);

  int16x8_t sum = vcombine_s16(vmovn_s32(sum01), vmovn_s32(sum23));

  // We halved the filter values so -1 from right shift.
  return vqrshrun_n_s16(sum, FILTER_BITS - 1);
}

static inline void scale_4_to_1_horiz_8tap(const uint8_t *src,
                                           const int src_stride, int w, int h,
                                           uint8_t *dst, const int dst_stride,
                                           const int16x8_t filters) {
  const int8x8_t filter = vmovn_s16(filters);
  const uint8x16_t permute_tbl = vld1q_u8(kScale4DotProdPermuteTbl);

  do {
    const uint8_t *s = src;
    uint8_t *d = dst;
    int width = w;

    do {
      uint8x16_t s0, s1, s2, s3, s4, s5, s6, s7;
      load_u8_16x8(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7);

      uint8x8_t d0 =
          scale_4_to_1_filter8_8(s0, s1, s2, s3, permute_tbl, filter);
      uint8x8_t d1 =
          scale_4_to_1_filter8_8(s4, s5, s6, s7, permute_tbl, filter);

      store_u8x2_strided_x4(d + 0 * dst_stride, dst_stride, d0);
      store_u8x2_strided_x4(d + 4 * dst_stride, dst_stride, d1);

      d += 2;
      s += 8;
      width -= 2;
    } while (width > 0);

    dst += 8 * dst_stride;
    src += 8 * src_stride;
    h -= 8;
  } while (h > 0);
}

static inline void scale_plane_4_to_1_8tap(const uint8_t *src,
                                           const int src_stride, uint8_t *dst,
                                           const int dst_stride, const int w,
                                           const int h,
                                           const int16_t *const filter_ptr,
                                           uint8_t *const im_block) {
  assert(w > 0 && h > 0);
  const int im_h = 4 * h + SUBPEL_TAPS - 2;
  const int im_stride = (w + 1) & ~1;
  // All filter values are even, halve them to fit in int8_t when applying
  // horizontal filter and stay in 16-bit elements when applying vertical
  // filter.
  const int16x8_t filters = vshrq_n_s16(vld1q_s16(filter_ptr), 1);

  const ptrdiff_t horiz_offset = SUBPEL_TAPS / 2 - 1;
  const ptrdiff_t vert_offset = (SUBPEL_TAPS / 2 - 1) * src_stride;

  scale_4_to_1_horiz_8tap(src - horiz_offset - vert_offset, src_stride, w, im_h,
                          im_block, im_stride, filters);

  // We can specialise the vertical filtering for 6-tap filters given that the
  // EIGHTTAP_SMOOTH and EIGHTTAP_REGULAR filters are 0-padded.
  scale_4_to_1_vert_6tap(im_block + im_stride, im_stride, w, h, dst, dst_stride,
                         filters);
}

static inline bool has_normative_scaler_neon_dotprod(const int src_width,
                                                     const int src_height,
                                                     const int dst_width,
                                                     const int dst_height) {
  return (2 * dst_width == src_width && 2 * dst_height == src_height) ||
         (4 * dst_width == src_width && 4 * dst_height == src_height);
}

void av1_resize_and_extend_frame_neon_dotprod(const YV12_BUFFER_CONFIG *src,
                                              YV12_BUFFER_CONFIG *dst,
                                              const InterpFilter filter,
                                              const int phase,
                                              const int num_planes) {
  assert(filter == BILINEAR || filter == EIGHTTAP_SMOOTH ||
         filter == EIGHTTAP_REGULAR);

  bool has_normative_scaler =
      has_normative_scaler_neon_dotprod(src->y_crop_width, src->y_crop_height,
                                        dst->y_crop_width, dst->y_crop_height);

  if (num_planes > 1) {
    has_normative_scaler =
        has_normative_scaler && has_normative_scaler_neon_dotprod(
                                    src->uv_crop_width, src->uv_crop_height,
                                    dst->uv_crop_width, dst->uv_crop_height);
  }

  if (!has_normative_scaler || filter == BILINEAR || phase == 0) {
    av1_resize_and_extend_frame_neon(src, dst, filter, phase, num_planes);
    return;
  }

  // We use AOMMIN(num_planes, MAX_MB_PLANE) instead of num_planes to quiet
  // the static analysis warnings.
  int malloc_failed = 0;
  for (int i = 0; i < AOMMIN(num_planes, MAX_MB_PLANE); ++i) {
    const int is_uv = i > 0;
    const int src_w = src->crop_widths[is_uv];
    const int src_h = src->crop_heights[is_uv];
    const int dst_w = dst->crop_widths[is_uv];
    const int dst_h = dst->crop_heights[is_uv];
    const int dst_y_w = (dst->crop_widths[0] + 1) & ~1;
    const int dst_y_h = (dst->crop_heights[0] + 1) & ~1;

    if (2 * dst_w == src_w && 2 * dst_h == src_h) {
      const int buffer_stride = (dst_y_w + 7) & ~7;
      const int buffer_height = (2 * dst_y_h + SUBPEL_TAPS - 2 + 7) & ~7;
      uint8_t *const temp_buffer =
          (uint8_t *)malloc(buffer_stride * buffer_height);
      if (!temp_buffer) {
        malloc_failed = 1;
        break;
      }
      const InterpKernel *interp_kernel =
          (const InterpKernel *)av1_interp_filter_params_list[filter]
              .filter_ptr;
      scale_plane_2_to_1_8tap(src->buffers[i], src->strides[is_uv],
                              dst->buffers[i], dst->strides[is_uv], dst_w,
                              dst_h, interp_kernel[phase], temp_buffer);
      free(temp_buffer);
    } else if (4 * dst_w == src_w && 4 * dst_h == src_h) {
      const int buffer_stride = (dst_y_w + 1) & ~1;
      const int buffer_height = (4 * dst_y_h + SUBPEL_TAPS - 2 + 7) & ~7;
      uint8_t *const temp_buffer =
          (uint8_t *)malloc(buffer_stride * buffer_height);
      if (!temp_buffer) {
        malloc_failed = 1;
        break;
      }
      const InterpKernel *interp_kernel =
          (const InterpKernel *)av1_interp_filter_params_list[filter]
              .filter_ptr;
      scale_plane_4_to_1_8tap(src->buffers[i], src->strides[is_uv],
                              dst->buffers[i], dst->strides[is_uv], dst_w,
                              dst_h, interp_kernel[phase], temp_buffer);
      free(temp_buffer);
    }
  }

  if (malloc_failed) {
    av1_resize_and_extend_frame_c(src, dst, filter, phase, num_planes);
  } else {
    aom_extend_frame_borders(dst, num_planes);
  }
}

Messung V0.5 in Prozent

¤ Dauer der Verarbeitung: 0.14 Sekunden (vorverarbeitet am 2026-06-06) ¤

Wurzel

Suchen

Beweissystem der NASA

Beweissystem Isabelle

NIST Cobol Testsuite

Cephes Mathematical Library

Wiener Entwicklungsmethode

Haftungshinweis

Die Informationen auf dieser Webseite wurden nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit, noch Qualität der bereit gestellten Informationen zugesichert.

Bemerkung:

Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.