Quelle encodetxb_neon.c

Sprache: C

/*
* Copyright (c) 2016, Alliance for Open Media. All rights reserved.
*
* This source code is subject to the terms of the BSD 2 Clause License and
* the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
* was not distributed with this source code in the LICENSE file, you can
* obtain it at www.aomedia.org/license/software. If the Alliance for Open
* Media Patent License 1.0 was not distributed with this source code in the
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/

#include <arm_neon.h>
#include <assert.h>
#include <math.h>

#include "config/aom_config.h"

#include "aom_dsp/arm/mem_neon.h"
#include "av1/common/txb_common.h"
#include "av1/encoder/encodetxb.h"

void av1_txb_init_levels_neon(const tran_low_t *const coeff, const int width,
                              const int height, uint8_t *const levels) {
  const int stride = height + TX_PAD_HOR;
  memset(levels - TX_PAD_TOP * stride, 0,
         sizeof(*levels) * TX_PAD_TOP * stride);
  memset(levels + stride * width, 0,
         sizeof(*levels) * (TX_PAD_BOTTOM * stride + TX_PAD_END));

  const int32x4_t zeros = vdupq_n_s32(0);
  int i = 0;
  uint8_t *ls = levels;
  const tran_low_t *cf = coeff;
  if (height == 4) {
    do {
      const int32x4_t coeffA = vld1q_s32(cf);
      const int32x4_t coeffB = vld1q_s32(cf + height);
      const int16x8_t coeffAB =
          vcombine_s16(vqmovn_s32(coeffA), vqmovn_s32(coeffB));
      const int16x8_t absAB = vqabsq_s16(coeffAB);
      const int8x8_t absABs = vqmovn_s16(absAB);
#if AOM_ARCH_AARCH64
      const int8x16_t absAB8 =
          vcombine_s8(absABs, vreinterpret_s8_s32(vget_low_s32(zeros)));
      const uint8x16_t lsAB =
          vreinterpretq_u8_s32(vzip1q_s32(vreinterpretq_s32_s8(absAB8), zeros));
#else
      const int32x2x2_t absAB8 =
          vzip_s32(vreinterpret_s32_s8(absABs), vget_low_s32(zeros));
      const uint8x16_t lsAB =
          vreinterpretq_u8_s32(vcombine_s32(absAB8.val[0], absAB8.val[1]));
#endif
      vst1q_u8(ls, lsAB);
      ls += (stride << 1);
      cf += (height << 1);
      i += 2;
    } while (i < width);
  } else if (height == 8) {
    do {
      const int16x8_t coeffAB = load_tran_low_to_s16q(cf);
      const int16x8_t absAB = vqabsq_s16(coeffAB);
      const uint8x16_t absAB8 = vreinterpretq_u8_s8(vcombine_s8(
          vqmovn_s16(absAB), vreinterpret_s8_s32(vget_low_s32(zeros))));
      vst1q_u8(ls, absAB8);
      ls += stride;
      cf += height;
      i += 1;
    } while (i < width);
  } else {
    do {
      int j = 0;
      do {
        const int16x8_t coeffAB = load_tran_low_to_s16q(cf);
        const int16x8_t coeffCD = load_tran_low_to_s16q(cf + 8);
        const int16x8_t absAB = vqabsq_s16(coeffAB);
        const int16x8_t absCD = vqabsq_s16(coeffCD);
        const uint8x16_t absABCD = vreinterpretq_u8_s8(
            vcombine_s8(vqmovn_s16(absAB), vqmovn_s16(absCD)));
        vst1q_u8((ls + j), absABCD);
        j += 16;
        cf += 16;
      } while (j < height);
      *(int32_t *)(ls + height) = 0;
      ls += stride;
      i += 1;
    } while (i < width);
  }
}

// get_4_nz_map_contexts_2d coefficients:
static const DECLARE_ALIGNED(16, uint8_t, c_4_po_2d[2][16]) = {
  { 0, 1, 6, 6, 1, 6, 6, 21, 6, 6, 21, 21, 6, 21, 21, 21 },
  { 0, 16, 16, 16, 16, 16, 16, 16, 6, 6, 21, 21, 6, 21, 21, 21 }
};

// get_4_nz_map_contexts_hor coefficients:
/* clang-format off */
#define SIG_COEF_CONTEXTS_2D_X4_051010                        \
  (SIG_COEF_CONTEXTS_2D + ((SIG_COEF_CONTEXTS_2D + 5) << 8) + \
  ((SIG_COEF_CONTEXTS_2D + 10) << 16) + ((SIG_COEF_CONTEXTS_2D + 10) << 24))
/* clang-format on */

// get_4_nz_map_contexts_ver coefficients:
static const DECLARE_ALIGNED(16, uint8_t, c_4_po_hor[16]) = {
  SIG_COEF_CONTEXTS_2D + 0,  SIG_COEF_CONTEXTS_2D + 0,
  SIG_COEF_CONTEXTS_2D + 0,  SIG_COEF_CONTEXTS_2D + 0,
  SIG_COEF_CONTEXTS_2D + 5,  SIG_COEF_CONTEXTS_2D + 5,
  SIG_COEF_CONTEXTS_2D + 5,  SIG_COEF_CONTEXTS_2D + 5,
  SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
  SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
  SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
  SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10
};

// get_8_coeff_contexts_2d coefficients:
// if (width == 8)
static const DECLARE_ALIGNED(16, uint8_t, c_8_po_2d_8[2][16]) = {
  { 0, 1, 6, 6, 21, 21, 21, 21, 1, 6, 6, 21, 21, 21, 21, 21 },
  { 6, 6, 21, 21, 21, 21, 21, 21, 6, 21, 21, 21, 21, 21, 21, 21 }
};
// if (width < 8)
static const DECLARE_ALIGNED(16, uint8_t, c_8_po_2d_l[2][16]) = {
  { 0, 11, 6, 6, 21, 21, 21, 21, 11, 11, 6, 21, 21, 21, 21, 21 },
  { 11, 11, 21, 21, 21, 21, 21, 21, 11, 11, 21, 21, 21, 21, 21, 21 }
};

// if (width > 8)
static const DECLARE_ALIGNED(16, uint8_t, c_8_po_2d_g[2][16]) = {
  { 0, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16 },
  { 6, 6, 21, 21, 21, 21, 21, 21, 6, 21, 21, 21, 21, 21, 21, 21 }
};

// get_4_nz_map_contexts_ver coefficients:
static const DECLARE_ALIGNED(16, uint8_t, c_8_po_ver[16]) = {
  SIG_COEF_CONTEXTS_2D + 0,  SIG_COEF_CONTEXTS_2D + 5,
  SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
  SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
  SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
  SIG_COEF_CONTEXTS_2D + 0,  SIG_COEF_CONTEXTS_2D + 5,
  SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
  SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
  SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10
};

// get_16n_coeff_contexts_2d coefficients:
// real_width == real_height
static const DECLARE_ALIGNED(16, uint8_t, c_16_po_2d_e[4][16]) = {
  { 0, 1, 6, 6, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21 },
  { 1, 6, 6, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21 },
  { 6, 6, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21 },
  { 6, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21 }
};

// real_width < real_height
static const DECLARE_ALIGNED(16, uint8_t, c_16_po_2d_g[3][16]) = {
  { 0, 11, 6, 6, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21 },
  { 11, 11, 6, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21 },
  { 11, 11, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21 }
};

// real_width > real_height
static const DECLARE_ALIGNED(16, uint8_t, c_16_po_2d_l[3][16]) = {
  { 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16 },
  { 6, 6, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21 },
  { 6, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21 }
};

// get_16n_coeff_contexts_hor coefficients:
static const DECLARE_ALIGNED(16, uint8_t, c_16_po_ver[16]) = {
  SIG_COEF_CONTEXTS_2D + 0,  SIG_COEF_CONTEXTS_2D + 5,
  SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
  SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
  SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
  SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
  SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
  SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
  SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10
};

// end of coefficients declaration area

static inline uint8x16_t load_8bit_4x4_to_1_reg(const uint8_t *const src,
                                                const int byte_stride) {
#if AOM_ARCH_AARCH64
  uint32x4_t v_data = vld1q_u32((uint32_t *)src);
  v_data = vld1q_lane_u32((uint32_t *)(src + 1 * byte_stride), v_data, 1);
  v_data = vld1q_lane_u32((uint32_t *)(src + 2 * byte_stride), v_data, 2);
  v_data = vld1q_lane_u32((uint32_t *)(src + 3 * byte_stride), v_data, 3);

  return vreinterpretq_u8_u32(v_data);
#else
  return load_unaligned_u8q(src, byte_stride);
#endif
}

static inline uint8x16_t load_8bit_8x2_to_1_reg(const uint8_t *const src,
                                                const int byte_stride) {
#if AOM_ARCH_AARCH64
  uint64x2_t v_data = vld1q_u64((uint64_t *)src);
  v_data = vld1q_lane_u64((uint64_t *)(src + 1 * byte_stride), v_data, 1);

  return vreinterpretq_u8_u64(v_data);
#else
  uint8x8_t v_data_low = vld1_u8(src);
  uint8x8_t v_data_high = vld1_u8(src + byte_stride);

  return vcombine_u8(v_data_low, v_data_high);
#endif
}

static inline uint8x16_t load_8bit_16x1_to_1_reg(const uint8_t *const src,
                                                 const int byte_stride) {
  (void)byte_stride;
  return vld1q_u8(src);
}

static inline void load_levels_4x4x5(const uint8_t *const src, const int stride,
                                     const ptrdiff_t *const offsets,
                                     uint8x16_t *const level) {
  level[0] = load_8bit_4x4_to_1_reg(&src[1], stride);
  level[1] = load_8bit_4x4_to_1_reg(&src[stride], stride);
  level[2] = load_8bit_4x4_to_1_reg(&src[offsets[0]], stride);
  level[3] = load_8bit_4x4_to_1_reg(&src[offsets[1]], stride);
  level[4] = load_8bit_4x4_to_1_reg(&src[offsets[2]], stride);
}

static inline void load_levels_8x2x5(const uint8_t *const src, const int stride,
                                     const ptrdiff_t *const offsets,
                                     uint8x16_t *const level) {
  level[0] = load_8bit_8x2_to_1_reg(&src[1], stride);
  level[1] = load_8bit_8x2_to_1_reg(&src[stride], stride);
  level[2] = load_8bit_8x2_to_1_reg(&src[offsets[0]], stride);
  level[3] = load_8bit_8x2_to_1_reg(&src[offsets[1]], stride);
  level[4] = load_8bit_8x2_to_1_reg(&src[offsets[2]], stride);
}

static inline void load_levels_16x1x5(const uint8_t *const src,
                                      const int stride,
                                      const ptrdiff_t *const offsets,
                                      uint8x16_t *const level) {
  level[0] = load_8bit_16x1_to_1_reg(&src[1], stride);
  level[1] = load_8bit_16x1_to_1_reg(&src[stride], stride);
  level[2] = load_8bit_16x1_to_1_reg(&src[offsets[0]], stride);
  level[3] = load_8bit_16x1_to_1_reg(&src[offsets[1]], stride);
  level[4] = load_8bit_16x1_to_1_reg(&src[offsets[2]], stride);
}

static inline uint8x16_t get_coeff_contexts_kernel(uint8x16_t *const level) {
  const uint8x16_t const_3 = vdupq_n_u8(3);
  const uint8x16_t const_4 = vdupq_n_u8(4);
  uint8x16_t count;

  count = vminq_u8(level[0], const_3);
  level[1] = vminq_u8(level[1], const_3);
  level[2] = vminq_u8(level[2], const_3);
  level[3] = vminq_u8(level[3], const_3);
  level[4] = vminq_u8(level[4], const_3);
  count = vaddq_u8(count, level[1]);
  count = vaddq_u8(count, level[2]);
  count = vaddq_u8(count, level[3]);
  count = vaddq_u8(count, level[4]);

  count = vrshrq_n_u8(count, 1);
  count = vminq_u8(count, const_4);
  return count;
}

static inline void get_4_nz_map_contexts_2d(const uint8_t *levels,
                                            const int width,
                                            const ptrdiff_t *const offsets,
                                            uint8_t *const coeff_contexts) {
  const int stride = 4 + TX_PAD_HOR;
  const uint8x16_t pos_to_offset_large = vdupq_n_u8(21);

  uint8x16_t pos_to_offset =
      (width == 4) ? vld1q_u8(c_4_po_2d[0]) : vld1q_u8(c_4_po_2d[1]);

  uint8x16_t count;
  uint8x16_t level[5];
  uint8_t *cc = coeff_contexts;

  assert(!(width % 4));

  int col = width;
  do {
    load_levels_4x4x5(levels, stride, offsets, level);
    count = get_coeff_contexts_kernel(level);
    count = vaddq_u8(count, pos_to_offset);
    vst1q_u8(cc, count);
    pos_to_offset = pos_to_offset_large;
    levels += 4 * stride;
    cc += 16;
    col -= 4;
  } while (col);

  coeff_contexts[0] = 0;
}

static inline void get_4_nz_map_contexts_ver(const uint8_t *levels,
                                             const int width,
                                             const ptrdiff_t *const offsets,
                                             uint8_t *coeff_contexts) {
  const int stride = 4 + TX_PAD_HOR;

  const uint8x16_t pos_to_offset =
      vreinterpretq_u8_u32(vdupq_n_u32(SIG_COEF_CONTEXTS_2D_X4_051010));

  uint8x16_t count;
  uint8x16_t level[5];

  assert(!(width % 4));

  int col = width;
  do {
    load_levels_4x4x5(levels, stride, offsets, level);
    count = get_coeff_contexts_kernel(level);
    count = vaddq_u8(count, pos_to_offset);
    vst1q_u8(coeff_contexts, count);
    levels += 4 * stride;
    coeff_contexts += 16;
    col -= 4;
  } while (col);
}

static inline void get_4_nz_map_contexts_hor(const uint8_t *levels,
                                             const int width,
                                             const ptrdiff_t *const offsets,
                                             uint8_t *coeff_contexts) {
  const int stride = 4 + TX_PAD_HOR;
  const uint8x16_t pos_to_offset_large = vdupq_n_u8(SIG_COEF_CONTEXTS_2D + 10);

  uint8x16_t pos_to_offset = vld1q_u8(c_4_po_hor);

  uint8x16_t count;
  uint8x16_t level[5];

  assert(!(width % 4));

  int col = width;
  do {
    load_levels_4x4x5(levels, stride, offsets, level);
    count = get_coeff_contexts_kernel(level);
    count = vaddq_u8(count, pos_to_offset);
    vst1q_u8(coeff_contexts, count);
    pos_to_offset = pos_to_offset_large;
    levels += 4 * stride;
    coeff_contexts += 16;
    col -= 4;
  } while (col);
}

static inline void get_8_coeff_contexts_2d(const uint8_t *levels,
                                           const int width,
                                           const ptrdiff_t *const offsets,
                                           uint8_t *coeff_contexts) {
  const int stride = 8 + TX_PAD_HOR;
  uint8_t *cc = coeff_contexts;
  uint8x16_t count;
  uint8x16_t level[5];
  uint8x16_t pos_to_offset[3];

  assert(!(width % 2));

  if (width == 8) {
    pos_to_offset[0] = vld1q_u8(c_8_po_2d_8[0]);
    pos_to_offset[1] = vld1q_u8(c_8_po_2d_8[1]);
  } else if (width < 8) {
    pos_to_offset[0] = vld1q_u8(c_8_po_2d_l[0]);
    pos_to_offset[1] = vld1q_u8(c_8_po_2d_l[1]);
  } else {
    pos_to_offset[0] = vld1q_u8(c_8_po_2d_g[0]);
    pos_to_offset[1] = vld1q_u8(c_8_po_2d_g[1]);
  }
  pos_to_offset[2] = vdupq_n_u8(21);

  int col = width;
  do {
    load_levels_8x2x5(levels, stride, offsets, level);
    count = get_coeff_contexts_kernel(level);
    count = vaddq_u8(count, pos_to_offset[0]);
    vst1q_u8(cc, count);
    pos_to_offset[0] = pos_to_offset[1];
    pos_to_offset[1] = pos_to_offset[2];
    levels += 2 * stride;
    cc += 16;
    col -= 2;
  } while (col);

  coeff_contexts[0] = 0;
}

static inline void get_8_coeff_contexts_ver(const uint8_t *levels,
                                            const int width,
                                            const ptrdiff_t *const offsets,
                                            uint8_t *coeff_contexts) {
  const int stride = 8 + TX_PAD_HOR;

  const uint8x16_t pos_to_offset = vld1q_u8(c_8_po_ver);

  uint8x16_t count;
  uint8x16_t level[5];

  assert(!(width % 2));

  int col = width;
  do {
    load_levels_8x2x5(levels, stride, offsets, level);
    count = get_coeff_contexts_kernel(level);
    count = vaddq_u8(count, pos_to_offset);
    vst1q_u8(coeff_contexts, count);
    levels += 2 * stride;
    coeff_contexts += 16;
    col -= 2;
  } while (col);
}

static inline void get_8_coeff_contexts_hor(const uint8_t *levels,
                                            const int width,
                                            const ptrdiff_t *const offsets,
                                            uint8_t *coeff_contexts) {
  const int stride = 8 + TX_PAD_HOR;
  const uint8x16_t pos_to_offset_large = vdupq_n_u8(SIG_COEF_CONTEXTS_2D + 10);

  uint8x16_t pos_to_offset = vcombine_u8(vdup_n_u8(SIG_COEF_CONTEXTS_2D + 0),
                                         vdup_n_u8(SIG_COEF_CONTEXTS_2D + 5));

  uint8x16_t count;
  uint8x16_t level[5];

  assert(!(width % 2));

  int col = width;
  do {
    load_levels_8x2x5(levels, stride, offsets, level);
    count = get_coeff_contexts_kernel(level);
    count = vaddq_u8(count, pos_to_offset);
    vst1q_u8(coeff_contexts, count);
    pos_to_offset = pos_to_offset_large;
    levels += 2 * stride;
    coeff_contexts += 16;
    col -= 2;
  } while (col);
}

static inline void get_16n_coeff_contexts_2d(const uint8_t *levels,
                                             const int real_width,
                                             const int real_height,
                                             const int width, const int height,
                                             const ptrdiff_t *const offsets,
                                             uint8_t *coeff_contexts) {
  const int stride = height + TX_PAD_HOR;
  uint8_t *cc = coeff_contexts;
  int col = width;
  uint8x16_t pos_to_offset[5];
  uint8x16_t pos_to_offset_large[3];
  uint8x16_t count;
  uint8x16_t level[5];

  assert(!(height % 16));

  pos_to_offset_large[2] = vdupq_n_u8(21);
  if (real_width == real_height) {
    pos_to_offset[0] = vld1q_u8(c_16_po_2d_e[0]);
    pos_to_offset[1] = vld1q_u8(c_16_po_2d_e[1]);
    pos_to_offset[2] = vld1q_u8(c_16_po_2d_e[2]);
    pos_to_offset[3] = vld1q_u8(c_16_po_2d_e[3]);
    pos_to_offset[4] = pos_to_offset_large[0] = pos_to_offset_large[1] =
        pos_to_offset_large[2];
  } else if (real_width < real_height) {
    pos_to_offset[0] = vld1q_u8(c_16_po_2d_g[0]);
    pos_to_offset[1] = vld1q_u8(c_16_po_2d_g[1]);
    pos_to_offset[2] = pos_to_offset[3] = pos_to_offset[4] =
        vld1q_u8(c_16_po_2d_g[2]);
    pos_to_offset_large[0] = pos_to_offset_large[1] = pos_to_offset_large[2];
  } else {  // real_width > real_height
    pos_to_offset[0] = pos_to_offset[1] = vld1q_u8(c_16_po_2d_l[0]);
    pos_to_offset[2] = vld1q_u8(c_16_po_2d_l[1]);
    pos_to_offset[3] = vld1q_u8(c_16_po_2d_l[2]);
    pos_to_offset[4] = pos_to_offset_large[2];
    pos_to_offset_large[0] = pos_to_offset_large[1] = vdupq_n_u8(16);
  }

  do {
    int h = height;

    do {
      load_levels_16x1x5(levels, stride, offsets, level);
      count = get_coeff_contexts_kernel(level);
      count = vaddq_u8(count, pos_to_offset[0]);
      vst1q_u8(cc, count);
      levels += 16;
      cc += 16;
      h -= 16;
      pos_to_offset[0] = pos_to_offset_large[0];
    } while (h);

    pos_to_offset[0] = pos_to_offset[1];
    pos_to_offset[1] = pos_to_offset[2];
    pos_to_offset[2] = pos_to_offset[3];
    pos_to_offset[3] = pos_to_offset[4];
    pos_to_offset_large[0] = pos_to_offset_large[1];
    pos_to_offset_large[1] = pos_to_offset_large[2];
    levels += TX_PAD_HOR;
  } while (--col);

  coeff_contexts[0] = 0;
}

static inline void get_16n_coeff_contexts_ver(const uint8_t *levels,
                                              const int width, const int height,
                                              const ptrdiff_t *const offsets,
                                              uint8_t *coeff_contexts) {
  const int stride = height + TX_PAD_HOR;

  const uint8x16_t pos_to_offset_large = vdupq_n_u8(SIG_COEF_CONTEXTS_2D + 10);

  uint8x16_t count;
  uint8x16_t level[5];

  assert(!(height % 16));

  int col = width;
  do {
    uint8x16_t pos_to_offset = vld1q_u8(c_16_po_ver);

    int h = height;
    do {
      load_levels_16x1x5(levels, stride, offsets, level);
      count = get_coeff_contexts_kernel(level);
      count = vaddq_u8(count, pos_to_offset);
      vst1q_u8(coeff_contexts, count);
      pos_to_offset = pos_to_offset_large;
      levels += 16;
      coeff_contexts += 16;
      h -= 16;
    } while (h);

    levels += TX_PAD_HOR;
  } while (--col);
}

static inline void get_16n_coeff_contexts_hor(const uint8_t *levels,
                                              const int width, const int height,
                                              const ptrdiff_t *const offsets,
                                              uint8_t *coeff_contexts) {
  const int stride = height + TX_PAD_HOR;

  uint8x16_t pos_to_offset[3];
  uint8x16_t count;
  uint8x16_t level[5];

  assert(!(height % 16));

  pos_to_offset[0] = vdupq_n_u8(SIG_COEF_CONTEXTS_2D + 0);
  pos_to_offset[1] = vdupq_n_u8(SIG_COEF_CONTEXTS_2D + 5);
  pos_to_offset[2] = vdupq_n_u8(SIG_COEF_CONTEXTS_2D + 10);

  int col = width;
  do {
    int h = height;
    do {
      load_levels_16x1x5(levels, stride, offsets, level);
      count = get_coeff_contexts_kernel(level);
      count = vaddq_u8(count, pos_to_offset[0]);
      vst1q_u8(coeff_contexts, count);
      levels += 16;
      coeff_contexts += 16;
      h -= 16;
    } while (h);

    pos_to_offset[0] = pos_to_offset[1];
    pos_to_offset[1] = pos_to_offset[2];
    levels += TX_PAD_HOR;
  } while (--col);
}

// Note: levels[] must be in the range [0, 127], inclusive.
void av1_get_nz_map_contexts_neon(const uint8_t *const levels,
                                  const int16_t *const scan, const uint16_t eob,
                                  const TX_SIZE tx_size,
                                  const TX_CLASS tx_class,
                                  int8_t *const coeff_contexts) {
  const int last_idx = eob - 1;
  if (!last_idx) {
    coeff_contexts[0] = 0;
    return;
  }

  uint8_t *const coefficients = (uint8_t *const)coeff_contexts;

  const int real_width = tx_size_wide[tx_size];
  const int real_height = tx_size_high[tx_size];
  const int width = get_txb_wide(tx_size);
  const int height = get_txb_high(tx_size);
  const int stride = height + TX_PAD_HOR;
  ptrdiff_t offsets[3];

  /* coeff_contexts must be 16 byte aligned. */
  assert(!((intptr_t)coeff_contexts & 0xf));

  if (tx_class == TX_CLASS_2D) {
    offsets[0] = 0 * stride + 2;
    offsets[1] = 1 * stride + 1;
    offsets[2] = 2 * stride + 0;

    if (height == 4) {
      get_4_nz_map_contexts_2d(levels, width, offsets, coefficients);
    } else if (height == 8) {
      get_8_coeff_contexts_2d(levels, width, offsets, coefficients);
    } else {
      get_16n_coeff_contexts_2d(levels, real_width, real_height, width, height,
                                offsets, coefficients);
    }
  } else if (tx_class == TX_CLASS_HORIZ) {
    offsets[0] = 2 * stride;
    offsets[1] = 3 * stride;
    offsets[2] = 4 * stride;
    if (height == 4) {
      get_4_nz_map_contexts_hor(levels, width, offsets, coefficients);
    } else if (height == 8) {
      get_8_coeff_contexts_hor(levels, width, offsets, coefficients);
    } else {
      get_16n_coeff_contexts_hor(levels, width, height, offsets, coefficients);
    }
  } else {  // TX_CLASS_VERT
    offsets[0] = 2;
    offsets[1] = 3;
    offsets[2] = 4;
    if (height == 4) {
      get_4_nz_map_contexts_ver(levels, width, offsets, coefficients);
    } else if (height == 8) {
      get_8_coeff_contexts_ver(levels, width, offsets, coefficients);
    } else {
      get_16n_coeff_contexts_ver(levels, width, height, offsets, coefficients);
    }
  }

  const int bhl = get_txb_bhl(tx_size);
  const int pos = scan[last_idx];
  if (last_idx <= (width << bhl) / 8)
    coeff_contexts[pos] = 1;
  else if (last_idx <= (width << bhl) / 4)
    coeff_contexts[pos] = 2;
  else
    coeff_contexts[pos] = 3;
}

Messung V0.5 in Prozent

¤ Dauer der Verarbeitung: 0.16 Sekunden (vorverarbeitet am 2026-06-06) ¤

Wurzel

Suchen

Beweissystem der NASA

Beweissystem Isabelle

NIST Cobol Testsuite

Cephes Mathematical Library

Wiener Entwicklungsmethode

Haftungshinweis

Die Informationen auf dieser Webseite wurden nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit, noch Qualität der bereit gestellten Informationen zugesichert.

Bemerkung:

Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.