/*
* Copyright (c) 2016, Alliance for Open Media. All rights reserved.
*
* This source code is subject to the terms of the BSD 2 Clause License and
* the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
* was not distributed with this source code in the LICENSE file, you can
* obtain it at www.aomedia.org/license/software. If the Alliance for Open
* Media Patent License 1.0 was not distributed with this source code in the
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
#include <arm_neon.h>
#include <assert.h>
#include <math.h>
#include "config/aom_config.h"
#include "aom_dsp/arm/mem_neon.h"
#include "av1/common/txb_common.h"
#include "av1/encoder/encodetxb.h"
void av1_txb_init_levels_neon(const tran_low_t *const coeff, const int width,
const int height, uint8_t *const levels) {
const int stride = height + TX_PAD_HOR;
memset(levels - TX_PAD_TOP * stride, 0 ,
sizeof (*levels) * TX_PAD_TOP * stride);
memset(levels + stride * width, 0 ,
sizeof (*levels) * (TX_PAD_BOTTOM * stride + TX_PAD_END));
const int32x4_t zeros = vdupq_n_s32(0 );
int i = 0 ;
uint8_t *ls = levels;
const tran_low_t *cf = coeff;
if (height == 4 ) {
do {
const int32x4_t coeffA = vld1q_s32(cf);
const int32x4_t coeffB = vld1q_s32(cf + height);
const int16x8_t coeffAB =
vcombine_s16(vqmovn_s32(coeffA), vqmovn_s32(coeffB));
const int16x8_t absAB = vqabsq_s16(coeffAB);
const int8x8_t absABs = vqmovn_s16(absAB);
#if AOM_ARCH_AARCH64
const int8x16_t absAB8 =
vcombine_s8(absABs, vreinterpret_s8_s32(vget_low_s32(zeros)));
const uint8x16_t lsAB =
vreinterpretq_u8_s32(vzip1q_s32(vreinterpretq_s32_s8(absAB8), zeros));
#else
const int32x2x2_t absAB8 =
vzip_s32(vreinterpret_s32_s8(absABs), vget_low_s32(zeros));
const uint8x16_t lsAB =
vreinterpretq_u8_s32(vcombine_s32(absAB8.val[0 ], absAB8.val[1 ]));
#endif
vst1q_u8(ls, lsAB);
ls += (stride << 1 );
cf += (height << 1 );
i += 2 ;
} while (i < width);
} else if (height == 8 ) {
do {
const int16x8_t coeffAB = load_tran_low_to_s16q(cf);
const int16x8_t absAB = vqabsq_s16(coeffAB);
const uint8x16_t absAB8 = vreinterpretq_u8_s8(vcombine_s8(
vqmovn_s16(absAB), vreinterpret_s8_s32(vget_low_s32(zeros))));
vst1q_u8(ls, absAB8);
ls += stride;
cf += height;
i += 1 ;
} while (i < width);
} else {
do {
int j = 0 ;
do {
const int16x8_t coeffAB = load_tran_low_to_s16q(cf);
const int16x8_t coeffCD = load_tran_low_to_s16q(cf + 8 );
const int16x8_t absAB = vqabsq_s16(coeffAB);
const int16x8_t absCD = vqabsq_s16(coeffCD);
const uint8x16_t absABCD = vreinterpretq_u8_s8(
vcombine_s8(vqmovn_s16(absAB), vqmovn_s16(absCD)));
vst1q_u8((ls + j), absABCD);
j += 16 ;
cf += 16 ;
} while (j < height);
*(int32_t *)(ls + height) = 0 ;
ls += stride;
i += 1 ;
} while (i < width);
}
}
// get_4_nz_map_contexts_2d coefficients:
static const DECLARE_ALIGNED(16 , uint8_t, c_4_po_2d[2 ][16 ]) = {
{ 0 , 1 , 6 , 6 , 1 , 6 , 6 , 21 , 6 , 6 , 21 , 21 , 6 , 21 , 21 , 21 },
{ 0 , 16 , 16 , 16 , 16 , 16 , 16 , 16 , 6 , 6 , 21 , 21 , 6 , 21 , 21 , 21 }
};
// get_4_nz_map_contexts_hor coefficients:
/* clang-format off */
#define SIG_COEF_CONTEXTS_2D_X4_051010 \
(SIG_COEF_CONTEXTS_2D + ((SIG_COEF_CONTEXTS_2D + 5 ) << 8 ) + \
((SIG_COEF_CONTEXTS_2D + 10 ) << 16 ) + ((SIG_COEF_CONTEXTS_2D + 10 ) << 24 ))
/* clang-format on */
// get_4_nz_map_contexts_ver coefficients:
static const DECLARE_ALIGNED(16 , uint8_t, c_4_po_hor[16 ]) = {
SIG_COEF_CONTEXTS_2D + 0 , SIG_COEF_CONTEXTS_2D + 0 ,
SIG_COEF_CONTEXTS_2D + 0 , SIG_COEF_CONTEXTS_2D + 0 ,
SIG_COEF_CONTEXTS_2D + 5 , SIG_COEF_CONTEXTS_2D + 5 ,
SIG_COEF_CONTEXTS_2D + 5 , SIG_COEF_CONTEXTS_2D + 5 ,
SIG_COEF_CONTEXTS_2D + 10 , SIG_COEF_CONTEXTS_2D + 10 ,
SIG_COEF_CONTEXTS_2D + 10 , SIG_COEF_CONTEXTS_2D + 10 ,
SIG_COEF_CONTEXTS_2D + 10 , SIG_COEF_CONTEXTS_2D + 10 ,
SIG_COEF_CONTEXTS_2D + 10 , SIG_COEF_CONTEXTS_2D + 10
};
// get_8_coeff_contexts_2d coefficients:
// if (width == 8)
static const DECLARE_ALIGNED(16 , uint8_t, c_8_po_2d_8[2 ][16 ]) = {
{ 0 , 1 , 6 , 6 , 21 , 21 , 21 , 21 , 1 , 6 , 6 , 21 , 21 , 21 , 21 , 21 },
{ 6 , 6 , 21 , 21 , 21 , 21 , 21 , 21 , 6 , 21 , 21 , 21 , 21 , 21 , 21 , 21 }
};
// if (width < 8)
static const DECLARE_ALIGNED(16 , uint8_t, c_8_po_2d_l[2 ][16 ]) = {
{ 0 , 11 , 6 , 6 , 21 , 21 , 21 , 21 , 11 , 11 , 6 , 21 , 21 , 21 , 21 , 21 },
{ 11 , 11 , 21 , 21 , 21 , 21 , 21 , 21 , 11 , 11 , 21 , 21 , 21 , 21 , 21 , 21 }
};
// if (width > 8)
static const DECLARE_ALIGNED(16 , uint8_t, c_8_po_2d_g[2 ][16 ]) = {
{ 0 , 16 , 16 , 16 , 16 , 16 , 16 , 16 , 16 , 16 , 16 , 16 , 16 , 16 , 16 , 16 },
{ 6 , 6 , 21 , 21 , 21 , 21 , 21 , 21 , 6 , 21 , 21 , 21 , 21 , 21 , 21 , 21 }
};
// get_4_nz_map_contexts_ver coefficients:
static const DECLARE_ALIGNED(16 , uint8_t, c_8_po_ver[16 ]) = {
SIG_COEF_CONTEXTS_2D + 0 , SIG_COEF_CONTEXTS_2D + 5 ,
SIG_COEF_CONTEXTS_2D + 10 , SIG_COEF_CONTEXTS_2D + 10 ,
SIG_COEF_CONTEXTS_2D + 10 , SIG_COEF_CONTEXTS_2D + 10 ,
SIG_COEF_CONTEXTS_2D + 10 , SIG_COEF_CONTEXTS_2D + 10 ,
SIG_COEF_CONTEXTS_2D + 0 , SIG_COEF_CONTEXTS_2D + 5 ,
SIG_COEF_CONTEXTS_2D + 10 , SIG_COEF_CONTEXTS_2D + 10 ,
SIG_COEF_CONTEXTS_2D + 10 , SIG_COEF_CONTEXTS_2D + 10 ,
SIG_COEF_CONTEXTS_2D + 10 , SIG_COEF_CONTEXTS_2D + 10
};
// get_16n_coeff_contexts_2d coefficients:
// real_width == real_height
static const DECLARE_ALIGNED(16 , uint8_t, c_16_po_2d_e[4 ][16 ]) = {
{ 0 , 1 , 6 , 6 , 21 , 21 , 21 , 21 , 21 , 21 , 21 , 21 , 21 , 21 , 21 , 21 },
{ 1 , 6 , 6 , 21 , 21 , 21 , 21 , 21 , 21 , 21 , 21 , 21 , 21 , 21 , 21 , 21 },
{ 6 , 6 , 21 , 21 , 21 , 21 , 21 , 21 , 21 , 21 , 21 , 21 , 21 , 21 , 21 , 21 },
{ 6 , 21 , 21 , 21 , 21 , 21 , 21 , 21 , 21 , 21 , 21 , 21 , 21 , 21 , 21 , 21 }
};
// real_width < real_height
static const DECLARE_ALIGNED(16 , uint8_t, c_16_po_2d_g[3 ][16 ]) = {
{ 0 , 11 , 6 , 6 , 21 , 21 , 21 , 21 , 21 , 21 , 21 , 21 , 21 , 21 , 21 , 21 },
{ 11 , 11 , 6 , 21 , 21 , 21 , 21 , 21 , 21 , 21 , 21 , 21 , 21 , 21 , 21 , 21 },
{ 11 , 11 , 21 , 21 , 21 , 21 , 21 , 21 , 21 , 21 , 21 , 21 , 21 , 21 , 21 , 21 }
};
// real_width > real_height
static const DECLARE_ALIGNED(16 , uint8_t, c_16_po_2d_l[3 ][16 ]) = {
{ 16 , 16 , 16 , 16 , 16 , 16 , 16 , 16 , 16 , 16 , 16 , 16 , 16 , 16 , 16 , 16 },
{ 6 , 6 , 21 , 21 , 21 , 21 , 21 , 21 , 21 , 21 , 21 , 21 , 21 , 21 , 21 , 21 },
{ 6 , 21 , 21 , 21 , 21 , 21 , 21 , 21 , 21 , 21 , 21 , 21 , 21 , 21 , 21 , 21 }
};
// get_16n_coeff_contexts_hor coefficients:
static const DECLARE_ALIGNED(16 , uint8_t, c_16_po_ver[16 ]) = {
SIG_COEF_CONTEXTS_2D + 0 , SIG_COEF_CONTEXTS_2D + 5 ,
SIG_COEF_CONTEXTS_2D + 10 , SIG_COEF_CONTEXTS_2D + 10 ,
SIG_COEF_CONTEXTS_2D + 10 , SIG_COEF_CONTEXTS_2D + 10 ,
SIG_COEF_CONTEXTS_2D + 10 , SIG_COEF_CONTEXTS_2D + 10 ,
SIG_COEF_CONTEXTS_2D + 10 , SIG_COEF_CONTEXTS_2D + 10 ,
SIG_COEF_CONTEXTS_2D + 10 , SIG_COEF_CONTEXTS_2D + 10 ,
SIG_COEF_CONTEXTS_2D + 10 , SIG_COEF_CONTEXTS_2D + 10 ,
SIG_COEF_CONTEXTS_2D + 10 , SIG_COEF_CONTEXTS_2D + 10
};
// end of coefficients declaration area
static inline uint8x16_t load_8bit_4x4_to_1_reg(const uint8_t *const src,
const int byte_stride) {
#if AOM_ARCH_AARCH64
uint32x4_t v_data = vld1q_u32((uint32_t *)src);
v_data = vld1q_lane_u32((uint32_t *)(src + 1 * byte_stride), v_data, 1 );
v_data = vld1q_lane_u32((uint32_t *)(src + 2 * byte_stride), v_data, 2 );
v_data = vld1q_lane_u32((uint32_t *)(src + 3 * byte_stride), v_data, 3 );
return vreinterpretq_u8_u32(v_data);
#else
return load_unaligned_u8q(src, byte_stride);
#endif
}
static inline uint8x16_t load_8bit_8x2_to_1_reg(const uint8_t *const src,
const int byte_stride) {
#if AOM_ARCH_AARCH64
uint64x2_t v_data = vld1q_u64((uint64_t *)src);
v_data = vld1q_lane_u64((uint64_t *)(src + 1 * byte_stride), v_data, 1 );
return vreinterpretq_u8_u64(v_data);
#else
uint8x8_t v_data_low = vld1_u8(src);
uint8x8_t v_data_high = vld1_u8(src + byte_stride);
return vcombine_u8(v_data_low, v_data_high);
#endif
}
static inline uint8x16_t load_8bit_16x1_to_1_reg(const uint8_t *const src,
const int byte_stride) {
(void )byte_stride;
return vld1q_u8(src);
}
static inline void load_levels_4x4x5(const uint8_t *const src, const int stride,
const ptrdiff_t *const offsets,
uint8x16_t *const level) {
level[0 ] = load_8bit_4x4_to_1_reg(&src[1 ], stride);
level[1 ] = load_8bit_4x4_to_1_reg(&src[stride], stride);
level[2 ] = load_8bit_4x4_to_1_reg(&src[offsets[0 ]], stride);
level[3 ] = load_8bit_4x4_to_1_reg(&src[offsets[1 ]], stride);
level[4 ] = load_8bit_4x4_to_1_reg(&src[offsets[2 ]], stride);
}
static inline void load_levels_8x2x5(const uint8_t *const src, const int stride,
const ptrdiff_t *const offsets,
uint8x16_t *const level) {
level[0 ] = load_8bit_8x2_to_1_reg(&src[1 ], stride);
level[1 ] = load_8bit_8x2_to_1_reg(&src[stride], stride);
level[2 ] = load_8bit_8x2_to_1_reg(&src[offsets[0 ]], stride);
level[3 ] = load_8bit_8x2_to_1_reg(&src[offsets[1 ]], stride);
level[4 ] = load_8bit_8x2_to_1_reg(&src[offsets[2 ]], stride);
}
static inline void load_levels_16x1x5(const uint8_t *const src,
const int stride,
const ptrdiff_t *const offsets,
uint8x16_t *const level) {
level[0 ] = load_8bit_16x1_to_1_reg(&src[1 ], stride);
level[1 ] = load_8bit_16x1_to_1_reg(&src[stride], stride);
level[2 ] = load_8bit_16x1_to_1_reg(&src[offsets[0 ]], stride);
level[3 ] = load_8bit_16x1_to_1_reg(&src[offsets[1 ]], stride);
level[4 ] = load_8bit_16x1_to_1_reg(&src[offsets[2 ]], stride);
}
static inline uint8x16_t get_coeff_contexts_kernel(uint8x16_t *const level) {
const uint8x16_t const_3 = vdupq_n_u8(3 );
const uint8x16_t const_4 = vdupq_n_u8(4 );
uint8x16_t count;
count = vminq_u8(level[0 ], const_3);
level[1 ] = vminq_u8(level[1 ], const_3);
level[2 ] = vminq_u8(level[2 ], const_3);
level[3 ] = vminq_u8(level[3 ], const_3);
level[4 ] = vminq_u8(level[4 ], const_3);
count = vaddq_u8(count, level[1 ]);
count = vaddq_u8(count, level[2 ]);
count = vaddq_u8(count, level[3 ]);
count = vaddq_u8(count, level[4 ]);
count = vrshrq_n_u8(count, 1 );
count = vminq_u8(count, const_4);
return count;
}
static inline void get_4_nz_map_contexts_2d(const uint8_t *levels,
const int width,
const ptrdiff_t *const offsets,
uint8_t *const coeff_contexts) {
const int stride = 4 + TX_PAD_HOR;
const uint8x16_t pos_to_offset_large = vdupq_n_u8(21 );
uint8x16_t pos_to_offset =
(width == 4 ) ? vld1q_u8(c_4_po_2d[0 ]) : vld1q_u8(c_4_po_2d[1 ]);
uint8x16_t count;
uint8x16_t level[5 ];
uint8_t *cc = coeff_contexts;
assert(!(width % 4 ));
int col = width;
do {
load_levels_4x4x5(levels, stride, offsets, level);
count = get_coeff_contexts_kernel(level);
count = vaddq_u8(count, pos_to_offset);
vst1q_u8(cc, count);
pos_to_offset = pos_to_offset_large;
levels += 4 * stride;
cc += 16 ;
col -= 4 ;
} while (col);
coeff_contexts[0 ] = 0 ;
}
static inline void get_4_nz_map_contexts_ver(const uint8_t *levels,
const int width,
const ptrdiff_t *const offsets,
uint8_t *coeff_contexts) {
const int stride = 4 + TX_PAD_HOR;
const uint8x16_t pos_to_offset =
vreinterpretq_u8_u32(vdupq_n_u32(SIG_COEF_CONTEXTS_2D_X4_051010));
uint8x16_t count;
uint8x16_t level[5 ];
assert(!(width % 4 ));
int col = width;
do {
load_levels_4x4x5(levels, stride, offsets, level);
count = get_coeff_contexts_kernel(level);
count = vaddq_u8(count, pos_to_offset);
vst1q_u8(coeff_contexts, count);
levels += 4 * stride;
coeff_contexts += 16 ;
col -= 4 ;
} while (col);
}
static inline void get_4_nz_map_contexts_hor(const uint8_t *levels,
const int width,
const ptrdiff_t *const offsets,
uint8_t *coeff_contexts) {
const int stride = 4 + TX_PAD_HOR;
const uint8x16_t pos_to_offset_large = vdupq_n_u8(SIG_COEF_CONTEXTS_2D + 10 );
uint8x16_t pos_to_offset = vld1q_u8(c_4_po_hor);
uint8x16_t count;
uint8x16_t level[5 ];
assert(!(width % 4 ));
int col = width;
do {
load_levels_4x4x5(levels, stride, offsets, level);
count = get_coeff_contexts_kernel(level);
count = vaddq_u8(count, pos_to_offset);
vst1q_u8(coeff_contexts, count);
pos_to_offset = pos_to_offset_large;
levels += 4 * stride;
coeff_contexts += 16 ;
col -= 4 ;
} while (col);
}
static inline void get_8_coeff_contexts_2d(const uint8_t *levels,
const int width,
const ptrdiff_t *const offsets,
uint8_t *coeff_contexts) {
const int stride = 8 + TX_PAD_HOR;
uint8_t *cc = coeff_contexts;
uint8x16_t count;
uint8x16_t level[5 ];
uint8x16_t pos_to_offset[3 ];
assert(!(width % 2 ));
if (width == 8 ) {
pos_to_offset[0 ] = vld1q_u8(c_8_po_2d_8[0 ]);
pos_to_offset[1 ] = vld1q_u8(c_8_po_2d_8[1 ]);
} else if (width < 8 ) {
pos_to_offset[0 ] = vld1q_u8(c_8_po_2d_l[0 ]);
pos_to_offset[1 ] = vld1q_u8(c_8_po_2d_l[1 ]);
} else {
pos_to_offset[0 ] = vld1q_u8(c_8_po_2d_g[0 ]);
pos_to_offset[1 ] = vld1q_u8(c_8_po_2d_g[1 ]);
}
pos_to_offset[2 ] = vdupq_n_u8(21 );
int col = width;
do {
load_levels_8x2x5(levels, stride, offsets, level);
count = get_coeff_contexts_kernel(level);
count = vaddq_u8(count, pos_to_offset[0 ]);
vst1q_u8(cc, count);
pos_to_offset[0 ] = pos_to_offset[1 ];
pos_to_offset[1 ] = pos_to_offset[2 ];
levels += 2 * stride;
cc += 16 ;
col -= 2 ;
} while (col);
coeff_contexts[0 ] = 0 ;
}
static inline void get_8_coeff_contexts_ver(const uint8_t *levels,
const int width,
const ptrdiff_t *const offsets,
uint8_t *coeff_contexts) {
const int stride = 8 + TX_PAD_HOR;
const uint8x16_t pos_to_offset = vld1q_u8(c_8_po_ver);
uint8x16_t count;
uint8x16_t level[5 ];
assert(!(width % 2 ));
int col = width;
do {
load_levels_8x2x5(levels, stride, offsets, level);
count = get_coeff_contexts_kernel(level);
count = vaddq_u8(count, pos_to_offset);
vst1q_u8(coeff_contexts, count);
levels += 2 * stride;
coeff_contexts += 16 ;
col -= 2 ;
} while (col);
}
static inline void get_8_coeff_contexts_hor(const uint8_t *levels,
const int width,
const ptrdiff_t *const offsets,
uint8_t *coeff_contexts) {
const int stride = 8 + TX_PAD_HOR;
const uint8x16_t pos_to_offset_large = vdupq_n_u8(SIG_COEF_CONTEXTS_2D + 10 );
uint8x16_t pos_to_offset = vcombine_u8(vdup_n_u8(SIG_COEF_CONTEXTS_2D + 0 ),
vdup_n_u8(SIG_COEF_CONTEXTS_2D + 5 ));
uint8x16_t count;
uint8x16_t level[5 ];
assert(!(width % 2 ));
int col = width;
do {
load_levels_8x2x5(levels, stride, offsets, level);
count = get_coeff_contexts_kernel(level);
count = vaddq_u8(count, pos_to_offset);
vst1q_u8(coeff_contexts, count);
pos_to_offset = pos_to_offset_large;
levels += 2 * stride;
coeff_contexts += 16 ;
col -= 2 ;
} while (col);
}
static inline void get_16n_coeff_contexts_2d(const uint8_t *levels,
const int real_width,
const int real_height,
const int width, const int height,
const ptrdiff_t *const offsets,
uint8_t *coeff_contexts) {
const int stride = height + TX_PAD_HOR;
uint8_t *cc = coeff_contexts;
int col = width;
uint8x16_t pos_to_offset[5 ];
uint8x16_t pos_to_offset_large[3 ];
uint8x16_t count;
uint8x16_t level[5 ];
assert(!(height % 16 ));
pos_to_offset_large[2 ] = vdupq_n_u8(21 );
if (real_width == real_height) {
pos_to_offset[0 ] = vld1q_u8(c_16_po_2d_e[0 ]);
pos_to_offset[1 ] = vld1q_u8(c_16_po_2d_e[1 ]);
pos_to_offset[2 ] = vld1q_u8(c_16_po_2d_e[2 ]);
pos_to_offset[3 ] = vld1q_u8(c_16_po_2d_e[3 ]);
pos_to_offset[4 ] = pos_to_offset_large[0 ] = pos_to_offset_large[1 ] =
pos_to_offset_large[2 ];
} else if (real_width < real_height) {
pos_to_offset[0 ] = vld1q_u8(c_16_po_2d_g[0 ]);
pos_to_offset[1 ] = vld1q_u8(c_16_po_2d_g[1 ]);
pos_to_offset[2 ] = pos_to_offset[3 ] = pos_to_offset[4 ] =
vld1q_u8(c_16_po_2d_g[2 ]);
pos_to_offset_large[0 ] = pos_to_offset_large[1 ] = pos_to_offset_large[2 ];
} else { // real_width > real_height
pos_to_offset[0 ] = pos_to_offset[1 ] = vld1q_u8(c_16_po_2d_l[0 ]);
pos_to_offset[2 ] = vld1q_u8(c_16_po_2d_l[1 ]);
pos_to_offset[3 ] = vld1q_u8(c_16_po_2d_l[2 ]);
pos_to_offset[4 ] = pos_to_offset_large[2 ];
pos_to_offset_large[0 ] = pos_to_offset_large[1 ] = vdupq_n_u8(16 );
}
do {
int h = height;
do {
load_levels_16x1x5(levels, stride, offsets, level);
count = get_coeff_contexts_kernel(level);
count = vaddq_u8(count, pos_to_offset[0 ]);
vst1q_u8(cc, count);
levels += 16 ;
cc += 16 ;
h -= 16 ;
pos_to_offset[0 ] = pos_to_offset_large[0 ];
} while (h);
pos_to_offset[0 ] = pos_to_offset[1 ];
pos_to_offset[1 ] = pos_to_offset[2 ];
pos_to_offset[2 ] = pos_to_offset[3 ];
pos_to_offset[3 ] = pos_to_offset[4 ];
pos_to_offset_large[0 ] = pos_to_offset_large[1 ];
pos_to_offset_large[1 ] = pos_to_offset_large[2 ];
levels += TX_PAD_HOR;
} while (--col);
coeff_contexts[0 ] = 0 ;
}
static inline void get_16n_coeff_contexts_ver(const uint8_t *levels,
const int width, const int height,
const ptrdiff_t *const offsets,
uint8_t *coeff_contexts) {
const int stride = height + TX_PAD_HOR;
const uint8x16_t pos_to_offset_large = vdupq_n_u8(SIG_COEF_CONTEXTS_2D + 10 );
uint8x16_t count;
uint8x16_t level[5 ];
assert(!(height % 16 ));
int col = width;
do {
uint8x16_t pos_to_offset = vld1q_u8(c_16_po_ver);
int h = height;
do {
load_levels_16x1x5(levels, stride, offsets, level);
count = get_coeff_contexts_kernel(level);
count = vaddq_u8(count, pos_to_offset);
vst1q_u8(coeff_contexts, count);
pos_to_offset = pos_to_offset_large;
levels += 16 ;
coeff_contexts += 16 ;
h -= 16 ;
} while (h);
levels += TX_PAD_HOR;
} while (--col);
}
static inline void get_16n_coeff_contexts_hor(const uint8_t *levels,
const int width, const int height,
const ptrdiff_t *const offsets,
uint8_t *coeff_contexts) {
const int stride = height + TX_PAD_HOR;
uint8x16_t pos_to_offset[3 ];
uint8x16_t count;
uint8x16_t level[5 ];
assert(!(height % 16 ));
pos_to_offset[0 ] = vdupq_n_u8(SIG_COEF_CONTEXTS_2D + 0 );
pos_to_offset[1 ] = vdupq_n_u8(SIG_COEF_CONTEXTS_2D + 5 );
pos_to_offset[2 ] = vdupq_n_u8(SIG_COEF_CONTEXTS_2D + 10 );
int col = width;
do {
int h = height;
do {
load_levels_16x1x5(levels, stride, offsets, level);
count = get_coeff_contexts_kernel(level);
count = vaddq_u8(count, pos_to_offset[0 ]);
vst1q_u8(coeff_contexts, count);
levels += 16 ;
coeff_contexts += 16 ;
h -= 16 ;
} while (h);
pos_to_offset[0 ] = pos_to_offset[1 ];
pos_to_offset[1 ] = pos_to_offset[2 ];
levels += TX_PAD_HOR;
} while (--col);
}
// Note: levels[] must be in the range [0, 127], inclusive.
void av1_get_nz_map_contexts_neon(const uint8_t *const levels,
const int16_t *const scan, const uint16_t eob,
const TX_SIZE tx_size,
const TX_CLASS tx_class,
int8_t *const coeff_contexts) {
const int last_idx = eob - 1 ;
if (!last_idx) {
coeff_contexts[0 ] = 0 ;
return ;
}
uint8_t *const coefficients = (uint8_t *const )coeff_contexts;
const int real_width = tx_size_wide[tx_size];
const int real_height = tx_size_high[tx_size];
const int width = get_txb_wide(tx_size);
const int height = get_txb_high(tx_size);
const int stride = height + TX_PAD_HOR;
ptrdiff_t offsets[3 ];
/* coeff_contexts must be 16 byte aligned. */
assert(!((intptr_t)coeff_contexts & 0 xf));
if (tx_class == TX_CLASS_2D) {
offsets[0 ] = 0 * stride + 2 ;
offsets[1 ] = 1 * stride + 1 ;
offsets[2 ] = 2 * stride + 0 ;
if (height == 4 ) {
get_4_nz_map_contexts_2d(levels, width, offsets, coefficients);
} else if (height == 8 ) {
get_8_coeff_contexts_2d(levels, width, offsets, coefficients);
} else {
get_16n_coeff_contexts_2d(levels, real_width, real_height, width, height,
offsets, coefficients);
}
} else if (tx_class == TX_CLASS_HORIZ) {
offsets[0 ] = 2 * stride;
offsets[1 ] = 3 * stride;
offsets[2 ] = 4 * stride;
if (height == 4 ) {
get_4_nz_map_contexts_hor(levels, width, offsets, coefficients);
} else if (height == 8 ) {
get_8_coeff_contexts_hor(levels, width, offsets, coefficients);
} else {
get_16n_coeff_contexts_hor(levels, width, height, offsets, coefficients);
}
} else { // TX_CLASS_VERT
offsets[0 ] = 2 ;
offsets[1 ] = 3 ;
offsets[2 ] = 4 ;
if (height == 4 ) {
get_4_nz_map_contexts_ver(levels, width, offsets, coefficients);
} else if (height == 8 ) {
get_8_coeff_contexts_ver(levels, width, offsets, coefficients);
} else {
get_16n_coeff_contexts_ver(levels, width, height, offsets, coefficients);
}
}
const int bhl = get_txb_bhl(tx_size);
const int pos = scan[last_idx];
if (last_idx <= (width << bhl) / 8 )
coeff_contexts[pos] = 1 ;
else if (last_idx <= (width << bhl) / 4 )
coeff_contexts[pos] = 2 ;
else
coeff_contexts[pos] = 3 ;
}
Messung V0.5 in Prozent C=97 H=84 G=90
¤ Dauer der Verarbeitung: 0.16 Sekunden
(vorverarbeitet am 2026-06-06)
¤
*© Formatika GbR, Deutschland