/* * Copyright (c) 2016 The WebM project authors. All Rights Reserved. * * Use of this source code is governed by a BSD-style license * that can be found in the LICENSE file in the root of the source * tree. An additional intellectual property rights grant can be found * in the file PATENTS. All contributing project authors may * be found in the AUTHORS file in the root of the source tree.
*/
void vpx_post_proc_down_and_across_mb_row_neon(uint8_t *src_ptr,
uint8_t *dst_ptr, int src_stride, int dst_stride, int cols,
uint8_t *f, int size) {
uint8_t *src, *dst; int row; int col;
// While columns of length 16 can be processed, load them. for (col = 0; col < cols - 8; col += 16) {
uint8x16_t a0, a1, a2, a3, a4, a5, a6, a7;
src = src_ptr - 2 * src_stride;
dst = dst_ptr;
// Load 8 values, transpose 4 of them, and discard 2 because they will be // reloaded later.
load_and_transpose_u8_4x8(src, dst_stride, &a0, &a1, &a2, &a3);
a3 = a1;
a2 = a1 = a0; // Extend left border.
src += 2;
for (col = 0; col < cols; col += 8) {
uint8x8_t v_out_0, v_out_1, v_out_2, v_out_3, v_out_4, v_out_5, v_out_6,
v_out_7; // Although the filter is meant to be applied vertically and is instead // being applied horizontally here it's OK because it's set in blocks of 8 // (or 16). const uint8x8_t filter = vld1_u8(f + col);
// Add in the first set because vext doesn't work with '0'.
*sum = vadd_s16(*sum, x);
*sumsq = vaddq_s32(*sumsq, xy);
// Shift x and xy to the right and sum. vext requires an immediate.
*sum = vadd_s16(*sum, vext_s16(zero, x, 1));
*sumsq = vaddq_s32(*sumsq, vextq_s32(zeroq, xy, 1));
void vpx_mbpost_proc_across_ip_neon(uint8_t *src, int pitch, int rows, int cols, int flimit) { int row, col; const int32x4_t f = vdupq_n_s32(flimit);
assert(cols % 8 == 0);
for (row = 0; row < rows; ++row) { // Sum the first 8 elements, which are extended from s[0]. // sumsq gets primed with +16. int sumsq = src[0] * src[0] * 9 + 16; int sum = src[0] * 9;
// Sum (+square) the next 6 elements. // Skip [0] because it's included above. for (col = 1; col <= 6; ++col) {
sumsq += src[col] * src[col];
sum += src[col];
}
// Prime the sums. Later the loop uses the _high values to prime the new // vectors.
sumsq_high = vdupq_n_s32(sumsq);
sum_high = vdup_n_s16(sum);
// Manually extend the left border.
left_context = vdup_n_u8(src[0]);
for (col = 0; col < cols; col += 8) {
uint8x8_t mask, output;
int16x8_t x, y;
int32x4_t xy_low, xy_high;
s = vld1_u8(src + col);
if (col + 8 == cols) { // Last row. Extend border.
right_context = vdup_n_u8(src[col + 7]);
} else {
right_context = vld1_u8(src + col + 7);
}
x = vreinterpretq_s16_u16(vsubl_u8(right_context, left_context));
y = vreinterpretq_s16_u16(vaddl_u8(right_context, left_context));
xy_low = vmull_s16(vget_low_s16(x), vget_low_s16(y));
xy_high = vmull_s16(vget_high_s16(x), vget_high_s16(y));
// Catch up to the last sum'd value.
sum_low = vdup_lane_s16(sum_high, 3);
sumsq_low = vdupq_lane_s32(vget_high_s32(sumsq_high), 1);
// Need to do this sequentially because we need the max value from // sum_low.
sum_high = vdup_lane_s16(sum_low, 3);
sumsq_high = vdupq_lane_s32(vget_high_s32(sumsq_low), 1);
void vpx_mbpost_proc_down_neon(uint8_t *dst, int pitch, int rows, int cols, int flimit) { int row, col, i; const int32x4_t f = vdupq_n_s32(flimit);
uint8x8_t below_context = vdup_n_u8(0);
// 8 columns are processed at a time. // If rows is less than 8 the bottom border extension fails.
assert(cols % 8 == 0);
assert(rows >= 8);
// Load and keep the first 8 values in memory. Process a vertical stripe that // is 8 wide. for (col = 0; col < cols; col += 8) {
uint8x8_t s, above_context[8];
int16x8_t sum, sum_tmp;
int32x4_t sumsq_low, sumsq_high;
// Load and extend the top border.
s = vld1_u8(dst); for (i = 0; i < 8; i++) {
above_context[i] = s;
}
sum_tmp = vreinterpretq_s16_u16(vmovl_u8(s));
// sum * 9
sum = vmulq_n_s16(sum_tmp, 9);
// (sum * 9) * sum == sum * sum * 9
sumsq_low = vmull_s16(vget_low_s16(sum), vget_low_s16(sum_tmp));
sumsq_high = vmull_s16(vget_high_s16(sum), vget_high_s16(sum_tmp));
// Load and discard the next 6 values to prime sum and sumsq. for (i = 1; i <= 6; ++i) { const uint8x8_t a = vld1_u8(dst + i * pitch); const int16x8_t b = vreinterpretq_s16_u16(vmovl_u8(a));
sum = vaddq_s16(sum, b);
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.