/* * Copyright (c) 2020, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
// look-up table for sqrt of number of pixels in a transform block // rounded up to the nearest integer. staticconstint sqrt_tx_pixels_2d[TX_SIZES_ALL] = { 4, 8, 16, 32, 32, 6, 6,
12, 12, 23, 23, 32, 32, 8,
8, 16, 16, 23, 23 };
staticinline int32_t find_mb_rd_info(const MB_RD_RECORD *const mb_rd_record, const int64_t ref_best_rd, const uint32_t hash) {
int32_t match_index = -1; if (ref_best_rd != INT64_MAX) { for (int i = 0; i < mb_rd_record->num; ++i) { constint index = (mb_rd_record->index_start + i) % RD_RECORD_BUFFER_LEN; // If there is a match in the mb_rd_record, fetch the RD decision and // terminate early. if (mb_rd_record->mb_rd_info[index].hash_value == hash) {
match_index = index; break;
}
}
} return match_index;
}
const int64_t mse = *dist / bw / bh; // Normalized quantizer takes the transform upscaling factor (8 for tx size // smaller than 32) into account. const int16_t normalized_dc_q = dc_q >> 3; const int64_t mse_thresh = (int64_t)normalized_dc_q * normalized_dc_q / 8; // For faster early skip decision, use dist to compare against threshold so // that quality risk is less for the skip=1 decision. Otherwise, use mse // since the fwd_txfm coeff checks will take care of quality // TODO(any): Use dist to return 0 when skip_txfm_level is 1
int64_t pred_err = (txfm_params->skip_txfm_level >= 2) ? *dist : mse; // Predict not to skip when error is larger than threshold. if (pred_err > mse_thresh) return 0; // Return as skip otherwise for aggressive early skip elseif (txfm_params->skip_txfm_level >= 2) return 1;
// Used to set proper context for early termination with skip = 1. staticinlinevoid set_skip_txfm(MACROBLOCK *x, RD_STATS *rd_stats,
BLOCK_SIZE bsize, int64_t dist) {
MACROBLOCKD *const xd = &x->e_mbd;
MB_MODE_INFO *const mbmi = xd->mi[0]; constint n4 = bsize_to_num_blk(bsize); const TX_SIZE tx_size = max_txsize_rect_lookup[bsize];
memset(xd->tx_type_map, DCT_DCT, sizeof(xd->tx_type_map[0]) * n4);
memset(mbmi->inter_tx_size, tx_size, sizeof(mbmi->inter_tx_size));
mbmi->tx_size = tx_size; for (int i = 0; i < n4; ++i)
set_blk_skip(x->txfm_search_info.blk_skip, 0, i, 1);
rd_stats->skip_txfm = 1; if (is_cur_buf_hbd(xd)) dist = ROUND_POWER_OF_TWO(dist, (xd->bd - 8) * 2);
rd_stats->dist = rd_stats->sse = (dist << 4); // Though decision is to make the block as skip based on luma stats, // it is possible that block becomes non skip after chroma rd. In addition // intermediate non skip costs calculated by caller function will be // incorrect, if rate is set as zero (i.e., if zero_blk_rate is not // accounted). Hence intermediate rate is populated to code the luma tx blks // as skip, the caller function based on final rd decision (i.e., skip vs // non-skip) sets the final rate accordingly. Here the rate populated // corresponds to coding all the tx blocks with zero_blk_rate (based on max tx // size possible) in the current block. Eg: For 128*128 block, rate would be // 4 * zero_blk_rate where zero_blk_rate corresponds to coding of one 64x64 tx // block as 'all zeros'
ENTROPY_CONTEXT ctxa[MAX_MIB_SIZE];
ENTROPY_CONTEXT ctxl[MAX_MIB_SIZE];
av1_get_entropy_contexts(bsize, &xd->plane[0], ctxa, ctxl);
ENTROPY_CONTEXT *ta = ctxa;
ENTROPY_CONTEXT *tl = ctxl; const TX_SIZE txs_ctx = get_txsize_entropy_ctx(tx_size);
TXB_CTX txb_ctx;
get_txb_ctx(bsize, tx_size, 0, ta, tl, &txb_ctx); constint zero_blk_rate = x->coeff_costs.coeff_costs[txs_ctx][PLANE_TYPE_Y]
.txb_skip_cost[txb_ctx.txb_skip_ctx][1];
rd_stats->rate = zero_blk_rate *
(block_size_wide[bsize] >> tx_size_wide_log2[tx_size]) *
(block_size_high[bsize] >> tx_size_high_log2[tx_size]);
}
staticint get_search_init_depth(int mi_width, int mi_height, int is_inter, const SPEED_FEATURES *sf, int tx_size_search_method) { if (tx_size_search_method == USE_LARGESTALL) return MAX_VARTX_DEPTH;
if (sf->tx_sf.tx_size_search_lgr_block) { if (mi_width > mi_size_wide[BLOCK_64X64] ||
mi_height > mi_size_high[BLOCK_64X64]) return MAX_VARTX_DEPTH;
}
staticinlinevoid select_tx_block( const AV1_COMP *cpi, MACROBLOCK *x, int blk_row, int blk_col, int block,
TX_SIZE tx_size, int depth, BLOCK_SIZE plane_bsize, ENTROPY_CONTEXT *ta,
ENTROPY_CONTEXT *tl, TXFM_CONTEXT *tx_above, TXFM_CONTEXT *tx_left,
RD_STATS *rd_stats, int64_t prev_level_rd, int64_t ref_best_rd, int *is_cost_valid, FAST_TX_SEARCH_MODE ftxs_mode);
// NOTE: CONFIG_COLLECT_RD_STATS has 3 possible values // 0: Do not collect any RD stats // 1: Collect RD stats for transform units // 2: Collect RD stats for partition units #if CONFIG_COLLECT_RD_STATS
staticdouble get_sse_norm(const int16_t *diff, int stride, int w, int h) { double sum = 0.0; for (int j = 0; j < h; ++j) { for (int i = 0; i < w; ++i) { constint err = diff[j * stride + i];
sum += err * err;
}
}
assert(w > 0 && h > 0); return sum / (w * h);
}
staticdouble get_sad_norm(const int16_t *diff, int stride, int w, int h) { double sum = 0.0; for (int j = 0; j < h; ++j) { for (int i = 0; i < w; ++i) {
sum += abs(diff[j * stride + i]);
}
}
assert(w > 0 && h > 0); return sum / (w * h);
}
// This may happen because of hash collision. The eob stored in the hash // table is non-zero, but the real eob is zero. We need to make sure tx_type // is DCT_DCT in this case. if (plane == 0 && x->plane[plane].eobs[block] == 0 &&
best_tx_type != DCT_DCT) {
update_txk_array(xd, blk_row, blk_col, tx_size, DCT_DCT);
}
}
}
staticunsigned pixel_dist_visible_only( const AV1_COMP *const cpi, const MACROBLOCK *x, const uint8_t *src, constint src_stride, const uint8_t *dst, constint dst_stride, const BLOCK_SIZE tx_bsize, int txb_rows, int txb_cols, int visible_rows, int visible_cols) { unsigned sse;
QUANT_PARAM quant_param;
TxfmParam txfm_param;
av1_setup_xform(cm, x, tx_size, DCT_DCT, &txfm_param);
av1_setup_quant(tx_size, 1, AV1_XFORM_QUANT_B, cpi->oxcf.q_cfg.quant_b_adapt,
&quant_param); int tx_type; // to ensure we can try ones even outside of ext_tx_set of current block // this function should only be called for size < 16
assert(txsize_sqr_up_map[tx_size] <= TX_16X16);
txfm_param.tx_set_type = EXT_TX_SET_ALL16;
for (int i = 0; i < 4; i++) { float *cur_scores_2D = scores_2D_raw + i * 4;
cur_scores_2D[0] = vscores[i] * hscores[0];
cur_scores_2D[1] = vscores[i] * hscores[1];
cur_scores_2D[2] = vscores[i] * hscores[2];
cur_scores_2D[3] = vscores[i] * hscores[3];
}
assert(TX_TYPES == 16); // This version of the function only works when there are at most 16 classes. // So we will need to change the optimization or use av1_nn_softmax instead if // this ever gets changed.
av1_nn_fast_softmax_16(scores_2D_raw, scores_2D_raw);
// Always keep the TX type with the highest score, prune all others with // score below score_thresh. int max_score_i = 0; float max_score = 0.0f;
uint16_t allow_bitmask = 0; float sum_score = 0.0; // Calculate sum of allowed tx type score and Populate allow bit mask based // on score_thresh and allowed_tx_mask int allow_count = 0; int tx_type_allowed[16] = { TX_TYPE_INVALID, TX_TYPE_INVALID, TX_TYPE_INVALID,
TX_TYPE_INVALID, TX_TYPE_INVALID, TX_TYPE_INVALID,
TX_TYPE_INVALID, TX_TYPE_INVALID, TX_TYPE_INVALID,
TX_TYPE_INVALID, TX_TYPE_INVALID, TX_TYPE_INVALID,
TX_TYPE_INVALID, TX_TYPE_INVALID, TX_TYPE_INVALID,
TX_TYPE_INVALID }; float scores_2D[16] = {
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
}; for (int tx_idx = 0; tx_idx < TX_TYPES; tx_idx++) { constint allow_tx_type =
check_bit_mask(*allowed_tx_mask, tx_type_table_2D[tx_idx]); if (!allow_tx_type) { continue;
} if (scores_2D_raw[tx_idx] > max_score) {
max_score = scores_2D_raw[tx_idx];
max_score_i = tx_idx;
} if (scores_2D_raw[tx_idx] >= score_thresh) { // Set allow mask based on score_thresh
set_bit_mask(&allow_bitmask, tx_type_table_2D[tx_idx]);
// Accumulate score of allowed tx type
sum_score += scores_2D_raw[tx_idx];
scores_2D[allow_count] = scores_2D_raw[tx_idx];
tx_type_allowed[allow_count] = tx_type_table_2D[tx_idx];
allow_count += 1;
}
} if (!check_bit_mask(allow_bitmask, tx_type_table_2D[max_score_i])) { // If even the tx_type with max score is pruned, this means that no other // tx_type is feasible. When this happens, we force enable max_score_i and // end the search.
set_bit_mask(&allow_bitmask, tx_type_table_2D[max_score_i]);
memcpy(txk_map, tx_type_table_2D, sizeof(tx_type_table_2D));
*allowed_tx_mask = allow_bitmask; return;
}
// Sort tx type probability of all types if (allow_count <= 8) {
av1_sort_fi32_8(scores_2D, tx_type_allowed);
} else {
av1_sort_fi32_16(scores_2D, tx_type_allowed);
}
// Enable more pruning based on tx type probability and number of allowed tx // types if (prune_2d_txfm_mode >= TX_TYPE_PRUNE_4) { float temp_score = 0.0; float score_ratio = 0.0; int tx_idx, tx_count = 0; constfloat inv_sum_score = 100 / sum_score; // Get allowed tx types based on sorted probability score and tx count for (tx_idx = 0; tx_idx < allow_count; tx_idx++) { // Skip the tx type which has more than 30% of cumulative // probability and allowed tx type count is more than 2 if (score_ratio > 30.0 && tx_count >= 2) break;
assert(check_bit_mask(allow_bitmask, tx_type_allowed[tx_idx])); // Calculate cumulative probability
temp_score += scores_2D[tx_idx];
// Calculate percentage of cumulative probability of allowed tx type
score_ratio = temp_score * inv_sum_score;
tx_count++;
} // Set remaining tx types as pruned for (; tx_idx < allow_count; tx_idx++)
unset_bit_mask(&allow_bitmask, tx_type_allowed[tx_idx]);
}
// Writes the features required by the ML model to predict tx split based on // mean and standard deviation values of the block and sub-blocks. // Returns the number of elements written to the output array which is at most // 12 currently. Hence 'features' buffer should be able to accommodate at least // 12 elements. staticinlineint get_mean_dev_features(const int16_t *data, int stride, int bw, int bh, float *features) { const int16_t *const data_ptr = &data[0]; constint subh = (bh >= bw) ? (bh >> 1) : bh; constint subw = (bw >= bh) ? (bw >> 1) : bw; constint num = bw * bh; constint sub_num = subw * subh; int feature_idx = 2; int total_x_sum = 0;
int64_t total_x2_sum = 0; int num_sub_blks = 0; double mean2_sum = 0.0f; float dev_sum = 0.0f;
for (int row = 0; row < bh; row += subh) { for (int col = 0; col < bw; col += subw) { int x_sum;
int64_t x2_sum; // TODO(any): Write a SIMD version. Clear registers.
aom_get_blk_sse_sum(data_ptr + row * stride + col, stride, subw, subh,
&x_sum, &x2_sum);
total_x_sum += x_sum;
total_x2_sum += x2_sum;
// Need to have at least one transform type allowed. if (allowed_tx_mask == 0) {
txk_allowed = (plane ? uv_tx_type : DCT_DCT);
allowed_tx_mask = (1 << txk_allowed);
}
staticint skip_trellis_opt_based_on_satd(MACROBLOCK *x,
QUANT_PARAM *quant_param, int plane, int block, TX_SIZE tx_size, int quant_b_adapt, int qstep, unsignedint coeff_opt_satd_threshold, int skip_trellis, int dc_only_blk) { if (skip_trellis || (coeff_opt_satd_threshold == UINT_MAX)) return skip_trellis;
// Prediction of skip block if residual mean and variance are less // than qstep based threshold if ((llabs(*per_px_mean) * dc_coeff_scale[tx_size]) < (dc_qstep << 12)) { // If the normalized mean of residual block is less than the dc qstep and // the normalized block variance is less than ac qstep, then the block is // assumed to be a skip block and its rdcost is updated accordingly.
best_rd_stats->skip_txfm = 1;
x->plane[plane].eobs[block] = 0;
if (is_cur_buf_hbd(xd))
*block_sse = ROUND_POWER_OF_TWO((*block_sse), (xd->bd - 8) * 2);
x->plane[plane].txb_entropy_ctx[block] = 0;
} elseif (predict_dc_level > 1) { // Predict DC only blocks based on residual variance. // For chroma plane, this prediction is disabled for intra blocks. if ((plane == 0) || (plane > 0 && is_inter_block(mbmi))) *dc_only_blk = 1;
}
}
// Search for the best transform type for a given transform block. // This function can be used for both inter and intra, both luma and chroma. staticvoid search_tx_type(const AV1_COMP *cpi, MACROBLOCK *x, int plane, int block, int blk_row, int blk_col,
BLOCK_SIZE plane_bsize, TX_SIZE tx_size, const TXB_CTX *const txb_ctx,
FAST_TX_SEARCH_MODE ftxs_mode, int skip_trellis,
int64_t ref_best_rd, RD_STATS *best_rd_stats) { const AV1_COMMON *cm = &cpi->common;
MACROBLOCKD *xd = &x->e_mbd;
MB_MODE_INFO *mbmi = xd->mi[0]; const TxfmSearchParams *txfm_params = &x->txfm_search_params;
int64_t best_rd = INT64_MAX;
uint16_t best_eob = 0;
TX_TYPE best_tx_type = DCT_DCT; int rate_cost = 0; struct macroblock_plane *const p = &x->plane[plane];
tran_low_t *orig_dqcoeff = p->dqcoeff;
tran_low_t *best_dqcoeff = x->dqcoeff_buf; constint tx_type_map_idx =
plane ? 0 : blk_row * xd->tx_type_map_stride + blk_col;
av1_invalid_rd_stats(best_rd_stats);
// Bit mask to indicate which transform types are allowed in the RD search.
uint16_t tx_mask;
// Use DCT_DCT transform for DC only block. if (dc_only_blk || cpi->sf.rt_sf.dct_only_palette_nonrd == 1)
tx_mask = 1 << DCT_DCT; else
tx_mask = get_tx_mask(cpi, x, plane, block, blk_row, blk_col, plane_bsize,
tx_size, txb_ctx, ftxs_mode, ref_best_rd,
&txk_allowed, txk_map); const uint16_t allowed_tx_mask = tx_mask;
if (is_cur_buf_hbd(xd)) {
block_sse = ROUND_POWER_OF_TWO(block_sse, (xd->bd - 8) * 2);
block_mse_q8 = ROUND_POWER_OF_TWO(block_mse_q8, (xd->bd - 8) * 2);
}
block_sse *= 16; // Use mse / qstep^2 based threshold logic to take decision of R-D // optimization of coeffs. For smaller residuals, coeff optimization // would be helpful. For larger residuals, R-D optimization may not be // effective. // TODO(any): Experiment with variance and mean based thresholds constint perform_block_coeff_opt =
((uint64_t)block_mse_q8 <=
(uint64_t)txfm_params->coeff_opt_thresholds[0] * qstep * qstep);
skip_trellis |= !perform_block_coeff_opt;
// Flag to indicate if distortion should be calculated in transform domain or // not during iterating through transform type candidates. // Transform domain distortion is accurate for higher residuals. // TODO(any): Experiment with variance and mean based thresholds int use_transform_domain_distortion =
(txfm_params->use_transform_domain_distortion > 0) &&
(block_mse_q8 >= txfm_params->tx_domain_dist_threshold) && // Any 64-pt transforms only preserves half the coefficients. // Therefore transform domain distortion is not valid for these // transform sizes.
(txsize_sqr_up_map[tx_size] != TX_64X64) && // Use pixel domain distortion for DC only blocks
!dc_only_blk; // Flag to indicate if an extra calculation of distortion in the pixel domain // should be performed at the end, after the best transform type has been // decided. int calc_pixel_domain_distortion_final =
txfm_params->use_transform_domain_distortion == 1 &&
use_transform_domain_distortion && x->rd_model != LOW_TXFM_RD; if (calc_pixel_domain_distortion_final &&
(txk_allowed < TX_TYPES || allowed_tx_mask == 0x0001))
calc_pixel_domain_distortion_final = use_transform_domain_distortion = 0;
// Calculate rate cost of quantized coefficients. if (quant_param.use_optimize_b) { // TODO(aomedia:3209): update Trellis quantization to take into account // quantization matrices.
av1_optimize_b(cpi, x, plane, block, tx_size, tx_type, txb_ctx,
&rate_cost);
} else {
rate_cost = cost_coeffs(x, plane, block, tx_size, tx_type, txb_ctx,
cm->features.reduced_tx_set_used);
}
// If rd cost based on coeff rate alone is already more than best_rd, // terminate early. if (RDCOST(x->rdmult, rate_cost, 0) > best_rd) continue;
// Calculate distortion. if (eobs_ptr[block] == 0) { // When eob is 0, pixel domain distortion is more efficient and accurate.
this_rd_stats.dist = this_rd_stats.sse = block_sse;
} elseif (dc_only_blk) {
this_rd_stats.sse = block_sse;
this_rd_stats.dist = dist_block_px_domain(
cpi, x, plane, plane_bsize, block, blk_row, blk_col, tx_size);
} elseif (use_transform_domain_distortion) { const SCAN_ORDER *const scan_order =
get_scan(txfm_param.tx_size, txfm_param.tx_type);
dist_block_tx_domain(x, plane, block, tx_size, quant_param.qmatrix,
scan_order->scan, &this_rd_stats.dist,
&this_rd_stats.sse);
} else {
int64_t sse_diff = INT64_MAX; // high_energy threshold assumes that every pixel within a txfm block // has a residue energy of at least 25% of the maximum, i.e. 128 * 128 // for 8 bit. const int64_t high_energy_thresh =
((int64_t)128 * 128 * tx_size_2d[tx_size]); constint is_high_energy = (block_sse >= high_energy_thresh); if (tx_size == TX_64X64 || is_high_energy) { // Because 3 out 4 quadrants of transform coefficients are forced to // zero, the inverse transform has a tendency to overflow. sse_diff // is effectively the energy of those 3 quadrants, here we use it // to decide if we should do pixel domain distortion. If the energy // is mostly in first quadrant, then it is unlikely that we have // overflow issue in inverse transform. const SCAN_ORDER *const scan_order =
get_scan(txfm_param.tx_size, txfm_param.tx_type);
dist_block_tx_domain(x, plane, block, tx_size, quant_param.qmatrix,
scan_order->scan, &this_rd_stats.dist,
&this_rd_stats.sse);
sse_diff = block_sse - this_rd_stats.sse;
} if (tx_size != TX_64X64 || !is_high_energy ||
(sse_diff * 2) < this_rd_stats.sse) { const int64_t tx_domain_dist = this_rd_stats.dist;
this_rd_stats.dist = dist_block_px_domain(
cpi, x, plane, plane_bsize, block, blk_row, blk_col, tx_size); // For high energy blocks, occasionally, the pixel domain distortion // can be artificially low due to clamping at reconstruction stage // even when inverse transform output is hugely different from the // actual residue. if (is_high_energy && this_rd_stats.dist < tx_domain_dist)
this_rd_stats.dist = tx_domain_dist;
} else {
assert(sse_diff < INT64_MAX);
this_rd_stats.dist += sse_diff;
}
this_rd_stats.sse = block_sse;
}
// If the current best RD cost is much worse than the reference RD cost, // terminate early. if (cpi->sf.tx_sf.adaptive_txb_search_level) { if ((best_rd - (best_rd >> cpi->sf.tx_sf.adaptive_txb_search_level)) >
ref_best_rd) { break;
}
}
// Terminate transform type search if the block has been quantized to // all zero. if (cpi->sf.tx_sf.tx_type_search.skip_tx_search && !best_eob) break;
}
// Point dqcoeff to the quantized coefficients corresponding to the best // transform type, then we can skip transform and quantization, e.g. in the // final pixel domain distortion calculation and recon_intra().
p->dqcoeff = best_dqcoeff;
// Intra mode needs decoded pixels such that the next transform block // can use them for prediction.
recon_intra(cpi, x, plane, block, blk_row, blk_col, plane_bsize, tx_size,
txb_ctx, skip_trellis, best_tx_type, 0, &rate_cost, best_eob);
p->dqcoeff = orig_dqcoeff;
}
// Pick transform type for a luma transform block of tx_size. Note this function // is used only for inter-predicted blocks. staticinlinevoid tx_type_rd(const AV1_COMP *cpi, MACROBLOCK *x,
TX_SIZE tx_size, int blk_row, int blk_col, int block, int plane_bsize, TXB_CTX *txb_ctx,
RD_STATS *rd_stats, FAST_TX_SEARCH_MODE ftxs_mode,
int64_t ref_rdcost) {
assert(is_inter_block(x->e_mbd.mi[0]));
RD_STATS this_rd_stats; constint skip_trellis = 0;
search_tx_type(cpi, x, 0, block, blk_row, blk_col, plane_bsize, tx_size,
txb_ctx, ftxs_mode, skip_trellis, ref_rdcost, &this_rd_stats);
// Search for the best transform partition(recursive)/type for a given // inter-predicted luma block. The obtained transform selection will be saved // in xd->mi[0], the corresponding RD stats will be saved in rd_stats. staticinlinevoid select_tx_block( const AV1_COMP *cpi, MACROBLOCK *x, int blk_row, int blk_col, int block,
TX_SIZE tx_size, int depth, BLOCK_SIZE plane_bsize, ENTROPY_CONTEXT *ta,
ENTROPY_CONTEXT *tl, TXFM_CONTEXT *tx_above, TXFM_CONTEXT *tx_left,
RD_STATS *rd_stats, int64_t prev_level_rd, int64_t ref_best_rd, int *is_cost_valid, FAST_TX_SEARCH_MODE ftxs_mode) {
assert(tx_size < TX_SIZES_ALL);
av1_init_rd_stats(rd_stats); if (ref_best_rd < 0) {
*is_cost_valid = 0; return;
}
// Prune tx_split and no-split based on sub-block properties. if (tx_size != TX_4X4 && try_split == 1 && try_no_split == 1 &&
cpi->sf.tx_sf.prune_tx_size_level > 0) {
prune_tx_split_no_split(x, plane_bsize, blk_row, blk_col, tx_size,
&try_no_split, &try_split,
cpi->sf.tx_sf.prune_tx_size_level);
}
if (cpi->sf.rt_sf.skip_tx_no_split_var_based_partition) { if (x->try_merge_partition && try_split && p->eobs[block]) try_no_split = 0;
}
// Try using current block as a single transform block without split. if (try_no_split) {
try_tx_block_no_split(cpi, x, blk_row, blk_col, block, tx_size, depth,
plane_bsize, ta, tl, ctx, rd_stats, ref_best_rd,
ftxs_mode, &no_split);
// Speed features for early termination. constint search_level = cpi->sf.tx_sf.adaptive_txb_search_level; if (search_level) { if ((no_split.rd - (no_split.rd >> (1 + search_level))) > ref_best_rd) {
*is_cost_valid = 0; return;
} if (no_split.rd - (no_split.rd >> (2 + search_level)) > prev_level_rd) {
try_split = 0;
}
} if (cpi->sf.tx_sf.txb_split_cap) { if (p->eobs[block] == 0) try_split = 0;
}
}
// ML based speed feature to skip searching for split transform blocks. if (x->e_mbd.bd == 8 && try_split &&
!(ref_best_rd == INT64_MAX && no_split.rd == INT64_MAX)) { constint threshold = cpi->sf.tx_sf.tx_type_search.ml_tx_split_thresh; if (threshold >= 0) { constint split_score =
ml_predict_tx_split(x, plane_bsize, blk_row, blk_col, tx_size); if (split_score < -threshold) try_split = 0;
}
}
RD_STATS split_rd_stats;
split_rd_stats.rdcost = INT64_MAX; // Try splitting current block into smaller transform blocks. if (try_split) {
try_tx_block_split(cpi, x, blk_row, blk_col, block, tx_size, depth,
plane_bsize, ta, tl, tx_above, tx_left, ctx, no_split.rd,
AOMMIN(no_split.rd, ref_best_rd), ftxs_mode,
&split_rd_stats);
}
// Disable the pruning logic using NN model for the following cases: // 1) Lossless coding as only 4x4 transform is evaluated in this case // 2) When transform and current block sizes do not match as the features are // obtained over the current block // 3) When operating bit-depth is not 8-bit as the input features are not // scaled according to bit-depth. if (xd->lossless[mbmi->segment_id] || txsize_to_bsize[tx_size] != bsize ||
xd->bd != 8) return;
// Currently NN model based pruning is supported only when largest transform // size is 8x8 if (tx_size != TX_8X8) return;
// Neural network model is a sequential neural net and was trained using SGD // optimizer. The model can be further improved in terms of speed/quality by // considering the following experiments: // 1) Generate ML model by training with balanced data for different learning // rates and optimizers. // 2) Experiment with ML model by adding features related to the statistics of // top and left pixels to capture the accuracy of reconstructed neighbouring // pixels for 4x4 blocks numbered 1, 2, 3 in 8x8 block, source variance of 4x4 // sub-blocks, etc. // 3) Generate ML models for transform blocks other than 8x8. const NN_CONFIG *const nn_config = &av1_intra_tx_split_nnconfig_8x8; constfloat *const intra_tx_prune_thresh = av1_intra_tx_prune_nn_thresh_8x8;
/*!\brief Transform type search for luma macroblock with fixed transform size. * * \ingroup transform_search * Search for the best transform type and return the transform coefficients RD * cost of current luma macroblock with the given uniform transform size. * * \param[in] x Pointer to structure holding the data for the current encoding macroblock * \param[in] cpi Top-level encoder structure * \param[in] rd_stats Pointer to struct to keep track of the RD stats * \param[in] ref_best_rd Best RD cost seen for this block so far * \param[in] bs Size of the current macroblock * \param[in] tx_size The given transform size * \param[in] ftxs_mode Transform search mode specifying desired speed and quality tradeoff * \param[in] skip_trellis Binary flag indicating if trellis optimization should be skipped * \return An int64_t value that is the best RD cost found.
*/ static int64_t uniform_txfm_yrd(const AV1_COMP *const cpi, MACROBLOCK *x,
RD_STATS *rd_stats, int64_t ref_best_rd,
BLOCK_SIZE bs, TX_SIZE tx_size,
FAST_TX_SEARCH_MODE ftxs_mode, int skip_trellis) {
assert(IMPLIES(is_rect_tx(tx_size), is_rect_tx_allowed_bsize(bs)));
MACROBLOCKD *const xd = &x->e_mbd;
MB_MODE_INFO *const mbmi = xd->mi[0]; const TxfmSearchParams *txfm_params = &x->txfm_search_params; const ModeCosts *mode_costs = &x->mode_costs; constint is_inter = is_inter_block(mbmi); constint tx_select = txfm_params->tx_mode_search_type == TX_MODE_SELECT &&
block_signals_txsize(mbmi->bsize); int tx_size_rate = 0; if (tx_select) { constint ctx = txfm_partition_context(
xd->above_txfm_context, xd->left_txfm_context, mbmi->bsize, tx_size);
tx_size_rate = is_inter ? mode_costs->txfm_partition_cost[ctx][0]
: tx_size_cost(x, bs, tx_size);
} constint skip_ctx = av1_get_skip_txfm_context(xd); constint no_skip_txfm_rate = mode_costs->skip_txfm_cost[skip_ctx][0]; constint skip_txfm_rate = mode_costs->skip_txfm_cost[skip_ctx][1]; const int64_t skip_txfm_rd =
is_inter ? RDCOST(x->rdmult, skip_txfm_rate, 0) : INT64_MAX; const int64_t no_this_rd =
RDCOST(x->rdmult, no_skip_txfm_rate + tx_size_rate, 0);
int64_t rd; // rdstats->rate should include all the rate except skip/non-skip cost as the // same is accounted in the caller functions after rd evaluation of all // planes. However the decisions should be done after considering the // skip/non-skip header cost if (rd_stats->skip_txfm && is_inter) {
rd = RDCOST(x->rdmult, skip_txfm_rate, rd_stats->sse);
} else { // Intra blocks are always signalled as non-skip
rd = RDCOST(x->rdmult, rd_stats->rate + no_skip_txfm_rate + tx_size_rate,
rd_stats->dist);
rd_stats->rate += tx_size_rate;
} // Check if forcing the block to skip transform leads to smaller RD cost. if (is_inter && !rd_stats->skip_txfm && !xd->lossless[mbmi->segment_id]) {
int64_t temp_skip_txfm_rd =
RDCOST(x->rdmult, skip_txfm_rate, rd_stats->sse); if (temp_skip_txfm_rd <= rd) {
rd = temp_skip_txfm_rd;
rd_stats->rate = 0;
rd_stats->dist = rd_stats->sse;
rd_stats->skip_txfm = 1;
}
}
return rd;
}
// Search for the best uniform transform size and type for current coding block. staticinlinevoid choose_tx_size_type_from_rd(const AV1_COMP *const cpi,
MACROBLOCK *x,
RD_STATS *rd_stats,
int64_t ref_best_rd,
BLOCK_SIZE bs) {
av1_invalid_rd_stats(rd_stats);
MACROBLOCKD *const xd = &x->e_mbd;
MB_MODE_INFO *const mbmi = xd->mi[0];
TxfmSearchParams *const txfm_params = &x->txfm_search_params; const TX_SIZE max_rect_tx_size = max_txsize_rect_lookup[bs]; constint tx_select = txfm_params->tx_mode_search_type == TX_MODE_SELECT; int start_tx; // The split depth can be at most MAX_TX_DEPTH, so the init_depth controls // how many times of splitting is allowed during the RD search. int init_depth;
#if !CONFIG_REALTIME_ONLY if (txfm_params->nn_prune_depths_for_intra_tx == TX_PRUNE_SPLIT) break;
// Set the flag to enable the evaluation of NN classifier to prune transform // depths. As the features are based on intra residual information of // largest transform, the evaluation of NN model is enabled only for this // case.
txfm_params->enable_nn_prune_intra_tx_depths =
(cpi->sf.tx_sf.prune_intra_tx_depths_using_nn && tx_size == start_tx); #endif
RD_STATS this_rd_stats; // When the speed feature use_rd_based_breakout_for_intra_tx_search is // enabled, use the known minimum best_rd for early termination. const int64_t rd_thresh =
cpi->sf.tx_sf.use_rd_based_breakout_for_intra_tx_search
? AOMMIN(ref_best_rd, best_rd)
: ref_best_rd;
rd[depth] = uniform_txfm_yrd(cpi, x, &this_rd_stats, rd_thresh, bs, tx_size,
FTXS_NONE, skip_trellis); if (rd[depth] < best_rd) {
av1_copy_array(best_blk_skip, txfm_info->blk_skip, num_blks);
av1_copy_array(best_txk_type_map, xd->tx_type_map, num_blks);
best_tx_size = tx_size;
best_rd = rd[depth];
*rd_stats = this_rd_stats;
} if (tx_size == TX_4X4) break; // If we are searching three depths, prune the smallest size depending // on rd results for the first two depths for low contrast blocks. if (depth > init_depth && depth != MAX_TX_DEPTH &&
x->source_variance < 256) { if (rd[depth - 1] != INT64_MAX && rd[depth] > rd[depth - 1]) break;
}
}
#if !CONFIG_REALTIME_ONLY // Reset the flags to avoid any unintentional evaluation of NN model and // consumption of prune depths.
txfm_params->enable_nn_prune_intra_tx_depths = false;
txfm_params->nn_prune_depths_for_intra_tx = TX_PRUNE_NONE; #endif
}
// Search for the best transform type for the given transform block in the // given plane/channel, and calculate the corresponding RD cost. staticinlinevoid block_rd_txfm(int plane, int block, int blk_row, int blk_col,
BLOCK_SIZE plane_bsize, TX_SIZE tx_size, void *arg) { struct rdcost_block_args *args = arg; if (args->exit_early) {
args->incomplete_exit = 1; return;
}
if (args.current_rd > ref_best_rd) {
args.exit_early = 1; break;
}
av1_set_txb_context(x, 0, i, tx_size, a, l);
i += step;
}
}
if (args.incomplete_exit) av1_invalid_rd_stats(&args.rd_stats);
*rd_stats = args.rd_stats; if (rd_stats->rate == INT_MAX) return INT64_MAX;
int64_t rd; // rdstats->rate should include all the rate except skip/non-skip cost as the // same is accounted in the caller functions after rd evaluation of all // planes. However the decisions should be done after considering the // skip/non-skip header cost if (rd_stats->skip_txfm && is_inter) {
rd = RDCOST(x->rdmult, skip_txfm_rate, rd_stats->sse);
} else { // Intra blocks are always signalled as non-skip
rd = RDCOST(x->rdmult, rd_stats->rate + no_skip_txfm_rate + tx_size_rate,
rd_stats->dist);
rd_stats->rate += tx_size_rate;
} // Check if forcing the block to skip transform leads to smaller RD cost. if (is_inter && !rd_stats->skip_txfm && !xd->lossless[mbmi->segment_id]) {
int64_t temp_skip_txfm_rd =
RDCOST(x->rdmult, skip_txfm_rate, rd_stats->sse); if (temp_skip_txfm_rd <= rd) {
rd = temp_skip_txfm_rd;
rd_stats->rate = 0;
rd_stats->dist = rd_stats->sse;
rd_stats->skip_txfm = 1;
}
}
return rd;
}
// Search for the best transform type for a luma inter-predicted block, given // the transform block partitions. // This function is used only when some speed features are enabled. staticinlinevoid tx_block_yrd(const AV1_COMP *cpi, MACROBLOCK *x, int blk_row, int blk_col, int block, TX_SIZE tx_size,
BLOCK_SIZE plane_bsize, int depth,
ENTROPY_CONTEXT *above_ctx,
ENTROPY_CONTEXT *left_ctx,
TXFM_CONTEXT *tx_above, TXFM_CONTEXT *tx_left,
int64_t ref_best_rd, RD_STATS *rd_stats,
FAST_TX_SEARCH_MODE ftxs_mode) {
assert(tx_size < TX_SIZES_ALL);
MACROBLOCKD *const xd = &x->e_mbd;
MB_MODE_INFO *const mbmi = xd->mi[0];
assert(is_inter_block(mbmi)); constint max_blocks_high = max_block_high(xd, plane_bsize, 0); constint max_blocks_wide = max_block_wide(xd, plane_bsize, 0);
if (blk_row >= max_blocks_high || blk_col >= max_blocks_wide) return;
// search for tx type with tx sizes already decided for a inter-predicted luma // partition block. It's used only when some speed features are enabled. // Return value 0: early termination triggered, no valid rd cost available; // 1: rd cost values are valid. staticint inter_block_yrd(const AV1_COMP *cpi, MACROBLOCK *x,
RD_STATS *rd_stats, BLOCK_SIZE bsize,
int64_t ref_best_rd, FAST_TX_SEARCH_MODE ftxs_mode) { if (ref_best_rd < 0) {
av1_invalid_rd_stats(rd_stats); return 0;
}
// If fast_tx_search is true, only DCT and 1D DCT were tested in // select_inter_block_yrd() above. Do a better search for tx type with // tx sizes already decided. if (fast_tx_search && cpi->sf.tx_sf.refine_fast_tx_search_results) { if (!inter_block_yrd(cpi, x, rd_stats, bsize, ref_best_rd, FTXS_NONE)) return INT64_MAX;
}
// If modeled RD cost is a lot worse than the best so far, terminate early. if (cpi->sf.tx_sf.model_based_prune_tx_search_level &&
ref_best_rd != INT64_MAX) { if (model_based_tx_search_prune(cpi, x, bsize, ref_best_rd)) return;
}
// Hashing based speed feature. If the hash of the prediction residue block is // found in the hash table, use previous search results and terminate early.
uint32_t hash = 0;
MB_RD_RECORD *mb_rd_record = NULL; constint mi_row = x->e_mbd.mi_row; constint mi_col = x->e_mbd.mi_col; constint within_border =
mi_row >= xd->tile.mi_row_start &&
(mi_row + mi_size_high[bsize] < xd->tile.mi_row_end) &&
mi_col >= xd->tile.mi_col_start &&
(mi_col + mi_size_wide[bsize] < xd->tile.mi_col_end); constint is_mb_rd_hash_enabled =
(within_border && cpi->sf.rd_sf.use_mb_rd_hash); constint n4 = bsize_to_num_blk(bsize); if (is_mb_rd_hash_enabled) {
hash = get_block_residue_hash(x, bsize);
mb_rd_record = x->txfm_search_info.mb_rd_record; constint match_index = find_mb_rd_info(mb_rd_record, ref_best_rd, hash); if (match_index != -1) {
MB_RD_INFO *mb_rd_info = &mb_rd_record->mb_rd_info[match_index];
fetch_mb_rd_info(n4, mb_rd_info, rd_stats, x); return;
}
}
// If we predict that skip is the optimal RD decision - set the respective // context and terminate early.
int64_t dist; if (txfm_params->skip_txfm_level &&
predict_skip_txfm(x, bsize, &dist,
cpi->common.features.reduced_tx_set_used)) {
set_skip_txfm(x, rd_stats, bsize, dist); // Save the RD search results into mb_rd_record. if (is_mb_rd_hash_enabled)
save_mb_rd_info(n4, hash, x, rd_stats, mb_rd_record); return;
} #if CONFIG_SPEED_STATS
++x->txfm_search_info.tx_search_count; #endif// CONFIG_SPEED_STATS
if (rd == INT64_MAX) { // We should always find at least one candidate unless ref_best_rd is less // than INT64_MAX (in which case, all the calls to select_tx_size_fix_type // might have failed to find something better)
assert(ref_best_rd != INT64_MAX);
av1_invalid_rd_stats(rd_stats); return;
}
// Save the RD search results into mb_rd_record. if (is_mb_rd_hash_enabled) {
assert(mb_rd_record != NULL);
save_mb_rd_info(n4, hash, x, rd_stats, mb_rd_record);
}
}
// Hashing based speed feature for inter blocks. If the hash of the residue // block is found in the table, use previously saved search results and // terminate early.
uint32_t hash = 0;
MB_RD_RECORD *mb_rd_record = NULL; constint num_blks = bsize_to_num_blk(bs); if (is_inter && cpi->sf.rd_sf.use_mb_rd_hash) { constint within_border =
mi_row >= xd->tile.mi_row_start &&
(mi_row + mi_size_high[bs] < xd->tile.mi_row_end) &&
mi_col >= xd->tile.mi_col_start &&
(mi_col + mi_size_wide[bs] < xd->tile.mi_col_end); if (within_border) {
hash = get_block_residue_hash(x, bs);
mb_rd_record = x->txfm_search_info.mb_rd_record; constint match_index = find_mb_rd_info(mb_rd_record, ref_best_rd, hash); if (match_index != -1) {
MB_RD_INFO *mb_rd_info = &mb_rd_record->mb_rd_info[match_index];
fetch_mb_rd_info(num_blks, mb_rd_info, rd_stats, x); return;
}
}
}
// If we predict that skip is the optimal RD decision - set the respective // context and terminate early.
int64_t dist; if (tx_params->skip_txfm_level && is_inter &&
!xd->lossless[mbmi->segment_id] &&
predict_skip_txfm(x, bs, &dist,
cpi->common.features.reduced_tx_set_used)) { // Populate rdstats as per skip decision
set_skip_txfm(x, rd_stats, bs, dist); // Save the RD search results into mb_rd_record. if (mb_rd_record) {
save_mb_rd_info(num_blks, hash, x, rd_stats, mb_rd_record);
} return;
}
if (xd->lossless[mbmi->segment_id]) { // Lossless mode can only pick the smallest (4x4) transform size.
choose_smallest_tx_size(cpi, x, rd_stats, ref_best_rd, bs);
} elseif (tx_params->tx_size_search_method == USE_LARGESTALL) {
choose_largest_tx_size(cpi, x, rd_stats, ref_best_rd, bs);
} else {
choose_tx_size_type_from_rd(cpi, x, rd_stats, ref_best_rd, bs);
}
// Save the RD search results into mb_rd_record for possible reuse in future. if (mb_rd_record) {
save_mb_rd_info(num_blks, hash, x, rd_stats, mb_rd_record);
}
}
int av1_txfm_uvrd(const AV1_COMP *const cpi, MACROBLOCK *x, RD_STATS *rd_stats,
BLOCK_SIZE bsize, int64_t ref_best_rd) {
av1_init_rd_stats(rd_stats); if (ref_best_rd < 0) return 0; if (!x->e_mbd.is_chroma_ref) return 1;
if (is_inter) { for (int plane = 1; plane < MAX_MB_PLANE; ++plane)
av1_subtract_plane(x, plane_bsize, plane);
}
constint skip_trellis = 0; const TX_SIZE uv_tx_size = av1_get_tx_size(AOM_PLANE_U, xd); int is_cost_valid = 1; for (int plane = 1; plane < MAX_MB_PLANE; ++plane) {
RD_STATS this_rd_stats;
int64_t chroma_ref_best_rd = ref_best_rd; // For inter blocks, refined ref_best_rd is used for early exit // For intra blocks, even though current rd crosses ref_best_rd, early // exit is not recommended as current rd is used for gating subsequent // modes as well (say, for angular modes) // TODO(any): Extend the early exit mechanism for intra modes as well if (cpi->sf.inter_sf.perform_best_rd_based_gating_for_chroma && is_inter &&
chroma_ref_best_rd != INT64_MAX)
chroma_ref_best_rd = ref_best_rd - AOMMIN(this_rd, skip_txfm_rd);
av1_txfm_rd_in_plane(x, cpi, &this_rd_stats, chroma_ref_best_rd, 0, plane,
plane_bsize, uv_tx_size, FTXS_NONE, skip_trellis); if (this_rd_stats.rate == INT_MAX) {
is_cost_valid = 0; break;
}
av1_merge_rd_stats(rd_stats, &this_rd_stats);
this_rd = RDCOST(x->rdmult, rd_stats->rate, rd_stats->dist);
skip_txfm_rd = RDCOST(x->rdmult, 0, rd_stats->sse); if (AOMMIN(this_rd, skip_txfm_rd) > ref_best_rd) {
is_cost_valid = 0; break;
}
}
if (!is_cost_valid) { // reset cost value
av1_invalid_rd_stats(rd_stats);
}
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.