/*
* Copyright © 2018-2021, VideoLAN and dav1d authors
* Copyright © 2018, Two Orioles, LLC
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
* ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "config.h"
#include <string.h>
#include <stdio.h>
#include "common/attributes.h"
#include "common/bitdepth.h"
#include "common/dump.h"
#include "common/frame.h"
#include "common/intops.h"
#include "src/cdef_apply.h"
#include "src/ctx.h"
#include "src/ipred_prepare.h"
#include "src/lf_apply.h"
#include "src/lr_apply.h"
#include "src/recon.h"
#include "src/scan.h"
#include "src/tables.h"
#include "src/wedge.h"
static inline unsigned read_golomb(MsacContext *const msac) {
int len = 0 ;
unsigned val = 1 ;
while (!dav1d_msac_decode_bool_equi(msac) && len < 32 ) len++;
while (len--) val = (val << 1 ) + dav1d_msac_decode_bool_equi(msac);
return val - 1 ;
}
static inline unsigned get_skip_ctx(const TxfmInfo *const t_dim,
const enum BlockSize bs,
const uint8_t *const a,
const uint8_t *const l,
const int chroma,
const enum Dav1dPixelLayout layout)
{
const uint8_t *const b_dim = dav1d_block_dimensions[bs];
if (chroma) {
const int ss_ver = layout == DAV1D_PIXEL_LAYOUT_I420;
const int ss_hor = layout != DAV1D_PIXEL_LAYOUT_I444;
const int not_one_blk = b_dim[2 ] - (!!b_dim[2 ] && ss_hor) > t_dim->lw ||
b_dim[3 ] - (!!b_dim[3 ] && ss_ver) > t_dim->lh;
unsigned ca, cl;
#define MERGE_CTX(dir, type, no_val) \
c## dir = *(const type *) dir != no_val; \
break
switch (t_dim->lw) {
/* For some reason the MSVC CRT _wassert() function is not flagged as
* __declspec(noreturn), so when using those headers the compiler will
* expect execution to continue after an assertion has been triggered
* and will therefore complain about the use of uninitialized variables
* when compiled in debug mode if we put the default case at the end. */
default : assert(0 ); /* fall-through */
case TX_4X4: MERGE_CTX(a, uint8_t, 0 x40);
case TX_8X8: MERGE_CTX(a, uint16_t, 0 x4040);
case TX_16X16: MERGE_CTX(a, uint32_t, 0 x40404040U);
case TX_32X32: MERGE_CTX(a, uint64_t, 0 x4040404040404040ULL);
}
switch (t_dim->lh) {
default : assert(0 ); /* fall-through */
case TX_4X4: MERGE_CTX(l, uint8_t, 0 x40);
case TX_8X8: MERGE_CTX(l, uint16_t, 0 x4040);
case TX_16X16: MERGE_CTX(l, uint32_t, 0 x40404040U);
case TX_32X32: MERGE_CTX(l, uint64_t, 0 x4040404040404040ULL);
}
#undef MERGE_CTX
return 7 + not_one_blk * 3 + ca + cl;
} else if (b_dim[2 ] == t_dim->lw && b_dim[3 ] == t_dim->lh) {
return 0 ;
} else {
unsigned la, ll;
#define MERGE_CTX(dir, type, tx) \
if (tx == TX_64X64) { \
uint64_t tmp = *(const uint64_t *) dir; \
tmp |= *(const uint64_t *) &dir[8 ]; \
l## dir = (unsigned ) (tmp >> 32 ) | (unsigned ) tmp; \
} else \
l## dir = *(const type *) dir; \
if (tx == TX_32X32) l## dir |= *(const type *) &dir[sizeof (type)]; \
if (tx >= TX_16X16) l## dir |= l## dir >> 16 ; \
if (tx >= TX_8X8) l## dir |= l## dir >> 8 ; \
break
switch (t_dim->lw) {
default : assert(0 ); /* fall-through */
case TX_4X4: MERGE_CTX(a, uint8_t, TX_4X4);
case TX_8X8: MERGE_CTX(a, uint16_t, TX_8X8);
case TX_16X16: MERGE_CTX(a, uint32_t, TX_16X16);
case TX_32X32: MERGE_CTX(a, uint32_t, TX_32X32);
case TX_64X64: MERGE_CTX(a, uint32_t, TX_64X64);
}
switch (t_dim->lh) {
default : assert(0 ); /* fall-through */
case TX_4X4: MERGE_CTX(l, uint8_t, TX_4X4);
case TX_8X8: MERGE_CTX(l, uint16_t, TX_8X8);
case TX_16X16: MERGE_CTX(l, uint32_t, TX_16X16);
case TX_32X32: MERGE_CTX(l, uint32_t, TX_32X32);
case TX_64X64: MERGE_CTX(l, uint32_t, TX_64X64);
}
#undef MERGE_CTX
return dav1d_skip_ctx[umin(la & 0 x3F, 4 )][umin(ll & 0 x3F, 4 )];
}
}
static inline unsigned get_dc_sign_ctx(const int /*enum RectTxfmSize*/ tx,
const uint8_t *const a,
const uint8_t *const l)
{
uint64_t mask = 0 xC0C0C0C0C0C0C0C0ULL, mul = 0 x0101010101010101ULL;
int s;
#if ARCH_X86_64 && defined (__GNUC__)
/* Coerce compilers into producing better code. For some reason
* every x86-64 compiler is awful at handling 64-bit constants. */
__asm__("" : "+r" (mask), "+r" (mul));
#endif
switch (tx) {
default : assert(0 ); /* fall-through */
case TX_4X4: {
int t = *(const uint8_t *) a >> 6 ;
t += *(const uint8_t *) l >> 6 ;
s = t - 1 - 1 ;
break ;
}
case TX_8X8: {
uint32_t t = *(const uint16_t *) a & (uint32_t) mask;
t += *(const uint16_t *) l & (uint32_t) mask;
t *= 0 x04040404U;
s = (int ) (t >> 24 ) - 2 - 2 ;
break ;
}
case TX_16X16: {
uint32_t t = (*(const uint32_t *) a & (uint32_t) mask) >> 6 ;
t += (*(const uint32_t *) l & (uint32_t) mask) >> 6 ;
t *= (uint32_t) mul;
s = (int ) (t >> 24 ) - 4 - 4 ;
break ;
}
case TX_32X32: {
uint64_t t = (*(const uint64_t *) a & mask) >> 6 ;
t += (*(const uint64_t *) l & mask) >> 6 ;
t *= mul;
s = (int ) (t >> 56 ) - 8 - 8 ;
break ;
}
case TX_64X64: {
uint64_t t = (*(const uint64_t *) &a[0 ] & mask) >> 6 ;
t += (*(const uint64_t *) &a[8 ] & mask) >> 6 ;
t += (*(const uint64_t *) &l[0 ] & mask) >> 6 ;
t += (*(const uint64_t *) &l[8 ] & mask) >> 6 ;
t *= mul;
s = (int ) (t >> 56 ) - 16 - 16 ;
break ;
}
case RTX_4X8: {
uint32_t t = *(const uint8_t *) a & (uint32_t) mask;
t += *(const uint16_t *) l & (uint32_t) mask;
t *= 0 x04040404U;
s = (int ) (t >> 24 ) - 1 - 2 ;
break ;
}
case RTX_8X4: {
uint32_t t = *(const uint16_t *) a & (uint32_t) mask;
t += *(const uint8_t *) l & (uint32_t) mask;
t *= 0 x04040404U;
s = (int ) (t >> 24 ) - 2 - 1 ;
break ;
}
case RTX_8X16: {
uint32_t t = *(const uint16_t *) a & (uint32_t) mask;
t += *(const uint32_t *) l & (uint32_t) mask;
t = (t >> 6 ) * (uint32_t) mul;
s = (int ) (t >> 24 ) - 2 - 4 ;
break ;
}
case RTX_16X8: {
uint32_t t = *(const uint32_t *) a & (uint32_t) mask;
t += *(const uint16_t *) l & (uint32_t) mask;
t = (t >> 6 ) * (uint32_t) mul;
s = (int ) (t >> 24 ) - 4 - 2 ;
break ;
}
case RTX_16X32: {
uint64_t t = *(const uint32_t *) a & (uint32_t) mask;
t += *(const uint64_t *) l & mask;
t = (t >> 6 ) * mul;
s = (int ) (t >> 56 ) - 4 - 8 ;
break ;
}
case RTX_32X16: {
uint64_t t = *(const uint64_t *) a & mask;
t += *(const uint32_t *) l & (uint32_t) mask;
t = (t >> 6 ) * mul;
s = (int ) (t >> 56 ) - 8 - 4 ;
break ;
}
case RTX_32X64: {
uint64_t t = (*(const uint64_t *) &a[0 ] & mask) >> 6 ;
t += (*(const uint64_t *) &l[0 ] & mask) >> 6 ;
t += (*(const uint64_t *) &l[8 ] & mask) >> 6 ;
t *= mul;
s = (int ) (t >> 56 ) - 8 - 16 ;
break ;
}
case RTX_64X32: {
uint64_t t = (*(const uint64_t *) &a[0 ] & mask) >> 6 ;
t += (*(const uint64_t *) &a[8 ] & mask) >> 6 ;
t += (*(const uint64_t *) &l[0 ] & mask) >> 6 ;
t *= mul;
s = (int ) (t >> 56 ) - 16 - 8 ;
break ;
}
case RTX_4X16: {
uint32_t t = *(const uint8_t *) a & (uint32_t) mask;
t += *(const uint32_t *) l & (uint32_t) mask;
t = (t >> 6 ) * (uint32_t) mul;
s = (int ) (t >> 24 ) - 1 - 4 ;
break ;
}
case RTX_16X4: {
uint32_t t = *(const uint32_t *) a & (uint32_t) mask;
t += *(const uint8_t *) l & (uint32_t) mask;
t = (t >> 6 ) * (uint32_t) mul;
s = (int ) (t >> 24 ) - 4 - 1 ;
break ;
}
case RTX_8X32: {
uint64_t t = *(const uint16_t *) a & (uint32_t) mask;
t += *(const uint64_t *) l & mask;
t = (t >> 6 ) * mul;
s = (int ) (t >> 56 ) - 2 - 8 ;
break ;
}
case RTX_32X8: {
uint64_t t = *(const uint64_t *) a & mask;
t += *(const uint16_t *) l & (uint32_t) mask;
t = (t >> 6 ) * mul;
s = (int ) (t >> 56 ) - 8 - 2 ;
break ;
}
case RTX_16X64: {
uint64_t t = *(const uint32_t *) a & (uint32_t) mask;
t += *(const uint64_t *) &l[0 ] & mask;
t = (t >> 6 ) + ((*(const uint64_t *) &l[8 ] & mask) >> 6 );
t *= mul;
s = (int ) (t >> 56 ) - 4 - 16 ;
break ;
}
case RTX_64X16: {
uint64_t t = *(const uint64_t *) &a[0 ] & mask;
t += *(const uint32_t *) l & (uint32_t) mask;
t = (t >> 6 ) + ((*(const uint64_t *) &a[8 ] & mask) >> 6 );
t *= mul;
s = (int ) (t >> 56 ) - 16 - 4 ;
break ;
}
}
return (s != 0 ) + (s > 0 );
}
static inline unsigned get_lo_ctx(const uint8_t *const levels,
const enum TxClass tx_class,
unsigned *const hi_mag,
const uint8_t (*const ctx_offsets)[5 ],
const unsigned x, const unsigned y,
const ptrdiff_t stride)
{
unsigned mag = levels[0 * stride + 1 ] + levels[1 * stride + 0 ];
unsigned offset;
if (tx_class == TX_CLASS_2D) {
mag += levels[1 * stride + 1 ];
*hi_mag = mag;
mag += levels[0 * stride + 2 ] + levels[2 * stride + 0 ];
offset = ctx_offsets[umin(y, 4 )][umin(x, 4 )];
} else {
mag += levels[0 * stride + 2 ];
*hi_mag = mag;
mag += levels[0 * stride + 3 ] + levels[0 * stride + 4 ];
offset = 26 + (y > 1 ? 10 : y * 5 );
}
return offset + (mag > 512 ? 4 : (mag + 64 ) >> 7 );
}
static int decode_coefs(Dav1dTaskContext *const t,
uint8_t *const a, uint8_t *const l,
const enum RectTxfmSize tx, const enum BlockSize bs,
const Av1Block *const b, const int intra,
const int plane, coef *cf,
enum TxfmType *const txtp, uint8_t *res_ctx)
{
Dav1dTileState *const ts = t->ts;
const int chroma = !!plane;
const Dav1dFrameContext *const f = t->f;
const int lossless = f->frame_hdr->segmentation.lossless[b->seg_id];
const TxfmInfo *const t_dim = &dav1d_txfm_dimensions[tx];
const int dbg = DEBUG_BLOCK_INFO && plane && 0 ;
if (dbg)
printf("Start: r=%d\n" , ts->msac.rng);
// does this block have any non-zero coefficients
const int sctx = get_skip_ctx(t_dim, bs, a, l, chroma, f->cur.p.layout);
const int all_skip = dav1d_msac_decode_bool_adapt(&ts->msac,
ts->cdf.coef.skip[t_dim->ctx][sctx]);
if (dbg)
printf("Post-non-zero[%d][%d][%d]: r=%d\n" ,
t_dim->ctx, sctx, all_skip, ts->msac.rng);
if (all_skip) {
*res_ctx = 0 x40;
*txtp = lossless * WHT_WHT; /* lossless ? WHT_WHT : DCT_DCT */
return -1 ;
}
// transform type (chroma: derived, luma: explicitly coded)
if (lossless) {
assert(t_dim->max == TX_4X4);
*txtp = WHT_WHT;
} else if (t_dim->max + intra >= TX_64X64) {
*txtp = DCT_DCT;
} else if (chroma) {
// inferred from either the luma txtp (inter) or a LUT (intra)
*txtp = intra ? dav1d_txtp_from_uvmode[b->uv_mode] :
get_uv_inter_txtp(t_dim, *txtp);
} else if (!f->frame_hdr->segmentation.qidx[b->seg_id]) {
// In libaom, lossless is checked by a literal qidx == 0, but not all
// such blocks are actually lossless. The remainder gets an implicit
// transform type (for luma)
*txtp = DCT_DCT;
} else {
unsigned idx;
if (intra) {
const enum IntraPredMode y_mode_nofilt = b->y_mode == FILTER_PRED ?
dav1d_filter_mode_to_y_mode[b->y_angle] : b->y_mode;
if (f->frame_hdr->reduced_txtp_set || t_dim->min == TX_16X16) {
idx = dav1d_msac_decode_symbol_adapt8(&ts->msac,
ts->cdf.m.txtp_intra2[t_dim->min][y_mode_nofilt], 4 );
*txtp = dav1d_tx_types_per_set[idx + 0 ];
} else {
idx = dav1d_msac_decode_symbol_adapt8(&ts->msac,
ts->cdf.m.txtp_intra1[t_dim->min][y_mode_nofilt], 6 );
*txtp = dav1d_tx_types_per_set[idx + 5 ];
}
if (dbg)
printf("Post-txtp-intra[%d->%d][%d][%d->%d]: r=%d\n" ,
tx, t_dim->min, y_mode_nofilt, idx, *txtp, ts->msac.rng);
} else {
if (f->frame_hdr->reduced_txtp_set || t_dim->max == TX_32X32) {
idx = dav1d_msac_decode_bool_adapt(&ts->msac,
ts->cdf.m.txtp_inter3[t_dim->min]);
*txtp = (idx - 1 ) & IDTX; /* idx ? DCT_DCT : IDTX */
} else if (t_dim->min == TX_16X16) {
idx = dav1d_msac_decode_symbol_adapt16(&ts->msac,
ts->cdf.m.txtp_inter2, 11 );
*txtp = dav1d_tx_types_per_set[idx + 12 ];
} else {
idx = dav1d_msac_decode_symbol_adapt16(&ts->msac,
ts->cdf.m.txtp_inter1[t_dim->min], 15 );
*txtp = dav1d_tx_types_per_set[idx + 24 ];
}
if (dbg)
printf("Post-txtp-inter[%d->%d][%d->%d]: r=%d\n" ,
tx, t_dim->min, idx, *txtp, ts->msac.rng);
}
}
// find end-of-block (eob)
int eob_bin;
const int slw = imin(t_dim->lw, TX_32X32), slh = imin(t_dim->lh, TX_32X32);
const int tx2dszctx = slw + slh;
const enum TxClass tx_class = dav1d_tx_type_class[*txtp];
const int is_1d = tx_class != TX_CLASS_2D;
switch (tx2dszctx) {
#define case_sz(sz, bin, ns, is_1d) \
case sz: { \
uint16_t *const eob_bin_cdf = ts->cdf.coef.eob_bin_## bin[chroma]is_1d; \
eob_bin = dav1d_msac_decode_symbol_adapt## ns(&ts->msac, eob_bin_cdf, 4 + sz); \
break ; \
}
case_sz(0 , 16 , 8 , [is_1d]);
case_sz(1 , 32 , 8 , [is_1d]);
case_sz(2 , 64 , 8 , [is_1d]);
case_sz(3 , 128 , 8 , [is_1d]);
case_sz(4 , 256 , 16 , [is_1d]);
case_sz(5 , 512 , 16 , );
case_sz(6 , 1024 , 16 , );
#undef case_sz
}
if (dbg)
printf("Post-eob_bin_%d[%d][%d][%d]: r=%d\n" ,
16 << tx2dszctx, chroma, is_1d, eob_bin, ts->msac.rng);
int eob;
if (eob_bin > 1 ) {
uint16_t *const eob_hi_bit_cdf =
ts->cdf.coef.eob_hi_bit[t_dim->ctx][chroma][eob_bin];
const int eob_hi_bit = dav1d_msac_decode_bool_adapt(&ts->msac, eob_hi_bit_cdf);
if (dbg)
printf("Post-eob_hi_bit[%d][%d][%d][%d]: r=%d\n" ,
t_dim->ctx, chroma, eob_bin, eob_hi_bit, ts->msac.rng);
eob = ((eob_hi_bit | 2 ) << (eob_bin - 2 )) |
dav1d_msac_decode_bools(&ts->msac, eob_bin - 2 );
if (dbg)
printf("Post-eob[%d]: r=%d\n" , eob, ts->msac.rng);
} else {
eob = eob_bin;
}
assert(eob >= 0 );
// base tokens
uint16_t (*const eob_cdf)[4 ] = ts->cdf.coef.eob_base_tok[t_dim->ctx][chroma];
uint16_t (*const hi_cdf)[4 ] = ts->cdf.coef.br_tok[imin(t_dim->ctx, 3 )][chroma];
unsigned rc, dc_tok;
if (eob) {
uint16_t (*const lo_cdf)[4 ] = ts->cdf.coef.base_tok[t_dim->ctx][chroma];
uint8_t *const levels = t->scratch.levels; // bits 0-5: tok, 6-7: lo_tok
/* eob */
unsigned ctx = 1 + (eob > 2 << tx2dszctx) + (eob > 4 << tx2dszctx);
int eob_tok = dav1d_msac_decode_symbol_adapt4(&ts->msac, eob_cdf[ctx], 2 );
int tok = eob_tok + 1 ;
int level_tok = tok * 0 x41;
unsigned mag;
#define DECODE_COEFS_CLASS(tx_class) \
unsigned x, y; \
uint8_t *level; \
if (tx_class == TX_CLASS_2D) \
rc = scan[eob], x = rc >> shift, y = rc & mask; \
else if (tx_class == TX_CLASS_H) \
/* Transposing reduces the stride and padding requirements */ \
x = eob & mask, y = eob >> shift, rc = eob; \
else /* tx_class == TX_CLASS_V */ \
x = eob & mask, y = eob >> shift, rc = (x << shift2) | y; \
if (dbg) \
printf("Post-lo_tok[%d][%d][%d][%d=%d=%d]: r=%d\n" , \
t_dim->ctx, chroma, ctx, eob, rc, tok, ts->msac.rng); \
if (eob_tok == 2 ) { \
ctx = (tx_class == TX_CLASS_2D ? (x | y) > 1 : y != 0 ) ? 14 : 7 ; \
tok = dav1d_msac_decode_hi_tok(&ts->msac, hi_cdf[ctx]); \
level_tok = tok + (3 << 6 ); \
if (dbg) \
printf("Post-hi_tok[%d][%d][%d][%d=%d=%d]: r=%d\n" , \
imin(t_dim->ctx, 3 ), chroma, ctx, eob, rc, tok, \
ts->msac.rng); \
} \
cf[rc] = tok << 11 ; \
if (TX_CLASS_2D) \
level = levels + rc; \
else \
level = levels + x * stride + y; \
*level = (uint8_t) level_tok; \
for (int i = eob - 1 ; i > 0 ; i--) { /* ac */ \
unsigned rc_i; \
if (tx_class == TX_CLASS_2D) \
rc_i = scan[i], x = rc_i >> shift, y = rc_i & mask; \
else if (tx_class == TX_CLASS_H) \
x = i & mask, y = i >> shift, rc_i = i; \
else /* tx_class == TX_CLASS_V */ \
x = i & mask, y = i >> shift, rc_i = (x << shift2) | y; \
assert(x < 32 && y < 32 ); \
if (TX_CLASS_2D) \
level = levels + rc; \
else \
level = levels + x * stride + y; \
ctx = get_lo_ctx(level, tx_class, &mag, lo_ctx_offsets, x, y, stride); \
if (tx_class == TX_CLASS_2D) \
y |= x; \
tok = dav1d_msac_decode_symbol_adapt4(&ts->msac, lo_cdf[ctx], 3 ); \
if (dbg) \
printf("Post-lo_tok[%d][%d][%d][%d=%d=%d]: r=%d\n" , \
t_dim->ctx, chroma, ctx, i, rc_i, tok, ts->msac.rng); \
if (tok == 3 ) { \
mag &= 63 ; \
ctx = (y > (tx_class == TX_CLASS_2D) ? 14 : 7 ) + \
(mag > 12 ? 6 : (mag + 1 ) >> 1 ); \
tok = dav1d_msac_decode_hi_tok(&ts->msac, hi_cdf[ctx]); \
if (dbg) \
printf("Post-hi_tok[%d][%d][%d][%d=%d=%d]: r=%d\n" , \
imin(t_dim->ctx, 3 ), chroma, ctx, i, rc_i, tok, \
ts->msac.rng); \
*level = (uint8_t) (tok + (3 << 6 )); \
cf[rc_i] = (tok << 11 ) | rc; \
rc = rc_i; \
} else { \
/* 0x1 for tok, 0x7ff as bitmask for rc, 0x41 for level_tok */ \
tok *= 0 x17ff41; \
*level = (uint8_t) tok; \
/* tok ? (tok << 11) | rc : 0 */ \
tok = (tok >> 9 ) & (rc + ~0 x7ffu); \
if (tok) rc = rc_i; \
cf[rc_i] = tok; \
} \
} \
/* dc */ \
ctx = (tx_class == TX_CLASS_2D) ? 0 : \
get_lo_ctx(levels, tx_class, &mag, lo_ctx_offsets, 0 , 0 , stride); \
dc_tok = dav1d_msac_decode_symbol_adapt4(&ts->msac, lo_cdf[ctx], 3 ); \
if (dbg) \
printf("Post-dc_lo_tok[%d][%d][%d][%d]: r=%d\n" , \
t_dim->ctx, chroma, ctx, dc_tok, ts->msac.rng); \
if (dc_tok == 3 ) { \
if (tx_class == TX_CLASS_2D) \
mag = levels[0 * stride + 1 ] + levels[1 * stride + 0 ] + \
levels[1 * stride + 1 ]; \
mag &= 63 ; \
ctx = mag > 12 ? 6 : (mag + 1 ) >> 1 ; \
dc_tok = dav1d_msac_decode_hi_tok(&ts->msac, hi_cdf[ctx]); \
if (dbg) \
printf("Post-dc_hi_tok[%d][%d][0][%d]: r=%d\n" , \
imin(t_dim->ctx, 3 ), chroma, dc_tok, ts->msac.rng); \
} \
break
const uint16_t *scan;
switch (tx_class) {
case TX_CLASS_2D: {
const unsigned nonsquare_tx = tx >= RTX_4X8;
const uint8_t (*const lo_ctx_offsets)[5 ] =
dav1d_lo_ctx_offsets[nonsquare_tx + (tx & nonsquare_tx)];
scan = dav1d_scans[tx];
const ptrdiff_t stride = 4 << slh;
const unsigned shift = slh + 2 , shift2 = 0 ;
const unsigned mask = (4 << slh) - 1 ;
memset(levels, 0 , stride * ((4 << slw) + 2 ));
DECODE_COEFS_CLASS(TX_CLASS_2D);
}
case TX_CLASS_H: {
const uint8_t (*const lo_ctx_offsets)[5 ] = NULL;
const ptrdiff_t stride = 16 ;
const unsigned shift = slh + 2 , shift2 = 0 ;
const unsigned mask = (4 << slh) - 1 ;
memset(levels, 0 , stride * ((4 << slh) + 2 ));
DECODE_COEFS_CLASS(TX_CLASS_H);
}
case TX_CLASS_V: {
const uint8_t (*const lo_ctx_offsets)[5 ] = NULL;
const ptrdiff_t stride = 16 ;
const unsigned shift = slw + 2 , shift2 = slh + 2 ;
const unsigned mask = (4 << slw) - 1 ;
memset(levels, 0 , stride * ((4 << slw) + 2 ));
DECODE_COEFS_CLASS(TX_CLASS_V);
}
#undef DECODE_COEFS_CLASS
default : assert(0 );
}
} else { // dc-only
int tok_br = dav1d_msac_decode_symbol_adapt4(&ts->msac, eob_cdf[0 ], 2 );
dc_tok = 1 + tok_br;
if (dbg)
printf("Post-dc_lo_tok[%d][%d][%d][%d]: r=%d\n" ,
t_dim->ctx, chroma, 0 , dc_tok, ts->msac.rng);
if (tok_br == 2 ) {
dc_tok = dav1d_msac_decode_hi_tok(&ts->msac, hi_cdf[0 ]);
if (dbg)
printf("Post-dc_hi_tok[%d][%d][0][%d]: r=%d\n" ,
imin(t_dim->ctx, 3 ), chroma, dc_tok, ts->msac.rng);
}
rc = 0 ;
}
// residual and sign
const uint16_t *const dq_tbl = ts->dq[b->seg_id][plane];
const uint8_t *const qm_tbl = *txtp < IDTX ? f->qm[tx][plane] : NULL;
const int dq_shift = imax(0 , t_dim->ctx - 2 );
const int cf_max = ~(~127 U << (BITDEPTH == 8 ? 8 : f->cur.p.bpc));
unsigned cul_level, dc_sign_level;
if (!dc_tok) {
cul_level = 0 ;
dc_sign_level = 1 << 6 ;
if (qm_tbl) goto ac_qm;
goto ac_noqm;
}
const int dc_sign_ctx = get_dc_sign_ctx(tx, a, l);
uint16_t *const dc_sign_cdf = ts->cdf.coef.dc_sign[chroma][dc_sign_ctx];
const int dc_sign = dav1d_msac_decode_bool_adapt(&ts->msac, dc_sign_cdf);
if (dbg)
printf("Post-dc_sign[%d][%d][%d]: r=%d\n" ,
chroma, dc_sign_ctx, dc_sign, ts->msac.rng);
int dc_dq = dq_tbl[0 ];
dc_sign_level = (dc_sign - 1 ) & (2 << 6 );
if (qm_tbl) {
dc_dq = (dc_dq * qm_tbl[0 ] + 16 ) >> 5 ;
if (dc_tok == 15 ) {
dc_tok = read_golomb(&ts->msac) + 15 ;
if (dbg)
printf("Post-dc_residual[%d->%d]: r=%d\n" ,
dc_tok - 15 , dc_tok, ts->msac.rng);
dc_tok &= 0 xfffff;
dc_dq = (dc_dq * dc_tok) & 0 xffffff;
} else {
dc_dq *= dc_tok;
assert(dc_dq <= 0 xffffff);
}
cul_level = dc_tok;
dc_dq >>= dq_shift;
dc_dq = umin(dc_dq, cf_max + dc_sign);
cf[0 ] = (coef) (dc_sign ? -dc_dq : dc_dq);
if (rc) ac_qm: {
const unsigned ac_dq = dq_tbl[1 ];
do {
const int sign = dav1d_msac_decode_bool_equi(&ts->msac);
if (dbg)
printf("Post-sign[%d=%d]: r=%d\n" , rc, sign, ts->msac.rng);
const unsigned rc_tok = cf[rc];
unsigned tok, dq = (ac_dq * qm_tbl[rc] + 16 ) >> 5 ;
int dq_sat;
if (rc_tok >= (15 << 11 )) {
tok = read_golomb(&ts->msac) + 15 ;
if (dbg)
printf("Post-residual[%d=%d->%d]: r=%d\n" ,
rc, tok - 15 , tok, ts->msac.rng);
tok &= 0 xfffff;
dq = (dq * tok) & 0 xffffff;
} else {
tok = rc_tok >> 11 ;
dq *= tok;
assert(dq <= 0 xffffff);
}
cul_level += tok;
dq >>= dq_shift;
dq_sat = umin(dq, cf_max + sign);
cf[rc] = (coef) (sign ? -dq_sat : dq_sat);
rc = rc_tok & 0 x3ff;
} while (rc);
}
} else {
// non-qmatrix is the common case and allows for additional optimizations
if (dc_tok == 15 ) {
dc_tok = read_golomb(&ts->msac) + 15 ;
if (dbg)
printf("Post-dc_residual[%d->%d]: r=%d\n" ,
dc_tok - 15 , dc_tok, ts->msac.rng);
dc_tok &= 0 xfffff;
dc_dq = ((dc_dq * dc_tok) & 0 xffffff) >> dq_shift;
dc_dq = umin(dc_dq, cf_max + dc_sign);
} else {
dc_dq = ((dc_dq * dc_tok) >> dq_shift);
assert(dc_dq <= cf_max);
}
cul_level = dc_tok;
cf[0 ] = (coef) (dc_sign ? -dc_dq : dc_dq);
if (rc) ac_noqm: {
const unsigned ac_dq = dq_tbl[1 ];
do {
const int sign = dav1d_msac_decode_bool_equi(&ts->msac);
if (dbg)
printf("Post-sign[%d=%d]: r=%d\n" , rc, sign, ts->msac.rng);
const unsigned rc_tok = cf[rc];
unsigned tok;
int dq;
// residual
if (rc_tok >= (15 << 11 )) {
tok = read_golomb(&ts->msac) + 15 ;
if (dbg)
printf("Post-residual[%d=%d->%d]: r=%d\n" ,
rc, tok - 15 , tok, ts->msac.rng);
// coefficient parsing, see 5.11.39
tok &= 0 xfffff;
// dequant, see 7.12.3
dq = ((ac_dq * tok) & 0 xffffff) >> dq_shift;
dq = umin(dq, cf_max + sign);
} else {
// cannot exceed cf_max, so we can avoid the clipping
tok = rc_tok >> 11 ;
dq = ((ac_dq * tok) >> dq_shift);
assert(dq <= cf_max);
}
cul_level += tok;
cf[rc] = (coef) (sign ? -dq : dq);
rc = rc_tok & 0 x3ff; // next non-zero rc, zero if eob
} while (rc);
}
}
// context
*res_ctx = umin(cul_level, 63 ) | dc_sign_level;
return eob;
}
static void read_coef_tree(Dav1dTaskContext *const t,
const enum BlockSize bs, const Av1Block *const b,
const enum RectTxfmSize ytx, const int depth,
const uint16_t *const tx_split,
const int x_off, const int y_off, pixel *dst)
{
const Dav1dFrameContext *const f = t->f;
Dav1dTileState *const ts = t->ts;
const Dav1dDSPContext *const dsp = f->dsp;
const TxfmInfo *const t_dim = &dav1d_txfm_dimensions[ytx];
const int txw = t_dim->w, txh = t_dim->h;
/* y_off can be larger than 3 since lossless blocks use TX_4X4 but can't
* be splitted. Aviods an undefined left shift. */
if (depth < 2 && tx_split[depth] &&
tx_split[depth] & (1 << (y_off * 4 + x_off)))
{
const enum RectTxfmSize sub = t_dim->sub;
const TxfmInfo *const sub_t_dim = &dav1d_txfm_dimensions[sub];
const int txsw = sub_t_dim->w, txsh = sub_t_dim->h;
read_coef_tree(t, bs, b, sub, depth + 1 , tx_split,
x_off * 2 + 0 , y_off * 2 + 0 , dst);
t->bx += txsw;
if (txw >= txh && t->bx < f->bw)
read_coef_tree(t, bs, b, sub, depth + 1 , tx_split, x_off * 2 + 1 ,
y_off * 2 + 0 , dst ? &dst[4 * txsw] : NULL);
t->bx -= txsw;
t->by += txsh;
if (txh >= txw && t->by < f->bh) {
if (dst)
dst += 4 * txsh * PXSTRIDE(f->cur.stride[0 ]);
read_coef_tree(t, bs, b, sub, depth + 1 , tx_split,
x_off * 2 + 0 , y_off * 2 + 1 , dst);
t->bx += txsw;
if (txw >= txh && t->bx < f->bw)
read_coef_tree(t, bs, b, sub, depth + 1 , tx_split, x_off * 2 + 1 ,
y_off * 2 + 1 , dst ? &dst[4 * txsw] : NULL);
t->bx -= txsw;
}
t->by -= txsh;
} else {
const int bx4 = t->bx & 31 , by4 = t->by & 31 ;
enum TxfmType txtp;
uint8_t cf_ctx;
int eob;
coef *cf;
if (t->frame_thread.pass) {
const int p = t->frame_thread.pass & 1 ;
assert(ts->frame_thread[p].cf);
cf = ts->frame_thread[p].cf;
ts->frame_thread[p].cf += imin(t_dim->w, 8 ) * imin(t_dim->h, 8 ) * 16 ;
} else {
cf = bitfn(t->cf);
}
if (t->frame_thread.pass != 2 ) {
eob = decode_coefs(t, &t->a->lcoef[bx4], &t->l.lcoef[by4],
ytx, bs, b, 0 , 0 , cf, &txtp, &cf_ctx);
if (DEBUG_BLOCK_INFO)
printf("Post-y-cf-blk[tx=%d,txtp=%d,eob=%d]: r=%d\n" ,
ytx, txtp, eob, ts->msac.rng);
dav1d_memset_likely_pow2(&t->a->lcoef[bx4], cf_ctx, imin(txw, f->bw - t->bx));
dav1d_memset_likely_pow2(&t->l.lcoef[by4], cf_ctx, imin(txh, f->bh - t->by));
#define set_ctx(rep_macro) \
for (int y = 0 ; y < txh; y++) { \
rep_macro(txtp_map, 0 , txtp); \
txtp_map += 32 ; \
}
uint8_t *txtp_map = &t->scratch.txtp_map[by4 * 32 + bx4];
case_set_upto16(t_dim->lw);
#undef set_ctx
if (t->frame_thread.pass == 1 )
*ts->frame_thread[1 ].cbi++ = eob * (1 << 5 ) + txtp;
} else {
const int cbi = *ts->frame_thread[0 ].cbi++;
eob = cbi >> 5 ;
txtp = cbi & 0 x1f;
}
if (!(t->frame_thread.pass & 1 )) {
assert(dst);
if (eob >= 0 ) {
if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS)
coef_dump(cf, imin(t_dim->h, 8 ) * 4 , imin(t_dim->w, 8 ) * 4 , 3 , "dq" );
dsp->itx.itxfm_add[ytx][txtp](dst, f->cur.stride[0 ], cf, eob
HIGHBD_CALL_SUFFIX);
if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS)
hex_dump(dst, f->cur.stride[0 ], t_dim->w * 4 , t_dim->h * 4 , "recon" );
}
}
}
}
void bytefn(dav1d_read_coef_blocks)(Dav1dTaskContext *const t,
const enum BlockSize bs, const Av1Block *const b)
{
const Dav1dFrameContext *const f = t->f;
const int ss_ver = f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I420;
const int ss_hor = f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I444;
const int bx4 = t->bx & 31 , by4 = t->by & 31 ;
const int cbx4 = bx4 >> ss_hor, cby4 = by4 >> ss_ver;
const uint8_t *const b_dim = dav1d_block_dimensions[bs];
const int bw4 = b_dim[0 ], bh4 = b_dim[1 ];
const int cbw4 = (bw4 + ss_hor) >> ss_hor, cbh4 = (bh4 + ss_ver) >> ss_ver;
const int has_chroma = f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I400 &&
(bw4 > ss_hor || t->bx & 1 ) &&
(bh4 > ss_ver || t->by & 1 );
if (b->skip) {
BlockContext *const a = t->a;
dav1d_memset_pow2[b_dim[2 ]](&a->lcoef[bx4], 0 x40);
dav1d_memset_pow2[b_dim[3 ]](&t->l.lcoef[by4], 0 x40);
if (has_chroma) {
dav1d_memset_pow2_fn memset_cw = dav1d_memset_pow2[ulog2(cbw4)];
dav1d_memset_pow2_fn memset_ch = dav1d_memset_pow2[ulog2(cbh4)];
memset_cw(&a->ccoef[0 ][cbx4], 0 x40);
memset_cw(&a->ccoef[1 ][cbx4], 0 x40);
memset_ch(&t->l.ccoef[0 ][cby4], 0 x40);
memset_ch(&t->l.ccoef[1 ][cby4], 0 x40);
}
return ;
}
Dav1dTileState *const ts = t->ts;
const int w4 = imin(bw4, f->bw - t->bx), h4 = imin(bh4, f->bh - t->by);
const int cw4 = (w4 + ss_hor) >> ss_hor, ch4 = (h4 + ss_ver) >> ss_ver;
assert(t->frame_thread.pass == 1 );
assert(!b->skip);
const TxfmInfo *const uv_t_dim = &dav1d_txfm_dimensions[b->uvtx];
const TxfmInfo *const t_dim = &dav1d_txfm_dimensions[b->intra ? b->tx : b->max_ytx];
const uint16_t tx_split[2 ] = { b->tx_split0, b->tx_split1 };
for (int init_y = 0 ; init_y < h4; init_y += 16 ) {
const int sub_h4 = imin(h4, 16 + init_y);
for (int init_x = 0 ; init_x < w4; init_x += 16 ) {
const int sub_w4 = imin(w4, init_x + 16 );
int y_off = !!init_y, y, x;
for (y = init_y, t->by += init_y; y < sub_h4;
y += t_dim->h, t->by += t_dim->h, y_off++)
{
int x_off = !!init_x;
for (x = init_x, t->bx += init_x; x < sub_w4;
x += t_dim->w, t->bx += t_dim->w, x_off++)
{
if (!b->intra) {
read_coef_tree(t, bs, b, b->max_ytx, 0 , tx_split,
x_off, y_off, NULL);
} else {
uint8_t cf_ctx = 0 x40;
enum TxfmType txtp;
const int eob =
decode_coefs(t, &t->a->lcoef[bx4 + x],
&t->l.lcoef[by4 + y], b->tx, bs, b, 1 ,
0 , ts->frame_thread[1 ].cf, &txtp, &cf_ctx);
if (DEBUG_BLOCK_INFO)
printf("Post-y-cf-blk[tx=%d,txtp=%d,eob=%d]: r=%d\n" ,
b->tx, txtp, eob, ts->msac.rng);
*ts->frame_thread[1 ].cbi++ = eob * (1 << 5 ) + txtp;
ts->frame_thread[1 ].cf += imin(t_dim->w, 8 ) * imin(t_dim->h, 8 ) * 16 ;
dav1d_memset_likely_pow2(&t->a->lcoef[bx4 + x], cf_ctx, imin(t_dim->w, f->bw - t->bx));
dav1d_memset_likely_pow2(&t->l.lcoef[by4 + y], cf_ctx, imin(t_dim->h, f->bh - t->by));
}
}
t->bx -= x;
}
t->by -= y;
if (!has_chroma) continue ;
const int sub_ch4 = imin(ch4, (init_y + 16 ) >> ss_ver);
const int sub_cw4 = imin(cw4, (init_x + 16 ) >> ss_hor);
for (int pl = 0 ; pl < 2 ; pl++) {
for (y = init_y >> ss_ver, t->by += init_y; y < sub_ch4;
y += uv_t_dim->h, t->by += uv_t_dim->h << ss_ver)
{
for (x = init_x >> ss_hor, t->bx += init_x; x < sub_cw4;
x += uv_t_dim->w, t->bx += uv_t_dim->w << ss_hor)
{
uint8_t cf_ctx = 0 x40;
enum TxfmType txtp;
if (!b->intra)
txtp = t->scratch.txtp_map[(by4 + (y << ss_ver)) * 32 +
bx4 + (x << ss_hor)];
const int eob =
decode_coefs(t, &t->a->ccoef[pl][cbx4 + x],
&t->l.ccoef[pl][cby4 + y], b->uvtx, bs,
b, b->intra, 1 + pl, ts->frame_thread[1 ].cf,
&txtp, &cf_ctx);
if (DEBUG_BLOCK_INFO)
printf("Post-uv-cf-blk[pl=%d,tx=%d,"
"txtp=%d,eob=%d]: r=%d\n" ,
pl, b->uvtx, txtp, eob, ts->msac.rng);
*ts->frame_thread[1 ].cbi++ = eob * (1 << 5 ) + txtp;
ts->frame_thread[1 ].cf += uv_t_dim->w * uv_t_dim->h * 16 ;
int ctw = imin(uv_t_dim->w, (f->bw - t->bx + ss_hor) >> ss_hor);
int cth = imin(uv_t_dim->h, (f->bh - t->by + ss_ver) >> ss_ver);
dav1d_memset_likely_pow2(&t->a->ccoef[pl][cbx4 + x], cf_ctx, ctw);
dav1d_memset_likely_pow2(&t->l.ccoef[pl][cby4 + y], cf_ctx, cth);
}
t->bx -= x << ss_hor;
}
t->by -= y << ss_ver;
}
}
}
}
static int mc(Dav1dTaskContext *const t,
pixel *const dst8, int16_t *const dst16, const ptrdiff_t dst_stride,
const int bw4, const int bh4,
const int bx, const int by, const int pl,
const mv mv, const Dav1dThreadPicture *const refp, const int refidx,
const enum Filter2d filter_2d)
{
assert((dst8 != NULL) ^ (dst16 != NULL));
const Dav1dFrameContext *const f = t->f;
const int ss_ver = !!pl && f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I420;
const int ss_hor = !!pl && f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I444;
const int h_mul = 4 >> ss_hor, v_mul = 4 >> ss_ver;
const int mvx = mv.x, mvy = mv.y;
const int mx = mvx & (15 >> !ss_hor), my = mvy & (15 >> !ss_ver);
ptrdiff_t ref_stride = refp->p.stride[!!pl];
const pixel *ref;
if (refp->p.p.w == f->cur.p.w && refp->p.p.h == f->cur.p.h) {
const int dx = bx * h_mul + (mvx >> (3 + ss_hor));
const int dy = by * v_mul + (mvy >> (3 + ss_ver));
int w, h;
if (refp->p.data[0 ] != f->cur.data[0 ]) { // i.e. not for intrabc
w = (f->cur.p.w + ss_hor) >> ss_hor;
h = (f->cur.p.h + ss_ver) >> ss_ver;
} else {
w = f->bw * 4 >> ss_hor;
h = f->bh * 4 >> ss_ver;
}
if (dx < !!mx * 3 || dy < !!my * 3 ||
dx + bw4 * h_mul + !!mx * 4 > w ||
dy + bh4 * v_mul + !!my * 4 > h)
{
pixel *const emu_edge_buf = bitfn(t->scratch.emu_edge);
f->dsp->mc.emu_edge(bw4 * h_mul + !!mx * 7 , bh4 * v_mul + !!my * 7 ,
w, h, dx - !!mx * 3 , dy - !!my * 3 ,
emu_edge_buf, 192 * sizeof (pixel),
refp->p.data[pl], ref_stride);
ref = &emu_edge_buf[192 * !!my * 3 + !!mx * 3 ];
ref_stride = 192 * sizeof (pixel);
} else {
ref = ((pixel *) refp->p.data[pl]) + PXSTRIDE(ref_stride) * dy + dx;
}
if (dst8 != NULL) {
f->dsp->mc.mc[filter_2d](dst8, dst_stride, ref, ref_stride, bw4 * h_mul,
bh4 * v_mul, mx << !ss_hor, my << !ss_ver
HIGHBD_CALL_SUFFIX);
} else {
f->dsp->mc.mct[filter_2d](dst16, ref, ref_stride, bw4 * h_mul,
bh4 * v_mul, mx << !ss_hor, my << !ss_ver
HIGHBD_CALL_SUFFIX);
}
} else {
assert(refp != &f->sr_cur);
const int orig_pos_y = (by * v_mul << 4 ) + mvy * (1 << !ss_ver);
const int orig_pos_x = (bx * h_mul << 4 ) + mvx * (1 << !ss_hor);
#define scale_mv(res, val, scale) do { \
const int64_t tmp = (int64_t)(val) * scale + (scale - 0 x4000) * 8 ; \
res = apply_sign64((int ) ((llabs(tmp) + 128 ) >> 8 ), tmp) + 32 ; \
} while (0 )
int pos_y, pos_x;
scale_mv(pos_x, orig_pos_x, f->svc[refidx][0 ].scale);
scale_mv(pos_y, orig_pos_y, f->svc[refidx][1 ].scale);
#undef scale_mv
const int left = pos_x >> 10 ;
const int top = pos_y >> 10 ;
const int right =
((pos_x + (bw4 * h_mul - 1 ) * f->svc[refidx][0 ].step) >> 10 ) + 1 ;
const int bottom =
((pos_y + (bh4 * v_mul - 1 ) * f->svc[refidx][1 ].step) >> 10 ) + 1 ;
if (DEBUG_BLOCK_INFO)
printf("Off %dx%d [%d,%d,%d], size %dx%d [%d,%d]\n" ,
left, top, orig_pos_x, f->svc[refidx][0 ].scale, refidx,
right-left, bottom-top,
f->svc[refidx][0 ].step, f->svc[refidx][1 ].step);
const int w = (refp->p.p.w + ss_hor) >> ss_hor;
const int h = (refp->p.p.h + ss_ver) >> ss_ver;
if (left < 3 || top < 3 || right + 4 > w || bottom + 4 > h) {
pixel *const emu_edge_buf = bitfn(t->scratch.emu_edge);
f->dsp->mc.emu_edge(right - left + 7 , bottom - top + 7 ,
w, h, left - 3 , top - 3 ,
emu_edge_buf, 320 * sizeof (pixel),
refp->p.data[pl], ref_stride);
ref = &emu_edge_buf[320 * 3 + 3 ];
ref_stride = 320 * sizeof (pixel);
if (DEBUG_BLOCK_INFO) printf("Emu\n" );
} else {
ref = ((pixel *) refp->p.data[pl]) + PXSTRIDE(ref_stride) * top + left;
}
if (dst8 != NULL) {
f->dsp->mc.mc_scaled[filter_2d](dst8, dst_stride, ref, ref_stride,
bw4 * h_mul, bh4 * v_mul,
pos_x & 0 x3ff, pos_y & 0 x3ff,
f->svc[refidx][0 ].step,
f->svc[refidx][1 ].step
HIGHBD_CALL_SUFFIX);
} else {
f->dsp->mc.mct_scaled[filter_2d](dst16, ref, ref_stride,
bw4 * h_mul, bh4 * v_mul,
pos_x & 0 x3ff, pos_y & 0 x3ff,
f->svc[refidx][0 ].step,
f->svc[refidx][1 ].step
HIGHBD_CALL_SUFFIX);
}
}
return 0 ;
}
static int obmc(Dav1dTaskContext *const t,
pixel *const dst, const ptrdiff_t dst_stride,
const uint8_t *const b_dim, const int pl,
const int bx4, const int by4, const int w4, const int h4)
{
assert(!(t->bx & 1 ) && !(t->by & 1 ));
const Dav1dFrameContext *const f = t->f;
/*const*/ refmvs_block **r = &t->rt.r[(t->by & 31) + 5];
pixel *const lap = bitfn(t->scratch.lap);
const int ss_ver = !!pl && f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I420;
const int ss_hor = !!pl && f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I444;
const int h_mul = 4 >> ss_hor, v_mul = 4 >> ss_ver;
int res;
if (t->by > t->ts->tiling.row_start &&
(!pl || b_dim[0 ] * h_mul + b_dim[1 ] * v_mul >= 16 ))
{
for (int i = 0 , x = 0 ; x < w4 && i < imin(b_dim[2 ], 4 ); ) {
// only odd blocks are considered for overlap handling, hence +1
const refmvs_block *const a_r = &r[-1 ][t->bx + x + 1 ];
const uint8_t *const a_b_dim = dav1d_block_dimensions[a_r->bs];
const int step4 = iclip(a_b_dim[0 ], 2 , 16 );
if (a_r->ref.ref[0 ] > 0 ) {
const int ow4 = imin(step4, b_dim[0 ]);
const int oh4 = imin(b_dim[1 ], 16 ) >> 1 ;
res = mc(t, lap, NULL, ow4 * h_mul * sizeof (pixel), ow4, (oh4 * 3 + 3 ) >> 2 ,
t->bx + x, t->by, pl, a_r->mv.mv[0 ],
&f->refp[a_r->ref.ref[0 ] - 1 ], a_r->ref.ref[0 ] - 1 ,
dav1d_filter_2d[t->a->filter[1 ][bx4 + x + 1 ]][t->a->filter[0 ][bx4 + x + 1 ]]);
if (res) return res;
f->dsp->mc.blend_h(&dst[x * h_mul], dst_stride, lap,
h_mul * ow4, v_mul * oh4);
i++;
}
x += step4;
}
}
if (t->bx > t->ts->tiling.col_start)
for (int i = 0 , y = 0 ; y < h4 && i < imin(b_dim[3 ], 4 ); ) {
// only odd blocks are considered for overlap handling, hence +1
const refmvs_block *const l_r = &r[y + 1 ][t->bx - 1 ];
const uint8_t *const l_b_dim = dav1d_block_dimensions[l_r->bs];
const int step4 = iclip(l_b_dim[1 ], 2 , 16 );
if (l_r->ref.ref[0 ] > 0 ) {
const int ow4 = imin(b_dim[0 ], 16 ) >> 1 ;
const int oh4 = imin(step4, b_dim[1 ]);
res = mc(t, lap, NULL, h_mul * ow4 * sizeof (pixel), ow4, oh4,
t->bx, t->by + y, pl, l_r->mv.mv[0 ],
&f->refp[l_r->ref.ref[0 ] - 1 ], l_r->ref.ref[0 ] - 1 ,
dav1d_filter_2d[t->l.filter[1 ][by4 + y + 1 ]][t->l.filter[0 ][by4 + y + 1 ]]);
if (res) return res;
f->dsp->mc.blend_v(&dst[y * v_mul * PXSTRIDE(dst_stride)],
dst_stride, lap, h_mul * ow4, v_mul * oh4);
i++;
}
y += step4;
}
return 0 ;
}
static int warp_affine(Dav1dTaskContext *const t,
pixel *dst8, int16_t *dst16, const ptrdiff_t dstride,
const uint8_t *const b_dim, const int pl,
const Dav1dThreadPicture *const refp,
const Dav1dWarpedMotionParams *const wmp)
{
assert((dst8 != NULL) ^ (dst16 != NULL));
const Dav1dFrameContext *const f = t->f;
const Dav1dDSPContext *const dsp = f->dsp;
const int ss_ver = !!pl && f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I420;
const int ss_hor = !!pl && f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I444;
const int h_mul = 4 >> ss_hor, v_mul = 4 >> ss_ver;
assert(!((b_dim[0 ] * h_mul) & 7 ) && !((b_dim[1 ] * v_mul) & 7 ));
const int32_t *const mat = wmp->matrix;
const int width = (refp->p.p.w + ss_hor) >> ss_hor;
const int height = (refp->p.p.h + ss_ver) >> ss_ver;
for (int y = 0 ; y < b_dim[1 ] * v_mul; y += 8 ) {
const int src_y = t->by * 4 + ((y + 4 ) << ss_ver);
const int64_t mat3_y = (int64_t) mat[3 ] * src_y + mat[0 ];
const int64_t mat5_y = (int64_t) mat[5 ] * src_y + mat[1 ];
for (int x = 0 ; x < b_dim[0 ] * h_mul; x += 8 ) {
// calculate transformation relative to center of 8x8 block in
// luma pixel units
const int src_x = t->bx * 4 + ((x + 4 ) << ss_hor);
const int64_t mvx = ((int64_t) mat[2 ] * src_x + mat3_y) >> ss_hor;
const int64_t mvy = ((int64_t) mat[4 ] * src_x + mat5_y) >> ss_ver;
const int dx = (int ) (mvx >> 16 ) - 4 ;
const int mx = (((int ) mvx & 0 xffff) - wmp->u.p.alpha * 4 -
wmp->u.p.beta * 7 ) & ~0 x3f;
const int dy = (int ) (mvy >> 16 ) - 4 ;
const int my = (((int ) mvy & 0 xffff) - wmp->u.p.gamma * 4 -
wmp->u.p.delta * 4 ) & ~0 x3f;
const pixel *ref_ptr;
ptrdiff_t ref_stride = refp->p.stride[!!pl];
if (dx < 3 || dx + 8 + 4 > width || dy < 3 || dy + 8 + 4 > height) {
pixel *const emu_edge_buf = bitfn(t->scratch.emu_edge);
f->dsp->mc.emu_edge(15 , 15 , width, height, dx - 3 , dy - 3 ,
emu_edge_buf, 32 * sizeof (pixel),
refp->p.data[pl], ref_stride);
ref_ptr = &emu_edge_buf[32 * 3 + 3 ];
ref_stride = 32 * sizeof (pixel);
} else {
ref_ptr = ((pixel *) refp->p.data[pl]) + PXSTRIDE(ref_stride) * dy + dx;
}
if (dst16 != NULL)
dsp->mc.warp8x8t(&dst16[x], dstride, ref_ptr, ref_stride,
wmp->u.abcd, mx, my HIGHBD_CALL_SUFFIX);
else
dsp->mc.warp8x8(&dst8[x], dstride, ref_ptr, ref_stride,
wmp->u.abcd, mx, my HIGHBD_CALL_SUFFIX);
}
if (dst8) dst8 += 8 * PXSTRIDE(dstride);
else dst16 += 8 * dstride;
}
return 0 ;
}
void bytefn(dav1d_recon_b_intra)(Dav1dTaskContext *const t, const enum BlockSize bs,
const enum EdgeFlags intra_edge_flags,
const Av1Block *const b)
{
Dav1dTileState *const ts = t->ts;
const Dav1dFrameContext *const f = t->f;
const Dav1dDSPContext *const dsp = f->dsp;
const int bx4 = t->bx & 31 , by4 = t->by & 31 ;
const int ss_ver = f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I420;
const int ss_hor = f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I444;
const int cbx4 = bx4 >> ss_hor, cby4 = by4 >> ss_ver;
const uint8_t *const b_dim = dav1d_block_dimensions[bs];
const int bw4 = b_dim[0 ], bh4 = b_dim[1 ];
const int w4 = imin(bw4, f->bw - t->bx), h4 = imin(bh4, f->bh - t->by);
const int cw4 = (w4 + ss_hor) >> ss_hor, ch4 = (h4 + ss_ver) >> ss_ver;
const int has_chroma = f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I400 &&
(bw4 > ss_hor || t->bx & 1 ) &&
(bh4 > ss_ver || t->by & 1 );
const TxfmInfo *const t_dim = &dav1d_txfm_dimensions[b->tx];
const TxfmInfo *const uv_t_dim = &dav1d_txfm_dimensions[b->uvtx];
// coefficient coding
pixel *const edge = bitfn(t->scratch.edge) + 128 ;
const int cbw4 = (bw4 + ss_hor) >> ss_hor, cbh4 = (bh4 + ss_ver) >> ss_ver;
const int intra_edge_filter_flag = f->seq_hdr->intra_edge_filter << 10 ;
for (int init_y = 0 ; init_y < h4; init_y += 16 ) {
const int sub_h4 = imin(h4, 16 + init_y);
const int sub_ch4 = imin(ch4, (init_y + 16 ) >> ss_ver);
for (int init_x = 0 ; init_x < w4; init_x += 16 ) {
if (b->pal_sz[0 ]) {
pixel *dst = ((pixel *) f->cur.data[0 ]) +
4 * (t->by * PXSTRIDE(f->cur.stride[0 ]) + t->bx);
const uint8_t *pal_idx;
if (t->frame_thread.pass) {
const int p = t->frame_thread.pass & 1 ;
assert(ts->frame_thread[p].pal_idx);
pal_idx = ts->frame_thread[p].pal_idx;
ts->frame_thread[p].pal_idx += bw4 * bh4 * 8 ;
} else {
pal_idx = t->scratch.pal_idx_y;
}
const pixel *const pal = t->frame_thread.pass ?
f->frame_thread.pal[((t->by >> 1 ) + (t->bx & 1 )) * (f->b4_stride >> 1 ) +
((t->bx >> 1 ) + (t->by & 1 ))][0 ] :
bytefn(t->scratch.pal)[0 ];
f->dsp->ipred.pal_pred(dst, f->cur.stride[0 ], pal,
pal_idx, bw4 * 4 , bh4 * 4 );
if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS)
hex_dump(dst, PXSTRIDE(f->cur.stride[0 ]),
bw4 * 4 , bh4 * 4 , "y-pal-pred" );
}
const int intra_flags = (sm_flag(t->a, bx4) |
sm_flag(&t->l, by4) |
intra_edge_filter_flag);
const int sb_has_tr = init_x + 16 < w4 ? 1 : init_y ? 0 :
intra_edge_flags & EDGE_I444_TOP_HAS_RIGHT;
const int sb_has_bl = init_x ? 0 : init_y + 16 < h4 ? 1 :
intra_edge_flags & EDGE_I444_LEFT_HAS_BOTTOM;
int y, x;
const int sub_w4 = imin(w4, init_x + 16 );
for (y = init_y, t->by += init_y; y < sub_h4;
y += t_dim->h, t->by += t_dim->h)
{
pixel *dst = ((pixel *) f->cur.data[0 ]) +
4 * (t->by * PXSTRIDE(f->cur.stride[0 ]) +
t->bx + init_x);
for (x = init_x, t->bx += init_x; x < sub_w4;
x += t_dim->w, t->bx += t_dim->w)
{
if (b->pal_sz[0 ]) goto skip_y_pred;
int angle = b->y_angle;
const enum EdgeFlags edge_flags =
(((y > init_y || !sb_has_tr) && (x + t_dim->w >= sub_w4)) ?
0 : EDGE_I444_TOP_HAS_RIGHT) |
((x > init_x || (!sb_has_bl && y + t_dim->h >= sub_h4)) ?
0 : EDGE_I444_LEFT_HAS_BOTTOM);
const pixel *top_sb_edge = NULL;
if (!(t->by & (f->sb_step - 1 ))) {
top_sb_edge = f->ipred_edge[0 ];
const int sby = t->by >> f->sb_shift;
top_sb_edge += f->sb128w * 128 * (sby - 1 );
}
const enum IntraPredMode m =
bytefn(dav1d_prepare_intra_edges)(t->bx,
t->bx > ts->tiling.col_start,
t->by,
t->by > ts->tiling.row_start,
ts->tiling.col_end,
ts->tiling.row_end,
edge_flags, dst,
f->cur.stride[0 ], top_sb_edge,
b->y_mode, &angle,
t_dim->w, t_dim->h,
f->seq_hdr->intra_edge_filter,
edge HIGHBD_CALL_SUFFIX);
dsp->ipred.intra_pred[m](dst, f->cur.stride[0 ], edge,
t_dim->w * 4 , t_dim->h * 4 ,
angle | intra_flags,
4 * f->bw - 4 * t->bx,
4 * f->bh - 4 * t->by
HIGHBD_CALL_SUFFIX);
if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS) {
hex_dump(edge - t_dim->h * 4 , t_dim->h * 4 ,
t_dim->h * 4 , 2 , "l" );
hex_dump(edge, 0 , 1 , 1 , "tl" );
hex_dump(edge + 1 , t_dim->w * 4 ,
t_dim->w * 4 , 2 , "t" );
hex_dump(dst, f->cur.stride[0 ],
t_dim->w * 4 , t_dim->h * 4 , "y-intra-pred" );
}
skip_y_pred: {}
if (!b->skip) {
coef *cf;
int eob;
enum TxfmType txtp;
if (t->frame_thread.pass) {
const int p = t->frame_thread.pass & 1 ;
const int cbi = *ts->frame_thread[p].cbi++;
cf = ts->frame_thread[p].cf;
ts->frame_thread[p].cf += imin(t_dim->w, 8 ) * imin(t_dim->h, 8 ) * 16 ;
eob = cbi >> 5 ;
txtp = cbi & 0 x1f;
} else {
uint8_t cf_ctx;
cf = bitfn(t->cf);
eob = decode_coefs(t, &t->a->lcoef[bx4 + x],
&t->l.lcoef[by4 + y], b->tx, bs,
b, 1 , 0 , cf, &txtp, &cf_ctx);
if (DEBUG_BLOCK_INFO)
printf("Post-y-cf-blk[tx=%d,txtp=%d,eob=%d]: r=%d\n" ,
b->tx, txtp, eob, ts->msac.rng);
dav1d_memset_likely_pow2(&t->a->lcoef[bx4 + x], cf_ctx, imin(t_dim->w, f->bw - t->bx));
dav1d_memset_likely_pow2(&t->l.lcoef[by4 + y], cf_ctx, imin(t_dim->h, f->bh - t->by));
}
if (eob >= 0 ) {
if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS)
coef_dump(cf, imin(t_dim->h, 8 ) * 4 ,
imin(t_dim->w, 8 ) * 4 , 3 , "dq" );
dsp->itx.itxfm_add[b->tx]
[txtp](dst,
f->cur.stride[0 ],
cf, eob HIGHBD_CALL_SUFFIX);
if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS)
hex_dump(dst, f->cur.stride[0 ],
t_dim->w * 4 , t_dim->h * 4 , "recon" );
}
} else if (!t->frame_thread.pass) {
dav1d_memset_pow2[t_dim->lw](&t->a->lcoef[bx4 + x], 0 x40);
dav1d_memset_pow2[t_dim->lh](&t->l.lcoef[by4 + y], 0 x40);
}
dst += 4 * t_dim->w;
}
t->bx -= x;
}
t->by -= y;
if (!has_chroma) continue ;
const ptrdiff_t stride = f->cur.stride[1 ];
if (b->uv_mode == CFL_PRED) {
assert(!init_x && !init_y);
int16_t *const ac = t->scratch.ac;
pixel *y_src = ((pixel *) f->cur.data[0 ]) + 4 * (t->bx & ~ss_hor) +
4 * (t->by & ~ss_ver) * PXSTRIDE(f->cur.stride[0 ]);
const ptrdiff_t uv_off = 4 * ((t->bx >> ss_hor) +
(t->by >> ss_ver) * PXSTRIDE(stride));
pixel *const uv_dst[2 ] = { ((pixel *) f->cur.data[1 ]) + uv_off,
((pixel *) f->cur.data[2 ]) + uv_off };
const int furthest_r =
((cw4 << ss_hor) + t_dim->w - 1 ) & ~(t_dim->w - 1 );
const int furthest_b =
((ch4 << ss_ver) + t_dim->h - 1 ) & ~(t_dim->h - 1 );
dsp->ipred.cfl_ac[f->cur.p.layout - 1 ](ac, y_src, f->cur.stride[0 ],
cbw4 - (furthest_r >> ss_hor),
cbh4 - (furthest_b >> ss_ver),
cbw4 * 4 , cbh4 * 4 );
for (int pl = 0 ; pl < 2 ; pl++) {
if (!b->cfl_alpha[pl]) continue ;
int angle = 0 ;
const pixel *top_sb_edge = NULL;
if (!((t->by & ~ss_ver) & (f->sb_step - 1 ))) {
top_sb_edge = f->ipred_edge[pl + 1 ];
const int sby = t->by >> f->sb_shift;
top_sb_edge += f->sb128w * 128 * (sby - 1 );
}
const int xpos = t->bx >> ss_hor, ypos = t->by >> ss_ver;
const int xstart = ts->tiling.col_start >> ss_hor;
const int ystart = ts->tiling.row_start >> ss_ver;
const enum IntraPredMode m =
bytefn(dav1d_prepare_intra_edges)(xpos, xpos > xstart,
ypos, ypos > ystart,
ts->tiling.col_end >> ss_hor,
ts->tiling.row_end >> ss_ver,
0 , uv_dst[pl], stride,
top_sb_edge, DC_PRED, &angle,
uv_t_dim->w, uv_t_dim->h, 0 ,
edge HIGHBD_CALL_SUFFIX);
dsp->ipred.cfl_pred[m](uv_dst[pl], stride, edge,
uv_t_dim->w * 4 ,
uv_t_dim->h * 4 ,
ac, b->cfl_alpha[pl]
HIGHBD_CALL_SUFFIX);
}
if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS) {
ac_dump(ac, 4 *cbw4, 4 *cbh4, "ac" );
hex_dump(uv_dst[0 ], stride, cbw4 * 4 , cbh4 * 4 , "u-cfl-pred" );
hex_dump(uv_dst[1 ], stride, cbw4 * 4 , cbh4 * 4 , "v-cfl-pred" );
}
} else if (b->pal_sz[1 ]) {
const ptrdiff_t uv_dstoff = 4 * ((t->bx >> ss_hor) +
(t->by >> ss_ver) * PXSTRIDE(f->cur.stride[1 ]));
const pixel (*pal)[8 ];
const uint8_t *pal_idx;
if (t->frame_thread.pass) {
const int p = t->frame_thread.pass & 1 ;
assert(ts->frame_thread[p].pal_idx);
pal = f->frame_thread.pal[((t->by >> 1 ) + (t->bx & 1 )) * (f->b4_stride >> 1 ) +
((t->bx >> 1 ) + (t->by & 1 ))];
pal_idx = ts->frame_thread[p].pal_idx;
ts->frame_thread[p].pal_idx += cbw4 * cbh4 * 8 ;
} else {
pal = bytefn(t->scratch.pal);
pal_idx = t->scratch.pal_idx_uv;
}
f->dsp->ipred.pal_pred(((pixel *) f->cur.data[1 ]) + uv_dstoff,
f->cur.stride[1 ], pal[1 ],
pal_idx, cbw4 * 4 , cbh4 * 4 );
f->dsp->ipred.pal_pred(((pixel *) f->cur.data[2 ]) + uv_dstoff,
f->cur.stride[1 ], pal[2 ],
pal_idx, cbw4 * 4 , cbh4 * 4 );
if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS) {
hex_dump(((pixel *) f->cur.data[1 ]) + uv_dstoff,
PXSTRIDE(f->cur.stride[1 ]),
cbw4 * 4 , cbh4 * 4 , "u-pal-pred" );
hex_dump(((pixel *) f->cur.data[2 ]) + uv_dstoff,
PXSTRIDE(f->cur.stride[1 ]),
cbw4 * 4 , cbh4 * 4 , "v-pal-pred" );
}
}
const int sm_uv_fl = sm_uv_flag(t->a, cbx4) |
sm_uv_flag(&t->l, cby4);
const int uv_sb_has_tr =
((init_x + 16 ) >> ss_hor) < cw4 ? 1 : init_y ? 0 :
intra_edge_flags & (EDGE_I420_TOP_HAS_RIGHT >> (f->cur.p.layout - 1 ));
const int uv_sb_has_bl =
init_x ? 0 : ((init_y + 16 ) >> ss_ver) < ch4 ? 1 :
intra_edge_flags & (EDGE_I420_LEFT_HAS_BOTTOM >> (f->cur.p.layout - 1 ));
const int sub_cw4 = imin(cw4, (init_x + 16 ) >> ss_hor);
for (int pl = 0 ; pl < 2 ; pl++) {
for (y = init_y >> ss_ver, t->by += init_y; y < sub_ch4;
y += uv_t_dim->h, t->by += uv_t_dim->h << ss_ver)
{
pixel *dst = ((pixel *) f->cur.data[1 + pl]) +
4 * ((t->by >> ss_ver) * PXSTRIDE(stride) +
((t->bx + init_x) >> ss_hor));
for (x = init_x >> ss_hor, t->bx += init_x; x < sub_cw4;
x += uv_t_dim->w, t->bx += uv_t_dim->w << ss_hor)
{
if ((b->uv_mode == CFL_PRED && b->cfl_alpha[pl]) ||
b->pal_sz[1 ])
{
goto skip_uv_pred;
}
int angle = b->uv_angle;
// this probably looks weird because we're using
// luma flags in a chroma loop, but that's because
// prepare_intra_edges() expects luma flags as input
const enum EdgeFlags edge_flags =
(((y > (init_y >> ss_ver) || !uv_sb_has_tr) &&
(x + uv_t_dim->w >= sub_cw4)) ?
0 : EDGE_I444_TOP_HAS_RIGHT) |
((x > (init_x >> ss_hor) ||
(!uv_sb_has_bl && y + uv_t_dim->h >= sub_ch4)) ?
0 : EDGE_I444_LEFT_HAS_BOTTOM);
const pixel *top_sb_edge = NULL;
if (!((t->by & ~ss_ver) & (f->sb_step - 1 ))) {
top_sb_edge = f->ipred_edge[1 + pl];
const int sby = t->by >> f->sb_shift;
top_sb_edge += f->sb128w * 128 * (sby - 1 );
}
const enum IntraPredMode uv_mode =
b->uv_mode == CFL_PRED ? DC_PRED : b->uv_mode;
const int xpos = t->bx >> ss_hor, ypos = t->by >> ss_ver;
const int xstart = ts->tiling.col_start >> ss_hor;
const int ystart = ts->tiling.row_start >> ss_ver;
const enum IntraPredMode m =
bytefn(dav1d_prepare_intra_edges)(xpos, xpos > xstart,
ypos, ypos > ystart,
ts->tiling.col_end >> ss_hor,
ts->tiling.row_end >> ss_ver,
edge_flags, dst, stride,
top_sb_edge, uv_mode,
&angle, uv_t_dim->w,
uv_t_dim->h,
f->seq_hdr->intra_edge_filter,
edge HIGHBD_CALL_SUFFIX);
angle |= intra_edge_filter_flag;
dsp->ipred.intra_pred[m](dst, stride, edge,
uv_t_dim->w * 4 ,
uv_t_dim->h * 4 ,
angle | sm_uv_fl,
(4 * f->bw + ss_hor -
4 * (t->bx & ~ss_hor)) >> ss_hor,
(4 * f->bh + ss_ver -
4 * (t->by & ~ss_ver)) >> ss_ver
HIGHBD_CALL_SUFFIX);
if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS) {
hex_dump(edge - uv_t_dim->h * 4 , uv_t_dim->h * 4 ,
uv_t_dim->h * 4 , 2 , "l" );
hex_dump(edge, 0 , 1 , 1 , "tl" );
hex_dump(edge + 1 , uv_t_dim->w * 4 ,
uv_t_dim->w * 4 , 2 , "t" );
hex_dump(dst, stride, uv_t_dim->w * 4 ,
uv_t_dim->h * 4 , pl ? "v-intra-pred" : "u-intra-pred" );
}
skip_uv_pred: {}
if (!b->skip) {
enum TxfmType txtp;
int eob;
coef *cf;
if (t->frame_thread.pass) {
const int p = t->frame_thread.pass & 1 ;
const int cbi = *ts->frame_thread[p].cbi++;
cf = ts->frame_thread[p].cf;
ts->frame_thread[p].cf += uv_t_dim->w * uv_t_dim->h * 16 ;
eob = cbi >> 5 ;
txtp = cbi & 0 x1f;
} else {
uint8_t cf_ctx;
cf = bitfn(t->cf);
eob = decode_coefs(t, &t->a->ccoef[pl][cbx4 + x],
&t->l.ccoef[pl][cby4 + y],
b->uvtx, bs, b, 1 , 1 + pl, cf,
&txtp, &cf_ctx);
if (DEBUG_BLOCK_INFO)
printf("Post-uv-cf-blk[pl=%d,tx=%d,"
"txtp=%d,eob=%d]: r=%d [x=%d,cbx4=%d]\n" ,
pl, b->uvtx, txtp, eob, ts->msac.rng, x, cbx4);
int ctw = imin(uv_t_dim->w, (f->bw - t->bx + ss_hor) >> ss_hor);
int cth = imin(uv_t_dim->h, (f->bh - t->by + ss_ver) >> ss_ver);
dav1d_memset_likely_pow2(&t->a->ccoef[pl][cbx4 + x], cf_ctx, ctw);
dav1d_memset_likely_pow2(&t->l.ccoef[pl][cby4 + y], cf_ctx, cth);
}
if (eob >= 0 ) {
if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS)
coef_dump(cf, uv_t_dim->h * 4 ,
uv_t_dim->w * 4 , 3 , "dq" );
dsp->itx.itxfm_add[b->uvtx]
[txtp](dst, stride,
cf, eob HIGHBD_CALL_SUFFIX);
if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS)
hex_dump(dst, stride, uv_t_dim->w * 4 ,
uv_t_dim->h * 4 , "recon" );
}
} else if (!t->frame_thread.pass) {
dav1d_memset_pow2[uv_t_dim->lw](&t->a->ccoef[pl][cbx4 + x], 0 x40);
dav1d_memset_pow2[uv_t_dim->lh](&t->l.ccoef[pl][cby4 + y], 0 x40);
}
dst += uv_t_dim->w * 4 ;
}
t->bx -= x << ss_hor;
}
t->by -= y << ss_ver;
}
}
}
}
int bytefn(dav1d_recon_b_inter)(Dav1dTaskContext *const t, const enum BlockSize bs,
const Av1Block *const b)
{
Dav1dTileState *const ts = t->ts;
const Dav1dFrameContext *const f = t->f;
const Dav1dDSPContext *const dsp = f->dsp;
const int bx4 = t->bx & 31 , by4 = t->by & 31 ;
const int ss_ver = f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I420;
const int ss_hor = f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I444;
const int cbx4 = bx4 >> ss_hor, cby4 = by4 >> ss_ver;
const uint8_t *const b_dim = dav1d_block_dimensions[bs];
const int bw4 = b_dim[0 ], bh4 = b_dim[1 ];
const int w4 = imin(bw4, f->bw - t->bx), h4 = imin(bh4, f->bh - t->by);
const int has_chroma = f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I400 &&
(bw4 > ss_hor || t->bx & 1 ) &&
(bh4 > ss_ver || t->by & 1 );
const int chr_layout_idx = f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I400 ? 0 :
DAV1D_PIXEL_LAYOUT_I444 - f->cur.p.layout;
int res;
// prediction
const int cbh4 = (bh4 + ss_ver) >> ss_ver, cbw4 = (bw4 + ss_hor) >> ss_hor;
pixel *dst = ((pixel *) f->cur.data[0 ]) +
4 * (t->by * PXSTRIDE(f->cur.stride[0 ]) + t->bx);
const ptrdiff_t uvdstoff =
4 * ((t->bx >> ss_hor) + (t->by >> ss_ver) * PXSTRIDE(f->cur.stride[1 ]));
if (IS_KEY_OR_INTRA(f->frame_hdr)) {
// intrabc
assert(!f->frame_hdr->super_res.enabled);
res = mc(t, dst, NULL, f->cur.stride[0 ], bw4, bh4, t->bx, t->by, 0 ,
b->mv[0 ], &f->sr_cur, 0 /* unused */, FILTER_2D_BILINEAR);
if (res) return res;
if (has_chroma) for (int pl = 1 ; pl < 3 ; pl++) {
res = mc(t, ((pixel *)f->cur.data[pl]) + uvdstoff, NULL, f->cur.stride[1 ],
bw4 << (bw4 == ss_hor), bh4 << (bh4 == ss_ver),
t->bx & ~ss_hor, t->by & ~ss_ver, pl, b->mv[0 ],
&f->sr_cur, 0 /* unused */, FILTER_2D_BILINEAR);
if (res) return res;
}
} else if (b->comp_type == COMP_INTER_NONE) {
const Dav1dThreadPicture *const refp = &f->refp[b->ref[0 ]];
const enum Filter2d filter_2d = b->filter2d;
if (imin(bw4, bh4) > 1 &&
((b->inter_mode == GLOBALMV && f->gmv_warp_allowed[b->ref[0 ]]) ||
(b->motion_mode == MM_WARP && t->warpmv.type > DAV1D_WM_TYPE_TRANSLATION)))
{
res = warp_affine(t, dst, NULL, f->cur.stride[0 ], b_dim, 0 , refp,
b->motion_mode == MM_WARP ? &t->warpmv :
&f->frame_hdr->gmv[b->ref[0 ]]);
if (res) return res;
} else {
res = mc(t, dst, NULL, f->cur.stride[0 ],
bw4, bh4, t->bx, t->by, 0 , b->mv[0 ], refp, b->ref[0 ], filter_2d);
if (res) return res;
if (b->motion_mode == MM_OBMC) {
res = obmc(t, dst, f->cur.stride[0 ], b_dim, 0 , bx4, by4, w4, h4);
if (res) return res;
}
}
if (b->interintra_type) {
pixel *const tl_edge = bitfn(t->scratch.edge) + 32 ;
enum IntraPredMode m = b->interintra_mode == II_SMOOTH_PRED ?
SMOOTH_PRED : b->interintra_mode;
pixel *const tmp = bitfn(t->scratch.interintra);
int angle = 0 ;
const pixel *top_sb_edge = NULL;
if (!(t->by & (f->sb_step - 1 ))) {
top_sb_edge = f->ipred_edge[0 ];
const int sby = t->by >> f->sb_shift;
top_sb_edge += f->sb128w * 128 * (sby - 1 );
}
m = bytefn(dav1d_prepare_intra_edges)(t->bx, t->bx > ts->tiling.col_start,
t->by, t->by > ts->tiling.row_start,
ts->tiling.col_end, ts->tiling.row_end,
0 , dst, f->cur.stride[0 ], top_sb_edge,
m, &angle, bw4, bh4, 0 , tl_edge
HIGHBD_CALL_SUFFIX);
dsp->ipred.intra_pred[m](tmp, 4 * bw4 * sizeof (pixel),
tl_edge, bw4 * 4 , bh4 * 4 , 0 , 0 , 0
HIGHBD_CALL_SUFFIX);
dsp->mc.blend(dst, f->cur.stride[0 ], tmp,
bw4 * 4 , bh4 * 4 , II_MASK(0 , bs, b));
}
if (!has_chroma) goto skip_inter_chroma_pred;
// sub8x8 derivation
int is_sub8x8 = bw4 == ss_hor || bh4 == ss_ver;
refmvs_block *const *r;
if (is_sub8x8) {
assert(ss_hor == 1 );
r = &t->rt.r[(t->by & 31 ) + 5 ];
if (bw4 == 1 ) is_sub8x8 &= r[0 ][t->bx - 1 ].ref.ref[0 ] > 0 ;
if (bh4 == ss_ver) is_sub8x8 &= r[-1 ][t->bx].ref.ref[0 ] > 0 ;
if (bw4 == 1 && bh4 == ss_ver)
is_sub8x8 &= r[-1 ][t->bx - 1 ].ref.ref[0 ] > 0 ;
}
// chroma prediction
if (is_sub8x8) {
assert(ss_hor == 1 );
ptrdiff_t h_off = 0 , v_off = 0 ;
if (bw4 == 1 && bh4 == ss_ver) {
for (int pl = 0 ; pl < 2 ; pl++) {
res = mc(t, ((pixel *) f->cur.data[1 + pl]) + uvdstoff,
NULL, f->cur.stride[1 ],
bw4, bh4, t->bx - 1 , t->by - 1 , 1 + pl,
r[-1 ][t->bx - 1 ].mv.mv[0 ],
&f->refp[r[-1 ][t->bx - 1 ].ref.ref[0 ] - 1 ],
r[-1 ][t->bx - 1 ].ref.ref[0 ] - 1 ,
t->frame_thread.pass != 2 ? t->tl_4x4_filter :
f->frame_thread.b[((t->by - 1 ) * f->b4_stride) + t->bx - 1 ].filter2d);
if (res) return res;
}
v_off = 2 * PXSTRIDE(f->cur.stride[1 ]);
h_off = 2 ;
}
if (bw4 == 1 ) {
const enum Filter2d left_filter_2d =
dav1d_filter_2d[t->l.filter[1 ][by4]][t->l.filter[0 ][by4]];
for (int pl = 0 ; pl < 2 ; pl++) {
res = mc(t, ((pixel *) f->cur.data[1 + pl]) + uvdstoff + v_off, NULL,
f->cur.stride[1 ], bw4, bh4, t->bx - 1 ,
t->by, 1 + pl, r[0 ][t->bx - 1 ].mv.mv[0 ],
&f->refp[r[0 ][t->bx - 1 ].ref.ref[0 ] - 1 ],
r[0 ][t->bx - 1 ].ref.ref[0 ] - 1 ,
t->frame_thread.pass != 2 ? left_filter_2d :
f->frame_thread.b[(t->by * f->b4_stride) + t->bx - 1 ].filter2d);
if (res) return res;
}
h_off = 2 ;
}
if (bh4 == ss_ver) {
const enum Filter2d top_filter_2d =
dav1d_filter_2d[t->a->filter[1 ][bx4]][t->a->filter[0 ][bx4]];
for (int pl = 0 ; pl < 2 ; pl++) {
res = mc(t, ((pixel *) f->cur.data[1 + pl]) + uvdstoff + h_off, NULL,
f->cur.stride[1 ], bw4, bh4, t->bx, t->by - 1 ,
1 + pl, r[-1 ][t->bx].mv.mv[0 ],
&f->refp[r[-1 ][t->bx].ref.ref[0 ] - 1 ],
r[-1 ][t->bx].ref.ref[0 ] - 1 ,
t->frame_thread.pass != 2 ? top_filter_2d :
f->frame_thread.b[((t->by - 1 ) * f->b4_stride) + t->bx].filter2d);
if (res) return res;
}
v_off = 2 * PXSTRIDE(f->cur.stride[1 ]);
}
for (int pl = 0 ; pl < 2 ; pl++) {
res = mc(t, ((pixel *) f->cur.data[1 + pl]) + uvdstoff + h_off + v_off, NULL, f->cur.stride[1 ],
bw4, bh4, t->bx, t->by, 1 + pl, b->mv[0 ],
refp, b->ref[0 ], filter_2d);
if (res) return res;
}
} else {
if (imin(cbw4, cbh4) > 1 &&
((b->inter_mode == GLOBALMV && f->gmv_warp_allowed[b->ref[0 ]]) ||
(b->motion_mode == MM_WARP && t->warpmv.type > DAV1D_WM_TYPE_TRANSLATION)))
{
for (int pl = 0 ; pl < 2 ; pl++) {
res = warp_affine(t, ((pixel *) f->cur.data[1 + pl]) + uvdstoff, NULL,
f->cur.stride[1 ], b_dim, 1 + pl, refp,
b->motion_mode == MM_WARP ? &t->warpmv :
&f->frame_hdr->gmv[b->ref[0 ]]);
if (res) return res;
}
} else {
for (int pl = 0 ; pl < 2 ; pl++) {
res = mc(t, ((pixel *) f->cur.data[1 + pl]) + uvdstoff,
NULL, f->cur.stride[1 ],
bw4 << (bw4 == ss_hor), bh4 << (bh4 == ss_ver),
t->bx & ~ss_hor, t->by & ~ss_ver,
1 + pl, b->mv[0 ], refp, b->ref[0 ], filter_2d);
if (res) return res;
if (b->motion_mode == MM_OBMC) {
res = obmc(t, ((pixel *) f->cur.data[1 + pl]) + uvdstoff,
f->cur.stride[1 ], b_dim, 1 + pl, bx4, by4, w4, h4);
if (res) return res;
}
}
}
if (b->interintra_type) {
// FIXME for 8x32 with 4:2:2 subsampling, this probably does
// the wrong thing since it will select 4x16, not 4x32, as a
// transform size...
const uint8_t *const ii_mask = II_MASK(chr_layout_idx, bs, b);
for (int pl = 0 ; pl < 2 ; pl++) {
pixel *const tmp = bitfn(t->scratch.interintra);
pixel *const tl_edge = bitfn(t->scratch.edge) + 32 ;
enum IntraPredMode m =
b->interintra_mode == II_SMOOTH_PRED ?
SMOOTH_PRED : b->interintra_mode;
int angle = 0 ;
pixel *const uvdst = ((pixel *) f->cur.data[1 + pl]) + uvdstoff;
const pixel *top_sb_edge = NULL;
if (!(t->by & (f->sb_step - 1 ))) {
top_sb_edge = f->ipred_edge[pl + 1 ];
const int sby = t->by >> f->sb_shift;
top_sb_edge += f->sb128w * 128 * (sby - 1 );
}
m = bytefn(dav1d_prepare_intra_edges)(t->bx >> ss_hor,
(t->bx >> ss_hor) >
(ts->tiling.col_start >> ss_hor),
t->by >> ss_ver,
(t->by >> ss_ver) >
(ts->tiling.row_start >> ss_ver),
ts->tiling.col_end >> ss_hor,
ts->tiling.row_end >> ss_ver,
0 , uvdst, f->cur.stride[1 ],
top_sb_edge, m,
&angle, cbw4, cbh4, 0 , tl_edge
HIGHBD_CALL_SUFFIX);
dsp->ipred.intra_pred[m](tmp, cbw4 * 4 * sizeof (pixel),
tl_edge, cbw4 * 4 , cbh4 * 4 , 0 , 0 , 0
HIGHBD_CALL_SUFFIX);
dsp->mc.blend(uvdst, f->cur.stride[1 ], tmp,
cbw4 * 4 , cbh4 * 4 , ii_mask);
}
}
}
skip_inter_chroma_pred: {}
t->tl_4x4_filter = filter_2d;
} else {
const enum Filter2d filter_2d = b->filter2d;
// Maximum super block size is 128x128
int16_t (*tmp)[128 * 128 ] = t->scratch.compinter;
int jnt_weight;
uint8_t *const seg_mask = t->scratch.seg_mask;
const uint8_t *mask;
for (int i = 0 ; i < 2 ; i++) {
const Dav1dThreadPicture *const refp = &f->refp[b->ref[i]];
if (b->inter_mode == GLOBALMV_GLOBALMV && f->gmv_warp_allowed[b->ref[i]]) {
res = warp_affine(t, NULL, tmp[i], bw4 * 4 , b_dim, 0 , refp,
&f->frame_hdr->gmv[b->ref[i]]);
if (res) return res;
} else {
res = mc(t, NULL, tmp[i], 0 , bw4, bh4, t->bx, t->by, 0 ,
b->mv[i], refp, b->ref[i], filter_2d);
if (res) return res;
}
}
switch (b->comp_type) {
case COMP_INTER_AVG:
dsp->mc.avg(dst, f->cur.stride[0 ], tmp[0 ], tmp[1 ],
bw4 * 4 , bh4 * 4 HIGHBD_CALL_SUFFIX);
break ;
case COMP_INTER_WEIGHTED_AVG:
jnt_weight = f->jnt_weights[b->ref[0 ]][b->ref[1 ]];
dsp->mc.w_avg(dst, f->cur.stride[0 ], tmp[0 ], tmp[1 ],
bw4 * 4 , bh4 * 4 , jnt_weight HIGHBD_CALL_SUFFIX);
break ;
case COMP_INTER_SEG:
dsp->mc.w_mask[chr_layout_idx](dst, f->cur.stride[0 ],
tmp[b->mask_sign], tmp[!b->mask_sign],
bw4 * 4 , bh4 * 4 , seg_mask,
b->mask_sign HIGHBD_CALL_SUFFIX);
mask = seg_mask;
break ;
case COMP_INTER_WEDGE:
mask = WEDGE_MASK(0 , bs, 0 , b->wedge_idx);
dsp->mc.mask(dst, f->cur.stride[0 ],
tmp[b->mask_sign], tmp[!b->mask_sign],
bw4 * 4 , bh4 * 4 , mask HIGHBD_CALL_SUFFIX);
if (has_chroma)
mask = WEDGE_MASK(chr_layout_idx, bs, b->mask_sign, b->wedge_idx);
break ;
}
// chroma
if (has_chroma) for (int pl = 0 ; pl < 2 ; pl++) {
for (int i = 0 ; i < 2 ; i++) {
const Dav1dThreadPicture *const refp = &f->refp[b->ref[i]];
if (b->inter_mode == GLOBALMV_GLOBALMV &&
imin(cbw4, cbh4) > 1 && f->gmv_warp_allowed[b->ref[i]])
{
res = warp_affine(t, NULL, tmp[i], bw4 * 4 >> ss_hor,
b_dim, 1 + pl,
refp, &f->frame_hdr->gmv[b->ref[i]]);
if (res) return res;
} else {
res = mc(t, NULL, tmp[i], 0 , bw4, bh4, t->bx, t->by,
1 + pl, b->mv[i], refp, b->ref[i], filter_2d);
if (res) return res;
}
}
pixel *const uvdst = ((pixel *) f->cur.data[1 + pl]) + uvdstoff;
switch (b->comp_type) {
case COMP_INTER_AVG:
dsp->mc.avg(uvdst, f->cur.stride[1 ], tmp[0 ], tmp[1 ],
bw4 * 4 >> ss_hor, bh4 * 4 >> ss_ver
HIGHBD_CALL_SUFFIX);
break ;
case COMP_INTER_WEIGHTED_AVG:
dsp->mc.w_avg(uvdst, f->cur.stride[1 ], tmp[0 ], tmp[1 ],
bw4 * 4 >> ss_hor, bh4 * 4 >> ss_ver, jnt_weight
HIGHBD_CALL_SUFFIX);
break ;
case COMP_INTER_WEDGE:
case COMP_INTER_SEG:
dsp->mc.mask(uvdst, f->cur.stride[1 ],
tmp[b->mask_sign], tmp[!b->mask_sign],
bw4 * 4 >> ss_hor, bh4 * 4 >> ss_ver, mask
HIGHBD_CALL_SUFFIX);
break ;
}
}
}
if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS) {
hex_dump(dst, f->cur.stride[0 ], b_dim[0 ] * 4 , b_dim[1 ] * 4 , "y-pred" );
if (has_chroma) {
hex_dump(&((pixel *) f->cur.data[1 ])[uvdstoff], f->cur.stride[1 ],
cbw4 * 4 , cbh4 * 4 , "u-pred" );
hex_dump(&((pixel *) f->cur.data[2 ])[uvdstoff], f->cur.stride[1 ],
cbw4 * 4 , cbh4 * 4 , "v-pred" );
}
}
const int cw4 = (w4 + ss_hor) >> ss_hor, ch4 = (h4 + ss_ver) >> ss_ver;
if (b->skip) {
// reset coef contexts
BlockContext *const a = t->a;
dav1d_memset_pow2[b_dim[2 ]](&a->lcoef[bx4], 0 x40);
dav1d_memset_pow2[b_dim[3 ]](&t->l.lcoef[by4], 0 x40);
if (has_chroma) {
dav1d_memset_pow2_fn memset_cw = dav1d_memset_pow2[ulog2(cbw4)];
dav1d_memset_pow2_fn memset_ch = dav1d_memset_pow2[ulog2(cbh4)];
memset_cw(&a->ccoef[0 ][cbx4], 0 x40);
memset_cw(&a->ccoef[1 ][cbx4], 0 x40);
memset_ch(&t->l.ccoef[0 ][cby4], 0 x40);
memset_ch(&t->l.ccoef[1 ][cby4], 0 x40);
}
return 0 ;
}
const TxfmInfo *const uvtx = &dav1d_txfm_dimensions[b->uvtx];
const TxfmInfo *const ytx = &dav1d_txfm_dimensions[b->max_ytx];
const uint16_t tx_split[2 ] = { b->tx_split0, b->tx_split1 };
for (int init_y = 0 ; init_y < bh4; init_y += 16 ) {
for (int init_x = 0 ; init_x < bw4; init_x += 16 ) {
// coefficient coding & inverse transforms
int y_off = !!init_y, y;
dst += PXSTRIDE(f->cur.stride[0 ]) * 4 * init_y;
for (y = init_y, t->by += init_y; y < imin(h4, init_y + 16 );
y += ytx->h, y_off++)
{
int x, x_off = !!init_x;
for (x = init_x, t->bx += init_x; x < imin(w4, init_x + 16 );
x += ytx->w, x_off++)
{
read_coef_tree(t, bs, b, b->max_ytx, 0 , tx_split,
x_off, y_off, &dst[x * 4 ]);
t->bx += ytx->w;
}
dst += PXSTRIDE(f->cur.stride[0 ]) * 4 * ytx->h;
t->bx -= x;
t->by += ytx->h;
}
dst -= PXSTRIDE(f->cur.stride[0 ]) * 4 * y;
t->by -= y;
// chroma coefs and inverse transform
if (has_chroma) for (int pl = 0 ; pl < 2 ; pl++) {
pixel *uvdst = ((pixel *) f->cur.data[1 + pl]) + uvdstoff +
(PXSTRIDE(f->cur.stride[1 ]) * init_y * 4 >> ss_ver);
for (y = init_y >> ss_ver, t->by += init_y;
y < imin(ch4, (init_y + 16 ) >> ss_ver); y += uvtx->h)
{
int x;
for (x = init_x >> ss_hor, t->bx += init_x;
x < imin(cw4, (init_x + 16 ) >> ss_hor); x += uvtx->w)
{
coef *cf;
int eob;
enum TxfmType txtp;
if (t->frame_thread.pass) {
const int p = t->frame_thread.pass & 1 ;
const int cbi = *ts->frame_thread[p].cbi++;
cf = ts->frame_thread[p].cf;
ts->frame_thread[p].cf += uvtx->w * uvtx->h * 16 ;
eob = cbi >> 5 ;
txtp = cbi & 0 x1f;
} else {
uint8_t cf_ctx;
cf = bitfn(t->cf);
txtp = t->scratch.txtp_map[(by4 + (y << ss_ver)) * 32 +
bx4 + (x << ss_hor)];
eob = decode_coefs(t, &t->a->ccoef[pl][cbx4 + x],
&t->l.ccoef[pl][cby4 + y],
b->uvtx, bs, b, 0 , 1 + pl,
cf, &txtp, &cf_ctx);
if (DEBUG_BLOCK_INFO)
printf("Post-uv-cf-blk[pl=%d,tx=%d,"
"txtp=%d,eob=%d]: r=%d\n" ,
pl, b->uvtx, txtp, eob, ts->msac.rng);
int ctw = imin(uvtx->w, (f->bw - t->bx + ss_hor) >> ss_hor);
int cth = imin(uvtx->h, (f->bh - t->by + ss_ver) >> ss_ver);
dav1d_memset_likely_pow2(&t->a->ccoef[pl][cbx4 + x], cf_ctx, ctw);
dav1d_memset_likely_pow2(&t->l.ccoef[pl][cby4 + y], cf_ctx, cth);
}
if (eob >= 0 ) {
if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS)
coef_dump(cf, uvtx->h * 4 , uvtx->w * 4 , 3 , "dq" );
dsp->itx.itxfm_add[b->uvtx]
[txtp](&uvdst[4 * x],
f->cur.stride[1 ],
cf, eob HIGHBD_CALL_SUFFIX);
if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS)
hex_dump(&uvdst[4 * x], f->cur.stride[1 ],
uvtx->w * 4 , uvtx->h * 4 , "recon" );
}
t->bx += uvtx->w << ss_hor;
}
uvdst += PXSTRIDE(f->cur.stride[1 ]) * 4 * uvtx->h;
t->bx -= x << ss_hor;
t->by += uvtx->h << ss_ver;
}
t->by -= y << ss_ver;
}
}
}
return 0 ;
}
void bytefn(dav1d_filter_sbrow_deblock_cols)(Dav1dFrameContext *const f, const int sby) {
if (!(f->c->inloop_filters & DAV1D_INLOOPFILTER_DEBLOCK) ||
(!f->frame_hdr->loopfilter.level_y[0 ] && !f->frame_hdr->loopfilter.level_y[1 ]))
{
return ;
}
const int y = sby * f->sb_step * 4 ;
const int ss_ver = f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I420;
pixel *const p[3 ] = {
f->lf.p[0 ] + y * PXSTRIDE(f->cur.stride[0 ]),
f->lf.p[1 ] + (y * PXSTRIDE(f->cur.stride[1 ]) >> ss_ver),
f->lf.p[2 ] + (y * PXSTRIDE(f->cur.stride[1 ]) >> ss_ver)
};
Av1Filter *mask = f->lf.mask + (sby >> !f->seq_hdr->sb128) * f->sb128w;
bytefn(dav1d_loopfilter_sbrow_cols)(f, p, mask, sby,
f->lf.start_of_tile_row[sby]);
}
void bytefn(dav1d_filter_sbrow_deblock_rows)(Dav1dFrameContext *const f, const int sby) {
const int y = sby * f->sb_step * 4 ;
const int ss_ver = f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I420;
pixel *const p[3 ] = {
f->lf.p[0 ] + y * PXSTRIDE(f->cur.stride[0 ]),
f->lf.p[1 ] + (y * PXSTRIDE(f->cur.stride[1 ]) >> ss_ver),
f->lf.p[2 ] + (y * PXSTRIDE(f->cur.stride[1 ]) >> ss_ver)
};
Av1Filter *mask = f->lf.mask + (sby >> !f->seq_hdr->sb128) * f->sb128w;
if (f->c->inloop_filters & DAV1D_INLOOPFILTER_DEBLOCK &&
(f->frame_hdr->loopfilter.level_y[0 ] || f->frame_hdr->loopfilter.level_y[1 ]))
{
bytefn(dav1d_loopfilter_sbrow_rows)(f, p, mask, sby);
}
if (f->seq_hdr->cdef || f->lf.restore_planes) {
// Store loop filtered pixels required by CDEF / LR
bytefn(dav1d_copy_lpf)(f, p, sby);
}
}
void bytefn(dav1d_filter_sbrow_cdef)(Dav1dTaskContext *const tc, const int sby) {
const Dav1dFrameContext *const f = tc->f;
if (!(f->c->inloop_filters & DAV1D_INLOOPFILTER_CDEF)) return ;
const int sbsz = f->sb_step;
const int y = sby * sbsz * 4 ;
const int ss_ver = f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I420;
pixel *const p[3 ] = {
f->lf.p[0 ] + y * PXSTRIDE(f->cur.stride[0 ]),
f->lf.p[1 ] + (y * PXSTRIDE(f->cur.stride[1 ]) >> ss_ver),
f->lf.p[2 ] + (y * PXSTRIDE(f->cur.stride[1 ]) >> ss_ver)
};
Av1Filter *prev_mask = f->lf.mask + ((sby - 1 ) >> !f->seq_hdr->sb128) * f->sb128w;
Av1Filter *mask = f->lf.mask + (sby >> !f->seq_hdr->sb128) * f->sb128w;
const int start = sby * sbsz;
if (sby) {
const int ss_ver = f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I420;
pixel *p_up[3 ] = {
p[0 ] - 8 * PXSTRIDE(f->cur.stride[0 ]),
p[1 ] - (8 * PXSTRIDE(f->cur.stride[1 ]) >> ss_ver),
p[2 ] - (8 * PXSTRIDE(f->cur.stride[1 ]) >> ss_ver),
};
bytefn(dav1d_cdef_brow)(tc, p_up, prev_mask, start - 2 , start, 1 , sby);
}
const int n_blks = sbsz - 2 * (sby + 1 < f->sbh);
const int end = imin(start + n_blks, f->bh);
bytefn(dav1d_cdef_brow)(tc, p, mask, start, end, 0 , sby);
}
void bytefn(dav1d_filter_sbrow_resize)(Dav1dFrameContext *const f, const int sby) {
const int sbsz = f->sb_step;
const int y = sby * sbsz * 4 ;
const int ss_ver = f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I420;
const pixel *const p[3 ] = {
f->lf.p[0 ] + y * PXSTRIDE(f->cur.stride[0 ]),
f->lf.p[1 ] + (y * PXSTRIDE(f->cur.stride[1 ]) >> ss_ver),
f->lf.p[2 ] + (y * PXSTRIDE(f->cur.stride[1 ]) >> ss_ver)
};
pixel *const sr_p[3 ] = {
f->lf.sr_p[0 ] + y * PXSTRIDE(f->sr_cur.p.stride[0 ]),
f->lf.sr_p[1 ] + (y * PXSTRIDE(f->sr_cur.p.stride[1 ]) >> ss_ver),
f->lf.sr_p[2 ] + (y * PXSTRIDE(f->sr_cur.p.stride[1 ]) >> ss_ver)
};
const int has_chroma = f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I400;
for (int pl = 0 ; pl < 1 + 2 * has_chroma; pl++) {
const int ss_ver = pl && f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I420;
const int h_start = 8 * !!sby >> ss_ver;
const ptrdiff_t dst_stride = f->sr_cur.p.stride[!!pl];
pixel *dst = sr_p[pl] - h_start * PXSTRIDE(dst_stride);
const ptrdiff_t src_stride = f->cur.stride[!!pl];
const pixel *src = p[pl] - h_start * PXSTRIDE(src_stride);
const int h_end = 4 * (sbsz - 2 * (sby + 1 < f->sbh)) >> ss_ver;
const int ss_hor = pl && f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I444;
const int dst_w = (f->sr_cur.p.p.w + ss_hor) >> ss_hor;
const int src_w = (4 * f->bw + ss_hor) >> ss_hor;
const int img_h = (f->cur.p.h - sbsz * 4 * sby + ss_ver) >> ss_ver;
f->dsp->mc.resize(dst, dst_stride, src, src_stride, dst_w,
imin(img_h, h_end) + h_start, src_w,
f->resize_step[!!pl], f->resize_start[!!pl]
HIGHBD_CALL_SUFFIX);
}
}
void bytefn(dav1d_filter_sbrow_lr)(Dav1dFrameContext *const f, const int sby) {
if (!(f->c->inloop_filters & DAV1D_INLOOPFILTER_RESTORATION)) return ;
const int y = sby * f->sb_step * 4 ;
const int ss_ver = f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I420;
pixel *const sr_p[3 ] = {
f->lf.sr_p[0 ] + y * PXSTRIDE(f->sr_cur.p.stride[0 ]),
f->lf.sr_p[1 ] + (y * PXSTRIDE(f->sr_cur.p.stride[1 ]) >> ss_ver),
f->lf.sr_p[2 ] + (y * PXSTRIDE(f->sr_cur.p.stride[1 ]) >> ss_ver)
};
bytefn(dav1d_lr_sbrow)(f, sr_p, sby);
}
void bytefn(dav1d_filter_sbrow)(Dav1dFrameContext *const f, const int sby) {
bytefn(dav1d_filter_sbrow_deblock_cols)(f, sby);
bytefn(dav1d_filter_sbrow_deblock_rows)(f, sby);
if (f->seq_hdr->cdef)
bytefn(dav1d_filter_sbrow_cdef)(f->c->tc, sby);
if (f->frame_hdr->width[0 ] != f->frame_hdr->width[1 ])
bytefn(dav1d_filter_sbrow_resize)(f, sby);
if (f->lf.restore_planes)
bytefn(dav1d_filter_sbrow_lr)(f, sby);
}
void bytefn(dav1d_backup_ipred_edge)(Dav1dTaskContext *const t) {
const Dav1dFrameContext *const f = t->f;
Dav1dTileState *const ts = t->ts;
const int sby = t->by >> f->sb_shift;
const int sby_off = f->sb128w * 128 * sby;
const int x_off = ts->tiling.col_start;
const pixel *const y =
((const pixel *) f->cur.data[0 ]) + x_off * 4 +
((t->by + f->sb_step) * 4 - 1 ) * PXSTRIDE(f->cur.stride[0 ]);
pixel_copy(&f->ipred_edge[0 ][sby_off + x_off * 4 ], y,
4 * (ts->tiling.col_end - x_off));
if (f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I400) {
const int ss_ver = f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I420;
const int ss_hor = f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I444;
const ptrdiff_t uv_off = (x_off * 4 >> ss_hor) +
(((t->by + f->sb_step) * 4 >> ss_ver) - 1 ) * PXSTRIDE(f->cur.stride[1 ]);
for (int pl = 1 ; pl <= 2 ; pl++)
pixel_copy(&f->ipred_edge[pl][sby_off + (x_off * 4 >> ss_hor)],
&((const pixel *) f->cur.data[pl])[uv_off],
4 * (ts->tiling.col_end - x_off) >> ss_hor);
}
}
void bytefn(dav1d_copy_pal_block_y)(Dav1dTaskContext *const t,
const int bx4, const int by4,
const int bw4, const int bh4)
{
const Dav1dFrameContext *const f = t->f;
pixel *const pal = t->frame_thread.pass ?
f->frame_thread.pal[((t->by >> 1 ) + (t->bx & 1 )) * (f->b4_stride >> 1 ) +
((t->bx >> 1 ) + (t->by & 1 ))][0 ] :
bytefn(t->scratch.pal)[0 ];
for (int x = 0 ; x < bw4; x++)
memcpy(bytefn(t->al_pal)[0 ][bx4 + x][0 ], pal, 8 * sizeof (pixel));
for (int y = 0 ; y < bh4; y++)
memcpy(bytefn(t->al_pal)[1 ][by4 + y][0 ], pal, 8 * sizeof (pixel));
}
void bytefn(dav1d_copy_pal_block_uv)(Dav1dTaskContext *const t,
const int bx4, const int by4,
const int bw4, const int bh4)
{
const Dav1dFrameContext *const f = t->f;
const pixel (*const pal)[8 ] = t->frame_thread.pass ?
f->frame_thread.pal[((t->by >> 1 ) + (t->bx & 1 )) * (f->b4_stride >> 1 ) +
((t->bx >> 1 ) + (t->by & 1 ))] :
bytefn(t->scratch.pal);
// see aomedia bug 2183 for why we use luma coordinates here
for (int pl = 1 ; pl <= 2 ; pl++) {
for (int x = 0 ; x < bw4; x++)
memcpy(bytefn(t->al_pal)[0 ][bx4 + x][pl], pal[pl], 8 * sizeof (pixel));
for (int y = 0 ; y < bh4; y++)
memcpy(bytefn(t->al_pal)[1 ][by4 + y][pl], pal[pl], 8 * sizeof (pixel));
}
}
void bytefn(dav1d_read_pal_plane)(Dav1dTaskContext *const t, Av1Block *const b,
const int pl, const int sz_ctx,
const int bx4, const int by4)
{
Dav1dTileState *const ts = t->ts;
const Dav1dFrameContext *const f = t->f;
const int pal_sz = b->pal_sz[pl] = dav1d_msac_decode_symbol_adapt8(&ts->msac,
ts->cdf.m.pal_sz[pl][sz_ctx], 6 ) + 2 ;
pixel cache[16 ], used_cache[8 ];
int l_cache = pl ? t->pal_sz_uv[1 ][by4] : t->l.pal_sz[by4];
int n_cache = 0 ;
// don't reuse above palette outside SB64 boundaries
int a_cache = by4 & 15 ? pl ? t->pal_sz_uv[0 ][bx4] : t->a->pal_sz[bx4] : 0 ;
const pixel *l = bytefn(t->al_pal)[1 ][by4][pl];
const pixel *a = bytefn(t->al_pal)[0 ][bx4][pl];
// fill/sort cache
while (l_cache && a_cache) {
if (*l < *a) {
if (!n_cache || cache[n_cache - 1 ] != *l)
cache[n_cache++] = *l;
l++;
l_cache--;
} else {
if (*a == *l) {
l++;
l_cache--;
}
if (!n_cache || cache[n_cache - 1 ] != *a)
cache[n_cache++] = *a;
a++;
a_cache--;
}
}
if (l_cache) {
do {
if (!n_cache || cache[n_cache - 1 ] != *l)
cache[n_cache++] = *l;
l++;
} while (--l_cache > 0 );
} else if (a_cache) {
do {
if (!n_cache || cache[n_cache - 1 ] != *a)
cache[n_cache++] = *a;
a++;
} while (--a_cache > 0 );
}
// find reused cache entries
int i = 0 ;
for (int n = 0 ; n < n_cache && i < pal_sz; n++)
if (dav1d_msac_decode_bool_equi(&ts->msac))
used_cache[i++] = cache[n];
const int n_used_cache = i;
// parse new entries
pixel *const pal = t->frame_thread.pass ?
f->frame_thread.pal[((t->by >> 1 ) + (t->bx & 1 )) * (f->b4_stride >> 1 ) +
((t->bx >> 1 ) + (t->by & 1 ))][pl] :
bytefn(t->scratch.pal)[pl];
if (i < pal_sz) {
const int bpc = BITDEPTH == 8 ? 8 : f->cur.p.bpc;
int prev = pal[i++] = dav1d_msac_decode_bools(&ts->msac, bpc);
if (i < pal_sz) {
int bits = bpc - 3 + dav1d_msac_decode_bools(&ts->msac, 2 );
const int max = (1 << bpc) - 1 ;
do {
const int delta = dav1d_msac_decode_bools(&ts->msac, bits);
prev = pal[i++] = imin(prev + delta + !pl, max);
if (prev + !pl >= max) {
for (; i < pal_sz; i++)
pal[i] = max;
break ;
}
bits = imin(bits, 1 + ulog2(max - prev - !pl));
} while (i < pal_sz);
}
// merge cache+new entries
int n = 0 , m = n_used_cache;
for (i = 0 ; i < pal_sz; i++) {
if (n < n_used_cache && (m >= pal_sz || used_cache[n] <= pal[m])) {
pal[i] = used_cache[n++];
} else {
assert(m < pal_sz);
pal[i] = pal[m++];
}
}
} else {
memcpy(pal, used_cache, n_used_cache * sizeof (*used_cache));
}
if (DEBUG_BLOCK_INFO) {
printf("Post-pal[pl=%d,sz=%d,cache_size=%d,used_cache=%d]: r=%d, cache=" ,
pl, pal_sz, n_cache, n_used_cache, ts->msac.rng);
for (int n = 0 ; n < n_cache; n++)
printf("%c%02x" , n ? ' ' : '[' , cache[n]);
printf("%s, pal=" , n_cache ? "]" : "[]" );
for (int n = 0 ; n < pal_sz; n++)
printf("%c%02x" , n ? ' ' : '[' , pal[n]);
printf("]\n" );
}
}
void bytefn(dav1d_read_pal_uv)(Dav1dTaskContext *const t, Av1Block *const b,
const int sz_ctx, const int bx4, const int by4)
{
bytefn(dav1d_read_pal_plane)(t, b, 1 , sz_ctx, bx4, by4);
// V pal coding
Dav1dTileState *const ts = t->ts;
const Dav1dFrameContext *const f = t->f;
pixel *const pal = t->frame_thread.pass ?
f->frame_thread.pal[((t->by >> 1 ) + (t->bx & 1 )) * (f->b4_stride >> 1 ) +
((t->bx >> 1 ) + (t->by & 1 ))][2 ] :
bytefn(t->scratch.pal)[2 ];
const int bpc = BITDEPTH == 8 ? 8 : f->cur.p.bpc;
if (dav1d_msac_decode_bool_equi(&ts->msac)) {
const int bits = bpc - 4 + dav1d_msac_decode_bools(&ts->msac, 2 );
int prev = pal[0 ] = dav1d_msac_decode_bools(&ts->msac, bpc);
const int max = (1 << bpc) - 1 ;
for (int i = 1 ; i < b->pal_sz[1 ]; i++) {
int delta = dav1d_msac_decode_bools(&ts->msac, bits);
if (delta && dav1d_msac_decode_bool_equi(&ts->msac)) delta = -delta;
prev = pal[i] = (prev + delta) & max;
}
} else {
for (int i = 0 ; i < b->pal_sz[1 ]; i++)
pal[i] = dav1d_msac_decode_bools(&ts->msac, bpc);
}
if (DEBUG_BLOCK_INFO) {
printf("Post-pal[pl=2]: r=%d " , ts->msac.rng);
for (int n = 0 ; n < b->pal_sz[1 ]; n++)
printf("%c%02x" , n ? ' ' : '[' , pal[n]);
printf("]\n" );
}
}
Messung V0.5 in Prozent C=89 H=76 G=82
¤ Dauer der Verarbeitung: 0.47 Sekunden
(vorverarbeitet am 2026-06-07)
¤
*© Formatika GbR, Deutschland