Quelle recon_tmpl.c

Sprache: C

/*
* Copyright © 2018-2021, VideoLAN and dav1d authors
* Copyright © 2018, Two Orioles, LLC
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
*    list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
*    this list of conditions and the following disclaimer in the documentation
*    and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
* ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/

#include "config.h"

#include <string.h>
#include <stdio.h>

#include "common/attributes.h"
#include "common/bitdepth.h"
#include "common/dump.h"
#include "common/frame.h"
#include "common/intops.h"

#include "src/cdef_apply.h"
#include "src/ctx.h"
#include "src/ipred_prepare.h"
#include "src/lf_apply.h"
#include "src/lr_apply.h"
#include "src/recon.h"
#include "src/scan.h"
#include "src/tables.h"
#include "src/wedge.h"

static inline unsigned read_golomb(MsacContext *const msac) {
    int len = 0;
    unsigned val = 1;

    while (!dav1d_msac_decode_bool_equi(msac) && len < 32) len++;
    while (len--) val = (val << 1) + dav1d_msac_decode_bool_equi(msac);

    return val - 1;
}

static inline unsigned get_skip_ctx(const TxfmInfo *const t_dim,
                                    const enum BlockSize bs,
                                    const uint8_t *const a,
                                    const uint8_t *const l,
                                    const int chroma,
                                    const enum Dav1dPixelLayout layout)
{
    const uint8_t *const b_dim = dav1d_block_dimensions[bs];

    if (chroma) {
        const int ss_ver = layout == DAV1D_PIXEL_LAYOUT_I420;
        const int ss_hor = layout != DAV1D_PIXEL_LAYOUT_I444;
        const int not_one_blk = b_dim[2] - (!!b_dim[2] && ss_hor) > t_dim->lw ||
                                b_dim[3] - (!!b_dim[3] && ss_ver) > t_dim->lh;
        unsigned ca, cl;

#define MERGE_CTX(dir, type, no_val) \
        c##dir = *(const type *) dir != no_val; \
        break

        switch (t_dim->lw) {
        /* For some reason the MSVC CRT _wassert() function is not flagged as
         * __declspec(noreturn), so when using those headers the compiler will
         * expect execution to continue after an assertion has been triggered
         * and will therefore complain about the use of uninitialized variables
         * when compiled in debug mode if we put the default case at the end. */
        default: assert(0); /* fall-through */
        case TX_4X4:   MERGE_CTX(a, uint8_t,  0x40);
        case TX_8X8:   MERGE_CTX(a, uint16_t, 0x4040);
        case TX_16X16: MERGE_CTX(a, uint32_t, 0x40404040U);
        case TX_32X32: MERGE_CTX(a, uint64_t, 0x4040404040404040ULL);
        }
        switch (t_dim->lh) {
        default: assert(0); /* fall-through */
        case TX_4X4:   MERGE_CTX(l, uint8_t,  0x40);
        case TX_8X8:   MERGE_CTX(l, uint16_t, 0x4040);
        case TX_16X16: MERGE_CTX(l, uint32_t, 0x40404040U);
        case TX_32X32: MERGE_CTX(l, uint64_t, 0x4040404040404040ULL);
        }
#undef MERGE_CTX

        return 7 + not_one_blk * 3 + ca + cl;
    } else if (b_dim[2] == t_dim->lw && b_dim[3] == t_dim->lh) {
        return 0;
    } else {
        unsigned la, ll;

#define MERGE_CTX(dir, type, tx) \
        if (tx == TX_64X64) { \
            uint64_t tmp = *(const uint64_t *) dir; \
            tmp |= *(const uint64_t *) &dir[8]; \
            l##dir = (unsigned) (tmp >> 32) | (unsigned) tmp; \
        } else \
            l##dir = *(const type *) dir; \
        if (tx == TX_32X32) l##dir |= *(const type *) &dir[sizeof(type)]; \
        if (tx >= TX_16X16) l##dir |= l##dir >> 16; \
        if (tx >= TX_8X8)   l##dir |= l##dir >> 8; \
        break

        switch (t_dim->lw) {
        default: assert(0); /* fall-through */
        case TX_4X4:   MERGE_CTX(a, uint8_t,  TX_4X4);
        case TX_8X8:   MERGE_CTX(a, uint16_t, TX_8X8);
        case TX_16X16: MERGE_CTX(a, uint32_t, TX_16X16);
        case TX_32X32: MERGE_CTX(a, uint32_t, TX_32X32);
        case TX_64X64: MERGE_CTX(a, uint32_t, TX_64X64);
        }
        switch (t_dim->lh) {
        default: assert(0); /* fall-through */
        case TX_4X4:   MERGE_CTX(l, uint8_t,  TX_4X4);
        case TX_8X8:   MERGE_CTX(l, uint16_t, TX_8X8);
        case TX_16X16: MERGE_CTX(l, uint32_t, TX_16X16);
        case TX_32X32: MERGE_CTX(l, uint32_t, TX_32X32);
        case TX_64X64: MERGE_CTX(l, uint32_t, TX_64X64);
        }
#undef MERGE_CTX

        return dav1d_skip_ctx[umin(la & 0x3F, 4)][umin(ll & 0x3F, 4)];
    }
}

static inline unsigned get_dc_sign_ctx(const int /*enum RectTxfmSize*/ tx,
                                       const uint8_t *const a,
                                       const uint8_t *const l)
{
    uint64_t mask = 0xC0C0C0C0C0C0C0C0ULL, mul = 0x0101010101010101ULL;
    int s;

#if ARCH_X86_64 && defined(__GNUC__)
    /* Coerce compilers into producing better code. For some reason
     * every x86-64 compiler is awful at handling 64-bit constants. */
    __asm__("" : "+r"(mask), "+r"(mul));
#endif

    switch(tx) {
    default: assert(0); /* fall-through */
    case TX_4X4: {
        int t = *(const uint8_t *) a >> 6;
        t    += *(const uint8_t *) l >> 6;
        s = t - 1 - 1;
        break;
    }
    case TX_8X8: {
        uint32_t t = *(const uint16_t *) a & (uint32_t) mask;
        t         += *(const uint16_t *) l & (uint32_t) mask;
        t *= 0x04040404U;
        s = (int) (t >> 24) - 2 - 2;
        break;
    }
    case TX_16X16: {
        uint32_t t = (*(const uint32_t *) a & (uint32_t) mask) >> 6;
        t         += (*(const uint32_t *) l & (uint32_t) mask) >> 6;
        t *= (uint32_t) mul;
        s = (int) (t >> 24) - 4 - 4;
        break;
    }
    case TX_32X32: {
        uint64_t t = (*(const uint64_t *) a & mask) >> 6;
        t         += (*(const uint64_t *) l & mask) >> 6;
        t *= mul;
        s = (int) (t >> 56) - 8 - 8;
        break;
    }
    case TX_64X64: {
        uint64_t t = (*(const uint64_t *) &a[0] & mask) >> 6;
        t         += (*(const uint64_t *) &a[8] & mask) >> 6;
        t         += (*(const uint64_t *) &l[0] & mask) >> 6;
        t         += (*(const uint64_t *) &l[8] & mask) >> 6;
        t *= mul;
        s = (int) (t >> 56) - 16 - 16;
        break;
    }
    case RTX_4X8: {
        uint32_t t = *(const uint8_t  *) a & (uint32_t) mask;
        t         += *(const uint16_t *) l & (uint32_t) mask;
        t *= 0x04040404U;
        s = (int) (t >> 24) - 1 - 2;
        break;
    }
    case RTX_8X4: {
        uint32_t t = *(const uint16_t *) a & (uint32_t) mask;
        t         += *(const uint8_t  *) l & (uint32_t) mask;
        t *= 0x04040404U;
        s = (int) (t >> 24) - 2 - 1;
        break;
    }
    case RTX_8X16: {
        uint32_t t = *(const uint16_t *) a & (uint32_t) mask;
        t         += *(const uint32_t *) l & (uint32_t) mask;
        t = (t >> 6) * (uint32_t) mul;
        s = (int) (t >> 24) - 2 - 4;
        break;
    }
    case RTX_16X8: {
        uint32_t t = *(const uint32_t *) a & (uint32_t) mask;
        t         += *(const uint16_t *) l & (uint32_t) mask;
        t = (t >> 6) * (uint32_t) mul;
        s = (int) (t >> 24) - 4 - 2;
        break;
    }
    case RTX_16X32: {
        uint64_t t = *(const uint32_t *) a & (uint32_t) mask;
        t         += *(const uint64_t *) l & mask;
        t = (t >> 6) * mul;
        s = (int) (t >> 56) - 4 - 8;
        break;
    }
    case RTX_32X16: {
        uint64_t t = *(const uint64_t *) a & mask;
        t         += *(const uint32_t *) l & (uint32_t) mask;
        t = (t >> 6) * mul;
        s = (int) (t >> 56) - 8 - 4;
        break;
    }
    case RTX_32X64: {
        uint64_t t = (*(const uint64_t *) &a[0] & mask) >> 6;
        t         += (*(const uint64_t *) &l[0] & mask) >> 6;
        t         += (*(const uint64_t *) &l[8] & mask) >> 6;
        t *= mul;
        s = (int) (t >> 56) - 8 - 16;
        break;
    }
    case RTX_64X32: {
        uint64_t t = (*(const uint64_t *) &a[0] & mask) >> 6;
        t         += (*(const uint64_t *) &a[8] & mask) >> 6;
        t         += (*(const uint64_t *) &l[0] & mask) >> 6;
        t *= mul;
        s = (int) (t >> 56) - 16 - 8;
        break;
    }
    case RTX_4X16: {
        uint32_t t = *(const uint8_t  *) a & (uint32_t) mask;
        t         += *(const uint32_t *) l & (uint32_t) mask;
        t = (t >> 6) * (uint32_t) mul;
        s = (int) (t >> 24) - 1 - 4;
        break;
    }
    case RTX_16X4: {
        uint32_t t = *(const uint32_t *) a & (uint32_t) mask;
        t         += *(const uint8_t  *) l & (uint32_t) mask;
        t = (t >> 6) * (uint32_t) mul;
        s = (int) (t >> 24) - 4 - 1;
        break;
    }
    case RTX_8X32: {
        uint64_t t = *(const uint16_t *) a & (uint32_t) mask;
        t         += *(const uint64_t *) l & mask;
        t = (t >> 6) * mul;
        s = (int) (t >> 56) - 2 - 8;
        break;
    }
    case RTX_32X8: {
        uint64_t t = *(const uint64_t *) a & mask;
        t         += *(const uint16_t *) l & (uint32_t) mask;
        t = (t >> 6) * mul;
        s = (int) (t >> 56) - 8 - 2;
        break;
    }
    case RTX_16X64: {
        uint64_t t = *(const uint32_t *) a & (uint32_t) mask;
        t         += *(const uint64_t *) &l[0] & mask;
        t = (t >> 6) + ((*(const uint64_t *) &l[8] & mask) >> 6);
        t *= mul;
        s = (int) (t >> 56) - 4 - 16;
        break;
    }
    case RTX_64X16: {
        uint64_t t = *(const uint64_t *) &a[0] & mask;
        t         += *(const uint32_t *) l & (uint32_t) mask;
        t = (t >> 6) + ((*(const uint64_t *) &a[8] & mask) >> 6);
        t *= mul;
        s = (int) (t >> 56) - 16 - 4;
        break;
    }
    }

    return (s != 0) + (s > 0);
}

static inline unsigned get_lo_ctx(const uint8_t *const levels,
                                  const enum TxClass tx_class,
                                  unsigned *const hi_mag,
                                  const uint8_t (*const ctx_offsets)[5],
                                  const unsigned x, const unsigned y,
                                  const ptrdiff_t stride)
{
    unsigned mag = levels[0 * stride + 1] + levels[1 * stride + 0];
    unsigned offset;
    if (tx_class == TX_CLASS_2D) {
        mag += levels[1 * stride + 1];
        *hi_mag = mag;
        mag += levels[0 * stride + 2] + levels[2 * stride + 0];
        offset = ctx_offsets[umin(y, 4)][umin(x, 4)];
    } else {
        mag += levels[0 * stride + 2];
        *hi_mag = mag;
        mag += levels[0 * stride + 3] + levels[0 * stride + 4];
        offset = 26 + (y > 1 ? 10 : y * 5);
    }
    return offset + (mag > 512 ? 4 : (mag + 64) >> 7);
}

static int decode_coefs(Dav1dTaskContext *const t,
                        uint8_t *const a, uint8_t *const l,
                        const enum RectTxfmSize tx, const enum BlockSize bs,
                        const Av1Block *const b, const int intra,
                        const int plane, coef *cf,
                        enum TxfmType *const txtp, uint8_t *res_ctx)
{
    Dav1dTileState *const ts = t->ts;
    const int chroma = !!plane;
    const Dav1dFrameContext *const f = t->f;
    const int lossless = f->frame_hdr->segmentation.lossless[b->seg_id];
    const TxfmInfo *const t_dim = &dav1d_txfm_dimensions[tx];
    const int dbg = DEBUG_BLOCK_INFO && plane && 0;

    if (dbg)
        printf("Start: r=%d\n", ts->msac.rng);

    // does this block have any non-zero coefficients
    const int sctx = get_skip_ctx(t_dim, bs, a, l, chroma, f->cur.p.layout);
    const int all_skip = dav1d_msac_decode_bool_adapt(&ts->msac,
                             ts->cdf.coef.skip[t_dim->ctx][sctx]);
    if (dbg)
        printf("Post-non-zero[%d][%d][%d]: r=%d\n",
               t_dim->ctx, sctx, all_skip, ts->msac.rng);
    if (all_skip) {
        *res_ctx = 0x40;
        *txtp = lossless * WHT_WHT; /* lossless ? WHT_WHT : DCT_DCT */
        return -1;
    }

    // transform type (chroma: derived, luma: explicitly coded)
    if (lossless) {
        assert(t_dim->max == TX_4X4);
        *txtp = WHT_WHT;
    } else if (t_dim->max + intra >= TX_64X64) {
        *txtp = DCT_DCT;
    } else if (chroma) {
        // inferred from either the luma txtp (inter) or a LUT (intra)
        *txtp = intra ? dav1d_txtp_from_uvmode[b->uv_mode] :
                        get_uv_inter_txtp(t_dim, *txtp);
    } else if (!f->frame_hdr->segmentation.qidx[b->seg_id]) {
        // In libaom, lossless is checked by a literal qidx == 0, but not all
        // such blocks are actually lossless. The remainder gets an implicit
        // transform type (for luma)
        *txtp = DCT_DCT;
    } else {
        unsigned idx;
        if (intra) {
            const enum IntraPredMode y_mode_nofilt = b->y_mode == FILTER_PRED ?
                dav1d_filter_mode_to_y_mode[b->y_angle] : b->y_mode;
            if (f->frame_hdr->reduced_txtp_set || t_dim->min == TX_16X16) {
                idx = dav1d_msac_decode_symbol_adapt8(&ts->msac,
                          ts->cdf.m.txtp_intra2[t_dim->min][y_mode_nofilt], 4);
                *txtp = dav1d_tx_types_per_set[idx + 0];
            } else {
                idx = dav1d_msac_decode_symbol_adapt8(&ts->msac,
                          ts->cdf.m.txtp_intra1[t_dim->min][y_mode_nofilt], 6);
                *txtp = dav1d_tx_types_per_set[idx + 5];
            }
            if (dbg)
                printf("Post-txtp-intra[%d->%d][%d][%d->%d]: r=%d\n",
                       tx, t_dim->min, y_mode_nofilt, idx, *txtp, ts->msac.rng);
        } else {
            if (f->frame_hdr->reduced_txtp_set || t_dim->max == TX_32X32) {
                idx = dav1d_msac_decode_bool_adapt(&ts->msac,
                          ts->cdf.m.txtp_inter3[t_dim->min]);
                *txtp = (idx - 1) & IDTX; /* idx ? DCT_DCT : IDTX */
            } else if (t_dim->min == TX_16X16) {
                idx = dav1d_msac_decode_symbol_adapt16(&ts->msac,
                          ts->cdf.m.txtp_inter2, 11);
                *txtp = dav1d_tx_types_per_set[idx + 12];
            } else {
                idx = dav1d_msac_decode_symbol_adapt16(&ts->msac,
                          ts->cdf.m.txtp_inter1[t_dim->min], 15);
                *txtp = dav1d_tx_types_per_set[idx + 24];
            }
            if (dbg)
                printf("Post-txtp-inter[%d->%d][%d->%d]: r=%d\n",
                       tx, t_dim->min, idx, *txtp, ts->msac.rng);
        }
    }

    // find end-of-block (eob)
    int eob_bin;
    const int slw = imin(t_dim->lw, TX_32X32), slh = imin(t_dim->lh, TX_32X32);
    const int tx2dszctx = slw + slh;
    const enum TxClass tx_class = dav1d_tx_type_class[*txtp];
    const int is_1d = tx_class != TX_CLASS_2D;
    switch (tx2dszctx) {
#define case_sz(sz, bin, ns, is_1d) \
    case sz: { \
        uint16_t *const eob_bin_cdf = ts->cdf.coef.eob_bin_##bin[chroma]is_1d; \
        eob_bin = dav1d_msac_decode_symbol_adapt##ns(&ts->msac, eob_bin_cdf, 4 + sz); \
        break; \
    }
    case_sz(0,   16,  8, [is_1d]);
    case_sz(1,   32,  8, [is_1d]);
    case_sz(2,   64,  8, [is_1d]);
    case_sz(3,  128,  8, [is_1d]);
    case_sz(4,  256, 16, [is_1d]);
    case_sz(5,  512, 16,        );
    case_sz(6, 1024, 16,        );
#undef case_sz
    }
    if (dbg)
        printf("Post-eob_bin_%d[%d][%d][%d]: r=%d\n",
               16 << tx2dszctx, chroma, is_1d, eob_bin, ts->msac.rng);
    int eob;
    if (eob_bin > 1) {
        uint16_t *const eob_hi_bit_cdf =
            ts->cdf.coef.eob_hi_bit[t_dim->ctx][chroma][eob_bin];
        const int eob_hi_bit = dav1d_msac_decode_bool_adapt(&ts->msac, eob_hi_bit_cdf);
        if (dbg)
            printf("Post-eob_hi_bit[%d][%d][%d][%d]: r=%d\n",
                   t_dim->ctx, chroma, eob_bin, eob_hi_bit, ts->msac.rng);
        eob = ((eob_hi_bit | 2) << (eob_bin - 2)) |
              dav1d_msac_decode_bools(&ts->msac, eob_bin - 2);
        if (dbg)
            printf("Post-eob[%d]: r=%d\n", eob, ts->msac.rng);
    } else {
        eob = eob_bin;
    }
    assert(eob >= 0);

    // base tokens
    uint16_t (*const eob_cdf)[4] = ts->cdf.coef.eob_base_tok[t_dim->ctx][chroma];
    uint16_t (*const hi_cdf)[4] = ts->cdf.coef.br_tok[imin(t_dim->ctx, 3)][chroma];
    unsigned rc, dc_tok;

    if (eob) {
        uint16_t (*const lo_cdf)[4] = ts->cdf.coef.base_tok[t_dim->ctx][chroma];
        uint8_t *const levels = t->scratch.levels; // bits 0-5: tok, 6-7: lo_tok

        /* eob */
        unsigned ctx = 1 + (eob > 2 << tx2dszctx) + (eob > 4 << tx2dszctx);
        int eob_tok = dav1d_msac_decode_symbol_adapt4(&ts->msac, eob_cdf[ctx], 2);
        int tok = eob_tok + 1;
        int level_tok = tok * 0x41;
        unsigned mag;

#define DECODE_COEFS_CLASS(tx_class) \
        unsigned x, y; \
        uint8_t *level; \
        if (tx_class == TX_CLASS_2D) \
            rc = scan[eob], x = rc >> shift, y = rc & mask; \
        else if (tx_class == TX_CLASS_H) \
            /* Transposing reduces the stride and padding requirements */ \
            x = eob & mask, y = eob >> shift, rc = eob; \
        else /* tx_class == TX_CLASS_V */ \
            x = eob & mask, y = eob >> shift, rc = (x << shift2) | y; \
        if (dbg) \
            printf("Post-lo_tok[%d][%d][%d][%d=%d=%d]: r=%d\n", \
                   t_dim->ctx, chroma, ctx, eob, rc, tok, ts->msac.rng); \
        if (eob_tok == 2) { \
            ctx = (tx_class == TX_CLASS_2D ? (x | y) > 1 : y != 0) ? 14 : 7; \
            tok = dav1d_msac_decode_hi_tok(&ts->msac, hi_cdf[ctx]); \
            level_tok = tok + (3 << 6); \
            if (dbg) \
                printf("Post-hi_tok[%d][%d][%d][%d=%d=%d]: r=%d\n", \
                       imin(t_dim->ctx, 3), chroma, ctx, eob, rc, tok, \
                       ts->msac.rng); \
        } \
        cf[rc] = tok << 11; \
        if (TX_CLASS_2D) \
            level = levels + rc; \
        else \
            level = levels + x * stride + y; \
        *level = (uint8_t) level_tok; \
        for (int i = eob - 1; i > 0; i--) { /* ac */ \
            unsigned rc_i; \
            if (tx_class == TX_CLASS_2D) \
                rc_i = scan[i], x = rc_i >> shift, y = rc_i & mask; \
            else if (tx_class == TX_CLASS_H) \
                x = i & mask, y = i >> shift, rc_i = i; \
            else /* tx_class == TX_CLASS_V */ \
                x = i & mask, y = i >> shift, rc_i = (x << shift2) | y; \
            assert(x < 32 && y < 32); \
            if (TX_CLASS_2D) \
                level = levels + rc; \
            else \
                level = levels + x * stride + y; \
            ctx = get_lo_ctx(level, tx_class, &mag, lo_ctx_offsets, x, y, stride); \
            if (tx_class == TX_CLASS_2D) \
                y |= x; \
            tok = dav1d_msac_decode_symbol_adapt4(&ts->msac, lo_cdf[ctx], 3); \
            if (dbg) \
                printf("Post-lo_tok[%d][%d][%d][%d=%d=%d]: r=%d\n", \
                       t_dim->ctx, chroma, ctx, i, rc_i, tok, ts->msac.rng); \
            if (tok == 3) { \
                mag &= 63; \
                ctx = (y > (tx_class == TX_CLASS_2D) ? 14 : 7) + \
                      (mag > 12 ? 6 : (mag + 1) >> 1); \
                tok = dav1d_msac_decode_hi_tok(&ts->msac, hi_cdf[ctx]); \
                if (dbg) \
                    printf("Post-hi_tok[%d][%d][%d][%d=%d=%d]: r=%d\n", \
                           imin(t_dim->ctx, 3), chroma, ctx, i, rc_i, tok, \
                           ts->msac.rng); \
                *level = (uint8_t) (tok + (3 << 6)); \
                cf[rc_i] = (tok << 11) | rc; \
                rc = rc_i; \
            } else { \
                /* 0x1 for tok, 0x7ff as bitmask for rc, 0x41 for level_tok */ \
                tok *= 0x17ff41; \
                *level = (uint8_t) tok; \
                /* tok ? (tok << 11) | rc : 0 */ \
                tok = (tok >> 9) & (rc + ~0x7ffu); \
                if (tok) rc = rc_i; \
                cf[rc_i] = tok; \
            } \
        } \
        /* dc */ \
        ctx = (tx_class == TX_CLASS_2D) ? 0 : \
            get_lo_ctx(levels, tx_class, &mag, lo_ctx_offsets, 0, 0, stride); \
        dc_tok = dav1d_msac_decode_symbol_adapt4(&ts->msac, lo_cdf[ctx], 3); \
        if (dbg) \
            printf("Post-dc_lo_tok[%d][%d][%d][%d]: r=%d\n", \
                   t_dim->ctx, chroma, ctx, dc_tok, ts->msac.rng); \
        if (dc_tok == 3) { \
            if (tx_class == TX_CLASS_2D) \
                mag = levels[0 * stride + 1] + levels[1 * stride + 0] + \
                      levels[1 * stride + 1]; \
            mag &= 63; \
            ctx = mag > 12 ? 6 : (mag + 1) >> 1; \
            dc_tok = dav1d_msac_decode_hi_tok(&ts->msac, hi_cdf[ctx]); \
            if (dbg) \
                printf("Post-dc_hi_tok[%d][%d][0][%d]: r=%d\n", \
                       imin(t_dim->ctx, 3), chroma, dc_tok, ts->msac.rng); \
        } \
        break

        const uint16_t *scan;
        switch (tx_class) {
        case TX_CLASS_2D: {
            const unsigned nonsquare_tx = tx >= RTX_4X8;
            const uint8_t (*const lo_ctx_offsets)[5] =
                dav1d_lo_ctx_offsets[nonsquare_tx + (tx & nonsquare_tx)];
            scan = dav1d_scans[tx];
            const ptrdiff_t stride = 4 << slh;
            const unsigned shift = slh + 2, shift2 = 0;
            const unsigned mask = (4 << slh) - 1;
            memset(levels, 0, stride * ((4 << slw) + 2));
            DECODE_COEFS_CLASS(TX_CLASS_2D);
        }
        case TX_CLASS_H: {
            const uint8_t (*const lo_ctx_offsets)[5] = NULL;
            const ptrdiff_t stride = 16;
            const unsigned shift = slh + 2, shift2 = 0;
            const unsigned mask = (4 << slh) - 1;
            memset(levels, 0, stride * ((4 << slh) + 2));
            DECODE_COEFS_CLASS(TX_CLASS_H);
        }
        case TX_CLASS_V: {
            const uint8_t (*const lo_ctx_offsets)[5] = NULL;
            const ptrdiff_t stride = 16;
            const unsigned shift = slw + 2, shift2 = slh + 2;
            const unsigned mask = (4 << slw) - 1;
            memset(levels, 0, stride * ((4 << slw) + 2));
            DECODE_COEFS_CLASS(TX_CLASS_V);
        }
#undef DECODE_COEFS_CLASS
        default: assert(0);
        }
    } else { // dc-only
        int tok_br = dav1d_msac_decode_symbol_adapt4(&ts->msac, eob_cdf[0], 2);
        dc_tok = 1 + tok_br;
        if (dbg)
            printf("Post-dc_lo_tok[%d][%d][%d][%d]: r=%d\n",
                   t_dim->ctx, chroma, 0, dc_tok, ts->msac.rng);
        if (tok_br == 2) {
            dc_tok = dav1d_msac_decode_hi_tok(&ts->msac, hi_cdf[0]);
            if (dbg)
                printf("Post-dc_hi_tok[%d][%d][0][%d]: r=%d\n",
                       imin(t_dim->ctx, 3), chroma, dc_tok, ts->msac.rng);
        }
        rc = 0;
    }

    // residual and sign
    const uint16_t *const dq_tbl = ts->dq[b->seg_id][plane];
    const uint8_t *const qm_tbl = *txtp < IDTX ? f->qm[tx][plane] : NULL;
    const int dq_shift = imax(0, t_dim->ctx - 2);
    const int cf_max = ~(~127U << (BITDEPTH == 8 ? 8 : f->cur.p.bpc));
    unsigned cul_level, dc_sign_level;

    if (!dc_tok) {
        cul_level = 0;
        dc_sign_level = 1 << 6;
        if (qm_tbl) goto ac_qm;
        goto ac_noqm;
    }

    const int dc_sign_ctx = get_dc_sign_ctx(tx, a, l);
    uint16_t *const dc_sign_cdf = ts->cdf.coef.dc_sign[chroma][dc_sign_ctx];
    const int dc_sign = dav1d_msac_decode_bool_adapt(&ts->msac, dc_sign_cdf);
    if (dbg)
        printf("Post-dc_sign[%d][%d][%d]: r=%d\n",
               chroma, dc_sign_ctx, dc_sign, ts->msac.rng);

    int dc_dq = dq_tbl[0];
    dc_sign_level = (dc_sign - 1) & (2 << 6);

    if (qm_tbl) {
        dc_dq = (dc_dq * qm_tbl[0] + 16) >> 5;

        if (dc_tok == 15) {
            dc_tok = read_golomb(&ts->msac) + 15;
            if (dbg)
                printf("Post-dc_residual[%d->%d]: r=%d\n",
                       dc_tok - 15, dc_tok, ts->msac.rng);

            dc_tok &= 0xfffff;
            dc_dq = (dc_dq * dc_tok) & 0xffffff;
        } else {
            dc_dq *= dc_tok;
            assert(dc_dq <= 0xffffff);
        }
        cul_level = dc_tok;
        dc_dq >>= dq_shift;
        dc_dq = umin(dc_dq, cf_max + dc_sign);
        cf[0] = (coef) (dc_sign ? -dc_dq : dc_dq);

        if (rc) ac_qm: {
            const unsigned ac_dq = dq_tbl[1];
            do {
                const int sign = dav1d_msac_decode_bool_equi(&ts->msac);
                if (dbg)
                    printf("Post-sign[%d=%d]: r=%d\n", rc, sign, ts->msac.rng);
                const unsigned rc_tok = cf[rc];
                unsigned tok, dq = (ac_dq * qm_tbl[rc] + 16) >> 5;
                int dq_sat;

                if (rc_tok >= (15 << 11)) {
                    tok = read_golomb(&ts->msac) + 15;
                    if (dbg)
                        printf("Post-residual[%d=%d->%d]: r=%d\n",
                               rc, tok - 15, tok, ts->msac.rng);

                    tok &= 0xfffff;
                    dq = (dq * tok) & 0xffffff;
                } else {
                    tok = rc_tok >> 11;
                    dq *= tok;
                    assert(dq <= 0xffffff);
                }
                cul_level += tok;
                dq >>= dq_shift;
                dq_sat = umin(dq, cf_max + sign);
                cf[rc] = (coef) (sign ? -dq_sat : dq_sat);

                rc = rc_tok & 0x3ff;
            } while (rc);
        }
    } else {
        // non-qmatrix is the common case and allows for additional optimizations
        if (dc_tok == 15) {
            dc_tok = read_golomb(&ts->msac) + 15;
            if (dbg)
                printf("Post-dc_residual[%d->%d]: r=%d\n",
                       dc_tok - 15, dc_tok, ts->msac.rng);

            dc_tok &= 0xfffff;
            dc_dq = ((dc_dq * dc_tok) & 0xffffff) >> dq_shift;
            dc_dq = umin(dc_dq, cf_max + dc_sign);
        } else {
            dc_dq = ((dc_dq * dc_tok) >> dq_shift);
            assert(dc_dq <= cf_max);
        }
        cul_level = dc_tok;
        cf[0] = (coef) (dc_sign ? -dc_dq : dc_dq);

        if (rc) ac_noqm: {
            const unsigned ac_dq = dq_tbl[1];
            do {
                const int sign = dav1d_msac_decode_bool_equi(&ts->msac);
                if (dbg)
                    printf("Post-sign[%d=%d]: r=%d\n", rc, sign, ts->msac.rng);
                const unsigned rc_tok = cf[rc];
                unsigned tok;
                int dq;

                // residual
                if (rc_tok >= (15 << 11)) {
                    tok = read_golomb(&ts->msac) + 15;
                    if (dbg)
                        printf("Post-residual[%d=%d->%d]: r=%d\n",
                               rc, tok - 15, tok, ts->msac.rng);

                    // coefficient parsing, see 5.11.39
                    tok &= 0xfffff;

                    // dequant, see 7.12.3
                    dq = ((ac_dq * tok) & 0xffffff) >> dq_shift;
                    dq = umin(dq, cf_max + sign);
                } else {
                    // cannot exceed cf_max, so we can avoid the clipping
                    tok = rc_tok >> 11;
                    dq = ((ac_dq * tok) >> dq_shift);
                    assert(dq <= cf_max);
                }
                cul_level += tok;
                cf[rc] = (coef) (sign ? -dq : dq);

                rc = rc_tok & 0x3ff; // next non-zero rc, zero if eob
            } while (rc);
        }
    }

    // context
    *res_ctx = umin(cul_level, 63) | dc_sign_level;

    return eob;
}

static void read_coef_tree(Dav1dTaskContext *const t,
                           const enum BlockSize bs, const Av1Block *const b,
                           const enum RectTxfmSize ytx, const int depth,
                           const uint16_t *const tx_split,
                           const int x_off, const int y_off, pixel *dst)
{
    const Dav1dFrameContext *const f = t->f;
    Dav1dTileState *const ts = t->ts;
    const Dav1dDSPContext *const dsp = f->dsp;
    const TxfmInfo *const t_dim = &dav1d_txfm_dimensions[ytx];
    const int txw = t_dim->w, txh = t_dim->h;

    /* y_off can be larger than 3 since lossless blocks use TX_4X4 but can't
     * be splitted. Aviods an undefined left shift. */
    if (depth < 2 && tx_split[depth] &&
        tx_split[depth] & (1 << (y_off * 4 + x_off)))
    {
        const enum RectTxfmSize sub = t_dim->sub;
        const TxfmInfo *const sub_t_dim = &dav1d_txfm_dimensions[sub];
        const int txsw = sub_t_dim->w, txsh = sub_t_dim->h;

        read_coef_tree(t, bs, b, sub, depth + 1, tx_split,
                       x_off * 2 + 0, y_off * 2 + 0, dst);
        t->bx += txsw;
        if (txw >= txh && t->bx < f->bw)
            read_coef_tree(t, bs, b, sub, depth + 1, tx_split, x_off * 2 + 1,
                           y_off * 2 + 0, dst ? &dst[4 * txsw] : NULL);
        t->bx -= txsw;
        t->by += txsh;
        if (txh >= txw && t->by < f->bh) {
            if (dst)
                dst += 4 * txsh * PXSTRIDE(f->cur.stride[0]);
            read_coef_tree(t, bs, b, sub, depth + 1, tx_split,
                           x_off * 2 + 0, y_off * 2 + 1, dst);
            t->bx += txsw;
            if (txw >= txh && t->bx < f->bw)
                read_coef_tree(t, bs, b, sub, depth + 1, tx_split, x_off * 2 + 1,
                               y_off * 2 + 1, dst ? &dst[4 * txsw] : NULL);
            t->bx -= txsw;
        }
        t->by -= txsh;
    } else {
        const int bx4 = t->bx & 31, by4 = t->by & 31;
        enum TxfmType txtp;
        uint8_t cf_ctx;
        int eob;
        coef *cf;

        if (t->frame_thread.pass) {
            const int p = t->frame_thread.pass & 1;
            assert(ts->frame_thread[p].cf);
            cf = ts->frame_thread[p].cf;
            ts->frame_thread[p].cf += imin(t_dim->w, 8) * imin(t_dim->h, 8) * 16;
        } else {
            cf = bitfn(t->cf);
        }
        if (t->frame_thread.pass != 2) {
            eob = decode_coefs(t, &t->a->lcoef[bx4], &t->l.lcoef[by4],
                               ytx, bs, b, 0, 0, cf, &txtp, &cf_ctx);
            if (DEBUG_BLOCK_INFO)
                printf("Post-y-cf-blk[tx=%d,txtp=%d,eob=%d]: r=%d\n",
                       ytx, txtp, eob, ts->msac.rng);
            dav1d_memset_likely_pow2(&t->a->lcoef[bx4], cf_ctx, imin(txw, f->bw - t->bx));
            dav1d_memset_likely_pow2(&t->l.lcoef[by4], cf_ctx, imin(txh, f->bh - t->by));
#define set_ctx(rep_macro) \
            for (int y = 0; y < txh; y++) { \
                rep_macro(txtp_map, 0, txtp); \
                txtp_map += 32; \
            }
            uint8_t *txtp_map = &t->scratch.txtp_map[by4 * 32 + bx4];
            case_set_upto16(t_dim->lw);
#undef set_ctx
            if (t->frame_thread.pass == 1)
                *ts->frame_thread[1].cbi++ = eob * (1 << 5) + txtp;
        } else {
            const int cbi = *ts->frame_thread[0].cbi++;
            eob  = cbi >> 5;
            txtp = cbi & 0x1f;
        }
        if (!(t->frame_thread.pass & 1)) {
            assert(dst);
            if (eob >= 0) {
                if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS)
                    coef_dump(cf, imin(t_dim->h, 8) * 4, imin(t_dim->w, 8) * 4, 3, "dq");
                dsp->itx.itxfm_add[ytx][txtp](dst, f->cur.stride[0], cf, eob
                                              HIGHBD_CALL_SUFFIX);
                if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS)
                    hex_dump(dst, f->cur.stride[0], t_dim->w * 4, t_dim->h * 4, "recon");
            }
        }
    }
}

void bytefn(dav1d_read_coef_blocks)(Dav1dTaskContext *const t,
                                    const enum BlockSize bs, const Av1Block *const b)
{
    const Dav1dFrameContext *const f = t->f;
    const int ss_ver = f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I420;
    const int ss_hor = f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I444;
    const int bx4 = t->bx & 31, by4 = t->by & 31;
    const int cbx4 = bx4 >> ss_hor, cby4 = by4 >> ss_ver;
    const uint8_t *const b_dim = dav1d_block_dimensions[bs];
    const int bw4 = b_dim[0], bh4 = b_dim[1];
    const int cbw4 = (bw4 + ss_hor) >> ss_hor, cbh4 = (bh4 + ss_ver) >> ss_ver;
    const int has_chroma = f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I400 &&
                           (bw4 > ss_hor || t->bx & 1) &&
                           (bh4 > ss_ver || t->by & 1);

    if (b->skip) {
        BlockContext *const a = t->a;
        dav1d_memset_pow2[b_dim[2]](&a->lcoef[bx4], 0x40);
        dav1d_memset_pow2[b_dim[3]](&t->l.lcoef[by4], 0x40);
        if (has_chroma) {
            dav1d_memset_pow2_fn memset_cw = dav1d_memset_pow2[ulog2(cbw4)];
            dav1d_memset_pow2_fn memset_ch = dav1d_memset_pow2[ulog2(cbh4)];
            memset_cw(&a->ccoef[0][cbx4], 0x40);
            memset_cw(&a->ccoef[1][cbx4], 0x40);
            memset_ch(&t->l.ccoef[0][cby4], 0x40);
            memset_ch(&t->l.ccoef[1][cby4], 0x40);
        }
        return;
    }

    Dav1dTileState *const ts = t->ts;
    const int w4 = imin(bw4, f->bw - t->bx), h4 = imin(bh4, f->bh - t->by);
    const int cw4 = (w4 + ss_hor) >> ss_hor, ch4 = (h4 + ss_ver) >> ss_ver;
    assert(t->frame_thread.pass == 1);
    assert(!b->skip);
    const TxfmInfo *const uv_t_dim = &dav1d_txfm_dimensions[b->uvtx];
    const TxfmInfo *const t_dim = &dav1d_txfm_dimensions[b->intra ? b->tx : b->max_ytx];
    const uint16_t tx_split[2] = { b->tx_split0, b->tx_split1 };

    for (int init_y = 0; init_y < h4; init_y += 16) {
        const int sub_h4 = imin(h4, 16 + init_y);
        for (int init_x = 0; init_x < w4; init_x += 16) {
            const int sub_w4 = imin(w4, init_x + 16);
            int y_off = !!init_y, y, x;
            for (y = init_y, t->by += init_y; y < sub_h4;
                 y += t_dim->h, t->by += t_dim->h, y_off++)
            {
                int x_off = !!init_x;
                for (x = init_x, t->bx += init_x; x < sub_w4;
                     x += t_dim->w, t->bx += t_dim->w, x_off++)
                {
                    if (!b->intra) {
                        read_coef_tree(t, bs, b, b->max_ytx, 0, tx_split,
                                       x_off, y_off, NULL);
                    } else {
                        uint8_t cf_ctx = 0x40;
                        enum TxfmType txtp;
                        const int eob =
                            decode_coefs(t, &t->a->lcoef[bx4 + x],
                                         &t->l.lcoef[by4 + y], b->tx, bs, b, 1,
                                         0, ts->frame_thread[1].cf, &txtp, &cf_ctx);
                        if (DEBUG_BLOCK_INFO)
                            printf("Post-y-cf-blk[tx=%d,txtp=%d,eob=%d]: r=%d\n",
                                   b->tx, txtp, eob, ts->msac.rng);
                        *ts->frame_thread[1].cbi++ = eob * (1 << 5) + txtp;
                        ts->frame_thread[1].cf += imin(t_dim->w, 8) * imin(t_dim->h, 8) * 16;
                        dav1d_memset_likely_pow2(&t->a->lcoef[bx4 + x], cf_ctx, imin(t_dim->w, f->bw - t->bx));
                        dav1d_memset_likely_pow2(&t->l.lcoef[by4 + y], cf_ctx, imin(t_dim->h, f->bh - t->by));
                    }
                }
                t->bx -= x;
            }
            t->by -= y;

            if (!has_chroma) continue;

            const int sub_ch4 = imin(ch4, (init_y + 16) >> ss_ver);
            const int sub_cw4 = imin(cw4, (init_x + 16) >> ss_hor);
            for (int pl = 0; pl < 2; pl++) {
                for (y = init_y >> ss_ver, t->by += init_y; y < sub_ch4;
                     y += uv_t_dim->h, t->by += uv_t_dim->h << ss_ver)
                {
                    for (x = init_x >> ss_hor, t->bx += init_x; x < sub_cw4;
                         x += uv_t_dim->w, t->bx += uv_t_dim->w << ss_hor)
                    {
                        uint8_t cf_ctx = 0x40;
                        enum TxfmType txtp;
                        if (!b->intra)
                            txtp = t->scratch.txtp_map[(by4 + (y << ss_ver)) * 32 +
                                                        bx4 + (x << ss_hor)];
                        const int eob =
                            decode_coefs(t, &t->a->ccoef[pl][cbx4 + x],
                                         &t->l.ccoef[pl][cby4 + y], b->uvtx, bs,
                                         b, b->intra, 1 + pl, ts->frame_thread[1].cf,
                                         &txtp, &cf_ctx);
                        if (DEBUG_BLOCK_INFO)
                            printf("Post-uv-cf-blk[pl=%d,tx=%d,"
                                   "txtp=%d,eob=%d]: r=%d\n",
                                   pl, b->uvtx, txtp, eob, ts->msac.rng);
                        *ts->frame_thread[1].cbi++ = eob * (1 << 5) + txtp;
                        ts->frame_thread[1].cf += uv_t_dim->w * uv_t_dim->h * 16;
                        int ctw = imin(uv_t_dim->w, (f->bw - t->bx + ss_hor) >> ss_hor);
                        int cth = imin(uv_t_dim->h, (f->bh - t->by + ss_ver) >> ss_ver);
                        dav1d_memset_likely_pow2(&t->a->ccoef[pl][cbx4 + x], cf_ctx, ctw);
                        dav1d_memset_likely_pow2(&t->l.ccoef[pl][cby4 + y], cf_ctx, cth);
                    }
                    t->bx -= x << ss_hor;
                }
                t->by -= y << ss_ver;
            }
        }
    }
}

static int mc(Dav1dTaskContext *const t,
              pixel *const dst8, int16_t *const dst16, const ptrdiff_t dst_stride,
              const int bw4, const int bh4,
              const int bx, const int by, const int pl,
              const mv mv, const Dav1dThreadPicture *const refp, const int refidx,
              const enum Filter2d filter_2d)
{
    assert((dst8 != NULL) ^ (dst16 != NULL));
    const Dav1dFrameContext *const f = t->f;
    const int ss_ver = !!pl && f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I420;
    const int ss_hor = !!pl && f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I444;
    const int h_mul = 4 >> ss_hor, v_mul = 4 >> ss_ver;
    const int mvx = mv.x, mvy = mv.y;
    const int mx = mvx & (15 >> !ss_hor), my = mvy & (15 >> !ss_ver);
    ptrdiff_t ref_stride = refp->p.stride[!!pl];
    const pixel *ref;

    if (refp->p.p.w == f->cur.p.w && refp->p.p.h == f->cur.p.h) {
        const int dx = bx * h_mul + (mvx >> (3 + ss_hor));
        const int dy = by * v_mul + (mvy >> (3 + ss_ver));
        int w, h;

        if (refp->p.data[0] != f->cur.data[0]) { // i.e. not for intrabc
            w = (f->cur.p.w + ss_hor) >> ss_hor;
            h = (f->cur.p.h + ss_ver) >> ss_ver;
        } else {
            w = f->bw * 4 >> ss_hor;
            h = f->bh * 4 >> ss_ver;
        }
        if (dx < !!mx * 3 || dy < !!my * 3 ||
            dx + bw4 * h_mul + !!mx * 4 > w ||
            dy + bh4 * v_mul + !!my * 4 > h)
        {
            pixel *const emu_edge_buf = bitfn(t->scratch.emu_edge);
            f->dsp->mc.emu_edge(bw4 * h_mul + !!mx * 7, bh4 * v_mul + !!my * 7,
                                w, h, dx - !!mx * 3, dy - !!my * 3,
                                emu_edge_buf, 192 * sizeof(pixel),
                                refp->p.data[pl], ref_stride);
            ref = &emu_edge_buf[192 * !!my * 3 + !!mx * 3];
            ref_stride = 192 * sizeof(pixel);
        } else {
            ref = ((pixel *) refp->p.data[pl]) + PXSTRIDE(ref_stride) * dy + dx;
        }

        if (dst8 != NULL) {
            f->dsp->mc.mc[filter_2d](dst8, dst_stride, ref, ref_stride, bw4 * h_mul,
                                     bh4 * v_mul, mx << !ss_hor, my << !ss_ver
                                     HIGHBD_CALL_SUFFIX);
        } else {
            f->dsp->mc.mct[filter_2d](dst16, ref, ref_stride, bw4 * h_mul,
                                      bh4 * v_mul, mx << !ss_hor, my << !ss_ver
                                      HIGHBD_CALL_SUFFIX);
        }
    } else {
        assert(refp != &f->sr_cur);

        const int orig_pos_y = (by * v_mul << 4) + mvy * (1 << !ss_ver);
        const int orig_pos_x = (bx * h_mul << 4) + mvx * (1 << !ss_hor);
#define scale_mv(res, val, scale) do { \
            const int64_t tmp = (int64_t)(val) * scale + (scale - 0x4000) * 8; \
            res = apply_sign64((int) ((llabs(tmp) + 128) >> 8), tmp) + 32;     \
        } while (0)
        int pos_y, pos_x;
        scale_mv(pos_x, orig_pos_x, f->svc[refidx][0].scale);
        scale_mv(pos_y, orig_pos_y, f->svc[refidx][1].scale);
#undef scale_mv
        const int left = pos_x >> 10;
        const int top = pos_y >> 10;
        const int right =
            ((pos_x + (bw4 * h_mul - 1) * f->svc[refidx][0].step) >> 10) + 1;
        const int bottom =
            ((pos_y + (bh4 * v_mul - 1) * f->svc[refidx][1].step) >> 10) + 1;

        if (DEBUG_BLOCK_INFO)
            printf("Off %dx%d [%d,%d,%d], size %dx%d [%d,%d]\n",
                   left, top, orig_pos_x, f->svc[refidx][0].scale, refidx,
                   right-left, bottom-top,
                   f->svc[refidx][0].step, f->svc[refidx][1].step);

        const int w = (refp->p.p.w + ss_hor) >> ss_hor;
        const int h = (refp->p.p.h + ss_ver) >> ss_ver;
        if (left < 3 || top < 3 || right + 4 > w || bottom + 4 > h) {
            pixel *const emu_edge_buf = bitfn(t->scratch.emu_edge);
            f->dsp->mc.emu_edge(right - left + 7, bottom - top + 7,
                                w, h, left - 3, top - 3,
                                emu_edge_buf, 320 * sizeof(pixel),
                                refp->p.data[pl], ref_stride);
            ref = &emu_edge_buf[320 * 3 + 3];
            ref_stride = 320 * sizeof(pixel);
            if (DEBUG_BLOCK_INFO) printf("Emu\n");
        } else {
            ref = ((pixel *) refp->p.data[pl]) + PXSTRIDE(ref_stride) * top + left;
        }

        if (dst8 != NULL) {
            f->dsp->mc.mc_scaled[filter_2d](dst8, dst_stride, ref, ref_stride,
                                            bw4 * h_mul, bh4 * v_mul,
                                            pos_x & 0x3ff, pos_y & 0x3ff,
                                            f->svc[refidx][0].step,
                                            f->svc[refidx][1].step
                                            HIGHBD_CALL_SUFFIX);
        } else {
            f->dsp->mc.mct_scaled[filter_2d](dst16, ref, ref_stride,
                                             bw4 * h_mul, bh4 * v_mul,
                                             pos_x & 0x3ff, pos_y & 0x3ff,
                                             f->svc[refidx][0].step,
                                             f->svc[refidx][1].step
                                             HIGHBD_CALL_SUFFIX);
        }
    }

    return 0;
}

static int obmc(Dav1dTaskContext *const t,
                pixel *const dst, const ptrdiff_t dst_stride,
                const uint8_t *const b_dim, const int pl,
                const int bx4, const int by4, const int w4, const int h4)
{
    assert(!(t->bx & 1) && !(t->by & 1));
    const Dav1dFrameContext *const f = t->f;
    /*const*/ refmvs_block **r = &t->rt.r[(t->by & 31) + 5];
    pixel *const lap = bitfn(t->scratch.lap);
    const int ss_ver = !!pl && f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I420;
    const int ss_hor = !!pl && f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I444;
    const int h_mul = 4 >> ss_hor, v_mul = 4 >> ss_ver;
    int res;

    if (t->by > t->ts->tiling.row_start &&
        (!pl || b_dim[0] * h_mul + b_dim[1] * v_mul >= 16))
    {
        for (int i = 0, x = 0; x < w4 && i < imin(b_dim[2], 4); ) {
            // only odd blocks are considered for overlap handling, hence +1
            const refmvs_block *const a_r = &r[-1][t->bx + x + 1];
            const uint8_t *const a_b_dim = dav1d_block_dimensions[a_r->bs];
            const int step4 = iclip(a_b_dim[0], 2, 16);

            if (a_r->ref.ref[0] > 0) {
                const int ow4 = imin(step4, b_dim[0]);
                const int oh4 = imin(b_dim[1], 16) >> 1;
                res = mc(t, lap, NULL, ow4 * h_mul * sizeof(pixel), ow4, (oh4 * 3 + 3) >> 2,
                         t->bx + x, t->by, pl, a_r->mv.mv[0],
                         &f->refp[a_r->ref.ref[0] - 1], a_r->ref.ref[0] - 1,
                         dav1d_filter_2d[t->a->filter[1][bx4 + x + 1]][t->a->filter[0][bx4 + x + 1]]);
                if (res) return res;
                f->dsp->mc.blend_h(&dst[x * h_mul], dst_stride, lap,
                                   h_mul * ow4, v_mul * oh4);
                i++;
            }
            x += step4;
        }
    }

    if (t->bx > t->ts->tiling.col_start)
        for (int i = 0, y = 0; y < h4 && i < imin(b_dim[3], 4); ) {
            // only odd blocks are considered for overlap handling, hence +1
            const refmvs_block *const l_r = &r[y + 1][t->bx - 1];
            const uint8_t *const l_b_dim = dav1d_block_dimensions[l_r->bs];
            const int step4 = iclip(l_b_dim[1], 2, 16);

            if (l_r->ref.ref[0] > 0) {
                const int ow4 = imin(b_dim[0], 16) >> 1;
                const int oh4 = imin(step4, b_dim[1]);
                res = mc(t, lap, NULL, h_mul * ow4 * sizeof(pixel), ow4, oh4,
                         t->bx, t->by + y, pl, l_r->mv.mv[0],
                         &f->refp[l_r->ref.ref[0] - 1], l_r->ref.ref[0] - 1,
                         dav1d_filter_2d[t->l.filter[1][by4 + y + 1]][t->l.filter[0][by4 + y + 1]]);
                if (res) return res;
                f->dsp->mc.blend_v(&dst[y * v_mul * PXSTRIDE(dst_stride)],
                                   dst_stride, lap, h_mul * ow4, v_mul * oh4);
                i++;
            }
            y += step4;
        }
    return 0;
}

static int warp_affine(Dav1dTaskContext *const t,
                       pixel *dst8, int16_t *dst16, const ptrdiff_t dstride,
                       const uint8_t *const b_dim, const int pl,
                       const Dav1dThreadPicture *const refp,
                       const Dav1dWarpedMotionParams *const wmp)
{
    assert((dst8 != NULL) ^ (dst16 != NULL));
    const Dav1dFrameContext *const f = t->f;
    const Dav1dDSPContext *const dsp = f->dsp;
    const int ss_ver = !!pl && f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I420;
    const int ss_hor = !!pl && f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I444;
    const int h_mul = 4 >> ss_hor, v_mul = 4 >> ss_ver;
    assert(!((b_dim[0] * h_mul) & 7) && !((b_dim[1] * v_mul) & 7));
    const int32_t *const mat = wmp->matrix;
    const int width = (refp->p.p.w + ss_hor) >> ss_hor;
    const int height = (refp->p.p.h + ss_ver) >> ss_ver;

    for (int y = 0; y < b_dim[1] * v_mul; y += 8) {
        const int src_y = t->by * 4 + ((y + 4) << ss_ver);
        const int64_t mat3_y = (int64_t) mat[3] * src_y + mat[0];
        const int64_t mat5_y = (int64_t) mat[5] * src_y + mat[1];
        for (int x = 0; x < b_dim[0] * h_mul; x += 8) {
            // calculate transformation relative to center of 8x8 block in
            // luma pixel units
            const int src_x = t->bx * 4 + ((x + 4) << ss_hor);
            const int64_t mvx = ((int64_t) mat[2] * src_x + mat3_y) >> ss_hor;
            const int64_t mvy = ((int64_t) mat[4] * src_x + mat5_y) >> ss_ver;

            const int dx = (int) (mvx >> 16) - 4;
            const int mx = (((int) mvx & 0xffff) - wmp->u.p.alpha * 4 -
                                                   wmp->u.p.beta  * 7) & ~0x3f;
            const int dy = (int) (mvy >> 16) - 4;
            const int my = (((int) mvy & 0xffff) - wmp->u.p.gamma * 4 -
                                                   wmp->u.p.delta * 4) & ~0x3f;

            const pixel *ref_ptr;
            ptrdiff_t ref_stride = refp->p.stride[!!pl];

            if (dx < 3 || dx + 8 + 4 > width || dy < 3 || dy + 8 + 4 > height) {
                pixel *const emu_edge_buf = bitfn(t->scratch.emu_edge);
                f->dsp->mc.emu_edge(15, 15, width, height, dx - 3, dy - 3,
                                    emu_edge_buf, 32 * sizeof(pixel),
                                    refp->p.data[pl], ref_stride);
                ref_ptr = &emu_edge_buf[32 * 3 + 3];
                ref_stride = 32 * sizeof(pixel);
            } else {
                ref_ptr = ((pixel *) refp->p.data[pl]) + PXSTRIDE(ref_stride) * dy + dx;
            }
            if (dst16 != NULL)
                dsp->mc.warp8x8t(&dst16[x], dstride, ref_ptr, ref_stride,
                                 wmp->u.abcd, mx, my HIGHBD_CALL_SUFFIX);
            else
                dsp->mc.warp8x8(&dst8[x], dstride, ref_ptr, ref_stride,
                                wmp->u.abcd, mx, my HIGHBD_CALL_SUFFIX);
        }
        if (dst8) dst8  += 8 * PXSTRIDE(dstride);
        else      dst16 += 8 * dstride;
    }
    return 0;
}

void bytefn(dav1d_recon_b_intra)(Dav1dTaskContext *const t, const enum BlockSize bs,
                                 const enum EdgeFlags intra_edge_flags,
                                 const Av1Block *const b)
{
    Dav1dTileState *const ts = t->ts;
    const Dav1dFrameContext *const f = t->f;
    const Dav1dDSPContext *const dsp = f->dsp;
    const int bx4 = t->bx & 31, by4 = t->by & 31;
    const int ss_ver = f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I420;
    const int ss_hor = f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I444;
    const int cbx4 = bx4 >> ss_hor, cby4 = by4 >> ss_ver;
    const uint8_t *const b_dim = dav1d_block_dimensions[bs];
    const int bw4 = b_dim[0], bh4 = b_dim[1];
    const int w4 = imin(bw4, f->bw - t->bx), h4 = imin(bh4, f->bh - t->by);
    const int cw4 = (w4 + ss_hor) >> ss_hor, ch4 = (h4 + ss_ver) >> ss_ver;
    const int has_chroma = f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I400 &&
                           (bw4 > ss_hor || t->bx & 1) &&
                           (bh4 > ss_ver || t->by & 1);
    const TxfmInfo *const t_dim = &dav1d_txfm_dimensions[b->tx];
    const TxfmInfo *const uv_t_dim = &dav1d_txfm_dimensions[b->uvtx];

    // coefficient coding
    pixel *const edge = bitfn(t->scratch.edge) + 128;
    const int cbw4 = (bw4 + ss_hor) >> ss_hor, cbh4 = (bh4 + ss_ver) >> ss_ver;

    const int intra_edge_filter_flag = f->seq_hdr->intra_edge_filter << 10;

    for (int init_y = 0; init_y < h4; init_y += 16) {
        const int sub_h4 = imin(h4, 16 + init_y);
        const int sub_ch4 = imin(ch4, (init_y + 16) >> ss_ver);
        for (int init_x = 0; init_x < w4; init_x += 16) {
            if (b->pal_sz[0]) {
                pixel *dst = ((pixel *) f->cur.data[0]) +
                             4 * (t->by * PXSTRIDE(f->cur.stride[0]) + t->bx);
                const uint8_t *pal_idx;
                if (t->frame_thread.pass) {
                    const int p = t->frame_thread.pass & 1;
                    assert(ts->frame_thread[p].pal_idx);
                    pal_idx = ts->frame_thread[p].pal_idx;
                    ts->frame_thread[p].pal_idx += bw4 * bh4 * 8;
                } else {
                    pal_idx = t->scratch.pal_idx_y;
                }
                const pixel *const pal = t->frame_thread.pass ?
                    f->frame_thread.pal[((t->by >> 1) + (t->bx & 1)) * (f->b4_stride >> 1) +
                                        ((t->bx >> 1) + (t->by & 1))][0] :
                    bytefn(t->scratch.pal)[0];
                f->dsp->ipred.pal_pred(dst, f->cur.stride[0], pal,
                                       pal_idx, bw4 * 4, bh4 * 4);
                if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS)
                    hex_dump(dst, PXSTRIDE(f->cur.stride[0]),
                             bw4 * 4, bh4 * 4, "y-pal-pred");
            }

            const int intra_flags = (sm_flag(t->a, bx4) |
                                     sm_flag(&t->l, by4) |
                                     intra_edge_filter_flag);
            const int sb_has_tr = init_x + 16 < w4 ? 1 : init_y ? 0 :
                              intra_edge_flags & EDGE_I444_TOP_HAS_RIGHT;
            const int sb_has_bl = init_x ? 0 : init_y + 16 < h4 ? 1 :
                              intra_edge_flags & EDGE_I444_LEFT_HAS_BOTTOM;
            int y, x;
            const int sub_w4 = imin(w4, init_x + 16);
            for (y = init_y, t->by += init_y; y < sub_h4;
                 y += t_dim->h, t->by += t_dim->h)
            {
                pixel *dst = ((pixel *) f->cur.data[0]) +
                               4 * (t->by * PXSTRIDE(f->cur.stride[0]) +
                                    t->bx + init_x);
                for (x = init_x, t->bx += init_x; x < sub_w4;
                     x += t_dim->w, t->bx += t_dim->w)
                {
                    if (b->pal_sz[0]) goto skip_y_pred;

                    int angle = b->y_angle;
                    const enum EdgeFlags edge_flags =
                        (((y > init_y || !sb_has_tr) && (x + t_dim->w >= sub_w4)) ?
                             0 : EDGE_I444_TOP_HAS_RIGHT) |
                        ((x > init_x || (!sb_has_bl && y + t_dim->h >= sub_h4)) ?
                             0 : EDGE_I444_LEFT_HAS_BOTTOM);
                    const pixel *top_sb_edge = NULL;
                    if (!(t->by & (f->sb_step - 1))) {
                        top_sb_edge = f->ipred_edge[0];
                        const int sby = t->by >> f->sb_shift;
                        top_sb_edge += f->sb128w * 128 * (sby - 1);
                    }
                    const enum IntraPredMode m =
                        bytefn(dav1d_prepare_intra_edges)(t->bx,
                                                          t->bx > ts->tiling.col_start,
                                                          t->by,
                                                          t->by > ts->tiling.row_start,
                                                          ts->tiling.col_end,
                                                          ts->tiling.row_end,
                                                          edge_flags, dst,
                                                          f->cur.stride[0], top_sb_edge,
                                                          b->y_mode, &angle,
                                                          t_dim->w, t_dim->h,
                                                          f->seq_hdr->intra_edge_filter,
                                                          edge HIGHBD_CALL_SUFFIX);
                    dsp->ipred.intra_pred[m](dst, f->cur.stride[0], edge,
                                             t_dim->w * 4, t_dim->h * 4,
                                             angle | intra_flags,
                                             4 * f->bw - 4 * t->bx,
                                             4 * f->bh - 4 * t->by
                                             HIGHBD_CALL_SUFFIX);

                    if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS) {
                        hex_dump(edge - t_dim->h * 4, t_dim->h * 4,
                                 t_dim->h * 4, 2, "l");
                        hex_dump(edge, 0, 1, 1, "tl");
                        hex_dump(edge + 1, t_dim->w * 4,
                                 t_dim->w * 4, 2, "t");
                        hex_dump(dst, f->cur.stride[0],
                                 t_dim->w * 4, t_dim->h * 4, "y-intra-pred");
                    }

                skip_y_pred: {}
                    if (!b->skip) {
                        coef *cf;
                        int eob;
                        enum TxfmType txtp;
                        if (t->frame_thread.pass) {
                            const int p = t->frame_thread.pass & 1;
                            const int cbi = *ts->frame_thread[p].cbi++;
                            cf = ts->frame_thread[p].cf;
                            ts->frame_thread[p].cf += imin(t_dim->w, 8) * imin(t_dim->h, 8) * 16;
                            eob  = cbi >> 5;
                            txtp = cbi & 0x1f;
                        } else {
                            uint8_t cf_ctx;
                            cf = bitfn(t->cf);
                            eob = decode_coefs(t, &t->a->lcoef[bx4 + x],
                                               &t->l.lcoef[by4 + y], b->tx, bs,
                                               b, 1, 0, cf, &txtp, &cf_ctx);
                            if (DEBUG_BLOCK_INFO)
                                printf("Post-y-cf-blk[tx=%d,txtp=%d,eob=%d]: r=%d\n",
                                       b->tx, txtp, eob, ts->msac.rng);
                            dav1d_memset_likely_pow2(&t->a->lcoef[bx4 + x], cf_ctx, imin(t_dim->w, f->bw - t->bx));
                            dav1d_memset_likely_pow2(&t->l.lcoef[by4 + y], cf_ctx, imin(t_dim->h, f->bh - t->by));
                        }
                        if (eob >= 0) {
                            if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS)
                                coef_dump(cf, imin(t_dim->h, 8) * 4,
                                          imin(t_dim->w, 8) * 4, 3, "dq");
                            dsp->itx.itxfm_add[b->tx]
                                              [txtp](dst,
                                                     f->cur.stride[0],
                                                     cf, eob HIGHBD_CALL_SUFFIX);
                            if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS)
                                hex_dump(dst, f->cur.stride[0],
                                         t_dim->w * 4, t_dim->h * 4, "recon");
                        }
                    } else if (!t->frame_thread.pass) {
                        dav1d_memset_pow2[t_dim->lw](&t->a->lcoef[bx4 + x], 0x40);
                        dav1d_memset_pow2[t_dim->lh](&t->l.lcoef[by4 + y], 0x40);
                    }
                    dst += 4 * t_dim->w;
                }
                t->bx -= x;
            }
            t->by -= y;

            if (!has_chroma) continue;

            const ptrdiff_t stride = f->cur.stride[1];

            if (b->uv_mode == CFL_PRED) {
                assert(!init_x && !init_y);

                int16_t *const ac = t->scratch.ac;
                pixel *y_src = ((pixel *) f->cur.data[0]) + 4 * (t->bx & ~ss_hor) +
                                 4 * (t->by & ~ss_ver) * PXSTRIDE(f->cur.stride[0]);
                const ptrdiff_t uv_off = 4 * ((t->bx >> ss_hor) +
                                              (t->by >> ss_ver) * PXSTRIDE(stride));
                pixel *const uv_dst[2] = { ((pixel *) f->cur.data[1]) + uv_off,
                                           ((pixel *) f->cur.data[2]) + uv_off };

                const int furthest_r =
                    ((cw4 << ss_hor) + t_dim->w - 1) & ~(t_dim->w - 1);
                const int furthest_b =
                    ((ch4 << ss_ver) + t_dim->h - 1) & ~(t_dim->h - 1);
                dsp->ipred.cfl_ac[f->cur.p.layout - 1](ac, y_src, f->cur.stride[0],
                                                         cbw4 - (furthest_r >> ss_hor),
                                                         cbh4 - (furthest_b >> ss_ver),
                                                         cbw4 * 4, cbh4 * 4);
                for (int pl = 0; pl < 2; pl++) {
                    if (!b->cfl_alpha[pl]) continue;
                    int angle = 0;
                    const pixel *top_sb_edge = NULL;
                    if (!((t->by & ~ss_ver) & (f->sb_step - 1))) {
                        top_sb_edge = f->ipred_edge[pl + 1];
                        const int sby = t->by >> f->sb_shift;
                        top_sb_edge += f->sb128w * 128 * (sby - 1);
                    }
                    const int xpos = t->bx >> ss_hor, ypos = t->by >> ss_ver;
                    const int xstart = ts->tiling.col_start >> ss_hor;
                    const int ystart = ts->tiling.row_start >> ss_ver;
                    const enum IntraPredMode m =
                        bytefn(dav1d_prepare_intra_edges)(xpos, xpos > xstart,
                                                          ypos, ypos > ystart,
                                                          ts->tiling.col_end >> ss_hor,
                                                          ts->tiling.row_end >> ss_ver,
                                                          0, uv_dst[pl], stride,
                                                          top_sb_edge, DC_PRED, &angle,
                                                          uv_t_dim->w, uv_t_dim->h, 0,
                                                          edge HIGHBD_CALL_SUFFIX);
                    dsp->ipred.cfl_pred[m](uv_dst[pl], stride, edge,
                                           uv_t_dim->w * 4,
                                           uv_t_dim->h * 4,
                                           ac, b->cfl_alpha[pl]
                                           HIGHBD_CALL_SUFFIX);
                }
                if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS) {
                    ac_dump(ac, 4*cbw4, 4*cbh4, "ac");
                    hex_dump(uv_dst[0], stride, cbw4 * 4, cbh4 * 4, "u-cfl-pred");
                    hex_dump(uv_dst[1], stride, cbw4 * 4, cbh4 * 4, "v-cfl-pred");
                }
            } else if (b->pal_sz[1]) {
                const ptrdiff_t uv_dstoff = 4 * ((t->bx >> ss_hor) +
                                              (t->by >> ss_ver) * PXSTRIDE(f->cur.stride[1]));
                const pixel (*pal)[8];
                const uint8_t *pal_idx;
                if (t->frame_thread.pass) {
                    const int p = t->frame_thread.pass & 1;
                    assert(ts->frame_thread[p].pal_idx);
                    pal = f->frame_thread.pal[((t->by >> 1) + (t->bx & 1)) * (f->b4_stride >> 1) +
                                              ((t->bx >> 1) + (t->by & 1))];
                    pal_idx = ts->frame_thread[p].pal_idx;
                    ts->frame_thread[p].pal_idx += cbw4 * cbh4 * 8;
                } else {
                    pal = bytefn(t->scratch.pal);
                    pal_idx = t->scratch.pal_idx_uv;
                }

                f->dsp->ipred.pal_pred(((pixel *) f->cur.data[1]) + uv_dstoff,
                                       f->cur.stride[1], pal[1],
                                       pal_idx, cbw4 * 4, cbh4 * 4);
                f->dsp->ipred.pal_pred(((pixel *) f->cur.data[2]) + uv_dstoff,
                                       f->cur.stride[1], pal[2],
                                       pal_idx, cbw4 * 4, cbh4 * 4);
                if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS) {
                    hex_dump(((pixel *) f->cur.data[1]) + uv_dstoff,
                             PXSTRIDE(f->cur.stride[1]),
                             cbw4 * 4, cbh4 * 4, "u-pal-pred");
                    hex_dump(((pixel *) f->cur.data[2]) + uv_dstoff,
                             PXSTRIDE(f->cur.stride[1]),
                             cbw4 * 4, cbh4 * 4, "v-pal-pred");
                }
            }

            const int sm_uv_fl = sm_uv_flag(t->a, cbx4) |
                                 sm_uv_flag(&t->l, cby4);
            const int uv_sb_has_tr =
                ((init_x + 16) >> ss_hor) < cw4 ? 1 : init_y ? 0 :
                intra_edge_flags & (EDGE_I420_TOP_HAS_RIGHT >> (f->cur.p.layout - 1));
            const int uv_sb_has_bl =
                init_x ? 0 : ((init_y + 16) >> ss_ver) < ch4 ? 1 :
                intra_edge_flags & (EDGE_I420_LEFT_HAS_BOTTOM >> (f->cur.p.layout - 1));
            const int sub_cw4 = imin(cw4, (init_x + 16) >> ss_hor);
            for (int pl = 0; pl < 2; pl++) {
                for (y = init_y >> ss_ver, t->by += init_y; y < sub_ch4;
                     y += uv_t_dim->h, t->by += uv_t_dim->h << ss_ver)
                {
                    pixel *dst = ((pixel *) f->cur.data[1 + pl]) +
                                   4 * ((t->by >> ss_ver) * PXSTRIDE(stride) +
                                        ((t->bx + init_x) >> ss_hor));
                    for (x = init_x >> ss_hor, t->bx += init_x; x < sub_cw4;
                         x += uv_t_dim->w, t->bx += uv_t_dim->w << ss_hor)
                    {
                        if ((b->uv_mode == CFL_PRED && b->cfl_alpha[pl]) ||
                            b->pal_sz[1])
                        {
                            goto skip_uv_pred;
                        }

                        int angle = b->uv_angle;
                        // this probably looks weird because we're using
                        // luma flags in a chroma loop, but that's because
                        // prepare_intra_edges() expects luma flags as input
                        const enum EdgeFlags edge_flags =
                            (((y > (init_y >> ss_ver) || !uv_sb_has_tr) &&
                              (x + uv_t_dim->w >= sub_cw4)) ?
                                 0 : EDGE_I444_TOP_HAS_RIGHT) |
                            ((x > (init_x >> ss_hor) ||
                              (!uv_sb_has_bl && y + uv_t_dim->h >= sub_ch4)) ?
                                 0 : EDGE_I444_LEFT_HAS_BOTTOM);
                        const pixel *top_sb_edge = NULL;
                        if (!((t->by & ~ss_ver) & (f->sb_step - 1))) {
                            top_sb_edge = f->ipred_edge[1 + pl];
                            const int sby = t->by >> f->sb_shift;
                            top_sb_edge += f->sb128w * 128 * (sby - 1);
                        }
                        const enum IntraPredMode uv_mode =
                             b->uv_mode == CFL_PRED ? DC_PRED : b->uv_mode;
                        const int xpos = t->bx >> ss_hor, ypos = t->by >> ss_ver;
                        const int xstart = ts->tiling.col_start >> ss_hor;
                        const int ystart = ts->tiling.row_start >> ss_ver;
                        const enum IntraPredMode m =
                            bytefn(dav1d_prepare_intra_edges)(xpos, xpos > xstart,
                                                              ypos, ypos > ystart,
                                                              ts->tiling.col_end >> ss_hor,
                                                              ts->tiling.row_end >> ss_ver,
                                                              edge_flags, dst, stride,
                                                              top_sb_edge, uv_mode,
                                                              &angle, uv_t_dim->w,
                                                              uv_t_dim->h,
                                                              f->seq_hdr->intra_edge_filter,
                                                              edge HIGHBD_CALL_SUFFIX);
                        angle |= intra_edge_filter_flag;
                        dsp->ipred.intra_pred[m](dst, stride, edge,
                                                 uv_t_dim->w * 4,
                                                 uv_t_dim->h * 4,
                                                 angle | sm_uv_fl,
                                                 (4 * f->bw + ss_hor -
                                                  4 * (t->bx & ~ss_hor)) >> ss_hor,
                                                 (4 * f->bh + ss_ver -
                                                  4 * (t->by & ~ss_ver)) >> ss_ver
                                                 HIGHBD_CALL_SUFFIX);
                        if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS) {
                            hex_dump(edge - uv_t_dim->h * 4, uv_t_dim->h * 4,
                                     uv_t_dim->h * 4, 2, "l");
                            hex_dump(edge, 0, 1, 1, "tl");
                            hex_dump(edge + 1, uv_t_dim->w * 4,
                                     uv_t_dim->w * 4, 2, "t");
                            hex_dump(dst, stride, uv_t_dim->w * 4,
                                     uv_t_dim->h * 4, pl ? "v-intra-pred" : "u-intra-pred");
                        }

                    skip_uv_pred: {}
                        if (!b->skip) {
                            enum TxfmType txtp;
                            int eob;
                            coef *cf;
                            if (t->frame_thread.pass) {
                                const int p = t->frame_thread.pass & 1;
                                const int cbi = *ts->frame_thread[p].cbi++;
                                cf = ts->frame_thread[p].cf;
                                ts->frame_thread[p].cf += uv_t_dim->w * uv_t_dim->h * 16;
                                eob  = cbi >> 5;
                                txtp = cbi & 0x1f;
                            } else {
                                uint8_t cf_ctx;
                                cf = bitfn(t->cf);
                                eob = decode_coefs(t, &t->a->ccoef[pl][cbx4 + x],
                                                   &t->l.ccoef[pl][cby4 + y],
                                                   b->uvtx, bs, b, 1, 1 + pl, cf,
                                                   &txtp, &cf_ctx);
                                if (DEBUG_BLOCK_INFO)
                                    printf("Post-uv-cf-blk[pl=%d,tx=%d,"
                                           "txtp=%d,eob=%d]: r=%d [x=%d,cbx4=%d]\n",
                                           pl, b->uvtx, txtp, eob, ts->msac.rng, x, cbx4);
                                int ctw = imin(uv_t_dim->w, (f->bw - t->bx + ss_hor) >> ss_hor);
                                int cth = imin(uv_t_dim->h, (f->bh - t->by + ss_ver) >> ss_ver);
                                dav1d_memset_likely_pow2(&t->a->ccoef[pl][cbx4 + x], cf_ctx, ctw);
                                dav1d_memset_likely_pow2(&t->l.ccoef[pl][cby4 + y], cf_ctx, cth);
                            }
                            if (eob >= 0) {
                                if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS)
                                    coef_dump(cf, uv_t_dim->h * 4,
                                              uv_t_dim->w * 4, 3, "dq");
                                dsp->itx.itxfm_add[b->uvtx]
                                                  [txtp](dst, stride,
                                                         cf, eob HIGHBD_CALL_SUFFIX);
                                if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS)
                                    hex_dump(dst, stride, uv_t_dim->w * 4,
                                             uv_t_dim->h * 4, "recon");
                            }
                        } else if (!t->frame_thread.pass) {
                            dav1d_memset_pow2[uv_t_dim->lw](&t->a->ccoef[pl][cbx4 + x], 0x40);
                            dav1d_memset_pow2[uv_t_dim->lh](&t->l.ccoef[pl][cby4 + y], 0x40);
                        }
                        dst += uv_t_dim->w * 4;
                    }
                    t->bx -= x << ss_hor;
                }
                t->by -= y << ss_ver;
            }
        }
    }
}

int bytefn(dav1d_recon_b_inter)(Dav1dTaskContext *const t, const enum BlockSize bs,
                                const Av1Block *const b)
{
    Dav1dTileState *const ts = t->ts;
    const Dav1dFrameContext *const f = t->f;
    const Dav1dDSPContext *const dsp = f->dsp;
    const int bx4 = t->bx & 31, by4 = t->by & 31;
    const int ss_ver = f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I420;
    const int ss_hor = f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I444;
    const int cbx4 = bx4 >> ss_hor, cby4 = by4 >> ss_ver;
    const uint8_t *const b_dim = dav1d_block_dimensions[bs];
    const int bw4 = b_dim[0], bh4 = b_dim[1];
    const int w4 = imin(bw4, f->bw - t->bx), h4 = imin(bh4, f->bh - t->by);
    const int has_chroma = f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I400 &&
                           (bw4 > ss_hor || t->bx & 1) &&
                           (bh4 > ss_ver || t->by & 1);
    const int chr_layout_idx = f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I400 ? 0 :
                               DAV1D_PIXEL_LAYOUT_I444 - f->cur.p.layout;
    int res;

    // prediction
    const int cbh4 = (bh4 + ss_ver) >> ss_ver, cbw4 = (bw4 + ss_hor) >> ss_hor;
    pixel *dst = ((pixel *) f->cur.data[0]) +
        4 * (t->by * PXSTRIDE(f->cur.stride[0]) + t->bx);
    const ptrdiff_t uvdstoff =
        4 * ((t->bx >> ss_hor) + (t->by >> ss_ver) * PXSTRIDE(f->cur.stride[1]));
    if (IS_KEY_OR_INTRA(f->frame_hdr)) {
        // intrabc
        assert(!f->frame_hdr->super_res.enabled);
        res = mc(t, dst, NULL, f->cur.stride[0], bw4, bh4, t->bx, t->by, 0,
                 b->mv[0], &f->sr_cur, 0 /* unused */, FILTER_2D_BILINEAR);
        if (res) return res;
        if (has_chroma) for (int pl = 1; pl < 3; pl++) {
            res = mc(t, ((pixel *)f->cur.data[pl]) + uvdstoff, NULL, f->cur.stride[1],
                     bw4 << (bw4 == ss_hor), bh4 << (bh4 == ss_ver),
                     t->bx & ~ss_hor, t->by & ~ss_ver, pl, b->mv[0],
                     &f->sr_cur, 0 /* unused */, FILTER_2D_BILINEAR);
            if (res) return res;
        }
    } else if (b->comp_type == COMP_INTER_NONE) {
        const Dav1dThreadPicture *const refp = &f->refp[b->ref[0]];
        const enum Filter2d filter_2d = b->filter2d;

        if (imin(bw4, bh4) > 1 &&
            ((b->inter_mode == GLOBALMV && f->gmv_warp_allowed[b->ref[0]]) ||
             (b->motion_mode == MM_WARP && t->warpmv.type > DAV1D_WM_TYPE_TRANSLATION)))
        {
            res = warp_affine(t, dst, NULL, f->cur.stride[0], b_dim, 0, refp,
                              b->motion_mode == MM_WARP ? &t->warpmv :
                                  &f->frame_hdr->gmv[b->ref[0]]);
            if (res) return res;
        } else {
            res = mc(t, dst, NULL, f->cur.stride[0],
                     bw4, bh4, t->bx, t->by, 0, b->mv[0], refp, b->ref[0], filter_2d);
            if (res) return res;
            if (b->motion_mode == MM_OBMC) {
                res = obmc(t, dst, f->cur.stride[0], b_dim, 0, bx4, by4, w4, h4);
                if (res) return res;
            }
        }
        if (b->interintra_type) {
            pixel *const tl_edge = bitfn(t->scratch.edge) + 32;
            enum IntraPredMode m = b->interintra_mode == II_SMOOTH_PRED ?
                                   SMOOTH_PRED : b->interintra_mode;
            pixel *const tmp = bitfn(t->scratch.interintra);
            int angle = 0;
            const pixel *top_sb_edge = NULL;
            if (!(t->by & (f->sb_step - 1))) {
                top_sb_edge = f->ipred_edge[0];
                const int sby = t->by >> f->sb_shift;
                top_sb_edge += f->sb128w * 128 * (sby - 1);
            }
            m = bytefn(dav1d_prepare_intra_edges)(t->bx, t->bx > ts->tiling.col_start,
                                                  t->by, t->by > ts->tiling.row_start,
                                                  ts->tiling.col_end, ts->tiling.row_end,
                                                  0, dst, f->cur.stride[0], top_sb_edge,
                                                  m, &angle, bw4, bh4, 0, tl_edge
                                                  HIGHBD_CALL_SUFFIX);
            dsp->ipred.intra_pred[m](tmp, 4 * bw4 * sizeof(pixel),
                                     tl_edge, bw4 * 4, bh4 * 4, 0, 0, 0
                                     HIGHBD_CALL_SUFFIX);
            dsp->mc.blend(dst, f->cur.stride[0], tmp,
                          bw4 * 4, bh4 * 4, II_MASK(0, bs, b));
        }

        if (!has_chroma) goto skip_inter_chroma_pred;

        // sub8x8 derivation
        int is_sub8x8 = bw4 == ss_hor || bh4 == ss_ver;
        refmvs_block *const *r;
        if (is_sub8x8) {
            assert(ss_hor == 1);
            r = &t->rt.r[(t->by & 31) + 5];
            if (bw4 == 1) is_sub8x8 &= r[0][t->bx - 1].ref.ref[0] > 0;
            if (bh4 == ss_ver) is_sub8x8 &= r[-1][t->bx].ref.ref[0] > 0;
            if (bw4 == 1 && bh4 == ss_ver)
                is_sub8x8 &= r[-1][t->bx - 1].ref.ref[0] > 0;
        }

        // chroma prediction
        if (is_sub8x8) {
            assert(ss_hor == 1);
            ptrdiff_t h_off = 0, v_off = 0;
            if (bw4 == 1 && bh4 == ss_ver) {
                for (int pl = 0; pl < 2; pl++) {
                    res = mc(t, ((pixel *) f->cur.data[1 + pl]) + uvdstoff,
                             NULL, f->cur.stride[1],
                             bw4, bh4, t->bx - 1, t->by - 1, 1 + pl,
                             r[-1][t->bx - 1].mv.mv[0],
                             &f->refp[r[-1][t->bx - 1].ref.ref[0] - 1],
                             r[-1][t->bx - 1].ref.ref[0] - 1,
                             t->frame_thread.pass != 2 ? t->tl_4x4_filter :
                                 f->frame_thread.b[((t->by - 1) * f->b4_stride) + t->bx - 1].filter2d);
                    if (res) return res;
                }
                v_off = 2 * PXSTRIDE(f->cur.stride[1]);
                h_off = 2;
            }
            if (bw4 == 1) {
                const enum Filter2d left_filter_2d =
                    dav1d_filter_2d[t->l.filter[1][by4]][t->l.filter[0][by4]];
                for (int pl = 0; pl < 2; pl++) {
                    res = mc(t, ((pixel *) f->cur.data[1 + pl]) + uvdstoff + v_off, NULL,
                             f->cur.stride[1], bw4, bh4, t->bx - 1,
                             t->by, 1 + pl, r[0][t->bx - 1].mv.mv[0],
                             &f->refp[r[0][t->bx - 1].ref.ref[0] - 1],
                             r[0][t->bx - 1].ref.ref[0] - 1,
                             t->frame_thread.pass != 2 ? left_filter_2d :
                                 f->frame_thread.b[(t->by * f->b4_stride) + t->bx - 1].filter2d);
                    if (res) return res;
                }
                h_off = 2;
            }
            if (bh4 == ss_ver) {
                const enum Filter2d top_filter_2d =
                    dav1d_filter_2d[t->a->filter[1][bx4]][t->a->filter[0][bx4]];
                for (int pl = 0; pl < 2; pl++) {
                    res = mc(t, ((pixel *) f->cur.data[1 + pl]) + uvdstoff + h_off, NULL,
                             f->cur.stride[1], bw4, bh4, t->bx, t->by - 1,
                             1 + pl, r[-1][t->bx].mv.mv[0],
                             &f->refp[r[-1][t->bx].ref.ref[0] - 1],
                             r[-1][t->bx].ref.ref[0] - 1,
                             t->frame_thread.pass != 2 ? top_filter_2d :
                                 f->frame_thread.b[((t->by - 1) * f->b4_stride) + t->bx].filter2d);
                    if (res) return res;
                }
                v_off = 2 * PXSTRIDE(f->cur.stride[1]);
            }
            for (int pl = 0; pl < 2; pl++) {
                res = mc(t, ((pixel *) f->cur.data[1 + pl]) + uvdstoff + h_off + v_off, NULL, f->cur.stride[1],
                         bw4, bh4, t->bx, t->by, 1 + pl, b->mv[0],
                         refp, b->ref[0], filter_2d);
                if (res) return res;
            }
        } else {
            if (imin(cbw4, cbh4) > 1 &&
                ((b->inter_mode == GLOBALMV && f->gmv_warp_allowed[b->ref[0]]) ||
                 (b->motion_mode == MM_WARP && t->warpmv.type > DAV1D_WM_TYPE_TRANSLATION)))
            {
                for (int pl = 0; pl < 2; pl++) {
                    res = warp_affine(t, ((pixel *) f->cur.data[1 + pl]) + uvdstoff, NULL,
                                      f->cur.stride[1], b_dim, 1 + pl, refp,
                                      b->motion_mode == MM_WARP ? &t->warpmv :
                                          &f->frame_hdr->gmv[b->ref[0]]);
                    if (res) return res;
                }
            } else {
                for (int pl = 0; pl < 2; pl++) {
                    res = mc(t, ((pixel *) f->cur.data[1 + pl]) + uvdstoff,
                             NULL, f->cur.stride[1],
                             bw4 << (bw4 == ss_hor), bh4 << (bh4 == ss_ver),
                             t->bx & ~ss_hor, t->by & ~ss_ver,
                             1 + pl, b->mv[0], refp, b->ref[0], filter_2d);
                    if (res) return res;
                    if (b->motion_mode == MM_OBMC) {
                        res = obmc(t, ((pixel *) f->cur.data[1 + pl]) + uvdstoff,
                                   f->cur.stride[1], b_dim, 1 + pl, bx4, by4, w4, h4);
                        if (res) return res;
                    }
                }
            }
            if (b->interintra_type) {
                // FIXME for 8x32 with 4:2:2 subsampling, this probably does
                // the wrong thing since it will select 4x16, not 4x32, as a
                // transform size...
                const uint8_t *const ii_mask = II_MASK(chr_layout_idx, bs, b);

                for (int pl = 0; pl < 2; pl++) {
                    pixel *const tmp = bitfn(t->scratch.interintra);
                    pixel *const tl_edge = bitfn(t->scratch.edge) + 32;
                    enum IntraPredMode m =
                        b->interintra_mode == II_SMOOTH_PRED ?
                        SMOOTH_PRED : b->interintra_mode;
                    int angle = 0;
                    pixel *const uvdst = ((pixel *) f->cur.data[1 + pl]) + uvdstoff;
                    const pixel *top_sb_edge = NULL;
                    if (!(t->by & (f->sb_step - 1))) {
                        top_sb_edge = f->ipred_edge[pl + 1];
                        const int sby = t->by >> f->sb_shift;
                        top_sb_edge += f->sb128w * 128 * (sby - 1);
                    }
                    m = bytefn(dav1d_prepare_intra_edges)(t->bx >> ss_hor,
                                                          (t->bx >> ss_hor) >
                                                              (ts->tiling.col_start >> ss_hor),
                                                          t->by >> ss_ver,
                                                          (t->by >> ss_ver) >
                                                              (ts->tiling.row_start >> ss_ver),
                                                          ts->tiling.col_end >> ss_hor,
                                                          ts->tiling.row_end >> ss_ver,
                                                          0, uvdst, f->cur.stride[1],
                                                          top_sb_edge, m,
                                                          &angle, cbw4, cbh4, 0, tl_edge
                                                          HIGHBD_CALL_SUFFIX);
                    dsp->ipred.intra_pred[m](tmp, cbw4 * 4 * sizeof(pixel),
                                             tl_edge, cbw4 * 4, cbh4 * 4, 0, 0, 0
                                             HIGHBD_CALL_SUFFIX);
                    dsp->mc.blend(uvdst, f->cur.stride[1], tmp,
                                  cbw4 * 4, cbh4 * 4, ii_mask);
                }
            }
        }

    skip_inter_chroma_pred: {}
        t->tl_4x4_filter = filter_2d;
    } else {
        const enum Filter2d filter_2d = b->filter2d;
        // Maximum super block size is 128x128
        int16_t (*tmp)[128 * 128] = t->scratch.compinter;
        int jnt_weight;
        uint8_t *const seg_mask = t->scratch.seg_mask;
        const uint8_t *mask;

        for (int i = 0; i < 2; i++) {
            const Dav1dThreadPicture *const refp = &f->refp[b->ref[i]];

            if (b->inter_mode == GLOBALMV_GLOBALMV && f->gmv_warp_allowed[b->ref[i]]) {
                res = warp_affine(t, NULL, tmp[i], bw4 * 4, b_dim, 0, refp,
                                  &f->frame_hdr->gmv[b->ref[i]]);
                if (res) return res;
            } else {
                res = mc(t, NULL, tmp[i], 0, bw4, bh4, t->bx, t->by, 0,
                         b->mv[i], refp, b->ref[i], filter_2d);
                if (res) return res;
            }
        }
        switch (b->comp_type) {
        case COMP_INTER_AVG:
            dsp->mc.avg(dst, f->cur.stride[0], tmp[0], tmp[1],
                        bw4 * 4, bh4 * 4 HIGHBD_CALL_SUFFIX);
            break;
        case COMP_INTER_WEIGHTED_AVG:
            jnt_weight = f->jnt_weights[b->ref[0]][b->ref[1]];
            dsp->mc.w_avg(dst, f->cur.stride[0], tmp[0], tmp[1],
                          bw4 * 4, bh4 * 4, jnt_weight HIGHBD_CALL_SUFFIX);
            break;
        case COMP_INTER_SEG:
            dsp->mc.w_mask[chr_layout_idx](dst, f->cur.stride[0],
                                           tmp[b->mask_sign], tmp[!b->mask_sign],
                                           bw4 * 4, bh4 * 4, seg_mask,
                                           b->mask_sign HIGHBD_CALL_SUFFIX);
            mask = seg_mask;
            break;
        case COMP_INTER_WEDGE:
            mask = WEDGE_MASK(0, bs, 0, b->wedge_idx);
            dsp->mc.mask(dst, f->cur.stride[0],
                         tmp[b->mask_sign], tmp[!b->mask_sign],
                         bw4 * 4, bh4 * 4, mask HIGHBD_CALL_SUFFIX);
            if (has_chroma)
                mask = WEDGE_MASK(chr_layout_idx, bs, b->mask_sign, b->wedge_idx);
            break;
        }

        // chroma
        if (has_chroma) for (int pl = 0; pl < 2; pl++) {
            for (int i = 0; i < 2; i++) {
                const Dav1dThreadPicture *const refp = &f->refp[b->ref[i]];
                if (b->inter_mode == GLOBALMV_GLOBALMV &&
                    imin(cbw4, cbh4) > 1 && f->gmv_warp_allowed[b->ref[i]])
                {
                    res = warp_affine(t, NULL, tmp[i], bw4 * 4 >> ss_hor,
                                      b_dim, 1 + pl,
                                      refp, &f->frame_hdr->gmv[b->ref[i]]);
                    if (res) return res;
                } else {
                    res = mc(t, NULL, tmp[i], 0, bw4, bh4, t->bx, t->by,
                             1 + pl, b->mv[i], refp, b->ref[i], filter_2d);
                    if (res) return res;
                }
            }
            pixel *const uvdst = ((pixel *) f->cur.data[1 + pl]) + uvdstoff;
            switch (b->comp_type) {
            case COMP_INTER_AVG:
                dsp->mc.avg(uvdst, f->cur.stride[1], tmp[0], tmp[1],
                            bw4 * 4 >> ss_hor, bh4 * 4 >> ss_ver
                            HIGHBD_CALL_SUFFIX);
                break;
            case COMP_INTER_WEIGHTED_AVG:
                dsp->mc.w_avg(uvdst, f->cur.stride[1], tmp[0], tmp[1],
                              bw4 * 4 >> ss_hor, bh4 * 4 >> ss_ver, jnt_weight
                              HIGHBD_CALL_SUFFIX);
                break;
            case COMP_INTER_WEDGE:
            case COMP_INTER_SEG:
                dsp->mc.mask(uvdst, f->cur.stride[1],
                             tmp[b->mask_sign], tmp[!b->mask_sign],
                             bw4 * 4 >> ss_hor, bh4 * 4 >> ss_ver, mask
                             HIGHBD_CALL_SUFFIX);
                break;
            }
        }
    }

    if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS) {
        hex_dump(dst, f->cur.stride[0], b_dim[0] * 4, b_dim[1] * 4, "y-pred");
        if (has_chroma) {
            hex_dump(&((pixel *) f->cur.data[1])[uvdstoff], f->cur.stride[1],
                     cbw4 * 4, cbh4 * 4, "u-pred");
            hex_dump(&((pixel *) f->cur.data[2])[uvdstoff], f->cur.stride[1],
                     cbw4 * 4, cbh4 * 4, "v-pred");
        }
    }

    const int cw4 = (w4 + ss_hor) >> ss_hor, ch4 = (h4 + ss_ver) >> ss_ver;

    if (b->skip) {
        // reset coef contexts
        BlockContext *const a = t->a;
        dav1d_memset_pow2[b_dim[2]](&a->lcoef[bx4], 0x40);
        dav1d_memset_pow2[b_dim[3]](&t->l.lcoef[by4], 0x40);
        if (has_chroma) {
            dav1d_memset_pow2_fn memset_cw = dav1d_memset_pow2[ulog2(cbw4)];
            dav1d_memset_pow2_fn memset_ch = dav1d_memset_pow2[ulog2(cbh4)];
            memset_cw(&a->ccoef[0][cbx4], 0x40);
            memset_cw(&a->ccoef[1][cbx4], 0x40);
            memset_ch(&t->l.ccoef[0][cby4], 0x40);
            memset_ch(&t->l.ccoef[1][cby4], 0x40);
        }
        return 0;
    }

    const TxfmInfo *const uvtx = &dav1d_txfm_dimensions[b->uvtx];
    const TxfmInfo *const ytx = &dav1d_txfm_dimensions[b->max_ytx];
    const uint16_t tx_split[2] = { b->tx_split0, b->tx_split1 };

    for (int init_y = 0; init_y < bh4; init_y += 16) {
        for (int init_x = 0; init_x < bw4; init_x += 16) {
            // coefficient coding & inverse transforms
            int y_off = !!init_y, y;
            dst += PXSTRIDE(f->cur.stride[0]) * 4 * init_y;
            for (y = init_y, t->by += init_y; y < imin(h4, init_y + 16);
                 y += ytx->h, y_off++)
            {
                int x, x_off = !!init_x;
                for (x = init_x, t->bx += init_x; x < imin(w4, init_x + 16);
                     x += ytx->w, x_off++)
                {
                    read_coef_tree(t, bs, b, b->max_ytx, 0, tx_split,
                                   x_off, y_off, &dst[x * 4]);
                    t->bx += ytx->w;
                }
                dst += PXSTRIDE(f->cur.stride[0]) * 4 * ytx->h;
                t->bx -= x;
                t->by += ytx->h;
            }
            dst -= PXSTRIDE(f->cur.stride[0]) * 4 * y;
            t->by -= y;

            // chroma coefs and inverse transform
            if (has_chroma) for (int pl = 0; pl < 2; pl++) {
                pixel *uvdst = ((pixel *) f->cur.data[1 + pl]) + uvdstoff +
                    (PXSTRIDE(f->cur.stride[1]) * init_y * 4 >> ss_ver);
                for (y = init_y >> ss_ver, t->by += init_y;
                     y < imin(ch4, (init_y + 16) >> ss_ver); y += uvtx->h)
                {
                    int x;
                    for (x = init_x >> ss_hor, t->bx += init_x;
                         x < imin(cw4, (init_x + 16) >> ss_hor); x += uvtx->w)
                    {
                        coef *cf;
                        int eob;
                        enum TxfmType txtp;
                        if (t->frame_thread.pass) {
                            const int p = t->frame_thread.pass & 1;
                            const int cbi = *ts->frame_thread[p].cbi++;
                            cf = ts->frame_thread[p].cf;
                            ts->frame_thread[p].cf += uvtx->w * uvtx->h * 16;
                            eob  = cbi >> 5;
                            txtp = cbi & 0x1f;
                        } else {
                            uint8_t cf_ctx;
                            cf = bitfn(t->cf);
                            txtp = t->scratch.txtp_map[(by4 + (y << ss_ver)) * 32 +
                                                        bx4 + (x << ss_hor)];
                            eob = decode_coefs(t, &t->a->ccoef[pl][cbx4 + x],
                                               &t->l.ccoef[pl][cby4 + y],
                                               b->uvtx, bs, b, 0, 1 + pl,
                                               cf, &txtp, &cf_ctx);
                            if (DEBUG_BLOCK_INFO)
                                printf("Post-uv-cf-blk[pl=%d,tx=%d,"
                                       "txtp=%d,eob=%d]: r=%d\n",
                                       pl, b->uvtx, txtp, eob, ts->msac.rng);
                            int ctw = imin(uvtx->w, (f->bw - t->bx + ss_hor) >> ss_hor);
                            int cth = imin(uvtx->h, (f->bh - t->by + ss_ver) >> ss_ver);
                            dav1d_memset_likely_pow2(&t->a->ccoef[pl][cbx4 + x], cf_ctx, ctw);
                            dav1d_memset_likely_pow2(&t->l.ccoef[pl][cby4 + y], cf_ctx, cth);
                        }
                        if (eob >= 0) {
                            if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS)
                                coef_dump(cf, uvtx->h * 4, uvtx->w * 4, 3, "dq");
                            dsp->itx.itxfm_add[b->uvtx]
                                              [txtp](&uvdst[4 * x],
                                                     f->cur.stride[1],
                                                     cf, eob HIGHBD_CALL_SUFFIX);
                            if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS)
                                hex_dump(&uvdst[4 * x], f->cur.stride[1],
                                         uvtx->w * 4, uvtx->h * 4, "recon");
                        }
                        t->bx += uvtx->w << ss_hor;
                    }
                    uvdst += PXSTRIDE(f->cur.stride[1]) * 4 * uvtx->h;
                    t->bx -= x << ss_hor;
                    t->by += uvtx->h << ss_ver;
                }
                t->by -= y << ss_ver;
            }
        }
    }
    return 0;
}

void bytefn(dav1d_filter_sbrow_deblock_cols)(Dav1dFrameContext *const f, const int sby) {
    if (!(f->c->inloop_filters & DAV1D_INLOOPFILTER_DEBLOCK) ||
        (!f->frame_hdr->loopfilter.level_y[0] && !f->frame_hdr->loopfilter.level_y[1]))
    {
        return;
    }
    const int y = sby * f->sb_step * 4;
    const int ss_ver = f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I420;
    pixel *const p[3] = {
        f->lf.p[0] + y * PXSTRIDE(f->cur.stride[0]),
        f->lf.p[1] + (y * PXSTRIDE(f->cur.stride[1]) >> ss_ver),
        f->lf.p[2] + (y * PXSTRIDE(f->cur.stride[1]) >> ss_ver)
    };
    Av1Filter *mask = f->lf.mask + (sby >> !f->seq_hdr->sb128) * f->sb128w;
    bytefn(dav1d_loopfilter_sbrow_cols)(f, p, mask, sby,
                                        f->lf.start_of_tile_row[sby]);
}

void bytefn(dav1d_filter_sbrow_deblock_rows)(Dav1dFrameContext *const f, const int sby) {
    const int y = sby * f->sb_step * 4;
    const int ss_ver = f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I420;
    pixel *const p[3] = {
        f->lf.p[0] + y * PXSTRIDE(f->cur.stride[0]),
        f->lf.p[1] + (y * PXSTRIDE(f->cur.stride[1]) >> ss_ver),
        f->lf.p[2] + (y * PXSTRIDE(f->cur.stride[1]) >> ss_ver)
    };
    Av1Filter *mask = f->lf.mask + (sby >> !f->seq_hdr->sb128) * f->sb128w;
    if (f->c->inloop_filters & DAV1D_INLOOPFILTER_DEBLOCK &&
        (f->frame_hdr->loopfilter.level_y[0] || f->frame_hdr->loopfilter.level_y[1]))
    {
        bytefn(dav1d_loopfilter_sbrow_rows)(f, p, mask, sby);
    }
    if (f->seq_hdr->cdef || f->lf.restore_planes) {
        // Store loop filtered pixels required by CDEF / LR
        bytefn(dav1d_copy_lpf)(f, p, sby);
    }
}

void bytefn(dav1d_filter_sbrow_cdef)(Dav1dTaskContext *const tc, const int sby) {
    const Dav1dFrameContext *const f = tc->f;
    if (!(f->c->inloop_filters & DAV1D_INLOOPFILTER_CDEF)) return;
    const int sbsz = f->sb_step;
    const int y = sby * sbsz * 4;
    const int ss_ver = f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I420;
    pixel *const p[3] = {
        f->lf.p[0] + y * PXSTRIDE(f->cur.stride[0]),
        f->lf.p[1] + (y * PXSTRIDE(f->cur.stride[1]) >> ss_ver),
        f->lf.p[2] + (y * PXSTRIDE(f->cur.stride[1]) >> ss_ver)
    };
    Av1Filter *prev_mask = f->lf.mask + ((sby - 1) >> !f->seq_hdr->sb128) * f->sb128w;
    Av1Filter *mask = f->lf.mask + (sby >> !f->seq_hdr->sb128) * f->sb128w;
    const int start = sby * sbsz;
    if (sby) {
        const int ss_ver = f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I420;
        pixel *p_up[3] = {
            p[0] - 8 * PXSTRIDE(f->cur.stride[0]),
            p[1] - (8 * PXSTRIDE(f->cur.stride[1]) >> ss_ver),
            p[2] - (8 * PXSTRIDE(f->cur.stride[1]) >> ss_ver),
        };
        bytefn(dav1d_cdef_brow)(tc, p_up, prev_mask, start - 2, start, 1, sby);
    }
    const int n_blks = sbsz - 2 * (sby + 1 < f->sbh);
    const int end = imin(start + n_blks, f->bh);
    bytefn(dav1d_cdef_brow)(tc, p, mask, start, end, 0, sby);
}

void bytefn(dav1d_filter_sbrow_resize)(Dav1dFrameContext *const f, const int sby) {
    const int sbsz = f->sb_step;
    const int y = sby * sbsz * 4;
    const int ss_ver = f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I420;
    const pixel *const p[3] = {
        f->lf.p[0] + y * PXSTRIDE(f->cur.stride[0]),
        f->lf.p[1] + (y * PXSTRIDE(f->cur.stride[1]) >> ss_ver),
        f->lf.p[2] + (y * PXSTRIDE(f->cur.stride[1]) >> ss_ver)
    };
    pixel *const sr_p[3] = {
        f->lf.sr_p[0] + y * PXSTRIDE(f->sr_cur.p.stride[0]),
        f->lf.sr_p[1] + (y * PXSTRIDE(f->sr_cur.p.stride[1]) >> ss_ver),
        f->lf.sr_p[2] + (y * PXSTRIDE(f->sr_cur.p.stride[1]) >> ss_ver)
    };
    const int has_chroma = f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I400;
    for (int pl = 0; pl < 1 + 2 * has_chroma; pl++) {
        const int ss_ver = pl && f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I420;
        const int h_start = 8 * !!sby >> ss_ver;
        const ptrdiff_t dst_stride = f->sr_cur.p.stride[!!pl];
        pixel *dst = sr_p[pl] - h_start * PXSTRIDE(dst_stride);
        const ptrdiff_t src_stride = f->cur.stride[!!pl];
        const pixel *src = p[pl] - h_start * PXSTRIDE(src_stride);
        const int h_end = 4 * (sbsz - 2 * (sby + 1 < f->sbh)) >> ss_ver;
        const int ss_hor = pl && f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I444;
        const int dst_w = (f->sr_cur.p.p.w + ss_hor) >> ss_hor;
        const int src_w = (4 * f->bw + ss_hor) >> ss_hor;
        const int img_h = (f->cur.p.h - sbsz * 4 * sby + ss_ver) >> ss_ver;

        f->dsp->mc.resize(dst, dst_stride, src, src_stride, dst_w,
                          imin(img_h, h_end) + h_start, src_w,
                          f->resize_step[!!pl], f->resize_start[!!pl]
                          HIGHBD_CALL_SUFFIX);
    }
}

void bytefn(dav1d_filter_sbrow_lr)(Dav1dFrameContext *const f, const int sby) {
    if (!(f->c->inloop_filters & DAV1D_INLOOPFILTER_RESTORATION)) return;
    const int y = sby * f->sb_step * 4;
    const int ss_ver = f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I420;
    pixel *const sr_p[3] = {
        f->lf.sr_p[0] + y * PXSTRIDE(f->sr_cur.p.stride[0]),
        f->lf.sr_p[1] + (y * PXSTRIDE(f->sr_cur.p.stride[1]) >> ss_ver),
        f->lf.sr_p[2] + (y * PXSTRIDE(f->sr_cur.p.stride[1]) >> ss_ver)
    };
    bytefn(dav1d_lr_sbrow)(f, sr_p, sby);
}

void bytefn(dav1d_filter_sbrow)(Dav1dFrameContext *const f, const int sby) {
    bytefn(dav1d_filter_sbrow_deblock_cols)(f, sby);
    bytefn(dav1d_filter_sbrow_deblock_rows)(f, sby);
    if (f->seq_hdr->cdef)
        bytefn(dav1d_filter_sbrow_cdef)(f->c->tc, sby);
    if (f->frame_hdr->width[0] != f->frame_hdr->width[1])
        bytefn(dav1d_filter_sbrow_resize)(f, sby);
    if (f->lf.restore_planes)
        bytefn(dav1d_filter_sbrow_lr)(f, sby);
}

void bytefn(dav1d_backup_ipred_edge)(Dav1dTaskContext *const t) {
    const Dav1dFrameContext *const f = t->f;
    Dav1dTileState *const ts = t->ts;
    const int sby = t->by >> f->sb_shift;
    const int sby_off = f->sb128w * 128 * sby;
    const int x_off = ts->tiling.col_start;

    const pixel *const y =
        ((const pixel *) f->cur.data[0]) + x_off * 4 +
                    ((t->by + f->sb_step) * 4 - 1) * PXSTRIDE(f->cur.stride[0]);
    pixel_copy(&f->ipred_edge[0][sby_off + x_off * 4], y,
               4 * (ts->tiling.col_end - x_off));

    if (f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I400) {
        const int ss_ver = f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I420;
        const int ss_hor = f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I444;

        const ptrdiff_t uv_off = (x_off * 4 >> ss_hor) +
            (((t->by + f->sb_step) * 4 >> ss_ver) - 1) * PXSTRIDE(f->cur.stride[1]);
        for (int pl = 1; pl <= 2; pl++)
            pixel_copy(&f->ipred_edge[pl][sby_off + (x_off * 4 >> ss_hor)],
                       &((const pixel *) f->cur.data[pl])[uv_off],
                       4 * (ts->tiling.col_end - x_off) >> ss_hor);
    }
}

void bytefn(dav1d_copy_pal_block_y)(Dav1dTaskContext *const t,
                                    const int bx4, const int by4,
                                    const int bw4, const int bh4)

{
    const Dav1dFrameContext *const f = t->f;
    pixel *const pal = t->frame_thread.pass ?
        f->frame_thread.pal[((t->by >> 1) + (t->bx & 1)) * (f->b4_stride >> 1) +
                            ((t->bx >> 1) + (t->by & 1))][0] :
        bytefn(t->scratch.pal)[0];
    for (int x = 0; x < bw4; x++)
        memcpy(bytefn(t->al_pal)[0][bx4 + x][0], pal, 8 * sizeof(pixel));
    for (int y = 0; y < bh4; y++)
        memcpy(bytefn(t->al_pal)[1][by4 + y][0], pal, 8 * sizeof(pixel));
}

void bytefn(dav1d_copy_pal_block_uv)(Dav1dTaskContext *const t,
                                     const int bx4, const int by4,
                                     const int bw4, const int bh4)

{
    const Dav1dFrameContext *const f = t->f;
    const pixel (*const pal)[8] = t->frame_thread.pass ?
        f->frame_thread.pal[((t->by >> 1) + (t->bx & 1)) * (f->b4_stride >> 1) +
                            ((t->bx >> 1) + (t->by & 1))] :
        bytefn(t->scratch.pal);
    // see aomedia bug 2183 for why we use luma coordinates here
    for (int pl = 1; pl <= 2; pl++) {
        for (int x = 0; x < bw4; x++)
            memcpy(bytefn(t->al_pal)[0][bx4 + x][pl], pal[pl], 8 * sizeof(pixel));
        for (int y = 0; y < bh4; y++)
            memcpy(bytefn(t->al_pal)[1][by4 + y][pl], pal[pl], 8 * sizeof(pixel));
    }
}

void bytefn(dav1d_read_pal_plane)(Dav1dTaskContext *const t, Av1Block *const b,
                                  const int pl, const int sz_ctx,
                                  const int bx4, const int by4)
{
    Dav1dTileState *const ts = t->ts;
    const Dav1dFrameContext *const f = t->f;
    const int pal_sz = b->pal_sz[pl] = dav1d_msac_decode_symbol_adapt8(&ts->msac,
                                           ts->cdf.m.pal_sz[pl][sz_ctx], 6) + 2;
    pixel cache[16], used_cache[8];
    int l_cache = pl ? t->pal_sz_uv[1][by4] : t->l.pal_sz[by4];
    int n_cache = 0;
    // don't reuse above palette outside SB64 boundaries
    int a_cache = by4 & 15 ? pl ? t->pal_sz_uv[0][bx4] : t->a->pal_sz[bx4] : 0;
    const pixel *l = bytefn(t->al_pal)[1][by4][pl];
    const pixel *a = bytefn(t->al_pal)[0][bx4][pl];

    // fill/sort cache
    while (l_cache && a_cache) {
        if (*l < *a) {
            if (!n_cache || cache[n_cache - 1] != *l)
                cache[n_cache++] = *l;
            l++;
            l_cache--;
        } else {
            if (*a == *l) {
                l++;
                l_cache--;
            }
            if (!n_cache || cache[n_cache - 1] != *a)
                cache[n_cache++] = *a;
            a++;
            a_cache--;
        }
    }
    if (l_cache) {
        do {
            if (!n_cache || cache[n_cache - 1] != *l)
                cache[n_cache++] = *l;
            l++;
        } while (--l_cache > 0);
    } else if (a_cache) {
        do {
            if (!n_cache || cache[n_cache - 1] != *a)
                cache[n_cache++] = *a;
            a++;
        } while (--a_cache > 0);
    }

    // find reused cache entries
    int i = 0;
    for (int n = 0; n < n_cache && i < pal_sz; n++)
        if (dav1d_msac_decode_bool_equi(&ts->msac))
            used_cache[i++] = cache[n];
    const int n_used_cache = i;

    // parse new entries
    pixel *const pal = t->frame_thread.pass ?
        f->frame_thread.pal[((t->by >> 1) + (t->bx & 1)) * (f->b4_stride >> 1) +
                            ((t->bx >> 1) + (t->by & 1))][pl] :
        bytefn(t->scratch.pal)[pl];
    if (i < pal_sz) {
        const int bpc = BITDEPTH == 8 ? 8 : f->cur.p.bpc;
        int prev = pal[i++] = dav1d_msac_decode_bools(&ts->msac, bpc);

        if (i < pal_sz) {
            int bits = bpc - 3 + dav1d_msac_decode_bools(&ts->msac, 2);
            const int max = (1 << bpc) - 1;

            do {
                const int delta = dav1d_msac_decode_bools(&ts->msac, bits);
                prev = pal[i++] = imin(prev + delta + !pl, max);
                if (prev + !pl >= max) {
                    for (; i < pal_sz; i++)
                        pal[i] = max;
                    break;
                }
                bits = imin(bits, 1 + ulog2(max - prev - !pl));
            } while (i < pal_sz);
        }

        // merge cache+new entries
        int n = 0, m = n_used_cache;
        for (i = 0; i < pal_sz; i++) {
            if (n < n_used_cache && (m >= pal_sz || used_cache[n] <= pal[m])) {
                pal[i] = used_cache[n++];
            } else {
                assert(m < pal_sz);
                pal[i] = pal[m++];
            }
        }
    } else {
        memcpy(pal, used_cache, n_used_cache * sizeof(*used_cache));
    }

    if (DEBUG_BLOCK_INFO) {
        printf("Post-pal[pl=%d,sz=%d,cache_size=%d,used_cache=%d]: r=%d, cache=",
               pl, pal_sz, n_cache, n_used_cache, ts->msac.rng);
        for (int n = 0; n < n_cache; n++)
            printf("%c%02x", n ? ' ' : '[', cache[n]);
        printf("%s, pal=", n_cache ? "]" : "[]");
        for (int n = 0; n < pal_sz; n++)
            printf("%c%02x", n ? ' ' : '[', pal[n]);
        printf("]\n");
    }
}

void bytefn(dav1d_read_pal_uv)(Dav1dTaskContext *const t, Av1Block *const b,
                               const int sz_ctx, const int bx4, const int by4)
{
    bytefn(dav1d_read_pal_plane)(t, b, 1, sz_ctx, bx4, by4);

    // V pal coding
    Dav1dTileState *const ts = t->ts;
    const Dav1dFrameContext *const f = t->f;
    pixel *const pal = t->frame_thread.pass ?
        f->frame_thread.pal[((t->by >> 1) + (t->bx & 1)) * (f->b4_stride >> 1) +
                            ((t->bx >> 1) + (t->by & 1))][2] :
        bytefn(t->scratch.pal)[2];
    const int bpc = BITDEPTH == 8 ? 8 : f->cur.p.bpc;
    if (dav1d_msac_decode_bool_equi(&ts->msac)) {
        const int bits = bpc - 4 + dav1d_msac_decode_bools(&ts->msac, 2);
        int prev = pal[0] = dav1d_msac_decode_bools(&ts->msac, bpc);
        const int max = (1 << bpc) - 1;
        for (int i = 1; i < b->pal_sz[1]; i++) {
            int delta = dav1d_msac_decode_bools(&ts->msac, bits);
            if (delta && dav1d_msac_decode_bool_equi(&ts->msac)) delta = -delta;
            prev = pal[i] = (prev + delta) & max;
        }
    } else {
        for (int i = 0; i < b->pal_sz[1]; i++)
            pal[i] = dav1d_msac_decode_bools(&ts->msac, bpc);
    }
    if (DEBUG_BLOCK_INFO) {
        printf("Post-pal[pl=2]: r=%d ", ts->msac.rng);
        for (int n = 0; n < b->pal_sz[1]; n++)
            printf("%c%02x", n ? ' ' : '[', pal[n]);
        printf("]\n");
    }
}

Messung V0.5 in Prozent

¤ Dauer der Verarbeitung: 0.47 Sekunden (vorverarbeitet am 2026-06-07) ¤

Wurzel

Suchen

Beweissystem der NASA

Beweissystem Isabelle

NIST Cobol Testsuite

Cephes Mathematical Library

Wiener Entwicklungsmethode

Haftungshinweis

Die Informationen auf dieser Webseite wurden nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit, noch Qualität der bereit gestellten Informationen zugesichert.

Bemerkung:

Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.