Quellcodebibliothek Statistik Leitseite products/Sources/formale Sprachen/C/Firefox/media/libyuv/libyuv/include/libyuv/   (Browser von der Mozilla Stiftung Version 136.0.1©)  Datei vom 10.2.2025 mit Größe 86 kB image not shown  

Quelle  row_sve.h   Sprache: C

 
/*
 *  Copyright 2024 The LibYuv Project Authors. All rights reserved.
 *
 *  Use of this source code is governed by a BSD-style license
 *  that can be found in the LICENSE file in the root of the source
 *  tree. An additional intellectual property rights grant can be found
 *  in the file PATENTS. All contributing project authors may
 *  be found in the AUTHORS file in the root of the source tree.
 */


#ifndef INCLUDE_LIBYUV_ROW_SVE_H_
#define INCLUDE_LIBYUV_ROW_SVE_H_

#include "libyuv/row.h"

#ifdef __cplusplus
namespace libyuv {
extern "C" {
#endif

#if !defined(LIBYUV_DISABLE_SVE) && defined(__aarch64__)

#if !defined(LIBYUV_DISABLE_SME) && defined(CLANG_HAS_SME) && \
    defined(__aarch64__)
#define STREAMING_COMPATIBLE __arm_streaming_compatible
#else  // defined(LIBYUV_DISABLE_SME) || !defined(CLANG_HAS_SME) ||
       // !defined(__aarch64__)
#define STREAMING_COMPATIBLE
#endif  // !defined(LIBYUV_DISABLE_SME) && defined(CLANG_HAS_SME) &&
        // defined(__aarch64__)

#define YUVTORGB_SVE_SETUP                          \
  "ld1rb {z28.b}, p0/z, [%[kUVCoeff], #0] \n" \
  "ld1rb {z29.b}, p0/z, [%[kUVCoeff], #1] \n" \
  "ld1rb {z30.b}, p0/z, [%[kUVCoeff], #2] \n" \
  "ld1rb {z31.b}, p0/z, [%[kUVCoeff], #3] \n" \
  "ld1rh {z24.h}, p0/z, [%[kRGBCoeffBias], #0] \n" \
  "ld1rh {z25.h}, p0/z, [%[kRGBCoeffBias], #2] \n" \
  "ld1rh {z26.h}, p0/z, [%[kRGBCoeffBias], #4] \n" \
  "ld1rh {z27.h}, p0/z, [%[kRGBCoeffBias], #6] \n"

#define READYUV444_SVE                           \
  "ld1b {z0.h}, p1/z, [%[src_y]] \n" \
  "ld1b {z1.h}, p1/z, [%[src_u]] \n" \
  "ld1b {z2.h}, p1/z, [%[src_v]] \n" \
  "add %[src_y], %[src_y], %[vl] \n" \
  "add %[src_u], %[src_u], %[vl] \n" \
  "add %[src_v], %[src_v], %[vl] \n" \
  "prfm pldl1keep, [%[src_y], 448] \n" \
  "prfm pldl1keep, [%[src_u], 448] \n" \
  "trn1 z0.b, z0.b, z0.b \n" \
  "prfm pldl1keep, [%[src_v], 448] \n"

#define READYUV400_SVE                           \
  "ld1b {z0.h}, p1/z, [%[src_y]] \n" \
  "inch %[src_y] \n" \
  "prfm pldl1keep, [%[src_y], 448] \n" \
  "trn1 z0.b, z0.b, z0.b \n"

#define READYUV422_SVE                           \
  "ld1b {z0.h}, p1/z, [%[src_y]] \n" \
  "ld1b {z1.s}, p1/z, [%[src_u]] \n" \
  "ld1b {z2.s}, p1/z, [%[src_v]] \n" \
  "inch %[src_y] \n" \
  "incw %[src_u] \n" \
  "incw %[src_v] \n" \
  "prfm pldl1keep, [%[src_y], 448] \n" \
  "prfm pldl1keep, [%[src_u], 128] \n" \
  "prfm pldl1keep, [%[src_v], 128] \n" \
  "trn1 z0.b, z0.b, z0.b \n" \
  "trn1 z1.h, z1.h, z1.h \n" \
  "trn1 z2.h, z2.h, z2.h \n"

// Read twice as much data from YUV, putting the even elements from the Y data
// in z0.h and odd elements in z1.h. U/V data is not duplicated, stored in
// z2.h/z3.h.
#define READYUV422_SVE_2X                        \
  "ld1b {z0.b}, p1/z, [%[src_y]] \n" \
  "ld1b {z2.h}, p1/z, [%[src_u]] \n" \
  "ld1b {z3.h}, p1/z, [%[src_v]] \n" \
  "incb %[src_y] \n" \
  "inch %[src_u] \n" \
  "inch %[src_v] \n" \
  "prfm pldl1keep, [%[src_y], 448] \n" \
  "prfm pldl1keep, [%[src_u], 128] \n" \
  "prfm pldl1keep, [%[src_v], 128] \n" \
  "trn2 z1.b, z0.b, z0.b \n" \
  "trn1 z0.b, z0.b, z0.b \n"

#define READI210_SVE                             \
  "ld1h {z3.h}, p1/z, [%[src_y]] \n" \
  "ld1h {z1.s}, p1/z, [%[src_u]] \n" \
  "ld1h {z2.s}, p1/z, [%[src_v]] \n" \
  "incb %[src_y] \n" \
  "inch %[src_u] \n" \
  "inch %[src_v] \n" \
  "lsl z0.h, z3.h, #6 \n" \
  "trn1 z1.h, z1.h, z1.h \n" \
  "trn1 z2.h, z2.h, z2.h \n" \
  "prfm pldl1keep, [%[src_y], 448] \n" \
  "prfm pldl1keep, [%[src_u], 128] \n" \
  "prfm pldl1keep, [%[src_v], 128] \n" \
  "usra z0.h, z3.h, #4 \n" \
  "uqshrnb z1.b, z1.h, #2 \n" \
  "uqshrnb z2.b, z2.h, #2 \n"

#define READP210_SVE                             \
  "ld1h {z0.h}, p1/z, [%[src_y]] \n" \
  "ld1h {z1.h}, p2/z, [%[src_uv]] \n" \
  "incb %[src_y] \n" \
  "incb %[src_uv] \n" \
  "prfm pldl1keep, [%[src_y], 448] \n" \
  "prfm pldl1keep, [%[src_uv], 256] \n" \
  "tbl z1.b, {z1.b}, z22.b \n"

#define READI410_SVE                             \
  "ld1h {z3.h}, p1/z, [%[src_y]] \n" \
  "lsl z0.h, z3.h, #6 \n" \
  "usra z0.h, z3.h, #4 \n" \
  "ld1h {z1.h}, p1/z, [%[src_u]] \n" \
  "ld1h {z2.h}, p1/z, [%[src_v]] \n" \
  "incb %[src_y] \n" \
  "incb %[src_u] \n" \
  "incb %[src_v] \n" \
  "prfm pldl1keep, [%[src_y], 448] \n" \
  "prfm pldl1keep, [%[src_u], 128] \n" \
  "prfm pldl1keep, [%[src_v], 128] \n" \
  "uqshrnb z1.b, z1.h, #2 \n" \
  "uqshrnb z2.b, z2.h, #2 \n"

// We need different predicates for the UV components since we are reading
// 32-bit (pairs of UV) elements rather than 16-bit Y elements.
#define READP410_SVE                                    \
  "ld1h {z0.h}, p1/z, [%[src_y]] \n" \
  "ld1w {z1.s}, p2/z, [%[src_uv]] \n" \
  "ld1w {z2.s}, p3/z, [%[src_uv], #1, mul vl] \n" \
  "incb %[src_y] \n" \
  "incb %[src_uv], all, mul #2 \n" \
  "prfm pldl1keep, [%[src_y], 448] \n" \
  "prfm pldl1keep, [%[src_uv], 256] \n" \
  "uzp2 z1.b, z1.b, z2.b \n"

#define READI212_SVE                             \
  "ld1h {z3.h}, p1/z, [%[src_y]] \n" \
  "ld1h {z1.s}, p1/z, [%[src_u]] \n" \
  "ld1h {z2.s}, p1/z, [%[src_v]] \n" \
  "incb %[src_y] \n" \
  "inch %[src_u] \n" \
  "inch %[src_v] \n" \
  "lsl z0.h, z3.h, #4 \n" \
  "trn1 z1.h, z1.h, z1.h \n" \
  "trn1 z2.h, z2.h, z2.h \n" \
  "prfm pldl1keep, [%[src_y], 448] \n" \
  "prfm pldl1keep, [%[src_u], 128] \n" \
  "prfm pldl1keep, [%[src_v], 128] \n" \
  "usra z0.h, z3.h, #8 \n" \
  "uqshrnb z1.b, z1.h, #4 \n" \
  "uqshrnb z2.b, z2.h, #4 \n"

#define I400TORGB_SVE                                    \
  "umulh z18.h, z24.h, z0.h \n" /* Y */ \
  "movprfx z16, z18 \n"         \
  "usqadd z16.h, p0/m, z16.h, z4.h \n" /* B */ \
  "movprfx z17, z18 \n"         \
  "usqadd z17.h, p0/m, z17.h, z6.h \n" /* G */ \
  "usqadd z18.h, p0/m, z18.h, z5.h \n" /* R */

// We need a different predicate for the UV component to handle the tail.
// If there is a single element remaining then we want to load one Y element
// but two UV elements.
#define READNV_SVE_2X                                               \
  "ld1b {z0.b}, p1/z, [%[src_y]] \n" /* Y0Y0 */         \
  "ld1b {z2.b}, p2/z, [%[src_uv]] \n" /* U0V0 or V0U0 */ \
  "incb %[src_y] \n"                    \
  "incb %[src_uv] \n"                    \
  "prfm pldl1keep, [%[src_y], 448] \n"                    \
  "prfm pldl1keep, [%[src_uv], 256] \n"                    \
  "trn2 z1.b, z0.b, z0.b \n" /* YYYY */         \
  "trn1 z0.b, z0.b, z0.b \n" /* YYYY */

// Like NVTORGB_SVE but U/V components are stored in widened .h elements of
// z1/z2 rather than even/odd .b lanes of z1.
#define I4XXTORGB_SVE                                     \
  "umulh z0.h, z24.h, z0.h \n" /* Y */  \
  "umullb z6.h, z30.b, z1.b \n"          \
  "umullb z4.h, z28.b, z1.b \n" /* DB */ \
  "umullb z5.h, z29.b, z2.b \n" /* DR */ \
  "umlalb z6.h, z31.b, z2.b \n" /* DG */ \
  "add z17.h, z0.h, z26.h \n" /* G */  \
  "add z16.h, z0.h, z4.h \n" /* B */  \
  "add z18.h, z0.h, z5.h \n" /* R */  \
  "uqsub z17.h, z17.h, z6.h \n" /* G */  \
  "uqsub z16.h, z16.h, z25.h \n" /* B */  \
  "uqsub z18.h, z18.h, z27.h \n" /* R */

// Like I4XXTORGB_SVE but U/V components are stored in even/odd .b lanes of z1
// rather than widened .h elements of z1/z2.
#define NVTORGB_SVE                                       \
  "umulh z0.h, z24.h, z0.h \n" /* Y */  \
  "umullb z6.h, z30.b, z1.b \n"          \
  "umullb z4.h, z28.b, z1.b \n" /* DB */ \
  "umullt z5.h, z29.b, z1.b \n" /* DR */ \
  "umlalt z6.h, z31.b, z1.b \n" /* DG */ \
  "add z17.h, z0.h, z26.h \n" /* G */  \
  "add z16.h, z0.h, z4.h \n" /* B */  \
  "add z18.h, z0.h, z5.h \n" /* R */  \
  "uqsub z17.h, z17.h, z6.h \n" /* G */  \
  "uqsub z16.h, z16.h, z25.h \n" /* B */  \
  "uqsub z18.h, z18.h, z27.h \n" /* R */

// The U/V component multiplies do not need to be duplicated in I422, we just
// need to combine them with Y0/Y1 correctly.
#define I422TORGB_SVE_2X                                  \
  "umulh z0.h, z24.h, z0.h \n" /* Y0 */ \
  "umulh z1.h, z24.h, z1.h \n" /* Y1 */ \
  "umullb z6.h, z30.b, z2.b \n"          \
  "umullb z4.h, z28.b, z2.b \n" /* DB */ \
  "umullb z5.h, z29.b, z3.b \n" /* DR */ \
  "umlalb z6.h, z31.b, z3.b \n" /* DG */ \
                                                          \
  "add z17.h, z0.h, z26.h \n" /* G0 */ \
  "add z21.h, z1.h, z26.h \n" /* G1 */ \
  "add z16.h, z0.h, z4.h \n" /* B0 */ \
  "add z20.h, z1.h, z4.h \n" /* B1 */ \
  "add z18.h, z0.h, z5.h \n" /* R0 */ \
  "add z22.h, z1.h, z5.h \n" /* R1 */ \
  "uqsub z17.h, z17.h, z6.h \n" /* G0 */ \
  "uqsub z21.h, z21.h, z6.h \n" /* G1 */ \
  "uqsub z16.h, z16.h, z25.h \n" /* B0 */ \
  "uqsub z20.h, z20.h, z25.h \n" /* B1 */ \
  "uqsub z18.h, z18.h, z27.h \n" /* R0 */ \
  "uqsub z22.h, z22.h, z27.h \n" /* R1 */

// clang-format off
#define NVTORGB_SVE_2X(bt_u, bt_v)                        \
  "umulh z0.h, z24.h, z0.h \n" /* Y0 */ \
  "umulh z1.h, z24.h, z1.h \n" /* Y1 */ \
  "umull" #bt_u " z6.h, z30.b, z2.b \n"          \
  "umull" #bt_u " z4.h, z28.b, z2.b \n" /* DB */ \
  "umull" #bt_v " z5.h, z29.b, z2.b \n" /* DR */ \
  "umlal" #bt_v " z6.h, z31.b, z2.b \n" /* DG */ \
                                                          \
  "add z17.h, z0.h, z26.h \n" /* G0 */ \
  "add z21.h, z1.h, z26.h \n" /* G1 */ \
  "add z16.h, z0.h, z4.h \n" /* B0 */ \
  "add z20.h, z1.h, z4.h \n" /* B1 */ \
  "add z18.h, z0.h, z5.h \n" /* R0 */ \
  "add z22.h, z1.h, z5.h \n" /* R1 */ \
  "uqsub z17.h, z17.h, z6.h \n" /* G0 */ \
  "uqsub z21.h, z21.h, z6.h \n" /* G1 */ \
  "uqsub z16.h, z16.h, z25.h \n" /* B0 */ \
  "uqsub z20.h, z20.h, z25.h \n" /* B1 */ \
  "uqsub z18.h, z18.h, z27.h \n" /* R0 */ \
  "uqsub z22.h, z22.h, z27.h \n" /* R1 */
// clang-format on

#define RGBTOARGB8_SVE_TOP_2X                        \
  /* Inputs: B: z16.h,  G: z17.h,  R: z18.h */       \
  "uqshl z16.h, p0/m, z16.h, #2 \n" /* B0 */ \
  "uqshl z17.h, p0/m, z17.h, #2 \n" /* G0 */ \
  "uqshl z18.h, p0/m, z18.h, #2 \n" /* R0 */ \
  "uqshl z20.h, p0/m, z20.h, #2 \n" /* B1 */ \
  "uqshl z21.h, p0/m, z21.h, #2 \n" /* G1 */ \
  "uqshl z22.h, p0/m, z22.h, #2 \n" /* R1 */

// Convert from 2.14 fixed point RGB to 8 bit ARGB, interleaving as BG and RA
// pairs to allow us to use ST2 for storing rather than ST4.
#define RGBTOARGB8_SVE                                    \
  /* Inputs: B: z16.h,  G: z17.h,  R: z18.h,  A: z19.b */ \
  "uqshrnb z16.b, z16.h, #6 \n" /* B0 */          \
  "uqshrnb z18.b, z18.h, #6 \n" /* R0 */          \
  "uqshrnt z16.b, z17.h, #6 \n" /* BG */          \
  "trn1 z17.b, z18.b, z19.b \n" /* RA */

// Convert from 2.14 fixed point RGBA to 8 bit ARGB, interleaving as BG and RA
// pairs to allow us to use ST2 for storing rather than ST4.
#define RGBATOARGB8_SVE                                   \
  /* Inputs: B: z16.h,  G: z17.h,  R: z18.h,  A: z19.h */ \
  "uqshrnb z16.b, z16.h, #6 \n" /* B0 */          \
  "uqshrnt z16.b, z17.h, #6 \n" /* BG */          \
  "uqshrnb z17.b, z18.h, #6 \n" /* R0 */          \
  "uqshrnt z17.b, z19.h, #2 \n" /* RA */

// Convert from 2.14 fixed point RGB to 8 bit RGBA, interleaving as AB and GR
// pairs to allow us to use ST2 for storing rather than ST4.
#define RGBTORGBA8_SVE                                    \
  /* Inputs: B: z16.h,  G: z17.h,  R: z18.h,  A: z19.b */ \
  "uqshrnt z19.b, z16.h, #6 \n" /* AB */          \
  "uqshrnb z20.b, z17.h, #6 \n" /* G0 */          \
  "uqshrnt z20.b, z18.h, #6 \n" /* GR */

#define RGBTOARGB8_SVE_2X                                 \
  /* Inputs: B: z16.h,  G: z17.h,  R: z18.h,  A: z19.b */ \
  "uqshrnb z16.b, z16.h, #6 \n" /* B0 */          \
  "uqshrnb z17.b, z17.h, #6 \n" /* G0 */          \
  "uqshrnb z18.b, z18.h, #6 \n" /* R0 */          \
  "uqshrnt z16.b, z20.h, #6 \n" /* B1 */          \
  "uqshrnt z17.b, z21.h, #6 \n" /* G1 */          \
  "uqshrnt z18.b, z22.h, #6 \n" /* R1 */

// Store AR30 elements. Inputs are 2.14 fixed point RGB. We expect z23 to be
// populated with 0x3ff0 (0x3fff would also work) to saturate the R input
// rather than needing a pair of shifts to saturate and then insert into the
// correct position in the lane.
#define STOREAR30_SVE                                                    \
  "uqshl z16.h, p0/m, z16.h, #2 \n" /* bbbbbbbbbbxxxxxx */ \
  "uqshl z17.h, p0/m, z17.h, #2 \n" /* ggggggggggxxxxxx */ \
  "umin z18.h, p0/m, z18.h, z23.h \n" /* 00rrrrrrrrrrxxxx */ \
  "orr z18.h, z18.h, #0xc000 \n" /* 11rrrrrrrrrrxxxx */ \
  "sri z18.h, z17.h, #12 \n" /* 11rrrrrrrrrrgggg */ \
  "lsl z17.h, z17.h, #4 \n" /* ggggggxxxxxx0000 */ \
  "sri z17.h, z16.h, #6 \n" /* ggggggbbbbbbbbbb */ \
  "st2h {z17.h, z18.h}, p1, [%[dst_ar30]] \n"                        \
  "incb %[dst_ar30], all, mul #2 \n"

#define YUVTORGB_SVE_REGS                                                     \
  "z0""z1""z2""z3""z4""z5""z6""z7""z16""z17""z18""z19", \
      "z20""z22""z23""z24""z25""z26""z27""z28""z29""z30",   \
      "z31""p0""p1""p2""p3"

static inline void I400ToARGBRow_SVE_SC(const uint8_t* src_y,
                                        uint8_t* dst_argb,
                                        const struct YuvConstants* yuvconstants,
                                        int width) STREAMING_COMPATIBLE {
  uint64_t vl;
  asm volatile(
      "cnth %[vl] \n"
      "ptrue p0.b \n"
      "dup z19.b, #255 \n"  // Alpha
      YUVTORGB_SVE_SETUP
      "cmp %w[width], %w[vl] \n"
      "mov z1.h, #128 \n"  // U/V
      "umullb z6.h, z30.b, z1.b \n"
      "umullb z4.h, z28.b, z1.b \n"  // DB
      "umullb z5.h, z29.b, z1.b \n"  // DR
      "mla z6.h, p0/m, z31.h, z1.h \n"  // DG
      "sub z4.h, z4.h, z25.h \n"
      "sub z5.h, z5.h, z27.h \n"
      "sub z6.h, z26.h, z6.h \n"
      "b.le 2f \n"

      // Run bulk of computation with an all-true predicate to avoid predicate
      // generation overhead.
      "ptrue p1.h \n"
      "sub %w[width], %w[width], %w[vl] \n"
      "1: \n"  //
      READYUV400_SVE I400TORGB_SVE RGBTOARGB8_SVE
      "subs %w[width], %w[width], %w[vl] \n"
      "st2h {z16.h, z17.h}, p1, [%[dst_argb]] \n"
      "add %[dst_argb], %[dst_argb], %[vl], lsl #2 \n"
      "b.gt 1b \n"
      "add %w[width], %w[width], %w[vl] \n"

      // Calculate a predicate for the final iteration to deal with the tail.
      "2: \n"
      "whilelt p1.h, wzr, %w[width] \n"  //
      READYUV400_SVE I400TORGB_SVE RGBTOARGB8_SVE
      "st2h {z16.h, z17.h}, p1, [%[dst_argb]] \n"
      : [src_y] "+r"(src_y),                               // %[src_y]
        [dst_argb] "+r"(dst_argb),                         // %[dst_argb]
        [width] "+r"(width),                               // %[width]
        [vl] "=&r"(vl)                                     // %[vl]
      : [kUVCoeff] "r"(&yuvconstants->kUVCoeff),           // %[kUVCoeff]
        [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias)  // %[kRGBCoeffBias]
      : "cc""memory", YUVTORGB_SVE_REGS);
}

static inline void I422ToARGBRow_SVE_SC(const uint8_t* src_y,
                                        const uint8_t* src_u,
                                        const uint8_t* src_v,
                                        uint8_t* dst_argb,
                                        const struct YuvConstants* yuvconstants,
                                        int width) STREAMING_COMPATIBLE {
  uint64_t vl;
  asm volatile(
      "cntb %[vl] \n"
      "ptrue p0.b \n"  //
      YUVTORGB_SVE_SETUP
      "dup z19.b, #255 \n"  // Alpha
      "subs %w[width], %w[width], %w[vl] \n"
      "b.lt 2f \n"

      // Run bulk of computation with an all-true predicate to avoid predicate
      // generation overhead.
      "ptrue p1.b \n"
      "1: \n"  //
      READYUV422_SVE_2X I422TORGB_SVE_2X RGBTOARGB8_SVE_2X
      "subs %w[width], %w[width], %w[vl] \n"
      "st4b {z16.b, z17.b, z18.b, z19.b}, p1, [%[dst_argb]] \n"
      "incb %[dst_argb], all, mul #4 \n"
      "b.ge 1b \n"

      "2: \n"
      "adds %w[width], %w[width], %w[vl] \n"
      "b.eq 99f \n"

      // Calculate a predicate for the final iteration to deal with the tail.
      "cnth %[vl] \n"
      "whilelt p1.b, wzr, %w[width] \n"  //
      READYUV422_SVE_2X I422TORGB_SVE_2X RGBTOARGB8_SVE_2X
      "st4b {z16.b, z17.b, z18.b, z19.b}, p1, [%[dst_argb]] \n"

      "99: \n"
      : [src_y] "+r"(src_y),                               // %[src_y]
        [src_u] "+r"(src_u),                               // %[src_u]
        [src_v] "+r"(src_v),                               // %[src_v]
        [dst_argb] "+r"(dst_argb),                         // %[dst_argb]
        [width] "+r"(width),                               // %[width]
        [vl] "=&r"(vl)                                     // %[vl]
      : [kUVCoeff] "r"(&yuvconstants->kUVCoeff),           // %[kUVCoeff]
        [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias)  // %[kRGBCoeffBias]
      : "cc""memory", YUVTORGB_SVE_REGS);
}

static inline void I422ToRGB24Row_SVE_SC(
    const uint8_t* src_y,
    const uint8_t* src_u,
    const uint8_t* src_v,
    uint8_t* dst_argb,
    const struct YuvConstants* yuvconstants,
    int width) STREAMING_COMPATIBLE {
  uint64_t vl;
  asm volatile(
      "cntb %[vl] \n"
      "ptrue p0.b \n"  //
      YUVTORGB_SVE_SETUP
      "subs %w[width], %w[width], %w[vl] \n"
      "b.lt 2f \n"

      // Run bulk of computation with an all-true predicate to avoid predicate
      // generation overhead.
      "ptrue p1.b \n"
      "1: \n"  //
      READYUV422_SVE_2X I422TORGB_SVE_2X RGBTOARGB8_SVE_2X
      "subs %w[width], %w[width], %w[vl] \n"
      "st3b {z16.b, z17.b, z18.b}, p1, [%[dst_argb]] \n"
      "incb %[dst_argb], all, mul #3 \n"
      "b.ge 1b \n"

      "2: \n"
      "adds %w[width], %w[width], %w[vl] \n"
      "b.eq 99f \n"

      // Calculate a predicate for the final iteration to deal with the tail.
      "cnth %[vl] \n"
      "whilelt p1.b, wzr, %w[width] \n"  //
      READYUV422_SVE_2X I422TORGB_SVE_2X RGBTOARGB8_SVE_2X
      "st3b {z16.b, z17.b, z18.b}, p1, [%[dst_argb]] \n"

      "99: \n"
      : [src_y] "+r"(src_y),                               // %[src_y]
        [src_u] "+r"(src_u),                               // %[src_u]
        [src_v] "+r"(src_v),                               // %[src_v]
        [dst_argb] "+r"(dst_argb),                         // %[dst_argb]
        [width] "+r"(width),                               // %[width]
        [vl] "=&r"(vl)                                     // %[vl]
      : [kUVCoeff] "r"(&yuvconstants->kUVCoeff),           // %[kUVCoeff]
        [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias)  // %[kRGBCoeffBias]
      : "cc""memory", YUVTORGB_SVE_REGS);
}

#define RGB8TORGB565_SVE_FROM_TOP_2X                        \
  "sri z18.h, z17.h, #5 \n" /* rrrrrgggggg00000 */ \
  "sri z22.h, z21.h, #5 \n" /* rrrrrgggggg00000 */ \
  "sri z18.h, z16.h, #11 \n" /* rrrrrggggggbbbbb */ \
  "sri z22.h, z20.h, #11 \n" /* rrrrrggggggbbbbb */ \
  "mov z19.d, z22.d \n"

static inline void I422ToRGB565Row_SVE_SC(
    const uint8_t* src_y,
    const uint8_t* src_u,
    const uint8_t* src_v,
    uint8_t* dst_rgb565,
    const struct YuvConstants* yuvconstants,
    int width) STREAMING_COMPATIBLE {
  uint64_t vl;
  asm volatile(
      "cntb %[vl] \n"
      "ptrue p0.b \n"  //
      YUVTORGB_SVE_SETUP
      "subs %w[width], %w[width], %w[vl] \n"
      "b.lt 2f \n"

      // Run bulk of computation with an all-true predicate to avoid predicate
      // generation overhead.
      "ptrue p1.b \n"
      "1: \n"  //
      READYUV422_SVE_2X I422TORGB_SVE_2X RGBTOARGB8_SVE_TOP_2X
      "subs %w[width], %w[width], %w[vl] \n"  //
      RGB8TORGB565_SVE_FROM_TOP_2X
      "st2h {z18.h, z19.h}, p1, [%[dst]] \n"
      "incb %[dst], all, mul #2 \n"
      "b.ge 1b \n"

      "2: \n"
      "adds %w[width], %w[width], %w[vl] \n"
      "b.eq 99f \n"

      // Calculate a predicate for the final iteration to deal with the tail.
      "cnth %[vl] \n"
      "whilelt p1.b, wzr, %w[width] \n"  //
      READYUV422_SVE_2X I422TORGB_SVE_2X RGBTOARGB8_SVE_TOP_2X
          RGB8TORGB565_SVE_FROM_TOP_2X
      "st2h {z18.h, z19.h}, p1, [%[dst]] \n"

      "99: \n"
      : [src_y] "+r"(src_y),                               // %[src_y]
        [src_u] "+r"(src_u),                               // %[src_u]
        [src_v] "+r"(src_v),                               // %[src_v]
        [dst] "+r"(dst_rgb565),                            // %[dst]
        [width] "+r"(width),                               // %[width]
        [vl] "=&r"(vl)                                     // %[vl]
      : [kUVCoeff] "r"(&yuvconstants->kUVCoeff),           // %[kUVCoeff]
        [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias)  // %[kRGBCoeffBias]
      : "cc""memory", YUVTORGB_SVE_REGS);
}

#define RGB8TOARGB1555_SVE_FROM_TOP_2X                      \
  "dup z0.h, #0x8000 \n" /* 1000000000000000 */ \
  "dup z1.h, #0x8000 \n" /* 1000000000000000 */ \
  "sri z0.h, z18.h, #1 \n" /* 1rrrrrxxxxxxxxxx */ \
  "sri z1.h, z22.h, #1 \n" /* 1rrrrrxxxxxxxxxx */ \
  "sri z0.h, z17.h, #6 \n" /* 1rrrrrgggggxxxxx */ \
  "sri z1.h, z21.h, #6 \n" /* 1rrrrrgggggxxxxx */ \
  "sri z0.h, z16.h, #11 \n" /* 1rrrrrgggggbbbbb */ \
  "sri z1.h, z20.h, #11 \n" /* 1rrrrrgggggbbbbb */

static inline void I422ToARGB1555Row_SVE_SC(
    const uint8_t* src_y,
    const uint8_t* src_u,
    const uint8_t* src_v,
    uint8_t* dst_argb1555,
    const struct YuvConstants* yuvconstants,
    int width) STREAMING_COMPATIBLE {
  uint64_t vl;
  asm volatile(
      "cntb %[vl] \n"
      "ptrue p0.b \n"  //
      YUVTORGB_SVE_SETUP
      "subs %w[width], %w[width], %w[vl] \n"
      "b.lt 2f \n"

      // Run bulk of computation with an all-true predicate to avoid predicate
      // generation overhead.
      "ptrue p1.b \n"
      "1: \n"  //
      READYUV422_SVE_2X I422TORGB_SVE_2X RGBTOARGB8_SVE_TOP_2X
      "subs %w[width], %w[width], %w[vl] \n"  //
      RGB8TOARGB1555_SVE_FROM_TOP_2X
      "st2h {z0.h, z1.h}, p1, [%[dst]] \n"
      "incb %[dst], all, mul #2 \n"
      "b.ge 1b \n"

      "2: \n"
      "adds %w[width], %w[width], %w[vl] \n"
      "b.eq 99f \n"

      // Calculate a predicate for the final iteration to deal with the tail.
      "cnth %[vl] \n"
      "whilelt p1.b, wzr, %w[width] \n"  //
      READYUV422_SVE_2X I422TORGB_SVE_2X RGBTOARGB8_SVE_TOP_2X
          RGB8TOARGB1555_SVE_FROM_TOP_2X
      "st2h {z0.h, z1.h}, p1, [%[dst]] \n"

      "99: \n"
      : [src_y] "+r"(src_y),                               // %[src_y]
        [src_u] "+r"(src_u),                               // %[src_u]
        [src_v] "+r"(src_v),                               // %[src_v]
        [dst] "+r"(dst_argb1555),                          // %[dst]
        [width] "+r"(width),                               // %[width]
        [vl] "=&r"(vl)                                     // %[vl]
      : [kUVCoeff] "r"(&yuvconstants->kUVCoeff),           // %[kUVCoeff]
        [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias)  // %[kRGBCoeffBias]
      : "cc""memory", YUVTORGB_SVE_REGS);
}

#define RGB8TOARGB4444_SVE_FROM_TOP_2X                      \
  "dup z0.h, #0xf000 \n" /* 1111000000000000 */ \
  "dup z1.h, #0xf000 \n" /* 1111000000000000 */ \
  "sri z0.h, z18.h, #4 \n" /* 1111rrrrxxxxxxxx */ \
  "sri z1.h, z22.h, #4 \n" /* 1111rrrrxxxxxxxx */ \
  "sri z0.h, z17.h, #8 \n" /* 1111rrrrggggxxxx */ \
  "sri z1.h, z21.h, #8 \n" /* 1111rrrrggggxxxx */ \
  "sri z0.h, z16.h, #12 \n" /* 1111rrrrggggbbbb */ \
  "sri z1.h, z20.h, #12 \n" /* 1111rrrrggggbbbb */

static inline void I422ToARGB4444Row_SVE_SC(
    const uint8_t* src_y,
    const uint8_t* src_u,
    const uint8_t* src_v,
    uint8_t* dst_argb4444,
    const struct YuvConstants* yuvconstants,
    int width) STREAMING_COMPATIBLE {
  uint64_t vl;
  asm volatile(
      "cntb %[vl] \n"
      "ptrue p0.b \n"  //
      YUVTORGB_SVE_SETUP
      "subs %w[width], %w[width], %w[vl] \n"
      "b.lt 2f \n"

      // Run bulk of computation with an all-true predicate to avoid predicate
      // generation overhead.
      "ptrue p1.b \n"
      "1: \n"  //
      READYUV422_SVE_2X I422TORGB_SVE_2X RGBTOARGB8_SVE_TOP_2X
      "subs %w[width], %w[width], %w[vl] \n"  //
      RGB8TOARGB4444_SVE_FROM_TOP_2X
      "st2h {z0.h, z1.h}, p1, [%[dst]] \n"
      "incb %[dst], all, mul #2 \n"
      "b.ge 1b \n"

      "2: \n"
      "adds %w[width], %w[width], %w[vl] \n"
      "b.eq 99f \n"

      // Calculate a predicate for the final iteration to deal with the tail.
      "cnth %[vl] \n"
      "whilelt p1.b, wzr, %w[width] \n"  //
      READYUV422_SVE_2X I422TORGB_SVE_2X RGBTOARGB8_SVE_TOP_2X
          RGB8TOARGB4444_SVE_FROM_TOP_2X
      "st2h {z0.h, z1.h}, p1, [%[dst]] \n"

      "99: \n"
      : [src_y] "+r"(src_y),                               // %[src_y]
        [src_u] "+r"(src_u),                               // %[src_u]
        [src_v] "+r"(src_v),                               // %[src_v]
        [dst] "+r"(dst_argb4444),                          // %[dst]
        [width] "+r"(width),                               // %[width]
        [vl] "=&r"(vl)                                     // %[vl]
      : [kUVCoeff] "r"(&yuvconstants->kUVCoeff),           // %[kUVCoeff]
        [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias)  // %[kRGBCoeffBias]
      : "cc""memory", YUVTORGB_SVE_REGS);
}

static inline void I422ToRGBARow_SVE_SC(const uint8_t* src_y,
                                        const uint8_t* src_u,
                                        const uint8_t* src_v,
                                        uint8_t* dst_argb,
                                        const struct YuvConstants* yuvconstants,
                                        int width) STREAMING_COMPATIBLE {
  uint64_t vl;
  asm volatile(
      "cnth %[vl] \n"
      "ptrue p0.b \n"  //
      YUVTORGB_SVE_SETUP
      "dup z19.b, #255 \n"  // Alpha
      "subs %w[width], %w[width], %w[vl] \n"
      "b.le 2f \n"

      // Run bulk of computation with an all-true predicate to avoid predicate
      // generation overhead.
      "ptrue p1.h \n"
      "1: \n"  //
      READYUV422_SVE I4XXTORGB_SVE RGBTORGBA8_SVE
      "subs %w[width], %w[width], %w[vl] \n"
      "st2h {z19.h, z20.h}, p1, [%[dst_argb]] \n"
      "add %[dst_argb], %[dst_argb], %[vl], lsl #2 \n"
      "b.gt 1b \n"

      // Calculate a predicate for the final iteration to deal with the tail.
      "2: \n"
      "adds %w[width], %w[width], %w[vl] \n"
      "b.eq 99f \n"

      "whilelt p1.h, wzr, %w[width] \n"  //
      READYUV422_SVE I4XXTORGB_SVE RGBTORGBA8_SVE
      "st2h {z19.h, z20.h}, p1, [%[dst_argb]] \n"

      "99: \n"
      : [src_y] "+r"(src_y),                               // %[src_y]
        [src_u] "+r"(src_u),                               // %[src_u]
        [src_v] "+r"(src_v),                               // %[src_v]
        [dst_argb] "+r"(dst_argb),                         // %[dst_argb]
        [width] "+r"(width),                               // %[width]
        [vl] "=&r"(vl)                                     // %[vl]
      : [kUVCoeff] "r"(&yuvconstants->kUVCoeff),           // %[kUVCoeff]
        [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias)  // %[kRGBCoeffBias]
      : "cc""memory", YUVTORGB_SVE_REGS);
}

static inline void I422AlphaToARGBRow_SVE_SC(
    const uint8_t* src_y,
    const uint8_t* src_u,
    const uint8_t* src_v,
    const uint8_t* src_a,
    uint8_t* dst_argb,
    const struct YuvConstants* yuvconstants,
    int width) STREAMING_COMPATIBLE {
  uint64_t vl;
  asm volatile(
      "cntb %[vl] \n"
      "ptrue p0.b \n"  //
      YUVTORGB_SVE_SETUP
      "subs %w[width], %w[width], %w[vl] \n"
      "b.lt 2f \n"

      // Run bulk of computation with an all-true predicate to avoid predicate
      // generation overhead.
      "ptrue p1.b \n"
      "1: \n"  //
      READYUV422_SVE_2X
      "ld1b {z19.b}, p1/z, [%[src_a]] \n"
      "add %[src_a], %[src_a], %[vl] \n"  // Alpha
      I422TORGB_SVE_2X RGBTOARGB8_SVE_2X
      "subs %w[width], %w[width], %w[vl] \n"
      "st4b {z16.b, z17.b, z18.b, z19.b}, p1, [%[dst_argb]] \n"
      "incb %[dst_argb], all, mul #4 \n"
      "b.ge 1b \n"

      "2: \n"
      "adds %w[width], %w[width], %w[vl] \n"
      "b.eq 99f \n"

      // Calculate a predicate for the final iteration to deal with the tail.
      "cnth %[vl] \n"
      "whilelt p1.b, wzr, %w[width] \n"  //
      READYUV422_SVE_2X
      "ld1b {z19.b}, p1/z, [%[src_a]] \n"  // Alpha
      I422TORGB_SVE_2X RGBTOARGB8_SVE_2X
      "st4b {z16.b, z17.b, z18.b, z19.b}, p1, [%[dst_argb]] \n"

      "99: \n"
      : [src_y] "+r"(src_y),                               // %[src_y]
        [src_u] "+r"(src_u),                               // %[src_u]
        [src_v] "+r"(src_v),                               // %[src_v]
        [src_a] "+r"(src_a),                               // %[src_a]
        [dst_argb] "+r"(dst_argb),                         // %[dst_argb]
        [width] "+r"(width),                               // %[width]
        [vl] "=&r"(vl)                                     // %[vl]
      : [kUVCoeff] "r"(&yuvconstants->kUVCoeff),           // %[kUVCoeff]
        [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias)  // %[kRGBCoeffBias]
      : "cc""memory", YUVTORGB_SVE_REGS);
}

static inline void I444AlphaToARGBRow_SVE_SC(
    const uint8_t* src_y,
    const uint8_t* src_u,
    const uint8_t* src_v,
    const uint8_t* src_a,
    uint8_t* dst_argb,
    const struct YuvConstants* yuvconstants,
    int width) STREAMING_COMPATIBLE {
  uint64_t vl;
  asm volatile(
      "cnth %[vl] \n"
      "ptrue p0.b \n"  //
      YUVTORGB_SVE_SETUP
      "subs %w[width], %w[width], %w[vl] \n"
      "b.lt 2f \n"

      // Run bulk of computation with an all-true predicate to avoid predicate
      // generation overhead.
      "ptrue p1.h \n"
      "1: \n"  //
      READYUV444_SVE
      "ld1b {z19.h}, p1/z, [%[src_a]] \n"
      "add %[src_a], %[src_a], %[vl] \n"  // Alpha
      I4XXTORGB_SVE RGBTOARGB8_SVE
      "subs %w[width], %w[width], %w[vl] \n"
      "st2h {z16.h, z17.h}, p1, [%[dst_argb]] \n"
      "add %[dst_argb], %[dst_argb], %[vl], lsl #2 \n"
      "b.ge 1b \n"

      "2: \n"
      "adds %w[width], %w[width], %w[vl] \n"
      "b.eq 99f \n"

      // Calculate a predicate for the final iteration to deal with the tail.
      "whilelt p1.h, wzr, %w[width] \n"  //
      READYUV444_SVE
      "ld1b {z19.h}, p1/z, [%[src_a]] \n"  // Alpha
      I4XXTORGB_SVE RGBTOARGB8_SVE
      "st2h {z16.h, z17.h}, p1, [%[dst_argb]] \n"

      "99: \n"
      : [src_y] "+r"(src_y),                               // %[src_y]
        [src_u] "+r"(src_u),                               // %[src_u]
        [src_v] "+r"(src_v),                               // %[src_v]
        [src_a] "+r"(src_a),                               // %[src_a]
        [dst_argb] "+r"(dst_argb),                         // %[dst_argb]
        [width] "+r"(width),                               // %[width]
        [vl] "=&r"(vl)                                     // %[vl]
      : [kUVCoeff] "r"(&yuvconstants->kUVCoeff),           // %[kUVCoeff]
        [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias)  // %[kRGBCoeffBias]
      : "cc""memory", YUVTORGB_SVE_REGS);
}

static inline void NV12ToARGBRow_SVE_SC(const uint8_t* src_y,
                                        const uint8_t* src_uv,
                                        uint8_t* dst_argb,
                                        const struct YuvConstants* yuvconstants,
                                        int width) STREAMING_COMPATIBLE {
  uint64_t vl;
  asm("cntb %0" : "=r"(vl));
  int width_last_y = width & (vl - 1);
  int width_last_uv = width_last_y + (width_last_y & 1);
  asm volatile(
      "ptrue p0.b \n"  //
      YUVTORGB_SVE_SETUP
      "dup z19.b, #255 \n"  // Alpha
      "subs %w[width], %w[width], %w[vl] \n"
      "b.lt 2f \n"

      // Run bulk of computation with an all-true predicate to avoid predicate
      // generation overhead.
      "ptrue p1.b \n"
      "ptrue p2.b \n"
      "1: \n"  //
      READNV_SVE_2X NVTORGB_SVE_2X(b, t) RGBTOARGB8_SVE_2X
      "subs %w[width], %w[width], %w[vl] \n"
      "st4b {z16.b, z17.b, z18.b, z19.b}, p1, [%[dst_argb]] \n"
      "add %[dst_argb], %[dst_argb], %[vl], lsl #2 \n"
      "b.ge 1b \n"

      "2: \n"
      "adds %w[width], %w[width], %w[vl] \n"
      "b.eq 99f \n"

      // Calculate a predicate for the final iteration to deal with the tail.
      "whilelt p1.b, wzr, %w[width_last_y] \n"
      "whilelt p2.b, wzr, %w[width_last_uv] \n"  //
      READNV_SVE_2X NVTORGB_SVE_2X(b, t) RGBTOARGB8_SVE_2X
      "st4b {z16.b, z17.b, z18.b, z19.b}, p1, [%[dst_argb]] \n"

      "99: \n"
      : [src_y] "+r"(src_y),                                // %[src_y]
        [src_uv] "+r"(src_uv),                              // %[src_uv]
        [dst_argb] "+r"(dst_argb),                          // %[dst_argb]
        [width] "+r"(width)                                 // %[width]
      : [vl] "r"(vl),                                       // %[vl]
        [kUVCoeff] "r"(&yuvconstants->kUVCoeff),            // %[kUVCoeff]
        [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias),  // %[kRGBCoeffBias]
        [width_last_y] "r"(width_last_y),                   // %[width_last_y]
        [width_last_uv] "r"(width_last_uv)                  // %[width_last_uv]
      : "cc""memory", YUVTORGB_SVE_REGS, "p2");
}

static inline void NV21ToARGBRow_SVE_SC(const uint8_t* src_y,
                                        const uint8_t* src_vu,
                                        uint8_t* dst_argb,
                                        const struct YuvConstants* yuvconstants,
                                        int width) STREAMING_COMPATIBLE {
  uint64_t vl;
  asm("cntb %0" : "=r"(vl));
  int width_last_y = width & (vl - 1);
  int width_last_uv = width_last_y + (width_last_y & 1);
  asm volatile(
      "ptrue p0.b \n"  //
      YUVTORGB_SVE_SETUP
      "dup z19.b, #255 \n"  // Alpha
      "subs %w[width], %w[width], %w[vl] \n"
      "b.lt 2f \n"

      // Run bulk of computation with an all-true predicate to avoid predicate
      // generation overhead.
      "ptrue p1.b \n"
      "ptrue p2.b \n"
      "1: \n"  //
      READNV_SVE_2X NVTORGB_SVE_2X(t, b) RGBTOARGB8_SVE_2X
      "subs %w[width], %w[width], %w[vl] \n"
      "st4b {z16.b, z17.b, z18.b, z19.b}, p1, [%[dst_argb]] \n"
      "add %[dst_argb], %[dst_argb], %[vl], lsl #2 \n"
      "b.ge 1b \n"

      "2: \n"
      "adds %w[width], %w[width], %w[vl] \n"
      "b.eq 99f \n"

      // Calculate a predicate for the final iteration to deal with the tail.
      "whilelt p1.b, wzr, %w[width_last_y] \n"
      "whilelt p2.b, wzr, %w[width_last_uv] \n"  //
      READNV_SVE_2X NVTORGB_SVE_2X(t, b) RGBTOARGB8_SVE_2X
      "st4b {z16.b, z17.b, z18.b, z19.b}, p1, [%[dst_argb]] \n"

      "99: \n"
      : [src_y] "+r"(src_y),                                // %[src_y]
        [src_uv] "+r"(src_vu),                              // %[src_vu]
        [dst_argb] "+r"(dst_argb),                          // %[dst_argb]
        [width] "+r"(width)                                 // %[width]
      : [vl] "r"(vl),                                       // %[vl]
        [kUVCoeff] "r"(&yuvconstants->kUVCoeff),            // %[kUVCoeff]
        [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias),  // %[kRGBCoeffBias]
        [width_last_y] "r"(width_last_y),                   // %[width_last_y]
        [width_last_uv] "r"(width_last_uv)                  // %[width_last_uv]
      : "cc""memory", YUVTORGB_SVE_REGS, "p2");
}

static inline void NV12ToRGB24Row_SVE_SC(
    const uint8_t* src_y,
    const uint8_t* src_uv,
    uint8_t* dst_rgb24,
    const struct YuvConstants* yuvconstants,
    int width) STREAMING_COMPATIBLE {
  uint64_t vl;
  asm("cntb %0" : "=r"(vl));
  int width_last_y = width & (vl - 1);
  int width_last_uv = width_last_y + (width_last_y & 1);
  asm volatile(
      "ptrue p0.b \n"  //
      YUVTORGB_SVE_SETUP
      "dup z19.b, #255 \n"  // Alpha
      "subs %w[width], %w[width], %w[vl] \n"
      "b.lt 2f \n"

      // Run bulk of computation with an all-true predicate to avoid predicate
      // generation overhead.
      "ptrue p1.b \n"
      "ptrue p2.b \n"
      "1: \n"  //
      READNV_SVE_2X NVTORGB_SVE_2X(b, t) RGBTOARGB8_SVE_2X
      "subs %w[width], %w[width], %w[vl] \n"
      "st3b {z16.b, z17.b, z18.b}, p1, [%[dst_rgb24]] \n"
      "incb %[dst_rgb24], all, mul #3 \n"
      "b.ge 1b \n"

      "2: \n"
      "adds %w[width], %w[width], %w[vl] \n"
      "b.eq 99f \n"

      // Calculate a predicate for the final iteration to deal with the tail.
      "whilelt p1.b, wzr, %w[width_last_y] \n"
      "whilelt p2.b, wzr, %w[width_last_uv] \n"  //
      READNV_SVE_2X NVTORGB_SVE_2X(b, t) RGBTOARGB8_SVE_2X
      "st3b {z16.b, z17.b, z18.b}, p1, [%[dst_rgb24]] \n"

      "99: \n"
      : [src_y] "+r"(src_y),                                // %[src_y]
        [src_uv] "+r"(src_uv),                              // %[src_uv]
        [dst_rgb24] "+r"(dst_rgb24),                        // %[dst_rgb24]
        [width] "+r"(width)                                 // %[width]
      : [vl] "r"(vl),                                       // %[vl]
        [kUVCoeff] "r"(&yuvconstants->kUVCoeff),            // %[kUVCoeff]
        [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias),  // %[kRGBCoeffBias]
        [width_last_y] "r"(width_last_y),                   // %[width_last_y]
        [width_last_uv] "r"(width_last_uv)                  // %[width_last_uv]
      : "cc""memory", YUVTORGB_SVE_REGS, "p2");
}

static inline void NV21ToRGB24Row_SVE_SC(
    const uint8_t* src_y,
    const uint8_t* src_vu,
    uint8_t* dst_rgb24,
    const struct YuvConstants* yuvconstants,
    int width) STREAMING_COMPATIBLE {
  uint64_t vl;
  asm("cntb %0" : "=r"(vl));
  int width_last_y = width & (vl - 1);
  int width_last_uv = width_last_y + (width_last_y & 1);
  asm volatile(
      "ptrue p0.b \n"  //
      YUVTORGB_SVE_SETUP
      "dup z19.b, #255 \n"  // Alpha
      "subs %w[width], %w[width], %w[vl] \n"
      "b.lt 2f \n"

      // Run bulk of computation with an all-true predicate to avoid predicate
      // generation overhead.
      "ptrue p1.b \n"
      "ptrue p2.b \n"
      "1: \n"  //
      READNV_SVE_2X NVTORGB_SVE_2X(t, b) RGBTOARGB8_SVE_2X
      "subs %w[width], %w[width], %w[vl] \n"
      "st3b {z16.b, z17.b, z18.b}, p1, [%[dst_rgb24]] \n"
      "incb %[dst_rgb24], all, mul #3 \n"
      "b.ge 1b \n"

      "2: \n"
      "adds %w[width], %w[width], %w[vl] \n"
      "b.eq 99f \n"

      // Calculate a predicate for the final iteration to deal with the tail.
      "whilelt p1.b, wzr, %w[width_last_y] \n"
      "whilelt p2.b, wzr, %w[width_last_uv] \n"  //
      READNV_SVE_2X NVTORGB_SVE_2X(t, b) RGBTOARGB8_SVE_2X
      "st3b {z16.b, z17.b, z18.b}, p1, [%[dst_rgb24]] \n"

      "99: \n"
      : [src_y] "+r"(src_y),                                // %[src_y]
        [src_uv] "+r"(src_vu),                              // %[src_vu]
        [dst_rgb24] "+r"(dst_rgb24),                        // %[dst_rgb24]
        [width] "+r"(width)                                 // %[width]
      : [vl] "r"(vl),                                       // %[vl]
        [kUVCoeff] "r"(&yuvconstants->kUVCoeff),            // %[kUVCoeff]
        [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias),  // %[kRGBCoeffBias]
        [width_last_y] "r"(width_last_y),                   // %[width_last_y]
        [width_last_uv] "r"(width_last_uv)                  // %[width_last_uv]
      : "cc""memory", YUVTORGB_SVE_REGS, "p2");
}

#define READYUY2_SVE                                        \
  "ld1w {z0.s}, p2/z, [%[src_yuy2]] \n" /* YUYV */ \
  "incb %[src_yuy2] \n"            \
  "prfm pldl1keep, [%[src_yuy2], 448] \n"            \
  "tbl z1.b, {z0.b}, z22.b \n" /* UVUV */ \
  "trn1 z0.b, z0.b, z0.b \n" /* YYYY */

static inline void YUY2ToARGBRow_SVE_SC(const uint8_t* src_yuy2,
                                        uint8_t* dst_argb,
                                        const struct YuvConstants* yuvconstants,
                                        int width) STREAMING_COMPATIBLE {
  uint32_t nv_uv_start = 0x03010301U;
  uint32_t nv_uv_step = 0x04040404U;
  uint64_t vl;
  asm("cnth %0" : "=r"(vl));
  int width_last_y = width & (vl - 1);
  int width_last_uv = width_last_y + (width_last_y & 1);
  asm volatile(
      "ptrue p0.b \n"
      "index z22.s, %w[nv_uv_start], %w[nv_uv_step] \n"
      "dup z19.b, #255 \n"  // Alpha
      YUVTORGB_SVE_SETUP
      "subs %w[width], %w[width], %w[vl] \n"
      "b.lt 2f \n"

      // Run bulk of computation with an all-true predicate to avoid predicate
      // generation overhead.
      "ptrue p1.h \n"
      "ptrue p2.h \n"
      "1: \n"  //
      READYUY2_SVE NVTORGB_SVE RGBTOARGB8_SVE
      "subs %w[width], %w[width], %w[vl] \n"
      "st2h {z16.h, z17.h}, p1, [%[dst_argb]] \n"
      "add %[dst_argb], %[dst_argb], %[vl], lsl #2 \n"
      "b.ge 1b \n"

      "2: \n"
      "adds %w[width], %w[width], %w[vl] \n"
      "b.eq 99f \n"

      // Calculate a predicate for the final iteration to deal with the tail.
      "whilelt p1.h, wzr, %w[width_last_y] \n"
      "whilelt p2.h, wzr, %w[width_last_uv] \n"  //
      READYUY2_SVE NVTORGB_SVE RGBTOARGB8_SVE
      "st2h {z16.h, z17.h}, p1, [%[dst_argb]] \n"

      "99: \n"
      : [src_yuy2] "+r"(src_yuy2),                          // %[src_yuy2]
        [dst_argb] "+r"(dst_argb),                          // %[dst_argb]
        [width] "+r"(width)                                 // %[width]
      : [vl] "r"(vl),                                       // %[vl]
        [kUVCoeff] "r"(&yuvconstants->kUVCoeff),            // %[kUVCoeff]
        [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias),  // %[kRGBCoeffBias]
        [nv_uv_start] "r"(nv_uv_start),                     // %[nv_uv_start]
        [nv_uv_step] "r"(nv_uv_step),                       // %[nv_uv_step]
        [width_last_y] "r"(width_last_y),                   // %[width_last_y]
        [width_last_uv] "r"(width_last_uv)                  // %[width_last_uv]
      : "cc""memory", YUVTORGB_SVE_REGS, "p2");
}

#define READUYVY_SVE                                        \
  "ld1w {z0.s}, p2/z, [%[src_uyvy]] \n" /* UYVY */ \
  "incb %[src_uyvy] \n"            \
  "prfm pldl1keep, [%[src_uyvy], 448] \n"            \
  "tbl z1.b, {z0.b}, z22.b \n" /* UVUV */ \
  "trn2 z0.b, z0.b, z0.b \n" /* YYYY */

static inline void UYVYToARGBRow_SVE_SC(const uint8_t* src_uyvy,
                                        uint8_t* dst_argb,
                                        const struct YuvConstants* yuvconstants,
                                        int width) STREAMING_COMPATIBLE {
  uint32_t nv_uv_start = 0x02000200U;
  uint32_t nv_uv_step = 0x04040404U;
  uint64_t vl;
  asm("cnth %0" : "=r"(vl));
  int width_last_y = width & (vl - 1);
  int width_last_uv = width_last_y + (width_last_y & 1);
  asm volatile(
      "ptrue p0.b \n"
      "index z22.s, %w[nv_uv_start], %w[nv_uv_step] \n"
      "dup z19.b, #255 \n"  // Alpha
      YUVTORGB_SVE_SETUP
      "subs %w[width], %w[width], %w[vl] \n"
      "b.lt 2f \n"

      // Run bulk of computation with an all-true predicate to avoid predicate
      // generation overhead.
      "ptrue p1.h \n"
      "ptrue p2.h \n"
      "1: \n"  //
      READUYVY_SVE NVTORGB_SVE RGBTOARGB8_SVE
      "subs %w[width], %w[width], %w[vl] \n"
      "st2h {z16.h, z17.h}, p1, [%[dst_argb]] \n"
      "add %[dst_argb], %[dst_argb], %[vl], lsl #2 \n"
      "b.ge 1b \n"

      "2: \n"
      "adds %w[width], %w[width], %w[vl] \n"
      "b.eq 99f \n"

      // Calculate a predicate for the final iteration to deal with the tail.
      "2: \n"
      "whilelt p1.h, wzr, %w[width_last_y] \n"
      "whilelt p2.h, wzr, %w[width_last_uv] \n"  //
      READUYVY_SVE NVTORGB_SVE RGBTOARGB8_SVE
      "st2h {z16.h, z17.h}, p1, [%[dst_argb]] \n"

      "99: \n"
      : [src_uyvy] "+r"(src_uyvy),                          // %[src_yuy2]
        [dst_argb] "+r"(dst_argb),                          // %[dst_argb]
        [width] "+r"(width)                                 // %[width]
      : [vl] "r"(vl),                                       // %[vl]
        [kUVCoeff] "r"(&yuvconstants->kUVCoeff),            // %[kUVCoeff]
        [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias),  // %[kRGBCoeffBias]
        [nv_uv_start] "r"(nv_uv_start),                     // %[nv_uv_start]
        [nv_uv_step] "r"(nv_uv_step),                       // %[nv_uv_step]
        [width_last_y] "r"(width_last_y),                   // %[width_last_y]
        [width_last_uv] "r"(width_last_uv)                  // %[width_last_uv]
      : "cc""memory", YUVTORGB_SVE_REGS, "p2");
}

static inline void I210ToARGBRow_SVE_SC(const uint16_t* src_y,
                                        const uint16_t* src_u,
                                        const uint16_t* src_v,
                                        uint8_t* dst_argb,
                                        const struct YuvConstants* yuvconstants,
                                        int width) STREAMING_COMPATIBLE {
  uint64_t vl;
  asm("cnth %0" : "=r"(vl));
  int width_last_y = width & (vl - 1);
  asm volatile(
      "ptrue p0.b \n"  //
      YUVTORGB_SVE_SETUP
      "dup z19.b, #255 \n"  // Alpha
      "subs %w[width], %w[width], %w[vl] \n"
      "b.lt 2f \n"

      // Run bulk of computation with an all-true predicate to avoid predicate
      // generation overhead.
      "ptrue p1.h \n"
      "1: \n"  //
      READI210_SVE I4XXTORGB_SVE RGBTOARGB8_SVE
      "subs %w[width], %w[width], %w[vl] \n"
      "st2h {z16.h, z17.h}, p1, [%[dst_argb]] \n"
      "add %[dst_argb], %[dst_argb], %[vl], lsl #2 \n"
      "b.ge 1b \n"

      "2: \n"
      "adds %w[width], %w[width], %w[vl] \n"
      "b.eq 99f \n"

      // Calculate a predicate for the final iteration to deal with the tail.
      "whilelt p1.h, wzr, %w[width_last_y] \n"  //
      READI210_SVE I4XXTORGB_SVE RGBTOARGB8_SVE
      "st2h {z16.h, z17.h}, p1, [%[dst_argb]] \n"

      "99: \n"
      : [src_y] "+r"(src_y),                                // %[src_y]
        [src_u] "+r"(src_u),                                // %[src_u]
        [src_v] "+r"(src_v),                                // %[src_v]
        [dst_argb] "+r"(dst_argb),                          // %[dst_argb]
        [width] "+r"(width)                                 // %[width]
      : [vl] "r"(vl),                                       // %[vl]
        [kUVCoeff] "r"(&yuvconstants->kUVCoeff),            // %[kUVCoeff]
        [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias),  // %[kRGBCoeffBias]
        [width_last_y] "r"(width_last_y)                    // %[width_last_y]
      : "cc""memory", YUVTORGB_SVE_REGS);
}

static inline void I210AlphaToARGBRow_SVE_SC(
    const uint16_t* src_y,
    const uint16_t* src_u,
    const uint16_t* src_v,
    const uint16_t* src_a,
    uint8_t* dst_argb,
    const struct YuvConstants* yuvconstants,
    int width) STREAMING_COMPATIBLE {
  uint64_t vl;
  asm("cnth %0" : "=r"(vl));
  int width_last_y = width & (vl - 1);
  asm volatile(
      "ptrue p0.b \n"  //
      YUVTORGB_SVE_SETUP
      "subs %w[width], %w[width], %w[vl] \n"
      "b.lt 2f \n"

      // Run bulk of computation with an all-true predicate to avoid predicate
      // generation overhead.
      "ptrue p1.h \n"
      "1: \n"  //
      READI210_SVE
      "ld1h {z19.h}, p1/z, [%[src_a]] \n"  //
      I4XXTORGB_SVE
      "incb %[src_a] \n"  //
      RGBATOARGB8_SVE
      "subs %w[width], %w[width], %w[vl] \n"
      "st2h {z16.h, z17.h}, p1, [%[dst_argb]] \n"
      "add %[dst_argb], %[dst_argb], %[vl], lsl #2 \n"
      "b.ge 1b \n"

      "2: \n"
      "adds %w[width], %w[width], %w[vl] \n"
      "b.eq 99f \n"

      // Calculate a predicate for the final iteration to deal with the tail.
      "whilelt p1.h, wzr, %w[width_last_y] \n"  //
      READI210_SVE
      "ld1h {z19.h}, p1/z, [%[src_a]] \n"  //
      I4XXTORGB_SVE RGBATOARGB8_SVE
      "st2h {z16.h, z17.h}, p1, [%[dst_argb]] \n"

      "99: \n"
      : [src_y] "+r"(src_y),                                // %[src_y]
        [src_u] "+r"(src_u),                                // %[src_u]
        [src_v] "+r"(src_v),                                // %[src_v]
        [src_a] "+r"(src_a),                                // %[src_a]
        [dst_argb] "+r"(dst_argb),                          // %[dst_argb]
        [width] "+r"(width)                                 // %[width]
      : [vl] "r"(vl),                                       // %[vl]
        [kUVCoeff] "r"(&yuvconstants->kUVCoeff),            // %[kUVCoeff]
        [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias),  // %[kRGBCoeffBias]
        [width_last_y] "r"(width_last_y)                    // %[width_last_y]
      : "cc""memory", YUVTORGB_SVE_REGS);
}

static inline void I210ToAR30Row_SVE_SC(const uint16_t* src_y,
                                        const uint16_t* src_u,
                                        const uint16_t* src_v,
                                        uint8_t* dst_ar30,
                                        const struct YuvConstants* yuvconstants,
                                        int width) STREAMING_COMPATIBLE {
  uint64_t vl;
  asm("cnth %0" : "=r"(vl));
  int width_last_y = width & (vl - 1);
  // The limit is used for saturating the 2.14 red channel in STOREAR30_SVE.
  uint16_t limit = 0x3ff0;
  asm volatile(
      "ptrue p0.b \n"  //
      YUVTORGB_SVE_SETUP
      "dup z23.h, %w[limit] \n"
      "subs %w[width], %w[width], %w[vl] \n"
      "b.lt 2f \n"

      // Run bulk of computation with an all-true predicate to avoid predicate
      // generation overhead.
      "ptrue p1.h \n"
      "1: \n"  //
      READI210_SVE I4XXTORGB_SVE STOREAR30_SVE
      "subs %w[width], %w[width], %w[vl] \n"
      "b.ge 1b \n"

      "2: \n"
      "adds %w[width], %w[width], %w[vl] \n"
      "b.eq 99f \n"

      // Calculate a predicate for the final iteration to deal with the tail.
      "whilelt p1.h, wzr, %w[width_last_y] \n"  //
      READI210_SVE I4XXTORGB_SVE STOREAR30_SVE

      "99: \n"
      : [src_y] "+r"(src_y),                                // %[src_y]
        [src_u] "+r"(src_u),                                // %[src_u]
        [src_v] "+r"(src_v),                                // %[src_v]
        [dst_ar30] "+r"(dst_ar30),                          // %[dst_ar30]
        [width] "+r"(width)                                 // %[width]
      : [vl] "r"(vl),                                       // %[vl]
        [kUVCoeff] "r"(&yuvconstants->kUVCoeff),            // %[kUVCoeff]
        [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias),  // %[kRGBCoeffBias]
        [width_last_y] "r"(width_last_y),                   // %[width_last_y]
        [limit] "r"(limit)                                  // %[limit]
      : "cc""memory", YUVTORGB_SVE_REGS);
}

// P210 has 10 bits in msb of 16 bit NV12 style layout.
static inline void P210ToARGBRow_SVE_SC(const uint16_t* src_y,
                                        const uint16_t* src_uv,
                                        uint8_t* dst_argb,
                                        const struct YuvConstants* yuvconstants,
                                        int width) STREAMING_COMPATIBLE {
  uint64_t vl;
  asm("cnth %0" : "=r"(vl));
  int width_last_y = width & (vl - 1);
  int width_last_uv = width_last_y + (width_last_y & 1);
  uint32_t nv_uv_start = 0x03010301U;
  uint32_t nv_uv_step = 0x04040404U;
  asm volatile(
      "ptrue p0.b \n"  //
      YUVTORGB_SVE_SETUP
      "index z22.s, %w[nv_uv_start], %w[nv_uv_step] \n"
      "dup z19.b, #255 \n"  // Alpha
      "subs %w[width], %w[width], %w[vl] \n"
      "b.lt 2f \n"

      // Run bulk of computation with an all-true predicate to avoid predicate
      // generation overhead.
      "ptrue p1.h \n"
      "ptrue p2.h \n"
      "1: \n"  //
      READP210_SVE NVTORGB_SVE RGBTOARGB8_SVE
      "subs %w[width], %w[width], %w[vl] \n"
      "st2h {z16.h, z17.h}, p1, [%[dst_argb]] \n"
      "add %[dst_argb], %[dst_argb], %[vl], lsl #2 \n"
--> --------------------

--> maximum size reached

--> --------------------

Messung V0.5
C=66 H=76 G=70

¤ Dauer der Verarbeitung: 0.10 Sekunden  (vorverarbeitet)  ¤

*© Formatika GbR, Deutschland






Wurzel

Suchen

Beweissystem der NASA

Beweissystem Isabelle

NIST Cobol Testsuite

Cephes Mathematical Library

Wiener Entwicklungsmethode

Haftungshinweis

Die Informationen auf dieser Webseite wurden nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit, noch Qualität der bereit gestellten Informationen zugesichert.

Bemerkung:

Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.