Quelle highbd_sad_avx2.c

Sprache: C

/*
*  Copyright (c) 2022 The WebM project authors. All Rights Reserved.
*
*  Use of this source code is governed by a BSD-style license
*  that can be found in the LICENSE file in the root of the source
*  tree. An additional intellectual property rights grant can be found
*  in the file PATENTS.  All contributing project authors may
*  be found in the AUTHORS file in the root of the source tree.
*/
#include <immintrin.h>
#include "./vpx_dsp_rtcd.h"
#include "vpx/vpx_integer.h"

static VPX_FORCE_INLINE unsigned int calc_final(const __m256i sums_32) {
  const __m256i t0 = _mm256_add_epi32(sums_32, _mm256_srli_si256(sums_32, 8));
  const __m256i t1 = _mm256_add_epi32(t0, _mm256_srli_si256(t0, 4));
  const __m128i sum = _mm_add_epi32(_mm256_castsi256_si128(t1),
                                    _mm256_extractf128_si256(t1, 1));
  return (unsigned int)_mm_cvtsi128_si32(sum);
}

static VPX_FORCE_INLINE void highbd_sad64xH(__m256i *sums_16,
                                            const uint16_t *src, int src_stride,
                                            uint16_t *ref, int ref_stride,
                                            int height) {
  int i;
  for (i = 0; i < height; ++i) {
    // load src and all ref[]
    const __m256i s0 = _mm256_load_si256((const __m256i *)src);
    const __m256i s1 = _mm256_load_si256((const __m256i *)(src + 16));
    const __m256i s2 = _mm256_load_si256((const __m256i *)(src + 32));
    const __m256i s3 = _mm256_load_si256((const __m256i *)(src + 48));
    const __m256i r0 = _mm256_loadu_si256((const __m256i *)ref);
    const __m256i r1 = _mm256_loadu_si256((const __m256i *)(ref + 16));
    const __m256i r2 = _mm256_loadu_si256((const __m256i *)(ref + 32));
    const __m256i r3 = _mm256_loadu_si256((const __m256i *)(ref + 48));
    // absolute differences between every ref[] to src
    const __m256i abs_diff0 = _mm256_abs_epi16(_mm256_sub_epi16(r0, s0));
    const __m256i abs_diff1 = _mm256_abs_epi16(_mm256_sub_epi16(r1, s1));
    const __m256i abs_diff2 = _mm256_abs_epi16(_mm256_sub_epi16(r2, s2));
    const __m256i abs_diff3 = _mm256_abs_epi16(_mm256_sub_epi16(r3, s3));
    // sum every abs diff
    *sums_16 =
        _mm256_add_epi16(*sums_16, _mm256_add_epi16(abs_diff0, abs_diff1));
    *sums_16 =
        _mm256_add_epi16(*sums_16, _mm256_add_epi16(abs_diff2, abs_diff3));

    src += src_stride;
    ref += ref_stride;
  }
}

static VPX_FORCE_INLINE unsigned int highbd_sad64xN_avx2(const uint8_t *src_ptr,
                                                         int src_stride,
                                                         const uint8_t *ref_ptr,
                                                         int ref_stride,
                                                         int n) {
  const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr);
  uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr);
  __m256i sums_32 = _mm256_setzero_si256();
  int i;

  for (i = 0; i < (n / 2); ++i) {
    __m256i sums_16 = _mm256_setzero_si256();

    highbd_sad64xH(&sums_16, src, src_stride, ref, ref_stride, 2);

    /* sums_16 will outrange after 2 rows, so add current sums_16 to
     * sums_32*/
    sums_32 = _mm256_add_epi32(
        sums_32,
        _mm256_add_epi32(
            _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16)),
            _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16, 1))));

    src += src_stride << 1;
    ref += ref_stride << 1;
  }
  return calc_final(sums_32);
}

#define HIGHBD_SAD64XN(n)                                                      \
  unsigned int vpx_highbd_sad64x##n##_avx2(const uint8_t *src, int src_stride, \
                                           const uint8_t *ref,                 \
                                           int ref_stride) {                   \
    return highbd_sad64xN_avx2(src, src_stride, ref, ref_stride, n);           \
  }

#define HIGHBD_SADSKIP64xN(n)                                                \
  unsigned int vpx_highbd_sad_skip_64x##n##_avx2(                            \
      const uint8_t *src, int src_stride, const uint8_t *ref,                \
      int ref_stride) {                                                      \
    return 2 * highbd_sad64xN_avx2(src, 2 * src_stride, ref, 2 * ref_stride, \
                                   n / 2);                                   \
  }

static VPX_FORCE_INLINE void highbd_sad32xH(__m256i *sums_16,
                                            const uint16_t *src, int src_stride,
                                            uint16_t *ref, int ref_stride,
                                            int height) {
  int i;
  for (i = 0; i < height; ++i) {
    // load src and all ref[]
    const __m256i s0 = _mm256_load_si256((const __m256i *)src);
    const __m256i s1 = _mm256_load_si256((const __m256i *)(src + 16));
    const __m256i r0 = _mm256_loadu_si256((const __m256i *)ref);
    const __m256i r1 = _mm256_loadu_si256((const __m256i *)(ref + 16));
    // absolute differences between every ref[] to src
    const __m256i abs_diff0 = _mm256_abs_epi16(_mm256_sub_epi16(r0, s0));
    const __m256i abs_diff1 = _mm256_abs_epi16(_mm256_sub_epi16(r1, s1));
    // sum every abs diff
    *sums_16 = _mm256_add_epi16(*sums_16, abs_diff0);
    *sums_16 = _mm256_add_epi16(*sums_16, abs_diff1);

    src += src_stride;
    ref += ref_stride;
  }
}

static VPX_FORCE_INLINE unsigned int highbd_sad32xN_avx2(const uint8_t *src_ptr,
                                                         int src_stride,
                                                         const uint8_t *ref_ptr,
                                                         int ref_stride,
                                                         int n) {
  const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr);
  uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr);
  __m256i sums_32 = _mm256_setzero_si256();
  int i;

  for (i = 0; i < (n / 8); ++i) {
    __m256i sums_16 = _mm256_setzero_si256();

    highbd_sad32xH(&sums_16, src, src_stride, ref, ref_stride, 8);

    /* sums_16 will outrange after 8 rows, so add current sums_16 to
     * sums_32*/
    sums_32 = _mm256_add_epi32(
        sums_32,
        _mm256_add_epi32(
            _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16)),
            _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16, 1))));

    src += src_stride << 3;
    ref += ref_stride << 3;
  }
  return calc_final(sums_32);
}

#define HIGHBD_SAD32XN(n)                                                      \
  unsigned int vpx_highbd_sad32x##n##_avx2(const uint8_t *src, int src_stride, \
                                           const uint8_t *ref,                 \
                                           int ref_stride) {                   \
    return highbd_sad32xN_avx2(src, src_stride, ref, ref_stride, n);           \
  }

#define HIGHBD_SADSKIP32xN(n)                                                \
  unsigned int vpx_highbd_sad_skip_32x##n##_avx2(                            \
      const uint8_t *src, int src_stride, const uint8_t *ref,                \
      int ref_stride) {                                                      \
    return 2 * highbd_sad32xN_avx2(src, 2 * src_stride, ref, 2 * ref_stride, \
                                   n / 2);                                   \
  }

static VPX_FORCE_INLINE void highbd_sad16xH(__m256i *sums_16,
                                            const uint16_t *src, int src_stride,
                                            uint16_t *ref, int ref_stride,
                                            int height) {
  int i;
  for (i = 0; i < height; i += 2) {
    // load src and all ref[]
    const __m256i s0 = _mm256_load_si256((const __m256i *)src);
    const __m256i s1 = _mm256_load_si256((const __m256i *)(src + src_stride));
    const __m256i r0 = _mm256_loadu_si256((const __m256i *)ref);
    const __m256i r1 = _mm256_loadu_si256((const __m256i *)(ref + ref_stride));
    // absolute differences between every ref[] to src
    const __m256i abs_diff0 = _mm256_abs_epi16(_mm256_sub_epi16(r0, s0));
    const __m256i abs_diff1 = _mm256_abs_epi16(_mm256_sub_epi16(r1, s1));
    // sum every abs diff
    *sums_16 = _mm256_add_epi16(*sums_16, abs_diff0);
    *sums_16 = _mm256_add_epi16(*sums_16, abs_diff1);

    src += src_stride << 1;
    ref += ref_stride << 1;
  }
}

static VPX_FORCE_INLINE unsigned int highbd_sad16xN_avx2(const uint8_t *src_ptr,
                                                         int src_stride,
                                                         const uint8_t *ref_ptr,
                                                         int ref_stride,
                                                         int n) {
  const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr);
  uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr);
  __m256i sums_32 = _mm256_setzero_si256();
  const int height = VPXMIN(16, n);
  const int num_iters = n / height;
  int i;

  for (i = 0; i < num_iters; ++i) {
    __m256i sums_16 = _mm256_setzero_si256();

    highbd_sad16xH(&sums_16, src, src_stride, ref, ref_stride, height);

    // sums_16 will outrange after 16 rows, so add current sums_16 to sums_32
    sums_32 = _mm256_add_epi32(
        sums_32,
        _mm256_add_epi32(
            _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16)),
            _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16, 1))));

    src += src_stride << 4;
    ref += ref_stride << 4;
  }
  return calc_final(sums_32);
}

#define HIGHBD_SAD16XN(n)                                                      \
  unsigned int vpx_highbd_sad16x##n##_avx2(const uint8_t *src, int src_stride, \
                                           const uint8_t *ref,                 \
                                           int ref_stride) {                   \
    return highbd_sad16xN_avx2(src, src_stride, ref, ref_stride, n);           \
  }

#define HIGHBD_SADSKIP16xN(n)                                                \
  unsigned int vpx_highbd_sad_skip_16x##n##_avx2(                            \
      const uint8_t *src, int src_stride, const uint8_t *ref,                \
      int ref_stride) {                                                      \
    return 2 * highbd_sad16xN_avx2(src, 2 * src_stride, ref, 2 * ref_stride, \
                                   n / 2);                                   \
  }

unsigned int vpx_highbd_sad16x16_avx2(const uint8_t *src_ptr, int src_stride,
                                      const uint8_t *ref_ptr, int ref_stride) {
  const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr);
  uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr);
  __m256i sums_16 = _mm256_setzero_si256();

  highbd_sad16xH(&sums_16, src, src_stride, ref, ref_stride, 16);

  {
    const __m256i sums_32 = _mm256_add_epi32(
        _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16)),
        _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16, 1)));
    return calc_final(sums_32);
  }
}

unsigned int vpx_highbd_sad16x8_avx2(const uint8_t *src_ptr, int src_stride,
                                     const uint8_t *ref_ptr, int ref_stride) {
  const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr);
  uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr);
  __m256i sums_16 = _mm256_setzero_si256();

  highbd_sad16xH(&sums_16, src, src_stride, ref, ref_stride, 8);

  {
    const __m256i sums_32 = _mm256_add_epi32(
        _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16)),
        _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16, 1)));
    return calc_final(sums_32);
  }
}

// clang-format off
HIGHBD_SAD64XN(64)
HIGHBD_SADSKIP64xN(64)
HIGHBD_SAD64XN(32)
HIGHBD_SADSKIP64xN(32)
HIGHBD_SAD32XN(64)
HIGHBD_SADSKIP32xN(64)
HIGHBD_SAD32XN(32)
HIGHBD_SADSKIP32xN(32)
HIGHBD_SAD32XN(16)
HIGHBD_SADSKIP32xN(16)
HIGHBD_SAD16XN(32)
HIGHBD_SADSKIP16xN(32)
HIGHBD_SADSKIP16xN(16)
HIGHBD_SADSKIP16xN(8)
//clang-format on

// AVG -------------------------------------------------------------------------
static VPX_FORCE_INLINE void highbd_sad64xH_avg(__m256i *sums_16,
                                                const uint16_t *src,
                                                int src_stride, uint16_t *ref,
                                                int ref_stride, uint16_t *sec,
                                                int height) {
  int i;
  for (i = 0; i < height; ++i) {
    // load src and all ref[]
    const __m256i s0 = _mm256_load_si256((const __m256i *)src);
    const __m256i s1 = _mm256_load_si256((const __m256i *)(src + 16));
    const __m256i s2 = _mm256_load_si256((const __m256i *)(src + 32));
    const __m256i s3 = _mm256_load_si256((const __m256i *)(src + 48));
    const __m256i r0 = _mm256_loadu_si256((const __m256i *)ref);
    const __m256i r1 = _mm256_loadu_si256((const __m256i *)(ref + 16));
    const __m256i r2 = _mm256_loadu_si256((const __m256i *)(ref + 32));
    const __m256i r3 = _mm256_loadu_si256((const __m256i *)(ref + 48));
    const __m256i x0 = _mm256_loadu_si256((const __m256i *)sec);
    const __m256i x1 = _mm256_loadu_si256((const __m256i *)(sec + 16));
    const __m256i x2 = _mm256_loadu_si256((const __m256i *)(sec + 32));
    const __m256i x3 = _mm256_loadu_si256((const __m256i *)(sec + 48));
    const __m256i avg0 = _mm256_avg_epu16(r0, x0);
    const __m256i avg1 = _mm256_avg_epu16(r1, x1);
    const __m256i avg2 = _mm256_avg_epu16(r2, x2);
    const __m256i avg3 = _mm256_avg_epu16(r3, x3);
    // absolute differences between every ref/pred avg to src
    const __m256i abs_diff0 = _mm256_abs_epi16(_mm256_sub_epi16(avg0, s0));
    const __m256i abs_diff1 = _mm256_abs_epi16(_mm256_sub_epi16(avg1, s1));
    const __m256i abs_diff2 = _mm256_abs_epi16(_mm256_sub_epi16(avg2, s2));
    const __m256i abs_diff3 = _mm256_abs_epi16(_mm256_sub_epi16(avg3, s3));
    // sum every abs diff
    *sums_16 =
        _mm256_add_epi16(*sums_16, _mm256_add_epi16(abs_diff0, abs_diff1));
    *sums_16 =
        _mm256_add_epi16(*sums_16, _mm256_add_epi16(abs_diff2, abs_diff3));

    src += src_stride;
    ref += ref_stride;
    sec += 64;
  }
}

#define HIGHBD_SAD64XN_AVG(n)                                                 \
  unsigned int vpx_highbd_sad64x##n##_avg_avx2(                               \
      const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,         \
      int ref_stride, const uint8_t *second_pred) {                           \
    const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr);                       \
    uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr);                             \
    uint16_t *sec = CONVERT_TO_SHORTPTR(second_pred);                         \
    __m256i sums_32 = _mm256_setzero_si256();                                 \
    int i;                                                                    \
                                                                              \
    for (i = 0; i < (n / 2); ++i) {                                           \
      __m256i sums_16 = _mm256_setzero_si256();                               \
                                                                              \
      highbd_sad64xH_avg(&sums_16, src, src_stride, ref, ref_stride, sec, 2); \
                                                                              \
      /* sums_16 will outrange after 2 rows, so add current sums_16 to        \
       * sums_32*/                                                            \
      sums_32 = _mm256_add_epi32(                                             \
          sums_32,                                                            \
          _mm256_add_epi32(                                                   \
              _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16)),         \
              _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16, 1))));  \
                                                                              \
      src += src_stride << 1;                                                 \
      ref += ref_stride << 1;                                                 \
      sec += 64 << 1;                                                         \
    }                                                                         \
    return calc_final(sums_32);                                               \
  }

// 64x64
HIGHBD_SAD64XN_AVG(64)

// 64x32
HIGHBD_SAD64XN_AVG(32)

static VPX_FORCE_INLINE void highbd_sad32xH_avg(__m256i *sums_16,
                                                const uint16_t *src,
                                                int src_stride, uint16_t *ref,
                                                int ref_stride, uint16_t *sec,
                                                int height) {
  int i;
  for (i = 0; i < height; ++i) {
    // load src and all ref[]
    const __m256i s0 = _mm256_load_si256((const __m256i *)src);
    const __m256i s1 = _mm256_load_si256((const __m256i *)(src + 16));
    const __m256i r0 = _mm256_loadu_si256((const __m256i *)ref);
    const __m256i r1 = _mm256_loadu_si256((const __m256i *)(ref + 16));
    const __m256i x0 = _mm256_loadu_si256((const __m256i *)sec);
    const __m256i x1 = _mm256_loadu_si256((const __m256i *)(sec + 16));
    const __m256i avg0 = _mm256_avg_epu16(r0, x0);
    const __m256i avg1 = _mm256_avg_epu16(r1, x1);
    // absolute differences between every ref/pred avg to src
    const __m256i abs_diff0 = _mm256_abs_epi16(_mm256_sub_epi16(avg0, s0));
    const __m256i abs_diff1 = _mm256_abs_epi16(_mm256_sub_epi16(avg1, s1));
    // sum every abs diff
    *sums_16 = _mm256_add_epi16(*sums_16, abs_diff0);
    *sums_16 = _mm256_add_epi16(*sums_16, abs_diff1);

    src += src_stride;
    ref += ref_stride;
    sec += 32;
  }
}

#define HIGHBD_SAD32XN_AVG(n)                                                 \
  unsigned int vpx_highbd_sad32x##n##_avg_avx2(                               \
      const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,         \
      int ref_stride, const uint8_t *second_pred) {                           \
    const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr);                       \
    uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr);                             \
    uint16_t *sec = CONVERT_TO_SHORTPTR(second_pred);                         \
    __m256i sums_32 = _mm256_setzero_si256();                                 \
    int i;                                                                    \
                                                                              \
    for (i = 0; i < (n / 8); ++i) {                                           \
      __m256i sums_16 = _mm256_setzero_si256();                               \
                                                                              \
      highbd_sad32xH_avg(&sums_16, src, src_stride, ref, ref_stride, sec, 8); \
                                                                              \
      /* sums_16 will outrange after 8 rows, so add current sums_16 to        \
       * sums_32*/                                                            \
      sums_32 = _mm256_add_epi32(                                             \
          sums_32,                                                            \
          _mm256_add_epi32(                                                   \
              _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16)),         \
              _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16, 1))));  \
                                                                              \
      src += src_stride << 3;                                                 \
      ref += ref_stride << 3;                                                 \
      sec += 32 << 3;                                                         \
    }                                                                         \
    return calc_final(sums_32);                                               \
  }

// 32x64
HIGHBD_SAD32XN_AVG(64)

// 32x32
HIGHBD_SAD32XN_AVG(32)

// 32x16
HIGHBD_SAD32XN_AVG(16)

static VPX_FORCE_INLINE void highbd_sad16xH_avg(__m256i *sums_16,
                                                const uint16_t *src,
                                                int src_stride, uint16_t *ref,
                                                int ref_stride, uint16_t *sec,
                                                int height) {
  int i;
  for (i = 0; i < height; i += 2) {
    // load src and all ref[]
    const __m256i s0 = _mm256_load_si256((const __m256i *)src);
    const __m256i s1 = _mm256_load_si256((const __m256i *)(src + src_stride));
    const __m256i r0 = _mm256_loadu_si256((const __m256i *)ref);
    const __m256i r1 = _mm256_loadu_si256((const __m256i *)(ref + ref_stride));
    const __m256i x0 = _mm256_loadu_si256((const __m256i *)sec);
    const __m256i x1 = _mm256_loadu_si256((const __m256i *)(sec + 16));
    const __m256i avg0 = _mm256_avg_epu16(r0, x0);
    const __m256i avg1 = _mm256_avg_epu16(r1, x1);
    // absolute differences between every ref[] to src
    const __m256i abs_diff0 = _mm256_abs_epi16(_mm256_sub_epi16(avg0, s0));
    const __m256i abs_diff1 = _mm256_abs_epi16(_mm256_sub_epi16(avg1, s1));
    // sum every abs diff
    *sums_16 = _mm256_add_epi16(*sums_16, abs_diff0);
    *sums_16 = _mm256_add_epi16(*sums_16, abs_diff1);

    src += src_stride << 1;
    ref += ref_stride << 1;
    sec += 32;
  }
}

unsigned int vpx_highbd_sad16x32_avg_avx2(const uint8_t *src_ptr,
                                          int src_stride,
                                          const uint8_t *ref_ptr,
                                          int ref_stride,
                                          const uint8_t *second_pred) {
  const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr);
  uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr);
  uint16_t *sec = CONVERT_TO_SHORTPTR(second_pred);
  __m256i sums_32 = _mm256_setzero_si256();
  int i;

  for (i = 0; i < 2; ++i) {
    __m256i sums_16 = _mm256_setzero_si256();

    highbd_sad16xH_avg(&sums_16, src, src_stride, ref, ref_stride, sec, 16);

    // sums_16 will outrange after 16 rows, so add current sums_16 to sums_32
    sums_32 = _mm256_add_epi32(
        sums_32,
        _mm256_add_epi32(
            _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16)),
            _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16, 1))));

    src += src_stride << 4;
    ref += ref_stride << 4;
    sec += 16 << 4;
  }
  return calc_final(sums_32);
}

unsigned int vpx_highbd_sad16x16_avg_avx2(const uint8_t *src_ptr,
                                          int src_stride,
                                          const uint8_t *ref_ptr,
                                          int ref_stride,
                                          const uint8_t *second_pred) {
  const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr);
  uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr);
  uint16_t *sec = CONVERT_TO_SHORTPTR(second_pred);
  __m256i sums_16 = _mm256_setzero_si256();

  highbd_sad16xH_avg(&sums_16, src, src_stride, ref, ref_stride, sec, 16);

  {
    const __m256i sums_32 = _mm256_add_epi32(
        _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16)),
        _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16, 1)));
    return calc_final(sums_32);
  }
}

unsigned int vpx_highbd_sad16x8_avg_avx2(const uint8_t *src_ptr, int src_stride,
                                         const uint8_t *ref_ptr, int ref_stride,
                                         const uint8_t *second_pred) {
  const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr);
  uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr);
  uint16_t *sec = CONVERT_TO_SHORTPTR(second_pred);
  __m256i sums_16 = _mm256_setzero_si256();

  highbd_sad16xH_avg(&sums_16, src, src_stride, ref, ref_stride, sec, 8);

  {
    const __m256i sums_32 = _mm256_add_epi32(
        _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16)),
        _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16, 1)));
    return calc_final(sums_32);
  }
}

Messung V0.5 in Prozent

¤ Dauer der Verarbeitung: 0.10 Sekunden (vorverarbeitet am 2026-06-11) ¤

Wurzel

Suchen

Beweissystem der NASA

Beweissystem Isabelle

NIST Cobol Testsuite

Cephes Mathematical Library

Wiener Entwicklungsmethode

Haftungshinweis

Die Informationen auf dieser Webseite wurden nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit, noch Qualität der bereit gestellten Informationen zugesichert.

Bemerkung:

Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.