// Copyright (c) the JPEG XL Project Authors. All rights reserved.
//
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
#include "lib/jxl/base/status.h"
#ifndef FJXL_SELF_INCLUDE
#include <assert.h>
#include <stdint.h>
#include <string.h>
#include <algorithm>
#include <array>
#include <limits>
#include <memory>
#include <vector>
#include "lib/jxl/enc_fast_lossless.h"
#if FJXL_STANDALONE
#if defined (_MSC_VER)
using ssize_t = intptr_t;
#endif
#else // FJXL_STANDALONE
#include "lib/jxl/encode_internal.h"
#endif // FJXL_STANDALONE
#if defined (__x86_64__) || defined (_M_X64)
#define FJXL_ARCH_IS_X86_64 1
#else
#define FJXL_ARCH_IS_X86_64 0
#endif
#if defined (__i386__) || defined (_M_IX86) || FJXL_ARCH_IS_X86_64
#define FJXL_ARCH_IS_X86 1
#else
#define FJXL_ARCH_IS_X86 0
#endif
#if FJXL_ARCH_IS_X86
#if defined (_MSC_VER)
#include <intrin.h>
#else // _MSC_VER
#include <cpuid.h>
#endif // _MSC_VER
#endif // FJXL_ARCH_IS_X86
// Enable NEON and AVX2/AVX512 if not asked to do otherwise and the compilers
// support it.
#if defined (__aarch64__) || defined (_M_ARM64) // ARCH
#include <arm_neon.h>
#if !defined (FJXL_ENABLE_NEON)
#define FJXL_ENABLE_NEON 1
#endif // !defined(FJXL_ENABLE_NEON)
#elif FJXL_ARCH_IS_X86_64 && !defined (_MSC_VER) // ARCH
#include <immintrin.h>
// manually add _mm512_cvtsi512_si32 definition if missing
// (e.g. with Xcode on macOS Mojave)
// copied from gcc 11.1.0 include/avx512fintrin.h line 14367-14373
#if defined (__clang__) && \
((!defined (__apple_build_version__) && __clang_major__ < 10) || \
(defined (__apple_build_version__) && __apple_build_version__ < 12000032))
inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm512_cvtsi512_si32(__m512i __A) {
__v16si __B = (__v16si)__A;
return __B[0];
}
#endif
#if !defined (FJXL_ENABLE_AVX2)
#define FJXL_ENABLE_AVX2 1
#endif // !defined(FJXL_ENABLE_AVX2)
#if !defined (FJXL_ENABLE_AVX512)
// On clang-7 or earlier, and gcc-10 or earlier, AVX512 seems broken.
#if (defined (__clang__) && \
(!defined (__apple_build_version__) && __clang_major__ > 7) || \
(defined (__apple_build_version__) && \
__apple_build_version__ > 10010046)) || \
(defined (__GNUC__) && __GNUC__ > 10)
#define FJXL_ENABLE_AVX512 1
#endif
#endif // !defined(FJXL_ENABLE_AVX512)
#endif // ARCH
#ifndef FJXL_ENABLE_NEON
#define FJXL_ENABLE_NEON 0
#endif
#ifndef FJXL_ENABLE_AVX2
#define FJXL_ENABLE_AVX2 0
#endif
#ifndef FJXL_ENABLE_AVX512
#define FJXL_ENABLE_AVX512 0
#endif
namespace {
enum class CpuFeature : uint32_t {
kAVX2 = 0,
kAVX512F,
kAVX512VL,
kAVX512CD,
kAVX512BW,
kVBMI,
kVBMI2
};
constexpr uint32_t CpuFeatureBit(CpuFeature feature) {
return 1u << static_cast <uint32_t>(feature);
}
#if FJXL_ARCH_IS_X86
#if defined (_MSC_VER)
void Cpuid(const uint32_t level, const uint32_t count,
std::array<uint32_t, 4>& abcd) {
int regs[4];
__cpuidex(regs, level, count);
for (int i = 0; i < 4; ++i) {
abcd[i] = regs[i];
}
}
uint32_t ReadXCR0() { return static_cast <uint32_t>(_xgetbv(0)); }
#else // _MSC_VER
void Cpuid(const uint32_t level, const uint32_t count,
std::array<uint32_t, 4>& abcd) {
uint32_t a;
uint32_t b;
uint32_t c;
uint32_t d;
__cpuid_count(level, count, a, b, c, d);
abcd[0] = a;
abcd[1] = b;
abcd[2] = c;
abcd[3] = d;
}
uint32_t ReadXCR0() {
uint32_t xcr0;
uint32_t xcr0_high;
const uint32_t index = 0;
asm volatile (".byte 0x0F, 0x01, 0xD0"
: "=a" (xcr0), "=d" (xcr0_high)
: "c" (index));
return xcr0;
}
#endif // _MSC_VER
uint32_t DetectCpuFeatures() {
uint32_t flags = 0; // return value
std::array<uint32_t, 4> abcd;
Cpuid(0, 0, abcd);
const uint32_t max_level = abcd[0];
const auto check_bit = [](uint32_t v, uint32_t idx) -> bool {
return (v & (1U << idx)) != 0;
};
// Extended features
if (max_level >= 7) {
Cpuid(7, 0, abcd);
flags |= check_bit(abcd[1], 5) ? CpuFeatureBit(CpuFeature::kAVX2) : 0;
flags |= check_bit(abcd[1], 16) ? CpuFeatureBit(CpuFeature::kAVX512F) : 0;
flags |= check_bit(abcd[1], 28) ? CpuFeatureBit(CpuFeature::kAVX512CD) : 0;
flags |= check_bit(abcd[1], 30) ? CpuFeatureBit(CpuFeature::kAVX512BW) : 0;
flags |= check_bit(abcd[1], 31) ? CpuFeatureBit(CpuFeature::kAVX512VL) : 0;
flags |= check_bit(abcd[2], 1) ? CpuFeatureBit(CpuFeature::kVBMI) : 0;
flags |= check_bit(abcd[2], 6) ? CpuFeatureBit(CpuFeature::kVBMI2) : 0;
}
Cpuid(1, 0, abcd);
const bool os_has_xsave = check_bit(abcd[2], 27);
if (os_has_xsave) {
const uint32_t xcr0 = ReadXCR0();
if (!check_bit(xcr0, 1) || !check_bit(xcr0, 2) || !check_bit(xcr0, 5) ||
!check_bit(xcr0, 6) || !check_bit(xcr0, 7)) {
flags = 0; // TODO(eustas): be more selective?
}
}
return flags;
}
#else // FJXL_ARCH_IS_X86
uint32_t DetectCpuFeatures() { return 0; }
#endif // FJXL_ARCH_IS_X86
#if defined (_MSC_VER)
#define FJXL_UNUSED
#else
#define FJXL_UNUSED __attribute__((unused))
#endif
FJXL_UNUSED bool HasCpuFeature(CpuFeature feature) {
static uint32_t cpu_features = DetectCpuFeatures();
return (cpu_features & CpuFeatureBit(feature)) != 0;
}
#if defined (_MSC_VER) && !defined (__clang__)
#define FJXL_INLINE __forceinline
FJXL_INLINE uint32_t FloorLog2(uint32_t v) {
unsigned long index;
_BitScanReverse(&index, v);
return index;
}
FJXL_INLINE uint32_t CtzNonZero(uint64_t v) {
unsigned long index;
_BitScanForward(&index, v);
return index;
}
#else
#define FJXL_INLINE inline __attribute__((always_inline))
FJXL_INLINE uint32_t FloorLog2(uint32_t v) {
return v ? 31 - __builtin_clz(v) : 0;
}
FJXL_UNUSED FJXL_INLINE uint32_t CtzNonZero(uint64_t v) {
return __builtin_ctzll(v);
}
#endif
// Compiles to a memcpy on little-endian systems.
FJXL_INLINE void StoreLE64(uint8_t* tgt, uint64_t data) {
#if (!defined (__BYTE_ORDER__) || (__BYTE_ORDER__ != __ORDER_LITTLE_ENDIAN__))
for (int i = 0; i < 8; i++) {
tgt[i] = (data >> (i * 8)) & 0xFF;
}
#else
memcpy(tgt, &data, 8);
#endif
}
FJXL_INLINE size_t AddBits(uint32_t count, uint64_t bits, uint8_t* data_buf,
size_t& bits_in_buffer, uint64_t& bit_buffer) {
bit_buffer |= bits << bits_in_buffer;
bits_in_buffer += count;
StoreLE64(data_buf, bit_buffer);
size_t bytes_in_buffer = bits_in_buffer / 8;
bits_in_buffer -= bytes_in_buffer * 8;
bit_buffer >>= bytes_in_buffer * 8;
return bytes_in_buffer;
}
struct BitWriter {
void Allocate(size_t maximum_bit_size) {
assert(data == nullptr);
// Leave some padding.
data.reset(static_cast <uint8_t*>(malloc(maximum_bit_size / 8 + 64)));
}
void Write(uint32_t count, uint64_t bits) {
bytes_written += AddBits(count, bits, data.get() + bytes_written,
bits_in_buffer, buffer);
}
void ZeroPadToByte() {
if (bits_in_buffer != 0) {
Write(8 - bits_in_buffer, 0);
}
}
FJXL_INLINE void WriteMultiple(const uint64_t* nbits, const uint64_t* bits,
size_t n) {
// Necessary because Write() is only guaranteed to work with <=56 bits.
// Trying to SIMD-fy this code results in lower speed (and definitely less
// clarity).
{
for (size_t i = 0; i < n; i++) {
this->buffer |= bits[i] << this->bits_in_buffer;
memcpy(this->data.get() + this->bytes_written, &this->buffer, 8);
uint64_t shift = 64 - this->bits_in_buffer;
this->bits_in_buffer += nbits[i];
// This `if` seems to be faster than using ternaries.
if (this->bits_in_buffer >= 64) {
uint64_t next_buffer = bits[i] >> shift;
this->buffer = next_buffer;
this->bits_in_buffer -= 64;
this->bytes_written += 8;
}
}
memcpy(this->data.get() + this->bytes_written, &this->buffer, 8);
size_t bytes_in_buffer = this->bits_in_buffer / 8;
this->bits_in_buffer -= bytes_in_buffer * 8;
this->buffer >>= bytes_in_buffer * 8;
this->bytes_written += bytes_in_buffer;
}
}
std::unique_ptr<uint8_t[], void (*)(void *)> data = {nullptr, free};
size_t bytes_written = 0;
size_t bits_in_buffer = 0;
uint64_t buffer = 0;
};
size_t SectionSize(const std::array<BitWriter, 4>& group_data) {
size_t sz = 0;
for (size_t j = 0; j < 4; j++) {
const auto & writer = group_data[j];
sz += writer.bytes_written * 8 + writer.bits_in_buffer;
}
sz = (sz + 7) / 8;
return sz;
}
constexpr size_t kMaxFrameHeaderSize = 5;
constexpr size_t kGroupSizeOffset[4] = {
static_cast <size_t>(0),
static_cast <size_t>(1024),
static_cast <size_t>(17408),
static_cast <size_t>(4211712),
};
constexpr size_t kTOCBits[4] = {12, 16, 24, 32};
size_t TOCBucket(size_t group_size) {
size_t bucket = 0;
while (bucket < 3 && group_size >= kGroupSizeOffset[bucket + 1]) ++bucket;
return bucket;
}
#if !FJXL_STANDALONE
size_t TOCSize(const std::vector<size_t>& group_sizes) {
size_t toc_bits = 0;
for (size_t group_size : group_sizes) {
toc_bits += kTOCBits[TOCBucket(group_size)];
}
return (toc_bits + 7) / 8;
}
size_t FrameHeaderSize(bool have_alpha, bool is_last) {
size_t nbits = 28 + (have_alpha ? 4 : 0) + (is_last ? 0 : 2);
return (nbits + 7) / 8;
}
#endif
void ComputeAcGroupDataOffset(size_t dc_global_size, size_t num_dc_groups,
size_t num_ac_groups, size_t& min_dc_global_size,
size_t& ac_group_offset) {
// Max AC group size is 768 kB, so max AC group TOC bits is 24.
size_t ac_toc_max_bits = num_ac_groups * 24;
size_t ac_toc_min_bits = num_ac_groups * 12;
size_t max_padding = 1 + (ac_toc_max_bits - ac_toc_min_bits + 7) / 8;
min_dc_global_size = dc_global_size;
size_t dc_global_bucket = TOCBucket(min_dc_global_size);
while (TOCBucket(min_dc_global_size + max_padding) > dc_global_bucket) {
dc_global_bucket = TOCBucket(min_dc_global_size + max_padding);
min_dc_global_size = kGroupSizeOffset[dc_global_bucket];
}
assert(TOCBucket(min_dc_global_size) == dc_global_bucket);
assert(TOCBucket(min_dc_global_size + max_padding) == dc_global_bucket);
size_t max_toc_bits =
kTOCBits[dc_global_bucket] + 12 * (1 + num_dc_groups) + ac_toc_max_bits;
size_t max_toc_size = (max_toc_bits + 7) / 8;
ac_group_offset = kMaxFrameHeaderSize + max_toc_size + min_dc_global_size;
}
#if !FJXL_STANDALONE
size_t ComputeDcGlobalPadding(const std::vector<size_t>& group_sizes,
size_t ac_group_data_offset,
size_t min_dc_global_size, bool have_alpha,
bool is_last) {
std::vector<size_t> new_group_sizes = group_sizes;
new_group_sizes[0] = min_dc_global_size;
size_t toc_size = TOCSize(new_group_sizes);
size_t actual_offset =
FrameHeaderSize(have_alpha, is_last) + toc_size + group_sizes[0];
return ac_group_data_offset - actual_offset;
}
#endif
constexpr size_t kNumRawSymbols = 19;
constexpr size_t kNumLZ77 = 33;
constexpr size_t kLZ77CacheSize = 32;
constexpr size_t kLZ77Offset = 224;
constexpr size_t kLZ77MinLength = 7;
void EncodeHybridUintLZ77(uint32_t value, uint32_t* token, uint32_t* nbits,
uint32_t* bits) {
// 400 config
uint32_t n = FloorLog2(value);
*token = value < 16 ? value : 16 + n - 4;
*nbits = value < 16 ? 0 : n;
*bits = value < 16 ? 0 : value - (1 << *nbits);
}
struct PrefixCode {
uint8_t raw_nbits[kNumRawSymbols] = {};
uint8_t raw_bits[kNumRawSymbols] = {};
uint8_t lz77_nbits[kNumLZ77] = {};
uint16_t lz77_bits[kNumLZ77] = {};
uint64_t lz77_cache_bits[kLZ77CacheSize] = {};
uint8_t lz77_cache_nbits[kLZ77CacheSize] = {};
size_t numraw;
static uint16_t BitReverse(size_t nbits, uint16_t bits) {
constexpr uint16_t kNibbleLookup[16] = {
0b0000, 0b1000, 0b0100, 0b1100, 0b0010, 0b1010, 0b0110, 0b1110,
0b0001, 0b1001, 0b0101, 0b1101, 0b0011, 0b1011, 0b0111, 0b1111,
};
uint16_t rev16 = (kNibbleLookup[bits & 0xF] << 12) |
(kNibbleLookup[(bits >> 4) & 0xF] << 8) |
(kNibbleLookup[(bits >> 8) & 0xF] << 4) |
(kNibbleLookup[bits >> 12]);
return rev16 >> (16 - nbits);
}
// Create the prefix codes given the code lengths.
// Supports the code lengths being split into two halves.
static void ComputeCanonicalCode(const uint8_t* first_chunk_nbits,
uint8_t* first_chunk_bits,
size_t first_chunk_size,
const uint8_t* second_chunk_nbits,
uint16_t* second_chunk_bits,
size_t second_chunk_size) {
constexpr size_t kMaxCodeLength = 15;
uint8_t code_length_counts[kMaxCodeLength + 1] = {};
for (size_t i = 0; i < first_chunk_size; i++) {
code_length_counts[first_chunk_nbits[i]]++;
assert(first_chunk_nbits[i] <= kMaxCodeLength);
assert(first_chunk_nbits[i] <= 8);
assert(first_chunk_nbits[i] > 0);
}
for (size_t i = 0; i < second_chunk_size; i++) {
code_length_counts[second_chunk_nbits[i]]++;
assert(second_chunk_nbits[i] <= kMaxCodeLength);
}
uint16_t next_code[kMaxCodeLength + 1] = {};
uint16_t code = 0;
for (size_t i = 1; i < kMaxCodeLength + 1; i++) {
code = (code + code_length_counts[i - 1]) << 1;
next_code[i] = code;
}
for (size_t i = 0; i < first_chunk_size; i++) {
first_chunk_bits[i] =
BitReverse(first_chunk_nbits[i], next_code[first_chunk_nbits[i]]++);
}
for (size_t i = 0; i < second_chunk_size; i++) {
second_chunk_bits[i] =
BitReverse(second_chunk_nbits[i], next_code[second_chunk_nbits[i]]++);
}
}
template <typename T>
static void ComputeCodeLengthsNonZeroImpl(const uint64_t* freqs, size_t n,
size_t precision, T infty,
const uint8_t* min_limit,
const uint8_t* max_limit,
uint8_t* nbits) {
assert(precision < 15);
assert(n <= kMaxNumSymbols);
std::vector<T> dynp(((1U << precision) + 1) * (n + 1), infty);
auto d = [&](size_t sym, size_t off) -> T& {
return dynp[sym * ((1 << precision) + 1) + off];
};
d(0, 0) = 0;
for (size_t sym = 0; sym < n; sym++) {
for (T bits = min_limit[sym]; bits <= max_limit[sym]; bits++) {
size_t off_delta = 1U << (precision - bits);
for (size_t off = 0; off + off_delta <= (1U << precision); off++) {
d(sym + 1, off + off_delta) =
std::min(d(sym, off) + static_cast <T>(freqs[sym]) * bits,
d(sym + 1, off + off_delta));
}
}
}
size_t sym = n;
size_t off = 1U << precision;
assert(d(sym, off) != infty);
while (sym-- > 0) {
assert(off > 0);
for (size_t bits = min_limit[sym]; bits <= max_limit[sym]; bits++) {
size_t off_delta = 1U << (precision - bits);
if (off_delta <= off &&
d(sym + 1, off) == d(sym, off - off_delta) + freqs[sym] * bits) {
off -= off_delta;
nbits[sym] = bits;
break ;
}
}
}
}
// Computes nbits[i] for i <= n, subject to min_limit[i] <= nbits[i] <=
// max_limit[i] and sum 2**-nbits[i] == 1, so to minimize sum(nbits[i] *
// freqs[i]).
static void ComputeCodeLengthsNonZero(const uint64_t* freqs, size_t n,
uint8_t* min_limit, uint8_t* max_limit,
uint8_t* nbits) {
size_t precision = 0;
size_t shortest_length = 255;
uint64_t freqsum = 0;
for (size_t i = 0; i < n; i++) {
assert(freqs[i] != 0);
freqsum += freqs[i];
if (min_limit[i] < 1) min_limit[i] = 1;
assert(min_limit[i] <= max_limit[i]);
precision = std::max<size_t>(max_limit[i], precision);
shortest_length = std::min<size_t>(min_limit[i], shortest_length);
}
// If all the minimum limits are greater than 1, shift precision so that we
// behave as if the shortest was 1.
precision -= shortest_length - 1;
uint64_t infty = freqsum * precision;
if (infty < std::numeric_limits<uint32_t>::max() / 2) {
ComputeCodeLengthsNonZeroImpl(freqs, n, precision,
static_cast <uint32_t>(infty), min_limit,
max_limit, nbits);
} else {
ComputeCodeLengthsNonZeroImpl(freqs, n, precision, infty, min_limit,
max_limit, nbits);
}
}
static constexpr size_t kMaxNumSymbols =
kNumRawSymbols + 1 < kNumLZ77 ? kNumLZ77 : kNumRawSymbols + 1;
static void ComputeCodeLengths(const uint64_t* freqs, size_t n,
const uint8_t* min_limit_in,
const uint8_t* max_limit_in, uint8_t* nbits) {
assert(n <= kMaxNumSymbols);
uint64_t compact_freqs[kMaxNumSymbols];
uint8_t min_limit[kMaxNumSymbols];
uint8_t max_limit[kMaxNumSymbols];
size_t ni = 0;
for (size_t i = 0; i < n; i++) {
if (freqs[i]) {
compact_freqs[ni] = freqs[i];
min_limit[ni] = min_limit_in[i];
max_limit[ni] = max_limit_in[i];
ni++;
}
}
uint8_t num_bits[kMaxNumSymbols] = {};
ComputeCodeLengthsNonZero(compact_freqs, ni, min_limit, max_limit,
num_bits);
ni = 0;
for (size_t i = 0; i < n; i++) {
nbits[i] = 0;
if (freqs[i]) {
nbits[i] = num_bits[ni++];
}
}
}
// Invalid code, used to construct arrays.
PrefixCode() = default ;
template <typename BitDepth>
PrefixCode(BitDepth /* bitdepth */, uint64_t* raw_counts,
uint64_t* lz77_counts) {
// "merge" together all the lz77 counts in a single symbol for the level 1
// table (containing just the raw symbols, up to length 7).
uint64_t level1_counts[kNumRawSymbols + 1];
memcpy(level1_counts, raw_counts, kNumRawSymbols * sizeof (uint64_t));
numraw = kNumRawSymbols;
while (numraw > 0 && level1_counts[numraw - 1] == 0) numraw--;
level1_counts[numraw] = 0;
for (size_t i = 0; i < kNumLZ77; i++) {
level1_counts[numraw] += lz77_counts[i];
}
uint8_t level1_nbits[kNumRawSymbols + 1] = {};
ComputeCodeLengths(level1_counts, numraw + 1, BitDepth::kMinRawLength,
BitDepth::kMaxRawLength, level1_nbits);
uint8_t level2_nbits[kNumLZ77] = {};
uint8_t min_lengths[kNumLZ77] = {};
uint8_t l = 15 - level1_nbits[numraw];
uint8_t max_lengths[kNumLZ77];
for (uint8_t& max_length : max_lengths) {
max_length = l;
}
size_t num_lz77 = kNumLZ77;
while (num_lz77 > 0 && lz77_counts[num_lz77 - 1] == 0) num_lz77--;
ComputeCodeLengths(lz77_counts, num_lz77, min_lengths, max_lengths,
level2_nbits);
for (size_t i = 0; i < numraw; i++) {
raw_nbits[i] = level1_nbits[i];
}
for (size_t i = 0; i < num_lz77; i++) {
lz77_nbits[i] =
level2_nbits[i] ? level1_nbits[numraw] + level2_nbits[i] : 0;
}
ComputeCanonicalCode(raw_nbits, raw_bits, numraw, lz77_nbits, lz77_bits,
kNumLZ77);
// Prepare lz77 cache
for (size_t count = 0; count < kLZ77CacheSize; count++) {
unsigned token, nbits, bits;
EncodeHybridUintLZ77(count, &token, &nbits, &bits);
lz77_cache_nbits[count] = lz77_nbits[token] + nbits + raw_nbits[0];
lz77_cache_bits[count] =
(((bits << lz77_nbits[token]) | lz77_bits[token]) << raw_nbits[0]) |
raw_bits[0];
}
}
// Max bits written: 2 + 72 + 95 + 24 + 165 = 286
void WriteTo(BitWriter* writer) const {
uint64_t code_length_counts[18] = {};
code_length_counts[17] = 3 + 2 * (kNumLZ77 - 1);
for (uint8_t raw_nbit : raw_nbits) {
code_length_counts[raw_nbit]++;
}
for (uint8_t lz77_nbit : lz77_nbits) {
code_length_counts[lz77_nbit]++;
}
uint8_t code_length_nbits[18] = {};
uint8_t code_length_nbits_min[18] = {};
uint8_t code_length_nbits_max[18] = {
5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
};
ComputeCodeLengths(code_length_counts, 18, code_length_nbits_min,
code_length_nbits_max, code_length_nbits);
writer->Write(2, 0b00); // HSKIP = 0, i.e. don't skip code lengths.
// As per Brotli RFC.
uint8_t code_length_order[18] = {1, 2, 3, 4, 0, 5, 17, 6, 16,
7, 8, 9, 10, 11, 12, 13, 14, 15};
uint8_t code_length_length_nbits[] = {2, 4, 3, 2, 2, 4};
uint8_t code_length_length_bits[] = {0, 7, 3, 2, 1, 15};
// Encode lengths of code lengths.
size_t num_code_lengths = 18;
while (code_length_nbits[code_length_order[num_code_lengths - 1]] == 0) {
num_code_lengths--;
}
// Max bits written in this loop: 18 * 4 = 72
for (size_t i = 0; i < num_code_lengths; i++) {
int symbol = code_length_nbits[code_length_order[i]];
writer->Write(code_length_length_nbits[symbol],
code_length_length_bits[symbol]);
}
// Compute the canonical codes for the codes that represent the lengths of
// the actual codes for data.
uint16_t code_length_bits[18] = {};
ComputeCanonicalCode(nullptr, nullptr, 0, code_length_nbits,
code_length_bits, 18);
// Encode raw bit code lengths.
// Max bits written in this loop: 19 * 5 = 95
for (uint8_t raw_nbit : raw_nbits) {
writer->Write(code_length_nbits[raw_nbit], code_length_bits[raw_nbit]);
}
size_t num_lz77 = kNumLZ77;
while (lz77_nbits[num_lz77 - 1] == 0) {
num_lz77--;
}
// Encode 0s until 224 (start of LZ77 symbols). This is in total 224-19 =
// 205.
static_assert(kLZ77Offset == 224);
static_assert(kNumRawSymbols == 19);
{
// Max bits in this block: 24
writer->Write(code_length_nbits[17], code_length_bits[17]);
writer->Write(3, 0b010); // 5
writer->Write(code_length_nbits[17], code_length_bits[17]);
writer->Write(3, 0b000); // (5-2)*8 + 3 = 27
writer->Write(code_length_nbits[17], code_length_bits[17]);
writer->Write(3, 0b010); // (27-2)*8 + 5 = 205
}
// Encode LZ77 symbols, with values 224+i.
// Max bits written in this loop: 33 * 5 = 165
for (size_t i = 0; i < num_lz77; i++) {
writer->Write(code_length_nbits[lz77_nbits[i]],
code_length_bits[lz77_nbits[i]]);
}
}
};
} // namespace
extern "C" {
struct JxlFastLosslessFrameState {
JxlChunkedFrameInputSource input;
size_t width;
size_t height;
size_t num_groups_x;
size_t num_groups_y;
size_t num_dc_groups_x;
size_t num_dc_groups_y;
size_t nb_chans;
size_t bitdepth;
int big_endian;
int effort;
bool collided;
PrefixCode hcode[4];
std::vector<int16_t> lookup;
BitWriter header;
std::vector<std::array<BitWriter, 4>> group_data;
std::vector<size_t> group_sizes;
size_t ac_group_data_offset = 0;
size_t min_dc_global_size = 0;
size_t current_bit_writer = 0;
size_t bit_writer_byte_pos = 0;
size_t bits_in_buffer = 0;
uint64_t bit_buffer = 0;
bool process_done = false ;
};
size_t JxlFastLosslessOutputSize(const JxlFastLosslessFrameState* frame) {
size_t total_size_groups = 0;
for (const auto & section : frame->group_data) {
total_size_groups += SectionSize(section);
}
return frame->header.bytes_written + total_size_groups;
}
size_t JxlFastLosslessMaxRequiredOutput(
const JxlFastLosslessFrameState* frame) {
return JxlFastLosslessOutputSize(frame) + 32;
}
void JxlFastLosslessPrepareHeader(JxlFastLosslessFrameState* frame,
int add_image_header, int is_last) {
BitWriter* output = &frame->header;
output->Allocate(1000 + frame->group_sizes.size() * 32);
bool have_alpha = (frame->nb_chans == 2 || frame->nb_chans == 4);
#if FJXL_STANDALONE
if (add_image_header) {
// Signature
output->Write(16, 0x0AFF);
// Size header, hand-crafted.
// Not small
output->Write(1, 0);
auto wsz = [output](size_t size) {
if (size - 1 < (1 << 9)) {
output->Write(2, 0b00);
output->Write(9, size - 1);
} else if (size - 1 < (1 << 13)) {
output->Write(2, 0b01);
output->Write(13, size - 1);
} else if (size - 1 < (1 << 18)) {
output->Write(2, 0b10);
output->Write(18, size - 1);
} else {
output->Write(2, 0b11);
output->Write(30, size - 1);
}
};
wsz(frame->height);
// No special ratio.
output->Write(3, 0);
wsz(frame->width);
// Hand-crafted ImageMetadata.
output->Write(1, 0); // all_default
output->Write(1, 0); // extra_fields
output->Write(1, 0); // bit_depth.floating_point_sample
if (frame->bitdepth == 8) {
output->Write(2, 0b00); // bit_depth.bits_per_sample = 8
} else if (frame->bitdepth == 10) {
output->Write(2, 0b01); // bit_depth.bits_per_sample = 10
} else if (frame->bitdepth == 12) {
output->Write(2, 0b10); // bit_depth.bits_per_sample = 12
} else {
output->Write(2, 0b11); // 1 + u(6)
output->Write(6, frame->bitdepth - 1);
}
if (frame->bitdepth <= 14) {
output->Write(1, 1); // 16-bit-buffer sufficient
} else {
output->Write(1, 0); // 16-bit-buffer NOT sufficient
}
if (have_alpha) {
output->Write(2, 0b01); // One extra channel
output->Write(1, 1); // ... all_default (ie. 8-bit alpha)
} else {
output->Write(2, 0b00); // No extra channel
}
output->Write(1, 0); // Not XYB
if (frame->nb_chans > 2) {
output->Write(1, 1); // color_encoding.all_default (sRGB)
} else {
output->Write(1, 0); // color_encoding.all_default false
output->Write(1, 0); // color_encoding.want_icc false
output->Write(2, 1); // grayscale
output->Write(2, 1); // D65
output->Write(1, 0); // no gamma transfer function
output->Write(2, 0b10); // tf: 2 + u(4)
output->Write(4, 11); // tf of sRGB
output->Write(2, 1); // relative rendering intent
}
output->Write(2, 0b00); // No extensions.
output->Write(1, 1); // all_default transform data
// No ICC, no preview. Frame should start at byte boundary.
output->ZeroPadToByte();
}
#else
assert(!add_image_header);
#endif
// Handcrafted frame header.
output->Write(1, 0); // all_default
output->Write(2, 0b00); // regular frame
output->Write(1, 1); // modular
output->Write(2, 0b00); // default flags
output->Write(1, 0); // not YCbCr
output->Write(2, 0b00); // no upsampling
if (have_alpha) {
output->Write(2, 0b00); // no alpha upsampling
}
output->Write(2, 0b01); // default group size
output->Write(2, 0b00); // exactly one pass
output->Write(1, 0); // no custom size or origin
output->Write(2, 0b00); // kReplace blending mode
if (have_alpha) {
output->Write(2, 0b00); // kReplace blending mode for alpha channel
}
output->Write(1, is_last); // is_last
if (!is_last) {
output->Write(2, 0b00); // can not be saved as reference
}
output->Write(2, 0b00); // a frame has no name
output->Write(1, 0); // loop filter is not all_default
output->Write(1, 0); // no gaborish
output->Write(2, 0); // 0 EPF iters
output->Write(2, 0b00); // No LF extensions
output->Write(2, 0b00); // No FH extensions
output->Write(1, 0); // No TOC permutation
output->ZeroPadToByte(); // TOC is byte-aligned.
assert(add_image_header || output->bytes_written <= kMaxFrameHeaderSize);
for (size_t group_size : frame->group_sizes) {
size_t bucket = TOCBucket(group_size);
output->Write(2, bucket);
output->Write(kTOCBits[bucket] - 2, group_size - kGroupSizeOffset[bucket]);
}
output->ZeroPadToByte(); // Groups are byte-aligned.
}
#if !FJXL_STANDALONE
bool JxlFastLosslessOutputAlignedSection(
const BitWriter& bw, JxlEncoderOutputProcessorWrapper* output_processor) {
assert(bw.bits_in_buffer == 0);
const uint8_t* data = bw.data.get();
size_t remaining_len = bw.bytes_written;
while (remaining_len > 0) {
JXL_ASSIGN_OR_RETURN(auto buffer,
output_processor->GetBuffer(1, remaining_len));
size_t n = std::min(buffer.size(), remaining_len);
if (n == 0) break ;
memcpy(buffer.data(), data, n);
JXL_RETURN_IF_ERROR(buffer.advance(n));
data += n;
remaining_len -= n;
};
return true ;
}
bool JxlFastLosslessOutputHeaders(
JxlFastLosslessFrameState* frame_state,
JxlEncoderOutputProcessorWrapper* output_processor) {
JXL_RETURN_IF_ERROR(JxlFastLosslessOutputAlignedSection(frame_state->header,
output_processor));
JXL_RETURN_IF_ERROR(JxlFastLosslessOutputAlignedSection(
frame_state->group_data[0][0], output_processor));
return true ;
}
#endif
#if FJXL_ENABLE_AVX512
__attribute__((target("avx512vbmi2" ))) static size_t AppendBytesWithBitOffset(
const uint8_t* data, size_t n, size_t bit_buffer_nbits,
unsigned char * output, uint64_t& bit_buffer) {
if (n < 128) {
return 0;
}
size_t i = 0;
__m512i shift = _mm512_set1_epi64(64 - bit_buffer_nbits);
__m512i carry = _mm512_set1_epi64(bit_buffer << (64 - bit_buffer_nbits));
for (; i + 64 <= n; i += 64) {
__m512i current = _mm512_loadu_si512(data + i);
__m512i previous_u64 = _mm512_alignr_epi64(current, carry, 7);
carry = current;
__m512i out = _mm512_shrdv_epi64(previous_u64, current, shift);
_mm512_storeu_si512(output + i, out);
}
bit_buffer = data[i - 1] >> (8 - bit_buffer_nbits);
return i;
}
#endif
size_t JxlFastLosslessWriteOutput(JxlFastLosslessFrameState* frame,
unsigned char * output, size_t output_size) {
assert(output_size >= 32);
unsigned char * initial_output = output;
size_t (*append_bytes_with_bit_offset)(const uint8_t*, size_t, size_t,
unsigned char *, uint64_t&) = nullptr;
#if FJXL_ENABLE_AVX512
if (HasCpuFeature(CpuFeature::kVBMI2)) {
append_bytes_with_bit_offset = AppendBytesWithBitOffset;
}
#endif
while (true ) {
size_t& cur = frame->current_bit_writer;
size_t& bw_pos = frame->bit_writer_byte_pos;
if (cur >= 1 + frame->group_data.size() * frame->nb_chans) {
return output - initial_output;
}
if (output_size <= 9) {
return output - initial_output;
}
size_t nbc = frame->nb_chans;
const BitWriter& writer =
cur == 0 ? frame->header
: frame->group_data[(cur - 1) / nbc][(cur - 1) % nbc];
size_t full_byte_count =
std::min(output_size - 9, writer.bytes_written - bw_pos);
if (frame->bits_in_buffer == 0) {
memcpy(output, writer.data.get() + bw_pos, full_byte_count);
} else {
size_t i = 0;
if (append_bytes_with_bit_offset) {
i += append_bytes_with_bit_offset(
writer.data.get() + bw_pos, full_byte_count, frame->bits_in_buffer,
output, frame->bit_buffer);
}
#if defined (__BYTE_ORDER__) && (__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__)
// Copy 8 bytes at a time until we reach the border.
for (; i + 8 < full_byte_count; i += 8) {
uint64_t chunk;
memcpy(&chunk, writer.data.get() + bw_pos + i, 8);
uint64_t out = frame->bit_buffer | (chunk << frame->bits_in_buffer);
memcpy(output + i, &out, 8);
frame->bit_buffer = chunk >> (64 - frame->bits_in_buffer);
}
#endif
for (; i < full_byte_count; i++) {
AddBits(8, writer.data.get()[bw_pos + i], output + i,
frame->bits_in_buffer, frame->bit_buffer);
}
}
output += full_byte_count;
output_size -= full_byte_count;
bw_pos += full_byte_count;
if (bw_pos == writer.bytes_written) {
auto write = [&](size_t num, uint64_t bits) {
size_t n = AddBits(num, bits, output, frame->bits_in_buffer,
frame->bit_buffer);
output += n;
output_size -= n;
};
if (writer.bits_in_buffer) {
write(writer.bits_in_buffer, writer.buffer);
}
bw_pos = 0;
cur++;
if ((cur - 1) % nbc == 0 && frame->bits_in_buffer != 0) {
write(8 - frame->bits_in_buffer, 0);
}
}
}
}
void JxlFastLosslessFreeFrameState(JxlFastLosslessFrameState* frame) {
delete frame;
}
} // extern "C"
#endif
#ifdef FJXL_SELF_INCLUDE
namespace {
template <typename T>
struct VecPair {
T low;
T hi;
};
#ifdef FJXL_GENERIC_SIMD
#undef FJXL_GENERIC_SIMD
#endif
#ifdef FJXL_AVX512
#define FJXL_GENERIC_SIMD
struct SIMDVec32;
struct Mask32 {
__mmask16 mask;
SIMDVec32 IfThenElse(const SIMDVec32& if_true, const SIMDVec32& if_false);
size_t CountPrefix() const {
return CtzNonZero(~uint64_t{_cvtmask16_u32(mask)});
}
};
struct SIMDVec32 {
__m512i vec;
static constexpr size_t kLanes = 16;
FJXL_INLINE static SIMDVec32 Load(const uint32_t* data) {
return SIMDVec32{_mm512_loadu_si512((__m512i*)data)};
}
FJXL_INLINE void Store(uint32_t* data) {
_mm512_storeu_si512((__m512i*)data, vec);
}
FJXL_INLINE static SIMDVec32 Val(uint32_t v) {
return SIMDVec32{_mm512_set1_epi32(v)};
}
FJXL_INLINE SIMDVec32 ValToToken() const {
return SIMDVec32{
_mm512_sub_epi32(_mm512_set1_epi32(32), _mm512_lzcnt_epi32(vec))};
}
FJXL_INLINE SIMDVec32 SatSubU(const SIMDVec32& to_subtract) const {
return SIMDVec32{_mm512_sub_epi32(_mm512_max_epu32(vec, to_subtract.vec),
to_subtract.vec)};
}
FJXL_INLINE SIMDVec32 Sub(const SIMDVec32& to_subtract) const {
return SIMDVec32{_mm512_sub_epi32(vec, to_subtract.vec)};
}
FJXL_INLINE SIMDVec32 Add(const SIMDVec32& oth) const {
return SIMDVec32{_mm512_add_epi32(vec, oth.vec)};
}
FJXL_INLINE SIMDVec32 Xor (const SIMDVec32& oth) const {
return SIMDVec32{_mm512_xor_epi32(vec, oth.vec)};
}
FJXL_INLINE Mask32 Eq(const SIMDVec32& oth) const {
return Mask32{_mm512_cmpeq_epi32_mask(vec, oth.vec)};
}
FJXL_INLINE Mask32 Gt(const SIMDVec32& oth) const {
return Mask32{_mm512_cmpgt_epi32_mask(vec, oth.vec)};
}
FJXL_INLINE SIMDVec32 Pow2() const {
return SIMDVec32{_mm512_sllv_epi32(_mm512_set1_epi32(1), vec)};
}
template <size_t i>
FJXL_INLINE SIMDVec32 SignedShiftRight() const {
return SIMDVec32{_mm512_srai_epi32(vec, i)};
}
};
struct SIMDVec16;
struct Mask16 {
__mmask32 mask;
SIMDVec16 IfThenElse(const SIMDVec16& if_true, const SIMDVec16& if_false);
Mask16 And (const Mask16& oth) const {
return Mask16{_kand_mask32(mask, oth.mask)};
}
size_t CountPrefix() const {
return CtzNonZero(~uint64_t{_cvtmask32_u32(mask)});
}
};
struct SIMDVec16 {
__m512i vec;
static constexpr size_t kLanes = 32;
FJXL_INLINE static SIMDVec16 Load(const uint16_t* data) {
return SIMDVec16{_mm512_loadu_si512((__m512i*)data)};
}
FJXL_INLINE void Store(uint16_t* data) {
_mm512_storeu_si512((__m512i*)data, vec);
}
FJXL_INLINE static SIMDVec16 Val(uint16_t v) {
return SIMDVec16{_mm512_set1_epi16(v)};
}
FJXL_INLINE static SIMDVec16 FromTwo32(const SIMDVec32& lo,
const SIMDVec32& hi) {
auto tmp = _mm512_packus_epi32(lo.vec, hi.vec);
alignas(64) uint64_t perm[8] = {0, 2, 4, 6, 1, 3, 5, 7};
return SIMDVec16{
_mm512_permutex2var_epi64(tmp, _mm512_load_si512((__m512i*)perm), tmp)};
}
FJXL_INLINE SIMDVec16 ValToToken() const {
auto c16 = _mm512_set1_epi32(16);
auto c32 = _mm512_set1_epi32(32);
auto low16bit = _mm512_set1_epi32(0x0000FFFF);
auto lzhi =
_mm512_sub_epi32(c16, _mm512_min_epu32(c16, _mm512_lzcnt_epi32(vec)));
auto lzlo = _mm512_sub_epi32(
c32, _mm512_lzcnt_epi32(_mm512_and_si512(low16bit, vec)));
return SIMDVec16{_mm512_or_si512(lzlo, _mm512_slli_epi32(lzhi, 16))};
}
FJXL_INLINE SIMDVec16 SatSubU(const SIMDVec16& to_subtract) const {
return SIMDVec16{_mm512_subs_epu16(vec, to_subtract.vec)};
}
FJXL_INLINE SIMDVec16 Sub(const SIMDVec16& to_subtract) const {
return SIMDVec16{_mm512_sub_epi16(vec, to_subtract.vec)};
}
FJXL_INLINE SIMDVec16 Add(const SIMDVec16& oth) const {
return SIMDVec16{_mm512_add_epi16(vec, oth.vec)};
}
FJXL_INLINE SIMDVec16 Min(const SIMDVec16& oth) const {
return SIMDVec16{_mm512_min_epu16(vec, oth.vec)};
}
FJXL_INLINE Mask16 Eq(const SIMDVec16& oth) const {
return Mask16{_mm512_cmpeq_epi16_mask(vec, oth.vec)};
}
FJXL_INLINE Mask16 Gt(const SIMDVec16& oth) const {
return Mask16{_mm512_cmpgt_epi16_mask(vec, oth.vec)};
}
FJXL_INLINE SIMDVec16 Pow2() const {
return SIMDVec16{_mm512_sllv_epi16(_mm512_set1_epi16(1), vec)};
}
FJXL_INLINE SIMDVec16 Or (const SIMDVec16& oth) const {
return SIMDVec16{_mm512_or_si512(vec, oth.vec)};
}
FJXL_INLINE SIMDVec16 Xor (const SIMDVec16& oth) const {
return SIMDVec16{_mm512_xor_si512(vec, oth.vec)};
}
FJXL_INLINE SIMDVec16 And (const SIMDVec16& oth) const {
return SIMDVec16{_mm512_and_si512(vec, oth.vec)};
}
FJXL_INLINE SIMDVec16 HAdd(const SIMDVec16& oth) const {
return SIMDVec16{_mm512_srai_epi16(_mm512_add_epi16(vec, oth.vec), 1)};
}
FJXL_INLINE SIMDVec16 PrepareForU8Lookup() const {
return SIMDVec16{_mm512_or_si512(vec, _mm512_set1_epi16(0xFF00))};
}
FJXL_INLINE SIMDVec16 U8Lookup(const uint8_t* table) const {
return SIMDVec16{_mm512_shuffle_epi8(
_mm512_broadcast_i32x4(_mm_loadu_si128((__m128i*)table)), vec)};
}
FJXL_INLINE VecPair<SIMDVec16> Interleave(const SIMDVec16& low) const {
auto lo = _mm512_unpacklo_epi16(low.vec, vec);
auto hi = _mm512_unpackhi_epi16(low.vec, vec);
alignas(64) uint64_t perm1[8] = {0, 1, 8, 9, 2, 3, 10, 11};
alignas(64) uint64_t perm2[8] = {4, 5, 12, 13, 6, 7, 14, 15};
return {SIMDVec16{_mm512_permutex2var_epi64(
lo, _mm512_load_si512((__m512i*)perm1), hi)},
SIMDVec16{_mm512_permutex2var_epi64(
lo, _mm512_load_si512((__m512i*)perm2), hi)}};
}
FJXL_INLINE VecPair<SIMDVec32> Upcast() const {
auto lo = _mm512_unpacklo_epi16(vec, _mm512_setzero_si512());
auto hi = _mm512_unpackhi_epi16(vec, _mm512_setzero_si512());
alignas(64) uint64_t perm1[8] = {0, 1, 8, 9, 2, 3, 10, 11};
alignas(64) uint64_t perm2[8] = {4, 5, 12, 13, 6, 7, 14, 15};
return {SIMDVec32{_mm512_permutex2var_epi64(
lo, _mm512_load_si512((__m512i*)perm1), hi)},
SIMDVec32{_mm512_permutex2var_epi64(
lo, _mm512_load_si512((__m512i*)perm2), hi)}};
}
template <size_t i>
FJXL_INLINE SIMDVec16 SignedShiftRight() const {
return SIMDVec16{_mm512_srai_epi16(vec, i)};
}
static std::array<SIMDVec16, 1> LoadG8(const unsigned char * data) {
__m256i bytes = _mm256_loadu_si256((__m256i*)data);
return {SIMDVec16{_mm512_cvtepu8_epi16(bytes)}};
}
static std::array<SIMDVec16, 1> LoadG16(const unsigned char * data) {
return {Load((const uint16_t*)data)};
}
static std::array<SIMDVec16, 2> LoadGA8(const unsigned char * data) {
__m512i bytes = _mm512_loadu_si512((__m512i*)data);
__m512i gray = _mm512_and_si512(bytes, _mm512_set1_epi16(0xFF));
__m512i alpha = _mm512_srli_epi16(bytes, 8);
return {SIMDVec16{gray}, SIMDVec16{alpha}};
}
static std::array<SIMDVec16, 2> LoadGA16(const unsigned char * data) {
__m512i bytes1 = _mm512_loadu_si512((__m512i*)data);
__m512i bytes2 = _mm512_loadu_si512((__m512i*)(data + 64));
__m512i g_mask = _mm512_set1_epi32(0xFFFF);
__m512i permuteidx = _mm512_set_epi64(7, 5, 3, 1, 6, 4, 2, 0);
__m512i g = _mm512_permutexvar_epi64(
permuteidx, _mm512_packus_epi32(_mm512_and_si512(bytes1, g_mask),
_mm512_and_si512(bytes2, g_mask)));
__m512i a = _mm512_permutexvar_epi64(
permuteidx, _mm512_packus_epi32(_mm512_srli_epi32(bytes1, 16),
_mm512_srli_epi32(bytes2, 16)));
return {SIMDVec16{g}, SIMDVec16{a}};
}
static std::array<SIMDVec16, 3> LoadRGB8(const unsigned char * data) {
__m512i bytes0 = _mm512_loadu_si512((__m512i*)data);
__m512i bytes1 =
_mm512_zextsi256_si512(_mm256_loadu_si256((__m256i*)(data + 64)));
// 0x7A = element of upper half of second vector = 0 after lookup; still in
// the upper half once we add 1 or 2.
uint8_t z = 0x7A;
__m512i ridx =
_mm512_set_epi8(z, 93, z, 90, z, 87, z, 84, z, 81, z, 78, z, 75, z, 72,
z, 69, z, 66, z, 63, z, 60, z, 57, z, 54, z, 51, z, 48,
z, 45, z, 42, z, 39, z, 36, z, 33, z, 30, z, 27, z, 24,
z, 21, z, 18, z, 15, z, 12, z, 9, z, 6, z, 3, z, 0);
__m512i gidx = _mm512_add_epi8(ridx, _mm512_set1_epi8(1));
__m512i bidx = _mm512_add_epi8(gidx, _mm512_set1_epi8(1));
__m512i r = _mm512_permutex2var_epi8(bytes0, ridx, bytes1);
__m512i g = _mm512_permutex2var_epi8(bytes0, gidx, bytes1);
__m512i b = _mm512_permutex2var_epi8(bytes0, bidx, bytes1);
return {SIMDVec16{r}, SIMDVec16{g}, SIMDVec16{b}};
}
static std::array<SIMDVec16, 3> LoadRGB16(const unsigned char * data) {
__m512i bytes0 = _mm512_loadu_si512((__m512i*)data);
__m512i bytes1 = _mm512_loadu_si512((__m512i*)(data + 64));
__m512i bytes2 = _mm512_loadu_si512((__m512i*)(data + 128));
__m512i ridx_lo = _mm512_set_epi16(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 63, 60, 57,
54, 51, 48, 45, 42, 39, 36, 33, 30, 27,
24, 21, 18, 15, 12, 9, 6, 3, 0);
// -1 is such that when adding 1 or 2, we get the correct index for
// green/blue.
__m512i ridx_hi =
_mm512_set_epi16(29, 26, 23, 20, 17, 14, 11, 8, 5, 2, -1, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
__m512i gidx_lo = _mm512_add_epi16(ridx_lo, _mm512_set1_epi16(1));
__m512i gidx_hi = _mm512_add_epi16(ridx_hi, _mm512_set1_epi16(1));
__m512i bidx_lo = _mm512_add_epi16(gidx_lo, _mm512_set1_epi16(1));
__m512i bidx_hi = _mm512_add_epi16(gidx_hi, _mm512_set1_epi16(1));
__mmask32 rmask = _cvtu32_mask32(0b11111111110000000000000000000000);
__mmask32 gbmask = _cvtu32_mask32(0b11111111111000000000000000000000);
__m512i rlo = _mm512_permutex2var_epi16(bytes0, ridx_lo, bytes1);
__m512i glo = _mm512_permutex2var_epi16(bytes0, gidx_lo, bytes1);
__m512i blo = _mm512_permutex2var_epi16(bytes0, bidx_lo, bytes1);
__m512i r = _mm512_mask_permutexvar_epi16(rlo, rmask, ridx_hi, bytes2);
__m512i g = _mm512_mask_permutexvar_epi16(glo, gbmask, gidx_hi, bytes2);
__m512i b = _mm512_mask_permutexvar_epi16(blo, gbmask, bidx_hi, bytes2);
return {SIMDVec16{r}, SIMDVec16{g}, SIMDVec16{b}};
}
static std::array<SIMDVec16, 4> LoadRGBA8(const unsigned char * data) {
__m512i bytes1 = _mm512_loadu_si512((__m512i*)data);
__m512i bytes2 = _mm512_loadu_si512((__m512i*)(data + 64));
__m512i rg_mask = _mm512_set1_epi32(0xFFFF);
__m512i permuteidx = _mm512_set_epi64(7, 5, 3, 1, 6, 4, 2, 0);
__m512i rg = _mm512_permutexvar_epi64(
permuteidx, _mm512_packus_epi32(_mm512_and_si512(bytes1, rg_mask),
_mm512_and_si512(bytes2, rg_mask)));
__m512i b_a = _mm512_permutexvar_epi64(
permuteidx, _mm512_packus_epi32(_mm512_srli_epi32(bytes1, 16),
_mm512_srli_epi32(bytes2, 16)));
__m512i r = _mm512_and_si512(rg, _mm512_set1_epi16(0xFF));
__m512i g = _mm512_srli_epi16(rg, 8);
__m512i b = _mm512_and_si512(b_a, _mm512_set1_epi16(0xFF));
__m512i a = _mm512_srli_epi16(b_a, 8);
return {SIMDVec16{r}, SIMDVec16{g}, SIMDVec16{b}, SIMDVec16{a}};
}
static std::array<SIMDVec16, 4> LoadRGBA16(const unsigned char * data) {
__m512i bytes0 = _mm512_loadu_si512((__m512i*)data);
__m512i bytes1 = _mm512_loadu_si512((__m512i*)(data + 64));
__m512i bytes2 = _mm512_loadu_si512((__m512i*)(data + 128));
__m512i bytes3 = _mm512_loadu_si512((__m512i*)(data + 192));
auto pack32 = [](__m512i a, __m512i b) {
__m512i permuteidx = _mm512_set_epi64(7, 5, 3, 1, 6, 4, 2, 0);
return _mm512_permutexvar_epi64(permuteidx, _mm512_packus_epi32(a, b));
};
auto packlow32 = [&pack32](__m512i a, __m512i b) {
__m512i mask = _mm512_set1_epi32(0xFFFF);
return pack32(_mm512_and_si512(a, mask), _mm512_and_si512(b, mask));
};
auto packhi32 = [&pack32](__m512i a, __m512i b) {
return pack32(_mm512_srli_epi32(a, 16), _mm512_srli_epi32(b, 16));
};
__m512i rb0 = packlow32(bytes0, bytes1);
__m512i rb1 = packlow32(bytes2, bytes3);
__m512i ga0 = packhi32(bytes0, bytes1);
__m512i ga1 = packhi32(bytes2, bytes3);
__m512i r = packlow32(rb0, rb1);
__m512i g = packlow32(ga0, ga1);
__m512i b = packhi32(rb0, rb1);
__m512i a = packhi32(ga0, ga1);
return {SIMDVec16{r}, SIMDVec16{g}, SIMDVec16{b}, SIMDVec16{a}};
}
void SwapEndian() {
auto indices = _mm512_broadcast_i32x4(
_mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14));
vec = _mm512_shuffle_epi8(vec, indices);
}
};
SIMDVec16 Mask16::IfThenElse(const SIMDVec16& if_true,
const SIMDVec16& if_false) {
return SIMDVec16{_mm512_mask_blend_epi16(mask, if_false.vec, if_true.vec)};
}
SIMDVec32 Mask32::IfThenElse(const SIMDVec32& if_true,
const SIMDVec32& if_false) {
return SIMDVec32{_mm512_mask_blend_epi32(mask, if_false.vec, if_true.vec)};
}
struct Bits64 {
static constexpr size_t kLanes = 8;
__m512i nbits;
__m512i bits;
FJXL_INLINE void Store(uint64_t* nbits_out, uint64_t* bits_out) {
_mm512_storeu_si512((__m512i*)nbits_out, nbits);
_mm512_storeu_si512((__m512i*)bits_out, bits);
}
};
struct Bits32 {
__m512i nbits;
__m512i bits;
static Bits32 FromRaw(SIMDVec32 nbits, SIMDVec32 bits) {
return Bits32{nbits.vec, bits.vec};
}
Bits64 Merge() const {
auto nbits_hi32 = _mm512_srli_epi64(nbits, 32);
auto nbits_lo32 = _mm512_and_si512(nbits, _mm512_set1_epi64(0xFFFFFFFF));
auto bits_hi32 = _mm512_srli_epi64(bits, 32);
auto bits_lo32 = _mm512_and_si512(bits, _mm512_set1_epi64(0xFFFFFFFF));
auto nbits64 = _mm512_add_epi64(nbits_hi32, nbits_lo32);
auto bits64 =
_mm512_or_si512(_mm512_sllv_epi64(bits_hi32, nbits_lo32), bits_lo32);
return Bits64{nbits64, bits64};
}
void Interleave(const Bits32& low) {
bits = _mm512_or_si512(_mm512_sllv_epi32(bits, low.nbits), low.bits);
nbits = _mm512_add_epi32(nbits, low.nbits);
}
void ClipTo(size_t n) {
n = std::min<size_t>(n, 16);
constexpr uint32_t kMask[32] = {
~0u, ~0u, ~0u, ~0u, ~0u, ~0u, ~0u, ~0u, ~0u, ~0u, ~0u,
~0u, ~0u, ~0u, ~0u, ~0u, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
};
__m512i mask = _mm512_loadu_si512((__m512i*)(kMask + 16 - n));
nbits = _mm512_and_si512(mask, nbits);
bits = _mm512_and_si512(mask, bits);
}
void Skip(size_t n) {
n = std::min<size_t>(n, 16);
constexpr uint32_t kMask[32] = {
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, ~0u, ~0u, ~0u, ~0u, ~0u, ~0u,
~0u, ~0u, ~0u, ~0u, ~0u, ~0u, ~0u, ~0u, ~0u, ~0u,
};
__m512i mask = _mm512_loadu_si512((__m512i*)(kMask + 16 - n));
nbits = _mm512_and_si512(mask, nbits);
bits = _mm512_and_si512(mask, bits);
}
};
struct Bits16 {
__m512i nbits;
__m512i bits;
static Bits16 FromRaw(SIMDVec16 nbits, SIMDVec16 bits) {
return Bits16{nbits.vec, bits.vec};
}
Bits32 Merge() const {
auto nbits_hi16 = _mm512_srli_epi32(nbits, 16);
auto nbits_lo16 = _mm512_and_si512(nbits, _mm512_set1_epi32(0xFFFF));
auto bits_hi16 = _mm512_srli_epi32(bits, 16);
auto bits_lo16 = _mm512_and_si512(bits, _mm512_set1_epi32(0xFFFF));
auto nbits32 = _mm512_add_epi32(nbits_hi16, nbits_lo16);
auto bits32 =
_mm512_or_si512(_mm512_sllv_epi32(bits_hi16, nbits_lo16), bits_lo16);
return Bits32{nbits32, bits32};
}
void Interleave(const Bits16& low) {
bits = _mm512_or_si512(_mm512_sllv_epi16(bits, low.nbits), low.bits);
nbits = _mm512_add_epi16(nbits, low.nbits);
}
void ClipTo(size_t n) {
n = std::min<size_t>(n, 32);
constexpr uint16_t kMask[64] = {
0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF,
0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF,
0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF,
0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF,
0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0,
};
__m512i mask = _mm512_loadu_si512((__m512i*)(kMask + 32 - n));
nbits = _mm512_and_si512(mask, nbits);
bits = _mm512_and_si512(mask, bits);
}
void Skip(size_t n) {
n = std::min<size_t>(n, 32);
constexpr uint16_t kMask[64] = {
0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0,
0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF,
0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF,
0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF,
0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF,
};
__m512i mask = _mm512_loadu_si512((__m512i*)(kMask + 32 - n));
nbits = _mm512_and_si512(mask, nbits);
bits = _mm512_and_si512(mask, bits);
}
};
#endif
#ifdef FJXL_AVX2
#define FJXL_GENERIC_SIMD
struct SIMDVec32;
struct Mask32 {
__m256i mask;
SIMDVec32 IfThenElse(const SIMDVec32& if_true, const SIMDVec32& if_false);
size_t CountPrefix() const {
return CtzNonZero(~static_cast <uint64_t>(
static_cast <uint8_t>(_mm256_movemask_ps(_mm256_castsi256_ps(mask)))));
}
};
struct SIMDVec32 {
__m256i vec;
static constexpr size_t kLanes = 8;
FJXL_INLINE static SIMDVec32 Load(const uint32_t* data) {
return SIMDVec32{_mm256_loadu_si256((__m256i*)data)};
}
FJXL_INLINE void Store(uint32_t* data) {
_mm256_storeu_si256((__m256i*)data, vec);
}
FJXL_INLINE static SIMDVec32 Val(uint32_t v) {
return SIMDVec32{_mm256_set1_epi32(v)};
}
FJXL_INLINE SIMDVec32 ValToToken() const {
auto f32 = _mm256_castps_si256(_mm256_cvtepi32_ps(vec));
return SIMDVec32{_mm256_max_epi32(
_mm256_setzero_si256(),
_mm256_sub_epi32(_mm256_srli_epi32(f32, 23), _mm256_set1_epi32(126)))};
}
FJXL_INLINE SIMDVec32 SatSubU(const SIMDVec32& to_subtract) const {
return SIMDVec32{_mm256_sub_epi32(_mm256_max_epu32(vec, to_subtract.vec),
to_subtract.vec)};
}
FJXL_INLINE SIMDVec32 Sub(const SIMDVec32& to_subtract) const {
return SIMDVec32{_mm256_sub_epi32(vec, to_subtract.vec)};
}
FJXL_INLINE SIMDVec32 Add(const SIMDVec32& oth) const {
return SIMDVec32{_mm256_add_epi32(vec, oth.vec)};
}
FJXL_INLINE SIMDVec32 Xor (const SIMDVec32& oth) const {
return SIMDVec32{_mm256_xor_si256(vec, oth.vec)};
}
FJXL_INLINE SIMDVec32 Pow2() const {
return SIMDVec32{_mm256_sllv_epi32(_mm256_set1_epi32(1), vec)};
}
FJXL_INLINE Mask32 Eq(const SIMDVec32& oth) const {
return Mask32{_mm256_cmpeq_epi32(vec, oth.vec)};
}
FJXL_INLINE Mask32 Gt(const SIMDVec32& oth) const {
return Mask32{_mm256_cmpgt_epi32(vec, oth.vec)};
}
template <size_t i>
FJXL_INLINE SIMDVec32 SignedShiftRight() const {
return SIMDVec32{_mm256_srai_epi32(vec, i)};
}
};
struct SIMDVec16;
struct Mask16 {
__m256i mask;
SIMDVec16 IfThenElse(const SIMDVec16& if_true, const SIMDVec16& if_false);
Mask16 And (const Mask16& oth) const {
return Mask16{_mm256_and_si256(mask, oth.mask)};
}
size_t CountPrefix() const {
return CtzNonZero(~static_cast <uint64_t>(
static_cast <uint32_t>(_mm256_movemask_epi8(mask)))) /
2;
}
};
struct SIMDVec16 {
__m256i vec;
static constexpr size_t kLanes = 16;
FJXL_INLINE static SIMDVec16 Load(const uint16_t* data) {
return SIMDVec16{_mm256_loadu_si256((__m256i*)data)};
}
FJXL_INLINE void Store(uint16_t* data) {
_mm256_storeu_si256((__m256i*)data, vec);
}
FJXL_INLINE static SIMDVec16 Val(uint16_t v) {
return SIMDVec16{_mm256_set1_epi16(v)};
}
FJXL_INLINE static SIMDVec16 FromTwo32(const SIMDVec32& lo,
const SIMDVec32& hi) {
auto tmp = _mm256_packus_epi32(lo.vec, hi.vec);
return SIMDVec16{_mm256_permute4x64_epi64(tmp, 0b11011000)};
}
FJXL_INLINE SIMDVec16 ValToToken() const {
auto nibble0 =
_mm256_or_si256(_mm256_and_si256(vec, _mm256_set1_epi16(0xF)),
_mm256_set1_epi16(0xFF00));
auto nibble1 = _mm256_or_si256(
_mm256_and_si256(_mm256_srli_epi16(vec, 4), _mm256_set1_epi16(0xF)),
_mm256_set1_epi16(0xFF00));
auto nibble2 = _mm256_or_si256(
_mm256_and_si256(_mm256_srli_epi16(vec, 8), _mm256_set1_epi16(0xF)),
_mm256_set1_epi16(0xFF00));
auto nibble3 =
_mm256_or_si256(_mm256_srli_epi16(vec, 12), _mm256_set1_epi16(0xFF00));
auto lut0 = _mm256_broadcastsi128_si256(
_mm_setr_epi8(0, 1, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4));
auto lut1 = _mm256_broadcastsi128_si256(
_mm_setr_epi8(0, 5, 6, 6, 7, 7, 7, 7, 8, 8, 8, 8, 8, 8, 8, 8));
auto lut2 = _mm256_broadcastsi128_si256(_mm_setr_epi8(
0, 9, 10, 10, 11, 11, 11, 11, 12, 12, 12, 12, 12, 12, 12, 12));
auto lut3 = _mm256_broadcastsi128_si256(_mm_setr_epi8(
0, 13, 14, 14, 15, 15, 15, 15, 16, 16, 16, 16, 16, 16, 16, 16));
auto token0 = _mm256_shuffle_epi8(lut0, nibble0);
auto token1 = _mm256_shuffle_epi8(lut1, nibble1);
auto token2 = _mm256_shuffle_epi8(lut2, nibble2);
auto token3 = _mm256_shuffle_epi8(lut3, nibble3);
auto token = _mm256_max_epi16(_mm256_max_epi16(token0, token1),
_mm256_max_epi16(token2, token3));
return SIMDVec16{token};
}
FJXL_INLINE SIMDVec16 SatSubU(const SIMDVec16& to_subtract) const {
return SIMDVec16{_mm256_subs_epu16(vec, to_subtract.vec)};
}
FJXL_INLINE SIMDVec16 Sub(const SIMDVec16& to_subtract) const {
return SIMDVec16{_mm256_sub_epi16(vec, to_subtract.vec)};
}
FJXL_INLINE SIMDVec16 Add(const SIMDVec16& oth) const {
return SIMDVec16{_mm256_add_epi16(vec, oth.vec)};
}
FJXL_INLINE SIMDVec16 Min(const SIMDVec16& oth) const {
return SIMDVec16{_mm256_min_epu16(vec, oth.vec)};
}
FJXL_INLINE Mask16 Eq(const SIMDVec16& oth) const {
return Mask16{_mm256_cmpeq_epi16(vec, oth.vec)};
}
FJXL_INLINE Mask16 Gt(const SIMDVec16& oth) const {
return Mask16{_mm256_cmpgt_epi16(vec, oth.vec)};
}
FJXL_INLINE SIMDVec16 Pow2() const {
auto pow2_lo_lut = _mm256_broadcastsi128_si256(
_mm_setr_epi8(1 << 0, 1 << 1, 1 << 2, 1 << 3, 1 << 4, 1 << 5, 1 << 6,
1u << 7, 0, 0, 0, 0, 0, 0, 0, 0));
auto pow2_hi_lut = _mm256_broadcastsi128_si256(
_mm_setr_epi8(0, 0, 0, 0, 0, 0, 0, 0, 1 << 0, 1 << 1, 1 << 2, 1 << 3,
1 << 4, 1 << 5, 1 << 6, 1u << 7));
auto masked = _mm256_or_si256(vec, _mm256_set1_epi16(0xFF00));
auto pow2_lo = _mm256_shuffle_epi8(pow2_lo_lut, masked);
auto pow2_hi = _mm256_shuffle_epi8(pow2_hi_lut, masked);
auto pow2 = _mm256_or_si256(_mm256_slli_epi16(pow2_hi, 8), pow2_lo);
return SIMDVec16{pow2};
}
FJXL_INLINE SIMDVec16 Or (const SIMDVec16& oth) const {
return SIMDVec16{_mm256_or_si256(vec, oth.vec)};
}
FJXL_INLINE SIMDVec16 Xor (const SIMDVec16& oth) const {
return SIMDVec16{_mm256_xor_si256(vec, oth.vec)};
}
FJXL_INLINE SIMDVec16 And (const SIMDVec16& oth) const {
return SIMDVec16{_mm256_and_si256(vec, oth.vec)};
}
FJXL_INLINE SIMDVec16 HAdd(const SIMDVec16& oth) const {
return SIMDVec16{_mm256_srai_epi16(_mm256_add_epi16(vec, oth.vec), 1)};
}
FJXL_INLINE SIMDVec16 PrepareForU8Lookup() const {
return SIMDVec16{_mm256_or_si256(vec, _mm256_set1_epi16(0xFF00))};
}
FJXL_INLINE SIMDVec16 U8Lookup(const uint8_t* table) const {
return SIMDVec16{_mm256_shuffle_epi8(
_mm256_broadcastsi128_si256(_mm_loadu_si128((__m128i*)table)), vec)};
}
FJXL_INLINE VecPair<SIMDVec16> Interleave(const SIMDVec16& low) const {
auto v02 = _mm256_unpacklo_epi16(low.vec, vec);
auto v13 = _mm256_unpackhi_epi16(low.vec, vec);
return {SIMDVec16{_mm256_permute2x128_si256(v02, v13, 0x20)},
SIMDVec16{_mm256_permute2x128_si256(v02, v13, 0x31)}};
}
FJXL_INLINE VecPair<SIMDVec32> Upcast() const {
auto v02 = _mm256_unpacklo_epi16(vec, _mm256_setzero_si256());
auto v13 = _mm256_unpackhi_epi16(vec, _mm256_setzero_si256());
return {SIMDVec32{_mm256_permute2x128_si256(v02, v13, 0x20)},
SIMDVec32{_mm256_permute2x128_si256(v02, v13, 0x31)}};
}
template <size_t i>
FJXL_INLINE SIMDVec16 SignedShiftRight() const {
return SIMDVec16{_mm256_srai_epi16(vec, i)};
}
static std::array<SIMDVec16, 1> LoadG8(const unsigned char * data) {
__m128i bytes = _mm_loadu_si128((__m128i*)data);
return {SIMDVec16{_mm256_cvtepu8_epi16(bytes)}};
}
static std::array<SIMDVec16, 1> LoadG16(const unsigned char * data) {
return {Load((const uint16_t*)data)};
}
static std::array<SIMDVec16, 2> LoadGA8(const unsigned char * data) {
__m256i bytes = _mm256_loadu_si256((__m256i*)data);
__m256i gray = _mm256_and_si256(bytes, _mm256_set1_epi16(0xFF));
__m256i alpha = _mm256_srli_epi16(bytes, 8);
return {SIMDVec16{gray}, SIMDVec16{alpha}};
}
static std::array<SIMDVec16, 2> LoadGA16(const unsigned char * data) {
__m256i bytes1 = _mm256_loadu_si256((__m256i*)data);
__m256i bytes2 = _mm256_loadu_si256((__m256i*)(data + 32));
__m256i g_mask = _mm256_set1_epi32(0xFFFF);
__m256i g = _mm256_permute4x64_epi64(
_mm256_packus_epi32(_mm256_and_si256(bytes1, g_mask),
_mm256_and_si256(bytes2, g_mask)),
0b11011000);
__m256i a = _mm256_permute4x64_epi64(
_mm256_packus_epi32(_mm256_srli_epi32(bytes1, 16),
_mm256_srli_epi32(bytes2, 16)),
0b11011000);
return {SIMDVec16{g}, SIMDVec16{a}};
}
static std::array<SIMDVec16, 3> LoadRGB8(const unsigned char * data) {
__m128i bytes0 = _mm_loadu_si128((__m128i*)data);
__m128i bytes1 = _mm_loadu_si128((__m128i*)(data + 16));
__m128i bytes2 = _mm_loadu_si128((__m128i*)(data + 32));
__m128i idx =
_mm_setr_epi8(0, 3, 6, 9, 12, 15, 2, 5, 8, 11, 14, 1, 4, 7, 10, 13);
__m128i r6b5g5_0 = _mm_shuffle_epi8(bytes0, idx);
__m128i g6r5b5_1 = _mm_shuffle_epi8(bytes1, idx);
__m128i b6g5r5_2 = _mm_shuffle_epi8(bytes2, idx);
__m128i mask010 = _mm_setr_epi8(0, 0, 0, 0, 0, 0, 0xFF, 0xFF, 0xFF, 0xFF,
0xFF, 0, 0, 0, 0, 0);
__m128i mask001 = _mm_setr_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0xFF, 0xFF,
0xFF, 0xFF, 0xFF);
__m128i b2g2b1 = _mm_blendv_epi8(b6g5r5_2, g6r5b5_1, mask001);
__m128i b2b0b1 = _mm_blendv_epi8(b2g2b1, r6b5g5_0, mask010);
__m128i r0r1b1 = _mm_blendv_epi8(r6b5g5_0, g6r5b5_1, mask010);
__m128i r0r1r2 = _mm_blendv_epi8(r0r1b1, b6g5r5_2, mask001);
__m128i g1r1g0 = _mm_blendv_epi8(g6r5b5_1, r6b5g5_0, mask001);
__m128i g1g2g0 = _mm_blendv_epi8(g1r1g0, b6g5r5_2, mask010);
__m128i g0g1g2 = _mm_alignr_epi8(g1g2g0, g1g2g0, 11);
__m128i b0b1b2 = _mm_alignr_epi8(b2b0b1, b2b0b1, 6);
return {SIMDVec16{_mm256_cvtepu8_epi16(r0r1r2)},
SIMDVec16{_mm256_cvtepu8_epi16(g0g1g2)},
SIMDVec16{_mm256_cvtepu8_epi16(b0b1b2)}};
}
static std::array<SIMDVec16, 3> LoadRGB16(const unsigned char * data) {
auto load_and_split_lohi = [](const unsigned char * data) {
// LHLHLH...
__m256i bytes = _mm256_loadu_si256((__m256i*)data);
// L0L0L0...
__m256i lo = _mm256_and_si256(bytes, _mm256_set1_epi16(0xFF));
// H0H0H0...
__m256i hi = _mm256_srli_epi16(bytes, 8);
// LLLLLLLLHHHHHHHHLLLLLLLLHHHHHHHH
__m256i packed = _mm256_packus_epi16(lo, hi);
return _mm256_permute4x64_epi64(packed, 0b11011000);
};
__m256i bytes0 = load_and_split_lohi(data);
__m256i bytes1 = load_and_split_lohi(data + 32);
__m256i bytes2 = load_and_split_lohi(data + 64);
__m256i idx = _mm256_broadcastsi128_si256(
_mm_setr_epi8(0, 3, 6, 9, 12, 15, 2, 5, 8, 11, 14, 1, 4, 7, 10, 13));
__m256i r6b5g5_0 = _mm256_shuffle_epi8(bytes0, idx);
__m256i g6r5b5_1 = _mm256_shuffle_epi8(bytes1, idx);
__m256i b6g5r5_2 = _mm256_shuffle_epi8(bytes2, idx);
__m256i mask010 = _mm256_broadcastsi128_si256(_mm_setr_epi8(
0, 0, 0, 0, 0, 0, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0, 0, 0, 0, 0));
__m256i mask001 = _mm256_broadcastsi128_si256(_mm_setr_epi8(
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF));
__m256i b2g2b1 = _mm256_blendv_epi8(b6g5r5_2, g6r5b5_1, mask001);
__m256i b2b0b1 = _mm256_blendv_epi8(b2g2b1, r6b5g5_0, mask010);
__m256i r0r1b1 = _mm256_blendv_epi8(r6b5g5_0, g6r5b5_1, mask010);
__m256i r0r1r2 = _mm256_blendv_epi8(r0r1b1, b6g5r5_2, mask001);
__m256i g1r1g0 = _mm256_blendv_epi8(g6r5b5_1, r6b5g5_0, mask001);
__m256i g1g2g0 = _mm256_blendv_epi8(g1r1g0, b6g5r5_2, mask010);
__m256i g0g1g2 = _mm256_alignr_epi8(g1g2g0, g1g2g0, 11);
__m256i b0b1b2 = _mm256_alignr_epi8(b2b0b1, b2b0b1, 6);
// Now r0r1r2, g0g1g2, b0b1b2 have the low bytes of the RGB pixels in their
// lower half, and the high bytes in their upper half.
auto combine_low_hi = [](__m256i v) {
__m128i low = _mm256_extracti128_si256(v, 0);
__m128i hi = _mm256_extracti128_si256(v, 1);
__m256i low16 = _mm256_cvtepu8_epi16(low);
__m256i hi16 = _mm256_cvtepu8_epi16(hi);
return _mm256_or_si256(_mm256_slli_epi16(hi16, 8), low16);
};
return {SIMDVec16{combine_low_hi(r0r1r2)},
SIMDVec16{combine_low_hi(g0g1g2)},
SIMDVec16{combine_low_hi(b0b1b2)}};
}
static std::array<SIMDVec16, 4> LoadRGBA8(const unsigned char * data) {
__m256i bytes1 = _mm256_loadu_si256((__m256i*)data);
__m256i bytes2 = _mm256_loadu_si256((__m256i*)(data + 32));
__m256i rg_mask = _mm256_set1_epi32(0xFFFF);
__m256i rg = _mm256_permute4x64_epi64(
_mm256_packus_epi32(_mm256_and_si256(bytes1, rg_mask),
_mm256_and_si256(bytes2, rg_mask)),
0b11011000);
__m256i b_a = _mm256_permute4x64_epi64(
_mm256_packus_epi32(_mm256_srli_epi32(bytes1, 16),
_mm256_srli_epi32(bytes2, 16)),
0b11011000);
__m256i r = _mm256_and_si256(rg, _mm256_set1_epi16(0xFF));
__m256i g = _mm256_srli_epi16(rg, 8);
__m256i b = _mm256_and_si256(b_a, _mm256_set1_epi16(0xFF));
__m256i a = _mm256_srli_epi16(b_a, 8);
return {SIMDVec16{r}, SIMDVec16{g}, SIMDVec16{b}, SIMDVec16{a}};
}
static std::array<SIMDVec16, 4> LoadRGBA16(const unsigned char * data) {
__m256i bytes0 = _mm256_loadu_si256((__m256i*)data);
__m256i bytes1 = _mm256_loadu_si256((__m256i*)(data + 32));
__m256i bytes2 = _mm256_loadu_si256((__m256i*)(data + 64));
__m256i bytes3 = _mm256_loadu_si256((__m256i*)(data + 96));
auto pack32 = [](__m256i a, __m256i b) {
return _mm256_permute4x64_epi64(_mm256_packus_epi32(a, b), 0b11011000);
};
auto packlow32 = [&pack32](__m256i a, __m256i b) {
__m256i mask = _mm256_set1_epi32(0xFFFF);
return pack32(_mm256_and_si256(a, mask), _mm256_and_si256(b, mask));
};
auto packhi32 = [&pack32](__m256i a, __m256i b) {
return pack32(_mm256_srli_epi32(a, 16), _mm256_srli_epi32(b, 16));
};
__m256i rb0 = packlow32(bytes0, bytes1);
__m256i rb1 = packlow32(bytes2, bytes3);
__m256i ga0 = packhi32(bytes0, bytes1);
__m256i ga1 = packhi32(bytes2, bytes3);
__m256i r = packlow32(rb0, rb1);
__m256i g = packlow32(ga0, ga1);
__m256i b = packhi32(rb0, rb1);
__m256i a = packhi32(ga0, ga1);
return {SIMDVec16{r}, SIMDVec16{g}, SIMDVec16{b}, SIMDVec16{a}};
}
void SwapEndian() {
auto indices = _mm256_broadcastsi128_si256(
_mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14));
--> --------------------
--> maximum size reached
--> --------------------
Messung V0.5 C=92 H=89 G=90
¤ Dauer der Verarbeitung: 0.30 Sekunden
(vorverarbeitet)
¤
*© Formatika GbR, Deutschland