// Copyright 2022 Google LLC
// SPDX-License-Identifier: Apache-2.0
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// Per-target include guard
#if defined(HIGHWAY_HWY_CONTRIB_BIT_PACK_INL_H_) ==
defined(HWY_TARGET_TOGGLE)
#ifdef HIGHWAY_HWY_CONTRIB_BIT_PACK_INL_H_
#undef HIGHWAY_HWY_CONTRIB_BIT_PACK_INL_H_
#else
#define HIGHWAY_HWY_CONTRIB_BIT_PACK_INL_H_
#endif
#include "hwy/highway.h"
HWY_BEFORE_NAMESPACE();
namespace hwy {
namespace HWY_NAMESPACE {
// The entry points are class templates specialized below for each number of
// bits. Each provides Pack and Unpack member functions which load (Pack) or
// store (Unpack) B raw vectors, and store (Pack) or load (Unpack) a number of
// packed vectors equal to kBits. B denotes the bits per lane: 8 for Pack8, 16
// for Pack16, which is also the upper bound for kBits.
template <size_t kBits>
// <= 8
struct Pack8 {};
template <size_t kBits>
// <= 16
struct Pack16 {};
template <>
struct Pack8<
1> {
template <
class D8>
HWY_INLINE
void Pack(D8 d8,
const uint8_t* HWY_RESTRICT raw,
uint8_t* HWY_RESTRICT packed_out)
const {
const RepartitionToWide<decltype(d8)> d16;
using VU16 = Vec<decltype(d16)>;
const size_t N8 = Lanes(d8);
// 16-bit shifts avoid masking (bits will not cross 8-bit lanes).
const VU16 raw0 = BitCast(d16, LoadU(d8, raw +
0 * N8));
const VU16 raw1 = BitCast(d16, LoadU(d8, raw +
1 * N8));
const VU16 raw2 = BitCast(d16, LoadU(d8, raw +
2 * N8));
const VU16 raw3 = BitCast(d16, LoadU(d8, raw +
3 * N8));
const VU16 raw4 = BitCast(d16, LoadU(d8, raw +
4 * N8));
const VU16 raw5 = BitCast(d16, LoadU(d8, raw +
5 * N8));
const VU16 raw6 = BitCast(d16, LoadU(d8, raw +
6 * N8));
const VU16 raw7 = BitCast(d16, LoadU(d8, raw +
7 * N8));
const VU16 packed =
Xor3(
Or(ShiftLeft<
7>(raw7), ShiftLeft<
6>(raw6)),
Xor3(ShiftLeft<
5>(raw5), ShiftLeft<
4>(raw4), ShiftLeft<
3>(raw3)),
Xor3(ShiftLeft<
2>(raw2), ShiftLeft<
1>(raw1), raw0));
StoreU(BitCast(d8, packed), d8, packed_out);
}
template <
class D8>
HWY_INLINE
void Unpack(D8 d8,
const uint8_t* HWY_RESTRICT packed_in,
uint8_t* HWY_RESTRICT raw)
const {
const RepartitionToWide<decltype(d8)> d16;
using VU16 = Vec<decltype(d16)>;
const size_t N8 = Lanes(d8);
const VU16 mask = Set(d16,
0x0101u);
// LSB in each byte
const VU16 packed = BitCast(d16, LoadU(d8, packed_in));
const VU16 raw0 =
And(packed, mask);
StoreU(BitCast(d8, raw0), d8, raw +
0 * N8);
const VU16 raw1 =
And(ShiftRight<
1>(packed), mask);
StoreU(BitCast(d8, raw1), d8, raw +
1 * N8);
const VU16 raw2 =
And(ShiftRight<
2>(packed), mask);
StoreU(BitCast(d8, raw2), d8, raw +
2 * N8);
const VU16 raw3 =
And(ShiftRight<
3>(packed), mask);
StoreU(BitCast(d8, raw3), d8, raw +
3 * N8);
const VU16 raw4 =
And(ShiftRight<
4>(packed), mask);
StoreU(BitCast(d8, raw4), d8, raw +
4 * N8);
const VU16 raw5 =
And(ShiftRight<
5>(packed), mask);
StoreU(BitCast(d8, raw5), d8, raw +
5 * N8);
const VU16 raw6 =
And(ShiftRight<
6>(packed), mask);
StoreU(BitCast(d8, raw6), d8, raw +
6 * N8);
const VU16 raw7 =
And(ShiftRight<
7>(packed), mask);
StoreU(BitCast(d8, raw7), d8, raw +
7 * N8);
}
};
// Pack8<1>
template <>
struct Pack8<
2> {
template <
class D8>
HWY_INLINE
void Pack(D8 d8,
const uint8_t* HWY_RESTRICT raw,
uint8_t* HWY_RESTRICT packed_out)
const {
const RepartitionToWide<decltype(d8)> d16;
using VU16 = Vec<decltype(d16)>;
const size_t N8 = Lanes(d8);
// 16-bit shifts avoid masking (bits will not cross 8-bit lanes).
const VU16 raw0 = BitCast(d16, LoadU(d8, raw +
0 * N8));
const VU16 raw1 = BitCast(d16, LoadU(d8, raw +
1 * N8));
const VU16 raw2 = BitCast(d16, LoadU(d8, raw +
2 * N8));
const VU16 raw3 = BitCast(d16, LoadU(d8, raw +
3 * N8));
const VU16 raw4 = BitCast(d16, LoadU(d8, raw +
4 * N8));
const VU16 raw5 = BitCast(d16, LoadU(d8, raw +
5 * N8));
const VU16 raw6 = BitCast(d16, LoadU(d8, raw +
6 * N8));
const VU16 raw7 = BitCast(d16, LoadU(d8, raw +
7 * N8));
const VU16 packed0 = Xor3(ShiftLeft<
6>(raw6), ShiftLeft<
4>(raw4),
Or(ShiftLeft<
2>(raw2), raw0));
const VU16 packed1 = Xor3(ShiftLeft<
6>(raw7), ShiftLeft<
4>(raw5),
Or(ShiftLeft<
2>(raw3), raw1));
StoreU(BitCast(d8, packed0), d8, packed_out +
0 * N8);
StoreU(BitCast(d8, packed1), d8, packed_out +
1 * N8);
}
template <
class D8>
HWY_INLINE
void Unpack(D8 d8,
const uint8_t* HWY_RESTRICT packed_in,
uint8_t* HWY_RESTRICT raw)
const {
const RepartitionToWide<decltype(d8)> d16;
using VU16 = Vec<decltype(d16)>;
const size_t N8 = Lanes(d8);
const VU16 mask = Set(d16,
0x0303u);
// Lowest 2 bits per byte
const VU16 packed0 = BitCast(d16, LoadU(d8, packed_in +
0 * N8));
const VU16 packed1 = BitCast(d16, LoadU(d8, packed_in +
1 * N8));
const VU16 raw0 =
And(packed0, mask);
StoreU(BitCast(d8, raw0), d8, raw +
0 * N8);
const VU16 raw1 =
And(packed1, mask);
StoreU(BitCast(d8, raw1), d8, raw +
1 * N8);
const VU16 raw2 =
And(ShiftRight<
2>(packed0), mask);
StoreU(BitCast(d8, raw2), d8, raw +
2 * N8);
const VU16 raw3 =
And(ShiftRight<
2>(packed1), mask);
StoreU(BitCast(d8, raw3), d8, raw +
3 * N8);
const VU16 raw4 =
And(ShiftRight<
4>(packed0), mask);
StoreU(BitCast(d8, raw4), d8, raw +
4 * N8);
const VU16 raw5 =
And(ShiftRight<
4>(packed1), mask);
StoreU(BitCast(d8, raw5), d8, raw +
5 * N8);
const VU16 raw6 =
And(ShiftRight<
6>(packed0), mask);
StoreU(BitCast(d8, raw6), d8, raw +
6 * N8);
const VU16 raw7 =
And(ShiftRight<
6>(packed1), mask);
StoreU(BitCast(d8, raw7), d8, raw +
7 * N8);
}
};
// Pack8<2>
template <>
struct Pack8<
3> {
template <
class D8>
HWY_INLINE
void Pack(D8 d8,
const uint8_t* HWY_RESTRICT raw,
uint8_t* HWY_RESTRICT packed_out)
const {
const RepartitionToWide<decltype(d8)> d16;
using VU16 = Vec<decltype(d16)>;
const size_t N8 = Lanes(d8);
const VU16 raw0 = BitCast(d16, LoadU(d8, raw +
0 * N8));
const VU16 raw1 = BitCast(d16, LoadU(d8, raw +
1 * N8));
const VU16 raw2 = BitCast(d16, LoadU(d8, raw +
2 * N8));
const VU16 raw3 = BitCast(d16, LoadU(d8, raw +
3 * N8));
const VU16 raw4 = BitCast(d16, LoadU(d8, raw +
4 * N8));
const VU16 raw5 = BitCast(d16, LoadU(d8, raw +
5 * N8));
const VU16 raw6 = BitCast(d16, LoadU(d8, raw +
6 * N8));
const VU16 raw7 = BitCast(d16, LoadU(d8, raw +
7 * N8));
// The upper two bits of these three will be filled with packed3 (6 bits).
VU16 packed0 =
Or(ShiftLeft<
3>(raw4), raw0);
VU16 packed1 =
Or(ShiftLeft<
3>(raw5), raw1);
VU16 packed2 =
Or(ShiftLeft<
3>(raw6), raw2);
const VU16 packed3 =
Or(ShiftLeft<
3>(raw7), raw3);
const VU16 hi2 = Set(d16,
0xC0C0u);
packed0 = OrAnd(packed0, ShiftLeft<
2>(packed3), hi2);
packed1 = OrAnd(packed1, ShiftLeft<
4>(packed3), hi2);
packed2 = OrAnd(packed2, ShiftLeft<
6>(packed3), hi2);
StoreU(BitCast(d8, packed0), d8, packed_out +
0 * N8);
StoreU(BitCast(d8, packed1), d8, packed_out +
1 * N8);
StoreU(BitCast(d8, packed2), d8, packed_out +
2 * N8);
}
template <
class D8>
HWY_INLINE
void Unpack(D8 d8,
const uint8_t* HWY_RESTRICT packed_in,
uint8_t* HWY_RESTRICT raw)
const {
const RepartitionToWide<decltype(d8)> d16;
using VU16 = Vec<decltype(d16)>;
const size_t N8 = Lanes(d8);
const VU16 mask = Set(d16,
0x0707u);
// Lowest 3 bits per byte
const VU16 packed0 = BitCast(d16, LoadU(d8, packed_in +
0 * N8));
const VU16 packed1 = BitCast(d16, LoadU(d8, packed_in +
1 * N8));
const VU16 packed2 = BitCast(d16, LoadU(d8, packed_in +
2 * N8));
const VU16 raw0 =
And(packed0, mask);
StoreU(BitCast(d8, raw0), d8, raw +
0 * N8);
const VU16 raw1 =
And(packed1, mask);
StoreU(BitCast(d8, raw1), d8, raw +
1 * N8);
const VU16 raw2 =
And(packed2, mask);
StoreU(BitCast(d8, raw2), d8, raw +
2 * N8);
const VU16 raw4 =
And(ShiftRight<
3>(packed0), mask);
StoreU(BitCast(d8, raw4), d8, raw +
4 * N8);
const VU16 raw5 =
And(ShiftRight<
3>(packed1), mask);
StoreU(BitCast(d8, raw5), d8, raw +
5 * N8);
const VU16 raw6 =
And(ShiftRight<
3>(packed2), mask);
StoreU(BitCast(d8, raw6), d8, raw +
6 * N8);
// raw73 is the concatenation of the upper two bits in packed0..2.
const VU16 hi2 = Set(d16,
0xC0C0u);
const VU16 raw73 = Xor3(ShiftRight<
6>(
And(packed2, hi2)),
//
ShiftRight<
4>(
And(packed1, hi2)),
ShiftRight<
2>(
And(packed0, hi2)));
const VU16 raw3 =
And(mask, raw73);
StoreU(BitCast(d8, raw3), d8, raw +
3 * N8);
const VU16 raw7 =
And(mask, ShiftRight<
3>(raw73));
StoreU(BitCast(d8, raw7), d8, raw +
7 * N8);
}
};
// Pack8<3>
template <>
struct Pack8<
4> {
template <
class D8>
HWY_INLINE
void Pack(D8 d8,
const uint8_t* HWY_RESTRICT raw,
uint8_t* HWY_RESTRICT packed_out)
const {
const RepartitionToWide<decltype(d8)> d16;
using VU16 = Vec<decltype(d16)>;
const size_t N8 = Lanes(d8);
// 16-bit shifts avoid masking (bits will not cross 8-bit lanes).
const VU16 raw0 = BitCast(d16, LoadU(d8, raw +
0 * N8));
const VU16 raw1 = BitCast(d16, LoadU(d8, raw +
1 * N8));
const VU16 raw2 = BitCast(d16, LoadU(d8, raw +
2 * N8));
const VU16 raw3 = BitCast(d16, LoadU(d8, raw +
3 * N8));
const VU16 raw4 = BitCast(d16, LoadU(d8, raw +
4 * N8));
const VU16 raw5 = BitCast(d16, LoadU(d8, raw +
5 * N8));
const VU16 raw6 = BitCast(d16, LoadU(d8, raw +
6 * N8));
const VU16 raw7 = BitCast(d16, LoadU(d8, raw +
7 * N8));
const VU16 packed0 =
Or(ShiftLeft<
4>(raw2), raw0);
const VU16 packed1 =
Or(ShiftLeft<
4>(raw3), raw1);
const VU16 packed2 =
Or(ShiftLeft<
4>(raw6), raw4);
const VU16 packed3 =
Or(ShiftLeft<
4>(raw7), raw5);
StoreU(BitCast(d8, packed0), d8, packed_out +
0 * N8);
StoreU(BitCast(d8, packed1), d8, packed_out +
1 * N8);
StoreU(BitCast(d8, packed2), d8, packed_out +
2 * N8);
StoreU(BitCast(d8, packed3), d8, packed_out +
3 * N8);
}
template <
class D8>
HWY_INLINE
void Unpack(D8 d8,
const uint8_t* HWY_RESTRICT packed_in,
uint8_t* HWY_RESTRICT raw)
const {
const RepartitionToWide<decltype(d8)> d16;
using VU16 = Vec<decltype(d16)>;
const size_t N8 = Lanes(d8);
const VU16 mask = Set(d16,
0x0F0Fu);
// Lowest 4 bits per byte
const VU16 packed0 = BitCast(d16, LoadU(d8, packed_in +
0 * N8));
const VU16 packed1 = BitCast(d16, LoadU(d8, packed_in +
1 * N8));
const VU16 packed2 = BitCast(d16, LoadU(d8, packed_in +
2 * N8));
const VU16 packed3 = BitCast(d16, LoadU(d8, packed_in +
3 * N8));
const VU16 raw0 =
And(packed0, mask);
StoreU(BitCast(d8, raw0), d8, raw +
0 * N8);
const VU16 raw1 =
And(packed1, mask);
StoreU(BitCast(d8, raw1), d8, raw +
1 * N8);
const VU16 raw2 =
And(ShiftRight<
4>(packed0), mask);
StoreU(BitCast(d8, raw2), d8, raw +
2 * N8);
const VU16 raw3 =
And(ShiftRight<
4>(packed1), mask);
StoreU(BitCast(d8, raw3), d8, raw +
3 * N8);
const VU16 raw4 =
And(packed2, mask);
StoreU(BitCast(d8, raw4), d8, raw +
4 * N8);
const VU16 raw5 =
And(packed3, mask);
StoreU(BitCast(d8, raw5), d8, raw +
5 * N8);
const VU16 raw6 =
And(ShiftRight<
4>(packed2), mask);
StoreU(BitCast(d8, raw6), d8, raw +
6 * N8);
const VU16 raw7 =
And(ShiftRight<
4>(packed3), mask);
StoreU(BitCast(d8, raw7), d8, raw +
7 * N8);
}
};
// Pack8<4>
template <>
struct Pack8<
5> {
template <
class D8>
HWY_INLINE
void Pack(D8 d8,
const uint8_t* HWY_RESTRICT raw,
uint8_t* HWY_RESTRICT packed_out)
const {
const RepartitionToWide<decltype(d8)> d16;
using VU16 = Vec<decltype(d16)>;
const size_t N8 = Lanes(d8);
const VU16 raw0 = BitCast(d16, LoadU(d8, raw +
0 * N8));
const VU16 raw1 = BitCast(d16, LoadU(d8, raw +
1 * N8));
const VU16 raw2 = BitCast(d16, LoadU(d8, raw +
2 * N8));
const VU16 raw3 = BitCast(d16, LoadU(d8, raw +
3 * N8));
const VU16 raw4 = BitCast(d16, LoadU(d8, raw +
4 * N8));
const VU16 raw5 = BitCast(d16, LoadU(d8, raw +
5 * N8));
const VU16 raw6 = BitCast(d16, LoadU(d8, raw +
6 * N8));
const VU16 raw7 = BitCast(d16, LoadU(d8, raw +
7 * N8));
// Fill upper three bits with upper bits from raw4..7.
const VU16 hi3 = Set(d16,
0xE0E0u);
const VU16 packed0 = OrAnd(raw0, ShiftLeft<
3>(raw4), hi3);
const VU16 packed1 = OrAnd(raw1, ShiftLeft<
3>(raw5), hi3);
const VU16 packed2 = OrAnd(raw2, ShiftLeft<
3>(raw6), hi3);
const VU16 packed3 = OrAnd(raw3, ShiftLeft<
3>(raw7), hi3);
StoreU(BitCast(d8, packed0), d8, packed_out +
0 * N8);
StoreU(BitCast(d8, packed1), d8, packed_out +
1 * N8);
StoreU(BitCast(d8, packed2), d8, packed_out +
2 * N8);
StoreU(BitCast(d8, packed3), d8, packed_out +
3 * N8);
// Combine lower two bits of raw4..7 into packed4.
const VU16 lo2 = Set(d16,
0x0303u);
const VU16 packed4 =
Or(
And(raw4, lo2), Xor3(ShiftLeft<
2>(
And(raw5, lo2)),
ShiftLeft<
4>(
And(raw6, lo2)),
ShiftLeft<
6>(
And(raw7, lo2))));
StoreU(BitCast(d8, packed4), d8, packed_out +
4 * N8);
}
template <
class D8>
HWY_INLINE
void Unpack(D8 d8,
const uint8_t* HWY_RESTRICT packed_in,
uint8_t* HWY_RESTRICT raw)
const {
const RepartitionToWide<decltype(d8)> d16;
using VU16 = Vec<decltype(d16)>;
const size_t N8 = Lanes(d8);
const VU16 packed0 = BitCast(d16, LoadU(d8, packed_in +
0 * N8));
const VU16 packed1 = BitCast(d16, LoadU(d8, packed_in +
1 * N8));
const VU16 packed2 = BitCast(d16, LoadU(d8, packed_in +
2 * N8));
const VU16 packed3 = BitCast(d16, LoadU(d8, packed_in +
3 * N8));
const VU16 packed4 = BitCast(d16, LoadU(d8, packed_in +
4 * N8));
const VU16 mask = Set(d16,
0x1F1Fu);
// Lowest 5 bits per byte
const VU16 raw0 =
And(packed0, mask);
StoreU(BitCast(d8, raw0), d8, raw +
0 * N8);
const VU16 raw1 =
And(packed1, mask);
StoreU(BitCast(d8, raw1), d8, raw +
1 * N8);
const VU16 raw2 =
And(packed2, mask);
StoreU(BitCast(d8, raw2), d8, raw +
2 * N8);
const VU16 raw3 =
And(packed3, mask);
StoreU(BitCast(d8, raw3), d8, raw +
3 * N8);
// The upper bits are the top 3 bits shifted right by three.
const VU16 top4 = ShiftRight<
3>(AndNot(mask, packed0));
const VU16 top5 = ShiftRight<
3>(AndNot(mask, packed1));
const VU16 top6 = ShiftRight<
3>(AndNot(mask, packed2));
const VU16 top7 = ShiftRight<
3>(AndNot(mask, packed3));
// Insert the lower 2 bits, which were concatenated into a byte.
const VU16 lo2 = Set(d16,
0x0303u);
const VU16 raw4 = OrAnd(top4, lo2, packed4);
const VU16 raw5 = OrAnd(top5, lo2, ShiftRight<
2>(packed4));
const VU16 raw6 = OrAnd(top6, lo2, ShiftRight<
4>(packed4));
const VU16 raw7 = OrAnd(top7, lo2, ShiftRight<
6>(packed4));
StoreU(BitCast(d8, raw4), d8, raw +
4 * N8);
StoreU(BitCast(d8, raw5), d8, raw +
5 * N8);
StoreU(BitCast(d8, raw6), d8, raw +
6 * N8);
StoreU(BitCast(d8, raw7), d8, raw +
7 * N8);
}
};
// Pack8<5>
template <>
struct Pack8<
6> {
template <
class D8>
HWY_INLINE
void Pack(D8 d8,
const uint8_t* HWY_RESTRICT raw,
uint8_t* HWY_RESTRICT packed_out)
const {
const RepartitionToWide<decltype(d8)> d16;
using VU16 = Vec<decltype(d16)>;
const size_t N8 = Lanes(d8);
const VU16 raw0 = BitCast(d16, LoadU(d8, raw +
0 * N8));
const VU16 raw1 = BitCast(d16, LoadU(d8, raw +
1 * N8));
const VU16 raw2 = BitCast(d16, LoadU(d8, raw +
2 * N8));
const VU16 raw3 = BitCast(d16, LoadU(d8, raw +
3 * N8));
const VU16 raw4 = BitCast(d16, LoadU(d8, raw +
4 * N8));
const VU16 raw5 = BitCast(d16, LoadU(d8, raw +
5 * N8));
const VU16 raw6 = BitCast(d16, LoadU(d8, raw +
6 * N8));
const VU16 raw7 = BitCast(d16, LoadU(d8, raw +
7 * N8));
const VU16 hi2 = Set(d16,
0xC0C0u);
// Each triplet of these stores raw3/raw7 (6 bits) in the upper 2 bits.
const VU16 packed0 = OrAnd(raw0, ShiftLeft<
2>(raw3), hi2);
const VU16 packed1 = OrAnd(raw1, ShiftLeft<
4>(raw3), hi2);
const VU16 packed2 = OrAnd(raw2, ShiftLeft<
6>(raw3), hi2);
const VU16 packed3 = OrAnd(raw4, ShiftLeft<
2>(raw7), hi2);
const VU16 packed4 = OrAnd(raw5, ShiftLeft<
4>(raw7), hi2);
const VU16 packed5 = OrAnd(raw6, ShiftLeft<
6>(raw7), hi2);
StoreU(BitCast(d8, packed0), d8, packed_out +
0 * N8);
StoreU(BitCast(d8, packed1), d8, packed_out +
1 * N8);
StoreU(BitCast(d8, packed2), d8, packed_out +
2 * N8);
StoreU(BitCast(d8, packed3), d8, packed_out +
3 * N8);
StoreU(BitCast(d8, packed4), d8, packed_out +
4 * N8);
StoreU(BitCast(d8, packed5), d8, packed_out +
5 * N8);
}
template <
class D8>
HWY_INLINE
void Unpack(D8 d8,
const uint8_t* HWY_RESTRICT packed_in,
uint8_t* HWY_RESTRICT raw)
const {
const RepartitionToWide<decltype(d8)> d16;
using VU16 = Vec<decltype(d16)>;
const size_t N8 = Lanes(d8);
const VU16 mask = Set(d16,
0x3F3Fu);
// Lowest 6 bits per byte
const VU16 packed0 = BitCast(d16, LoadU(d8, packed_in +
0 * N8));
const VU16 packed1 = BitCast(d16, LoadU(d8, packed_in +
1 * N8));
const VU16 packed2 = BitCast(d16, LoadU(d8, packed_in +
2 * N8));
const VU16 packed3 = BitCast(d16, LoadU(d8, packed_in +
3 * N8));
const VU16 packed4 = BitCast(d16, LoadU(d8, packed_in +
4 * N8));
const VU16 packed5 = BitCast(d16, LoadU(d8, packed_in +
5 * N8));
const VU16 raw0 =
And(packed0, mask);
StoreU(BitCast(d8, raw0), d8, raw +
0 * N8);
const VU16 raw1 =
And(packed1, mask);
StoreU(BitCast(d8, raw1), d8, raw +
1 * N8);
const VU16 raw2 =
And(packed2, mask);
StoreU(BitCast(d8, raw2), d8, raw +
2 * N8);
const VU16 raw4 =
And(packed3, mask);
StoreU(BitCast(d8, raw4), d8, raw +
4 * N8);
const VU16 raw5 =
And(packed4, mask);
StoreU(BitCast(d8, raw5), d8, raw +
5 * N8);
const VU16 raw6 =
And(packed5, mask);
StoreU(BitCast(d8, raw6), d8, raw +
6 * N8);
// raw3/7 are the concatenation of the upper two bits in packed0..2.
const VU16 raw3 = Xor3(ShiftRight<
6>(AndNot(mask, packed2)),
ShiftRight<
4>(AndNot(mask, packed1)),
ShiftRight<
2>(AndNot(mask, packed0)));
const VU16 raw7 = Xor3(ShiftRight<
6>(AndNot(mask, packed5)),
ShiftRight<
4>(AndNot(mask, packed4)),
ShiftRight<
2>(AndNot(mask, packed3)));
StoreU(BitCast(d8, raw3), d8, raw +
3 * N8);
StoreU(BitCast(d8, raw7), d8, raw +
7 * N8);
}
};
// Pack8<6>
template <>
struct Pack8<
7> {
template <
class D8>
HWY_INLINE
void Pack(D8 d8,
const uint8_t* HWY_RESTRICT raw,
uint8_t* HWY_RESTRICT packed_out)
const {
const RepartitionToWide<decltype(d8)> d16;
using VU16 = Vec<decltype(d16)>;
const size_t N8 = Lanes(d8);
const VU16 raw0 = BitCast(d16, LoadU(d8, raw +
0 * N8));
const VU16 raw1 = BitCast(d16, LoadU(d8, raw +
1 * N8));
const VU16 raw2 = BitCast(d16, LoadU(d8, raw +
2 * N8));
const VU16 raw3 = BitCast(d16, LoadU(d8, raw +
3 * N8));
const VU16 raw4 = BitCast(d16, LoadU(d8, raw +
4 * N8));
const VU16 raw5 = BitCast(d16, LoadU(d8, raw +
5 * N8));
const VU16 raw6 = BitCast(d16, LoadU(d8, raw +
6 * N8));
// Inserted into top bit of packed0..6.
const VU16 raw7 = BitCast(d16, LoadU(d8, raw +
7 * N8));
const VU16 hi1 = Set(d16,
0x8080u);
const VU16 packed0 = OrAnd(raw0, Add(raw7, raw7), hi1);
const VU16 packed1 = OrAnd(raw1, ShiftLeft<
2>(raw7), hi1);
const VU16 packed2 = OrAnd(raw2, ShiftLeft<
3>(raw7), hi1);
const VU16 packed3 = OrAnd(raw3, ShiftLeft<
4>(raw7), hi1);
const VU16 packed4 = OrAnd(raw4, ShiftLeft<
5>(raw7), hi1);
const VU16 packed5 = OrAnd(raw5, ShiftLeft<
6>(raw7), hi1);
const VU16 packed6 = OrAnd(raw6, ShiftLeft<
7>(raw7), hi1);
StoreU(BitCast(d8, packed0), d8, packed_out +
0 * N8);
StoreU(BitCast(d8, packed1), d8, packed_out +
1 * N8);
StoreU(BitCast(d8, packed2), d8, packed_out +
2 * N8);
StoreU(BitCast(d8, packed3), d8, packed_out +
3 * N8);
StoreU(BitCast(d8, packed4), d8, packed_out +
4 * N8);
StoreU(BitCast(d8, packed5), d8, packed_out +
5 * N8);
StoreU(BitCast(d8, packed6), d8, packed_out +
6 * N8);
}
template <
class D8>
HWY_INLINE
void Unpack(D8 d8,
const uint8_t* HWY_RESTRICT packed_in,
uint8_t* HWY_RESTRICT raw)
const {
const RepartitionToWide<decltype(d8)> d16;
using VU16 = Vec<decltype(d16)>;
const size_t N8 = Lanes(d8);
const VU16 packed0 = BitCast(d16, LoadU(d8, packed_in +
0 * N8));
const VU16 packed1 = BitCast(d16, LoadU(d8, packed_in +
1 * N8));
const VU16 packed2 = BitCast(d16, LoadU(d8, packed_in +
2 * N8));
const VU16 packed3 = BitCast(d16, LoadU(d8, packed_in +
3 * N8));
const VU16 packed4 = BitCast(d16, LoadU(d8, packed_in +
4 * N8));
const VU16 packed5 = BitCast(d16, LoadU(d8, packed_in +
5 * N8));
const VU16 packed6 = BitCast(d16, LoadU(d8, packed_in +
6 * N8));
const VU16 mask = Set(d16,
0x7F7Fu);
// Lowest 7 bits per byte
const VU16 raw0 =
And(packed0, mask);
StoreU(BitCast(d8, raw0), d8, raw +
0 * N8);
const VU16 raw1 =
And(packed1, mask);
StoreU(BitCast(d8, raw1), d8, raw +
1 * N8);
const VU16 raw2 =
And(packed2, mask);
StoreU(BitCast(d8, raw2), d8, raw +
2 * N8);
const VU16 raw3 =
And(packed3, mask);
StoreU(BitCast(d8, raw3), d8, raw +
3 * N8);
const VU16 raw4 =
And(packed4, mask);
StoreU(BitCast(d8, raw4), d8, raw +
4 * N8);
const VU16 raw5 =
And(packed5, mask);
StoreU(BitCast(d8, raw5), d8, raw +
5 * N8);
const VU16 raw6 =
And(packed6, mask);
StoreU(BitCast(d8, raw6), d8, raw +
6 * N8);
const VU16 p0 = Xor3(ShiftRight<
7>(AndNot(mask, packed6)),
ShiftRight<
6>(AndNot(mask, packed5)),
ShiftRight<
5>(AndNot(mask, packed4)));
const VU16 p1 = Xor3(ShiftRight<
4>(AndNot(mask, packed3)),
ShiftRight<
3>(AndNot(mask, packed2)),
ShiftRight<
2>(AndNot(mask, packed1)));
const VU16 raw7 = Xor3(ShiftRight<
1>(AndNot(mask, packed0)), p0, p1);
StoreU(BitCast(d8, raw7), d8, raw +
7 * N8);
}
};
// Pack8<7>
template <>
struct Pack8<
8> {
template <
class D8>
HWY_INLINE
void Pack(D8 d8,
const uint8_t* HWY_RESTRICT raw,
uint8_t* HWY_RESTRICT packed_out)
const {
using VU8 = Vec<decltype(d8)>;
const size_t N8 = Lanes(d8);
const VU8 raw0 = LoadU(d8, raw +
0 * N8);
const VU8 raw1 = LoadU(d8, raw +
1 * N8);
const VU8 raw2 = LoadU(d8, raw +
2 * N8);
const VU8 raw3 = LoadU(d8, raw +
3 * N8);
const VU8 raw4 = LoadU(d8, raw +
4 * N8);
const VU8 raw5 = LoadU(d8, raw +
5 * N8);
const VU8 raw6 = LoadU(d8, raw +
6 * N8);
const VU8 raw7 = LoadU(d8, raw +
7 * N8);
StoreU(raw0, d8, packed_out +
0 * N8);
StoreU(raw1, d8, packed_out +
1 * N8);
StoreU(raw2, d8, packed_out +
2 * N8);
StoreU(raw3, d8, packed_out +
3 * N8);
StoreU(raw4, d8, packed_out +
4 * N8);
StoreU(raw5, d8, packed_out +
5 * N8);
StoreU(raw6, d8, packed_out +
6 * N8);
StoreU(raw7, d8, packed_out +
7 * N8);
}
template <
class D8>
HWY_INLINE
void Unpack(D8 d8,
const uint8_t* HWY_RESTRICT packed_in,
uint8_t* HWY_RESTRICT raw)
const {
using VU8 = Vec<decltype(d8)>;
const size_t N8 = Lanes(d8);
const VU8 raw0 = LoadU(d8, packed_in +
0 * N8);
const VU8 raw1 = LoadU(d8, packed_in +
1 * N8);
const VU8 raw2 = LoadU(d8, packed_in +
2 * N8);
const VU8 raw3 = LoadU(d8, packed_in +
3 * N8);
const VU8 raw4 = LoadU(d8, packed_in +
4 * N8);
const VU8 raw5 = LoadU(d8, packed_in +
5 * N8);
const VU8 raw6 = LoadU(d8, packed_in +
6 * N8);
const VU8 raw7 = LoadU(d8, packed_in +
7 * N8);
StoreU(raw0, d8, raw +
0 * N8);
StoreU(raw1, d8, raw +
1 * N8);
StoreU(raw2, d8, raw +
2 * N8);
StoreU(raw3, d8, raw +
3 * N8);
StoreU(raw4, d8, raw +
4 * N8);
StoreU(raw5, d8, raw +
5 * N8);
StoreU(raw6, d8, raw +
6 * N8);
StoreU(raw7, d8, raw +
7 * N8);
}
};
// Pack8<8>
template <>
struct Pack16<
1> {
template <
class D>
HWY_INLINE
void Pack(D d,
const uint16_t* HWY_RESTRICT raw,
uint16_t* HWY_RESTRICT packed_out)
const {
using VU16 = Vec<decltype(d)>;
const size_t N = Lanes(d);
const VU16 raw0 = LoadU(d, raw +
0 * N);
const VU16 raw1 = LoadU(d, raw +
1 * N);
const VU16 raw2 = LoadU(d, raw +
2 * N);
const VU16 raw3 = LoadU(d, raw +
3 * N);
const VU16 raw4 = LoadU(d, raw +
4 * N);
const VU16 raw5 = LoadU(d, raw +
5 * N);
const VU16 raw6 = LoadU(d, raw +
6 * N);
const VU16 raw7 = LoadU(d, raw +
7 * N);
const VU16 raw8 = LoadU(d, raw +
8 * N);
const VU16 raw9 = LoadU(d, raw +
9 * N);
const VU16 rawA = LoadU(d, raw +
0xA * N);
const VU16 rawB = LoadU(d, raw +
0xB * N);
const VU16 rawC = LoadU(d, raw +
0xC * N);
const VU16 rawD = LoadU(d, raw +
0xD * N);
const VU16 rawE = LoadU(d, raw +
0xE * N);
const VU16 rawF = LoadU(d, raw +
0xF * N);
const VU16 p0 = Xor3(ShiftLeft<
2>(raw2), Add(raw1, raw1), raw0);
const VU16 p1 =
Xor3(ShiftLeft<
5>(raw5), ShiftLeft<
4>(raw4), ShiftLeft<
3>(raw3));
const VU16 p2 =
Xor3(ShiftLeft<
8>(raw8), ShiftLeft<
7>(raw7), ShiftLeft<
6>(raw6));
const VU16 p3 =
Xor3(ShiftLeft<
0xB>(rawB), ShiftLeft<
0xA>(rawA), ShiftLeft<
9>(raw9));
const VU16 p4 =
Xor3(ShiftLeft<
0xE>(rawE), ShiftLeft<
0xD>(rawD), ShiftLeft<
0xC>(rawC));
const VU16 packed =
Or(Xor3(ShiftLeft<
0xF>(rawF), p0, p1), Xor3(p2, p3, p4));
StoreU(packed, d, packed_out);
}
template <
class D>
HWY_INLINE
void Unpack(D d,
const uint16_t* HWY_RESTRICT packed_in,
uint16_t* HWY_RESTRICT raw)
const {
using VU16 = Vec<decltype(d)>;
const size_t N = Lanes(d);
const VU16 mask = Set(d,
1u);
// Lowest bit
const VU16 packed = LoadU(d, packed_in);
const VU16 raw0 =
And(packed, mask);
StoreU(raw0, d, raw +
0 * N);
const VU16 raw1 =
And(ShiftRight<
1>(packed), mask);
StoreU(raw1, d, raw +
1 * N);
const VU16 raw2 =
And(ShiftRight<
2>(packed), mask);
StoreU(raw2, d, raw +
2 * N);
const VU16 raw3 =
And(ShiftRight<
3>(packed), mask);
StoreU(raw3, d, raw +
3 * N);
const VU16 raw4 =
And(ShiftRight<
4>(packed), mask);
StoreU(raw4, d, raw +
4 * N);
const VU16 raw5 =
And(ShiftRight<
5>(packed), mask);
StoreU(raw5, d, raw +
5 * N);
const VU16 raw6 =
And(ShiftRight<
6>(packed), mask);
StoreU(raw6, d, raw +
6 * N);
const VU16 raw7 =
And(ShiftRight<
7>(packed), mask);
StoreU(raw7, d, raw +
7 * N);
const VU16 raw8 =
And(ShiftRight<
8>(packed), mask);
StoreU(raw8, d, raw +
8 * N);
const VU16 raw9 =
And(ShiftRight<
9>(packed), mask);
StoreU(raw9, d, raw +
9 * N);
const VU16 rawA =
And(ShiftRight<
0xA>(packed), mask);
StoreU(rawA, d, raw +
0xA * N);
const VU16 rawB =
And(ShiftRight<
0xB>(packed), mask);
StoreU(rawB, d, raw +
0xB * N);
const VU16 rawC =
And(ShiftRight<
0xC>(packed), mask);
StoreU(rawC, d, raw +
0xC * N);
const VU16 rawD =
And(ShiftRight<
0xD>(packed), mask);
StoreU(rawD, d, raw +
0xD * N);
const VU16 rawE =
And(ShiftRight<
0xE>(packed), mask);
StoreU(rawE, d, raw +
0xE * N);
const VU16 rawF = ShiftRight<
0xF>(packed);
StoreU(rawF, d, raw +
0xF * N);
}
};
// Pack16<1>
template <>
struct Pack16<
2> {
template <
class D>
HWY_INLINE
void Pack(D d,
const uint16_t* HWY_RESTRICT raw,
uint16_t* HWY_RESTRICT packed_out)
const {
using VU16 = Vec<decltype(d)>;
const size_t N = Lanes(d);
const VU16 raw0 = LoadU(d, raw +
0 * N);
const VU16 raw1 = LoadU(d, raw +
1 * N);
const VU16 raw2 = LoadU(d, raw +
2 * N);
const VU16 raw3 = LoadU(d, raw +
3 * N);
const VU16 raw4 = LoadU(d, raw +
4 * N);
const VU16 raw5 = LoadU(d, raw +
5 * N);
const VU16 raw6 = LoadU(d, raw +
6 * N);
const VU16 raw7 = LoadU(d, raw +
7 * N);
const VU16 raw8 = LoadU(d, raw +
8 * N);
const VU16 raw9 = LoadU(d, raw +
9 * N);
const VU16 rawA = LoadU(d, raw +
0xA * N);
const VU16 rawB = LoadU(d, raw +
0xB * N);
const VU16 rawC = LoadU(d, raw +
0xC * N);
const VU16 rawD = LoadU(d, raw +
0xD * N);
const VU16 rawE = LoadU(d, raw +
0xE * N);
const VU16 rawF = LoadU(d, raw +
0xF * N);
VU16 packed0 = Xor3(ShiftLeft<
4>(raw4), ShiftLeft<
2>(raw2), raw0);
VU16 packed1 = Xor3(ShiftLeft<
4>(raw5), ShiftLeft<
2>(raw3), raw1);
packed0 = Xor3(packed0, ShiftLeft<
8>(raw8), ShiftLeft<
6>(raw6));
packed1 = Xor3(packed1, ShiftLeft<
8>(raw9), ShiftLeft<
6>(raw7));
packed0 = Xor3(packed0, ShiftLeft<
12>(rawC), ShiftLeft<
10>(rawA));
packed1 = Xor3(packed1, ShiftLeft<
12>(rawD), ShiftLeft<
10>(rawB));
packed0 =
Or(packed0, ShiftLeft<
14>(rawE));
packed1 =
Or(packed1, ShiftLeft<
14>(rawF));
StoreU(packed0, d, packed_out +
0 * N);
StoreU(packed1, d, packed_out +
1 * N);
}
template <
class D>
HWY_INLINE
void Unpack(D d,
const uint16_t* HWY_RESTRICT packed_in,
uint16_t* HWY_RESTRICT raw)
const {
using VU16 = Vec<decltype(d)>;
const size_t N = Lanes(d);
const VU16 mask = Set(d,
0x3u);
// Lowest 2 bits
const VU16 packed0 = LoadU(d, packed_in +
0 * N);
const VU16 packed1 = LoadU(d, packed_in +
1 * N);
const VU16 raw0 =
And(packed0, mask);
StoreU(raw0, d, raw +
0 * N);
const VU16 raw1 =
And(packed1, mask);
StoreU(raw1, d, raw +
1 * N);
const VU16 raw2 =
And(ShiftRight<
2>(packed0), mask);
StoreU(raw2, d, raw +
2 * N);
const VU16 raw3 =
And(ShiftRight<
2>(packed1), mask);
StoreU(raw3, d, raw +
3 * N);
const VU16 raw4 =
And(ShiftRight<
4>(packed0), mask);
StoreU(raw4, d, raw +
4 * N);
const VU16 raw5 =
And(ShiftRight<
4>(packed1), mask);
StoreU(raw5, d, raw +
5 * N);
const VU16 raw6 =
And(ShiftRight<
6>(packed0), mask);
StoreU(raw6, d, raw +
6 * N);
const VU16 raw7 =
And(ShiftRight<
6>(packed1), mask);
StoreU(raw7, d, raw +
7 * N);
const VU16 raw8 =
And(ShiftRight<
8>(packed0), mask);
StoreU(raw8, d, raw +
8 * N);
const VU16 raw9 =
And(ShiftRight<
8>(packed1), mask);
StoreU(raw9, d, raw +
9 * N);
const VU16 rawA =
And(ShiftRight<
0xA>(packed0), mask);
StoreU(rawA, d, raw +
0xA * N);
const VU16 rawB =
And(ShiftRight<
0xA>(packed1), mask);
StoreU(rawB, d, raw +
0xB * N);
const VU16 rawC =
And(ShiftRight<
0xC>(packed0), mask);
StoreU(rawC, d, raw +
0xC * N);
const VU16 rawD =
And(ShiftRight<
0xC>(packed1), mask);
StoreU(rawD, d, raw +
0xD * N);
const VU16 rawE = ShiftRight<
0xE>(packed0);
StoreU(rawE, d, raw +
0xE * N);
const VU16 rawF = ShiftRight<
0xE>(packed1);
StoreU(rawF, d, raw +
0xF * N);
}
};
// Pack16<2>
template <>
struct Pack16<
3> {
template <
class D>
HWY_INLINE
void Pack(D d,
const uint16_t* HWY_RESTRICT raw,
uint16_t* HWY_RESTRICT packed_out)
const {
using VU16 = Vec<decltype(d)>;
const size_t N = Lanes(d);
const VU16 raw0 = LoadU(d, raw +
0 * N);
const VU16 raw1 = LoadU(d, raw +
1 * N);
const VU16 raw2 = LoadU(d, raw +
2 * N);
const VU16 raw3 = LoadU(d, raw +
3 * N);
const VU16 raw4 = LoadU(d, raw +
4 * N);
const VU16 raw5 = LoadU(d, raw +
5 * N);
const VU16 raw6 = LoadU(d, raw +
6 * N);
const VU16 raw7 = LoadU(d, raw +
7 * N);
const VU16 raw8 = LoadU(d, raw +
8 * N);
const VU16 raw9 = LoadU(d, raw +
9 * N);
const VU16 rawA = LoadU(d, raw +
0xA * N);
const VU16 rawB = LoadU(d, raw +
0xB * N);
const VU16 rawC = LoadU(d, raw +
0xC * N);
const VU16 rawD = LoadU(d, raw +
0xD * N);
const VU16 rawE = LoadU(d, raw +
0xE * N);
const VU16 rawF = LoadU(d, raw +
0xF * N);
// We can fit 15 raw vectors in three packed vectors (five each).
VU16 packed0 = Xor3(ShiftLeft<
6>(raw6), ShiftLeft<
3>(raw3), raw0);
VU16 packed1 = Xor3(ShiftLeft<
6>(raw7), ShiftLeft<
3>(raw4), raw1);
VU16 packed2 = Xor3(ShiftLeft<
6>(raw8), ShiftLeft<
3>(raw5), raw2);
// rawF will be scattered into the upper bit of these three.
packed0 = Xor3(packed0, ShiftLeft<
12>(rawC), ShiftLeft<
9>(raw9));
packed1 = Xor3(packed1, ShiftLeft<
12>(rawD), ShiftLeft<
9>(rawA));
packed2 = Xor3(packed2, ShiftLeft<
12>(rawE), ShiftLeft<
9>(rawB));
const VU16 hi1 = Set(d,
0x8000u);
packed0 =
Or(packed0, ShiftLeft<
15>(rawF));
// MSB only, no mask
packed1 = OrAnd(packed1, ShiftLeft<
14>(rawF), hi1);
packed2 = OrAnd(packed2, ShiftLeft<
13>(rawF), hi1);
StoreU(packed0, d, packed_out +
0 * N);
StoreU(packed1, d, packed_out +
1 * N);
StoreU(packed2, d, packed_out +
2 * N);
}
template <
class D>
HWY_INLINE
void Unpack(D d,
const uint16_t* HWY_RESTRICT packed_in,
uint16_t* HWY_RESTRICT raw)
const {
using VU16 = Vec<decltype(d)>;
const size_t N = Lanes(d);
const VU16 mask = Set(d,
0x7u);
// Lowest 3 bits
const VU16 packed0 = LoadU(d, packed_in +
0 * N);
const VU16 packed1 = LoadU(d, packed_in +
1 * N);
const VU16 packed2 = LoadU(d, packed_in +
2 * N);
const VU16 raw0 =
And(mask, packed0);
StoreU(raw0, d, raw +
0 * N);
const VU16 raw1 =
And(mask, packed1);
StoreU(raw1, d, raw +
1 * N);
const VU16 raw2 =
And(mask, packed2);
StoreU(raw2, d, raw +
2 * N);
const VU16 raw3 =
And(mask, ShiftRight<
3>(packed0));
StoreU(raw3, d, raw +
3 * N);
const VU16 raw4 =
And(mask, ShiftRight<
3>(packed1));
StoreU(raw4, d, raw +
4 * N);
const VU16 raw5 =
And(mask, ShiftRight<
3>(packed2));
StoreU(raw5, d, raw +
5 * N);
const VU16 raw6 =
And(mask, ShiftRight<
6>(packed0));
StoreU(raw6, d, raw +
6 * N);
const VU16 raw7 =
And(mask, ShiftRight<
6>(packed1));
StoreU(raw7, d, raw +
7 * N);
const VU16 raw8 =
And(mask, ShiftRight<
6>(packed2));
StoreU(raw8, d, raw +
8 * N);
const VU16 raw9 =
And(mask, ShiftRight<
9>(packed0));
StoreU(raw9, d, raw +
9 * N);
const VU16 rawA =
And(mask, ShiftRight<
9>(packed1));
StoreU(rawA, d, raw +
0xA * N);
const VU16 rawB =
And(mask, ShiftRight<
9>(packed2));
StoreU(rawB, d, raw +
0xB * N);
const VU16 rawC =
And(mask, ShiftRight<
12>(packed0));
StoreU(rawC, d, raw +
0xC * N);
const VU16 rawD =
And(mask, ShiftRight<
12>(packed1));
StoreU(rawD, d, raw +
0xD * N);
const VU16 rawE =
And(mask, ShiftRight<
12>(packed2));
StoreU(rawE, d, raw +
0xE * N);
// rawF is the concatenation of the upper bit of packed0..2.
const VU16 down0 = ShiftRight<
15>(packed0);
const VU16 down1 = ShiftRight<
15>(packed1);
const VU16 down2 = ShiftRight<
15>(packed2);
const VU16 rawF = Xor3(ShiftLeft<
2>(down2), Add(down1, down1), down0);
StoreU(rawF, d, raw +
0xF * N);
}
};
// Pack16<3>
template <>
struct Pack16<
4> {
template <
class D>
HWY_INLINE
void Pack(D d,
const uint16_t* HWY_RESTRICT raw,
uint16_t* HWY_RESTRICT packed_out)
const {
using VU16 = Vec<decltype(d)>;
const size_t N = Lanes(d);
const VU16 raw0 = LoadU(d, raw +
0 * N);
const VU16 raw1 = LoadU(d, raw +
1 * N);
const VU16 raw2 = LoadU(d, raw +
2 * N);
const VU16 raw3 = LoadU(d, raw +
3 * N);
const VU16 raw4 = LoadU(d, raw +
4 * N);
const VU16 raw5 = LoadU(d, raw +
5 * N);
const VU16 raw6 = LoadU(d, raw +
6 * N);
const VU16 raw7 = LoadU(d, raw +
7 * N);
const VU16 raw8 = LoadU(d, raw +
8 * N);
const VU16 raw9 = LoadU(d, raw +
9 * N);
const VU16 rawA = LoadU(d, raw +
0xA * N);
const VU16 rawB = LoadU(d, raw +
0xB * N);
const VU16 rawC = LoadU(d, raw +
0xC * N);
const VU16 rawD = LoadU(d, raw +
0xD * N);
const VU16 rawE = LoadU(d, raw +
0xE * N);
const VU16 rawF = LoadU(d, raw +
0xF * N);
VU16 packed0 = Xor3(ShiftLeft<
8>(raw4), ShiftLeft<
4>(raw2), raw0);
VU16 packed1 = Xor3(ShiftLeft<
8>(raw5), ShiftLeft<
4>(raw3), raw1);
packed0 =
Or(packed0, ShiftLeft<
12>(raw6));
packed1 =
Or(packed1, ShiftLeft<
12>(raw7));
VU16 packed2 = Xor3(ShiftLeft<
8>(rawC), ShiftLeft<
4>(rawA), raw8);
VU16 packed3 = Xor3(ShiftLeft<
8>(rawD), ShiftLeft<
4>(rawB), raw9);
packed2 =
Or(packed2, ShiftLeft<
12>(rawE));
packed3 =
Or(packed3, ShiftLeft<
12>(rawF));
StoreU(packed0, d, packed_out +
0 * N);
StoreU(packed1, d, packed_out +
1 * N);
StoreU(packed2, d, packed_out +
2 * N);
StoreU(packed3, d, packed_out +
3 * N);
}
template <
class D>
HWY_INLINE
void Unpack(D d,
const uint16_t* HWY_RESTRICT packed_in,
uint16_t* HWY_RESTRICT raw)
const {
using VU16 = Vec<decltype(d)>;
const size_t N = Lanes(d);
const VU16 mask = Set(d,
0xFu);
// Lowest 4 bits
const VU16 packed0 = LoadU(d, packed_in +
0 * N);
const VU16 packed1 = LoadU(d, packed_in +
1 * N);
const VU16 packed2 = LoadU(d, packed_in +
2 * N);
const VU16 packed3 = LoadU(d, packed_in +
3 * N);
const VU16 raw0 =
And(packed0, mask);
StoreU(raw0, d, raw +
0 * N);
const VU16 raw1 =
And(packed1, mask);
StoreU(raw1, d, raw +
1 * N);
const VU16 raw2 =
And(ShiftRight<
4>(packed0), mask);
StoreU(raw2, d, raw +
2 * N);
const VU16 raw3 =
And(ShiftRight<
4>(packed1), mask);
StoreU(raw3, d, raw +
3 * N);
const VU16 raw4 =
And(ShiftRight<
8>(packed0), mask);
StoreU(raw4, d, raw +
4 * N);
const VU16 raw5 =
And(ShiftRight<
8>(packed1), mask);
StoreU(raw5, d, raw +
5 * N);
const VU16 raw6 = ShiftRight<
12>(packed0);
// no mask required
StoreU(raw6, d, raw +
6 * N);
const VU16 raw7 = ShiftRight<
12>(packed1);
// no mask required
StoreU(raw7, d, raw +
7 * N);
const VU16 raw8 =
And(packed2, mask);
StoreU(raw8, d, raw +
8 * N);
const VU16 raw9 =
And(packed3, mask);
StoreU(raw9, d, raw +
9 * N);
const VU16 rawA =
And(ShiftRight<
4>(packed2), mask);
StoreU(rawA, d, raw +
0xA * N);
const VU16 rawB =
And(ShiftRight<
4>(packed3), mask);
StoreU(rawB, d, raw +
0xB * N);
const VU16 rawC =
And(ShiftRight<
8>(packed2), mask);
StoreU(rawC, d, raw +
0xC * N);
const VU16 rawD =
And(ShiftRight<
8>(packed3), mask);
StoreU(rawD, d, raw +
0xD * N);
const VU16 rawE = ShiftRight<
12>(packed2);
// no mask required
StoreU(rawE, d, raw +
0xE * N);
const VU16 rawF = ShiftRight<
12>(packed3);
// no mask required
StoreU(rawF, d, raw +
0xF * N);
}
};
// Pack16<4>
template <>
struct Pack16<
5> {
template <
class D>
HWY_INLINE
void Pack(D d,
const uint16_t* HWY_RESTRICT raw,
uint16_t* HWY_RESTRICT packed_out)
const {
using VU16 = Vec<decltype(d)>;
const size_t N = Lanes(d);
const VU16 raw0 = LoadU(d, raw +
0 * N);
const VU16 raw1 = LoadU(d, raw +
1 * N);
const VU16 raw2 = LoadU(d, raw +
2 * N);
const VU16 raw3 = LoadU(d, raw +
3 * N);
const VU16 raw4 = LoadU(d, raw +
4 * N);
const VU16 raw5 = LoadU(d, raw +
5 * N);
const VU16 raw6 = LoadU(d, raw +
6 * N);
const VU16 raw7 = LoadU(d, raw +
7 * N);
const VU16 raw8 = LoadU(d, raw +
8 * N);
const VU16 raw9 = LoadU(d, raw +
9 * N);
const VU16 rawA = LoadU(d, raw +
0xA * N);
const VU16 rawB = LoadU(d, raw +
0xB * N);
const VU16 rawC = LoadU(d, raw +
0xC * N);
const VU16 rawD = LoadU(d, raw +
0xD * N);
const VU16 rawE = LoadU(d, raw +
0xE * N);
const VU16 rawF = LoadU(d, raw +
0xF * N);
// We can fit 15 raw vectors in five packed vectors (three each).
VU16 packed0 = Xor3(ShiftLeft<
10>(rawA), ShiftLeft<
5>(raw5), raw0);
VU16 packed1 = Xor3(ShiftLeft<
10>(rawB), ShiftLeft<
5>(raw6), raw1);
VU16 packed2 = Xor3(ShiftLeft<
10>(rawC), ShiftLeft<
5>(raw7), raw2);
VU16 packed3 = Xor3(ShiftLeft<
10>(rawD), ShiftLeft<
5>(raw8), raw3);
VU16 packed4 = Xor3(ShiftLeft<
10>(rawE), ShiftLeft<
5>(raw9), raw4);
// rawF will be scattered into the upper bits of these five.
const VU16 hi1 = Set(d,
0x8000u);
packed0 =
Or(packed0, ShiftLeft<
15>(rawF));
// MSB only, no mask
packed1 = OrAnd(packed1, ShiftLeft<
14>(rawF), hi1);
packed2 = OrAnd(packed2, ShiftLeft<
13>(rawF), hi1);
packed3 = OrAnd(packed3, ShiftLeft<
12>(rawF), hi1);
packed4 = OrAnd(packed4, ShiftLeft<
11>(rawF), hi1);
StoreU(packed0, d, packed_out +
0 * N);
StoreU(packed1, d, packed_out +
1 * N);
StoreU(packed2, d, packed_out +
2 * N);
StoreU(packed3, d, packed_out +
3 * N);
StoreU(packed4, d, packed_out +
4 * N);
}
template <
class D>
HWY_INLINE
void Unpack(D d,
const uint16_t* HWY_RESTRICT packed_in,
uint16_t* HWY_RESTRICT raw)
const {
using VU16 = Vec<decltype(d)>;
const size_t N = Lanes(d);
const VU16 packed0 = LoadU(d, packed_in +
0 * N);
const VU16 packed1 = LoadU(d, packed_in +
1 * N);
const VU16 packed2 = LoadU(d, packed_in +
2 * N);
const VU16 packed3 = LoadU(d, packed_in +
3 * N);
const VU16 packed4 = LoadU(d, packed_in +
4 * N);
const VU16 mask = Set(d,
0x1Fu);
// Lowest 5 bits
const VU16 raw0 =
And(packed0, mask);
StoreU(raw0, d, raw +
0 * N);
const VU16 raw1 =
And(packed1, mask);
StoreU(raw1, d, raw +
1 * N);
const VU16 raw2 =
And(packed2, mask);
StoreU(raw2, d, raw +
2 * N);
const VU16 raw3 =
And(packed3, mask);
StoreU(raw3, d, raw +
3 * N);
const VU16 raw4 =
And(packed4, mask);
StoreU(raw4, d, raw +
4 * N);
const VU16 raw5 =
And(ShiftRight<
5>(packed0), mask);
StoreU(raw5, d, raw +
5 * N);
const VU16 raw6 =
And(ShiftRight<
5>(packed1), mask);
StoreU(raw6, d, raw +
6 * N);
const VU16 raw7 =
And(ShiftRight<
5>(packed2), mask);
StoreU(raw7, d, raw +
7 * N);
const VU16 raw8 =
And(ShiftRight<
5>(packed3), mask);
StoreU(raw8, d, raw +
8 * N);
const VU16 raw9 =
And(ShiftRight<
5>(packed4), mask);
StoreU(raw9, d, raw +
9 * N);
const VU16 rawA =
And(ShiftRight<
10>(packed0), mask);
StoreU(rawA, d, raw +
0xA * N);
const VU16 rawB =
And(ShiftRight<
10>(packed1), mask);
StoreU(rawB, d, raw +
0xB * N);
const VU16 rawC =
And(ShiftRight<
10>(packed2), mask);
StoreU(rawC, d, raw +
0xC * N);
const VU16 rawD =
And(ShiftRight<
10>(packed3), mask);
StoreU(rawD, d, raw +
0xD * N);
const VU16 rawE =
And(ShiftRight<
10>(packed4), mask);
StoreU(rawE, d, raw +
0xE * N);
// rawF is the concatenation of the lower bit of packed0..4.
const VU16 down0 = ShiftRight<
15>(packed0);
const VU16 down1 = ShiftRight<
15>(packed1);
const VU16 hi1 = Set(d,
0x8000u);
const VU16 p0 =
Xor3(ShiftRight<
13>(
And(packed2, hi1)), Add(down1, down1), down0);
const VU16 rawF = Xor3(ShiftRight<
11>(
And(packed4, hi1)),
ShiftRight<
12>(
And(packed3, hi1)), p0);
StoreU(rawF, d, raw +
0xF * N);
}
};
// Pack16<5>
template <>
struct Pack16<
6> {
template <
class D>
HWY_INLINE
void Pack(D d,
const uint16_t* HWY_RESTRICT raw,
uint16_t* HWY_RESTRICT packed_out)
const {
using VU16 = Vec<decltype(d)>;
const size_t N = Lanes(d);
const VU16 raw0 = LoadU(d, raw +
0 * N);
const VU16 raw1 = LoadU(d, raw +
1 * N);
const VU16 raw2 = LoadU(d, raw +
2 * N);
const VU16 raw3 = LoadU(d, raw +
3 * N);
const VU16 raw4 = LoadU(d, raw +
4 * N);
const VU16 raw5 = LoadU(d, raw +
5 * N);
const VU16 raw6 = LoadU(d, raw +
6 * N);
const VU16 raw7 = LoadU(d, raw +
7 * N);
const VU16 raw8 = LoadU(d, raw +
8 * N);
const VU16 raw9 = LoadU(d, raw +
9 * N);
const VU16 rawA = LoadU(d, raw +
0xA * N);
const VU16 rawB = LoadU(d, raw +
0xB * N);
const VU16 rawC = LoadU(d, raw +
0xC * N);
const VU16 rawD = LoadU(d, raw +
0xD * N);
const VU16 rawE = LoadU(d, raw +
0xE * N);
const VU16 rawF = LoadU(d, raw +
0xF * N);
const VU16 packed3 =
Or(ShiftLeft<
6>(raw7), raw3);
const VU16 packed7 =
Or(ShiftLeft<
6>(rawF), rawB);
// Three vectors, two 6-bit raw each; packed3 (12 bits) is spread over the
// four remainder bits at the top of each vector.
const VU16 packed0 = Xor3(ShiftLeft<
12>(packed3), ShiftLeft<
6>(raw4), raw0);
VU16 packed1 =
Or(ShiftLeft<
6>(raw5), raw1);
VU16 packed2 =
Or(ShiftLeft<
6>(raw6), raw2);
const VU16 packed4 = Xor3(ShiftLeft<
12>(packed7), ShiftLeft<
6>(rawC), raw8);
VU16 packed5 =
Or(ShiftLeft<
6>(rawD), raw9);
VU16 packed6 =
Or(ShiftLeft<
6>(rawE), rawA);
const VU16 hi4 = Set(d,
0xF000u);
packed1 = OrAnd(packed1, ShiftLeft<
8>(packed3), hi4);
packed2 = OrAnd(packed2, ShiftLeft<
4>(packed3), hi4);
packed5 = OrAnd(packed5, ShiftLeft<
8>(packed7), hi4);
packed6 = OrAnd(packed6, ShiftLeft<
4>(packed7), hi4);
StoreU(packed0, d, packed_out +
0 * N);
StoreU(packed1, d, packed_out +
1 * N);
StoreU(packed2, d, packed_out +
2 * N);
StoreU(packed4, d, packed_out +
3 * N);
StoreU(packed5, d, packed_out +
4 * N);
StoreU(packed6, d, packed_out +
5 * N);
}
template <
class D>
HWY_INLINE
void Unpack(D d,
const uint16_t* HWY_RESTRICT packed_in,
uint16_t* HWY_RESTRICT raw)
const {
using VU16 = Vec<decltype(d)>;
const size_t N = Lanes(d);
const VU16 mask = Set(d,
0x3Fu);
// Lowest 6 bits
const VU16 packed0 = LoadU(d, packed_in +
0 * N);
const VU16 packed1 = LoadU(d, packed_in +
1 * N);
const VU16 packed2 = LoadU(d, packed_in +
2 * N);
const VU16 packed4 = LoadU(d, packed_in +
3 * N);
const VU16 packed5 = LoadU(d, packed_in +
4 * N);
const VU16 packed6 = LoadU(d, packed_in +
5 * N);
const VU16 raw0 =
And(packed0, mask);
StoreU(raw0, d, raw +
0 * N);
const VU16 raw1 =
And(packed1, mask);
StoreU(raw1, d, raw +
1 * N);
const VU16 raw2 =
And(packed2, mask);
StoreU(raw2, d, raw +
2 * N);
const VU16 raw4 =
And(ShiftRight<
6>(packed0), mask);
StoreU(raw4, d, raw +
4 * N);
const VU16 raw5 =
And(ShiftRight<
6>(packed1), mask);
StoreU(raw5, d, raw +
5 * N);
const VU16 raw6 =
And(ShiftRight<
6>(packed2), mask);
StoreU(raw6, d, raw +
6 * N);
const VU16 raw8 =
And(packed4, mask);
StoreU(raw8, d, raw +
8 * N);
const VU16 raw9 =
And(packed5, mask);
StoreU(raw9, d, raw +
9 * N);
const VU16 rawA =
And(packed6, mask);
StoreU(rawA, d, raw +
0xA * N);
const VU16 rawC =
And(ShiftRight<
6>(packed4), mask);
StoreU(rawC, d, raw +
0xC * N);
const VU16 rawD =
And(ShiftRight<
6>(packed5), mask);
StoreU(rawD, d, raw +
0xD * N);
const VU16 rawE =
And(ShiftRight<
6>(packed6), mask);
StoreU(rawE, d, raw +
0xE * N);
// packed3 is the concatenation of the four upper bits in packed0..2.
const VU16 down0 = ShiftRight<
12>(packed0);
const VU16 down4 = ShiftRight<
12>(packed4);
const VU16 hi4 = Set(d,
0xF000u);
const VU16 packed3 = Xor3(ShiftRight<
4>(
And(packed2, hi4)),
ShiftRight<
8>(
And(packed1, hi4)), down0);
const VU16 packed7 = Xor3(ShiftRight<
4>(
And(packed6, hi4)),
ShiftRight<
8>(
And(packed5, hi4)), down4);
const VU16 raw3 =
And(packed3, mask);
StoreU(raw3, d, raw +
3 * N);
const VU16 rawB =
And(packed7, mask);
StoreU(rawB, d, raw +
0xB * N);
const VU16 raw7 = ShiftRight<
6>(packed3);
// upper bits already zero
StoreU(raw7, d, raw +
7 * N);
const VU16 rawF = ShiftRight<
6>(packed7);
// upper bits already zero
StoreU(rawF, d, raw +
0xF * N);
}
};
// Pack16<6>
template <>
struct Pack16<
7> {
template <
class D>
HWY_INLINE
void Pack(D d,
const uint16_t* HWY_RESTRICT raw,
uint16_t* HWY_RESTRICT packed_out)
const {
using VU16 = Vec<decltype(d)>;
const size_t N = Lanes(d);
const VU16 raw0 = LoadU(d, raw +
0 * N);
const VU16 raw1 = LoadU(d, raw +
1 * N);
const VU16 raw2 = LoadU(d, raw +
2 * N);
const VU16 raw3 = LoadU(d, raw +
3 * N);
const VU16 raw4 = LoadU(d, raw +
4 * N);
const VU16 raw5 = LoadU(d, raw +
5 * N);
const VU16 raw6 = LoadU(d, raw +
6 * N);
const VU16 raw7 = LoadU(d, raw +
7 * N);
const VU16 raw8 = LoadU(d, raw +
8 * N);
const VU16 raw9 = LoadU(d, raw +
9 * N);
const VU16 rawA = LoadU(d, raw +
0xA * N);
const VU16 rawB = LoadU(d, raw +
0xB * N);
const VU16 rawC = LoadU(d, raw +
0xC * N);
const VU16 rawD = LoadU(d, raw +
0xD * N);
const VU16 rawE = LoadU(d, raw +
0xE * N);
const VU16 rawF = LoadU(d, raw +
0xF * N);
const VU16 packed7 =
Or(ShiftLeft<
7>(rawF), raw7);
// Seven vectors, two 7-bit raw each; packed7 (14 bits) is spread over the
// two remainder bits at the top of each vector.
const VU16 packed0 = Xor3(ShiftLeft<
14>(packed7), ShiftLeft<
7>(raw8), raw0);
VU16 packed1 =
Or(ShiftLeft<
7>(raw9), raw1);
VU16 packed2 =
Or(ShiftLeft<
7>(rawA), raw2);
VU16 packed3 =
Or(ShiftLeft<
7>(rawB), raw3);
VU16 packed4 =
Or(ShiftLeft<
7>(rawC), raw4);
VU16 packed5 =
Or(ShiftLeft<
7>(rawD), raw5);
VU16 packed6 =
Or(ShiftLeft<
7>(rawE), raw6);
const VU16 hi2 = Set(d,
0xC000u);
packed1 = OrAnd(packed1, ShiftLeft<
12>(packed7), hi2);
packed2 = OrAnd(packed2, ShiftLeft<
10>(packed7), hi2);
packed3 = OrAnd(packed3, ShiftLeft<
8>(packed7), hi2);
packed4 = OrAnd(packed4, ShiftLeft<
6>(packed7), hi2);
packed5 = OrAnd(packed5, ShiftLeft<
4>(packed7), hi2);
packed6 = OrAnd(packed6, ShiftLeft<
2>(packed7), hi2);
StoreU(packed0, d, packed_out +
0 * N);
StoreU(packed1, d, packed_out +
1 * N);
StoreU(packed2, d, packed_out +
2 * N);
StoreU(packed3, d, packed_out +
3 * N);
StoreU(packed4, d, packed_out +
4 * N);
StoreU(packed5, d, packed_out +
5 * N);
StoreU(packed6, d, packed_out +
6 * N);
}
template <
class D>
HWY_INLINE
void Unpack(D d,
const uint16_t* HWY_RESTRICT packed_in,
uint16_t* HWY_RESTRICT raw)
const {
using VU16 = Vec<decltype(d)>;
const size_t N = Lanes(d);
const VU16 packed0 = BitCast(d, LoadU(d, packed_in +
0 * N));
const VU16 packed1 = BitCast(d, LoadU(d, packed_in +
1 * N));
const VU16 packed2 = BitCast(d, LoadU(d, packed_in +
2 * N));
const VU16 packed3 = BitCast(d, LoadU(d, packed_in +
3 * N));
const VU16 packed4 = BitCast(d, LoadU(d, packed_in +
4 * N));
const VU16 packed5 = BitCast(d, LoadU(d, packed_in +
5 * N));
const VU16 packed6 = BitCast(d, LoadU(d, packed_in +
6 * N));
const VU16 mask = Set(d,
0x7Fu);
// Lowest 7 bits
const VU16 raw0 =
And(packed0, mask);
StoreU(raw0, d, raw +
0 * N);
const VU16 raw1 =
And(packed1, mask);
StoreU(raw1, d, raw +
1 * N);
const VU16 raw2 =
And(packed2, mask);
StoreU(raw2, d, raw +
2 * N);
const VU16 raw3 =
And(packed3, mask);
StoreU(raw3, d, raw +
3 * N);
const VU16 raw4 =
And(packed4, mask);
StoreU(raw4, d, raw +
4 * N);
const VU16 raw5 =
And(packed5, mask);
StoreU(raw5, d, raw +
5 * N);
const VU16 raw6 =
And(packed6, mask);
StoreU(raw6, d, raw +
6 * N);
const VU16 raw8 =
And(ShiftRight<
7>(packed0), mask);
StoreU(raw8, d, raw +
8 * N);
const VU16 raw9 =
And(ShiftRight<
7>(packed1), mask);
StoreU(raw9, d, raw +
9 * N);
const VU16 rawA =
And(ShiftRight<
7>(packed2), mask);
StoreU(rawA, d, raw +
0xA * N);
const VU16 rawB =
And(ShiftRight<
7>(packed3), mask);
StoreU(rawB, d, raw +
0xB * N);
const VU16 rawC =
And(ShiftRight<
7>(packed4), mask);
StoreU(rawC, d, raw +
0xC * N);
const VU16 rawD =
And(ShiftRight<
7>(packed5), mask);
StoreU(rawD, d, raw +
0xD * N);
const VU16 rawE =
And(ShiftRight<
7>(packed6), mask);
StoreU(rawE, d, raw +
0xE * N);
// packed7 is the concatenation of the two upper bits in packed0..6.
const VU16 down0 = ShiftRight<
14>(packed0);
const VU16 hi2 = Set(d,
0xC000u);
const VU16 p0 = Xor3(ShiftRight<
12>(
And(packed1, hi2)),
ShiftRight<
10>(
And(packed2, hi2)), down0);
const VU16 p1 = Xor3(ShiftRight<
8>(
And(packed3, hi2)),
//
ShiftRight<
6>(
And(packed4, hi2)),
ShiftRight<
4>(
And(packed5, hi2)));
const VU16 packed7 = Xor3(ShiftRight<
2>(
And(packed6, hi2)), p1, p0);
const VU16 raw7 =
And(packed7, mask);
StoreU(raw7, d, raw +
7 * N);
const VU16 rawF = ShiftRight<
7>(packed7);
// upper bits already zero
StoreU(rawF, d, raw +
0xF * N);
}
};
// Pack16<7>
template <>
struct Pack16<
8> {
template <
class D>
HWY_INLINE
void Pack(D d,
const uint16_t* HWY_RESTRICT raw,
uint16_t* HWY_RESTRICT packed_out)
const {
using VU16 = Vec<decltype(d)>;
const size_t N = Lanes(d);
const VU16 raw0 = LoadU(d, raw +
0 * N);
const VU16 raw1 = LoadU(d, raw +
1 * N);
const VU16 raw2 = LoadU(d, raw +
2 * N);
const VU16 raw3 = LoadU(d, raw +
3 * N);
const VU16 raw4 = LoadU(d, raw +
4 * N);
const VU16 raw5 = LoadU(d, raw +
5 * N);
const VU16 raw6 = LoadU(d, raw +
6 * N);
const VU16 raw7 = LoadU(d, raw +
7 * N);
const VU16 raw8 = LoadU(d, raw +
8 * N);
const VU16 raw9 = LoadU(d, raw +
9 * N);
const VU16 rawA = LoadU(d, raw +
0xA * N);
const VU16 rawB = LoadU(d, raw +
0xB * N);
const VU16 rawC = LoadU(d, raw +
0xC * N);
const VU16 rawD = LoadU(d, raw +
0xD * N);
const VU16 rawE = LoadU(d, raw +
0xE * N);
const VU16 rawF = LoadU(d, raw +
0xF * N);
// This is equivalent to ConcatEven with 8-bit lanes, but much more
// efficient on RVV and slightly less efficient on SVE2.
const VU16 packed0 =
Or(ShiftLeft<
8>(raw2), raw0);
const VU16 packed1 =
Or(ShiftLeft<
8>(raw3), raw1);
const VU16 packed2 =
Or(ShiftLeft<
8>(raw6), raw4);
const VU16 packed3 =
Or(ShiftLeft<
8>(raw7), raw5);
const VU16 packed4 =
Or(ShiftLeft<
8>(rawA), raw8);
const VU16 packed5 =
Or(ShiftLeft<
8>(rawB), raw9);
const VU16 packed6 =
Or(ShiftLeft<
8>(rawE), rawC);
const VU16 packed7 =
Or(ShiftLeft<
8>(rawF), rawD);
StoreU(packed0, d, packed_out +
0 * N);
StoreU(packed1, d, packed_out +
1 * N);
StoreU(packed2, d, packed_out +
2 * N);
StoreU(packed3, d, packed_out +
3 * N);
StoreU(packed4, d, packed_out +
4 * N);
StoreU(packed5, d, packed_out +
5 * N);
StoreU(packed6, d, packed_out +
6 * N);
StoreU(packed7, d, packed_out +
7 * N);
}
template <
class D>
HWY_INLINE
void Unpack(D d,
const uint16_t* HWY_RESTRICT packed_in,
uint16_t* HWY_RESTRICT raw)
const {
using VU16 = Vec<decltype(d)>;
const size_t N = Lanes(d);
const VU16 packed0 = BitCast(d, LoadU(d, packed_in +
0 * N));
const VU16 packed1 = BitCast(d, LoadU(d, packed_in +
1 * N));
const VU16 packed2 = BitCast(d, LoadU(d, packed_in +
2 * N));
const VU16 packed3 = BitCast(d, LoadU(d, packed_in +
3 * N));
const VU16 packed4 = BitCast(d, LoadU(d, packed_in +
4 * N));
const VU16 packed5 = BitCast(d, LoadU(d, packed_in +
5 * N));
const VU16 packed6 = BitCast(d, LoadU(d, packed_in +
6 * N));
const VU16 packed7 = BitCast(d, LoadU(d, packed_in +
7 * N));
const VU16 mask = Set(d,
0xFFu);
// Lowest 8 bits
const VU16 raw0 =
And(packed0, mask);
StoreU(raw0, d, raw +
0 * N);
const VU16 raw1 =
And(packed1, mask);
StoreU(raw1, d, raw +
1 * N);
const VU16 raw2 = ShiftRight<
8>(packed0);
// upper bits already zero
StoreU(raw2, d, raw +
2 * N);
const VU16 raw3 = ShiftRight<
8>(packed1);
// upper bits already zero
StoreU(raw3, d, raw +
3 * N);
const VU16 raw4 =
And(packed2, mask);
StoreU(raw4, d, raw +
4 * N);
const VU16 raw5 =
And(packed3, mask);
StoreU(raw5, d, raw +
5 * N);
const VU16 raw6 = ShiftRight<
8>(packed2);
// upper bits already zero
StoreU(raw6, d, raw +
6 * N);
const VU16 raw7 = ShiftRight<
8>(packed3);
// upper bits already zero
StoreU(raw7, d, raw +
7 * N);
const VU16 raw8 =
And(packed4, mask);
StoreU(raw8, d, raw +
8 * N);
const VU16 raw9 =
And(packed5, mask);
StoreU(raw9, d, raw +
9 * N);
const VU16 rawA = ShiftRight<
8>(packed4);
// upper bits already zero
StoreU(rawA, d, raw +
0xA * N);
const VU16 rawB = ShiftRight<
8>(packed5);
// upper bits already zero
StoreU(rawB, d, raw +
0xB * N);
const VU16 rawC =
And(packed6, mask);
StoreU(rawC, d, raw +
0xC * N);
const VU16 rawD =
And(packed7, mask);
StoreU(rawD, d, raw +
0xD * N);
const VU16 rawE = ShiftRight<
8>(packed6);
// upper bits already zero
StoreU(rawE, d, raw +
0xE * N);
const VU16 rawF = ShiftRight<
8>(packed7);
// upper bits already zero
StoreU(rawF, d, raw +
0xF * N);
}
};
// Pack16<8>
template <>
struct Pack16<
9> {
template <
class D>
HWY_INLINE
void Pack(D d,
const uint16_t* HWY_RESTRICT raw,
uint16_t* HWY_RESTRICT packed_out)
const {
using VU16 = Vec<decltype(d)>;
const size_t N = Lanes(d);
const VU16 raw0 = LoadU(d, raw +
0 * N);
const VU16 raw1 = LoadU(d, raw +
1 * N);
const VU16 raw2 = LoadU(d, raw +
2 * N);
const VU16 raw3 = LoadU(d, raw +
3 * N);
const VU16 raw4 = LoadU(d, raw +
4 * N);
const VU16 raw5 = LoadU(d, raw +
5 * N);
const VU16 raw6 = LoadU(d, raw +
6 * N);
const VU16 raw7 = LoadU(d, raw +
7 * N);
const VU16 raw8 = LoadU(d, raw +
8 * N);
const VU16 raw9 = LoadU(d, raw +
9 * N);
const VU16 rawA = LoadU(d, raw +
0xA * N);
const VU16 rawB = LoadU(d, raw +
0xB * N);
const VU16 rawC = LoadU(d, raw +
0xC * N);
const VU16 rawD = LoadU(d, raw +
0xD * N);
const VU16 rawE = LoadU(d, raw +
0xE * N);
const VU16 rawF = LoadU(d, raw +
0xF * N);
// 8 vectors, each with 9+7 bits; top 2 bits are concatenated into packed8.
const VU16 packed0 =
Or(ShiftLeft<
9>(raw8), raw0);
const VU16 packed1 =
Or(ShiftLeft<
9>(raw9), raw1);
const VU16 packed2 =
Or(ShiftLeft<
9>(rawA), raw2);
const VU16 packed3 =
Or(ShiftLeft<
9>(rawB), raw3);
const VU16 packed4 =
Or(ShiftLeft<
9>(rawC), raw4);
const VU16 packed5 =
Or(ShiftLeft<
9>(rawD), raw5);
const VU16 packed6 =
Or(ShiftLeft<
9>(rawE), raw6);
const VU16 packed7 =
Or(ShiftLeft<
9>(rawF), raw7);
// We could shift down, OR and shift up, but two shifts are typically more
// expensive than AND, shift into position, and OR (which can be further
// reduced via Xor3).
const VU16 mid2 = Set(d,
0x180u);
// top 2 in lower 9
const VU16 part8 = ShiftRight<
7>(
And(raw8, mid2));
const VU16 part9 = ShiftRight<
5>(
And(raw9, mid2));
const VU16 partA = ShiftRight<
3>(
And(rawA, mid2));
const VU16 partB = ShiftRight<
1>(
And(rawB, mid2));
const VU16 partC = ShiftLeft<
1>(
And(rawC, mid2));
const VU16 partD = ShiftLeft<
3>(
And(rawD, mid2));
const VU16 partE = ShiftLeft<
5>(
And(rawE, mid2));
const VU16 partF = ShiftLeft<
7>(
And(rawF, mid2));
const VU16 packed8 = Xor3(Xor3(part8, part9, partA),
Xor3(partB, partC, partD),
Or(partE, partF));
StoreU(packed0, d, packed_out +
0 * N);
StoreU(packed1, d, packed_out +
1 * N);
StoreU(packed2, d, packed_out +
2 * N);
StoreU(packed3, d, packed_out +
3 * N);
StoreU(packed4, d, packed_out +
4 * N);
StoreU(packed5, d, packed_out +
5 * N);
StoreU(packed6, d, packed_out +
6 * N);
StoreU(packed7, d, packed_out +
7 * N);
StoreU(packed8, d, packed_out +
8 * N);
}
template <
class D>
HWY_INLINE
void Unpack(D d,
const uint16_t* HWY_RESTRICT packed_in,
uint16_t* HWY_RESTRICT raw)
const {
using VU16 = Vec<decltype(d)>;
const size_t N = Lanes(d);
const VU16 packed0 = BitCast(d, LoadU(d, packed_in +
0 * N));
const VU16 packed1 = BitCast(d, LoadU(d, packed_in +
1 * N));
const VU16 packed2 = BitCast(d, LoadU(d, packed_in +
2 * N));
const VU16 packed3 = BitCast(d, LoadU(d, packed_in +
3 * N));
const VU16 packed4 = BitCast(d, LoadU(d, packed_in +
4 * N));
const VU16 packed5 = BitCast(d, LoadU(d, packed_in +
5 * N));
const VU16 packed6 = BitCast(d, LoadU(d, packed_in +
6 * N));
const VU16 packed7 = BitCast(d, LoadU(d, packed_in +
7 * N));
const VU16 packed8 = BitCast(d, LoadU(d, packed_in +
8 * N));
const VU16 mask = Set(d,
0x1FFu);
// Lowest 9 bits
const VU16 raw0 =
And(packed0, mask);
StoreU(raw0, d, raw +
0 * N);
const VU16 raw1 =
And(packed1, mask);
StoreU(raw1, d, raw +
1 * N);
const VU16 raw2 =
And(packed2, mask);
StoreU(raw2, d, raw +
2 * N);
const VU16 raw3 =
And(packed3, mask);
StoreU(raw3, d, raw +
3 * N);
const VU16 raw4 =
And(packed4, mask);
StoreU(raw4, d, raw +
4 * N);
const VU16 raw5 =
And(packed5, mask);
StoreU(raw5, d, raw +
5 * N);
const VU16 raw6 =
And(packed6, mask);
StoreU(raw6, d, raw +
6 * N);
const VU16 raw7 =
And(packed7, mask);
StoreU(raw7, d, raw +
7 * N);
const VU16 mid2 = Set(d,
0x180u);
// top 2 in lower 9
const VU16 raw8 =
OrAnd(ShiftRight<
9>(packed0), ShiftLeft<
7>(packed8), mid2);
const VU16 raw9 =
OrAnd(ShiftRight<
9>(packed1), ShiftLeft<
5>(packed8), mid2);
const VU16 rawA =
OrAnd(ShiftRight<
9>(packed2), ShiftLeft<
3>(packed8), mid2);
const VU16 rawB =
OrAnd(ShiftRight<
9>(packed3), ShiftLeft<
1>(packed8), mid2);
const VU16 rawC =
OrAnd(ShiftRight<
9>(packed4), ShiftRight<
1>(packed8), mid2);
const VU16 rawD =
OrAnd(ShiftRight<
9>(packed5), ShiftRight<
3>(packed8), mid2);
const VU16 rawE =
OrAnd(ShiftRight<
9>(packed6), ShiftRight<
5>(packed8), mid2);
const VU16 rawF =
OrAnd(ShiftRight<
9>(packed7), ShiftRight<
7>(packed8), mid2);
StoreU(raw8, d, raw +
8 * N);
StoreU(raw9, d, raw +
9 * N);
StoreU(rawA, d, raw +
0xA * N);
StoreU(rawB, d, raw +
0xB * N);
StoreU(rawC, d, raw +
0xC * N);
StoreU(rawD, d, raw +
0xD * N);
StoreU(rawE, d, raw +
0xE * N);
StoreU(rawF, d, raw +
0xF * N);
}
};
// Pack16<9>
template <>
struct Pack16<
10> {
template <
class D>
HWY_INLINE
void Pack(D d,
const uint16_t* HWY_RESTRICT raw,
uint16_t* HWY_RESTRICT packed_out)
const {
using VU16 = Vec<decltype(d)>;
const size_t N = Lanes(d);
const VU16 raw0 = LoadU(d, raw +
0 * N);
const VU16 raw1 = LoadU(d, raw +
1 * N);
const VU16 raw2 = LoadU(d, raw +
2 * N);
const VU16 raw3 = LoadU(d, raw +
3 * N);
const VU16 raw4 = LoadU(d, raw +
4 * N);
const VU16 raw5 = LoadU(d, raw +
5 * N);
const VU16 raw6 = LoadU(d, raw +
6 * N);
const VU16 raw7 = LoadU(d, raw +
7 * N);
const VU16 raw8 = LoadU(d, raw +
8 * N);
const VU16 raw9 = LoadU(d, raw +
9 * N);
const VU16 rawA = LoadU(d, raw +
0xA * N);
const VU16 rawB = LoadU(d, raw +
0xB * N);
const VU16 rawC = LoadU(d, raw +
0xC * N);
const VU16 rawD = LoadU(d, raw +
0xD * N);
const VU16 rawE = LoadU(d, raw +
0xE * N);
const VU16 rawF = LoadU(d, raw +
0xF * N);
// 8 vectors, each with 10+6 bits; top 4 bits are concatenated into
// packed8 and packed9.
const VU16 packed0 =
Or(ShiftLeft<
10>(raw8), raw0);
const VU16 packed1 =
Or(ShiftLeft<
10>(raw9), raw1);
const VU16 packed2 =
Or(ShiftLeft<
10>(rawA), raw2);
const VU16 packed3 =
Or(ShiftLeft<
10>(rawB), raw3);
const VU16 packed4 =
Or(ShiftLeft<
10>(rawC), raw4);
const VU16 packed5 =
Or(ShiftLeft<
10>(rawD), raw5);
const VU16 packed6 =
Or(ShiftLeft<
10>(rawE), raw6);
const VU16 packed7 =
Or(ShiftLeft<
10>(rawF), raw7);
// We could shift down, OR and shift up, but two shifts are typically more
// expensive than AND, shift into position, and OR (which can be further
// reduced via Xor3).
const VU16 mid4 = Set(d,
0x3C0u);
// top 4 in lower 10
const VU16 part8 = ShiftRight<
6>(
And(raw8, mid4));
const VU16 part9 = ShiftRight<
2>(
And(raw9, mid4));
const VU16 partA = ShiftLeft<
2>(
And(rawA, mid4));
const VU16 partB = ShiftLeft<
6>(
And(rawB, mid4));
const VU16 partC = ShiftRight<
6>(
And(rawC, mid4));
const VU16 partD = ShiftRight<
2>(
And(rawD, mid4));
const VU16 partE = ShiftLeft<
2>(
And(rawE, mid4));
const VU16 partF = ShiftLeft<
6>(
And(rawF, mid4));
const VU16 packed8 =
Or(Xor3(part8, part9, partA), partB);
const VU16 packed9 =
Or(Xor3(partC, partD, partE), partF);
StoreU(packed0, d, packed_out +
0 * N);
StoreU(packed1, d, packed_out +
1 * N);
StoreU(packed2, d, packed_out +
2 * N);
StoreU(packed3, d, packed_out +
3 * N);
StoreU(packed4, d, packed_out +
4 * N);
StoreU(packed5, d, packed_out +
5 * N);
StoreU(packed6, d, packed_out +
6 * N);
StoreU(packed7, d, packed_out +
7 * N);
StoreU(packed8, d, packed_out +
8 * N);
StoreU(packed9, d, packed_out +
9 * N);
}
template <
class D>
HWY_INLINE
void Unpack(D d,
const uint16_t* HWY_RESTRICT packed_in,
uint16_t* HWY_RESTRICT raw)
const {
using VU16 = Vec<decltype(d)>;
const size_t N = Lanes(d);
const VU16 packed0 = BitCast(d, LoadU(d, packed_in +
0 * N));
const VU16 packed1 = BitCast(d, LoadU(d, packed_in +
1 * N));
const VU16 packed2 = BitCast(d, LoadU(d, packed_in +
2 * N));
const VU16 packed3 = BitCast(d, LoadU(d, packed_in +
3 * N));
const VU16 packed4 = BitCast(d, LoadU(d, packed_in +
4 * N));
const VU16 packed5 = BitCast(d, LoadU(d, packed_in +
5 * N));
const VU16 packed6 = BitCast(d, LoadU(d, packed_in +
6 * N));
const VU16 packed7 = BitCast(d, LoadU(d, packed_in +
7 * N));
const VU16 packed8 = BitCast(d, LoadU(d, packed_in +
8 * N));
const VU16 packed9 = BitCast(d, LoadU(d, packed_in +
9 * N));
const VU16 mask = Set(d,
0x3FFu);
// Lowest 10 bits
const VU16 raw0 =
And(packed0, mask);
StoreU(raw0, d, raw +
0 * N);
const VU16 raw1 =
And(packed1, mask);
StoreU(raw1, d, raw +
1 * N);
const VU16 raw2 =
And(packed2, mask);
StoreU(raw2, d, raw +
2 * N);
const VU16 raw3 =
And(packed3, mask);
StoreU(raw3, d, raw +
3 * N);
const VU16 raw4 =
And(packed4, mask);
StoreU(raw4, d, raw +
4 * N);
const VU16 raw5 =
And(packed5, mask);
StoreU(raw5, d, raw +
5 * N);
const VU16 raw6 =
And(packed6, mask);
StoreU(raw6, d, raw +
6 * N);
const VU16 raw7 =
And(packed7, mask);
StoreU(raw7, d, raw +
7 * N);
const VU16 mid4 = Set(d,
0x3C0u);
// top 4 in lower 10
const VU16 raw8 =
OrAnd(ShiftRight<
10>(packed0), ShiftLeft<
6>(packed8), mid4);
const VU16 raw9 =
OrAnd(ShiftRight<
10>(packed1), ShiftLeft<
2>(packed8), mid4);
const VU16 rawA =
OrAnd(ShiftRight<
10>(packed2), ShiftRight<
2>(packed8), mid4);
const VU16 rawB =
OrAnd(ShiftRight<
10>(packed3), ShiftRight<
6>(packed8), mid4);
const VU16 rawC =
OrAnd(ShiftRight<
10>(packed4), ShiftLeft<
6>(packed9), mid4);
const VU16 rawD =
OrAnd(ShiftRight<
10>(packed5), ShiftLeft<
2>(packed9), mid4);
const VU16 rawE =
OrAnd(ShiftRight<
10>(packed6), ShiftRight<
2>(packed9), mid4);
const VU16 rawF =
OrAnd(ShiftRight<
10>(packed7), ShiftRight<
6>(packed9), mid4);
StoreU(raw8, d, raw +
8 * N);
StoreU(raw9, d, raw +
9 * N);
StoreU(rawA, d, raw +
0xA * N);
StoreU(rawB, d, raw +
0xB * N);
StoreU(rawC, d, raw +
0xC * N);
StoreU(rawD, d, raw +
0xD * N);
StoreU(rawE, d, raw +
0xE * N);
StoreU(rawF, d, raw +
0xF * N);
}
};
// Pack16<10>
template <>
struct Pack16<
11> {
template <
class D>
HWY_INLINE
void Pack(D d,
const uint16_t* HWY_RESTRICT raw,
uint16_t* HWY_RESTRICT packed_out)
const {
using VU16 = Vec<decltype(d)>;
const size_t N = Lanes(d);
const VU16 raw0 = LoadU(d, raw +
0 * N);
const VU16 raw1 = LoadU(d, raw +
1 * N);
const VU16 raw2 = LoadU(d, raw +
2 * N);
const VU16 raw3 = LoadU(d, raw +
3 * N);
const VU16 raw4 = LoadU(d, raw +
4 * N);
const VU16 raw5 = LoadU(d, raw +
5 * N);
const VU16 raw6 = LoadU(d, raw +
6 * N);
const VU16 raw7 = LoadU(d, raw +
7 * N);
const VU16 raw8 = LoadU(d, raw +
8 * N);
const VU16 raw9 = LoadU(d, raw +
9 * N);
const VU16 rawA = LoadU(d, raw +
0xA * N);
const VU16 rawB = LoadU(d, raw +
0xB * N);
const VU16 rawC = LoadU(d, raw +
0xC * N);
const VU16 rawD = LoadU(d, raw +
0xD * N);
const VU16 rawE = LoadU(d, raw +
0xE * N);
const VU16 rawF = LoadU(d, raw +
0xF * N);
// It is not obvious what the optimal partitioning looks like. To reduce the
// number of constants, we want to minimize the number of distinct bit
// lengths. 11+5 also requires 6-bit remnants with 4-bit leftovers.
// 8+3 seems better: it is easier to scatter 3 bits into the MSBs.
const VU16 lo8 = Set(d,
0xFFu);
// Lower 8 bits of all raw
const VU16 packed0 = OrAnd(ShiftLeft<
8>(raw1), raw0, lo8);
const VU16 packed1 = OrAnd(ShiftLeft<
8>(raw3), raw2, lo8);
const VU16 packed2 = OrAnd(ShiftLeft<
8>(raw5), raw4, lo8);
const VU16 packed3 = OrAnd(ShiftLeft<
8>(raw7), raw6, lo8);
const VU16 packed4 = OrAnd(ShiftLeft<
8>(raw9), raw8, lo8);
const VU16 packed5 = OrAnd(ShiftLeft<
8>(rawB), rawA, lo8);
const VU16 packed6 = OrAnd(ShiftLeft<
8>(rawD), rawC, lo8);
const VU16 packed7 = OrAnd(ShiftLeft<
8>(rawF), rawE, lo8);
StoreU(packed0, d, packed_out +
0 * N);
StoreU(packed1, d, packed_out +
1 * N);
StoreU(packed2, d, packed_out +
2 * N);
StoreU(packed3, d, packed_out +
3 * N);
StoreU(packed4, d, packed_out +
4 * N);
StoreU(packed5, d, packed_out +
5 * N);
StoreU(packed6, d, packed_out +
6 * N);
StoreU(packed7, d, packed_out +
7 * N);
// Three vectors, five 3bit remnants each, plus one 3bit in their MSB.
const VU16 top0 = ShiftRight<
8>(raw0);
const VU16 top1 = ShiftRight<
8>(raw1);
const VU16 top2 = ShiftRight<
8>(raw2);
// Insert top raw bits into 3-bit groups within packed8..A. Moving the
// mask along avoids masking each of raw0..E and enables OrAnd.
VU16 next = Set(d,
0x38u);
// 0x7 << 3
VU16 packed8 = OrAnd(top0, ShiftRight<
5>(raw3), next);
VU16 packed9 = OrAnd(top1, ShiftRight<
5>(raw4), next);
VU16 packedA = OrAnd(top2, ShiftRight<
5>(raw5), next);
next = ShiftLeft<
3>(next);
packed8 = OrAnd(packed8, ShiftRight<
2>(raw6), next);
packed9 = OrAnd(packed9, ShiftRight<
2>(raw7), next);
packedA = OrAnd(packedA, ShiftRight<
2>(raw8), next);
next = ShiftLeft<
3>(next);
packed8 = OrAnd(packed8, Add(raw9, raw9), next);
packed9 = OrAnd(packed9, Add(rawA, rawA), next);
packedA = OrAnd(packedA, Add(rawB, rawB), next);
next = ShiftLeft<
3>(next);
packed8 = OrAnd(packed8, ShiftLeft<
4>(rawC), next);
packed9 = OrAnd(packed9, ShiftLeft<
4>(rawD), next);
packedA = OrAnd(packedA, ShiftLeft<
4>(rawE), next);
// Scatter upper 3 bits of rawF into the upper bits.
next = ShiftLeft<
3>(next);
// = 0x8000u
packed8 = OrAnd(packed8, ShiftLeft<
7>(rawF), next);
packed9 = OrAnd(packed9, ShiftLeft<
6>(rawF), next);
packedA = OrAnd(packedA, ShiftLeft<
5>(rawF), next);
StoreU(packed8, d, packed_out +
8 * N);
StoreU(packed9, d, packed_out +
9 * N);
StoreU(packedA, d, packed_out +
0xA * N);
}
template <
class D>
HWY_INLINE
void Unpack(D d,
const uint16_t* HWY_RESTRICT packed_in,
uint16_t* HWY_RESTRICT raw)
const {
using VU16 = Vec<decltype(d)>;
const size_t N = Lanes(d);
const VU16 packed0 = BitCast(d, LoadU(d, packed_in +
0 * N));
const VU16 packed1 = BitCast(d, LoadU(d, packed_in +
1 * N));
const VU16 packed2 = BitCast(d, LoadU(d, packed_in +
2 * N));
const VU16 packed3 = BitCast(d, LoadU(d, packed_in +
3 * N));
const VU16 packed4 = BitCast(d, LoadU(d, packed_in +
4 * N));
const VU16 packed5 = BitCast(d, LoadU(d, packed_in +
5 * N));
const VU16 packed6 = BitCast(d, LoadU(d, packed_in +
6 * N));
const VU16 packed7 = BitCast(d, LoadU(d, packed_in +
7 * N));
const VU16 packed8 = BitCast(d, LoadU(d, packed_in +
8 * N));
const VU16 packed9 = BitCast(d, LoadU(d, packed_in +
9 * N));
const VU16 packedA = BitCast(d, LoadU(d, packed_in +
0xA * N));
const VU16 mask = Set(d,
0xFFu);
// Lowest 8 bits
const VU16 down0 =
And(packed0, mask);
const VU16 down1 = ShiftRight<
8>(packed0);
const VU16 down2 =
And(packed1, mask);
const VU16 down3 = ShiftRight<
8>(packed1);
const VU16 down4 =
And(packed2, mask);
const VU16 down5 = ShiftRight<
8>(packed2);
const VU16 down6 =
And(packed3, mask);
const VU16 down7 = ShiftRight<
8>(packed3);
const VU16 down8 =
And(packed4, mask);
const VU16 down9 = ShiftRight<
8>(packed4);
const VU16 downA =
And(packed5, mask);
const VU16 downB = ShiftRight<
8>(packed5);
const VU16 downC =
And(packed6, mask);
const VU16 downD = ShiftRight<
8>(packed6);
const VU16 downE =
And(packed7, mask);
const VU16 downF = ShiftRight<
8>(packed7);
// Three bits from packed8..A, eight bits from down0..F.
const VU16 hi3 = Set(d,
0x700u);
const VU16 raw0 = OrAnd(down0, ShiftLeft<
8>(packed8), hi3);
const VU16 raw1 = OrAnd(down1, ShiftLeft<
8>(packed9), hi3);
const VU16 raw2 = OrAnd(down2, ShiftLeft<
8>(packedA), hi3);
const VU16 raw3 = OrAnd(down3, ShiftLeft<
5>(packed8), hi3);
const VU16 raw4 = OrAnd(down4, ShiftLeft<
5>(packed9), hi3);
const VU16 raw5 = OrAnd(down5, ShiftLeft<
5>(packedA), hi3);
const VU16 raw6 = OrAnd(down6, ShiftLeft<
2>(packed8), hi3);
const VU16 raw7 = OrAnd(down7, ShiftLeft<
2>(packed9), hi3);
const VU16 raw8 = OrAnd(down8, ShiftLeft<
2>(packedA), hi3);
const VU16 raw9 = OrAnd(down9, ShiftRight<
1>(packed8), hi3);
const VU16 rawA = OrAnd(downA, ShiftRight<
1>(packed9), hi3);
const VU16 rawB = OrAnd(downB, ShiftRight<
1>(packedA), hi3);
const VU16 rawC = OrAnd(downC, ShiftRight<
4>(packed8), hi3);
const VU16 rawD = OrAnd(downD, ShiftRight<
4>(packed9), hi3);
const VU16 rawE = OrAnd(downE, ShiftRight<
4>(packedA), hi3);
// Shift MSB into the top 3-of-11 and mask.
const VU16 rawF =
Or(downF, Xor3(
And(ShiftRight<
7>(packed8), hi3),
And(ShiftRight<
6>(packed9), hi3),
And(ShiftRight<
5>(packedA), hi3)));
StoreU(raw0, d, raw +
0 * N);
StoreU(raw1, d, raw +
1 * N);
StoreU(raw2, d, raw +
2 * N);
StoreU(raw3, d, raw +
3 * N);
StoreU(raw4, d, raw +
4 * N);
StoreU(raw5, d, raw +
5 * N);
StoreU(raw6, d, raw +
6 * N);
StoreU(raw7, d, raw +
7 * N);
StoreU(raw8, d, raw +
8 * N);
StoreU(raw9, d, raw +
9 * N);
StoreU(rawA, d, raw +
0xA * N);
StoreU(rawB, d, raw +
0xB * N);
StoreU(rawC, d, raw +
0xC * N);
StoreU(rawD, d, raw +
0xD * N);
StoreU(rawE, d, raw +
0xE * N);
StoreU(rawF, d, raw +
0xF * N);
}
};
// Pack16<11>
template <>
struct Pack16<
12> {
template <
class D>
HWY_INLINE
void Pack(D d,
const uint16_t* HWY_RESTRICT raw,
uint16_t* HWY_RESTRICT packed_out)
const {
using VU16 = Vec<decltype(d)>;
const size_t N = Lanes(d);
const VU16 raw0 = LoadU(d, raw +
0 * N);
const VU16 raw1 = LoadU(d, raw +
1 * N);
const VU16 raw2 = LoadU(d, raw +
2 * N);
const VU16 raw3 = LoadU(d, raw +
3 * N);
const VU16 raw4 = LoadU(d, raw +
4 * N);
const VU16 raw5 = LoadU(d, raw +
5 * N);
const VU16 raw6 = LoadU(d, raw +
6 * N);
const VU16 raw7 = LoadU(d, raw +
7 * N);
const VU16 raw8 = LoadU(d, raw +
8 * N);
const VU16 raw9 = LoadU(d, raw +
9 * N);
const VU16 rawA = LoadU(d, raw +
0xA * N);
const VU16 rawB = LoadU(d, raw +
0xB * N);
const VU16 rawC = LoadU(d, raw +
0xC * N);
const VU16 rawD = LoadU(d, raw +
0xD * N);
const VU16 rawE = LoadU(d, raw +
0xE * N);
const VU16 rawF = LoadU(d, raw +
0xF * N);
// 8 vectors, each with 12+4 bits; top 8 bits are concatenated into
// packed8 to packedB.
const VU16 packed0 =
Or(ShiftLeft<
12>(raw8), raw0);
const VU16 packed1 =
Or(ShiftLeft<
12>(raw9), raw1);
const VU16 packed2 =
Or(ShiftLeft<
12>(rawA), raw2);
const VU16 packed3 =
Or(ShiftLeft<
12>(rawB), raw3);
const VU16 packed4 =
Or(ShiftLeft<
12>(rawC), raw4);
const VU16 packed5 =
Or(ShiftLeft<
12>(rawD), raw5);
const VU16 packed6 =
Or(ShiftLeft<
12>(rawE), raw6);
const VU16 packed7 =
Or(ShiftLeft<
12>(rawF), raw7);
// Masking after shifting left enables OrAnd.
const VU16 hi8 = Set(d,
0xFF00u);
const VU16 packed8 = OrAnd(ShiftRight<
4>(raw8), ShiftLeft<
4>(raw9), hi8);
const VU16 packed9 = OrAnd(ShiftRight<
4>(rawA), ShiftLeft<
4>(rawB), hi8);
const VU16 packedA = OrAnd(ShiftRight<
4>(rawC), ShiftLeft<
4>(rawD), hi8);
const VU16 packedB = OrAnd(ShiftRight<
4>(rawE), ShiftLeft<
4>(rawF), hi8);
StoreU(packed0, d, packed_out +
0 * N);
StoreU(packed1, d, packed_out +
1 * N);
StoreU(packed2, d, packed_out +
2 * N);
StoreU(packed3, d, packed_out +
3 * N);
StoreU(packed4, d, packed_out +
4 * N);
StoreU(packed5, d, packed_out +
5 * N);
StoreU(packed6, d, packed_out +
6 * N);
StoreU(packed7, d, packed_out +
7 * N);
StoreU(packed8, d, packed_out +
8 * N);
StoreU(packed9, d, packed_out +
9 * N);
StoreU(packedA, d, packed_out +
0xA * N);
StoreU(packedB, d, packed_out +
0xB * N);
}
template <
class D>
HWY_INLINE
void Unpack(D d,
const uint16_t* HWY_RESTRICT packed_in,
uint16_t* HWY_RESTRICT raw)
const {
using VU16 = Vec<decltype(d)>;
const size_t N = Lanes(d);
const VU16 packed0 = BitCast(d, LoadU(d, packed_in +
0 * N));
const VU16 packed1 = BitCast(d, LoadU(d, packed_in +
1 * N));
const VU16 packed2 = BitCast(d, LoadU(d, packed_in +
2 * N));
const VU16 packed3 = BitCast(d, LoadU(d, packed_in +
3 * N));
const VU16 packed4 = BitCast(d, LoadU(d, packed_in +
4 * N));
const VU16 packed5 = BitCast(d, LoadU(d, packed_in +
5 * N));
const VU16 packed6 = BitCast(d, LoadU(d, packed_in +
6 * N));
const VU16 packed7 = BitCast(d, LoadU(d, packed_in +
7 * N));
const VU16 packed8 = BitCast(d, LoadU(d, packed_in +
8 * N));
const VU16 packed9 = BitCast(d, LoadU(d, packed_in +
9 * N));
const VU16 packedA = BitCast(d, LoadU(d, packed_in +
0xA * N));
const VU16 packedB = BitCast(d, LoadU(d, packed_in +
0xB * N));
const VU16 mask = Set(d,
0xFFFu);
// Lowest 12 bits
const VU16 raw0 =
And(packed0, mask);
StoreU(raw0, d, raw +
0 * N);
const VU16 raw1 =
And(packed1, mask);
StoreU(raw1, d, raw +
1 * N);
const VU16 raw2 =
And(packed2, mask);
StoreU(raw2, d, raw +
2 * N);
const VU16 raw3 =
And(packed3, mask);
StoreU(raw3, d, raw +
3 * N);
const VU16 raw4 =
And(packed4, mask);
StoreU(raw4, d, raw +
4 * N);
const VU16 raw5 =
And(packed5, mask);
StoreU(raw5, d, raw +
5 * N);
const VU16 raw6 =
And(packed6, mask);
StoreU(raw6, d, raw +
6 * N);
const VU16 raw7 =
And(packed7, mask);
StoreU(raw7, d, raw +
7 * N);
const VU16 mid8 = Set(d,
0xFF0u);
// upper 8 in lower 12
const VU16 raw8 =
OrAnd(ShiftRight<
12>(packed0), ShiftLeft<
4>(packed8), mid8);
const VU16 raw9 =
OrAnd(ShiftRight<
12>(packed1), ShiftRight<
4>(packed8), mid8);
const VU16 rawA =
OrAnd(ShiftRight<
12>(packed2), ShiftLeft<
4>(packed9), mid8);
const VU16 rawB =
OrAnd(ShiftRight<
12>(packed3), ShiftRight<
4>(packed9), mid8);
const VU16 rawC =
OrAnd(ShiftRight<
12>(packed4), ShiftLeft<
4>(packedA), mid8);
const VU16 rawD =
OrAnd(ShiftRight<
12>(packed5), ShiftRight<
4>(packedA), mid8);
const VU16 rawE =
OrAnd(ShiftRight<
12>(packed6), ShiftLeft<
4>(packedB), mid8);
const VU16 rawF =
OrAnd(ShiftRight<
12>(packed7), ShiftRight<
4>(packedB), mid8);
StoreU(raw8, d, raw +
8 * N);
StoreU(raw9, d, raw +
9 * N);
StoreU(rawA, d, raw +
0xA * N);
StoreU(rawB, d, raw +
0xB * N);
StoreU(rawC, d, raw +
0xC * N);
StoreU(rawD, d, raw +
0xD * N);
StoreU(rawE, d, raw +
0xE * N);
StoreU(rawF, d, raw +
0xF * N);
}
};
// Pack16<12>
template <>
struct Pack16<
13> {
template <
class D>
HWY_INLINE
void Pack(D d,
const uint16_t* HWY_RESTRICT raw,
uint16_t* HWY_RESTRICT packed_out)
const {
using VU16 = Vec<decltype(d)>;
const size_t N = Lanes(d);
const VU16 raw0 = LoadU(d, raw +
0 * N);
const VU16 raw1 = LoadU(d, raw +
1 * N);
const VU16 raw2 = LoadU(d, raw +
2 * N);
const VU16 raw3 = LoadU(d, raw +
3 * N);
const VU16 raw4 = LoadU(d, raw +
4 * N);
const VU16 raw5 = LoadU(d, raw +
5 * N);
const VU16 raw6 = LoadU(d, raw +
6 * N);
const VU16 raw7 = LoadU(d, raw +
7 * N);
const VU16 raw8 = LoadU(d, raw +
8 * N);
const VU16 raw9 = LoadU(d, raw +
9 * N);
const VU16 rawA = LoadU(d, raw +
0xA * N);
const VU16 rawB = LoadU(d, raw +
0xB * N);
const VU16 rawC = LoadU(d, raw +
0xC * N);
const VU16 rawD = LoadU(d, raw +
0xD * N);
const VU16 rawE = LoadU(d, raw +
0xE * N);
const VU16 rawF = LoadU(d, raw +
0xF * N);
// As with 11 bits, it is not obvious what the optimal partitioning looks
// like. We similarly go with an 8+5 split.
const VU16 lo8 = Set(d,
0xFFu);
// Lower 8 bits of all raw
const VU16 packed0 = OrAnd(ShiftLeft<
8>(raw1), raw0, lo8);
const VU16 packed1 = OrAnd(ShiftLeft<
8>(raw3), raw2, lo8);
const VU16 packed2 = OrAnd(ShiftLeft<
8>(raw5), raw4, lo8);
const VU16 packed3 = OrAnd(ShiftLeft<
8>(raw7), raw6, lo8);
const VU16 packed4 = OrAnd(ShiftLeft<
8>(raw9), raw8, lo8);
const VU16 packed5 = OrAnd(ShiftLeft<
8>(rawB), rawA, lo8);
const VU16 packed6 = OrAnd(ShiftLeft<
8>(rawD), rawC, lo8);
const VU16 packed7 = OrAnd(ShiftLeft<
8>(rawF), rawE, lo8);
StoreU(packed0, d, packed_out +
0 * N);
StoreU(packed1, d, packed_out +
1 * N);
StoreU(packed2, d, packed_out +
2 * N);
StoreU(packed3, d, packed_out +
3 * N);
StoreU(packed4, d, packed_out +
4 * N);
StoreU(packed5, d, packed_out +
5 * N);
StoreU(packed6, d, packed_out +
6 * N);
StoreU(packed7, d, packed_out +
7 * N);
// Five vectors, three 5bit remnants each, plus one 5bit in their MSB.
const VU16 top0 = ShiftRight<
8>(raw0);
const VU16 top1 = ShiftRight<
8>(raw1);
const VU16 top2 = ShiftRight<
8>(raw2);
const VU16 top3 = ShiftRight<
8>(raw3);
const VU16 top4 = ShiftRight<
8>(raw4);
// Insert top raw bits into 5-bit groups within packed8..C. Moving the
// mask along avoids masking each of raw0..E and enables OrAnd.
VU16 next = Set(d,
0x3E0u);
// 0x1F << 5
VU16 packed8 = OrAnd(top0, ShiftRight<
3>(raw5), next);
VU16 packed9 = OrAnd(top1, ShiftRight<
3>(raw6), next);
VU16 packedA = OrAnd(top2, ShiftRight<
3>(raw7), next);
VU16 packedB = OrAnd(top3, ShiftRight<
3>(raw8), next);
VU16 packedC = OrAnd(top4, ShiftRight<
3>(raw9), next);
next = ShiftLeft<
5>(next);
packed8 = OrAnd(packed8, ShiftLeft<
2>(rawA), next);
packed9 = OrAnd(packed9, ShiftLeft<
2>(rawB), next);
packedA = OrAnd(packedA, ShiftLeft<
2>(rawC), next);
packedB = OrAnd(packedB, ShiftLeft<
2>(rawD), next);
packedC = OrAnd(packedC, ShiftLeft<
2>(rawE), next);
// Scatter upper 5 bits of rawF into the upper bits.
next = ShiftLeft<
3>(next);
// = 0x8000u
packed8 = OrAnd(packed8, ShiftLeft<
7>(rawF), next);
packed9 = OrAnd(packed9, ShiftLeft<
6>(rawF), next);
packedA = OrAnd(packedA, ShiftLeft<
5>(rawF), next);
packedB = OrAnd(packedB, ShiftLeft<
4>(rawF), next);
packedC = OrAnd(packedC, ShiftLeft<
3>(rawF), next);
StoreU(packed8, d, packed_out +
8 * N);
StoreU(packed9, d, packed_out +
9 * N);
StoreU(packedA, d, packed_out +
0xA * N);
StoreU(packedB, d, packed_out +
0xB * N);
StoreU(packedC, d, packed_out +
0xC * N);
}
template <
class D>
HWY_INLINE
void Unpack(D d,
const uint16_t* HWY_RESTRICT packed_in,
uint16_t* HWY_RESTRICT raw)
const {
using VU16 = Vec<decltype(d)>;
const size_t N = Lanes(d);
const VU16 packed0 = BitCast(d, LoadU(d, packed_in +
0 * N));
const VU16 packed1 = BitCast(d, LoadU(d, packed_in +
1 * N));
const VU16 packed2 = BitCast(d, LoadU(d, packed_in +
2 * N));
const VU16 packed3 = BitCast(d, LoadU(d, packed_in +
3 * N));
const VU16 packed4 = BitCast(d, LoadU(d, packed_in +
4 * N));
const VU16 packed5 = BitCast(d, LoadU(d, packed_in +
5 * N));
const VU16 packed6 = BitCast(d, LoadU(d, packed_in +
6 * N));
const VU16 packed7 = BitCast(d, LoadU(d, packed_in +
7 * N));
const VU16 packed8 = BitCast(d, LoadU(d, packed_in +
8 * N));
const VU16 packed9 = BitCast(d, LoadU(d, packed_in +
9 * N));
const VU16 packedA = BitCast(d, LoadU(d, packed_in +
0xA * N));
const VU16 packedB = BitCast(d, LoadU(d, packed_in +
0xB * N));
const VU16 packedC = BitCast(d, LoadU(d, packed_in +
0xC * N));
const VU16 mask = Set(d,
0xFFu);
// Lowest 8 bits
const VU16 down0 =
And(packed0, mask);
const VU16 down1 = ShiftRight<
8>(packed0);
const VU16 down2 =
And(packed1, mask);
const VU16 down3 = ShiftRight<
8>(packed1);
const VU16 down4 =
And(packed2, mask);
const VU16 down5 = ShiftRight<
8>(packed2);
const VU16 down6 =
And(packed3, mask);
const VU16 down7 = ShiftRight<
8>(packed3);
const VU16 down8 =
And(packed4, mask);
const VU16 down9 = ShiftRight<
8>(packed4);
const VU16 downA =
And(packed5, mask);
const VU16 downB = ShiftRight<
8>(packed5);
const VU16 downC =
And(packed6, mask);
const VU16 downD = ShiftRight<
8>(packed6);
const VU16 downE =
And(packed7, mask);
const VU16 downF = ShiftRight<
8>(packed7);
// Upper five bits from packed8..C, eight bits from down0..F.
const VU16 hi5 = Set(d,
0x1F00u);
const VU16 raw0 = OrAnd(down0, ShiftLeft<
8>(packed8), hi5);
const VU16 raw1 = OrAnd(down1, ShiftLeft<
8>(packed9), hi5);
const VU16 raw2 = OrAnd(down2, ShiftLeft<
8>(packedA), hi5);
const VU16 raw3 = OrAnd(down3, ShiftLeft<
8>(packedB), hi5);
const VU16 raw4 = OrAnd(down4, ShiftLeft<
8>(packedC), hi5);
const VU16 raw5 = OrAnd(down5, ShiftLeft<
3>(packed8), hi5);
const VU16 raw6 = OrAnd(down6, ShiftLeft<
3>(packed9), hi5);
const VU16 raw7 = OrAnd(down7, ShiftLeft<
3>(packedA), hi5);
const VU16 raw8 = OrAnd(down8, ShiftLeft<
3>(packed9), hi5);
const VU16 raw9 = OrAnd(down9, ShiftLeft<
3>(packedA), hi5);
const VU16 rawA = OrAnd(downA, ShiftRight<
2>(packed8), hi5);
const VU16 rawB = OrAnd(downB, ShiftRight<
2>(packed9), hi5);
const VU16 rawC = OrAnd(downC, ShiftRight<
2>(packedA), hi5);
const VU16 rawD = OrAnd(downD, ShiftRight<
2>(packed9), hi5);
const VU16 rawE = OrAnd(downE, ShiftRight<
2>(packedA), hi5);
// Shift MSB into the top 5-of-11 and mask.
const VU16 p0 = Xor3(
And(ShiftRight<
7>(packed8), hi5),
//
And(ShiftRight<
6>(packed9), hi5),
And(ShiftRight<
5>(packedA), hi5));
const VU16 p1 = Xor3(
And(ShiftRight<
4>(packedB), hi5),
And(ShiftRight<
3>(packedC), hi5), downF);
const VU16 rawF =
Or(p0, p1);
StoreU(raw0, d, raw +
0 * N);
StoreU(raw1, d, raw +
1 * N);
StoreU(raw2, d, raw +
2 * N);
StoreU(raw3, d, raw +
3 * N);
StoreU(raw4, d, raw +
4 * N);
StoreU(raw5, d, raw +
5 * N);
StoreU(raw6, d, raw +
6 * N);
StoreU(raw7, d, raw +
7 * N);
StoreU(raw8, d, raw +
8 * N);
StoreU(raw9, d, raw +
9 * N);
StoreU(rawA, d, raw +
0xA * N);
StoreU(rawB, d, raw +
0xB * N);
StoreU(rawC, d, raw +
0xC * N);
StoreU(rawD, d, raw +
0xD * N);
StoreU(rawE, d, raw +
0xE * N);
StoreU(rawF, d, raw +
0xF * N);
}
};
// Pack16<13>
template <>
struct Pack16<
14> {
template <
class D>
HWY_INLINE
void Pack(D d,
const uint16_t* HWY_RESTRICT raw,
uint16_t* HWY_RESTRICT packed_out)
const {
using VU16 = Vec<decltype(d)>;
const size_t N = Lanes(d);
const VU16 raw0 = LoadU(d, raw +
0 * N);
const VU16 raw1 = LoadU(d, raw +
1 * N);
const VU16 raw2 = LoadU(d, raw +
2 * N);
const VU16 raw3 = LoadU(d, raw +
3 * N);
const VU16 raw4 = LoadU(d, raw +
4 * N);
const VU16 raw5 = LoadU(d, raw +
5 * N);
const VU16 raw6 = LoadU(d, raw +
6 * N);
const VU16 raw7 = LoadU(d, raw +
7 * N);
const VU16 raw8 = LoadU(d, raw +
8 * N);
const VU16 raw9 = LoadU(d, raw +
9 * N);
const VU16 rawA = LoadU(d, raw +
0xA * N);
const VU16 rawB = LoadU(d, raw +
0xB * N);
const VU16 rawC = LoadU(d, raw +
0xC * N);
const VU16 rawD = LoadU(d, raw +
0xD * N);
const VU16 rawE = LoadU(d, raw +
0xE * N);
const VU16 rawF = LoadU(d, raw +
0xF * N);
// 14 vectors, each with 14+2 bits; two raw vectors are scattered
// across the upper 2 bits.
const VU16 hi2 = Set(d,
0xC000u);
const VU16 packed0 =
Or(raw0, ShiftLeft<
14>(rawE));
const VU16 packed1 = OrAnd(raw1, ShiftLeft<
12>(rawE), hi2);
const VU16 packed2 = OrAnd(raw2, ShiftLeft<
10>(rawE), hi2);
const VU16 packed3 = OrAnd(raw3, ShiftLeft<
8>(rawE), hi2);
const VU16 packed4 = OrAnd(raw4, ShiftLeft<
6>(rawE), hi2);
const VU16 packed5 = OrAnd(raw5, ShiftLeft<
4>(rawE), hi2);
const VU16 packed6 = OrAnd(raw6, ShiftLeft<
2>(rawE), hi2);
const VU16 packed7 =
Or(raw7, ShiftLeft<
14>(rawF));
const VU16 packed8 = OrAnd(raw8, ShiftLeft<
12>(rawF), hi2);
const VU16 packed9 = OrAnd(raw9, ShiftLeft<
10>(rawF), hi2);
const VU16 packedA = OrAnd(rawA, ShiftLeft<
8>(rawF), hi2);
const VU16 packedB = OrAnd(rawB, ShiftLeft<
6>(rawF), hi2);
const VU16 packedC = OrAnd(rawC, ShiftLeft<
4>(rawF), hi2);
const VU16 packedD = OrAnd(rawD, ShiftLeft<
2>(rawF), hi2);
StoreU(packed0, d, packed_out +
0 * N);
StoreU(packed1, d, packed_out +
1 * N);
StoreU(packed2, d, packed_out +
2 * N);
StoreU(packed3, d, packed_out +
3 * N);
StoreU(packed4, d, packed_out +
4 * N);
StoreU(packed5, d, packed_out +
5 * N);
StoreU(packed6, d, packed_out +
6 * N);
StoreU(packed7, d, packed_out +
7 * N);
StoreU(packed8, d, packed_out +
8 * N);
StoreU(packed9, d, packed_out +
9 * N);
StoreU(packedA, d, packed_out +
0xA * N);
StoreU(packedB, d, packed_out +
0xB * N);
StoreU(packedC, d, packed_out +
0xC * N);
StoreU(packedD, d, packed_out +
0xD * N);
}
template <
class D>
HWY_INLINE
void Unpack(D d,
const uint16_t* HWY_RESTRICT packed_in,
uint16_t* HWY_RESTRICT raw)
const {
using VU16 = Vec<decltype(d)>;
const size_t N = Lanes(d);
const VU16 packed0 = BitCast(d, LoadU(d, packed_in +
0 * N));
const VU16 packed1 = BitCast(d, LoadU(d, packed_in +
1 * N));
const VU16 packed2 = BitCast(d, LoadU(d, packed_in +
2 * N));
const VU16 packed3 = BitCast(d, LoadU(d, packed_in +
3 * N));
const VU16 packed4 = BitCast(d, LoadU(d, packed_in +
4 * N));
const VU16 packed5 = BitCast(d, LoadU(d, packed_in +
5 * N));
const VU16 packed6 = BitCast(d, LoadU(d, packed_in +
6 * N));
const VU16 packed7 = BitCast(d, LoadU(d, packed_in +
7 * N));
const VU16 packed8 = BitCast(d, LoadU(d, packed_in +
8 * N));
const VU16 packed9 = BitCast(d, LoadU(d, packed_in +
9 * N));
const VU16 packedA = BitCast(d, LoadU(d, packed_in +
0xA * N));
const VU16 packedB = BitCast(d, LoadU(d, packed_in +
0xB * N));
const VU16 packedC = BitCast(d, LoadU(d, packed_in +
0xC * N));
const VU16 packedD = BitCast(d, LoadU(d, packed_in +
0xD * N));
const VU16 mask = Set(d,
0x3FFFu);
// Lowest 14 bits
const VU16 raw0 =
And(packed0, mask);
StoreU(raw0, d, raw +
0 * N);
const VU16 raw1 =
And(packed1, mask);
StoreU(raw1, d, raw +
1 * N);
const VU16 raw2 =
And(packed2, mask);
StoreU(raw2, d, raw +
2 * N);
const VU16 raw3 =
And(packed3, mask);
StoreU(raw3, d, raw +
3 * N);
const VU16 raw4 =
And(packed4, mask);
StoreU(raw4, d, raw +
4 * N);
const VU16 raw5 =
And(packed5, mask);
StoreU(raw5, d, raw +
5 * N);
const VU16 raw6 =
And(packed6, mask);
StoreU(raw6, d, raw +
6 * N);
const VU16 raw7 =
And(packed7, mask);
StoreU(raw7, d, raw +
7 * N);
const VU16 raw8 =
And(packed8, mask);
StoreU(raw8, d, raw +
8 * N);
const VU16 raw9 =
And(packed9, mask);
StoreU(raw9, d, raw +
9 * N);
const VU16 rawA =
And(packedA, mask);
StoreU(rawA, d, raw +
0xA * N);
const VU16 rawB =
And(packedB, mask);
StoreU(rawB, d, raw +
0xB * N);
const VU16 rawC =
And(packedC, mask);
StoreU(rawC, d, raw +
0xC * N);
const VU16 rawD =
And(packedD, mask);
StoreU(rawD, d, raw +
0xD * N);
// rawE is the concatenation of the top two bits in packed0..6.
const VU16 E0 = Xor3(ShiftRight<
14>(packed0),
//
ShiftRight<
12>(AndNot(mask, packed1)),
ShiftRight<
10>(AndNot(mask, packed2)));
const VU16 E1 = Xor3(ShiftRight<
8>(AndNot(mask, packed3)),
ShiftRight<
6>(AndNot(mask, packed4)),
ShiftRight<
4>(AndNot(mask, packed5)));
const VU16 rawE = Xor3(ShiftRight<
2>(AndNot(mask, packed6)), E0, E1);
const VU16 F0 = Xor3(ShiftRight<
14>(AndNot(mask, packed7)),
ShiftRight<
12>(AndNot(mask, packed8)),
ShiftRight<
10>(AndNot(mask, packed9)));
const VU16 F1 = Xor3(ShiftRight<
8>(AndNot(mask, packedA)),
ShiftRight<
6>(AndNot(mask, packedB)),
ShiftRight<
4>(AndNot(mask, packedC)));
const VU16 rawF = Xor3(ShiftRight<
2>(AndNot(mask, packedD)), F0, F1);
StoreU(rawE, d, raw +
0xE * N);
StoreU(rawF, d, raw +
0xF * N);
}
};
// Pack16<14>
template <>
struct Pack16<
15> {
template <
class D>
HWY_INLINE
void Pack(D d,
const uint16_t* HWY_RESTRICT raw,
uint16_t* HWY_RESTRICT packed_out)
const {
using VU16 = Vec<decltype(d)>;
const size_t N = Lanes(d);
const VU16 raw0 = LoadU(d, raw +
0 * N);
const VU16 raw1 = LoadU(d, raw +
1 * N);
const VU16 raw2 = LoadU(d, raw +
2 * N);
const VU16 raw3 = LoadU(d, raw +
3 * N);
const VU16 raw4 = LoadU(d, raw +
4 * N);
const VU16 raw5 = LoadU(d, raw +
5 * N);
const VU16 raw6 = LoadU(d, raw +
6 * N);
const VU16 raw7 = LoadU(d, raw +
7 * N);
const VU16 raw8 = LoadU(d, raw +
8 * N);
const VU16 raw9 = LoadU(d, raw +
9 * N);
const VU16 rawA = LoadU(d, raw +
0xA * N);
const VU16 rawB = LoadU(d, raw +
0xB * N);
const VU16 rawC = LoadU(d, raw +
0xC * N);
const VU16 rawD = LoadU(d, raw +
0xD * N);
const VU16 rawE = LoadU(d, raw +
0xE * N);
const VU16 rawF = LoadU(d, raw +
0xF * N);
// 15 vectors, each with 15+1 bits; one packed vector is scattered
// across the upper bit.
const VU16 hi1 = Set(d,
0x8000u);
const VU16 packed0 =
Or(raw0, ShiftLeft<
15>(rawF));
const VU16 packed1 = OrAnd(raw1, ShiftLeft<
14>(rawF), hi1);
const VU16 packed2 = OrAnd(raw2, ShiftLeft<
13>(rawF), hi1);
const VU16 packed3 = OrAnd(raw3, ShiftLeft<
12>(rawF), hi1);
const VU16 packed4 = OrAnd(raw4, ShiftLeft<
11>(rawF), hi1);
const VU16 packed5 = OrAnd(raw5, ShiftLeft<
10>(rawF), hi1);
const VU16 packed6 = OrAnd(raw6, ShiftLeft<
9>(rawF), hi1);
const VU16 packed7 = OrAnd(raw7, ShiftLeft<
8>(rawF), hi1);
const VU16 packed8 = OrAnd(raw8, ShiftLeft<
7>(rawF), hi1);
const VU16 packed9 = OrAnd(raw9, ShiftLeft<
6>(rawF), hi1);
const VU16 packedA = OrAnd(rawA, ShiftLeft<
5>(rawF), hi1);
const VU16 packedB = OrAnd(rawB, ShiftLeft<
4>(rawF), hi1);
const VU16 packedC = OrAnd(rawC, ShiftLeft<
3>(rawF), hi1);
const VU16 packedD = OrAnd(rawD, ShiftLeft<
2>(rawF), hi1);
const VU16 packedE = OrAnd(rawE, ShiftLeft<
1>(rawF), hi1);
StoreU(packed0, d, packed_out +
0 * N);
StoreU(packed1, d, packed_out +
1 * N);
StoreU(packed2, d, packed_out +
2 * N);
StoreU(packed3, d, packed_out +
3 * N);
StoreU(packed4, d, packed_out +
4 * N);
StoreU(packed5, d, packed_out +
5 * N);
StoreU(packed6, d, packed_out +
6 * N);
StoreU(packed7, d, packed_out +
7 * N);
StoreU(packed8, d, packed_out +
8 * N);
StoreU(packed9, d, packed_out +
9 * N);
StoreU(packedA, d, packed_out +
0xA * N);
StoreU(packedB, d, packed_out +
0xB * N);
StoreU(packedC, d, packed_out +
0xC * N);
StoreU(packedD, d, packed_out +
0xD * N);
StoreU(packedE, d, packed_out +
0xE * N);
}
template <
class D>
HWY_INLINE
void Unpack(D d,
const uint16_t* HWY_RESTRICT packed_in,
uint16_t* HWY_RESTRICT raw)
const {
using VU16 = Vec<decltype(d)>;
const size_t N = Lanes(d);
const VU16 packed0 = BitCast(d, LoadU(d, packed_in +
0 * N));
const VU16 packed1 = BitCast(d, LoadU(d, packed_in +
1 * N));
const VU16 packed2 = BitCast(d, LoadU(d, packed_in +
2 * N));
const VU16 packed3 = BitCast(d, LoadU(d, packed_in +
3 * N));
const VU16 packed4 = BitCast(d, LoadU(d, packed_in +
4 * N));
const VU16 packed5 = BitCast(d, LoadU(d, packed_in +
5 * N));
const VU16 packed6 = BitCast(d, LoadU(d, packed_in +
6 * N));
const VU16 packed7 = BitCast(d, LoadU(d, packed_in +
7 * N));
const VU16 packed8 = BitCast(d, LoadU(d, packed_in +
8 * N));
const VU16 packed9 = BitCast(d, LoadU(d, packed_in +
9 * N));
const VU16 packedA = BitCast(d, LoadU(d, packed_in +
0xA * N));
const VU16 packedB = BitCast(d, LoadU(d, packed_in +
0xB * N));
const VU16 packedC = BitCast(d, LoadU(d, packed_in +
0xC * N));
const VU16 packedD = BitCast(d, LoadU(d, packed_in +
0xD * N));
const VU16 packedE = BitCast(d, LoadU(d, packed_in +
0xE * N));
const VU16 mask = Set(d,
0x7FFFu);
// Lowest 15 bits
const VU16 raw0 =
And(packed0, mask);
StoreU(raw0, d, raw +
0 * N);
const VU16 raw1 =
And(packed1, mask);
StoreU(raw1, d, raw +
1 * N);
const VU16 raw2 =
And(packed2, mask);
StoreU(raw2, d, raw +
2 * N);
const VU16 raw3 =
And(packed3, mask);
StoreU(raw3, d, raw +
3 * N);
const VU16 raw4 =
And(packed4, mask);
StoreU(raw4, d, raw +
4 * N);
const VU16 raw5 =
And(packed5, mask);
StoreU(raw5, d, raw +
5 * N);
const VU16 raw6 =
And(packed6, mask);
StoreU(raw6, d, raw +
6 * N);
const VU16 raw7 =
And(packed7, mask);
StoreU(raw7, d, raw +
7 * N);
const VU16 raw8 =
And(packed8, mask);
StoreU(raw8, d, raw +
8 * N);
const VU16 raw9 =
And(packed9, mask);
StoreU(raw9, d, raw +
9 * N);
const VU16 rawA =
And(packedA, mask);
StoreU(rawA, d, raw +
0xA * N);
const VU16 rawB =
And(packedB, mask);
StoreU(rawB, d, raw +
0xB * N);
const VU16 rawC =
And(packedC, mask);
StoreU(rawC, d, raw +
0xC * N);
const VU16 rawD =
And(packedD, mask);
StoreU(rawD, d, raw +
0xD * N);
const VU16 rawE =
And(packedE, mask);
StoreU(rawE, d, raw +
0xE * N);
// rawF is the concatenation of the top bit in packed0..E.
const VU16 F0 = Xor3(ShiftRight<
15>(packed0),
//
ShiftRight<
14>(AndNot(mask, packed1)),
ShiftRight<
13>(AndNot(mask, packed2)));
const VU16 F1 = Xor3(ShiftRight<
12>(AndNot(mask, packed3)),
ShiftRight<
11>(AndNot(mask, packed4)),
ShiftRight<
10>(AndNot(mask, packed5)));
const VU16 F2 = Xor3(ShiftRight<
9>(AndNot(mask, packed6)),
ShiftRight<
8>(AndNot(mask, packed7)),
ShiftRight<
7>(AndNot(mask, packed8)));
const VU16 F3 = Xor3(ShiftRight<
6>(AndNot(mask, packed9)),
ShiftRight<
5>(AndNot(mask, packedA)),
ShiftRight<
4>(AndNot(mask, packedB)));
const VU16 F4 = Xor3(ShiftRight<
3>(AndNot(mask, packedC)),
ShiftRight<
2>(AndNot(mask, packedD)),
ShiftRight<
1>(AndNot(mask, packedE)));
const VU16 rawF = Xor3(F0, F1, Xor3(F2, F3, F4));
StoreU(rawF, d, raw +
0xF * N);
}
};
// Pack16<15>
template <>
struct Pack16<
16> {
template <
class D>
HWY_INLINE
void Pack(D d,
const uint16_t* HWY_RESTRICT raw,
uint16_t* HWY_RESTRICT packed_out)
const {
using VU16 = Vec<decltype(d)>;
const size_t N = Lanes(d);
const VU16 raw0 = LoadU(d, raw +
0 * N);
const VU16 raw1 = LoadU(d, raw +
1 * N);
const VU16 raw2 = LoadU(d, raw +
2 * N);
const VU16 raw3 = LoadU(d, raw +
3 * N);
const VU16 raw4 = LoadU(d, raw +
4 * N);
const VU16 raw5 = LoadU(d, raw +
5 * N);
const VU16 raw6 = LoadU(d, raw +
6 * N);
const VU16 raw7 = LoadU(d, raw +
7 * N);
const VU16 raw8 = LoadU(d, raw +
8 * N);
const VU16 raw9 = LoadU(d, raw +
9 * N);
const VU16 rawA = LoadU(d, raw +
0xA * N);
const VU16 rawB = LoadU(d, raw +
0xB * N);
const VU16 rawC = LoadU(d, raw +
0xC * N);
const VU16 rawD = LoadU(d, raw +
0xD * N);
const VU16 rawE = LoadU(d, raw +
0xE * N);
const VU16 rawF = LoadU(d, raw +
0xF * N);
StoreU(raw0, d, packed_out +
0 * N);
StoreU(raw1, d, packed_out +
1 * N);
StoreU(raw2, d, packed_out +
2 * N);
StoreU(raw3, d, packed_out +
3 * N);
StoreU(raw4, d, packed_out +
4 * N);
StoreU(raw5, d, packed_out +
5 * N);
StoreU(raw6, d, packed_out +
6 * N);
StoreU(raw7, d, packed_out +
7 * N);
StoreU(raw8, d, packed_out +
8 * N);
StoreU(raw9, d, packed_out +
9 * N);
StoreU(rawA, d, packed_out +
0xA * N);
StoreU(rawB, d, packed_out +
0xB * N);
StoreU(rawC, d, packed_out +
0xC * N);
StoreU(rawD, d, packed_out +
0xD * N);
StoreU(rawE, d, packed_out +
0xE * N);
StoreU(rawF, d, packed_out +
0xF * N);
}
template <
class D>
HWY_INLINE
void Unpack(D d,
const uint16_t* HWY_RESTRICT packed_in,
uint16_t* HWY_RESTRICT raw)
const {
using VU16 = Vec<decltype(d)>;
const size_t N = Lanes(d);
const VU16 raw0 = BitCast(d, LoadU(d, packed_in +
0 * N));
const VU16 raw1 = BitCast(d, LoadU(d, packed_in +
1 * N));
const VU16 raw2 = BitCast(d, LoadU(d, packed_in +
2 * N));
const VU16 raw3 = BitCast(d, LoadU(d, packed_in +
3 * N));
const VU16 raw4 = BitCast(d, LoadU(d, packed_in +
4 * N));
const VU16 raw5 = BitCast(d, LoadU(d, packed_in +
5 * N));
const VU16 raw6 = BitCast(d, LoadU(d, packed_in +
6 * N));
const VU16 raw7 = BitCast(d, LoadU(d, packed_in +
7 * N));
const VU16 raw8 = BitCast(d, LoadU(d, packed_in +
8 * N));
const VU16 raw9 = BitCast(d, LoadU(d, packed_in +
9 * N));
const VU16 rawA = BitCast(d, LoadU(d, packed_in +
0xA * N));
const VU16 rawB = BitCast(d, LoadU(d, packed_in +
0xB * N));
const VU16 rawC = BitCast(d, LoadU(d, packed_in +
0xC * N));
const VU16 rawD = BitCast(d, LoadU(d, packed_in +
0xD * N));
const VU16 rawE = BitCast(d, LoadU(d, packed_in +
0xE * N));
const VU16 rawF = BitCast(d, LoadU(d, packed_in +
0xF * N));
StoreU(raw0, d, raw +
0 * N);
StoreU(raw1, d, raw +
1 * N);
StoreU(raw2, d, raw +
2 * N);
StoreU(raw3, d, raw +
3 * N);
StoreU(raw4, d, raw +
4 * N);
StoreU(raw5, d, raw +
5 * N);
StoreU(raw6, d, raw +
6 * N);
StoreU(raw7, d, raw +
7 * N);
StoreU(raw8, d, raw +
8 * N);
StoreU(raw9, d, raw +
9 * N);
StoreU(rawA, d, raw +
0xA * N);
StoreU(rawB, d, raw +
0xB * N);
StoreU(rawC, d, raw +
0xC * N);
StoreU(rawD, d, raw +
0xD * N);
StoreU(rawE, d, raw +
0xE * N);
StoreU(rawF, d, raw +
0xF * N);
}
};
// Pack16<16>
// NOLINTNEXTLINE(google-readability-namespace-comments)
}
// namespace HWY_NAMESPACE
}
// namespace hwy
HWY_AFTER_NAMESPACE();
#endif // HIGHWAY_HWY_CONTRIB_BIT_PACK_INL_H_