/* * Copyright (c) Meta Platforms, Inc. and affiliates. * All rights reserved. * * This source code is licensed under both the BSD-style license (found in the * LICENSE file in the root directory of this source tree) and the GPLv2 (found * in the COPYING file in the root directory of this source tree). * You may select, at your option, one of the above-listed licenses.
*/
/* zstd_decompress_block :
* this module takes care of decompressing _compressed_ block */
/* These two optional macros force the use one way or another of the two * ZSTD_decompressSequences implementations. You can't force in both directions * at the same time.
*/ #ifdefined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT) && \ defined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG) #error"Cannot force the use of the short and the long ZSTD_decompressSequences variants!" #endif
/* Allocate buffer for literals, either overlapping current dst, or split between dst and litExtraBuffer, or stored entirely within litExtraBuffer */ staticvoid ZSTD_allocateLiteralsBuffer(ZSTD_DCtx* dctx, void* const dst, const size_t dstCapacity, const size_t litSize, const streaming_operation streaming, const size_t expectedWriteSize, constunsigned splitImmediately)
{
size_t const blockSizeMax = ZSTD_blockSizeMax(dctx);
assert(litSize <= blockSizeMax);
assert(dctx->isFrameDecompression || streaming == not_streaming);
assert(expectedWriteSize <= blockSizeMax); if (streaming == not_streaming && dstCapacity > blockSizeMax + WILDCOPY_OVERLENGTH + litSize + WILDCOPY_OVERLENGTH) { /* If we aren't streaming, we can just put the literals after the output * of the current block. We don't need to worry about overwriting the * extDict of our window, because it doesn't exist. * So if we have space after the end of the block, just put it there.
*/
dctx->litBuffer = (BYTE*)dst + blockSizeMax + WILDCOPY_OVERLENGTH;
dctx->litBufferEnd = dctx->litBuffer + litSize;
dctx->litBufferLocation = ZSTD_in_dst;
} elseif (litSize <= ZSTD_LITBUFFEREXTRASIZE) { /* Literals fit entirely within the extra buffer, put them there to avoid * having to split the literals.
*/
dctx->litBuffer = dctx->litExtraBuffer;
dctx->litBufferEnd = dctx->litBuffer + litSize;
dctx->litBufferLocation = ZSTD_not_in_dst;
} else {
assert(blockSizeMax > ZSTD_LITBUFFEREXTRASIZE); /* Literals must be split between the output block and the extra lit * buffer. We fill the extra lit buffer with the tail of the literals, * and put the rest of the literals at the end of the block, with * WILDCOPY_OVERLENGTH of buffer room to allow for overreads. * This MUST not write more than our maxBlockSize beyond dst, because in * streaming mode, that could overwrite part of our extDict window.
*/ if (splitImmediately) { /* won't fit in litExtraBuffer, so it will be split between end of dst and extra buffer */
dctx->litBuffer = (BYTE*)dst + expectedWriteSize - litSize + ZSTD_LITBUFFEREXTRASIZE - WILDCOPY_OVERLENGTH;
dctx->litBufferEnd = dctx->litBuffer + litSize - ZSTD_LITBUFFEREXTRASIZE;
} else { /* initially this will be stored entirely in dst during huffman decoding, it will partially be shifted to litExtraBuffer after */
dctx->litBuffer = (BYTE*)dst + expectedWriteSize - litSize;
dctx->litBufferEnd = (BYTE*)dst + expectedWriteSize;
}
dctx->litBufferLocation = ZSTD_split;
assert(dctx->litBufferEnd <= (BYTE*)dst + expectedWriteSize);
}
}
/*! ZSTD_decodeLiteralsBlock() : * Where it is possible to do so without being stomped by the output during decompression, the literals block will be stored * in the dstBuffer. If there is room to do so, it will be stored in full in the excess dst space after where the current * block will be output. Otherwise it will be stored at the end of the current dst blockspace, with a small portion being * stored in dctx->litExtraBuffer to help keep it "ahead" of the current output write. * * @return : nb of bytes read from src (< srcSize )
* note : symbol not declared but exposed for fullbench */ static size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx, constvoid* src, size_t srcSize, /* note : srcSize < BLOCKSIZE */ void* dst, size_t dstCapacity, const streaming_operation streaming)
{
DEBUGLOG(5, "ZSTD_decodeLiteralsBlock");
RETURN_ERROR_IF(srcSize < MIN_CBLOCK_SIZE, corruption_detected, "");
/* Default FSE distribution tables. * These are pre-calculated FSE decoding tables using default distributions as defined in specification : * https://github.com/facebook/zstd/blob/release/doc/zstd_compression_format.md#default-distributions * They were generated programmatically with following method : * - start from default distributions, present in /lib/common/zstd_internal.h * - generate tables normally, using ZSTD_buildFSETable() * - printout the content of tables
* - pretify output, report below, test with fuzzer to ensure it's correct */
/* Spread symbols */
assert(tableSize <= 512); /* Specialized symbol spreading for the case when there are * no low probability (-1 count) symbols. When compressing * small blocks we avoid low probability symbols to hit this * case, since header decoding speed matters more.
*/ if (highThreshold == tableSize - 1) {
size_t const tableMask = tableSize-1;
size_t const step = FSE_TABLESTEP(tableSize); /* First lay down the symbols in order. * We use a uint64_t to lay down 8 bytes at a time. This reduces branch * misses since small blocks generally have small table logs, so nearly * all symbols have counts <= 8. We ensure we have 8 bytes at the end of * our buffer to handle the over-write.
*/
{
U64 const add = 0x0101010101010101ull;
size_t pos = 0;
U64 sv = 0;
U32 s; for (s=0; s<maxSV1; ++s, sv += add) { int i; intconst n = normalizedCounter[s];
MEM_write64(spread + pos, sv); for (i = 8; i < n; i += 8) {
MEM_write64(spread + pos + i, sv);
}
assert(n>=0);
pos += (size_t)n;
}
} /* Now we spread those positions across the table. * The benefit of doing it in two stages is that we avoid the * variable size inner loop, which caused lots of branch misses. * Now we can run through all the positions without any branch misses. * We unroll the loop twice, since that is what empirically worked best.
*/
{
size_t position = 0;
size_t s;
size_t const unroll = 2;
assert(tableSize % unroll == 0); /* FSE_MIN_TABLELOG is 5 */ for (s = 0; s < (size_t)tableSize; s += unroll) {
size_t u; for (u = 0; u < unroll; ++u) {
size_t const uPosition = (position + (u * step)) & tableMask;
tableDecode[uPosition].baseValue = spread[s + u];
}
position = (position + (unroll * step)) & tableMask;
}
assert(position == 0);
}
} else {
U32 const tableMask = tableSize-1;
U32 const step = FSE_TABLESTEP(tableSize);
U32 s, position = 0; for (s=0; s<maxSV1; s++) { int i; intconst n = normalizedCounter[s]; for (i=0; i<n; i++) {
tableDecode[position].baseValue = s;
position = (position + step) & tableMask; while (UNLIKELY(position > highThreshold)) position = (position + step) & tableMask; /* lowprob area */
} }
assert(position == 0); /* position must reach all cells once, otherwise normalizedCounter is incorrect */
}
/*! ZSTD_overlapCopy8() : * Copies 8 bytes from ip to op and updates op and ip where ip <= op. * If the offset is < 8 then the offset is spread to at least 8 bytes. * * Precondition: *ip <= *op * Postcondition: *op - *op >= 8
*/
HINT_INLINE void ZSTD_overlapCopy8(BYTE** op, BYTE const** ip, size_t offset) {
assert(*ip <= *op); if (offset < 8) { /* close range match, overlap */ staticconst U32 dec32table[] = { 0, 1, 2, 1, 4, 4, 4, 4 }; /* added */ staticconstint dec64table[] = { 8, 8, 8, 7, 8, 9,10,11 }; /* subtracted */ intconst sub2 = dec64table[offset];
(*op)[0] = (*ip)[0];
(*op)[1] = (*ip)[1];
(*op)[2] = (*ip)[2];
(*op)[3] = (*ip)[3];
*ip += dec32table[offset];
ZSTD_copy4(*op+4, *ip);
*ip -= sub2;
} else {
ZSTD_copy8(*op, *ip);
}
*ip += 8;
*op += 8;
assert(*op - *ip >= 8);
}
/*! ZSTD_safecopy() : * Specialized version of memcpy() that is allowed to READ up to WILDCOPY_OVERLENGTH past the input buffer * and write up to 16 bytes past oend_w (op >= oend_w is allowed). * This function is only called in the uncommon case where the sequence is near the end of the block. It * should be fast for a single long sequence, but can be slow for several short sequences. * * @param ovtype controls the overlap detection * - ZSTD_no_overlap: The source and destination are guaranteed to be at least WILDCOPY_VECLEN bytes apart. * - ZSTD_overlap_src_before_dst: The src and dst may overlap and may be any distance apart. * The src buffer must be before the dst buffer.
*/ staticvoid ZSTD_safecopy(BYTE* op, const BYTE* const oend_w, BYTE const* ip, ptrdiff_t length, ZSTD_overlap_e ovtype) {
ptrdiff_t const diff = op - ip;
BYTE* const oend = op + length;
if (length < 8) { /* Handle short lengths. */ while (op < oend) *op++ = *ip++; return;
} if (ovtype == ZSTD_overlap_src_before_dst) { /* Copy 8 bytes and ensure the offset >= 8 when there can be overlap. */
assert(length >= 8);
ZSTD_overlapCopy8(&op, &ip, diff);
length -= 8;
assert(op - ip >= 8);
assert(op <= oend);
}
if (oend <= oend_w) { /* No risk of overwrite. */
ZSTD_wildcopy(op, ip, length, ovtype); return;
} if (op <= oend_w) { /* Wildcopy until we get close to the end. */
assert(oend > oend_w);
ZSTD_wildcopy(op, ip, oend_w - op, ovtype);
ip += oend_w - op;
op += oend_w - op;
} /* Handle the leftovers. */ while (op < oend) *op++ = *ip++;
}
/* ZSTD_safecopyDstBeforeSrc(): * This version allows overlap with dst before src, or handles the non-overlap case with dst after src
* Kept separate from more common ZSTD_safecopy case to avoid performance impact to the safecopy common case */ staticvoid ZSTD_safecopyDstBeforeSrc(BYTE* op, const BYTE* ip, ptrdiff_t length) {
ptrdiff_t const diff = op - ip;
BYTE* const oend = op + length;
if (length < 8 || diff > -8) { /* Handle short lengths, close overlaps, and dst not before src. */ while (op < oend) *op++ = *ip++; return;
}
/* Handle the leftovers. */ while (op < oend) *op++ = *ip++;
}
/* ZSTD_execSequenceEnd(): * This version handles cases that are near the end of the output buffer. It requires * more careful checks to make sure there is no overflow. By separating out these hard * and unlikely cases, we can speed up the common cases. * * NOTE: This function needs to be fast for a single long sequence, but doesn't need * to be optimized for many small sequences, since those fall into ZSTD_execSequence().
*/
FORCE_NOINLINE
ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
size_t ZSTD_execSequenceEnd(BYTE* op,
BYTE* const oend, seq_t sequence, const BYTE** litPtr, const BYTE* const litLimit, const BYTE* const prefixStart, const BYTE* const virtualStart, const BYTE* const dictEnd)
{
BYTE* const oLitEnd = op + sequence.litLength;
size_t const sequenceLength = sequence.litLength + sequence.matchLength; const BYTE* const iLitEnd = *litPtr + sequence.litLength; const BYTE* match = oLitEnd - sequence.offset;
BYTE* const oend_w = oend - WILDCOPY_OVERLENGTH;
/* bounds checks : careful of address space overflow in 32-bit mode */
RETURN_ERROR_IF(sequenceLength > (size_t)(oend - op), dstSize_tooSmall, "last match must fit within dstBuffer");
RETURN_ERROR_IF(sequence.litLength > (size_t)(litLimit - *litPtr), corruption_detected, "try to read beyond literal buffer");
assert(op < op + sequenceLength);
assert(oLitEnd < op + sequenceLength);
/* ZSTD_execSequenceEndSplitLitBuffer(): * This version is intended to be used during instances where the litBuffer is still split. It is kept separate to avoid performance impact for the good case.
*/
FORCE_NOINLINE
ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
size_t ZSTD_execSequenceEndSplitLitBuffer(BYTE* op,
BYTE* const oend, const BYTE* const oend_w, seq_t sequence, const BYTE** litPtr, const BYTE* const litLimit, const BYTE* const prefixStart, const BYTE* const virtualStart, const BYTE* const dictEnd)
{
BYTE* const oLitEnd = op + sequence.litLength;
size_t const sequenceLength = sequence.litLength + sequence.matchLength; const BYTE* const iLitEnd = *litPtr + sequence.litLength; const BYTE* match = oLitEnd - sequence.offset;
/* bounds checks : careful of address space overflow in 32-bit mode */
RETURN_ERROR_IF(sequenceLength > (size_t)(oend - op), dstSize_tooSmall, "last match must fit within dstBuffer");
RETURN_ERROR_IF(sequence.litLength > (size_t)(litLimit - *litPtr), corruption_detected, "try to read beyond literal buffer");
assert(op < op + sequenceLength);
assert(oLitEnd < op + sequenceLength);
/* copy literals */
RETURN_ERROR_IF(op > *litPtr && op < *litPtr + sequence.litLength, dstSize_tooSmall, "output should not catch up to and overwrite literal buffer");
ZSTD_safecopyDstBeforeSrc(op, *litPtr, sequence.litLength);
op = oLitEnd;
*litPtr = iLitEnd;
#ifdefined(__aarch64__) /* prefetch sequence starting from match that will be used for copy later */
PREFETCH_L1(match); #endif /* Handle edge cases in a slow path: * - Read beyond end of literals * - Match end is within WILDCOPY_OVERLIMIT of oend * - 32-bit mode and the match length overflows
*/ if (UNLIKELY(
iLitEnd > litLimit ||
oMatchEnd > oend_w ||
(MEM_32bits() && (size_t)(oend - op) < sequenceLength + WILDCOPY_OVERLENGTH))) return ZSTD_execSequenceEnd(op, oend, sequence, litPtr, litLimit, prefixStart, virtualStart, dictEnd);
/* Assumptions (everything else goes into ZSTD_execSequenceEnd()) */
assert(op <= oLitEnd /* No overflow */);
assert(oLitEnd < oMatchEnd /* Non-zero match & no overflow */);
assert(oMatchEnd <= oend /* No underflow */);
assert(iLitEnd <= litLimit /* Literal length is in bounds */);
assert(oLitEnd <= oend_w /* Can wildcopy literals */);
assert(oMatchEnd <= oend_w /* Can wildcopy matches */);
/* Copy Literals: * Split out litLength <= 16 since it is nearly always true. +1.6% on gcc-9. * We likely don't need the full 32-byte wildcopy.
*/
assert(WILDCOPY_OVERLENGTH >= 16);
ZSTD_copy16(op, (*litPtr)); if (UNLIKELY(sequence.litLength > 16)) {
ZSTD_wildcopy(op + 16, (*litPtr) + 16, sequence.litLength - 16, ZSTD_no_overlap);
}
op = oLitEnd;
*litPtr = iLitEnd; /* update for next sequence */
/* Copy Match */ if (sequence.offset > (size_t)(oLitEnd - prefixStart)) { /* offset beyond prefix -> go into extDict */
RETURN_ERROR_IF(UNLIKELY(sequence.offset > (size_t)(oLitEnd - virtualStart)), corruption_detected, "");
match = dictEnd + (match - prefixStart); if (match + sequence.matchLength <= dictEnd) {
ZSTD_memmove(oLitEnd, match, sequence.matchLength); return sequenceLength;
} /* span extDict & currentPrefixSegment */
{ size_t const length1 = dictEnd - match;
ZSTD_memmove(oLitEnd, match, length1);
op = oLitEnd + length1;
sequence.matchLength -= length1;
match = prefixStart;
}
} /* Match within prefix of 1 or more bytes */
assert(op <= oMatchEnd);
assert(oMatchEnd <= oend_w);
assert(match >= prefixStart);
assert(sequence.matchLength >= 1);
/* Nearly all offsets are >= WILDCOPY_VECLEN bytes, which means we can use wildcopy * without overlap checking.
*/ if (LIKELY(sequence.offset >= WILDCOPY_VECLEN)) { /* We bet on a full wildcopy for matches, since we expect matches to be * longer than literals (in general). In silesia, ~10% of matches are longer * than 16 bytes.
*/
ZSTD_wildcopy(op, match, (ptrdiff_t)sequence.matchLength, ZSTD_no_overlap); return sequenceLength;
}
assert(sequence.offset < WILDCOPY_VECLEN);
/* Copy 8 bytes and spread the offset to be >= 8. */
ZSTD_overlapCopy8(&op, &match, sequence.offset);
/* If the match length is > 8 bytes, then continue with the wildcopy. */ if (sequence.matchLength > 8) {
assert(op < oMatchEnd);
ZSTD_wildcopy(op, match, (ptrdiff_t)sequence.matchLength - 8, ZSTD_overlap_src_before_dst);
} return sequenceLength;
}
assert(op != NULL /* Precondition */);
assert(oend_w < oend /* No underflow */); /* Handle edge cases in a slow path: * - Read beyond end of literals * - Match end is within WILDCOPY_OVERLIMIT of oend * - 32-bit mode and the match length overflows
*/ if (UNLIKELY(
iLitEnd > litLimit ||
oMatchEnd > oend_w ||
(MEM_32bits() && (size_t)(oend - op) < sequenceLength + WILDCOPY_OVERLENGTH))) return ZSTD_execSequenceEndSplitLitBuffer(op, oend, oend_w, sequence, litPtr, litLimit, prefixStart, virtualStart, dictEnd);
/* Assumptions (everything else goes into ZSTD_execSequenceEnd()) */
assert(op <= oLitEnd /* No overflow */);
assert(oLitEnd < oMatchEnd /* Non-zero match & no overflow */);
assert(oMatchEnd <= oend /* No underflow */);
assert(iLitEnd <= litLimit /* Literal length is in bounds */);
assert(oLitEnd <= oend_w /* Can wildcopy literals */);
assert(oMatchEnd <= oend_w /* Can wildcopy matches */);
/* Copy Literals: * Split out litLength <= 16 since it is nearly always true. +1.6% on gcc-9. * We likely don't need the full 32-byte wildcopy.
*/
assert(WILDCOPY_OVERLENGTH >= 16);
ZSTD_copy16(op, (*litPtr)); if (UNLIKELY(sequence.litLength > 16)) {
ZSTD_wildcopy(op+16, (*litPtr)+16, sequence.litLength-16, ZSTD_no_overlap);
}
op = oLitEnd;
*litPtr = iLitEnd; /* update for next sequence */
/* Copy Match */ if (sequence.offset > (size_t)(oLitEnd - prefixStart)) { /* offset beyond prefix -> go into extDict */
RETURN_ERROR_IF(UNLIKELY(sequence.offset > (size_t)(oLitEnd - virtualStart)), corruption_detected, "");
match = dictEnd + (match - prefixStart); if (match + sequence.matchLength <= dictEnd) {
ZSTD_memmove(oLitEnd, match, sequence.matchLength); return sequenceLength;
} /* span extDict & currentPrefixSegment */
{ size_t const length1 = dictEnd - match;
ZSTD_memmove(oLitEnd, match, length1);
op = oLitEnd + length1;
sequence.matchLength -= length1;
match = prefixStart;
} } /* Match within prefix of 1 or more bytes */
assert(op <= oMatchEnd);
assert(oMatchEnd <= oend_w);
assert(match >= prefixStart);
assert(sequence.matchLength >= 1);
/* Nearly all offsets are >= WILDCOPY_VECLEN bytes, which means we can use wildcopy * without overlap checking.
*/ if (LIKELY(sequence.offset >= WILDCOPY_VECLEN)) { /* We bet on a full wildcopy for matches, since we expect matches to be * longer than literals (in general). In silesia, ~10% of matches are longer * than 16 bytes.
*/
ZSTD_wildcopy(op, match, (ptrdiff_t)sequence.matchLength, ZSTD_no_overlap); return sequenceLength;
}
assert(sequence.offset < WILDCOPY_VECLEN);
/* Copy 8 bytes and spread the offset to be >= 8. */
ZSTD_overlapCopy8(&op, &match, sequence.offset);
/* If the match length is > 8 bytes, then continue with the wildcopy. */ if (sequence.matchLength > 8) {
assert(op < oMatchEnd);
ZSTD_wildcopy(op, match, (ptrdiff_t)sequence.matchLength-8, ZSTD_overlap_src_before_dst);
} return sequenceLength;
}
/* We need to add at most (ZSTD_WINDOWLOG_MAX_32 - 1) bits to read the maximum * offset bits. But we can only read at most STREAM_ACCUMULATOR_MIN_32 * bits before reloading. This value is the maximum number of bytes we read * after reloading when we are decoding long offsets.
*/ #define LONG_OFFSETS_MAX_EXTRA_BITS_32 \
(ZSTD_WINDOWLOG_MAX_32 > STREAM_ACCUMULATOR_MIN_32 \
? ZSTD_WINDOWLOG_MAX_32 - STREAM_ACCUMULATOR_MIN_32 \
: 0)
assert(llBits <= MaxLLBits);
assert(mlBits <= MaxMLBits);
assert(ofBits <= MaxOff); /* * As gcc has better branch and block analyzers, sometimes it is only * valuable to mark likeliness for clang, it gives around 3-4% of * performance.
*/
if (mlBits > 0)
seq.matchLength += BIT_readBitsFast(&seqState->DStream, mlBits/*>0*/);
if (MEM_32bits() && (mlBits+llBits >= STREAM_ACCUMULATOR_MIN_32-LONG_OFFSETS_MAX_EXTRA_BITS_32))
BIT_reloadDStream(&seqState->DStream); if (MEM_64bits() && UNLIKELY(totalBits >= STREAM_ACCUMULATOR_MIN_64-(LLFSELog+MLFSELog+OffFSELog)))
BIT_reloadDStream(&seqState->DStream); /* Ensure there are enough bits to read the rest of data in 64-bit mode. */
ZSTD_STATIC_ASSERT(16+LLFSELog+MLFSELog+OffFSELog < STREAM_ACCUMULATOR_MIN_64);
if (llBits > 0)
seq.litLength += BIT_readBitsFast(&seqState->DStream, llBits/*>0*/);
if (MEM_32bits())
BIT_reloadDStream(&seqState->DStream);
/* decompress without overrunning litPtr begins */
{ seq_t sequence = {0,0,0}; /* some static analyzer believe that @sequence is not initialized (it necessarily is, since for(;;) loop as at least one iteration) */ /* Align the decompression loop to 32 + 16 bytes. * * zstd compiled with gcc-9 on an Intel i9-9900k shows 10% decompression * speed swings based on the alignment of the decompression loop. This * performance swing is caused by parts of the decompression loop falling * out of the DSB. The entire decompression loop should fit in the DSB, * when it can't we get much worse performance. You can measure if you've * hit the good case or the bad case with this perf command for some * compressed file test.zst: * * perf stat -e cycles -e instructions -e idq.all_dsb_cycles_any_uops \ * -e idq.all_mite_cycles_any_uops -- ./zstd -tq test.zst * * If you see most cycles served out of the MITE you've hit the bad case. * If you see most cycles served out of the DSB you've hit the good case. * If it is pretty even then you may be in an okay case. * * This issue has been reproduced on the following CPUs: * - Kabylake: Macbook Pro (15-inch, 2019) 2.4 GHz Intel Core i9 * Use Instruments->Counters to get DSB/MITE cycles. * I never got performance swings, but I was able to * go from the good case of mostly DSB to half of the * cycles served from MITE. * - Coffeelake: Intel i9-9900k * - Coffeelake: Intel i7-9700k * * I haven't been able to reproduce the instability or DSB misses on any * of the following CPUS: * - Haswell * - Broadwell: Intel(R) Xeon(R) CPU E5-2680 v4 @ 2.40GH * - Skylake * * Alignment is done for each of the three major decompression loops: * - ZSTD_decompressSequences_bodySplitLitBuffer - presplit section of the literal buffer * - ZSTD_decompressSequences_bodySplitLitBuffer - postsplit section of the literal buffer * - ZSTD_decompressSequences_body * Alignment choices are made to minimize large swings on bad cases and influence on performance * from changes external to this code, rather than to overoptimize on the current commit. * * If you are seeing performance stability this script can help test. * It tests on 4 commits in zstd where I saw performance change. * * https://gist.github.com/terrelln/9889fc06a423fd5ca6e99351564473f4
*/ #ifdefined(__GNUC__) && defined(__x86_64__)
__asm__(".p2align 6"); # if __GNUC__ >= 7 /* good for gcc-7, gcc-9, and gcc-11 */
__asm__("nop");
__asm__(".p2align 5");
__asm__("nop");
__asm__(".p2align 4"); # if __GNUC__ == 8 || __GNUC__ == 10 /* good for gcc-8 and gcc-10 */
__asm__("nop");
__asm__(".p2align 3"); # endif # endif #endif
/* Handle the initial state where litBuffer is currently split between dst and litExtraBuffer */ for ( ; nbSeq; nbSeq--) {
sequence = ZSTD_decodeSequence(&seqState, isLongOffset, nbSeq==1); if (litPtr + sequence.litLength > dctx->litBufferEnd) break;
{ size_t const oneSeqSize = ZSTD_execSequenceSplitLitBuffer(op, oend, litPtr + sequence.litLength - WILDCOPY_OVERLENGTH, sequence, &litPtr, litBufferEnd, prefixStart, vBase, dictEnd); #ifdefined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION) && defined(FUZZING_ASSERT_VALID_SEQUENCE)
assert(!ZSTD_isError(oneSeqSize));
ZSTD_assertValidSequence(dctx, op, oend, sequence, prefixStart, vBase); #endif if (UNLIKELY(ZSTD_isError(oneSeqSize))) return oneSeqSize;
DEBUGLOG(6, "regenerated sequence size : %u", (U32)oneSeqSize);
op += oneSeqSize;
} }
DEBUGLOG(6, "reached: (litPtr + sequence.litLength > dctx->litBufferEnd)");
/* If there are more sequences, they will need to read literals from litExtraBuffer; copy over the remainder from dst and update litPtr and litEnd */ if (nbSeq > 0) { const size_t leftoverLit = dctx->litBufferEnd - litPtr;
DEBUGLOG(6, "There are %i sequences left, and %zu/%zu literals left in buffer", nbSeq, leftoverLit, sequence.litLength); if (leftoverLit) {
RETURN_ERROR_IF(leftoverLit > (size_t)(oend - op), dstSize_tooSmall, "remaining lit must fit within dstBuffer");
ZSTD_safecopyDstBeforeSrc(op, litPtr, leftoverLit);
sequence.litLength -= leftoverLit;
op += leftoverLit;
}
litPtr = dctx->litExtraBuffer;
litBufferEnd = dctx->litExtraBuffer + ZSTD_LITBUFFEREXTRASIZE;
dctx->litBufferLocation = ZSTD_not_in_dst;
{ size_t const oneSeqSize = ZSTD_execSequence(op, oend, sequence, &litPtr, litBufferEnd, prefixStart, vBase, dictEnd); #ifdefined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION) && defined(FUZZING_ASSERT_VALID_SEQUENCE)
assert(!ZSTD_isError(oneSeqSize));
ZSTD_assertValidSequence(dctx, op, oend, sequence, prefixStart, vBase); #endif if (UNLIKELY(ZSTD_isError(oneSeqSize))) return oneSeqSize;
DEBUGLOG(6, "regenerated sequence size : %u", (U32)oneSeqSize);
op += oneSeqSize;
}
nbSeq--;
}
}
if (nbSeq > 0) { /* there is remaining lit from extra buffer */
#ifdefined(__GNUC__) && defined(__x86_64__)
__asm__(".p2align 6");
__asm__("nop"); # if __GNUC__ != 7 /* worse for gcc-7 better for gcc-8, gcc-9, and gcc-10 and clang */
__asm__(".p2align 4");
__asm__("nop");
__asm__(".p2align 3"); # elif __GNUC__ >= 11
__asm__(".p2align 3"); # else
__asm__(".p2align 5");
__asm__("nop");
__asm__(".p2align 3"); # endif #endif
size_t ZSTD_prefetchMatch(size_t prefetchPos, seq_t const sequence, const BYTE* const prefixStart, const BYTE* const dictEnd)
{
prefetchPos += sequence.litLength;
{ const BYTE* const matchBase = (sequence.offset > prefetchPos) ? dictEnd : prefixStart; /* note : this operation can overflow when seq.offset is really too large, which can only happen when input is corrupted.
* No consequence though : memory address is only used for prefetching, not for dereferencing */ const BYTE* const match = ZSTD_wrappedPtrSub(ZSTD_wrappedPtrAdd(matchBase, prefetchPos), sequence.offset);
PREFETCH_L1(match); PREFETCH_L1(match+CACHELINE_SIZE); /* note : it's safe to invoke PREFETCH() on any memory address, including invalid ones */
} return prefetchPos + sequence.matchLength;
}
/* This decoding function employs prefetching * to reduce latency impact of cache misses. * It's generally employed when block contains a significant portion of long-distance matches
* or when coupled with a "cold" dictionary */
FORCE_INLINE_TEMPLATE size_t
ZSTD_decompressSequencesLong_body(
ZSTD_DCtx* dctx, void* dst, size_t maxDstSize, constvoid* seqStart, size_t seqSize, int nbSeq, const ZSTD_longOffset_e isLongOffset)
{ const BYTE* ip = (const BYTE*)seqStart; const BYTE* const iend = ip + seqSize;
BYTE* const ostart = (BYTE*)dst;
BYTE* const oend = dctx->litBufferLocation == ZSTD_in_dst ? dctx->litBuffer : ZSTD_maybeNullPtrAdd(ostart, maxDstSize);
BYTE* op = ostart; const BYTE* litPtr = dctx->litPtr; const BYTE* litBufferEnd = dctx->litBufferEnd; const BYTE* const prefixStart = (const BYTE*) (dctx->prefixStart); const BYTE* const dictStart = (const BYTE*) (dctx->virtualStart); const BYTE* const dictEnd = (const BYTE*) (dctx->dictEnd);
/* Regen sequences */ if (nbSeq) { #define STORED_SEQS 8 #define STORED_SEQS_MASK (STORED_SEQS-1) #define ADVANCED_SEQS STORED_SEQS
seq_t sequences[STORED_SEQS]; intconst seqAdvance = MIN(nbSeq, ADVANCED_SEQS);
seqState_t seqState; int seqNb;
size_t prefetchPos = (size_t)(op-prefixStart); /* track position relative to prefixStart */
#ifndef ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT /* ZSTD_decompressSequencesLong() : * decompression function triggered when a minimum share of offsets is considered "long", * aka out of cache. * note : "long" definition seems overloaded here, sometimes meaning "wider than bitstream register", and sometimes meaning "farther than memory cache distance".
* This function will try to mitigate main memory latency through the use of prefetching */ static size_t
ZSTD_decompressSequencesLong(ZSTD_DCtx* dctx, void* dst, size_t maxDstSize, constvoid* seqStart, size_t seqSize, int nbSeq, const ZSTD_longOffset_e isLongOffset)
{
DEBUGLOG(5, "ZSTD_decompressSequencesLong"); #if DYNAMIC_BMI2 if (ZSTD_DCtx_get_bmi2(dctx)) { return ZSTD_decompressSequencesLong_bmi2(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset);
} #endif return ZSTD_decompressSequencesLong_default(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset);
} #endif/* ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT */
/** * @returns The total size of the history referenceable by zstd, including * both the prefix and the extDict. At @p op any offset larger than this * is invalid.
*/ static size_t ZSTD_totalHistorySize(BYTE* op, BYTE const* virtualStart)
{ return (size_t)(op - virtualStart);
}
/* ZSTD_getOffsetInfo() : * condition : offTable must be valid * @return : "share" of long offsets (arbitrarily defined as > (1<<23)) * compared to maximum possible of (1<<OffFSELog), * as well as the maximum number additional bits required.
*/ static ZSTD_OffsetInfo
ZSTD_getOffsetInfo(const ZSTD_seqSymbol* offTable, int nbSeq)
{
ZSTD_OffsetInfo info = {0, 0}; /* If nbSeq == 0, then the offTable is uninitialized, but we have * no sequences, so both values should be 0.
*/ if (nbSeq != 0) { constvoid* ptr = offTable;
U32 const tableLog = ((const ZSTD_seqSymbol_header*)ptr)[0].tableLog; const ZSTD_seqSymbol* table = offTable + 1;
U32 const max = 1 << tableLog;
U32 u;
DEBUGLOG(5, "ZSTD_getLongOffsetsShare: (tableLog=%u)", tableLog);
assert(max <= (1 << OffFSELog)); /* max not too large */ for (u=0; u<max; u++) {
info.maxNbAdditionalBits = MAX(info.maxNbAdditionalBits, table[u].nbAdditionalBits); if (table[u].nbAdditionalBits > 22) info.longOffsetShare += 1;
}
/** * @returns The maximum offset we can decode in one read of our bitstream, without * reloading more bits in the middle of the offset bits read. Any offsets larger * than this must use the long offset decoder.
*/ static size_t ZSTD_maxShortOffset(void)
{ if (MEM_64bits()) { /* We can decode any offset without reloading bits. * This might change if the max window size grows.
*/
ZSTD_STATIC_ASSERT(ZSTD_WINDOWLOG_MAX <= 31); return (size_t)-1;
} else { /* The maximum offBase is (1 << (STREAM_ACCUMULATOR_MIN + 1)) - 1. * This offBase would require STREAM_ACCUMULATOR_MIN extra bits. * Then we have to subtract ZSTD_REP_NUM to get the maximum possible offset.
*/
size_t const maxOffbase = ((size_t)1 << (STREAM_ACCUMULATOR_MIN + 1)) - 1;
size_t const maxOffset = maxOffbase - ZSTD_REP_NUM;
assert(ZSTD_highbit32((U32)maxOffbase) == STREAM_ACCUMULATOR_MIN); return maxOffset;
}
}
/* Note : the wording of the specification * allows compressed block to be sized exactly ZSTD_blockSizeMax(dctx). * This generally does not happen, as it makes little sense, * since an uncompressed block would feature same size and have no decompression cost. * Also, note that decoder from reference libzstd before < v1.5.4 * would consider this edge case as an error. * As a consequence, avoid generating compressed blocks of size ZSTD_blockSizeMax(dctx)
* for broader compatibility with the deployed ecosystem of zstd decoders */
RETURN_ERROR_IF(srcSize > ZSTD_blockSizeMax(dctx), srcSize_wrong, "");
/* Build Decoding Tables */
{ /* Compute the maximum block size, which must also work when !frame and fParams are unset. * Additionally, take the min with dstCapacity to ensure that the totalHistorySize fits in a size_t.
*/
size_t const blockSizeMax = MIN(dstCapacity, ZSTD_blockSizeMax(dctx));
size_t const totalHistorySize = ZSTD_totalHistorySize(ZSTD_maybeNullPtrAdd((BYTE*)dst, blockSizeMax), (BYTE const*)dctx->virtualStart); /* isLongOffset must be true if there are long offsets. * Offsets are long if they are larger than ZSTD_maxShortOffset(). * We don't expect that to be the case in 64-bit mode. * * We check here to see if our history is large enough to allow long offsets. * If it isn't, then we can't possible have (valid) long offsets. If the offset * is invalid, then it is okay to read it incorrectly. * * If isLongOffsets is true, then we will later check our decoding table to see * if it is even possible to generate long offsets.
*/
ZSTD_longOffset_e isLongOffset = (ZSTD_longOffset_e)(MEM_32bits() && (totalHistorySize > ZSTD_maxShortOffset())); /* These macros control at build-time which decompressor implementation * we use. If neither is defined, we do some inspection and dispatch at * runtime.
*/ #if !defined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT) && \
!defined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG) int usePrefetchDecoder = dctx->ddictIsCold; #else /* Set to 1 to avoid computing offset info if we don't need to. * Otherwise this value is ignored.
*/ int usePrefetchDecoder = 1; #endif int nbSeq;
size_t const seqHSize = ZSTD_decodeSeqHeaders(dctx, &nbSeq, ip, srcSize); if (ZSTD_isError(seqHSize)) return seqHSize;
ip += seqHSize;
srcSize -= seqHSize;
/* If we could potentially have long offsets, or we might want to use the prefetch decoder, * compute information about the share of long offsets, and the maximum nbAdditionalBits. * NOTE: could probably use a larger nbSeq limit
*/ if (isLongOffset || (!usePrefetchDecoder && (totalHistorySize > (1u << 24)) && (nbSeq > 8))) {
ZSTD_OffsetInfo const info = ZSTD_getOffsetInfo(dctx->OFTptr, nbSeq); if (isLongOffset && info.maxNbAdditionalBits <= STREAM_ACCUMULATOR_MIN) { /* If isLongOffset, but the maximum number of additional bits that we see in our table is small * enough, then we know it is impossible to have too long an offset in this block, so we can * use the regular offset decoder.
*/
isLongOffset = ZSTD_lo_isRegularOffset;
} if (!usePrefetchDecoder) {
U32 const minShare = MEM_64bits() ? 7 : 20; /* heuristic values, correspond to 2.73% and 7.81% */
usePrefetchDecoder = (info.longOffsetShare >= minShare);
}
}
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.