diff options
Diffstat (limited to 'tools/src/minilzlib/lzmadec.c')
| -rw-r--r-- | tools/src/minilzlib/lzmadec.c | 627 |
1 files changed, 627 insertions, 0 deletions
diff --git a/tools/src/minilzlib/lzmadec.c b/tools/src/minilzlib/lzmadec.c new file mode 100644 index 0000000..1a3c420 --- /dev/null +++ b/tools/src/minilzlib/lzmadec.c @@ -0,0 +1,627 @@ +/*++ + +Copyright (c) Alex Ionescu. All rights reserved. + +Module Name: + + lzmadec.c + +Abstract: + + This module implements the LZMA Decoding Logic responsible for decoding the + three possible types of LZMA "packets": matches, repetitions (short \& long) + and literals. The probability model for each type of packet is also stored + in this file, along with the management of the previously seen packet types + (which is tracked as the "sequence"). + +Author: + + Alex Ionescu (@aionescu) 15-Apr-2020 - Initial version + +Environment: + + Windows & Linux, user mode and kernel mode. + +--*/ + +#include "minlzlib.h" +#include "lzmadec.h" + +// +// Probability Bit Model for Lenghts in Rep and in Match sequences +// +typedef struct _LENGTH_DECODER_STATE +{ + // + // Bit Model for the choosing the type of length encoding + // + uint16_t Choice; + uint16_t Choice2; + // + // Bit Model for each of the length encodings + // + uint16_t Low[LZMA_POSITION_COUNT][LZMA_MAX_LOW_LENGTH]; + uint16_t Mid[LZMA_POSITION_COUNT][LZMA_MAX_MID_LENGTH]; + uint16_t High[LZMA_MAX_HIGH_LENGTH]; +} LENGTH_DECODER_STATE, * PLENGTH_DECODER_STATE; + +// +// State used for LZMA decoding +// +typedef struct _DECODER_STATE +{ + // + // Current type of sequence last decoded + // + LZMA_SEQUENCE_STATE Sequence; + // + // History of last 4 decoded distances + // + uint32_t Rep0; + uint32_t Rep1; + uint32_t Rep2; + uint32_t Rep3; + // + // Pending length to repeat from dictionary + // + uint32_t Len; + // + // Probability Bit Models for all sequence types + // + union + { + struct + { + // + // Literal model + // + uint16_t Literal[LZMA_LITERAL_CODERS][LZMA_LC_MODEL_SIZE]; + // + // Last-used-distance based models + // + uint16_t Rep[LzmaMaxState]; + uint16_t Rep0[LzmaMaxState]; + uint16_t Rep0Long[LzmaMaxState][LZMA_POSITION_COUNT]; + uint16_t Rep1[LzmaMaxState]; + uint16_t Rep2[LzmaMaxState]; + LENGTH_DECODER_STATE RepLen; + // + // Explicit distance match based models + // + uint16_t Match[LzmaMaxState][LZMA_POSITION_COUNT]; + uint16_t DistSlot[LZMA_FIRST_CONTEXT_DISTANCE_SLOT][LZMA_DISTANCE_SLOTS]; + uint16_t Dist[(1 << 7) - LZMA_FIRST_FIXED_DISTANCE_SLOT]; + uint16_t Align[LZMA_DISTANCE_ALIGN_SLOTS]; + LENGTH_DECODER_STATE MatchLen; + } BitModel; + uint16_t RawProbabilities[LZMA_BIT_MODEL_SLOTS]; + } u; +} DECODER_STATE, *PDECODER_STATE; +DECODER_STATE Decoder; + +// +// LZMA decoding uses 3 "properties" which determine how the probability +// bit model will be laid out. These store the number of bits that are used +// to pick the correct Literal Coder ("lc"), the number of Position bits to +// select the Literal coder ("lp"), and the number of Position Bits used to +// select various lengths ("pb"). In LZMA2, these properties are encoded in +// a single byte with the formula: ((pb * 45) + lp * 9) + lc). +// +// We only support the default {lc = 3, lp = 0, pb = 2} properties, which +// are what the main encoders out there use. This means that a total of 2 +// bits will be used for arithmetic-coded bit trees that are dependent on +// the current position, and that a total of 3 bits will be used when we +// pick the arithmetic-coded bit tree used for literal coding. The 0 means +// this selection will _not_ be dependent on the position in the buffer. +// +const uint8_t k_LzSupportedProperties = + (LZMA_PB * 45) + (LZMA_LP * 9) + (LZMA_LC); + +void +LzSetLiteral ( + PLZMA_SEQUENCE_STATE State + ) +{ + if (*State <= LzmaLitShortrepLitLitState) + { + // + // States 0-3 represent packets with at least 2 back-to-back literals, + // so another literal now takes us to state 0 (3 back-to-back literals) + // + *State = LzmaLitLitLitState; + } + else if (*State <= LzmaLitShortrepState) + { + // + // States 4-6 represent packets with a literal at the end, so seeing + // another literal now takes us to 2 back-to-back literals, which are + // state packets 1-3. + // + // States 7-9 represent packets with a literal at the start, followed + // by a match/rep/shortrep. Seeing another literal now drops this first + // literal and takes us to having a literal at the end, which are state + // packets 4-6 that we just described in the paragraph above. + // + *State = (LZMA_SEQUENCE_STATE)(*State - 3); + } + else + { + // + // Finally, state 10 and 11 represent cases without a single literal in + // the last 2 sequence packets, so seeing a literal now takes us to a + // "literal at the end" state, either following a match or a rep. + // + *State = (LZMA_SEQUENCE_STATE)(*State - 6); + } +} + +bool +LzIsLiteral ( + LZMA_SEQUENCE_STATE State + ) +{ + // + // States 0-6 describe literal packet sequences + // + return State < LzmaMaxLitState; +} + +void +LzSetMatch ( + PLZMA_SEQUENCE_STATE State + ) +{ + // + // Move to the appropriate "match" state based on current literal state + // + *State = LzIsLiteral(*State) ? LzmaLitMatchState : LzmaNonlitMatchState; +} + +void +LzSetLongRep ( + PLZMA_SEQUENCE_STATE State + ) +{ + // + // Move to the appropriate "long rep" state based on current literal state + // + *State = LzIsLiteral(*State) ? LzmaLitRepState : LzmaNonlitRepState; +} + +void +LzSetShortRep ( + PLZMA_SEQUENCE_STATE State + ) +{ + // + // Move to the appropriate "short rep" state based on current literal state + // + *State = LzIsLiteral(*State) ? LzmaLitShortrepState : LzmaNonlitRepState; +} + +uint16_t* +LzGetLiteralSlot ( + void + ) +{ + uint8_t symbol; + + // + // To pick the correct literal coder arithmetic-coded bit tree, LZMA uses + // the "lc" parameter to choose the number of high bits from the previous + // symbol (in the normal case, 3). It then combines that with the "lp" + // parameter to choose the number of low bits from the current position in + // the dictionary. However, since "lp" is normally 0, we can omit this. + // + symbol = DtGetSymbol(1); + return Decoder.u.BitModel.Literal[symbol >> (8 - LZMA_LC)]; +} + +uint16_t* +LzGetDistSlot ( + void + ) +{ + uint8_t slotIndex; + + // + // There are 4 different arithmetic-coded bit trees which are used to pick + // the correct "distance slot" when doing match distance decoding. Each of + // them is used based on the length of the symbol that is being repeated. + // For lengths of 2, 3, 4 bytes, a dedicated set of distance slots is used. + // For lengths of 5 bytes or above, a shared set of distance slots is used. + // + if (Decoder.Len < (LZMA_FIRST_CONTEXT_DISTANCE_SLOT + LZMA_MIN_LENGTH)) + { + slotIndex = (uint8_t)(Decoder.Len - LZMA_MIN_LENGTH); + } + else + { + slotIndex = LZMA_FIRST_CONTEXT_DISTANCE_SLOT - 1; + } + return Decoder.u.BitModel.DistSlot[slotIndex]; +} + +void +LzDecodeLiteral ( + void + ) +{ + uint16_t* probArray; + uint8_t symbol, matchByte; + + // + // First, choose the correct arithmetic-coded bit tree (which is based on + // the last symbol we just decoded), then see if we last decoded a literal. + // + // If so, simply get the symbol from the bit tree as normal. However, if + // we didn't last see a literal, we need to read the "match byte" that is + // "n" bytes away from the last decoded match. We previously stored this in + // rep0. + // + // Based on this match byte, we'll then use 2 other potential bit trees, + // see LzDecodeMatched for more information. + // + probArray = LzGetLiteralSlot(); + if (LzIsLiteral(Decoder.Sequence)) + { + + symbol = RcGetBitTree(probArray, (1 << 8)); + } + else + { + matchByte = DtGetSymbol(Decoder.Rep0 + 1); + symbol = RcDecodeMatchedBitTree(probArray, matchByte); + } + + // + // Write the symbol and indicate that the last sequence was a literal + // + DtPutSymbol(symbol); + LzSetLiteral(&Decoder.Sequence); +} + +void +LzDecodeLen ( + PLENGTH_DECODER_STATE LenState, + uint8_t PosBit + ) +{ + uint16_t* probArray; + uint16_t limit; + + // + // Lenghts of 2 and higher are encoded in 3 possible types of arithmetic- + // coded bit trees, depending on the size of the length. + // + // Lengths 2-9 are encoded in trees called "Low" using 3 bits of data. + // Lengths 10-17 are encoded in trees called "Mid" using 3 bits of data. + // Lengths 18-273 are encoded in a tree called "high" using 8 bits of data. + // + // The appropriate "Low" or "Mid" tree is selected based on the bottom 2 + // position bits (0-3) (in the LZMA standard, this is based on the "pb", + // while the "High" tree is shared for all positions. + // + // Two arithmetic-coded bit trees, called "Choice" and "Choice2" tell us + // the type of Length, so we can choose the right tree. {0, n} tells us + // to use the Low trees, while {1, 0} tells us to use the Mid trees. Lastly + // {1, 1} tells us to use the High tree. + // + Decoder.Len = LZMA_MIN_LENGTH; + if (RcIsBitSet(&LenState->Choice)) + { + if (RcIsBitSet(&LenState->Choice2)) + { + probArray = LenState->High; + limit = LZMA_MAX_HIGH_LENGTH; + Decoder.Len += LZMA_MAX_LOW_LENGTH + LZMA_MAX_MID_LENGTH; + } + else + { + probArray = LenState->Mid[PosBit]; + limit = LZMA_MAX_MID_LENGTH; + Decoder.Len += LZMA_MAX_LOW_LENGTH; + } + } + else + { + probArray = LenState->Low[PosBit]; + limit = LZMA_MAX_LOW_LENGTH; + } + Decoder.Len += RcGetBitTree(probArray, limit); +} + +void +LzDecodeMatch ( + uint8_t PosBit + ) +{ + uint16_t* probArray; + uint8_t distSlot, distBits; + + // + // Decode the length component of the "match" sequence. Then, since we're + // about to decode a new distance, update our history by one level. + // + LzDecodeLen(&Decoder.u.BitModel.MatchLen, PosBit); + Decoder.Rep3 = Decoder.Rep2; + Decoder.Rep2 = Decoder.Rep1; + Decoder.Rep1 = Decoder.Rep0; + + // + // Read the first 6 bits, which make up the "distance slot" + // + probArray = LzGetDistSlot(); + distSlot = RcGetBitTree(probArray, LZMA_DISTANCE_SLOTS); + if (distSlot < LZMA_FIRST_CONTEXT_DISTANCE_SLOT) + { + // + // Slots 0-3 directly encode the distance as a literal number + // + Decoder.Rep0 = distSlot; + } + else + { + // + // For slots 4-13, figure out how many "context encoded bits" are used + // to encode this distance. The math works out such that slots 4-5 use + // 1 bit, 6-7 use 2 bits, 8-9 use 3 bits, and so on and so forth until + // slots 12-13 which use 5 bits. + // + // This gives us anywhere from 1-5 bits, plus the two upper bits which + // can either be 0b10 or 0b11 (based on the bottom bit of the distance + // slot). Thus, with the context encoded bits, we can represent lengths + // anywhere from 0b10[0] to 0b11[11111] (i.e.: 4-127). + // + // For slots 14-63, we use "fixed 50% probability bits" which are also + // called "direct bits". The formula below also tells us how many such + // direct bits to use in this scenario. In other words, distBits can + // either be the number of "context encoded bits" for slots 4-13, or it + // can be the the number of "direct bits" for slots 14-63. This gives + // us a range of of 2 to 26 bits, which are then used as middle bits. + // Finally, the last 4 bits are called the "align" bits. The smallest + // possible number we can encode is now going to be 0b10[00][0000] and + // the highest is 0b11[1111111111111111111111111][1111], in other words + // 128 to (2^31)-1. + // + distBits = (distSlot >> 1) - 1; + Decoder.Rep0 = (0b10 | (distSlot & 1)) << distBits; + + // + // Slots 4-13 have their own arithmetic-coded reverse bit trees. Slots + // 14-63 encode the middle "direct bits" with fixed 50% probability and + // the bottom 4 "align bits" with a shared arithmetic-coded reverse bit + // tree. + // + if (distSlot < LZMA_FIRST_FIXED_DISTANCE_SLOT) + { + probArray = &Decoder.u.BitModel.Dist[Decoder.Rep0 - distSlot]; + } + else + { + Decoder.Rep0 |= RcGetFixed(distBits - LZMA_DISTANCE_ALIGN_BITS) << + LZMA_DISTANCE_ALIGN_BITS; + distBits = LZMA_DISTANCE_ALIGN_BITS; + probArray = Decoder.u.BitModel.Align; + } + Decoder.Rep0 |= RcGetReverseBitTree(probArray, distBits); + } + + // + // Indicate that the last sequence was a "match" + // + LzSetMatch(&Decoder.Sequence); +} + +void +LzDecodeRepLen ( + uint8_t PosBit, + bool IsLongRep + ) +{ + // + // Decode the length byte and indicate the last sequence was a "rep". + // If this is a short rep, then the length is always hard-coded to 1. + // + if (IsLongRep) + { + LzDecodeLen(&Decoder.u.BitModel.RepLen, PosBit); + LzSetLongRep(&Decoder.Sequence); + } + else + { + Decoder.Len = 1; + LzSetShortRep(&Decoder.Sequence); + } +} + +void +LzDecodeRep0( + uint8_t PosBit + ) +{ + uint8_t bit; + + // + // This could be a "short rep" with a length of 1, or a "long rep0" with + // a length that we have to decode. The next bit tells us this, using the + // arithmetic-coded bit trees stored in "Rep0Long", with 1 tree for each + // position bit (0-3). + // + bit = RcIsBitSet(&Decoder.u.BitModel.Rep0Long[Decoder.Sequence][PosBit]); + LzDecodeRepLen(PosBit, bit); +} + +void +LzDecodeLongRep ( + uint8_t PosBit + ) +{ + uint32_t newRep; + + // + // Read the next 2 bits to figure out which of the recently used distances + // we should use for this match. The following three states are possible : + // + // {0,n} - "Long rep1", where the length is stored in an arithmetic-coded + // bit tree, and the distance is the 2nd most recently used distance (Rep1) + // + // {1,0} - "Long rep2", where the length is stored in an arithmetic-coded + // bit tree, and the distance is the 3rd most recently used distance (Rep2) + // + // {1,1} - "Long rep3", where the length is stored in an arithmetic-coded + // bit tree, and the distance is the 4th most recently used distance (Rep3) + // + // Once we have the right one, we must slide down each previously recently + // used distance, so that the distance we're now using (Rep1, Rep2 or Rep3) + // becomes "Rep0" again. + // + if (RcIsBitSet(&Decoder.u.BitModel.Rep1[Decoder.Sequence])) + { + if (RcIsBitSet(&Decoder.u.BitModel.Rep2[Decoder.Sequence])) + { + newRep = Decoder.Rep3; + Decoder.Rep3 = Decoder.Rep2; + } + else + { + newRep = Decoder.Rep2; + } + Decoder.Rep2 = Decoder.Rep1; + } + else + { + newRep = Decoder.Rep1; + } + Decoder.Rep1 = Decoder.Rep0; + Decoder.Rep0 = newRep; + LzDecodeRepLen(PosBit, true); +} + +void +LzDecodeRep ( + uint8_t PosBit + ) +{ + // + // We know this is an LZ77 distance-length pair where the distance is based + // on a history of up to 4 previously used distance (Rep0-3). To know which + // distance to use, the following 5 bit positions are possible (keeping in + // mind that we've already decoded the first 2 bits {1,1} in LzDecode which + // got us here in the first place): + // + // {0,0} - "Short rep", where the length is always 1 and distance is always + // the most recently used distance (Rep0). + // + // {0,1} - "Long rep0", where the length is stored in an arithmetic-coded + // bit tree, and the distance is the most recently used distance (Rep0). + // + // Because both of these possibilities just use Rep0, LzDecodeRep0 handles + // these two cases. Otherwise, we use LzDecodeLongRep to read up to two + // additional bits to figure out which recently used distance (1, 2, or 3) + // to use. + // + if (RcIsBitSet(&Decoder.u.BitModel.Rep0[Decoder.Sequence])) + { + LzDecodeLongRep(PosBit); + } + else + { + LzDecodeRep0(PosBit); + } +} + +bool +LzDecode ( + void + ) +{ + uint32_t position; + uint8_t posBit; + + // + // Get the current position in dictionary, making sure we have input bytes. + // Once we run out of bytes, normalize the last arithmetic coded byte and + // ensure there's no pending lengths that we haven't yet repeated. + // + while (DtCanWrite(&position) && RcCanRead()) + { + // + // An LZMA packet begins here, which can have 3 possible initial bit + // sequences that correspond to the type of encoding that was chosen + // to represent the next stream of symbols. + // + // {0, n} represents a "literal", which LzDecodeLiteral decodes. + // Literals are a single byte encoded with arithmetic-coded bit trees + // + // {1, 0} represents a "match", which LzDecodeMatch decodes. + // Matches are typical LZ77 sequences with explicit length and distance + // + // {1, 1} represents a "rep", which LzDecodeRep decodes. + // Reps are LZ77 sequences where the distance is encoded as a reference + // to a previously used distance (up to 4 -- called "Rep0-3"). + // + // Once we've decoded either the "match" or the "rep', we now have the + // distance in "Rep0" (the most recently used distance) and the length + // in "Len", so we will use DtRepeatSymbol to go back in the dictionary + // buffer "Rep0" bytes and repeat that character "Len" times. + // + posBit = position & (LZMA_POSITION_COUNT - 1); + if (RcIsBitSet(&Decoder.u.BitModel.Match[Decoder.Sequence][posBit])) + { + if (RcIsBitSet(&Decoder.u.BitModel.Rep[Decoder.Sequence])) + { + LzDecodeRep(posBit); + } + else + { + LzDecodeMatch(posBit); + } + + if (!DtRepeatSymbol(Decoder.Len, Decoder.Rep0 + 1)) + { + return false; + } + Decoder.Len = 0; + } + else + { + LzDecodeLiteral(); + } + } + RcNormalize(); + return (Decoder.Len == 0); +} + +void +LzResetState ( + void + ) +{ + // + // Initialize decoder to default state in case we're called more than once. + // The LZMA "Bit Model" is an adaptive arithmetic-coded probability-based + // bit tree which encodes either a "0" or a "1". + // + Decoder.Sequence = LzmaLitLitLitState; + Decoder.Rep0 = Decoder.Rep1 = Decoder.Rep2 = Decoder.Rep3 = 0; + static_assert((LZMA_BIT_MODEL_SLOTS * 2) == sizeof(Decoder.u.BitModel), + "Invalid size"); + for (int i = 0; i < LZMA_BIT_MODEL_SLOTS; i++) + { + RcSetDefaultProbability(&Decoder.u.RawProbabilities[i]); + } +} + +bool +LzInitialize ( + uint8_t Properties + ) +{ + if (Properties != k_LzSupportedProperties) + { + return false; + } + LzResetState(); + return true; +} |
