Project Alice
|
#include <stdlib.h>
#include <string.h>
#include <stdio.h>
#include <time.h>
#include "../common/mem.h"
#include "../common/fse.h"
#include "../common/huf.h"
#include "../common/zstd_internal.h"
#include "../common/xxhash.h"
#include "../compress/zstd_compress_internal.h"
#include "../zdict.h"
#include "divsufsort.h"
#include "../common/bits.h"
Go to the source code of this file.
Classes | |
struct | dictItem |
struct | EStats_ress_t |
struct | offsetCount_t |
Macros | |
#define | MINRATIO 4 /* minimum nb of apparition to be selected in dictionary */ |
#define | ZDICT_MAX_SAMPLES_SIZE (2000U << 20) |
#define | ZDICT_MIN_SAMPLES_SIZE (ZDICT_CONTENTSIZE_MIN * MINRATIO) |
#define | _FILE_OFFSET_BITS 64 |
#define | _LARGEFILE64_SOURCE |
#define | ZDICT_STATIC_LINKING_ONLY |
#define | KB *(1 <<10) |
#define | MB *(1 <<20) |
#define | GB *(1U<<30) |
#define | DICTLISTSIZE_DEFAULT 10000 |
#define | NOISELENGTH 32 |
#define | DISPLAY(...) do { fprintf(stderr, __VA_ARGS__); fflush( stderr ); } while (0) |
#define | DISPLAYLEVEL(l, ...) do { if (notificationLevel>=l) { DISPLAY(__VA_ARGS__); } } while (0) /* 0 : no display; 1: errors; 2: default; 3: details; 4: debug */ |
#define | LLIMIT 64 /* heuristic determined experimentally */ |
#define | MINMATCHLENGTH 7 /* heuristic determined experimentally */ |
#define | DISPLAYUPDATE(l, ...) |
#define | MAXREPOFFSET 1024 |
#define | OFFCODE_MAX 30 /* only applicable to first block */ |
#define | HBUFFSIZE 256 /* should prove large enough for all entropy headers */ |
Functions | |
unsigned | ZDICT_isError (size_t errorCode) |
const char * | ZDICT_getErrorName (size_t errorCode) |
unsigned | ZDICT_getDictID (const void *dictBuffer, size_t dictSize) |
size_t | ZDICT_getDictHeaderSize (const void *dictBuffer, size_t dictSize) |
size_t | ZDICT_finalizeDictionary (void *dictBuffer, size_t dictBufferCapacity, const void *customDictContent, size_t dictContentSize, const void *samplesBuffer, const size_t *samplesSizes, unsigned nbSamples, ZDICT_params_t params) |
size_t | ZDICT_trainFromBuffer_legacy (void *dictBuffer, size_t dictBufferCapacity, const void *samplesBuffer, const size_t *samplesSizes, unsigned nbSamples, ZDICT_legacy_params_t params) |
size_t | ZDICT_trainFromBuffer (void *dictBuffer, size_t dictBufferCapacity, const void *samplesBuffer, const size_t *samplesSizes, unsigned nbSamples) |
size_t | ZDICT_addEntropyTablesFromBuffer (void *dictBuffer, size_t dictContentSize, size_t dictBufferCapacity, const void *samplesBuffer, const size_t *samplesSizes, unsigned nbSamples) |
#define DISPLAY | ( | ... | ) | do { fprintf(stderr, __VA_ARGS__); fflush( stderr ); } while (0) |
#define DISPLAYLEVEL | ( | l, | |
... | |||
) | do { if (notificationLevel>=l) { DISPLAY(__VA_ARGS__); } } while (0) /* 0 : no display; 1: errors; 2: default; 3: details; 4: debug */ |
#define DISPLAYUPDATE | ( | l, | |
... | |||
) |
#define HBUFFSIZE 256 /* should prove large enough for all entropy headers */ |
#define MINMATCHLENGTH 7 /* heuristic determined experimentally */ |
#define MINRATIO 4 /* minimum nb of apparition to be selected in dictionary */ |
#define ZDICT_MIN_SAMPLES_SIZE (ZDICT_CONTENTSIZE_MIN * MINRATIO) |
size_t ZDICT_addEntropyTablesFromBuffer | ( | void * | dictBuffer, |
size_t | dictContentSize, | ||
size_t | dictBufferCapacity, | ||
const void * | samplesBuffer, | ||
const size_t * | samplesSizes, | ||
unsigned | nbSamples | ||
) |
size_t ZDICT_finalizeDictionary | ( | void * | dstDictBuffer, |
size_t | maxDictSize, | ||
const void * | dictContent, | ||
size_t | dictContentSize, | ||
const void * | samplesBuffer, | ||
const size_t * | samplesSizes, | ||
unsigned | nbSamples, | ||
ZDICT_params_t | parameters | ||
) |
ZDICT_finalizeDictionary(): Given a custom content as a basis for dictionary, and a set of samples, finalize dictionary by adding headers and statistics according to the zstd dictionary format.
Samples must be stored concatenated in a flat buffer samplesBuffer
, supplied with an array of sizes samplesSizes
, providing the size of each sample in order. The samples are used to construct the statistics, so they should be representative of what you will compress with this dictionary.
The compression level can be set in parameters
. You should pass the compression level you expect to use in production. The statistics for each compression level differ, so tuning the dictionary for the compression level can help quite a bit.
You can set an explicit dictionary ID in parameters
, or allow us to pick a random dictionary ID for you, but we can't guarantee no collisions.
The dstDictBuffer and the dictContent may overlap, and the content will be appended to the end of the header. If the header + the content doesn't fit in maxDictSize the beginning of the content is truncated to make room, since it is presumed that the most profitable content is at the end of the dictionary, since that is the cheapest to reference.
maxDictSize
must be >= max(dictContentSize, ZSTD_DICTSIZE_MIN).
dstDictBuffer
(<= maxDictSize
), or an error code, which can be tested by ZDICT_isError(). Note: ZDICT_finalizeDictionary() will push notifications into stderr if instructed to, using notificationLevel>0. NOTE: This function currently may fail in several edge cases including:Definition at line 858 of file zdict.c.
size_t ZDICT_getDictHeaderSize | ( | const void * | dictBuffer, |
size_t | dictSize | ||
) |
unsigned ZDICT_getDictID | ( | const void * | dictBuffer, |
size_t | dictSize | ||
) |
const char * ZDICT_getErrorName | ( | size_t | errorCode | ) |
unsigned ZDICT_isError | ( | size_t | errorCode | ) |
size_t ZDICT_trainFromBuffer | ( | void * | dictBuffer, |
size_t | dictBufferCapacity, | ||
const void * | samplesBuffer, | ||
const size_t * | samplesSizes, | ||
unsigned | nbSamples | ||
) |
ZDICT_trainFromBuffer(): Train a dictionary from an array of samples. Redirect towards ZDICT_optimizeTrainFromBuffer_fastCover() single-threaded, with d=8, steps=4, f=20, and accel=1. Samples must be stored concatenated in a single flat buffer samplesBuffer
, supplied with an array of sizes samplesSizes
, providing the size of each sample, in order. The resulting dictionary will be saved into dictBuffer
.
dictBuffer
(<= dictBufferCapacity
) or an error code, which can be tested with ZDICT_isError(). Note: Dictionary training will fail if there are not enough samples to construct a dictionary, or if most of the samples are too small (< 8 bytes being the lower limit). If dictionary training fails, you should use zstd without a dictionary, as the dictionary would've been ineffective anyways. If you believe your samples would benefit from a dictionary please open an issue with details, and we can look into it. Note: ZDICT_trainFromBuffer()'s memory usage is about 6 MB. Tips: In general, a reasonable dictionary has a size of ~ 100 KB. It's possible to select smaller or larger size, just by specifying dictBufferCapacity
. In general, it's recommended to provide a few thousands samples, though this can vary a lot. It's recommended that total size of all samples be about ~x100 times the target size of dictionary. Definition at line 1107 of file zdict.c.