#include <stdlib.h>
#include <string.h>
#include <stdio.h>
#include <time.h>
#include "../common/mem.h"
#include "../common/fse.h"
#include "../common/huf.h"
#include "../common/zstd_internal.h"
#include "../common/xxhash.h"
#include "../compress/zstd_compress_internal.h"
#include "../zdict.h"
#include "divsufsort.h"
#include "../common/bits.h"

Include dependency graph for zdict.c:

Classes
struct	dictItem

struct	EStats_ress_t

struct	offsetCount_t

Macros
#define	MINRATIO 4 /* minimum nb of apparition to be selected in dictionary */

#define	ZDICT_MAX_SAMPLES_SIZE (2000U << 20)

#define	ZDICT_MIN_SAMPLES_SIZE (ZDICT_CONTENTSIZE_MIN * MINRATIO)

#define	_FILE_OFFSET_BITS 64

#define	_LARGEFILE64_SOURCE

#define	ZDICT_STATIC_LINKING_ONLY

#define	KB *(1 <<10)

#define	MB *(1 <<20)

#define	GB *(1U<<30)

#define	DICTLISTSIZE_DEFAULT 10000

#define	NOISELENGTH 32

#define	DISPLAY(...) do { fprintf(stderr, __VA_ARGS__); fflush( stderr ); } while (0)

#define	DISPLAYLEVEL(l, ...) do { if (notificationLevel>=l) { DISPLAY(__VA_ARGS__); } } while (0) /* 0 : no display; 1: errors; 2: default; 3: details; 4: debug */

#define	LLIMIT 64 /* heuristic determined experimentally */

#define	MINMATCHLENGTH 7 /* heuristic determined experimentally */

#define	DISPLAYUPDATE(l, ...)

#define	MAXREPOFFSET 1024

#define	OFFCODE_MAX 30 /* only applicable to first block */

#define	HBUFFSIZE 256 /* should prove large enough for all entropy headers */

Functions
unsigned	ZDICT_isError (size_t errorCode)

const char *	ZDICT_getErrorName (size_t errorCode)

unsigned	ZDICT_getDictID (const void *dictBuffer, size_t dictSize)

size_t	ZDICT_getDictHeaderSize (const void *dictBuffer, size_t dictSize)

size_t	ZDICT_finalizeDictionary (void dictBuffer, size_t dictBufferCapacity, const void customDictContent, size_t dictContentSize, const void samplesBuffer, const size_t samplesSizes, unsigned nbSamples, ZDICT_params_t params)

size_t	ZDICT_trainFromBuffer_legacy (void dictBuffer, size_t dictBufferCapacity, const void samplesBuffer, const size_t *samplesSizes, unsigned nbSamples, ZDICT_legacy_params_t params)

size_t	ZDICT_trainFromBuffer (void dictBuffer, size_t dictBufferCapacity, const void samplesBuffer, const size_t *samplesSizes, unsigned nbSamples)

size_t	ZDICT_addEntropyTablesFromBuffer (void dictBuffer, size_t dictContentSize, size_t dictBufferCapacity, const void samplesBuffer, const size_t *samplesSizes, unsigned nbSamples)

Macro Definition Documentation

◆ _FILE_OFFSET_BITS

#define _FILE_OFFSET_BITS 64

Definition at line 24 of file zdict.c.

◆ _LARGEFILE64_SOURCE

#define _LARGEFILE64_SOURCE

Definition at line 31 of file zdict.c.

◆ DICTLISTSIZE_DEFAULT

#define DICTLISTSIZE_DEFAULT 10000

Definition at line 66 of file zdict.c.

◆ DISPLAY

#define DISPLAY ( ... ) do { fprintf(stderr, __VA_ARGS__); fflush( stderr ); } while (0)

Definition at line 77 of file zdict.c.

◆ DISPLAYLEVEL

#define DISPLAYLEVEL	(	l,
		...
	)	do { if (notificationLevel>=l) { DISPLAY(__VA_ARGS__); } } while (0) /* 0 : no display; 1: errors; 2: default; 3: details; 4: debug */

Definition at line 79 of file zdict.c.

◆ DISPLAYUPDATE

#define DISPLAYUPDATE	(	l,
		...
	)

Value:

        do {                                                       \
            if (notificationLevel>=l) {                            \
                if (ZDICT_clockSpan(displayClock) > refreshRate) { \
                    displayClock = clock();                        \
                    DISPLAY(__VA_ARGS__);                          \
                }                                                  \
                if (notificationLevel>=4) fflush(stderr);          \
            }                                                      \
        } while (0)

◆ GB

#define GB *(1U<<30)

Definition at line 64 of file zdict.c.

◆ HBUFFSIZE

#define HBUFFSIZE 256 /* should prove large enough for all entropy headers */

◆ KB

#define KB *(1 <<10)

Definition at line 62 of file zdict.c.

◆ LLIMIT

#define LLIMIT 64 /* heuristic determined experimentally */

Definition at line 167 of file zdict.c.

◆ MAXREPOFFSET

#define MAXREPOFFSET 1024

Definition at line 564 of file zdict.c.

◆ MB

#define MB *(1 <<20)

Definition at line 63 of file zdict.c.

◆ MINMATCHLENGTH

#define MINMATCHLENGTH 7 /* heuristic determined experimentally */

Definition at line 168 of file zdict.c.

◆ MINRATIO

#define MINRATIO 4 /* minimum nb of apparition to be selected in dictionary */

Definition at line 15 of file zdict.c.

◆ NOISELENGTH

#define NOISELENGTH 32

Definition at line 68 of file zdict.c.

◆ OFFCODE_MAX

#define OFFCODE_MAX 30 /* only applicable to first block */

Definition at line 658 of file zdict.c.

◆ ZDICT_MAX_SAMPLES_SIZE

#define ZDICT_MAX_SAMPLES_SIZE (2000U << 20)

Definition at line 16 of file zdict.c.

◆ ZDICT_MIN_SAMPLES_SIZE

#define ZDICT_MIN_SAMPLES_SIZE (ZDICT_CONTENTSIZE_MIN * MINRATIO)

Definition at line 17 of file zdict.c.

◆ ZDICT_STATIC_LINKING_ONLY

#define ZDICT_STATIC_LINKING_ONLY

Definition at line 45 of file zdict.c.

Function Documentation

◆ ZDICT_addEntropyTablesFromBuffer()

size_t ZDICT_addEntropyTablesFromBuffer	(	void *	dictBuffer,
		size_t	dictContentSize,
		size_t	dictBufferCapacity,
		const void *	samplesBuffer,
		const size_t *	samplesSizes,
		unsigned	nbSamples
	)

Definition at line 1125 of file zdict.c.

◆ ZDICT_finalizeDictionary()

size_t ZDICT_finalizeDictionary	(	void *	dstDictBuffer,
		size_t	maxDictSize,
		const void *	dictContent,
		size_t	dictContentSize,
		const void *	samplesBuffer,
		const size_t *	samplesSizes,
		unsigned	nbSamples,
		ZDICT_params_t	parameters
	)

ZDICT_finalizeDictionary(): Given a custom content as a basis for dictionary, and a set of samples, finalize dictionary by adding headers and statistics according to the zstd dictionary format.

Samples must be stored concatenated in a flat buffer samplesBuffer, supplied with an array of sizes samplesSizes, providing the size of each sample in order. The samples are used to construct the statistics, so they should be representative of what you will compress with this dictionary.

The compression level can be set in parameters. You should pass the compression level you expect to use in production. The statistics for each compression level differ, so tuning the dictionary for the compression level can help quite a bit.

You can set an explicit dictionary ID in parameters, or allow us to pick a random dictionary ID for you, but we can't guarantee no collisions.

The dstDictBuffer and the dictContent may overlap, and the content will be appended to the end of the header. If the header + the content doesn't fit in maxDictSize the beginning of the content is truncated to make room, since it is presumed that the most profitable content is at the end of the dictionary, since that is the cheapest to reference.

maxDictSize must be >= max(dictContentSize, ZSTD_DICTSIZE_MIN).

Returns

: size of dictionary stored into dstDictBuffer (<= maxDictSize), or an error code, which can be tested by ZDICT_isError(). Note: ZDICT_finalizeDictionary() will push notifications into stderr if instructed to, using notificationLevel>0. NOTE: This function currently may fail in several edge cases including:

Not enough samples
Samples are uncompressible
Samples are all exactly the same

Definition at line 858 of file zdict.c.

Here is the call graph for this function:

Here is the caller graph for this function:

◆ ZDICT_getDictHeaderSize()

size_t ZDICT_getDictHeaderSize	(	const void *	dictBuffer,
		size_t	dictSize
	)

Definition at line 109 of file zdict.c.

Here is the call graph for this function:

◆ ZDICT_getDictID()

unsigned ZDICT_getDictID	(	const void *	dictBuffer,
		size_t	dictSize
	)

extracts dictID;

Returns: zero if error (not a valid dictionary)

Definition at line 102 of file zdict.c.

Here is the call graph for this function:

◆ ZDICT_getErrorName()

const char * ZDICT_getErrorName ( size_t errorCode )

Definition at line 100 of file zdict.c.

Here is the call graph for this function:

◆ ZDICT_isError()

unsigned ZDICT_isError ( size_t errorCode )

Definition at line 98 of file zdict.c.

Here is the call graph for this function:

Here is the caller graph for this function:

◆ ZDICT_trainFromBuffer()

size_t ZDICT_trainFromBuffer	(	void *	dictBuffer,
		size_t	dictBufferCapacity,
		const void *	samplesBuffer,
		const size_t *	samplesSizes,
		unsigned	nbSamples
	)

ZDICT_trainFromBuffer(): Train a dictionary from an array of samples. Redirect towards ZDICT_optimizeTrainFromBuffer_fastCover() single-threaded, with d=8, steps=4, f=20, and accel=1. Samples must be stored concatenated in a single flat buffer samplesBuffer, supplied with an array of sizes samplesSizes, providing the size of each sample, in order. The resulting dictionary will be saved into dictBuffer.

Returns: : size of dictionary stored into dictBuffer (<= dictBufferCapacity) or an error code, which can be tested with ZDICT_isError(). Note: Dictionary training will fail if there are not enough samples to construct a dictionary, or if most of the samples are too small (< 8 bytes being the lower limit). If dictionary training fails, you should use zstd without a dictionary, as the dictionary would've been ineffective anyways. If you believe your samples would benefit from a dictionary please open an issue with details, and we can look into it. Note: ZDICT_trainFromBuffer()'s memory usage is about 6 MB. Tips: In general, a reasonable dictionary has a size of ~ 100 KB. It's possible to select smaller or larger size, just by specifying dictBufferCapacity. In general, it's recommended to provide a few thousands samples, though this can vary a lot. It's recommended that total size of all samples be about ~x100 times the target size of dictionary.

Definition at line 1107 of file zdict.c.

Here is the call graph for this function:

◆ ZDICT_trainFromBuffer_legacy()

size_t ZDICT_trainFromBuffer_legacy	(	void *	dictBuffer,
		size_t	dictBufferCapacity,
		const void *	samplesBuffer,
		const size_t *	samplesSizes,
		unsigned	nbSamples,
		ZDICT_legacy_params_t	params
	)

Definition at line 1084 of file zdict.c.

Classes

Macros

Functions

Macro Definition Documentation

◆ _FILE_OFFSET_BITS

◆ _LARGEFILE64_SOURCE

◆ DICTLISTSIZE_DEFAULT

◆ DISPLAY

◆ DISPLAYLEVEL

◆ DISPLAYUPDATE

◆ GB

◆ HBUFFSIZE

◆ KB

◆ LLIMIT

◆ MAXREPOFFSET

◆ MB

◆ MINMATCHLENGTH

◆ MINRATIO

◆ NOISELENGTH

◆ OFFCODE_MAX

◆ ZDICT_MAX_SAMPLES_SIZE

◆ ZDICT_MIN_SAMPLES_SIZE

◆ ZDICT_STATIC_LINKING_ONLY

Function Documentation

◆ ZDICT_addEntropyTablesFromBuffer()

◆ ZDICT_finalizeDictionary()

◆ ZDICT_getDictHeaderSize()

◆ ZDICT_getDictID()

◆ ZDICT_getErrorName()

◆ ZDICT_isError()

◆ ZDICT_trainFromBuffer()

◆ ZDICT_trainFromBuffer_legacy()