Project Alice
Loading...
Searching...
No Matches
zdict.h
Go to the documentation of this file.
1/*
2 * Copyright (c) Meta Platforms, Inc. and affiliates.
3 * All rights reserved.
4 *
5 * This source code is licensed under both the BSD-style license (found in the
6 * LICENSE file in the root directory of this source tree) and the GPLv2 (found
7 * in the COPYING file in the root directory of this source tree).
8 * You may select, at your option, one of the above-listed licenses.
9 */
10
11#if defined (__cplusplus)
12extern "C" {
13#endif
14
15#ifndef ZSTD_ZDICT_H
16#define ZSTD_ZDICT_H
17
18/*====== Dependencies ======*/
19#include <stddef.h> /* size_t */
20
21
22/* ===== ZDICTLIB_API : control library symbols visibility ===== */
23#ifndef ZDICTLIB_VISIBLE
24 /* Backwards compatibility with old macro name */
25# ifdef ZDICTLIB_VISIBILITY
26# define ZDICTLIB_VISIBLE ZDICTLIB_VISIBILITY
27# elif defined(__GNUC__) && (__GNUC__ >= 4) && !defined(__MINGW32__)
28# define ZDICTLIB_VISIBLE __attribute__ ((visibility ("default")))
29# else
30# define ZDICTLIB_VISIBLE
31# endif
32#endif
33
34#ifndef ZDICTLIB_HIDDEN
35# if defined(__GNUC__) && (__GNUC__ >= 4) && !defined(__MINGW32__)
36# define ZDICTLIB_HIDDEN __attribute__ ((visibility ("hidden")))
37# else
38# define ZDICTLIB_HIDDEN
39# endif
40#endif
41
42#if defined(ZSTD_DLL_EXPORT) && (ZSTD_DLL_EXPORT==1)
43# define ZDICTLIB_API __declspec(dllexport) ZDICTLIB_VISIBLE
44#elif defined(ZSTD_DLL_IMPORT) && (ZSTD_DLL_IMPORT==1)
45# define ZDICTLIB_API __declspec(dllimport) ZDICTLIB_VISIBLE /* It isn't required but allows to generate better code, saving a function pointer load from the IAT and an indirect jump.*/
46#else
47# define ZDICTLIB_API ZDICTLIB_VISIBLE
48#endif
49
50/*******************************************************************************
51 * Zstd dictionary builder
52 *
53 * FAQ
54 * ===
55 * Why should I use a dictionary?
56 * ------------------------------
57 *
58 * Zstd can use dictionaries to improve compression ratio of small data.
59 * Traditionally small files don't compress well because there is very little
60 * repetition in a single sample, since it is small. But, if you are compressing
61 * many similar files, like a bunch of JSON records that share the same
62 * structure, you can train a dictionary on ahead of time on some samples of
63 * these files. Then, zstd can use the dictionary to find repetitions that are
64 * present across samples. This can vastly improve compression ratio.
65 *
66 * When is a dictionary useful?
67 * ----------------------------
68 *
69 * Dictionaries are useful when compressing many small files that are similar.
70 * The larger a file is, the less benefit a dictionary will have. Generally,
71 * we don't expect dictionary compression to be effective past 100KB. And the
72 * smaller a file is, the more we would expect the dictionary to help.
73 *
74 * How do I use a dictionary?
75 * --------------------------
76 *
77 * Simply pass the dictionary to the zstd compressor with
78 * `ZSTD_CCtx_loadDictionary()`. The same dictionary must then be passed to
79 * the decompressor, using `ZSTD_DCtx_loadDictionary()`. There are other
80 * more advanced functions that allow selecting some options, see zstd.h for
81 * complete documentation.
82 *
83 * What is a zstd dictionary?
84 * --------------------------
85 *
86 * A zstd dictionary has two pieces: Its header, and its content. The header
87 * contains a magic number, the dictionary ID, and entropy tables. These
88 * entropy tables allow zstd to save on header costs in the compressed file,
89 * which really matters for small data. The content is just bytes, which are
90 * repeated content that is common across many samples.
91 *
92 * What is a raw content dictionary?
93 * ---------------------------------
94 *
95 * A raw content dictionary is just bytes. It doesn't have a zstd dictionary
96 * header, a dictionary ID, or entropy tables. Any buffer is a valid raw
97 * content dictionary.
98 *
99 * How do I train a dictionary?
100 * ----------------------------
101 *
102 * Gather samples from your use case. These samples should be similar to each
103 * other. If you have several use cases, you could try to train one dictionary
104 * per use case.
105 *
106 * Pass those samples to `ZDICT_trainFromBuffer()` and that will train your
107 * dictionary. There are a few advanced versions of this function, but this
108 * is a great starting point. If you want to further tune your dictionary
109 * you could try `ZDICT_optimizeTrainFromBuffer_cover()`. If that is too slow
110 * you can try `ZDICT_optimizeTrainFromBuffer_fastCover()`.
111 *
112 * If the dictionary training function fails, that is likely because you
113 * either passed too few samples, or a dictionary would not be effective
114 * for your data. Look at the messages that the dictionary trainer printed,
115 * if it doesn't say too few samples, then a dictionary would not be effective.
116 *
117 * How large should my dictionary be?
118 * ----------------------------------
119 *
120 * A reasonable dictionary size, the `dictBufferCapacity`, is about 100KB.
121 * The zstd CLI defaults to a 110KB dictionary. You likely don't need a
122 * dictionary larger than that. But, most use cases can get away with a
123 * smaller dictionary. The advanced dictionary builders can automatically
124 * shrink the dictionary for you, and select the smallest size that doesn't
125 * hurt compression ratio too much. See the `shrinkDict` parameter.
126 * A smaller dictionary can save memory, and potentially speed up
127 * compression.
128 *
129 * How many samples should I provide to the dictionary builder?
130 * ------------------------------------------------------------
131 *
132 * We generally recommend passing ~100x the size of the dictionary
133 * in samples. A few thousand should suffice. Having too few samples
134 * can hurt the dictionaries effectiveness. Having more samples will
135 * only improve the dictionaries effectiveness. But having too many
136 * samples can slow down the dictionary builder.
137 *
138 * How do I determine if a dictionary will be effective?
139 * -----------------------------------------------------
140 *
141 * Simply train a dictionary and try it out. You can use zstd's built in
142 * benchmarking tool to test the dictionary effectiveness.
143 *
144 * # Benchmark levels 1-3 without a dictionary
145 * zstd -b1e3 -r /path/to/my/files
146 * # Benchmark levels 1-3 with a dictionary
147 * zstd -b1e3 -r /path/to/my/files -D /path/to/my/dictionary
148 *
149 * When should I retrain a dictionary?
150 * -----------------------------------
151 *
152 * You should retrain a dictionary when its effectiveness drops. Dictionary
153 * effectiveness drops as the data you are compressing changes. Generally, we do
154 * expect dictionaries to "decay" over time, as your data changes, but the rate
155 * at which they decay depends on your use case. Internally, we regularly
156 * retrain dictionaries, and if the new dictionary performs significantly
157 * better than the old dictionary, we will ship the new dictionary.
158 *
159 * I have a raw content dictionary, how do I turn it into a zstd dictionary?
160 * -------------------------------------------------------------------------
161 *
162 * If you have a raw content dictionary, e.g. by manually constructing it, or
163 * using a third-party dictionary builder, you can turn it into a zstd
164 * dictionary by using `ZDICT_finalizeDictionary()`. You'll also have to
165 * provide some samples of the data. It will add the zstd header to the
166 * raw content, which contains a dictionary ID and entropy tables, which
167 * will improve compression ratio, and allow zstd to write the dictionary ID
168 * into the frame, if you so choose.
169 *
170 * Do I have to use zstd's dictionary builder?
171 * -------------------------------------------
172 *
173 * No! You can construct dictionary content however you please, it is just
174 * bytes. It will always be valid as a raw content dictionary. If you want
175 * a zstd dictionary, which can improve compression ratio, use
176 * `ZDICT_finalizeDictionary()`.
177 *
178 * What is the attack surface of a zstd dictionary?
179 * ------------------------------------------------
180 *
181 * Zstd is heavily fuzz tested, including loading fuzzed dictionaries, so
182 * zstd should never crash, or access out-of-bounds memory no matter what
183 * the dictionary is. However, if an attacker can control the dictionary
184 * during decompression, they can cause zstd to generate arbitrary bytes,
185 * just like if they controlled the compressed data.
186 *
187 ******************************************************************************/
188
189
210ZDICTLIB_API size_t ZDICT_trainFromBuffer(void* dictBuffer, size_t dictBufferCapacity,
211 const void* samplesBuffer,
212 const size_t* samplesSizes, unsigned nbSamples);
213
214typedef struct {
217 unsigned dictID;
226
262ZDICTLIB_API size_t ZDICT_finalizeDictionary(void* dstDictBuffer, size_t maxDictSize,
263 const void* dictContent, size_t dictContentSize,
264 const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples,
265 ZDICT_params_t parameters);
266
267
268/*====== Helper functions ======*/
269ZDICTLIB_API unsigned ZDICT_getDictID(const void* dictBuffer, size_t dictSize);
270ZDICTLIB_API size_t ZDICT_getDictHeaderSize(const void* dictBuffer, size_t dictSize); /* returns dict header size; returns a ZSTD error code on failure */
271ZDICTLIB_API unsigned ZDICT_isError(size_t errorCode);
272ZDICTLIB_API const char* ZDICT_getErrorName(size_t errorCode);
273
274#endif /* ZSTD_ZDICT_H */
275
276#if defined(ZDICT_STATIC_LINKING_ONLY) && !defined(ZSTD_ZDICT_H_STATIC)
277#define ZSTD_ZDICT_H_STATIC
278
279/* This can be overridden externally to hide static symbols. */
280#ifndef ZDICTLIB_STATIC_API
281# if defined(ZSTD_DLL_EXPORT) && (ZSTD_DLL_EXPORT==1)
282# define ZDICTLIB_STATIC_API __declspec(dllexport) ZDICTLIB_VISIBLE
283# elif defined(ZSTD_DLL_IMPORT) && (ZSTD_DLL_IMPORT==1)
284# define ZDICTLIB_STATIC_API __declspec(dllimport) ZDICTLIB_VISIBLE
285# else
286# define ZDICTLIB_STATIC_API ZDICTLIB_VISIBLE
287# endif
288#endif
289
290/* ====================================================================================
291 * The definitions in this section are considered experimental.
292 * They should never be used with a dynamic library, as they may change in the future.
293 * They are provided for advanced usages.
294 * Use them only in association with static linking.
295 * ==================================================================================== */
296
297#define ZDICT_DICTSIZE_MIN 256
298/* Deprecated: Remove in v1.6.0 */
299#define ZDICT_CONTENTSIZE_MIN 128
300
305typedef struct {
306 unsigned k; /* Segment size : constraint: 0 < k : Reasonable range [16, 2048+] */
307 unsigned d; /* dmer size : constraint: 0 < d <= k : Reasonable range [6, 16] */
308 unsigned steps; /* Number of steps : Only used for optimization : 0 means default (40) : Higher means more parameters checked */
309 unsigned nbThreads; /* Number of threads : constraint: 0 < nbThreads : 1 means single-threaded : Only used for optimization : Ignored if ZSTD_MULTITHREAD is not defined */
310 double splitPoint; /* Percentage of samples used for training: Only used for optimization : the first nbSamples * splitPoint samples will be used to training, the last nbSamples * (1 - splitPoint) samples will be used for testing, 0 means default (1.0), 1.0 when all samples are used for both training and testing */
311 unsigned shrinkDict; /* Train dictionaries to shrink in size starting from the minimum size and selects the smallest dictionary that is shrinkDictMaxRegression% worse than the largest dictionary. 0 means no shrinking and 1 means shrinking */
312 unsigned shrinkDictMaxRegression; /* Sets shrinkDictMaxRegression so that a smaller dictionary can be at worse shrinkDictMaxRegression% worse than the max dict size dictionary. */
313 ZDICT_params_t zParams;
314} ZDICT_cover_params_t;
315
316typedef struct {
317 unsigned k; /* Segment size : constraint: 0 < k : Reasonable range [16, 2048+] */
318 unsigned d; /* dmer size : constraint: 0 < d <= k : Reasonable range [6, 16] */
319 unsigned f; /* log of size of frequency array : constraint: 0 < f <= 31 : 1 means default(20)*/
320 unsigned steps; /* Number of steps : Only used for optimization : 0 means default (40) : Higher means more parameters checked */
321 unsigned nbThreads; /* Number of threads : constraint: 0 < nbThreads : 1 means single-threaded : Only used for optimization : Ignored if ZSTD_MULTITHREAD is not defined */
322 double splitPoint; /* Percentage of samples used for training: Only used for optimization : the first nbSamples * splitPoint samples will be used to training, the last nbSamples * (1 - splitPoint) samples will be used for testing, 0 means default (0.75), 1.0 when all samples are used for both training and testing */
323 unsigned accel; /* Acceleration level: constraint: 0 < accel <= 10, higher means faster and less accurate, 0 means default(1) */
324 unsigned shrinkDict; /* Train dictionaries to shrink in size starting from the minimum size and selects the smallest dictionary that is shrinkDictMaxRegression% worse than the largest dictionary. 0 means no shrinking and 1 means shrinking */
325 unsigned shrinkDictMaxRegression; /* Sets shrinkDictMaxRegression so that a smaller dictionary can be at worse shrinkDictMaxRegression% worse than the max dict size dictionary. */
326
327 ZDICT_params_t zParams;
328} ZDICT_fastCover_params_t;
329
344ZDICTLIB_STATIC_API size_t ZDICT_trainFromBuffer_cover(
345 void *dictBuffer, size_t dictBufferCapacity,
346 const void *samplesBuffer, const size_t *samplesSizes, unsigned nbSamples,
347 ZDICT_cover_params_t parameters);
348
366ZDICTLIB_STATIC_API size_t ZDICT_optimizeTrainFromBuffer_cover(
367 void* dictBuffer, size_t dictBufferCapacity,
368 const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples,
369 ZDICT_cover_params_t* parameters);
370
387ZDICTLIB_STATIC_API size_t ZDICT_trainFromBuffer_fastCover(void *dictBuffer,
388 size_t dictBufferCapacity, const void *samplesBuffer,
389 const size_t *samplesSizes, unsigned nbSamples,
390 ZDICT_fastCover_params_t parameters);
391
410ZDICTLIB_STATIC_API size_t ZDICT_optimizeTrainFromBuffer_fastCover(void* dictBuffer,
411 size_t dictBufferCapacity, const void* samplesBuffer,
412 const size_t* samplesSizes, unsigned nbSamples,
413 ZDICT_fastCover_params_t* parameters);
414
415typedef struct {
416 unsigned selectivityLevel; /* 0 means default; larger => select more => larger dictionary */
417 ZDICT_params_t zParams;
418} ZDICT_legacy_params_t;
419
435ZDICTLIB_STATIC_API size_t ZDICT_trainFromBuffer_legacy(
436 void* dictBuffer, size_t dictBufferCapacity,
437 const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples,
438 ZDICT_legacy_params_t parameters);
439
440
441/* Deprecation warnings */
442/* It is generally possible to disable deprecation warnings from compiler,
443 for example with -Wno-deprecated-declarations for gcc
444 or _CRT_SECURE_NO_WARNINGS in Visual.
445 Otherwise, it's also possible to manually define ZDICT_DISABLE_DEPRECATE_WARNINGS */
446#ifdef ZDICT_DISABLE_DEPRECATE_WARNINGS
447# define ZDICT_DEPRECATED(message) /* disable deprecation warnings */
448#else
449# define ZDICT_GCC_VERSION (__GNUC__ * 100 + __GNUC_MINOR__)
450# if defined (__cplusplus) && (__cplusplus >= 201402) /* C++14 or greater */
451# define ZDICT_DEPRECATED(message) [[deprecated(message)]]
452# elif defined(__clang__) || (ZDICT_GCC_VERSION >= 405)
453# define ZDICT_DEPRECATED(message) __attribute__((deprecated(message)))
454# elif (ZDICT_GCC_VERSION >= 301)
455# define ZDICT_DEPRECATED(message) __attribute__((deprecated))
456# elif defined(_MSC_VER)
457# define ZDICT_DEPRECATED(message) __declspec(deprecated(message))
458# else
459# pragma message("WARNING: You need to implement ZDICT_DEPRECATED for this compiler")
460# define ZDICT_DEPRECATED(message)
461# endif
462#endif /* ZDICT_DISABLE_DEPRECATE_WARNINGS */
463
464ZDICT_DEPRECATED("use ZDICT_finalizeDictionary() instead")
465ZDICTLIB_STATIC_API
466size_t ZDICT_addEntropyTablesFromBuffer(void* dictBuffer, size_t dictContentSize, size_t dictBufferCapacity,
467 const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples);
468
469
470#endif /* ZSTD_ZDICT_H_STATIC */
471
472#if defined (__cplusplus)
473}
474#endif
ZDICTLIB_STATIC_API size_t ZDICT_trainFromBuffer_cover(void *dictBuffer, size_t dictBufferCapacity, const void *samplesBuffer, const size_t *samplesSizes, unsigned nbSamples, ZDICT_cover_params_t parameters)
Definition: cover.c:738
ZDICTLIB_STATIC_API size_t ZDICT_optimizeTrainFromBuffer_cover(void *dictBuffer, size_t dictBufferCapacity, const void *samplesBuffer, const size_t *samplesSizes, unsigned nbSamples, ZDICT_cover_params_t *parameters)
Definition: cover.c:1126
ZDICTLIB_STATIC_API size_t ZDICT_trainFromBuffer_fastCover(void *dictBuffer, size_t dictBufferCapacity, const void *samplesBuffer, const size_t *samplesSizes, unsigned nbSamples, ZDICT_fastCover_params_t parameters)
Definition: fastcover.c:549
ZDICTLIB_STATIC_API size_t ZDICT_optimizeTrainFromBuffer_fastCover(void *dictBuffer, size_t dictBufferCapacity, const void *samplesBuffer, const size_t *samplesSizes, unsigned nbSamples, ZDICT_fastCover_params_t *parameters)
Definition: fastcover.c:618
int compressionLevel
Definition: zdict.h:215
unsigned dictID
Definition: zdict.h:217
unsigned notificationLevel
Definition: zdict.h:216
size_t ZDICT_trainFromBuffer_legacy(void *dictBuffer, size_t dictBufferCapacity, const void *samplesBuffer, const size_t *samplesSizes, unsigned nbSamples, ZDICT_legacy_params_t params)
Definition: zdict.c:1084
size_t ZDICT_addEntropyTablesFromBuffer(void *dictBuffer, size_t dictContentSize, size_t dictBufferCapacity, const void *samplesBuffer, const size_t *samplesSizes, unsigned nbSamples)
Definition: zdict.c:1125
ZDICTLIB_API size_t ZDICT_finalizeDictionary(void *dstDictBuffer, size_t maxDictSize, const void *dictContent, size_t dictContentSize, const void *samplesBuffer, const size_t *samplesSizes, unsigned nbSamples, ZDICT_params_t parameters)
Definition: zdict.c:858
#define ZDICTLIB_API
Definition: zdict.h:47
ZDICTLIB_API size_t ZDICT_trainFromBuffer(void *dictBuffer, size_t dictBufferCapacity, const void *samplesBuffer, const size_t *samplesSizes, unsigned nbSamples)
Definition: zdict.c:1107
ZDICTLIB_API size_t ZDICT_getDictHeaderSize(const void *dictBuffer, size_t dictSize)
Definition: zdict.c:109
ZDICTLIB_API unsigned ZDICT_isError(size_t errorCode)
Definition: zdict.c:98
ZDICTLIB_API const char * ZDICT_getErrorName(size_t errorCode)
Definition: zdict.c:100
ZDICTLIB_API unsigned ZDICT_getDictID(const void *dictBuffer, size_t dictSize)
Definition: zdict.c:102