Merge tag 'x86-cpu-2021-08-30' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
[linux-2.6-microblaze.git] / lib / zstd / fse_compress.c
1 /*
2  * FSE : Finite State Entropy encoder
3  * Copyright (C) 2013-2015, Yann Collet.
4  *
5  * BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions are
9  * met:
10  *
11  *   * Redistributions of source code must retain the above copyright
12  * notice, this list of conditions and the following disclaimer.
13  *   * Redistributions in binary form must reproduce the above
14  * copyright notice, this list of conditions and the following disclaimer
15  * in the documentation and/or other materials provided with the
16  * distribution.
17  *
18  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19  * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
21  * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
22  * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
23  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
24  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
25  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
26  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29  *
30  * This program is free software; you can redistribute it and/or modify it under
31  * the terms of the GNU General Public License version 2 as published by the
32  * Free Software Foundation. This program is dual-licensed; you may select
33  * either version 2 of the GNU General Public License ("GPL") or BSD license
34  * ("BSD").
35  *
36  * You can contact the author at :
37  * - Source repository : https://github.com/Cyan4973/FiniteStateEntropy
38  */
39
40 /* **************************************************************
41 *  Compiler specifics
42 ****************************************************************/
43 #define FORCE_INLINE static __always_inline
44
45 /* **************************************************************
46 *  Includes
47 ****************************************************************/
48 #include "bitstream.h"
49 #include "fse.h"
50 #include <linux/compiler.h>
51 #include <linux/kernel.h>
52 #include <linux/math64.h>
53 #include <linux/string.h> /* memcpy, memset */
54
55 /* **************************************************************
56 *  Error Management
57 ****************************************************************/
58 #define FSE_STATIC_ASSERT(c)                                   \
59         {                                                      \
60                 enum { FSE_static_assert = 1 / (int)(!!(c)) }; \
61         } /* use only *after* variable declarations */
62
63 /* **************************************************************
64 *  Templates
65 ****************************************************************/
66 /*
67   designed to be included
68   for type-specific functions (template emulation in C)
69   Objective is to write these functions only once, for improved maintenance
70 */
71
72 /* safety checks */
73 #ifndef FSE_FUNCTION_EXTENSION
74 #error "FSE_FUNCTION_EXTENSION must be defined"
75 #endif
76 #ifndef FSE_FUNCTION_TYPE
77 #error "FSE_FUNCTION_TYPE must be defined"
78 #endif
79
80 /* Function names */
81 #define FSE_CAT(X, Y) X##Y
82 #define FSE_FUNCTION_NAME(X, Y) FSE_CAT(X, Y)
83 #define FSE_TYPE_NAME(X, Y) FSE_CAT(X, Y)
84
85 /* Function templates */
86
87 /* FSE_buildCTable_wksp() :
88  * Same as FSE_buildCTable(), but using an externally allocated scratch buffer (`workSpace`).
89  * wkspSize should be sized to handle worst case situation, which is `1<<max_tableLog * sizeof(FSE_FUNCTION_TYPE)`
90  * workSpace must also be properly aligned with FSE_FUNCTION_TYPE requirements
91  */
92 size_t FSE_buildCTable_wksp(FSE_CTable *ct, const short *normalizedCounter, unsigned maxSymbolValue, unsigned tableLog, void *workspace, size_t workspaceSize)
93 {
94         U32 const tableSize = 1 << tableLog;
95         U32 const tableMask = tableSize - 1;
96         void *const ptr = ct;
97         U16 *const tableU16 = ((U16 *)ptr) + 2;
98         void *const FSCT = ((U32 *)ptr) + 1 /* header */ + (tableLog ? tableSize >> 1 : 1);
99         FSE_symbolCompressionTransform *const symbolTT = (FSE_symbolCompressionTransform *)(FSCT);
100         U32 const step = FSE_TABLESTEP(tableSize);
101         U32 highThreshold = tableSize - 1;
102
103         U32 *cumul;
104         FSE_FUNCTION_TYPE *tableSymbol;
105         size_t spaceUsed32 = 0;
106
107         cumul = (U32 *)workspace + spaceUsed32;
108         spaceUsed32 += FSE_MAX_SYMBOL_VALUE + 2;
109         tableSymbol = (FSE_FUNCTION_TYPE *)((U32 *)workspace + spaceUsed32);
110         spaceUsed32 += ALIGN(sizeof(FSE_FUNCTION_TYPE) * ((size_t)1 << tableLog), sizeof(U32)) >> 2;
111
112         if ((spaceUsed32 << 2) > workspaceSize)
113                 return ERROR(tableLog_tooLarge);
114         workspace = (U32 *)workspace + spaceUsed32;
115         workspaceSize -= (spaceUsed32 << 2);
116
117         /* CTable header */
118         tableU16[-2] = (U16)tableLog;
119         tableU16[-1] = (U16)maxSymbolValue;
120
121         /* For explanations on how to distribute symbol values over the table :
122         *  http://fastcompression.blogspot.fr/2014/02/fse-distributing-symbol-values.html */
123
124         /* symbol start positions */
125         {
126                 U32 u;
127                 cumul[0] = 0;
128                 for (u = 1; u <= maxSymbolValue + 1; u++) {
129                         if (normalizedCounter[u - 1] == -1) { /* Low proba symbol */
130                                 cumul[u] = cumul[u - 1] + 1;
131                                 tableSymbol[highThreshold--] = (FSE_FUNCTION_TYPE)(u - 1);
132                         } else {
133                                 cumul[u] = cumul[u - 1] + normalizedCounter[u - 1];
134                         }
135                 }
136                 cumul[maxSymbolValue + 1] = tableSize + 1;
137         }
138
139         /* Spread symbols */
140         {
141                 U32 position = 0;
142                 U32 symbol;
143                 for (symbol = 0; symbol <= maxSymbolValue; symbol++) {
144                         int nbOccurences;
145                         for (nbOccurences = 0; nbOccurences < normalizedCounter[symbol]; nbOccurences++) {
146                                 tableSymbol[position] = (FSE_FUNCTION_TYPE)symbol;
147                                 position = (position + step) & tableMask;
148                                 while (position > highThreshold)
149                                         position = (position + step) & tableMask; /* Low proba area */
150                         }
151                 }
152
153                 if (position != 0)
154                         return ERROR(GENERIC); /* Must have gone through all positions */
155         }
156
157         /* Build table */
158         {
159                 U32 u;
160                 for (u = 0; u < tableSize; u++) {
161                         FSE_FUNCTION_TYPE s = tableSymbol[u];   /* note : static analyzer may not understand tableSymbol is properly initialized */
162                         tableU16[cumul[s]++] = (U16)(tableSize + u); /* TableU16 : sorted by symbol order; gives next state value */
163                 }
164         }
165
166         /* Build Symbol Transformation Table */
167         {
168                 unsigned total = 0;
169                 unsigned s;
170                 for (s = 0; s <= maxSymbolValue; s++) {
171                         switch (normalizedCounter[s]) {
172                         case 0: break;
173
174                         case -1:
175                         case 1:
176                                 symbolTT[s].deltaNbBits = (tableLog << 16) - (1 << tableLog);
177                                 symbolTT[s].deltaFindState = total - 1;
178                                 total++;
179                                 break;
180                         default: {
181                                 U32 const maxBitsOut = tableLog - BIT_highbit32(normalizedCounter[s] - 1);
182                                 U32 const minStatePlus = normalizedCounter[s] << maxBitsOut;
183                                 symbolTT[s].deltaNbBits = (maxBitsOut << 16) - minStatePlus;
184                                 symbolTT[s].deltaFindState = total - normalizedCounter[s];
185                                 total += normalizedCounter[s];
186                         }
187                         }
188                 }
189         }
190
191         return 0;
192 }
193
194 /*-**************************************************************
195 *  FSE NCount encoding-decoding
196 ****************************************************************/
197 size_t FSE_NCountWriteBound(unsigned maxSymbolValue, unsigned tableLog)
198 {
199         size_t const maxHeaderSize = (((maxSymbolValue + 1) * tableLog) >> 3) + 3;
200         return maxSymbolValue ? maxHeaderSize : FSE_NCOUNTBOUND; /* maxSymbolValue==0 ? use default */
201 }
202
203 static size_t FSE_writeNCount_generic(void *header, size_t headerBufferSize, const short *normalizedCounter, unsigned maxSymbolValue, unsigned tableLog,
204                                       unsigned writeIsSafe)
205 {
206         BYTE *const ostart = (BYTE *)header;
207         BYTE *out = ostart;
208         BYTE *const oend = ostart + headerBufferSize;
209         int nbBits;
210         const int tableSize = 1 << tableLog;
211         int remaining;
212         int threshold;
213         U32 bitStream;
214         int bitCount;
215         unsigned charnum = 0;
216         int previous0 = 0;
217
218         bitStream = 0;
219         bitCount = 0;
220         /* Table Size */
221         bitStream += (tableLog - FSE_MIN_TABLELOG) << bitCount;
222         bitCount += 4;
223
224         /* Init */
225         remaining = tableSize + 1; /* +1 for extra accuracy */
226         threshold = tableSize;
227         nbBits = tableLog + 1;
228
229         while (remaining > 1) { /* stops at 1 */
230                 if (previous0) {
231                         unsigned start = charnum;
232                         while (!normalizedCounter[charnum])
233                                 charnum++;
234                         while (charnum >= start + 24) {
235                                 start += 24;
236                                 bitStream += 0xFFFFU << bitCount;
237                                 if ((!writeIsSafe) && (out > oend - 2))
238                                         return ERROR(dstSize_tooSmall); /* Buffer overflow */
239                                 out[0] = (BYTE)bitStream;
240                                 out[1] = (BYTE)(bitStream >> 8);
241                                 out += 2;
242                                 bitStream >>= 16;
243                         }
244                         while (charnum >= start + 3) {
245                                 start += 3;
246                                 bitStream += 3 << bitCount;
247                                 bitCount += 2;
248                         }
249                         bitStream += (charnum - start) << bitCount;
250                         bitCount += 2;
251                         if (bitCount > 16) {
252                                 if ((!writeIsSafe) && (out > oend - 2))
253                                         return ERROR(dstSize_tooSmall); /* Buffer overflow */
254                                 out[0] = (BYTE)bitStream;
255                                 out[1] = (BYTE)(bitStream >> 8);
256                                 out += 2;
257                                 bitStream >>= 16;
258                                 bitCount -= 16;
259                         }
260                 }
261                 {
262                         int count = normalizedCounter[charnum++];
263                         int const max = (2 * threshold - 1) - remaining;
264                         remaining -= count < 0 ? -count : count;
265                         count++; /* +1 for extra accuracy */
266                         if (count >= threshold)
267                                 count += max; /* [0..max[ [max..threshold[ (...) [threshold+max 2*threshold[ */
268                         bitStream += count << bitCount;
269                         bitCount += nbBits;
270                         bitCount -= (count < max);
271                         previous0 = (count == 1);
272                         if (remaining < 1)
273                                 return ERROR(GENERIC);
274                         while (remaining < threshold)
275                                 nbBits--, threshold >>= 1;
276                 }
277                 if (bitCount > 16) {
278                         if ((!writeIsSafe) && (out > oend - 2))
279                                 return ERROR(dstSize_tooSmall); /* Buffer overflow */
280                         out[0] = (BYTE)bitStream;
281                         out[1] = (BYTE)(bitStream >> 8);
282                         out += 2;
283                         bitStream >>= 16;
284                         bitCount -= 16;
285                 }
286         }
287
288         /* flush remaining bitStream */
289         if ((!writeIsSafe) && (out > oend - 2))
290                 return ERROR(dstSize_tooSmall); /* Buffer overflow */
291         out[0] = (BYTE)bitStream;
292         out[1] = (BYTE)(bitStream >> 8);
293         out += (bitCount + 7) / 8;
294
295         if (charnum > maxSymbolValue + 1)
296                 return ERROR(GENERIC);
297
298         return (out - ostart);
299 }
300
301 size_t FSE_writeNCount(void *buffer, size_t bufferSize, const short *normalizedCounter, unsigned maxSymbolValue, unsigned tableLog)
302 {
303         if (tableLog > FSE_MAX_TABLELOG)
304                 return ERROR(tableLog_tooLarge); /* Unsupported */
305         if (tableLog < FSE_MIN_TABLELOG)
306                 return ERROR(GENERIC); /* Unsupported */
307
308         if (bufferSize < FSE_NCountWriteBound(maxSymbolValue, tableLog))
309                 return FSE_writeNCount_generic(buffer, bufferSize, normalizedCounter, maxSymbolValue, tableLog, 0);
310
311         return FSE_writeNCount_generic(buffer, bufferSize, normalizedCounter, maxSymbolValue, tableLog, 1);
312 }
313
314 /*-**************************************************************
315 *  Counting histogram
316 ****************************************************************/
317 /*! FSE_count_simple
318         This function counts byte values within `src`, and store the histogram into table `count`.
319         It doesn't use any additional memory.
320         But this function is unsafe : it doesn't check that all values within `src` can fit into `count`.
321         For this reason, prefer using a table `count` with 256 elements.
322         @return : count of most numerous element
323 */
324 size_t FSE_count_simple(unsigned *count, unsigned *maxSymbolValuePtr, const void *src, size_t srcSize)
325 {
326         const BYTE *ip = (const BYTE *)src;
327         const BYTE *const end = ip + srcSize;
328         unsigned maxSymbolValue = *maxSymbolValuePtr;
329         unsigned max = 0;
330
331         memset(count, 0, (maxSymbolValue + 1) * sizeof(*count));
332         if (srcSize == 0) {
333                 *maxSymbolValuePtr = 0;
334                 return 0;
335         }
336
337         while (ip < end)
338                 count[*ip++]++;
339
340         while (!count[maxSymbolValue])
341                 maxSymbolValue--;
342         *maxSymbolValuePtr = maxSymbolValue;
343
344         {
345                 U32 s;
346                 for (s = 0; s <= maxSymbolValue; s++)
347                         if (count[s] > max)
348                                 max = count[s];
349         }
350
351         return (size_t)max;
352 }
353
354 /* FSE_count_parallel_wksp() :
355  * Same as FSE_count_parallel(), but using an externally provided scratch buffer.
356  * `workSpace` size must be a minimum of `1024 * sizeof(unsigned)`` */
357 static size_t FSE_count_parallel_wksp(unsigned *count, unsigned *maxSymbolValuePtr, const void *source, size_t sourceSize, unsigned checkMax,
358                                       unsigned *const workSpace)
359 {
360         const BYTE *ip = (const BYTE *)source;
361         const BYTE *const iend = ip + sourceSize;
362         unsigned maxSymbolValue = *maxSymbolValuePtr;
363         unsigned max = 0;
364         U32 *const Counting1 = workSpace;
365         U32 *const Counting2 = Counting1 + 256;
366         U32 *const Counting3 = Counting2 + 256;
367         U32 *const Counting4 = Counting3 + 256;
368
369         memset(Counting1, 0, 4 * 256 * sizeof(unsigned));
370
371         /* safety checks */
372         if (!sourceSize) {
373                 memset(count, 0, maxSymbolValue + 1);
374                 *maxSymbolValuePtr = 0;
375                 return 0;
376         }
377         if (!maxSymbolValue)
378                 maxSymbolValue = 255; /* 0 == default */
379
380         /* by stripes of 16 bytes */
381         {
382                 U32 cached = ZSTD_read32(ip);
383                 ip += 4;
384                 while (ip < iend - 15) {
385                         U32 c = cached;
386                         cached = ZSTD_read32(ip);
387                         ip += 4;
388                         Counting1[(BYTE)c]++;
389                         Counting2[(BYTE)(c >> 8)]++;
390                         Counting3[(BYTE)(c >> 16)]++;
391                         Counting4[c >> 24]++;
392                         c = cached;
393                         cached = ZSTD_read32(ip);
394                         ip += 4;
395                         Counting1[(BYTE)c]++;
396                         Counting2[(BYTE)(c >> 8)]++;
397                         Counting3[(BYTE)(c >> 16)]++;
398                         Counting4[c >> 24]++;
399                         c = cached;
400                         cached = ZSTD_read32(ip);
401                         ip += 4;
402                         Counting1[(BYTE)c]++;
403                         Counting2[(BYTE)(c >> 8)]++;
404                         Counting3[(BYTE)(c >> 16)]++;
405                         Counting4[c >> 24]++;
406                         c = cached;
407                         cached = ZSTD_read32(ip);
408                         ip += 4;
409                         Counting1[(BYTE)c]++;
410                         Counting2[(BYTE)(c >> 8)]++;
411                         Counting3[(BYTE)(c >> 16)]++;
412                         Counting4[c >> 24]++;
413                 }
414                 ip -= 4;
415         }
416
417         /* finish last symbols */
418         while (ip < iend)
419                 Counting1[*ip++]++;
420
421         if (checkMax) { /* verify stats will fit into destination table */
422                 U32 s;
423                 for (s = 255; s > maxSymbolValue; s--) {
424                         Counting1[s] += Counting2[s] + Counting3[s] + Counting4[s];
425                         if (Counting1[s])
426                                 return ERROR(maxSymbolValue_tooSmall);
427                 }
428         }
429
430         {
431                 U32 s;
432                 for (s = 0; s <= maxSymbolValue; s++) {
433                         count[s] = Counting1[s] + Counting2[s] + Counting3[s] + Counting4[s];
434                         if (count[s] > max)
435                                 max = count[s];
436                 }
437         }
438
439         while (!count[maxSymbolValue])
440                 maxSymbolValue--;
441         *maxSymbolValuePtr = maxSymbolValue;
442         return (size_t)max;
443 }
444
445 /* FSE_countFast_wksp() :
446  * Same as FSE_countFast(), but using an externally provided scratch buffer.
447  * `workSpace` size must be table of >= `1024` unsigned */
448 size_t FSE_countFast_wksp(unsigned *count, unsigned *maxSymbolValuePtr, const void *source, size_t sourceSize, unsigned *workSpace)
449 {
450         if (sourceSize < 1500)
451                 return FSE_count_simple(count, maxSymbolValuePtr, source, sourceSize);
452         return FSE_count_parallel_wksp(count, maxSymbolValuePtr, source, sourceSize, 0, workSpace);
453 }
454
455 /* FSE_count_wksp() :
456  * Same as FSE_count(), but using an externally provided scratch buffer.
457  * `workSpace` size must be table of >= `1024` unsigned */
458 size_t FSE_count_wksp(unsigned *count, unsigned *maxSymbolValuePtr, const void *source, size_t sourceSize, unsigned *workSpace)
459 {
460         if (*maxSymbolValuePtr < 255)
461                 return FSE_count_parallel_wksp(count, maxSymbolValuePtr, source, sourceSize, 1, workSpace);
462         *maxSymbolValuePtr = 255;
463         return FSE_countFast_wksp(count, maxSymbolValuePtr, source, sourceSize, workSpace);
464 }
465
466 /*-**************************************************************
467 *  FSE Compression Code
468 ****************************************************************/
469 /*! FSE_sizeof_CTable() :
470         FSE_CTable is a variable size structure which contains :
471         `U16 tableLog;`
472         `U16 maxSymbolValue;`
473         `U16 nextStateNumber[1 << tableLog];`                         // This size is variable
474         `FSE_symbolCompressionTransform symbolTT[maxSymbolValue+1];`  // This size is variable
475 Allocation is manual (C standard does not support variable-size structures).
476 */
477 size_t FSE_sizeof_CTable(unsigned maxSymbolValue, unsigned tableLog)
478 {
479         if (tableLog > FSE_MAX_TABLELOG)
480                 return ERROR(tableLog_tooLarge);
481         return FSE_CTABLE_SIZE_U32(tableLog, maxSymbolValue) * sizeof(U32);
482 }
483
484 /* provides the minimum logSize to safely represent a distribution */
485 static unsigned FSE_minTableLog(size_t srcSize, unsigned maxSymbolValue)
486 {
487         U32 minBitsSrc = BIT_highbit32((U32)(srcSize - 1)) + 1;
488         U32 minBitsSymbols = BIT_highbit32(maxSymbolValue) + 2;
489         U32 minBits = minBitsSrc < minBitsSymbols ? minBitsSrc : minBitsSymbols;
490         return minBits;
491 }
492
493 unsigned FSE_optimalTableLog_internal(unsigned maxTableLog, size_t srcSize, unsigned maxSymbolValue, unsigned minus)
494 {
495         U32 maxBitsSrc = BIT_highbit32((U32)(srcSize - 1)) - minus;
496         U32 tableLog = maxTableLog;
497         U32 minBits = FSE_minTableLog(srcSize, maxSymbolValue);
498         if (tableLog == 0)
499                 tableLog = FSE_DEFAULT_TABLELOG;
500         if (maxBitsSrc < tableLog)
501                 tableLog = maxBitsSrc; /* Accuracy can be reduced */
502         if (minBits > tableLog)
503                 tableLog = minBits; /* Need a minimum to safely represent all symbol values */
504         if (tableLog < FSE_MIN_TABLELOG)
505                 tableLog = FSE_MIN_TABLELOG;
506         if (tableLog > FSE_MAX_TABLELOG)
507                 tableLog = FSE_MAX_TABLELOG;
508         return tableLog;
509 }
510
511 unsigned FSE_optimalTableLog(unsigned maxTableLog, size_t srcSize, unsigned maxSymbolValue)
512 {
513         return FSE_optimalTableLog_internal(maxTableLog, srcSize, maxSymbolValue, 2);
514 }
515
516 /* Secondary normalization method.
517    To be used when primary method fails. */
518
519 static size_t FSE_normalizeM2(short *norm, U32 tableLog, const unsigned *count, size_t total, U32 maxSymbolValue)
520 {
521         short const NOT_YET_ASSIGNED = -2;
522         U32 s;
523         U32 distributed = 0;
524         U32 ToDistribute;
525
526         /* Init */
527         U32 const lowThreshold = (U32)(total >> tableLog);
528         U32 lowOne = (U32)((total * 3) >> (tableLog + 1));
529
530         for (s = 0; s <= maxSymbolValue; s++) {
531                 if (count[s] == 0) {
532                         norm[s] = 0;
533                         continue;
534                 }
535                 if (count[s] <= lowThreshold) {
536                         norm[s] = -1;
537                         distributed++;
538                         total -= count[s];
539                         continue;
540                 }
541                 if (count[s] <= lowOne) {
542                         norm[s] = 1;
543                         distributed++;
544                         total -= count[s];
545                         continue;
546                 }
547
548                 norm[s] = NOT_YET_ASSIGNED;
549         }
550         ToDistribute = (1 << tableLog) - distributed;
551
552         if ((total / ToDistribute) > lowOne) {
553                 /* risk of rounding to zero */
554                 lowOne = (U32)((total * 3) / (ToDistribute * 2));
555                 for (s = 0; s <= maxSymbolValue; s++) {
556                         if ((norm[s] == NOT_YET_ASSIGNED) && (count[s] <= lowOne)) {
557                                 norm[s] = 1;
558                                 distributed++;
559                                 total -= count[s];
560                                 continue;
561                         }
562                 }
563                 ToDistribute = (1 << tableLog) - distributed;
564         }
565
566         if (distributed == maxSymbolValue + 1) {
567                 /* all values are pretty poor;
568                    probably incompressible data (should have already been detected);
569                    find max, then give all remaining points to max */
570                 U32 maxV = 0, maxC = 0;
571                 for (s = 0; s <= maxSymbolValue; s++)
572                         if (count[s] > maxC)
573                                 maxV = s, maxC = count[s];
574                 norm[maxV] += (short)ToDistribute;
575                 return 0;
576         }
577
578         if (total == 0) {
579                 /* all of the symbols were low enough for the lowOne or lowThreshold */
580                 for (s = 0; ToDistribute > 0; s = (s + 1) % (maxSymbolValue + 1))
581                         if (norm[s] > 0)
582                                 ToDistribute--, norm[s]++;
583                 return 0;
584         }
585
586         {
587                 U64 const vStepLog = 62 - tableLog;
588                 U64 const mid = (1ULL << (vStepLog - 1)) - 1;
589                 U64 const rStep = div_u64((((U64)1 << vStepLog) * ToDistribute) + mid, (U32)total); /* scale on remaining */
590                 U64 tmpTotal = mid;
591                 for (s = 0; s <= maxSymbolValue; s++) {
592                         if (norm[s] == NOT_YET_ASSIGNED) {
593                                 U64 const end = tmpTotal + (count[s] * rStep);
594                                 U32 const sStart = (U32)(tmpTotal >> vStepLog);
595                                 U32 const sEnd = (U32)(end >> vStepLog);
596                                 U32 const weight = sEnd - sStart;
597                                 if (weight < 1)
598                                         return ERROR(GENERIC);
599                                 norm[s] = (short)weight;
600                                 tmpTotal = end;
601                         }
602                 }
603         }
604
605         return 0;
606 }
607
608 size_t FSE_normalizeCount(short *normalizedCounter, unsigned tableLog, const unsigned *count, size_t total, unsigned maxSymbolValue)
609 {
610         /* Sanity checks */
611         if (tableLog == 0)
612                 tableLog = FSE_DEFAULT_TABLELOG;
613         if (tableLog < FSE_MIN_TABLELOG)
614                 return ERROR(GENERIC); /* Unsupported size */
615         if (tableLog > FSE_MAX_TABLELOG)
616                 return ERROR(tableLog_tooLarge); /* Unsupported size */
617         if (tableLog < FSE_minTableLog(total, maxSymbolValue))
618                 return ERROR(GENERIC); /* Too small tableLog, compression potentially impossible */
619
620         {
621                 U32 const rtbTable[] = {0, 473195, 504333, 520860, 550000, 700000, 750000, 830000};
622                 U64 const scale = 62 - tableLog;
623                 U64 const step = div_u64((U64)1 << 62, (U32)total); /* <== here, one division ! */
624                 U64 const vStep = 1ULL << (scale - 20);
625                 int stillToDistribute = 1 << tableLog;
626                 unsigned s;
627                 unsigned largest = 0;
628                 short largestP = 0;
629                 U32 lowThreshold = (U32)(total >> tableLog);
630
631                 for (s = 0; s <= maxSymbolValue; s++) {
632                         if (count[s] == total)
633                                 return 0; /* rle special case */
634                         if (count[s] == 0) {
635                                 normalizedCounter[s] = 0;
636                                 continue;
637                         }
638                         if (count[s] <= lowThreshold) {
639                                 normalizedCounter[s] = -1;
640                                 stillToDistribute--;
641                         } else {
642                                 short proba = (short)((count[s] * step) >> scale);
643                                 if (proba < 8) {
644                                         U64 restToBeat = vStep * rtbTable[proba];
645                                         proba += (count[s] * step) - ((U64)proba << scale) > restToBeat;
646                                 }
647                                 if (proba > largestP)
648                                         largestP = proba, largest = s;
649                                 normalizedCounter[s] = proba;
650                                 stillToDistribute -= proba;
651                         }
652                 }
653                 if (-stillToDistribute >= (normalizedCounter[largest] >> 1)) {
654                         /* corner case, need another normalization method */
655                         size_t const errorCode = FSE_normalizeM2(normalizedCounter, tableLog, count, total, maxSymbolValue);
656                         if (FSE_isError(errorCode))
657                                 return errorCode;
658                 } else
659                         normalizedCounter[largest] += (short)stillToDistribute;
660         }
661
662         return tableLog;
663 }
664
665 /* fake FSE_CTable, for raw (uncompressed) input */
666 size_t FSE_buildCTable_raw(FSE_CTable *ct, unsigned nbBits)
667 {
668         const unsigned tableSize = 1 << nbBits;
669         const unsigned tableMask = tableSize - 1;
670         const unsigned maxSymbolValue = tableMask;
671         void *const ptr = ct;
672         U16 *const tableU16 = ((U16 *)ptr) + 2;
673         void *const FSCT = ((U32 *)ptr) + 1 /* header */ + (tableSize >> 1); /* assumption : tableLog >= 1 */
674         FSE_symbolCompressionTransform *const symbolTT = (FSE_symbolCompressionTransform *)(FSCT);
675         unsigned s;
676
677         /* Sanity checks */
678         if (nbBits < 1)
679                 return ERROR(GENERIC); /* min size */
680
681         /* header */
682         tableU16[-2] = (U16)nbBits;
683         tableU16[-1] = (U16)maxSymbolValue;
684
685         /* Build table */
686         for (s = 0; s < tableSize; s++)
687                 tableU16[s] = (U16)(tableSize + s);
688
689         /* Build Symbol Transformation Table */
690         {
691                 const U32 deltaNbBits = (nbBits << 16) - (1 << nbBits);
692                 for (s = 0; s <= maxSymbolValue; s++) {
693                         symbolTT[s].deltaNbBits = deltaNbBits;
694                         symbolTT[s].deltaFindState = s - 1;
695                 }
696         }
697
698         return 0;
699 }
700
701 /* fake FSE_CTable, for rle input (always same symbol) */
702 size_t FSE_buildCTable_rle(FSE_CTable *ct, BYTE symbolValue)
703 {
704         void *ptr = ct;
705         U16 *tableU16 = ((U16 *)ptr) + 2;
706         void *FSCTptr = (U32 *)ptr + 2;
707         FSE_symbolCompressionTransform *symbolTT = (FSE_symbolCompressionTransform *)FSCTptr;
708
709         /* header */
710         tableU16[-2] = (U16)0;
711         tableU16[-1] = (U16)symbolValue;
712
713         /* Build table */
714         tableU16[0] = 0;
715         tableU16[1] = 0; /* just in case */
716
717         /* Build Symbol Transformation Table */
718         symbolTT[symbolValue].deltaNbBits = 0;
719         symbolTT[symbolValue].deltaFindState = 0;
720
721         return 0;
722 }
723
724 static size_t FSE_compress_usingCTable_generic(void *dst, size_t dstSize, const void *src, size_t srcSize, const FSE_CTable *ct, const unsigned fast)
725 {
726         const BYTE *const istart = (const BYTE *)src;
727         const BYTE *const iend = istart + srcSize;
728         const BYTE *ip = iend;
729
730         BIT_CStream_t bitC;
731         FSE_CState_t CState1, CState2;
732
733         /* init */
734         if (srcSize <= 2)
735                 return 0;
736         {
737                 size_t const initError = BIT_initCStream(&bitC, dst, dstSize);
738                 if (FSE_isError(initError))
739                         return 0; /* not enough space available to write a bitstream */
740         }
741
742 #define FSE_FLUSHBITS(s) (fast ? BIT_flushBitsFast(s) : BIT_flushBits(s))
743
744         if (srcSize & 1) {
745                 FSE_initCState2(&CState1, ct, *--ip);
746                 FSE_initCState2(&CState2, ct, *--ip);
747                 FSE_encodeSymbol(&bitC, &CState1, *--ip);
748                 FSE_FLUSHBITS(&bitC);
749         } else {
750                 FSE_initCState2(&CState2, ct, *--ip);
751                 FSE_initCState2(&CState1, ct, *--ip);
752         }
753
754         /* join to mod 4 */
755         srcSize -= 2;
756         if ((sizeof(bitC.bitContainer) * 8 > FSE_MAX_TABLELOG * 4 + 7) && (srcSize & 2)) { /* test bit 2 */
757                 FSE_encodeSymbol(&bitC, &CState2, *--ip);
758                 FSE_encodeSymbol(&bitC, &CState1, *--ip);
759                 FSE_FLUSHBITS(&bitC);
760         }
761
762         /* 2 or 4 encoding per loop */
763         while (ip > istart) {
764
765                 FSE_encodeSymbol(&bitC, &CState2, *--ip);
766
767                 if (sizeof(bitC.bitContainer) * 8 < FSE_MAX_TABLELOG * 2 + 7) /* this test must be static */
768                         FSE_FLUSHBITS(&bitC);
769
770                 FSE_encodeSymbol(&bitC, &CState1, *--ip);
771
772                 if (sizeof(bitC.bitContainer) * 8 > FSE_MAX_TABLELOG * 4 + 7) { /* this test must be static */
773                         FSE_encodeSymbol(&bitC, &CState2, *--ip);
774                         FSE_encodeSymbol(&bitC, &CState1, *--ip);
775                 }
776
777                 FSE_FLUSHBITS(&bitC);
778         }
779
780         FSE_flushCState(&bitC, &CState2);
781         FSE_flushCState(&bitC, &CState1);
782         return BIT_closeCStream(&bitC);
783 }
784
785 size_t FSE_compress_usingCTable(void *dst, size_t dstSize, const void *src, size_t srcSize, const FSE_CTable *ct)
786 {
787         unsigned const fast = (dstSize >= FSE_BLOCKBOUND(srcSize));
788
789         if (fast)
790                 return FSE_compress_usingCTable_generic(dst, dstSize, src, srcSize, ct, 1);
791         else
792                 return FSE_compress_usingCTable_generic(dst, dstSize, src, srcSize, ct, 0);
793 }
794
795 size_t FSE_compressBound(size_t size) { return FSE_COMPRESSBOUND(size); }