compression_coding/ARITH1E.C at master · hyperbolic-c/compression_coding · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
/************************** Start of ARITH1E.C *************************
 *
 * This is the order-1 arithmetic coding module
 * This module implements an order 1 arithmetic
 * compression program that uses escape codes to encode previously
 * unseen symbols.
 *
 */
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <windows.h>
#include "errhand.h"
#include "bitio.h"

/*
 * The SYMBOL structure is what is used to define a symbol in
 * arithmetic coding terms.  A symbol is defined as a range between
 * 0 and 1.  Since we are using integer math, instead of using 0 and 1
 * as our end points, we have an integer scale.  The low_count and
 * high_count define where the symbol falls in the range.
 */

typedef struct {
    unsigned short int low_count;
    unsigned short int high_count;
    unsigned short int scale;
} SYMBOL;

#define MAXIMUM_SCALE   16383  /* Maximum allowed frequency count */
#define END_OF_STREAM   257    /* The EOF symbol                  */
#define ESCAPE          256    /* The ESCAPE symbol               */

static int *totals[ END_OF_STREAM ];

/*
 * Local function prototypes.
 */


static void initialize_arithmetic_decoder( BIT_FILE *stream );
static void remove_symbol_from_stream( BIT_FILE *stream, SYMBOL *s );
static void initialize_arithmetic_encoder( void );
static void encode_symbol( BIT_FILE *stream, SYMBOL *s );
static void flush_arithmetic_encoder( BIT_FILE *stream );
static short int get_current_count( SYMBOL *s );
static void initialize_model( void );
static void update_model( int symbol, int context );
static int convert_int_to_symbol( int symbol, int context, SYMBOL *s );
static void get_symbol_scale( int context, SYMBOL *s );
static int convert_symbol_to_int( int count, int context, SYMBOL *s );


//char *CompressionName = "Adaptive order 1 model with arithmetic coding";
//char *Usage           = "in-file out-file\n\n";

/*
 * The compression loop for this program is a little more complicated than
 * its counterpart in ARITH1.C. In addition to handling different contexts,
 * this program has to handle the possiblity of an escape code, which
 * means recoding the symbol using a different context table.  In this
 * case, that is the ESCAPE table, which has one count per symbol and is
 * never updated.
 */
int CompressFile_adaptive_order_1_arithmetic(FILE *input, BIT_FILE *output)
{
    SYMBOL s;
    int c;
    int context;
    int escaped;

    DWORD milliseconds = GetTickCount();
    context = 0;
    initialize_model();
    initialize_arithmetic_encoder();
    for ( ; ; ) {
        c = getc( input );
        if ( c == EOF )
            c = END_OF_STREAM;
        escaped = convert_int_to_symbol( c, context, &s );
        encode_symbol( output, &s );
        if ( escaped ) {
            convert_int_to_symbol( c, ESCAPE, &s );
            encode_symbol( output, &s );
        }
        if ( c == END_OF_STREAM )
	    break;
        update_model( c, context );
        context = c;
    }
    flush_arithmetic_encoder( output );
    return (GetTickCount() - milliseconds);
}

/*
 * Just like the compression loop, the expansion routine has to handle
 * an incoming escape character, and switch contexts if one is read.
 * Here that logic has been implemented as a loop so as to avoid
 * writing the same code twice.
 *
 */
int ExpandFile_adaptive_order_1_arithmetic(BIT_FILE *input, FILE *output)
{
    SYMBOL s;
    int count;
    int c;
    int context;
    int last_context;

    DWORD milliseconds = GetTickCount();
    context = 0;
    initialize_model();
    initialize_arithmetic_decoder( input );
    for ( ; ; ) {
        last_context = context;
        do {
            get_symbol_scale( context, &s );
            count = get_current_count( &s );
            c = convert_symbol_to_int( count, context, &s );
            remove_symbol_from_stream( input, &s );
            context = c;
        } while ( c == ESCAPE );
        if ( c == END_OF_STREAM )
            break;
        putc( (char) c, output );
	update_model( c, last_context );
        context = c;
    }
    return (GetTickCount() - milliseconds);
}

/*
 * Since we are using ESCAPE codes in this program, we can initialize
 * all of the contexts to contain nothing more than a single count for
 * the escape code.  However, the default context table, the ESCAPE
 * table, needs to have a single count for every symbol, since it is
 * our context table of final resort.  The remaining tables are assumed
 * to be initialized to zero when they are created using calloc().  The
 * ESCAPE code has its count set to 1 by the call to update_model().
 */

void initialize_model()
{
    int context;
    int i;

    for ( context = 0 ; context < END_OF_STREAM ; context++ ) {
        totals[ context ] = (int *) calloc( END_OF_STREAM + 2, sizeof(int) );
        if ( totals[ context ] == NULL )
            fatal_error( "Failure allocating context %d", context );
        update_model( ESCAPE, context );
    }
    for ( i = 0 ; i <= ( END_OF_STREAM + 1 ) ; i++ )
        totals[ ESCAPE ][ i ] = i;
}

/*
 * This routine is called to increment the counts for the current
 * contexts.  It is called after a character has been encoded or decoded.
 * Just like in the routines from other programs, this code has to check
 * to see if the context table has passed the MAXIMUM_SCALE value.  If
 * it has, the table needs to be rescaled.  The difference here is that
 * we don't care if any of the symbols have their counts reduced to 0,
 * since we can always issue an escape code.  The only symbol that is
 * required to be non-zero is the ESCAPE code, and that is handled
 * by the two lines of code at the end of the routine.
 */

void update_model( symbol, context )
int symbol;
int context;
{
    int i;

    for ( i = symbol + 1 ; i <= ( END_OF_STREAM + 1 ) ; i++ )
        totals[ context ][ i ]++;
    if ( totals[ context ][ END_OF_STREAM + 1 ] < MAXIMUM_SCALE )
        return;
    for ( i = 1 ; i <= ( END_OF_STREAM + 1 ) ; i++ )
        totals[ context ][ i ] /= 2;
    totals[ context ][ END_OF_STREAM ] = totals[ context ][ ESCAPE ] + 1;
    totals[ context ][ END_OF_STREAM + 1 ] =
        totals[ context ][ END_OF_STREAM ] + 1;
}

/*
 * This routine operates much like the convert_int_to_symbol from ARITH1.C
 * and earlier programs.  The difference here is that the symbol is checked
 * to see if it has a count of 0.  If it does, instead of returning
 * the counts for this symbol, we return the counts for the ESCAPE symbol
 * in the current context.  The return value from the function is either 0
 * in case the symbol was valid, or a 1 if the symbol had to be escaped.
 */

int convert_int_to_symbol( c, context, s )
int c;
int context;
SYMBOL *s;
{
    s->scale = totals[ context ][ END_OF_STREAM + 1 ];
    s->low_count = totals[ context ][ c ];
    s->high_count = totals[ context ][ c + 1 ];
    if ( s->low_count < s->high_count )
        return( 0 );
    s->low_count = totals[ context ][ ESCAPE ];
    s->high_count = totals[ context ][ ESCAPE + 1 ];
    return( 1 );
}

/*
 * This routine is unchanged from earlier versions of this program.  It
 * just has to find the cumulative total count for the entire context
 * table.
 */

void get_symbol_scale( context, s )
int context;
SYMBOL *s;
{
    s->scale = totals[ context][ END_OF_STREAM + 1 ];
}

/*
 * This routine has not had to change from earlier versions of this program
 * either.  It just has to look through the table for the correct symbol,
 * and return its value.
 */

int convert_symbol_to_int( count, context, s )
int count;
int context;
SYMBOL *s;
{
    int c;

    for ( c = 0; count >= totals[ context ][ c + 1 ] ; c++ )
        ;
    s->high_count = totals[ context ][ c + 1 ];
    s->low_count = totals[ context ][ c ];
    return( c );
}

/*
 * Everything from here down defines the arithmetic coder section
 * of the program.  This code is unchanged from the arithmetic coding
 * sections of ARITH1.C and earlier programs.
 */

/*
 * These four variables define the current state of the arithmetic
 * coder/decoder.  They are assumed to be 16 bits long.  Note that
 * by declaring them as short ints, they will actually be 16 bits
 * on most 80X86 and 680X0 machines, as well as VAXen.
 */
static unsigned short int code;  /* The present input code value       */
static unsigned short int low;   /* Start of the current code range    */
static unsigned short int high;  /* End of the current code range      */
long underflow_bits;             /* Number of underflow bits pending   */

/*
 * This routine must be called to initialize the encoding process.
 * The high register is initialized to all 1s, and it is assumed that
 * it has an infinite string of 1s to be shifted into the lower bit
 * positions when needed.
 */
void initialize_arithmetic_encoder()
{
    low = 0;
    high = 0xffff;
    underflow_bits = 0;
}

/*
 * At the end of the encoding process, there are still significant
 * bits left in the high and low registers.  We output two bits,
 * plus as many underflow bits as are necessary.
 */
void flush_arithmetic_encoder( stream )
BIT_FILE *stream;
{
    OutputBit( stream, low & 0x4000 );
    underflow_bits++;
    while ( underflow_bits-- > 0 )
        OutputBit( stream, ~low & 0x4000 );
    OutputBits( stream, 0L, 16 );
}

/*
 * This routine is called to encode a symbol.  The symbol is passed
 * in the SYMBOL structure as a low count, a high count, and a range,
 * instead of the more conventional probability ranges.  The encoding
 * process takes two steps.  First, the values of high and low are
 * updated to take into account the range restriction created by the
 * new symbol.  Then, as many bits as possible are shifted out to
 * the output stream.  Finally, high and low are stable again and
 * the routine returns.
 */
void encode_symbol( stream, s )
BIT_FILE *stream;
SYMBOL *s;
{
    long range;

/*
 * These three lines rescale high and low for the new symbol.
 */
    range = (long) ( high-low ) + 1;
    high = low + (unsigned short int)
                 (( range * s->high_count ) / s->scale - 1 );
    low = low + (unsigned short int)
                 (( range * s->low_count ) / s->scale );
/*
 * This loop turns out new bits until high and low are far enough
 * apart to have stabilized.
 */
    for ( ; ; ) {
/*
 * If this test passes, it means that the MSDigits match, and can
 * be sent to the output stream.
 */
        if ( ( high & 0x8000 ) == ( low & 0x8000 ) ) {
            OutputBit( stream, high & 0x8000 );
            while ( underflow_bits > 0 ) {
                OutputBit( stream, ~high & 0x8000 );
                underflow_bits--;
            }
        }
/*
 * If this test passes, the numbers are in danger of underflow, because
 * the MSDigits don't match, and the 2nd digits are just one apart.
 */
        else if ( ( low & 0x4000 ) && !( high & 0x4000 )) {
            underflow_bits += 1;
            low &= 0x3fff;
            high |= 0x4000;
        } else
            return ;
        low <<= 1;
        high <<= 1;
        high |= 1;
    }
}

/*
 * When decoding, this routine is called to figure out which symbol
 * is presently waiting to be decoded.  This routine expects to get
 * the current model scale in the s->scale parameter, and it returns
 * a count that corresponds to the present floating point code:
 *
 *  code = count / s->scale
 */
short int get_current_count( s )
SYMBOL *s;
{
    long range;
    short int count;

    range = (long) ( high - low ) + 1;
    count = (short int)
            ((((long) ( code - low ) + 1 ) * s->scale-1 ) / range );
    return( count );
}

/*
 * This routine is called to initialize the state of the arithmetic
 * decoder.  This involves initializing the high and low registers
 * to their conventional starting values, plus reading the first
 * 16 bits from the input stream into the code value.
 */
void initialize_arithmetic_decoder( stream )
BIT_FILE *stream;
{
    int i;

    code = 0;
    for ( i = 0 ; i < 16 ; i++ ) {
        code <<= 1;
        code += InputBit( stream );
    }
    low = 0;
    high = 0xffff;
}

/*
 * Just figuring out what the present symbol is doesn't remove
 * it from the input bit stream.  After the character has been
 * decoded, this routine has to be called to remove it from the
 * input stream.
 */
void remove_symbol_from_stream( stream, s )
BIT_FILE *stream;
SYMBOL *s;
{
    long range;

/*
 * First, the range is expanded to account for the symbol removal.
 */
    range = (long)( high - low ) + 1;
    high = low + (unsigned short int)
                 (( range * s->high_count ) / s->scale - 1 );
    low = low + (unsigned short int)
                 (( range * s->low_count ) / s->scale );
/*
 * Next, any possible bits are shipped out.
 */
    for ( ; ; ) {
/*
 * If the MSDigits match, the bits will be shifted out.
 */
        if ( ( high & 0x8000 ) == ( low & 0x8000 ) ) {
        }
/*
 * Else, if underflow is threatening, shift out the 2nd MSDigit.
 */
        else if ((low & 0x4000) == 0x4000  && (high & 0x4000) == 0 ) {
            code ^= 0x4000;
            low   &= 0x3fff;
            high  |= 0x4000;
        } else
 /*
 * Otherwise, nothing can be shifted out, so I return.
 */
            return;
        low <<= 1;
        high <<= 1;
        high |= 1;
        code <<= 1;
        code += InputBit( stream );
    }
}

/************************** End of ARITH1E.C ****************************/