By JS1


2015-07-29 18:46:50 8 Comments

Recently, I reviewed this question about an LZW decoder. In order to properly review that question, I needed to write my own encoder and decoder to be able to test their program. Someone thought it would be interesting to submit these programs for review, so here they are:

encode.c

/*
 * LZW encoder
 *
 * - Uses fixed length 12-bit encodings.
 * - Outputs in MSB format.
 * - When encoding table fills up, then table is reset back to the initial
 *   256 entries.
 * - Written in C89 style.
 */
#include <stdio.h>
#include <stdint.h>
#include <string.h>
#include <stdlib.h>

static void encode(FILE *in, FILE *out);

int main(int argc, char *argv[])
{
    FILE *in  = stdin;
    FILE *out = stdout;

    if (argc > 1) {
        in = fopen(argv[1], "rb");
        if (in == NULL) {
            fprintf(stderr, "Can't open %s.\n", argv[1]);
            return 1;
        }
    }
    if (argc > 2) {
        out = fopen(argv[2], "wb");
        if (out == NULL) {
            fprintf(stderr, "Can't open %s.\n", argv[2]);
            return 1;
        }
    }

    encode(in, out);

    if (argc > 1)
        fclose(in);
    if (argc > 2)
        fclose(out);

    return 0;
}

/* The LZW encoder builds a trie out of the input file, but only adds one
 * new trie node per sequence that it outputs.  There will be a maximum
 * of DICT_MAX sequences, so the child trie pointers can be uint16_t values
 * which are the node indices of the child nodes.  An index of 0 is like
 * a NULL pointer, because no node can point to node 0. */
typedef struct DictNode {
    uint16_t child[256];
} DictNode;

#define        DICT_BITS        12
#define        DICT_MAX        (1 << DICT_BITS)

/**
 * LZW encoder.  Reads from file "in" and outputs to file "out".
 */
static void encode(FILE *in, FILE *out)
{
    DictNode   *dictionary   = NULL;
    int         dictSize     = 256;
    int         nextByte     = fgetc(in);
    uint16_t    curNode      = nextByte;
    int         leftoverBits = 0;
    int         leftoverByte = 0;

    // Abort on empty input file.
    if (nextByte == EOF)
        return;

    // Initialize the dictionary.
    dictionary = calloc(DICT_MAX, sizeof(DictNode));
    if (dictionary == NULL)
        return;

    do {
        int curByte = fgetc(in);

        // Check if the file ended.  If so, output the last code and any
        // leftover bits, and then break out of the main loop.
        if (curByte == EOF) {
            if (leftoverBits == 0) {
                fputc(curNode >> 4, out);
                fputc(curNode << 4, out);
            } else {
                fputc(leftoverByte | (curNode >> 8), out);
                fputc(curNode, out);
            }
            break;
        }

        // Follow the new byte down the trie.
        uint16_t nextNode = dictionary[curNode].child[curByte];
        if (nextNode != 0) {
            // The sequence exists, keep searching down the trie.
            curNode = nextNode;
            continue;
        }

        // The sequence doesn't exist.  First, output the code for curNode.
        // This is hardcoded for 12-bit output codes.
        if (leftoverBits == 0) {
            fputc(curNode >> 4, out);
            leftoverBits = 4;
            leftoverByte = (curNode << 4);
        } else {
            fputc(leftoverByte | (curNode >> 8), out);
            fputc(curNode, out);
            leftoverBits = 0;
        }

        // Now, extend the sequence in the trie by the new byte.
        if (dictSize < DICT_MAX) {
            dictionary[curNode].child[curByte] = dictSize++;
        } else {
            // The trie hit max size.  Instead of extending the trie,
            // clear it back to the original 256 entries.
            memset(dictionary, 0, DICT_MAX * sizeof(dictionary[0]));
            dictSize = 256;
        }

        // Start over a new sequence with the current byte.
        curNode = curByte;
    } while (1);

    free(dictionary);
}

decode.c

/*
 * LZW decoder
 *
 * - Uses fixed length 12-bit encodings.
 * - Expects input in MSB format.
 * - When encoding table fills up, then table is reset back to the initial
 *   256 entries.
 * - Written in C89 style.
 */
#include <stdio.h>
#include <stdint.h>
#include <string.h>
#include <stdlib.h>

static void decode(FILE *in, FILE *out);

int main(int argc, char *argv[])
{
    FILE *in  = stdin;
    FILE *out = stdout;

    if (argc > 1) {
        in = fopen(argv[1], "rb");
        if (in == NULL) {
            fprintf(stderr, "Can't open %s.\n", argv[1]);
            return 1;
        }
    }
    if (argc > 2) {
        out = fopen(argv[2], "wb");
        if (out == NULL) {
            fprintf(stderr, "Can't open %s.\n", argv[2]);
            return 1;
        }
    }

    decode(in, out);

    if (argc > 1)
        fclose(in);
    if (argc > 2)
        fclose(out);

    return 0;
}

/* Each dictionary entry is a byte sequence and a length. */
typedef struct DictEntry {
    uint8_t *seq;
    int      len;
} DictEntry;

#define        DICT_BITS        12
#define        DICT_MAX        (1 << DICT_BITS)

/* We use a custom allocator because the maximum length of all sequences is
 * predictable, and we can keep all the sequences packed together by using
 * one big allocation and carving it up. */
typedef struct AllocInfo {
    uint8_t *base;
    int      len;
    uint8_t *nextAlloc;
} AllocInfo;

static void AllocInit(AllocInfo *alloc, int size);
static uint8_t *Allocate(AllocInfo *alloc, int len);

/* Use a struct to hold input state so we can read 12-bit codes from the
 * input file. */
typedef struct InputState {
    FILE   *fp;
    int     leftoverBits;
    int     leftoverCode;
} InputState;

static int ReadNextCode(InputState *inState);

#define END_OF_FILE_CODE                0xffff

static void decode(FILE *in, FILE *out)
{
    DictEntry  *dict      = NULL;
    AllocInfo   allocInfo;
    int         dictSize  = 256;
    InputState  inState   = { in, 0, 0 };
    uint16_t    prevCode  = ReadNextCode(&inState);
    uint8_t    *mark      = NULL;
    int         i         = 0;

    // Abort on empty input file.
    if (prevCode == END_OF_FILE_CODE)
        return;

    // The maximum of all sequences will be if the sequences increase in length
    // steadily from 1..DICT_MAX.  Add in an extra 2 bytes per entry to account
    // for the fact that we round each allocation to 4 bytes in size.
    AllocInit(&allocInfo, DICT_MAX*DICT_MAX/2 + DICT_MAX*2);

    // Initialize dictionary to single character entries.
    dict = calloc(DICT_MAX, sizeof(DictEntry));
    for (i = 0; i < dictSize; i++) {
        dict[i].seq = Allocate(&allocInfo, 1);
        dict[i].seq[0] = i;
        dict[i].len = 1;
    }
    // This mark is used to indicate where we should reset the allocations
    // to when we reset the dictionary to 256 entries.
    mark = allocInfo.nextAlloc;

    // Output the first code sequence, which is always a single byte.
    fputc(prevCode, out);

    do {
        uint16_t code = ReadNextCode(&inState);

        if (code > dictSize) {
            // The normal case would be that the file ended.
            if (code == END_OF_FILE_CODE)
                break;
            // Otherwise there was a problem with the input file.
            fprintf(stderr, "Error: bad code %d, dictSize = %d.\n", code,
                    dictSize);
            exit(1);
        }

        // Add entry to dictionary first.  That way, if we need to use
        // the just added dictionary entry, it will be ready to use.
        if (dictSize == DICT_MAX) {
            // Dictionary hit max size.  Reset it.
            dictSize            = 256;
            allocInfo.nextAlloc = mark;
        } else {
            // Extend dictionary by one entry.  The new entry is the same
            // as the previous entry plus one character.
            int prevLen        = dict[prevCode].len;
            dict[dictSize].len = prevLen + 1;
            dict[dictSize].seq = Allocate(&allocInfo, prevLen + 1);
            memcpy(dict[dictSize].seq, dict[prevCode].seq, prevLen);
            // The last character normally comes from the first character
            // of the current code.  However, if it is the newly added entry,
            // then it is the first character of the previous code.
            if (code == dictSize)
                dict[dictSize++].seq[prevLen] = dict[prevCode].seq[0];
            else
                dict[dictSize++].seq[prevLen] = dict[code].seq[0];
        }

        // Output code sequence to file.
        fwrite(dict[code].seq, 1, dict[code].len, out);
        prevCode = code;
    } while (1);

    free(dict);
    free(allocInfo.base);
}

/**
 * Intializes the custom allocator.
 */
static void AllocInit(AllocInfo *alloc, int size)
{
    alloc->base      = malloc(size);
    alloc->len       = size;
    alloc->nextAlloc = alloc->base;
}

/**
 * Allocate memory using custom allocator.
 */
static uint8_t *Allocate(AllocInfo *alloc, int len)
{
    uint8_t *ret = alloc->nextAlloc;

    // Round up to the nearest 4 byte alignment.
    len = (len + 3) & ~3;
    alloc->nextAlloc += len;
    return ret;
}

/**
 * Reads a 12 bit code from the file in a MSB manner.
 */
static int ReadNextCode(InputState *inState)
{
    int code;
    int b0 = fgetc(inState->fp);

    if (b0 == EOF)
        return END_OF_FILE_CODE;

    if (inState->leftoverBits == 0) {
        int b1 = fgetc(inState->fp);

        if (b1 == EOF)
            return END_OF_FILE_CODE;
        code = (b0 << 4) | (b1 >> 4);
        inState->leftoverBits = 4;
        inState->leftoverCode = (b1 & 0xf) << 8;
    } else {
        code = inState->leftoverCode | b0;
        inState->leftoverBits = 0;
    }
    return code;
}

Commentary and disclaimers

It may help to read about LZW compression first.

The encoding process essentially uses a trie, and the index number of the trie node is used as the 12-bit code to be encoded. The first 256 trie nodes are implicitly assumed to be the starting single character nodes.

For the decoding process, I decided to use my own allocator just because I thought it would be nice if I packed all the strings in the dictionary together. I could easily have just used malloc with the same results.

In both encoder and decoder, I decided to use fgetc and fputc and depend on the buffering behavior of the FILE functions instead of using my own buffers. It makes the code simpler, although there are probably ways to speed things up by buffering more of the input file.

I was considering making the program more general by being able to output different bit encodings. That's why you see things like DICT_BITS in the code. In the end, though, I wound up hardcoding everything to 12-bit encodings.

I prefer code to be in "reading order", which means that main should be on top, and then the functions that it calls below it. This may be opposite of what most people are used to, but this is an intentional thing.

2 comments

@Emanuele Paolini 2015-07-29 22:13:53

The code is very well written in my opinion. I have only very marginal questions:

  1. in the main function you have a small repetition when checking argv to determine if the FILE should be closed or not. I would prefer checking (in != stdin) which is telling the real reason for the check. Alternatively I would simply neglect closing the files... since you are in the main function, at the end all file descriptors are guaranteed to be closed anyway.

  2. i would prefer for(;;){...} instead of do {...} while(1);. I see three reasons for this: it is shorter, you don't introduce the arbitrary constant 1, it is clear from the beginning that the loop has no termination condition.

  3. i would expect less invasive error checking. Since your encode and decode functions are perfect candidates to be reused in a library, you should not write errors to stderr and should not terminate the execution with exit. A better approach would be to use return codes as signals for errors and let the main function write the messages and terminate.

  4. i would always enclose with braces the blocks of an if statement even when there is a single line. This is to avoid the risk to forget the braces when a line is later added. As an alternative I would put the single line on the same line as the if statement.

@JS1 2015-07-29 23:17:15

Thanks! 1) I agree. 2) I think I have a do loop because it used to have a terminating condition and later I removed it. For infinite loops I prefer to use while (1) over for (;;) though. 3) That error message was there for debugging - good point on that. 4) I prefer the standard K&R brace style where single line if statements have no braces. I know there are reasons not to, but I can't bring myself to do it at this point.

@glampert 2015-07-30 01:51:39

@JS1, to avoid the nuisance of typing the { } for every statement, you might consider adding a plugin to your editor. For instance, on Vim, I use this plugin, so I only type if<tab> and it finished the rest for me, adding the ( ) and { }. :)

@Anders 2015-07-30 06:51:32

general impression

overall the code looks clean and tidy so it is quite easy to follow

some comments/suggestions

MISRA and other coding practices recommend to always use compound statements after conditionals to make the code more clear and more foolproof for maintenance

if (...)  // or wherever you want to put your starting '{'
{
}

instead of

if (...)
  ;

in main() use perror() instead to get the real error from the OS instead of "can't open" e.g. perror(argv[1]);

i also agree with Emanuele that if (argc>1) seems a bit ... vague? better with if (stdin != in)

in encode() if you run out of memory you silently return, it would probably be good to return some kind of error code instead allowing for a user of the function to take action. This applies also to exit(), doing an exit in the middle is not a good strategy, better to return with an error code. Let the caller determine what to do e.g. write an error message.

in decode() you calloc(DICT_MAX,sizeof(DictEntry)) but there you do not check the return value.

Related Questions

Sponsored Content

1 Answered Questions

[SOLVED] Simplifying LZW compression & decompression

  • 2016-03-06 18:08:38
  • KrakenJaws
  • 7846 View
  • 1 Score
  • 1 Answer
  • Tags:   java compression

0 Answered Questions

Linux GPIO rotary encoder as volume control

1 Answered Questions

[SOLVED] JSON encoder/decoder for bandwidth efficiency

1 Answered Questions

[SOLVED] C++ Huffman coder and decoder

  • 2017-05-18 19:52:14
  • Yksuh
  • 1200 View
  • 8 Score
  • 1 Answer
  • Tags:   c++ compression

3 Answered Questions

[SOLVED] Message decoder

1 Answered Questions

[SOLVED] WAVE to FLAC encoder in C

1 Answered Questions

[SOLVED] Simple LZW compression algorithm

2 Answered Questions

[SOLVED] Run-length decoder

1 Answered Questions

[SOLVED] Base64 encoder optimization

2 Answered Questions

[SOLVED] toBase64 encoder

  • 2014-11-23 17:29:36
  • MaZaHaKa
  • 142 View
  • 5 Score
  • 2 Answer
  • Tags:   c converting base64

Sponsored Content