By SuddenMoustache


2015-07-27 22:26:27 8 Comments

Here is a simple decompression implementation. I'd very much appreciate any advice or comments about the structure/logic of the program, style of the code, or reusability of the features. I'm a real novice and really eager to learn from people here, so fire away!

:

/**
 *  lzw.h
 *  Function declarations for lempel ziv-welch decompression algorithm.
 */

#ifndef LZW_H
#define LZW_H

#include <stdio.h>
#include <stdlib.h>

/**
 *  Frees all blocks between 'from' and 'to' (inclusive) in the dict.
 */
void free_dict (unsigned char* dict[], int from, int to);

/**
 *  Enters the first 256 ASCII values into dict.
 */
void load_dict_defaults (unsigned char* dict[]);

/**
 *  Reads 12 bits from source and returns them as an int.
 */
int fread_twelve(FILE* source);

/**
 *  LZW decompression of source to dest.
 *  Returns 0 if successful, a positive integer otherwise.
 */
int decompress (FILE* source, FILE* dest);

#endif // LZW_H

/**
 *  decompress.c
 *  Implementation of lempel ziv-welch decompression algorithm using 12-bit fixed-width compression format.
 *
 *  Usage: decompress
 */

#include "lzw.h"
#include <stdio.h>
#include <stdlib.h>

int main (int argc, char* argv[])
{   
    // check for correct number of args
    if (argc != 3)
    {
        printf("Usage: decompress source destination\n");
        return 1;
    }

    // open compressed file
    FILE* source = fopen(argv[1], "r");
    if (source == NULL)
    {
        printf("Error: cannot open %s\n", argv[1]);
        return 1;
    }

    // open destination file
    FILE* dest = fopen(argv[2], "w");
    if (dest == NULL)
    {
        printf("Error: cannot open %s\n", argv[2]);
        return 1;
    }

    // decompress
    if (decompress(source, dest) == 1)
    {
        printf("Decompression failed\n");
        fclose(source);
        fclose(dest);
        return 1;
    }
    else
    {
        fclose(source);
        fclose(dest);
        return 0;
    }
}

/**
 *  lzw.c
 *  Implementation of lempel ziv-welch decompression algorithm using 12-bit fixed-width compression format.
 *
 */

#include "lzw.h"
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <stdbool.h>

unsigned char* dictionary[4096];

/**
 *  Enters the first 256 ASCII values into dict.
 */
void load_dict_defaults (unsigned char* dict[])
{
    unsigned char* temp;
    unsigned char c = 0;
    for (int i = 0; i < 256; i++)
    {
        temp = calloc(2, sizeof(unsigned char));
        temp[0] = c++;
        dict[i] = temp;
    }
}

/**
 *  Frees all blocks between 'from' and 'to' (inclusive) in the dict.
 */
void free_dict (unsigned char* dict[], int from, int to)
{
    for (int i = from; i <= to; i++)
    {
        free(dict[i]);
    }
}

/**
 *  Reads 12 bits from source and returns them as an int.
 *  Returns 0 on an error or EOF.
 */
int fread_twelve(FILE* source)
{
    int value;
    static bool left = true;

    unsigned char buf[2];

    if (left)
    {
        // if left, read two bytes and calculate int
        if (fread(buf, 1, 2, source) != 2) {return 0;}

        if (feof(source))    // we're at a 16-bit end!  
        {
            value = (buf[0] << 8) | buf[1];
        }
        else
        {
            value = (buf[0] << 4) | (buf[1] >> 4);
        }
        left = false;
    }
    else
    {
        // if right, rewind source by one byte, read two bytes and calculate int
        fseek(source, -1, SEEK_CUR);
        if (fread(buf, 1, 2, source) != 2) {return 0;}
        value = ((buf[0] & 0x0F) << 8) | buf[1];
        left = true;
    }
    return value;
}

/**
 *  LZW decompression of source to dest.
 *  Returns 0 if successful, a positive integer otherwise.
 */
int decompress (FILE* source, FILE* dest)
{
    load_dict_defaults(dictionary);

    int dictPos = 256;

    int prevCode, currCode;
    unsigned char* prevString, *currString;

    // read in the first character
    currCode = fread_twelve(source);
    if (dictionary[currCode] == NULL)
    {
        return 1;
    }
    size_t size;
    size = strlen((char*)dictionary[currCode]);
    currString = calloc(size + 2, sizeof(char)); //+2 for upcoming char and null
    if (currString == NULL)
    {
        return 1;
    }
    memcpy(currString, dictionary[currCode], size+1);
    fprintf(dest, "%s", currString);
    prevCode = currCode;
    prevString = currString;

    // read in the rest of the characters
    while ( (currCode = fread_twelve(source)) && currCode )
    {       
        if (dictionary[currCode] == NULL)
        {
            return 1;
        }
        size = strlen((char*)dictionary[currCode]);
        currString = calloc(size + 2, sizeof(char)); //+2 for upcoming char and null
        if (currString == NULL)
        {
            return 1;
        }
        memcpy(currString, dictionary[currCode], size+1);
        fprintf(dest, "%s", currString);

        if (currCode > dictPos)
        {
            dictionary[dictPos++] = (unsigned char*) strncat((char*)prevString, (char*)prevString, 1);
        }
        else
        {
            dictionary[dictPos++] = (unsigned char*) strncat((char*)prevString, (char*)currString, 1);
        }

        // restart dict if full.
        if (dictPos >= 4096)
        {
            free_dict(dictionary, 256, 4096);
            dictPos = 256;
        }

        prevCode = currCode;
        prevString = currString;
    }

    free_dict(dictionary, 0, dictPos-1);

    return 0;
}

Link to the followup question: Decompression implementation II

1 comments

@JS1 2015-07-27 23:41:37

Easy to read

Before I start getting critical of the code, I first want to say that it was well commented and the spacing/style of it was good. You used variable names that were descriptive and made sense. So the code was easy to read and understand.

Bugs: Binary file handling

Your program doesn't handle binary files correctly:

  1. The first minor problem is that you don't open the source and dest file with "rb" and "wb". This doesn't make any difference on a Unix system, but on a Windows system it might, because it may accidentally convert newline characters when you don't want that to happen.
  2. The bigger problem is that you don't output to your destination file correctly. Your current output method:

        fprintf(dest, "%s", currString);
    

    will mistakenly stop at any null characters in currString. That means that you can't decompress any file with a null character in it (such as a binary file).

  3. The same problem exists in your dictionary maintenance. The way you extend your dictionary string is with strncat(). But with strncat() you can't have any null characters in your dictionary. You should be using memcpy() instead. Similarly, you need to avoid using strlen() and instead keep track of the dictionary entry sizes yourself (perhaps by using a separate array called dictionaryLengths).

Bugs: Incorrect decoding

During encoding, the LZW algorithm outputs a 12 bit code for the longest dictionary match it can find, and then it creates a new dictionary entry for the previous match plus the next character. On the next iteration, it can immediately use the new entry it just added. Your decoder has a problem using the just created entry. For example, encoding a file containing aaa:

  1. Add all single characters to dictionary (slots 0-255).
  2. Encode a as 97 (12 bits).
  3. Add aa into the dictionary as entry #256.
  4. Encode aa as 256 (12 bits).

The output file would consist of the two 12-bit entries 97 and 256. Your decoder:

  1. Decodes 97 as 'a'. This is outside the main while loop.
  2. Starts the main loop:
while ( (currCode = fread_twelve(source)) && currCode )
{       
    if (dictionary[currCode] == NULL)
    {
        return 1;
    }

Here, currCode will be 256, but dictionary[256] is NULL because you haven't created it yet. The case of currCode == dictCode needs to be handled specially, where you add the dictionary entry first (using the code you already have under if (currCode > dictPos)), and then you process the entry.

NULL check problems

Your current check to see if currCode is valid uses a NULL check:

    if (dictionary[currCode] == NULL)

This doesn't work after you reset the dictionary because in free_dict(), you free the dictionary entries but you don't set the entries back to NULL. So on the second time through the dictionary, you might end up using an entry that has already been freed.

A better check than a NULL check would be to just check if currCode is in the valid range of [0..dictCode].

Extraneous line causes memory leak

In your main loop, you have these two lines where you call calloc() twice in a row. It was probably a copy and paste error:

    currString = calloc(size + 2, sizeof(char)); //+2 for upcoming char and null
    currString = calloc((strlen((char*)dictionary[currCode]) + 2), sizeof(char));

Avoid backward fseek

There's no need to seek backwards in your file. What you could do instead is remember the previous byte using a static variable. Then you use that byte in the "right" case. Something like this:

static bool left = true;
static unsigned char prevByte = 0;

if (left)
{
    // ...
    prevByte = buf[1];
}
else
{
    if (fread(buf, 1, 1, source) != 1) {return 0;}
    value = ((prevByte & 0x0F) << 8) | buf[0];
    left = true;
}

@fluffy 2015-07-28 05:34:02

I'd advise against using a static variable for storing state. This will make things much more difficult in the future if there's ever a situation where multiple streams are being decoded simultaneously. It's better to encapsulate state (including the FILE*) in a hidden struct type that you pass into the library function.

@JS1 2015-07-28 06:37:47

@fluffy I agree with you but since he already had a static variable there, I gave the quick fix. I was going to suggest what you said about having a state struct, for buffering purposes, because I don't like reading from the file 1-2 bytes at a time. But my review was already getting long and I didn't want to give an overload of information. If there's a followup question I'll add more.

Related Questions

Sponsored Content

2 Answered Questions

[SOLVED] Decompression implementation II

0 Answered Questions

Compression/decompression using Huffman Coding Algorithm

1 Answered Questions

[SOLVED] Simplifying LZW compression & decompression

  • 2016-03-06 18:08:38
  • KrakenJaws
  • 7860 View
  • 1 Score
  • 1 Answer
  • Tags:   java compression

2 Answered Questions

[SOLVED] Basic string compression implementation in C

  • 2017-11-12 17:39:34
  • Emre Can Kucukoglu
  • 1149 View
  • -1 Score
  • 2 Answer
  • Tags:   c strings compression

0 Answered Questions

Optimizing BZ2 decompression code for big data set

1 Answered Questions

[SOLVED] Prefix free compression and decompression

1 Answered Questions

[SOLVED] Multithreaded Decompression

2 Answered Questions

[SOLVED] Python 3 decompression routine 10x slower than C# equivalent

1 Answered Questions

[SOLVED] String compression implementation in C

2 Answered Questions

[SOLVED] Simple compression on steroids - now with decompression

  • 2016-01-05 19:56:26
  • ChrisWue
  • 116 View
  • 1 Score
  • 2 Answer
  • Tags:   c compression

Sponsored Content