By SuddenMoustache


2015-07-28 13:47:00 8 Comments

Here is an update to my previous thread which you can find here. Again, I'd very much appreciate any advice or comments about the structure/logic of the program, style of the code, or reusability of the features.

The fread_twelve function has been heavily modified to deal with detecting and reading the last entry, which is padded from 12 to 16 bits. Let me know what you think!

I'm also quite eager to hear how I can balance the heap. Valgrind says I have "778 allocs, 354 frees" on one run, but I thought I'd freed all my allocations!


/**
 *  file: lzw.h
 *
 *  function declarations for lempel ziv-welch decompression algorithm.
 */

#ifndef LZW_H
#define LZW_H

#include <stdio.h>
#include <stdlib.h>

typedef uint8_t byte;

/**
 *  Frees all blocks between 'from' and 'to' (inclusive) in the dict.
 */
void free_dict (byte* dict[], int from, int to);

/**
 *  Enters the first 256 ASCII values into dict, and their respective sizes into sizes.
 */
void load_dict_defaults (byte* dict[], int sizes[]);

/**
 *  Reads 12 bits from source and returns them as an int.
 */
int fread_twelve(FILE* source);

/**
 *  LZW decompression of source to dest.
 *  Returns 0 if successful, a positive integer otherwise.
 */
int decompress (FILE* source, FILE* dest);

#endif // LZW_H

/**
 *  file: decompress.c
 *
 *  implementation of lempel ziv-welch decompression algorithm using 12-bit fixed-width compression format.
 *
 *  usage: decompress source output
 */

#include "lzw.h"
#include <stdio.h>
#include <stdlib.h>

int main (int argc, char* argv[])
{   
    // check for correct number of args
    if (argc != 3)
    {
        printf("Usage: decompress source output\n");
        return 1;
    }

    // open compressed file
    FILE* source = fopen(argv[1], "rb");
    if (source == NULL)
    {
        printf("Error: cannot open %s\n", argv[1]);
        return 1;
    }

    // open destination file
    FILE* dest = fopen(argv[2], "wb");
    if (dest == NULL)
    {
        printf("Error: cannot open %s\n", argv[2]);
        return 1;
    }

    // decompress
    if (decompress(source, dest) == 1)
    {
        printf("Decompression failed\n");
        fclose(source);
        fclose(dest);
        return 1;
    }
    else
    {
        fclose(source);
        fclose(dest);
        return 0;
    }
}

/**
 *  file: lzw.c
 *  
 *  
 *  Implementation of lempel ziv-welch decompression algorithm using 12-bit fixed-width compression format.
 *
 */

#include "lzw.h"
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <stdbool.h>

byte* dictionary[4096];
int dict_entry_sizes[4096];

/**
 *  Enters the first 256 ASCII values into dict.
 */
void load_dict_defaults (byte* dict[], int sizes[])
{
    byte* temp;
    byte c = 0;
    for (int i = 0; i < 256; i++)
    {
        temp = malloc(1);
        temp[0] = c++;
        dict[i] = temp;
        dict_entry_sizes[i] = 1;
    }
}

/**
 *  Frees all blocks between 'from' and 'to' (inclusive) in the dict.
 */
void free_dict (byte* dict[], int from, int to)
{
    for (int i = from; i <= to; i++)
    {
        free(dict[i]);
    }
}

/**
 *  Gets 12 bits from source and returns them as an int.
 *  Returns 0 on an error or EOF.
 */
int fread_twelve(FILE* source)
{
    int value;
    static bool left = true;
    static byte prevBytes[2];

    byte buf[3];

    if (left)
    {
        if (fread(buf, 1, 2, source) != 2) 
        {
            // reached eof: even number of entries
            return 0;
        }
        if (fread(buf+2, 1, 1, source) != 1)
        {
            // reached eof: buf[0] and buf[1] are the padded last entry
            value = (buf[0] << 8) | buf[1];
        }
        else
        {
            value = (buf[0] << 4) | (buf[1] >> 4);
            left = false;
            prevBytes[0] = buf[1];
            prevBytes[1] = buf[2];
        }
    }
    else
    {
        value = ((prevBytes[0] & 0x0F) << 8) | prevBytes[1];
        left = true;
    }
    return value;
}

/**
 *  LZW decompression of source to dest.
 *  Returns 0 if successful, a positive integer otherwise.
 */
int decompress (FILE* source, FILE* dest)
{
    load_dict_defaults(dictionary, dict_entry_sizes);

    int dictPos = 256;

    int prevCode, currCode;
    byte* prevString, *currString;

    // read in the first character
    currCode = fread_twelve(source);
    size_t size = dict_entry_sizes[currCode];
    currString = malloc(size + 1); //+1 for upcoming char
    if (currString == NULL)
    {
        fprintf(stderr, "decompress: error assigning currString.\n");
        return 1;
    }
    memcpy(currString, dictionary[currCode], size);
    fwrite(currString, size, 1, dest);
    prevCode = currCode;
    prevString = currString;

    // read in the rest of the characters
    size_t oldSize;
    while ( (currCode = fread_twelve(source)) && currCode )
    {       
        if (currCode > dictPos)
        {
            fprintf(stderr, "decompress: currCode out of dictionary range.\n");
            return 1;
        }

        oldSize = dict_entry_sizes[prevCode];

        if (currCode < dictPos)     // code in dict
        {
            size = dict_entry_sizes[currCode];
            currString = malloc(size + 1); // +1 for adding char on next loop
            if (currString == NULL)
            {
                fprintf(stderr, "decompress: error assigning currString.\n");
                return 1;
            }
            memcpy(currString, dictionary[currCode], size);
            fwrite(currString, size, 1, dest);

            memcpy(prevString + oldSize, currString, 1);
            dictionary[dictPos] = prevString;
            dict_entry_sizes[dictPos++] = oldSize + 1;

            // restart dict if full. 
            if (dictPos >= 4096)
            {
                free_dict(dictionary, 256, 4095);
                dictPos = 256;
            }
        }
        else    // code not in dict until this pass
        {
            memmove(prevString + oldSize, prevString, 1);   // memmove allows overlap
            size = oldSize + 1;
            fwrite(prevString, size, 1, dest);
            dictionary[dictPos] = prevString;
            dict_entry_sizes[dictPos++] = size; 
            currString = malloc(size + 1);  // +1 for adding char on next loop
            if (currString == NULL)
            {
                fprintf(stderr, "decompress: error assigning currString.\n");
                return 1;
            }
            memcpy(currString, prevString, size);
        }
        prevCode = currCode;
        prevString = currString;
    }

    free(prevString);       
    free_dict(dictionary, 0, dictPos-1);

    return 0;
}

2 comments

@JS1 2015-07-29 02:09:54

Preface

Before I start the review, I want to say it was difficult to test your program without an encoder. So I wrote my own encoder, with your decoder's specifications in mind (12-bit codes, MSB format, reset dictionary at 4096 entries, etc). I also wrote my own decoder to make sure that my encoder worked. You can find both of these programs at Github here. There are some discrepencies between my decoder and yours which I will mention below.

No need to memcpy 1 character

In a couple of places, you use memcpy to tack on a single character to the end of a buffer. You can simplify these calls:

        memcpy(prevString + oldSize, currString, 1);

to just:

        prevString[oldSize] = currString[0];

Can simplify code to add to dictionary

Your two cases of "using the just added entry" and "using an existing entry" can be merged into one case. You can handle both cases early (before you need to use the entry). The only difference is where the last character of the new entry comes from. So I rewrote your loop to look like this:

    while ( (currCode = fread_twelve(source)) && currCode )
    {
        if (currCode > dictPos)
        {
            fprintf(stderr, "decompress: currCode out of dictionary range.\n");
            return 1;
        }

        oldSize = dict_entry_sizes[prevCode];

        // Add entry to dictionary first, in case we need to use it below.
        if (currCode < dictPos)
            prevString[oldSize] = dictionary[currCode][0];
        else
            prevString[oldSize] = prevString[0];
        dictionary[dictPos] = prevString;
        dict_entry_sizes[dictPos++] = oldSize + 1;

        size = dict_entry_sizes[currCode];
        currString = malloc(size + 1); // +1 for adding char on next loop
        if (currString == NULL)
        {
            fprintf(stderr, "decompress: error assigning currString.\n");
            return 1;
        }
        memcpy(currString, dictionary[currCode], size);
        fwrite(currString, size, 1, dest);

        // restart dict if full.
        if (dictPos >= 4096)
        {
            free_dict(dictionary, 256, 4095);
            dictPos = 256;
        }
        prevCode = currCode;
        prevString = currString;
    }

End of file handling

When I wrote my encoder, I outputted the last partial code in MSB format with four bits of trailing zeroes. I thought that would be the most natural way to do it. So for example, if the last code was 0x123 and the last code was output on a byte boundary, then my encoded file would end with these two bytes: 12 30.

Your decoder appears to expect the last two bytes to look like this: 01 23. Here, the 4 bits of padding come in on the left side. While that is a possible encoding, it requires your fread_twelve() function to make a special case for it. If you use my encoding, your end of file case would look just like the normal case, and you could remove the special case for it.

Dictionary reset handling

I'm not sure your decoder handles dictionary resets correctly. This again may be a difference between my encoder and yours. But when I ran your decoder against my encoder it didn't sync up after the first dictionary reset. It looks like your code might use a freed dictionary entry after a reset if the previous code was higher than 255, but again I'm not sure that's true depending on your encoder.

@glampert 2015-07-29 18:13:53

JS1, that's a neat little implementation you've come up with. Why don't you post for review and keep us busy? ;)

@JS1 2015-07-29 18:15:34

@glampert Which one, the encoder, the decoder, or both?

@glampert 2015-07-29 18:17:36

Up to you, I'd post each on a separate question. Both seem like good code review material to me.

@glampert 2015-07-29 18:19:14

Well, really, the encoder is small, so it would also be fine to make it a single question containing both...

@JS1 2015-07-29 18:47:25

@glampert Here you go

@glampert 2015-07-29 19:17:02

Awesome! I'm sure you'll get very good reviews :)

@glampert 2015-07-28 19:14:36

A few recommendations you might find useful:

Cleanup the global/static data:

Right now your program make use of some global/static data. E.g.: dictionary, dict_entry_sizes, prevBytes and left (anything else that I've missed?).

This poses a few limitations. For instance, your decompression routines cannot be run in parallel, since there is only one shared dictionary and friends, so running that code in different threads would result in data races. A good compression library should definitely be suitable for concurrent use!

Another potential issue is that it would be much harder to expand your code to support multiple compressors that can be paused and resumed, again because they would all share the same common global variables.

To improve this, you could make use of a context structure for your compressor. If you haven't experimented with structs in C yet, this should be a great opportunity. Move all the global and static variables into a structure declared it in your header file. Example:

struct LzwCompression
{
    bool left;
    byte prevBytes[2];
    byte* dictionary[4096];
    int dict_entry_sizes[4096];
};

Now each of the decompression functions must take a pointer to this context struture and source the variables from it, instead of reading the globals. Example with decompress():

int decompress (LzwCompression* lzw, FILE* source, FILE* dest)
{
    load_dict_defaults(lzw->dictionary, lzw->dict_entry_sizes);
    lzw->left = true;
    ...

And the caller would then provide an instance of this structure to the decompression functions:

LzwCompression lzw;
decompress(&lzw, sourceFile, destFile);

// You can have as many 'LzwCompressions' as you need, without the danger of they stepping on each other's toes.

After this change is applied, be sure to properly initialize the the structure. Your code probably relies on the globals being zero initialized. The compiler does that for you for global variables, but a struct declared at function scope is not cleared by default. So you might memset() the LzwCompression instance either on main() or right at the top of decompress(). Also set the left field to true after memseting the structure, since that was the starting value of the global originally.

Hide implementation details:

Internal functions that are only relevant to the implementation in the .c file should not be exposed in your header file. Keep header files as lean and mean as you can to streamline compile times. I believe the only function the user of your decompression cares about is decompress(), so remove the prototypes of the other ones from the header file and place those functions at the top of your .c file, marking them as static. In this case the static keyword means that the function is local, so the linker doesn't export it. It is not quite the same meaning of a static variable, so you don't have to worry with shared/global data.

// lzw.c

static void load_dict_defaults (byte* dict[], int sizes[])
{
    ...
}

static void free_dict (byte* dict[], int from, int to)
{
    ...
}

.. etc ...

Odds and ends:

  • I see that you already use the bool type, so extend that to the return type of functions that only have two possible outcomes, success (true) and failure (false). Returning integer error codes is error prone for the programmer since there isn't a single agreement or convention on which values should be returned. A boolean is much more obvious of its meaning.

  • Go easy on the magic numbers. You have a lot of repeated literal constants out there that could be consolidated into a few named constants. #define and enum are the traditional C ways of doing that, but I'd instead recommend preferring static consts whenever possible to gain some type safety.

  • You correctly use stderr in your decompression routines to print errors, but in main() all error reporting is done with printf, which outputs to stdout. Error output should always be sent to stderr, so that users can easily filter the program output. Replace those error prints with stderr. You might also consider adding the output of strerror() to the messages involving system errors (like fopen() and friends).


Well, that's all the juice I have! Compression is not my field, so I don't know how you can improve the algorithm. Hopefully someone else will provide some comments on that.

As for the memory leaks you mention, I haven't tested, sorry. If the output of Valgrind is not helping you much, you might try the Clang LeakSanitizer which is a very powerful tool with a super detailed error output that should make it very easy to find the source of the leaks.


EDIT: In a second thought, I think I do know where your memory leaks are:

void load_dict_defaults (byte* dict[], int sizes[])
{
    byte* temp;
    byte c = 0;
    for (int i = 0; i < 256; i++)
    {
        temp = malloc(1); // <-- is this ever freed somewhere?
        temp[0] = c++;
        dict[i] = temp;
        dict_entry_sizes[i] = 1;
    }
}

If I follow correctly the logic of decompress(), that 1 byte malloc you did in the above function gets overwritten without ever being freed. But actually, mallocing a single byte of memory is very inefficient, so you should try to refactor that to instead perform a single allocation of the sum of all bytes. I'll leave that as an exercise ;).

@glampert 2015-07-28 19:48:29

@SuddenCollapse, yes, refactoring the code to use a struct is as easy as appending struct_name-> to each variable :)! BTW, I think I've found the source of your memory leaks, take a new look at the answer.

@glampert 2015-07-28 20:02:17

@SuddenCollapse, I'm afraid not. Anywhere you had dictionary[foo] you'll replace with lzw->dictionary[foo], and so on for the others, because the variable is now a member of a new scope (the struct). But hopefully, you're using some smart text editor with find/replace capabilities? If not, this might be a cue for me introduce you to the Vim editor.

@glampert 2015-07-28 20:19:35

@SuddenCollapse, no because (I think) you overwrite most of the pointers in the while loop. E.g.: dictionary[dictPos] = prevString; You'd have to free the pointer at dictionary[dictPos] before assigning a new value to it. Make sure to replace all occurrences of the old globals. Comment them out to be sure you didn't forget some!

@glampert 2015-07-28 20:28:26

@SuddenCollapse, you know what, I think there's one more little thing you might need to do for the struct to work properly. You see, the globals you had before were being zero initialized by the compiler for you, and I think your code might have been relying on that. When you declare a struct in a function scope, this is no longer true. So you might need to memset() your struct either before calling the decompress or right at the beginning of the function. Also make sure to initialize left and prevBytes appropriately. Sorry for overlooking that.

Related Questions

Sponsored Content

1 Answered Questions

[SOLVED] Decompression implementation

0 Answered Questions

Compression/decompression using Huffman Coding Algorithm

1 Answered Questions

[SOLVED] Simplifying LZW compression & decompression

  • 2016-03-06 18:08:38
  • KrakenJaws
  • 8290 View
  • 1 Score
  • 1 Answer
  • Tags:   java compression

2 Answered Questions

[SOLVED] Basic string compression implementation in C

  • 2017-11-12 17:39:34
  • Emre Can Kucukoglu
  • 1407 View
  • -1 Score
  • 2 Answer
  • Tags:   c strings compression

0 Answered Questions

Optimizing BZ2 decompression code for big data set

1 Answered Questions

[SOLVED] Prefix free compression and decompression

1 Answered Questions

[SOLVED] Multithreaded Decompression

2 Answered Questions

[SOLVED] Python 3 decompression routine 10x slower than C# equivalent

1 Answered Questions

[SOLVED] String compression implementation in C

2 Answered Questions

[SOLVED] Simple compression on steroids - now with decompression

  • 2016-01-05 19:56:26
  • ChrisWue
  • 126 View
  • 1 Score
  • 2 Answer
  • Tags:   c compression

Sponsored Content