Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #include <time.h>
- #include <stdio.h>
- #include <string.h>
- #include <stdlib.h>
- #define WORD_SIZE (1 << 6)
- #define INPUT_BUFFER_SIZE (1 << 10)
- #define OUTPUT_BUFFER_SIZE (1 << 10)
- #define MAX_WORDS (1 << 25)
- #define MAX_NODES (1 << 19)
- #define HASH_TABLE_HEIGHT (1 << 20)
- #define MAX_TREE_LEVELS (1 << 5)
- #define BIN_BUFFER_SIZE (1 << 10)
- int fileSize;
- typedef unsigned int UI;
- typedef long long unsigned int LLUI;
- typedef unsigned char byte;
- typedef struct Node {
- char word[WORD_SIZE];
- byte word_len;
- UI code;
- byte code_len;
- UI count;
- byte isLeaf;
- struct Node* left;
- struct Node* right;
- struct Node* next;
- struct Node* sibling;
- } Node;
- Node allNodes[MAX_NODES + 1] = {};
- Node* lastNode = allNodes - 1;
- #define endNodes (allNodes + MAX_NODES)
- #define lastUsableNode (allNodes + MAX_NODES - 1)
- Node* wordSequence[MAX_WORDS];
- int wordSequenceLength = 0;
- int nWords = 0;
- int current_hash;
- int colisions = 0;
- Node* hashTable[HASH_TABLE_HEIGHT] = {};
- #define NEXTHASH(h, c)((h<<5)-h+c)
- byte* bufferIt;
- char inputBuffer[INPUT_BUFFER_SIZE];
- #define inputBufferEnd (inputBuffer + INPUT_BUFFER_SIZE)
- char last_char = 0;
- char word_buffer[WORD_SIZE];
- char* word_begin;
- char* word_end;
- byte addWordMap[256][256];
- Node* minHeap[MAX_NODES];
- int minHeapLength;
- typedef struct NodeList {
- Node* head;
- Node* foot;
- } NodeList;
- NodeList treeLevels[MAX_TREE_LEVELS] = {};
- int nLevels = 0;
- byte binBuffer[BIN_BUFFER_SIZE + 8];
- byte current_bin;
- byte current_bin_len;
- static inline void pushToBinBuffer(UI bin, byte len) {
- LLUI temp = current_bin | (((LLUI)bin) << current_bin_len);
- current_bin_len += len;
- while (current_bin_len >= 8) {
- *bufferIt++ = 0xff & temp;
- temp >>= 8;
- current_bin_len -= 8;
- }
- current_bin = (byte) temp;
- }
- static inline Node* newNode() {
- if (lastNode == lastUsableNode) {
- puts("Error: allNodes overflow");
- exit(0);
- }
- return ++lastNode;
- }
- static inline void minHeapPush(Node* node) {
- int pos = minHeapLength++;
- int count = node->count;
- int parent_pos;
- Node* parent;
- while (pos) {
- parent_pos = (pos - 1) >> 1;
- parent = minHeap[parent_pos];
- if (parent->count <= count) {
- break;
- }
- minHeap[pos] = parent;
- pos = parent_pos;
- }
- minHeap[pos] = node;
- }
- byte ptFlag;
- byte charToStateCode(char chr) {
- if (!chr) return 1;
- if (chr >= 'a' && chr <= 'z' || chr >= 'A' && chr <= 'Z' || chr & ptFlag) return 2;
- if (chr >= '0' && chr <= '9') return 3;
- if (chr == 10 || chr == 13 || chr == '\t' || chr == ' ') return 4;
- return 5;
- }
- void initAddWordMap(byte map[][256]) {
- byte tempMap[6][6] = {};
- int i, j;
- for (i=2; i<=5; ++i) {
- tempMap[i][1] = 1;
- for (j=2; j<=5; ++j) {
- if (i != j || i == 5 || i == 4) {
- tempMap[i][j] = 1;
- }
- }
- }
- for (i=0; i<256; ++i) {
- for (j=0; j<256; ++j) {
- byte last = charToStateCode(i);
- byte next = charToStateCode(j);
- map[i][j] = tempMap[last][next];
- }
- }
- }
- static inline int compareStr(char const *a, char const *b) {
- while (*a == *b && *a) ++a, ++b;
- return *a - *b;
- }
- static inline void copyStr(char *dst, char const *src) {
- char copied;
- do {
- copied = *dst++ = *src++;
- } while (copied);
- }
- static inline void countWord() {
- *word_end = 0;
- if (word_end - word_begin >= WORD_SIZE) {
- puts("Error: Word Overflow");
- exit(0);
- }
- int index = ((unsigned int) current_hash) % HASH_TABLE_HEIGHT;
- Node* node = hashTable[index];
- while (node && compareStr(node->word, word_begin)) {
- node = node->next;
- }
- if (node) {
- ++ node->count;
- } else {
- ++ nWords;
- if (hashTable[index]) {
- ++ colisions;
- }
- node = newNode();
- node->count = 1;
- node->next = hashTable[index];
- node->word_len = word_end - word_begin;
- hashTable[index] = node;
- }
- if (wordSequenceLength == MAX_WORDS) {
- puts("Error: countWord: wordSequence overflow");
- exit(0);
- }
- wordSequence[wordSequenceLength++] = node;
- current_hash = 0;
- word_begin = word_end = lastNode[1].word;
- }
- static inline void iterateInputByte(char chr) {
- if (addWordMap[last_char][chr]) {
- countWord();
- }
- current_hash = NEXTHASH(current_hash, chr);
- *word_end++ = last_char = chr;
- }
- static inline Node* removeMinHeapRoot() {
- int pos = 0;
- Node* root = minHeap[0];
- Node* node = minHeap[--minHeapLength];
- int count = node->count;
- for (;;) {
- int pos_a = (pos << 1) | 1;
- int pos_b = (pos + 1) << 1;
- int count_a = 0x7fffffff;
- int count_b = 0x7fffffff;
- if (pos_a < minHeapLength) {
- count_a = minHeap[pos_a]->count;
- }
- if (pos_b < minHeapLength) {
- count_b = minHeap[pos_b]->count;
- }
- int child_pos;
- int child_count;
- if (count_a <= count_b) {
- child_pos = pos_a;
- child_count = count_a;
- } else {
- child_pos = pos_b;
- child_count = count_b;
- }
- if (child_count >= count) {
- break;
- }
- minHeap[pos] = minHeap[child_pos];
- pos = child_pos;
- }
- minHeap[pos] = node;
- return root;
- }
- static inline void fillMinHeap() {
- minHeapLength = 0;
- Node* node = allNodes;
- while (node <= lastNode) {
- minHeapPush(node++);
- }
- }
- void generateHuffmanTree() {
- while (minHeapLength > 1) {
- Node* a = removeMinHeapRoot();
- Node* b = removeMinHeapRoot();
- Node* node = newNode();
- node->left = a;
- node->right = b;
- node->count = a->count + b->count;
- minHeapPush(node);
- }
- }
- void prepareHuffmanTree(Node* node, int level, UI code, byte code_len) {
- if (level == nLevels) {
- ++nLevels;
- }
- NodeList* list = treeLevels + level;
- Node* foot = list->foot;
- if (!foot) {
- list->head = node;
- } else {
- foot->sibling = node;
- }
- list->foot = node;
- node->code = code;
- node->code_len = code_len;
- Node* left = node->left;
- if (left) {
- node->isLeaf = 0;
- prepareHuffmanTree(left, level + 1, code, code_len + 1);
- prepareHuffmanTree(node->right, level + 1, code | (1 << code_len), code_len + 1);
- } else {
- node->isLeaf = 1;
- }
- }
- void resetBinBuffer() {
- bufferIt = binBuffer;
- current_bin = 0;
- current_bin_len = 0;
- }
- void writeHuffmanTreeStructure(FILE* fout) {
- int i;
- resetBinBuffer();
- for (i=0; i<nLevels; ++i) {
- Node* node = treeLevels[i].head;
- while (node) {
- pushToBinBuffer(node->isLeaf, 1);
- int length = bufferIt - binBuffer;
- if (length >= BIN_BUFFER_SIZE) {
- fwrite(binBuffer, length, 1, fout);
- bufferIt = binBuffer;
- }
- node = node->sibling;
- }
- }
- if (current_bin_len) {
- *bufferIt++ = current_bin;
- }
- int length = bufferIt - binBuffer;
- if (length) {
- fwrite(binBuffer, length, 1, fout);
- }
- }
- void writeUniqueWords(FILE* fout) {
- int i;
- for (i=0; i<nLevels; ++i) {
- Node* node = treeLevels[i].head;
- while (node) {
- if (node->isLeaf) {
- fwrite(node->word, node->word_len + 1, 1, fout);
- }
- node = node->sibling;
- }
- }
- }
- byte current_byte;
- byte current_shift;
- NodeList nodeQueue = {};
- static inline void addNodeToQueue() {
- Node* node = newNode();
- Node* foot = nodeQueue.foot;
- if (!foot) {
- nodeQueue.head = node;
- } else {
- foot->next = node;
- }
- nodeQueue.foot = node;
- }
- static inline Node* popNodeFromQueue() {
- Node* node = nodeQueue.head;
- if (node) {
- nodeQueue.head = node->next;
- if (!nodeQueue.head) {
- nodeQueue.foot = NULL;
- }
- }
- return node;
- }
- static inline void iterateBitReadingHuffmanTreeStructure() {
- Node* node = popNodeFromQueue();
- node->isLeaf = (current_byte >> current_shift) & 1;
- if (!node->isLeaf) {
- addNodeToQueue();
- node->left = lastNode;
- addNodeToQueue();
- node->right = lastNode;
- }
- ++ current_shift;
- }
- void readHuffmanTreeStructure(FILE* fin) {
- lastNode = allNodes - 1;
- fseek(fin, 0L, SEEK_END);
- fileSize = ftell(fin);
- int bytesLeft = fileSize;
- int bufferSize;
- if (fileSize < BIN_BUFFER_SIZE) {
- bufferSize = fileSize;
- } else {
- bufferSize = BIN_BUFFER_SIZE;
- }
- byte* bufferEnd = binBuffer + bufferSize;
- fseek(fin, 0L, SEEK_SET);
- fread(binBuffer, bufferSize, 1, fin);
- bytesLeft -= bufferSize;
- current_shift = 0;
- addNodeToQueue();
- Node* node;
- bufferIt = binBuffer;
- current_byte = *bufferIt;
- while (nodeQueue.head) {
- iterateBitReadingHuffmanTreeStructure();
- if (current_shift == 8) {
- if (++bufferIt == bufferEnd) {
- bufferIt = binBuffer;
- if (bytesLeft) {
- if (bytesLeft < BIN_BUFFER_SIZE) {
- bufferSize = bytesLeft;
- } else {
- bufferSize = BIN_BUFFER_SIZE;
- }
- bytesLeft -= bufferSize;
- fread(binBuffer, BIN_BUFFER_SIZE, 1, fin);
- }
- }
- current_shift = 0;
- current_byte = *bufferIt;
- }
- }
- prepareHuffmanTree(allNodes, 0, 0, 0);
- }
- void readWords(FILE* fin) {
- int nNodes = lastNode - allNodes + 1;
- int startPos = (nNodes + 7)/8;
- fseek(fin, startPos, SEEK_SET);
- int bytesLeft = fileSize - startPos;
- int bufferSize;
- Node* node;
- for (node = allNodes; node <= lastNode; ++node) {
- if (!node->isLeaf) {
- continue;
- }
- char* str = node->word;
- while (*str++ = fgetc(fin));
- node->word_len = str - node->word - 1;
- }
- }
- void readFile(FILE* fin) {
- fseek(fin, 0L, SEEK_END);
- fileSize = ftell(fin);
- fseek(fin, 0L, SEEK_SET);
- int i;
- initAddWordMap(addWordMap);
- word_begin = word_end = lastNode[1].word;
- for (i=fileSize / INPUT_BUFFER_SIZE; i--;) {
- fread(inputBuffer, sizeof(inputBuffer), 1, fin);
- char* ptr = inputBuffer;
- while (ptr < inputBufferEnd) {
- iterateInputByte(*ptr++);
- }
- }
- i = fileSize % INPUT_BUFFER_SIZE;
- if (i) {
- fread(inputBuffer, i, 1, fin);
- char* ptr = inputBuffer;
- for (;i--;) {
- iterateInputByte(*ptr++);
- }
- }
- iterateInputByte(0);
- fillMinHeap();
- generateHuffmanTree();
- prepareHuffmanTree(minHeap[0], 0, 0, 0);
- }
- void writeCompressedWords(FILE* fout) {
- int i;
- resetBinBuffer();
- Node** ptr = wordSequence;
- Node** end = wordSequence + wordSequenceLength;
- Node* node;
- fwrite(&wordSequenceLength, sizeof(int), 1, fout);
- for (;ptr < end; ++ptr) {
- node = *ptr;
- pushToBinBuffer(node->code, node->code_len);
- int length = bufferIt - binBuffer;
- if (length >= BIN_BUFFER_SIZE) {
- fwrite(binBuffer, length, 1, fout);
- bufferIt = binBuffer;
- }
- }
- if (current_bin_len) {
- *bufferIt++ = current_bin;
- }
- int length = bufferIt - binBuffer;
- if (length) {
- fwrite(binBuffer, length, 1, fout);
- }
- }
- int wordsLeft;
- Node* compass;
- static inline void decompressWordBitIteration(FILE* fout) {
- if (compass->isLeaf) {
- fwrite(compass->word, compass->word_len, 1, fout);
- compass = allNodes;
- -- wordsLeft;
- return;
- }
- if ((current_byte >> current_shift) & 1) {
- compass = compass->right;
- } else {
- compass = compass->left;
- }
- ++ current_shift;
- }
- void decompressWords(FILE* fin, FILE* fout) {
- fread(&wordsLeft, sizeof(int), 1, fin);
- compass = allNodes;
- int bytesLeft = fileSize - ftell(fin);
- int bufferSize;
- if (fileSize < BIN_BUFFER_SIZE) {
- bufferSize = fileSize;
- } else {
- bufferSize = BIN_BUFFER_SIZE;
- }
- byte* bufferEnd = binBuffer + bufferSize;
- fread(binBuffer, bufferSize, 1, fin);
- bytesLeft -= bufferSize;
- current_shift = 0;
- bufferIt = binBuffer;
- current_byte = *bufferIt;
- while (wordsLeft) {
- decompressWordBitIteration(fout);
- if (current_shift == 8) {
- if (++bufferIt == bufferEnd) {
- bufferIt = binBuffer;
- if (bytesLeft) {
- if (bytesLeft < BIN_BUFFER_SIZE) {
- bufferSize = bytesLeft;
- } else {
- bufferSize = BIN_BUFFER_SIZE;
- }
- bytesLeft -= bufferSize;
- fread(binBuffer, BIN_BUFFER_SIZE, 1, fin);
- }
- }
- current_shift = 0;
- current_byte = *bufferIt;
- }
- }
- }
- void compress(FILE* fin, FILE* fout) {
- clock_t t1 = clock();
- readFile(fin);
- writeHuffmanTreeStructure(fout);
- writeUniqueWords(fout);
- writeCompressedWords(fout);
- clock_t t2 = clock();
- int l1 = ftell(fin);
- int l2 = ftell(fout);
- fclose(fin);
- fclose(fout);
- float t = (t2 - t1) / (float) CLOCKS_PER_SEC;
- printf("Tempo de execucao: %.4fs\n", t);
- printf("Espaco liberado: %.2f%%\n", 100 - l2 * 100.0f / l1);
- }
- void decompress(FILE* fin, FILE* fout) {
- clock_t t1 = clock();
- readHuffmanTreeStructure(fin);
- readWords(fin);
- decompressWords(fin, fout);
- clock_t t2 = clock();
- fclose(fin);
- fclose(fout);
- float t = (t2 - t1) / (float) CLOCKS_PER_SEC;
- printf("Tempo de execucao: %.4fs\n", t);
- }
- void clearBufferStdin() {
- while (getchar()!='\n');
- }
- void pause() {
- puts("Pressione enter para prosseguir");
- clearBufferStdin();
- }
- void readStr(char s[], int max) {
- fgets(s, max, stdin);
- char* p = strchr(s, '\n');
- if (p) {
- *p = '\0';
- } else {
- clearBufferStdin();
- }
- }
- int lerOpcao(char const s[], int min, int max) {
- int op;
- for (;;) {
- puts(s);
- printf("> ");
- scanf("%d", &op);
- clearBufferStdin();
- if (op >= min && op <= max) {
- break;
- }
- puts("Codigo de operacao invalido");
- pause();
- }
- }
- int main(int argc, char const *argv[]) {
- int op = lerOpcao("Operacao [1 = Compressao, 2 = Descompressao]", 1, 2);
- if (op == 1) {
- if (lerOpcao("Considerar caracteres fora da tabela ASCII como letras? [1 = Sim, 2 = Nao])", 1, 2) == 1) {
- ptFlag = 0x80;
- } else {
- ptFlag = 0;
- }
- }
- FILE* fin;
- FILE* fout;
- char fname[1024] = "";
- for (;;) {
- printf("Arquivo de entrada: ");
- readStr(fname, 1024);
- if (*fname) {
- fin = fopen(fname, "rb");
- if (!fin) {
- printf("Falha ao abrir %s\n", fname);
- pause();
- } else {
- break;
- }
- }
- }
- for (;;) {
- printf("Arquivo de saida: ");
- readStr(fname, 1024);
- if (*fname) {
- fout = fopen(fname, "wb");
- if (!fout) {
- printf("Falha ao abrir %s\n", fname);
- pause();
- } else {
- break;
- }
- }
- }
- if (op == 1) {
- compress(fin, fout);
- } else {
- decompress(fin, fout);
- }
- }
Add Comment
Please, Sign In to add comment