Compress

import math
import numpy as np
import scipy as sp
import scipy.sparse as ss
import bz2
import pickle
import time
import timing
import os
import pandas as pd
from pathlib import Path

# Timers used for debug note time.clock() performs better than time.time()
start = time.time()
# Name of the file to be compressed
filename = Path(
    r'C:\Users\gashw\OneDrive\Desktop\Project Work\implement\Samples\Data\HR_edges.csv')
# Name of the compressed file
filename2 = Path(
    r'C:\Users\gashw\OneDrive\Desktop\Project Work\implement\Samples\Compressed\HR_edges_op.bin')
# Read data from edge list into a pandas dataframe
data = pd.read_csv(filename, sep=' ', header=None, dtype=np.int64)
end = time.time()
print("Time to read edgelist :")
print(end - start)
start = time.time()
# References not copies
rows = data[0]
cols = data[1]
ones = np.ones(len(rows), np.uint32)
# Load the data drame into a sparse coo matrix and convert it into an array
matrix = ss.coo_matrix((ones, (rows, cols)), dtype=np.int8).toarray()
end = time.time()
print("Time to generate sparse matrix :")
print(end - start)
# Creates a list to store the final output values before writing into a file
list1 = []

# Finds the parent of current node


def parent(index):
    return int((index-1) / 2)

# Finds the sibling of current node left or right sibling based on index


def sibling(index):
    if(index % 2 == 1):
        return index+1
    else:
        return index-1


start = time.time()
len_row = matrix.shape[0]
# Find the height of the binary tree and use it to find n i.e the no.of elements in the array
height = int(math.log2(len_row)) + 1
n = (2 ** height) - 1

# Loop for lal the elements in row i of the matrix to generate compressed format of that row
for i in range(len_row):
    print("Element "+str(i))
    # Input_array = i'th row of matrix array
    input_array = matrix[i]
    # Initilaize temp array with -1 values
    temp_array = np.full(n, -1, dtype=np.int8)
    start_index = n-len(input_array)
    for i in range(len_row):
        if(input_array[i] == 1):
            current_index = start_index+i
            dcn_reached = False
            while dcn_reached == False:
                temp_array[current_index] = 1
                if(temp_array[parent(current_index)] != 1):
                    temp_array[parent(current_index)] = 1
                    temp_array[sibling(current_index)] = 0
                    current_index = parent(current_index)
                else:
                    dcn_reached = True
    i = np.where(temp_array != -1)
    output = temp_array[i]
    list1.append(output)
bf = bz2.BZ2File(filename2, "wb")
pickle.dump(list1, bf, 2)
bf.close()
end = time.time()
print("Time to compress:")
print(end - start)