Untitled

from urllib.parse import urlparse

import os
import wget
import gzip
import shutil
import zipfile


GRAPH_URLS = [
    'http://snap.stanford.edu/data/loc-brightkite_edges.txt.gz',
    'https://snap.stanford.edu/data/amazon0302.txt.gz',
    'https://snap.stanford.edu/data/roadNet-PA.txt.gz',
    'https://snap.stanford.edu/data/cit-HepPh.txt.gz',
    'https://snap.stanford.edu/data/amazon0505.txt.gz',
    'https://snap.stanford.edu/data/roadNet-CA.txt.gz',
]

FULLGRAPH_POWS = [
    i
    for p in range(0, 4)
    for i in range(10 ** p, 10 ** (p + 1), 10 ** p)
]


def donwload_graph(url):
    archive_path = './input/' + os.path.split(urlparse(url).path)[1]
    file_path = os.path.splitext(archive_path)[0]

    if os.path.exists(file_path) is False:
        wget.download(url, './input')

        with gzip.open(archive_path, 'rb') as f_in:
            with open(file_path, 'wb') as f_out:
                shutil.copyfileobj(f_in, f_out)

        content = None
        with open(file_path, 'r') as f_in:
            content = f_in.readlines()

        with open(file_path, 'w') as f_out:
            for line in content:
                if not line.startswith('#'):
                    f_out.write(str.replace(line, ' ', ' '))

        os.remove(archive_path)


if __name__ == '__main__':
    for url in GRAPH_URLS:
        donwload_graph(url)