Untitled

import os
import sys
import json
import time
import traceback
import networkx as nx
import community as cm
import collections
import math
import matplotlib.pyplot as plt
from log_binning import *

G = nx.Graph()
affiliation_city = {}
auth_city = {}
city={}
cities= set()


def debug(x, end = '\n'):
    sys.stderr.write(x)
    sys.stderr.write(end)


def extract_data_from_json(filename, domain):
    debug("extract_data_from_json('%s')" % (filename))
    # open input json file
    filepath = os.getcwd() + '\\' + domain + '\\' + filename
    f = open(filepath)

    data = None
    # parse json file
    try:
        data = json.loads(f.read())
    except Exception as e:
        print(filepath + " didn't work !")
        traceback.print_exc()
        f.close()

    f.close()
    return data

def extract_edges(papers):
    debug("extract_edges()")

    for paper in papers:
        if (not paper.get('affiliation')):
            continue
        aff = paper['affiliation']
        if (type(aff) is list):
            for a in aff:
                if(a.get('afid') and a.get('affiliation-city') and a.get('affiliation-country') and a['affiliation-country'].lower()=='france'):
                    affid = int(a['afid'])
                    affiliation_city[affid] = a['affiliation-city'].lower()
        else:
            if(aff.get('afid') and aff.get('affiliation-city') and aff.get('affiliation-country').lower()=='france'):
                affid = int(aff['afid'])
                affiliation_city[affid] = aff['affiliation-city'].lower()
        # if that entry doesn't have an author ==> it's not a paper
        if (not paper.get('author')):
            affid = int(authors['afid'])
            authid = int(authors['authid'])
            authcity = affiliation_city[affid]
            if (auth_city.get(authid)):
                if (auth_city[authid].get(authcity)):
                    auth_city[authid][authcity] += 1
                else:
                    auth_city[authid][authcity] = 1
            else:
                auth_city[authid] = {}
                auth_city[authid][authcity] = 1
            continue

        authors = paper['author']

        # if there is only 1 author, ignore this paper
        if (not type(authors) is list):
            continue
        else:
            for i in range(len(authors)):
                if(authors[i].get('afid') and type(authors[i].get('afid')) is not list ):
                    authid = int(authors[i]['authid'])

                    affid = int(authors[i]['afid'])
                    if(affiliation_city.get(affid)):
                        authcity = affiliation_city[affid]
                        if (auth_city.get(authid)):
                            if (auth_city[authid].get(authcity)):
                                auth_city[authid][authcity] += 1
                            else:
                                auth_city[authid][authcity] = 1
                        else:
                            auth_city[authid] = {}
                            auth_city[authid][authcity] = 1
                u = int(authors[i]['authid'])

                for j in range(i + 1):
                    v = int(authors[j]['authid'])

                    # since we are looping over the same array, we should skip this case
                    if (u == v):
                        continue

                    # undirected graph ==> (u,v) == (v, u)
                    # to store only 1/2 of the memory
                    # we will always use key (u,v) such as u is smaller than v
                    u, v = min(u, v), max(u, v)

                    # checking if edge already exists
                    if G.has_edge(u, v):
                        # update the edge weight
                        G[u][v]['weight'] += 1
                    else:
                        G.add_edge(u, v, weight = 1)
    for author in auth_city:
        city[author] = max(auth_city[author], key=auth_city[author].get)
    for author,c in city.items():
        cities.add(c)

    return G.number_of_nodes

def export_degree_hist(filename, domain):
    deg = sorted([d for n, d in G.degree()], reverse=True)
    # tot = sum(deg)
    # min_deg = min(deg)
    # max_deg = max(deg)
    # deg=[(d - min_deg) / (max_deg - min_deg) for d in deg]
    # tot = sum([d for n, d in G.degree()])
    # degree_sequence = sorted([int(d * 100.0/tot) for n, d in G.degree()], reverse=True)
    # degreeCount = collections.Counter(degree_sequence)
    #
    # deg, cnt = zip(*degreeCount.items())
    # deg=deg[:-1]
    # cnt=cnt[:-1]
    # fig, ax = plt.subplots()
    # # plt.bar(deg, cnt, width=0.80, color='b')
    # #
    # # plt.title("Degree Histogram : " + filename)
    # # plt.ylabel("Count")
    # # plt.xlabel("Degree")
    # # ax.set_xticks([d + 0.4 for d in deg])
    # # ax.set_xticklabels(deg)
    dir = 'fig\\' + domain


    if not os.path.exists(dir):
        os.makedirs(dir)

    slope = export_log_pl(deg,dir + '\\' + filename + '.png')
    degmax = max([d for n, d in G.degree()])
    mean = sum([d for n, d in G.degree()]) / len([d for n, d in G.degree()])

    return slope, degmax, mean

def export_evolution(data,filename,domain,metric):
    y = [x for x in range(1990, 2019)]
    plt.title(metric + " distribution : " + domain)
    plt.ylabel("Value")
    plt.xlabel(metric)

    # if(metric == 'Slope'):
    #     print(data)

    plt.plot(y, data, 'k-', lw=0.5)
    plt.plot(y, data, 'k+')
    plt.tight_layout()
    plt.savefig(filename+"_"+metric)
    plt.clf()
    plt.close()


def export_graphml(filename, domain):
    debug("export_graphml('%s')" % filename)


    # create a file .graphml to output the graph coded in graphml
    dir = 'graphml\\' + domain

    if not os.path.exists(dir):
        os.makedirs(dir)

    output_file = open(dir + '\\' + filename, "w+")

    debug("---- file created : %s" % filename)
    sys.stdout = output_file

    # graphml format is structured as follows :
    #     - xml_header
    #     - nodes declarations
    #     - edges declarations
    #     - xml_footer

    xml_header = "<?xml version='1.0' encoding='utf-8'?>"
    xml_header += '<graphml xmlns="http://graphml.graphdrawing.org/xmlns" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://graphml.graphdrawing.org/xmlns http://graphml.graphdrawing.org/xmlns/1.0/graphml.xsd">'
    xml_header += '<graph edgedefault="undirected">\n'  # undirected graph

    sys.stdout.write(xml_header)

    debug("---- xml_header : done.")

    # res += xml_header

    sys.stdout.write('<key id="d1" for="edge" attr.name="weight" attr.type="int"/>')

    # node ids declaration as graphml format : <node id="#node" />
    nodes = G.nodes
    for node in nodes:
        sys.stdout.write('<node id="%d"/>\n' % (node))

    debug("%d nodes added." % len(nodes))
    # edges declaration as graphml format : <edge source="src" target="tgt" />

    cnt = 1
    for e in G.edges.data(data='weight', default=1):
        sys.stdout.write('<edge id="e%d" source="%d" target="%d">' % (cnt, e[0], e[1]))
        sys.stdout.write('<data key="d1">%d</data>' % e[2])
        sys.stdout.write('</edge>')
        cnt += 1

    # xml_footerx
    sys.stdout.write('</graph></graphml>\n')

    debug("---- xml_footer : done.")
    debug("---- file exported successfully : %s\n" % filename)

    # close file now that we are done
    output_file.close()


def extract_all(domain):
    folder = os.getcwd() + '\\' + domain

    slopes = []
    degmaxs = []
    means = []
    mods = []
    exes = []
    ps = []

    for filename in os.listdir(folder):

        if filename.endswith('.json'):
            domain = filename[:4]
            year = filename[5:9]

            data = extract_data_from_json(filename, domain)
            if(data == None):
                debug(filename + " : data = None")
                continue
            papers = data['search-results']['entry']
            extract_edges(papers)
            # export_graphml("tranche " + filename[:9] + '.graphml', domain)
            s, d, m = export_degree_hist(filename[:9], domain)
            slopes.append(s)
            degmaxs.append(d)
            means.append(m)
            # partition= export_stats(filename[:9], domain)
            # ps.append(p)
            # mods.append(mod)
            # exes.append(exe)
    folder = os.getcwd() + '\\distributions'

    if not os.path.exists(folder):
        os.makedirs(folder)


    export_evolution(slopes, folder + '\\' + domain,domain,'Slope')
    # export_evolution(degmaxs, folder + '\\' + domain,domain,'Degree Max')
    # export_evolution(means, folder + '\\' + domain,domain,'Degree Mean')
    # export_evolution(mods, folder + '\\' + domain, domain, 'Modularity')
    # export_evolution(exes, folder + '\\' + domain, domain, 'Execution Time')
    # export_evolution(ps, folder + '\\' + domain, domain, '#Partitions')


    # export_graphml("whole_graph_" + domain + ".graphml", '')

def export_stats(filename, domain):
    ######################### Calculating Data ########################################################

    debug("Calculating data")
    nbr_nodes = G.number_of_nodes()
    nbr_edges = G.number_of_edges()

    debug("Connected Componnents")
    Gc = max(nx.connected_components(G), key=len)
    largest_cc = G.subgraph(Gc.copy())

    debug("Partitions")

    start = time.perf_counter()

    partition = cm.best_partition(largest_cc)
    # print(partition)
    end = time.perf_counter()
    execution_time = end - start

    nbr_partitions = max([v for u, v in partition.items()]) + 1

    debug("Modularity")
    mod = cm.modularity(partition, largest_cc)
    return partition

    ######################### Statistics Printing ########################################################

    if not os.path.exists('stats'):
        os.makedirs('stats')

    dir = 'stats\\' + domain

    if not os.path.exists(dir):
        os.makedirs(dir)

    sys.stdout = open(dir + '\\' + "statsFile_" + filename + ".log", "w+")

    print("Number of connected components: %d" % (nx.number_connected_components(G)))
    print("Number of nodes : %d" % (nbr_nodes))
    print("Number of edges : %d" % (nbr_edges))
    print("Number of nodes in the largest connected components: %d" % (largest_cc.number_of_nodes()))
    print("Number of edges in the largest connected components: %d" % (largest_cc.number_of_edges()))
    print("Number of clusters: %d" % (nbr_partitions))
    print("modularity:", mod)
    print("execution time : %0.2f seconds" % (execution_time))


    ######################### End of Statistics Priting ##################################################

    if not os.path.exists('csv'):
        os.makedirs('csv')

    dir = 'csv\\' + domain

    if not os.path.exists(dir):
        os.makedirs(dir)

    with open(dir + '\\' + 'outputpartitionsFile_' + filename + '.csv', 'w+') as output:
        for key in partition.keys():
            output.write("%s,%s\n" % (key, partition[key]))

    return nbr_partitions,mod,execution_time


def main():
    # domain = input()

    for file in os.scandir(os.getcwd()):
        if(file.is_dir() and file.name=="VETE"):
            domain = file.name
            extract_all(domain)
            G.clear()
    # print(cities)
    print(cities)
    print(len(cities))


if __name__ == '__main__':

    main()