Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import os
- import sys
- import json
- import time
- import traceback
- import networkx as nx
- import community as cm
- import collections
- import math
- import matplotlib.pyplot as plt
- from log_binning import *
- G = nx.Graph()
- affiliation_city = {}
- auth_city = {}
- city={}
- cities= set()
- def debug(x, end = '\n'):
- sys.stderr.write(x)
- sys.stderr.write(end)
- def extract_data_from_json(filename, domain):
- debug("extract_data_from_json('%s')" % (filename))
- # open input json file
- filepath = os.getcwd() + '\\' + domain + '\\' + filename
- f = open(filepath)
- data = None
- # parse json file
- try:
- data = json.loads(f.read())
- except Exception as e:
- print(filepath + " didn't work !")
- traceback.print_exc()
- f.close()
- f.close()
- return data
- def extract_edges(papers):
- debug("extract_edges()")
- for paper in papers:
- if (not paper.get('affiliation')):
- continue
- aff = paper['affiliation']
- if (type(aff) is list):
- for a in aff:
- if(a.get('afid') and a.get('affiliation-city') and a.get('affiliation-country') and a['affiliation-country'].lower()=='france'):
- affid = int(a['afid'])
- affiliation_city[affid] = a['affiliation-city'].lower()
- else:
- if(aff.get('afid') and aff.get('affiliation-city') and aff.get('affiliation-country').lower()=='france'):
- affid = int(aff['afid'])
- affiliation_city[affid] = aff['affiliation-city'].lower()
- # if that entry doesn't have an author ==> it's not a paper
- if (not paper.get('author')):
- affid = int(authors['afid'])
- authid = int(authors['authid'])
- authcity = affiliation_city[affid]
- if (auth_city.get(authid)):
- if (auth_city[authid].get(authcity)):
- auth_city[authid][authcity] += 1
- else:
- auth_city[authid][authcity] = 1
- else:
- auth_city[authid] = {}
- auth_city[authid][authcity] = 1
- continue
- authors = paper['author']
- # if there is only 1 author, ignore this paper
- if (not type(authors) is list):
- continue
- else:
- for i in range(len(authors)):
- if(authors[i].get('afid') and type(authors[i].get('afid')) is not list ):
- authid = int(authors[i]['authid'])
- affid = int(authors[i]['afid'])
- if(affiliation_city.get(affid)):
- authcity = affiliation_city[affid]
- if (auth_city.get(authid)):
- if (auth_city[authid].get(authcity)):
- auth_city[authid][authcity] += 1
- else:
- auth_city[authid][authcity] = 1
- else:
- auth_city[authid] = {}
- auth_city[authid][authcity] = 1
- u = int(authors[i]['authid'])
- for j in range(i + 1):
- v = int(authors[j]['authid'])
- # since we are looping over the same array, we should skip this case
- if (u == v):
- continue
- # undirected graph ==> (u,v) == (v, u)
- # to store only 1/2 of the memory
- # we will always use key (u,v) such as u is smaller than v
- u, v = min(u, v), max(u, v)
- # checking if edge already exists
- if G.has_edge(u, v):
- # update the edge weight
- G[u][v]['weight'] += 1
- else:
- G.add_edge(u, v, weight = 1)
- for author in auth_city:
- city[author] = max(auth_city[author], key=auth_city[author].get)
- for author,c in city.items():
- cities.add(c)
- return G.number_of_nodes
- def export_degree_hist(filename, domain):
- deg = sorted([d for n, d in G.degree()], reverse=True)
- # tot = sum(deg)
- # min_deg = min(deg)
- # max_deg = max(deg)
- # deg=[(d - min_deg) / (max_deg - min_deg) for d in deg]
- # tot = sum([d for n, d in G.degree()])
- # degree_sequence = sorted([int(d * 100.0/tot) for n, d in G.degree()], reverse=True)
- # degreeCount = collections.Counter(degree_sequence)
- #
- # deg, cnt = zip(*degreeCount.items())
- # deg=deg[:-1]
- # cnt=cnt[:-1]
- # fig, ax = plt.subplots()
- # # plt.bar(deg, cnt, width=0.80, color='b')
- # #
- # # plt.title("Degree Histogram : " + filename)
- # # plt.ylabel("Count")
- # # plt.xlabel("Degree")
- # # ax.set_xticks([d + 0.4 for d in deg])
- # # ax.set_xticklabels(deg)
- dir = 'fig\\' + domain
- if not os.path.exists(dir):
- os.makedirs(dir)
- slope = export_log_pl(deg,dir + '\\' + filename + '.png')
- degmax = max([d for n, d in G.degree()])
- mean = sum([d for n, d in G.degree()]) / len([d for n, d in G.degree()])
- return slope, degmax, mean
- def export_evolution(data,filename,domain,metric):
- y = [x for x in range(1990, 2019)]
- plt.title(metric + " distribution : " + domain)
- plt.ylabel("Value")
- plt.xlabel(metric)
- # if(metric == 'Slope'):
- # print(data)
- plt.plot(y, data, 'k-', lw=0.5)
- plt.plot(y, data, 'k+')
- plt.tight_layout()
- plt.savefig(filename+"_"+metric)
- plt.clf()
- plt.close()
- def export_graphml(filename, domain):
- debug("export_graphml('%s')" % filename)
- # create a file .graphml to output the graph coded in graphml
- dir = 'graphml\\' + domain
- if not os.path.exists(dir):
- os.makedirs(dir)
- output_file = open(dir + '\\' + filename, "w+")
- debug("---- file created : %s" % filename)
- sys.stdout = output_file
- # graphml format is structured as follows :
- # - xml_header
- # - nodes declarations
- # - edges declarations
- # - xml_footer
- xml_header = "<?xml version='1.0' encoding='utf-8'?>"
- xml_header += '<graphml xmlns="http://graphml.graphdrawing.org/xmlns" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://graphml.graphdrawing.org/xmlns http://graphml.graphdrawing.org/xmlns/1.0/graphml.xsd">'
- xml_header += '<graph edgedefault="undirected">\n' # undirected graph
- sys.stdout.write(xml_header)
- debug("---- xml_header : done.")
- # res += xml_header
- sys.stdout.write('<key id="d1" for="edge" attr.name="weight" attr.type="int"/>')
- # node ids declaration as graphml format : <node id="#node" />
- nodes = G.nodes
- for node in nodes:
- sys.stdout.write('<node id="%d"/>\n' % (node))
- debug("%d nodes added." % len(nodes))
- # edges declaration as graphml format : <edge source="src" target="tgt" />
- cnt = 1
- for e in G.edges.data(data='weight', default=1):
- sys.stdout.write('<edge id="e%d" source="%d" target="%d">' % (cnt, e[0], e[1]))
- sys.stdout.write('<data key="d1">%d</data>' % e[2])
- sys.stdout.write('</edge>')
- cnt += 1
- # xml_footerx
- sys.stdout.write('</graph></graphml>\n')
- debug("---- xml_footer : done.")
- debug("---- file exported successfully : %s\n" % filename)
- # close file now that we are done
- output_file.close()
- def extract_all(domain):
- folder = os.getcwd() + '\\' + domain
- slopes = []
- degmaxs = []
- means = []
- mods = []
- exes = []
- ps = []
- for filename in os.listdir(folder):
- if filename.endswith('.json'):
- domain = filename[:4]
- year = filename[5:9]
- data = extract_data_from_json(filename, domain)
- if(data == None):
- debug(filename + " : data = None")
- continue
- papers = data['search-results']['entry']
- extract_edges(papers)
- # export_graphml("tranche " + filename[:9] + '.graphml', domain)
- s, d, m = export_degree_hist(filename[:9], domain)
- slopes.append(s)
- degmaxs.append(d)
- means.append(m)
- # partition= export_stats(filename[:9], domain)
- # ps.append(p)
- # mods.append(mod)
- # exes.append(exe)
- folder = os.getcwd() + '\\distributions'
- if not os.path.exists(folder):
- os.makedirs(folder)
- export_evolution(slopes, folder + '\\' + domain,domain,'Slope')
- # export_evolution(degmaxs, folder + '\\' + domain,domain,'Degree Max')
- # export_evolution(means, folder + '\\' + domain,domain,'Degree Mean')
- # export_evolution(mods, folder + '\\' + domain, domain, 'Modularity')
- # export_evolution(exes, folder + '\\' + domain, domain, 'Execution Time')
- # export_evolution(ps, folder + '\\' + domain, domain, '#Partitions')
- # export_graphml("whole_graph_" + domain + ".graphml", '')
- def export_stats(filename, domain):
- ######################### Calculating Data ########################################################
- debug("Calculating data")
- nbr_nodes = G.number_of_nodes()
- nbr_edges = G.number_of_edges()
- debug("Connected Componnents")
- Gc = max(nx.connected_components(G), key=len)
- largest_cc = G.subgraph(Gc.copy())
- debug("Partitions")
- start = time.perf_counter()
- partition = cm.best_partition(largest_cc)
- # print(partition)
- end = time.perf_counter()
- execution_time = end - start
- nbr_partitions = max([v for u, v in partition.items()]) + 1
- debug("Modularity")
- mod = cm.modularity(partition, largest_cc)
- return partition
- ######################### Statistics Printing ########################################################
- if not os.path.exists('stats'):
- os.makedirs('stats')
- dir = 'stats\\' + domain
- if not os.path.exists(dir):
- os.makedirs(dir)
- sys.stdout = open(dir + '\\' + "statsFile_" + filename + ".log", "w+")
- print("Number of connected components: %d" % (nx.number_connected_components(G)))
- print("Number of nodes : %d" % (nbr_nodes))
- print("Number of edges : %d" % (nbr_edges))
- print("Number of nodes in the largest connected components: %d" % (largest_cc.number_of_nodes()))
- print("Number of edges in the largest connected components: %d" % (largest_cc.number_of_edges()))
- print("Number of clusters: %d" % (nbr_partitions))
- print("modularity:", mod)
- print("execution time : %0.2f seconds" % (execution_time))
- ######################### End of Statistics Priting ##################################################
- if not os.path.exists('csv'):
- os.makedirs('csv')
- dir = 'csv\\' + domain
- if not os.path.exists(dir):
- os.makedirs(dir)
- with open(dir + '\\' + 'outputpartitionsFile_' + filename + '.csv', 'w+') as output:
- for key in partition.keys():
- output.write("%s,%s\n" % (key, partition[key]))
- return nbr_partitions,mod,execution_time
- def main():
- # domain = input()
- for file in os.scandir(os.getcwd()):
- if(file.is_dir() and file.name=="VETE"):
- domain = file.name
- extract_all(domain)
- G.clear()
- # print(cities)
- print(cities)
- print(len(cities))
- if __name__ == '__main__':
- main()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement