Advertisement
Guest User

Untitled

a guest
Dec 8th, 2019
91
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 11.47 KB | None | 0 0
  1. import os
  2. import sys
  3. import json
  4. import time
  5. import traceback
  6. import networkx as nx
  7. import community as cm
  8. import collections
  9. import math
  10. import matplotlib.pyplot as plt
  11. from log_binning import *
  12.  
  13. G = nx.Graph()
  14. affiliation_city = {}
  15. auth_city = {}
  16. city={}
  17. cities= set()
  18.  
  19.  
  20. def debug(x, end = '\n'):
  21.     sys.stderr.write(x)
  22.     sys.stderr.write(end)
  23.  
  24.  
  25. def extract_data_from_json(filename, domain):
  26.     debug("extract_data_from_json('%s')" % (filename))
  27.     # open input json file
  28.     filepath = os.getcwd() + '\\' + domain + '\\' + filename
  29.     f = open(filepath)
  30.  
  31.     data = None
  32.     # parse json file
  33.     try:
  34.         data = json.loads(f.read())
  35.     except Exception as e:
  36.         print(filepath + " didn't work !")
  37.         traceback.print_exc()
  38.         f.close()
  39.  
  40.     f.close()
  41.     return data
  42.  
  43. def extract_edges(papers):
  44.     debug("extract_edges()")
  45.  
  46.     for paper in papers:
  47.         if (not paper.get('affiliation')):
  48.             continue
  49.         aff = paper['affiliation']
  50.         if (type(aff) is list):
  51.             for a in aff:
  52.                 if(a.get('afid') and a.get('affiliation-city') and a.get('affiliation-country') and a['affiliation-country'].lower()=='france'):
  53.                     affid = int(a['afid'])
  54.                     affiliation_city[affid] = a['affiliation-city'].lower()
  55.         else:
  56.             if(aff.get('afid') and aff.get('affiliation-city') and aff.get('affiliation-country').lower()=='france'):
  57.                 affid = int(aff['afid'])
  58.                 affiliation_city[affid] = aff['affiliation-city'].lower()
  59.         # if that entry doesn't have an author ==> it's not a paper
  60.         if (not paper.get('author')):
  61.             affid = int(authors['afid'])
  62.             authid = int(authors['authid'])
  63.             authcity = affiliation_city[affid]
  64.             if (auth_city.get(authid)):
  65.                 if (auth_city[authid].get(authcity)):
  66.                     auth_city[authid][authcity] += 1
  67.                 else:
  68.                     auth_city[authid][authcity] = 1
  69.             else:
  70.                 auth_city[authid] = {}
  71.                 auth_city[authid][authcity] = 1
  72.             continue
  73.  
  74.         authors = paper['author']
  75.  
  76.         # if there is only 1 author, ignore this paper
  77.         if (not type(authors) is list):
  78.             continue
  79.         else:
  80.             for i in range(len(authors)):
  81.                 if(authors[i].get('afid') and type(authors[i].get('afid')) is not list ):
  82.                     authid = int(authors[i]['authid'])
  83.  
  84.                     affid = int(authors[i]['afid'])
  85.                     if(affiliation_city.get(affid)):
  86.                         authcity = affiliation_city[affid]
  87.                         if (auth_city.get(authid)):
  88.                             if (auth_city[authid].get(authcity)):
  89.                                 auth_city[authid][authcity] += 1
  90.                             else:
  91.                                 auth_city[authid][authcity] = 1
  92.                         else:
  93.                             auth_city[authid] = {}
  94.                             auth_city[authid][authcity] = 1
  95.                 u = int(authors[i]['authid'])
  96.  
  97.                 for j in range(i + 1):
  98.                     v = int(authors[j]['authid'])
  99.  
  100.                     # since we are looping over the same array, we should skip this case
  101.                     if (u == v):
  102.                         continue
  103.  
  104.                     # undirected graph ==> (u,v) == (v, u)
  105.                     # to store only 1/2 of the memory
  106.                     # we will always use key (u,v) such as u is smaller than v
  107.                     u, v = min(u, v), max(u, v)
  108.  
  109.                     # checking if edge already exists
  110.                     if G.has_edge(u, v):
  111.                         # update the edge weight
  112.                         G[u][v]['weight'] += 1
  113.                     else:
  114.                         G.add_edge(u, v, weight = 1)
  115.     for author in auth_city:
  116.         city[author] = max(auth_city[author], key=auth_city[author].get)
  117.     for author,c in city.items():
  118.         cities.add(c)
  119.  
  120.     return G.number_of_nodes
  121.  
  122. def export_degree_hist(filename, domain):
  123.     deg = sorted([d for n, d in G.degree()], reverse=True)
  124.     # tot = sum(deg)
  125.     # min_deg = min(deg)
  126.     # max_deg = max(deg)
  127.     # deg=[(d - min_deg) / (max_deg - min_deg) for d in deg]
  128.     # tot = sum([d for n, d in G.degree()])
  129.     # degree_sequence = sorted([int(d * 100.0/tot) for n, d in G.degree()], reverse=True)
  130.     # degreeCount = collections.Counter(degree_sequence)
  131.     #
  132.     # deg, cnt = zip(*degreeCount.items())
  133.     # deg=deg[:-1]
  134.     # cnt=cnt[:-1]
  135.     # fig, ax = plt.subplots()
  136.     # # plt.bar(deg, cnt, width=0.80, color='b')
  137.     # #
  138.     # # plt.title("Degree Histogram : " + filename)
  139.     # # plt.ylabel("Count")
  140.     # # plt.xlabel("Degree")
  141.     # # ax.set_xticks([d + 0.4 for d in deg])
  142.     # # ax.set_xticklabels(deg)
  143.     dir = 'fig\\' + domain
  144.  
  145.  
  146.     if not os.path.exists(dir):
  147.         os.makedirs(dir)
  148.  
  149.     slope = export_log_pl(deg,dir + '\\' + filename + '.png')
  150.     degmax = max([d for n, d in G.degree()])
  151.     mean = sum([d for n, d in G.degree()]) / len([d for n, d in G.degree()])
  152.  
  153.     return slope, degmax, mean
  154.  
  155. def export_evolution(data,filename,domain,metric):
  156.     y = [x for x in range(1990, 2019)]
  157.     plt.title(metric + " distribution : " + domain)
  158.     plt.ylabel("Value")
  159.     plt.xlabel(metric)
  160.  
  161.     # if(metric == 'Slope'):
  162.     #     print(data)
  163.  
  164.     plt.plot(y, data, 'k-', lw=0.5)
  165.     plt.plot(y, data, 'k+')
  166.     plt.tight_layout()
  167.     plt.savefig(filename+"_"+metric)
  168.     plt.clf()
  169.     plt.close()
  170.  
  171.  
  172.  
  173. def export_graphml(filename, domain):
  174.     debug("export_graphml('%s')" % filename)
  175.  
  176.  
  177.     # create a file .graphml to output the graph coded in graphml
  178.     dir = 'graphml\\' + domain
  179.  
  180.     if not os.path.exists(dir):
  181.         os.makedirs(dir)
  182.  
  183.     output_file = open(dir + '\\' + filename, "w+")
  184.  
  185.     debug("---- file created : %s" % filename)
  186.     sys.stdout = output_file
  187.  
  188.     # graphml format is structured as follows :
  189.     #     - xml_header
  190.     #     - nodes declarations
  191.     #     - edges declarations
  192.     #     - xml_footer
  193.  
  194.     xml_header = "<?xml version='1.0' encoding='utf-8'?>"
  195.     xml_header += '<graphml xmlns="http://graphml.graphdrawing.org/xmlns" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://graphml.graphdrawing.org/xmlns http://graphml.graphdrawing.org/xmlns/1.0/graphml.xsd">'
  196.     xml_header += '<graph edgedefault="undirected">\n'  # undirected graph
  197.  
  198.     sys.stdout.write(xml_header)
  199.  
  200.     debug("---- xml_header : done.")
  201.  
  202.     # res += xml_header
  203.  
  204.     sys.stdout.write('<key id="d1" for="edge" attr.name="weight" attr.type="int"/>')
  205.  
  206.     # node ids declaration as graphml format : <node id="#node" />
  207.     nodes = G.nodes
  208.     for node in nodes:
  209.         sys.stdout.write('<node id="%d"/>\n' % (node))
  210.  
  211.     debug("%d nodes added." % len(nodes))
  212.     # edges declaration as graphml format : <edge source="src" target="tgt" />
  213.  
  214.     cnt = 1
  215.     for e in G.edges.data(data='weight', default=1):
  216.         sys.stdout.write('<edge id="e%d" source="%d" target="%d">' % (cnt, e[0], e[1]))
  217.         sys.stdout.write('<data key="d1">%d</data>' % e[2])
  218.         sys.stdout.write('</edge>')
  219.         cnt += 1
  220.  
  221.     # xml_footerx
  222.     sys.stdout.write('</graph></graphml>\n')
  223.  
  224.     debug("---- xml_footer : done.")
  225.     debug("---- file exported successfully : %s\n" % filename)
  226.  
  227.     # close file now that we are done
  228.     output_file.close()
  229.  
  230.  
  231. def extract_all(domain):
  232.     folder = os.getcwd() + '\\' + domain
  233.  
  234.     slopes = []
  235.     degmaxs = []
  236.     means = []
  237.     mods = []
  238.     exes = []
  239.     ps = []
  240.  
  241.     for filename in os.listdir(folder):
  242.  
  243.         if filename.endswith('.json'):
  244.             domain = filename[:4]
  245.             year = filename[5:9]
  246.  
  247.             data = extract_data_from_json(filename, domain)
  248.             if(data == None):
  249.                 debug(filename + " : data = None")
  250.                 continue
  251.             papers = data['search-results']['entry']
  252.             extract_edges(papers)
  253.             # export_graphml("tranche " + filename[:9] + '.graphml', domain)
  254.             s, d, m = export_degree_hist(filename[:9], domain)
  255.             slopes.append(s)
  256.             degmaxs.append(d)
  257.             means.append(m)
  258.             # partition= export_stats(filename[:9], domain)
  259.             # ps.append(p)
  260.             # mods.append(mod)
  261.             # exes.append(exe)
  262.     folder = os.getcwd() + '\\distributions'
  263.  
  264.     if not os.path.exists(folder):
  265.         os.makedirs(folder)
  266.  
  267.  
  268.     export_evolution(slopes, folder + '\\' + domain,domain,'Slope')
  269.     # export_evolution(degmaxs, folder + '\\' + domain,domain,'Degree Max')
  270.     # export_evolution(means, folder + '\\' + domain,domain,'Degree Mean')
  271.     # export_evolution(mods, folder + '\\' + domain, domain, 'Modularity')
  272.     # export_evolution(exes, folder + '\\' + domain, domain, 'Execution Time')
  273.     # export_evolution(ps, folder + '\\' + domain, domain, '#Partitions')
  274.  
  275.  
  276.  
  277.     # export_graphml("whole_graph_" + domain + ".graphml", '')
  278.  
  279. def export_stats(filename, domain):
  280.     ######################### Calculating Data ########################################################
  281.  
  282.     debug("Calculating data")
  283.     nbr_nodes = G.number_of_nodes()
  284.     nbr_edges = G.number_of_edges()
  285.  
  286.     debug("Connected Componnents")
  287.     Gc = max(nx.connected_components(G), key=len)
  288.     largest_cc = G.subgraph(Gc.copy())
  289.  
  290.     debug("Partitions")
  291.  
  292.     start = time.perf_counter()
  293.  
  294.     partition = cm.best_partition(largest_cc)
  295.     # print(partition)
  296.     end = time.perf_counter()
  297.     execution_time = end - start
  298.  
  299.     nbr_partitions = max([v for u, v in partition.items()]) + 1
  300.  
  301.     debug("Modularity")
  302.     mod = cm.modularity(partition, largest_cc)
  303.     return partition
  304.  
  305.     ######################### Statistics Printing ########################################################
  306.  
  307.     if not os.path.exists('stats'):
  308.         os.makedirs('stats')
  309.  
  310.     dir = 'stats\\' + domain
  311.  
  312.     if not os.path.exists(dir):
  313.         os.makedirs(dir)
  314.  
  315.     sys.stdout = open(dir + '\\' + "statsFile_" + filename + ".log", "w+")
  316.  
  317.     print("Number of connected components: %d" % (nx.number_connected_components(G)))
  318.     print("Number of nodes : %d" % (nbr_nodes))
  319.     print("Number of edges : %d" % (nbr_edges))
  320.     print("Number of nodes in the largest connected components: %d" % (largest_cc.number_of_nodes()))
  321.     print("Number of edges in the largest connected components: %d" % (largest_cc.number_of_edges()))
  322.     print("Number of clusters: %d" % (nbr_partitions))
  323.     print("modularity:", mod)
  324.     print("execution time : %0.2f seconds" % (execution_time))
  325.  
  326.  
  327.     ######################### End of Statistics Priting ##################################################
  328.  
  329.     if not os.path.exists('csv'):
  330.         os.makedirs('csv')
  331.  
  332.     dir = 'csv\\' + domain
  333.  
  334.     if not os.path.exists(dir):
  335.         os.makedirs(dir)
  336.  
  337.     with open(dir + '\\' + 'outputpartitionsFile_' + filename + '.csv', 'w+') as output:
  338.         for key in partition.keys():
  339.             output.write("%s,%s\n" % (key, partition[key]))
  340.  
  341.     return nbr_partitions,mod,execution_time
  342.  
  343.  
  344.  
  345.  
  346. def main():
  347.     # domain = input()
  348.  
  349.     for file in os.scandir(os.getcwd()):
  350.         if(file.is_dir() and file.name=="VETE"):
  351.             domain = file.name
  352.             extract_all(domain)
  353.             G.clear()
  354.     # print(cities)
  355.     print(cities)
  356.     print(len(cities))
  357.  
  358.  
  359.  
  360.  
  361. if __name__ == '__main__':
  362.  
  363.     main()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement