Advertisement
Guest User

Untitled

a guest
May 4th, 2016
57
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 4.82 KB | None | 0 0
  1. from os import listdir
  2. from lxml import etree as ET
  3. from gexf import *
  4. from itertools import combinations
  5. #import xml.etree.ElementTree as ET #Use this if you don't have lxml installed
  6.  
  7. # Open up a gexf file
  8. gexf = Gexf("Author-Institution network", "GU")
  9. graph = gexf.addGraph("undirected", "static", "Swepub network")
  10. attribute_node = graph.addNodeAttribute("University", "default_value", "string")
  11. attribute_nodetwo = graph.addNodeAttribute("Institution", "default_value", "string")
  12.  
  13. records = 0 # Just a counter for control
  14. therecords = [] # A list for storing many dictionaries created in the loop below
  15.  
  16. # The loop for extracting author/institution from the xml-files
  17. for filename in listdir("GUYear2015N47761Searched20160110/"):
  18.     with open("GUYear2015N47761Searched20160110/" + filename) as currentFile:
  19.         tree = ET.parse(currentFile)
  20.         root = tree.getroot()
  21.  
  22.         for child in root[0]:
  23.             records += 1 #Add to counter above
  24.             #print("-" * 10)
  25.             coauthors = {}
  26.             for c in child: #This iterates over the records
  27.                 if c.get("tag") == "100": # The 100 Value is first author
  28.                     authorlist = []
  29.                     for value in c:
  30.                         if value.get("code") == "a": # a is author name
  31.                             author = value.text
  32.                             #print(author)
  33.                         elif value.get("code") == "u": # u is institution
  34.                             #print(value.text)
  35.                             institution = value.text
  36.                             #print(institution)
  37.                             coauthors.update({author: institution})
  38.  
  39.                 elif c.get("tag") == "700": # The 700 value is authors
  40.                     for value in c:
  41.                         if value.get("code") == "a":
  42.                             author = value.text
  43.                             #print(author)
  44.                         elif value.get("code") == "u":
  45.                             #print(value.text)
  46.                             institution = value.text
  47.                             #print(institution)
  48.                             coauthors.update({author: institution})
  49.  
  50.             therecords.append(coauthors) # Add each dictionary to the list above
  51.  
  52. #this removes only EXACT duplicate dictionaries from therecords list
  53. #Possible false negative: Two articles may have identical groups of authors
  54. seen = set()
  55. therecordsdeduplicated = []
  56. for d in therecords:
  57.     t = tuple(d.items())
  58.     if t not in seen:
  59.         seen.add(t)
  60.         therecordsdeduplicated.append(d)
  61.  
  62. #this creates a list which can be used to create edges
  63. edges = []
  64. coauthorcounter = 0 #just a counter
  65. for t in therecordsdeduplicated:
  66.     if len(t) > 1: #This removes single author articles, only more than one authors are allowed
  67.         coauthorcounter += 1 # counts the number of articles with minimum 2 authors
  68.         #print("--")
  69.         #print(len(t))
  70.         edgelist = []
  71.         for key, value in t.items():
  72.             coauthoredge = list(combinations(t, 2)) #This function calculates all possible relations between authors of an article.
  73.             for c in coauthoredge:
  74.                 edgelist.append(c)
  75.             newvalue = value.split(', ')
  76.             n = graph.addNode(key, key)
  77.             try:
  78.                 print(newvalue[0])
  79.                 n.addAttribute(attribute_node, newvalue[0])
  80.             except IndexError:
  81.                 print("nothing here...")
  82.             try:
  83.                 print(newvalue[1])
  84.                 n.addAttribute(attribute_nodetwo, newvalue[1])
  85.             except IndexError:
  86.                 print("nothing here...")
  87.  
  88.  
  89.             #graph.addNode(value, value) #This adds the Institutions as nodes
  90.         edges.append(edgelist)
  91.         #print(edgelist)
  92.  
  93. #this creates a list of edges then enumerates it and creates edges
  94. authoredges = []
  95. for e in edges:
  96.     for ee in e:
  97.         #print(ee)
  98.         authoredges.append(ee)
  99.  
  100. for enumer, e in enumerate(authoredges):
  101.     #print(enumer, e[0], e[1])
  102.     graph.addEdge(enumer, e[0], e[1])
  103.  
  104.  
  105. # Print some meta-data
  106. print("There are " + str(records) + " records")
  107. print("There are " + str(coauthorcounter) + " co-authored articles (more than 1 author)\n")
  108.  
  109. #write file
  110. gexf_file = open("coauthors.gexf", "wb")
  111. gexf.write(gexf_file)
  112.  
  113.  
  114.  
  115.  
  116.  
  117.  
  118.  
  119. """
  120.        for child in root[0]:
  121.            for c in child:
  122.                if c.get("tag") == "245":
  123.                    authorlist = []
  124.                    for value in c:
  125.                        authordict = {}
  126.  
  127.                        if value.get("code") == "a":
  128.                            #print(value.text)
  129.                            title = value.text
  130.                            print("TITLE: " + title)
  131.                            graph.addNode(title, title)
  132. """
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement