Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- from os import listdir
- from lxml import etree as ET
- from gexf import *
- from itertools import combinations
- #import xml.etree.ElementTree as ET #Use this if you don't have lxml installed
- # Open up a gexf file
- gexf = Gexf("Author-Institution network", "GU")
- graph = gexf.addGraph("undirected", "static", "Swepub network")
- attribute_node = graph.addNodeAttribute("University", "default_value", "string")
- attribute_nodetwo = graph.addNodeAttribute("Institution", "default_value", "string")
- records = 0 # Just a counter for control
- therecords = [] # A list for storing many dictionaries created in the loop below
- # The loop for extracting author/institution from the xml-files
- for filename in listdir("GUYear2015N47761Searched20160110/"):
- with open("GUYear2015N47761Searched20160110/" + filename) as currentFile:
- tree = ET.parse(currentFile)
- root = tree.getroot()
- for child in root[0]:
- records += 1 #Add to counter above
- #print("-" * 10)
- coauthors = {}
- for c in child: #This iterates over the records
- if c.get("tag") == "100": # The 100 Value is first author
- authorlist = []
- for value in c:
- if value.get("code") == "a": # a is author name
- author = value.text
- #print(author)
- elif value.get("code") == "u": # u is institution
- #print(value.text)
- institution = value.text
- #print(institution)
- coauthors.update({author: institution})
- elif c.get("tag") == "700": # The 700 value is authors
- for value in c:
- if value.get("code") == "a":
- author = value.text
- #print(author)
- elif value.get("code") == "u":
- #print(value.text)
- institution = value.text
- #print(institution)
- coauthors.update({author: institution})
- therecords.append(coauthors) # Add each dictionary to the list above
- #this removes only EXACT duplicate dictionaries from therecords list
- #Possible false negative: Two articles may have identical groups of authors
- seen = set()
- therecordsdeduplicated = []
- for d in therecords:
- t = tuple(d.items())
- if t not in seen:
- seen.add(t)
- therecordsdeduplicated.append(d)
- #this creates a list which can be used to create edges
- edges = []
- coauthorcounter = 0 #just a counter
- for t in therecordsdeduplicated:
- if len(t) > 1: #This removes single author articles, only more than one authors are allowed
- coauthorcounter += 1 # counts the number of articles with minimum 2 authors
- #print("--")
- #print(len(t))
- edgelist = []
- for key, value in t.items():
- coauthoredge = list(combinations(t, 2)) #This function calculates all possible relations between authors of an article.
- for c in coauthoredge:
- edgelist.append(c)
- newvalue = value.split(', ')
- n = graph.addNode(key, key)
- try:
- print(newvalue[0])
- n.addAttribute(attribute_node, newvalue[0])
- except IndexError:
- print("nothing here...")
- try:
- print(newvalue[1])
- n.addAttribute(attribute_nodetwo, newvalue[1])
- except IndexError:
- print("nothing here...")
- #graph.addNode(value, value) #This adds the Institutions as nodes
- edges.append(edgelist)
- #print(edgelist)
- #this creates a list of edges then enumerates it and creates edges
- authoredges = []
- for e in edges:
- for ee in e:
- #print(ee)
- authoredges.append(ee)
- for enumer, e in enumerate(authoredges):
- #print(enumer, e[0], e[1])
- graph.addEdge(enumer, e[0], e[1])
- # Print some meta-data
- print("There are " + str(records) + " records")
- print("There are " + str(coauthorcounter) + " co-authored articles (more than 1 author)\n")
- #write file
- gexf_file = open("coauthors.gexf", "wb")
- gexf.write(gexf_file)
- """
- for child in root[0]:
- for c in child:
- if c.get("tag") == "245":
- authorlist = []
- for value in c:
- authordict = {}
- if value.get("code") == "a":
- #print(value.text)
- title = value.text
- print("TITLE: " + title)
- graph.addNode(title, title)
- """
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement