Untitled

from os import listdir
from lxml import etree as ET
from gexf import *
from itertools import combinations
#import xml.etree.ElementTree as ET #Use this if you don't have lxml installed

# Open up a gexf file
gexf = Gexf("Author-Institution network", "GU")
graph = gexf.addGraph("undirected", "static", "Swepub network")
attribute_node = graph.addNodeAttribute("University", "default_value", "string")
attribute_nodetwo = graph.addNodeAttribute("Institution", "default_value", "string")

records = 0 # Just a counter for control
therecords = [] # A list for storing many dictionaries created in the loop below

# The loop for extracting author/institution from the xml-files
for filename in listdir("GUYear2015N47761Searched20160110/"):
    with open("GUYear2015N47761Searched20160110/" + filename) as currentFile:
        tree = ET.parse(currentFile)
        root = tree.getroot()

        for child in root[0]:
            records += 1 #Add to counter above
            #print("-" * 10)
            coauthors = {}
            for c in child: #This iterates over the records
                if c.get("tag") == "100": # The 100 Value is first author
                    authorlist = []
                    for value in c:
                        if value.get("code") == "a": # a is author name
                            author = value.text
                            #print(author)
                        elif value.get("code") == "u": # u is institution
                            #print(value.text)
                            institution = value.text
                            #print(institution)
                            coauthors.update({author: institution})

                elif c.get("tag") == "700": # The 700 value is authors
                    for value in c:
                        if value.get("code") == "a":
                            author = value.text
                            #print(author)
                        elif value.get("code") == "u":
                            #print(value.text)
                            institution = value.text
                            #print(institution)
                            coauthors.update({author: institution})

            therecords.append(coauthors) # Add each dictionary to the list above

#this removes only EXACT duplicate dictionaries from therecords list
#Possible false negative: Two articles may have identical groups of authors
seen = set()
therecordsdeduplicated = []
for d in therecords:
    t = tuple(d.items())
    if t not in seen:
        seen.add(t)
        therecordsdeduplicated.append(d)

#this creates a list which can be used to create edges
edges = []
coauthorcounter = 0 #just a counter
for t in therecordsdeduplicated:
    if len(t) > 1: #This removes single author articles, only more than one authors are allowed
        coauthorcounter += 1 # counts the number of articles with minimum 2 authors
        #print("--")
        #print(len(t))
        edgelist = []
        for key, value in t.items():
            coauthoredge = list(combinations(t, 2)) #This function calculates all possible relations between authors of an article.
            for c in coauthoredge:
                edgelist.append(c)
            newvalue = value.split(', ')
            n = graph.addNode(key, key)
            try:
                print(newvalue[0])
                n.addAttribute(attribute_node, newvalue[0])
            except IndexError:
                print("nothing here...")
            try:
                print(newvalue[1])
                n.addAttribute(attribute_nodetwo, newvalue[1])
            except IndexError:
                print("nothing here...")


            #graph.addNode(value, value) #This adds the Institutions as nodes
        edges.append(edgelist)
        #print(edgelist)

#this creates a list of edges then enumerates it and creates edges
authoredges = []
for e in edges:
    for ee in e:
        #print(ee)
        authoredges.append(ee)

for enumer, e in enumerate(authoredges):
    #print(enumer, e[0], e[1])
    graph.addEdge(enumer, e[0], e[1])


# Print some meta-data
print("There are " + str(records) + " records")
print("There are " + str(coauthorcounter) + " co-authored articles (more than 1 author)\n")

#write file
gexf_file = open("coauthors.gexf", "wb")
gexf.write(gexf_file)


"""
        for child in root[0]:
            for c in child:
                if c.get("tag") == "245":
                    authorlist = []
                    for value in c:
                        authordict = {}

                        if value.get("code") == "a":
                            #print(value.text)
                            title = value.text
                            print("TITLE: " + title)
                            graph.addNode(title, title)
"""