Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import sqlite3
- import sys
- import urllib
- import datetime
- import os
- import pickle
- import itertools
- def create_table(nodes=None,edges=None):
- db_file_name="/tmp/example.sqlite"
- if os.path.isfile(db_file_name):
- os.remove(db_file_name)
- db = sqlite3.connect(db_file_name)
- db.text_factory = str
- cur = db.cursor()
- cur.executescript("""
- create table nodes(
- id,
- label,
- x,
- y,
- size
- );
- create table edges(
- source,
- target,
- label,
- weight
- );
- """)
- c = db.cursor()
- nodes=nodes.items()
- for i in xrange(len(nodes)):
- c.execute('insert into nodes values (?,?,?,?,?)', (str(nodes[i][1]),str(nodes[i][0]),str(i),str(i),1))
- edges=edges.items()
- for i in xrange(len(edges)):
- c.execute('insert into edges values (?,?,?,?)', (str(edges[i][0]),str(edges[i][1]),'','1'))
- db.commit()
- c.close()
- def fetch_data_from_iacr(**kwargs):
- year_start=kwargs.get('year_start',1996)
- year_end=kwargs.get('year_end',datetime.datetime.now().year)
- field=kwargs.get('field',"author")
- path_to_pkl=kwargs.get('path_to_pkl',None)
- nodes={}
- edges={}
- stop=3
- if path_to_pkl is not None:
- pkl_file = open(path_to_pkl, 'rb')
- all_authors = pickle.load(pkl_file)
- pkl_file.close()
- for year in xrange(year_start,year_end+1):
- if path_to_pkl is not None:
- for s in all_authors[year]:
- authors=s
- authors=authors.replace(", and ", ",")
- authors=authors.replace(" and ", ",")
- authors=authors.replace(" ", "")
- authors=authors.replace(", ", ",")
- authors=authors.split(",")
- for a in authors:
- if a not in nodes:
- nodes[a]=len(nodes)
- for a in itertools.combinations(authors, 2):
- edges[nodes[a[0]]]=nodes[a[1]]
- #print "{0}".format(authors)
- #print "{0} ({1})".format(authors,"http://eprint.iacr.org/cgi-bin/cite.pl?entry={0}/{1}".format(year,str(num_article).zfill(3)))
- else:
- num_article=1
- while (1):
- sys.stdout.write("{0}: {1}\r".format(year,num_article))
- sys.stdout.flush()
- f = urllib.urlopen("http://eprint.iacr.org/cgi-bin/cite.pl?entry={0}/{1}".format(year,str(num_article).zfill(3)))
- s = f.read()
- f.close()
- s=str(s)
- pos=s.find(field)
- if pos == -1:
- num_article = num_article + 1
- stop=stop - 1
- if stop == 0:
- break
- continue
- else:
- stop=3
- s=s[pos:]
- s=s[:s.find("\n")]
- pos=len("{0} = {1}".format(field,"{"))
- pos_end=len(s)-1
- while s[pos_end] != '}':
- pos_end=pos_end-1
- authors=s[pos:pos_end]
- authors=authors.replace(", and ", ",")
- authors=authors.replace(" and ", ",")
- authors=authors.replace(" ", "")
- authors=authors.replace(", ", ",")
- authors=authors.split(",")
- for a in authors:
- if a not in nodes:
- nodes[a]=len(nodes)
- for a in itertools.combinations(authors, 2):
- edges[nodes[a[0]]]=nodes[a[1]]
- #print "{0}".format(authors)
- #print "{0} ({1})".format(authors,"http://eprint.iacr.org/cgi-bin/cite.pl?entry={0}/{1}".format(year,str(num_article).zfill(3)))
- num_article = num_article + 1
- sys.stdout.write("\n")
- return [nodes,edges]
- def blob_all_data_from_iacr(**kwargs):
- year_start=kwargs.get('year_start',1996)
- year_end=kwargs.get('year_end',datetime.datetime.now().year)
- field=kwargs.get('field',"author")
- all_authors={}
- stop=3
- for year in xrange(year_start,year_end+1):
- num_article=1
- all_authors[year]=[]
- while (1):
- sys.stdout.write("{0}: {1}\r".format(year,num_article))
- sys.stdout.flush()
- f = urllib.urlopen("http://eprint.iacr.org/cgi-bin/cite.pl?entry={0}/{1}".format(year,str(num_article).zfill(3)))
- s = f.read()
- f.close()
- s=str(s)
- pos=s.find(field)
- if pos == -1:
- num_article = num_article + 1
- stop=stop - 1
- if stop == 0:
- break
- continue
- else:
- stop=3
- s=s[pos:]
- s=s[:s.find("\n")]
- pos=len("{0} = {1}".format(field,"{"))
- pos_end=len(s)-1
- while s[pos_end] != '}':
- pos_end=pos_end-1
- authors=''
- while pos != pos_end:
- authors=authors+s[pos]
- pos=pos+1
- all_authors[year].append(authors)
- num_article = num_article + 1
- sys.stdout.write("\n")
- output = open('/tmp/all_authors.pkl', 'wb')
- pickle.dump(all_authors, output)
- output.close()
- def main():
- path_to_pkl='/tmp/all_authors.pkl'
- #path_to_pkl=None
- year_start=1996
- year_end=2012
- if path_to_pkl is not None:
- if not os.path.isfile(path_to_pkl):
- blob_all_data_from_iacr(year_start=year_start,year_end=year_end)
- else:
- pkl_file = open(path_to_pkl, 'rb')
- all_authors = pickle.load(pkl_file)
- pkl_file.close()
- for year in xrange(year_start,year_start+1):
- if not year in all_authors:
- print "Only year {0} is presented in file.".format(year-1)
- return -1
- break
- [nodes,edges]=fetch_data_from_iacr(field="author",year_start=year_start,year_end=year_end,path_to_pkl=path_to_pkl)
- create_table(nodes,edges)
- if __name__ == "__main__":
- sys.exit(main())
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement