Advertisement
2ck

Upload script

2ck
May 28th, 2014
41
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 3.81 KB | None | 0 0
  1. from PyOpenWorm import Data,DefaultConfig
  2. from xlrd import open_workbook
  3. from rdflib import Literal, URIRef, Graph,Namespace,RDFS
  4. import re
  5. import httplib as H
  6. from itertools import chain
  7. from os import getcwd
  8. # Read in from the spreadsheet
  9. # leading substring
  10. # matched names in first column go in an array
  11. # Assert that the development name is actually unique
  12. def read(n,sheet_number,cols,start=1):
  13.     rb = open_workbook(n)
  14.     for row in range(start,rb.sheet_by_index(sheet_number).nrows):
  15.         l = []
  16.         for i in range(cols):
  17.             l.append(str(rb.sheet_by_index(sheet_number).cell(row,i).value))
  18.         yield l
  19.  
  20. # Replace spaces with dots
  21. nospace_regex = re.compile(r"^([A-Z0-9]+)([a-z]+)$")
  22. goodname_regex = re.compile(r"^([A-Z0-9]+)(?: ([a-z]+))?$")
  23. def normalize_lineage_name(name):
  24.     n = str(name)
  25.  
  26.     #if "," in n:
  27.         #parts = n.split(",")
  28.         #for x in parts:
  29.             #for z in normalize_lineage_name(x):
  30.                 #yield z
  31.         #return
  32.  
  33.     n = n.replace(".", " ")
  34.     n = n.strip()
  35.  
  36.  
  37.     # find the starting substring with capitals and ensure there's a space after
  38.     m = re.match(nospace_regex, n)
  39.     if m:
  40.         n = str(m.group(1)) +" "+ str(m.group(2))
  41.  
  42.     return n
  43.  
  44. def normalize(s):
  45.     for i in s:
  46.         n = normalize_lineage_name(i)
  47.         yield x
  48.  
  49. def urlize(s,ns):
  50.     s = s.replace(" ", "_")
  51.     return ns[s]
  52.  
  53. def bad_names(names):
  54.     for n in names:
  55.         if not re.match(goodname_regex,n):
  56.             yield n
  57. def good_names(names):
  58.     for n in names:
  59.         if re.match(goodname_regex,n):
  60.             yield n
  61.  
  62. def filter_lineage_slash(i):
  63.     for k in i:
  64.         if '/' in k[1]:
  65.             yield k
  66.  
  67. def triple_adult_dev_mapping():
  68.     sheet = read("lineage.xls",sheet_number=2, cols=3, start=2)
  69.     for r in sheet:
  70.         yield (r[0], "development_name", r[1])
  71.  
  72. def triple_dev_tree():
  73.     sheet = read("lineage.xls",sheet_number=1, cols=6, start=2)
  74.     for r in sheet:
  75.         yield (r[0], "daughter_of", r[4])
  76.  
  77. #def missing_mappings():
  78.     #a = set([r[0] for r in triple_dev_tree()])
  79.     #a |= set([r[2] for r in triple_dev_tree()])
  80.     #b = set([r[2] for r in triple_adult_dev_mapping()])
  81.     #return (a - b, b - a)
  82.  
  83. def subject(s):
  84.     for i in s:
  85.         yield i[0]
  86.  
  87. def object(s):
  88.     for i in s:
  89.         yield i[2]
  90.  
  91. def smap_o(s,f):
  92.     m = f(object(s))
  93.     for i in zip(s,m):
  94.         i[2] = m
  95.         yield i
  96.  
  97. def all_bad_names():
  98.     collector = set([])
  99.     names = chain(subject(triple_dev_tree()), object(triple_dev_tree()), object(triple_adult_dev_mapping()))
  100.     for p in bad_names(normalize(names)):
  101.         collector.add(p)
  102.     return collector
  103.  
  104. def dev_bad_names():
  105.     collector = set([])
  106.     names = chain(object(triple_dev_tree()),subject(triple_dev_tree()))
  107.     for p in bad_names(normalize(names)):
  108.         collector.add(p)
  109.     return collector
  110.  
  111. def put_in_sesame(graph):
  112.     s = graph.serialize(format="n3")
  113.     con = H.HTTPConnection("107.170.133.175:8080")
  114.     con.request("POST", "/openrdf-sesame/repositories/OpenWorm2/statements", s, {"Content-Type": "application/x-turtle;charset=UTF-8"})
  115.     r = con.getresponse()
  116.     print "sesame response is %d " % r.status
  117.  
  118. class D:
  119.     namespace = Namespace("http://openworm.org/entities/")
  120. d = D()
  121. def f(i):
  122.     return urlize(normalize_lineage_name(i),d.namespace)
  123. def upload_data_to_db():
  124.     graph = Graph()
  125.     for i in ((f(x[0]), d.namespace[x[1]], f(x[2])) for x in triple_dev_tree()):
  126.         graph.add(i)
  127.     put_in_sesame(graph)
  128.  
  129. def upload_other_to_db():
  130.     graph = Graph()
  131.     def j():
  132.         for x in triple_adult_dev_mapping():
  133.             if re.match(goodname_regex,x[2]):
  134.                 yield (f(x[2]), RDFS["label"], Literal(x[0]))
  135.     for i in j():
  136.         graph.add(i)
  137.     put_in_sesame(graph)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement