Guest User

Untitled

a guest
Jun 25th, 2018
95
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 9.17 KB | None | 0 0
  1. from paver.easy import *
  2. import os.path, os
  3. import csv
  4. import ruffus
  5. from collections import defaultdict
  6. from types import GeneratorType, DictType
  7. from itertools import ifilter
  8.  
  9.  
  10. options(
  11. DATA_DIR = 'Data',
  12. PROCESSED = 'Data/Processed',
  13. STITCH_CUT = 900,
  14. )
  15.  
  16. @task
  17. def touch_data():
  18. for path, _, files in os.walk(options.DATA_DIR):
  19. for f in files:
  20. f = f.replace(' ', '\ ')
  21. sh('touch %s' % os.path.join(path, f))
  22.  
  23. @task
  24. def run():
  25. ruffus.pipeline_run([top_function])
  26.  
  27. @ruffus.follows('process_data', 'process_stitch')
  28. def top_function():
  29. pass
  30.  
  31. @ruffus.merge(os.path.join(options.PROCESSED, '*.csv'),
  32. os.path.join(options.PROCESSED, 'results.out'))
  33. @ruffus.follows('process_bras', 'process_chassey', 'process_fu',
  34. 'process_konig', 'process_kumar', 'process_shapira',
  35. 'process_tai')
  36. def process_data(in_files, out_file):
  37.  
  38. fields = ('Symbol', 'Viral-Protein',
  39. 'Disease', 'Method', 'Ref')
  40.  
  41. with open(out_file, 'w') as out_handle:
  42. writer = csv.DictWriter(out_handle, fields,
  43. delimiter = '\t')
  44. for f in in_files:
  45. with open(f) as in_handle:
  46. writer.writerows(csv.DictReader(in_handle, delimiter = '\t'))
  47.  
  48.  
  49.  
  50.  
  51.  
  52. @ruffus.files(os.path.join(options.DATA_DIR, 'Bras-2008', 'Table-S2.csv'),
  53. os.path.join(options.PROCESSED, 'Bras-2008.csv'))
  54. def process_bras(in_file, out_file):
  55.  
  56. def conv_fun(row):
  57. return {
  58. 'Symbol':row["Symbol"],
  59. 'Viral-Protein':'Unknown',
  60. 'Disease':'HIV',
  61. 'Method':'RNAi',
  62. 'Ref':'Bras-2008'
  63. }
  64.  
  65. process_file(in_file, out_file, conv_fun)
  66.  
  67.  
  68. @ruffus.files(os.path.join(options.DATA_DIR, 'Chassey-2008', 'msb200866-s2.csv'),
  69. os.path.join(options.PROCESSED, 'Chassey-2008.csv'))
  70. def process_chassey(in_file, out_file):
  71.  
  72. def conv_fun(row):
  73. if len(row["Text Mining"].strip()) > 0:
  74. yield {
  75. 'Symbol':row["Gene Symbol"],
  76. 'Viral-Protein':row["HCV-Protein"],
  77. 'Disease':'HCV',
  78. 'Method':'Literature',
  79. 'Ref':'Chassey-2008'
  80. }
  81. if len(row["Y2H"].strip()) > 0:
  82. yield {
  83. 'Symbol':row["Gene Symbol"],
  84. 'Viral-Protein':row["HCV-Protein"],
  85. 'Disease':'HCV',
  86. 'Method':'Y2H',
  87. 'Ref':'Chassey-2008'
  88. }
  89. process_file(in_file, out_file, conv_fun)
  90.  
  91.  
  92.  
  93.  
  94. @ruffus.files(os.path.join(options.DATA_DIR, 'Fu-2009', 'hiv_interactions'),
  95. os.path.join(options.PROCESSED, 'Fu-2009.csv'))
  96. def process_fu(in_file, out_file):
  97.  
  98.  
  99. def conv_fun(row, symbol_id):
  100. return {
  101. 'Symbol':symbol_id[row["Gene ID 2"]],
  102. 'Viral-Protein':row["product name 1"],
  103. 'Disease':'HIV',
  104. 'Method':'Literature',
  105. 'Ref':'Fu-2009'
  106. }
  107.  
  108. symbol_id = get_geneID2Symbol()
  109. process_file(in_file, out_file, conv_fun, extra = (symbol_id, ))
  110.  
  111. @ruffus.files(os.path.join(options.DATA_DIR, 'Konig-2010', 'nature08699-s8.csv'),
  112. os.path.join(options.PROCESSED, 'Konig-2010.csv'))
  113. def process_konig(in_file, out_file):
  114.  
  115.  
  116. def conv_fun(row):
  117. if len(row["Influenza (this study)"]) > 0:
  118. return {
  119. 'Symbol':row["Symbol"],
  120. 'Viral-Protein':'Unknown',
  121. 'Disease':'Influenza',
  122. 'Method':'RNAi',
  123. 'Ref':'Konig-2010'
  124. }
  125.  
  126. process_file(in_file, out_file, conv_fun)
  127.  
  128.  
  129. @ruffus.files(os.path.join(options.DATA_DIR, 'Kumar-2010', 'mmc2.csv'),
  130. os.path.join(options.PROCESSED, 'kumar-2010.csv'))
  131. def process_kumar(in_file, out_file):
  132.  
  133. def conv_fun(row):
  134. if float(row["p-value"]) < 0.05:
  135. return {
  136. 'Symbol':row["GeneSymbol"],
  137. 'Viral-Protein':'Unknown',
  138. 'Disease':'Influenza',
  139. 'Method':'RNAi',
  140. 'Ref':'Kumar-2010'
  141. }
  142. process_file(in_file, out_file, conv_fun)
  143.  
  144. @ruffus.files(os.path.join(options.DATA_DIR, 'Shapira-2009', 'mmc2.csv'),
  145. os.path.join(options.PROCESSED, 'Shapira-2009.csv'))
  146. def process_shapira(in_file, out_file):
  147.  
  148. def conv_fun(row, symbol_id, pdict):
  149. gene = symbol_id[row['entrez gene ID']]
  150. yield {
  151. 'Symbol':gene,
  152. 'Ref':'Shapira-2009',
  153. 'Disease':'Influenza',
  154. 'Method':'Y2H',
  155. 'Viral-Protein':'Unknown'
  156. }
  157. for field, info in pdict.items():
  158. if len(row[field].strip()) > 0:
  159. yield dict(Symbol = gene, **info)
  160.  
  161. pdict = {
  162. 'HCV Li et al. (25 genes)': {
  163. 'Ref':'Li-2009',
  164. 'Disease':'HCV',
  165. 'Method':'RNAi'
  166. },
  167. "WNV Krishnan et al. (14 genes)":{
  168. 'Ref':'Krishnan-2008',
  169. 'Disease':'West-Nile',
  170. 'Method':'RNAi'
  171. },
  172. "HIV Zhou et al. (5 genes)":{
  173. 'Ref':'Zhou-2008',
  174. 'Disease':'HIV',
  175. 'Method':'RNAi'
  176. }
  177.  
  178. }
  179. symbol_id = get_geneID2Symbol()
  180. process_file(in_file, out_file, conv_fun, extra = (symbol_id, pdict))
  181.  
  182.  
  183. @ruffus.files(os.path.join(options.DATA_DIR, 'Tai-2009', 'mmc3.csv'),
  184. os.path.join(options.PROCESSED, 'Tai-2009.csv'))
  185. def process_tai(in_file, out_file):
  186.  
  187. def conv_fun(row):
  188. return {
  189. 'Symbol':row['Symbol'],
  190. 'Ref':'Tai-2009',
  191. 'Disease':'HCV',
  192. 'Method':'RNAi',
  193. 'Viral-Protein':'Unknown'
  194. }
  195.  
  196. process_file(in_file, out_file, conv_fun)
  197.  
  198. @ruffus.files([os.path.join(options.DATA_DIR, 'stitch', 'protein.aliases.v8.2.txt'),
  199. os.path.join(options.DATA_DIR, 'stitch', 'protein_chemical.human.links.v2.0.tsv'),
  200. os.path.join(options.DATA_DIR, 'stitch', 'chemical.aliases.v2.0.tsv'),],
  201. os.path.join(options.DATA_DIR, 'stitch', 'processed.csv'))
  202. def process_stitch(in_files, out_file):
  203.  
  204.  
  205. protein_alias, protein_chemical, chemical_alias = in_files
  206. symbol_id = get_geneID2Symbol(source = 'symbol')
  207.  
  208. print 'getting protein-conv'
  209. protein_conv = {}
  210. with open(protein_alias) as handle:
  211. fields = ('species', 'protein-id', 'alias', 'source')
  212. handle.next()
  213. for row in csv.DictReader(handle, fieldnames = fields, delimiter = '\t'):
  214. if row['species'] == '9606' and row['source'] == 'Ensembl_EntrezGene':
  215. protein_conv[row['protein-id']] = symbol_id[row['alias']]
  216.  
  217. print 'getting chem-conv'
  218. chem_conv = {}
  219. with open(chemical_alias) as handle:
  220. for row in csv.DictReader(handle, delimiter = '\t'):
  221. chem_conv[row['chemical']] = row['alias']
  222.  
  223. fields = ('Symbol', 'Chemical')
  224. in_fields = ('chemical', 'protein', 'combined_score')
  225. print 'processing interactions'
  226. with open(out_file, 'w') as out_handle:
  227. writer = csv.DictWriter(out_handle, fields, delimiter = '\t')
  228. with open(protein_chemical) as handle:
  229. for c, row in enumerate(csv.DictReader(handle,
  230. fieldnames = in_fields,
  231. delimiter = '\t')):
  232. if c % 10000 == 0:
  233. print c
  234.  
  235. if row['protein'].startswith('9606.'):
  236. prot = row['protein'].split('.', 1)[1]
  237. if int(row['combined_score']) > options.STITCH_CUT:
  238. gene = protein_conv.get(prot, prot)
  239. chem = chem_conv.get(row['chemical'], row['chemical'])
  240.  
  241. writer.writerow({
  242. 'Symbol':gene,
  243. 'Chemical':chem
  244. })
  245.  
  246.  
  247.  
  248.  
  249.  
  250. def get_geneID2Symbol(source = 'geneid'):
  251.  
  252. fname = os.path.join(options.DATA_DIR, 'Homo_sapiens.gene_info')
  253. fields = ('taxid', 'geneid', 'symbol', 'locustag', 'synonyms',
  254. 'dbXrefs', 'chromosome', 'maplocation', 'description',
  255. 'type', 'symauth', 'fullauth', 'status', 'other', 'date')
  256.  
  257. out = defaultdict(lambda : None)
  258. with open(fname) as handle:
  259. for row in csv.DictReader(handle, fieldnames = fields, delimiter = '\t'):
  260. if source == 'symbol':
  261. for val in row[source].split('|'):
  262. out[val] = row['symbol']
  263. else:
  264. out[row[source]] = row['symbol']
  265.  
  266. return out
  267.  
  268. def process_file(in_file, out_file, func, extra = tuple()):
  269.  
  270. fields = ('Symbol', 'Viral-Protein',
  271. 'Disease', 'Method', 'Ref')
  272. with open(in_file) as in_handle:
  273. line_gen = csv.DictReader(in_handle, delimiter = '\t')
  274. with open(out_file, 'w') as out_handle:
  275. out_handle.write('\t'.join(fields)+'\n')
  276. writer = csv.DictWriter(out_handle,
  277. fields,
  278. delimiter = '\t')
  279. for row in line_gen:
  280. val = func(row, *extra)
  281. if type(val) == GeneratorType:
  282. writer.writerows(val)
  283. elif type(val) == DictType:
  284. writer.writerow(val)
Add Comment
Please, Sign In to add comment