Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- from paver.easy import *
- import os.path, os
- import csv
- import ruffus
- from collections import defaultdict
- from types import GeneratorType, DictType
- from itertools import ifilter
- options(
- DATA_DIR = 'Data',
- PROCESSED = 'Data/Processed',
- STITCH_CUT = 900,
- )
- @task
- def touch_data():
- for path, _, files in os.walk(options.DATA_DIR):
- for f in files:
- f = f.replace(' ', '\ ')
- sh('touch %s' % os.path.join(path, f))
- @task
- def run():
- ruffus.pipeline_run([top_function])
- @ruffus.follows('process_data', 'process_stitch')
- def top_function():
- pass
- @ruffus.merge(os.path.join(options.PROCESSED, '*.csv'),
- os.path.join(options.PROCESSED, 'results.out'))
- @ruffus.follows('process_bras', 'process_chassey', 'process_fu',
- 'process_konig', 'process_kumar', 'process_shapira',
- 'process_tai')
- def process_data(in_files, out_file):
- fields = ('Symbol', 'Viral-Protein',
- 'Disease', 'Method', 'Ref')
- with open(out_file, 'w') as out_handle:
- writer = csv.DictWriter(out_handle, fields,
- delimiter = '\t')
- for f in in_files:
- with open(f) as in_handle:
- writer.writerows(csv.DictReader(in_handle, delimiter = '\t'))
- @ruffus.files(os.path.join(options.DATA_DIR, 'Bras-2008', 'Table-S2.csv'),
- os.path.join(options.PROCESSED, 'Bras-2008.csv'))
- def process_bras(in_file, out_file):
- def conv_fun(row):
- return {
- 'Symbol':row["Symbol"],
- 'Viral-Protein':'Unknown',
- 'Disease':'HIV',
- 'Method':'RNAi',
- 'Ref':'Bras-2008'
- }
- process_file(in_file, out_file, conv_fun)
- @ruffus.files(os.path.join(options.DATA_DIR, 'Chassey-2008', 'msb200866-s2.csv'),
- os.path.join(options.PROCESSED, 'Chassey-2008.csv'))
- def process_chassey(in_file, out_file):
- def conv_fun(row):
- if len(row["Text Mining"].strip()) > 0:
- yield {
- 'Symbol':row["Gene Symbol"],
- 'Viral-Protein':row["HCV-Protein"],
- 'Disease':'HCV',
- 'Method':'Literature',
- 'Ref':'Chassey-2008'
- }
- if len(row["Y2H"].strip()) > 0:
- yield {
- 'Symbol':row["Gene Symbol"],
- 'Viral-Protein':row["HCV-Protein"],
- 'Disease':'HCV',
- 'Method':'Y2H',
- 'Ref':'Chassey-2008'
- }
- process_file(in_file, out_file, conv_fun)
- @ruffus.files(os.path.join(options.DATA_DIR, 'Fu-2009', 'hiv_interactions'),
- os.path.join(options.PROCESSED, 'Fu-2009.csv'))
- def process_fu(in_file, out_file):
- def conv_fun(row, symbol_id):
- return {
- 'Symbol':symbol_id[row["Gene ID 2"]],
- 'Viral-Protein':row["product name 1"],
- 'Disease':'HIV',
- 'Method':'Literature',
- 'Ref':'Fu-2009'
- }
- symbol_id = get_geneID2Symbol()
- process_file(in_file, out_file, conv_fun, extra = (symbol_id, ))
- @ruffus.files(os.path.join(options.DATA_DIR, 'Konig-2010', 'nature08699-s8.csv'),
- os.path.join(options.PROCESSED, 'Konig-2010.csv'))
- def process_konig(in_file, out_file):
- def conv_fun(row):
- if len(row["Influenza (this study)"]) > 0:
- return {
- 'Symbol':row["Symbol"],
- 'Viral-Protein':'Unknown',
- 'Disease':'Influenza',
- 'Method':'RNAi',
- 'Ref':'Konig-2010'
- }
- process_file(in_file, out_file, conv_fun)
- @ruffus.files(os.path.join(options.DATA_DIR, 'Kumar-2010', 'mmc2.csv'),
- os.path.join(options.PROCESSED, 'kumar-2010.csv'))
- def process_kumar(in_file, out_file):
- def conv_fun(row):
- if float(row["p-value"]) < 0.05:
- return {
- 'Symbol':row["GeneSymbol"],
- 'Viral-Protein':'Unknown',
- 'Disease':'Influenza',
- 'Method':'RNAi',
- 'Ref':'Kumar-2010'
- }
- process_file(in_file, out_file, conv_fun)
- @ruffus.files(os.path.join(options.DATA_DIR, 'Shapira-2009', 'mmc2.csv'),
- os.path.join(options.PROCESSED, 'Shapira-2009.csv'))
- def process_shapira(in_file, out_file):
- def conv_fun(row, symbol_id, pdict):
- gene = symbol_id[row['entrez gene ID']]
- yield {
- 'Symbol':gene,
- 'Ref':'Shapira-2009',
- 'Disease':'Influenza',
- 'Method':'Y2H',
- 'Viral-Protein':'Unknown'
- }
- for field, info in pdict.items():
- if len(row[field].strip()) > 0:
- yield dict(Symbol = gene, **info)
- pdict = {
- 'HCV Li et al. (25 genes)': {
- 'Ref':'Li-2009',
- 'Disease':'HCV',
- 'Method':'RNAi'
- },
- "WNV Krishnan et al. (14 genes)":{
- 'Ref':'Krishnan-2008',
- 'Disease':'West-Nile',
- 'Method':'RNAi'
- },
- "HIV Zhou et al. (5 genes)":{
- 'Ref':'Zhou-2008',
- 'Disease':'HIV',
- 'Method':'RNAi'
- }
- }
- symbol_id = get_geneID2Symbol()
- process_file(in_file, out_file, conv_fun, extra = (symbol_id, pdict))
- @ruffus.files(os.path.join(options.DATA_DIR, 'Tai-2009', 'mmc3.csv'),
- os.path.join(options.PROCESSED, 'Tai-2009.csv'))
- def process_tai(in_file, out_file):
- def conv_fun(row):
- return {
- 'Symbol':row['Symbol'],
- 'Ref':'Tai-2009',
- 'Disease':'HCV',
- 'Method':'RNAi',
- 'Viral-Protein':'Unknown'
- }
- process_file(in_file, out_file, conv_fun)
- @ruffus.files([os.path.join(options.DATA_DIR, 'stitch', 'protein.aliases.v8.2.txt'),
- os.path.join(options.DATA_DIR, 'stitch', 'protein_chemical.human.links.v2.0.tsv'),
- os.path.join(options.DATA_DIR, 'stitch', 'chemical.aliases.v2.0.tsv'),],
- os.path.join(options.DATA_DIR, 'stitch', 'processed.csv'))
- def process_stitch(in_files, out_file):
- protein_alias, protein_chemical, chemical_alias = in_files
- symbol_id = get_geneID2Symbol(source = 'symbol')
- print 'getting protein-conv'
- protein_conv = {}
- with open(protein_alias) as handle:
- fields = ('species', 'protein-id', 'alias', 'source')
- handle.next()
- for row in csv.DictReader(handle, fieldnames = fields, delimiter = '\t'):
- if row['species'] == '9606' and row['source'] == 'Ensembl_EntrezGene':
- protein_conv[row['protein-id']] = symbol_id[row['alias']]
- print 'getting chem-conv'
- chem_conv = {}
- with open(chemical_alias) as handle:
- for row in csv.DictReader(handle, delimiter = '\t'):
- chem_conv[row['chemical']] = row['alias']
- fields = ('Symbol', 'Chemical')
- in_fields = ('chemical', 'protein', 'combined_score')
- print 'processing interactions'
- with open(out_file, 'w') as out_handle:
- writer = csv.DictWriter(out_handle, fields, delimiter = '\t')
- with open(protein_chemical) as handle:
- for c, row in enumerate(csv.DictReader(handle,
- fieldnames = in_fields,
- delimiter = '\t')):
- if c % 10000 == 0:
- print c
- if row['protein'].startswith('9606.'):
- prot = row['protein'].split('.', 1)[1]
- if int(row['combined_score']) > options.STITCH_CUT:
- gene = protein_conv.get(prot, prot)
- chem = chem_conv.get(row['chemical'], row['chemical'])
- writer.writerow({
- 'Symbol':gene,
- 'Chemical':chem
- })
- def get_geneID2Symbol(source = 'geneid'):
- fname = os.path.join(options.DATA_DIR, 'Homo_sapiens.gene_info')
- fields = ('taxid', 'geneid', 'symbol', 'locustag', 'synonyms',
- 'dbXrefs', 'chromosome', 'maplocation', 'description',
- 'type', 'symauth', 'fullauth', 'status', 'other', 'date')
- out = defaultdict(lambda : None)
- with open(fname) as handle:
- for row in csv.DictReader(handle, fieldnames = fields, delimiter = '\t'):
- if source == 'symbol':
- for val in row[source].split('|'):
- out[val] = row['symbol']
- else:
- out[row[source]] = row['symbol']
- return out
- def process_file(in_file, out_file, func, extra = tuple()):
- fields = ('Symbol', 'Viral-Protein',
- 'Disease', 'Method', 'Ref')
- with open(in_file) as in_handle:
- line_gen = csv.DictReader(in_handle, delimiter = '\t')
- with open(out_file, 'w') as out_handle:
- out_handle.write('\t'.join(fields)+'\n')
- writer = csv.DictWriter(out_handle,
- fields,
- delimiter = '\t')
- for row in line_gen:
- val = func(row, *extra)
- if type(val) == GeneratorType:
- writer.writerows(val)
- elif type(val) == DictType:
- writer.writerow(val)
Add Comment
Please, Sign In to add comment