Untitled

from paver.easy import *
import os.path, os
import csv
import ruffus
from collections import defaultdict
from types import GeneratorType, DictType
from itertools import ifilter


options(
    DATA_DIR = 'Data',
    PROCESSED = 'Data/Processed',
    STITCH_CUT = 900,
)

@task
def touch_data():
    for path, _, files in os.walk(options.DATA_DIR):
        for f in files:
            f = f.replace(' ', '\ ')
            sh('touch %s' % os.path.join(path, f))

@task
def run():
    ruffus.pipeline_run([top_function])

@ruffus.follows('process_data', 'process_stitch')
def top_function():
    pass

@ruffus.merge(os.path.join(options.PROCESSED, '*.csv'),
              os.path.join(options.PROCESSED, 'results.out'))
@ruffus.follows('process_bras', 'process_chassey', 'process_fu',
                'process_konig', 'process_kumar', 'process_shapira',
                'process_tai')
def process_data(in_files, out_file):

    fields = ('Symbol', 'Viral-Protein',
              'Disease', 'Method', 'Ref')

    with open(out_file, 'w') as out_handle:
        writer = csv.DictWriter(out_handle, fields,
                                delimiter = '\t')
        for f in in_files:
            with open(f) as in_handle:
                writer.writerows(csv.DictReader(in_handle, delimiter = '\t'))


@ruffus.files(os.path.join(options.DATA_DIR, 'Bras-2008', 'Table-S2.csv'),
       os.path.join(options.PROCESSED, 'Bras-2008.csv'))
def process_bras(in_file, out_file):

    def conv_fun(row):
        return {
                'Symbol':row["Symbol"],
                'Viral-Protein':'Unknown',
                'Disease':'HIV',
                'Method':'RNAi',
                'Ref':'Bras-2008'
                }

    process_file(in_file, out_file, conv_fun)


@ruffus.files(os.path.join(options.DATA_DIR, 'Chassey-2008', 'msb200866-s2.csv'),
       os.path.join(options.PROCESSED, 'Chassey-2008.csv'))
def process_chassey(in_file, out_file):

    def conv_fun(row):
        if len(row["Text Mining"].strip()) > 0:
            yield {
                'Symbol':row["Gene Symbol"],
                'Viral-Protein':row["HCV-Protein"],
                'Disease':'HCV',
                'Method':'Literature',
                'Ref':'Chassey-2008'
                    }
        if len(row["Y2H"].strip()) > 0:
            yield {
                'Symbol':row["Gene Symbol"],
                'Viral-Protein':row["HCV-Protein"],
                'Disease':'HCV',
                'Method':'Y2H',
                'Ref':'Chassey-2008'
                    }
    process_file(in_file, out_file, conv_fun)


@ruffus.files(os.path.join(options.DATA_DIR, 'Fu-2009', 'hiv_interactions'),
       os.path.join(options.PROCESSED, 'Fu-2009.csv'))
def process_fu(in_file, out_file):


    def conv_fun(row, symbol_id):
        return {
                'Symbol':symbol_id[row["Gene ID 2"]],
                'Viral-Protein':row["product name 1"],
                'Disease':'HIV',
                'Method':'Literature',
                'Ref':'Fu-2009'
                }

    symbol_id = get_geneID2Symbol()
    process_file(in_file, out_file, conv_fun, extra = (symbol_id, ))

@ruffus.files(os.path.join(options.DATA_DIR, 'Konig-2010', 'nature08699-s8.csv'),
       os.path.join(options.PROCESSED, 'Konig-2010.csv'))
def process_konig(in_file, out_file):


    def conv_fun(row):
        if len(row["Influenza (this study)"]) > 0:
            return {
                'Symbol':row["Symbol"],
                'Viral-Protein':'Unknown',
                'Disease':'Influenza',
                'Method':'RNAi',
                'Ref':'Konig-2010'
                    }

    process_file(in_file, out_file, conv_fun)


@ruffus.files(os.path.join(options.DATA_DIR, 'Kumar-2010', 'mmc2.csv'),
       os.path.join(options.PROCESSED, 'kumar-2010.csv'))
def process_kumar(in_file, out_file):

    def conv_fun(row):
        if float(row["p-value"]) < 0.05:
            return {
                'Symbol':row["GeneSymbol"],
                'Viral-Protein':'Unknown',
                'Disease':'Influenza',
                'Method':'RNAi',
                'Ref':'Kumar-2010'
                    }
    process_file(in_file, out_file, conv_fun)

@ruffus.files(os.path.join(options.DATA_DIR, 'Shapira-2009', 'mmc2.csv'),
       os.path.join(options.PROCESSED, 'Shapira-2009.csv'))
def process_shapira(in_file, out_file):

    def conv_fun(row, symbol_id, pdict):
        gene = symbol_id[row['entrez gene ID']]
        yield {
            'Symbol':gene,
            'Ref':'Shapira-2009',
            'Disease':'Influenza',
            'Method':'Y2H',
            'Viral-Protein':'Unknown'
        }
        for field, info in pdict.items():
            if len(row[field].strip()) > 0:
                yield dict(Symbol = gene, **info)

    pdict = {
        'HCV Li et al. (25 genes)': {
            'Ref':'Li-2009',
            'Disease':'HCV',
            'Method':'RNAi'
        },
        "WNV Krishnan et al. (14 genes)":{
            'Ref':'Krishnan-2008',
            'Disease':'West-Nile',
            'Method':'RNAi'
        },
        "HIV Zhou et al. (5 genes)":{
            'Ref':'Zhou-2008',
            'Disease':'HIV',
            'Method':'RNAi'
        }

    }
    symbol_id = get_geneID2Symbol()
    process_file(in_file, out_file, conv_fun, extra = (symbol_id, pdict))


@ruffus.files(os.path.join(options.DATA_DIR, 'Tai-2009', 'mmc3.csv'),
       os.path.join(options.PROCESSED, 'Tai-2009.csv'))
def process_tai(in_file, out_file):

    def conv_fun(row):
        return {
            'Symbol':row['Symbol'],
            'Ref':'Tai-2009',
            'Disease':'HCV',
            'Method':'RNAi',
            'Viral-Protein':'Unknown'
        }

    process_file(in_file, out_file, conv_fun)

@ruffus.files([os.path.join(options.DATA_DIR, 'stitch', 'protein.aliases.v8.2.txt'),
               os.path.join(options.DATA_DIR, 'stitch', 'protein_chemical.human.links.v2.0.tsv'),
               os.path.join(options.DATA_DIR, 'stitch', 'chemical.aliases.v2.0.tsv'),],
              os.path.join(options.DATA_DIR, 'stitch', 'processed.csv'))
def process_stitch(in_files, out_file):


    protein_alias, protein_chemical, chemical_alias = in_files
    symbol_id = get_geneID2Symbol(source = 'symbol')

    print 'getting protein-conv'
    protein_conv = {}
    with open(protein_alias) as handle:
        fields = ('species', 'protein-id', 'alias', 'source')
        handle.next()
        for row in csv.DictReader(handle, fieldnames = fields, delimiter = '\t'):
            if row['species'] == '9606' and row['source'] == 'Ensembl_EntrezGene':
                protein_conv[row['protein-id']] = symbol_id[row['alias']]

    print 'getting chem-conv'
    chem_conv = {}
    with open(chemical_alias) as handle:
        for row in csv.DictReader(handle, delimiter = '\t'):
            chem_conv[row['chemical']] = row['alias']

    fields = ('Symbol', 'Chemical')
    in_fields = ('chemical', 'protein', 'combined_score')
    print 'processing interactions'
    with open(out_file, 'w') as out_handle:
        writer = csv.DictWriter(out_handle, fields, delimiter = '\t')
        with open(protein_chemical) as handle:
            for c, row in enumerate(csv.DictReader(handle,
                                                   fieldnames = in_fields,
                                                   delimiter = '\t')):
                if c % 10000 == 0:
                    print c

                if row['protein'].startswith('9606.'):
                    prot = row['protein'].split('.', 1)[1]
                    if int(row['combined_score']) > options.STITCH_CUT:
                        gene = protein_conv.get(prot, prot)
                        chem = chem_conv.get(row['chemical'], row['chemical'])

                        writer.writerow({
                            'Symbol':gene,
                            'Chemical':chem
                                        })


def get_geneID2Symbol(source = 'geneid'):

    fname = os.path.join(options.DATA_DIR, 'Homo_sapiens.gene_info')
    fields = ('taxid', 'geneid', 'symbol', 'locustag', 'synonyms',
                'dbXrefs', 'chromosome', 'maplocation', 'description',
                'type', 'symauth', 'fullauth', 'status', 'other', 'date')

    out = defaultdict(lambda : None)
    with open(fname) as handle:
        for row in csv.DictReader(handle, fieldnames = fields, delimiter = '\t'):
            if source == 'symbol':
                for val in row[source].split('|'):
                    out[val] = row['symbol']
            else:
                out[row[source]] = row['symbol']

    return out

def process_file(in_file, out_file, func, extra = tuple()):

    fields = ('Symbol', 'Viral-Protein',
              'Disease', 'Method', 'Ref')
    with open(in_file) as in_handle:
        line_gen = csv.DictReader(in_handle, delimiter = '\t')
        with open(out_file, 'w') as out_handle:
            out_handle.write('\t'.join(fields)+'\n')
            writer = csv.DictWriter(out_handle,
                                    fields,
                                    delimiter = '\t')
            for row in line_gen:
                val = func(row, *extra)
                if type(val) == GeneratorType:
                    writer.writerows(val)
                elif type(val) == DictType:
                    writer.writerow(val)