Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #!/usr/bin/env python
- """
- Given a file containing a table of genes and their coordinates (see "Inputs"), get their genotypes from 1000genomes.
- This in reality is a wrapper to tabix. You need to have tabix installed in the .bin folder, and you also need to have a ./data/tabix_indexes folder on your file system.
- Usage
- =========
- $: python get_genotypes.py -l data/n-glycan.coords
- Inputs
- ==========
- the coordinates file must contain the coordinates of each gene in the analysis, in the following format:
- ::
- DOLK N-glycan substrates chr9 131707808 131710012
- RPN1 N-glycan ost_complex chr3 128338812 128399918
- ALG8 N-glycan precursor_biosynthesis chr11 77811987 77850699
- ALG9 N-glycan precursor_biosynthesis chr11 111652918 111750181
- UAP1 N-glycan substrates chr1 162531295 162569633
- ST6GAL1 N-glycan branching2 chr3 186648314 186796341
- ALG2 N-glycan precursor_biosynthesis chr9 101978706 101984246
- ALG3 N-glycan precursor_biosynthesis chr3 183960116 183967313
- ALG1 N-glycan precursor_biosynthesis chr16 5083702 5137380
- ALG6 N-glycan precursor_biosynthesis chr1 63833260 63904233
- GANAB N-glycan cnx_crt chr11 62392297 62414104
- Note that SNPs in the 1000genomes project are encoded to hg19, so the coordinates must be all refSeq hg19. You can use the script get_gene_coords.py to retrieve the coordinates automatically.
- """
- import optparse
- import subprocess
- import os
- def get_options():
- parser = optparse.OptionParser(usage="python get_genotypes.py -l genes_coords.txt")
- parser.add_option('-l', '-g', '-f', '--list', '--list_of_genes', '--genes', dest='genes',
- help='file containing the list of gene. One symbol per line', default=False)
- parser.add_option('-i', '--index', '--force-generate-index', dest='force_index', action='store_true',
- help='Force generation of tabix index file', default=False)
- parser.add_option('-d', '--download', '--force-download', dest='force_download', action='store_true',
- help='Force download of vcf files', default=False)
- parser.add_option('-u', '--debug', dest='debug', action='store_true', default=True)
- (options, args) = parser.parse_args()
- if options.debug is True:
- logging.basicConfig(level=logging.DEBUG, filename='get_genotypes.log')
- if (options.genes is False) and (len(args) == 0):
- parser.print_help()
- parser.error('get_genotypes.py: genes file not defined.')
- # print args
- # print options.genes
- try:
- genes_path = ''
- if options.genes is not False:
- genes_path = options.genes
- elif args != '':
- genes_path = args[0]
- # print genes_path
- if genes_path != '': #TODO: I don't remember why I put this instruction here.
- genes_h = open(genes_path, 'r')
- except:
- print __doc__
- parser.error("Can not open genes file")
- genes = read_genes_file(genes_path)
- logging.debug(genes[:2])
- return (genes, options)
- def read_genes_file(genes_path):
- """
- DOLK N-glycan substrates chr9 131707808 131710012
- RPN1 N-glycan ost_complex chr3 128338812 128399918
- ALG8 N-glycan precursor_biosynthesis chr11 77811987 77850699
- ALG9 N-glycan precursor_biosynthesis chr11 111652918 111750181
- UAP1 N-glycan substrates chr1 162531295 162569633
- ST6GAL1 N-glycan branching2 chr3 186648314 186796341
- ALG2 N-glycan precursor_biosynthesis chr9 101978706 101984246
- ALG3 N-glycan precursor_biosynthesis chr3 183960116 183967313
- ALG1 N-glycan precursor_biosynthesis chr16 5083702 5137380
- ALG6 N-glycan precursor_biosynthesis chr1 63833260 63904233
- GANAB N-glycan cnx_crt chr11 62392297 62414104
- """
- genes = [line.split() for line in open(genes_path, 'r')]
- return genes
- def get_genotypes(genes, force_index=False, force_download=False, get_individuals=False):
- """
- Call tabix on 1000genomes server to get coordinates.
- Note: the URL to 1000genomes may change over time. If something doesn't work, check it.
- """
- # tabix_1000genomes_url = "ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/release/20100804/ALL.2of4intersection.20100804.genotypes.vcf.gz"
- tabix_1000genomes_url = "ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/release/20110521/ALL.chr{0}.phase1_release_v3.20101123.snps_indels_svs.genotypes.vcf.gz"
- individuals_file = "phase1_integrated_calls.20101123.ALL.panel"
- # ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/release/20110521/phase1_integrated_calls.20101123.ALL.panel
- individuals_url = tabix_1000genomes_url.rsplit('/', 1)[0] + '/' + individuals_file
- # Get individuals' info
- if get_individuals is True:
- print "\nDownloading individual infos\n"
- print("wget {}".format(individuals_url))
- subprocess.call("wget {}".format(individuals_url).split())
- print("mv {} data/individuals/{}.individuals".format(individuals_file, individuals_file))
- subprocess.call("mv {} data/individuals/{}.individuals".format(individuals_file, individuals_file).split())
- # report_table_content = 'gene\tpathway\tsubpathway\tchromosome\tstart\tend\ttot_snps\tgene_length\tsnp_per_kb\n'
- report_table_content = 'gene\tpathway\tsubpathway\tchromosome\tstart\tend\tgene_length\n'
- for fields in genes:
- (gene,pathway,subpathway,chromosome,start,end) = fields[:6]
- chromosome = chromosome.replace('chr', '')
- command = "./bin/tabix -h {0} {1}:{2}-{3}".format(tabix_1000genomes_url.format(chromosome), chromosome, start, end)
- print "\nDownloading genotypes for gene {}, {}:{}-{}".format(gene, chromosome, start,end)
- outputfilepath = './data/vcf/' + gene + '.vcf'
- gzoutputfilepath = './data/vcf/' + gene + '.vcf.gz'
- if (os.path.isfile(gzoutputfilepath)) and (os.stat(gzoutputfilepath).st_size > 0) and (force_download is not True):
- print "- skipping download, as genotypes for gene {} are already available in data/vcf/{}.vcf. Use --force-download option to download again.".format(gene, gene)
- else:
- print "- executing command: " + command
- outputfile = open(outputfilepath, 'w')
- subprocess.call(command.split(), cwd='./data/tabix_indexes', stdout=outputfile)
- subprocess.call(["gzip", "-f", outputfile.name])
- # Calculate how many SNPs correspond to the gene, and update Table S1
- gene_length = int(end) - int(start)
- # number_snps = len(open(outputfilepath,'r').readlines()) - 30
- # if number_snps <= 0:
- # number_snps = 'NA'
- # density = 'NA'
- # else:
- # density = 1000. * number_snps / gene_length
- # report_table_content += "{}\t{}\t{}\t{}\n".format('\t'.join(fields), number_snps, gene_length, density)
- report_table_content += "{}\t{}\n".format('\t'.join(fields), gene_length)
- return report_table_content
- def main():
- (genes, options) = get_options()
- report_table_content = get_genotypes(genes, options.force_index, options.force_download)
- print report_table_content
- pathway = options.genes.split('/')[-1].rsplit('.')[0]
- report_outputfilepath = "report_{}_number_snps.txt".format(pathway)
- report_outputfile = open(report_outputfilepath, 'w')
- report_outputfile.write(report_table_content)
- if __name__ == '__main__':
- import logging
- import doctest
- logging.basicConfig(level=logging.DEBUG)
- doctest.testmod()
- main()
Add Comment
Please, Sign In to add comment