Guest User

Untitled

a guest
Oct 21st, 2018
89
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 7.74 KB | None | 0 0
  1. #!/usr/bin/env python
  2. """
  3. Given a file containing a table of genes and their coordinates (see "Inputs"), get their genotypes from 1000genomes.
  4.  
  5. This in reality is a wrapper to tabix. You need to have tabix installed in the .bin folder, and you also need to have a ./data/tabix_indexes folder on your file system.
  6.  
  7. Usage
  8. =========
  9.  
  10. $: python get_genotypes.py -l data/n-glycan.coords
  11.  
  12.  
  13. Inputs
  14. ==========
  15.  
  16. the coordinates file must contain the coordinates of each gene in the analysis, in the following format:
  17.  
  18. ::
  19.  
  20. DOLK N-glycan substrates chr9 131707808 131710012
  21. RPN1 N-glycan ost_complex chr3 128338812 128399918
  22. ALG8 N-glycan precursor_biosynthesis chr11 77811987 77850699
  23. ALG9 N-glycan precursor_biosynthesis chr11 111652918 111750181
  24. UAP1 N-glycan substrates chr1 162531295 162569633
  25. ST6GAL1 N-glycan branching2 chr3 186648314 186796341
  26. ALG2 N-glycan precursor_biosynthesis chr9 101978706 101984246
  27. ALG3 N-glycan precursor_biosynthesis chr3 183960116 183967313
  28. ALG1 N-glycan precursor_biosynthesis chr16 5083702 5137380
  29. ALG6 N-glycan precursor_biosynthesis chr1 63833260 63904233
  30. GANAB N-glycan cnx_crt chr11 62392297 62414104
  31.  
  32. Note that SNPs in the 1000genomes project are encoded to hg19, so the coordinates must be all refSeq hg19. You can use the script get_gene_coords.py to retrieve the coordinates automatically.
  33.  
  34.  
  35. """
  36. import optparse
  37. import subprocess
  38. import os
  39.  
  40.  
  41. def get_options():
  42. parser = optparse.OptionParser(usage="python get_genotypes.py -l genes_coords.txt")
  43. parser.add_option('-l', '-g', '-f', '--list', '--list_of_genes', '--genes', dest='genes',
  44. help='file containing the list of gene. One symbol per line', default=False)
  45. parser.add_option('-i', '--index', '--force-generate-index', dest='force_index', action='store_true',
  46. help='Force generation of tabix index file', default=False)
  47. parser.add_option('-d', '--download', '--force-download', dest='force_download', action='store_true',
  48. help='Force download of vcf files', default=False)
  49. parser.add_option('-u', '--debug', dest='debug', action='store_true', default=True)
  50. (options, args) = parser.parse_args()
  51.  
  52. if options.debug is True:
  53. logging.basicConfig(level=logging.DEBUG, filename='get_genotypes.log')
  54.  
  55. if (options.genes is False) and (len(args) == 0):
  56. parser.print_help()
  57. parser.error('get_genotypes.py: genes file not defined.')
  58. # print args
  59. # print options.genes
  60. try:
  61. genes_path = ''
  62. if options.genes is not False:
  63. genes_path = options.genes
  64. elif args != '':
  65. genes_path = args[0]
  66. # print genes_path
  67. if genes_path != '': #TODO: I don't remember why I put this instruction here.
  68. genes_h = open(genes_path, 'r')
  69. except:
  70. print __doc__
  71. parser.error("Can not open genes file")
  72.  
  73. genes = read_genes_file(genes_path)
  74. logging.debug(genes[:2])
  75.  
  76. return (genes, options)
  77.  
  78. def read_genes_file(genes_path):
  79. """
  80. DOLK N-glycan substrates chr9 131707808 131710012
  81. RPN1 N-glycan ost_complex chr3 128338812 128399918
  82. ALG8 N-glycan precursor_biosynthesis chr11 77811987 77850699
  83. ALG9 N-glycan precursor_biosynthesis chr11 111652918 111750181
  84. UAP1 N-glycan substrates chr1 162531295 162569633
  85. ST6GAL1 N-glycan branching2 chr3 186648314 186796341
  86. ALG2 N-glycan precursor_biosynthesis chr9 101978706 101984246
  87. ALG3 N-glycan precursor_biosynthesis chr3 183960116 183967313
  88. ALG1 N-glycan precursor_biosynthesis chr16 5083702 5137380
  89. ALG6 N-glycan precursor_biosynthesis chr1 63833260 63904233
  90. GANAB N-glycan cnx_crt chr11 62392297 62414104
  91.  
  92. """
  93. genes = [line.split() for line in open(genes_path, 'r')]
  94. return genes
  95.  
  96.  
  97. def get_genotypes(genes, force_index=False, force_download=False, get_individuals=False):
  98. """
  99. Call tabix on 1000genomes server to get coordinates.
  100.  
  101. Note: the URL to 1000genomes may change over time. If something doesn't work, check it.
  102. """
  103. # tabix_1000genomes_url = "ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/release/20100804/ALL.2of4intersection.20100804.genotypes.vcf.gz"
  104. tabix_1000genomes_url = "ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/release/20110521/ALL.chr{0}.phase1_release_v3.20101123.snps_indels_svs.genotypes.vcf.gz"
  105. individuals_file = "phase1_integrated_calls.20101123.ALL.panel"
  106. # ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/release/20110521/phase1_integrated_calls.20101123.ALL.panel
  107. individuals_url = tabix_1000genomes_url.rsplit('/', 1)[0] + '/' + individuals_file
  108.  
  109. # Get individuals' info
  110. if get_individuals is True:
  111. print "\nDownloading individual infos\n"
  112. print("wget {}".format(individuals_url))
  113. subprocess.call("wget {}".format(individuals_url).split())
  114. print("mv {} data/individuals/{}.individuals".format(individuals_file, individuals_file))
  115. subprocess.call("mv {} data/individuals/{}.individuals".format(individuals_file, individuals_file).split())
  116.  
  117. # report_table_content = 'gene\tpathway\tsubpathway\tchromosome\tstart\tend\ttot_snps\tgene_length\tsnp_per_kb\n'
  118. report_table_content = 'gene\tpathway\tsubpathway\tchromosome\tstart\tend\tgene_length\n'
  119. for fields in genes:
  120. (gene,pathway,subpathway,chromosome,start,end) = fields[:6]
  121.  
  122. chromosome = chromosome.replace('chr', '')
  123. command = "./bin/tabix -h {0} {1}:{2}-{3}".format(tabix_1000genomes_url.format(chromosome), chromosome, start, end)
  124. print "\nDownloading genotypes for gene {}, {}:{}-{}".format(gene, chromosome, start,end)
  125. outputfilepath = './data/vcf/' + gene + '.vcf'
  126. gzoutputfilepath = './data/vcf/' + gene + '.vcf.gz'
  127. if (os.path.isfile(gzoutputfilepath)) and (os.stat(gzoutputfilepath).st_size > 0) and (force_download is not True):
  128. print "- skipping download, as genotypes for gene {} are already available in data/vcf/{}.vcf. Use --force-download option to download again.".format(gene, gene)
  129. else:
  130. print "- executing command: " + command
  131. outputfile = open(outputfilepath, 'w')
  132. subprocess.call(command.split(), cwd='./data/tabix_indexes', stdout=outputfile)
  133. subprocess.call(["gzip", "-f", outputfile.name])
  134.  
  135. # Calculate how many SNPs correspond to the gene, and update Table S1
  136. gene_length = int(end) - int(start)
  137. # number_snps = len(open(outputfilepath,'r').readlines()) - 30
  138. # if number_snps <= 0:
  139. # number_snps = 'NA'
  140. # density = 'NA'
  141. # else:
  142. # density = 1000. * number_snps / gene_length
  143.  
  144. # report_table_content += "{}\t{}\t{}\t{}\n".format('\t'.join(fields), number_snps, gene_length, density)
  145. report_table_content += "{}\t{}\n".format('\t'.join(fields), gene_length)
  146.  
  147.  
  148. return report_table_content
  149.  
  150. def main():
  151. (genes, options) = get_options()
  152. report_table_content = get_genotypes(genes, options.force_index, options.force_download)
  153. print report_table_content
  154. pathway = options.genes.split('/')[-1].rsplit('.')[0]
  155. report_outputfilepath = "report_{}_number_snps.txt".format(pathway)
  156. report_outputfile = open(report_outputfilepath, 'w')
  157. report_outputfile.write(report_table_content)
  158.  
  159.  
  160. if __name__ == '__main__':
  161. import logging
  162. import doctest
  163. logging.basicConfig(level=logging.DEBUG)
  164. doctest.testmod()
  165. main()
Add Comment
Please, Sign In to add comment