Guest User

Untitled

a guest
Mar 21st, 2018
101
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 10.61 KB | None | 0 0
  1. #!/usr/bin/env python
  2.  
  3. """calculates MLST types from assemblies using BLAST.
  4. If the gene is truncated, it will report a "T" and if
  5. the gene has no blast hit, it will report a "X".
  6. Your reference allele names must all end in "fasta"
  7. and must contain a "_" between gene name and number.
  8. The only external dependency is blast+ - tested version
  9. is 2.2.31 - and igs multithreading tools"""
  10.  
  11. from __future__ import print_function
  12. import optparse
  13. import glob
  14. import subprocess
  15. import os
  16. import re
  17. import sys
  18. import shutil
  19.  
  20. def test_file(option, opt_str, value, parser):
  21. try:
  22. with open(value): setattr(parser.values, option.dest, value)
  23. except IOError:
  24. print('%s cannot be opened' % option)
  25. sys.exit()
  26.  
  27. def test_dir(option, opt_str, value, parser):
  28. if os.path.exists(value):
  29. setattr(parser.values, option.dest, value)
  30. else:
  31. print("%s cannot be found" % option)
  32. sys.exit()
  33.  
  34. def mp_shell(func, params, numProc):
  35. from multiprocessing import Pool
  36. p = Pool(numProc)
  37. out = p.map(func, params)
  38. p.terminate()
  39. return out
  40.  
  41. def get_gene_order(reference):
  42. firstLine = open(reference).readline()
  43. fields = firstLine.split()
  44. ordered = fields[1:]
  45. return ordered
  46.  
  47. def get_seq_name(in_fasta):
  48. """used for renaming the sequences"""
  49. return os.path.basename(in_fasta)
  50.  
  51. def process_results_dev(directory,ordered,trunc,new,reference,prefix,num_markers):
  52. outfile = open("%s.profile.txt" % prefix, "w")
  53. pad_order = list(ordered)
  54. pad_order.insert(0,"genome")
  55. pad_order.insert(num_markers+1,"ST")
  56. outfile.write("\t".join(pad_order))
  57. outfile.write("\n")
  58. for infile in glob.glob(os.path.join(directory,"*blast.redux.out")):
  59. alist = []
  60. file_name = get_seq_name(infile)
  61. genome_name = file_name.replace(".fasta.blast.redux.out","")
  62. final_profile = []
  63. final_profile.append(genome_name)
  64. allele_profile = {}
  65. for allele in ordered:
  66. for line in open(infile, "rU"):
  67. newline = line.strip("\n")
  68. fields = newline.split()
  69. allele_fields = fields[0].split("_")
  70. allele_name = allele_fields[0]
  71. """This is where there is a match"""
  72. if allele in allele_name:
  73. try:
  74. if int(fields[3])/int(fields[7])>=trunc and float(fields[2])>=float(new):
  75. allele_profile.update({allele:allele_fields[1]})
  76. elif int(fields[3])/int(fields[7])>=trunc and float(fields[2])<float(new):
  77. allele_profile.update({allele:"N"})
  78. else:
  79. allele_profile.update({allele:"T"})
  80. except:
  81. allele_profile.update({allele:"X"})
  82. for allele in ordered:
  83. if allele in allele_profile:
  84. final_profile.append(allele_profile.get(allele))
  85. else:
  86. final_profile.append("X")
  87. if "X" in final_profile[1:]:
  88. final_profile.insert(len(final_profile),"missing")
  89. elif "T" in final_profile[1:]:
  90. final_profile.insert(len(final_profile),"truncated")
  91. elif "N" in final_profile[1:]:
  92. final_profile.insert(len(final_profile),"novel")
  93. else:
  94. pass
  95. """If this is the case, I know that there are no ST matches"""
  96. if "X" in final_profile[1:-1] or "N" in final_profile[1:-1] or "T" in final_profile[1:-1]:
  97. outfile.write("\t".join(final_profile))
  98. outfile.write("\n")
  99. else:
  100. """need to read in the first line of this file first"""
  101. firstLine = open(reference).readline()
  102. first_fields = firstLine.split()
  103. if "clonal_complex" in first_fields:
  104. """This ignores the clonal complex and ST fields,
  105. seems to be working correclty"""
  106. first_fields = first_fields[1:-1]
  107. num_fields = len(first_fields[1:])
  108. else:
  109. num_fields = len(first_fields[1:])
  110. for line in open(reference, "rU"):
  111. sum =[]
  112. ref_fields=line.split()
  113. """Sum is how many matches that I have.
  114. Must equal the number of alleles to receive
  115. a sequence type"""
  116. for i in range(1,num_fields+1):
  117. try:
  118. if ref_fields[i]==final_profile[i]:
  119. sum.append(i)
  120. except:
  121. pass
  122. if int(len(sum))==int(num_fields):
  123. """if there is a complete match, then add the Sequence type
  124. to the beginning of the list"""
  125. final_profile.insert(int(num_fields)+1,ref_fields[0])
  126. else:
  127. pass
  128. #final_profile.insert(int(num_fields)+1,"novel")
  129. """This assumes that the profile is complete"""
  130. if len(final_profile) == 9:
  131. if final_profile[-1] == "novel" or final_profile[-1] == "truncated":
  132. pass
  133. else:
  134. outfile.write("\t".join(final_profile))
  135. outfile.write("\n")
  136. elif len(final_profile) == 8:
  137. final_profile.insert(int(num_fields)+1,"novel")
  138. outfile.write("\t".join(final_profile))
  139. outfile.write("\n")
  140. outfile.close()
  141.  
  142. def _perform_blast_workflow(data):
  143. genome = "".join(data)
  144. if genome.endswith(".fasta"):
  145. try:
  146. subprocess.check_call("makeblastdb -in %s -dbtype nucl > /dev/null 2>&1" % genome, shell=True)
  147. except:
  148. print("problem found in formatting genome %s" % genome)
  149. if ".fasta" in genome:
  150. try:
  151. devnull = open('/dev/null', 'w')
  152. cmd = ["blastn",
  153. "-task", "blastn",
  154. "-query", "all_loci.fasta",
  155. "-db", genome,
  156. "-out", "%s_blast.out" % (genome),
  157. "-evalue", "0.0001",
  158. "-dust", "no",
  159. "-outfmt", "6"]
  160. subprocess.call(cmd, stdout=devnull, stderr=devnull)
  161. devnull.close()
  162. except:
  163. print("genomes %s cannot be used" % genome)
  164. ref_list = []
  165. newfile = open("%s.blast.redux.out" % genome, "w")
  166. os.system("sort -gr -k 12,12 %s_blast.out > %s.blast.sorted" % (genome,genome))
  167. for line in open("%s.blast.sorted" % genome):
  168. newline = line.strip()
  169. fields = line.split()
  170. allele_names = fields[0].split("_")
  171. if allele_names[0] in ref_list:
  172. pass
  173. else:
  174. ref_list.append(allele_names[0])
  175. newfile.write(line)
  176. newfile.close()
  177.  
  178. def run_blast_dev(ref_loci_file,processors,genome_list):
  179. curr_dir=os.getcwd()
  180. mp_shell(_perform_blast_workflow,genome_list,processors)
  181.  
  182. def make_ST_map(profile):
  183. map = {}
  184. with open(profile) as my_profile:
  185. for line in my_profile:
  186. newline = line.strip()
  187. fields = newline.split()
  188. if fields[0] == "genome":
  189. pass
  190. else:
  191. if len(fields) == 9:
  192. map.update({fields[0]:fields[8]})
  193. else:
  194. map.update({fields[0]:"unknown"})
  195. return map
  196.  
  197. def main(directory,fastas,trunc,new,processors,prefix):
  198. curr_dir = os.getcwd()
  199. ap = os.path.abspath(curr_dir)
  200. ref_seq_dir=os.path.abspath(directory)
  201. reference = "".join(glob.glob(os.path.join(ref_seq_dir, "*.txt")))
  202. ordered = get_gene_order(reference)
  203. fastas_path = os.path.abspath(fastas)
  204. if os.path.exists("%s/mlst_scratch" % ap):
  205. os.system("rm -rf %s/mlst_scratch" % ap)
  206. os.makedirs("%s/mlst_scratch" % ap)
  207. else:
  208. os.makedirs("%s/mlst_scratch" % ap)
  209. if os.path.exists("%s/ST_renames" % ap):
  210. os.system("rm -rf %s/ST_renames" % ap)
  211. os.makedirs("%s/ST_renames" % ap)
  212. else:
  213. os.makedirs("%s/ST_renames" % ap)
  214. tmp_dir = "%s/mlst_scratch" % ap
  215. if "clonal_complex" in ordered:
  216. ordered=ordered[:-1]
  217. """file_dir is a list of all of your FASTA files"""
  218. for afile in glob.glob(os.path.join(fastas,"*fasta")):
  219. shutil.copy2(afile,tmp_dir)
  220. files_and_temp_names = []
  221. for infile in glob.glob(os.path.join(tmp_dir,"*fasta")):
  222. genome = get_seq_name(infile)
  223. files_and_temp_names.append(genome)
  224. os.chdir(tmp_dir)
  225. os.system("cat %s/*fasta > ./all_loci.fasta" % ref_seq_dir)
  226. print("blast starting")
  227. run_blast_dev("all_loci.fasta",processors,files_and_temp_names)
  228. print("blast finished, parsing results")
  229. process_results_dev(tmp_dir,ordered,trunc,new,reference,prefix,len(ordered))
  230. st_map = make_ST_map("%s.profile.txt" % prefix)
  231. for k,v in st_map.items():
  232. old_name = str(k)+".fasta"
  233. new_name = str(k)+"_ST"+str(v)+".fasta"
  234. os.system("cp %s/%s %s/ST_renames/%s" % (fastas_path,old_name,ap,new_name))
  235. os.system("mv *.profile.txt %s" % ap)
  236. os.chdir(ap)
  237. os.system("rm -rf %s/mlst_scratch" % ap)
  238.  
  239. if __name__ == "__main__":
  240. usage="usage: %prog [options]"
  241. parser = optparse.OptionParser(usage=usage)
  242. parser.add_option("-d", "--MLST_dir", dest="directory",
  243. help="/path/to/MLST_seq_directory [REQUIRED]",
  244. action="callback", callback=test_dir, type="string")
  245. parser.add_option("-f", "--fasta dir", dest="fastas",
  246. help="/path/to/your_genome_FASTA_dir [REQUIRED]",
  247. action="callback", callback=test_dir, type="string")
  248. parser.add_option("-t", "--truncated value", dest="trunc",
  249. help="gene is considered truncated if below this value [default = 0.95]",
  250. default="0.95", type="float")
  251. parser.add_option("-n", "--new", dest="new",
  252. help="under this value is considered to be a new allele",
  253. default="100", type="int")
  254. parser.add_option("-p", "--processors", dest="processors",
  255. help="Number of processors to use, defaults to 2",
  256. default="2", type="int", action="store")
  257. parser.add_option("-y", "--prefix", dest="prefix",
  258. help="Output prefix for files [REQUIRED]",
  259. action="store", type="string")
  260. options, args = parser.parse_args()
  261.  
  262. mandatories = ["directory","fastas","prefix"]
  263. for m in mandatories:
  264. if not getattr(options, m, None):
  265. print("\nMust provide %s.\n" %m)
  266. parser.print_help()
  267. exit(-1)
  268. main(options.directory,options.fastas,options.trunc,options.new,options.processors,options.prefix)
Add Comment
Please, Sign In to add comment