Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #!/usr/bin/env python
- """calculates MLST types from assemblies using BLAST.
- If the gene is truncated, it will report a "T" and if
- the gene has no blast hit, it will report a "X".
- Your reference allele names must all end in "fasta"
- and must contain a "_" between gene name and number.
- The only external dependency is blast+ - tested version
- is 2.2.31 - and igs multithreading tools"""
- from __future__ import print_function
- import optparse
- import glob
- import subprocess
- import os
- import re
- import sys
- import shutil
- def test_file(option, opt_str, value, parser):
- try:
- with open(value): setattr(parser.values, option.dest, value)
- except IOError:
- print('%s cannot be opened' % option)
- sys.exit()
- def test_dir(option, opt_str, value, parser):
- if os.path.exists(value):
- setattr(parser.values, option.dest, value)
- else:
- print("%s cannot be found" % option)
- sys.exit()
- def mp_shell(func, params, numProc):
- from multiprocessing import Pool
- p = Pool(numProc)
- out = p.map(func, params)
- p.terminate()
- return out
- def get_gene_order(reference):
- firstLine = open(reference).readline()
- fields = firstLine.split()
- ordered = fields[1:]
- return ordered
- def get_seq_name(in_fasta):
- """used for renaming the sequences"""
- return os.path.basename(in_fasta)
- def process_results_dev(directory,ordered,trunc,new,reference,prefix,num_markers):
- outfile = open("%s.profile.txt" % prefix, "w")
- pad_order = list(ordered)
- pad_order.insert(0,"genome")
- pad_order.insert(num_markers+1,"ST")
- outfile.write("\t".join(pad_order))
- outfile.write("\n")
- for infile in glob.glob(os.path.join(directory,"*blast.redux.out")):
- alist = []
- file_name = get_seq_name(infile)
- genome_name = file_name.replace(".fasta.blast.redux.out","")
- final_profile = []
- final_profile.append(genome_name)
- allele_profile = {}
- for allele in ordered:
- for line in open(infile, "rU"):
- newline = line.strip("\n")
- fields = newline.split()
- allele_fields = fields[0].split("_")
- allele_name = allele_fields[0]
- """This is where there is a match"""
- if allele in allele_name:
- try:
- if int(fields[3])/int(fields[7])>=trunc and float(fields[2])>=float(new):
- allele_profile.update({allele:allele_fields[1]})
- elif int(fields[3])/int(fields[7])>=trunc and float(fields[2])<float(new):
- allele_profile.update({allele:"N"})
- else:
- allele_profile.update({allele:"T"})
- except:
- allele_profile.update({allele:"X"})
- for allele in ordered:
- if allele in allele_profile:
- final_profile.append(allele_profile.get(allele))
- else:
- final_profile.append("X")
- if "X" in final_profile[1:]:
- final_profile.insert(len(final_profile),"missing")
- elif "T" in final_profile[1:]:
- final_profile.insert(len(final_profile),"truncated")
- elif "N" in final_profile[1:]:
- final_profile.insert(len(final_profile),"novel")
- else:
- pass
- """If this is the case, I know that there are no ST matches"""
- if "X" in final_profile[1:-1] or "N" in final_profile[1:-1] or "T" in final_profile[1:-1]:
- outfile.write("\t".join(final_profile))
- outfile.write("\n")
- else:
- """need to read in the first line of this file first"""
- firstLine = open(reference).readline()
- first_fields = firstLine.split()
- if "clonal_complex" in first_fields:
- """This ignores the clonal complex and ST fields,
- seems to be working correclty"""
- first_fields = first_fields[1:-1]
- num_fields = len(first_fields[1:])
- else:
- num_fields = len(first_fields[1:])
- for line in open(reference, "rU"):
- sum =[]
- ref_fields=line.split()
- """Sum is how many matches that I have.
- Must equal the number of alleles to receive
- a sequence type"""
- for i in range(1,num_fields+1):
- try:
- if ref_fields[i]==final_profile[i]:
- sum.append(i)
- except:
- pass
- if int(len(sum))==int(num_fields):
- """if there is a complete match, then add the Sequence type
- to the beginning of the list"""
- final_profile.insert(int(num_fields)+1,ref_fields[0])
- else:
- pass
- #final_profile.insert(int(num_fields)+1,"novel")
- """This assumes that the profile is complete"""
- if len(final_profile) == 9:
- if final_profile[-1] == "novel" or final_profile[-1] == "truncated":
- pass
- else:
- outfile.write("\t".join(final_profile))
- outfile.write("\n")
- elif len(final_profile) == 8:
- final_profile.insert(int(num_fields)+1,"novel")
- outfile.write("\t".join(final_profile))
- outfile.write("\n")
- outfile.close()
- def _perform_blast_workflow(data):
- genome = "".join(data)
- if genome.endswith(".fasta"):
- try:
- subprocess.check_call("makeblastdb -in %s -dbtype nucl > /dev/null 2>&1" % genome, shell=True)
- except:
- print("problem found in formatting genome %s" % genome)
- if ".fasta" in genome:
- try:
- devnull = open('/dev/null', 'w')
- cmd = ["blastn",
- "-task", "blastn",
- "-query", "all_loci.fasta",
- "-db", genome,
- "-out", "%s_blast.out" % (genome),
- "-evalue", "0.0001",
- "-dust", "no",
- "-outfmt", "6"]
- subprocess.call(cmd, stdout=devnull, stderr=devnull)
- devnull.close()
- except:
- print("genomes %s cannot be used" % genome)
- ref_list = []
- newfile = open("%s.blast.redux.out" % genome, "w")
- os.system("sort -gr -k 12,12 %s_blast.out > %s.blast.sorted" % (genome,genome))
- for line in open("%s.blast.sorted" % genome):
- newline = line.strip()
- fields = line.split()
- allele_names = fields[0].split("_")
- if allele_names[0] in ref_list:
- pass
- else:
- ref_list.append(allele_names[0])
- newfile.write(line)
- newfile.close()
- def run_blast_dev(ref_loci_file,processors,genome_list):
- curr_dir=os.getcwd()
- mp_shell(_perform_blast_workflow,genome_list,processors)
- def make_ST_map(profile):
- map = {}
- with open(profile) as my_profile:
- for line in my_profile:
- newline = line.strip()
- fields = newline.split()
- if fields[0] == "genome":
- pass
- else:
- if len(fields) == 9:
- map.update({fields[0]:fields[8]})
- else:
- map.update({fields[0]:"unknown"})
- return map
- def main(directory,fastas,trunc,new,processors,prefix):
- curr_dir = os.getcwd()
- ap = os.path.abspath(curr_dir)
- ref_seq_dir=os.path.abspath(directory)
- reference = "".join(glob.glob(os.path.join(ref_seq_dir, "*.txt")))
- ordered = get_gene_order(reference)
- fastas_path = os.path.abspath(fastas)
- if os.path.exists("%s/mlst_scratch" % ap):
- os.system("rm -rf %s/mlst_scratch" % ap)
- os.makedirs("%s/mlst_scratch" % ap)
- else:
- os.makedirs("%s/mlst_scratch" % ap)
- if os.path.exists("%s/ST_renames" % ap):
- os.system("rm -rf %s/ST_renames" % ap)
- os.makedirs("%s/ST_renames" % ap)
- else:
- os.makedirs("%s/ST_renames" % ap)
- tmp_dir = "%s/mlst_scratch" % ap
- if "clonal_complex" in ordered:
- ordered=ordered[:-1]
- """file_dir is a list of all of your FASTA files"""
- for afile in glob.glob(os.path.join(fastas,"*fasta")):
- shutil.copy2(afile,tmp_dir)
- files_and_temp_names = []
- for infile in glob.glob(os.path.join(tmp_dir,"*fasta")):
- genome = get_seq_name(infile)
- files_and_temp_names.append(genome)
- os.chdir(tmp_dir)
- os.system("cat %s/*fasta > ./all_loci.fasta" % ref_seq_dir)
- print("blast starting")
- run_blast_dev("all_loci.fasta",processors,files_and_temp_names)
- print("blast finished, parsing results")
- process_results_dev(tmp_dir,ordered,trunc,new,reference,prefix,len(ordered))
- st_map = make_ST_map("%s.profile.txt" % prefix)
- for k,v in st_map.items():
- old_name = str(k)+".fasta"
- new_name = str(k)+"_ST"+str(v)+".fasta"
- os.system("cp %s/%s %s/ST_renames/%s" % (fastas_path,old_name,ap,new_name))
- os.system("mv *.profile.txt %s" % ap)
- os.chdir(ap)
- os.system("rm -rf %s/mlst_scratch" % ap)
- if __name__ == "__main__":
- usage="usage: %prog [options]"
- parser = optparse.OptionParser(usage=usage)
- parser.add_option("-d", "--MLST_dir", dest="directory",
- help="/path/to/MLST_seq_directory [REQUIRED]",
- action="callback", callback=test_dir, type="string")
- parser.add_option("-f", "--fasta dir", dest="fastas",
- help="/path/to/your_genome_FASTA_dir [REQUIRED]",
- action="callback", callback=test_dir, type="string")
- parser.add_option("-t", "--truncated value", dest="trunc",
- help="gene is considered truncated if below this value [default = 0.95]",
- default="0.95", type="float")
- parser.add_option("-n", "--new", dest="new",
- help="under this value is considered to be a new allele",
- default="100", type="int")
- parser.add_option("-p", "--processors", dest="processors",
- help="Number of processors to use, defaults to 2",
- default="2", type="int", action="store")
- parser.add_option("-y", "--prefix", dest="prefix",
- help="Output prefix for files [REQUIRED]",
- action="store", type="string")
- options, args = parser.parse_args()
- mandatories = ["directory","fastas","prefix"]
- for m in mandatories:
- if not getattr(options, m, None):
- print("\nMust provide %s.\n" %m)
- parser.print_help()
- exit(-1)
- main(options.directory,options.fastas,options.trunc,options.new,options.processors,options.prefix)
Add Comment
Please, Sign In to add comment