Untitled

#!/usr/bin/env python

"""calculates MLST types from assemblies using BLAST.
If the gene is truncated, it will report a "T" and if
the gene has no blast hit, it will report a "X".
Your reference allele names must all end in "fasta"
and must contain a "_" between gene name and number.
The only external dependency is blast+ - tested version
is 2.2.31 - and igs multithreading tools"""

from __future__ import print_function
import optparse
import glob
import subprocess
import os
import re
import sys
import shutil

def test_file(option, opt_str, value, parser):
    try:
        with open(value): setattr(parser.values, option.dest, value)
    except IOError:
        print('%s cannot be opened' % option)
        sys.exit()

def test_dir(option, opt_str, value, parser):
    if os.path.exists(value):
        setattr(parser.values, option.dest, value)
    else:
        print("%s cannot be found" % option)
        sys.exit()

def mp_shell(func, params, numProc):
    from multiprocessing import Pool
    p = Pool(numProc)
    out = p.map(func, params)
    p.terminate()
    return out

def get_gene_order(reference):
    firstLine = open(reference).readline()
    fields = firstLine.split()
    ordered = fields[1:]
    return ordered

def get_seq_name(in_fasta):
    """used for renaming the sequences"""
    return os.path.basename(in_fasta)

def process_results_dev(directory,ordered,trunc,new,reference,prefix,num_markers):
    outfile = open("%s.profile.txt" % prefix, "w")
    pad_order = list(ordered)
    pad_order.insert(0,"genome")
    pad_order.insert(num_markers+1,"ST")
    outfile.write("\t".join(pad_order))
    outfile.write("\n")
    for infile in glob.glob(os.path.join(directory,"*blast.redux.out")):
        alist = []
        file_name = get_seq_name(infile)
        genome_name = file_name.replace(".fasta.blast.redux.out","")
        final_profile = []
        final_profile.append(genome_name)
        allele_profile = {}
        for allele in ordered:
            for line in open(infile, "rU"):
                newline = line.strip("\n")
                fields = newline.split()
                allele_fields = fields[0].split("_")
                allele_name = allele_fields[0]
                """This is where there is a match"""
                if allele in allele_name:
                    try:
                        if int(fields[3])/int(fields[7])>=trunc and float(fields[2])>=float(new):
                            allele_profile.update({allele:allele_fields[1]})
                        elif int(fields[3])/int(fields[7])>=trunc and float(fields[2])<float(new):
                            allele_profile.update({allele:"N"})
                        else:
                            allele_profile.update({allele:"T"})
                    except:
                        allele_profile.update({allele:"X"})
        for allele in ordered:
            if allele in allele_profile:
                final_profile.append(allele_profile.get(allele))
            else:
                final_profile.append("X")
        if "X" in final_profile[1:]:
            final_profile.insert(len(final_profile),"missing")
        elif "T" in final_profile[1:]:
            final_profile.insert(len(final_profile),"truncated")
        elif "N" in final_profile[1:]:
            final_profile.insert(len(final_profile),"novel")
        else:
            pass
        """If this is the case, I know that there are no ST matches"""
        if "X" in final_profile[1:-1] or "N" in final_profile[1:-1] or "T" in final_profile[1:-1]:
            outfile.write("\t".join(final_profile))
            outfile.write("\n")
        else:
            """need to read in the first line of this file first"""
            firstLine = open(reference).readline()
            first_fields = firstLine.split()
            if "clonal_complex" in first_fields:
                """This ignores the clonal complex and ST fields,
                seems to be working correclty"""
                first_fields = first_fields[1:-1]
                num_fields = len(first_fields[1:])
            else:
                num_fields = len(first_fields[1:])
            for line in open(reference, "rU"):
                sum =[]
                ref_fields=line.split()
                """Sum is how many matches that I have.
                Must equal the number of alleles to receive
                a sequence type"""
                for i in range(1,num_fields+1):
                    try:
                        if ref_fields[i]==final_profile[i]:
                            sum.append(i)
                    except:
                        pass
                if int(len(sum))==int(num_fields):
                    """if there is a complete match, then add the Sequence type
                    to the beginning of the list"""
                    final_profile.insert(int(num_fields)+1,ref_fields[0])
                else:
                    pass
                    #final_profile.insert(int(num_fields)+1,"novel")
        """This assumes that the profile is complete"""
        if len(final_profile) == 9:
            if final_profile[-1] == "novel" or final_profile[-1] == "truncated":
                pass
            else:
                outfile.write("\t".join(final_profile))
                outfile.write("\n")
        elif len(final_profile) == 8:
            final_profile.insert(int(num_fields)+1,"novel")
            outfile.write("\t".join(final_profile))
            outfile.write("\n")
    outfile.close()

def _perform_blast_workflow(data):
    genome = "".join(data)
    if genome.endswith(".fasta"):
        try:
            subprocess.check_call("makeblastdb -in %s -dbtype nucl > /dev/null 2>&1" % genome, shell=True)
        except:
            print("problem found in formatting genome %s" % genome)
    if ".fasta" in genome:
        try:
            devnull = open('/dev/null', 'w')
            cmd = ["blastn",
                   "-task", "blastn",
                   "-query", "all_loci.fasta",
                   "-db", genome,
                   "-out", "%s_blast.out" % (genome),
                   "-evalue", "0.0001",
                   "-dust", "no",
                   "-outfmt", "6"]
            subprocess.call(cmd, stdout=devnull, stderr=devnull)
            devnull.close()
        except:
            print("genomes %s cannot be used" % genome)
    ref_list = []
    newfile = open("%s.blast.redux.out" % genome, "w")
    os.system("sort -gr -k 12,12 %s_blast.out > %s.blast.sorted" % (genome,genome))
    for line in open("%s.blast.sorted" % genome):
        newline = line.strip()
        fields = line.split()
        allele_names = fields[0].split("_")
        if allele_names[0] in ref_list:
            pass
        else:
            ref_list.append(allele_names[0])
            newfile.write(line)
    newfile.close()

def run_blast_dev(ref_loci_file,processors,genome_list):
    curr_dir=os.getcwd()
    mp_shell(_perform_blast_workflow,genome_list,processors)

def make_ST_map(profile):
    map = {}
    with open(profile) as my_profile:
        for line in my_profile:
            newline = line.strip()
            fields = newline.split()
            if fields[0] == "genome":
                pass
            else:
                if len(fields) == 9:
                    map.update({fields[0]:fields[8]})
                else:
                    map.update({fields[0]:"unknown"})
    return map

def main(directory,fastas,trunc,new,processors,prefix):
    curr_dir = os.getcwd()
    ap = os.path.abspath(curr_dir)
    ref_seq_dir=os.path.abspath(directory)
    reference = "".join(glob.glob(os.path.join(ref_seq_dir, "*.txt")))
    ordered = get_gene_order(reference)
    fastas_path = os.path.abspath(fastas)
    if os.path.exists("%s/mlst_scratch" % ap):
        os.system("rm -rf %s/mlst_scratch" % ap)
        os.makedirs("%s/mlst_scratch" % ap)
    else:
        os.makedirs("%s/mlst_scratch" % ap)
    if os.path.exists("%s/ST_renames" % ap):
        os.system("rm -rf %s/ST_renames" % ap)
        os.makedirs("%s/ST_renames" % ap)
    else:
        os.makedirs("%s/ST_renames" % ap)
    tmp_dir = "%s/mlst_scratch" % ap
    if "clonal_complex" in ordered:
        ordered=ordered[:-1]
    """file_dir is a list of all of your FASTA files"""
    for afile in glob.glob(os.path.join(fastas,"*fasta")):
        shutil.copy2(afile,tmp_dir)
    files_and_temp_names = []
    for infile in glob.glob(os.path.join(tmp_dir,"*fasta")):
        genome = get_seq_name(infile)
        files_and_temp_names.append(genome)
    os.chdir(tmp_dir)
    os.system("cat %s/*fasta > ./all_loci.fasta" % ref_seq_dir)
    print("blast starting")
    run_blast_dev("all_loci.fasta",processors,files_and_temp_names)
    print("blast finished, parsing results")
    process_results_dev(tmp_dir,ordered,trunc,new,reference,prefix,len(ordered))
    st_map = make_ST_map("%s.profile.txt" % prefix)
    for k,v in st_map.items():
        old_name = str(k)+".fasta"
        new_name = str(k)+"_ST"+str(v)+".fasta"
        os.system("cp %s/%s %s/ST_renames/%s" % (fastas_path,old_name,ap,new_name))
    os.system("mv *.profile.txt %s" % ap)
    os.chdir(ap)
    os.system("rm -rf %s/mlst_scratch" % ap)

if __name__ == "__main__":
    usage="usage: %prog [options]"
    parser = optparse.OptionParser(usage=usage)
    parser.add_option("-d", "--MLST_dir", dest="directory",
                      help="/path/to/MLST_seq_directory [REQUIRED]",
                      action="callback", callback=test_dir, type="string")
    parser.add_option("-f", "--fasta dir", dest="fastas",
                      help="/path/to/your_genome_FASTA_dir [REQUIRED]",
                      action="callback", callback=test_dir, type="string")
    parser.add_option("-t", "--truncated value", dest="trunc",
                      help="gene is considered truncated if below this value [default = 0.95]",
                      default="0.95", type="float")
    parser.add_option("-n", "--new", dest="new",
                      help="under this value is considered to be a new allele",
                      default="100", type="int")
    parser.add_option("-p", "--processors", dest="processors",
                      help="Number of processors to use, defaults to 2",
                      default="2", type="int", action="store")
    parser.add_option("-y", "--prefix", dest="prefix",
                      help="Output prefix for files [REQUIRED]",
                      action="store", type="string")
    options, args = parser.parse_args()

    mandatories = ["directory","fastas","prefix"]
    for m in mandatories:
        if not getattr(options, m, None):
            print("\nMust provide %s.\n" %m)
            parser.print_help()
            exit(-1)
    main(options.directory,options.fastas,options.trunc,options.new,options.processors,options.prefix)