Untitled

def seq_reanalysis(kraken_table, kraken_labels, out_dir, user_format, renamed_file1, subset, renamed_file2 = False):
    kraken_colNames = ["kraken_classified", "Seq_ID","kraken_tax_ID", "kraken_length", "kraken_k-mer"]
    kraken_fullTable = format_result_table(out_dir, "kraken_table.txt", "kraken_labels.txt", kraken_colNames)
	# Save full kraken table, compress and delete unformatted kraken table
    kraken_fullTable.to_csv(out_dir  + "kraken_FormattedTable.txt", sep='\t', index= False)
	subprocess.call("gzip " + out_dir  + "kraken_FormattedTable.txt", shell =True)
    subprocess.call("rm " + "kraken_table.txt kraken_labels.txt", shell = True)

	# Subset table for final results
	kraken_results = kraken_fullTable[["kraken_classified", "Seq_ID","Tax_ID", "Seq_tax", "Div_ID"]]

    if subset:
        # Make a list of "Seq_ID" column value if sequence is unclassified in "Classified" column or
        #  classified as VRL (virus) in column "Div_ID". This list will be used to determine which sequences
        #  will be further analysed by Kaiju
#        unclassified_IDs = kraken_results.loc[(kraken_results.kraken_classified == 'U'), ['Seq_ID']]
#        VRL_IDs = kraken_results.loc[(kraken_results.Div_ID == 'VRL'), ['Seq_ID']]
		kraken_VRL = kraken_results.loc[(kraken_results.Div_ID == 'VRL'),]
		kraken_VRL.to_csv(out_dir  + 'kraken_VRL.txt', sep='\t', index= False)
        reanalyse_IDs =  kraken_VRL['Seq_ID'].tolist()
#		reanalyse_IDs += unclassified_IDs['Seq_ID'].tolist()


        # Use biopython to make new fastq files of sequences to be reanalysed.
        def reanalyse_subset (input_file, output_file, id_list):
            outfile = open(out_dir  + 'reanalyse_ID.txt', 'w')
            print >> outfile, "\n".join(str(i) for i in id_list)
            outfile.close()
            id_file = out_dir  + "reanalyse_ID.txt"

            wanted = set(line.rstrip("\n").split(None,1)[0] for line in open(id_file))
            print "Found %i unique identifiers in %s" % (len(wanted), id_file)

            records = (r for r in SeqIO.parse(out_dir  + input_file, user_format) if r.id in wanted)
            count = SeqIO.write(records, out_dir  + output_file, user_format)
            print "Saved %i records from %s to %s" % (count, input_file, output_file)
            if count < len(wanted):
                print "Warning %i IDs not found in %s" % (len(wanted)-count, input_file)

        if renamed_file2:
            reanalyse_ID1 = [s + "/1" for s in reanalyse_IDs]
            reanalyse_ID2 = [s + "/2" for s in reanalyse_IDs]
            reanalyse_subset(renamed_file1, "subset_file1." + user_format, reanalyse_ID1)
            reanalyse_subset(renamed_file2, "subset_file2." + user_format, reanalyse_ID2)
            # Delete "renamed" files
            subprocess.call("rm renamed_1 renamed_2", shell=True)
        else:
            reanalyse_subset(renamed_file1 + user_format, "subset_file1." + user_format, reanalyse_IDs)


        # Deleted "reanalyse.txt"
        subprocess.call("rm reanalyse_ID.txt", shell=True)