SHARE
TWEET

Untitled

a guest Sep 14th, 2017 97 Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
  1. def seq_reanalysis(kraken_table, kraken_labels, out_dir, user_format, renamed_file1, subset, renamed_file2 = False):
  2.     kraken_colNames = ["kraken_classified", "Seq_ID","kraken_tax_ID", "kraken_length", "kraken_k-mer"]
  3.     kraken_fullTable = format_result_table(out_dir, "kraken_table.txt", "kraken_labels.txt", kraken_colNames)
  4.     # Save full kraken table, compress and delete unformatted kraken table
  5.     kraken_fullTable.to_csv(out_dir  + "kraken_FormattedTable.txt", sep='\t', index= False)
  6.     subprocess.call("gzip " + out_dir  + "kraken_FormattedTable.txt", shell =True)
  7.     subprocess.call("rm " + "kraken_table.txt kraken_labels.txt", shell = True)
  8.    
  9.     # Subset table for final results
  10.     kraken_results = kraken_fullTable[["kraken_classified", "Seq_ID","Tax_ID", "Seq_tax", "Div_ID"]]
  11.  
  12.     if subset:
  13.         # Make a list of "Seq_ID" column value if sequence is unclassified in "Classified" column or
  14.         #  classified as VRL (virus) in column "Div_ID". This list will be used to determine which sequences
  15.         #  will be further analysed by Kaiju
  16. #        unclassified_IDs = kraken_results.loc[(kraken_results.kraken_classified == 'U'), ['Seq_ID']]
  17. #        VRL_IDs = kraken_results.loc[(kraken_results.Div_ID == 'VRL'), ['Seq_ID']]
  18.         kraken_VRL = kraken_results.loc[(kraken_results.Div_ID == 'VRL'),]
  19.         kraken_VRL.to_csv(out_dir  + 'kraken_VRL.txt', sep='\t', index= False)
  20.         reanalyse_IDs =  kraken_VRL['Seq_ID'].tolist()
  21. #       reanalyse_IDs += unclassified_IDs['Seq_ID'].tolist()
  22.  
  23.  
  24.         # Use biopython to make new fastq files of sequences to be reanalysed.
  25.         def reanalyse_subset (input_file, output_file, id_list):
  26.             outfile = open(out_dir  + 'reanalyse_ID.txt', 'w')
  27.             print >> outfile, "\n".join(str(i) for i in id_list)
  28.             outfile.close()
  29.             id_file = out_dir  + "reanalyse_ID.txt"
  30.  
  31.             wanted = set(line.rstrip("\n").split(None,1)[0] for line in open(id_file))
  32.             print "Found %i unique identifiers in %s" % (len(wanted), id_file)
  33.  
  34.             records = (r for r in SeqIO.parse(out_dir  + input_file, user_format) if r.id in wanted)
  35.             count = SeqIO.write(records, out_dir  + output_file, user_format)
  36.             print "Saved %i records from %s to %s" % (count, input_file, output_file)
  37.             if count < len(wanted):
  38.                 print "Warning %i IDs not found in %s" % (len(wanted)-count, input_file)
  39.  
  40.         if renamed_file2:
  41.             reanalyse_ID1 = [s + "/1" for s in reanalyse_IDs]
  42.             reanalyse_ID2 = [s + "/2" for s in reanalyse_IDs]
  43.             reanalyse_subset(renamed_file1, "subset_file1." + user_format, reanalyse_ID1)
  44.             reanalyse_subset(renamed_file2, "subset_file2." + user_format, reanalyse_ID2)
  45.             # Delete "renamed" files
  46.             subprocess.call("rm renamed_1 renamed_2", shell=True)
  47.         else:
  48.             reanalyse_subset(renamed_file1 + user_format, "subset_file1." + user_format, reanalyse_IDs)
  49.  
  50.  
  51.         # Deleted "reanalyse.txt"
  52.         subprocess.call("rm reanalyse_ID.txt", shell=True)
RAW Paste Data
Pastebin PRO Autumn Special!
Get 40% OFF on Pastebin PRO accounts!
Top