Advertisement
Guest User

Untitled

a guest
Sep 14th, 2017
158
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 3.04 KB | None | 0 0
  1. def seq_reanalysis(kraken_table, kraken_labels, out_dir, user_format, renamed_file1, subset, renamed_file2 = False):
  2. kraken_colNames = ["kraken_classified", "Seq_ID","kraken_tax_ID", "kraken_length", "kraken_k-mer"]
  3. kraken_fullTable = format_result_table(out_dir, "kraken_table.txt", "kraken_labels.txt", kraken_colNames)
  4. # Save full kraken table, compress and delete unformatted kraken table
  5. kraken_fullTable.to_csv(out_dir + "kraken_FormattedTable.txt", sep='\t', index= False)
  6. subprocess.call("gzip " + out_dir + "kraken_FormattedTable.txt", shell =True)
  7. subprocess.call("rm " + "kraken_table.txt kraken_labels.txt", shell = True)
  8.  
  9. # Subset table for final results
  10. kraken_results = kraken_fullTable[["kraken_classified", "Seq_ID","Tax_ID", "Seq_tax", "Div_ID"]]
  11.  
  12. if subset:
  13. # Make a list of "Seq_ID" column value if sequence is unclassified in "Classified" column or
  14. # classified as VRL (virus) in column "Div_ID". This list will be used to determine which sequences
  15. # will be further analysed by Kaiju
  16. # unclassified_IDs = kraken_results.loc[(kraken_results.kraken_classified == 'U'), ['Seq_ID']]
  17. # VRL_IDs = kraken_results.loc[(kraken_results.Div_ID == 'VRL'), ['Seq_ID']]
  18. kraken_VRL = kraken_results.loc[(kraken_results.Div_ID == 'VRL'),]
  19. kraken_VRL.to_csv(out_dir + 'kraken_VRL.txt', sep='\t', index= False)
  20. reanalyse_IDs = kraken_VRL['Seq_ID'].tolist()
  21. # reanalyse_IDs += unclassified_IDs['Seq_ID'].tolist()
  22.  
  23.  
  24. # Use biopython to make new fastq files of sequences to be reanalysed.
  25. def reanalyse_subset (input_file, output_file, id_list):
  26. outfile = open(out_dir + 'reanalyse_ID.txt', 'w')
  27. print >> outfile, "\n".join(str(i) for i in id_list)
  28. outfile.close()
  29. id_file = out_dir + "reanalyse_ID.txt"
  30.  
  31. wanted = set(line.rstrip("\n").split(None,1)[0] for line in open(id_file))
  32. print "Found %i unique identifiers in %s" % (len(wanted), id_file)
  33.  
  34. records = (r for r in SeqIO.parse(out_dir + input_file, user_format) if r.id in wanted)
  35. count = SeqIO.write(records, out_dir + output_file, user_format)
  36. print "Saved %i records from %s to %s" % (count, input_file, output_file)
  37. if count < len(wanted):
  38. print "Warning %i IDs not found in %s" % (len(wanted)-count, input_file)
  39.  
  40. if renamed_file2:
  41. reanalyse_ID1 = [s + "/1" for s in reanalyse_IDs]
  42. reanalyse_ID2 = [s + "/2" for s in reanalyse_IDs]
  43. reanalyse_subset(renamed_file1, "subset_file1." + user_format, reanalyse_ID1)
  44. reanalyse_subset(renamed_file2, "subset_file2." + user_format, reanalyse_ID2)
  45. # Delete "renamed" files
  46. subprocess.call("rm renamed_1 renamed_2", shell=True)
  47. else:
  48. reanalyse_subset(renamed_file1 + user_format, "subset_file1." + user_format, reanalyse_IDs)
  49.  
  50.  
  51. # Deleted "reanalyse.txt"
  52. subprocess.call("rm reanalyse_ID.txt", shell=True)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement