Advertisement
cdocpalao

pySeqAppend

May 22nd, 2019
436
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 1.70 KB | None | 0 0
  1. # -*- coding: utf-8 -*-
  2. """
  3. Spyder Editor
  4.  
  5. This is a temporary script file.
  6. """
  7.  
  8. from itertools import groupby
  9. import pandas as pd
  10. import os
  11.  
  12. def fasta_iter(fasta_name):
  13. """
  14. modified from Brent Pedersen
  15. Correct Way To Parse A Fasta File In Python
  16. given a fasta file. yield tuples of header, sequence
  17. """
  18. "first open the file outside "
  19. fh = open(fasta_name)
  20.  
  21. # ditch the boolean (x[0]) and just keep the header or sequence since
  22. # we know they alternate.
  23. faiter = (x[1] for x in groupby(fh, lambda line: line[0] == ">"))
  24.  
  25. for header in faiter:
  26. # drop the ">"
  27. headerStr = header.__next__()[1:].strip()
  28.  
  29. # join all sequence lines to one.
  30. seq = "".join(s.strip() for s in faiter.__next__())
  31.  
  32. yield (headerStr, seq)
  33.  
  34. # put all your files into one directory. chdir to that directory
  35. os.chdir("/Users/beatrizpalao/Documents/dna_seq/sequences")
  36.  
  37. #input files
  38. fa_file = "contigbcmerged.fa"
  39. fa_pp_results = "Abuabcontigs.fasta.results.csv"
  40.  
  41. df_pp_results = pd.read_csv(fa_pp_results, sep='\t', header = 0, encoding = "ISO-8859-1")
  42. seq_id_pp = df_pp_results["ID"]
  43.  
  44. faiter = fasta_iter(fa_file)
  45. seq_id_fa = []
  46. seq_fa = []
  47. for ff in faiter:
  48. headerStr, seq = ff
  49. seq_id_fa.append(headerStr)
  50. seq_fa.append(seq)
  51.  
  52. seq_dict = dict(zip(seq_id_fa, seq_fa))
  53. #seq_dict['ITC1587_BchrUn_random_T39713_consensus']
  54.  
  55. #some list comprehension here, seq_id = [x for x in seq_id_pp if x in seq_id_fa]
  56. seq_pp = []
  57. for x in seq_id_pp:
  58. if x in seq_id_fa:
  59. seq_pp.append(seq_dict[x])
  60.  
  61. df_pp_results['Flank_Seq'] = seq_pp
  62. df_pp_results.to_csv("contigbcmerged_with_flankingseq.csv")
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement