Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- # -*- coding: utf-8 -*-
- """
- Spyder Editor
- This is a temporary script file.
- """
- from itertools import groupby
- import pandas as pd
- import os
- def fasta_iter(fasta_name):
- """
- modified from Brent Pedersen
- Correct Way To Parse A Fasta File In Python
- given a fasta file. yield tuples of header, sequence
- """
- "first open the file outside "
- fh = open(fasta_name)
- # ditch the boolean (x[0]) and just keep the header or sequence since
- # we know they alternate.
- faiter = (x[1] for x in groupby(fh, lambda line: line[0] == ">"))
- for header in faiter:
- # drop the ">"
- headerStr = header.__next__()[1:].strip()
- # join all sequence lines to one.
- seq = "".join(s.strip() for s in faiter.__next__())
- yield (headerStr, seq)
- # put all your files into one directory. chdir to that directory
- os.chdir("/Users/beatrizpalao/Documents/dna_seq/sequences")
- #input files
- fa_file = "contigbcmerged.fa"
- fa_pp_results = "Abuabcontigs.fasta.results.csv"
- df_pp_results = pd.read_csv(fa_pp_results, sep='\t', header = 0, encoding = "ISO-8859-1")
- seq_id_pp = df_pp_results["ID"]
- faiter = fasta_iter(fa_file)
- seq_id_fa = []
- seq_fa = []
- for ff in faiter:
- headerStr, seq = ff
- seq_id_fa.append(headerStr)
- seq_fa.append(seq)
- seq_dict = dict(zip(seq_id_fa, seq_fa))
- #seq_dict['ITC1587_BchrUn_random_T39713_consensus']
- #some list comprehension here, seq_id = [x for x in seq_id_pp if x in seq_id_fa]
- seq_pp = []
- for x in seq_id_pp:
- if x in seq_id_fa:
- seq_pp.append(seq_dict[x])
- df_pp_results['Flank_Seq'] = seq_pp
- df_pp_results.to_csv("contigbcmerged_with_flankingseq.csv")
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement