Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import re
- def calculate_aminoacid_frequencies(fasta_filename, subsequences_filename, number_of_repetitions, output_filename):
- """Create a function that, given a multi-line protein FASTA file (fasta_filename) and a “sub-sequences” file
- (subsequences_filename) (one sequence in each line), calculates the proportion of proteins in the FASTA file
- containing at least N-times (number_of_repetitions) each of the sub-sequences (exactly equal). Save it in an
- output file with the specified format, ordered by the proportion value (descending order). Overlapping sub-
- sequences are taked into account in the count."""
- file= open(fasta_filename, "r")
- subfile= open(subsequences_filename, "r")
- output= open(output_filename, "w")
- numprots=0
- function={}
- sequence=""
- proteinscontaining=0
- for element in subfile:
- function[element.strip()] = 0
- numbersubseqs=len(function)
- subfile.close()
- for line in file:
- if line.startswith(">"):
- numprots+= 1
- if sequence != "":
- for subsequence in function:
- contador=0
- contador+= len(re.findall("(?="+subsequence+")", sequence))
- if contador >= number_of_repetitions:
- function[subsequence] += 1
- sequence=""
- else:
- sequence+= line.strip()
- file.close()
- subfile= open(subsequences_filename, "r")
- output.write("{:s}\t{:>20d}\n{:s}\t{:>20d}\n{:s}\n".format("#Number of proteins:", numprots, "#Number of subsequences:", numbersubseqs, "#Subsequence proportions:"))
- for subsequence in function:
- contador=0
- contador+= len(re.findall("(?="+subsequence+")", sequence))
- if contador >= number_of_repetitions:
- function[subsequence] += 1
- for item in sorted(function.items(), key=lambda item: item[1] , reverse=True):
- output.write("{:s}\t{:>10d}\t{:.4f}\n".format(item[0], item[1], item[1]/numprots))
- subfile.close()
- output.close()
- #calculate_aminoacid_frequencies("example_fasta_file.fa", "sequence_fragments.txt", 5, "prueba.txt")
Add Comment
Please, Sign In to add comment