Guest User

Untitled

a guest
Jan 22nd, 2019
137
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 1.89 KB | None | 0 0
  1. import re
  2. def calculate_aminoacid_frequencies(fasta_filename, subsequences_filename, number_of_repetitions, output_filename):
  3. """Create a function that, given a multi-line protein FASTA file (fasta_filename) and a “sub-sequences” file
  4. (subsequences_filename) (one sequence in each line), calculates the proportion of proteins in the FASTA file
  5. containing at least N-times (number_of_repetitions) each of the sub-sequences (exactly equal). Save it in an
  6. output file with the specified format, ordered by the proportion value (descending order). Overlapping sub-
  7. sequences are taked into account in the count."""
  8. file= open(fasta_filename, "r")
  9. subfile= open(subsequences_filename, "r")
  10. output= open(output_filename, "w")
  11. numprots=0
  12. function={}
  13. sequence=""
  14. proteinscontaining=0
  15. for element in subfile:
  16. function[element.strip()] = 0
  17. numbersubseqs=len(function)
  18. subfile.close()
  19. for line in file:
  20. if line.startswith(">"):
  21. numprots+= 1
  22. if sequence != "":
  23. for subsequence in function:
  24. contador=0
  25. contador+= len(re.findall("(?="+subsequence+")", sequence))
  26. if contador >= number_of_repetitions:
  27. function[subsequence] += 1
  28. sequence=""
  29. else:
  30. sequence+= line.strip()
  31. file.close()
  32. subfile= open(subsequences_filename, "r")
  33. output.write("{:s}\t{:>20d}\n{:s}\t{:>20d}\n{:s}\n".format("#Number of proteins:", numprots, "#Number of subsequences:", numbersubseqs, "#Subsequence proportions:"))
  34. for subsequence in function:
  35. contador=0
  36. contador+= len(re.findall("(?="+subsequence+")", sequence))
  37. if contador >= number_of_repetitions:
  38. function[subsequence] += 1
  39. for item in sorted(function.items(), key=lambda item: item[1] , reverse=True):
  40. output.write("{:s}\t{:>10d}\t{:.4f}\n".format(item[0], item[1], item[1]/numprots))
  41. subfile.close()
  42. output.close()
  43.  
  44. #calculate_aminoacid_frequencies("example_fasta_file.fa", "sequence_fragments.txt", 5, "prueba.txt")
Add Comment
Please, Sign In to add comment