Advertisement
Guest User

Untitled

a guest
Oct 26th, 2014
171
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 3.46 KB | None | 0 0
  1. text = 'GTGGTGCCCTTTTAGTGGTACATTTATGCTACACCTTACAGTGGTGCTTTATGCGTGGTTTATACAGTGGGTGGTGCGTGGTGCGTGGGTGGTACATTTAGTGGGTGGTGCTGCTACATACATGCTGCTACATTTAGTGGTTTATGCTACATGCTGCTTTATTTAGTGGGTGGCCTCCTCCTGTGGCCTTACATACATGCTTTATTTATACATGCTACATACATACATTTACCTCCTCCTGTGGTGCGTGGGTGGTACATGCTACAGTGGTACATACATTTACCTTACATGCTGCTACAGTGGTACATACATGCTGCTACATACATGCTACATTTATACA'
  2. d = 2
  3. k = 9
  4.  
  5. from itertools import product
  6.  
  7. def approx_frequent_words(text,k,d):
  8. #Generate candidates for all posibilities
  9. A=['A','T','G','C']
  10. B = product(A,repeat=k)
  11. pattern_max = 0
  12. temp_pattern2 = []
  13. count_list = []
  14. most_frequent = []
  15. formatted_string = ""
  16. conc_format_strings = ""
  17. #Check each candidate's pattern count against the max;
  18. #if >= max then add count to count_list, pattern to pattern_list
  19. for candidate in B:
  20. temp_pattern = approx_pattern_count(candidate,text,d)
  21. if temp_pattern[1] >= pattern_max:
  22. pattern_max = temp_pattern[1]
  23. temp_pattern2.append(temp_pattern[0])
  24. count_list.append(temp_pattern[1])
  25. pattern_max = max(count_list)
  26. for i in range(len(count_list)):
  27. if count_list[i] == pattern_max:
  28. most_frequent.append(temp_pattern2[i])
  29. for i in range(len(most_frequent)):
  30. string_to_format = most_frequent[i]
  31. for i in range(len(string_to_format)):
  32. formatted_string += str(string_to_format[i])
  33. conc_format_strings += formatted_string + " "
  34. formatted_string = ""
  35. return conc_format_strings.strip()
  36.  
  37.  
  38. def hamming_distance(string1,string2):
  39. hamming_distance = 0
  40. if len(string1)==len(string2):
  41. for i in range(len(string1)):
  42. if string1[i] != string2[i]:
  43. hamming_distance+=1
  44. else:
  45. print ("Strings are not same length!")
  46. return hamming_distance
  47.  
  48. def approx_pattern_match(pattern,text,d):
  49. output_string = ""
  50. for i in range(len(text)-len(pattern)+1):
  51. if hamming_distance(pattern,text[i:i+len(pattern)]) <= d:
  52. output_string += str(i) + " "
  53. return output_string.strip()
  54.  
  55. def approx_pattern_count(pattern,text,d):
  56. pattern_count = 0
  57. for i in range(len(text)-len(pattern)+1):
  58. if hamming_distance(pattern,text[i:i+len(pattern)]) <= d:
  59. pattern_count+=1
  60. return [pattern,pattern_count]
  61.  
  62. ##def approx_frequent_words(text,k,d):
  63. ## frequent_patterns = []
  64. ## pattern_candidates = []
  65. ##
  66. ## candidate_string = ""
  67. ## count_list = []
  68. ## #look through the entire text, store substrings of
  69. ## #length k in pattern_candidates, run pattern_count
  70. ## #to count repetitions of each substring
  71. ## #in text and store the count in count_list
  72. ## for i in range(len(text)-k+1):
  73. ## pattern_candidates.append(text[i:i+k])
  74. ## count_list.append(approx_pattern_count(pattern_candidates[i],text,d))
  75. ## #dummy list for pattern duplicate removal
  76. ## duplicate_remove = list(frequent_patterns)
  77. ## #delete duplicates from frequent_patterns
  78. ## for pattern in duplicate_remove:
  79. ## if frequent_patterns.count(pattern) > 1:
  80. ## frequent_patterns.remove(pattern)
  81. ## #alphabetical order
  82. ## frequent_patterns.sort()
  83. ## #format output to a string with spaces between patterns
  84. ## output_string = ""
  85. ## for pattern in frequent_patterns:
  86. ## output_string += pattern + " "
  87. ## return output_string.strip()
  88.  
  89. print (approx_frequent_words(text,k,d))
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement