Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- text = 'GTGGTGCCCTTTTAGTGGTACATTTATGCTACACCTTACAGTGGTGCTTTATGCGTGGTTTATACAGTGGGTGGTGCGTGGTGCGTGGGTGGTACATTTAGTGGGTGGTGCTGCTACATACATGCTGCTACATTTAGTGGTTTATGCTACATGCTGCTTTATTTAGTGGGTGGCCTCCTCCTGTGGCCTTACATACATGCTTTATTTATACATGCTACATACATACATTTACCTCCTCCTGTGGTGCGTGGGTGGTACATGCTACAGTGGTACATACATTTACCTTACATGCTGCTACAGTGGTACATACATGCTGCTACATACATGCTACATTTATACA'
- d = 2
- k = 9
- from itertools import product
- def approx_frequent_words(text,k,d):
- #Generate candidates for all posibilities
- A=['A','T','G','C']
- B = product(A,repeat=k)
- pattern_max = 0
- temp_pattern2 = []
- count_list = []
- most_frequent = []
- formatted_string = ""
- conc_format_strings = ""
- #Check each candidate's pattern count against the max;
- #if >= max then add count to count_list, pattern to pattern_list
- for candidate in B:
- temp_pattern = approx_pattern_count(candidate,text,d)
- if temp_pattern[1] >= pattern_max:
- pattern_max = temp_pattern[1]
- temp_pattern2.append(temp_pattern[0])
- count_list.append(temp_pattern[1])
- pattern_max = max(count_list)
- for i in range(len(count_list)):
- if count_list[i] == pattern_max:
- most_frequent.append(temp_pattern2[i])
- for i in range(len(most_frequent)):
- string_to_format = most_frequent[i]
- for i in range(len(string_to_format)):
- formatted_string += str(string_to_format[i])
- conc_format_strings += formatted_string + " "
- formatted_string = ""
- return conc_format_strings.strip()
- def hamming_distance(string1,string2):
- hamming_distance = 0
- if len(string1)==len(string2):
- for i in range(len(string1)):
- if string1[i] != string2[i]:
- hamming_distance+=1
- else:
- print ("Strings are not same length!")
- return hamming_distance
- def approx_pattern_match(pattern,text,d):
- output_string = ""
- for i in range(len(text)-len(pattern)+1):
- if hamming_distance(pattern,text[i:i+len(pattern)]) <= d:
- output_string += str(i) + " "
- return output_string.strip()
- def approx_pattern_count(pattern,text,d):
- pattern_count = 0
- for i in range(len(text)-len(pattern)+1):
- if hamming_distance(pattern,text[i:i+len(pattern)]) <= d:
- pattern_count+=1
- return [pattern,pattern_count]
- ##def approx_frequent_words(text,k,d):
- ## frequent_patterns = []
- ## pattern_candidates = []
- ##
- ## candidate_string = ""
- ## count_list = []
- ## #look through the entire text, store substrings of
- ## #length k in pattern_candidates, run pattern_count
- ## #to count repetitions of each substring
- ## #in text and store the count in count_list
- ## for i in range(len(text)-k+1):
- ## pattern_candidates.append(text[i:i+k])
- ## count_list.append(approx_pattern_count(pattern_candidates[i],text,d))
- ## #dummy list for pattern duplicate removal
- ## duplicate_remove = list(frequent_patterns)
- ## #delete duplicates from frequent_patterns
- ## for pattern in duplicate_remove:
- ## if frequent_patterns.count(pattern) > 1:
- ## frequent_patterns.remove(pattern)
- ## #alphabetical order
- ## frequent_patterns.sort()
- ## #format output to a string with spaces between patterns
- ## output_string = ""
- ## for pattern in frequent_patterns:
- ## output_string += pattern + " "
- ## return output_string.strip()
- print (approx_frequent_words(text,k,d))
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement